Untitled
unknown
plain_text
2 years ago
5.8 kB
9
Indexable
#include <intrin.h>
#include <chrono>
#include <cstdint>
extern "C" void avx_add(float* a, float* b, float* result);
extern "C" void avx_mul(float* a, float* b, float* result);
extern "C" void avx_sqrt(float* a, float* result);
extern "C" void sse_add(float* a, float* b, float* result);
extern "C" void sse_mul(float* a, float* b, float* result);
extern "C" void sse_sqrt(float* a, float* result);
extern "C" void scalaradd(float* a, float* b, float* result);
extern "C" void scalarmul(float* a, float* b, float* result);
extern "C" void scalarsqrt(float* a, float* result);
int main() {
const int size = (100000 >> 5) << 5; // Adjust the size based on your requirements
float* a = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256)));
float* b = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256)));
float* result = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256)));
// Initialize arrays with some values
for (int i = 0; i < size; ++i)
{
a[i] = static_cast<float>(i);
b[i] = static_cast<float>(i + 1);
}
printf("Created Arrays [A][B][Result]: size -> %d\n", size);
auto start_time = std::chrono::high_resolution_clock::now();
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
for (int i = 0; i < size; i += (16 / sizeof(float)))
{
sse_mul(a + i, b + i, result + i);
}
// Benchmark SSE2 multiplication
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i += (32 / sizeof(float)))
{
avx_mul(a + i, b + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("AVX 256 Multiplication: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 square root
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i += (32 / sizeof(float)))
{
avx_sqrt(a + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("AVX 256 Square Root: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 addition
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i += (32 / sizeof(float)))
{
avx_add(a + i, b + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("AVX 256 Addition: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 multiplication
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i += (16 / sizeof(float)))
{
sse_mul(a + i, b + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("SSE2 Multiplication: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 square root
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i += (16 / sizeof(float)))
{
sse_sqrt(a + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("SSE2 Square Root: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 addition
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i += (16 / sizeof(float)))
{
sse_add(a + i, b + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("SSE2 Addition: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 square root
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i++)
{
scalaradd(a + i, b + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("Scalar Add: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 square root
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; i++)
{
scalarmul(a + i, b + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("Scalar Mult: %lld microseconds\n", static_cast<long long>(duration.count()));
// Benchmark SSE2 square root
start_time = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; ++i)
{
scalarsqrt(a + i, result + i);
}
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
printf("Scalar Square Root: %lld microseconds\n", static_cast<long long>(duration.count()));
// Cleanup
_aligned_free(a);
_aligned_free(b);
_aligned_free(result);
return 0;
}Editor is loading...
Leave a Comment