Untitled
#include <intrin.h> #include <chrono> #include <cstdint> extern "C" void avx_add(float* a, float* b, float* result); extern "C" void avx_mul(float* a, float* b, float* result); extern "C" void avx_sqrt(float* a, float* result); extern "C" void sse_add(float* a, float* b, float* result); extern "C" void sse_mul(float* a, float* b, float* result); extern "C" void sse_sqrt(float* a, float* result); extern "C" void scalaradd(float* a, float* b, float* result); extern "C" void scalarmul(float* a, float* b, float* result); extern "C" void scalarsqrt(float* a, float* result); int main() { const int size = (100000 >> 5) << 5; // Adjust the size based on your requirements float* a = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256))); float* b = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256))); float* result = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256))); // Initialize arrays with some values for (int i = 0; i < size; ++i) { a[i] = static_cast<float>(i); b[i] = static_cast<float>(i + 1); } printf("Created Arrays [A][B][Result]: size -> %d\n", size); auto start_time = std::chrono::high_resolution_clock::now(); auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); for (int i = 0; i < size; i += (16 / sizeof(float))) { sse_mul(a + i, b + i, result + i); } // Benchmark SSE2 multiplication start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i += (32 / sizeof(float))) { avx_mul(a + i, b + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("AVX 256 Multiplication: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 square root start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i += (32 / sizeof(float))) { avx_sqrt(a + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("AVX 256 Square Root: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 addition start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i += (32 / sizeof(float))) { avx_add(a + i, b + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("AVX 256 Addition: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 multiplication start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i += (16 / sizeof(float))) { sse_mul(a + i, b + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("SSE2 Multiplication: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 square root start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i += (16 / sizeof(float))) { sse_sqrt(a + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("SSE2 Square Root: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 addition start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i += (16 / sizeof(float))) { sse_add(a + i, b + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("SSE2 Addition: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 square root start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i++) { scalaradd(a + i, b + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("Scalar Add: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 square root start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; i++) { scalarmul(a + i, b + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("Scalar Mult: %lld microseconds\n", static_cast<long long>(duration.count())); // Benchmark SSE2 square root start_time = std::chrono::high_resolution_clock::now(); for (int i = 0; i < size; ++i) { scalarsqrt(a + i, result + i); } end_time = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); printf("Scalar Square Root: %lld microseconds\n", static_cast<long long>(duration.count())); // Cleanup _aligned_free(a); _aligned_free(b); _aligned_free(result); return 0; }
Leave a Comment