Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
5.8 kB
3
Indexable
#include <intrin.h>
#include <chrono>
#include <cstdint>

extern "C" void avx_add(float* a, float* b, float* result);
extern "C"  void avx_mul(float* a, float* b, float* result);
extern "C"  void avx_sqrt(float* a, float* result);

extern "C" void sse_add(float* a, float* b, float* result);
extern "C"  void sse_mul(float* a, float* b, float* result);
extern "C"  void sse_sqrt(float* a, float* result);

extern "C" void scalaradd(float* a, float* b, float* result);
extern "C"  void scalarmul(float* a, float* b, float* result);
extern "C"  void scalarsqrt(float* a, float* result);


int main() {
    const int size = (100000 >> 5) << 5; // Adjust the size based on your requirements

    float* a = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256)));
    float* b = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256)));
    float* result = static_cast<float*>(_aligned_malloc(size * sizeof(float), sizeof(__m256)));

    // Initialize arrays with some values
    for (int i = 0; i < size; ++i) 
    {
        a[i] = static_cast<float>(i);
        b[i] = static_cast<float>(i + 1);
    }

    printf("Created Arrays [A][B][Result]: size -> %d\n", size);
    auto start_time = std::chrono::high_resolution_clock::now();
    auto end_time = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);

    for (int i = 0; i < size; i += (16 / sizeof(float)))
    {
        sse_mul(a + i, b + i, result + i);
    }

    // Benchmark SSE2 multiplication
    start_time = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < size; i += (32 / sizeof(float)))
    {
        avx_mul(a + i, b + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("AVX 256 Multiplication: %lld microseconds\n", static_cast<long long>(duration.count()));


    // Benchmark SSE2 square root
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i += (32 / sizeof(float)))
    {
        avx_sqrt(a + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("AVX 256  Square Root: %lld microseconds\n", static_cast<long long>(duration.count()));

    // Benchmark SSE2 addition
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i += (32 / sizeof(float)))
    {
       avx_add(a + i, b + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("AVX 256  Addition: %lld microseconds\n", static_cast<long long>(duration.count()));


    // Benchmark SSE2 multiplication
    start_time = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < size; i += (16 / sizeof(float)))
    {
        sse_mul(a + i, b + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("SSE2 Multiplication: %lld microseconds\n", static_cast<long long>(duration.count()));


    // Benchmark SSE2 square root
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i += (16 / sizeof(float)))
    {
        sse_sqrt(a + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("SSE2 Square Root: %lld microseconds\n", static_cast<long long>(duration.count()));

    // Benchmark SSE2 addition
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i += (16 / sizeof(float)))
    {
        sse_add(a + i, b + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("SSE2 Addition: %lld microseconds\n", static_cast<long long>(duration.count()));


    // Benchmark SSE2 square root
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i++)
    {
        scalaradd(a + i, b + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("Scalar Add: %lld microseconds\n", static_cast<long long>(duration.count()));


    // Benchmark SSE2 square root
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i++)
    {
        scalarmul(a + i, b + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("Scalar Mult: %lld microseconds\n", static_cast<long long>(duration.count()));


    // Benchmark SSE2 square root
    start_time = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; ++i)
    {
        scalarsqrt(a + i, result + i);
    }
    end_time = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
    printf("Scalar Square Root: %lld microseconds\n", static_cast<long long>(duration.count()));

    // Cleanup
    _aligned_free(a);
    _aligned_free(b);
    _aligned_free(result);

    return 0;
}
Leave a Comment