gppd AVX2

 avatar
user_3093867
c_cpp
6 months ago
7.2 kB
3
Indexable
//! AVX2
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#define PNG_NO_SETJMP
#include <sched.h>
#include <assert.h>
#include <png.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <immintrin.h> // For AVX2 intrinsics
// #include <xmmintrin.h> // SSE
// #include <emmintrin.h> // SSE2

#define CHUNK_SIZE 16 // Number of rows to process in each chunk

// Structure to hold thread arguments and shared data
typedef struct
{
    int width;
    int height;
    int iters;
    double left, right, lower, upper;
    int *image;
    int next_row;
    pthread_mutex_t mutex;
} thread_data_t;

// Function to write PNG file (unchanged)
void write_png(const char *filename, int iters, int width, int height, const int *buffer)
{
    FILE *fp = fopen(filename, "wb");
    assert(fp);
    png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
    assert(png_ptr);
    png_infop info_ptr = png_create_info_struct(png_ptr);
    assert(info_ptr);
    png_init_io(png_ptr, fp);
    png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
                 PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
    png_set_filter(png_ptr, 0, PNG_NO_FILTERS);
    png_write_info(png_ptr, info_ptr);
    png_set_compression_level(png_ptr, 1);
    size_t row_size = 3 * width * sizeof(png_byte);
    png_bytep row = (png_bytep)malloc(row_size);
    for (int y = 0; y < height; ++y)
    {
        memset(row, 0, row_size);
        for (int x = 0; x < width; ++x)
        {
            int p = buffer[(height - 1 - y) * width + x];
            png_bytep color = row + x * 3;
            if (p != iters)
            {
                if (p & 16)
                {
                    color[0] = 240;
                    color[1] = color[2] = p % 16 * 16;
                }
                else
                {
                    color[0] = p % 16 * 16;
                }
            }
        }
        png_write_row(png_ptr, row);
    }
    free(row);
    png_write_end(png_ptr, NULL);
    png_destroy_write_struct(&png_ptr, &info_ptr);
    fclose(fp);
}

void *compute_mandelbrot(void *arg)
{
    thread_data_t *data = (thread_data_t *)arg;
    int start_row, end_row;

    __m256d x0_vec, y0_vec, x_vec, y_vec, length_squared_vec;
    __m256d two_vec = _mm256_set1_pd(2.0);
    __m256d four_vec = _mm256_set1_pd(4.0);

    while (1)
    {
        pthread_mutex_lock(&data->mutex);
        start_row = data->next_row;
        end_row = start_row + CHUNK_SIZE;
        if (end_row > data->height)
            end_row = data->height;
        data->next_row = end_row;
        pthread_mutex_unlock(&data->mutex);

        if (start_row >= data->height)
            break;

        for (int j = start_row; j < end_row; ++j)
        {
            double y0 = j * ((data->upper - data->lower) / data->height) + data->lower;
            y0_vec = _mm256_set1_pd(y0);

            for (int i = 0; i < data->width; i += 4)
            {
                double x0_1 = i * ((data->right - data->left) / data->width) + data->left;
                double x0_2 = (i + 1) * ((data->right - data->left) / data->width) + data->left;
                double x0_3 = (i + 2) * ((data->right - data->left) / data->width) + data->left;
                double x0_4 = (i + 3) * ((data->right - data->left) / data->width) + data->left;
                x0_vec = _mm256_set_pd(x0_4, x0_3, x0_2, x0_1);

                x_vec = _mm256_setzero_pd();
                y_vec = _mm256_setzero_pd();
                length_squared_vec = _mm256_setzero_pd();

                __m256i repeats_vec = _mm256_setzero_si256();
                __m256i ones_vec = _mm256_set1_epi64x(1);
                __m256i max_iters_vec = _mm256_set1_epi64x(data->iters);

                for (int k = 0; k < data->iters; ++k)
                {
                    __m256d xy_vec = _mm256_mul_pd(x_vec, y_vec);
                    __m256d xx_vec = _mm256_mul_pd(x_vec, x_vec);
                    __m256d yy_vec = _mm256_mul_pd(y_vec, y_vec);

                    y_vec = _mm256_add_pd(_mm256_mul_pd(two_vec, xy_vec), y0_vec);
                    x_vec = _mm256_add_pd(_mm256_sub_pd(xx_vec, yy_vec), x0_vec);

                    length_squared_vec = _mm256_add_pd(xx_vec, yy_vec);

                    __m256d mask_vec = _mm256_cmp_pd(length_squared_vec, four_vec, _CMP_LT_OQ);
                    __m256i mask_int_vec = _mm256_castpd_si256(mask_vec);

                    if (_mm256_movemask_pd(mask_vec) == 0)
                        break;

                    repeats_vec = _mm256_add_epi64(_mm256_and_si256(mask_int_vec, ones_vec), repeats_vec);
                }

                // Convert __m256i to array of integers
                int64_t results[4];
                _mm256_storeu_si256((__m256i *)results, repeats_vec);

                for (int k = 0; k < 4; ++k)
                {
                    if (i + k < data->width)
                    {
                        data->image[j * data->width + i + k] = results[k];
                    }
                    else
                    {
                        break; // Exit the loop if we've reached the end of the row
                    }
                }
            }
        }
    }

    return NULL;
}

int main(int argc, char **argv)
{
    /* detect how many CPUs are available */
    cpu_set_t cpu_set;
    sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
    int num_cpus = CPU_COUNT(&cpu_set);
    printf("%d cpus available\n", num_cpus);

    /* argument parsing */
    assert(argc == 9);
    const char *filename = argv[1];
    int iters = strtol(argv[2], 0, 10);
    double left = strtod(argv[3], 0);
    double right = strtod(argv[4], 0);
    double lower = strtod(argv[5], 0);
    double upper = strtod(argv[6], 0);
    int width = strtol(argv[7], 0, 10);
    int height = strtol(argv[8], 0, 10);

    /* allocate memory for image */
    int *image = (int *)malloc(width * height * sizeof(int));
    assert(image);

    /* initialize thread data */
    thread_data_t thread_data = {
        .width = width,
        .height = height,
        .iters = iters,
        .left = left,
        .right = right,
        .lower = lower,
        .upper = upper,
        .image = image,
        .next_row = 0};
    pthread_mutex_init(&thread_data.mutex, NULL);

    /* create threads */
    int num_threads = num_cpus; // Use the number of available CPUs
    pthread_t threads[num_threads];

    for (int i = 0; i < num_threads; ++i)
    {
        pthread_create(&threads[i], NULL, compute_mandelbrot, &thread_data);
    }

    /* wait for threads to finish */
    for (int i = 0; i < num_threads; ++i)
    {
        pthread_join(threads[i], NULL);
    }

    /* cleanup */
    pthread_mutex_destroy(&thread_data.mutex);

    /* draw and cleanup */
    write_png(filename, iters, width, height, image);
    free(image);

    return 0;
}
Editor is loading...
Leave a Comment