gppd AVX2
user_3093867
c_cpp
a year ago
7.2 kB
5
Indexable
//! AVX2
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#define PNG_NO_SETJMP
#include <sched.h>
#include <assert.h>
#include <png.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <immintrin.h> // For AVX2 intrinsics
// #include <xmmintrin.h> // SSE
// #include <emmintrin.h> // SSE2
#define CHUNK_SIZE 16 // Number of rows to process in each chunk
// Structure to hold thread arguments and shared data
typedef struct
{
int width;
int height;
int iters;
double left, right, lower, upper;
int *image;
int next_row;
pthread_mutex_t mutex;
} thread_data_t;
// Function to write PNG file (unchanged)
void write_png(const char *filename, int iters, int width, int height, const int *buffer)
{
FILE *fp = fopen(filename, "wb");
assert(fp);
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
assert(png_ptr);
png_infop info_ptr = png_create_info_struct(png_ptr);
assert(info_ptr);
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_set_filter(png_ptr, 0, PNG_NO_FILTERS);
png_write_info(png_ptr, info_ptr);
png_set_compression_level(png_ptr, 1);
size_t row_size = 3 * width * sizeof(png_byte);
png_bytep row = (png_bytep)malloc(row_size);
for (int y = 0; y < height; ++y)
{
memset(row, 0, row_size);
for (int x = 0; x < width; ++x)
{
int p = buffer[(height - 1 - y) * width + x];
png_bytep color = row + x * 3;
if (p != iters)
{
if (p & 16)
{
color[0] = 240;
color[1] = color[2] = p % 16 * 16;
}
else
{
color[0] = p % 16 * 16;
}
}
}
png_write_row(png_ptr, row);
}
free(row);
png_write_end(png_ptr, NULL);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
void *compute_mandelbrot(void *arg)
{
thread_data_t *data = (thread_data_t *)arg;
int start_row, end_row;
__m256d x0_vec, y0_vec, x_vec, y_vec, length_squared_vec;
__m256d two_vec = _mm256_set1_pd(2.0);
__m256d four_vec = _mm256_set1_pd(4.0);
while (1)
{
pthread_mutex_lock(&data->mutex);
start_row = data->next_row;
end_row = start_row + CHUNK_SIZE;
if (end_row > data->height)
end_row = data->height;
data->next_row = end_row;
pthread_mutex_unlock(&data->mutex);
if (start_row >= data->height)
break;
for (int j = start_row; j < end_row; ++j)
{
double y0 = j * ((data->upper - data->lower) / data->height) + data->lower;
y0_vec = _mm256_set1_pd(y0);
for (int i = 0; i < data->width; i += 4)
{
double x0_1 = i * ((data->right - data->left) / data->width) + data->left;
double x0_2 = (i + 1) * ((data->right - data->left) / data->width) + data->left;
double x0_3 = (i + 2) * ((data->right - data->left) / data->width) + data->left;
double x0_4 = (i + 3) * ((data->right - data->left) / data->width) + data->left;
x0_vec = _mm256_set_pd(x0_4, x0_3, x0_2, x0_1);
x_vec = _mm256_setzero_pd();
y_vec = _mm256_setzero_pd();
length_squared_vec = _mm256_setzero_pd();
__m256i repeats_vec = _mm256_setzero_si256();
__m256i ones_vec = _mm256_set1_epi64x(1);
__m256i max_iters_vec = _mm256_set1_epi64x(data->iters);
for (int k = 0; k < data->iters; ++k)
{
__m256d xy_vec = _mm256_mul_pd(x_vec, y_vec);
__m256d xx_vec = _mm256_mul_pd(x_vec, x_vec);
__m256d yy_vec = _mm256_mul_pd(y_vec, y_vec);
y_vec = _mm256_add_pd(_mm256_mul_pd(two_vec, xy_vec), y0_vec);
x_vec = _mm256_add_pd(_mm256_sub_pd(xx_vec, yy_vec), x0_vec);
length_squared_vec = _mm256_add_pd(xx_vec, yy_vec);
__m256d mask_vec = _mm256_cmp_pd(length_squared_vec, four_vec, _CMP_LT_OQ);
__m256i mask_int_vec = _mm256_castpd_si256(mask_vec);
if (_mm256_movemask_pd(mask_vec) == 0)
break;
repeats_vec = _mm256_add_epi64(_mm256_and_si256(mask_int_vec, ones_vec), repeats_vec);
}
// Convert __m256i to array of integers
int64_t results[4];
_mm256_storeu_si256((__m256i *)results, repeats_vec);
for (int k = 0; k < 4; ++k)
{
if (i + k < data->width)
{
data->image[j * data->width + i + k] = results[k];
}
else
{
break; // Exit the loop if we've reached the end of the row
}
}
}
}
}
return NULL;
}
int main(int argc, char **argv)
{
/* detect how many CPUs are available */
cpu_set_t cpu_set;
sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
int num_cpus = CPU_COUNT(&cpu_set);
printf("%d cpus available\n", num_cpus);
/* argument parsing */
assert(argc == 9);
const char *filename = argv[1];
int iters = strtol(argv[2], 0, 10);
double left = strtod(argv[3], 0);
double right = strtod(argv[4], 0);
double lower = strtod(argv[5], 0);
double upper = strtod(argv[6], 0);
int width = strtol(argv[7], 0, 10);
int height = strtol(argv[8], 0, 10);
/* allocate memory for image */
int *image = (int *)malloc(width * height * sizeof(int));
assert(image);
/* initialize thread data */
thread_data_t thread_data = {
.width = width,
.height = height,
.iters = iters,
.left = left,
.right = right,
.lower = lower,
.upper = upper,
.image = image,
.next_row = 0};
pthread_mutex_init(&thread_data.mutex, NULL);
/* create threads */
int num_threads = num_cpus; // Use the number of available CPUs
pthread_t threads[num_threads];
for (int i = 0; i < num_threads; ++i)
{
pthread_create(&threads[i], NULL, compute_mandelbrot, &thread_data);
}
/* wait for threads to finish */
for (int i = 0; i < num_threads; ++i)
{
pthread_join(threads[i], NULL);
}
/* cleanup */
pthread_mutex_destroy(&thread_data.mutex);
/* draw and cleanup */
write_png(filename, iters, width, height, image);
free(image);
return 0;
}
Editor is loading...
Leave a Comment