gppd AVX2
user_3093867
c_cpp
6 months ago
7.2 kB
3
Indexable
//! AVX2 #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #define PNG_NO_SETJMP #include <sched.h> #include <assert.h> #include <png.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <pthread.h> #include <immintrin.h> // For AVX2 intrinsics // #include <xmmintrin.h> // SSE // #include <emmintrin.h> // SSE2 #define CHUNK_SIZE 16 // Number of rows to process in each chunk // Structure to hold thread arguments and shared data typedef struct { int width; int height; int iters; double left, right, lower, upper; int *image; int next_row; pthread_mutex_t mutex; } thread_data_t; // Function to write PNG file (unchanged) void write_png(const char *filename, int iters, int width, int height, const int *buffer) { FILE *fp = fopen(filename, "wb"); assert(fp); png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); assert(png_ptr); png_infop info_ptr = png_create_info_struct(png_ptr); assert(info_ptr); png_init_io(png_ptr, fp); png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT); png_set_filter(png_ptr, 0, PNG_NO_FILTERS); png_write_info(png_ptr, info_ptr); png_set_compression_level(png_ptr, 1); size_t row_size = 3 * width * sizeof(png_byte); png_bytep row = (png_bytep)malloc(row_size); for (int y = 0; y < height; ++y) { memset(row, 0, row_size); for (int x = 0; x < width; ++x) { int p = buffer[(height - 1 - y) * width + x]; png_bytep color = row + x * 3; if (p != iters) { if (p & 16) { color[0] = 240; color[1] = color[2] = p % 16 * 16; } else { color[0] = p % 16 * 16; } } } png_write_row(png_ptr, row); } free(row); png_write_end(png_ptr, NULL); png_destroy_write_struct(&png_ptr, &info_ptr); fclose(fp); } void *compute_mandelbrot(void *arg) { thread_data_t *data = (thread_data_t *)arg; int start_row, end_row; __m256d x0_vec, y0_vec, x_vec, y_vec, length_squared_vec; __m256d two_vec = _mm256_set1_pd(2.0); __m256d four_vec = _mm256_set1_pd(4.0); while (1) { pthread_mutex_lock(&data->mutex); start_row = data->next_row; end_row = start_row + CHUNK_SIZE; if (end_row > data->height) end_row = data->height; data->next_row = end_row; pthread_mutex_unlock(&data->mutex); if (start_row >= data->height) break; for (int j = start_row; j < end_row; ++j) { double y0 = j * ((data->upper - data->lower) / data->height) + data->lower; y0_vec = _mm256_set1_pd(y0); for (int i = 0; i < data->width; i += 4) { double x0_1 = i * ((data->right - data->left) / data->width) + data->left; double x0_2 = (i + 1) * ((data->right - data->left) / data->width) + data->left; double x0_3 = (i + 2) * ((data->right - data->left) / data->width) + data->left; double x0_4 = (i + 3) * ((data->right - data->left) / data->width) + data->left; x0_vec = _mm256_set_pd(x0_4, x0_3, x0_2, x0_1); x_vec = _mm256_setzero_pd(); y_vec = _mm256_setzero_pd(); length_squared_vec = _mm256_setzero_pd(); __m256i repeats_vec = _mm256_setzero_si256(); __m256i ones_vec = _mm256_set1_epi64x(1); __m256i max_iters_vec = _mm256_set1_epi64x(data->iters); for (int k = 0; k < data->iters; ++k) { __m256d xy_vec = _mm256_mul_pd(x_vec, y_vec); __m256d xx_vec = _mm256_mul_pd(x_vec, x_vec); __m256d yy_vec = _mm256_mul_pd(y_vec, y_vec); y_vec = _mm256_add_pd(_mm256_mul_pd(two_vec, xy_vec), y0_vec); x_vec = _mm256_add_pd(_mm256_sub_pd(xx_vec, yy_vec), x0_vec); length_squared_vec = _mm256_add_pd(xx_vec, yy_vec); __m256d mask_vec = _mm256_cmp_pd(length_squared_vec, four_vec, _CMP_LT_OQ); __m256i mask_int_vec = _mm256_castpd_si256(mask_vec); if (_mm256_movemask_pd(mask_vec) == 0) break; repeats_vec = _mm256_add_epi64(_mm256_and_si256(mask_int_vec, ones_vec), repeats_vec); } // Convert __m256i to array of integers int64_t results[4]; _mm256_storeu_si256((__m256i *)results, repeats_vec); for (int k = 0; k < 4; ++k) { if (i + k < data->width) { data->image[j * data->width + i + k] = results[k]; } else { break; // Exit the loop if we've reached the end of the row } } } } } return NULL; } int main(int argc, char **argv) { /* detect how many CPUs are available */ cpu_set_t cpu_set; sched_getaffinity(0, sizeof(cpu_set), &cpu_set); int num_cpus = CPU_COUNT(&cpu_set); printf("%d cpus available\n", num_cpus); /* argument parsing */ assert(argc == 9); const char *filename = argv[1]; int iters = strtol(argv[2], 0, 10); double left = strtod(argv[3], 0); double right = strtod(argv[4], 0); double lower = strtod(argv[5], 0); double upper = strtod(argv[6], 0); int width = strtol(argv[7], 0, 10); int height = strtol(argv[8], 0, 10); /* allocate memory for image */ int *image = (int *)malloc(width * height * sizeof(int)); assert(image); /* initialize thread data */ thread_data_t thread_data = { .width = width, .height = height, .iters = iters, .left = left, .right = right, .lower = lower, .upper = upper, .image = image, .next_row = 0}; pthread_mutex_init(&thread_data.mutex, NULL); /* create threads */ int num_threads = num_cpus; // Use the number of available CPUs pthread_t threads[num_threads]; for (int i = 0; i < num_threads; ++i) { pthread_create(&threads[i], NULL, compute_mandelbrot, &thread_data); } /* wait for threads to finish */ for (int i = 0; i < num_threads; ++i) { pthread_join(threads[i], NULL); } /* cleanup */ pthread_mutex_destroy(&thread_data.mutex); /* draw and cleanup */ write_png(filename, iters, width, height, image); free(image); return 0; }
Editor is loading...
Leave a Comment