Untitled
unknown
plain_text
3 years ago
3.0 kB
4
Indexable
void conv_filter_avx(int imgHeight, int imgWidth, int imgWidthF, short *filter, unsigned char *imgSrcExt, unsigned char *imgDst) { // Generate the vector-coefficients __m256i filter_laplace[25]; for (int i = 0; i < 25; i++) { filter_laplace[i] = _mm256_set1_epi16(filter[i]); } #if USE_OMP==1 #pragma omp parallel for #else #endif for (int row=0; row<imgHeight; row++) { int wr_base = row*imgWidth*3; int rd_base = row*imgWidthF*3; // step through all components in a line for (int col=0; col<(imgWidth*3); col=col+16) { __m256i fval; fval = _mm256_set1_epi16(0); int rd_offset = 0; for (int fy=0; fy<FILTER_H; fy++) { #if LOOP_UNROLL==0 for (int fx=0; fx<FILTER_W; fx++) #endif { int pixel_src = rd_base + rd_offset; __m128i load_data; __m256i conv_res, mul_res; #if LOOP_UNROLL==0 load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src)); conv_res = _mm256_cvtepu8_epi16(load_data); mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + fx]); fval = _mm256_add_epi16(fval, mul_res); rd_offset = rd_offset + 3; #else load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 0)); conv_res = _mm256_cvtepu8_epi16(load_data); mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 0]); fval = _mm256_add_epi16(fval, mul_res); load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 3)); conv_res = _mm256_cvtepu8_epi16(load_data); mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 1]); fval = _mm256_add_epi16(fval, mul_res); load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 6)); conv_res = _mm256_cvtepu8_epi16(load_data); mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 2]); fval = _mm256_add_epi16(fval, mul_res); load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 9)); conv_res = _mm256_cvtepu8_epi16(load_data); mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 3]); fval = _mm256_add_epi16(fval, mul_res); load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 12)); conv_res = _mm256_cvtepu8_epi16(load_data); mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 4]); fval = _mm256_add_epi16(fval, mul_res); #endif } #if LOOP_UNROLL==0 rd_offset = rd_offset - 5*3 + imgWidthF*3; #else rd_offset = rd_offset + imgWidthF * 3; #endif } __m256i sat_val; sat_val = _mm256_set1_epi16(255); fval = _mm256_min_epi16(fval, sat_val); sat_val = _mm256_set1_epi16(0); fval = _mm256_max_epi16(fval, sat_val); __m256i *res_vec_ptr; res_vec_ptr = &fval; short *res_short_ptr; res_short_ptr = (short*)(res_vec_ptr); for (int i = 0; i < 16; i++) { *(imgDst + wr_base + i) = *(res_short_ptr + i); } wr_base = wr_base + 16; rd_base = rd_base + 16; } } }
Editor is loading...