Untitled
unknown
plain_text
3 years ago
3.0 kB
9
Indexable
void conv_filter_avx(int imgHeight, int imgWidth, int imgWidthF,
short *filter, unsigned char *imgSrcExt, unsigned char *imgDst)
{
// Generate the vector-coefficients
__m256i filter_laplace[25];
for (int i = 0; i < 25; i++)
{
filter_laplace[i] = _mm256_set1_epi16(filter[i]);
}
#if USE_OMP==1
#pragma omp parallel for
#else
#endif
for (int row=0; row<imgHeight; row++)
{
int wr_base = row*imgWidth*3;
int rd_base = row*imgWidthF*3;
// step through all components in a line
for (int col=0; col<(imgWidth*3); col=col+16)
{
__m256i fval;
fval = _mm256_set1_epi16(0);
int rd_offset = 0;
for (int fy=0; fy<FILTER_H; fy++)
{
#if LOOP_UNROLL==0
for (int fx=0; fx<FILTER_W; fx++)
#endif
{
int pixel_src = rd_base + rd_offset;
__m128i load_data;
__m256i conv_res, mul_res;
#if LOOP_UNROLL==0
load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src));
conv_res = _mm256_cvtepu8_epi16(load_data);
mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + fx]);
fval = _mm256_add_epi16(fval, mul_res);
rd_offset = rd_offset + 3;
#else
load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 0));
conv_res = _mm256_cvtepu8_epi16(load_data);
mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 0]);
fval = _mm256_add_epi16(fval, mul_res);
load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 3));
conv_res = _mm256_cvtepu8_epi16(load_data);
mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 1]);
fval = _mm256_add_epi16(fval, mul_res);
load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 6));
conv_res = _mm256_cvtepu8_epi16(load_data);
mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 2]);
fval = _mm256_add_epi16(fval, mul_res);
load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 9));
conv_res = _mm256_cvtepu8_epi16(load_data);
mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 3]);
fval = _mm256_add_epi16(fval, mul_res);
load_data = _mm_lddqu_si128((__m128i *)(imgSrcExt + pixel_src + 12));
conv_res = _mm256_cvtepu8_epi16(load_data);
mul_res = _mm256_mullo_epi16(conv_res, filter_laplace[fy * 5 + 4]);
fval = _mm256_add_epi16(fval, mul_res);
#endif
}
#if LOOP_UNROLL==0
rd_offset = rd_offset - 5*3 + imgWidthF*3;
#else
rd_offset = rd_offset + imgWidthF * 3;
#endif
}
__m256i sat_val;
sat_val = _mm256_set1_epi16(255);
fval = _mm256_min_epi16(fval, sat_val);
sat_val = _mm256_set1_epi16(0);
fval = _mm256_max_epi16(fval, sat_val);
__m256i *res_vec_ptr;
res_vec_ptr = &fval;
short *res_short_ptr;
res_short_ptr = (short*)(res_vec_ptr);
for (int i = 0; i < 16; i++)
{
*(imgDst + wr_base + i) = *(res_short_ptr + i);
}
wr_base = wr_base + 16;
rd_base = rd_base + 16;
}
}
}Editor is loading...