Assembler Language HW7

2023/01/06 PM: 2:45
mail@pastecode.io avatar
unknown
c_cpp
2 years ago
2.6 kB
2
Indexable
Never
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <emmintrin.h>

#define height 2    // 17280
#define width  2    // 30720

int main(void){
    void *ptr;
    int *pBitmap, row, col;
    int pixel, red, green, blue, alpha, bw;

   //pBitmap = (int*) malloc(height * width * sizeof(int));
    posix_memalign(&ptr, 16, height * width * sizeof(int));
    pBitmap = (int*) ptr;

    for (row=0; row<height; row++) {
        for (col=0; col<width; col++) {
            pBitmap[col+row*width] = rand();
        }
    }

	__m128i accum;
    int alignedLength = (height*width) - ((height*width)% 4);
	__m128i *pBitmap_128 = (__m128i *)pBitmap;
	accum = _mm_sub_epi32 (accum, accum);

	float B[4] = {0.299, 0.299, 0.299, 0.299};
    float C[4] = {0.587, 0.587, 0.587, 0.587};
    float D[4] = {0.114, 0.114, 0.144, 0.114};

    __m128 RED_R = _mm_load_ps(B);
    __m128 RED_G = _mm_load_ps(C);
    __m128 RED_B = _mm_load_ps(D);

	int o = 0xff;
	__m128i o_128 = _mm_cvtsi32_si128(o);

    for (int i=0; i<(height*width)/4; i++){

		accum = _mm_add_epi32(accum, pBitmap_128[i]);
		__m128i pBitmap_24 = _mm_and_si128((__m128i)_mm_srli_epi32(accum, 24),o_128);
		__m128i pBitmap_16 = _mm_and_si128((__m128i)_mm_srli_epi32(accum, 16),o_128);
		__m128i pBitmap_08 = _mm_and_si128((__m128i)_mm_srli_epi32(accum, 8),o_128);
		__m128i pBitmap_00 = _mm_and_si128(accum,o_128);

		_mm_cvtsi128_si32 (pBitmap_24);
		printf("shift 24: %lld %lld %lld %lld\n",pBitmap_24[0],pBitmap_24[1],pBitmap_24[2],pBitmap_24[3]);

		//int x =  _mm_cvtsi128_si32(pBitmap_24);
		__m128 pBitmap16_float = _mm_cvtepi32_ps(pBitmap_16);
		__m128 pBitmap08_float = _mm_cvtepi32_ps(pBitmap_08);
		__m128 pBitmap00_float = _mm_cvtepi32_ps(pBitmap_00);

		// _mm_cvtss_f32(pBitmap16_float);
		// printf("%f %f %f %f\n",pBitmap16_float[0],pBitmap16_float[1],pBitmap16_float[2],pBitmap16_float[3]);
		// int y =  _mm_cvtsi128_si32(accum);
		__m128 BW_0 = _mm_add_ps(_mm_mul_ps(pBitmap16_float,RED_R),_mm_mul_ps(pBitmap08_float,RED_G));
		__m128 BW = _mm_add_ps(_mm_mul_ps(pBitmap00_float,RED_B),BW_0);

		__m128i BW_128_int = _mm_cvtps_epi32(BW);
		pBitmap_24 = _mm_slli_epi32(pBitmap_24, 24);
		pBitmap_16 = _mm_slli_epi32(BW_128_int, 16);
		pBitmap_08 = _mm_slli_epi32(BW_128_int, 8);
		pBitmap_00 = BW_128_int;

		__m128i answer = _mm_add_epi32(_mm_add_epi32(pBitmap_24,pBitmap_16),_mm_add_epi32(pBitmap_08,pBitmap_00));
		//_mm_cvtsi128_si32
		int tmp[4];
		_mm_storeu_si64 (tmp,answer);
		printf("%d %d %d %d\n",tmp[0],tmp[1],tmp[2],tmp[3]);
	}
    return 0;
}