PTX
user_5573880
assembly_x86
a year ago
8.8 kB
7
Indexable
.version 7.6 .target sm_75 .address_size 64 .visible .entry _Z12wmma_exampleP6__halfS0_Pfiiiff( .param .u64 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_0, .param .u64 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_1, .param .u64 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_2, .param .u32 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_3, .param .u32 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_4, .param .u32 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_5, .param .f32 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_6, .param .f32 _Z12wmma_exampleP6__halfS0_Pfiiiff_param_7 ) { .reg .pred %p<15>; .reg .f32 %f<254>; .reg .b32 %r<147>; .reg .b64 %rd<33>; ld.param.u64 %rd7, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_0]; ld.param.u64 %rd8, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_1]; ld.param.u32 %r18, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_3]; ld.param.u32 %r20, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_4]; ld.param.u32 %r19, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_5]; cvta.to.global.u64 %rd1, %rd7; mov.u32 %r21, %ntid.x; mov.u32 %r22, %ctaid.x; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r21, %r23; mov.u32 %r25, %ntid.y; mov.u32 %r26, %ctaid.y; mov.u32 %r27, WARP_SZ; div.u32 %r28, %r24, %r27; mov.u32 %r29, %tid.y; mad.lo.s32 %r30, %r26, %r25, %r29; cvta.to.global.u64 %rd2, %rd8; shl.b32 %r1, %r28, 4; shl.b32 %r2, %r30, 4; setp.lt.s32 %p2, %r1, %r18; setp.lt.s32 %p3, %r2, %r20; and.pred %p1, %p3, %p2; setp.lt.s32 %p4, %r19, 1; mov.f32 %f189, 0f00000000; mov.f32 %f188, %f189; mov.f32 %f187, %f189; mov.f32 %f186, %f189; mov.f32 %f185, %f189; mov.f32 %f184, %f189; mov.f32 %f183, %f189; mov.f32 %f182, %f189; @%p4 bra $L__BB0_17; mul.lo.s32 %r3, %r2, %r19; add.s32 %r32, %r19, -1; shr.u32 %r33, %r32, 4; add.s32 %r4, %r33, 1; and.b32 %r5, %r4, 3; setp.lt.u32 %p5, %r32, 48; mov.u32 %r144, 0; mov.f32 %f182, 0f00000000; mov.f32 %f183, %f182; mov.f32 %f184, %f182; mov.f32 %f185, %f182; mov.f32 %f186, %f182; mov.f32 %f187, %f182; mov.f32 %f188, %f182; mov.f32 %f189, %f182; @%p5 bra $L__BB0_12; sub.s32 %r143, %r4, %r5; not.pred %p6, %p1; $L__BB0_3: @%p6 bra $L__BB0_5; mad.lo.s32 %r35, %r144, %r18, %r1; mul.wide.s32 %rd9, %r35, 2; add.s64 %rd10, %rd1, %rd9; wmma.load.a.sync.aligned.col.m16n16k16.global.f16 {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43}, [%rd10], %r18; add.s32 %r44, %r144, %r3; mul.wide.s32 %rd11, %r44, 2; add.s64 %rd12, %rd2, %rd11; wmma.load.b.sync.aligned.col.m16n16k16.global.f16 {%r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52}, [%rd12], %r19; wmma.mma.sync.aligned.col.col.m16n16k16.f32.f32 {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}, {%r36, %r37, %r38, %r39, %r40, %r41, %r42, %r43}, {%r45, %r46, %r47, %r48, %r49, %r50, %r51, %r52}, {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}; $L__BB0_5: @%p6 bra $L__BB0_7; add.s32 %r53, %r144, 16; mad.lo.s32 %r54, %r53, %r18, %r1; mul.wide.s32 %rd13, %r54, 2; add.s64 %rd14, %rd1, %rd13; wmma.load.a.sync.aligned.col.m16n16k16.global.f16 {%r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62}, [%rd14], %r18; add.s32 %r63, %r53, %r3; mul.wide.s32 %rd15, %r63, 2; add.s64 %rd16, %rd2, %rd15; wmma.load.b.sync.aligned.col.m16n16k16.global.f16 {%r64, %r65, %r66, %r67, %r68, %r69, %r70, %r71}, [%rd16], %r19; wmma.mma.sync.aligned.col.col.m16n16k16.f32.f32 {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}, {%r55, %r56, %r57, %r58, %r59, %r60, %r61, %r62}, {%r64, %r65, %r66, %r67, %r68, %r69, %r70, %r71}, {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}; $L__BB0_7: @%p6 bra $L__BB0_9; add.s32 %r72, %r144, 32; mad.lo.s32 %r73, %r72, %r18, %r1; mul.wide.s32 %rd17, %r73, 2; add.s64 %rd18, %rd1, %rd17; wmma.load.a.sync.aligned.col.m16n16k16.global.f16 {%r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81}, [%rd18], %r18; add.s32 %r82, %r72, %r3; mul.wide.s32 %rd19, %r82, 2; add.s64 %rd20, %rd2, %rd19; wmma.load.b.sync.aligned.col.m16n16k16.global.f16 {%r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90}, [%rd20], %r19; wmma.mma.sync.aligned.col.col.m16n16k16.f32.f32 {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}, {%r74, %r75, %r76, %r77, %r78, %r79, %r80, %r81}, {%r83, %r84, %r85, %r86, %r87, %r88, %r89, %r90}, {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}; $L__BB0_9: @%p6 bra $L__BB0_11; add.s32 %r91, %r144, 48; mad.lo.s32 %r92, %r91, %r18, %r1; mul.wide.s32 %rd21, %r92, 2; add.s64 %rd22, %rd1, %rd21; wmma.load.a.sync.aligned.col.m16n16k16.global.f16 {%r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100}, [%rd22], %r18; add.s32 %r101, %r91, %r3; mul.wide.s32 %rd23, %r101, 2; add.s64 %rd24, %rd2, %rd23; wmma.load.b.sync.aligned.col.m16n16k16.global.f16 {%r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109}, [%rd24], %r19; wmma.mma.sync.aligned.col.col.m16n16k16.f32.f32 {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}, {%r93, %r94, %r95, %r96, %r97, %r98, %r99, %r100}, {%r102, %r103, %r104, %r105, %r106, %r107, %r108, %r109}, {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}; $L__BB0_11: add.s32 %r144, %r144, 64; add.s32 %r143, %r143, -4; setp.ne.s32 %p10, %r143, 0; @%p10 bra $L__BB0_3; $L__BB0_12: add.s32 %r137, %r19, -1; shr.u32 %r136, %r137, 4; add.s32 %r135, %r136, 1; and.b32 %r134, %r135, 3; setp.eq.s32 %p11, %r134, 0; @%p11 bra $L__BB0_17; add.s32 %r141, %r19, -1; shr.u32 %r140, %r141, 4; add.s32 %r139, %r140, 1; and.b32 %r146, %r139, 3; add.s32 %r110, %r144, %r3; mul.wide.s32 %rd25, %r110, 2; add.s64 %rd32, %rd2, %rd25; mul.lo.s32 %r145, %r144, %r18; shl.b32 %r13, %r18, 4; not.pred %p12, %p1; $L__BB0_14: .pragma "nounroll"; @%p12 bra $L__BB0_16; add.s32 %r111, %r145, %r1; mul.wide.s32 %rd26, %r111, 2; add.s64 %rd27, %rd1, %rd26; wmma.load.a.sync.aligned.col.m16n16k16.global.f16 {%r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119}, [%rd27], %r18; wmma.load.b.sync.aligned.col.m16n16k16.global.f16 {%r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127}, [%rd32], %r19; wmma.mma.sync.aligned.col.col.m16n16k16.f32.f32 {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}, {%r112, %r113, %r114, %r115, %r116, %r117, %r118, %r119}, {%r120, %r121, %r122, %r123, %r124, %r125, %r126, %r127}, {%f189, %f188, %f187, %f186, %f185, %f184, %f183, %f182}; $L__BB0_16: add.s64 %rd32, %rd32, 32; add.s32 %r145, %r145, %r13; add.s32 %r146, %r146, -1; setp.ne.s32 %p13, %r146, 0; @%p13 bra $L__BB0_14; $L__BB0_17: not.pred %p14, %p1; @%p14 bra $L__BB0_19; ld.param.f32 %f173, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_6]; ld.param.f32 %f172, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_7]; ld.param.u64 %rd31, [_Z12wmma_exampleP6__halfS0_Pfiiiff_param_2]; mov.u32 %r133, %tid.y; mov.u32 %r132, %ntid.y; mov.u32 %r131, %ctaid.y; mad.lo.s32 %r130, %r131, %r132, %r133; shl.b32 %r129, %r130, 4; cvta.to.global.u64 %rd28, %rd31; mad.lo.s32 %r128, %r129, %r18, %r1; mul.wide.s32 %rd29, %r128, 4; add.s64 %rd30, %rd28, %rd29; wmma.load.c.sync.aligned.col.m16n16k16.global.f32 {%f148, %f149, %f150, %f151, %f152, %f153, %f154, %f155}, [%rd30], %r18; mul.f32 %f156, %f148, %f172; fma.rn.f32 %f157, %f189, %f173, %f156; mul.f32 %f158, %f149, %f172; fma.rn.f32 %f159, %f188, %f173, %f158; mul.f32 %f160, %f150, %f172; fma.rn.f32 %f161, %f187, %f173, %f160; mul.f32 %f162, %f151, %f172; fma.rn.f32 %f163, %f186, %f173, %f162; mul.f32 %f164, %f152, %f172; fma.rn.f32 %f165, %f185, %f173, %f164; mul.f32 %f166, %f153, %f172; fma.rn.f32 %f167, %f184, %f173, %f166; mul.f32 %f168, %f154, %f172; fma.rn.f32 %f169, %f183, %f173, %f168; mul.f32 %f170, %f155, %f172; fma.rn.f32 %f171, %f182, %f173, %f170; wmma.store.d.sync.aligned.col.m16n16k16.global.f32 [%rd30], {%f157, %f159, %f161, %f163, %f165, %f167, %f169, %f171}, %r18; $L__BB0_19: ret; } .visible .entry _Z17convertFp32ToFp16P6__halfPfi( .param .u64 _Z17convertFp32ToFp16P6__halfPfi_param_0, .param .u64 _Z17convertFp32ToFp16P6__halfPfi_param_1, .param .u32 _Z17convertFp32ToFp16P6__halfPfi_param_2 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<6>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [_Z17convertFp32ToFp16P6__halfPfi_param_0]; ld.param.u64 %rd2, [_Z17convertFp32ToFp16P6__halfPfi_param_1]; ld.param.u32 %r2, [_Z17convertFp32ToFp16P6__halfPfi_param_2]; mov.u32 %r3, %ntid.x; mov.u32 %r4, %ctaid.x; mov.u32 %r5, %tid.x; mad.lo.s32 %r1, %r3, %r4, %r5; setp.ge.s32 %p1, %r1, %r2; @%p1 bra $L__BB1_2; cvta.to.global.u64 %rd3, %rd2; mul.wide.s32 %rd4, %r1, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f1, [%rd5]; { cvt.rn.f16.f32 %rs1, %f1;} cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r1, 2; add.s64 %rd8, %rd6, %rd7; st.global.u16 [%rd8], %rs1; $L__BB1_2: ret; }
Editor is loading...
Leave a Comment