0-hero's picture
Add files using upload-large-folder tool
71c6277 verified
//
// Generated by LLVM NVPTX Back-End
//
.version 8.2
.target sm_89
.address_size 64
// .globl triton__0d1d2de
.extern .func __assertfail
(
.param .b64 __assertfail_param_0,
.param .b64 __assertfail_param_1,
.param .b32 __assertfail_param_2,
.param .b64 __assertfail_param_3,
.param .b64 __assertfail_param_4
)
;
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55};
.extern .shared .align 1 .b8 global_smem[];
.visible .entry triton__0d1d2de(
.param .u64 triton__0d1d2de_param_0,
.param .u64 triton__0d1d2de_param_1,
.param .u64 triton__0d1d2de_param_2
)
.maxntid 128, 1, 1
{
.reg .pred %p<24>;
.reg .b16 %rs<21>;
.reg .b32 %r<21>;
.reg .b64 %rd<58>;
.loc 1 18 0
$L__func_begin0:
.loc 1 18 0
ld.param.u64 %rd9, [triton__0d1d2de_param_1];
ld.param.u64 %rd16, [triton__0d1d2de_param_0];
$L__tmp0:
.loc 1 21 36
mov.u32 %r4, %tid.x;
and.b32 %r1, %r4, 127;
shl.b32 %r2, %r1, 1;
or.b32 %r5, %r2, 1;
or.b32 %r6, %r2, 256;
.loc 1 20 28
mov.u32 %r3, %ctaid.x;
.loc 1 20 46
mul.wide.s32 %rd1, %r3, 512;
cvt.u64.u32 %rd17, %r2;
cvt.u64.u32 %rd18, %r6;
.loc 1 21 23
or.b64 %rd2, %rd1, %rd17;
or.b64 %rd3, %rd1, %rd18;
.loc 1 24 30
shl.b64 %rd19, %rd2, 3;
add.s64 %rd12, %rd16, %rd19;
add.s64 %rd15, %rd12, 2048;
mov.pred %p20, -1;
.loc 1 24 35
mov.u64 %rd10, 0x0;
mov.u64 %rd11, 0x0;
@%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ];
mov.u64 %rd13, 0x0;
mov.u64 %rd14, 0x0;
@%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ];
.loc 1 26 19
setp.eq.s64 %p3, %rd14, -1;
setp.eq.s64 %p4, %rd13, -1;
setp.eq.s64 %p5, %rd11, -1;
setp.eq.s64 %p6, %rd10, -1;
.loc 1 28 32
selp.b64 %rd20, 0, %rd10, %p6;
selp.b64 %rd21, 0, %rd11, %p5;
selp.b64 %rd22, 0, %rd13, %p4;
selp.b64 %rd23, 0, %rd14, %p3;
.loc 1 29 18
add.s64 %rd24, %rd23, 50257;
add.s64 %rd25, %rd22, 50257;
add.s64 %rd26, %rd21, 50257;
add.s64 %rd27, %rd20, 50257;
.loc 1 30 18
setp.lt.s64 %p7, %rd23, 0;
setp.lt.s64 %p8, %rd22, 0;
setp.lt.s64 %p9, %rd21, 0;
setp.lt.s64 %p10, %rd20, 0;
.loc 1 31 32
selp.b64 %rd7, %rd27, %rd20, %p10;
selp.b64 %rd6, %rd26, %rd21, %p9;
selp.b64 %rd5, %rd25, %rd22, %p8;
selp.b64 %rd4, %rd24, %rd23, %p7;
.loc 1 32 36
setp.lt.u64 %p11, %rd4, 50257;
setp.lt.u64 %p12, %rd5, 50257;
setp.lt.u64 %p13, %rd6, 50257;
setp.lt.u64 %p14, %rd7, 50257;
mov.u32 %r7, global_smem;
add.s32 %r8, %r7, %r2;
selp.u16 %rs1, 1, 0, %p14;
st.shared.u8 [%r8], %rs1;
cvt.u64.u32 %rd8, %r5;
selp.u16 %rs2, 1, 0, %p13;
st.shared.u8 [%r8+1], %rs2;
bar.sync 0;
add.s32 %r9, %r7, %r1;
ld.shared.u8 %rs3, [%r9];
ld.shared.u8 %rs4, [%r9+128];
bar.sync 0;
selp.u16 %rs5, 1, 0, %p12;
st.shared.u8 [%r8], %rs5;
selp.u16 %rs6, 1, 0, %p11;
st.shared.u8 [%r8+1], %rs6;
bar.sync 0;
ld.shared.u8 %rs7, [%r9];
ld.shared.u8 %rs8, [%r9+128];
setp.eq.s16 %p15, %rs7, 0;
selp.u16 %rs9, 1, 0, %p15;
shl.b16 %rs10, %rs9, 2;
setp.eq.s16 %p16, %rs8, 0;
selp.u16 %rs11, -1, 0, %p16;
shl.b16 %rs12, %rs11, 3;
or.b16 %rs13, %rs12, %rs10;
setp.eq.s16 %p17, %rs4, 0;
selp.u16 %rs14, 1, 0, %p17;
setp.eq.s16 %p18, %rs3, 0;
selp.u16 %rs15, -1, 0, %p18;
shl.b16 %rs16, %rs15, 1;
or.b16 %rs17, %rs14, %rs16;
and.b16 %rs18, %rs17, 3;
or.b16 %rs19, %rs18, %rs13;
.loc 1 32 51
and.b16 %rs20, %rs19, 15;
setp.eq.s16 %p19, %rs20, 0;
@%p19 bra $L__BB0_2;
mov.u64 %rd28, assertMessage_0;
cvta.global.u64 %rd29, %rd28;
mov.u64 %rd30, assertFile_0;
cvta.global.u64 %rd31, %rd30;
mov.u64 %rd32, assertFunc_0;
cvta.global.u64 %rd33, %rd32;
mov.b32 %r10, 883;
mov.u64 %rd34, 1;
{ // callseq 0, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd29;
.param .b64 param1;
st.param.b64 [param1+0], %rd31;
.param .b32 param2;
st.param.b32 [param2+0], %r10;
.param .b64 param3;
st.param.b64 [param3+0], %rd33;
.param .b64 param4;
st.param.b64 [param4+0], %rd34;
call.uni
__assertfail,
(
param0,
param1,
param2,
param3,
param4
);
} // callseq 0
$L__BB0_2:
.loc 1 21 36
or.b32 %r15, %r2, 257;
cvt.u64.u32 %rd39, %r15;
.loc 1 21 23
or.b64 %rd40, %rd1, %rd39;
or.b64 %rd41, %rd1, %rd8;
.loc 1 34 25
shl.b64 %rd42, %rd7, 2;
add.s64 %rd43, %rd9, %rd42;
mul.lo.s64 %rd44, %rd2, 201028;
add.s64 %rd45, %rd43, %rd44;
shl.b64 %rd46, %rd6, 2;
add.s64 %rd47, %rd9, %rd46;
mul.lo.s64 %rd48, %rd41, 201028;
add.s64 %rd49, %rd47, %rd48;
shl.b64 %rd50, %rd5, 2;
add.s64 %rd51, %rd9, %rd50;
mul.lo.s64 %rd52, %rd3, 201028;
add.s64 %rd53, %rd51, %rd52;
shl.b64 %rd54, %rd4, 2;
add.s64 %rd55, %rd9, %rd54;
mul.lo.s64 %rd56, %rd40, 201028;
add.s64 %rd57, %rd55, %rd56;
.loc 1 34 51
bar.sync 0;
shl.b32 %r16, %r2, 3;
add.s32 %r18, %r7, %r16;
st.shared.u64 [%r18], %rd45;
st.shared.u64 [%r18+8], %rd49;
bar.sync 0;
shl.b32 %r19, %r1, 3;
add.s32 %r20, %r7, %r19;
ld.shared.u64 %rd35, [%r20];
ld.shared.u64 %rd36, [%r20+1024];
bar.sync 0;
st.shared.u64 [%r18], %rd53;
st.shared.u64 [%r18+8], %rd57;
bar.sync 0;
ld.shared.u64 %rd37, [%r20];
ld.shared.u64 %rd38, [%r20+1024];
mov.b32 %r11, -1082130432;
@%p20 st.global.b32 [ %rd35 + 0 ], { %r11 };
@%p20 st.global.b32 [ %rd36 + 0 ], { %r11 };
@%p20 st.global.b32 [ %rd37 + 0 ], { %r11 };
@%p20 st.global.b32 [ %rd38 + 0 ], { %r11 };
.loc 1 34 4
ret;
$L__tmp1:
$L__func_end0:
}
.file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py"
.section .debug_abbrev
{
.b8 1
.b8 17
.b8 1
.b8 37
.b8 8
.b8 19
.b8 5
.b8 3
.b8 8
.b8 16
.b8 6
.b8 27
.b8 8
.b8 180
.b8 66
.b8 12
.b8 17
.b8 1
.b8 18
.b8 1
.b8 0
.b8 0
.b8 2
.b8 46
.b8 0
.b8 17
.b8 1
.b8 18
.b8 1
.b8 64
.b8 10
.b8 135
.b8 64
.b8 8
.b8 3
.b8 8
.b8 58
.b8 11
.b8 59
.b8 11
.b8 63
.b8 12
.b8 0
.b8 0
.b8 0
}
.section .debug_info
{
.b32 176
.b8 2
.b8 0
.b32 .debug_abbrev
.b8 8
.b8 1
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 0
.b8 2
.b8 0
.b8 99
.b8 104
.b8 108
.b8 114
.b8 107
.b8 103
.b8 112
.b8 118
.b8 118
.b8 98
.b8 100
.b8 105
.b8 122
.b8 100
.b8 122
.b8 55
.b8 115
.b8 108
.b8 108
.b8 113
.b8 117
.b8 101
.b8 116
.b8 50
.b8 106
.b8 55
.b8 122
.b8 104
.b8 116
.b8 101
.b8 115
.b8 54
.b8 109
.b8 101
.b8 104
.b8 54
.b8 107
.b8 101
.b8 110
.b8 114
.b8 113
.b8 120
.b8 111
.b8 118
.b8 50
.b8 54
.b8 109
.b8 115
.b8 119
.b8 118
.b8 119
.b8 55
.b8 46
.b8 112
.b8 121
.b8 0
.b32 .debug_line
.b8 47
.b8 116
.b8 109
.b8 112
.b8 47
.b8 116
.b8 111
.b8 114
.b8 99
.b8 104
.b8 105
.b8 110
.b8 100
.b8 117
.b8 99
.b8 116
.b8 111
.b8 114
.b8 95
.b8 114
.b8 111
.b8 111
.b8 116
.b8 47
.b8 104
.b8 108
.b8 0
.b8 1
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 2
.b64 $L__func_begin0
.b64 $L__func_end0
.b8 1
.b8 156
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 101
.b8 0
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 101
.b8 0
.b8 1
.b8 18
.b8 1
.b8 0
}
.section .debug_pubnames
{
.b32 $L__pubNames_end0-$L__pubNames_start0
$L__pubNames_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 180
.b32 125
.b8 116
.b8 114
.b8 105
.b8 116
.b8 111
.b8 110
.b8 95
.b8 95
.b8 48
.b8 100
.b8 49
.b8 100
.b8 50
.b8 100
.b8 101
.b8 0
.b32 0
$L__pubNames_end0:
}
.section .debug_pubtypes
{
.b32 $L__pubTypes_end0-$L__pubTypes_start0
$L__pubTypes_start0:
.b8 2
.b8 0
.b32 .debug_info
.b32 180
.b32 0
$L__pubTypes_end0:
}
.section .debug_loc { }