|
// |
|
// Generated by LLVM NVPTX Back-End |
|
// |
|
|
|
.version 8.2 |
|
.target sm_89 |
|
.address_size 64 |
|
|
|
// .globl triton__0d1d2de |
|
.extern .func __assertfail |
|
( |
|
.param .b64 __assertfail_param_0, |
|
.param .b64 __assertfail_param_1, |
|
.param .b32 __assertfail_param_2, |
|
.param .b64 __assertfail_param_3, |
|
.param .b64 __assertfail_param_4 |
|
) |
|
|
|
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100} |
|
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62} |
|
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55} |
|
.extern .shared .align 1 .b8 global_smem[] |
|
|
|
.visible .entry triton__0d1d2de( |
|
.param .u64 triton__0d1d2de_param_0, |
|
.param .u64 triton__0d1d2de_param_1, |
|
.param .u64 triton__0d1d2de_param_2 |
|
) |
|
.maxntid 128, 1, 1 |
|
{ |
|
.reg .pred %p<24> |
|
.reg .b16 %rs<21> |
|
.reg .b32 %r<21> |
|
.reg .b64 %rd<58> |
|
.loc 1 18 0 |
|
$L__func_begin0: |
|
.loc 1 18 0 |
|
|
|
ld.param.u64 %rd9, [triton__0d1d2de_param_1] |
|
ld.param.u64 %rd16, [triton__0d1d2de_param_0] |
|
$L__tmp0: |
|
.loc 1 21 36 |
|
mov.u32 %r4, %tid.x |
|
and.b32 %r1, %r4, 127 |
|
shl.b32 %r2, %r1, 1 |
|
or.b32 %r5, %r2, 1 |
|
or.b32 %r6, %r2, 256 |
|
.loc 1 20 28 |
|
mov.u32 %r3, %ctaid.x |
|
.loc 1 20 46 |
|
mul.wide.s32 %rd1, %r3, 512 |
|
cvt.u64.u32 %rd17, %r2 |
|
cvt.u64.u32 %rd18, %r6 |
|
.loc 1 21 23 |
|
or.b64 %rd2, %rd1, %rd17 |
|
or.b64 %rd3, %rd1, %rd18 |
|
.loc 1 24 30 |
|
shl.b64 %rd19, %rd2, 3 |
|
add.s64 %rd12, %rd16, %rd19 |
|
add.s64 %rd15, %rd12, 2048 |
|
mov.pred %p20, -1 |
|
.loc 1 24 35 |
|
mov.u64 %rd10, 0x0 |
|
mov.u64 %rd11, 0x0 |
|
@%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ] |
|
mov.u64 %rd13, 0x0 |
|
mov.u64 %rd14, 0x0 |
|
@%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ] |
|
.loc 1 26 19 |
|
setp.eq.s64 %p3, %rd14, -1 |
|
setp.eq.s64 %p4, %rd13, -1 |
|
setp.eq.s64 %p5, %rd11, -1 |
|
setp.eq.s64 %p6, %rd10, -1 |
|
.loc 1 28 32 |
|
selp.b64 %rd20, 0, %rd10, %p6 |
|
selp.b64 %rd21, 0, %rd11, %p5 |
|
selp.b64 %rd22, 0, %rd13, %p4 |
|
selp.b64 %rd23, 0, %rd14, %p3 |
|
.loc 1 29 18 |
|
add.s64 %rd24, %rd23, 50257 |
|
add.s64 %rd25, %rd22, 50257 |
|
add.s64 %rd26, %rd21, 50257 |
|
add.s64 %rd27, %rd20, 50257 |
|
.loc 1 30 18 |
|
setp.lt.s64 %p7, %rd23, 0 |
|
setp.lt.s64 %p8, %rd22, 0 |
|
setp.lt.s64 %p9, %rd21, 0 |
|
setp.lt.s64 %p10, %rd20, 0 |
|
.loc 1 31 32 |
|
selp.b64 %rd7, %rd27, %rd20, %p10 |
|
selp.b64 %rd6, %rd26, %rd21, %p9 |
|
selp.b64 %rd5, %rd25, %rd22, %p8 |
|
selp.b64 %rd4, %rd24, %rd23, %p7 |
|
.loc 1 32 36 |
|
setp.lt.u64 %p11, %rd4, 50257 |
|
setp.lt.u64 %p12, %rd5, 50257 |
|
setp.lt.u64 %p13, %rd6, 50257 |
|
setp.lt.u64 %p14, %rd7, 50257 |
|
mov.u32 %r7, global_smem |
|
add.s32 %r8, %r7, %r2 |
|
selp.u16 %rs1, 1, 0, %p14 |
|
st.shared.u8 [%r8], %rs1 |
|
cvt.u64.u32 %rd8, %r5 |
|
selp.u16 %rs2, 1, 0, %p13 |
|
st.shared.u8 [%r8+1], %rs2 |
|
bar.sync 0 |
|
add.s32 %r9, %r7, %r1 |
|
ld.shared.u8 %rs3, [%r9] |
|
ld.shared.u8 %rs4, [%r9+128] |
|
bar.sync 0 |
|
selp.u16 %rs5, 1, 0, %p12 |
|
st.shared.u8 [%r8], %rs5 |
|
selp.u16 %rs6, 1, 0, %p11 |
|
st.shared.u8 [%r8+1], %rs6 |
|
bar.sync 0 |
|
ld.shared.u8 %rs7, [%r9] |
|
ld.shared.u8 %rs8, [%r9+128] |
|
setp.eq.s16 %p15, %rs7, 0 |
|
selp.u16 %rs9, 1, 0, %p15 |
|
shl.b16 %rs10, %rs9, 2 |
|
setp.eq.s16 %p16, %rs8, 0 |
|
selp.u16 %rs11, -1, 0, %p16 |
|
shl.b16 %rs12, %rs11, 3 |
|
or.b16 %rs13, %rs12, %rs10 |
|
setp.eq.s16 %p17, %rs4, 0 |
|
selp.u16 %rs14, 1, 0, %p17 |
|
setp.eq.s16 %p18, %rs3, 0 |
|
selp.u16 %rs15, -1, 0, %p18 |
|
shl.b16 %rs16, %rs15, 1 |
|
or.b16 %rs17, %rs14, %rs16 |
|
and.b16 %rs18, %rs17, 3 |
|
or.b16 %rs19, %rs18, %rs13 |
|
.loc 1 32 51 |
|
and.b16 %rs20, %rs19, 15 |
|
setp.eq.s16 %p19, %rs20, 0 |
|
@%p19 bra $L__BB0_2 |
|
mov.u64 %rd28, assertMessage_0 |
|
cvta.global.u64 %rd29, %rd28 |
|
mov.u64 %rd30, assertFile_0 |
|
cvta.global.u64 %rd31, %rd30 |
|
mov.u64 %rd32, assertFunc_0 |
|
cvta.global.u64 %rd33, %rd32 |
|
mov.b32 %r10, 883 |
|
mov.u64 %rd34, 1 |
|
{ // callseq 0, 0 |
|
.reg .b32 temp_param_reg |
|
.param .b64 param0 |
|
st.param.b64 [param0+0], %rd29 |
|
.param .b64 param1 |
|
st.param.b64 [param1+0], %rd31 |
|
.param .b32 param2 |
|
st.param.b32 [param2+0], %r10 |
|
.param .b64 param3 |
|
st.param.b64 [param3+0], %rd33 |
|
.param .b64 param4 |
|
st.param.b64 [param4+0], %rd34 |
|
call.uni |
|
__assertfail, |
|
( |
|
param0, |
|
param1, |
|
param2, |
|
param3, |
|
param4 |
|
) |
|
} // callseq 0 |
|
$L__BB0_2: |
|
.loc 1 21 36 |
|
or.b32 %r15, %r2, 257 |
|
cvt.u64.u32 %rd39, %r15 |
|
.loc 1 21 23 |
|
or.b64 %rd40, %rd1, %rd39 |
|
or.b64 %rd41, %rd1, %rd8 |
|
.loc 1 34 25 |
|
shl.b64 %rd42, %rd7, 2 |
|
add.s64 %rd43, %rd9, %rd42 |
|
mul.lo.s64 %rd44, %rd2, 201028 |
|
add.s64 %rd45, %rd43, %rd44 |
|
shl.b64 %rd46, %rd6, 2 |
|
add.s64 %rd47, %rd9, %rd46 |
|
mul.lo.s64 %rd48, %rd41, 201028 |
|
add.s64 %rd49, %rd47, %rd48 |
|
shl.b64 %rd50, %rd5, 2 |
|
add.s64 %rd51, %rd9, %rd50 |
|
mul.lo.s64 %rd52, %rd3, 201028 |
|
add.s64 %rd53, %rd51, %rd52 |
|
shl.b64 %rd54, %rd4, 2 |
|
add.s64 %rd55, %rd9, %rd54 |
|
mul.lo.s64 %rd56, %rd40, 201028 |
|
add.s64 %rd57, %rd55, %rd56 |
|
.loc 1 34 51 |
|
bar.sync 0 |
|
shl.b32 %r16, %r2, 3 |
|
add.s32 %r18, %r7, %r16 |
|
st.shared.u64 [%r18], %rd45 |
|
st.shared.u64 [%r18+8], %rd49 |
|
bar.sync 0 |
|
shl.b32 %r19, %r1, 3 |
|
add.s32 %r20, %r7, %r19 |
|
ld.shared.u64 %rd35, [%r20] |
|
ld.shared.u64 %rd36, [%r20+1024] |
|
bar.sync 0 |
|
st.shared.u64 [%r18], %rd53 |
|
st.shared.u64 [%r18+8], %rd57 |
|
bar.sync 0 |
|
ld.shared.u64 %rd37, [%r20] |
|
ld.shared.u64 %rd38, [%r20+1024] |
|
mov.b32 %r11, -1082130432 |
|
@%p20 st.global.b32 [ %rd35 + 0 ], { %r11 } |
|
@%p20 st.global.b32 [ %rd36 + 0 ], { %r11 } |
|
@%p20 st.global.b32 [ %rd37 + 0 ], { %r11 } |
|
@%p20 st.global.b32 [ %rd38 + 0 ], { %r11 } |
|
.loc 1 34 4 |
|
ret |
|
$L__tmp1: |
|
$L__func_end0: |
|
|
|
} |
|
.file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py" |
|
.section .debug_abbrev |
|
{ |
|
.b8 1 |
|
.b8 17 |
|
.b8 1 |
|
.b8 37 |
|
.b8 8 |
|
.b8 19 |
|
.b8 5 |
|
.b8 3 |
|
.b8 8 |
|
.b8 16 |
|
.b8 6 |
|
.b8 27 |
|
.b8 8 |
|
.b8 180 |
|
.b8 66 |
|
.b8 12 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
.b8 0 |
|
.b8 2 |
|
.b8 46 |
|
.b8 0 |
|
.b8 17 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 64 |
|
.b8 10 |
|
.b8 135 |
|
.b8 64 |
|
.b8 8 |
|
.b8 3 |
|
.b8 8 |
|
.b8 58 |
|
.b8 11 |
|
.b8 59 |
|
.b8 11 |
|
.b8 63 |
|
.b8 12 |
|
.b8 0 |
|
.b8 0 |
|
.b8 0 |
|
} |
|
.section .debug_info |
|
{ |
|
.b32 176 |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_abbrev |
|
.b8 8 |
|
.b8 1 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 0 |
|
.b8 2 |
|
.b8 0 |
|
.b8 99 |
|
.b8 104 |
|
.b8 108 |
|
.b8 114 |
|
.b8 107 |
|
.b8 103 |
|
.b8 112 |
|
.b8 118 |
|
.b8 118 |
|
.b8 98 |
|
.b8 100 |
|
.b8 105 |
|
.b8 122 |
|
.b8 100 |
|
.b8 122 |
|
.b8 55 |
|
.b8 115 |
|
.b8 108 |
|
.b8 108 |
|
.b8 113 |
|
.b8 117 |
|
.b8 101 |
|
.b8 116 |
|
.b8 50 |
|
.b8 106 |
|
.b8 55 |
|
.b8 122 |
|
.b8 104 |
|
.b8 116 |
|
.b8 101 |
|
.b8 115 |
|
.b8 54 |
|
.b8 109 |
|
.b8 101 |
|
.b8 104 |
|
.b8 54 |
|
.b8 107 |
|
.b8 101 |
|
.b8 110 |
|
.b8 114 |
|
.b8 113 |
|
.b8 120 |
|
.b8 111 |
|
.b8 118 |
|
.b8 50 |
|
.b8 54 |
|
.b8 109 |
|
.b8 115 |
|
.b8 119 |
|
.b8 118 |
|
.b8 119 |
|
.b8 55 |
|
.b8 46 |
|
.b8 112 |
|
.b8 121 |
|
.b8 0 |
|
.b32 .debug_line |
|
.b8 47 |
|
.b8 116 |
|
.b8 109 |
|
.b8 112 |
|
.b8 47 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 99 |
|
.b8 104 |
|
.b8 105 |
|
.b8 110 |
|
.b8 100 |
|
.b8 117 |
|
.b8 99 |
|
.b8 116 |
|
.b8 111 |
|
.b8 114 |
|
.b8 95 |
|
.b8 114 |
|
.b8 111 |
|
.b8 111 |
|
.b8 116 |
|
.b8 47 |
|
.b8 104 |
|
.b8 108 |
|
.b8 0 |
|
.b8 1 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 2 |
|
.b64 $L__func_begin0 |
|
.b64 $L__func_end0 |
|
.b8 1 |
|
.b8 156 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b8 1 |
|
.b8 18 |
|
.b8 1 |
|
.b8 0 |
|
} |
|
.section .debug_pubnames |
|
{ |
|
.b32 $L__pubNames_end0-$L__pubNames_start0 |
|
$L__pubNames_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 180 |
|
.b32 125 |
|
.b8 116 |
|
.b8 114 |
|
.b8 105 |
|
.b8 116 |
|
.b8 111 |
|
.b8 110 |
|
.b8 95 |
|
.b8 95 |
|
.b8 48 |
|
.b8 100 |
|
.b8 49 |
|
.b8 100 |
|
.b8 50 |
|
.b8 100 |
|
.b8 101 |
|
.b8 0 |
|
.b32 0 |
|
$L__pubNames_end0: |
|
} |
|
.section .debug_pubtypes |
|
{ |
|
.b32 $L__pubTypes_end0-$L__pubTypes_start0 |
|
$L__pubTypes_start0: |
|
.b8 2 |
|
.b8 0 |
|
.b32 .debug_info |
|
.b32 180 |
|
.b32 0 |
|
$L__pubTypes_end0: |
|
} |
|
.section .debug_loc { } |
|
|