// // Generated by LLVM NVPTX Back-End // .version 8.2 .target sm_89 .address_size 64 // .globl triton__0d1d2d3d4d5d6d7d8de9de .extern .shared .align 1 .b8 global_smem[]; .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; .visible .entry triton__0d1d2d3d4d5d6d7d8de9de( .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 ) .maxntid 64, 1, 1 { .reg .pred %p<31>; .reg .b16 %rs<17>; .reg .b32 %r<103>; .reg .f32 %f<86>; .reg .b64 %rd<21>; .loc 1 18 0 $L__func_begin0: .loc 1 18 0 ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; $L__tmp0: .loc 1 26 26 mov.u32 %r68, %tid.x; and.b32 %r69, %r68, 31; ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; and.b32 %r70, %r68, 63; ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; shl.b32 %r71, %r70, 2; ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; .loc 1 23 28 mov.u32 %r1, %ctaid.x; .loc 1 30 40 shl.b32 %r72, %r1, 8; .loc 1 30 36 or.b32 %r73, %r72, %r71; .loc 1 30 30 mul.wide.s32 %rd17, %r73, 4; add.s64 %rd1, %rd10, %rd17; mov.b32 %r6, 0; mov.pred %p1, -1; .loc 1 30 46 mov.u32 %r2, 0x0; mov.u32 %r3, 0x0; mov.u32 %r4, 0x0; mov.u32 %r5, 0x0; @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; @!%p1 mov.u32 %r2, %r6; @!%p1 mov.u32 %r3, %r6; @!%p1 mov.u32 %r4, %r6; @!%p1 mov.u32 %r5, %r6; mov.b32 %f1, %r4; mov.b32 %f2, %r5; .loc 1 31 30 mul.wide.s32 %rd18, %r73, 2; add.s64 %rd2, %rd11, %rd18; .loc 1 31 46 mov.u32 %r10, 0x0; mov.u32 %r11, 0x0; @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; @!%p1 mov.u32 %r10, %r6; @!%p1 mov.u32 %r11, %r6; cvt.u16.u32 %rs1, %r10; { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } cvt.u16.u32 %rs3, %r11; { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } .loc 1 31 67 cvt.f32.bf16 %r14, %rs1; mov.b32 %f3, %r14; cvt.f32.bf16 %r15, %rs2; mov.b32 %f4, %r15; cvt.f32.bf16 %r16, %rs3; mov.b32 %f5, %r16; cvt.f32.bf16 %r17, %rs4; mov.b32 %f6, %r17; .loc 1 32 30 add.s64 %rd3, %rd12, %rd18; .loc 1 32 46 mov.u32 %r18, 0x0; mov.u32 %r19, 0x0; @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ]; @!%p1 mov.u32 %r18, %r6; @!%p1 mov.u32 %r19, %r6; cvt.u16.u32 %rs5, %r18; { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; } cvt.u16.u32 %rs7, %r19; { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; } .loc 1 32 67 cvt.f32.bf16 %r22, %rs5; mov.b32 %f7, %r22; cvt.f32.bf16 %r23, %rs6; mov.b32 %f8, %r23; cvt.f32.bf16 %r24, %rs7; mov.b32 %f9, %r24; cvt.f32.bf16 %r25, %rs8; mov.b32 %f10, %r25; .loc 1 33 30 add.s64 %rd4, %rd13, %rd18; .loc 1 33 46 mov.u32 %r26, 0x0; mov.u32 %r27, 0x0; @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ]; @!%p1 mov.u32 %r26, %r6; @!%p1 mov.u32 %r27, %r6; cvt.u16.u32 %rs9, %r26; { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; } cvt.u16.u32 %rs11, %r27; { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; } .loc 1 33 67 cvt.f32.bf16 %r30, %rs9; mov.b32 %f11, %r30; cvt.f32.bf16 %r31, %rs10; mov.b32 %f12, %r31; cvt.f32.bf16 %r32, %rs11; mov.b32 %f13, %r32; cvt.f32.bf16 %r33, %rs12; mov.b32 %f14, %r33; .loc 1 34 31 mul.wide.u32 %rd19, %r71, 4; add.s64 %rd5, %rd14, %rd19; .loc 1 34 36 mov.u32 %r34, 0x0; mov.u32 %r35, 0x0; mov.u32 %r36, 0x0; mov.u32 %r37, 0x0; @%p1 ld.global.L1::evict_last.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ]; @!%p1 mov.u32 %r34, %r6; @!%p1 mov.u32 %r35, %r6; @!%p1 mov.u32 %r36, %r6; @!%p1 mov.u32 %r37, %r6; .loc 1 36 18 add.f32 %f15, %f5, %f1; add.f32 %f16, %f6, %f2; .loc 1 38 18 add.f32 %f17, %f15, %f9; add.f32 %f18, %f16, %f10; .loc 1 30 46 mov.b32 %f19, %r2; mov.b32 %f20, %r3; .loc 1 36 18 add.f32 %f21, %f4, %f20; add.f32 %f22, %f3, %f19; .loc 1 38 18 add.f32 %f23, %f22, %f7; add.f32 %f24, %f21, %f8; .loc 1 40 18 add.f32 %f25, %f24, %f12; add.f32 %f26, %f23, %f11; add.f32 %f27, %f17, %f13; add.f32 %f28, %f18, %f14; $L__tmp1: .loc 2 233 15 add.f32 %f29, %f26, %f25; add.f32 %f30, %f29, %f27; add.f32 %f31, %f30, %f28; $L__tmp2: .loc 2 243 36 mov.b32 %r74, %f31; shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1; mov.b32 %f32, %r75; $L__tmp3: .loc 2 233 15 add.f32 %f33, %f31, %f32; $L__tmp4: .loc 2 243 36 mov.b32 %r76, %f33; shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1; mov.b32 %f34, %r77; $L__tmp5: .loc 2 233 15 add.f32 %f35, %f33, %f34; $L__tmp6: .loc 2 243 36 mov.b32 %r78, %f35; shfl.sync.bfly.b32 %r79, %r78, 4, 31, -1; mov.b32 %f36, %r79; $L__tmp7: .loc 2 233 15 add.f32 %f37, %f35, %f36; $L__tmp8: .loc 2 243 36 mov.b32 %r80, %f37; shfl.sync.bfly.b32 %r81, %r80, 2, 31, -1; mov.b32 %f38, %r81; $L__tmp9: .loc 2 233 15 add.f32 %f39, %f37, %f38; $L__tmp10: .loc 2 243 36 mov.b32 %r82, %f39; shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1; mov.b32 %f40, %r83; $L__tmp11: .loc 2 233 15 add.f32 %f41, %f39, %f40; $L__tmp12: .loc 2 243 36 setp.eq.s32 %p20, %r69, 0; shr.u32 %r84, %r68, 3; and.b32 %r85, %r84, 4; mov.u32 %r86, global_smem; add.s32 %r42, %r86, %r85; mov.b32 %r43, %f41; @%p20 st.shared.b32 [ %r42 + 0 ], %r43; bar.sync 0; setp.lt.s32 %p21, %r68, 2; shl.b32 %r87, %r68, 2; add.s32 %r45, %r86, %r87; @%p21 ld.shared.b32 %r44, [ %r45 + 0 ]; mov.b32 %f42, %r44; shfl.sync.bfly.b32 %r88, %r44, 1, 31, -1; mov.b32 %f43, %r88; $L__tmp13: .loc 2 233 15 add.f32 %f44, %f42, %f43; $L__tmp14: .loc 2 243 36 and.b32 %r89, %r68, 1; setp.eq.b32 %p29, %r89, 1; not.pred %p30, %p29; and.pred %p22, %p21, %p30; mov.b32 %r47, %f44; @%p22 st.shared.b32 [ %r45 + 0 ], %r47; bar.sync 0; ld.shared.f32 %f45, [global_smem]; $L__tmp15: .loc 3 8 15 add.f32 %f46, %f45, 0f00000000; $L__tmp16: .loc 1 48 20 mov.b32 %r49, %f46; mov.b32 %r50, 1132462080; div.full.f32 %r67, %r49, %r50; mov.b32 %f47, %r67; .loc 1 49 20 sub.f32 %f48, %f26, %f47; sub.f32 %f49, %f25, %f47; sub.f32 %f50, %f27, %f47; sub.f32 %f51, %f28, %f47; .loc 1 50 20 mul.f32 %f52, %f49, %f49; $L__tmp17: .loc 2 243 36 bar.sync 0; $L__tmp18: .loc 2 233 15 fma.rn.f32 %f53, %f48, %f48, %f52; fma.rn.f32 %f54, %f50, %f50, %f53; fma.rn.f32 %f55, %f51, %f51, %f54; $L__tmp19: .loc 2 243 36 mov.b32 %r90, %f55; shfl.sync.bfly.b32 %r91, %r90, 16, 31, -1; mov.b32 %f56, %r91; $L__tmp20: .loc 2 233 15 add.f32 %f57, %f55, %f56; $L__tmp21: .loc 2 243 36 mov.b32 %r92, %f57; shfl.sync.bfly.b32 %r93, %r92, 8, 31, -1; mov.b32 %f58, %r93; $L__tmp22: .loc 2 233 15 add.f32 %f59, %f57, %f58; $L__tmp23: .loc 2 243 36 mov.b32 %r94, %f59; shfl.sync.bfly.b32 %r95, %r94, 4, 31, -1; mov.b32 %f60, %r95; $L__tmp24: .loc 2 233 15 add.f32 %f61, %f59, %f60; $L__tmp25: .loc 2 243 36 mov.b32 %r96, %f61; shfl.sync.bfly.b32 %r97, %r96, 2, 31, -1; mov.b32 %f62, %r97; $L__tmp26: .loc 2 233 15 add.f32 %f63, %f61, %f62; $L__tmp27: .loc 2 243 36 mov.b32 %r98, %f63; shfl.sync.bfly.b32 %r99, %r98, 1, 31, -1; mov.b32 %f64, %r99; $L__tmp28: .loc 2 233 15 add.f32 %f65, %f63, %f64; $L__tmp29: .loc 2 243 36 mov.b32 %r52, %f65; @%p20 st.shared.b32 [ %r42 + 0 ], %r52; bar.sync 0; @%p21 ld.shared.b32 %r53, [ %r45 + 0 ]; mov.b32 %f66, %r53; shfl.sync.bfly.b32 %r100, %r53, 1, 31, -1; mov.b32 %f67, %r100; $L__tmp30: .loc 2 233 15 add.f32 %f68, %f66, %f67; $L__tmp31: .loc 2 243 36 mov.b32 %r56, %f68; @%p22 st.shared.b32 [ %r45 + 0 ], %r56; bar.sync 0; ld.shared.f32 %f69, [global_smem]; $L__tmp32: .loc 3 8 15 add.f32 %f70, %f69, 0f00000000; $L__tmp33: .loc 1 55 20 mov.b32 %r58, %f70; div.full.f32 %r57, %r58, %r50; mov.b32 %f71, %r57; .loc 1 57 20 add.f32 %f72, %f71, 0f3727C5AC; .loc 1 58 26 rsqrt.approx.ftz.f32 %f73, %f72; .loc 1 34 36 mov.b32 %f74, %r37; mov.b32 %f75, %r36; mov.b32 %f76, %r35; mov.b32 %f77, %r34; .loc 1 60 20 mul.f32 %f78, %f48, %f73; mul.f32 %f79, %f49, %f73; mul.f32 %f80, %f50, %f73; mul.f32 %f81, %f51, %f73; .loc 1 61 20 mul.f32 %f82, %f78, %f77; mul.f32 %f83, %f79, %f76; mul.f32 %f84, %f80, %f75; mul.f32 %f85, %f81, %f74; .loc 1 63 4 bar.sync 0; .loc 1 64 28 mul.wide.s32 %rd20, %r1, 4; add.s64 %rd6, %rd9, %rd20; .loc 1 64 40 setp.eq.s32 %p26, %r70, 0; mov.b32 %r60, %f73; @%p26 st.global.b32 [ %rd6 + 0 ], { %r60 }; .loc 1 65 25 add.s64 %rd7, %rd16, %rd18; .loc 1 65 48 mov.b32 %r61, %f82; cvt.rn.bf16.f32 %rs13, %r61; mov.b32 %r62, %f83; cvt.rn.bf16.f32 %rs14, %r62; mov.b32 %r63, %f84; cvt.rn.bf16.f32 %rs15, %r63; mov.b32 %r64, %f85; cvt.rn.bf16.f32 %rs16, %r64; mov.b32 %r101, {%rs13, %rs14}; mov.b32 %r102, {%rs15, %rs16}; @%p1 st.global.v2.b32 [ %rd7 + 0 ], { %r101, %r102 }; .loc 1 66 25 add.s64 %rd8, %rd15, %rd20; .loc 1 66 37 @%p26 st.global.b32 [ %rd8 + 0 ], { %r67 }; .loc 1 66 4 ret; $L__tmp34: $L__func_end0: } // .globl __nv_rsqrtf .visible .func (.param .b32 func_retval0) __nv_rsqrtf( .param .b32 __nv_rsqrtf_param_0 ) { .reg .f32 %f<3>; $L__func_begin1: ld.param.f32 %f1, [__nv_rsqrtf_param_0]; rsqrt.approx.ftz.f32 %f2, %f1; st.param.f32 [func_retval0+0], %f2; ret; $L__func_end1: } .file 1 "/tmp/torchinductor_root/dx/cdxa5yqgsimvskocpuiz4ajfrjfcwys3opyrdv53xfphj4576qx7.py" .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 5 .b8 3 .b8 8 .b8 16 .b8 6 .b8 27 .b8 8 .b8 180 .b8 66 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 2 .b8 46 .b8 0 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 11 .b8 59 .b8 11 .b8 63 .b8 12 .b8 32 .b8 11 .b8 0 .b8 0 .b8 3 .b8 46 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 49 .b8 19 .b8 0 .b8 0 .b8 4 .b8 29 .b8 1 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 5 .b8 29 .b8 0 .b8 49 .b8 19 .b8 17 .b8 1 .b8 18 .b8 1 .b8 88 .b8 11 .b8 89 .b8 11 .b8 87 .b8 11 .b8 0 .b8 0 .b8 0 } .section .debug_info { .b32 407 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 .b8 0 .b8 99 .b8 100 .b8 120 .b8 97 .b8 53 .b8 121 .b8 113 .b8 103 .b8 115 .b8 105 .b8 109 .b8 118 .b8 115 .b8 107 .b8 111 .b8 99 .b8 112 .b8 117 .b8 105 .b8 122 .b8 52 .b8 97 .b8 106 .b8 102 .b8 114 .b8 106 .b8 102 .b8 99 .b8 119 .b8 121 .b8 115 .b8 51 .b8 111 .b8 112 .b8 121 .b8 114 .b8 100 .b8 118 .b8 53 .b8 51 .b8 120 .b8 102 .b8 112 .b8 104 .b8 106 .b8 52 .b8 53 .b8 55 .b8 54 .b8 113 .b8 120 .b8 55 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line .b8 47 .b8 116 .b8 109 .b8 112 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 105 .b8 110 .b8 100 .b8 117 .b8 99 .b8 116 .b8 111 .b8 114 .b8 95 .b8 114 .b8 111 .b8 111 .b8 116 .b8 47 .b8 100 .b8 120 .b8 0 .b8 1 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 2 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 55 .b8 100 .b8 56 .b8 100 .b8 101 .b8 57 .b8 100 .b8 101 .b8 0 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 55 .b8 100 .b8 56 .b8 100 .b8 101 .b8 57 .b8 100 .b8 101 .b8 0 .b8 1 .b8 18 .b8 1 .b8 1 .b8 3 .b64 $L__func_begin0 .b64 $L__func_end0 .b8 1 .b8 156 .b32 125 .b8 4 .b32 125 .b64 $L__tmp1 .b64 $L__tmp14 .b8 2 .b8 45 .b8 59 .b8 5 .b32 125 .b64 $L__tmp1 .b64 $L__tmp14 .b8 2 .b8 243 .b8 36 .b8 0 .b8 5 .b32 125 .b64 $L__tmp2 .b64 $L__tmp15 .b8 2 .b8 45 .b8 59 .b8 5 .b32 125 .b64 $L__tmp15 .b64 $L__tmp16 .b8 3 .b8 45 .b8 45 .b8 5 .b32 125 .b64 $L__tmp17 .b64 $L__tmp32 .b8 2 .b8 53 .b8 59 .b8 4 .b32 125 .b64 $L__tmp18 .b64 $L__tmp31 .b8 2 .b8 53 .b8 59 .b8 5 .b32 125 .b64 $L__tmp18 .b64 $L__tmp31 .b8 2 .b8 243 .b8 36 .b8 0 .b8 5 .b32 125 .b64 $L__tmp32 .b64 $L__tmp33 .b8 3 .b8 53 .b8 45 .b8 0 .b8 0 } .section .debug_pubnames { .b32 $L__pubNames_end0-$L__pubNames_start0 $L__pubNames_start0: .b8 2 .b8 0 .b32 .debug_info .b32 411 .b32 125 .b8 116 .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 95 .b8 48 .b8 100 .b8 49 .b8 100 .b8 50 .b8 100 .b8 51 .b8 100 .b8 52 .b8 100 .b8 53 .b8 100 .b8 54 .b8 100 .b8 55 .b8 100 .b8 56 .b8 100 .b8 101 .b8 57 .b8 100 .b8 101 .b8 0 .b32 0 $L__pubNames_end0: } .section .debug_pubtypes { .b32 $L__pubTypes_end0-$L__pubTypes_start0 $L__pubTypes_start0: .b8 2 .b8 0 .b32 .debug_info .b32 411 .b32 0 $L__pubTypes_end0: } .section .debug_loc { }