0-hero
/

gpt2-pos-encoding-experiment-100B

Model card Files Files and versions Community

0-hero commited on Sep 27, 2024

Commit

e7aa429

verified ·

1 Parent(s): 0eeffdd

Add files using upload-large-folder tool

Browse files

Files changed (11) hide show

.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.llir +85 -0
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttgir +152 -0
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir +153 -0
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir +109 -0
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.llir +600 -0
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir +169 -0
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir +18 -0
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttgir +86 -0
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.cubin +0 -0
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir +760 -0
.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.llir +839 -0

.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.llir ADDED Viewed

	@@ -0,0 +1,85 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 3, !dbg !8
+  %6 = and i32 %5, 1016, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 10, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = or i32 %9, 4, !dbg !11
+  %11 = sext i32 %9 to i64, !dbg !12
+  %12 = getelementptr float, ptr addrspace(1) %0, i64 %11, !dbg !12
+  %13 = sext i32 %10 to i64, !dbg !12
+  %14 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !12
+  %15 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %12, i1 true) #1, !dbg !13
+  %16 = extractvalue { i32, i32, i32, i32 } %15, 0, !dbg !13
+  %17 = extractvalue { i32, i32, i32, i32 } %15, 1, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %15, 2, !dbg !13
+  %19 = extractvalue { i32, i32, i32, i32 } %15, 3, !dbg !13
+  %20 = bitcast i32 %16 to float, !dbg !13
+  %21 = bitcast i32 %17 to float, !dbg !13
+  %22 = bitcast i32 %18 to float, !dbg !13
+  %23 = bitcast i32 %19 to float, !dbg !13
+  %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %14, i1 true) #1, !dbg !13
+  %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !13
+  %26 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !13
+  %27 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !13
+  %28 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !13
+  %29 = bitcast i32 %25 to float, !dbg !13
+  %30 = bitcast i32 %26 to float, !dbg !13
+  %31 = bitcast i32 %27 to float, !dbg !13
+  %32 = bitcast i32 %28 to float, !dbg !13
+  %33 = getelementptr i16, ptr addrspace(1) %1, i64 %11, !dbg !14
+  %34 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
+  %35 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %21) #1, !dbg !15
+  %36 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %22) #1, !dbg !15
+  %37 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %23) #1, !dbg !15
+  %38 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %29) #1, !dbg !15
+  %39 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %30) #1, !dbg !15
+  %40 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %31) #1, !dbg !15
+  %41 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %32) #1, !dbg !15
+  %42 = insertelement <2 x i16> undef, i16 %34, i64 0, !dbg !15
+  %43 = insertelement <2 x i16> %42, i16 %35, i64 1, !dbg !15
+  %44 = bitcast <2 x i16> %43 to i32, !dbg !15
+  %45 = insertelement <2 x i16> undef, i16 %36, i64 0, !dbg !15
+  %46 = insertelement <2 x i16> %45, i16 %37, i64 1, !dbg !15
+  %47 = bitcast <2 x i16> %46 to i32, !dbg !15
+  %48 = insertelement <2 x i16> undef, i16 %38, i64 0, !dbg !15
+  %49 = insertelement <2 x i16> %48, i16 %39, i64 1, !dbg !15
+  %50 = bitcast <2 x i16> %49 to i32, !dbg !15
+  %51 = insertelement <2 x i16> undef, i16 %40, i64 0, !dbg !15
+  %52 = insertelement <2 x i16> %51, i16 %41, i64 1, !dbg !15
+  %53 = bitcast <2 x i16> %52 to i32, !dbg !15
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %44, i32 %47, i32 %50, i32 %53, ptr addrspace(1) %33, i1 true) #1, !dbg !15
+  ret void, !dbg !16
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py", directory: "/tmp/torchinductor_root/5t")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 26, column: 25, scope: !5)
+!15 = !DILocation(line: 26, column: 36, scope: !5)
+!16 = !DILocation(line: 26, column: 4, scope: !5)

.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,152 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
+    %cst_7 = arith.constant 0.000000e+00 : f32
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked>
+    %cst_9 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
+    %cst_10 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
+    %cst_11 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
+    %cst_12 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_13 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
+    %12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
+    %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+    %21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
+    %22 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
+    %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+    %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
+    %25 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
+    %26 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
+    %27 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
+    %28 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
+    %29 = arith.select %27, %25, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %30 = arith.select %28, %26, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %31 = arith.cmpi sge, %30, %cst_5 : tensor<64x1xi64, #blocked1>
+    %32 = arith.cmpi slt, %30, %cst_4 : tensor<64x1xi64, #blocked1>
+    %33 = arith.andi %31, %32 : tensor<64x1xi1, #blocked1>
+    %34 = arith.muli %29, %cst_1 : tensor<64x1xi64, #blocked>
+    %35 = tt.broadcast %34 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
+    %36 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
+    %37:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_10, %arg10 = %cst_10, %arg11 = %cst_10) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>)  : i32 {
+      %46 = tt.splat %arg8 : (i32) -> tensor<1x4xi32, #blocked>
+      %47 = arith.addi %46, %11 : tensor<1x4xi32, #blocked>
+      %48 = arith.cmpi slt, %47, %cst_11 : tensor<1x4xi32, #blocked>
+      %49 = tt.broadcast %47 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+      %50 = arith.addi %49, %20 : tensor<64x4xi32, #blocked>
+      %51 = tt.addptr %21, %50 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %52 = tt.broadcast %48 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
+      %53 = tt.load %51, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %54 = arith.addi %49, %23 : tensor<64x4xi32, #blocked>
+      %55 = tt.addptr %24, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %56 = tt.load %55, %52, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
+      %57 = arith.extf %56 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
+      tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %58 = arith.extsi %47 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
+      %59 = tt.broadcast %58 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
+      %60 = arith.addi %59, %35 : tensor<64x4xi64, #blocked>
+      %61 = tt.addptr %36, %60 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
+      %62 = tt.load %61, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %63 = arith.addf %62, %53 : tensor<64x4xf32, #blocked>
+      %64 = arith.addf %63, %57 : tensor<64x4xf32, #blocked>
+      %65 = arith.subf %64, %arg9 : tensor<64x4xf32, #blocked>
+      %66 = arith.addf %arg11, %cst_6 : tensor<64x4xf32, #blocked>
+      %67 = arith.divf %65, %66 : tensor<64x4xf32, #blocked>
+      %68 = arith.addf %arg9, %67 : tensor<64x4xf32, #blocked>
+      %69 = arith.subf %64, %68 : tensor<64x4xf32, #blocked>
+      %70 = arith.mulf %65, %69 : tensor<64x4xf32, #blocked>
+      %71 = arith.addf %arg10, %70 : tensor<64x4xf32, #blocked>
+      %72 = arith.select %52, %68, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      %73 = arith.select %52, %71, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      %74 = arith.select %52, %66, %arg11 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      scf.yield %72, %73, %74 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
+    }
+    %38:3 = "tt.reduce"(%37#0, %37#1, %37#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %46 = arith.subf %arg11, %arg8 : f32
+      %47 = arith.addf %arg10, %arg13 : f32
+      %48 = arith.cmpf oeq, %47, %cst_7 : f32
+      %49 = arith.divf %arg13, %47 : f32
+      %50 = arith.select %48, %cst_7, %49 : f32
+      %51 = arith.mulf %46, %50 : f32
+      %52 = arith.addf %arg8, %51 : f32
+      %53 = arith.addf %arg9, %arg12 : f32
+      %54 = arith.mulf %46, %46 : f32
+      %55 = arith.mulf %54, %arg10 : f32
+      %56 = arith.mulf %55, %50 : f32
+      %57 = arith.addf %53, %56 : f32
+      tt.reduce.return %52, %57, %47 : f32, f32, f32
+    }) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %41 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
+    %42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
+    %43 = arith.divf %40, %cst_13 : tensor<64x1xf32, #blocked>
+    %44 = arith.addf %43, %cst_12 : tensor<64x1xf32, #blocked>
+    %45 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32  : i32 {
+      %46 = tt.splat %arg8 : (i32) -> tensor<1x4xi32, #blocked>
+      %47 = arith.addi %46, %11 : tensor<1x4xi32, #blocked>
+      %48 = arith.cmpi slt, %47, %cst_11 : tensor<1x4xi32, #blocked>
+      %49 = tt.broadcast %47 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+      %50 = arith.addi %49, %20 : tensor<64x4xi32, #blocked>
+      %51 = tt.addptr %21, %50 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %52 = tt.broadcast %48 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
+      %53 = tt.load %51, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %54 = arith.addi %49, %23 : tensor<64x4xi32, #blocked>
+      %55 = tt.addptr %24, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %56 = tt.load %55, %52, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
+      %57 = arith.extf %56 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
+      %58 = tt.addptr %41, %47 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
+      %59 = tt.load %58, %48, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
+      tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %60 = arith.extsi %47 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
+      %61 = tt.broadcast %60 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
+      %62 = arith.addi %61, %35 : tensor<64x4xi64, #blocked>
+      %63 = tt.addptr %36, %62 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
+      %64 = tt.load %63, %52, %cst_10 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %65 = arith.addf %64, %53 : tensor<64x4xf32, #blocked>
+      %66 = arith.addf %65, %57 : tensor<64x4xf32, #blocked>
+      %67 = arith.subf %66, %42 : tensor<64x4xf32, #blocked>
+      %68 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %69 = tt.broadcast %68 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
+      %70 = arith.mulf %67, %69 : tensor<64x4xf32, #blocked>
+      %71 = tt.broadcast %59 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
+      %72 = arith.mulf %70, %71 : tensor<64x4xf32, #blocked>
+      %73 = tt.addptr %45, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %74 = arith.truncf %72 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
+      tt.store %73, %74, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir ADDED Viewed

	@@ -0,0 +1,153 @@

+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
+    %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_10 = arith.constant dense<256> : tensor<1x4xi32>
+    %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
+    %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
+    %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
+    %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
+    %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
+    %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
+    %23 = arith.andi %21, %22 : tensor<64x1xi1>
+    %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
+    %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x4xi64>
+    %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>)  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
+      %52 = arith.addi %51, %7 : tensor<1x4xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
+      %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %55 = arith.addi %54, %13 : tensor<64x4xi32>
+      %56 = tt.addptr %14, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %59 = arith.addi %54, %16 : tensor<64x4xi32>
+      %60 = tt.addptr %17, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16>
+      %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
+      tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %63 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
+      %64 = tt.broadcast %63 : (tensor<1x4xi64>) -> tensor<64x4xi64>
+      %65 = arith.addi %64, %25 : tensor<64x4xi64>
+      %66 = tt.addptr %26, %65 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
+      %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %68 = arith.addf %67, %58 : tensor<64x4xf32>
+      %69 = arith.addf %68, %62 : tensor<64x4xf32>
+      %70 = arith.subf %69, %arg9 : tensor<64x4xf32>
+      %71 = arith.addf %arg11, %cst_1 : tensor<64x4xf32>
+      %72 = arith.divf %70, %71 : tensor<64x4xf32>
+      %73 = arith.addf %arg9, %72 : tensor<64x4xf32>
+      %74 = arith.subf %69, %73 : tensor<64x4xf32>
+      %75 = arith.mulf %70, %74 : tensor<64x4xf32>
+      %76 = arith.addf %arg10, %75 : tensor<64x4xf32>
+      %77 = arith.select %57, %73, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
+      %78 = arith.select %57, %76, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
+      %79 = arith.select %57, %71, %arg11 : tensor<64x4xi1>, tensor<64x4xf32>
+      scf.yield %77, %78, %79 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
+    }
+    %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %51 = arith.subf %arg11, %arg8 : f32
+      %52 = arith.addf %arg10, %arg13 : f32
+      %53 = arith.cmpf oeq, %52, %cst_0 : f32
+      %54 = arith.divf %arg13, %52 : f32
+      %55 = arith.select %53, %cst_0, %54 : f32
+      %56 = arith.mulf %51, %55 : f32
+      %57 = arith.addf %arg8, %56 : f32
+      %58 = arith.addf %arg9, %arg12 : f32
+      %59 = arith.mulf %51, %51 : f32
+      %60 = arith.mulf %59, %arg10 : f32
+      %61 = arith.mulf %60, %55 : f32
+      %62 = arith.addf %58, %61 : f32
+      tt.reduce.return %57, %62, %52 : f32, f32, f32
+    }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
+    %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
+    %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x4xi32>
+    %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
+    %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
+    %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
+    %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
+    %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
+    %43 = arith.andi %41, %42 : tensor<64x1xi1>
+    %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
+    %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x4xi64>
+    %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
+    %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x4xf32>
+    %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
+    %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
+    %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32  : i32 {
+      %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
+      %52 = arith.addi %51, %7 : tensor<1x4xi32>
+      %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
+      %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
+      %55 = arith.addi %54, %32 : tensor<64x4xi32>
+      %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
+      %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
+      %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
+      %59 = arith.addi %54, %35 : tensor<64x4xi32>
+      %60 = tt.addptr %36, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
+      %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
+      %63 = tt.addptr %37, %52 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
+      %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
+      tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %65 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
+      %66 = tt.broadcast %65 : (tensor<1x4xi64>) -> tensor<64x4xi64>
+      %67 = arith.addi %66, %45 : tensor<64x4xi64>
+      %68 = tt.addptr %46, %67 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
+      %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
+      %70 = arith.addf %69, %58 : tensor<64x4xf32>
+      %71 = arith.addf %70, %62 : tensor<64x4xf32>
+      %72 = arith.subf %71, %47 : tensor<64x4xf32>
+      %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x4xf32>
+      %75 = arith.mulf %72, %74 : tensor<64x4xf32>
+      %76 = tt.broadcast %64 : (tensor<1x4xf32>) -> tensor<64x4xf32>
+      %77 = arith.mulf %75, %76 : tensor<64x4xf32>
+      %78 = tt.addptr %50, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
+      %79 = arith.truncf %77 : tensor<64x4xf32> to tensor<64x4xbf16>
+      tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
+    }
+    tt.return
+  }
+}

.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir ADDED Viewed

	@@ -0,0 +1,109 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+define void @triton__0d1d2d34e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 7, !dbg !8
+  %8 = zext nneg i32 %7 to i64, !dbg !9
+  %9 = getelementptr float, ptr addrspace(1) %1, i64 %8, !dbg !9
+  %10 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %9, i1 true, i32 0, i1 true) #3, !dbg !10
+  %11 = bitcast i32 %10 to float, !dbg !10
+  %12 = getelementptr i64, ptr addrspace(1) %2, i64 %8, !dbg !11
+  %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];\0A\09@!$3 mov.u64 $0, 0x0;", "=l,l,b,b"(ptr addrspace(1) %12, i1 true, i1 true) #3, !dbg !12
+  %14 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %10, i32 4, i32 31), !dbg !13
+  %15 = bitcast i32 %14 to float, !dbg !13
+  %16 = fadd float %11, %15, !dbg !17
+  %17 = bitcast float %16 to i32, !dbg !13
+  %18 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %17, i32 2, i32 31), !dbg !13
+  %19 = bitcast i32 %18 to float, !dbg !13
+  %20 = fadd float %16, %19, !dbg !17
+  %21 = bitcast float %20 to i32, !dbg !13
+  %22 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %21, i32 1, i32 31), !dbg !13
+  %23 = bitcast i32 %22 to float, !dbg !13
+  %24 = fadd float %20, %23, !dbg !17
+  %25 = trunc i64 %13 to i32, !dbg !21
+  %26 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %25, i32 4, i32 31), !dbg !21
+  %bc = bitcast i64 %13 to <2 x i32>, !dbg !21
+  %27 = extractelement <2 x i32> %bc, i64 1, !dbg !21
+  %28 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %27, i32 4, i32 31), !dbg !21
+  %29 = insertelement <2 x i32> undef, i32 %26, i64 0, !dbg !21
+  %30 = insertelement <2 x i32> %29, i32 %28, i64 1, !dbg !21
+  %31 = bitcast <2 x i32> %30 to i64, !dbg !21
+  %32 = add i64 %13, %31, !dbg !23
+  %33 = trunc i64 %32 to i32, !dbg !21
+  %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 2, i32 31), !dbg !21
+  %bc1 = bitcast i64 %32 to <2 x i32>, !dbg !21
+  %35 = extractelement <2 x i32> %bc1, i64 1, !dbg !21
+  %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !21
+  %37 = insertelement <2 x i32> undef, i32 %34, i64 0, !dbg !21
+  %38 = insertelement <2 x i32> %37, i32 %36, i64 1, !dbg !21
+  %39 = bitcast <2 x i32> %38 to i64, !dbg !21
+  %40 = add i64 %32, %39, !dbg !23
+  %41 = trunc i64 %40 to i32, !dbg !21
+  %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !21
+  %bc2 = bitcast i64 %40 to <2 x i32>, !dbg !21
+  %43 = extractelement <2 x i32> %bc2, i64 1, !dbg !21
+  %44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 1, i32 31), !dbg !21
+  %45 = insertelement <2 x i32> undef, i32 %42, i64 0, !dbg !21
+  %46 = insertelement <2 x i32> %45, i32 %44, i64 1, !dbg !21
+  %47 = bitcast <2 x i32> %46 to i64, !dbg !21
+  %48 = add i64 %40, %47, !dbg !23
+  %49 = sitofp i64 %48 to float, !dbg !26
+  %50 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %24, float %49) #3, !dbg !27
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %51 = and i32 %6, 63, !dbg !29
+  %52 = icmp eq i32 %51, 0, !dbg !29
+  %53 = bitcast float %50 to i32, !dbg !29
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %53, ptr addrspace(1) %0, i1 %52) #3, !dbg !29
+  ret void, !dbg !30
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c7zrzealf5bsn7qskl6y72zb73mh5bzf6uskuswp33lv4y5kk64w.py", directory: "/tmp/torchinductor_root/7z")
+!3 = !{ptr @triton__0d1d2d34e, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d34e, !"maxntidx", i32 64}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d34e", linkageName: "triton__0d1d2d34e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 25, column: 34, scope: !5)
+!9 = !DILocation(line: 28, column: 30, scope: !5)
+!10 = !DILocation(line: 28, column: 35, scope: !5)
+!11 = !DILocation(line: 29, column: 30, scope: !5)
+!12 = !DILocation(line: 29, column: 35, scope: !5)
+!13 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlockFile(scope: !5, file: !15, discriminator: 0)
+!15 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!16 = !DILocation(line: 32, column: 24, scope: !14)
+!17 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !19)
+!18 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
+!19 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !20)
+!20 = !DILocation(line: 32, column: 24, scope: !18)
+!21 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !22)
+!22 = !DILocation(line: 35, column: 24, scope: !14)
+!23 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !24)
+!24 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !25)
+!25 = !DILocation(line: 35, column: 24, scope: !18)
+!26 = !DILocation(line: 36, column: 20, scope: !5)
+!27 = !DILocation(line: 37, column: 19, scope: !5)
+!28 = !DILocation(line: 38, column: 4, scope: !5)
+!29 = !DILocation(line: 39, column: 71, scope: !5)
+!30 = !DILocation(line: 39, column: 4, scope: !5)

.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.llir ADDED Viewed

	@@ -0,0 +1,600 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [8 x i8] c"<module>"
+@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [8 x i8] c"<module>"
+@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 6, !dbg !10
+  %12 = and i32 %11, 1, !dbg !10
+  %13 = and i32 %9, 1, !dbg !10
+  %urem = shl i32 %9, 1, !dbg !11
+  %14 = and i32 %urem, 126, !dbg !11
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %16 = shl i32 %15, 1, !dbg !13
+  %17 = or i32 %16, %12, !dbg !14
+  %18 = or i32 %16, %13, !dbg !14
+  %19 = sext i32 %17 to i64, !dbg !15
+  %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
+  %21 = sext i32 %18 to i64, !dbg !15
+  %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %26 = srem i32 %17, 512, !dbg !17
+  %27 = shl nsw i32 %26, 8, !dbg !18
+  %28 = shl i32 %17, 8, !dbg !19
+  %29 = add i64 %25, 50257, !dbg !20
+  %30 = icmp slt i64 %23, 0, !dbg !21
+  %31 = icmp slt i64 %25, 0, !dbg !21
+  %32 = select i1 %31, i64 %29, i64 %25, !dbg !22
+  %33 = icmp ugt i64 %32, 50256, !dbg !23
+  %34 = shl i64 %23, 8, !dbg !24
+  %35 = add i64 %34, 12865792, !dbg !24
+  %36 = select i1 %30, i64 %35, i64 %34, !dbg !24
+  %37 = getelementptr float, ptr addrspace(1) %1, i64 %36
+  %38 = or i32 %14, %27, !dbg !25
+  %39 = sext i32 %38 to i64, !dbg !26
+  %40 = getelementptr float, ptr addrspace(1) %2, i64 %39, !dbg !26
+  %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
+  %42 = extractvalue { i32, i32 } %41, 0, !dbg !27
+  %43 = extractvalue { i32, i32 } %41, 1, !dbg !27
+  %44 = insertelement <2 x i32> poison, i32 %42, i64 0, !dbg !27
+  %45 = insertelement <2 x i32> %44, i32 %43, i64 1, !dbg !27
+  %46 = bitcast <2 x i32> %45 to <2 x float>, !dbg !27
+  %47 = or i32 %14, %28, !dbg !28
+  %48 = sext i32 %47 to i64, !dbg !29
+  %49 = getelementptr i16, ptr addrspace(1) %3, i64 %48, !dbg !29
+  %50 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %49, i1 true, i32 0, i1 true) #6, !dbg !30
+  %51 = trunc i32 %50 to i16, !dbg !30
+  %extelt.offset2 = lshr i32 %50, 16, !dbg !30
+  %52 = trunc i32 %extelt.offset2 to i16, !dbg !30
+  %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #6, !dbg !31
+  %54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #6, !dbg !31
+  br i1 %33, label %55, label %56, !dbg !32
+55:                                               ; preds = %8
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !32
+  br label %56, !dbg !32
+56:                                               ; preds = %55, %8
+  %57 = zext nneg i32 %14 to i64, !dbg !33
+  %58 = getelementptr float, ptr addrspace(1) %37, i64 %57, !dbg !34
+  %59 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %60 = extractvalue { i32, i32 } %59, 0, !dbg !35
+  %61 = extractvalue { i32, i32 } %59, 1, !dbg !35
+  %62 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !35
+  %63 = insertelement <2 x i32> %62, i32 %61, i64 1, !dbg !35
+  %64 = bitcast <2 x i32> %63 to <2 x float>, !dbg !35
+  %65 = fadd <2 x float> %46, %64, !dbg !36
+  %66 = insertelement <2 x float> poison, float %53, i64 0, !dbg !37
+  %67 = insertelement <2 x float> %66, float %54, i64 1, !dbg !37
+  %68 = fadd <2 x float> %67, %65, !dbg !37
+  %69 = extractelement <2 x float> %68, i64 0, !dbg !38
+  %70 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %69, float 1.000000e+00) #6, !dbg !38
+  %71 = extractelement <2 x float> %68, i64 1, !dbg !38
+  %72 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %71, float 1.000000e+00) #6, !dbg !38
+  %73 = insertelement <2 x float> poison, float %70, i64 0, !dbg !42
+  %74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !42
+  %75 = fadd <2 x float> %74, zeroinitializer, !dbg !42
+  %76 = fsub <2 x float> %68, %75, !dbg !43
+  %77 = fmul <2 x float> %68, %76, !dbg !44
+  %78 = fadd <2 x float> %77, zeroinitializer, !dbg !45
+  %79 = or i32 %14, 128, !dbg !46
+  %80 = or i32 %79, %27, !dbg !25
+  %81 = sext i32 %80 to i64, !dbg !26
+  %82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !26
+  %83 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
+  %84 = extractvalue { i32, i32 } %83, 0, !dbg !27
+  %85 = extractvalue { i32, i32 } %83, 1, !dbg !27
+  %86 = insertelement <2 x i32> poison, i32 %84, i64 0, !dbg !27
+  %87 = insertelement <2 x i32> %86, i32 %85, i64 1, !dbg !27
+  %88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !27
+  %89 = or i32 %79, %28, !dbg !28
+  %90 = sext i32 %89 to i64, !dbg !29
+  %91 = getelementptr i16, ptr addrspace(1) %3, i64 %90, !dbg !29
+  %92 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 true, i32 0, i1 true) #6, !dbg !30
+  %93 = trunc i32 %92 to i16, !dbg !30
+  %extelt.offset2.1 = lshr i32 %92, 16, !dbg !30
+  %94 = trunc i32 %extelt.offset2.1 to i16, !dbg !30
+  %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %93) #6, !dbg !31
+  %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %94) #6, !dbg !31
+  br i1 %33, label %97, label %98, !dbg !32
+97:                                               ; preds = %56
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !32
+  br label %98, !dbg !32
+98:                                               ; preds = %97, %56
+  %99 = zext nneg i32 %79 to i64, !dbg !33
+  %100 = getelementptr float, ptr addrspace(1) %37, i64 %99, !dbg !34
+  %101 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %100, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
+  %102 = extractvalue { i32, i32 } %101, 0, !dbg !35
+  %103 = extractvalue { i32, i32 } %101, 1, !dbg !35
+  %104 = insertelement <2 x i32> poison, i32 %102, i64 0, !dbg !35
+  %105 = insertelement <2 x i32> %104, i32 %103, i64 1, !dbg !35
+  %106 = bitcast <2 x i32> %105 to <2 x float>, !dbg !35
+  %107 = fadd <2 x float> %88, %106, !dbg !36
+  %108 = insertelement <2 x float> poison, float %95, i64 0, !dbg !37
+  %109 = insertelement <2 x float> %108, float %96, i64 1, !dbg !37
+  %110 = fadd <2 x float> %109, %107, !dbg !37
+  %111 = fsub <2 x float> %110, %75, !dbg !47
+  %112 = extractelement <2 x float> %111, i64 0, !dbg !38
+  %113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float 2.000000e+00) #6, !dbg !38
+  %114 = extractelement <2 x float> %111, i64 1, !dbg !38
+  %115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float 2.000000e+00) #6, !dbg !38
+  %116 = insertelement <2 x float> poison, float %113, i64 0, !dbg !42
+  %117 = insertelement <2 x float> %116, float %115, i64 1, !dbg !42
+  %118 = fadd <2 x float> %75, %117, !dbg !42
+  %119 = fsub <2 x float> %110, %118, !dbg !43
+  %120 = fmul <2 x float> %111, %119, !dbg !44
+  %121 = fadd <2 x float> %78, %120, !dbg !45
+  %122 = lshr i32 %9, 5, !dbg !10
+  %123 = and i32 %122, 1, !dbg !11
+  %124 = and i32 %9, 127, !dbg !11
+  %125 = zext nneg i32 %124 to i64, !dbg !48
+  %126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !48
+  store <1 x float> <float 2.000000e+00>, ptr addrspace(3) %126, align 4, !dbg !48
+  %127 = add nuw nsw i32 %124, 130, !dbg !48
+  %128 = zext nneg i32 %127 to i64, !dbg !48
+  %129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !48
+  store <1 x float> <float 2.000000e+00>, ptr addrspace(3) %129, align 4, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !48
+  %130 = mul nuw nsw i32 %12, 130, !dbg !48
+  %131 = add nuw nsw i32 %130, %14, !dbg !48
+  %132 = zext nneg i32 %131 to i64, !dbg !48
+  %133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !48
+  %134 = load float, ptr addrspace(3) %133, align 8, !dbg !48
+  %135 = getelementptr inbounds <2 x float>, ptr addrspace(3) %133, i64 0, i64 1, !dbg !48
+  %136 = load float, ptr addrspace(3) %135, align 4, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %137 = extractelement <2 x float> %118, i64 0, !dbg !51
+  %138 = extractelement <2 x float> %118, i64 1, !dbg !55
+  %139 = fsub float %138, %137, !dbg !55
+  %140 = fadd float %134, %136, !dbg !56
+  %141 = fcmp oeq float %140, 0.000000e+00, !dbg !57
+  %142 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %140) #6, !dbg !58
+  %143 = select i1 %141, float 0.000000e+00, float %142, !dbg !59
+  %144 = fmul float %139, %143, !dbg !60
+  %145 = fadd float %137, %144, !dbg !51
+  %shift = shufflevector <2 x float> %121, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !61
+  %146 = fadd <2 x float> %121, %shift, !dbg !61
+  %147 = extractelement <2 x float> %146, i64 0, !dbg !61
+  %148 = fmul float %139, %139, !dbg !62
+  %149 = fmul float %148, %134, !dbg !63
+  %150 = fmul float %149, %143, !dbg !64
+  %151 = fadd float %147, %150, !dbg !65
+  %152 = bitcast float %145 to i32, !dbg !49
+  %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 16, i32 31), !dbg !49
+  %154 = bitcast i32 %153 to float, !dbg !49
+  %155 = bitcast float %151 to i32, !dbg !49
+  %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 16, i32 31), !dbg !49
+  %157 = bitcast i32 %156 to float, !dbg !49
+  %158 = bitcast float %140 to i32, !dbg !49
+  %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 16, i32 31), !dbg !49
+  %160 = bitcast i32 %159 to float, !dbg !49
+  %161 = fsub float %154, %145, !dbg !55
+  %162 = fadd float %140, %160, !dbg !56
+  %163 = fcmp oeq float %162, 0.000000e+00, !dbg !57
+  %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %160, float %162) #6, !dbg !58
+  %165 = select i1 %163, float 0.000000e+00, float %164, !dbg !59
+  %166 = fmul float %161, %165, !dbg !60
+  %167 = fadd float %145, %166, !dbg !51
+  %168 = fadd float %151, %157, !dbg !61
+  %169 = fmul float %161, %161, !dbg !62
+  %170 = fmul float %140, %169, !dbg !63
+  %171 = fmul float %170, %165, !dbg !64
+  %172 = fadd float %168, %171, !dbg !65
+  %173 = bitcast float %167 to i32, !dbg !49
+  %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 8, i32 31), !dbg !49
+  %175 = bitcast i32 %174 to float, !dbg !49
+  %176 = bitcast float %172 to i32, !dbg !49
+  %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 8, i32 31), !dbg !49
+  %178 = bitcast i32 %177 to float, !dbg !49
+  %179 = bitcast float %162 to i32, !dbg !49
+  %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !49
+  %181 = bitcast i32 %180 to float, !dbg !49
+  %182 = fsub float %175, %167, !dbg !55
+  %183 = fadd float %162, %181, !dbg !56
+  %184 = fcmp oeq float %183, 0.000000e+00, !dbg !57
+  %185 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %181, float %183) #6, !dbg !58
+  %186 = select i1 %184, float 0.000000e+00, float %185, !dbg !59
+  %187 = fmul float %182, %186, !dbg !60
+  %188 = fadd float %167, %187, !dbg !51
+  %189 = fadd float %172, %178, !dbg !61
+  %190 = fmul float %182, %182, !dbg !62
+  %191 = fmul float %162, %190, !dbg !63
+  %192 = fmul float %186, %191, !dbg !64
+  %193 = fadd float %189, %192, !dbg !65
+  %194 = bitcast float %188 to i32, !dbg !49
+  %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !49
+  %196 = bitcast i32 %195 to float, !dbg !49
+  %197 = bitcast float %193 to i32, !dbg !49
+  %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !49
+  %199 = bitcast i32 %198 to float, !dbg !49
+  %200 = bitcast float %183 to i32, !dbg !49
+  %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !49
+  %202 = bitcast i32 %201 to float, !dbg !49
+  %203 = fsub float %196, %188, !dbg !55
+  %204 = fadd float %183, %202, !dbg !56
+  %205 = fcmp oeq float %204, 0.000000e+00, !dbg !57
+  %206 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %202, float %204) #6, !dbg !58
+  %207 = select i1 %205, float 0.000000e+00, float %206, !dbg !59
+  %208 = fmul float %203, %207, !dbg !60
+  %209 = fadd float %188, %208, !dbg !51
+  %210 = fadd float %193, %199, !dbg !61
+  %211 = fmul float %203, %203, !dbg !62
+  %212 = fmul float %183, %211, !dbg !63
+  %213 = fmul float %207, %212, !dbg !64
+  %214 = fadd float %210, %213, !dbg !65
+  %215 = bitcast float %209 to i32, !dbg !49
+  %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 2, i32 31), !dbg !49
+  %217 = bitcast i32 %216 to float, !dbg !49
+  %218 = bitcast float %214 to i32, !dbg !49
+  %219 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 2, i32 31), !dbg !49
+  %220 = bitcast i32 %219 to float, !dbg !49
+  %221 = bitcast float %204 to i32, !dbg !49
+  %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !49
+  %223 = bitcast i32 %222 to float, !dbg !49
+  %224 = fsub float %217, %209, !dbg !55
+  %225 = fadd float %204, %223, !dbg !56
+  %226 = fcmp oeq float %225, 0.000000e+00, !dbg !57
+  %227 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %223, float %225) #6, !dbg !58
+  %228 = select i1 %226, float 0.000000e+00, float %227, !dbg !59
+  %229 = fmul float %224, %228, !dbg !60
+  %230 = fadd float %209, %229, !dbg !51
+  %231 = fadd float %214, %220, !dbg !61
+  %232 = fmul float %224, %224, !dbg !62
+  %233 = fmul float %204, %232, !dbg !63
+  %234 = fmul float %228, %233, !dbg !64
+  %235 = fadd float %231, %234, !dbg !65
+  %236 = bitcast float %230 to i32, !dbg !49
+  %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !49
+  %238 = bitcast i32 %237 to float, !dbg !49
+  %239 = bitcast float %235 to i32, !dbg !49
+  %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !49
+  %241 = bitcast i32 %240 to float, !dbg !49
+  %242 = bitcast float %225 to i32, !dbg !49
+  %243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 1, i32 31), !dbg !49
+  %244 = bitcast i32 %243 to float, !dbg !49
+  %245 = fsub float %238, %230, !dbg !55
+  %246 = fadd float %225, %244, !dbg !56
+  %247 = fcmp oeq float %246, 0.000000e+00, !dbg !57
+  %248 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %246) #6, !dbg !58
+  %249 = select i1 %247, float 0.000000e+00, float %248, !dbg !59
+  %250 = fmul float %245, %249, !dbg !60
+  %251 = fadd float %230, %250, !dbg !51
+  %252 = fadd float %235, %241, !dbg !61
+  %253 = fmul float %245, %245, !dbg !62
+  %254 = fmul float %225, %253, !dbg !63
+  %255 = fmul float %249, %254, !dbg !64
+  %256 = fadd float %252, %255, !dbg !65
+  %257 = icmp eq i32 %10, 0, !dbg !49
+  %258 = shl nuw nsw i32 %12, 1, !dbg !49
+  %259 = or i32 %258, %123, !dbg !49
+  %260 = zext nneg i32 %259 to i64, !dbg !49
+  %261 = getelementptr float, ptr addrspace(3) @global_smem, i64 %260, !dbg !49
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, float %251, i1 %257) #6, !dbg !49
+  %262 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %260, !dbg !49
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %262, float %256, i1 %257) #6, !dbg !49
+  %263 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %260, !dbg !49
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %263, float %246, i1 %257) #6, !dbg !49
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %264 = icmp slt i32 %9, 4, !dbg !49
+  %265 = sext i32 %9 to i64, !dbg !49
+  %266 = getelementptr float, ptr addrspace(3) @global_smem, i64 %265, !dbg !49
+  %267 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %266, i1 %264) #6, !dbg !49
+  %268 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %265, !dbg !49
+  %269 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %268, i1 %264) #6, !dbg !49
+  %270 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %265, !dbg !49
+  %271 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %270, i1 %264) #6, !dbg !49
+  %272 = bitcast float %267 to i32, !dbg !49
+  %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !49
+  %274 = bitcast i32 %273 to float, !dbg !49
+  %275 = bitcast float %269 to i32, !dbg !49
+  %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 1, i32 31), !dbg !49
+  %277 = bitcast i32 %276 to float, !dbg !49
+  %278 = bitcast float %271 to i32, !dbg !49
+  %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 1, i32 31), !dbg !49
+  %280 = bitcast i32 %279 to float, !dbg !49
+  %281 = fsub float %274, %267, !dbg !55
+  %282 = fadd float %271, %280, !dbg !56
+  %283 = fcmp oeq float %282, 0.000000e+00, !dbg !57
+  %284 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %280, float %282) #6, !dbg !58
+  %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !59
+  %286 = fmul float %281, %285, !dbg !60
+  %287 = fadd float %267, %286, !dbg !51
+  %288 = fadd float %269, %277, !dbg !61
+  %289 = fmul float %281, %281, !dbg !62
+  %290 = fmul float %271, %289, !dbg !63
+  %291 = fmul float %290, %285, !dbg !64
+  %292 = fadd float %288, %291, !dbg !65
+  %293 = icmp eq i32 %13, 0, !dbg !49
+  %294 = and i1 %264, %293, !dbg !49
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %266, float %287, i1 %294) #6, !dbg !49
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %268, float %292, i1 %294) #6, !dbg !49
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %270, float %282, i1 %294) #6, !dbg !49
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %295 = zext nneg i32 %258 to i64, !dbg !49
+  %296 = getelementptr float, ptr addrspace(3) @global_smem, i64 %295, !dbg !49
+  %297 = load float, ptr addrspace(3) %296, align 4, !dbg !49
+  %298 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %295, !dbg !49
+  %299 = load float, ptr addrspace(3) %298, align 4, !dbg !49
+  %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %299, float 2.560000e+02) #6, !dbg !66
+  %301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %299, float 2.560000e+02) #6, !dbg !66
+  %302 = fadd float %300, 0x3EE4F8B580000000, !dbg !67
+  %303 = getelementptr float, ptr addrspace(3) @global_smem, i64 %57
+  %304 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %305 = extractvalue { i32, i32 } %304, 0, !dbg !68
+  %306 = extractvalue { i32, i32 } %304, 1, !dbg !68
+  %307 = bitcast i32 %305 to float, !dbg !68
+  %308 = bitcast i32 %306 to float, !dbg !68
+  %309 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %49, i1 true, i32 0, i1 true) #6, !dbg !69
+  %310 = trunc i32 %309 to i16, !dbg !69
+  %extelt.offset = lshr i32 %309, 16, !dbg !69
+  %311 = trunc i32 %extelt.offset to i16, !dbg !69
+  %312 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %310) #6, !dbg !70
+  %313 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %311) #6, !dbg !70
+  %314 = getelementptr float, ptr addrspace(1) %4, i64 %125, !dbg !71
+  %315 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %314, i1 true, i32 0, i1 true) #6, !dbg !72
+  br i1 %33, label %316, label %317, !dbg !73
+316:                                              ; preds = %98
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !73
+  br label %317, !dbg !73
+317:                                              ; preds = %316, %98
+  %318 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !74
+  %319 = extractvalue { i32, i32 } %318, 0, !dbg !74
+  %320 = extractvalue { i32, i32 } %318, 1, !dbg !74
+  %321 = bitcast i32 %319 to float, !dbg !74
+  %322 = bitcast i32 %320 to float, !dbg !74
+  %323 = fadd float %307, %321, !dbg !75
+  %324 = fadd float %308, %322, !dbg !75
+  %325 = fadd float %312, %323, !dbg !76
+  %326 = fadd float %313, %324, !dbg !76
+  %327 = fsub float %325, %297, !dbg !77
+  %328 = fsub float %326, %297, !dbg !77
+  %329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
+  %.not.i = icmp eq i32 %329, 0, !dbg !78
+  br i1 %.not.i, label %332, label %330, !dbg !78
+330:                                              ; preds = %317
+  %331 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %302), !dbg !78
+  br label %__nv_rsqrtf.exit, !dbg !78
+332:                                              ; preds = %317
+  %333 = tail call float @llvm.nvvm.rsqrt.approx.f(float %302), !dbg !78
+  br label %__nv_rsqrtf.exit, !dbg !78
+__nv_rsqrtf.exit:                                 ; preds = %330, %332
+  %.0.i = phi float [ %331, %330 ], [ %333, %332 ], !dbg !78
+  %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
+  %335 = fmul float %327, %.0.i, !dbg !79
+  %336 = fmul float %328, %.0.i, !dbg !79
+  tail call void @llvm.nvvm.barrier0(), !dbg !80
+  store i32 %315, ptr addrspace(3) %126, align 4, !dbg !80
+  tail call void @llvm.nvvm.barrier0(), !dbg !80
+  %337 = load float, ptr addrspace(3) %303, align 8, !dbg !80
+  %338 = getelementptr inbounds <2 x float>, ptr addrspace(3) %303, i64 0, i64 1, !dbg !80
+  %339 = load float, ptr addrspace(3) %338, align 4, !dbg !80
+  %340 = fmul float %335, %337, !dbg !80
+  %341 = fmul float %336, %339, !dbg !80
+  %342 = getelementptr i16, ptr addrspace(1) %5, i64 %48, !dbg !81
+  %343 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %340) #6, !dbg !82
+  %344 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !82
+  %345 = insertelement <2 x i16> undef, i16 %343, i64 0, !dbg !82
+  %346 = insertelement <2 x i16> %345, i16 %344, i64 1, !dbg !82
+  %347 = bitcast <2 x i16> %346 to i32, !dbg !82
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %347, ptr addrspace(1) %342, i1 true) #6, !dbg !82
+  %348 = or i32 %124, 128, !dbg !83
+  %349 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %350 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 true, i32 0, i1 true) #6, !dbg !69
+  %351 = trunc i32 %350 to i16, !dbg !69
+  %extelt.offset.1 = lshr i32 %350, 16, !dbg !69
+  %352 = trunc i32 %extelt.offset.1 to i16, !dbg !69
+  %353 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %351) #6, !dbg !70
+  %354 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %352) #6, !dbg !70
+  %355 = zext nneg i32 %348 to i64, !dbg !71
+  %356 = getelementptr float, ptr addrspace(1) %4, i64 %355, !dbg !71
+  %357 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %356, i1 true, i32 0, i1 true) #6, !dbg !72
+  br i1 %33, label %358, label %359, !dbg !73
+358:                                              ; preds = %__nv_rsqrtf.exit
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !73
+  br label %359, !dbg !73
+359:                                              ; preds = %358, %__nv_rsqrtf.exit
+  %360 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %100, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !74
+  %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
+  %.not.i.1 = icmp eq i32 %361, 0, !dbg !78
+  br i1 %.not.i.1, label %364, label %362, !dbg !78
+362:                                              ; preds = %359
+  %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %302), !dbg !78
+  br label %__nv_rsqrtf.exit.1, !dbg !78
+364:                                              ; preds = %359
+  %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %302), !dbg !78
+  br label %__nv_rsqrtf.exit.1, !dbg !78
+__nv_rsqrtf.exit.1:                               ; preds = %364, %362
+  %.0.i.1 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !78
+  %366 = extractvalue { i32, i32 } %349, 1, !dbg !68
+  %367 = bitcast i32 %366 to float, !dbg !68
+  %368 = extractvalue { i32, i32 } %360, 1, !dbg !74
+  %369 = bitcast i32 %368 to float, !dbg !74
+  %370 = fadd float %367, %369, !dbg !75
+  %371 = fadd float %354, %370, !dbg !76
+  %372 = fsub float %371, %297, !dbg !77
+  %373 = extractvalue { i32, i32 } %349, 0, !dbg !68
+  %374 = bitcast i32 %373 to float, !dbg !68
+  %375 = extractvalue { i32, i32 } %360, 0, !dbg !74
+  %376 = bitcast i32 %375 to float, !dbg !74
+  %377 = fadd float %374, %376, !dbg !75
+  %378 = fadd float %353, %377, !dbg !76
+  %379 = fsub float %378, %297, !dbg !77
+  %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
+  %381 = fmul float %379, %.0.i.1, !dbg !79
+  %382 = fmul float %372, %.0.i.1, !dbg !79
+  tail call void @llvm.nvvm.barrier0(), !dbg !80
+  store i32 %357, ptr addrspace(3) %126, align 4, !dbg !80
+  tail call void @llvm.nvvm.barrier0(), !dbg !80
+  %383 = load float, ptr addrspace(3) %303, align 8, !dbg !80
+  %384 = load float, ptr addrspace(3) %338, align 4, !dbg !80
+  %385 = fmul float %381, %383, !dbg !80
+  %386 = fmul float %382, %384, !dbg !80
+  %387 = getelementptr i16, ptr addrspace(1) %5, i64 %90, !dbg !81
+  %388 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %385) #6, !dbg !82
+  %389 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %386) #6, !dbg !82
+  %390 = insertelement <2 x i16> undef, i16 %388, i64 0, !dbg !82
+  %391 = insertelement <2 x i16> %390, i16 %389, i64 1, !dbg !82
+  %392 = bitcast <2 x i16> %391 to i32, !dbg !82
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %392, ptr addrspace(1) %387, i1 true) #6, !dbg !82
+  ret void, !dbg !84
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 36, column: 44, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 22, scope: !7)
+!22 = !DILocation(line: 39, column: 36, scope: !7)
+!23 = !DILocation(line: 40, column: 40, scope: !7)
+!24 = !DILocation(line: 41, column: 44, scope: !7)
+!25 = !DILocation(line: 35, column: 40, scope: !7)
+!26 = !DILocation(line: 35, column: 34, scope: !7)
+!27 = !DILocation(line: 35, column: 50, scope: !7)
+!28 = !DILocation(line: 36, column: 40, scope: !7)
+!29 = !DILocation(line: 36, column: 34, scope: !7)
+!30 = !DILocation(line: 36, column: 50, scope: !7)
+!31 = !DILocation(line: 36, column: 101, scope: !7)
+!32 = !DILocation(line: 40, column: 55, scope: !7)
+!33 = !DILocation(line: 41, column: 40, scope: !7)
+!34 = !DILocation(line: 41, column: 34, scope: !7)
+!35 = !DILocation(line: 41, column: 52, scope: !7)
+!36 = !DILocation(line: 42, column: 22, scope: !7)
+!37 = !DILocation(line: 44, column: 22, scope: !7)
+!38 = !DILocation(line: 98, column: 30, scope: !39, inlinedAt: !41)
+!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
+!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!41 = !DILocation(line: 47, column: 41, scope: !39)
+!42 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
+!43 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
+!44 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
+!45 = !DILocation(line: 50, column: 50, scope: !7)
+!46 = !DILocation(line: 32, column: 27, scope: !7)
+!47 = !DILocation(line: 96, column: 20, scope: !39, inlinedAt: !41)
+!48 = !DILocation(line: 31, column: 36, scope: !7)
+!49 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !50)
+!50 = !DILocation(line: 53, column: 44, scope: !39)
+!51 = !DILocation(line: 112, column: 17, scope: !52, inlinedAt: !53)
+!52 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
+!53 = !DILocation(line: 120, column: 46, scope: !52, inlinedAt: !54)
+!54 = !DILocation(line: 53, column: 44, scope: !52)
+!55 = !DILocation(line: 108, column: 21, scope: !52, inlinedAt: !53)
+!56 = !DILocation(line: 109, column: 28, scope: !52, inlinedAt: !53)
+!57 = !DILocation(line: 110, column: 39, scope: !52, inlinedAt: !53)
+!58 = !DILocation(line: 110, column: 60, scope: !52, inlinedAt: !53)
+!59 = !DILocation(line: 110, column: 49, scope: !52, inlinedAt: !53)
+!60 = !DILocation(line: 112, column: 25, scope: !52, inlinedAt: !53)
+!61 = !DILocation(line: 113, column: 15, scope: !52, inlinedAt: !53)
+!62 = !DILocation(line: 113, column: 30, scope: !52, inlinedAt: !53)
+!63 = !DILocation(line: 113, column: 38, scope: !52, inlinedAt: !53)
+!64 = !DILocation(line: 113, column: 49, scope: !52, inlinedAt: !53)
+!65 = !DILocation(line: 113, column: 22, scope: !52, inlinedAt: !53)
+!66 = !DILocation(line: 75, column: 24, scope: !7)
+!67 = !DILocation(line: 77, column: 24, scope: !7)
+!68 = !DILocation(line: 62, column: 51, scope: !7)
+!69 = !DILocation(line: 63, column: 51, scope: !7)
+!70 = !DILocation(line: 63, column: 103, scope: !7)
+!71 = !DILocation(line: 64, column: 35, scope: !7)
+!72 = !DILocation(line: 64, column: 40, scope: !7)
+!73 = !DILocation(line: 68, column: 57, scope: !7)
+!74 = !DILocation(line: 69, column: 54, scope: !7)
+!75 = !DILocation(line: 70, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 24, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 78, column: 30, scope: !7)
+!79 = !DILocation(line: 79, column: 24, scope: !7)
+!80 = !DILocation(line: 80, column: 24, scope: !7)
+!81 = !DILocation(line: 82, column: 29, scope: !7)
+!82 = !DILocation(line: 82, column: 52, scope: !7)
+!83 = !DILocation(line: 59, column: 27, scope: !7)
+!84 = !DILocation(line: 58, column: 4, scope: !7)

.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,169 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked>
+    %cst_3 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked>
+    %cst_4 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
+    %cst_5 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
+    %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<2x1xi64, #blocked1>
+    %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_9 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked2>
+    %cst_10 = arith.constant 0.000000e+00 : f32
+    %cst_11 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked2>
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked>
+    %c2_i32 = arith.constant 2 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c2_i32 : i32
+    %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<2x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked1>
+    %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked1>, tensor<2x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<2x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<2x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_7 : tensor<2x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_5 : tensor<2x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<2x1xi1, #blocked1>, tensor<2x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_8 : tensor<2x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_7 : tensor<2x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<2x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_4 : tensor<2x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<2x1xi64, #blocked>) -> tensor<2x128xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x128xi1, #blocked2>) -> tensor<2x128xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<2x128xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<2x128xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<2x128xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<2x128xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_3 : tensor<2x128xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_9 : tensor<2x128xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<2x128xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<2x128xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<2x128xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<2x128xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<2x128xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<2x128xi1, #blocked2>, tensor<2x128xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<2x128xf32, #blocked2>) -> tensor<2x128xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_10 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_10, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked2>
+    %45 = tt.broadcast %42 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<2x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<2x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
+      %59 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %60 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
+      %61 = tt.addptr %26, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %62 = tt.load %61, %58, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
+      %63 = arith.extf %62 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
+      %64 = tt.addptr %44, %52 : tensor<1x128x!tt.ptr<f32, 1>, #blocked2>, tensor<1x128xi32, #blocked2>
+      %65 = tt.load %64, %54, %cst_11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked2>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
+      %66 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
+      %67 = tt.broadcast %66 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
+      %68 = arith.addi %67, %37 : tensor<2x128xi64, #blocked>
+      %69 = tt.addptr %38, %68 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
+      %70 = tt.load %69, %58, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
+      %71 = arith.addf %70, %59 : tensor<2x128xf32, #blocked>
+      %72 = arith.addf %71, %63 : tensor<2x128xf32, #blocked>
+      %73 = arith.subf %72, %45 : tensor<2x128xf32, #blocked>
+      %74 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
+      %75 = tt.broadcast %74 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
+      %76 = arith.mulf %73, %75 : tensor<2x128xf32, #blocked>
+      %77 = triton_gpu.convert_layout %65 : (tensor<1x128xf32, #blocked2>) -> tensor<1x128xf32, #blocked>
+      %78 = tt.broadcast %77 : (tensor<1x128xf32, #blocked>) -> tensor<2x128xf32, #blocked>
+      %79 = arith.mulf %76, %78 : tensor<2x128xf32, #blocked>
+      %80 = tt.addptr %48, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
+      %81 = arith.truncf %79 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked>
+      tt.store %80, %81, %58 {cache = 1 : i32, evict = 1 : i32} : tensor<2x128xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,18 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<256xi32, #blocked>
+    %c256_i32 = arith.constant 256 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c256_i32 : i32
+    %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<256xi32, #blocked>
+    %5 = arith.cmpi slt, %4, %cst : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>, #blocked>
+    %7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = arith.extsi %4 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
+    tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64, #blocked>
+    tt.return
+  }
+}

.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,86 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
+    %30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
+    %31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
+    %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %59 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %59 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %34 = arith.addf %33, %cst_2 : f32
+    %35 = arith.divf %34, %cst_1 : f32
+    %36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
+    %37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
+    %38 = arith.subf %31, %37 : tensor<256xf32, #blocked>
+    %39 = arith.mulf %38, %38 : tensor<256xf32, #blocked>
+    %40 = arith.select %2, %39, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %59 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %59 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %42 = arith.addf %41, %cst_2 : f32
+    %43 = arith.divf %42, %cst_1 : f32
+    %44 = arith.addf %43, %cst_0 : f32
+    %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %46 = tt.splat %45 : (f32) -> tensor<1xf32, #blocked1>
+    %47 = tt.splat %45 : (f32) -> tensor<256xf32, #blocked>
+    %48 = arith.mulf %38, %47 : tensor<256xf32, #blocked>
+    %49 = arith.mulf %48, %27 : tensor<256xf32, #blocked>
+    %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
+    gpu.barrier
+    %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    %54 = tt.splat %arg9 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %56 = arith.truncf %49 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %55, %56, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    %57 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
+    %58 = tt.splat %57 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %58, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
+    tt.return
+  }
+}

.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.cubin ADDED Viewed

Binary file (42.6 kB). View file

.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir ADDED Viewed

	@@ -0,0 +1,760 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [8 x i8] c"<module>"
+@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [8 x i8] c"<module>"
+@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = lshr i32 %8, 4, !dbg !10
+  %10 = and i32 %9, 15, !dbg !10
+  %11 = and i32 %8, 15, !dbg !10
+  %12 = shl nuw nsw i32 %11, 3, !dbg !11
+  %13 = or i32 %12, 4, !dbg !11
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %15 = shl i32 %14, 4, !dbg !13
+  %16 = or i32 %15, %10, !dbg !14
+  %17 = or i32 %15, %11, !dbg !14
+  %18 = sext i32 %16 to i64, !dbg !15
+  %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
+  %20 = sext i32 %17 to i64, !dbg !15
+  %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
+  %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
+  %31 = srem i32 %16, 512, !dbg !17
+  %32 = shl nsw i32 %31, 8, !dbg !18
+  %33 = add i64 %30, 50257, !dbg !19
+  %34 = icmp slt i64 %22, 0, !dbg !20
+  %35 = icmp slt i64 %30, 0, !dbg !20
+  %36 = select i1 %35, i64 %33, i64 %30, !dbg !21
+  %37 = icmp ugt i64 %36, 50256, !dbg !22
+  %38 = shl i64 %22, 8, !dbg !23
+  %39 = add i64 %38, 12865792, !dbg !23
+  %40 = select i1 %34, i64 %39, i64 %38, !dbg !23
+  %41 = getelementptr float, ptr addrspace(1) %1, i64 %40
+  br label %42, !dbg !24
+42:                                               ; preds = %7, %104
+  %43 = phi float [ 0.000000e+00, %7 ], [ %143, %104 ]
+  %44 = phi float [ 0.000000e+00, %7 ], [ %144, %104 ]
+  %45 = phi float [ 0.000000e+00, %7 ], [ %145, %104 ]
+  %46 = phi float [ 0.000000e+00, %7 ], [ %146, %104 ]
+  %47 = phi float [ 0.000000e+00, %7 ], [ %147, %104 ]
+  %48 = phi float [ 0.000000e+00, %7 ], [ %148, %104 ]
+  %49 = phi float [ 0.000000e+00, %7 ], [ %149, %104 ]
+  %50 = phi float [ 0.000000e+00, %7 ], [ %150, %104 ]
+  %51 = phi float [ 0.000000e+00, %7 ], [ %151, %104 ]
+  %52 = phi float [ 0.000000e+00, %7 ], [ %152, %104 ]
+  %53 = phi float [ 0.000000e+00, %7 ], [ %153, %104 ]
+  %54 = phi float [ 0.000000e+00, %7 ], [ %154, %104 ]
+  %55 = phi float [ 0.000000e+00, %7 ], [ %155, %104 ]
+  %56 = phi float [ 0.000000e+00, %7 ], [ %156, %104 ]
+  %57 = phi float [ 0.000000e+00, %7 ], [ %157, %104 ]
+  %58 = phi float [ 0.000000e+00, %7 ], [ %158, %104 ]
+  %59 = phi float [ 0.000000e+00, %7 ], [ %191, %104 ]
+  %60 = phi float [ 0.000000e+00, %7 ], [ %192, %104 ]
+  %61 = phi float [ 0.000000e+00, %7 ], [ %193, %104 ]
+  %62 = phi float [ 0.000000e+00, %7 ], [ %194, %104 ]
+  %63 = phi float [ 0.000000e+00, %7 ], [ %195, %104 ]
+  %64 = phi float [ 0.000000e+00, %7 ], [ %196, %104 ]
+  %65 = phi float [ 0.000000e+00, %7 ], [ %197, %104 ]
+  %66 = phi float [ 0.000000e+00, %7 ], [ %198, %104 ]
+  %67 = phi float [ 0.000000e+00, %7 ], [ %167, %104 ]
+  %68 = phi float [ 0.000000e+00, %7 ], [ %168, %104 ]
+  %69 = phi float [ 0.000000e+00, %7 ], [ %169, %104 ]
+  %70 = phi float [ 0.000000e+00, %7 ], [ %170, %104 ]
+  %71 = phi float [ 0.000000e+00, %7 ], [ %171, %104 ]
+  %72 = phi float [ 0.000000e+00, %7 ], [ %172, %104 ]
+  %73 = phi float [ 0.000000e+00, %7 ], [ %173, %104 ]
+  %74 = phi float [ 0.000000e+00, %7 ], [ %174, %104 ]
+  %75 = phi i1 [ true, %7 ], [ false, %104 ]
+  %76 = phi i32 [ 0, %7 ], [ 128, %104 ]
+  %77 = or i32 %76, %12, !dbg !25
+  %78 = or i32 %76, %13, !dbg !25
+  %79 = or i32 %77, %32, !dbg !26
+  %80 = or i32 %78, %32, !dbg !26
+  %81 = sext i32 %79 to i64, !dbg !27
+  %82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !27
+  %83 = sext i32 %80 to i64, !dbg !27
+  %84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !27
+  %85 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %86 = extractvalue { i32, i32, i32, i32 } %85, 0, !dbg !28
+  %87 = extractvalue { i32, i32, i32, i32 } %85, 1, !dbg !28
+  %88 = extractvalue { i32, i32, i32, i32 } %85, 2, !dbg !28
+  %89 = extractvalue { i32, i32, i32, i32 } %85, 3, !dbg !28
+  %90 = bitcast i32 %86 to float, !dbg !28
+  %91 = bitcast i32 %87 to float, !dbg !28
+  %92 = bitcast i32 %88 to float, !dbg !28
+  %93 = bitcast i32 %89 to float, !dbg !28
+  %94 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %95 = extractvalue { i32, i32, i32, i32 } %94, 0, !dbg !28
+  %96 = extractvalue { i32, i32, i32, i32 } %94, 1, !dbg !28
+  %97 = extractvalue { i32, i32, i32, i32 } %94, 2, !dbg !28
+  %98 = extractvalue { i32, i32, i32, i32 } %94, 3, !dbg !28
+  %99 = bitcast i32 %95 to float, !dbg !28
+  %100 = bitcast i32 %96 to float, !dbg !28
+  %101 = bitcast i32 %97 to float, !dbg !28
+  %102 = bitcast i32 %98 to float, !dbg !28
+  br i1 %37, label %103, label %104, !dbg !29
+103:                                              ; preds = %42
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !29
+  br label %104, !dbg !29
+104:                                              ; preds = %103, %42
+  %105 = zext nneg i32 %77 to i64, !dbg !30
+  %106 = zext nneg i32 %78 to i64, !dbg !30
+  %107 = getelementptr float, ptr addrspace(1) %41, i64 %105, !dbg !31
+  %108 = getelementptr float, ptr addrspace(1) %41, i64 %106, !dbg !31
+  %109 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %110 = extractvalue { i32, i32, i32, i32 } %109, 0, !dbg !32
+  %111 = extractvalue { i32, i32, i32, i32 } %109, 1, !dbg !32
+  %112 = extractvalue { i32, i32, i32, i32 } %109, 2, !dbg !32
+  %113 = extractvalue { i32, i32, i32, i32 } %109, 3, !dbg !32
+  %114 = bitcast i32 %110 to float, !dbg !32
+  %115 = bitcast i32 %111 to float, !dbg !32
+  %116 = bitcast i32 %112 to float, !dbg !32
+  %117 = bitcast i32 %113 to float, !dbg !32
+  %118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %108, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !32
+  %120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !32
+  %121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !32
+  %122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !32
+  %123 = bitcast i32 %119 to float, !dbg !32
+  %124 = bitcast i32 %120 to float, !dbg !32
+  %125 = bitcast i32 %121 to float, !dbg !32
+  %126 = bitcast i32 %122 to float, !dbg !32
+  %127 = fadd float %90, %114, !dbg !33
+  %128 = fadd float %91, %115, !dbg !33
+  %129 = fadd float %92, %116, !dbg !33
+  %130 = fadd float %93, %117, !dbg !33
+  %131 = fadd float %99, %123, !dbg !33
+  %132 = fadd float %100, %124, !dbg !33
+  %133 = fadd float %101, %125, !dbg !33
+  %134 = fadd float %102, %126, !dbg !33
+  %135 = fsub float %127, %67, !dbg !34
+  %136 = fsub float %128, %68, !dbg !34
+  %137 = fsub float %129, %69, !dbg !34
+  %138 = fsub float %130, %70, !dbg !34
+  %139 = fsub float %131, %71, !dbg !34
+  %140 = fsub float %132, %72, !dbg !34
+  %141 = fsub float %133, %73, !dbg !34
+  %142 = fsub float %134, %74, !dbg !34
+  %143 = fadd float %43, 1.000000e+00, !dbg !38
+  %144 = fadd float %44, 1.000000e+00, !dbg !38
+  %145 = fadd float %45, 1.000000e+00, !dbg !38
+  %146 = fadd float %46, 1.000000e+00, !dbg !38
+  %147 = fadd float %47, 1.000000e+00, !dbg !38
+  %148 = fadd float %48, 1.000000e+00, !dbg !38
+  %149 = fadd float %49, 1.000000e+00, !dbg !38
+  %150 = fadd float %50, 1.000000e+00, !dbg !38
+  %151 = fadd float %51, 1.000000e+00, !dbg !38
+  %152 = fadd float %52, 1.000000e+00, !dbg !38
+  %153 = fadd float %53, 1.000000e+00, !dbg !38
+  %154 = fadd float %54, 1.000000e+00, !dbg !38
+  %155 = fadd float %55, 1.000000e+00, !dbg !38
+  %156 = fadd float %56, 1.000000e+00, !dbg !38
+  %157 = fadd float %57, 1.000000e+00, !dbg !38
+  %158 = fadd float %58, 1.000000e+00, !dbg !38
+  %159 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %143) #6, !dbg !39
+  %160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %144) #6, !dbg !39
+  %161 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float %145) #6, !dbg !39
+  %162 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %138, float %146) #6, !dbg !39
+  %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %139, float %147) #6, !dbg !39
+  %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %140, float %148) #6, !dbg !39
+  %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %149) #6, !dbg !39
+  %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %142, float %150) #6, !dbg !39
+  %167 = fadd float %67, %159, !dbg !40
+  %168 = fadd float %68, %160, !dbg !40
+  %169 = fadd float %69, %161, !dbg !40
+  %170 = fadd float %70, %162, !dbg !40
+  %171 = fadd float %71, %163, !dbg !40
+  %172 = fadd float %72, %164, !dbg !40
+  %173 = fadd float %73, %165, !dbg !40
+  %174 = fadd float %74, %166, !dbg !40
+  %175 = fsub float %127, %167, !dbg !41
+  %176 = fsub float %128, %168, !dbg !41
+  %177 = fsub float %129, %169, !dbg !41
+  %178 = fsub float %130, %170, !dbg !41
+  %179 = fsub float %131, %171, !dbg !41
+  %180 = fsub float %132, %172, !dbg !41
+  %181 = fsub float %133, %173, !dbg !41
+  %182 = fsub float %134, %174, !dbg !41
+  %183 = fmul float %135, %175, !dbg !42
+  %184 = fmul float %136, %176, !dbg !42
+  %185 = fmul float %137, %177, !dbg !42
+  %186 = fmul float %138, %178, !dbg !42
+  %187 = fmul float %139, %179, !dbg !42
+  %188 = fmul float %140, %180, !dbg !42
+  %189 = fmul float %141, %181, !dbg !42
+  %190 = fmul float %142, %182, !dbg !42
+  %191 = fadd float %59, %183, !dbg !43
+  %192 = fadd float %60, %184, !dbg !43
+  %193 = fadd float %61, %185, !dbg !43
+  %194 = fadd float %62, %186, !dbg !43
+  %195 = fadd float %63, %187, !dbg !43
+  %196 = fadd float %64, %188, !dbg !43
+  %197 = fadd float %65, %189, !dbg !43
+  %198 = fadd float %66, %190, !dbg !43
+  br i1 %75, label %42, label %199, !dbg !24
+199:                                              ; preds = %104
+  %200 = and i32 %8, 127, !dbg !11
+  %201 = and i32 %8, 128, !dbg !24
+  %.not = icmp eq i32 %201, 0, !dbg !24
+  %202 = select i1 %.not, i32 0, i32 136, !dbg !24
+  %203 = add nuw nsw i32 %202, %200, !dbg !24
+  %204 = zext nneg i32 %203 to i64, !dbg !24
+  %205 = getelementptr float, ptr addrspace(3) @global_smem, i64 %204, !dbg !24
+  %206 = insertelement <1 x float> undef, float %151, i64 0, !dbg !24
+  store <1 x float> %206, ptr addrspace(3) %205, align 4, !dbg !24
+  %207 = add nuw nsw i32 %200, 272, !dbg !24
+  %208 = add nuw nsw i32 %207, %202, !dbg !24
+  %209 = zext nneg i32 %208 to i64, !dbg !24
+  %210 = getelementptr float, ptr addrspace(3) @global_smem, i64 %209, !dbg !24
+  %211 = insertelement <1 x float> undef, float %152, i64 0, !dbg !24
+  store <1 x float> %211, ptr addrspace(3) %210, align 4, !dbg !24
+  %212 = add nuw nsw i32 %200, 544, !dbg !24
+  %213 = add nuw nsw i32 %212, %202, !dbg !24
+  %214 = zext nneg i32 %213 to i64, !dbg !24
+  %215 = getelementptr float, ptr addrspace(3) @global_smem, i64 %214, !dbg !24
+  %216 = insertelement <1 x float> undef, float %153, i64 0, !dbg !24
+  store <1 x float> %216, ptr addrspace(3) %215, align 4, !dbg !24
+  %217 = add nuw nsw i32 %200, 816, !dbg !24
+  %218 = add nuw nsw i32 %217, %202, !dbg !24
+  %219 = zext nneg i32 %218 to i64, !dbg !24
+  %220 = getelementptr float, ptr addrspace(3) @global_smem, i64 %219, !dbg !24
+  %221 = insertelement <1 x float> undef, float %154, i64 0, !dbg !24
+  store <1 x float> %221, ptr addrspace(3) %220, align 4, !dbg !24
+  %222 = add nuw nsw i32 %200, 1088, !dbg !24
+  %223 = add nuw nsw i32 %222, %202, !dbg !24
+  %224 = zext nneg i32 %223 to i64, !dbg !24
+  %225 = getelementptr float, ptr addrspace(3) @global_smem, i64 %224, !dbg !24
+  %226 = insertelement <1 x float> undef, float %155, i64 0, !dbg !24
+  store <1 x float> %226, ptr addrspace(3) %225, align 4, !dbg !24
+  %227 = add nuw nsw i32 %200, 1360, !dbg !24
+  %228 = add nuw nsw i32 %227, %202, !dbg !24
+  %229 = zext nneg i32 %228 to i64, !dbg !24
+  %230 = getelementptr float, ptr addrspace(3) @global_smem, i64 %229, !dbg !24
+  %231 = insertelement <1 x float> undef, float %156, i64 0, !dbg !24
+  store <1 x float> %231, ptr addrspace(3) %230, align 4, !dbg !24
+  %232 = add nuw nsw i32 %200, 1632, !dbg !24
+  %233 = add nuw nsw i32 %232, %202, !dbg !24
+  %234 = zext nneg i32 %233 to i64, !dbg !24
+  %235 = getelementptr float, ptr addrspace(3) @global_smem, i64 %234, !dbg !24
+  %236 = insertelement <1 x float> undef, float %157, i64 0, !dbg !24
+  store <1 x float> %236, ptr addrspace(3) %235, align 4, !dbg !24
+  %237 = add nuw nsw i32 %200, 1904, !dbg !24
+  %238 = add nuw nsw i32 %237, %202, !dbg !24
+  %239 = zext nneg i32 %238 to i64, !dbg !24
+  %240 = getelementptr float, ptr addrspace(3) @global_smem, i64 %239, !dbg !24
+  %241 = insertelement <1 x float> undef, float %158, i64 0, !dbg !24
+  store <1 x float> %241, ptr addrspace(3) %240, align 4, !dbg !24
+  tail call void @llvm.nvvm.barrier0(), !dbg !24
+  %242 = mul nuw nsw i32 %10, 136, !dbg !24
+  %243 = add nuw nsw i32 %242, %12, !dbg !24
+  %244 = zext nneg i32 %243 to i64, !dbg !24
+  %245 = getelementptr float, ptr addrspace(3) @global_smem, i64 %244, !dbg !24
+  %246 = load float, ptr addrspace(3) %245, align 32, !dbg !24
+  %247 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 1, !dbg !24
+  %248 = load float, ptr addrspace(3) %247, align 4, !dbg !24
+  %249 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 2, !dbg !24
+  %250 = load float, ptr addrspace(3) %249, align 8, !dbg !24
+  %251 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 3, !dbg !24
+  %252 = load float, ptr addrspace(3) %251, align 4, !dbg !24
+  %253 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 4, !dbg !24
+  %254 = load float, ptr addrspace(3) %253, align 16, !dbg !24
+  %255 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 5, !dbg !24
+  %256 = load float, ptr addrspace(3) %255, align 4, !dbg !24
+  %257 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 6, !dbg !24
+  %258 = load float, ptr addrspace(3) %257, align 8, !dbg !24
+  %259 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 7, !dbg !24
+  %260 = load float, ptr addrspace(3) %259, align 4, !dbg !24
+  %261 = fsub float %168, %167, !dbg !44
+  %262 = fadd float %246, %248, !dbg !48
+  %263 = fcmp oeq float %262, 0.000000e+00, !dbg !49
+  %264 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %262) #6, !dbg !50
+  %265 = select i1 %263, float 0.000000e+00, float %264, !dbg !51
+  %266 = fmul float %261, %265, !dbg !52
+  %267 = fadd float %167, %266, !dbg !53
+  %268 = fadd float %191, %192, !dbg !54
+  %269 = fmul float %261, %261, !dbg !55
+  %270 = fmul float %269, %246, !dbg !56
+  %271 = fmul float %270, %265, !dbg !57
+  %272 = fadd float %268, %271, !dbg !58
+  %273 = fsub float %169, %267, !dbg !44
+  %274 = fadd float %250, %262, !dbg !48
+  %275 = fcmp oeq float %274, 0.000000e+00, !dbg !49
+  %276 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %274) #6, !dbg !50
+  %277 = select i1 %275, float 0.000000e+00, float %276, !dbg !51
+  %278 = fmul float %277, %273, !dbg !52
+  %279 = fadd float %267, %278, !dbg !53
+  %280 = fadd float %193, %272, !dbg !54
+  %281 = fmul float %273, %273, !dbg !55
+  %282 = fmul float %262, %281, !dbg !56
+  %283 = fmul float %277, %282, !dbg !57
+  %284 = fadd float %280, %283, !dbg !58
+  %285 = fsub float %170, %279, !dbg !44
+  %286 = fadd float %252, %274, !dbg !48
+  %287 = fcmp oeq float %286, 0.000000e+00, !dbg !49
+  %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %286) #6, !dbg !50
+  %289 = select i1 %287, float 0.000000e+00, float %288, !dbg !51
+  %290 = fmul float %289, %285, !dbg !52
+  %291 = fadd float %279, %290, !dbg !53
+  %292 = fadd float %194, %284, !dbg !54
+  %293 = fmul float %285, %285, !dbg !55
+  %294 = fmul float %274, %293, !dbg !56
+  %295 = fmul float %289, %294, !dbg !57
+  %296 = fadd float %292, %295, !dbg !58
+  %297 = fsub float %171, %291, !dbg !44
+  %298 = fadd float %254, %286, !dbg !48
+  %299 = fcmp oeq float %298, 0.000000e+00, !dbg !49
+  %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %254, float %298) #6, !dbg !50
+  %301 = select i1 %299, float 0.000000e+00, float %300, !dbg !51
+  %302 = fmul float %301, %297, !dbg !52
+  %303 = fadd float %291, %302, !dbg !53
+  %304 = fadd float %195, %296, !dbg !54
+  %305 = fmul float %297, %297, !dbg !55
+  %306 = fmul float %286, %305, !dbg !56
+  %307 = fmul float %301, %306, !dbg !57
+  %308 = fadd float %304, %307, !dbg !58
+  %309 = fsub float %172, %303, !dbg !44
+  %310 = fadd float %256, %298, !dbg !48
+  %311 = fcmp oeq float %310, 0.000000e+00, !dbg !49
+  %312 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %256, float %310) #6, !dbg !50
+  %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !51
+  %314 = fmul float %313, %309, !dbg !52
+  %315 = fadd float %303, %314, !dbg !53
+  %316 = fadd float %196, %308, !dbg !54
+  %317 = fmul float %309, %309, !dbg !55
+  %318 = fmul float %298, %317, !dbg !56
+  %319 = fmul float %313, %318, !dbg !57
+  %320 = fadd float %316, %319, !dbg !58
+  %321 = fsub float %173, %315, !dbg !44
+  %322 = fadd float %258, %310, !dbg !48
+  %323 = fcmp oeq float %322, 0.000000e+00, !dbg !49
+  %324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %258, float %322) #6, !dbg !50
+  %325 = select i1 %323, float 0.000000e+00, float %324, !dbg !51
+  %326 = fmul float %325, %321, !dbg !52
+  %327 = fadd float %315, %326, !dbg !53
+  %328 = fadd float %197, %320, !dbg !54
+  %329 = fmul float %321, %321, !dbg !55
+  %330 = fmul float %310, %329, !dbg !56
+  %331 = fmul float %325, %330, !dbg !57
+  %332 = fadd float %328, %331, !dbg !58
+  %333 = fsub float %174, %327, !dbg !44
+  %334 = fadd float %260, %322, !dbg !48
+  %335 = fcmp oeq float %334, 0.000000e+00, !dbg !49
+  %336 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %260, float %334) #6, !dbg !50
+  %337 = select i1 %335, float 0.000000e+00, float %336, !dbg !51
+  %338 = fmul float %337, %333, !dbg !52
+  %339 = fadd float %327, %338, !dbg !53
+  %340 = fadd float %198, %332, !dbg !54
+  %341 = fmul float %333, %333, !dbg !55
+  %342 = fmul float %322, %341, !dbg !56
+  %343 = fmul float %337, %342, !dbg !57
+  %344 = fadd float %340, %343, !dbg !58
+  %345 = bitcast float %339 to i32, !dbg !59
+  %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 8, i32 31), !dbg !59
+  %347 = bitcast i32 %346 to float, !dbg !59
+  %348 = bitcast float %344 to i32, !dbg !59
+  %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 8, i32 31), !dbg !59
+  %350 = bitcast i32 %349 to float, !dbg !59
+  %351 = bitcast float %334 to i32, !dbg !59
+  %352 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %351, i32 8, i32 31), !dbg !59
+  %353 = bitcast i32 %352 to float, !dbg !59
+  %354 = fsub float %347, %339, !dbg !44
+  %355 = fadd float %334, %353, !dbg !48
+  %356 = fcmp oeq float %355, 0.000000e+00, !dbg !49
+  %357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %353, float %355) #6, !dbg !50
+  %358 = select i1 %356, float 0.000000e+00, float %357, !dbg !51
+  %359 = fmul float %358, %354, !dbg !52
+  %360 = fadd float %339, %359, !dbg !53
+  %361 = fadd float %344, %350, !dbg !54
+  %362 = fmul float %354, %354, !dbg !55
+  %363 = fmul float %334, %362, !dbg !56
+  %364 = fmul float %358, %363, !dbg !57
+  %365 = fadd float %361, %364, !dbg !58
+  %366 = bitcast float %360 to i32, !dbg !59
+  %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 4, i32 31), !dbg !59
+  %368 = bitcast i32 %367 to float, !dbg !59
+  %369 = bitcast float %365 to i32, !dbg !59
+  %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 4, i32 31), !dbg !59
+  %371 = bitcast i32 %370 to float, !dbg !59
+  %372 = bitcast float %355 to i32, !dbg !59
+  %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %372, i32 4, i32 31), !dbg !59
+  %374 = bitcast i32 %373 to float, !dbg !59
+  %375 = fsub float %368, %360, !dbg !44
+  %376 = fadd float %355, %374, !dbg !48
+  %377 = fcmp oeq float %376, 0.000000e+00, !dbg !49
+  %378 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %374, float %376) #6, !dbg !50
+  %379 = select i1 %377, float 0.000000e+00, float %378, !dbg !51
+  %380 = fmul float %379, %375, !dbg !52
+  %381 = fadd float %360, %380, !dbg !53
+  %382 = fadd float %365, %371, !dbg !54
+  %383 = fmul float %375, %375, !dbg !55
+  %384 = fmul float %355, %383, !dbg !56
+  %385 = fmul float %379, %384, !dbg !57
+  %386 = fadd float %382, %385, !dbg !58
+  %387 = bitcast float %381 to i32, !dbg !59
+  %388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 2, i32 31), !dbg !59
+  %389 = bitcast i32 %388 to float, !dbg !59
+  %390 = bitcast float %386 to i32, !dbg !59
+  %391 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %390, i32 2, i32 31), !dbg !59
+  %392 = bitcast i32 %391 to float, !dbg !59
+  %393 = bitcast float %376 to i32, !dbg !59
+  %394 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %393, i32 2, i32 31), !dbg !59
+  %395 = bitcast i32 %394 to float, !dbg !59
+  %396 = fsub float %389, %381, !dbg !44
+  %397 = fadd float %376, %395, !dbg !48
+  %398 = fcmp oeq float %397, 0.000000e+00, !dbg !49
+  %399 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %395, float %397) #6, !dbg !50
+  %400 = select i1 %398, float 0.000000e+00, float %399, !dbg !51
+  %401 = fmul float %400, %396, !dbg !52
+  %402 = fadd float %381, %401, !dbg !53
+  %403 = fadd float %386, %392, !dbg !54
+  %404 = fmul float %396, %396, !dbg !55
+  %405 = fmul float %376, %404, !dbg !56
+  %406 = fmul float %400, %405, !dbg !57
+  %407 = fadd float %403, %406, !dbg !58
+  %408 = bitcast float %402 to i32, !dbg !59
+  %409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 1, i32 31), !dbg !59
+  %410 = bitcast i32 %409 to float, !dbg !59
+  %411 = bitcast float %407 to i32, !dbg !59
+  %412 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %411, i32 1, i32 31), !dbg !59
+  %413 = bitcast i32 %412 to float, !dbg !59
+  %414 = bitcast float %397 to i32, !dbg !59
+  %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !59
+  %416 = bitcast i32 %415 to float, !dbg !59
+  %417 = fsub float %410, %402, !dbg !44
+  %418 = fadd float %397, %416, !dbg !48
+  %419 = fcmp oeq float %418, 0.000000e+00, !dbg !49
+  %420 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %418) #6, !dbg !50
+  %421 = select i1 %419, float 0.000000e+00, float %420, !dbg !51
+  %422 = fmul float %421, %417, !dbg !52
+  %423 = fadd float %402, %422, !dbg !53
+  %424 = fadd float %407, %413, !dbg !54
+  %425 = fmul float %417, %417, !dbg !55
+  %426 = fmul float %397, %425, !dbg !56
+  %427 = fmul float %421, %426, !dbg !57
+  %428 = fadd float %424, %427, !dbg !58
+  %429 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %430 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %431 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %432 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %434 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %435 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %436 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
+  %437 = fadd float %429, 0x3EE4F8B580000000, !dbg !62
+  %438 = shl i32 %16, 8, !dbg !63
+  br label %439, !dbg !64
+439:                                              ; preds = %199, %__nv_rsqrtf.exit
+  %440 = phi i1 [ true, %199 ], [ false, %__nv_rsqrtf.exit ]
+  %441 = phi i32 [ 0, %199 ], [ 128, %__nv_rsqrtf.exit ]
+  %442 = or i32 %441, %12, !dbg !65
+  %443 = or i32 %441, %13, !dbg !65
+  %444 = or i32 %442, %32, !dbg !66
+  %445 = or i32 %443, %32, !dbg !66
+  %446 = sext i32 %444 to i64, !dbg !67
+  %447 = getelementptr float, ptr addrspace(1) %2, i64 %446, !dbg !67
+  %448 = sext i32 %445 to i64, !dbg !67
+  %449 = getelementptr float, ptr addrspace(1) %2, i64 %448, !dbg !67
+  %450 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %447, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %451 = extractvalue { i32, i32, i32, i32 } %450, 0, !dbg !68
+  %452 = extractvalue { i32, i32, i32, i32 } %450, 1, !dbg !68
+  %453 = extractvalue { i32, i32, i32, i32 } %450, 2, !dbg !68
+  %454 = extractvalue { i32, i32, i32, i32 } %450, 3, !dbg !68
+  %455 = bitcast i32 %451 to float, !dbg !68
+  %456 = bitcast i32 %452 to float, !dbg !68
+  %457 = bitcast i32 %453 to float, !dbg !68
+  %458 = bitcast i32 %454 to float, !dbg !68
+  %459 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %449, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %460 = extractvalue { i32, i32, i32, i32 } %459, 0, !dbg !68
+  %461 = extractvalue { i32, i32, i32, i32 } %459, 1, !dbg !68
+  %462 = extractvalue { i32, i32, i32, i32 } %459, 2, !dbg !68
+  %463 = extractvalue { i32, i32, i32, i32 } %459, 3, !dbg !68
+  %464 = bitcast i32 %460 to float, !dbg !68
+  %465 = bitcast i32 %461 to float, !dbg !68
+  %466 = bitcast i32 %462 to float, !dbg !68
+  %467 = bitcast i32 %463 to float, !dbg !68
+  %468 = zext nneg i32 %442 to i64, !dbg !69
+  %469 = getelementptr float, ptr addrspace(1) %3, i64 %468, !dbg !69
+  %470 = zext nneg i32 %443 to i64, !dbg !69
+  %471 = getelementptr float, ptr addrspace(1) %3, i64 %470, !dbg !69
+  %472 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %469, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %473 = extractvalue { i32, i32, i32, i32 } %472, 0, !dbg !70
+  %474 = extractvalue { i32, i32, i32, i32 } %472, 1, !dbg !70
+  %475 = extractvalue { i32, i32, i32, i32 } %472, 2, !dbg !70
+  %476 = extractvalue { i32, i32, i32, i32 } %472, 3, !dbg !70
+  %477 = bitcast i32 %473 to float, !dbg !70
+  %478 = bitcast i32 %474 to float, !dbg !70
+  %479 = bitcast i32 %475 to float, !dbg !70
+  %480 = bitcast i32 %476 to float, !dbg !70
+  %481 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %471, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %482 = extractvalue { i32, i32, i32, i32 } %481, 0, !dbg !70
+  %483 = extractvalue { i32, i32, i32, i32 } %481, 1, !dbg !70
+  %484 = extractvalue { i32, i32, i32, i32 } %481, 2, !dbg !70
+  %485 = extractvalue { i32, i32, i32, i32 } %481, 3, !dbg !70
+  %486 = bitcast i32 %482 to float, !dbg !70
+  %487 = bitcast i32 %483 to float, !dbg !70
+  %488 = bitcast i32 %484 to float, !dbg !70
+  %489 = bitcast i32 %485 to float, !dbg !70
+  br i1 %37, label %490, label %491, !dbg !71
+490:                                              ; preds = %439
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !71
+  br label %491, !dbg !71
+491:                                              ; preds = %490, %439
+  %492 = getelementptr float, ptr addrspace(1) %41, i64 %468, !dbg !72
+  %493 = getelementptr float, ptr addrspace(1) %41, i64 %470, !dbg !72
+  %494 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %492, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %495 = extractvalue { i32, i32, i32, i32 } %494, 0, !dbg !73
+  %496 = extractvalue { i32, i32, i32, i32 } %494, 1, !dbg !73
+  %497 = extractvalue { i32, i32, i32, i32 } %494, 2, !dbg !73
+  %498 = extractvalue { i32, i32, i32, i32 } %494, 3, !dbg !73
+  %499 = bitcast i32 %495 to float, !dbg !73
+  %500 = bitcast i32 %496 to float, !dbg !73
+  %501 = bitcast i32 %497 to float, !dbg !73
+  %502 = bitcast i32 %498 to float, !dbg !73
+  %503 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %493, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %504 = extractvalue { i32, i32, i32, i32 } %503, 0, !dbg !73
+  %505 = extractvalue { i32, i32, i32, i32 } %503, 1, !dbg !73
+  %506 = extractvalue { i32, i32, i32, i32 } %503, 2, !dbg !73
+  %507 = extractvalue { i32, i32, i32, i32 } %503, 3, !dbg !73
+  %508 = bitcast i32 %504 to float, !dbg !73
+  %509 = bitcast i32 %505 to float, !dbg !73
+  %510 = bitcast i32 %506 to float, !dbg !73
+  %511 = bitcast i32 %507 to float, !dbg !73
+  %512 = fadd float %455, %499, !dbg !74
+  %513 = fadd float %456, %500, !dbg !74
+  %514 = fadd float %457, %501, !dbg !74
+  %515 = fadd float %458, %502, !dbg !74
+  %516 = fadd float %464, %508, !dbg !74
+  %517 = fadd float %465, %509, !dbg !74
+  %518 = fadd float %466, %510, !dbg !74
+  %519 = fadd float %467, %511, !dbg !74
+  %520 = fsub float %512, %423, !dbg !75
+  %521 = fsub float %513, %423, !dbg !75
+  %522 = fsub float %514, %423, !dbg !75
+  %523 = fsub float %515, %423, !dbg !75
+  %524 = fsub float %516, %423, !dbg !75
+  %525 = fsub float %517, %423, !dbg !75
+  %526 = fsub float %518, %423, !dbg !75
+  %527 = fsub float %519, %423, !dbg !75
+  %528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %.not.i = icmp eq i32 %528, 0, !dbg !76
+  br i1 %.not.i, label %531, label %529, !dbg !76
+529:                                              ; preds = %491
+  %530 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %437), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+531:                                              ; preds = %491
+  %532 = tail call float @llvm.nvvm.rsqrt.approx.f(float %437), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+__nv_rsqrtf.exit:                                 ; preds = %529, %531
+  %.0.i = phi float [ %530, %529 ], [ %532, %531 ], !dbg !76
+  %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %540 = fmul float %520, %.0.i, !dbg !77
+  %541 = fmul float %521, %.0.i, !dbg !77
+  %542 = fmul float %522, %.0.i, !dbg !77
+  %543 = fmul float %523, %.0.i, !dbg !77
+  %544 = fmul float %524, %.0.i, !dbg !77
+  %545 = fmul float %525, %.0.i, !dbg !77
+  %546 = fmul float %526, %.0.i, !dbg !77
+  %547 = fmul float %527, %.0.i, !dbg !77
+  %548 = fmul float %540, %477, !dbg !78
+  %549 = fmul float %541, %478, !dbg !78
+  %550 = fmul float %542, %479, !dbg !78
+  %551 = fmul float %543, %480, !dbg !78
+  %552 = fmul float %544, %486, !dbg !78
+  %553 = fmul float %545, %487, !dbg !78
+  %554 = fmul float %546, %488, !dbg !78
+  %555 = fmul float %547, %489, !dbg !78
+  %556 = or i32 %442, %438, !dbg !79
+  %557 = sext i32 %556 to i64, !dbg !80
+  %558 = getelementptr i16, ptr addrspace(1) %4, i64 %557, !dbg !80
+  %559 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %548) #6, !dbg !81
+  %560 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %549) #6, !dbg !81
+  %561 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %550) #6, !dbg !81
+  %562 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %551) #6, !dbg !81
+  %563 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %552) #6, !dbg !81
+  %564 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %553) #6, !dbg !81
+  %565 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %554) #6, !dbg !81
+  %566 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %555) #6, !dbg !81
+  %567 = insertelement <2 x i16> undef, i16 %559, i64 0, !dbg !81
+  %568 = insertelement <2 x i16> %567, i16 %560, i64 1, !dbg !81
+  %569 = bitcast <2 x i16> %568 to i32, !dbg !81
+  %570 = insertelement <2 x i16> undef, i16 %561, i64 0, !dbg !81
+  %571 = insertelement <2 x i16> %570, i16 %562, i64 1, !dbg !81
+  %572 = bitcast <2 x i16> %571 to i32, !dbg !81
+  %573 = insertelement <2 x i16> undef, i16 %563, i64 0, !dbg !81
+  %574 = insertelement <2 x i16> %573, i16 %564, i64 1, !dbg !81
+  %575 = bitcast <2 x i16> %574 to i32, !dbg !81
+  %576 = insertelement <2 x i16> undef, i16 %565, i64 0, !dbg !81
+  %577 = insertelement <2 x i16> %576, i16 %566, i64 1, !dbg !81
+  %578 = bitcast <2 x i16> %577 to i32, !dbg !81
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %569, i32 %572, i32 %575, i32 %578, ptr addrspace(1) %558, i1 true) #6, !dbg !81
+  br i1 %440, label %439, label %579, !dbg !64
+579:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !82
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 36, column: 22, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 36, scope: !7)
+!22 = !DILocation(line: 39, column: 40, scope: !7)
+!23 = !DILocation(line: 40, column: 44, scope: !7)
+!24 = !DILocation(line: 31, column: 36, scope: !7)
+!25 = !DILocation(line: 32, column: 27, scope: !7)
+!26 = !DILocation(line: 35, column: 40, scope: !7)
+!27 = !DILocation(line: 35, column: 34, scope: !7)
+!28 = !DILocation(line: 35, column: 50, scope: !7)
+!29 = !DILocation(line: 39, column: 55, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 34, scope: !7)
+!32 = !DILocation(line: 40, column: 52, scope: !7)
+!33 = !DILocation(line: 41, column: 22, scope: !7)
+!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 44, column: 38, scope: !35)
+!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
+!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
+!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
+!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
+!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
+!43 = !DILocation(line: 47, column: 48, scope: !7)
+!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
+!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
+!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 41, scope: !45)
+!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
+!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
+!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
+!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
+!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
+!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
+!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
+!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
+!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
+!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
+!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
+!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
+!60 = !DILocation(line: 50, column: 41, scope: !35)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 76, column: 39, scope: !7)
+!64 = !DILocation(line: 55, column: 36, scope: !7)
+!65 = !DILocation(line: 56, column: 27, scope: !7)
+!66 = !DILocation(line: 59, column: 41, scope: !7)
+!67 = !DILocation(line: 59, column: 35, scope: !7)
+!68 = !DILocation(line: 59, column: 51, scope: !7)
+!69 = !DILocation(line: 60, column: 35, scope: !7)
+!70 = !DILocation(line: 60, column: 40, scope: !7)
+!71 = !DILocation(line: 64, column: 57, scope: !7)
+!72 = !DILocation(line: 65, column: 35, scope: !7)
+!73 = !DILocation(line: 65, column: 54, scope: !7)
+!74 = !DILocation(line: 66, column: 24, scope: !7)
+!75 = !DILocation(line: 67, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 30, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 74, column: 24, scope: !7)
+!79 = !DILocation(line: 76, column: 35, scope: !7)
+!80 = !DILocation(line: 76, column: 29, scope: !7)
+!81 = !DILocation(line: 76, column: 52, scope: !7)
+!82 = !DILocation(line: 55, column: 4, scope: !7)

.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.llir ADDED Viewed

	@@ -0,0 +1,839 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@assertFunc_1 = internal constant [8 x i8] c"<module>"
+@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [8 x i8] c"<module>"
+@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = lshr i32 %9, 4, !dbg !10
+  %11 = and i32 %10, 15, !dbg !10
+  %12 = and i32 %9, 15, !dbg !10
+  %13 = shl nuw nsw i32 %12, 3, !dbg !11
+  %14 = or i32 %13, 4, !dbg !11
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
+  %16 = shl i32 %15, 4, !dbg !13
+  %17 = or i32 %16, %11, !dbg !14
+  %18 = or i32 %16, %12, !dbg !14
+  %19 = sext i32 %17 to i64, !dbg !15
+  %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
+  %21 = sext i32 %18 to i64, !dbg !15
+  %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
+  %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
+  %32 = srem i32 %17, 512, !dbg !17
+  %33 = shl nsw i32 %32, 8, !dbg !18
+  %34 = shl i32 %17, 8, !dbg !19
+  %35 = add i64 %31, 50257, !dbg !20
+  %36 = icmp slt i64 %23, 0, !dbg !21
+  %37 = icmp slt i64 %31, 0, !dbg !21
+  %38 = select i1 %37, i64 %35, i64 %31, !dbg !22
+  %39 = icmp ugt i64 %38, 50256, !dbg !23
+  %40 = shl i64 %23, 8, !dbg !24
+  %41 = add i64 %40, 12865792, !dbg !24
+  %42 = select i1 %36, i64 %41, i64 %40, !dbg !24
+  %43 = getelementptr float, ptr addrspace(1) %1, i64 %42
+  br label %44, !dbg !25
+44:                                               ; preds = %8, %130
+  %45 = phi float [ 0.000000e+00, %8 ], [ %177, %130 ]
+  %46 = phi float [ 0.000000e+00, %8 ], [ %178, %130 ]
+  %47 = phi float [ 0.000000e+00, %8 ], [ %179, %130 ]
+  %48 = phi float [ 0.000000e+00, %8 ], [ %180, %130 ]
+  %49 = phi float [ 0.000000e+00, %8 ], [ %181, %130 ]
+  %50 = phi float [ 0.000000e+00, %8 ], [ %182, %130 ]
+  %51 = phi float [ 0.000000e+00, %8 ], [ %183, %130 ]
+  %52 = phi float [ 0.000000e+00, %8 ], [ %184, %130 ]
+  %53 = phi float [ 0.000000e+00, %8 ], [ %185, %130 ]
+  %54 = phi float [ 0.000000e+00, %8 ], [ %186, %130 ]
+  %55 = phi float [ 0.000000e+00, %8 ], [ %187, %130 ]
+  %56 = phi float [ 0.000000e+00, %8 ], [ %188, %130 ]
+  %57 = phi float [ 0.000000e+00, %8 ], [ %189, %130 ]
+  %58 = phi float [ 0.000000e+00, %8 ], [ %190, %130 ]
+  %59 = phi float [ 0.000000e+00, %8 ], [ %191, %130 ]
+  %60 = phi float [ 0.000000e+00, %8 ], [ %192, %130 ]
+  %61 = phi float [ 0.000000e+00, %8 ], [ %225, %130 ]
+  %62 = phi float [ 0.000000e+00, %8 ], [ %226, %130 ]
+  %63 = phi float [ 0.000000e+00, %8 ], [ %227, %130 ]
+  %64 = phi float [ 0.000000e+00, %8 ], [ %228, %130 ]
+  %65 = phi float [ 0.000000e+00, %8 ], [ %229, %130 ]
+  %66 = phi float [ 0.000000e+00, %8 ], [ %230, %130 ]
+  %67 = phi float [ 0.000000e+00, %8 ], [ %231, %130 ]
+  %68 = phi float [ 0.000000e+00, %8 ], [ %232, %130 ]
+  %69 = phi float [ 0.000000e+00, %8 ], [ %201, %130 ]
+  %70 = phi float [ 0.000000e+00, %8 ], [ %202, %130 ]
+  %71 = phi float [ 0.000000e+00, %8 ], [ %203, %130 ]
+  %72 = phi float [ 0.000000e+00, %8 ], [ %204, %130 ]
+  %73 = phi float [ 0.000000e+00, %8 ], [ %205, %130 ]
+  %74 = phi float [ 0.000000e+00, %8 ], [ %206, %130 ]
+  %75 = phi float [ 0.000000e+00, %8 ], [ %207, %130 ]
+  %76 = phi float [ 0.000000e+00, %8 ], [ %208, %130 ]
+  %77 = phi i1 [ true, %8 ], [ false, %130 ]
+  %78 = phi i32 [ 0, %8 ], [ 128, %130 ]
+  %79 = or i32 %78, %13, !dbg !26
+  %80 = or i32 %78, %14, !dbg !26
+  %81 = or i32 %79, %33, !dbg !27
+  %82 = or i32 %80, %33, !dbg !27
+  %83 = sext i32 %81 to i64, !dbg !28
+  %84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !28
+  %85 = sext i32 %82 to i64, !dbg !28
+  %86 = getelementptr float, ptr addrspace(1) %2, i64 %85, !dbg !28
+  %87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !29
+  %89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !29
+  %90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !29
+  %91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !29
+  %92 = bitcast i32 %88 to float, !dbg !29
+  %93 = bitcast i32 %89 to float, !dbg !29
+  %94 = bitcast i32 %90 to float, !dbg !29
+  %95 = bitcast i32 %91 to float, !dbg !29
+  %96 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %86, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %97 = extractvalue { i32, i32, i32, i32 } %96, 0, !dbg !29
+  %98 = extractvalue { i32, i32, i32, i32 } %96, 1, !dbg !29
+  %99 = extractvalue { i32, i32, i32, i32 } %96, 2, !dbg !29
+  %100 = extractvalue { i32, i32, i32, i32 } %96, 3, !dbg !29
+  %101 = bitcast i32 %97 to float, !dbg !29
+  %102 = bitcast i32 %98 to float, !dbg !29
+  %103 = bitcast i32 %99 to float, !dbg !29
+  %104 = bitcast i32 %100 to float, !dbg !29
+  %105 = or i32 %79, %34, !dbg !30
+  %106 = sext i32 %105 to i64, !dbg !31
+  %107 = getelementptr i16, ptr addrspace(1) %3, i64 %106, !dbg !31
+  %108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !32
+  %110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !32
+  %111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !32
+  %112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !32
+  %113 = trunc i32 %109 to i16, !dbg !32
+  %extelt.offset5 = lshr i32 %109, 16, !dbg !32
+  %114 = trunc i32 %extelt.offset5 to i16, !dbg !32
+  %115 = trunc i32 %110 to i16, !dbg !32
+  %extelt.offset6 = lshr i32 %110, 16, !dbg !32
+  %116 = trunc i32 %extelt.offset6 to i16, !dbg !32
+  %117 = trunc i32 %111 to i16, !dbg !32
+  %extelt.offset7 = lshr i32 %111, 16, !dbg !32
+  %118 = trunc i32 %extelt.offset7 to i16, !dbg !32
+  %119 = trunc i32 %112 to i16, !dbg !32
+  %extelt.offset8 = lshr i32 %112, 16, !dbg !32
+  %120 = trunc i32 %extelt.offset8 to i16, !dbg !32
+  %121 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !33
+  %122 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !33
+  %123 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !33
+  %124 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !33
+  %125 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %117) #6, !dbg !33
+  %126 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %118) #6, !dbg !33
+  %127 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %119) #6, !dbg !33
+  %128 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %120) #6, !dbg !33
+  br i1 %39, label %129, label %130, !dbg !34
+129:                                              ; preds = %44
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !34
+  br label %130, !dbg !34
+130:                                              ; preds = %129, %44
+  %131 = zext nneg i32 %79 to i64, !dbg !35
+  %132 = zext nneg i32 %80 to i64, !dbg !35
+  %133 = getelementptr float, ptr addrspace(1) %43, i64 %131, !dbg !36
+  %134 = getelementptr float, ptr addrspace(1) %43, i64 %132, !dbg !36
+  %135 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %136 = extractvalue { i32, i32, i32, i32 } %135, 0, !dbg !37
+  %137 = extractvalue { i32, i32, i32, i32 } %135, 1, !dbg !37
+  %138 = extractvalue { i32, i32, i32, i32 } %135, 2, !dbg !37
+  %139 = extractvalue { i32, i32, i32, i32 } %135, 3, !dbg !37
+  %140 = bitcast i32 %136 to float, !dbg !37
+  %141 = bitcast i32 %137 to float, !dbg !37
+  %142 = bitcast i32 %138 to float, !dbg !37
+  %143 = bitcast i32 %139 to float, !dbg !37
+  %144 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %134, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %145 = extractvalue { i32, i32, i32, i32 } %144, 0, !dbg !37
+  %146 = extractvalue { i32, i32, i32, i32 } %144, 1, !dbg !37
+  %147 = extractvalue { i32, i32, i32, i32 } %144, 2, !dbg !37
+  %148 = extractvalue { i32, i32, i32, i32 } %144, 3, !dbg !37
+  %149 = bitcast i32 %145 to float, !dbg !37
+  %150 = bitcast i32 %146 to float, !dbg !37
+  %151 = bitcast i32 %147 to float, !dbg !37
+  %152 = bitcast i32 %148 to float, !dbg !37
+  %153 = fadd float %92, %140, !dbg !38
+  %154 = fadd float %93, %141, !dbg !38
+  %155 = fadd float %94, %142, !dbg !38
+  %156 = fadd float %95, %143, !dbg !38
+  %157 = fadd float %101, %149, !dbg !38
+  %158 = fadd float %102, %150, !dbg !38
+  %159 = fadd float %103, %151, !dbg !38
+  %160 = fadd float %104, %152, !dbg !38
+  %161 = fadd float %121, %153, !dbg !39
+  %162 = fadd float %122, %154, !dbg !39
+  %163 = fadd float %123, %155, !dbg !39
+  %164 = fadd float %124, %156, !dbg !39
+  %165 = fadd float %125, %157, !dbg !39
+  %166 = fadd float %126, %158, !dbg !39
+  %167 = fadd float %127, %159, !dbg !39
+  %168 = fadd float %128, %160, !dbg !39
+  %169 = fsub float %161, %69, !dbg !40
+  %170 = fsub float %162, %70, !dbg !40
+  %171 = fsub float %163, %71, !dbg !40
+  %172 = fsub float %164, %72, !dbg !40
+  %173 = fsub float %165, %73, !dbg !40
+  %174 = fsub float %166, %74, !dbg !40
+  %175 = fsub float %167, %75, !dbg !40
+  %176 = fsub float %168, %76, !dbg !40
+  %177 = fadd float %45, 1.000000e+00, !dbg !44
+  %178 = fadd float %46, 1.000000e+00, !dbg !44
+  %179 = fadd float %47, 1.000000e+00, !dbg !44
+  %180 = fadd float %48, 1.000000e+00, !dbg !44
+  %181 = fadd float %49, 1.000000e+00, !dbg !44
+  %182 = fadd float %50, 1.000000e+00, !dbg !44
+  %183 = fadd float %51, 1.000000e+00, !dbg !44
+  %184 = fadd float %52, 1.000000e+00, !dbg !44
+  %185 = fadd float %53, 1.000000e+00, !dbg !44
+  %186 = fadd float %54, 1.000000e+00, !dbg !44
+  %187 = fadd float %55, 1.000000e+00, !dbg !44
+  %188 = fadd float %56, 1.000000e+00, !dbg !44
+  %189 = fadd float %57, 1.000000e+00, !dbg !44
+  %190 = fadd float %58, 1.000000e+00, !dbg !44
+  %191 = fadd float %59, 1.000000e+00, !dbg !44
+  %192 = fadd float %60, 1.000000e+00, !dbg !44
+  %193 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %169, float %177) #6, !dbg !45
+  %194 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %170, float %178) #6, !dbg !45
+  %195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %171, float %179) #6, !dbg !45
+  %196 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %172, float %180) #6, !dbg !45
+  %197 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %173, float %181) #6, !dbg !45
+  %198 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %174, float %182) #6, !dbg !45
+  %199 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %183) #6, !dbg !45
+  %200 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %176, float %184) #6, !dbg !45
+  %201 = fadd float %69, %193, !dbg !46
+  %202 = fadd float %70, %194, !dbg !46
+  %203 = fadd float %71, %195, !dbg !46
+  %204 = fadd float %72, %196, !dbg !46
+  %205 = fadd float %73, %197, !dbg !46
+  %206 = fadd float %74, %198, !dbg !46
+  %207 = fadd float %75, %199, !dbg !46
+  %208 = fadd float %76, %200, !dbg !46
+  %209 = fsub float %161, %201, !dbg !47
+  %210 = fsub float %162, %202, !dbg !47
+  %211 = fsub float %163, %203, !dbg !47
+  %212 = fsub float %164, %204, !dbg !47
+  %213 = fsub float %165, %205, !dbg !47
+  %214 = fsub float %166, %206, !dbg !47
+  %215 = fsub float %167, %207, !dbg !47
+  %216 = fsub float %168, %208, !dbg !47
+  %217 = fmul float %169, %209, !dbg !48
+  %218 = fmul float %170, %210, !dbg !48
+  %219 = fmul float %171, %211, !dbg !48
+  %220 = fmul float %172, %212, !dbg !48
+  %221 = fmul float %173, %213, !dbg !48
+  %222 = fmul float %174, %214, !dbg !48
+  %223 = fmul float %175, %215, !dbg !48
+  %224 = fmul float %176, %216, !dbg !48
+  %225 = fadd float %61, %217, !dbg !49
+  %226 = fadd float %62, %218, !dbg !49
+  %227 = fadd float %63, %219, !dbg !49
+  %228 = fadd float %64, %220, !dbg !49
+  %229 = fadd float %65, %221, !dbg !49
+  %230 = fadd float %66, %222, !dbg !49
+  %231 = fadd float %67, %223, !dbg !49
+  %232 = fadd float %68, %224, !dbg !49
+  br i1 %77, label %44, label %233, !dbg !25
+233:                                              ; preds = %130
+  %234 = and i32 %9, 127, !dbg !11
+  %235 = and i32 %9, 128, !dbg !25
+  %.not = icmp eq i32 %235, 0, !dbg !25
+  %236 = select i1 %.not, i32 0, i32 136, !dbg !25
+  %237 = add nuw nsw i32 %236, %234, !dbg !25
+  %238 = zext nneg i32 %237 to i64, !dbg !25
+  %239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !25
+  %240 = insertelement <1 x float> undef, float %185, i64 0, !dbg !25
+  store <1 x float> %240, ptr addrspace(3) %239, align 4, !dbg !25
+  %241 = add nuw nsw i32 %234, 272, !dbg !25
+  %242 = add nuw nsw i32 %241, %236, !dbg !25
+  %243 = zext nneg i32 %242 to i64, !dbg !25
+  %244 = getelementptr float, ptr addrspace(3) @global_smem, i64 %243, !dbg !25
+  %245 = insertelement <1 x float> undef, float %186, i64 0, !dbg !25
+  store <1 x float> %245, ptr addrspace(3) %244, align 4, !dbg !25
+  %246 = add nuw nsw i32 %234, 544, !dbg !25
+  %247 = add nuw nsw i32 %246, %236, !dbg !25
+  %248 = zext nneg i32 %247 to i64, !dbg !25
+  %249 = getelementptr float, ptr addrspace(3) @global_smem, i64 %248, !dbg !25
+  %250 = insertelement <1 x float> undef, float %187, i64 0, !dbg !25
+  store <1 x float> %250, ptr addrspace(3) %249, align 4, !dbg !25
+  %251 = add nuw nsw i32 %234, 816, !dbg !25
+  %252 = add nuw nsw i32 %251, %236, !dbg !25
+  %253 = zext nneg i32 %252 to i64, !dbg !25
+  %254 = getelementptr float, ptr addrspace(3) @global_smem, i64 %253, !dbg !25
+  %255 = insertelement <1 x float> undef, float %188, i64 0, !dbg !25
+  store <1 x float> %255, ptr addrspace(3) %254, align 4, !dbg !25
+  %256 = add nuw nsw i32 %234, 1088, !dbg !25
+  %257 = add nuw nsw i32 %256, %236, !dbg !25
+  %258 = zext nneg i32 %257 to i64, !dbg !25
+  %259 = getelementptr float, ptr addrspace(3) @global_smem, i64 %258, !dbg !25
+  %260 = insertelement <1 x float> undef, float %189, i64 0, !dbg !25
+  store <1 x float> %260, ptr addrspace(3) %259, align 4, !dbg !25
+  %261 = add nuw nsw i32 %234, 1360, !dbg !25
+  %262 = add nuw nsw i32 %261, %236, !dbg !25
+  %263 = zext nneg i32 %262 to i64, !dbg !25
+  %264 = getelementptr float, ptr addrspace(3) @global_smem, i64 %263, !dbg !25
+  %265 = insertelement <1 x float> undef, float %190, i64 0, !dbg !25
+  store <1 x float> %265, ptr addrspace(3) %264, align 4, !dbg !25
+  %266 = add nuw nsw i32 %234, 1632, !dbg !25
+  %267 = add nuw nsw i32 %266, %236, !dbg !25
+  %268 = zext nneg i32 %267 to i64, !dbg !25
+  %269 = getelementptr float, ptr addrspace(3) @global_smem, i64 %268, !dbg !25
+  %270 = insertelement <1 x float> undef, float %191, i64 0, !dbg !25
+  store <1 x float> %270, ptr addrspace(3) %269, align 4, !dbg !25
+  %271 = add nuw nsw i32 %234, 1904, !dbg !25
+  %272 = add nuw nsw i32 %271, %236, !dbg !25
+  %273 = zext nneg i32 %272 to i64, !dbg !25
+  %274 = getelementptr float, ptr addrspace(3) @global_smem, i64 %273, !dbg !25
+  %275 = insertelement <1 x float> undef, float %192, i64 0, !dbg !25
+  store <1 x float> %275, ptr addrspace(3) %274, align 4, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %276 = mul nuw nsw i32 %11, 136, !dbg !25
+  %277 = add nuw nsw i32 %276, %13, !dbg !25
+  %278 = zext nneg i32 %277 to i64, !dbg !25
+  %279 = getelementptr float, ptr addrspace(3) @global_smem, i64 %278, !dbg !25
+  %280 = load float, ptr addrspace(3) %279, align 32, !dbg !25
+  %281 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 1, !dbg !25
+  %282 = load float, ptr addrspace(3) %281, align 4, !dbg !25
+  %283 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 2, !dbg !25
+  %284 = load float, ptr addrspace(3) %283, align 8, !dbg !25
+  %285 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 3, !dbg !25
+  %286 = load float, ptr addrspace(3) %285, align 4, !dbg !25
+  %287 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 4, !dbg !25
+  %288 = load float, ptr addrspace(3) %287, align 16, !dbg !25
+  %289 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 5, !dbg !25
+  %290 = load float, ptr addrspace(3) %289, align 4, !dbg !25
+  %291 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 6, !dbg !25
+  %292 = load float, ptr addrspace(3) %291, align 8, !dbg !25
+  %293 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 7, !dbg !25
+  %294 = load float, ptr addrspace(3) %293, align 4, !dbg !25
+  %295 = fsub float %202, %201, !dbg !50
+  %296 = fadd float %280, %282, !dbg !54
+  %297 = fcmp oeq float %296, 0.000000e+00, !dbg !55
+  %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %282, float %296) #6, !dbg !56
+  %299 = select i1 %297, float 0.000000e+00, float %298, !dbg !57
+  %300 = fmul float %295, %299, !dbg !58
+  %301 = fadd float %201, %300, !dbg !59
+  %302 = fadd float %225, %226, !dbg !60
+  %303 = fmul float %295, %295, !dbg !61
+  %304 = fmul float %303, %280, !dbg !62
+  %305 = fmul float %304, %299, !dbg !63
+  %306 = fadd float %302, %305, !dbg !64
+  %307 = fsub float %203, %301, !dbg !50
+  %308 = fadd float %284, %296, !dbg !54
+  %309 = fcmp oeq float %308, 0.000000e+00, !dbg !55
+  %310 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float %308) #6, !dbg !56
+  %311 = select i1 %309, float 0.000000e+00, float %310, !dbg !57
+  %312 = fmul float %311, %307, !dbg !58
+  %313 = fadd float %301, %312, !dbg !59
+  %314 = fadd float %227, %306, !dbg !60
+  %315 = fmul float %307, %307, !dbg !61
+  %316 = fmul float %296, %315, !dbg !62
+  %317 = fmul float %311, %316, !dbg !63
+  %318 = fadd float %314, %317, !dbg !64
+  %319 = fsub float %204, %313, !dbg !50
+  %320 = fadd float %286, %308, !dbg !54
+  %321 = fcmp oeq float %320, 0.000000e+00, !dbg !55
+  %322 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %286, float %320) #6, !dbg !56
+  %323 = select i1 %321, float 0.000000e+00, float %322, !dbg !57
+  %324 = fmul float %323, %319, !dbg !58
+  %325 = fadd float %313, %324, !dbg !59
+  %326 = fadd float %228, %318, !dbg !60
+  %327 = fmul float %319, %319, !dbg !61
+  %328 = fmul float %308, %327, !dbg !62
+  %329 = fmul float %323, %328, !dbg !63
+  %330 = fadd float %326, %329, !dbg !64
+  %331 = fsub float %205, %325, !dbg !50
+  %332 = fadd float %288, %320, !dbg !54
+  %333 = fcmp oeq float %332, 0.000000e+00, !dbg !55
+  %334 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %288, float %332) #6, !dbg !56
+  %335 = select i1 %333, float 0.000000e+00, float %334, !dbg !57
+  %336 = fmul float %335, %331, !dbg !58
+  %337 = fadd float %325, %336, !dbg !59
+  %338 = fadd float %229, %330, !dbg !60
+  %339 = fmul float %331, %331, !dbg !61
+  %340 = fmul float %320, %339, !dbg !62
+  %341 = fmul float %335, %340, !dbg !63
+  %342 = fadd float %338, %341, !dbg !64
+  %343 = fsub float %206, %337, !dbg !50
+  %344 = fadd float %290, %332, !dbg !54
+  %345 = fcmp oeq float %344, 0.000000e+00, !dbg !55
+  %346 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %290, float %344) #6, !dbg !56
+  %347 = select i1 %345, float 0.000000e+00, float %346, !dbg !57
+  %348 = fmul float %347, %343, !dbg !58
+  %349 = fadd float %337, %348, !dbg !59
+  %350 = fadd float %230, %342, !dbg !60
+  %351 = fmul float %343, %343, !dbg !61
+  %352 = fmul float %332, %351, !dbg !62
+  %353 = fmul float %347, %352, !dbg !63
+  %354 = fadd float %350, %353, !dbg !64
+  %355 = fsub float %207, %349, !dbg !50
+  %356 = fadd float %292, %344, !dbg !54
+  %357 = fcmp oeq float %356, 0.000000e+00, !dbg !55
+  %358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %292, float %356) #6, !dbg !56
+  %359 = select i1 %357, float 0.000000e+00, float %358, !dbg !57
+  %360 = fmul float %359, %355, !dbg !58
+  %361 = fadd float %349, %360, !dbg !59
+  %362 = fadd float %231, %354, !dbg !60
+  %363 = fmul float %355, %355, !dbg !61
+  %364 = fmul float %344, %363, !dbg !62
+  %365 = fmul float %359, %364, !dbg !63
+  %366 = fadd float %362, %365, !dbg !64
+  %367 = fsub float %208, %361, !dbg !50
+  %368 = fadd float %294, %356, !dbg !54
+  %369 = fcmp oeq float %368, 0.000000e+00, !dbg !55
+  %370 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %294, float %368) #6, !dbg !56
+  %371 = select i1 %369, float 0.000000e+00, float %370, !dbg !57
+  %372 = fmul float %371, %367, !dbg !58
+  %373 = fadd float %361, %372, !dbg !59
+  %374 = fadd float %232, %366, !dbg !60
+  %375 = fmul float %367, %367, !dbg !61
+  %376 = fmul float %356, %375, !dbg !62
+  %377 = fmul float %371, %376, !dbg !63
+  %378 = fadd float %374, %377, !dbg !64
+  %379 = bitcast float %373 to i32, !dbg !65
+  %380 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 8, i32 31), !dbg !65
+  %381 = bitcast i32 %380 to float, !dbg !65
+  %382 = bitcast float %378 to i32, !dbg !65
+  %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 8, i32 31), !dbg !65
+  %384 = bitcast i32 %383 to float, !dbg !65
+  %385 = bitcast float %368 to i32, !dbg !65
+  %386 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 8, i32 31), !dbg !65
+  %387 = bitcast i32 %386 to float, !dbg !65
+  %388 = fsub float %381, %373, !dbg !50
+  %389 = fadd float %368, %387, !dbg !54
+  %390 = fcmp oeq float %389, 0.000000e+00, !dbg !55
+  %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %387, float %389) #6, !dbg !56
+  %392 = select i1 %390, float 0.000000e+00, float %391, !dbg !57
+  %393 = fmul float %392, %388, !dbg !58
+  %394 = fadd float %373, %393, !dbg !59
+  %395 = fadd float %378, %384, !dbg !60
+  %396 = fmul float %388, %388, !dbg !61
+  %397 = fmul float %368, %396, !dbg !62
+  %398 = fmul float %392, %397, !dbg !63
+  %399 = fadd float %395, %398, !dbg !64
+  %400 = bitcast float %394 to i32, !dbg !65
+  %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 4, i32 31), !dbg !65
+  %402 = bitcast i32 %401 to float, !dbg !65
+  %403 = bitcast float %399 to i32, !dbg !65
+  %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 4, i32 31), !dbg !65
+  %405 = bitcast i32 %404 to float, !dbg !65
+  %406 = bitcast float %389 to i32, !dbg !65
+  %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 4, i32 31), !dbg !65
+  %408 = bitcast i32 %407 to float, !dbg !65
+  %409 = fsub float %402, %394, !dbg !50
+  %410 = fadd float %389, %408, !dbg !54
+  %411 = fcmp oeq float %410, 0.000000e+00, !dbg !55
+  %412 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %408, float %410) #6, !dbg !56
+  %413 = select i1 %411, float 0.000000e+00, float %412, !dbg !57
+  %414 = fmul float %413, %409, !dbg !58
+  %415 = fadd float %394, %414, !dbg !59
+  %416 = fadd float %399, %405, !dbg !60
+  %417 = fmul float %409, %409, !dbg !61
+  %418 = fmul float %389, %417, !dbg !62
+  %419 = fmul float %413, %418, !dbg !63
+  %420 = fadd float %416, %419, !dbg !64
+  %421 = bitcast float %415 to i32, !dbg !65
+  %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 2, i32 31), !dbg !65
+  %423 = bitcast i32 %422 to float, !dbg !65
+  %424 = bitcast float %420 to i32, !dbg !65
+  %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 2, i32 31), !dbg !65
+  %426 = bitcast i32 %425 to float, !dbg !65
+  %427 = bitcast float %410 to i32, !dbg !65
+  %428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %427, i32 2, i32 31), !dbg !65
+  %429 = bitcast i32 %428 to float, !dbg !65
+  %430 = fsub float %423, %415, !dbg !50
+  %431 = fadd float %410, %429, !dbg !54
+  %432 = fcmp oeq float %431, 0.000000e+00, !dbg !55
+  %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %429, float %431) #6, !dbg !56
+  %434 = select i1 %432, float 0.000000e+00, float %433, !dbg !57
+  %435 = fmul float %434, %430, !dbg !58
+  %436 = fadd float %415, %435, !dbg !59
+  %437 = fadd float %420, %426, !dbg !60
+  %438 = fmul float %430, %430, !dbg !61
+  %439 = fmul float %410, %438, !dbg !62
+  %440 = fmul float %434, %439, !dbg !63
+  %441 = fadd float %437, %440, !dbg !64
+  %442 = bitcast float %436 to i32, !dbg !65
+  %443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %442, i32 1, i32 31), !dbg !65
+  %444 = bitcast i32 %443 to float, !dbg !65
+  %445 = bitcast float %441 to i32, !dbg !65
+  %446 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %445, i32 1, i32 31), !dbg !65
+  %447 = bitcast i32 %446 to float, !dbg !65
+  %448 = bitcast float %431 to i32, !dbg !65
+  %449 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %448, i32 1, i32 31), !dbg !65
+  %450 = bitcast i32 %449 to float, !dbg !65
+  %451 = fsub float %444, %436, !dbg !50
+  %452 = fadd float %431, %450, !dbg !54
+  %453 = fcmp oeq float %452, 0.000000e+00, !dbg !55
+  %454 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %450, float %452) #6, !dbg !56
+  %455 = select i1 %453, float 0.000000e+00, float %454, !dbg !57
+  %456 = fmul float %455, %451, !dbg !58
+  %457 = fadd float %436, %456, !dbg !59
+  %458 = fadd float %441, %447, !dbg !60
+  %459 = fmul float %451, %451, !dbg !61
+  %460 = fmul float %431, %459, !dbg !62
+  %461 = fmul float %455, %460, !dbg !63
+  %462 = fadd float %458, %461, !dbg !64
+  %463 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %464 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %465 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %466 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %467 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %468 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %469 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %470 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
+  %471 = fadd float %463, 0x3EE4F8B580000000, !dbg !68
+  br label %472, !dbg !69
+472:                                              ; preds = %233, %__nv_rsqrtf.exit
+  %473 = phi i1 [ true, %233 ], [ false, %__nv_rsqrtf.exit ]
+  %474 = phi i32 [ 0, %233 ], [ 128, %__nv_rsqrtf.exit ]
+  %475 = or i32 %474, %13, !dbg !70
+  %476 = or i32 %474, %14, !dbg !70
+  %477 = or i32 %475, %33, !dbg !71
+  %478 = or i32 %476, %33, !dbg !71
+  %479 = sext i32 %477 to i64, !dbg !72
+  %480 = getelementptr float, ptr addrspace(1) %2, i64 %479, !dbg !72
+  %481 = sext i32 %478 to i64, !dbg !72
+  %482 = getelementptr float, ptr addrspace(1) %2, i64 %481, !dbg !72
+  %483 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %480, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %484 = extractvalue { i32, i32, i32, i32 } %483, 0, !dbg !73
+  %485 = extractvalue { i32, i32, i32, i32 } %483, 1, !dbg !73
+  %486 = extractvalue { i32, i32, i32, i32 } %483, 2, !dbg !73
+  %487 = extractvalue { i32, i32, i32, i32 } %483, 3, !dbg !73
+  %488 = bitcast i32 %484 to float, !dbg !73
+  %489 = bitcast i32 %485 to float, !dbg !73
+  %490 = bitcast i32 %486 to float, !dbg !73
+  %491 = bitcast i32 %487 to float, !dbg !73
+  %492 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %482, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %493 = extractvalue { i32, i32, i32, i32 } %492, 0, !dbg !73
+  %494 = extractvalue { i32, i32, i32, i32 } %492, 1, !dbg !73
+  %495 = extractvalue { i32, i32, i32, i32 } %492, 2, !dbg !73
+  %496 = extractvalue { i32, i32, i32, i32 } %492, 3, !dbg !73
+  %497 = bitcast i32 %493 to float, !dbg !73
+  %498 = bitcast i32 %494 to float, !dbg !73
+  %499 = bitcast i32 %495 to float, !dbg !73
+  %500 = bitcast i32 %496 to float, !dbg !73
+  %501 = or i32 %475, %34, !dbg !74
+  %502 = sext i32 %501 to i64, !dbg !75
+  %503 = getelementptr i16, ptr addrspace(1) %3, i64 %502, !dbg !75
+  %504 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %503, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
+  %505 = extractvalue { i32, i32, i32, i32 } %504, 0, !dbg !76
+  %506 = extractvalue { i32, i32, i32, i32 } %504, 1, !dbg !76
+  %507 = extractvalue { i32, i32, i32, i32 } %504, 2, !dbg !76
+  %508 = extractvalue { i32, i32, i32, i32 } %504, 3, !dbg !76
+  %509 = trunc i32 %505 to i16, !dbg !76
+  %extelt.offset = lshr i32 %505, 16, !dbg !76
+  %510 = trunc i32 %extelt.offset to i16, !dbg !76
+  %511 = trunc i32 %506 to i16, !dbg !76
+  %extelt.offset2 = lshr i32 %506, 16, !dbg !76
+  %512 = trunc i32 %extelt.offset2 to i16, !dbg !76
+  %513 = trunc i32 %507 to i16, !dbg !76
+  %extelt.offset3 = lshr i32 %507, 16, !dbg !76
+  %514 = trunc i32 %extelt.offset3 to i16, !dbg !76
+  %515 = trunc i32 %508 to i16, !dbg !76
+  %extelt.offset4 = lshr i32 %508, 16, !dbg !76
+  %516 = trunc i32 %extelt.offset4 to i16, !dbg !76
+  %517 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %509) #6, !dbg !77
+  %518 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %510) #6, !dbg !77
+  %519 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %511) #6, !dbg !77
+  %520 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #6, !dbg !77
+  %521 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #6, !dbg !77
+  %522 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #6, !dbg !77
+  %523 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #6, !dbg !77
+  %524 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #6, !dbg !77
+  %525 = zext nneg i32 %475 to i64, !dbg !78
+  %526 = getelementptr float, ptr addrspace(1) %4, i64 %525, !dbg !78
+  %527 = zext nneg i32 %476 to i64, !dbg !78
+  %528 = getelementptr float, ptr addrspace(1) %4, i64 %527, !dbg !78
+  %529 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %526, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
+  %530 = extractvalue { i32, i32, i32, i32 } %529, 0, !dbg !79
+  %531 = extractvalue { i32, i32, i32, i32 } %529, 1, !dbg !79
+  %532 = extractvalue { i32, i32, i32, i32 } %529, 2, !dbg !79
+  %533 = extractvalue { i32, i32, i32, i32 } %529, 3, !dbg !79
+  %534 = bitcast i32 %530 to float, !dbg !79
+  %535 = bitcast i32 %531 to float, !dbg !79
+  %536 = bitcast i32 %532 to float, !dbg !79
+  %537 = bitcast i32 %533 to float, !dbg !79
+  %538 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %528, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
+  %539 = extractvalue { i32, i32, i32, i32 } %538, 0, !dbg !79
+  %540 = extractvalue { i32, i32, i32, i32 } %538, 1, !dbg !79
+  %541 = extractvalue { i32, i32, i32, i32 } %538, 2, !dbg !79
+  %542 = extractvalue { i32, i32, i32, i32 } %538, 3, !dbg !79
+  %543 = bitcast i32 %539 to float, !dbg !79
+  %544 = bitcast i32 %540 to float, !dbg !79
+  %545 = bitcast i32 %541 to float, !dbg !79
+  %546 = bitcast i32 %542 to float, !dbg !79
+  br i1 %39, label %547, label %548, !dbg !80
+547:                                              ; preds = %472
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !80
+  br label %548, !dbg !80
+548:                                              ; preds = %547, %472
+  %549 = getelementptr float, ptr addrspace(1) %43, i64 %525, !dbg !81
+  %550 = getelementptr float, ptr addrspace(1) %43, i64 %527, !dbg !81
+  %551 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %549, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %552 = extractvalue { i32, i32, i32, i32 } %551, 0, !dbg !82
+  %553 = extractvalue { i32, i32, i32, i32 } %551, 1, !dbg !82
+  %554 = extractvalue { i32, i32, i32, i32 } %551, 2, !dbg !82
+  %555 = extractvalue { i32, i32, i32, i32 } %551, 3, !dbg !82
+  %556 = bitcast i32 %552 to float, !dbg !82
+  %557 = bitcast i32 %553 to float, !dbg !82
+  %558 = bitcast i32 %554 to float, !dbg !82
+  %559 = bitcast i32 %555 to float, !dbg !82
+  %560 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %550, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %561 = extractvalue { i32, i32, i32, i32 } %560, 0, !dbg !82
+  %562 = extractvalue { i32, i32, i32, i32 } %560, 1, !dbg !82
+  %563 = extractvalue { i32, i32, i32, i32 } %560, 2, !dbg !82
+  %564 = extractvalue { i32, i32, i32, i32 } %560, 3, !dbg !82
+  %565 = bitcast i32 %561 to float, !dbg !82
+  %566 = bitcast i32 %562 to float, !dbg !82
+  %567 = bitcast i32 %563 to float, !dbg !82
+  %568 = bitcast i32 %564 to float, !dbg !82
+  %569 = fadd float %488, %556, !dbg !83
+  %570 = fadd float %489, %557, !dbg !83
+  %571 = fadd float %490, %558, !dbg !83
+  %572 = fadd float %491, %559, !dbg !83
+  %573 = fadd float %497, %565, !dbg !83
+  %574 = fadd float %498, %566, !dbg !83
+  %575 = fadd float %499, %567, !dbg !83
+  %576 = fadd float %500, %568, !dbg !83
+  %577 = fadd float %517, %569, !dbg !84
+  %578 = fadd float %518, %570, !dbg !84
+  %579 = fadd float %519, %571, !dbg !84
+  %580 = fadd float %520, %572, !dbg !84
+  %581 = fadd float %521, %573, !dbg !84
+  %582 = fadd float %522, %574, !dbg !84
+  %583 = fadd float %523, %575, !dbg !84
+  %584 = fadd float %524, %576, !dbg !84
+  %585 = fsub float %577, %457, !dbg !85
+  %586 = fsub float %578, %457, !dbg !85
+  %587 = fsub float %579, %457, !dbg !85
+  %588 = fsub float %580, %457, !dbg !85
+  %589 = fsub float %581, %457, !dbg !85
+  %590 = fsub float %582, %457, !dbg !85
+  %591 = fsub float %583, %457, !dbg !85
+  %592 = fsub float %584, %457, !dbg !85
+  %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %.not.i = icmp eq i32 %593, 0, !dbg !86
+  br i1 %.not.i, label %596, label %594, !dbg !86
+594:                                              ; preds = %548
+  %595 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %471), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+596:                                              ; preds = %548
+  %597 = tail call float @llvm.nvvm.rsqrt.approx.f(float %471), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+__nv_rsqrtf.exit:                                 ; preds = %594, %596
+  %.0.i = phi float [ %595, %594 ], [ %597, %596 ], !dbg !86
+  %598 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %605 = fmul float %585, %.0.i, !dbg !87
+  %606 = fmul float %586, %.0.i, !dbg !87
+  %607 = fmul float %587, %.0.i, !dbg !87
+  %608 = fmul float %588, %.0.i, !dbg !87
+  %609 = fmul float %589, %.0.i, !dbg !87
+  %610 = fmul float %590, %.0.i, !dbg !87
+  %611 = fmul float %591, %.0.i, !dbg !87
+  %612 = fmul float %592, %.0.i, !dbg !87
+  %613 = fmul float %605, %534, !dbg !88
+  %614 = fmul float %606, %535, !dbg !88
+  %615 = fmul float %607, %536, !dbg !88
+  %616 = fmul float %608, %537, !dbg !88
+  %617 = fmul float %609, %543, !dbg !88
+  %618 = fmul float %610, %544, !dbg !88
+  %619 = fmul float %611, %545, !dbg !88
+  %620 = fmul float %612, %546, !dbg !88
+  %621 = getelementptr i16, ptr addrspace(1) %5, i64 %502, !dbg !89
+  %622 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %613) #6, !dbg !90
+  %623 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %614) #6, !dbg !90
+  %624 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %615) #6, !dbg !90
+  %625 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %616) #6, !dbg !90
+  %626 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %617) #6, !dbg !90
+  %627 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %618) #6, !dbg !90
+  %628 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %619) #6, !dbg !90
+  %629 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %620) #6, !dbg !90
+  %630 = insertelement <2 x i16> undef, i16 %622, i64 0, !dbg !90
+  %631 = insertelement <2 x i16> %630, i16 %623, i64 1, !dbg !90
+  %632 = bitcast <2 x i16> %631 to i32, !dbg !90
+  %633 = insertelement <2 x i16> undef, i16 %624, i64 0, !dbg !90
+  %634 = insertelement <2 x i16> %633, i16 %625, i64 1, !dbg !90
+  %635 = bitcast <2 x i16> %634 to i32, !dbg !90
+  %636 = insertelement <2 x i16> undef, i16 %626, i64 0, !dbg !90
+  %637 = insertelement <2 x i16> %636, i16 %627, i64 1, !dbg !90
+  %638 = bitcast <2 x i16> %637 to i32, !dbg !90
+  %639 = insertelement <2 x i16> undef, i16 %628, i64 0, !dbg !90
+  %640 = insertelement <2 x i16> %639, i16 %629, i64 1, !dbg !90
+  %641 = bitcast <2 x i16> %640 to i32, !dbg !90
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %632, i32 %635, i32 %638, i32 %641, ptr addrspace(1) %621, i1 true) #6, !dbg !90
+  br i1 %473, label %472, label %642, !dbg !69
+642:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !91
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 21, column: 28, scope: !7)
+!13 = !DILocation(line: 21, column: 33, scope: !7)
+!14 = !DILocation(line: 22, column: 23, scope: !7)
+!15 = !DILocation(line: 26, column: 30, scope: !7)
+!16 = !DILocation(line: 26, column: 35, scope: !7)
+!17 = !DILocation(line: 27, column: 18, scope: !7)
+!18 = !DILocation(line: 35, column: 44, scope: !7)
+!19 = !DILocation(line: 36, column: 44, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 22, scope: !7)
+!22 = !DILocation(line: 39, column: 36, scope: !7)
+!23 = !DILocation(line: 40, column: 40, scope: !7)
+!24 = !DILocation(line: 41, column: 44, scope: !7)
+!25 = !DILocation(line: 31, column: 36, scope: !7)
+!26 = !DILocation(line: 32, column: 27, scope: !7)
+!27 = !DILocation(line: 35, column: 40, scope: !7)
+!28 = !DILocation(line: 35, column: 34, scope: !7)
+!29 = !DILocation(line: 35, column: 50, scope: !7)
+!30 = !DILocation(line: 36, column: 40, scope: !7)
+!31 = !DILocation(line: 36, column: 34, scope: !7)
+!32 = !DILocation(line: 36, column: 50, scope: !7)
+!33 = !DILocation(line: 36, column: 101, scope: !7)
+!34 = !DILocation(line: 40, column: 55, scope: !7)
+!35 = !DILocation(line: 41, column: 40, scope: !7)
+!36 = !DILocation(line: 41, column: 34, scope: !7)
+!37 = !DILocation(line: 41, column: 52, scope: !7)
+!38 = !DILocation(line: 42, column: 22, scope: !7)
+!39 = !DILocation(line: 44, column: 22, scope: !7)
+!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
+!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
+!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!43 = !DILocation(line: 47, column: 41, scope: !41)
+!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
+!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
+!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
+!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
+!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
+!49 = !DILocation(line: 50, column: 50, scope: !7)
+!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
+!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
+!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
+!53 = !DILocation(line: 53, column: 44, scope: !51)
+!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
+!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
+!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
+!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
+!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
+!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
+!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
+!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
+!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
+!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
+!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
+!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
+!66 = !DILocation(line: 53, column: 44, scope: !41)
+!67 = !DILocation(line: 75, column: 24, scope: !7)
+!68 = !DILocation(line: 77, column: 24, scope: !7)
+!69 = !DILocation(line: 58, column: 36, scope: !7)
+!70 = !DILocation(line: 59, column: 27, scope: !7)
+!71 = !DILocation(line: 62, column: 41, scope: !7)
+!72 = !DILocation(line: 62, column: 35, scope: !7)
+!73 = !DILocation(line: 62, column: 51, scope: !7)
+!74 = !DILocation(line: 63, column: 41, scope: !7)
+!75 = !DILocation(line: 63, column: 35, scope: !7)
+!76 = !DILocation(line: 63, column: 51, scope: !7)
+!77 = !DILocation(line: 63, column: 103, scope: !7)
+!78 = !DILocation(line: 64, column: 35, scope: !7)
+!79 = !DILocation(line: 64, column: 40, scope: !7)
+!80 = !DILocation(line: 68, column: 57, scope: !7)
+!81 = !DILocation(line: 69, column: 35, scope: !7)
+!82 = !DILocation(line: 69, column: 54, scope: !7)
+!83 = !DILocation(line: 70, column: 24, scope: !7)
+!84 = !DILocation(line: 72, column: 24, scope: !7)
+!85 = !DILocation(line: 73, column: 24, scope: !7)
+!86 = !DILocation(line: 78, column: 30, scope: !7)
+!87 = !DILocation(line: 79, column: 24, scope: !7)
+!88 = !DILocation(line: 80, column: 24, scope: !7)
+!89 = !DILocation(line: 82, column: 29, scope: !7)
+!90 = !DILocation(line: 82, column: 52, scope: !7)
+!91 = !DILocation(line: 58, column: 4, scope: !7)