0-hero commited on
Commit
e7aa429
·
verified ·
1 Parent(s): 0eeffdd

Add files using upload-large-folder tool

Browse files
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.llir ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
5
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %5 = shl i32 %4, 3, !dbg !8
7
+ %6 = and i32 %5, 1016, !dbg !8
8
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %8 = shl i32 %7, 10, !dbg !10
10
+ %9 = or i32 %8, %6, !dbg !11
11
+ %10 = or i32 %9, 4, !dbg !11
12
+ %11 = sext i32 %9 to i64, !dbg !12
13
+ %12 = getelementptr float, ptr addrspace(1) %0, i64 %11, !dbg !12
14
+ %13 = sext i32 %10 to i64, !dbg !12
15
+ %14 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !12
16
+ %15 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %12, i1 true) #1, !dbg !13
17
+ %16 = extractvalue { i32, i32, i32, i32 } %15, 0, !dbg !13
18
+ %17 = extractvalue { i32, i32, i32, i32 } %15, 1, !dbg !13
19
+ %18 = extractvalue { i32, i32, i32, i32 } %15, 2, !dbg !13
20
+ %19 = extractvalue { i32, i32, i32, i32 } %15, 3, !dbg !13
21
+ %20 = bitcast i32 %16 to float, !dbg !13
22
+ %21 = bitcast i32 %17 to float, !dbg !13
23
+ %22 = bitcast i32 %18 to float, !dbg !13
24
+ %23 = bitcast i32 %19 to float, !dbg !13
25
+ %24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %14, i1 true) #1, !dbg !13
26
+ %25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !13
27
+ %26 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !13
28
+ %27 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !13
29
+ %28 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !13
30
+ %29 = bitcast i32 %25 to float, !dbg !13
31
+ %30 = bitcast i32 %26 to float, !dbg !13
32
+ %31 = bitcast i32 %27 to float, !dbg !13
33
+ %32 = bitcast i32 %28 to float, !dbg !13
34
+ %33 = getelementptr i16, ptr addrspace(1) %1, i64 %11, !dbg !14
35
+ %34 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
36
+ %35 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %21) #1, !dbg !15
37
+ %36 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %22) #1, !dbg !15
38
+ %37 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %23) #1, !dbg !15
39
+ %38 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %29) #1, !dbg !15
40
+ %39 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %30) #1, !dbg !15
41
+ %40 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %31) #1, !dbg !15
42
+ %41 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %32) #1, !dbg !15
43
+ %42 = insertelement <2 x i16> undef, i16 %34, i64 0, !dbg !15
44
+ %43 = insertelement <2 x i16> %42, i16 %35, i64 1, !dbg !15
45
+ %44 = bitcast <2 x i16> %43 to i32, !dbg !15
46
+ %45 = insertelement <2 x i16> undef, i16 %36, i64 0, !dbg !15
47
+ %46 = insertelement <2 x i16> %45, i16 %37, i64 1, !dbg !15
48
+ %47 = bitcast <2 x i16> %46 to i32, !dbg !15
49
+ %48 = insertelement <2 x i16> undef, i16 %38, i64 0, !dbg !15
50
+ %49 = insertelement <2 x i16> %48, i16 %39, i64 1, !dbg !15
51
+ %50 = bitcast <2 x i16> %49 to i32, !dbg !15
52
+ %51 = insertelement <2 x i16> undef, i16 %40, i64 0, !dbg !15
53
+ %52 = insertelement <2 x i16> %51, i16 %41, i64 1, !dbg !15
54
+ %53 = bitcast <2 x i16> %52 to i32, !dbg !15
55
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %44, i32 %47, i32 %50, i32 %53, ptr addrspace(1) %33, i1 true) #1, !dbg !15
56
+ ret void, !dbg !16
57
+ }
58
+
59
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
60
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
61
+
62
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
63
+ attributes #1 = { nounwind }
64
+
65
+ !llvm.module.flags = !{!0}
66
+ !llvm.dbg.cu = !{!1}
67
+ !nvvm.annotations = !{!3, !4, !4, !3}
68
+
69
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
70
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
71
+ !2 = !DIFile(filename: "c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py", directory: "/tmp/torchinductor_root/5t")
72
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
73
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
74
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
75
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
76
+ !7 = !{}
77
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
78
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
79
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
80
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
81
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
82
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
83
+ !14 = !DILocation(line: 26, column: 25, scope: !5)
84
+ !15 = !DILocation(line: 26, column: 36, scope: !5)
85
+ !16 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttgir ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
9
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
10
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
11
+ %cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
12
+ %c0_i32 = arith.constant 0 : i32
13
+ %c4_i32 = arith.constant 4 : i32
14
+ %c256_i32 = arith.constant 256 : i32
15
+ %cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
16
+ %cst_7 = arith.constant 0.000000e+00 : f32
17
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked>
18
+ %cst_9 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
19
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
20
+ %cst_11 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
21
+ %cst_12 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
22
+ %cst_13 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
23
+ %c64_i32 = arith.constant 64 : i32
24
+ %0 = tt.get_program_id x : i32
25
+ %1 = arith.muli %0, %c64_i32 : i32
26
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
27
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
28
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
29
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
30
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
31
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
32
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
33
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
34
+ %10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
35
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
36
+ %12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
37
+ %13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
38
+ %14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
39
+ %15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
40
+ %16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
41
+ %17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
42
+ %18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
43
+ %19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
44
+ %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
45
+ %21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
46
+ %22 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
47
+ %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
48
+ %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
49
+ %25 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
50
+ %26 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
51
+ %27 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
52
+ %28 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
53
+ %29 = arith.select %27, %25, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
54
+ %30 = arith.select %28, %26, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
55
+ %31 = arith.cmpi sge, %30, %cst_5 : tensor<64x1xi64, #blocked1>
56
+ %32 = arith.cmpi slt, %30, %cst_4 : tensor<64x1xi64, #blocked1>
57
+ %33 = arith.andi %31, %32 : tensor<64x1xi1, #blocked1>
58
+ %34 = arith.muli %29, %cst_1 : tensor<64x1xi64, #blocked>
59
+ %35 = tt.broadcast %34 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
60
+ %36 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
61
+ %37:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_10, %arg10 = %cst_10, %arg11 = %cst_10) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) : i32 {
62
+ %46 = tt.splat %arg8 : (i32) -> tensor<1x4xi32, #blocked>
63
+ %47 = arith.addi %46, %11 : tensor<1x4xi32, #blocked>
64
+ %48 = arith.cmpi slt, %47, %cst_11 : tensor<1x4xi32, #blocked>
65
+ %49 = tt.broadcast %47 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
66
+ %50 = arith.addi %49, %20 : tensor<64x4xi32, #blocked>
67
+ %51 = tt.addptr %21, %50 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
68
+ %52 = tt.broadcast %48 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
69
+ %53 = tt.load %51, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
70
+ %54 = arith.addi %49, %23 : tensor<64x4xi32, #blocked>
71
+ %55 = tt.addptr %24, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
72
+ %56 = tt.load %55, %52, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
73
+ %57 = arith.extf %56 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
74
+ tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
75
+ %58 = arith.extsi %47 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
76
+ %59 = tt.broadcast %58 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
77
+ %60 = arith.addi %59, %35 : tensor<64x4xi64, #blocked>
78
+ %61 = tt.addptr %36, %60 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
79
+ %62 = tt.load %61, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
80
+ %63 = arith.addf %62, %53 : tensor<64x4xf32, #blocked>
81
+ %64 = arith.addf %63, %57 : tensor<64x4xf32, #blocked>
82
+ %65 = arith.subf %64, %arg9 : tensor<64x4xf32, #blocked>
83
+ %66 = arith.addf %arg11, %cst_6 : tensor<64x4xf32, #blocked>
84
+ %67 = arith.divf %65, %66 : tensor<64x4xf32, #blocked>
85
+ %68 = arith.addf %arg9, %67 : tensor<64x4xf32, #blocked>
86
+ %69 = arith.subf %64, %68 : tensor<64x4xf32, #blocked>
87
+ %70 = arith.mulf %65, %69 : tensor<64x4xf32, #blocked>
88
+ %71 = arith.addf %arg10, %70 : tensor<64x4xf32, #blocked>
89
+ %72 = arith.select %52, %68, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
90
+ %73 = arith.select %52, %71, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
91
+ %74 = arith.select %52, %66, %arg11 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
92
+ scf.yield %72, %73, %74 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
93
+ }
94
+ %38:3 = "tt.reduce"(%37#0, %37#1, %37#2) <{axis = 1 : i32}> ({
95
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
96
+ %46 = arith.subf %arg11, %arg8 : f32
97
+ %47 = arith.addf %arg10, %arg13 : f32
98
+ %48 = arith.cmpf oeq, %47, %cst_7 : f32
99
+ %49 = arith.divf %arg13, %47 : f32
100
+ %50 = arith.select %48, %cst_7, %49 : f32
101
+ %51 = arith.mulf %46, %50 : f32
102
+ %52 = arith.addf %arg8, %51 : f32
103
+ %53 = arith.addf %arg9, %arg12 : f32
104
+ %54 = arith.mulf %46, %46 : f32
105
+ %55 = arith.mulf %54, %arg10 : f32
106
+ %56 = arith.mulf %55, %50 : f32
107
+ %57 = arith.addf %53, %56 : f32
108
+ tt.reduce.return %52, %57, %47 : f32, f32, f32
109
+ }) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
110
+ %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
111
+ %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
112
+ %41 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
113
+ %42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
114
+ %43 = arith.divf %40, %cst_13 : tensor<64x1xf32, #blocked>
115
+ %44 = arith.addf %43, %cst_12 : tensor<64x1xf32, #blocked>
116
+ %45 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
117
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
118
+ %46 = tt.splat %arg8 : (i32) -> tensor<1x4xi32, #blocked>
119
+ %47 = arith.addi %46, %11 : tensor<1x4xi32, #blocked>
120
+ %48 = arith.cmpi slt, %47, %cst_11 : tensor<1x4xi32, #blocked>
121
+ %49 = tt.broadcast %47 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
122
+ %50 = arith.addi %49, %20 : tensor<64x4xi32, #blocked>
123
+ %51 = tt.addptr %21, %50 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
124
+ %52 = tt.broadcast %48 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
125
+ %53 = tt.load %51, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
126
+ %54 = arith.addi %49, %23 : tensor<64x4xi32, #blocked>
127
+ %55 = tt.addptr %24, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
128
+ %56 = tt.load %55, %52, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
129
+ %57 = arith.extf %56 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
130
+ %58 = tt.addptr %41, %47 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
131
+ %59 = tt.load %58, %48, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
132
+ tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
133
+ %60 = arith.extsi %47 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
134
+ %61 = tt.broadcast %60 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
135
+ %62 = arith.addi %61, %35 : tensor<64x4xi64, #blocked>
136
+ %63 = tt.addptr %36, %62 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
137
+ %64 = tt.load %63, %52, %cst_10 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
138
+ %65 = arith.addf %64, %53 : tensor<64x4xf32, #blocked>
139
+ %66 = arith.addf %65, %57 : tensor<64x4xf32, #blocked>
140
+ %67 = arith.subf %66, %42 : tensor<64x4xf32, #blocked>
141
+ %68 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
142
+ %69 = tt.broadcast %68 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
143
+ %70 = arith.mulf %67, %69 : tensor<64x4xf32, #blocked>
144
+ %71 = tt.broadcast %59 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
145
+ %72 = arith.mulf %70, %71 : tensor<64x4xf32, #blocked>
146
+ %73 = tt.addptr %45, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
147
+ %74 = arith.truncf %72 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
148
+ tt.store %73, %74, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
149
+ }
150
+ tt.return
151
+ }
152
+ }
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
4
+ %cst_0 = arith.constant 0.000000e+00 : f32
5
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
6
+ %c256_i32 = arith.constant 256 : i32
7
+ %c4_i32 = arith.constant 4 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
12
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
15
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
16
+ %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
17
+ %cst_10 = arith.constant dense<256> : tensor<1x4xi32>
18
+ %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
19
+ %c64_i32 = arith.constant 64 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c64_i32 : i32
22
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
23
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
24
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
25
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
26
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
27
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
28
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
29
+ %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
30
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
31
+ %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
32
+ %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
33
+ %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
34
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
35
+ %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
36
+ %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x4xi32>
37
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
38
+ %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
39
+ %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
40
+ %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
41
+ %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
42
+ %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
43
+ %23 = arith.andi %21, %22 : tensor<64x1xi1>
44
+ %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
45
+ %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x4xi64>
46
+ %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
47
+ %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 {
48
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
49
+ %52 = arith.addi %51, %7 : tensor<1x4xi32>
50
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
51
+ %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
52
+ %55 = arith.addi %54, %13 : tensor<64x4xi32>
53
+ %56 = tt.addptr %14, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
54
+ %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
55
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
56
+ %59 = arith.addi %54, %16 : tensor<64x4xi32>
57
+ %60 = tt.addptr %17, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
58
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16>
59
+ %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
60
+ tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
61
+ %63 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
62
+ %64 = tt.broadcast %63 : (tensor<1x4xi64>) -> tensor<64x4xi64>
63
+ %65 = arith.addi %64, %25 : tensor<64x4xi64>
64
+ %66 = tt.addptr %26, %65 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
65
+ %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
66
+ %68 = arith.addf %67, %58 : tensor<64x4xf32>
67
+ %69 = arith.addf %68, %62 : tensor<64x4xf32>
68
+ %70 = arith.subf %69, %arg9 : tensor<64x4xf32>
69
+ %71 = arith.addf %arg11, %cst_1 : tensor<64x4xf32>
70
+ %72 = arith.divf %70, %71 : tensor<64x4xf32>
71
+ %73 = arith.addf %arg9, %72 : tensor<64x4xf32>
72
+ %74 = arith.subf %69, %73 : tensor<64x4xf32>
73
+ %75 = arith.mulf %70, %74 : tensor<64x4xf32>
74
+ %76 = arith.addf %arg10, %75 : tensor<64x4xf32>
75
+ %77 = arith.select %57, %73, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
76
+ %78 = arith.select %57, %76, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
77
+ %79 = arith.select %57, %71, %arg11 : tensor<64x4xi1>, tensor<64x4xf32>
78
+ scf.yield %77, %78, %79 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
79
+ }
80
+ %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
82
+ %51 = arith.subf %arg11, %arg8 : f32
83
+ %52 = arith.addf %arg10, %arg13 : f32
84
+ %53 = arith.cmpf oeq, %52, %cst_0 : f32
85
+ %54 = arith.divf %arg13, %52 : f32
86
+ %55 = arith.select %53, %cst_0, %54 : f32
87
+ %56 = arith.mulf %51, %55 : f32
88
+ %57 = arith.addf %arg8, %56 : f32
89
+ %58 = arith.addf %arg9, %arg12 : f32
90
+ %59 = arith.mulf %51, %51 : f32
91
+ %60 = arith.mulf %59, %arg10 : f32
92
+ %61 = arith.mulf %60, %55 : f32
93
+ %62 = arith.addf %58, %61 : f32
94
+ tt.reduce.return %57, %62, %52 : f32, f32, f32
95
+ }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
96
+ %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
97
+ %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
98
+ %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
99
+ %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x4xi32>
100
+ %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
101
+ %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
102
+ %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x4xi32>
103
+ %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
104
+ %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
105
+ %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
106
+ %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
107
+ %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
108
+ %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
109
+ %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
110
+ %43 = arith.andi %41, %42 : tensor<64x1xi1>
111
+ %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
112
+ %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x4xi64>
113
+ %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
114
+ %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x4xf32>
115
+ %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
116
+ %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
117
+ %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
118
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
119
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
120
+ %52 = arith.addi %51, %7 : tensor<1x4xi32>
121
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
122
+ %54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
123
+ %55 = arith.addi %54, %32 : tensor<64x4xi32>
124
+ %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
125
+ %57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
126
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
127
+ %59 = arith.addi %54, %35 : tensor<64x4xi32>
128
+ %60 = tt.addptr %36, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
129
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
130
+ %62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
131
+ %63 = tt.addptr %37, %52 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
132
+ %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
133
+ tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
134
+ %65 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
135
+ %66 = tt.broadcast %65 : (tensor<1x4xi64>) -> tensor<64x4xi64>
136
+ %67 = arith.addi %66, %45 : tensor<64x4xi64>
137
+ %68 = tt.addptr %46, %67 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
138
+ %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
139
+ %70 = arith.addf %69, %58 : tensor<64x4xf32>
140
+ %71 = arith.addf %70, %62 : tensor<64x4xf32>
141
+ %72 = arith.subf %71, %47 : tensor<64x4xf32>
142
+ %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
143
+ %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x4xf32>
144
+ %75 = arith.mulf %72, %74 : tensor<64x4xf32>
145
+ %76 = tt.broadcast %64 : (tensor<1x4xf32>) -> tensor<64x4xf32>
146
+ %77 = arith.mulf %75, %76 : tensor<64x4xf32>
147
+ %78 = tt.addptr %50, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
148
+ %79 = arith.truncf %77 : tensor<64x4xf32> to tensor<64x4xbf16>
149
+ tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
150
+ }
151
+ tt.return
152
+ }
153
+ }
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2d34e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
5
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %7 = and i32 %6, 7, !dbg !8
7
+ %8 = zext nneg i32 %7 to i64, !dbg !9
8
+ %9 = getelementptr float, ptr addrspace(1) %1, i64 %8, !dbg !9
9
+ %10 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %9, i1 true, i32 0, i1 true) #3, !dbg !10
10
+ %11 = bitcast i32 %10 to float, !dbg !10
11
+ %12 = getelementptr i64, ptr addrspace(1) %2, i64 %8, !dbg !11
12
+ %13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];\0A\09@!$3 mov.u64 $0, 0x0;", "=l,l,b,b"(ptr addrspace(1) %12, i1 true, i1 true) #3, !dbg !12
13
+ %14 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %10, i32 4, i32 31), !dbg !13
14
+ %15 = bitcast i32 %14 to float, !dbg !13
15
+ %16 = fadd float %11, %15, !dbg !17
16
+ %17 = bitcast float %16 to i32, !dbg !13
17
+ %18 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %17, i32 2, i32 31), !dbg !13
18
+ %19 = bitcast i32 %18 to float, !dbg !13
19
+ %20 = fadd float %16, %19, !dbg !17
20
+ %21 = bitcast float %20 to i32, !dbg !13
21
+ %22 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %21, i32 1, i32 31), !dbg !13
22
+ %23 = bitcast i32 %22 to float, !dbg !13
23
+ %24 = fadd float %20, %23, !dbg !17
24
+ %25 = trunc i64 %13 to i32, !dbg !21
25
+ %26 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %25, i32 4, i32 31), !dbg !21
26
+ %bc = bitcast i64 %13 to <2 x i32>, !dbg !21
27
+ %27 = extractelement <2 x i32> %bc, i64 1, !dbg !21
28
+ %28 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %27, i32 4, i32 31), !dbg !21
29
+ %29 = insertelement <2 x i32> undef, i32 %26, i64 0, !dbg !21
30
+ %30 = insertelement <2 x i32> %29, i32 %28, i64 1, !dbg !21
31
+ %31 = bitcast <2 x i32> %30 to i64, !dbg !21
32
+ %32 = add i64 %13, %31, !dbg !23
33
+ %33 = trunc i64 %32 to i32, !dbg !21
34
+ %34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 2, i32 31), !dbg !21
35
+ %bc1 = bitcast i64 %32 to <2 x i32>, !dbg !21
36
+ %35 = extractelement <2 x i32> %bc1, i64 1, !dbg !21
37
+ %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !21
38
+ %37 = insertelement <2 x i32> undef, i32 %34, i64 0, !dbg !21
39
+ %38 = insertelement <2 x i32> %37, i32 %36, i64 1, !dbg !21
40
+ %39 = bitcast <2 x i32> %38 to i64, !dbg !21
41
+ %40 = add i64 %32, %39, !dbg !23
42
+ %41 = trunc i64 %40 to i32, !dbg !21
43
+ %42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !21
44
+ %bc2 = bitcast i64 %40 to <2 x i32>, !dbg !21
45
+ %43 = extractelement <2 x i32> %bc2, i64 1, !dbg !21
46
+ %44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 1, i32 31), !dbg !21
47
+ %45 = insertelement <2 x i32> undef, i32 %42, i64 0, !dbg !21
48
+ %46 = insertelement <2 x i32> %45, i32 %44, i64 1, !dbg !21
49
+ %47 = bitcast <2 x i32> %46 to i64, !dbg !21
50
+ %48 = add i64 %40, %47, !dbg !23
51
+ %49 = sitofp i64 %48 to float, !dbg !26
52
+ %50 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %24, float %49) #3, !dbg !27
53
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
54
+ %51 = and i32 %6, 63, !dbg !29
55
+ %52 = icmp eq i32 %51, 0, !dbg !29
56
+ %53 = bitcast float %50 to i32, !dbg !29
57
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %53, ptr addrspace(1) %0, i1 %52) #3, !dbg !29
58
+ ret void, !dbg !30
59
+ }
60
+
61
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
62
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
63
+
64
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
65
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
66
+
67
+ ; Function Attrs: convergent nocallback nounwind
68
+ declare void @llvm.nvvm.barrier0() #2
69
+
70
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
71
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
72
+ attributes #2 = { convergent nocallback nounwind }
73
+ attributes #3 = { nounwind }
74
+
75
+ !llvm.module.flags = !{!0}
76
+ !llvm.dbg.cu = !{!1}
77
+ !nvvm.annotations = !{!3, !4, !4, !3}
78
+
79
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
80
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
81
+ !2 = !DIFile(filename: "c7zrzealf5bsn7qskl6y72zb73mh5bzf6uskuswp33lv4y5kk64w.py", directory: "/tmp/torchinductor_root/7z")
82
+ !3 = !{ptr @triton__0d1d2d34e, !"kernel", i32 1}
83
+ !4 = !{ptr @triton__0d1d2d34e, !"maxntidx", i32 64}
84
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d34e", linkageName: "triton__0d1d2d34e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
85
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
86
+ !7 = !{}
87
+ !8 = !DILocation(line: 25, column: 34, scope: !5)
88
+ !9 = !DILocation(line: 28, column: 30, scope: !5)
89
+ !10 = !DILocation(line: 28, column: 35, scope: !5)
90
+ !11 = !DILocation(line: 29, column: 30, scope: !5)
91
+ !12 = !DILocation(line: 29, column: 35, scope: !5)
92
+ !13 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !16)
93
+ !14 = distinct !DILexicalBlockFile(scope: !5, file: !15, discriminator: 0)
94
+ !15 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
95
+ !16 = !DILocation(line: 32, column: 24, scope: !14)
96
+ !17 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !19)
97
+ !18 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
98
+ !19 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !20)
99
+ !20 = !DILocation(line: 32, column: 24, scope: !18)
100
+ !21 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !22)
101
+ !22 = !DILocation(line: 35, column: 24, scope: !14)
102
+ !23 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !24)
103
+ !24 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !25)
104
+ !25 = !DILocation(line: 35, column: 24, scope: !18)
105
+ !26 = !DILocation(line: 36, column: 20, scope: !5)
106
+ !27 = !DILocation(line: 37, column: 19, scope: !5)
107
+ !28 = !DILocation(line: 38, column: 4, scope: !5)
108
+ !29 = !DILocation(line: 39, column: 71, scope: !5)
109
+ !30 = !DILocation(line: 39, column: 4, scope: !5)
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.llir ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [8 x i8] c"<module>"
5
+ @assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [8 x i8] c"<module>"
8
+ @assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = and i32 %9, 31, !dbg !10
18
+ %11 = lshr i32 %9, 6, !dbg !10
19
+ %12 = and i32 %11, 1, !dbg !10
20
+ %13 = and i32 %9, 1, !dbg !10
21
+ %urem = shl i32 %9, 1, !dbg !11
22
+ %14 = and i32 %urem, 126, !dbg !11
23
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
24
+ %16 = shl i32 %15, 1, !dbg !13
25
+ %17 = or i32 %16, %12, !dbg !14
26
+ %18 = or i32 %16, %13, !dbg !14
27
+ %19 = sext i32 %17 to i64, !dbg !15
28
+ %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
29
+ %21 = sext i32 %18 to i64, !dbg !15
30
+ %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
31
+ %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
32
+ %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
33
+ %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
34
+ %26 = srem i32 %17, 512, !dbg !17
35
+ %27 = shl nsw i32 %26, 8, !dbg !18
36
+ %28 = shl i32 %17, 8, !dbg !19
37
+ %29 = add i64 %25, 50257, !dbg !20
38
+ %30 = icmp slt i64 %23, 0, !dbg !21
39
+ %31 = icmp slt i64 %25, 0, !dbg !21
40
+ %32 = select i1 %31, i64 %29, i64 %25, !dbg !22
41
+ %33 = icmp ugt i64 %32, 50256, !dbg !23
42
+ %34 = shl i64 %23, 8, !dbg !24
43
+ %35 = add i64 %34, 12865792, !dbg !24
44
+ %36 = select i1 %30, i64 %35, i64 %34, !dbg !24
45
+ %37 = getelementptr float, ptr addrspace(1) %1, i64 %36
46
+ %38 = or i32 %14, %27, !dbg !25
47
+ %39 = sext i32 %38 to i64, !dbg !26
48
+ %40 = getelementptr float, ptr addrspace(1) %2, i64 %39, !dbg !26
49
+ %41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
50
+ %42 = extractvalue { i32, i32 } %41, 0, !dbg !27
51
+ %43 = extractvalue { i32, i32 } %41, 1, !dbg !27
52
+ %44 = insertelement <2 x i32> poison, i32 %42, i64 0, !dbg !27
53
+ %45 = insertelement <2 x i32> %44, i32 %43, i64 1, !dbg !27
54
+ %46 = bitcast <2 x i32> %45 to <2 x float>, !dbg !27
55
+ %47 = or i32 %14, %28, !dbg !28
56
+ %48 = sext i32 %47 to i64, !dbg !29
57
+ %49 = getelementptr i16, ptr addrspace(1) %3, i64 %48, !dbg !29
58
+ %50 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %49, i1 true, i32 0, i1 true) #6, !dbg !30
59
+ %51 = trunc i32 %50 to i16, !dbg !30
60
+ %extelt.offset2 = lshr i32 %50, 16, !dbg !30
61
+ %52 = trunc i32 %extelt.offset2 to i16, !dbg !30
62
+ %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #6, !dbg !31
63
+ %54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #6, !dbg !31
64
+ br i1 %33, label %55, label %56, !dbg !32
65
+
66
+ 55: ; preds = %8
67
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !32
68
+ br label %56, !dbg !32
69
+
70
+ 56: ; preds = %55, %8
71
+ %57 = zext nneg i32 %14 to i64, !dbg !33
72
+ %58 = getelementptr float, ptr addrspace(1) %37, i64 %57, !dbg !34
73
+ %59 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
74
+ %60 = extractvalue { i32, i32 } %59, 0, !dbg !35
75
+ %61 = extractvalue { i32, i32 } %59, 1, !dbg !35
76
+ %62 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !35
77
+ %63 = insertelement <2 x i32> %62, i32 %61, i64 1, !dbg !35
78
+ %64 = bitcast <2 x i32> %63 to <2 x float>, !dbg !35
79
+ %65 = fadd <2 x float> %46, %64, !dbg !36
80
+ %66 = insertelement <2 x float> poison, float %53, i64 0, !dbg !37
81
+ %67 = insertelement <2 x float> %66, float %54, i64 1, !dbg !37
82
+ %68 = fadd <2 x float> %67, %65, !dbg !37
83
+ %69 = extractelement <2 x float> %68, i64 0, !dbg !38
84
+ %70 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %69, float 1.000000e+00) #6, !dbg !38
85
+ %71 = extractelement <2 x float> %68, i64 1, !dbg !38
86
+ %72 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %71, float 1.000000e+00) #6, !dbg !38
87
+ %73 = insertelement <2 x float> poison, float %70, i64 0, !dbg !42
88
+ %74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !42
89
+ %75 = fadd <2 x float> %74, zeroinitializer, !dbg !42
90
+ %76 = fsub <2 x float> %68, %75, !dbg !43
91
+ %77 = fmul <2 x float> %68, %76, !dbg !44
92
+ %78 = fadd <2 x float> %77, zeroinitializer, !dbg !45
93
+ %79 = or i32 %14, 128, !dbg !46
94
+ %80 = or i32 %79, %27, !dbg !25
95
+ %81 = sext i32 %80 to i64, !dbg !26
96
+ %82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !26
97
+ %83 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
98
+ %84 = extractvalue { i32, i32 } %83, 0, !dbg !27
99
+ %85 = extractvalue { i32, i32 } %83, 1, !dbg !27
100
+ %86 = insertelement <2 x i32> poison, i32 %84, i64 0, !dbg !27
101
+ %87 = insertelement <2 x i32> %86, i32 %85, i64 1, !dbg !27
102
+ %88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !27
103
+ %89 = or i32 %79, %28, !dbg !28
104
+ %90 = sext i32 %89 to i64, !dbg !29
105
+ %91 = getelementptr i16, ptr addrspace(1) %3, i64 %90, !dbg !29
106
+ %92 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 true, i32 0, i1 true) #6, !dbg !30
107
+ %93 = trunc i32 %92 to i16, !dbg !30
108
+ %extelt.offset2.1 = lshr i32 %92, 16, !dbg !30
109
+ %94 = trunc i32 %extelt.offset2.1 to i16, !dbg !30
110
+ %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %93) #6, !dbg !31
111
+ %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %94) #6, !dbg !31
112
+ br i1 %33, label %97, label %98, !dbg !32
113
+
114
+ 97: ; preds = %56
115
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !32
116
+ br label %98, !dbg !32
117
+
118
+ 98: ; preds = %97, %56
119
+ %99 = zext nneg i32 %79 to i64, !dbg !33
120
+ %100 = getelementptr float, ptr addrspace(1) %37, i64 %99, !dbg !34
121
+ %101 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %100, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
122
+ %102 = extractvalue { i32, i32 } %101, 0, !dbg !35
123
+ %103 = extractvalue { i32, i32 } %101, 1, !dbg !35
124
+ %104 = insertelement <2 x i32> poison, i32 %102, i64 0, !dbg !35
125
+ %105 = insertelement <2 x i32> %104, i32 %103, i64 1, !dbg !35
126
+ %106 = bitcast <2 x i32> %105 to <2 x float>, !dbg !35
127
+ %107 = fadd <2 x float> %88, %106, !dbg !36
128
+ %108 = insertelement <2 x float> poison, float %95, i64 0, !dbg !37
129
+ %109 = insertelement <2 x float> %108, float %96, i64 1, !dbg !37
130
+ %110 = fadd <2 x float> %109, %107, !dbg !37
131
+ %111 = fsub <2 x float> %110, %75, !dbg !47
132
+ %112 = extractelement <2 x float> %111, i64 0, !dbg !38
133
+ %113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float 2.000000e+00) #6, !dbg !38
134
+ %114 = extractelement <2 x float> %111, i64 1, !dbg !38
135
+ %115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float 2.000000e+00) #6, !dbg !38
136
+ %116 = insertelement <2 x float> poison, float %113, i64 0, !dbg !42
137
+ %117 = insertelement <2 x float> %116, float %115, i64 1, !dbg !42
138
+ %118 = fadd <2 x float> %75, %117, !dbg !42
139
+ %119 = fsub <2 x float> %110, %118, !dbg !43
140
+ %120 = fmul <2 x float> %111, %119, !dbg !44
141
+ %121 = fadd <2 x float> %78, %120, !dbg !45
142
+ %122 = lshr i32 %9, 5, !dbg !10
143
+ %123 = and i32 %122, 1, !dbg !11
144
+ %124 = and i32 %9, 127, !dbg !11
145
+ %125 = zext nneg i32 %124 to i64, !dbg !48
146
+ %126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !48
147
+ store <1 x float> <float 2.000000e+00>, ptr addrspace(3) %126, align 4, !dbg !48
148
+ %127 = add nuw nsw i32 %124, 130, !dbg !48
149
+ %128 = zext nneg i32 %127 to i64, !dbg !48
150
+ %129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !48
151
+ store <1 x float> <float 2.000000e+00>, ptr addrspace(3) %129, align 4, !dbg !48
152
+ tail call void @llvm.nvvm.barrier0(), !dbg !48
153
+ %130 = mul nuw nsw i32 %12, 130, !dbg !48
154
+ %131 = add nuw nsw i32 %130, %14, !dbg !48
155
+ %132 = zext nneg i32 %131 to i64, !dbg !48
156
+ %133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !48
157
+ %134 = load float, ptr addrspace(3) %133, align 8, !dbg !48
158
+ %135 = getelementptr inbounds <2 x float>, ptr addrspace(3) %133, i64 0, i64 1, !dbg !48
159
+ %136 = load float, ptr addrspace(3) %135, align 4, !dbg !48
160
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
161
+ %137 = extractelement <2 x float> %118, i64 0, !dbg !51
162
+ %138 = extractelement <2 x float> %118, i64 1, !dbg !55
163
+ %139 = fsub float %138, %137, !dbg !55
164
+ %140 = fadd float %134, %136, !dbg !56
165
+ %141 = fcmp oeq float %140, 0.000000e+00, !dbg !57
166
+ %142 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %140) #6, !dbg !58
167
+ %143 = select i1 %141, float 0.000000e+00, float %142, !dbg !59
168
+ %144 = fmul float %139, %143, !dbg !60
169
+ %145 = fadd float %137, %144, !dbg !51
170
+ %shift = shufflevector <2 x float> %121, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !61
171
+ %146 = fadd <2 x float> %121, %shift, !dbg !61
172
+ %147 = extractelement <2 x float> %146, i64 0, !dbg !61
173
+ %148 = fmul float %139, %139, !dbg !62
174
+ %149 = fmul float %148, %134, !dbg !63
175
+ %150 = fmul float %149, %143, !dbg !64
176
+ %151 = fadd float %147, %150, !dbg !65
177
+ %152 = bitcast float %145 to i32, !dbg !49
178
+ %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 16, i32 31), !dbg !49
179
+ %154 = bitcast i32 %153 to float, !dbg !49
180
+ %155 = bitcast float %151 to i32, !dbg !49
181
+ %156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 16, i32 31), !dbg !49
182
+ %157 = bitcast i32 %156 to float, !dbg !49
183
+ %158 = bitcast float %140 to i32, !dbg !49
184
+ %159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 16, i32 31), !dbg !49
185
+ %160 = bitcast i32 %159 to float, !dbg !49
186
+ %161 = fsub float %154, %145, !dbg !55
187
+ %162 = fadd float %140, %160, !dbg !56
188
+ %163 = fcmp oeq float %162, 0.000000e+00, !dbg !57
189
+ %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %160, float %162) #6, !dbg !58
190
+ %165 = select i1 %163, float 0.000000e+00, float %164, !dbg !59
191
+ %166 = fmul float %161, %165, !dbg !60
192
+ %167 = fadd float %145, %166, !dbg !51
193
+ %168 = fadd float %151, %157, !dbg !61
194
+ %169 = fmul float %161, %161, !dbg !62
195
+ %170 = fmul float %140, %169, !dbg !63
196
+ %171 = fmul float %170, %165, !dbg !64
197
+ %172 = fadd float %168, %171, !dbg !65
198
+ %173 = bitcast float %167 to i32, !dbg !49
199
+ %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 8, i32 31), !dbg !49
200
+ %175 = bitcast i32 %174 to float, !dbg !49
201
+ %176 = bitcast float %172 to i32, !dbg !49
202
+ %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 8, i32 31), !dbg !49
203
+ %178 = bitcast i32 %177 to float, !dbg !49
204
+ %179 = bitcast float %162 to i32, !dbg !49
205
+ %180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !49
206
+ %181 = bitcast i32 %180 to float, !dbg !49
207
+ %182 = fsub float %175, %167, !dbg !55
208
+ %183 = fadd float %162, %181, !dbg !56
209
+ %184 = fcmp oeq float %183, 0.000000e+00, !dbg !57
210
+ %185 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %181, float %183) #6, !dbg !58
211
+ %186 = select i1 %184, float 0.000000e+00, float %185, !dbg !59
212
+ %187 = fmul float %182, %186, !dbg !60
213
+ %188 = fadd float %167, %187, !dbg !51
214
+ %189 = fadd float %172, %178, !dbg !61
215
+ %190 = fmul float %182, %182, !dbg !62
216
+ %191 = fmul float %162, %190, !dbg !63
217
+ %192 = fmul float %186, %191, !dbg !64
218
+ %193 = fadd float %189, %192, !dbg !65
219
+ %194 = bitcast float %188 to i32, !dbg !49
220
+ %195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !49
221
+ %196 = bitcast i32 %195 to float, !dbg !49
222
+ %197 = bitcast float %193 to i32, !dbg !49
223
+ %198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !49
224
+ %199 = bitcast i32 %198 to float, !dbg !49
225
+ %200 = bitcast float %183 to i32, !dbg !49
226
+ %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !49
227
+ %202 = bitcast i32 %201 to float, !dbg !49
228
+ %203 = fsub float %196, %188, !dbg !55
229
+ %204 = fadd float %183, %202, !dbg !56
230
+ %205 = fcmp oeq float %204, 0.000000e+00, !dbg !57
231
+ %206 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %202, float %204) #6, !dbg !58
232
+ %207 = select i1 %205, float 0.000000e+00, float %206, !dbg !59
233
+ %208 = fmul float %203, %207, !dbg !60
234
+ %209 = fadd float %188, %208, !dbg !51
235
+ %210 = fadd float %193, %199, !dbg !61
236
+ %211 = fmul float %203, %203, !dbg !62
237
+ %212 = fmul float %183, %211, !dbg !63
238
+ %213 = fmul float %207, %212, !dbg !64
239
+ %214 = fadd float %210, %213, !dbg !65
240
+ %215 = bitcast float %209 to i32, !dbg !49
241
+ %216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 2, i32 31), !dbg !49
242
+ %217 = bitcast i32 %216 to float, !dbg !49
243
+ %218 = bitcast float %214 to i32, !dbg !49
244
+ %219 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 2, i32 31), !dbg !49
245
+ %220 = bitcast i32 %219 to float, !dbg !49
246
+ %221 = bitcast float %204 to i32, !dbg !49
247
+ %222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !49
248
+ %223 = bitcast i32 %222 to float, !dbg !49
249
+ %224 = fsub float %217, %209, !dbg !55
250
+ %225 = fadd float %204, %223, !dbg !56
251
+ %226 = fcmp oeq float %225, 0.000000e+00, !dbg !57
252
+ %227 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %223, float %225) #6, !dbg !58
253
+ %228 = select i1 %226, float 0.000000e+00, float %227, !dbg !59
254
+ %229 = fmul float %224, %228, !dbg !60
255
+ %230 = fadd float %209, %229, !dbg !51
256
+ %231 = fadd float %214, %220, !dbg !61
257
+ %232 = fmul float %224, %224, !dbg !62
258
+ %233 = fmul float %204, %232, !dbg !63
259
+ %234 = fmul float %228, %233, !dbg !64
260
+ %235 = fadd float %231, %234, !dbg !65
261
+ %236 = bitcast float %230 to i32, !dbg !49
262
+ %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !49
263
+ %238 = bitcast i32 %237 to float, !dbg !49
264
+ %239 = bitcast float %235 to i32, !dbg !49
265
+ %240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !49
266
+ %241 = bitcast i32 %240 to float, !dbg !49
267
+ %242 = bitcast float %225 to i32, !dbg !49
268
+ %243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 1, i32 31), !dbg !49
269
+ %244 = bitcast i32 %243 to float, !dbg !49
270
+ %245 = fsub float %238, %230, !dbg !55
271
+ %246 = fadd float %225, %244, !dbg !56
272
+ %247 = fcmp oeq float %246, 0.000000e+00, !dbg !57
273
+ %248 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %246) #6, !dbg !58
274
+ %249 = select i1 %247, float 0.000000e+00, float %248, !dbg !59
275
+ %250 = fmul float %245, %249, !dbg !60
276
+ %251 = fadd float %230, %250, !dbg !51
277
+ %252 = fadd float %235, %241, !dbg !61
278
+ %253 = fmul float %245, %245, !dbg !62
279
+ %254 = fmul float %225, %253, !dbg !63
280
+ %255 = fmul float %249, %254, !dbg !64
281
+ %256 = fadd float %252, %255, !dbg !65
282
+ %257 = icmp eq i32 %10, 0, !dbg !49
283
+ %258 = shl nuw nsw i32 %12, 1, !dbg !49
284
+ %259 = or i32 %258, %123, !dbg !49
285
+ %260 = zext nneg i32 %259 to i64, !dbg !49
286
+ %261 = getelementptr float, ptr addrspace(3) @global_smem, i64 %260, !dbg !49
287
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, float %251, i1 %257) #6, !dbg !49
288
+ %262 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %260, !dbg !49
289
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %262, float %256, i1 %257) #6, !dbg !49
290
+ %263 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %260, !dbg !49
291
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %263, float %246, i1 %257) #6, !dbg !49
292
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
293
+ %264 = icmp slt i32 %9, 4, !dbg !49
294
+ %265 = sext i32 %9 to i64, !dbg !49
295
+ %266 = getelementptr float, ptr addrspace(3) @global_smem, i64 %265, !dbg !49
296
+ %267 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %266, i1 %264) #6, !dbg !49
297
+ %268 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %265, !dbg !49
298
+ %269 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %268, i1 %264) #6, !dbg !49
299
+ %270 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %265, !dbg !49
300
+ %271 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %270, i1 %264) #6, !dbg !49
301
+ %272 = bitcast float %267 to i32, !dbg !49
302
+ %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !49
303
+ %274 = bitcast i32 %273 to float, !dbg !49
304
+ %275 = bitcast float %269 to i32, !dbg !49
305
+ %276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 1, i32 31), !dbg !49
306
+ %277 = bitcast i32 %276 to float, !dbg !49
307
+ %278 = bitcast float %271 to i32, !dbg !49
308
+ %279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 1, i32 31), !dbg !49
309
+ %280 = bitcast i32 %279 to float, !dbg !49
310
+ %281 = fsub float %274, %267, !dbg !55
311
+ %282 = fadd float %271, %280, !dbg !56
312
+ %283 = fcmp oeq float %282, 0.000000e+00, !dbg !57
313
+ %284 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %280, float %282) #6, !dbg !58
314
+ %285 = select i1 %283, float 0.000000e+00, float %284, !dbg !59
315
+ %286 = fmul float %281, %285, !dbg !60
316
+ %287 = fadd float %267, %286, !dbg !51
317
+ %288 = fadd float %269, %277, !dbg !61
318
+ %289 = fmul float %281, %281, !dbg !62
319
+ %290 = fmul float %271, %289, !dbg !63
320
+ %291 = fmul float %290, %285, !dbg !64
321
+ %292 = fadd float %288, %291, !dbg !65
322
+ %293 = icmp eq i32 %13, 0, !dbg !49
323
+ %294 = and i1 %264, %293, !dbg !49
324
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %266, float %287, i1 %294) #6, !dbg !49
325
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %268, float %292, i1 %294) #6, !dbg !49
326
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %270, float %282, i1 %294) #6, !dbg !49
327
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
328
+ %295 = zext nneg i32 %258 to i64, !dbg !49
329
+ %296 = getelementptr float, ptr addrspace(3) @global_smem, i64 %295, !dbg !49
330
+ %297 = load float, ptr addrspace(3) %296, align 4, !dbg !49
331
+ %298 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %295, !dbg !49
332
+ %299 = load float, ptr addrspace(3) %298, align 4, !dbg !49
333
+ %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %299, float 2.560000e+02) #6, !dbg !66
334
+ %301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %299, float 2.560000e+02) #6, !dbg !66
335
+ %302 = fadd float %300, 0x3EE4F8B580000000, !dbg !67
336
+ %303 = getelementptr float, ptr addrspace(3) @global_smem, i64 %57
337
+ %304 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
338
+ %305 = extractvalue { i32, i32 } %304, 0, !dbg !68
339
+ %306 = extractvalue { i32, i32 } %304, 1, !dbg !68
340
+ %307 = bitcast i32 %305 to float, !dbg !68
341
+ %308 = bitcast i32 %306 to float, !dbg !68
342
+ %309 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %49, i1 true, i32 0, i1 true) #6, !dbg !69
343
+ %310 = trunc i32 %309 to i16, !dbg !69
344
+ %extelt.offset = lshr i32 %309, 16, !dbg !69
345
+ %311 = trunc i32 %extelt.offset to i16, !dbg !69
346
+ %312 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %310) #6, !dbg !70
347
+ %313 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %311) #6, !dbg !70
348
+ %314 = getelementptr float, ptr addrspace(1) %4, i64 %125, !dbg !71
349
+ %315 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %314, i1 true, i32 0, i1 true) #6, !dbg !72
350
+ br i1 %33, label %316, label %317, !dbg !73
351
+
352
+ 316: ; preds = %98
353
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !73
354
+ br label %317, !dbg !73
355
+
356
+ 317: ; preds = %316, %98
357
+ %318 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !74
358
+ %319 = extractvalue { i32, i32 } %318, 0, !dbg !74
359
+ %320 = extractvalue { i32, i32 } %318, 1, !dbg !74
360
+ %321 = bitcast i32 %319 to float, !dbg !74
361
+ %322 = bitcast i32 %320 to float, !dbg !74
362
+ %323 = fadd float %307, %321, !dbg !75
363
+ %324 = fadd float %308, %322, !dbg !75
364
+ %325 = fadd float %312, %323, !dbg !76
365
+ %326 = fadd float %313, %324, !dbg !76
366
+ %327 = fsub float %325, %297, !dbg !77
367
+ %328 = fsub float %326, %297, !dbg !77
368
+ %329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
369
+ %.not.i = icmp eq i32 %329, 0, !dbg !78
370
+ br i1 %.not.i, label %332, label %330, !dbg !78
371
+
372
+ 330: ; preds = %317
373
+ %331 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %302), !dbg !78
374
+ br label %__nv_rsqrtf.exit, !dbg !78
375
+
376
+ 332: ; preds = %317
377
+ %333 = tail call float @llvm.nvvm.rsqrt.approx.f(float %302), !dbg !78
378
+ br label %__nv_rsqrtf.exit, !dbg !78
379
+
380
+ __nv_rsqrtf.exit: ; preds = %330, %332
381
+ %.0.i = phi float [ %331, %330 ], [ %333, %332 ], !dbg !78
382
+ %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
383
+ %335 = fmul float %327, %.0.i, !dbg !79
384
+ %336 = fmul float %328, %.0.i, !dbg !79
385
+ tail call void @llvm.nvvm.barrier0(), !dbg !80
386
+ store i32 %315, ptr addrspace(3) %126, align 4, !dbg !80
387
+ tail call void @llvm.nvvm.barrier0(), !dbg !80
388
+ %337 = load float, ptr addrspace(3) %303, align 8, !dbg !80
389
+ %338 = getelementptr inbounds <2 x float>, ptr addrspace(3) %303, i64 0, i64 1, !dbg !80
390
+ %339 = load float, ptr addrspace(3) %338, align 4, !dbg !80
391
+ %340 = fmul float %335, %337, !dbg !80
392
+ %341 = fmul float %336, %339, !dbg !80
393
+ %342 = getelementptr i16, ptr addrspace(1) %5, i64 %48, !dbg !81
394
+ %343 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %340) #6, !dbg !82
395
+ %344 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !82
396
+ %345 = insertelement <2 x i16> undef, i16 %343, i64 0, !dbg !82
397
+ %346 = insertelement <2 x i16> %345, i16 %344, i64 1, !dbg !82
398
+ %347 = bitcast <2 x i16> %346 to i32, !dbg !82
399
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %347, ptr addrspace(1) %342, i1 true) #6, !dbg !82
400
+ %348 = or i32 %124, 128, !dbg !83
401
+ %349 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
402
+ %350 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 true, i32 0, i1 true) #6, !dbg !69
403
+ %351 = trunc i32 %350 to i16, !dbg !69
404
+ %extelt.offset.1 = lshr i32 %350, 16, !dbg !69
405
+ %352 = trunc i32 %extelt.offset.1 to i16, !dbg !69
406
+ %353 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %351) #6, !dbg !70
407
+ %354 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %352) #6, !dbg !70
408
+ %355 = zext nneg i32 %348 to i64, !dbg !71
409
+ %356 = getelementptr float, ptr addrspace(1) %4, i64 %355, !dbg !71
410
+ %357 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %356, i1 true, i32 0, i1 true) #6, !dbg !72
411
+ br i1 %33, label %358, label %359, !dbg !73
412
+
413
+ 358: ; preds = %__nv_rsqrtf.exit
414
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !73
415
+ br label %359, !dbg !73
416
+
417
+ 359: ; preds = %358, %__nv_rsqrtf.exit
418
+ %360 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %100, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !74
419
+ %361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
420
+ %.not.i.1 = icmp eq i32 %361, 0, !dbg !78
421
+ br i1 %.not.i.1, label %364, label %362, !dbg !78
422
+
423
+ 362: ; preds = %359
424
+ %363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %302), !dbg !78
425
+ br label %__nv_rsqrtf.exit.1, !dbg !78
426
+
427
+ 364: ; preds = %359
428
+ %365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %302), !dbg !78
429
+ br label %__nv_rsqrtf.exit.1, !dbg !78
430
+
431
+ __nv_rsqrtf.exit.1: ; preds = %364, %362
432
+ %.0.i.1 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !78
433
+ %366 = extractvalue { i32, i32 } %349, 1, !dbg !68
434
+ %367 = bitcast i32 %366 to float, !dbg !68
435
+ %368 = extractvalue { i32, i32 } %360, 1, !dbg !74
436
+ %369 = bitcast i32 %368 to float, !dbg !74
437
+ %370 = fadd float %367, %369, !dbg !75
438
+ %371 = fadd float %354, %370, !dbg !76
439
+ %372 = fsub float %371, %297, !dbg !77
440
+ %373 = extractvalue { i32, i32 } %349, 0, !dbg !68
441
+ %374 = bitcast i32 %373 to float, !dbg !68
442
+ %375 = extractvalue { i32, i32 } %360, 0, !dbg !74
443
+ %376 = bitcast i32 %375 to float, !dbg !74
444
+ %377 = fadd float %374, %376, !dbg !75
445
+ %378 = fadd float %353, %377, !dbg !76
446
+ %379 = fsub float %378, %297, !dbg !77
447
+ %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
448
+ %381 = fmul float %379, %.0.i.1, !dbg !79
449
+ %382 = fmul float %372, %.0.i.1, !dbg !79
450
+ tail call void @llvm.nvvm.barrier0(), !dbg !80
451
+ store i32 %357, ptr addrspace(3) %126, align 4, !dbg !80
452
+ tail call void @llvm.nvvm.barrier0(), !dbg !80
453
+ %383 = load float, ptr addrspace(3) %303, align 8, !dbg !80
454
+ %384 = load float, ptr addrspace(3) %338, align 4, !dbg !80
455
+ %385 = fmul float %381, %383, !dbg !80
456
+ %386 = fmul float %382, %384, !dbg !80
457
+ %387 = getelementptr i16, ptr addrspace(1) %5, i64 %90, !dbg !81
458
+ %388 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %385) #6, !dbg !82
459
+ %389 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %386) #6, !dbg !82
460
+ %390 = insertelement <2 x i16> undef, i16 %388, i64 0, !dbg !82
461
+ %391 = insertelement <2 x i16> %390, i16 %389, i64 1, !dbg !82
462
+ %392 = bitcast <2 x i16> %391 to i32, !dbg !82
463
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %392, ptr addrspace(1) %387, i1 true) #6, !dbg !82
464
+ ret void, !dbg !84
465
+ }
466
+
467
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
468
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
469
+
470
+ ; Function Attrs: convergent nocallback nounwind
471
+ declare void @llvm.nvvm.barrier0() #1
472
+
473
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
474
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
475
+
476
+ ; Function Attrs: alwaysinline nounwind
477
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
478
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
479
+ %.not = icmp eq i32 %1, 0
480
+ br i1 %.not, label %4, label %2
481
+
482
+ 2: ; preds = %0
483
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
484
+ br label %6
485
+
486
+ 4: ; preds = %0
487
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
488
+ br label %6
489
+
490
+ 6: ; preds = %4, %2
491
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
492
+ ret float %.0
493
+ }
494
+
495
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
496
+
497
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
498
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
499
+
500
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
501
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
502
+
503
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
504
+ attributes #1 = { convergent nocallback nounwind }
505
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
506
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
507
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
508
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
509
+ attributes #6 = { nounwind }
510
+
511
+ !llvm.module.flags = !{!0, !1}
512
+ !llvm.dbg.cu = !{!2}
513
+ !nvvm.annotations = !{!4, !5, !5, !4}
514
+ !llvm.ident = !{!6}
515
+
516
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
517
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
518
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
519
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
520
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
521
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
522
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
523
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
524
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
525
+ !9 = !{}
526
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
527
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
528
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
529
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
530
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
531
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
532
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
533
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
534
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
535
+ !19 = !DILocation(line: 36, column: 44, scope: !7)
536
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
537
+ !21 = !DILocation(line: 38, column: 22, scope: !7)
538
+ !22 = !DILocation(line: 39, column: 36, scope: !7)
539
+ !23 = !DILocation(line: 40, column: 40, scope: !7)
540
+ !24 = !DILocation(line: 41, column: 44, scope: !7)
541
+ !25 = !DILocation(line: 35, column: 40, scope: !7)
542
+ !26 = !DILocation(line: 35, column: 34, scope: !7)
543
+ !27 = !DILocation(line: 35, column: 50, scope: !7)
544
+ !28 = !DILocation(line: 36, column: 40, scope: !7)
545
+ !29 = !DILocation(line: 36, column: 34, scope: !7)
546
+ !30 = !DILocation(line: 36, column: 50, scope: !7)
547
+ !31 = !DILocation(line: 36, column: 101, scope: !7)
548
+ !32 = !DILocation(line: 40, column: 55, scope: !7)
549
+ !33 = !DILocation(line: 41, column: 40, scope: !7)
550
+ !34 = !DILocation(line: 41, column: 34, scope: !7)
551
+ !35 = !DILocation(line: 41, column: 52, scope: !7)
552
+ !36 = !DILocation(line: 42, column: 22, scope: !7)
553
+ !37 = !DILocation(line: 44, column: 22, scope: !7)
554
+ !38 = !DILocation(line: 98, column: 30, scope: !39, inlinedAt: !41)
555
+ !39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
556
+ !40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
557
+ !41 = !DILocation(line: 47, column: 41, scope: !39)
558
+ !42 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
559
+ !43 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
560
+ !44 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
561
+ !45 = !DILocation(line: 50, column: 50, scope: !7)
562
+ !46 = !DILocation(line: 32, column: 27, scope: !7)
563
+ !47 = !DILocation(line: 96, column: 20, scope: !39, inlinedAt: !41)
564
+ !48 = !DILocation(line: 31, column: 36, scope: !7)
565
+ !49 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !50)
566
+ !50 = !DILocation(line: 53, column: 44, scope: !39)
567
+ !51 = !DILocation(line: 112, column: 17, scope: !52, inlinedAt: !53)
568
+ !52 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
569
+ !53 = !DILocation(line: 120, column: 46, scope: !52, inlinedAt: !54)
570
+ !54 = !DILocation(line: 53, column: 44, scope: !52)
571
+ !55 = !DILocation(line: 108, column: 21, scope: !52, inlinedAt: !53)
572
+ !56 = !DILocation(line: 109, column: 28, scope: !52, inlinedAt: !53)
573
+ !57 = !DILocation(line: 110, column: 39, scope: !52, inlinedAt: !53)
574
+ !58 = !DILocation(line: 110, column: 60, scope: !52, inlinedAt: !53)
575
+ !59 = !DILocation(line: 110, column: 49, scope: !52, inlinedAt: !53)
576
+ !60 = !DILocation(line: 112, column: 25, scope: !52, inlinedAt: !53)
577
+ !61 = !DILocation(line: 113, column: 15, scope: !52, inlinedAt: !53)
578
+ !62 = !DILocation(line: 113, column: 30, scope: !52, inlinedAt: !53)
579
+ !63 = !DILocation(line: 113, column: 38, scope: !52, inlinedAt: !53)
580
+ !64 = !DILocation(line: 113, column: 49, scope: !52, inlinedAt: !53)
581
+ !65 = !DILocation(line: 113, column: 22, scope: !52, inlinedAt: !53)
582
+ !66 = !DILocation(line: 75, column: 24, scope: !7)
583
+ !67 = !DILocation(line: 77, column: 24, scope: !7)
584
+ !68 = !DILocation(line: 62, column: 51, scope: !7)
585
+ !69 = !DILocation(line: 63, column: 51, scope: !7)
586
+ !70 = !DILocation(line: 63, column: 103, scope: !7)
587
+ !71 = !DILocation(line: 64, column: 35, scope: !7)
588
+ !72 = !DILocation(line: 64, column: 40, scope: !7)
589
+ !73 = !DILocation(line: 68, column: 57, scope: !7)
590
+ !74 = !DILocation(line: 69, column: 54, scope: !7)
591
+ !75 = !DILocation(line: 70, column: 24, scope: !7)
592
+ !76 = !DILocation(line: 72, column: 24, scope: !7)
593
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
594
+ !78 = !DILocation(line: 78, column: 30, scope: !7)
595
+ !79 = !DILocation(line: 79, column: 24, scope: !7)
596
+ !80 = !DILocation(line: 80, column: 24, scope: !7)
597
+ !81 = !DILocation(line: 82, column: 29, scope: !7)
598
+ !82 = !DILocation(line: 82, column: 52, scope: !7)
599
+ !83 = !DILocation(line: 59, column: 27, scope: !7)
600
+ !84 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked>
10
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked>
11
+ %cst_4 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
12
+ %cst_5 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<2x1xi64, #blocked1>
15
+ %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked1>
16
+ %c0_i32 = arith.constant 0 : i32
17
+ %c128_i32 = arith.constant 128 : i32
18
+ %c256_i32 = arith.constant 256 : i32
19
+ %cst_9 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked2>
20
+ %cst_10 = arith.constant 0.000000e+00 : f32
21
+ %cst_11 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked2>
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
26
+ %cst_16 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked>
27
+ %c2_i32 = arith.constant 2 : i32
28
+ %0 = tt.get_program_id x : i32
29
+ %1 = arith.muli %0, %c2_i32 : i32
30
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
31
+ %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
32
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
33
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<2x1xi32, #blocked1>
34
+ %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
35
+ %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked1>
36
+ %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
37
+ %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked1>
38
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
39
+ %11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
40
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
41
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
42
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
43
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked1>
44
+ %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
45
+ %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked1>, tensor<2x1xi32, #blocked1>
46
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
47
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked1>
48
+ %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
49
+ %21 = arith.muli %20, %cst_1 : tensor<2x1xi32, #blocked>
50
+ %22 = tt.broadcast %21 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
51
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
52
+ %24 = arith.muli %8, %cst_1 : tensor<2x1xi32, #blocked>
53
+ %25 = tt.broadcast %24 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
54
+ %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
55
+ %27 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
56
+ %28 = arith.addi %19, %cst_7 : tensor<2x1xi64, #blocked1>
57
+ %29 = arith.cmpi slt, %18, %cst_5 : tensor<2x1xi64, #blocked>
58
+ %30 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked1>
59
+ %31 = arith.select %29, %27, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
60
+ %32 = arith.select %30, %28, %19 : tensor<2x1xi1, #blocked1>, tensor<2x1xi64, #blocked1>
61
+ %33 = arith.cmpi sge, %32, %cst_8 : tensor<2x1xi64, #blocked1>
62
+ %34 = arith.cmpi slt, %32, %cst_7 : tensor<2x1xi64, #blocked1>
63
+ %35 = arith.andi %33, %34 : tensor<2x1xi1, #blocked1>
64
+ %36 = arith.muli %31, %cst_4 : tensor<2x1xi64, #blocked>
65
+ %37 = tt.broadcast %36 : (tensor<2x1xi64, #blocked>) -> tensor<2x128xi64, #blocked>
66
+ %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
67
+ %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>) : i32 {
68
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
69
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
70
+ %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
71
+ %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
72
+ %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
73
+ %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
74
+ %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
75
+ %56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
76
+ %57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
77
+ %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
78
+ %59 = tt.broadcast %54 : (tensor<1x128xi1, #blocked2>) -> tensor<2x128xi1, #blocked2>
79
+ %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
80
+ %61 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
81
+ %62 = tt.addptr %26, %61 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
82
+ %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
83
+ %64 = arith.extf %63 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
84
+ tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
85
+ %65 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
86
+ %66 = tt.broadcast %65 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
87
+ %67 = arith.addi %66, %37 : tensor<2x128xi64, #blocked>
88
+ %68 = tt.addptr %38, %67 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
89
+ %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
90
+ %70 = arith.addf %69, %60 : tensor<2x128xf32, #blocked>
91
+ %71 = arith.addf %70, %64 : tensor<2x128xf32, #blocked>
92
+ %72 = arith.subf %71, %arg9 : tensor<2x128xf32, #blocked>
93
+ %73 = arith.addf %arg12, %cst_3 : tensor<2x128xf32, #blocked>
94
+ %74 = arith.addf %arg11, %cst_9 : tensor<2x128xf32, #blocked2>
95
+ %75 = arith.divf %72, %73 : tensor<2x128xf32, #blocked>
96
+ %76 = arith.addf %arg9, %75 : tensor<2x128xf32, #blocked>
97
+ %77 = arith.subf %71, %76 : tensor<2x128xf32, #blocked>
98
+ %78 = arith.mulf %72, %77 : tensor<2x128xf32, #blocked>
99
+ %79 = arith.addf %arg10, %78 : tensor<2x128xf32, #blocked>
100
+ %80 = arith.select %58, %76, %arg9 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
101
+ %81 = arith.select %58, %79, %arg10 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
102
+ %82 = arith.select %58, %73, %arg12 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
103
+ %83 = arith.select %59, %74, %arg11 : tensor<2x128xi1, #blocked2>, tensor<2x128xf32, #blocked2>
104
+ scf.yield %80, %81, %83, %82 : tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>
105
+ }
106
+ %40 = triton_gpu.convert_layout %39#2 : (tensor<2x128xf32, #blocked2>) -> tensor<2x128xf32, #blocked>
107
+ %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
108
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
109
+ %49 = arith.subf %arg11, %arg8 : f32
110
+ %50 = arith.addf %arg10, %arg13 : f32
111
+ %51 = arith.cmpf oeq, %50, %cst_10 : f32
112
+ %52 = arith.divf %arg13, %50 : f32
113
+ %53 = arith.select %51, %cst_10, %52 : f32
114
+ %54 = arith.mulf %49, %53 : f32
115
+ %55 = arith.addf %arg8, %54 : f32
116
+ %56 = arith.addf %arg9, %arg12 : f32
117
+ %57 = arith.mulf %49, %49 : f32
118
+ %58 = arith.mulf %57, %arg10 : f32
119
+ %59 = arith.mulf %58, %53 : f32
120
+ %60 = arith.addf %56, %59 : f32
121
+ tt.reduce.return %55, %60, %50 : f32, f32, f32
122
+ }) : (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
123
+ %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
124
+ %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
125
+ %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked2>
126
+ %45 = tt.broadcast %42 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
127
+ %46 = arith.divf %43, %cst_15 : tensor<2x1xf32, #blocked>
128
+ %47 = arith.addf %46, %cst_14 : tensor<2x1xf32, #blocked>
129
+ %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
130
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 : i32 {
131
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
132
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
133
+ %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
134
+ %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
135
+ %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
136
+ %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
137
+ %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
138
+ %56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
139
+ %57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
140
+ %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
141
+ %59 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
142
+ %60 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
143
+ %61 = tt.addptr %26, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
144
+ %62 = tt.load %61, %58, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
145
+ %63 = arith.extf %62 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
146
+ %64 = tt.addptr %44, %52 : tensor<1x128x!tt.ptr<f32, 1>, #blocked2>, tensor<1x128xi32, #blocked2>
147
+ %65 = tt.load %64, %54, %cst_11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked2>
148
+ tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
149
+ %66 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
150
+ %67 = tt.broadcast %66 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
151
+ %68 = arith.addi %67, %37 : tensor<2x128xi64, #blocked>
152
+ %69 = tt.addptr %38, %68 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
153
+ %70 = tt.load %69, %58, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
154
+ %71 = arith.addf %70, %59 : tensor<2x128xf32, #blocked>
155
+ %72 = arith.addf %71, %63 : tensor<2x128xf32, #blocked>
156
+ %73 = arith.subf %72, %45 : tensor<2x128xf32, #blocked>
157
+ %74 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
158
+ %75 = tt.broadcast %74 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
159
+ %76 = arith.mulf %73, %75 : tensor<2x128xf32, #blocked>
160
+ %77 = triton_gpu.convert_layout %65 : (tensor<1x128xf32, #blocked2>) -> tensor<1x128xf32, #blocked>
161
+ %78 = tt.broadcast %77 : (tensor<1x128xf32, #blocked>) -> tensor<2x128xf32, #blocked>
162
+ %79 = arith.mulf %76, %78 : tensor<2x128xf32, #blocked>
163
+ %80 = tt.addptr %48, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
164
+ %81 = arith.truncf %79 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked>
165
+ tt.store %80, %81, %58 {cache = 1 : i32, evict = 1 : i32} : tensor<2x128xbf16, #blocked>
166
+ }
167
+ tt.return
168
+ }
169
+ }
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<512> : tensor<256xi32, #blocked>
5
+ %c256_i32 = arith.constant 256 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c256_i32 : i32
8
+ %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<256xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<256xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>, #blocked>, tensor<256xi32, #blocked>
14
+ %8 = arith.extsi %4 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
15
+ tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttgir ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
6
+ %cst_0 = arith.constant 9.99999974E-6 : f32
7
+ %cst_1 = arith.constant 2.560000e+02 : f32
8
+ %cst_2 = arith.constant 0.000000e+00 : f32
9
+ %c256_i32 = arith.constant 256 : i32
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
14
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
15
+ %3 = arith.muli %0, %c256_i32 : i32
16
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
17
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
18
+ %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
19
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
20
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
21
+ %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
22
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
24
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
25
+ %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
26
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
27
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
28
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
29
+ %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
30
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
31
+ %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
32
+ %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
33
+ %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
34
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
35
+ %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
36
+ %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
37
+ %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
38
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
39
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
40
+ %28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
41
+ %29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
42
+ %30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
43
+ %31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
44
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
45
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg12: f32, %arg13: f32):
47
+ %59 = arith.addf %arg12, %arg13 : f32
48
+ tt.reduce.return %59 : f32
49
+ }) : (tensor<256xf32, #blocked>) -> f32
50
+ %34 = arith.addf %33, %cst_2 : f32
51
+ %35 = arith.divf %34, %cst_1 : f32
52
+ %36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
53
+ %37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
54
+ %38 = arith.subf %31, %37 : tensor<256xf32, #blocked>
55
+ %39 = arith.mulf %38, %38 : tensor<256xf32, #blocked>
56
+ %40 = arith.select %2, %39, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
57
+ %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
58
+ ^bb0(%arg12: f32, %arg13: f32):
59
+ %59 = arith.addf %arg12, %arg13 : f32
60
+ tt.reduce.return %59 : f32
61
+ }) : (tensor<256xf32, #blocked>) -> f32
62
+ %42 = arith.addf %41, %cst_2 : f32
63
+ %43 = arith.divf %42, %cst_1 : f32
64
+ %44 = arith.addf %43, %cst_0 : f32
65
+ %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
66
+ %46 = tt.splat %45 : (f32) -> tensor<1xf32, #blocked1>
67
+ %47 = tt.splat %45 : (f32) -> tensor<256xf32, #blocked>
68
+ %48 = arith.mulf %38, %47 : tensor<256xf32, #blocked>
69
+ %49 = arith.mulf %48, %27 : tensor<256xf32, #blocked>
70
+ %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
71
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
72
+ tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
73
+ gpu.barrier
74
+ %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
75
+ %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
76
+ tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
77
+ %54 = tt.splat %arg9 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
78
+ %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
79
+ %56 = arith.truncf %49 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
80
+ tt.store %55, %56, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
81
+ %57 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
82
+ %58 = tt.splat %57 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
83
+ tt.store %58, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
84
+ tt.return
85
+ }
86
+ }
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.cubin ADDED
Binary file (42.6 kB). View file
 
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [8 x i8] c"<module>"
5
+ @assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [8 x i8] c"<module>"
8
+ @assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = lshr i32 %8, 4, !dbg !10
18
+ %10 = and i32 %9, 15, !dbg !10
19
+ %11 = and i32 %8, 15, !dbg !10
20
+ %12 = shl nuw nsw i32 %11, 3, !dbg !11
21
+ %13 = or i32 %12, 4, !dbg !11
22
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
23
+ %15 = shl i32 %14, 4, !dbg !13
24
+ %16 = or i32 %15, %10, !dbg !14
25
+ %17 = or i32 %15, %11, !dbg !14
26
+ %18 = sext i32 %16 to i64, !dbg !15
27
+ %19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
28
+ %20 = sext i32 %17 to i64, !dbg !15
29
+ %21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
30
+ %22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
31
+ %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
32
+ %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
33
+ %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
34
+ %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
35
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
39
+ %31 = srem i32 %16, 512, !dbg !17
40
+ %32 = shl nsw i32 %31, 8, !dbg !18
41
+ %33 = add i64 %30, 50257, !dbg !19
42
+ %34 = icmp slt i64 %22, 0, !dbg !20
43
+ %35 = icmp slt i64 %30, 0, !dbg !20
44
+ %36 = select i1 %35, i64 %33, i64 %30, !dbg !21
45
+ %37 = icmp ugt i64 %36, 50256, !dbg !22
46
+ %38 = shl i64 %22, 8, !dbg !23
47
+ %39 = add i64 %38, 12865792, !dbg !23
48
+ %40 = select i1 %34, i64 %39, i64 %38, !dbg !23
49
+ %41 = getelementptr float, ptr addrspace(1) %1, i64 %40
50
+ br label %42, !dbg !24
51
+
52
+ 42: ; preds = %7, %104
53
+ %43 = phi float [ 0.000000e+00, %7 ], [ %143, %104 ]
54
+ %44 = phi float [ 0.000000e+00, %7 ], [ %144, %104 ]
55
+ %45 = phi float [ 0.000000e+00, %7 ], [ %145, %104 ]
56
+ %46 = phi float [ 0.000000e+00, %7 ], [ %146, %104 ]
57
+ %47 = phi float [ 0.000000e+00, %7 ], [ %147, %104 ]
58
+ %48 = phi float [ 0.000000e+00, %7 ], [ %148, %104 ]
59
+ %49 = phi float [ 0.000000e+00, %7 ], [ %149, %104 ]
60
+ %50 = phi float [ 0.000000e+00, %7 ], [ %150, %104 ]
61
+ %51 = phi float [ 0.000000e+00, %7 ], [ %151, %104 ]
62
+ %52 = phi float [ 0.000000e+00, %7 ], [ %152, %104 ]
63
+ %53 = phi float [ 0.000000e+00, %7 ], [ %153, %104 ]
64
+ %54 = phi float [ 0.000000e+00, %7 ], [ %154, %104 ]
65
+ %55 = phi float [ 0.000000e+00, %7 ], [ %155, %104 ]
66
+ %56 = phi float [ 0.000000e+00, %7 ], [ %156, %104 ]
67
+ %57 = phi float [ 0.000000e+00, %7 ], [ %157, %104 ]
68
+ %58 = phi float [ 0.000000e+00, %7 ], [ %158, %104 ]
69
+ %59 = phi float [ 0.000000e+00, %7 ], [ %191, %104 ]
70
+ %60 = phi float [ 0.000000e+00, %7 ], [ %192, %104 ]
71
+ %61 = phi float [ 0.000000e+00, %7 ], [ %193, %104 ]
72
+ %62 = phi float [ 0.000000e+00, %7 ], [ %194, %104 ]
73
+ %63 = phi float [ 0.000000e+00, %7 ], [ %195, %104 ]
74
+ %64 = phi float [ 0.000000e+00, %7 ], [ %196, %104 ]
75
+ %65 = phi float [ 0.000000e+00, %7 ], [ %197, %104 ]
76
+ %66 = phi float [ 0.000000e+00, %7 ], [ %198, %104 ]
77
+ %67 = phi float [ 0.000000e+00, %7 ], [ %167, %104 ]
78
+ %68 = phi float [ 0.000000e+00, %7 ], [ %168, %104 ]
79
+ %69 = phi float [ 0.000000e+00, %7 ], [ %169, %104 ]
80
+ %70 = phi float [ 0.000000e+00, %7 ], [ %170, %104 ]
81
+ %71 = phi float [ 0.000000e+00, %7 ], [ %171, %104 ]
82
+ %72 = phi float [ 0.000000e+00, %7 ], [ %172, %104 ]
83
+ %73 = phi float [ 0.000000e+00, %7 ], [ %173, %104 ]
84
+ %74 = phi float [ 0.000000e+00, %7 ], [ %174, %104 ]
85
+ %75 = phi i1 [ true, %7 ], [ false, %104 ]
86
+ %76 = phi i32 [ 0, %7 ], [ 128, %104 ]
87
+ %77 = or i32 %76, %12, !dbg !25
88
+ %78 = or i32 %76, %13, !dbg !25
89
+ %79 = or i32 %77, %32, !dbg !26
90
+ %80 = or i32 %78, %32, !dbg !26
91
+ %81 = sext i32 %79 to i64, !dbg !27
92
+ %82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !27
93
+ %83 = sext i32 %80 to i64, !dbg !27
94
+ %84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !27
95
+ %85 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
96
+ %86 = extractvalue { i32, i32, i32, i32 } %85, 0, !dbg !28
97
+ %87 = extractvalue { i32, i32, i32, i32 } %85, 1, !dbg !28
98
+ %88 = extractvalue { i32, i32, i32, i32 } %85, 2, !dbg !28
99
+ %89 = extractvalue { i32, i32, i32, i32 } %85, 3, !dbg !28
100
+ %90 = bitcast i32 %86 to float, !dbg !28
101
+ %91 = bitcast i32 %87 to float, !dbg !28
102
+ %92 = bitcast i32 %88 to float, !dbg !28
103
+ %93 = bitcast i32 %89 to float, !dbg !28
104
+ %94 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
105
+ %95 = extractvalue { i32, i32, i32, i32 } %94, 0, !dbg !28
106
+ %96 = extractvalue { i32, i32, i32, i32 } %94, 1, !dbg !28
107
+ %97 = extractvalue { i32, i32, i32, i32 } %94, 2, !dbg !28
108
+ %98 = extractvalue { i32, i32, i32, i32 } %94, 3, !dbg !28
109
+ %99 = bitcast i32 %95 to float, !dbg !28
110
+ %100 = bitcast i32 %96 to float, !dbg !28
111
+ %101 = bitcast i32 %97 to float, !dbg !28
112
+ %102 = bitcast i32 %98 to float, !dbg !28
113
+ br i1 %37, label %103, label %104, !dbg !29
114
+
115
+ 103: ; preds = %42
116
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !29
117
+ br label %104, !dbg !29
118
+
119
+ 104: ; preds = %103, %42
120
+ %105 = zext nneg i32 %77 to i64, !dbg !30
121
+ %106 = zext nneg i32 %78 to i64, !dbg !30
122
+ %107 = getelementptr float, ptr addrspace(1) %41, i64 %105, !dbg !31
123
+ %108 = getelementptr float, ptr addrspace(1) %41, i64 %106, !dbg !31
124
+ %109 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
125
+ %110 = extractvalue { i32, i32, i32, i32 } %109, 0, !dbg !32
126
+ %111 = extractvalue { i32, i32, i32, i32 } %109, 1, !dbg !32
127
+ %112 = extractvalue { i32, i32, i32, i32 } %109, 2, !dbg !32
128
+ %113 = extractvalue { i32, i32, i32, i32 } %109, 3, !dbg !32
129
+ %114 = bitcast i32 %110 to float, !dbg !32
130
+ %115 = bitcast i32 %111 to float, !dbg !32
131
+ %116 = bitcast i32 %112 to float, !dbg !32
132
+ %117 = bitcast i32 %113 to float, !dbg !32
133
+ %118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %108, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
134
+ %119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !32
135
+ %120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !32
136
+ %121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !32
137
+ %122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !32
138
+ %123 = bitcast i32 %119 to float, !dbg !32
139
+ %124 = bitcast i32 %120 to float, !dbg !32
140
+ %125 = bitcast i32 %121 to float, !dbg !32
141
+ %126 = bitcast i32 %122 to float, !dbg !32
142
+ %127 = fadd float %90, %114, !dbg !33
143
+ %128 = fadd float %91, %115, !dbg !33
144
+ %129 = fadd float %92, %116, !dbg !33
145
+ %130 = fadd float %93, %117, !dbg !33
146
+ %131 = fadd float %99, %123, !dbg !33
147
+ %132 = fadd float %100, %124, !dbg !33
148
+ %133 = fadd float %101, %125, !dbg !33
149
+ %134 = fadd float %102, %126, !dbg !33
150
+ %135 = fsub float %127, %67, !dbg !34
151
+ %136 = fsub float %128, %68, !dbg !34
152
+ %137 = fsub float %129, %69, !dbg !34
153
+ %138 = fsub float %130, %70, !dbg !34
154
+ %139 = fsub float %131, %71, !dbg !34
155
+ %140 = fsub float %132, %72, !dbg !34
156
+ %141 = fsub float %133, %73, !dbg !34
157
+ %142 = fsub float %134, %74, !dbg !34
158
+ %143 = fadd float %43, 1.000000e+00, !dbg !38
159
+ %144 = fadd float %44, 1.000000e+00, !dbg !38
160
+ %145 = fadd float %45, 1.000000e+00, !dbg !38
161
+ %146 = fadd float %46, 1.000000e+00, !dbg !38
162
+ %147 = fadd float %47, 1.000000e+00, !dbg !38
163
+ %148 = fadd float %48, 1.000000e+00, !dbg !38
164
+ %149 = fadd float %49, 1.000000e+00, !dbg !38
165
+ %150 = fadd float %50, 1.000000e+00, !dbg !38
166
+ %151 = fadd float %51, 1.000000e+00, !dbg !38
167
+ %152 = fadd float %52, 1.000000e+00, !dbg !38
168
+ %153 = fadd float %53, 1.000000e+00, !dbg !38
169
+ %154 = fadd float %54, 1.000000e+00, !dbg !38
170
+ %155 = fadd float %55, 1.000000e+00, !dbg !38
171
+ %156 = fadd float %56, 1.000000e+00, !dbg !38
172
+ %157 = fadd float %57, 1.000000e+00, !dbg !38
173
+ %158 = fadd float %58, 1.000000e+00, !dbg !38
174
+ %159 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %143) #6, !dbg !39
175
+ %160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %144) #6, !dbg !39
176
+ %161 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float %145) #6, !dbg !39
177
+ %162 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %138, float %146) #6, !dbg !39
178
+ %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %139, float %147) #6, !dbg !39
179
+ %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %140, float %148) #6, !dbg !39
180
+ %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %149) #6, !dbg !39
181
+ %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %142, float %150) #6, !dbg !39
182
+ %167 = fadd float %67, %159, !dbg !40
183
+ %168 = fadd float %68, %160, !dbg !40
184
+ %169 = fadd float %69, %161, !dbg !40
185
+ %170 = fadd float %70, %162, !dbg !40
186
+ %171 = fadd float %71, %163, !dbg !40
187
+ %172 = fadd float %72, %164, !dbg !40
188
+ %173 = fadd float %73, %165, !dbg !40
189
+ %174 = fadd float %74, %166, !dbg !40
190
+ %175 = fsub float %127, %167, !dbg !41
191
+ %176 = fsub float %128, %168, !dbg !41
192
+ %177 = fsub float %129, %169, !dbg !41
193
+ %178 = fsub float %130, %170, !dbg !41
194
+ %179 = fsub float %131, %171, !dbg !41
195
+ %180 = fsub float %132, %172, !dbg !41
196
+ %181 = fsub float %133, %173, !dbg !41
197
+ %182 = fsub float %134, %174, !dbg !41
198
+ %183 = fmul float %135, %175, !dbg !42
199
+ %184 = fmul float %136, %176, !dbg !42
200
+ %185 = fmul float %137, %177, !dbg !42
201
+ %186 = fmul float %138, %178, !dbg !42
202
+ %187 = fmul float %139, %179, !dbg !42
203
+ %188 = fmul float %140, %180, !dbg !42
204
+ %189 = fmul float %141, %181, !dbg !42
205
+ %190 = fmul float %142, %182, !dbg !42
206
+ %191 = fadd float %59, %183, !dbg !43
207
+ %192 = fadd float %60, %184, !dbg !43
208
+ %193 = fadd float %61, %185, !dbg !43
209
+ %194 = fadd float %62, %186, !dbg !43
210
+ %195 = fadd float %63, %187, !dbg !43
211
+ %196 = fadd float %64, %188, !dbg !43
212
+ %197 = fadd float %65, %189, !dbg !43
213
+ %198 = fadd float %66, %190, !dbg !43
214
+ br i1 %75, label %42, label %199, !dbg !24
215
+
216
+ 199: ; preds = %104
217
+ %200 = and i32 %8, 127, !dbg !11
218
+ %201 = and i32 %8, 128, !dbg !24
219
+ %.not = icmp eq i32 %201, 0, !dbg !24
220
+ %202 = select i1 %.not, i32 0, i32 136, !dbg !24
221
+ %203 = add nuw nsw i32 %202, %200, !dbg !24
222
+ %204 = zext nneg i32 %203 to i64, !dbg !24
223
+ %205 = getelementptr float, ptr addrspace(3) @global_smem, i64 %204, !dbg !24
224
+ %206 = insertelement <1 x float> undef, float %151, i64 0, !dbg !24
225
+ store <1 x float> %206, ptr addrspace(3) %205, align 4, !dbg !24
226
+ %207 = add nuw nsw i32 %200, 272, !dbg !24
227
+ %208 = add nuw nsw i32 %207, %202, !dbg !24
228
+ %209 = zext nneg i32 %208 to i64, !dbg !24
229
+ %210 = getelementptr float, ptr addrspace(3) @global_smem, i64 %209, !dbg !24
230
+ %211 = insertelement <1 x float> undef, float %152, i64 0, !dbg !24
231
+ store <1 x float> %211, ptr addrspace(3) %210, align 4, !dbg !24
232
+ %212 = add nuw nsw i32 %200, 544, !dbg !24
233
+ %213 = add nuw nsw i32 %212, %202, !dbg !24
234
+ %214 = zext nneg i32 %213 to i64, !dbg !24
235
+ %215 = getelementptr float, ptr addrspace(3) @global_smem, i64 %214, !dbg !24
236
+ %216 = insertelement <1 x float> undef, float %153, i64 0, !dbg !24
237
+ store <1 x float> %216, ptr addrspace(3) %215, align 4, !dbg !24
238
+ %217 = add nuw nsw i32 %200, 816, !dbg !24
239
+ %218 = add nuw nsw i32 %217, %202, !dbg !24
240
+ %219 = zext nneg i32 %218 to i64, !dbg !24
241
+ %220 = getelementptr float, ptr addrspace(3) @global_smem, i64 %219, !dbg !24
242
+ %221 = insertelement <1 x float> undef, float %154, i64 0, !dbg !24
243
+ store <1 x float> %221, ptr addrspace(3) %220, align 4, !dbg !24
244
+ %222 = add nuw nsw i32 %200, 1088, !dbg !24
245
+ %223 = add nuw nsw i32 %222, %202, !dbg !24
246
+ %224 = zext nneg i32 %223 to i64, !dbg !24
247
+ %225 = getelementptr float, ptr addrspace(3) @global_smem, i64 %224, !dbg !24
248
+ %226 = insertelement <1 x float> undef, float %155, i64 0, !dbg !24
249
+ store <1 x float> %226, ptr addrspace(3) %225, align 4, !dbg !24
250
+ %227 = add nuw nsw i32 %200, 1360, !dbg !24
251
+ %228 = add nuw nsw i32 %227, %202, !dbg !24
252
+ %229 = zext nneg i32 %228 to i64, !dbg !24
253
+ %230 = getelementptr float, ptr addrspace(3) @global_smem, i64 %229, !dbg !24
254
+ %231 = insertelement <1 x float> undef, float %156, i64 0, !dbg !24
255
+ store <1 x float> %231, ptr addrspace(3) %230, align 4, !dbg !24
256
+ %232 = add nuw nsw i32 %200, 1632, !dbg !24
257
+ %233 = add nuw nsw i32 %232, %202, !dbg !24
258
+ %234 = zext nneg i32 %233 to i64, !dbg !24
259
+ %235 = getelementptr float, ptr addrspace(3) @global_smem, i64 %234, !dbg !24
260
+ %236 = insertelement <1 x float> undef, float %157, i64 0, !dbg !24
261
+ store <1 x float> %236, ptr addrspace(3) %235, align 4, !dbg !24
262
+ %237 = add nuw nsw i32 %200, 1904, !dbg !24
263
+ %238 = add nuw nsw i32 %237, %202, !dbg !24
264
+ %239 = zext nneg i32 %238 to i64, !dbg !24
265
+ %240 = getelementptr float, ptr addrspace(3) @global_smem, i64 %239, !dbg !24
266
+ %241 = insertelement <1 x float> undef, float %158, i64 0, !dbg !24
267
+ store <1 x float> %241, ptr addrspace(3) %240, align 4, !dbg !24
268
+ tail call void @llvm.nvvm.barrier0(), !dbg !24
269
+ %242 = mul nuw nsw i32 %10, 136, !dbg !24
270
+ %243 = add nuw nsw i32 %242, %12, !dbg !24
271
+ %244 = zext nneg i32 %243 to i64, !dbg !24
272
+ %245 = getelementptr float, ptr addrspace(3) @global_smem, i64 %244, !dbg !24
273
+ %246 = load float, ptr addrspace(3) %245, align 32, !dbg !24
274
+ %247 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 1, !dbg !24
275
+ %248 = load float, ptr addrspace(3) %247, align 4, !dbg !24
276
+ %249 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 2, !dbg !24
277
+ %250 = load float, ptr addrspace(3) %249, align 8, !dbg !24
278
+ %251 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 3, !dbg !24
279
+ %252 = load float, ptr addrspace(3) %251, align 4, !dbg !24
280
+ %253 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 4, !dbg !24
281
+ %254 = load float, ptr addrspace(3) %253, align 16, !dbg !24
282
+ %255 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 5, !dbg !24
283
+ %256 = load float, ptr addrspace(3) %255, align 4, !dbg !24
284
+ %257 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 6, !dbg !24
285
+ %258 = load float, ptr addrspace(3) %257, align 8, !dbg !24
286
+ %259 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 7, !dbg !24
287
+ %260 = load float, ptr addrspace(3) %259, align 4, !dbg !24
288
+ %261 = fsub float %168, %167, !dbg !44
289
+ %262 = fadd float %246, %248, !dbg !48
290
+ %263 = fcmp oeq float %262, 0.000000e+00, !dbg !49
291
+ %264 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %262) #6, !dbg !50
292
+ %265 = select i1 %263, float 0.000000e+00, float %264, !dbg !51
293
+ %266 = fmul float %261, %265, !dbg !52
294
+ %267 = fadd float %167, %266, !dbg !53
295
+ %268 = fadd float %191, %192, !dbg !54
296
+ %269 = fmul float %261, %261, !dbg !55
297
+ %270 = fmul float %269, %246, !dbg !56
298
+ %271 = fmul float %270, %265, !dbg !57
299
+ %272 = fadd float %268, %271, !dbg !58
300
+ %273 = fsub float %169, %267, !dbg !44
301
+ %274 = fadd float %250, %262, !dbg !48
302
+ %275 = fcmp oeq float %274, 0.000000e+00, !dbg !49
303
+ %276 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %274) #6, !dbg !50
304
+ %277 = select i1 %275, float 0.000000e+00, float %276, !dbg !51
305
+ %278 = fmul float %277, %273, !dbg !52
306
+ %279 = fadd float %267, %278, !dbg !53
307
+ %280 = fadd float %193, %272, !dbg !54
308
+ %281 = fmul float %273, %273, !dbg !55
309
+ %282 = fmul float %262, %281, !dbg !56
310
+ %283 = fmul float %277, %282, !dbg !57
311
+ %284 = fadd float %280, %283, !dbg !58
312
+ %285 = fsub float %170, %279, !dbg !44
313
+ %286 = fadd float %252, %274, !dbg !48
314
+ %287 = fcmp oeq float %286, 0.000000e+00, !dbg !49
315
+ %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %286) #6, !dbg !50
316
+ %289 = select i1 %287, float 0.000000e+00, float %288, !dbg !51
317
+ %290 = fmul float %289, %285, !dbg !52
318
+ %291 = fadd float %279, %290, !dbg !53
319
+ %292 = fadd float %194, %284, !dbg !54
320
+ %293 = fmul float %285, %285, !dbg !55
321
+ %294 = fmul float %274, %293, !dbg !56
322
+ %295 = fmul float %289, %294, !dbg !57
323
+ %296 = fadd float %292, %295, !dbg !58
324
+ %297 = fsub float %171, %291, !dbg !44
325
+ %298 = fadd float %254, %286, !dbg !48
326
+ %299 = fcmp oeq float %298, 0.000000e+00, !dbg !49
327
+ %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %254, float %298) #6, !dbg !50
328
+ %301 = select i1 %299, float 0.000000e+00, float %300, !dbg !51
329
+ %302 = fmul float %301, %297, !dbg !52
330
+ %303 = fadd float %291, %302, !dbg !53
331
+ %304 = fadd float %195, %296, !dbg !54
332
+ %305 = fmul float %297, %297, !dbg !55
333
+ %306 = fmul float %286, %305, !dbg !56
334
+ %307 = fmul float %301, %306, !dbg !57
335
+ %308 = fadd float %304, %307, !dbg !58
336
+ %309 = fsub float %172, %303, !dbg !44
337
+ %310 = fadd float %256, %298, !dbg !48
338
+ %311 = fcmp oeq float %310, 0.000000e+00, !dbg !49
339
+ %312 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %256, float %310) #6, !dbg !50
340
+ %313 = select i1 %311, float 0.000000e+00, float %312, !dbg !51
341
+ %314 = fmul float %313, %309, !dbg !52
342
+ %315 = fadd float %303, %314, !dbg !53
343
+ %316 = fadd float %196, %308, !dbg !54
344
+ %317 = fmul float %309, %309, !dbg !55
345
+ %318 = fmul float %298, %317, !dbg !56
346
+ %319 = fmul float %313, %318, !dbg !57
347
+ %320 = fadd float %316, %319, !dbg !58
348
+ %321 = fsub float %173, %315, !dbg !44
349
+ %322 = fadd float %258, %310, !dbg !48
350
+ %323 = fcmp oeq float %322, 0.000000e+00, !dbg !49
351
+ %324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %258, float %322) #6, !dbg !50
352
+ %325 = select i1 %323, float 0.000000e+00, float %324, !dbg !51
353
+ %326 = fmul float %325, %321, !dbg !52
354
+ %327 = fadd float %315, %326, !dbg !53
355
+ %328 = fadd float %197, %320, !dbg !54
356
+ %329 = fmul float %321, %321, !dbg !55
357
+ %330 = fmul float %310, %329, !dbg !56
358
+ %331 = fmul float %325, %330, !dbg !57
359
+ %332 = fadd float %328, %331, !dbg !58
360
+ %333 = fsub float %174, %327, !dbg !44
361
+ %334 = fadd float %260, %322, !dbg !48
362
+ %335 = fcmp oeq float %334, 0.000000e+00, !dbg !49
363
+ %336 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %260, float %334) #6, !dbg !50
364
+ %337 = select i1 %335, float 0.000000e+00, float %336, !dbg !51
365
+ %338 = fmul float %337, %333, !dbg !52
366
+ %339 = fadd float %327, %338, !dbg !53
367
+ %340 = fadd float %198, %332, !dbg !54
368
+ %341 = fmul float %333, %333, !dbg !55
369
+ %342 = fmul float %322, %341, !dbg !56
370
+ %343 = fmul float %337, %342, !dbg !57
371
+ %344 = fadd float %340, %343, !dbg !58
372
+ %345 = bitcast float %339 to i32, !dbg !59
373
+ %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 8, i32 31), !dbg !59
374
+ %347 = bitcast i32 %346 to float, !dbg !59
375
+ %348 = bitcast float %344 to i32, !dbg !59
376
+ %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 8, i32 31), !dbg !59
377
+ %350 = bitcast i32 %349 to float, !dbg !59
378
+ %351 = bitcast float %334 to i32, !dbg !59
379
+ %352 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %351, i32 8, i32 31), !dbg !59
380
+ %353 = bitcast i32 %352 to float, !dbg !59
381
+ %354 = fsub float %347, %339, !dbg !44
382
+ %355 = fadd float %334, %353, !dbg !48
383
+ %356 = fcmp oeq float %355, 0.000000e+00, !dbg !49
384
+ %357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %353, float %355) #6, !dbg !50
385
+ %358 = select i1 %356, float 0.000000e+00, float %357, !dbg !51
386
+ %359 = fmul float %358, %354, !dbg !52
387
+ %360 = fadd float %339, %359, !dbg !53
388
+ %361 = fadd float %344, %350, !dbg !54
389
+ %362 = fmul float %354, %354, !dbg !55
390
+ %363 = fmul float %334, %362, !dbg !56
391
+ %364 = fmul float %358, %363, !dbg !57
392
+ %365 = fadd float %361, %364, !dbg !58
393
+ %366 = bitcast float %360 to i32, !dbg !59
394
+ %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 4, i32 31), !dbg !59
395
+ %368 = bitcast i32 %367 to float, !dbg !59
396
+ %369 = bitcast float %365 to i32, !dbg !59
397
+ %370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 4, i32 31), !dbg !59
398
+ %371 = bitcast i32 %370 to float, !dbg !59
399
+ %372 = bitcast float %355 to i32, !dbg !59
400
+ %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %372, i32 4, i32 31), !dbg !59
401
+ %374 = bitcast i32 %373 to float, !dbg !59
402
+ %375 = fsub float %368, %360, !dbg !44
403
+ %376 = fadd float %355, %374, !dbg !48
404
+ %377 = fcmp oeq float %376, 0.000000e+00, !dbg !49
405
+ %378 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %374, float %376) #6, !dbg !50
406
+ %379 = select i1 %377, float 0.000000e+00, float %378, !dbg !51
407
+ %380 = fmul float %379, %375, !dbg !52
408
+ %381 = fadd float %360, %380, !dbg !53
409
+ %382 = fadd float %365, %371, !dbg !54
410
+ %383 = fmul float %375, %375, !dbg !55
411
+ %384 = fmul float %355, %383, !dbg !56
412
+ %385 = fmul float %379, %384, !dbg !57
413
+ %386 = fadd float %382, %385, !dbg !58
414
+ %387 = bitcast float %381 to i32, !dbg !59
415
+ %388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 2, i32 31), !dbg !59
416
+ %389 = bitcast i32 %388 to float, !dbg !59
417
+ %390 = bitcast float %386 to i32, !dbg !59
418
+ %391 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %390, i32 2, i32 31), !dbg !59
419
+ %392 = bitcast i32 %391 to float, !dbg !59
420
+ %393 = bitcast float %376 to i32, !dbg !59
421
+ %394 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %393, i32 2, i32 31), !dbg !59
422
+ %395 = bitcast i32 %394 to float, !dbg !59
423
+ %396 = fsub float %389, %381, !dbg !44
424
+ %397 = fadd float %376, %395, !dbg !48
425
+ %398 = fcmp oeq float %397, 0.000000e+00, !dbg !49
426
+ %399 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %395, float %397) #6, !dbg !50
427
+ %400 = select i1 %398, float 0.000000e+00, float %399, !dbg !51
428
+ %401 = fmul float %400, %396, !dbg !52
429
+ %402 = fadd float %381, %401, !dbg !53
430
+ %403 = fadd float %386, %392, !dbg !54
431
+ %404 = fmul float %396, %396, !dbg !55
432
+ %405 = fmul float %376, %404, !dbg !56
433
+ %406 = fmul float %400, %405, !dbg !57
434
+ %407 = fadd float %403, %406, !dbg !58
435
+ %408 = bitcast float %402 to i32, !dbg !59
436
+ %409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 1, i32 31), !dbg !59
437
+ %410 = bitcast i32 %409 to float, !dbg !59
438
+ %411 = bitcast float %407 to i32, !dbg !59
439
+ %412 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %411, i32 1, i32 31), !dbg !59
440
+ %413 = bitcast i32 %412 to float, !dbg !59
441
+ %414 = bitcast float %397 to i32, !dbg !59
442
+ %415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !59
443
+ %416 = bitcast i32 %415 to float, !dbg !59
444
+ %417 = fsub float %410, %402, !dbg !44
445
+ %418 = fadd float %397, %416, !dbg !48
446
+ %419 = fcmp oeq float %418, 0.000000e+00, !dbg !49
447
+ %420 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %418) #6, !dbg !50
448
+ %421 = select i1 %419, float 0.000000e+00, float %420, !dbg !51
449
+ %422 = fmul float %421, %417, !dbg !52
450
+ %423 = fadd float %402, %422, !dbg !53
451
+ %424 = fadd float %407, %413, !dbg !54
452
+ %425 = fmul float %417, %417, !dbg !55
453
+ %426 = fmul float %397, %425, !dbg !56
454
+ %427 = fmul float %421, %426, !dbg !57
455
+ %428 = fadd float %424, %427, !dbg !58
456
+ %429 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
457
+ %430 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
458
+ %431 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
459
+ %432 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
460
+ %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
461
+ %434 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
462
+ %435 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
463
+ %436 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
464
+ %437 = fadd float %429, 0x3EE4F8B580000000, !dbg !62
465
+ %438 = shl i32 %16, 8, !dbg !63
466
+ br label %439, !dbg !64
467
+
468
+ 439: ; preds = %199, %__nv_rsqrtf.exit
469
+ %440 = phi i1 [ true, %199 ], [ false, %__nv_rsqrtf.exit ]
470
+ %441 = phi i32 [ 0, %199 ], [ 128, %__nv_rsqrtf.exit ]
471
+ %442 = or i32 %441, %12, !dbg !65
472
+ %443 = or i32 %441, %13, !dbg !65
473
+ %444 = or i32 %442, %32, !dbg !66
474
+ %445 = or i32 %443, %32, !dbg !66
475
+ %446 = sext i32 %444 to i64, !dbg !67
476
+ %447 = getelementptr float, ptr addrspace(1) %2, i64 %446, !dbg !67
477
+ %448 = sext i32 %445 to i64, !dbg !67
478
+ %449 = getelementptr float, ptr addrspace(1) %2, i64 %448, !dbg !67
479
+ %450 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %447, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
480
+ %451 = extractvalue { i32, i32, i32, i32 } %450, 0, !dbg !68
481
+ %452 = extractvalue { i32, i32, i32, i32 } %450, 1, !dbg !68
482
+ %453 = extractvalue { i32, i32, i32, i32 } %450, 2, !dbg !68
483
+ %454 = extractvalue { i32, i32, i32, i32 } %450, 3, !dbg !68
484
+ %455 = bitcast i32 %451 to float, !dbg !68
485
+ %456 = bitcast i32 %452 to float, !dbg !68
486
+ %457 = bitcast i32 %453 to float, !dbg !68
487
+ %458 = bitcast i32 %454 to float, !dbg !68
488
+ %459 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %449, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
489
+ %460 = extractvalue { i32, i32, i32, i32 } %459, 0, !dbg !68
490
+ %461 = extractvalue { i32, i32, i32, i32 } %459, 1, !dbg !68
491
+ %462 = extractvalue { i32, i32, i32, i32 } %459, 2, !dbg !68
492
+ %463 = extractvalue { i32, i32, i32, i32 } %459, 3, !dbg !68
493
+ %464 = bitcast i32 %460 to float, !dbg !68
494
+ %465 = bitcast i32 %461 to float, !dbg !68
495
+ %466 = bitcast i32 %462 to float, !dbg !68
496
+ %467 = bitcast i32 %463 to float, !dbg !68
497
+ %468 = zext nneg i32 %442 to i64, !dbg !69
498
+ %469 = getelementptr float, ptr addrspace(1) %3, i64 %468, !dbg !69
499
+ %470 = zext nneg i32 %443 to i64, !dbg !69
500
+ %471 = getelementptr float, ptr addrspace(1) %3, i64 %470, !dbg !69
501
+ %472 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %469, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
502
+ %473 = extractvalue { i32, i32, i32, i32 } %472, 0, !dbg !70
503
+ %474 = extractvalue { i32, i32, i32, i32 } %472, 1, !dbg !70
504
+ %475 = extractvalue { i32, i32, i32, i32 } %472, 2, !dbg !70
505
+ %476 = extractvalue { i32, i32, i32, i32 } %472, 3, !dbg !70
506
+ %477 = bitcast i32 %473 to float, !dbg !70
507
+ %478 = bitcast i32 %474 to float, !dbg !70
508
+ %479 = bitcast i32 %475 to float, !dbg !70
509
+ %480 = bitcast i32 %476 to float, !dbg !70
510
+ %481 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %471, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
511
+ %482 = extractvalue { i32, i32, i32, i32 } %481, 0, !dbg !70
512
+ %483 = extractvalue { i32, i32, i32, i32 } %481, 1, !dbg !70
513
+ %484 = extractvalue { i32, i32, i32, i32 } %481, 2, !dbg !70
514
+ %485 = extractvalue { i32, i32, i32, i32 } %481, 3, !dbg !70
515
+ %486 = bitcast i32 %482 to float, !dbg !70
516
+ %487 = bitcast i32 %483 to float, !dbg !70
517
+ %488 = bitcast i32 %484 to float, !dbg !70
518
+ %489 = bitcast i32 %485 to float, !dbg !70
519
+ br i1 %37, label %490, label %491, !dbg !71
520
+
521
+ 490: ; preds = %439
522
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !71
523
+ br label %491, !dbg !71
524
+
525
+ 491: ; preds = %490, %439
526
+ %492 = getelementptr float, ptr addrspace(1) %41, i64 %468, !dbg !72
527
+ %493 = getelementptr float, ptr addrspace(1) %41, i64 %470, !dbg !72
528
+ %494 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %492, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
529
+ %495 = extractvalue { i32, i32, i32, i32 } %494, 0, !dbg !73
530
+ %496 = extractvalue { i32, i32, i32, i32 } %494, 1, !dbg !73
531
+ %497 = extractvalue { i32, i32, i32, i32 } %494, 2, !dbg !73
532
+ %498 = extractvalue { i32, i32, i32, i32 } %494, 3, !dbg !73
533
+ %499 = bitcast i32 %495 to float, !dbg !73
534
+ %500 = bitcast i32 %496 to float, !dbg !73
535
+ %501 = bitcast i32 %497 to float, !dbg !73
536
+ %502 = bitcast i32 %498 to float, !dbg !73
537
+ %503 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %493, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
538
+ %504 = extractvalue { i32, i32, i32, i32 } %503, 0, !dbg !73
539
+ %505 = extractvalue { i32, i32, i32, i32 } %503, 1, !dbg !73
540
+ %506 = extractvalue { i32, i32, i32, i32 } %503, 2, !dbg !73
541
+ %507 = extractvalue { i32, i32, i32, i32 } %503, 3, !dbg !73
542
+ %508 = bitcast i32 %504 to float, !dbg !73
543
+ %509 = bitcast i32 %505 to float, !dbg !73
544
+ %510 = bitcast i32 %506 to float, !dbg !73
545
+ %511 = bitcast i32 %507 to float, !dbg !73
546
+ %512 = fadd float %455, %499, !dbg !74
547
+ %513 = fadd float %456, %500, !dbg !74
548
+ %514 = fadd float %457, %501, !dbg !74
549
+ %515 = fadd float %458, %502, !dbg !74
550
+ %516 = fadd float %464, %508, !dbg !74
551
+ %517 = fadd float %465, %509, !dbg !74
552
+ %518 = fadd float %466, %510, !dbg !74
553
+ %519 = fadd float %467, %511, !dbg !74
554
+ %520 = fsub float %512, %423, !dbg !75
555
+ %521 = fsub float %513, %423, !dbg !75
556
+ %522 = fsub float %514, %423, !dbg !75
557
+ %523 = fsub float %515, %423, !dbg !75
558
+ %524 = fsub float %516, %423, !dbg !75
559
+ %525 = fsub float %517, %423, !dbg !75
560
+ %526 = fsub float %518, %423, !dbg !75
561
+ %527 = fsub float %519, %423, !dbg !75
562
+ %528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
563
+ %.not.i = icmp eq i32 %528, 0, !dbg !76
564
+ br i1 %.not.i, label %531, label %529, !dbg !76
565
+
566
+ 529: ; preds = %491
567
+ %530 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %437), !dbg !76
568
+ br label %__nv_rsqrtf.exit, !dbg !76
569
+
570
+ 531: ; preds = %491
571
+ %532 = tail call float @llvm.nvvm.rsqrt.approx.f(float %437), !dbg !76
572
+ br label %__nv_rsqrtf.exit, !dbg !76
573
+
574
+ __nv_rsqrtf.exit: ; preds = %529, %531
575
+ %.0.i = phi float [ %530, %529 ], [ %532, %531 ], !dbg !76
576
+ %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
577
+ %534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
578
+ %535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
579
+ %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
580
+ %537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
581
+ %538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
582
+ %539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
583
+ %540 = fmul float %520, %.0.i, !dbg !77
584
+ %541 = fmul float %521, %.0.i, !dbg !77
585
+ %542 = fmul float %522, %.0.i, !dbg !77
586
+ %543 = fmul float %523, %.0.i, !dbg !77
587
+ %544 = fmul float %524, %.0.i, !dbg !77
588
+ %545 = fmul float %525, %.0.i, !dbg !77
589
+ %546 = fmul float %526, %.0.i, !dbg !77
590
+ %547 = fmul float %527, %.0.i, !dbg !77
591
+ %548 = fmul float %540, %477, !dbg !78
592
+ %549 = fmul float %541, %478, !dbg !78
593
+ %550 = fmul float %542, %479, !dbg !78
594
+ %551 = fmul float %543, %480, !dbg !78
595
+ %552 = fmul float %544, %486, !dbg !78
596
+ %553 = fmul float %545, %487, !dbg !78
597
+ %554 = fmul float %546, %488, !dbg !78
598
+ %555 = fmul float %547, %489, !dbg !78
599
+ %556 = or i32 %442, %438, !dbg !79
600
+ %557 = sext i32 %556 to i64, !dbg !80
601
+ %558 = getelementptr i16, ptr addrspace(1) %4, i64 %557, !dbg !80
602
+ %559 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %548) #6, !dbg !81
603
+ %560 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %549) #6, !dbg !81
604
+ %561 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %550) #6, !dbg !81
605
+ %562 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %551) #6, !dbg !81
606
+ %563 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %552) #6, !dbg !81
607
+ %564 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %553) #6, !dbg !81
608
+ %565 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %554) #6, !dbg !81
609
+ %566 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %555) #6, !dbg !81
610
+ %567 = insertelement <2 x i16> undef, i16 %559, i64 0, !dbg !81
611
+ %568 = insertelement <2 x i16> %567, i16 %560, i64 1, !dbg !81
612
+ %569 = bitcast <2 x i16> %568 to i32, !dbg !81
613
+ %570 = insertelement <2 x i16> undef, i16 %561, i64 0, !dbg !81
614
+ %571 = insertelement <2 x i16> %570, i16 %562, i64 1, !dbg !81
615
+ %572 = bitcast <2 x i16> %571 to i32, !dbg !81
616
+ %573 = insertelement <2 x i16> undef, i16 %563, i64 0, !dbg !81
617
+ %574 = insertelement <2 x i16> %573, i16 %564, i64 1, !dbg !81
618
+ %575 = bitcast <2 x i16> %574 to i32, !dbg !81
619
+ %576 = insertelement <2 x i16> undef, i16 %565, i64 0, !dbg !81
620
+ %577 = insertelement <2 x i16> %576, i16 %566, i64 1, !dbg !81
621
+ %578 = bitcast <2 x i16> %577 to i32, !dbg !81
622
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %569, i32 %572, i32 %575, i32 %578, ptr addrspace(1) %558, i1 true) #6, !dbg !81
623
+ br i1 %440, label %439, label %579, !dbg !64
624
+
625
+ 579: ; preds = %__nv_rsqrtf.exit
626
+ ret void, !dbg !82
627
+ }
628
+
629
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
630
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
631
+
632
+ ; Function Attrs: convergent nocallback nounwind
633
+ declare void @llvm.nvvm.barrier0() #1
634
+
635
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
636
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
637
+
638
+ ; Function Attrs: alwaysinline nounwind
639
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
640
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
641
+ %.not = icmp eq i32 %1, 0
642
+ br i1 %.not, label %4, label %2
643
+
644
+ 2: ; preds = %0
645
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
646
+ br label %6
647
+
648
+ 4: ; preds = %0
649
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
650
+ br label %6
651
+
652
+ 6: ; preds = %4, %2
653
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
654
+ ret float %.0
655
+ }
656
+
657
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
658
+
659
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
660
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
661
+
662
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
663
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
664
+
665
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
666
+ attributes #1 = { convergent nocallback nounwind }
667
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
668
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
669
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
670
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
671
+ attributes #6 = { nounwind }
672
+
673
+ !llvm.module.flags = !{!0, !1}
674
+ !llvm.dbg.cu = !{!2}
675
+ !nvvm.annotations = !{!4, !5, !5, !4}
676
+ !llvm.ident = !{!6}
677
+
678
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
679
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
680
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
681
+ !3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
682
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
683
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
684
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
685
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
686
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
687
+ !9 = !{}
688
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
689
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
690
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
691
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
692
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
693
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
694
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
695
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
696
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
697
+ !19 = !DILocation(line: 36, column: 22, scope: !7)
698
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
699
+ !21 = !DILocation(line: 38, column: 36, scope: !7)
700
+ !22 = !DILocation(line: 39, column: 40, scope: !7)
701
+ !23 = !DILocation(line: 40, column: 44, scope: !7)
702
+ !24 = !DILocation(line: 31, column: 36, scope: !7)
703
+ !25 = !DILocation(line: 32, column: 27, scope: !7)
704
+ !26 = !DILocation(line: 35, column: 40, scope: !7)
705
+ !27 = !DILocation(line: 35, column: 34, scope: !7)
706
+ !28 = !DILocation(line: 35, column: 50, scope: !7)
707
+ !29 = !DILocation(line: 39, column: 55, scope: !7)
708
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
709
+ !31 = !DILocation(line: 40, column: 34, scope: !7)
710
+ !32 = !DILocation(line: 40, column: 52, scope: !7)
711
+ !33 = !DILocation(line: 41, column: 22, scope: !7)
712
+ !34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
713
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
714
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
715
+ !37 = !DILocation(line: 44, column: 38, scope: !35)
716
+ !38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
717
+ !39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
718
+ !40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
719
+ !41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
720
+ !42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
721
+ !43 = !DILocation(line: 47, column: 48, scope: !7)
722
+ !44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
723
+ !45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
724
+ !46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
725
+ !47 = !DILocation(line: 50, column: 41, scope: !45)
726
+ !48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
727
+ !49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
728
+ !50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
729
+ !51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
730
+ !52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
731
+ !53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
732
+ !54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
733
+ !55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
734
+ !56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
735
+ !57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
736
+ !58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
737
+ !59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
738
+ !60 = !DILocation(line: 50, column: 41, scope: !35)
739
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
740
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
741
+ !63 = !DILocation(line: 76, column: 39, scope: !7)
742
+ !64 = !DILocation(line: 55, column: 36, scope: !7)
743
+ !65 = !DILocation(line: 56, column: 27, scope: !7)
744
+ !66 = !DILocation(line: 59, column: 41, scope: !7)
745
+ !67 = !DILocation(line: 59, column: 35, scope: !7)
746
+ !68 = !DILocation(line: 59, column: 51, scope: !7)
747
+ !69 = !DILocation(line: 60, column: 35, scope: !7)
748
+ !70 = !DILocation(line: 60, column: 40, scope: !7)
749
+ !71 = !DILocation(line: 64, column: 57, scope: !7)
750
+ !72 = !DILocation(line: 65, column: 35, scope: !7)
751
+ !73 = !DILocation(line: 65, column: 54, scope: !7)
752
+ !74 = !DILocation(line: 66, column: 24, scope: !7)
753
+ !75 = !DILocation(line: 67, column: 24, scope: !7)
754
+ !76 = !DILocation(line: 72, column: 30, scope: !7)
755
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
756
+ !78 = !DILocation(line: 74, column: 24, scope: !7)
757
+ !79 = !DILocation(line: 76, column: 35, scope: !7)
758
+ !80 = !DILocation(line: 76, column: 29, scope: !7)
759
+ !81 = !DILocation(line: 76, column: 52, scope: !7)
760
+ !82 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.llir ADDED
@@ -0,0 +1,839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [8 x i8] c"<module>"
5
+ @assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [8 x i8] c"<module>"
8
+ @assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = lshr i32 %9, 4, !dbg !10
18
+ %11 = and i32 %10, 15, !dbg !10
19
+ %12 = and i32 %9, 15, !dbg !10
20
+ %13 = shl nuw nsw i32 %12, 3, !dbg !11
21
+ %14 = or i32 %13, 4, !dbg !11
22
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
23
+ %16 = shl i32 %15, 4, !dbg !13
24
+ %17 = or i32 %16, %11, !dbg !14
25
+ %18 = or i32 %16, %12, !dbg !14
26
+ %19 = sext i32 %17 to i64, !dbg !15
27
+ %20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
28
+ %21 = sext i32 %18 to i64, !dbg !15
29
+ %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
30
+ %23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
31
+ %24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
32
+ %25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
33
+ %26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
34
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
35
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
36
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
37
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
38
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
39
+ %32 = srem i32 %17, 512, !dbg !17
40
+ %33 = shl nsw i32 %32, 8, !dbg !18
41
+ %34 = shl i32 %17, 8, !dbg !19
42
+ %35 = add i64 %31, 50257, !dbg !20
43
+ %36 = icmp slt i64 %23, 0, !dbg !21
44
+ %37 = icmp slt i64 %31, 0, !dbg !21
45
+ %38 = select i1 %37, i64 %35, i64 %31, !dbg !22
46
+ %39 = icmp ugt i64 %38, 50256, !dbg !23
47
+ %40 = shl i64 %23, 8, !dbg !24
48
+ %41 = add i64 %40, 12865792, !dbg !24
49
+ %42 = select i1 %36, i64 %41, i64 %40, !dbg !24
50
+ %43 = getelementptr float, ptr addrspace(1) %1, i64 %42
51
+ br label %44, !dbg !25
52
+
53
+ 44: ; preds = %8, %130
54
+ %45 = phi float [ 0.000000e+00, %8 ], [ %177, %130 ]
55
+ %46 = phi float [ 0.000000e+00, %8 ], [ %178, %130 ]
56
+ %47 = phi float [ 0.000000e+00, %8 ], [ %179, %130 ]
57
+ %48 = phi float [ 0.000000e+00, %8 ], [ %180, %130 ]
58
+ %49 = phi float [ 0.000000e+00, %8 ], [ %181, %130 ]
59
+ %50 = phi float [ 0.000000e+00, %8 ], [ %182, %130 ]
60
+ %51 = phi float [ 0.000000e+00, %8 ], [ %183, %130 ]
61
+ %52 = phi float [ 0.000000e+00, %8 ], [ %184, %130 ]
62
+ %53 = phi float [ 0.000000e+00, %8 ], [ %185, %130 ]
63
+ %54 = phi float [ 0.000000e+00, %8 ], [ %186, %130 ]
64
+ %55 = phi float [ 0.000000e+00, %8 ], [ %187, %130 ]
65
+ %56 = phi float [ 0.000000e+00, %8 ], [ %188, %130 ]
66
+ %57 = phi float [ 0.000000e+00, %8 ], [ %189, %130 ]
67
+ %58 = phi float [ 0.000000e+00, %8 ], [ %190, %130 ]
68
+ %59 = phi float [ 0.000000e+00, %8 ], [ %191, %130 ]
69
+ %60 = phi float [ 0.000000e+00, %8 ], [ %192, %130 ]
70
+ %61 = phi float [ 0.000000e+00, %8 ], [ %225, %130 ]
71
+ %62 = phi float [ 0.000000e+00, %8 ], [ %226, %130 ]
72
+ %63 = phi float [ 0.000000e+00, %8 ], [ %227, %130 ]
73
+ %64 = phi float [ 0.000000e+00, %8 ], [ %228, %130 ]
74
+ %65 = phi float [ 0.000000e+00, %8 ], [ %229, %130 ]
75
+ %66 = phi float [ 0.000000e+00, %8 ], [ %230, %130 ]
76
+ %67 = phi float [ 0.000000e+00, %8 ], [ %231, %130 ]
77
+ %68 = phi float [ 0.000000e+00, %8 ], [ %232, %130 ]
78
+ %69 = phi float [ 0.000000e+00, %8 ], [ %201, %130 ]
79
+ %70 = phi float [ 0.000000e+00, %8 ], [ %202, %130 ]
80
+ %71 = phi float [ 0.000000e+00, %8 ], [ %203, %130 ]
81
+ %72 = phi float [ 0.000000e+00, %8 ], [ %204, %130 ]
82
+ %73 = phi float [ 0.000000e+00, %8 ], [ %205, %130 ]
83
+ %74 = phi float [ 0.000000e+00, %8 ], [ %206, %130 ]
84
+ %75 = phi float [ 0.000000e+00, %8 ], [ %207, %130 ]
85
+ %76 = phi float [ 0.000000e+00, %8 ], [ %208, %130 ]
86
+ %77 = phi i1 [ true, %8 ], [ false, %130 ]
87
+ %78 = phi i32 [ 0, %8 ], [ 128, %130 ]
88
+ %79 = or i32 %78, %13, !dbg !26
89
+ %80 = or i32 %78, %14, !dbg !26
90
+ %81 = or i32 %79, %33, !dbg !27
91
+ %82 = or i32 %80, %33, !dbg !27
92
+ %83 = sext i32 %81 to i64, !dbg !28
93
+ %84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !28
94
+ %85 = sext i32 %82 to i64, !dbg !28
95
+ %86 = getelementptr float, ptr addrspace(1) %2, i64 %85, !dbg !28
96
+ %87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
97
+ %88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !29
98
+ %89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !29
99
+ %90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !29
100
+ %91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !29
101
+ %92 = bitcast i32 %88 to float, !dbg !29
102
+ %93 = bitcast i32 %89 to float, !dbg !29
103
+ %94 = bitcast i32 %90 to float, !dbg !29
104
+ %95 = bitcast i32 %91 to float, !dbg !29
105
+ %96 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %86, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
106
+ %97 = extractvalue { i32, i32, i32, i32 } %96, 0, !dbg !29
107
+ %98 = extractvalue { i32, i32, i32, i32 } %96, 1, !dbg !29
108
+ %99 = extractvalue { i32, i32, i32, i32 } %96, 2, !dbg !29
109
+ %100 = extractvalue { i32, i32, i32, i32 } %96, 3, !dbg !29
110
+ %101 = bitcast i32 %97 to float, !dbg !29
111
+ %102 = bitcast i32 %98 to float, !dbg !29
112
+ %103 = bitcast i32 %99 to float, !dbg !29
113
+ %104 = bitcast i32 %100 to float, !dbg !29
114
+ %105 = or i32 %79, %34, !dbg !30
115
+ %106 = sext i32 %105 to i64, !dbg !31
116
+ %107 = getelementptr i16, ptr addrspace(1) %3, i64 %106, !dbg !31
117
+ %108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
118
+ %109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !32
119
+ %110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !32
120
+ %111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !32
121
+ %112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !32
122
+ %113 = trunc i32 %109 to i16, !dbg !32
123
+ %extelt.offset5 = lshr i32 %109, 16, !dbg !32
124
+ %114 = trunc i32 %extelt.offset5 to i16, !dbg !32
125
+ %115 = trunc i32 %110 to i16, !dbg !32
126
+ %extelt.offset6 = lshr i32 %110, 16, !dbg !32
127
+ %116 = trunc i32 %extelt.offset6 to i16, !dbg !32
128
+ %117 = trunc i32 %111 to i16, !dbg !32
129
+ %extelt.offset7 = lshr i32 %111, 16, !dbg !32
130
+ %118 = trunc i32 %extelt.offset7 to i16, !dbg !32
131
+ %119 = trunc i32 %112 to i16, !dbg !32
132
+ %extelt.offset8 = lshr i32 %112, 16, !dbg !32
133
+ %120 = trunc i32 %extelt.offset8 to i16, !dbg !32
134
+ %121 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !33
135
+ %122 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !33
136
+ %123 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !33
137
+ %124 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !33
138
+ %125 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %117) #6, !dbg !33
139
+ %126 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %118) #6, !dbg !33
140
+ %127 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %119) #6, !dbg !33
141
+ %128 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %120) #6, !dbg !33
142
+ br i1 %39, label %129, label %130, !dbg !34
143
+
144
+ 129: ; preds = %44
145
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !34
146
+ br label %130, !dbg !34
147
+
148
+ 130: ; preds = %129, %44
149
+ %131 = zext nneg i32 %79 to i64, !dbg !35
150
+ %132 = zext nneg i32 %80 to i64, !dbg !35
151
+ %133 = getelementptr float, ptr addrspace(1) %43, i64 %131, !dbg !36
152
+ %134 = getelementptr float, ptr addrspace(1) %43, i64 %132, !dbg !36
153
+ %135 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
154
+ %136 = extractvalue { i32, i32, i32, i32 } %135, 0, !dbg !37
155
+ %137 = extractvalue { i32, i32, i32, i32 } %135, 1, !dbg !37
156
+ %138 = extractvalue { i32, i32, i32, i32 } %135, 2, !dbg !37
157
+ %139 = extractvalue { i32, i32, i32, i32 } %135, 3, !dbg !37
158
+ %140 = bitcast i32 %136 to float, !dbg !37
159
+ %141 = bitcast i32 %137 to float, !dbg !37
160
+ %142 = bitcast i32 %138 to float, !dbg !37
161
+ %143 = bitcast i32 %139 to float, !dbg !37
162
+ %144 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %134, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
163
+ %145 = extractvalue { i32, i32, i32, i32 } %144, 0, !dbg !37
164
+ %146 = extractvalue { i32, i32, i32, i32 } %144, 1, !dbg !37
165
+ %147 = extractvalue { i32, i32, i32, i32 } %144, 2, !dbg !37
166
+ %148 = extractvalue { i32, i32, i32, i32 } %144, 3, !dbg !37
167
+ %149 = bitcast i32 %145 to float, !dbg !37
168
+ %150 = bitcast i32 %146 to float, !dbg !37
169
+ %151 = bitcast i32 %147 to float, !dbg !37
170
+ %152 = bitcast i32 %148 to float, !dbg !37
171
+ %153 = fadd float %92, %140, !dbg !38
172
+ %154 = fadd float %93, %141, !dbg !38
173
+ %155 = fadd float %94, %142, !dbg !38
174
+ %156 = fadd float %95, %143, !dbg !38
175
+ %157 = fadd float %101, %149, !dbg !38
176
+ %158 = fadd float %102, %150, !dbg !38
177
+ %159 = fadd float %103, %151, !dbg !38
178
+ %160 = fadd float %104, %152, !dbg !38
179
+ %161 = fadd float %121, %153, !dbg !39
180
+ %162 = fadd float %122, %154, !dbg !39
181
+ %163 = fadd float %123, %155, !dbg !39
182
+ %164 = fadd float %124, %156, !dbg !39
183
+ %165 = fadd float %125, %157, !dbg !39
184
+ %166 = fadd float %126, %158, !dbg !39
185
+ %167 = fadd float %127, %159, !dbg !39
186
+ %168 = fadd float %128, %160, !dbg !39
187
+ %169 = fsub float %161, %69, !dbg !40
188
+ %170 = fsub float %162, %70, !dbg !40
189
+ %171 = fsub float %163, %71, !dbg !40
190
+ %172 = fsub float %164, %72, !dbg !40
191
+ %173 = fsub float %165, %73, !dbg !40
192
+ %174 = fsub float %166, %74, !dbg !40
193
+ %175 = fsub float %167, %75, !dbg !40
194
+ %176 = fsub float %168, %76, !dbg !40
195
+ %177 = fadd float %45, 1.000000e+00, !dbg !44
196
+ %178 = fadd float %46, 1.000000e+00, !dbg !44
197
+ %179 = fadd float %47, 1.000000e+00, !dbg !44
198
+ %180 = fadd float %48, 1.000000e+00, !dbg !44
199
+ %181 = fadd float %49, 1.000000e+00, !dbg !44
200
+ %182 = fadd float %50, 1.000000e+00, !dbg !44
201
+ %183 = fadd float %51, 1.000000e+00, !dbg !44
202
+ %184 = fadd float %52, 1.000000e+00, !dbg !44
203
+ %185 = fadd float %53, 1.000000e+00, !dbg !44
204
+ %186 = fadd float %54, 1.000000e+00, !dbg !44
205
+ %187 = fadd float %55, 1.000000e+00, !dbg !44
206
+ %188 = fadd float %56, 1.000000e+00, !dbg !44
207
+ %189 = fadd float %57, 1.000000e+00, !dbg !44
208
+ %190 = fadd float %58, 1.000000e+00, !dbg !44
209
+ %191 = fadd float %59, 1.000000e+00, !dbg !44
210
+ %192 = fadd float %60, 1.000000e+00, !dbg !44
211
+ %193 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %169, float %177) #6, !dbg !45
212
+ %194 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %170, float %178) #6, !dbg !45
213
+ %195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %171, float %179) #6, !dbg !45
214
+ %196 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %172, float %180) #6, !dbg !45
215
+ %197 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %173, float %181) #6, !dbg !45
216
+ %198 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %174, float %182) #6, !dbg !45
217
+ %199 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %183) #6, !dbg !45
218
+ %200 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %176, float %184) #6, !dbg !45
219
+ %201 = fadd float %69, %193, !dbg !46
220
+ %202 = fadd float %70, %194, !dbg !46
221
+ %203 = fadd float %71, %195, !dbg !46
222
+ %204 = fadd float %72, %196, !dbg !46
223
+ %205 = fadd float %73, %197, !dbg !46
224
+ %206 = fadd float %74, %198, !dbg !46
225
+ %207 = fadd float %75, %199, !dbg !46
226
+ %208 = fadd float %76, %200, !dbg !46
227
+ %209 = fsub float %161, %201, !dbg !47
228
+ %210 = fsub float %162, %202, !dbg !47
229
+ %211 = fsub float %163, %203, !dbg !47
230
+ %212 = fsub float %164, %204, !dbg !47
231
+ %213 = fsub float %165, %205, !dbg !47
232
+ %214 = fsub float %166, %206, !dbg !47
233
+ %215 = fsub float %167, %207, !dbg !47
234
+ %216 = fsub float %168, %208, !dbg !47
235
+ %217 = fmul float %169, %209, !dbg !48
236
+ %218 = fmul float %170, %210, !dbg !48
237
+ %219 = fmul float %171, %211, !dbg !48
238
+ %220 = fmul float %172, %212, !dbg !48
239
+ %221 = fmul float %173, %213, !dbg !48
240
+ %222 = fmul float %174, %214, !dbg !48
241
+ %223 = fmul float %175, %215, !dbg !48
242
+ %224 = fmul float %176, %216, !dbg !48
243
+ %225 = fadd float %61, %217, !dbg !49
244
+ %226 = fadd float %62, %218, !dbg !49
245
+ %227 = fadd float %63, %219, !dbg !49
246
+ %228 = fadd float %64, %220, !dbg !49
247
+ %229 = fadd float %65, %221, !dbg !49
248
+ %230 = fadd float %66, %222, !dbg !49
249
+ %231 = fadd float %67, %223, !dbg !49
250
+ %232 = fadd float %68, %224, !dbg !49
251
+ br i1 %77, label %44, label %233, !dbg !25
252
+
253
+ 233: ; preds = %130
254
+ %234 = and i32 %9, 127, !dbg !11
255
+ %235 = and i32 %9, 128, !dbg !25
256
+ %.not = icmp eq i32 %235, 0, !dbg !25
257
+ %236 = select i1 %.not, i32 0, i32 136, !dbg !25
258
+ %237 = add nuw nsw i32 %236, %234, !dbg !25
259
+ %238 = zext nneg i32 %237 to i64, !dbg !25
260
+ %239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !25
261
+ %240 = insertelement <1 x float> undef, float %185, i64 0, !dbg !25
262
+ store <1 x float> %240, ptr addrspace(3) %239, align 4, !dbg !25
263
+ %241 = add nuw nsw i32 %234, 272, !dbg !25
264
+ %242 = add nuw nsw i32 %241, %236, !dbg !25
265
+ %243 = zext nneg i32 %242 to i64, !dbg !25
266
+ %244 = getelementptr float, ptr addrspace(3) @global_smem, i64 %243, !dbg !25
267
+ %245 = insertelement <1 x float> undef, float %186, i64 0, !dbg !25
268
+ store <1 x float> %245, ptr addrspace(3) %244, align 4, !dbg !25
269
+ %246 = add nuw nsw i32 %234, 544, !dbg !25
270
+ %247 = add nuw nsw i32 %246, %236, !dbg !25
271
+ %248 = zext nneg i32 %247 to i64, !dbg !25
272
+ %249 = getelementptr float, ptr addrspace(3) @global_smem, i64 %248, !dbg !25
273
+ %250 = insertelement <1 x float> undef, float %187, i64 0, !dbg !25
274
+ store <1 x float> %250, ptr addrspace(3) %249, align 4, !dbg !25
275
+ %251 = add nuw nsw i32 %234, 816, !dbg !25
276
+ %252 = add nuw nsw i32 %251, %236, !dbg !25
277
+ %253 = zext nneg i32 %252 to i64, !dbg !25
278
+ %254 = getelementptr float, ptr addrspace(3) @global_smem, i64 %253, !dbg !25
279
+ %255 = insertelement <1 x float> undef, float %188, i64 0, !dbg !25
280
+ store <1 x float> %255, ptr addrspace(3) %254, align 4, !dbg !25
281
+ %256 = add nuw nsw i32 %234, 1088, !dbg !25
282
+ %257 = add nuw nsw i32 %256, %236, !dbg !25
283
+ %258 = zext nneg i32 %257 to i64, !dbg !25
284
+ %259 = getelementptr float, ptr addrspace(3) @global_smem, i64 %258, !dbg !25
285
+ %260 = insertelement <1 x float> undef, float %189, i64 0, !dbg !25
286
+ store <1 x float> %260, ptr addrspace(3) %259, align 4, !dbg !25
287
+ %261 = add nuw nsw i32 %234, 1360, !dbg !25
288
+ %262 = add nuw nsw i32 %261, %236, !dbg !25
289
+ %263 = zext nneg i32 %262 to i64, !dbg !25
290
+ %264 = getelementptr float, ptr addrspace(3) @global_smem, i64 %263, !dbg !25
291
+ %265 = insertelement <1 x float> undef, float %190, i64 0, !dbg !25
292
+ store <1 x float> %265, ptr addrspace(3) %264, align 4, !dbg !25
293
+ %266 = add nuw nsw i32 %234, 1632, !dbg !25
294
+ %267 = add nuw nsw i32 %266, %236, !dbg !25
295
+ %268 = zext nneg i32 %267 to i64, !dbg !25
296
+ %269 = getelementptr float, ptr addrspace(3) @global_smem, i64 %268, !dbg !25
297
+ %270 = insertelement <1 x float> undef, float %191, i64 0, !dbg !25
298
+ store <1 x float> %270, ptr addrspace(3) %269, align 4, !dbg !25
299
+ %271 = add nuw nsw i32 %234, 1904, !dbg !25
300
+ %272 = add nuw nsw i32 %271, %236, !dbg !25
301
+ %273 = zext nneg i32 %272 to i64, !dbg !25
302
+ %274 = getelementptr float, ptr addrspace(3) @global_smem, i64 %273, !dbg !25
303
+ %275 = insertelement <1 x float> undef, float %192, i64 0, !dbg !25
304
+ store <1 x float> %275, ptr addrspace(3) %274, align 4, !dbg !25
305
+ tail call void @llvm.nvvm.barrier0(), !dbg !25
306
+ %276 = mul nuw nsw i32 %11, 136, !dbg !25
307
+ %277 = add nuw nsw i32 %276, %13, !dbg !25
308
+ %278 = zext nneg i32 %277 to i64, !dbg !25
309
+ %279 = getelementptr float, ptr addrspace(3) @global_smem, i64 %278, !dbg !25
310
+ %280 = load float, ptr addrspace(3) %279, align 32, !dbg !25
311
+ %281 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 1, !dbg !25
312
+ %282 = load float, ptr addrspace(3) %281, align 4, !dbg !25
313
+ %283 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 2, !dbg !25
314
+ %284 = load float, ptr addrspace(3) %283, align 8, !dbg !25
315
+ %285 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 3, !dbg !25
316
+ %286 = load float, ptr addrspace(3) %285, align 4, !dbg !25
317
+ %287 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 4, !dbg !25
318
+ %288 = load float, ptr addrspace(3) %287, align 16, !dbg !25
319
+ %289 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 5, !dbg !25
320
+ %290 = load float, ptr addrspace(3) %289, align 4, !dbg !25
321
+ %291 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 6, !dbg !25
322
+ %292 = load float, ptr addrspace(3) %291, align 8, !dbg !25
323
+ %293 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 7, !dbg !25
324
+ %294 = load float, ptr addrspace(3) %293, align 4, !dbg !25
325
+ %295 = fsub float %202, %201, !dbg !50
326
+ %296 = fadd float %280, %282, !dbg !54
327
+ %297 = fcmp oeq float %296, 0.000000e+00, !dbg !55
328
+ %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %282, float %296) #6, !dbg !56
329
+ %299 = select i1 %297, float 0.000000e+00, float %298, !dbg !57
330
+ %300 = fmul float %295, %299, !dbg !58
331
+ %301 = fadd float %201, %300, !dbg !59
332
+ %302 = fadd float %225, %226, !dbg !60
333
+ %303 = fmul float %295, %295, !dbg !61
334
+ %304 = fmul float %303, %280, !dbg !62
335
+ %305 = fmul float %304, %299, !dbg !63
336
+ %306 = fadd float %302, %305, !dbg !64
337
+ %307 = fsub float %203, %301, !dbg !50
338
+ %308 = fadd float %284, %296, !dbg !54
339
+ %309 = fcmp oeq float %308, 0.000000e+00, !dbg !55
340
+ %310 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float %308) #6, !dbg !56
341
+ %311 = select i1 %309, float 0.000000e+00, float %310, !dbg !57
342
+ %312 = fmul float %311, %307, !dbg !58
343
+ %313 = fadd float %301, %312, !dbg !59
344
+ %314 = fadd float %227, %306, !dbg !60
345
+ %315 = fmul float %307, %307, !dbg !61
346
+ %316 = fmul float %296, %315, !dbg !62
347
+ %317 = fmul float %311, %316, !dbg !63
348
+ %318 = fadd float %314, %317, !dbg !64
349
+ %319 = fsub float %204, %313, !dbg !50
350
+ %320 = fadd float %286, %308, !dbg !54
351
+ %321 = fcmp oeq float %320, 0.000000e+00, !dbg !55
352
+ %322 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %286, float %320) #6, !dbg !56
353
+ %323 = select i1 %321, float 0.000000e+00, float %322, !dbg !57
354
+ %324 = fmul float %323, %319, !dbg !58
355
+ %325 = fadd float %313, %324, !dbg !59
356
+ %326 = fadd float %228, %318, !dbg !60
357
+ %327 = fmul float %319, %319, !dbg !61
358
+ %328 = fmul float %308, %327, !dbg !62
359
+ %329 = fmul float %323, %328, !dbg !63
360
+ %330 = fadd float %326, %329, !dbg !64
361
+ %331 = fsub float %205, %325, !dbg !50
362
+ %332 = fadd float %288, %320, !dbg !54
363
+ %333 = fcmp oeq float %332, 0.000000e+00, !dbg !55
364
+ %334 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %288, float %332) #6, !dbg !56
365
+ %335 = select i1 %333, float 0.000000e+00, float %334, !dbg !57
366
+ %336 = fmul float %335, %331, !dbg !58
367
+ %337 = fadd float %325, %336, !dbg !59
368
+ %338 = fadd float %229, %330, !dbg !60
369
+ %339 = fmul float %331, %331, !dbg !61
370
+ %340 = fmul float %320, %339, !dbg !62
371
+ %341 = fmul float %335, %340, !dbg !63
372
+ %342 = fadd float %338, %341, !dbg !64
373
+ %343 = fsub float %206, %337, !dbg !50
374
+ %344 = fadd float %290, %332, !dbg !54
375
+ %345 = fcmp oeq float %344, 0.000000e+00, !dbg !55
376
+ %346 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %290, float %344) #6, !dbg !56
377
+ %347 = select i1 %345, float 0.000000e+00, float %346, !dbg !57
378
+ %348 = fmul float %347, %343, !dbg !58
379
+ %349 = fadd float %337, %348, !dbg !59
380
+ %350 = fadd float %230, %342, !dbg !60
381
+ %351 = fmul float %343, %343, !dbg !61
382
+ %352 = fmul float %332, %351, !dbg !62
383
+ %353 = fmul float %347, %352, !dbg !63
384
+ %354 = fadd float %350, %353, !dbg !64
385
+ %355 = fsub float %207, %349, !dbg !50
386
+ %356 = fadd float %292, %344, !dbg !54
387
+ %357 = fcmp oeq float %356, 0.000000e+00, !dbg !55
388
+ %358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %292, float %356) #6, !dbg !56
389
+ %359 = select i1 %357, float 0.000000e+00, float %358, !dbg !57
390
+ %360 = fmul float %359, %355, !dbg !58
391
+ %361 = fadd float %349, %360, !dbg !59
392
+ %362 = fadd float %231, %354, !dbg !60
393
+ %363 = fmul float %355, %355, !dbg !61
394
+ %364 = fmul float %344, %363, !dbg !62
395
+ %365 = fmul float %359, %364, !dbg !63
396
+ %366 = fadd float %362, %365, !dbg !64
397
+ %367 = fsub float %208, %361, !dbg !50
398
+ %368 = fadd float %294, %356, !dbg !54
399
+ %369 = fcmp oeq float %368, 0.000000e+00, !dbg !55
400
+ %370 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %294, float %368) #6, !dbg !56
401
+ %371 = select i1 %369, float 0.000000e+00, float %370, !dbg !57
402
+ %372 = fmul float %371, %367, !dbg !58
403
+ %373 = fadd float %361, %372, !dbg !59
404
+ %374 = fadd float %232, %366, !dbg !60
405
+ %375 = fmul float %367, %367, !dbg !61
406
+ %376 = fmul float %356, %375, !dbg !62
407
+ %377 = fmul float %371, %376, !dbg !63
408
+ %378 = fadd float %374, %377, !dbg !64
409
+ %379 = bitcast float %373 to i32, !dbg !65
410
+ %380 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 8, i32 31), !dbg !65
411
+ %381 = bitcast i32 %380 to float, !dbg !65
412
+ %382 = bitcast float %378 to i32, !dbg !65
413
+ %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 8, i32 31), !dbg !65
414
+ %384 = bitcast i32 %383 to float, !dbg !65
415
+ %385 = bitcast float %368 to i32, !dbg !65
416
+ %386 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 8, i32 31), !dbg !65
417
+ %387 = bitcast i32 %386 to float, !dbg !65
418
+ %388 = fsub float %381, %373, !dbg !50
419
+ %389 = fadd float %368, %387, !dbg !54
420
+ %390 = fcmp oeq float %389, 0.000000e+00, !dbg !55
421
+ %391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %387, float %389) #6, !dbg !56
422
+ %392 = select i1 %390, float 0.000000e+00, float %391, !dbg !57
423
+ %393 = fmul float %392, %388, !dbg !58
424
+ %394 = fadd float %373, %393, !dbg !59
425
+ %395 = fadd float %378, %384, !dbg !60
426
+ %396 = fmul float %388, %388, !dbg !61
427
+ %397 = fmul float %368, %396, !dbg !62
428
+ %398 = fmul float %392, %397, !dbg !63
429
+ %399 = fadd float %395, %398, !dbg !64
430
+ %400 = bitcast float %394 to i32, !dbg !65
431
+ %401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 4, i32 31), !dbg !65
432
+ %402 = bitcast i32 %401 to float, !dbg !65
433
+ %403 = bitcast float %399 to i32, !dbg !65
434
+ %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 4, i32 31), !dbg !65
435
+ %405 = bitcast i32 %404 to float, !dbg !65
436
+ %406 = bitcast float %389 to i32, !dbg !65
437
+ %407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 4, i32 31), !dbg !65
438
+ %408 = bitcast i32 %407 to float, !dbg !65
439
+ %409 = fsub float %402, %394, !dbg !50
440
+ %410 = fadd float %389, %408, !dbg !54
441
+ %411 = fcmp oeq float %410, 0.000000e+00, !dbg !55
442
+ %412 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %408, float %410) #6, !dbg !56
443
+ %413 = select i1 %411, float 0.000000e+00, float %412, !dbg !57
444
+ %414 = fmul float %413, %409, !dbg !58
445
+ %415 = fadd float %394, %414, !dbg !59
446
+ %416 = fadd float %399, %405, !dbg !60
447
+ %417 = fmul float %409, %409, !dbg !61
448
+ %418 = fmul float %389, %417, !dbg !62
449
+ %419 = fmul float %413, %418, !dbg !63
450
+ %420 = fadd float %416, %419, !dbg !64
451
+ %421 = bitcast float %415 to i32, !dbg !65
452
+ %422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 2, i32 31), !dbg !65
453
+ %423 = bitcast i32 %422 to float, !dbg !65
454
+ %424 = bitcast float %420 to i32, !dbg !65
455
+ %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 2, i32 31), !dbg !65
456
+ %426 = bitcast i32 %425 to float, !dbg !65
457
+ %427 = bitcast float %410 to i32, !dbg !65
458
+ %428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %427, i32 2, i32 31), !dbg !65
459
+ %429 = bitcast i32 %428 to float, !dbg !65
460
+ %430 = fsub float %423, %415, !dbg !50
461
+ %431 = fadd float %410, %429, !dbg !54
462
+ %432 = fcmp oeq float %431, 0.000000e+00, !dbg !55
463
+ %433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %429, float %431) #6, !dbg !56
464
+ %434 = select i1 %432, float 0.000000e+00, float %433, !dbg !57
465
+ %435 = fmul float %434, %430, !dbg !58
466
+ %436 = fadd float %415, %435, !dbg !59
467
+ %437 = fadd float %420, %426, !dbg !60
468
+ %438 = fmul float %430, %430, !dbg !61
469
+ %439 = fmul float %410, %438, !dbg !62
470
+ %440 = fmul float %434, %439, !dbg !63
471
+ %441 = fadd float %437, %440, !dbg !64
472
+ %442 = bitcast float %436 to i32, !dbg !65
473
+ %443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %442, i32 1, i32 31), !dbg !65
474
+ %444 = bitcast i32 %443 to float, !dbg !65
475
+ %445 = bitcast float %441 to i32, !dbg !65
476
+ %446 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %445, i32 1, i32 31), !dbg !65
477
+ %447 = bitcast i32 %446 to float, !dbg !65
478
+ %448 = bitcast float %431 to i32, !dbg !65
479
+ %449 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %448, i32 1, i32 31), !dbg !65
480
+ %450 = bitcast i32 %449 to float, !dbg !65
481
+ %451 = fsub float %444, %436, !dbg !50
482
+ %452 = fadd float %431, %450, !dbg !54
483
+ %453 = fcmp oeq float %452, 0.000000e+00, !dbg !55
484
+ %454 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %450, float %452) #6, !dbg !56
485
+ %455 = select i1 %453, float 0.000000e+00, float %454, !dbg !57
486
+ %456 = fmul float %455, %451, !dbg !58
487
+ %457 = fadd float %436, %456, !dbg !59
488
+ %458 = fadd float %441, %447, !dbg !60
489
+ %459 = fmul float %451, %451, !dbg !61
490
+ %460 = fmul float %431, %459, !dbg !62
491
+ %461 = fmul float %455, %460, !dbg !63
492
+ %462 = fadd float %458, %461, !dbg !64
493
+ %463 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
494
+ %464 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
495
+ %465 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
496
+ %466 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
497
+ %467 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
498
+ %468 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
499
+ %469 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
500
+ %470 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
501
+ %471 = fadd float %463, 0x3EE4F8B580000000, !dbg !68
502
+ br label %472, !dbg !69
503
+
504
+ 472: ; preds = %233, %__nv_rsqrtf.exit
505
+ %473 = phi i1 [ true, %233 ], [ false, %__nv_rsqrtf.exit ]
506
+ %474 = phi i32 [ 0, %233 ], [ 128, %__nv_rsqrtf.exit ]
507
+ %475 = or i32 %474, %13, !dbg !70
508
+ %476 = or i32 %474, %14, !dbg !70
509
+ %477 = or i32 %475, %33, !dbg !71
510
+ %478 = or i32 %476, %33, !dbg !71
511
+ %479 = sext i32 %477 to i64, !dbg !72
512
+ %480 = getelementptr float, ptr addrspace(1) %2, i64 %479, !dbg !72
513
+ %481 = sext i32 %478 to i64, !dbg !72
514
+ %482 = getelementptr float, ptr addrspace(1) %2, i64 %481, !dbg !72
515
+ %483 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %480, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
516
+ %484 = extractvalue { i32, i32, i32, i32 } %483, 0, !dbg !73
517
+ %485 = extractvalue { i32, i32, i32, i32 } %483, 1, !dbg !73
518
+ %486 = extractvalue { i32, i32, i32, i32 } %483, 2, !dbg !73
519
+ %487 = extractvalue { i32, i32, i32, i32 } %483, 3, !dbg !73
520
+ %488 = bitcast i32 %484 to float, !dbg !73
521
+ %489 = bitcast i32 %485 to float, !dbg !73
522
+ %490 = bitcast i32 %486 to float, !dbg !73
523
+ %491 = bitcast i32 %487 to float, !dbg !73
524
+ %492 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %482, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
525
+ %493 = extractvalue { i32, i32, i32, i32 } %492, 0, !dbg !73
526
+ %494 = extractvalue { i32, i32, i32, i32 } %492, 1, !dbg !73
527
+ %495 = extractvalue { i32, i32, i32, i32 } %492, 2, !dbg !73
528
+ %496 = extractvalue { i32, i32, i32, i32 } %492, 3, !dbg !73
529
+ %497 = bitcast i32 %493 to float, !dbg !73
530
+ %498 = bitcast i32 %494 to float, !dbg !73
531
+ %499 = bitcast i32 %495 to float, !dbg !73
532
+ %500 = bitcast i32 %496 to float, !dbg !73
533
+ %501 = or i32 %475, %34, !dbg !74
534
+ %502 = sext i32 %501 to i64, !dbg !75
535
+ %503 = getelementptr i16, ptr addrspace(1) %3, i64 %502, !dbg !75
536
+ %504 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %503, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
537
+ %505 = extractvalue { i32, i32, i32, i32 } %504, 0, !dbg !76
538
+ %506 = extractvalue { i32, i32, i32, i32 } %504, 1, !dbg !76
539
+ %507 = extractvalue { i32, i32, i32, i32 } %504, 2, !dbg !76
540
+ %508 = extractvalue { i32, i32, i32, i32 } %504, 3, !dbg !76
541
+ %509 = trunc i32 %505 to i16, !dbg !76
542
+ %extelt.offset = lshr i32 %505, 16, !dbg !76
543
+ %510 = trunc i32 %extelt.offset to i16, !dbg !76
544
+ %511 = trunc i32 %506 to i16, !dbg !76
545
+ %extelt.offset2 = lshr i32 %506, 16, !dbg !76
546
+ %512 = trunc i32 %extelt.offset2 to i16, !dbg !76
547
+ %513 = trunc i32 %507 to i16, !dbg !76
548
+ %extelt.offset3 = lshr i32 %507, 16, !dbg !76
549
+ %514 = trunc i32 %extelt.offset3 to i16, !dbg !76
550
+ %515 = trunc i32 %508 to i16, !dbg !76
551
+ %extelt.offset4 = lshr i32 %508, 16, !dbg !76
552
+ %516 = trunc i32 %extelt.offset4 to i16, !dbg !76
553
+ %517 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %509) #6, !dbg !77
554
+ %518 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %510) #6, !dbg !77
555
+ %519 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %511) #6, !dbg !77
556
+ %520 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #6, !dbg !77
557
+ %521 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #6, !dbg !77
558
+ %522 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #6, !dbg !77
559
+ %523 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #6, !dbg !77
560
+ %524 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #6, !dbg !77
561
+ %525 = zext nneg i32 %475 to i64, !dbg !78
562
+ %526 = getelementptr float, ptr addrspace(1) %4, i64 %525, !dbg !78
563
+ %527 = zext nneg i32 %476 to i64, !dbg !78
564
+ %528 = getelementptr float, ptr addrspace(1) %4, i64 %527, !dbg !78
565
+ %529 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %526, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
566
+ %530 = extractvalue { i32, i32, i32, i32 } %529, 0, !dbg !79
567
+ %531 = extractvalue { i32, i32, i32, i32 } %529, 1, !dbg !79
568
+ %532 = extractvalue { i32, i32, i32, i32 } %529, 2, !dbg !79
569
+ %533 = extractvalue { i32, i32, i32, i32 } %529, 3, !dbg !79
570
+ %534 = bitcast i32 %530 to float, !dbg !79
571
+ %535 = bitcast i32 %531 to float, !dbg !79
572
+ %536 = bitcast i32 %532 to float, !dbg !79
573
+ %537 = bitcast i32 %533 to float, !dbg !79
574
+ %538 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %528, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
575
+ %539 = extractvalue { i32, i32, i32, i32 } %538, 0, !dbg !79
576
+ %540 = extractvalue { i32, i32, i32, i32 } %538, 1, !dbg !79
577
+ %541 = extractvalue { i32, i32, i32, i32 } %538, 2, !dbg !79
578
+ %542 = extractvalue { i32, i32, i32, i32 } %538, 3, !dbg !79
579
+ %543 = bitcast i32 %539 to float, !dbg !79
580
+ %544 = bitcast i32 %540 to float, !dbg !79
581
+ %545 = bitcast i32 %541 to float, !dbg !79
582
+ %546 = bitcast i32 %542 to float, !dbg !79
583
+ br i1 %39, label %547, label %548, !dbg !80
584
+
585
+ 547: ; preds = %472
586
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !80
587
+ br label %548, !dbg !80
588
+
589
+ 548: ; preds = %547, %472
590
+ %549 = getelementptr float, ptr addrspace(1) %43, i64 %525, !dbg !81
591
+ %550 = getelementptr float, ptr addrspace(1) %43, i64 %527, !dbg !81
592
+ %551 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %549, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
593
+ %552 = extractvalue { i32, i32, i32, i32 } %551, 0, !dbg !82
594
+ %553 = extractvalue { i32, i32, i32, i32 } %551, 1, !dbg !82
595
+ %554 = extractvalue { i32, i32, i32, i32 } %551, 2, !dbg !82
596
+ %555 = extractvalue { i32, i32, i32, i32 } %551, 3, !dbg !82
597
+ %556 = bitcast i32 %552 to float, !dbg !82
598
+ %557 = bitcast i32 %553 to float, !dbg !82
599
+ %558 = bitcast i32 %554 to float, !dbg !82
600
+ %559 = bitcast i32 %555 to float, !dbg !82
601
+ %560 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %550, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
602
+ %561 = extractvalue { i32, i32, i32, i32 } %560, 0, !dbg !82
603
+ %562 = extractvalue { i32, i32, i32, i32 } %560, 1, !dbg !82
604
+ %563 = extractvalue { i32, i32, i32, i32 } %560, 2, !dbg !82
605
+ %564 = extractvalue { i32, i32, i32, i32 } %560, 3, !dbg !82
606
+ %565 = bitcast i32 %561 to float, !dbg !82
607
+ %566 = bitcast i32 %562 to float, !dbg !82
608
+ %567 = bitcast i32 %563 to float, !dbg !82
609
+ %568 = bitcast i32 %564 to float, !dbg !82
610
+ %569 = fadd float %488, %556, !dbg !83
611
+ %570 = fadd float %489, %557, !dbg !83
612
+ %571 = fadd float %490, %558, !dbg !83
613
+ %572 = fadd float %491, %559, !dbg !83
614
+ %573 = fadd float %497, %565, !dbg !83
615
+ %574 = fadd float %498, %566, !dbg !83
616
+ %575 = fadd float %499, %567, !dbg !83
617
+ %576 = fadd float %500, %568, !dbg !83
618
+ %577 = fadd float %517, %569, !dbg !84
619
+ %578 = fadd float %518, %570, !dbg !84
620
+ %579 = fadd float %519, %571, !dbg !84
621
+ %580 = fadd float %520, %572, !dbg !84
622
+ %581 = fadd float %521, %573, !dbg !84
623
+ %582 = fadd float %522, %574, !dbg !84
624
+ %583 = fadd float %523, %575, !dbg !84
625
+ %584 = fadd float %524, %576, !dbg !84
626
+ %585 = fsub float %577, %457, !dbg !85
627
+ %586 = fsub float %578, %457, !dbg !85
628
+ %587 = fsub float %579, %457, !dbg !85
629
+ %588 = fsub float %580, %457, !dbg !85
630
+ %589 = fsub float %581, %457, !dbg !85
631
+ %590 = fsub float %582, %457, !dbg !85
632
+ %591 = fsub float %583, %457, !dbg !85
633
+ %592 = fsub float %584, %457, !dbg !85
634
+ %593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
635
+ %.not.i = icmp eq i32 %593, 0, !dbg !86
636
+ br i1 %.not.i, label %596, label %594, !dbg !86
637
+
638
+ 594: ; preds = %548
639
+ %595 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %471), !dbg !86
640
+ br label %__nv_rsqrtf.exit, !dbg !86
641
+
642
+ 596: ; preds = %548
643
+ %597 = tail call float @llvm.nvvm.rsqrt.approx.f(float %471), !dbg !86
644
+ br label %__nv_rsqrtf.exit, !dbg !86
645
+
646
+ __nv_rsqrtf.exit: ; preds = %594, %596
647
+ %.0.i = phi float [ %595, %594 ], [ %597, %596 ], !dbg !86
648
+ %598 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
649
+ %599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
650
+ %600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
651
+ %601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
652
+ %602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
653
+ %603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
654
+ %604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
655
+ %605 = fmul float %585, %.0.i, !dbg !87
656
+ %606 = fmul float %586, %.0.i, !dbg !87
657
+ %607 = fmul float %587, %.0.i, !dbg !87
658
+ %608 = fmul float %588, %.0.i, !dbg !87
659
+ %609 = fmul float %589, %.0.i, !dbg !87
660
+ %610 = fmul float %590, %.0.i, !dbg !87
661
+ %611 = fmul float %591, %.0.i, !dbg !87
662
+ %612 = fmul float %592, %.0.i, !dbg !87
663
+ %613 = fmul float %605, %534, !dbg !88
664
+ %614 = fmul float %606, %535, !dbg !88
665
+ %615 = fmul float %607, %536, !dbg !88
666
+ %616 = fmul float %608, %537, !dbg !88
667
+ %617 = fmul float %609, %543, !dbg !88
668
+ %618 = fmul float %610, %544, !dbg !88
669
+ %619 = fmul float %611, %545, !dbg !88
670
+ %620 = fmul float %612, %546, !dbg !88
671
+ %621 = getelementptr i16, ptr addrspace(1) %5, i64 %502, !dbg !89
672
+ %622 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %613) #6, !dbg !90
673
+ %623 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %614) #6, !dbg !90
674
+ %624 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %615) #6, !dbg !90
675
+ %625 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %616) #6, !dbg !90
676
+ %626 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %617) #6, !dbg !90
677
+ %627 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %618) #6, !dbg !90
678
+ %628 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %619) #6, !dbg !90
679
+ %629 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %620) #6, !dbg !90
680
+ %630 = insertelement <2 x i16> undef, i16 %622, i64 0, !dbg !90
681
+ %631 = insertelement <2 x i16> %630, i16 %623, i64 1, !dbg !90
682
+ %632 = bitcast <2 x i16> %631 to i32, !dbg !90
683
+ %633 = insertelement <2 x i16> undef, i16 %624, i64 0, !dbg !90
684
+ %634 = insertelement <2 x i16> %633, i16 %625, i64 1, !dbg !90
685
+ %635 = bitcast <2 x i16> %634 to i32, !dbg !90
686
+ %636 = insertelement <2 x i16> undef, i16 %626, i64 0, !dbg !90
687
+ %637 = insertelement <2 x i16> %636, i16 %627, i64 1, !dbg !90
688
+ %638 = bitcast <2 x i16> %637 to i32, !dbg !90
689
+ %639 = insertelement <2 x i16> undef, i16 %628, i64 0, !dbg !90
690
+ %640 = insertelement <2 x i16> %639, i16 %629, i64 1, !dbg !90
691
+ %641 = bitcast <2 x i16> %640 to i32, !dbg !90
692
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %632, i32 %635, i32 %638, i32 %641, ptr addrspace(1) %621, i1 true) #6, !dbg !90
693
+ br i1 %473, label %472, label %642, !dbg !69
694
+
695
+ 642: ; preds = %__nv_rsqrtf.exit
696
+ ret void, !dbg !91
697
+ }
698
+
699
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
700
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
701
+
702
+ ; Function Attrs: convergent nocallback nounwind
703
+ declare void @llvm.nvvm.barrier0() #1
704
+
705
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
706
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
707
+
708
+ ; Function Attrs: alwaysinline nounwind
709
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
710
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
711
+ %.not = icmp eq i32 %1, 0
712
+ br i1 %.not, label %4, label %2
713
+
714
+ 2: ; preds = %0
715
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
716
+ br label %6
717
+
718
+ 4: ; preds = %0
719
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
720
+ br label %6
721
+
722
+ 6: ; preds = %4, %2
723
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
724
+ ret float %.0
725
+ }
726
+
727
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
728
+
729
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
730
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
731
+
732
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
733
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
734
+
735
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
736
+ attributes #1 = { convergent nocallback nounwind }
737
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
738
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
739
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
740
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
741
+ attributes #6 = { nounwind }
742
+
743
+ !llvm.module.flags = !{!0, !1}
744
+ !llvm.dbg.cu = !{!2}
745
+ !nvvm.annotations = !{!4, !5, !5, !4}
746
+ !llvm.ident = !{!6}
747
+
748
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
749
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
750
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
751
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
752
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
753
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
754
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
755
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
756
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
757
+ !9 = !{}
758
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
759
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
760
+ !12 = !DILocation(line: 21, column: 28, scope: !7)
761
+ !13 = !DILocation(line: 21, column: 33, scope: !7)
762
+ !14 = !DILocation(line: 22, column: 23, scope: !7)
763
+ !15 = !DILocation(line: 26, column: 30, scope: !7)
764
+ !16 = !DILocation(line: 26, column: 35, scope: !7)
765
+ !17 = !DILocation(line: 27, column: 18, scope: !7)
766
+ !18 = !DILocation(line: 35, column: 44, scope: !7)
767
+ !19 = !DILocation(line: 36, column: 44, scope: !7)
768
+ !20 = !DILocation(line: 37, column: 22, scope: !7)
769
+ !21 = !DILocation(line: 38, column: 22, scope: !7)
770
+ !22 = !DILocation(line: 39, column: 36, scope: !7)
771
+ !23 = !DILocation(line: 40, column: 40, scope: !7)
772
+ !24 = !DILocation(line: 41, column: 44, scope: !7)
773
+ !25 = !DILocation(line: 31, column: 36, scope: !7)
774
+ !26 = !DILocation(line: 32, column: 27, scope: !7)
775
+ !27 = !DILocation(line: 35, column: 40, scope: !7)
776
+ !28 = !DILocation(line: 35, column: 34, scope: !7)
777
+ !29 = !DILocation(line: 35, column: 50, scope: !7)
778
+ !30 = !DILocation(line: 36, column: 40, scope: !7)
779
+ !31 = !DILocation(line: 36, column: 34, scope: !7)
780
+ !32 = !DILocation(line: 36, column: 50, scope: !7)
781
+ !33 = !DILocation(line: 36, column: 101, scope: !7)
782
+ !34 = !DILocation(line: 40, column: 55, scope: !7)
783
+ !35 = !DILocation(line: 41, column: 40, scope: !7)
784
+ !36 = !DILocation(line: 41, column: 34, scope: !7)
785
+ !37 = !DILocation(line: 41, column: 52, scope: !7)
786
+ !38 = !DILocation(line: 42, column: 22, scope: !7)
787
+ !39 = !DILocation(line: 44, column: 22, scope: !7)
788
+ !40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
789
+ !41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
790
+ !42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
791
+ !43 = !DILocation(line: 47, column: 41, scope: !41)
792
+ !44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
793
+ !45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
794
+ !46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
795
+ !47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
796
+ !48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
797
+ !49 = !DILocation(line: 50, column: 50, scope: !7)
798
+ !50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
799
+ !51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
800
+ !52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
801
+ !53 = !DILocation(line: 53, column: 44, scope: !51)
802
+ !54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
803
+ !55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
804
+ !56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
805
+ !57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
806
+ !58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
807
+ !59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
808
+ !60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
809
+ !61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
810
+ !62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
811
+ !63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
812
+ !64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
813
+ !65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
814
+ !66 = !DILocation(line: 53, column: 44, scope: !41)
815
+ !67 = !DILocation(line: 75, column: 24, scope: !7)
816
+ !68 = !DILocation(line: 77, column: 24, scope: !7)
817
+ !69 = !DILocation(line: 58, column: 36, scope: !7)
818
+ !70 = !DILocation(line: 59, column: 27, scope: !7)
819
+ !71 = !DILocation(line: 62, column: 41, scope: !7)
820
+ !72 = !DILocation(line: 62, column: 35, scope: !7)
821
+ !73 = !DILocation(line: 62, column: 51, scope: !7)
822
+ !74 = !DILocation(line: 63, column: 41, scope: !7)
823
+ !75 = !DILocation(line: 63, column: 35, scope: !7)
824
+ !76 = !DILocation(line: 63, column: 51, scope: !7)
825
+ !77 = !DILocation(line: 63, column: 103, scope: !7)
826
+ !78 = !DILocation(line: 64, column: 35, scope: !7)
827
+ !79 = !DILocation(line: 64, column: 40, scope: !7)
828
+ !80 = !DILocation(line: 68, column: 57, scope: !7)
829
+ !81 = !DILocation(line: 69, column: 35, scope: !7)
830
+ !82 = !DILocation(line: 69, column: 54, scope: !7)
831
+ !83 = !DILocation(line: 70, column: 24, scope: !7)
832
+ !84 = !DILocation(line: 72, column: 24, scope: !7)
833
+ !85 = !DILocation(line: 73, column: 24, scope: !7)
834
+ !86 = !DILocation(line: 78, column: 30, scope: !7)
835
+ !87 = !DILocation(line: 79, column: 24, scope: !7)
836
+ !88 = !DILocation(line: 80, column: 24, scope: !7)
837
+ !89 = !DILocation(line: 82, column: 29, scope: !7)
838
+ !90 = !DILocation(line: 82, column: 52, scope: !7)
839
+ !91 = !DILocation(line: 58, column: 4, scope: !7)