0-hero commited on
Commit
6f0bac9
·
verified ·
1 Parent(s): 934a9ba

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin +0 -0
  2. .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir +125 -0
  3. .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx +782 -0
  4. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin +0 -0
  5. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir +156 -0
  6. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx +525 -0
  7. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir +92 -0
  8. .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir +99 -0
  9. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir +99 -0
  10. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin +0 -0
  11. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir +43 -0
  12. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx +278 -0
  13. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir +18 -0
  14. .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir +17 -0
  15. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin +0 -0
  16. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir +296 -0
  17. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx +743 -0
  18. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir +73 -0
  19. .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir +72 -0
  20. .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir +98 -0
  21. .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin +0 -0
  22. .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx +1054 -0
  23. .triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin +0 -0
  24. .triton/dump/510522bb05917b836ed253751364fcad/triton_.llir +1211 -0
  25. .triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx +1810 -0
  26. .triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir +137 -0
  27. .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir +1360 -0
  28. .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir +151 -0
  29. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin +0 -0
  30. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir +269 -0
  31. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx +642 -0
  32. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir +60 -0
  33. .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir +53 -0
  34. .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir +66 -0
  35. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin +0 -0
  36. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir +45 -0
  37. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx +279 -0
  38. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir +16 -0
  39. .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir +15 -0
  40. .triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir +110 -0
  41. .triton/dump/962d1809855a53123762906133b1d960/triton_.cubin +0 -0
  42. .triton/dump/962d1809855a53123762906133b1d960/triton_.llir +48 -0
  43. .triton/dump/962d1809855a53123762906133b1d960/triton_.ptx +282 -0
  44. .triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir +18 -0
  45. .triton/dump/962d1809855a53123762906133b1d960/triton_.ttir +17 -0
  46. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin +0 -0
  47. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir +368 -0
  48. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx +771 -0
  49. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir +127 -0
  50. .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir +100 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin ADDED
Binary file (29 kB). View file
 
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
9
+ %cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
10
+ %cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
16
+ %cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
17
+ %cst_10 = arith.constant 0.000000e+00 : f32
18
+ %cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
19
+ %cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
22
+ %c2_i32 = arith.constant 2 : i32
23
+ %0 = tt.get_program_id x : i32
24
+ %1 = arith.muli %0, %c2_i32 : i32
25
+ %2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
26
+ %3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
27
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
28
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
29
+ %6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
30
+ %7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
31
+ %8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
32
+ %9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
33
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
34
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
35
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
36
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
37
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
38
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
39
+ %16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
40
+ %17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
41
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
42
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
43
+ %20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
44
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
45
+ %22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
46
+ %23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
47
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
48
+ %25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
49
+ %26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
50
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
51
+ %28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
52
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
53
+ %30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
54
+ %31 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
55
+ %32 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
56
+ %33 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
57
+ %34 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
58
+ %35 = arith.select %33, %31, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
59
+ %36 = arith.select %34, %32, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
60
+ %37 = arith.cmpi sge, %36, %cst_8 : tensor<2x1xi64, #blocked2>
61
+ %38 = arith.cmpi slt, %36, %cst_9 : tensor<2x1xi64, #blocked2>
62
+ %39 = arith.andi %37, %38 : tensor<2x1xi1, #blocked2>
63
+ tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
64
+ %40 = arith.muli %35, %cst_5 : tensor<2x1xi64, #blocked>
65
+ %41 = tt.broadcast %40 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
66
+ %42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
67
+ %43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
68
+ %44 = arith.addi %43, %41 : tensor<2x256xi64, #blocked>
69
+ %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
70
+ %46 = tt.addptr %45, %44 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
71
+ %47 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
72
+ %48 = arith.addf %47, %30 : tensor<2x256xf32, #blocked>
73
+ %49 = arith.addf %48, %cst_13 : tensor<2x256xf32, #blocked>
74
+ %50 = arith.subf %48, %49 : tensor<2x256xf32, #blocked>
75
+ %51 = arith.mulf %48, %50 : tensor<2x256xf32, #blocked>
76
+ %52 = arith.addf %51, %cst_13 : tensor<2x256xf32, #blocked>
77
+ %53 = arith.select %29, %49, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
78
+ %54 = arith.select %29, %52, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
79
+ %55 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
80
+ %56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
81
+ %57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
82
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
83
+ %82 = arith.subf %arg10, %arg7 : f32
84
+ %83 = arith.addf %arg9, %arg12 : f32
85
+ %84 = arith.cmpf oeq, %83, %cst_10 : f32
86
+ %85 = arith.divf %arg12, %83 : f32
87
+ %86 = arith.select %84, %cst_10, %85 : f32
88
+ %87 = arith.mulf %82, %86 : f32
89
+ %88 = arith.addf %arg7, %87 : f32
90
+ %89 = arith.addf %arg8, %arg11 : f32
91
+ %90 = arith.mulf %82, %82 : f32
92
+ %91 = arith.mulf %90, %arg9 : f32
93
+ %92 = arith.mulf %91, %86 : f32
94
+ %93 = arith.addf %89, %92 : f32
95
+ tt.reduce.return %88, %93, %83 : f32, f32, f32
96
+ }) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
97
+ %58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
98
+ %59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
99
+ %60 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
100
+ %61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
101
+ %62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
102
+ %63 = tt.load %62, %22, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
103
+ tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
104
+ %64 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
105
+ %65 = arith.addf %64, %60 : tensor<2x256xf32, #blocked>
106
+ %66 = tt.broadcast %58 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
107
+ %67 = arith.subf %65, %66 : tensor<2x256xf32, #blocked>
108
+ %68 = arith.divf %59, %cst_12 : tensor<2x1xf32, #blocked>
109
+ %69 = arith.addf %68, %cst_11 : tensor<2x1xf32, #blocked>
110
+ %70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
111
+ %71 = tt.broadcast %70 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
112
+ %72 = arith.mulf %67, %71 : tensor<2x256xf32, #blocked>
113
+ %73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
114
+ %74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
115
+ %75 = arith.mulf %72, %74 : tensor<2x256xf32, #blocked>
116
+ %76 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
117
+ %77 = tt.broadcast %76 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
118
+ %78 = arith.addi %24, %77 : tensor<2x256xi32, #blocked>
119
+ %79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
120
+ %80 = tt.addptr %79, %78 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
121
+ %81 = arith.truncf %75 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
122
+ tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
123
+ tt.return
124
+ }
125
+ }
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8de9de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
21
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
22
+ .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
23
+ )
24
+ .maxntid 64, 1, 1
25
+ {
26
+ .reg .pred %p<45>;
27
+ .reg .b16 %rs<5>;
28
+ .reg .b32 %r<106>;
29
+ .reg .f32 %f<90>;
30
+ .reg .b64 %rd<44>;
31
+ .loc 1 18 0
32
+ $L__func_begin0:
33
+ .loc 1 18 0
34
+
35
+ ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
36
+ ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
37
+ $L__tmp0:
38
+ .loc 1 26 26
39
+ mov.u32 %r74, %tid.x;
40
+ and.b32 %r75, %r74, 31;
41
+ ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
42
+ ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
43
+ ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
44
+ shl.b32 %r76, %r74, 2;
45
+ ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
46
+ and.b32 %r77, %r76, 252;
47
+ ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
48
+ ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
49
+ .loc 1 23 28
50
+ mov.u32 %r1, %ctaid.x;
51
+ .loc 1 30 40
52
+ shl.b32 %r78, %r1, 8;
53
+ .loc 1 30 36
54
+ or.b32 %r79, %r78, %r77;
55
+ .loc 1 30 30
56
+ mul.wide.s32 %rd33, %r79, 2;
57
+ add.s64 %rd1, %rd26, %rd33;
58
+ mov.b32 %r4, 0;
59
+ mov.pred %p1, -1;
60
+ .loc 1 30 46
61
+ mov.u32 %r2, 0x0;
62
+ mov.u32 %r3, 0x0;
63
+ @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
64
+ @!%p1 mov.u32 %r2, %r4;
65
+ @!%p1 mov.u32 %r3, %r4;
66
+ cvt.u16.u32 %rs1, %r2;
67
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
68
+ cvt.u16.u32 %rs3, %r3;
69
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
70
+ .loc 1 30 67
71
+ cvt.f32.bf16 %r6, %rs1;
72
+ mov.b32 %f1, %r6;
73
+ cvt.f32.bf16 %r7, %rs2;
74
+ mov.b32 %f2, %r7;
75
+ cvt.f32.bf16 %r8, %rs3;
76
+ mov.b32 %f3, %r8;
77
+ cvt.f32.bf16 %r9, %rs4;
78
+ mov.b32 %f4, %r9;
79
+ .loc 1 31 30
80
+ cvt.u64.u32 %rd34, %r77;
81
+ mul.wide.u32 %rd35, %r77, 4;
82
+ add.s64 %rd2, %rd27, %rd35;
83
+ .loc 1 31 35
84
+ mov.u32 %r10, 0x0;
85
+ mov.u32 %r11, 0x0;
86
+ mov.u32 %r12, 0x0;
87
+ mov.u32 %r13, 0x0;
88
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
89
+ @!%p1 mov.u32 %r10, %r4;
90
+ @!%p1 mov.u32 %r11, %r4;
91
+ @!%p1 mov.u32 %r12, %r4;
92
+ @!%p1 mov.u32 %r13, %r4;
93
+ mov.b32 %f5, %r10;
94
+ mov.b32 %f6, %r11;
95
+ mov.b32 %f7, %r12;
96
+ mov.b32 %f8, %r13;
97
+ .loc 1 32 30
98
+ mul.wide.s32 %rd36, %r79, 4;
99
+ add.s64 %rd3, %rd28, %rd36;
100
+ .loc 1 32 46
101
+ mov.u32 %r18, 0x0;
102
+ mov.u32 %r19, 0x0;
103
+ mov.u32 %r20, 0x0;
104
+ mov.u32 %r21, 0x0;
105
+ @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
106
+ @!%p1 mov.u32 %r18, %r4;
107
+ @!%p1 mov.u32 %r19, %r4;
108
+ @!%p1 mov.u32 %r20, %r4;
109
+ @!%p1 mov.u32 %r21, %r4;
110
+ mov.b32 %f9, %r18;
111
+ mov.b32 %f10, %r19;
112
+ mov.b32 %f11, %r20;
113
+ mov.b32 %f12, %r21;
114
+ .loc 1 33 30
115
+ mul.wide.s32 %rd37, %r1, 4;
116
+ add.s64 %rd4, %rd29, %rd37;
117
+ .loc 1 33 35
118
+ mov.u32 %r26, 0x0;
119
+ @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
120
+ mov.b32 %f13, %r26;
121
+ mov.u32 %r27, 0x0;
122
+ @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
123
+ mov.u32 %r28, 0x0;
124
+ @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
125
+ mov.u32 %r29, 0x0;
126
+ @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
127
+ .loc 1 34 31
128
+ add.s64 %rd8, %rd30, %rd37;
129
+ .loc 1 34 36
130
+ mov.u32 %r55, 0x0;
131
+ @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
132
+ mov.b32 %f14, %r55;
133
+ mov.u32 %r31, 0x0;
134
+ @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
135
+ mov.u32 %r32, 0x0;
136
+ @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
137
+ mov.u32 %r33, 0x0;
138
+ @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
139
+ .loc 1 35 31
140
+ mul.wide.s32 %rd38, %r1, 8;
141
+ add.s64 %rd13, %rd31, %rd38;
142
+ .loc 1 35 36
143
+ mov.u64 %rd12, 0x0;
144
+ @%p1 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd13 + 0 ];
145
+ mov.u64 %rd14, 0x0;
146
+ @%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd13 + 0 ];
147
+ mov.u64 %rd16, 0x0;
148
+ @%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd13 + 0 ];
149
+ mov.u64 %rd18, 0x0;
150
+ @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd13 + 0 ];
151
+ .loc 1 36 35
152
+ add.s64 %rd20, %rd25, %rd36;
153
+ .loc 1 36 51
154
+ mov.u32 %r34, 0x0;
155
+ mov.u32 %r35, 0x0;
156
+ mov.u32 %r36, 0x0;
157
+ mov.u32 %r37, 0x0;
158
+ @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd20 + 0 ];
159
+ @!%p1 mov.u32 %r34, %r4;
160
+ @!%p1 mov.u32 %r35, %r4;
161
+ @!%p1 mov.u32 %r36, %r4;
162
+ @!%p1 mov.u32 %r37, %r4;
163
+ mov.b32 %f15, %r34;
164
+ mov.b32 %f16, %r35;
165
+ mov.b32 %f17, %r36;
166
+ mov.b32 %f18, %r37;
167
+ .loc 1 38 18
168
+ mul.f32 %f19, %f1, %f5;
169
+ mul.f32 %f20, %f2, %f6;
170
+ mul.f32 %f21, %f3, %f7;
171
+ mul.f32 %f22, %f4, %f8;
172
+ $L__tmp1:
173
+ .loc 2 233 15
174
+ fma.rn.f32 %f23, %f1, %f5, %f20;
175
+ fma.rn.f32 %f24, %f3, %f7, %f23;
176
+ fma.rn.f32 %f25, %f4, %f8, %f24;
177
+ $L__tmp2:
178
+ .loc 2 243 36
179
+ mov.b32 %r80, %f25;
180
+ shfl.sync.bfly.b32 %r81, %r80, 16, 31, -1;
181
+ mov.b32 %f26, %r81;
182
+ $L__tmp3:
183
+ .loc 2 233 15
184
+ add.f32 %f27, %f25, %f26;
185
+ $L__tmp4:
186
+ .loc 2 243 36
187
+ mov.b32 %r82, %f27;
188
+ shfl.sync.bfly.b32 %r83, %r82, 8, 31, -1;
189
+ mov.b32 %f28, %r83;
190
+ $L__tmp5:
191
+ .loc 2 233 15
192
+ add.f32 %f29, %f27, %f28;
193
+ $L__tmp6:
194
+ .loc 2 243 36
195
+ mov.b32 %r84, %f29;
196
+ shfl.sync.bfly.b32 %r85, %r84, 4, 31, -1;
197
+ mov.b32 %f30, %r85;
198
+ $L__tmp7:
199
+ .loc 2 233 15
200
+ add.f32 %f31, %f29, %f30;
201
+ $L__tmp8:
202
+ .loc 2 243 36
203
+ mov.b32 %r86, %f31;
204
+ shfl.sync.bfly.b32 %r87, %r86, 2, 31, -1;
205
+ mov.b32 %f32, %r87;
206
+ $L__tmp9:
207
+ .loc 2 233 15
208
+ add.f32 %f33, %f31, %f32;
209
+ $L__tmp10:
210
+ .loc 2 243 36
211
+ mov.b32 %r88, %f33;
212
+ shfl.sync.bfly.b32 %r89, %r88, 1, 31, -1;
213
+ mov.b32 %f34, %r89;
214
+ $L__tmp11:
215
+ .loc 2 233 15
216
+ add.f32 %f35, %f33, %f34;
217
+ $L__tmp12:
218
+ .loc 2 243 36
219
+ setp.eq.s32 %p31, %r75, 0;
220
+ shr.u32 %r90, %r74, 3;
221
+ and.b32 %r91, %r90, 4;
222
+ mov.u32 %r92, global_smem;
223
+ add.s32 %r42, %r92, %r91;
224
+ mov.b32 %r43, %f35;
225
+ @%p31 st.shared.b32 [ %r42 + 0 ], %r43;
226
+ bar.sync 0;
227
+ setp.lt.s32 %p32, %r74, 2;
228
+ add.s32 %r45, %r92, %r76;
229
+ @%p32 ld.shared.b32 %r44, [ %r45 + 0 ];
230
+ mov.b32 %f36, %r44;
231
+ shfl.sync.bfly.b32 %r93, %r44, 1, 31, -1;
232
+ mov.b32 %f37, %r93;
233
+ $L__tmp13:
234
+ .loc 2 233 15
235
+ add.f32 %f38, %f36, %f37;
236
+ $L__tmp14:
237
+ .loc 2 243 36
238
+ and.b32 %r94, %r74, 1;
239
+ setp.eq.b32 %p41, %r94, 1;
240
+ not.pred %p42, %p41;
241
+ and.pred %p33, %p32, %p42;
242
+ mov.b32 %r47, %f38;
243
+ @%p33 st.shared.b32 [ %r45 + 0 ], %r47;
244
+ bar.sync 0;
245
+ ld.shared.f32 %f39, [global_smem];
246
+ $L__tmp15:
247
+ .loc 3 8 15
248
+ add.f32 %f40, %f39, 0f00000000;
249
+ $L__tmp16:
250
+ .loc 1 42 19
251
+ sub.f32 %f41, %f9, %f13;
252
+ sub.f32 %f42, %f10, %f13;
253
+ sub.f32 %f43, %f11, %f13;
254
+ sub.f32 %f44, %f12, %f13;
255
+ .loc 1 43 20
256
+ mul.f32 %f45, %f41, %f14;
257
+ mul.f32 %f46, %f42, %f14;
258
+ mul.f32 %f47, %f43, %f14;
259
+ mul.f32 %f48, %f44, %f14;
260
+ .loc 1 44 19
261
+ mul.f32 %f49, %f20, %f46;
262
+ $L__tmp17:
263
+ .loc 2 243 36
264
+ bar.sync 0;
265
+ $L__tmp18:
266
+ .loc 2 233 15
267
+ fma.rn.f32 %f50, %f19, %f45, %f49;
268
+ fma.rn.f32 %f51, %f21, %f47, %f50;
269
+ fma.rn.f32 %f52, %f22, %f48, %f51;
270
+ $L__tmp19:
271
+ .loc 2 243 36
272
+ mov.b32 %r95, %f52;
273
+ shfl.sync.bfly.b32 %r96, %r95, 16, 31, -1;
274
+ mov.b32 %f53, %r96;
275
+ $L__tmp20:
276
+ .loc 2 233 15
277
+ add.f32 %f54, %f52, %f53;
278
+ $L__tmp21:
279
+ .loc 2 243 36
280
+ mov.b32 %r97, %f54;
281
+ shfl.sync.bfly.b32 %r98, %r97, 8, 31, -1;
282
+ mov.b32 %f55, %r98;
283
+ $L__tmp22:
284
+ .loc 2 233 15
285
+ add.f32 %f56, %f54, %f55;
286
+ $L__tmp23:
287
+ .loc 2 243 36
288
+ mov.b32 %r99, %f56;
289
+ shfl.sync.bfly.b32 %r100, %r99, 4, 31, -1;
290
+ mov.b32 %f57, %r100;
291
+ $L__tmp24:
292
+ .loc 2 233 15
293
+ add.f32 %f58, %f56, %f57;
294
+ $L__tmp25:
295
+ .loc 2 243 36
296
+ mov.b32 %r101, %f58;
297
+ shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1;
298
+ mov.b32 %f59, %r102;
299
+ $L__tmp26:
300
+ .loc 2 233 15
301
+ add.f32 %f60, %f58, %f59;
302
+ $L__tmp27:
303
+ .loc 2 243 36
304
+ mov.b32 %r103, %f60;
305
+ shfl.sync.bfly.b32 %r104, %r103, 1, 31, -1;
306
+ mov.b32 %f61, %r104;
307
+ $L__tmp28:
308
+ .loc 2 233 15
309
+ add.f32 %f62, %f60, %f61;
310
+ $L__tmp29:
311
+ .loc 2 243 36
312
+ mov.b32 %r49, %f62;
313
+ @%p31 st.shared.b32 [ %r42 + 0 ], %r49;
314
+ bar.sync 0;
315
+ @%p32 ld.shared.b32 %r50, [ %r45 + 0 ];
316
+ mov.b32 %f63, %r50;
317
+ shfl.sync.bfly.b32 %r105, %r50, 1, 31, -1;
318
+ mov.b32 %f64, %r105;
319
+ $L__tmp30:
320
+ .loc 2 233 15
321
+ add.f32 %f65, %f63, %f64;
322
+ $L__tmp31:
323
+ .loc 2 243 36
324
+ mov.b32 %r53, %f65;
325
+ @%p33 st.shared.b32 [ %r45 + 0 ], %r53;
326
+ bar.sync 0;
327
+ ld.shared.f32 %f66, [global_smem];
328
+ $L__tmp32:
329
+ .loc 3 8 15
330
+ add.f32 %f67, %f66, 0f00000000;
331
+ $L__tmp33:
332
+ .loc 1 49 21
333
+ setp.eq.s64 %p43, %rd12, -1;
334
+ mov.b32 %r56, 1132462080;
335
+ .loc 1 51 20
336
+ div.full.f32 %r54, %r55, %r56;
337
+ mov.b32 %f68, %r54;
338
+ .loc 1 53 20
339
+ neg.f32 %f69, %f40;
340
+ fma.rn.f32 %f70, %f19, 0f43800000, %f69;
341
+ fma.rn.f32 %f71, %f20, 0f43800000, %f69;
342
+ fma.rn.f32 %f72, %f21, 0f43800000, %f69;
343
+ fma.rn.f32 %f73, %f22, 0f43800000, %f69;
344
+ .loc 1 55 20
345
+ neg.f32 %f74, %f45;
346
+ fma.rn.f32 %f75, %f74, %f67, %f70;
347
+ neg.f32 %f76, %f46;
348
+ fma.rn.f32 %f77, %f76, %f67, %f71;
349
+ neg.f32 %f78, %f47;
350
+ fma.rn.f32 %f79, %f78, %f67, %f72;
351
+ neg.f32 %f80, %f48;
352
+ fma.rn.f32 %f81, %f80, %f67, %f73;
353
+ .loc 1 57 20
354
+ fma.rn.f32 %f82, %f68, %f75, %f15;
355
+ fma.rn.f32 %f83, %f68, %f77, %f16;
356
+ fma.rn.f32 %f84, %f68, %f79, %f17;
357
+ fma.rn.f32 %f85, %f68, %f81, %f18;
358
+ .loc 1 59 35
359
+ selp.f32 %f86, 0f00000000, %f82, %p43;
360
+ selp.f32 %f87, 0f00000000, %f83, %p43;
361
+ selp.f32 %f88, 0f00000000, %f84, %p43;
362
+ selp.f32 %f89, 0f00000000, %f85, %p43;
363
+ .loc 1 61 20
364
+ setp.lt.s64 %p44, %rd12, 0;
365
+ .loc 1 63 56
366
+ shl.b64 %rd39, %rd12, 8;
367
+ add.s64 %rd40, %rd39, 12865792;
368
+ selp.b64 %rd41, %rd40, %rd39, %p44;
369
+ .loc 1 63 52
370
+ or.b64 %rd42, %rd41, %rd34;
371
+ .loc 1 63 30
372
+ shl.b64 %rd43, %rd42, 2;
373
+ add.s64 %rd21, %rd32, %rd43;
374
+ add.s64 %rd22, %rd21, 4;
375
+ add.s64 %rd23, %rd21, 8;
376
+ add.s64 %rd24, %rd21, 12;
377
+ .loc 1 63 83
378
+ mov.b32 %r67, %f86;
379
+ mov.u32 %r66, 0x0;
380
+ @%p1 atom.global.gpu.acq_rel.add.f32 %r66, [ %rd21 + 0 ], %r67;
381
+ mov.b32 %r69, %f87;
382
+ mov.u32 %r68, 0x0;
383
+ @%p1 atom.global.gpu.acq_rel.add.f32 %r68, [ %rd22 + 0 ], %r69;
384
+ mov.b32 %r71, %f88;
385
+ mov.u32 %r70, 0x0;
386
+ @%p1 atom.global.gpu.acq_rel.add.f32 %r70, [ %rd23 + 0 ], %r71;
387
+ mov.b32 %r73, %f89;
388
+ mov.u32 %r72, 0x0;
389
+ @%p1 atom.global.gpu.acq_rel.add.f32 %r72, [ %rd24 + 0 ], %r73;
390
+ .loc 1 63 4
391
+ ret;
392
+ $L__tmp34:
393
+ $L__func_end0:
394
+
395
+ }
396
+ .file 1 "/tmp/torchinductor_root/qr/cqryxm46jcxyr3qdktqirn53eap7h3pjjqiqavyqqyvflabjpvmd.py"
397
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
398
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
399
+ .section .debug_abbrev
400
+ {
401
+ .b8 1
402
+ .b8 17
403
+ .b8 1
404
+ .b8 37
405
+ .b8 8
406
+ .b8 19
407
+ .b8 5
408
+ .b8 3
409
+ .b8 8
410
+ .b8 16
411
+ .b8 6
412
+ .b8 27
413
+ .b8 8
414
+ .b8 180
415
+ .b8 66
416
+ .b8 12
417
+ .b8 17
418
+ .b8 1
419
+ .b8 18
420
+ .b8 1
421
+ .b8 0
422
+ .b8 0
423
+ .b8 2
424
+ .b8 46
425
+ .b8 0
426
+ .b8 135
427
+ .b8 64
428
+ .b8 8
429
+ .b8 3
430
+ .b8 8
431
+ .b8 58
432
+ .b8 11
433
+ .b8 59
434
+ .b8 11
435
+ .b8 63
436
+ .b8 12
437
+ .b8 32
438
+ .b8 11
439
+ .b8 0
440
+ .b8 0
441
+ .b8 3
442
+ .b8 46
443
+ .b8 1
444
+ .b8 17
445
+ .b8 1
446
+ .b8 18
447
+ .b8 1
448
+ .b8 64
449
+ .b8 10
450
+ .b8 49
451
+ .b8 19
452
+ .b8 0
453
+ .b8 0
454
+ .b8 4
455
+ .b8 29
456
+ .b8 1
457
+ .b8 49
458
+ .b8 19
459
+ .b8 17
460
+ .b8 1
461
+ .b8 18
462
+ .b8 1
463
+ .b8 88
464
+ .b8 11
465
+ .b8 89
466
+ .b8 11
467
+ .b8 87
468
+ .b8 11
469
+ .b8 0
470
+ .b8 0
471
+ .b8 5
472
+ .b8 29
473
+ .b8 0
474
+ .b8 49
475
+ .b8 19
476
+ .b8 17
477
+ .b8 1
478
+ .b8 18
479
+ .b8 1
480
+ .b8 88
481
+ .b8 11
482
+ .b8 89
483
+ .b8 11
484
+ .b8 87
485
+ .b8 11
486
+ .b8 0
487
+ .b8 0
488
+ .b8 0
489
+ }
490
+ .section .debug_info
491
+ {
492
+ .b32 407
493
+ .b8 2
494
+ .b8 0
495
+ .b32 .debug_abbrev
496
+ .b8 8
497
+ .b8 1
498
+ .b8 116
499
+ .b8 114
500
+ .b8 105
501
+ .b8 116
502
+ .b8 111
503
+ .b8 110
504
+ .b8 0
505
+ .b8 2
506
+ .b8 0
507
+ .b8 99
508
+ .b8 113
509
+ .b8 114
510
+ .b8 121
511
+ .b8 120
512
+ .b8 109
513
+ .b8 52
514
+ .b8 54
515
+ .b8 106
516
+ .b8 99
517
+ .b8 120
518
+ .b8 121
519
+ .b8 114
520
+ .b8 51
521
+ .b8 113
522
+ .b8 100
523
+ .b8 107
524
+ .b8 116
525
+ .b8 113
526
+ .b8 105
527
+ .b8 114
528
+ .b8 110
529
+ .b8 53
530
+ .b8 51
531
+ .b8 101
532
+ .b8 97
533
+ .b8 112
534
+ .b8 55
535
+ .b8 104
536
+ .b8 51
537
+ .b8 112
538
+ .b8 106
539
+ .b8 106
540
+ .b8 113
541
+ .b8 105
542
+ .b8 113
543
+ .b8 97
544
+ .b8 118
545
+ .b8 121
546
+ .b8 113
547
+ .b8 113
548
+ .b8 121
549
+ .b8 118
550
+ .b8 102
551
+ .b8 108
552
+ .b8 97
553
+ .b8 98
554
+ .b8 106
555
+ .b8 112
556
+ .b8 118
557
+ .b8 109
558
+ .b8 100
559
+ .b8 46
560
+ .b8 112
561
+ .b8 121
562
+ .b8 0
563
+ .b32 .debug_line
564
+ .b8 47
565
+ .b8 116
566
+ .b8 109
567
+ .b8 112
568
+ .b8 47
569
+ .b8 116
570
+ .b8 111
571
+ .b8 114
572
+ .b8 99
573
+ .b8 104
574
+ .b8 105
575
+ .b8 110
576
+ .b8 100
577
+ .b8 117
578
+ .b8 99
579
+ .b8 116
580
+ .b8 111
581
+ .b8 114
582
+ .b8 95
583
+ .b8 114
584
+ .b8 111
585
+ .b8 111
586
+ .b8 116
587
+ .b8 47
588
+ .b8 113
589
+ .b8 114
590
+ .b8 0
591
+ .b8 1
592
+ .b64 $L__func_begin0
593
+ .b64 $L__func_end0
594
+ .b8 2
595
+ .b8 116
596
+ .b8 114
597
+ .b8 105
598
+ .b8 116
599
+ .b8 111
600
+ .b8 110
601
+ .b8 95
602
+ .b8 95
603
+ .b8 48
604
+ .b8 100
605
+ .b8 49
606
+ .b8 100
607
+ .b8 50
608
+ .b8 100
609
+ .b8 51
610
+ .b8 100
611
+ .b8 52
612
+ .b8 100
613
+ .b8 53
614
+ .b8 100
615
+ .b8 54
616
+ .b8 100
617
+ .b8 55
618
+ .b8 100
619
+ .b8 56
620
+ .b8 100
621
+ .b8 101
622
+ .b8 57
623
+ .b8 100
624
+ .b8 101
625
+ .b8 0
626
+ .b8 116
627
+ .b8 114
628
+ .b8 105
629
+ .b8 116
630
+ .b8 111
631
+ .b8 110
632
+ .b8 95
633
+ .b8 95
634
+ .b8 48
635
+ .b8 100
636
+ .b8 49
637
+ .b8 100
638
+ .b8 50
639
+ .b8 100
640
+ .b8 51
641
+ .b8 100
642
+ .b8 52
643
+ .b8 100
644
+ .b8 53
645
+ .b8 100
646
+ .b8 54
647
+ .b8 100
648
+ .b8 55
649
+ .b8 100
650
+ .b8 56
651
+ .b8 100
652
+ .b8 101
653
+ .b8 57
654
+ .b8 100
655
+ .b8 101
656
+ .b8 0
657
+ .b8 1
658
+ .b8 18
659
+ .b8 1
660
+ .b8 1
661
+ .b8 3
662
+ .b64 $L__func_begin0
663
+ .b64 $L__func_end0
664
+ .b8 1
665
+ .b8 156
666
+ .b32 125
667
+ .b8 4
668
+ .b32 125
669
+ .b64 $L__tmp1
670
+ .b64 $L__tmp14
671
+ .b8 2
672
+ .b8 41
673
+ .b8 57
674
+ .b8 5
675
+ .b32 125
676
+ .b64 $L__tmp1
677
+ .b64 $L__tmp14
678
+ .b8 2
679
+ .b8 243
680
+ .b8 36
681
+ .b8 0
682
+ .b8 5
683
+ .b32 125
684
+ .b64 $L__tmp2
685
+ .b64 $L__tmp15
686
+ .b8 2
687
+ .b8 41
688
+ .b8 57
689
+ .b8 5
690
+ .b32 125
691
+ .b64 $L__tmp15
692
+ .b64 $L__tmp16
693
+ .b8 3
694
+ .b8 41
695
+ .b8 44
696
+ .b8 5
697
+ .b32 125
698
+ .b64 $L__tmp17
699
+ .b64 $L__tmp32
700
+ .b8 2
701
+ .b8 47
702
+ .b8 59
703
+ .b8 4
704
+ .b32 125
705
+ .b64 $L__tmp18
706
+ .b64 $L__tmp31
707
+ .b8 2
708
+ .b8 47
709
+ .b8 59
710
+ .b8 5
711
+ .b32 125
712
+ .b64 $L__tmp18
713
+ .b64 $L__tmp31
714
+ .b8 2
715
+ .b8 243
716
+ .b8 36
717
+ .b8 0
718
+ .b8 5
719
+ .b32 125
720
+ .b64 $L__tmp32
721
+ .b64 $L__tmp33
722
+ .b8 3
723
+ .b8 47
724
+ .b8 45
725
+ .b8 0
726
+ .b8 0
727
+ }
728
+ .section .debug_pubnames
729
+ {
730
+ .b32 $L__pubNames_end0-$L__pubNames_start0
731
+ $L__pubNames_start0:
732
+ .b8 2
733
+ .b8 0
734
+ .b32 .debug_info
735
+ .b32 411
736
+ .b32 125
737
+ .b8 116
738
+ .b8 114
739
+ .b8 105
740
+ .b8 116
741
+ .b8 111
742
+ .b8 110
743
+ .b8 95
744
+ .b8 95
745
+ .b8 48
746
+ .b8 100
747
+ .b8 49
748
+ .b8 100
749
+ .b8 50
750
+ .b8 100
751
+ .b8 51
752
+ .b8 100
753
+ .b8 52
754
+ .b8 100
755
+ .b8 53
756
+ .b8 100
757
+ .b8 54
758
+ .b8 100
759
+ .b8 55
760
+ .b8 100
761
+ .b8 56
762
+ .b8 100
763
+ .b8 101
764
+ .b8 57
765
+ .b8 100
766
+ .b8 101
767
+ .b8 0
768
+ .b32 0
769
+ $L__pubNames_end0:
770
+ }
771
+ .section .debug_pubtypes
772
+ {
773
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
774
+ $L__pubTypes_start0:
775
+ .b8 2
776
+ .b8 0
777
+ .b32 .debug_info
778
+ .b32 411
779
+ .b32 0
780
+ $L__pubTypes_end0:
781
+ }
782
+ .section .debug_loc { }
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin ADDED
Binary file (10.5 kB). View file
 
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
5
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %11 = lshr i32 %10, 2, !dbg !8
7
+ %12 = and i32 %11, 63, !dbg !8
8
+ %13 = and i32 %10, 3, !dbg !9
9
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !10
10
+ %15 = sext i32 %14 to i64, !dbg !11
11
+ %16 = shl nsw i64 %15, 6, !dbg !12
12
+ %17 = zext nneg i32 %12 to i64
13
+ %18 = or i64 %16, %17, !dbg !13
14
+ %19 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14
15
+ %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #2, !dbg !15
16
+ %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #2, !dbg !16
17
+ %22 = bitcast i32 %21 to float, !dbg !16
18
+ %23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #2, !dbg !17
19
+ %24 = bitcast i32 %23 to float, !dbg !17
20
+ %25 = mul nsw i64 %18, 50257, !dbg !18
21
+ %.not = icmp eq i64 %20, -1, !dbg !19
22
+ %26 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %22, float %24) #2, !dbg !20
23
+ %27 = select i1 %.not, float 0.000000e+00, float %26, !dbg !21
24
+ %28 = getelementptr float, ptr addrspace(1) %0, i64 %25
25
+ br label %29, !dbg !22
26
+
27
+ 29: ; preds = %9, %29
28
+ %30 = phi float [ 0.000000e+00, %9 ], [ %40, %29 ]
29
+ %31 = phi i32 [ 0, %9 ], [ %41, %29 ]
30
+ %32 = or i32 %31, %13, !dbg !23
31
+ %33 = zext nneg i32 %32 to i64, !dbg !23
32
+ %34 = icmp ult i32 %32, 50257, !dbg !24
33
+ %35 = getelementptr float, ptr addrspace(1) %28, i64 %33, !dbg !25
34
+ %36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %35, i1 %34, i32 0, i1 %34) #2, !dbg !26
35
+ %37 = bitcast i32 %36 to float, !dbg !26
36
+ %38 = fmul float %27, %37, !dbg !27
37
+ %39 = select i1 %34, float %38, float -0.000000e+00, !dbg !28
38
+ %40 = fadd float %30, %39, !dbg !28
39
+ %41 = add nuw nsw i32 %31, 4, !dbg !22
40
+ %42 = icmp ult i32 %31, 50253, !dbg !22
41
+ br i1 %42, label %29, label %43, !dbg !22
42
+
43
+ 43: ; preds = %29
44
+ %44 = bitcast float %40 to i32, !dbg !29
45
+ %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 2, i32 31), !dbg !29
46
+ %46 = bitcast i32 %45 to float, !dbg !29
47
+ %47 = fadd float %40, %46, !dbg !33
48
+ %48 = bitcast float %47 to i32, !dbg !29
49
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 1, i32 31), !dbg !29
50
+ %50 = bitcast i32 %49 to float, !dbg !29
51
+ %51 = fadd float %47, %50, !dbg !33
52
+ br label %52, !dbg !37
53
+
54
+ 52: ; preds = %43, %52
55
+ %53 = phi i32 [ 0, %43 ], [ %75, %52 ]
56
+ %54 = or i32 %53, %13, !dbg !38
57
+ %55 = zext nneg i32 %54 to i64, !dbg !38
58
+ %56 = icmp ult i32 %54, 50257, !dbg !39
59
+ %57 = add nsw i64 %25, %55, !dbg !40
60
+ %58 = getelementptr i16, ptr addrspace(1) %4, i64 %57, !dbg !41
61
+ %59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %58, i1 %56, i16 0, i1 %56) #2, !dbg !42
62
+ %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #2, !dbg !43
63
+ %61 = getelementptr float, ptr addrspace(1) %0, i64 %57, !dbg !44
64
+ %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %61, i1 %56, i32 0, i1 %56) #2, !dbg !45
65
+ %63 = bitcast i32 %62 to float, !dbg !45
66
+ %64 = getelementptr i16, ptr addrspace(1) %5, i64 %57, !dbg !46
67
+ %65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %64, i1 %56, i16 0, i1 %56) #2, !dbg !47
68
+ %66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !48
69
+ %67 = fmul float %27, %63, !dbg !49
70
+ %68 = fmul float %66, 0x3FF7154760000000, !dbg !50
71
+ %69 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %68) #2, !dbg !50
72
+ %70 = fmul float %51, %69, !dbg !51
73
+ %71 = fsub float %67, %70, !dbg !52
74
+ %72 = fadd float %60, %71, !dbg !53
75
+ %73 = getelementptr i16, ptr addrspace(1) %6, i64 %57, !dbg !54
76
+ %74 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %72) #2, !dbg !55
77
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %74, ptr addrspace(1) %73, i1 %56) #2, !dbg !55
78
+ %75 = add nuw nsw i32 %53, 4, !dbg !37
79
+ %76 = icmp ult i32 %53, 50253, !dbg !37
80
+ br i1 %76, label %52, label %77, !dbg !37
81
+
82
+ 77: ; preds = %52
83
+ ret void, !dbg !56
84
+ }
85
+
86
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
87
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
88
+
89
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
90
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
91
+
92
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
93
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
94
+ attributes #2 = { nounwind }
95
+
96
+ !llvm.module.flags = !{!0}
97
+ !llvm.dbg.cu = !{!1}
98
+ !nvvm.annotations = !{!3, !4, !4, !3}
99
+
100
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
101
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
102
+ !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
103
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
104
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
105
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
106
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
107
+ !7 = !{}
108
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
109
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
110
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
111
+ !11 = !DILocation(line: 21, column: 34, scope: !5)
112
+ !12 = !DILocation(line: 21, column: 46, scope: !5)
113
+ !13 = !DILocation(line: 22, column: 23, scope: !5)
114
+ !14 = !DILocation(line: 26, column: 30, scope: !5)
115
+ !15 = !DILocation(line: 26, column: 35, scope: !5)
116
+ !16 = !DILocation(line: 27, column: 19, scope: !5)
117
+ !17 = !DILocation(line: 29, column: 19, scope: !5)
118
+ !18 = !DILocation(line: 36, column: 46, scope: !5)
119
+ !19 = !DILocation(line: 38, column: 23, scope: !5)
120
+ !20 = !DILocation(line: 39, column: 22, scope: !5)
121
+ !21 = !DILocation(line: 41, column: 37, scope: !5)
122
+ !22 = !DILocation(line: 32, column: 36, scope: !5)
123
+ !23 = !DILocation(line: 33, column: 27, scope: !5)
124
+ !24 = !DILocation(line: 34, column: 25, scope: !5)
125
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
126
+ !26 = !DILocation(line: 36, column: 52, scope: !5)
127
+ !27 = !DILocation(line: 42, column: 23, scope: !5)
128
+ !28 = !DILocation(line: 45, column: 40, scope: !5)
129
+ !29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
130
+ !30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
131
+ !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
132
+ !32 = !DILocation(line: 46, column: 27, scope: !30)
133
+ !33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35)
134
+ !34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
135
+ !35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36)
136
+ !36 = !DILocation(line: 46, column: 27, scope: !34)
137
+ !37 = !DILocation(line: 51, column: 36, scope: !5)
138
+ !38 = !DILocation(line: 52, column: 27, scope: !5)
139
+ !39 = !DILocation(line: 53, column: 25, scope: !5)
140
+ !40 = !DILocation(line: 55, column: 41, scope: !5)
141
+ !41 = !DILocation(line: 55, column: 35, scope: !5)
142
+ !42 = !DILocation(line: 55, column: 53, scope: !5)
143
+ !43 = !DILocation(line: 55, column: 105, scope: !5)
144
+ !44 = !DILocation(line: 56, column: 35, scope: !5)
145
+ !45 = !DILocation(line: 56, column: 53, scope: !5)
146
+ !46 = !DILocation(line: 57, column: 35, scope: !5)
147
+ !47 = !DILocation(line: 57, column: 53, scope: !5)
148
+ !48 = !DILocation(line: 57, column: 105, scope: !5)
149
+ !49 = !DILocation(line: 63, column: 24, scope: !5)
150
+ !50 = !DILocation(line: 65, column: 23, scope: !5)
151
+ !51 = !DILocation(line: 66, column: 24, scope: !5)
152
+ !52 = !DILocation(line: 67, column: 24, scope: !5)
153
+ !53 = !DILocation(line: 69, column: 24, scope: !5)
154
+ !54 = !DILocation(line: 70, column: 29, scope: !5)
155
+ !55 = !DILocation(line: 70, column: 54, scope: !5)
156
+ !56 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+
11
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
12
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
21
+ )
22
+ .maxntid 256, 1, 1
23
+ {
24
+ .reg .pred %p<16>;
25
+ .reg .b16 %rs<9>;
26
+ .reg .b32 %r<31>;
27
+ .reg .f32 %f<23>;
28
+ .reg .b64 %rd<51>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_6];
34
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8_param_5];
35
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8_param_4];
36
+ ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_0];
37
+ ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_1];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r13, %tid.x;
41
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_2];
42
+ bfe.u32 %r14, %r13, 2, 6;
43
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_3];
44
+ .loc 1 24 33
45
+ and.b32 %r1, %r13, 3;
46
+ .loc 1 21 28
47
+ mov.u32 %r6, %ctaid.x;
48
+ .loc 1 21 34
49
+ cvt.s64.s32 %rd1, %r6;
50
+ .loc 1 21 46
51
+ mul.wide.s32 %rd27, %r6, 64;
52
+ cvt.u64.u32 %rd2, %r14;
53
+ .loc 1 22 23
54
+ or.b64 %rd28, %rd27, %rd2;
55
+ .loc 1 26 30
56
+ shl.b64 %rd29, %rd28, 3;
57
+ add.s64 %rd22, %rd26, %rd29;
58
+ mov.pred %p1, -1;
59
+ .loc 1 26 35
60
+ mov.u64 %rd21, 0x0;
61
+ @%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd22 + 0 ];
62
+ .loc 1 27 19
63
+ mov.u32 %r10, 0x0;
64
+ @%p1 ld.global.b32 { %r10 }, [ %rd23 + 0 ];
65
+ .loc 1 29 19
66
+ mov.u32 %r11, 0x0;
67
+ @%p1 ld.global.b32 { %r11 }, [ %rd24 + 0 ];
68
+ .loc 1 38 23
69
+ setp.eq.s64 %p4, %rd21, -1;
70
+ .loc 1 39 22
71
+ div.full.f32 %r9, %r10, %r11;
72
+ mov.b32 %f6, %r9;
73
+ .loc 1 41 37
74
+ selp.f32 %f1, 0f00000000, %f6, %p4;
75
+ .loc 1 32 36
76
+ mul.wide.s32 %rd30, %r6, 12865792;
77
+ mul.wide.u32 %rd31, %r14, 201028;
78
+ add.s64 %rd32, %rd30, %rd31;
79
+ cvt.u64.u32 %rd33, %r13;
80
+ and.b64 %rd3, %rd33, 3;
81
+ mul.wide.u32 %rd34, %r1, 4;
82
+ add.s64 %rd35, %rd32, %rd34;
83
+ add.s64 %rd50, %rd25, %rd35;
84
+ mov.f32 %f22, 0f00000000;
85
+ mov.b32 %r29, -4;
86
+ mov.u64 %rd46, %rd50;
87
+ $L__BB0_1:
88
+ add.s32 %r29, %r29, 4;
89
+ .loc 1 33 27
90
+ add.s32 %r17, %r29, %r1;
91
+ .loc 1 34 25
92
+ setp.lt.u32 %p5, %r17, 50257;
93
+ mov.b32 %r16, 0;
94
+ .loc 1 36 52
95
+ mov.u32 %r15, 0x0;
96
+ @%p5 ld.global.L1::evict_last.b32 { %r15 }, [ %rd46 + 0 ];
97
+ @!%p5 mov.u32 %r15, %r16;
98
+ mov.b32 %f7, %r15;
99
+ .loc 1 42 23
100
+ mul.f32 %f8, %f1, %f7;
101
+ .loc 1 45 40
102
+ selp.f32 %f9, %f8, 0f80000000, %p5;
103
+ add.f32 %f22, %f22, %f9;
104
+ .loc 1 32 36
105
+ add.s64 %rd46, %rd46, 16;
106
+ setp.lt.u32 %p7, %r29, 50253;
107
+ @%p7 bra $L__BB0_1;
108
+ $L__tmp1:
109
+ .loc 2 243 36
110
+ mov.b32 %r19, %f22;
111
+ shfl.sync.bfly.b32 %r20, %r19, 2, 31, -1;
112
+ mov.b32 %f10, %r20;
113
+ $L__tmp2:
114
+ .loc 2 233 15
115
+ add.f32 %f11, %f22, %f10;
116
+ $L__tmp3:
117
+ .loc 2 243 36
118
+ mov.b32 %r21, %f11;
119
+ shfl.sync.bfly.b32 %r22, %r21, 1, 31, -1;
120
+ mov.b32 %f12, %r22;
121
+ $L__tmp4:
122
+ .loc 2 233 15
123
+ add.f32 %f4, %f11, %f12;
124
+ $L__tmp5:
125
+ .loc 1 51 36
126
+ mul.lo.s64 %rd37, %rd1, 3216448;
127
+ mul.lo.s64 %rd38, %rd2, 50257;
128
+ add.s64 %rd39, %rd37, %rd38;
129
+ add.s64 %rd40, %rd39, %rd3;
130
+ shl.b64 %rd41, %rd40, 1;
131
+ add.s64 %rd49, %rd20, %rd41;
132
+ add.s64 %rd48, %rd19, %rd41;
133
+ add.s64 %rd47, %rd18, %rd41;
134
+ mov.b32 %r30, -4;
135
+ mov.u16 %rs2, 0;
136
+ $L__BB0_3:
137
+ add.s32 %r30, %r30, 4;
138
+ .loc 1 52 27
139
+ add.s32 %r28, %r30, %r1;
140
+ .loc 1 53 25
141
+ setp.lt.u32 %p8, %r28, 50257;
142
+ .loc 1 55 53
143
+ mov.u16 %rs1, 0x0;
144
+ @%p8 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
145
+ @!%p8 mov.u16 %rs1, %rs2;
146
+ .loc 1 55 105
147
+ cvt.f32.bf16 %r23, %rs1;
148
+ mov.b32 %f15, %r23;
149
+ .loc 1 56 53
150
+ mov.u32 %r24, 0x0;
151
+ @%p8 ld.global.L1::evict_first.b32 { %r24 }, [ %rd50 + 0 ];
152
+ @!%p8 mov.u32 %r24, %r16;
153
+ mov.b32 %f16, %r24;
154
+ .loc 1 57 53
155
+ mov.u16 %rs4, 0x0;
156
+ @%p8 ld.global.L1::evict_first.b16 { %rs4 }, [ %rd48 + 0 ];
157
+ @!%p8 mov.u16 %rs4, %rs2;
158
+ .loc 1 57 105
159
+ cvt.f32.bf16 %r26, %rs4;
160
+ mov.b32 %f17, %r26;
161
+ .loc 1 65 23
162
+ mul.f32 %f14, %f17, 0f3FB8AA3B;
163
+ ex2.approx.f32 %f13, %f14;
164
+ .loc 1 66 24
165
+ mul.f32 %f18, %f4, %f13;
166
+ .loc 1 67 24
167
+ neg.f32 %f19, %f18;
168
+ fma.rn.f32 %f20, %f1, %f16, %f19;
169
+ .loc 1 69 24
170
+ add.f32 %f21, %f15, %f20;
171
+ .loc 1 70 54
172
+ mov.b32 %r27, %f21;
173
+ cvt.rn.bf16.f32 %rs7, %r27;
174
+ @%p8 st.global.b16 [ %rd49 + 0 ], { %rs7 };
175
+ .loc 1 51 36
176
+ add.s64 %rd50, %rd50, 16;
177
+ add.s64 %rd49, %rd49, 8;
178
+ add.s64 %rd48, %rd48, 8;
179
+ add.s64 %rd47, %rd47, 8;
180
+ setp.lt.u32 %p15, %r30, 50253;
181
+ @%p15 bra $L__BB0_3;
182
+ .loc 1 51 4
183
+ ret;
184
+ $L__tmp6:
185
+ $L__func_end0:
186
+
187
+ }
188
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
189
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
190
+ .section .debug_abbrev
191
+ {
192
+ .b8 1
193
+ .b8 17
194
+ .b8 1
195
+ .b8 37
196
+ .b8 8
197
+ .b8 19
198
+ .b8 5
199
+ .b8 3
200
+ .b8 8
201
+ .b8 16
202
+ .b8 6
203
+ .b8 27
204
+ .b8 8
205
+ .b8 180
206
+ .b8 66
207
+ .b8 12
208
+ .b8 17
209
+ .b8 1
210
+ .b8 18
211
+ .b8 1
212
+ .b8 0
213
+ .b8 0
214
+ .b8 2
215
+ .b8 46
216
+ .b8 0
217
+ .b8 135
218
+ .b8 64
219
+ .b8 8
220
+ .b8 3
221
+ .b8 8
222
+ .b8 58
223
+ .b8 11
224
+ .b8 59
225
+ .b8 11
226
+ .b8 63
227
+ .b8 12
228
+ .b8 32
229
+ .b8 11
230
+ .b8 0
231
+ .b8 0
232
+ .b8 3
233
+ .b8 46
234
+ .b8 1
235
+ .b8 17
236
+ .b8 1
237
+ .b8 18
238
+ .b8 1
239
+ .b8 64
240
+ .b8 10
241
+ .b8 49
242
+ .b8 19
243
+ .b8 0
244
+ .b8 0
245
+ .b8 4
246
+ .b8 29
247
+ .b8 0
248
+ .b8 49
249
+ .b8 19
250
+ .b8 17
251
+ .b8 1
252
+ .b8 18
253
+ .b8 1
254
+ .b8 88
255
+ .b8 11
256
+ .b8 89
257
+ .b8 11
258
+ .b8 87
259
+ .b8 11
260
+ .b8 0
261
+ .b8 0
262
+ .b8 5
263
+ .b8 29
264
+ .b8 1
265
+ .b8 49
266
+ .b8 19
267
+ .b8 17
268
+ .b8 1
269
+ .b8 18
270
+ .b8 1
271
+ .b8 88
272
+ .b8 11
273
+ .b8 89
274
+ .b8 11
275
+ .b8 87
276
+ .b8 11
277
+ .b8 0
278
+ .b8 0
279
+ .b8 0
280
+ }
281
+ .section .debug_info
282
+ {
283
+ .b32 278
284
+ .b8 2
285
+ .b8 0
286
+ .b32 .debug_abbrev
287
+ .b8 8
288
+ .b8 1
289
+ .b8 116
290
+ .b8 114
291
+ .b8 105
292
+ .b8 116
293
+ .b8 111
294
+ .b8 110
295
+ .b8 0
296
+ .b8 2
297
+ .b8 0
298
+ .b8 99
299
+ .b8 107
300
+ .b8 122
301
+ .b8 103
302
+ .b8 108
303
+ .b8 55
304
+ .b8 116
305
+ .b8 104
306
+ .b8 98
307
+ .b8 52
308
+ .b8 120
309
+ .b8 100
310
+ .b8 102
311
+ .b8 107
312
+ .b8 102
313
+ .b8 110
314
+ .b8 100
315
+ .b8 50
316
+ .b8 116
317
+ .b8 105
318
+ .b8 100
319
+ .b8 107
320
+ .b8 115
321
+ .b8 54
322
+ .b8 109
323
+ .b8 116
324
+ .b8 53
325
+ .b8 102
326
+ .b8 51
327
+ .b8 104
328
+ .b8 97
329
+ .b8 117
330
+ .b8 119
331
+ .b8 102
332
+ .b8 121
333
+ .b8 106
334
+ .b8 102
335
+ .b8 108
336
+ .b8 98
337
+ .b8 116
338
+ .b8 122
339
+ .b8 121
340
+ .b8 101
341
+ .b8 112
342
+ .b8 111
343
+ .b8 53
344
+ .b8 111
345
+ .b8 120
346
+ .b8 107
347
+ .b8 118
348
+ .b8 104
349
+ .b8 107
350
+ .b8 46
351
+ .b8 112
352
+ .b8 121
353
+ .b8 0
354
+ .b32 .debug_line
355
+ .b8 47
356
+ .b8 116
357
+ .b8 109
358
+ .b8 112
359
+ .b8 47
360
+ .b8 116
361
+ .b8 111
362
+ .b8 114
363
+ .b8 99
364
+ .b8 104
365
+ .b8 105
366
+ .b8 110
367
+ .b8 100
368
+ .b8 117
369
+ .b8 99
370
+ .b8 116
371
+ .b8 111
372
+ .b8 114
373
+ .b8 95
374
+ .b8 114
375
+ .b8 111
376
+ .b8 111
377
+ .b8 116
378
+ .b8 47
379
+ .b8 107
380
+ .b8 122
381
+ .b8 0
382
+ .b8 1
383
+ .b64 $L__func_begin0
384
+ .b64 $L__func_end0
385
+ .b8 2
386
+ .b8 116
387
+ .b8 114
388
+ .b8 105
389
+ .b8 116
390
+ .b8 111
391
+ .b8 110
392
+ .b8 95
393
+ .b8 95
394
+ .b8 48
395
+ .b8 100
396
+ .b8 49
397
+ .b8 100
398
+ .b8 50
399
+ .b8 100
400
+ .b8 51
401
+ .b8 100
402
+ .b8 52
403
+ .b8 100
404
+ .b8 53
405
+ .b8 100
406
+ .b8 54
407
+ .b8 100
408
+ .b8 55
409
+ .b8 100
410
+ .b8 101
411
+ .b8 56
412
+ .b8 0
413
+ .b8 116
414
+ .b8 114
415
+ .b8 105
416
+ .b8 116
417
+ .b8 111
418
+ .b8 110
419
+ .b8 95
420
+ .b8 95
421
+ .b8 48
422
+ .b8 100
423
+ .b8 49
424
+ .b8 100
425
+ .b8 50
426
+ .b8 100
427
+ .b8 51
428
+ .b8 100
429
+ .b8 52
430
+ .b8 100
431
+ .b8 53
432
+ .b8 100
433
+ .b8 54
434
+ .b8 100
435
+ .b8 55
436
+ .b8 100
437
+ .b8 101
438
+ .b8 56
439
+ .b8 0
440
+ .b8 1
441
+ .b8 18
442
+ .b8 1
443
+ .b8 1
444
+ .b8 3
445
+ .b64 $L__func_begin0
446
+ .b64 $L__func_end0
447
+ .b8 1
448
+ .b8 156
449
+ .b32 125
450
+ .b8 4
451
+ .b32 125
452
+ .b64 $L__tmp1
453
+ .b64 $L__tmp4
454
+ .b8 2
455
+ .b8 46
456
+ .b8 27
457
+ .b8 5
458
+ .b32 125
459
+ .b64 $L__tmp2
460
+ .b64 $L__tmp5
461
+ .b8 2
462
+ .b8 46
463
+ .b8 27
464
+ .b8 4
465
+ .b32 125
466
+ .b64 $L__tmp2
467
+ .b64 $L__tmp5
468
+ .b8 2
469
+ .b8 243
470
+ .b8 36
471
+ .b8 0
472
+ .b8 0
473
+ .b8 0
474
+ }
475
+ .section .debug_pubnames
476
+ {
477
+ .b32 $L__pubNames_end0-$L__pubNames_start0
478
+ $L__pubNames_start0:
479
+ .b8 2
480
+ .b8 0
481
+ .b32 .debug_info
482
+ .b32 282
483
+ .b32 125
484
+ .b8 116
485
+ .b8 114
486
+ .b8 105
487
+ .b8 116
488
+ .b8 111
489
+ .b8 110
490
+ .b8 95
491
+ .b8 95
492
+ .b8 48
493
+ .b8 100
494
+ .b8 49
495
+ .b8 100
496
+ .b8 50
497
+ .b8 100
498
+ .b8 51
499
+ .b8 100
500
+ .b8 52
501
+ .b8 100
502
+ .b8 53
503
+ .b8 100
504
+ .b8 54
505
+ .b8 100
506
+ .b8 55
507
+ .b8 100
508
+ .b8 101
509
+ .b8 56
510
+ .b8 0
511
+ .b32 0
512
+ $L__pubNames_end0:
513
+ }
514
+ .section .debug_pubtypes
515
+ {
516
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
517
+ $L__pubTypes_start0:
518
+ .b8 2
519
+ .b8 0
520
+ .b32 .debug_info
521
+ .b32 282
522
+ .b32 0
523
+ $L__pubTypes_end0:
524
+ }
525
+ .section .debug_loc { }
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x1xf32, #blocked>
5
+ %cst_0 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<-1> : tensor<64x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
8
+ %c64_i64 = arith.constant 64 : i64
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x4xi64, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c4_i32 = arith.constant 4 : i32
12
+ %c50257_i32 = arith.constant 50257 : i32
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.extsi %0 : i32 to i64
16
+ %2 = arith.muli %1, %c64_i64 : i64
17
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
19
+ %5 = arith.extsi %4 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
20
+ %6 = tt.splat %2 : (i64) -> tensor<64x1xi64, #blocked>
21
+ %7 = arith.addi %6, %5 : tensor<64x1xi64, #blocked>
22
+ %8 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
24
+ %10 = arith.extsi %9 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
25
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
26
+ %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi64, #blocked>
27
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
28
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
29
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
30
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
31
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
32
+ %18 = arith.muli %7, %cst_0 : tensor<64x1xi64, #blocked>
33
+ %19 = tt.broadcast %18 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
34
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
35
+ %21 = arith.cmpi ne, %13, %cst_1 : tensor<64x1xi64, #blocked>
36
+ %22 = arith.divf %15, %17 : f32
37
+ %23 = tt.splat %22 : (f32) -> tensor<64x1xf32, #blocked>
38
+ %24 = arith.select %21, %23, %cst : tensor<64x1xi1, #blocked>, tensor<64x1xf32, #blocked>
39
+ %25 = tt.broadcast %24 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
40
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 iter_args(%arg10 = %cst_2) -> (tensor<64x4xf32, #blocked>) : i32 {
41
+ %33 = arith.extsi %arg9 : i32 to i64
42
+ %34 = tt.splat %33 : (i64) -> tensor<1x4xi64, #blocked>
43
+ %35 = arith.addi %34, %10 : tensor<1x4xi64, #blocked>
44
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x4xi64, #blocked>
45
+ %37 = tt.broadcast %35 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
46
+ %38 = arith.addi %37, %19 : tensor<64x4xi64, #blocked>
47
+ %39 = tt.addptr %20, %38 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
48
+ %40 = tt.broadcast %36 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
49
+ %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
50
+ %42 = arith.mulf %41, %25 : tensor<64x4xf32, #blocked>
51
+ %43 = arith.addf %arg10, %42 : tensor<64x4xf32, #blocked>
52
+ %44 = arith.select %40, %43, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
53
+ scf.yield %44 : tensor<64x4xf32, #blocked>
54
+ }
55
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
56
+ ^bb0(%arg9: f32, %arg10: f32):
57
+ %33 = arith.addf %arg9, %arg10 : f32
58
+ tt.reduce.return %33 : f32
59
+ }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
60
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
61
+ %29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
62
+ %30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
63
+ %31 = tt.broadcast %28 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
64
+ %32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
65
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 : i32 {
66
+ %33 = arith.extsi %arg9 : i32 to i64
67
+ %34 = tt.splat %33 : (i64) -> tensor<1x4xi64, #blocked>
68
+ %35 = arith.addi %34, %10 : tensor<1x4xi64, #blocked>
69
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x4xi64, #blocked>
70
+ %37 = tt.broadcast %35 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
71
+ %38 = arith.addi %37, %19 : tensor<64x4xi64, #blocked>
72
+ %39 = tt.addptr %29, %38 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi64, #blocked>
73
+ %40 = tt.broadcast %36 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
74
+ %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
75
+ %42 = arith.extf %41 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
76
+ %43 = tt.addptr %20, %38 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
77
+ %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
78
+ %45 = tt.addptr %30, %38 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi64, #blocked>
79
+ %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
80
+ %47 = arith.extf %46 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
81
+ %48 = arith.mulf %44, %25 : tensor<64x4xf32, #blocked>
82
+ %49 = math.exp %47 : tensor<64x4xf32, #blocked>
83
+ %50 = arith.mulf %49, %31 : tensor<64x4xf32, #blocked>
84
+ %51 = arith.subf %48, %50 : tensor<64x4xf32, #blocked>
85
+ %52 = arith.addf %42, %51 : tensor<64x4xf32, #blocked>
86
+ %53 = tt.addptr %32, %38 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi64, #blocked>
87
+ %54 = arith.truncf %52 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
88
+ tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
89
+ }
90
+ tt.return
91
+ }
92
+ }
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32>
5
+ %c50257_i32 = arith.constant 50257 : i32
6
+ %c4_i32 = arith.constant 4 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<50257> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<50257> : tensor<1x4xi64>
10
+ %c64_i64 = arith.constant 64 : i64
11
+ %cst_3 = arith.constant dense<-1> : tensor<64x1xi64>
12
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.extsi %0 : i32 to i64
15
+ %2 = arith.muli %1, %c64_i64 : i64
16
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
17
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
18
+ %5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64>
19
+ %6 = tt.splat %2 : (i64) -> tensor<64x1xi64>
20
+ %7 = arith.addi %6, %5 : tensor<64x1xi64>
21
+ %8 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
22
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
23
+ %10 = arith.extsi %9 : tensor<1x4xi32> to tensor<1x4xi64>
24
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
25
+ %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi64>
26
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
27
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
28
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
29
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
30
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
31
+ %18 = arith.muli %7, %cst_1 : tensor<64x1xi64>
32
+ %19 = tt.broadcast %18 : (tensor<64x1xi64>) -> tensor<64x4xi64>
33
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
34
+ %21 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64>
35
+ %22 = arith.divf %15, %17 : f32
36
+ %23 = tt.splat %22 : (f32) -> tensor<64x1xf32>
37
+ %24 = arith.select %21, %23, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
38
+ %25 = tt.broadcast %24 : (tensor<64x1xf32>) -> tensor<64x4xf32>
39
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x4xf32>) : i32 {
40
+ %41 = arith.extsi %arg9 : i32 to i64
41
+ %42 = tt.splat %41 : (i64) -> tensor<1x4xi64>
42
+ %43 = arith.addi %42, %10 : tensor<1x4xi64>
43
+ %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x4xi64>
44
+ %45 = tt.broadcast %43 : (tensor<1x4xi64>) -> tensor<64x4xi64>
45
+ %46 = arith.addi %45, %19 : tensor<64x4xi64>
46
+ %47 = tt.addptr %20, %46 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
47
+ %48 = tt.broadcast %44 : (tensor<1x4xi1>) -> tensor<64x4xi1>
48
+ %49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
49
+ %50 = arith.mulf %49, %25 : tensor<64x4xf32>
50
+ %51 = arith.addf %arg10, %50 : tensor<64x4xf32>
51
+ %52 = arith.select %48, %51, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
52
+ scf.yield %52 : tensor<64x4xf32>
53
+ }
54
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
55
+ ^bb0(%arg9: f32, %arg10: f32):
56
+ %41 = arith.addf %arg9, %arg10 : f32
57
+ tt.reduce.return %41 : f32
58
+ }) : (tensor<64x4xf32>) -> tensor<64xf32>
59
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
60
+ %29 = arith.muli %7, %cst_1 : tensor<64x1xi64>
61
+ %30 = tt.broadcast %29 : (tensor<64x1xi64>) -> tensor<64x4xi64>
62
+ %31 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
63
+ %32 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
64
+ %33 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
65
+ %34 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64>
66
+ %35 = arith.divf %15, %17 : f32
67
+ %36 = tt.splat %35 : (f32) -> tensor<64x1xf32>
68
+ %37 = arith.select %34, %36, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
69
+ %38 = tt.broadcast %37 : (tensor<64x1xf32>) -> tensor<64x4xf32>
70
+ %39 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x4xf32>
71
+ %40 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
72
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 : i32 {
73
+ %41 = arith.extsi %arg9 : i32 to i64
74
+ %42 = tt.splat %41 : (i64) -> tensor<1x4xi64>
75
+ %43 = arith.addi %42, %10 : tensor<1x4xi64>
76
+ %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x4xi64>
77
+ %45 = tt.broadcast %43 : (tensor<1x4xi64>) -> tensor<64x4xi64>
78
+ %46 = arith.addi %45, %30 : tensor<64x4xi64>
79
+ %47 = tt.addptr %31, %46 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi64>
80
+ %48 = tt.broadcast %44 : (tensor<1x4xi1>) -> tensor<64x4xi1>
81
+ %49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
82
+ %50 = arith.extf %49 : tensor<64x4xbf16> to tensor<64x4xf32>
83
+ %51 = tt.addptr %32, %46 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
84
+ %52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
85
+ %53 = tt.addptr %33, %46 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi64>
86
+ %54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
87
+ %55 = arith.extf %54 : tensor<64x4xbf16> to tensor<64x4xf32>
88
+ %56 = arith.mulf %52, %38 : tensor<64x4xf32>
89
+ %57 = math.exp %55 : tensor<64x4xf32>
90
+ %58 = arith.mulf %57, %39 : tensor<64x4xf32>
91
+ %59 = arith.subf %56, %58 : tensor<64x4xf32>
92
+ %60 = arith.addf %50, %59 : tensor<64x4xf32>
93
+ %61 = tt.addptr %40, %46 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi64>
94
+ %62 = arith.truncf %60 : tensor<64x4xf32> to tensor<64x4xbf16>
95
+ tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
96
+ }
97
+ tt.return
98
+ }
99
+ }
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<8x512xbf16>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<8x1xf32>
5
+ %c50257_i32 = arith.constant 50257 : i32
6
+ %c512_i32 = arith.constant 512 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<50257> : tensor<8x1xi64>
9
+ %cst_2 = arith.constant dense<50257> : tensor<1x512xi64>
10
+ %c8_i64 = arith.constant 8 : i64
11
+ %cst_3 = arith.constant dense<-1> : tensor<8x1xi64>
12
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x512xf32>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.extsi %0 : i32 to i64
15
+ %2 = arith.muli %1, %c8_i64 : i64
16
+ %3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
17
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32>) -> tensor<8x1xi32>
18
+ %5 = arith.extsi %4 : tensor<8x1xi32> to tensor<8x1xi64>
19
+ %6 = tt.splat %2 : (i64) -> tensor<8x1xi64>
20
+ %7 = arith.addi %6, %5 : tensor<8x1xi64>
21
+ %8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
22
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<512xi32>) -> tensor<1x512xi32>
23
+ %10 = arith.extsi %9 : tensor<1x512xi32> to tensor<1x512xi64>
24
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<8x1x!tt.ptr<i64, 1>>
25
+ %12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr<i64, 1>>, tensor<8x1xi64>
26
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64>
27
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
28
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
29
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
30
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
31
+ %18 = arith.muli %7, %cst_1 : tensor<8x1xi64>
32
+ %19 = tt.broadcast %18 : (tensor<8x1xi64>) -> tensor<8x512xi64>
33
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>>
34
+ %21 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64>
35
+ %22 = arith.divf %15, %17 : f32
36
+ %23 = tt.splat %22 : (f32) -> tensor<8x1xf32>
37
+ %24 = arith.select %21, %23, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32>
38
+ %25 = tt.broadcast %24 : (tensor<8x1xf32>) -> tensor<8x512xf32>
39
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 iter_args(%arg10 = %cst_4) -> (tensor<8x512xf32>) : i32 {
40
+ %41 = arith.extsi %arg9 : i32 to i64
41
+ %42 = tt.splat %41 : (i64) -> tensor<1x512xi64>
42
+ %43 = arith.addi %42, %10 : tensor<1x512xi64>
43
+ %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x512xi64>
44
+ %45 = tt.broadcast %43 : (tensor<1x512xi64>) -> tensor<8x512xi64>
45
+ %46 = arith.addi %45, %19 : tensor<8x512xi64>
46
+ %47 = tt.addptr %20, %46 : tensor<8x512x!tt.ptr<f32, 1>>, tensor<8x512xi64>
47
+ %48 = tt.broadcast %44 : (tensor<1x512xi1>) -> tensor<8x512xi1>
48
+ %49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x512xf32>
49
+ %50 = arith.mulf %49, %25 : tensor<8x512xf32>
50
+ %51 = arith.addf %arg10, %50 : tensor<8x512xf32>
51
+ %52 = arith.select %48, %51, %arg10 : tensor<8x512xi1>, tensor<8x512xf32>
52
+ scf.yield %52 : tensor<8x512xf32>
53
+ }
54
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
55
+ ^bb0(%arg9: f32, %arg10: f32):
56
+ %41 = arith.addf %arg9, %arg10 : f32
57
+ tt.reduce.return %41 : f32
58
+ }) : (tensor<8x512xf32>) -> tensor<8xf32>
59
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32>) -> tensor<8x1xf32>
60
+ %29 = arith.muli %7, %cst_1 : tensor<8x1xi64>
61
+ %30 = tt.broadcast %29 : (tensor<8x1xi64>) -> tensor<8x512xi64>
62
+ %31 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>>
63
+ %32 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>>
64
+ %33 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>>
65
+ %34 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64>
66
+ %35 = arith.divf %15, %17 : f32
67
+ %36 = tt.splat %35 : (f32) -> tensor<8x1xf32>
68
+ %37 = arith.select %34, %36, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32>
69
+ %38 = tt.broadcast %37 : (tensor<8x1xf32>) -> tensor<8x512xf32>
70
+ %39 = tt.broadcast %28 : (tensor<8x1xf32>) -> tensor<8x512xf32>
71
+ %40 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>>
72
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 : i32 {
73
+ %41 = arith.extsi %arg9 : i32 to i64
74
+ %42 = tt.splat %41 : (i64) -> tensor<1x512xi64>
75
+ %43 = arith.addi %42, %10 : tensor<1x512xi64>
76
+ %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x512xi64>
77
+ %45 = tt.broadcast %43 : (tensor<1x512xi64>) -> tensor<8x512xi64>
78
+ %46 = arith.addi %45, %30 : tensor<8x512xi64>
79
+ %47 = tt.addptr %31, %46 : tensor<8x512x!tt.ptr<bf16, 1>>, tensor<8x512xi64>
80
+ %48 = tt.broadcast %44 : (tensor<1x512xi1>) -> tensor<8x512xi1>
81
+ %49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16>
82
+ %50 = arith.extf %49 : tensor<8x512xbf16> to tensor<8x512xf32>
83
+ %51 = tt.addptr %32, %46 : tensor<8x512x!tt.ptr<f32, 1>>, tensor<8x512xi64>
84
+ %52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xf32>
85
+ %53 = tt.addptr %33, %46 : tensor<8x512x!tt.ptr<bf16, 1>>, tensor<8x512xi64>
86
+ %54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16>
87
+ %55 = arith.extf %54 : tensor<8x512xbf16> to tensor<8x512xf32>
88
+ %56 = arith.mulf %52, %38 : tensor<8x512xf32>
89
+ %57 = math.exp %55 : tensor<8x512xf32>
90
+ %58 = arith.mulf %57, %39 : tensor<8x512xf32>
91
+ %59 = arith.subf %56, %58 : tensor<8x512xf32>
92
+ %60 = arith.addf %50, %59 : tensor<8x512xf32>
93
+ %61 = tt.addptr %40, %46 : tensor<8x512x!tt.ptr<bf16, 1>>, tensor<8x512xi64>
94
+ %62 = arith.truncf %60 : tensor<8x512xf32> to tensor<8x512xbf16>
95
+ tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<8x512xbf16>
96
+ }
97
+ tt.return
98
+ }
99
+ }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin ADDED
Binary file (4.52 kB). View file
 
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 1, !dbg !8
7
+ %5 = and i32 %4, 510, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = shl i32 %6, 9, !dbg !10
10
+ %8 = or i32 %7, %5, !dbg !11
11
+ %9 = icmp slt i32 %8, 12865792, !dbg !12
12
+ %10 = sext i32 %8 to i64, !dbg !13
13
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !13
14
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %11, i1 %9) #1, !dbg !14
15
+ ret void, !dbg !15
16
+ }
17
+
18
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
19
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
20
+
21
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
22
+ attributes #1 = { nounwind }
23
+
24
+ !llvm.module.flags = !{!0}
25
+ !llvm.dbg.cu = !{!1}
26
+ !nvvm.annotations = !{!3, !4, !4, !3}
27
+
28
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
29
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
30
+ !2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
31
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
32
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
33
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
34
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
35
+ !7 = !{}
36
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
37
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
38
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
39
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
40
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
41
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
42
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
43
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u32 triton__0d1de_param_1
14
+ )
15
+ .maxntid 256, 1, 1
16
+ {
17
+ .reg .pred %p<2>;
18
+ .reg .b32 %r<9>;
19
+ .reg .b64 %rd<4>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd2, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r4, %tid.x;
28
+ shl.b32 %r5, %r4, 1;
29
+ and.b32 %r6, %r5, 510;
30
+ .loc 1 20 28
31
+ mov.u32 %r1, %ctaid.x;
32
+ .loc 1 20 33
33
+ shl.b32 %r7, %r1, 9;
34
+ .loc 1 21 23
35
+ or.b32 %r8, %r7, %r6;
36
+ .loc 1 22 21
37
+ setp.lt.s32 %p1, %r8, 12865792;
38
+ .loc 1 25 25
39
+ mul.wide.s32 %rd3, %r8, 4;
40
+ add.s64 %rd1, %rd2, %rd3;
41
+ mov.b32 %r2, 0;
42
+ .loc 1 25 36
43
+ @%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
44
+ .loc 1 25 4
45
+ ret;
46
+ $L__tmp1:
47
+ $L__func_end0:
48
+
49
+ }
50
+ .file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py"
51
+ .section .debug_abbrev
52
+ {
53
+ .b8 1
54
+ .b8 17
55
+ .b8 1
56
+ .b8 37
57
+ .b8 8
58
+ .b8 19
59
+ .b8 5
60
+ .b8 3
61
+ .b8 8
62
+ .b8 16
63
+ .b8 6
64
+ .b8 27
65
+ .b8 8
66
+ .b8 180
67
+ .b8 66
68
+ .b8 12
69
+ .b8 17
70
+ .b8 1
71
+ .b8 18
72
+ .b8 1
73
+ .b8 0
74
+ .b8 0
75
+ .b8 2
76
+ .b8 46
77
+ .b8 0
78
+ .b8 17
79
+ .b8 1
80
+ .b8 18
81
+ .b8 1
82
+ .b8 64
83
+ .b8 10
84
+ .b8 135
85
+ .b8 64
86
+ .b8 8
87
+ .b8 3
88
+ .b8 8
89
+ .b8 58
90
+ .b8 11
91
+ .b8 59
92
+ .b8 11
93
+ .b8 63
94
+ .b8 12
95
+ .b8 0
96
+ .b8 0
97
+ .b8 0
98
+ }
99
+ .section .debug_info
100
+ {
101
+ .b32 172
102
+ .b8 2
103
+ .b8 0
104
+ .b32 .debug_abbrev
105
+ .b8 8
106
+ .b8 1
107
+ .b8 116
108
+ .b8 114
109
+ .b8 105
110
+ .b8 116
111
+ .b8 111
112
+ .b8 110
113
+ .b8 0
114
+ .b8 2
115
+ .b8 0
116
+ .b8 99
117
+ .b8 52
118
+ .b8 121
119
+ .b8 115
120
+ .b8 101
121
+ .b8 108
122
+ .b8 100
123
+ .b8 119
124
+ .b8 109
125
+ .b8 117
126
+ .b8 51
127
+ .b8 116
128
+ .b8 111
129
+ .b8 53
130
+ .b8 50
131
+ .b8 112
132
+ .b8 98
133
+ .b8 104
134
+ .b8 50
135
+ .b8 109
136
+ .b8 100
137
+ .b8 50
138
+ .b8 111
139
+ .b8 101
140
+ .b8 117
141
+ .b8 102
142
+ .b8 114
143
+ .b8 113
144
+ .b8 51
145
+ .b8 102
146
+ .b8 99
147
+ .b8 100
148
+ .b8 109
149
+ .b8 97
150
+ .b8 112
151
+ .b8 107
152
+ .b8 116
153
+ .b8 52
154
+ .b8 110
155
+ .b8 120
156
+ .b8 100
157
+ .b8 122
158
+ .b8 109
159
+ .b8 121
160
+ .b8 113
161
+ .b8 116
162
+ .b8 103
163
+ .b8 100
164
+ .b8 50
165
+ .b8 121
166
+ .b8 115
167
+ .b8 112
168
+ .b8 46
169
+ .b8 112
170
+ .b8 121
171
+ .b8 0
172
+ .b32 .debug_line
173
+ .b8 47
174
+ .b8 116
175
+ .b8 109
176
+ .b8 112
177
+ .b8 47
178
+ .b8 116
179
+ .b8 111
180
+ .b8 114
181
+ .b8 99
182
+ .b8 104
183
+ .b8 105
184
+ .b8 110
185
+ .b8 100
186
+ .b8 117
187
+ .b8 99
188
+ .b8 116
189
+ .b8 111
190
+ .b8 114
191
+ .b8 95
192
+ .b8 114
193
+ .b8 111
194
+ .b8 111
195
+ .b8 116
196
+ .b8 47
197
+ .b8 52
198
+ .b8 121
199
+ .b8 0
200
+ .b8 1
201
+ .b64 $L__func_begin0
202
+ .b64 $L__func_end0
203
+ .b8 2
204
+ .b64 $L__func_begin0
205
+ .b64 $L__func_end0
206
+ .b8 1
207
+ .b8 156
208
+ .b8 116
209
+ .b8 114
210
+ .b8 105
211
+ .b8 116
212
+ .b8 111
213
+ .b8 110
214
+ .b8 95
215
+ .b8 95
216
+ .b8 48
217
+ .b8 100
218
+ .b8 49
219
+ .b8 100
220
+ .b8 101
221
+ .b8 0
222
+ .b8 116
223
+ .b8 114
224
+ .b8 105
225
+ .b8 116
226
+ .b8 111
227
+ .b8 110
228
+ .b8 95
229
+ .b8 95
230
+ .b8 48
231
+ .b8 100
232
+ .b8 49
233
+ .b8 100
234
+ .b8 101
235
+ .b8 0
236
+ .b8 1
237
+ .b8 18
238
+ .b8 1
239
+ .b8 0
240
+ }
241
+ .section .debug_pubnames
242
+ {
243
+ .b32 $L__pubNames_end0-$L__pubNames_start0
244
+ $L__pubNames_start0:
245
+ .b8 2
246
+ .b8 0
247
+ .b32 .debug_info
248
+ .b32 176
249
+ .b32 125
250
+ .b8 116
251
+ .b8 114
252
+ .b8 105
253
+ .b8 116
254
+ .b8 111
255
+ .b8 110
256
+ .b8 95
257
+ .b8 95
258
+ .b8 48
259
+ .b8 100
260
+ .b8 49
261
+ .b8 100
262
+ .b8 101
263
+ .b8 0
264
+ .b32 0
265
+ $L__pubNames_end0:
266
+ }
267
+ .section .debug_pubtypes
268
+ {
269
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
270
+ $L__pubTypes_start0:
271
+ .b8 2
272
+ .b8 0
273
+ .b32 .debug_info
274
+ .b32 176
275
+ .b32 0
276
+ $L__pubTypes_end0:
277
+ }
278
+ .section .debug_loc { }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
5
+ %c512_i32 = arith.constant 512 : i32
6
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c512_i32 : i32
9
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
10
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
11
+ %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
12
+ %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
13
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
14
+ %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
15
+ tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<512xf32>
4
+ %cst_0 = arith.constant dense<12865792> : tensor<512xi32>
5
+ %c512_i32 = arith.constant 512 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c512_i32 : i32
8
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
9
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
10
+ %4 = arith.addi %3, %2 : tensor<512xi32>
11
+ %5 = arith.cmpi slt, %4, %cst_0 : tensor<512xi32>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
13
+ %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
14
+ tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin ADDED
Binary file (15 kB). View file
 
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !5 {
7
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %11 = and i32 %10, 31, !dbg !8
9
+ %12 = lshr i32 %10, 5, !dbg !8
10
+ %13 = and i32 %12, 1, !dbg !8
11
+ %urem = shl i32 %10, 2, !dbg !8
12
+ %14 = and i32 %urem, 252, !dbg !8
13
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
14
+ %16 = shl i32 %15, 8, !dbg !10
15
+ %17 = or i32 %16, %14, !dbg !11
16
+ %18 = sext i32 %17 to i64, !dbg !12
17
+ %19 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !12
18
+ %20 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
19
+ %21 = extractvalue { i32, i32 } %20, 0, !dbg !13
20
+ %22 = extractvalue { i32, i32 } %20, 1, !dbg !13
21
+ %23 = trunc i32 %21 to i16, !dbg !13
22
+ %extelt.offset = lshr i32 %21, 16, !dbg !13
23
+ %24 = trunc i32 %extelt.offset to i16, !dbg !13
24
+ %25 = trunc i32 %22 to i16, !dbg !13
25
+ %extelt.offset1 = lshr i32 %22, 16, !dbg !13
26
+ %26 = trunc i32 %extelt.offset1 to i16, !dbg !13
27
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
28
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
29
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
30
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14
31
+ %31 = zext nneg i32 %14 to i64, !dbg !15
32
+ %32 = getelementptr float, ptr addrspace(1) %2, i64 %31, !dbg !15
33
+ %33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
34
+ %34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !16
35
+ %35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !16
36
+ %36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !16
37
+ %37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !16
38
+ %38 = bitcast i32 %34 to float, !dbg !16
39
+ %39 = bitcast i32 %35 to float, !dbg !16
40
+ %40 = bitcast i32 %36 to float, !dbg !16
41
+ %41 = bitcast i32 %37 to float, !dbg !16
42
+ %42 = getelementptr float, ptr addrspace(1) %3, i64 %18, !dbg !17
43
+ %43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
44
+ %44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18
45
+ %45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18
46
+ %46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18
47
+ %47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18
48
+ %48 = bitcast i32 %44 to float, !dbg !18
49
+ %49 = bitcast i32 %45 to float, !dbg !18
50
+ %50 = bitcast i32 %46 to float, !dbg !18
51
+ %51 = bitcast i32 %47 to float, !dbg !18
52
+ %52 = sext i32 %15 to i64, !dbg !19
53
+ %53 = getelementptr float, ptr addrspace(1) %4, i64 %52, !dbg !19
54
+ %54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
55
+ %55 = bitcast i32 %54 to float, !dbg !20
56
+ %56 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
57
+ %57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
58
+ %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
59
+ %59 = getelementptr float, ptr addrspace(1) %5, i64 %52, !dbg !21
60
+ %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
61
+ %61 = bitcast i32 %60 to float, !dbg !22
62
+ %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
63
+ %63 = bitcast i32 %62 to float, !dbg !22
64
+ %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
65
+ %65 = bitcast i32 %64 to float, !dbg !22
66
+ %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
67
+ %67 = bitcast i32 %66 to float, !dbg !22
68
+ %68 = getelementptr float, ptr addrspace(1) %0, i64 %18, !dbg !23
69
+ %69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
70
+ %70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !24
71
+ %71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !24
72
+ %72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !24
73
+ %73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !24
74
+ %74 = bitcast i32 %70 to float, !dbg !24
75
+ %75 = bitcast i32 %71 to float, !dbg !24
76
+ %76 = bitcast i32 %72 to float, !dbg !24
77
+ %77 = bitcast i32 %73 to float, !dbg !24
78
+ %78 = fmul float %27, %38, !dbg !25
79
+ %79 = fmul float %28, %39, !dbg !25
80
+ %80 = fmul float %29, %40, !dbg !25
81
+ %81 = fmul float %30, %41, !dbg !25
82
+ %82 = fadd float %78, %79, !dbg !26
83
+ %83 = fadd float %80, %82, !dbg !26
84
+ %84 = fadd float %81, %83, !dbg !26
85
+ %85 = bitcast float %84 to i32, !dbg !32
86
+ %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !32
87
+ %87 = bitcast i32 %86 to float, !dbg !32
88
+ %88 = fadd float %84, %87, !dbg !26
89
+ %89 = bitcast float %88 to i32, !dbg !32
90
+ %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !32
91
+ %91 = bitcast i32 %90 to float, !dbg !32
92
+ %92 = fadd float %88, %91, !dbg !26
93
+ %93 = bitcast float %92 to i32, !dbg !32
94
+ %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !32
95
+ %95 = bitcast i32 %94 to float, !dbg !32
96
+ %96 = fadd float %92, %95, !dbg !26
97
+ %97 = bitcast float %96 to i32, !dbg !32
98
+ %98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !32
99
+ %99 = bitcast i32 %98 to float, !dbg !32
100
+ %100 = fadd float %96, %99, !dbg !26
101
+ %101 = bitcast float %100 to i32, !dbg !32
102
+ %102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !32
103
+ %103 = bitcast i32 %102 to float, !dbg !32
104
+ %104 = fadd float %100, %103, !dbg !26
105
+ %105 = icmp eq i32 %11, 0, !dbg !32
106
+ %106 = zext nneg i32 %13 to i64, !dbg !32
107
+ %107 = getelementptr float, ptr addrspace(3) @global_smem, i64 %106, !dbg !32
108
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %104, i1 %105) #3, !dbg !32
109
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
110
+ %108 = icmp slt i32 %10, 2, !dbg !32
111
+ %109 = sext i32 %10 to i64, !dbg !32
112
+ %110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !32
113
+ %111 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !32
114
+ %112 = bitcast float %111 to i32, !dbg !32
115
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !32
116
+ %114 = bitcast i32 %113 to float, !dbg !32
117
+ %115 = fadd float %111, %114, !dbg !26
118
+ %116 = and i32 %10, 1, !dbg !32
119
+ %117 = icmp eq i32 %116, 0, !dbg !32
120
+ %118 = and i1 %108, %117, !dbg !32
121
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %115, i1 %118) #3, !dbg !32
122
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
123
+ %119 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
124
+ %120 = fadd float %119, 0.000000e+00, !dbg !34
125
+ %121 = fsub float %48, %55, !dbg !38
126
+ %122 = fsub float %49, %55, !dbg !38
127
+ %123 = fsub float %50, %55, !dbg !38
128
+ %124 = fsub float %51, %55, !dbg !38
129
+ %125 = fmul float %121, %61, !dbg !39
130
+ %126 = fmul float %122, %61, !dbg !39
131
+ %127 = fmul float %123, %61, !dbg !39
132
+ %128 = fmul float %124, %61, !dbg !39
133
+ %129 = fmul float %78, %125, !dbg !40
134
+ %130 = fmul float %79, %126, !dbg !40
135
+ %131 = fmul float %80, %127, !dbg !40
136
+ %132 = fmul float %81, %128, !dbg !40
137
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
138
+ %133 = fadd float %129, %130, !dbg !43
139
+ %134 = fadd float %131, %133, !dbg !43
140
+ %135 = fadd float %132, %134, !dbg !43
141
+ %136 = bitcast float %135 to i32, !dbg !41
142
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !41
143
+ %138 = bitcast i32 %137 to float, !dbg !41
144
+ %139 = fadd float %135, %138, !dbg !43
145
+ %140 = bitcast float %139 to i32, !dbg !41
146
+ %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !41
147
+ %142 = bitcast i32 %141 to float, !dbg !41
148
+ %143 = fadd float %139, %142, !dbg !43
149
+ %144 = bitcast float %143 to i32, !dbg !41
150
+ %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !41
151
+ %146 = bitcast i32 %145 to float, !dbg !41
152
+ %147 = fadd float %143, %146, !dbg !43
153
+ %148 = bitcast float %147 to i32, !dbg !41
154
+ %149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !41
155
+ %150 = bitcast i32 %149 to float, !dbg !41
156
+ %151 = fadd float %147, %150, !dbg !43
157
+ %152 = bitcast float %151 to i32, !dbg !41
158
+ %153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !41
159
+ %154 = bitcast i32 %153 to float, !dbg !41
160
+ %155 = fadd float %151, %154, !dbg !43
161
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %155, i1 %105) #3, !dbg !41
162
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
163
+ %156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !41
164
+ %157 = bitcast float %156 to i32, !dbg !41
165
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !41
166
+ %159 = bitcast i32 %158 to float, !dbg !41
167
+ %160 = fadd float %156, %159, !dbg !43
168
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %160, i1 %118) #3, !dbg !41
169
+ tail call void @llvm.nvvm.barrier0(), !dbg !41
170
+ %161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
171
+ %162 = fadd float %161, 0.000000e+00, !dbg !46
172
+ %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %61, float 2.560000e+02) #3, !dbg !48
173
+ %164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %63, float 2.560000e+02) #3, !dbg !48
174
+ %165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !48
175
+ %166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !48
176
+ %167 = fmul float %78, 2.560000e+02, !dbg !49
177
+ %168 = fmul float %79, 2.560000e+02, !dbg !49
178
+ %169 = fmul float %80, 2.560000e+02, !dbg !49
179
+ %170 = fmul float %81, 2.560000e+02, !dbg !49
180
+ %171 = fsub float %167, %120, !dbg !50
181
+ %172 = fsub float %168, %120, !dbg !50
182
+ %173 = fsub float %169, %120, !dbg !50
183
+ %174 = fsub float %170, %120, !dbg !50
184
+ %175 = fmul float %125, %162, !dbg !51
185
+ %176 = fmul float %126, %162, !dbg !51
186
+ %177 = fmul float %127, %162, !dbg !51
187
+ %178 = fmul float %128, %162, !dbg !51
188
+ %179 = fsub float %171, %175, !dbg !52
189
+ %180 = fsub float %172, %176, !dbg !52
190
+ %181 = fsub float %173, %177, !dbg !52
191
+ %182 = fsub float %174, %178, !dbg !52
192
+ %183 = fmul float %163, %179, !dbg !53
193
+ %184 = fmul float %163, %180, !dbg !53
194
+ %185 = fmul float %163, %181, !dbg !53
195
+ %186 = fmul float %163, %182, !dbg !53
196
+ %187 = fadd float %183, %74, !dbg !54
197
+ %188 = fadd float %184, %75, !dbg !54
198
+ %189 = fadd float %185, %76, !dbg !54
199
+ %190 = fadd float %186, %77, !dbg !54
200
+ %191 = bitcast float %187 to i32, !dbg !55
201
+ %192 = bitcast float %188 to i32, !dbg !55
202
+ %193 = bitcast float %189 to i32, !dbg !55
203
+ %194 = bitcast float %190 to i32, !dbg !55
204
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %191, i32 %192, i32 %193, i32 %194, ptr addrspace(1) %68, i1 true) #3, !dbg !55
205
+ %195 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56
206
+ %196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %187) #3, !dbg !57
207
+ %197 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %188) #3, !dbg !57
208
+ %198 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %189) #3, !dbg !57
209
+ %199 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %190) #3, !dbg !57
210
+ %200 = insertelement <2 x i16> undef, i16 %196, i64 0, !dbg !57
211
+ %201 = insertelement <2 x i16> %200, i16 %197, i64 1, !dbg !57
212
+ %202 = bitcast <2 x i16> %201 to i32, !dbg !57
213
+ %203 = insertelement <2 x i16> undef, i16 %198, i64 0, !dbg !57
214
+ %204 = insertelement <2 x i16> %203, i16 %199, i64 1, !dbg !57
215
+ %205 = bitcast <2 x i16> %204 to i32, !dbg !57
216
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %202, i32 %205, ptr addrspace(1) %195, i1 true) #3, !dbg !57
217
+ ret void, !dbg !58
218
+ }
219
+
220
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
221
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
222
+
223
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
224
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
225
+
226
+ ; Function Attrs: convergent nocallback nounwind
227
+ declare void @llvm.nvvm.barrier0() #2
228
+
229
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
230
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
231
+ attributes #2 = { convergent nocallback nounwind }
232
+ attributes #3 = { nounwind }
233
+
234
+ !llvm.module.flags = !{!0}
235
+ !llvm.dbg.cu = !{!1}
236
+ !nvvm.annotations = !{!3, !4, !4, !3}
237
+
238
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
239
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
240
+ !2 = !DIFile(filename: "csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py", directory: "/tmp/torchinductor_root/sn")
241
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1}
242
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64}
243
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
244
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
245
+ !7 = !{}
246
+ !8 = !DILocation(line: 26, column: 26, scope: !5)
247
+ !9 = !DILocation(line: 23, column: 28, scope: !5)
248
+ !10 = !DILocation(line: 30, column: 40, scope: !5)
249
+ !11 = !DILocation(line: 30, column: 36, scope: !5)
250
+ !12 = !DILocation(line: 30, column: 30, scope: !5)
251
+ !13 = !DILocation(line: 30, column: 46, scope: !5)
252
+ !14 = !DILocation(line: 30, column: 67, scope: !5)
253
+ !15 = !DILocation(line: 31, column: 30, scope: !5)
254
+ !16 = !DILocation(line: 31, column: 35, scope: !5)
255
+ !17 = !DILocation(line: 32, column: 30, scope: !5)
256
+ !18 = !DILocation(line: 32, column: 46, scope: !5)
257
+ !19 = !DILocation(line: 33, column: 30, scope: !5)
258
+ !20 = !DILocation(line: 33, column: 35, scope: !5)
259
+ !21 = !DILocation(line: 34, column: 31, scope: !5)
260
+ !22 = !DILocation(line: 34, column: 36, scope: !5)
261
+ !23 = !DILocation(line: 35, column: 35, scope: !5)
262
+ !24 = !DILocation(line: 35, column: 51, scope: !5)
263
+ !25 = !DILocation(line: 37, column: 18, scope: !5)
264
+ !26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
265
+ !27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
266
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
267
+ !29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
268
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
269
+ !31 = !DILocation(line: 40, column: 57, scope: !27)
270
+ !32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
271
+ !33 = !DILocation(line: 40, column: 57, scope: !29)
272
+ !34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
273
+ !35 = distinct !DILexicalBlockFile(scope: !5, file: !36, discriminator: 0)
274
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
275
+ !37 = !DILocation(line: 40, column: 44, scope: !35)
276
+ !38 = !DILocation(line: 41, column: 19, scope: !5)
277
+ !39 = !DILocation(line: 42, column: 20, scope: !5)
278
+ !40 = !DILocation(line: 43, column: 19, scope: !5)
279
+ !41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
280
+ !42 = !DILocation(line: 46, column: 59, scope: !29)
281
+ !43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
282
+ !44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
283
+ !45 = !DILocation(line: 46, column: 59, scope: !27)
284
+ !46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
285
+ !47 = !DILocation(line: 46, column: 45, scope: !35)
286
+ !48 = !DILocation(line: 48, column: 20, scope: !5)
287
+ !49 = !DILocation(line: 49, column: 19, scope: !5)
288
+ !50 = !DILocation(line: 50, column: 20, scope: !5)
289
+ !51 = !DILocation(line: 51, column: 20, scope: !5)
290
+ !52 = !DILocation(line: 52, column: 20, scope: !5)
291
+ !53 = !DILocation(line: 53, column: 20, scope: !5)
292
+ !54 = !DILocation(line: 54, column: 20, scope: !5)
293
+ !55 = !DILocation(line: 56, column: 51, scope: !5)
294
+ !56 = !DILocation(line: 57, column: 25, scope: !5)
295
+ !57 = !DILocation(line: 57, column: 48, scope: !5)
296
+ !58 = !DILocation(line: 57, column: 4, scope: !5)
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8de(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6,
20
+ .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7,
21
+ .param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8
22
+ )
23
+ .maxntid 64, 1, 1
24
+ {
25
+ .reg .pred %p<37>;
26
+ .reg .b16 %rs<9>;
27
+ .reg .b32 %r<110>;
28
+ .reg .f32 %f<86>;
29
+ .reg .b64 %rd<26>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0];
35
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1];
36
+ $L__tmp0:
37
+ .loc 1 26 26
38
+ mov.u32 %r76, %tid.x;
39
+ and.b32 %r77, %r76, 31;
40
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2];
41
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3];
42
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4];
43
+ shl.b32 %r78, %r76, 2;
44
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5];
45
+ and.b32 %r79, %r78, 252;
46
+ ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6];
47
+ .loc 1 23 28
48
+ mov.u32 %r1, %ctaid.x;
49
+ .loc 1 30 40
50
+ shl.b32 %r80, %r1, 8;
51
+ .loc 1 30 36
52
+ or.b32 %r81, %r80, %r79;
53
+ .loc 1 30 30
54
+ mul.wide.s32 %rd22, %r81, 2;
55
+ add.s64 %rd1, %rd16, %rd22;
56
+ mov.b32 %r4, 0;
57
+ mov.pred %p1, -1;
58
+ .loc 1 30 46
59
+ mov.u32 %r2, 0x0;
60
+ mov.u32 %r3, 0x0;
61
+ @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
62
+ @!%p1 mov.u32 %r2, %r4;
63
+ @!%p1 mov.u32 %r3, %r4;
64
+ cvt.u16.u32 %rs1, %r2;
65
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
66
+ cvt.u16.u32 %rs3, %r3;
67
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
68
+ .loc 1 30 67
69
+ cvt.f32.bf16 %r6, %rs1;
70
+ mov.b32 %f1, %r6;
71
+ cvt.f32.bf16 %r7, %rs2;
72
+ mov.b32 %f2, %r7;
73
+ cvt.f32.bf16 %r8, %rs3;
74
+ mov.b32 %f3, %r8;
75
+ cvt.f32.bf16 %r9, %rs4;
76
+ mov.b32 %f4, %r9;
77
+ .loc 1 31 30
78
+ mul.wide.u32 %rd23, %r79, 4;
79
+ add.s64 %rd2, %rd17, %rd23;
80
+ .loc 1 31 35
81
+ mov.u32 %r10, 0x0;
82
+ mov.u32 %r11, 0x0;
83
+ mov.u32 %r12, 0x0;
84
+ mov.u32 %r13, 0x0;
85
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
86
+ @!%p1 mov.u32 %r10, %r4;
87
+ @!%p1 mov.u32 %r11, %r4;
88
+ @!%p1 mov.u32 %r12, %r4;
89
+ @!%p1 mov.u32 %r13, %r4;
90
+ mov.b32 %f5, %r10;
91
+ mov.b32 %f6, %r11;
92
+ mov.b32 %f7, %r12;
93
+ mov.b32 %f8, %r13;
94
+ .loc 1 32 30
95
+ mul.wide.s32 %rd24, %r81, 4;
96
+ add.s64 %rd3, %rd18, %rd24;
97
+ .loc 1 32 46
98
+ mov.u32 %r18, 0x0;
99
+ mov.u32 %r19, 0x0;
100
+ mov.u32 %r20, 0x0;
101
+ mov.u32 %r21, 0x0;
102
+ @%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
103
+ @!%p1 mov.u32 %r18, %r4;
104
+ @!%p1 mov.u32 %r19, %r4;
105
+ @!%p1 mov.u32 %r20, %r4;
106
+ @!%p1 mov.u32 %r21, %r4;
107
+ mov.b32 %f9, %r18;
108
+ mov.b32 %f10, %r19;
109
+ mov.b32 %f11, %r20;
110
+ mov.b32 %f12, %r21;
111
+ .loc 1 33 30
112
+ mul.wide.s32 %rd25, %r1, 4;
113
+ add.s64 %rd4, %rd19, %rd25;
114
+ .loc 1 33 35
115
+ mov.u32 %r26, 0x0;
116
+ @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
117
+ mov.b32 %f13, %r26;
118
+ mov.u32 %r27, 0x0;
119
+ @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
120
+ mov.u32 %r28, 0x0;
121
+ @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
122
+ mov.u32 %r29, 0x0;
123
+ @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
124
+ .loc 1 34 31
125
+ add.s64 %rd8, %rd20, %rd25;
126
+ .loc 1 34 36
127
+ mov.u32 %r55, 0x0;
128
+ @%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
129
+ mov.b32 %f14, %r55;
130
+ mov.u32 %r31, 0x0;
131
+ @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
132
+ mov.u32 %r32, 0x0;
133
+ @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
134
+ mov.u32 %r33, 0x0;
135
+ @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
136
+ .loc 1 35 35
137
+ add.s64 %rd12, %rd15, %rd24;
138
+ .loc 1 35 51
139
+ mov.u32 %r34, 0x0;
140
+ mov.u32 %r35, 0x0;
141
+ mov.u32 %r36, 0x0;
142
+ mov.u32 %r37, 0x0;
143
+ @%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ];
144
+ @!%p1 mov.u32 %r34, %r4;
145
+ @!%p1 mov.u32 %r35, %r4;
146
+ @!%p1 mov.u32 %r36, %r4;
147
+ @!%p1 mov.u32 %r37, %r4;
148
+ mov.b32 %f15, %r34;
149
+ mov.b32 %f16, %r35;
150
+ mov.b32 %f17, %r36;
151
+ mov.b32 %f18, %r37;
152
+ .loc 1 37 18
153
+ mul.f32 %f19, %f1, %f5;
154
+ mul.f32 %f20, %f2, %f6;
155
+ mul.f32 %f21, %f3, %f7;
156
+ mul.f32 %f22, %f4, %f8;
157
+ $L__tmp1:
158
+ .loc 2 233 15
159
+ fma.rn.f32 %f23, %f1, %f5, %f20;
160
+ fma.rn.f32 %f24, %f3, %f7, %f23;
161
+ fma.rn.f32 %f25, %f4, %f8, %f24;
162
+ $L__tmp2:
163
+ .loc 2 243 36
164
+ mov.b32 %r82, %f25;
165
+ shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1;
166
+ mov.b32 %f26, %r83;
167
+ $L__tmp3:
168
+ .loc 2 233 15
169
+ add.f32 %f27, %f25, %f26;
170
+ $L__tmp4:
171
+ .loc 2 243 36
172
+ mov.b32 %r84, %f27;
173
+ shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1;
174
+ mov.b32 %f28, %r85;
175
+ $L__tmp5:
176
+ .loc 2 233 15
177
+ add.f32 %f29, %f27, %f28;
178
+ $L__tmp6:
179
+ .loc 2 243 36
180
+ mov.b32 %r86, %f29;
181
+ shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1;
182
+ mov.b32 %f30, %r87;
183
+ $L__tmp7:
184
+ .loc 2 233 15
185
+ add.f32 %f31, %f29, %f30;
186
+ $L__tmp8:
187
+ .loc 2 243 36
188
+ mov.b32 %r88, %f31;
189
+ shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1;
190
+ mov.b32 %f32, %r89;
191
+ $L__tmp9:
192
+ .loc 2 233 15
193
+ add.f32 %f33, %f31, %f32;
194
+ $L__tmp10:
195
+ .loc 2 243 36
196
+ mov.b32 %r90, %f33;
197
+ shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1;
198
+ mov.b32 %f34, %r91;
199
+ $L__tmp11:
200
+ .loc 2 233 15
201
+ add.f32 %f35, %f33, %f34;
202
+ $L__tmp12:
203
+ .loc 2 243 36
204
+ setp.eq.s32 %p27, %r77, 0;
205
+ shr.u32 %r92, %r76, 3;
206
+ and.b32 %r93, %r92, 4;
207
+ mov.u32 %r94, global_smem;
208
+ add.s32 %r42, %r94, %r93;
209
+ mov.b32 %r43, %f35;
210
+ @%p27 st.shared.b32 [ %r42 + 0 ], %r43;
211
+ bar.sync 0;
212
+ setp.lt.s32 %p28, %r76, 2;
213
+ add.s32 %r45, %r94, %r78;
214
+ @%p28 ld.shared.b32 %r44, [ %r45 + 0 ];
215
+ mov.b32 %f36, %r44;
216
+ shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1;
217
+ mov.b32 %f37, %r95;
218
+ $L__tmp13:
219
+ .loc 2 233 15
220
+ add.f32 %f38, %f36, %f37;
221
+ $L__tmp14:
222
+ .loc 2 243 36
223
+ and.b32 %r96, %r76, 1;
224
+ setp.eq.b32 %p35, %r96, 1;
225
+ not.pred %p36, %p35;
226
+ and.pred %p29, %p28, %p36;
227
+ mov.b32 %r47, %f38;
228
+ @%p29 st.shared.b32 [ %r45 + 0 ], %r47;
229
+ bar.sync 0;
230
+ ld.shared.f32 %f39, [global_smem];
231
+ $L__tmp15:
232
+ .loc 3 8 15
233
+ add.f32 %f40, %f39, 0f00000000;
234
+ $L__tmp16:
235
+ .loc 1 41 19
236
+ sub.f32 %f41, %f9, %f13;
237
+ sub.f32 %f42, %f10, %f13;
238
+ sub.f32 %f43, %f11, %f13;
239
+ sub.f32 %f44, %f12, %f13;
240
+ .loc 1 42 20
241
+ mul.f32 %f45, %f41, %f14;
242
+ mul.f32 %f46, %f42, %f14;
243
+ mul.f32 %f47, %f43, %f14;
244
+ mul.f32 %f48, %f44, %f14;
245
+ .loc 1 43 19
246
+ mul.f32 %f49, %f20, %f46;
247
+ $L__tmp17:
248
+ .loc 2 243 36
249
+ bar.sync 0;
250
+ $L__tmp18:
251
+ .loc 2 233 15
252
+ fma.rn.f32 %f50, %f19, %f45, %f49;
253
+ fma.rn.f32 %f51, %f21, %f47, %f50;
254
+ fma.rn.f32 %f52, %f22, %f48, %f51;
255
+ $L__tmp19:
256
+ .loc 2 243 36
257
+ mov.b32 %r97, %f52;
258
+ shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
259
+ mov.b32 %f53, %r98;
260
+ $L__tmp20:
261
+ .loc 2 233 15
262
+ add.f32 %f54, %f52, %f53;
263
+ $L__tmp21:
264
+ .loc 2 243 36
265
+ mov.b32 %r99, %f54;
266
+ shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
267
+ mov.b32 %f55, %r100;
268
+ $L__tmp22:
269
+ .loc 2 233 15
270
+ add.f32 %f56, %f54, %f55;
271
+ $L__tmp23:
272
+ .loc 2 243 36
273
+ mov.b32 %r101, %f56;
274
+ shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
275
+ mov.b32 %f57, %r102;
276
+ $L__tmp24:
277
+ .loc 2 233 15
278
+ add.f32 %f58, %f56, %f57;
279
+ $L__tmp25:
280
+ .loc 2 243 36
281
+ mov.b32 %r103, %f58;
282
+ shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
283
+ mov.b32 %f59, %r104;
284
+ $L__tmp26:
285
+ .loc 2 233 15
286
+ add.f32 %f60, %f58, %f59;
287
+ $L__tmp27:
288
+ .loc 2 243 36
289
+ mov.b32 %r105, %f60;
290
+ shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
291
+ mov.b32 %f61, %r106;
292
+ $L__tmp28:
293
+ .loc 2 233 15
294
+ add.f32 %f62, %f60, %f61;
295
+ $L__tmp29:
296
+ .loc 2 243 36
297
+ mov.b32 %r49, %f62;
298
+ @%p27 st.shared.b32 [ %r42 + 0 ], %r49;
299
+ bar.sync 0;
300
+ @%p28 ld.shared.b32 %r50, [ %r45 + 0 ];
301
+ mov.b32 %f63, %r50;
302
+ shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1;
303
+ mov.b32 %f64, %r107;
304
+ $L__tmp30:
305
+ .loc 2 233 15
306
+ add.f32 %f65, %f63, %f64;
307
+ $L__tmp31:
308
+ .loc 2 243 36
309
+ mov.b32 %r53, %f65;
310
+ @%p29 st.shared.b32 [ %r45 + 0 ], %r53;
311
+ bar.sync 0;
312
+ ld.shared.f32 %f66, [global_smem];
313
+ $L__tmp32:
314
+ .loc 3 8 15
315
+ add.f32 %f67, %f66, 0f00000000;
316
+ mov.b32 %r56, 1132462080;
317
+ $L__tmp33:
318
+ .loc 1 48 20
319
+ div.full.f32 %r54, %r55, %r56;
320
+ mov.b32 %f68, %r54;
321
+ .loc 1 50 20
322
+ neg.f32 %f69, %f40;
323
+ fma.rn.f32 %f70, %f19, 0f43800000, %f69;
324
+ fma.rn.f32 %f71, %f20, 0f43800000, %f69;
325
+ fma.rn.f32 %f72, %f21, 0f43800000, %f69;
326
+ fma.rn.f32 %f73, %f22, 0f43800000, %f69;
327
+ .loc 1 52 20
328
+ neg.f32 %f74, %f45;
329
+ fma.rn.f32 %f75, %f74, %f67, %f70;
330
+ neg.f32 %f76, %f46;
331
+ fma.rn.f32 %f77, %f76, %f67, %f71;
332
+ neg.f32 %f78, %f47;
333
+ fma.rn.f32 %f79, %f78, %f67, %f72;
334
+ neg.f32 %f80, %f48;
335
+ fma.rn.f32 %f81, %f80, %f67, %f73;
336
+ .loc 1 54 20
337
+ fma.rn.f32 %f82, %f68, %f75, %f15;
338
+ fma.rn.f32 %f83, %f68, %f77, %f16;
339
+ fma.rn.f32 %f84, %f68, %f79, %f17;
340
+ fma.rn.f32 %f85, %f68, %f81, %f18;
341
+ .loc 1 56 51
342
+ mov.b32 %r66, %f82;
343
+ mov.b32 %r67, %f83;
344
+ mov.b32 %r68, %f84;
345
+ mov.b32 %r69, %f85;
346
+ @%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 };
347
+ .loc 1 57 25
348
+ add.s64 %rd14, %rd21, %rd22;
349
+ .loc 1 57 48
350
+ cvt.rn.bf16.f32 %rs5, %r66;
351
+ cvt.rn.bf16.f32 %rs6, %r67;
352
+ cvt.rn.bf16.f32 %rs7, %r68;
353
+ cvt.rn.bf16.f32 %rs8, %r69;
354
+ mov.b32 %r108, {%rs5, %rs6};
355
+ mov.b32 %r109, {%rs7, %rs8};
356
+ @%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 };
357
+ .loc 1 57 4
358
+ ret;
359
+ $L__tmp34:
360
+ $L__func_end0:
361
+
362
+ }
363
+ .file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py"
364
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
365
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
366
+ .section .debug_abbrev
367
+ {
368
+ .b8 1
369
+ .b8 17
370
+ .b8 1
371
+ .b8 37
372
+ .b8 8
373
+ .b8 19
374
+ .b8 5
375
+ .b8 3
376
+ .b8 8
377
+ .b8 16
378
+ .b8 6
379
+ .b8 27
380
+ .b8 8
381
+ .b8 180
382
+ .b8 66
383
+ .b8 12
384
+ .b8 17
385
+ .b8 1
386
+ .b8 18
387
+ .b8 1
388
+ .b8 0
389
+ .b8 0
390
+ .b8 2
391
+ .b8 46
392
+ .b8 0
393
+ .b8 135
394
+ .b8 64
395
+ .b8 8
396
+ .b8 3
397
+ .b8 8
398
+ .b8 58
399
+ .b8 11
400
+ .b8 59
401
+ .b8 11
402
+ .b8 63
403
+ .b8 12
404
+ .b8 32
405
+ .b8 11
406
+ .b8 0
407
+ .b8 0
408
+ .b8 3
409
+ .b8 46
410
+ .b8 1
411
+ .b8 17
412
+ .b8 1
413
+ .b8 18
414
+ .b8 1
415
+ .b8 64
416
+ .b8 10
417
+ .b8 49
418
+ .b8 19
419
+ .b8 0
420
+ .b8 0
421
+ .b8 4
422
+ .b8 29
423
+ .b8 1
424
+ .b8 49
425
+ .b8 19
426
+ .b8 17
427
+ .b8 1
428
+ .b8 18
429
+ .b8 1
430
+ .b8 88
431
+ .b8 11
432
+ .b8 89
433
+ .b8 11
434
+ .b8 87
435
+ .b8 11
436
+ .b8 0
437
+ .b8 0
438
+ .b8 5
439
+ .b8 29
440
+ .b8 0
441
+ .b8 49
442
+ .b8 19
443
+ .b8 17
444
+ .b8 1
445
+ .b8 18
446
+ .b8 1
447
+ .b8 88
448
+ .b8 11
449
+ .b8 89
450
+ .b8 11
451
+ .b8 87
452
+ .b8 11
453
+ .b8 0
454
+ .b8 0
455
+ .b8 0
456
+ }
457
+ .section .debug_info
458
+ {
459
+ .b32 403
460
+ .b8 2
461
+ .b8 0
462
+ .b32 .debug_abbrev
463
+ .b8 8
464
+ .b8 1
465
+ .b8 116
466
+ .b8 114
467
+ .b8 105
468
+ .b8 116
469
+ .b8 111
470
+ .b8 110
471
+ .b8 0
472
+ .b8 2
473
+ .b8 0
474
+ .b8 99
475
+ .b8 115
476
+ .b8 110
477
+ .b8 101
478
+ .b8 100
479
+ .b8 52
480
+ .b8 104
481
+ .b8 121
482
+ .b8 120
483
+ .b8 112
484
+ .b8 103
485
+ .b8 119
486
+ .b8 117
487
+ .b8 53
488
+ .b8 116
489
+ .b8 116
490
+ .b8 117
491
+ .b8 98
492
+ .b8 115
493
+ .b8 51
494
+ .b8 114
495
+ .b8 55
496
+ .b8 117
497
+ .b8 120
498
+ .b8 107
499
+ .b8 106
500
+ .b8 113
501
+ .b8 53
502
+ .b8 121
503
+ .b8 102
504
+ .b8 108
505
+ .b8 51
506
+ .b8 122
507
+ .b8 104
508
+ .b8 54
509
+ .b8 99
510
+ .b8 50
511
+ .b8 115
512
+ .b8 111
513
+ .b8 122
514
+ .b8 111
515
+ .b8 98
516
+ .b8 116
517
+ .b8 107
518
+ .b8 101
519
+ .b8 107
520
+ .b8 50
521
+ .b8 117
522
+ .b8 122
523
+ .b8 102
524
+ .b8 99
525
+ .b8 118
526
+ .b8 46
527
+ .b8 112
528
+ .b8 121
529
+ .b8 0
530
+ .b32 .debug_line
531
+ .b8 47
532
+ .b8 116
533
+ .b8 109
534
+ .b8 112
535
+ .b8 47
536
+ .b8 116
537
+ .b8 111
538
+ .b8 114
539
+ .b8 99
540
+ .b8 104
541
+ .b8 105
542
+ .b8 110
543
+ .b8 100
544
+ .b8 117
545
+ .b8 99
546
+ .b8 116
547
+ .b8 111
548
+ .b8 114
549
+ .b8 95
550
+ .b8 114
551
+ .b8 111
552
+ .b8 111
553
+ .b8 116
554
+ .b8 47
555
+ .b8 115
556
+ .b8 110
557
+ .b8 0
558
+ .b8 1
559
+ .b64 $L__func_begin0
560
+ .b64 $L__func_end0
561
+ .b8 2
562
+ .b8 116
563
+ .b8 114
564
+ .b8 105
565
+ .b8 116
566
+ .b8 111
567
+ .b8 110
568
+ .b8 95
569
+ .b8 95
570
+ .b8 48
571
+ .b8 100
572
+ .b8 49
573
+ .b8 100
574
+ .b8 50
575
+ .b8 100
576
+ .b8 51
577
+ .b8 100
578
+ .b8 52
579
+ .b8 100
580
+ .b8 53
581
+ .b8 100
582
+ .b8 54
583
+ .b8 100
584
+ .b8 55
585
+ .b8 100
586
+ .b8 101
587
+ .b8 56
588
+ .b8 100
589
+ .b8 101
590
+ .b8 0
591
+ .b8 116
592
+ .b8 114
593
+ .b8 105
594
+ .b8 116
595
+ .b8 111
596
+ .b8 110
597
+ .b8 95
598
+ .b8 95
599
+ .b8 48
600
+ .b8 100
601
+ .b8 49
602
+ .b8 100
603
+ .b8 50
604
+ .b8 100
605
+ .b8 51
606
+ .b8 100
607
+ .b8 52
608
+ .b8 100
609
+ .b8 53
610
+ .b8 100
611
+ .b8 54
612
+ .b8 100
613
+ .b8 55
614
+ .b8 100
615
+ .b8 101
616
+ .b8 56
617
+ .b8 100
618
+ .b8 101
619
+ .b8 0
620
+ .b8 1
621
+ .b8 18
622
+ .b8 1
623
+ .b8 1
624
+ .b8 3
625
+ .b64 $L__func_begin0
626
+ .b64 $L__func_end0
627
+ .b8 1
628
+ .b8 156
629
+ .b32 125
630
+ .b8 4
631
+ .b32 125
632
+ .b64 $L__tmp1
633
+ .b64 $L__tmp14
634
+ .b8 2
635
+ .b8 40
636
+ .b8 57
637
+ .b8 5
638
+ .b32 125
639
+ .b64 $L__tmp1
640
+ .b64 $L__tmp14
641
+ .b8 2
642
+ .b8 243
643
+ .b8 36
644
+ .b8 0
645
+ .b8 5
646
+ .b32 125
647
+ .b64 $L__tmp2
648
+ .b64 $L__tmp15
649
+ .b8 2
650
+ .b8 40
651
+ .b8 57
652
+ .b8 5
653
+ .b32 125
654
+ .b64 $L__tmp15
655
+ .b64 $L__tmp16
656
+ .b8 3
657
+ .b8 40
658
+ .b8 44
659
+ .b8 5
660
+ .b32 125
661
+ .b64 $L__tmp17
662
+ .b64 $L__tmp32
663
+ .b8 2
664
+ .b8 46
665
+ .b8 59
666
+ .b8 4
667
+ .b32 125
668
+ .b64 $L__tmp18
669
+ .b64 $L__tmp31
670
+ .b8 2
671
+ .b8 46
672
+ .b8 59
673
+ .b8 5
674
+ .b32 125
675
+ .b64 $L__tmp18
676
+ .b64 $L__tmp31
677
+ .b8 2
678
+ .b8 243
679
+ .b8 36
680
+ .b8 0
681
+ .b8 5
682
+ .b32 125
683
+ .b64 $L__tmp32
684
+ .b64 $L__tmp33
685
+ .b8 3
686
+ .b8 46
687
+ .b8 45
688
+ .b8 0
689
+ .b8 0
690
+ }
691
+ .section .debug_pubnames
692
+ {
693
+ .b32 $L__pubNames_end0-$L__pubNames_start0
694
+ $L__pubNames_start0:
695
+ .b8 2
696
+ .b8 0
697
+ .b32 .debug_info
698
+ .b32 407
699
+ .b32 125
700
+ .b8 116
701
+ .b8 114
702
+ .b8 105
703
+ .b8 116
704
+ .b8 111
705
+ .b8 110
706
+ .b8 95
707
+ .b8 95
708
+ .b8 48
709
+ .b8 100
710
+ .b8 49
711
+ .b8 100
712
+ .b8 50
713
+ .b8 100
714
+ .b8 51
715
+ .b8 100
716
+ .b8 52
717
+ .b8 100
718
+ .b8 53
719
+ .b8 100
720
+ .b8 54
721
+ .b8 100
722
+ .b8 55
723
+ .b8 100
724
+ .b8 101
725
+ .b8 56
726
+ .b8 100
727
+ .b8 101
728
+ .b8 0
729
+ .b32 0
730
+ $L__pubNames_end0:
731
+ }
732
+ .section .debug_pubtypes
733
+ {
734
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
735
+ $L__pubTypes_start0:
736
+ .b8 2
737
+ .b8 0
738
+ .b32 .debug_info
739
+ .b32 407
740
+ .b32 0
741
+ $L__pubTypes_end0:
742
+ }
743
+ .section .debug_loc { }
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
6
+ %cst_1 = arith.constant 0.000000e+00 : f32
7
+ %c256_i32 = arith.constant 256 : i32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
9
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
20
+ %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
21
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
22
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
28
+ %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
29
+ %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
30
+ %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
31
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
32
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
33
+ %22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
34
+ %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
35
+ %24 = tt.load %23, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
36
+ %25 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
37
+ %26 = arith.select %2, %25, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
38
+ %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
39
+ ^bb0(%arg9: f32, %arg10: f32):
40
+ %50 = arith.addf %arg9, %arg10 : f32
41
+ tt.reduce.return %50 : f32
42
+ }) : (tensor<256xf32, #blocked>) -> f32
43
+ %28 = arith.addf %27, %cst_1 : f32
44
+ %29 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
45
+ %30 = arith.subf %15, %29 : tensor<256xf32, #blocked>
46
+ %31 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
47
+ %32 = arith.mulf %30, %31 : tensor<256xf32, #blocked>
48
+ %33 = arith.mulf %25, %32 : tensor<256xf32, #blocked>
49
+ %34 = arith.select %2, %33, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
50
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
51
+ ^bb0(%arg9: f32, %arg10: f32):
52
+ %50 = arith.addf %arg9, %arg10 : f32
53
+ tt.reduce.return %50 : f32
54
+ }) : (tensor<256xf32, #blocked>) -> f32
55
+ %36 = arith.addf %35, %cst_1 : f32
56
+ %37 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
57
+ %38 = arith.mulf %25, %cst_3 : tensor<256xf32, #blocked>
58
+ %39 = tt.splat %28 : (f32) -> tensor<256xf32, #blocked>
59
+ %40 = arith.subf %38, %39 : tensor<256xf32, #blocked>
60
+ %41 = tt.splat %36 : (f32) -> tensor<256xf32, #blocked>
61
+ %42 = arith.mulf %32, %41 : tensor<256xf32, #blocked>
62
+ %43 = arith.subf %40, %42 : tensor<256xf32, #blocked>
63
+ %44 = tt.broadcast %37 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
64
+ %45 = arith.mulf %44, %43 : tensor<256xf32, #blocked>
65
+ %46 = arith.addf %24, %45 : tensor<256xf32, #blocked>
66
+ tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
67
+ %47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
68
+ %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
69
+ %49 = arith.truncf %46 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
70
+ tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
71
+ tt.return
72
+ }
73
+ }
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32>
7
+ %cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32>
8
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
19
+ %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32>
20
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
21
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
22
+ %12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
23
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
26
+ %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
27
+ %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
28
+ %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
29
+ %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
30
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
31
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
32
+ %22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
33
+ %23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
34
+ %24 = tt.load %23, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
35
+ %25 = arith.mulf %9, %12 : tensor<256xf32>
36
+ %26 = arith.select %2, %25, %cst_1 : tensor<256xi1>, tensor<256xf32>
37
+ %27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
38
+ ^bb0(%arg9: f32, %arg10: f32):
39
+ %50 = arith.addf %arg9, %arg10 : f32
40
+ tt.reduce.return %50 : f32
41
+ }) : (tensor<256xf32>) -> f32
42
+ %28 = arith.addf %27, %cst_0 : f32
43
+ %29 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32>
44
+ %30 = arith.subf %15, %29 : tensor<256xf32>
45
+ %31 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32>
46
+ %32 = arith.mulf %30, %31 : tensor<256xf32>
47
+ %33 = arith.mulf %25, %32 : tensor<256xf32>
48
+ %34 = arith.select %2, %33, %cst_1 : tensor<256xi1>, tensor<256xf32>
49
+ %35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
50
+ ^bb0(%arg9: f32, %arg10: f32):
51
+ %50 = arith.addf %arg9, %arg10 : f32
52
+ tt.reduce.return %50 : f32
53
+ }) : (tensor<256xf32>) -> f32
54
+ %36 = arith.addf %35, %cst_0 : f32
55
+ %37 = arith.divf %21, %cst_3 : tensor<1xf32>
56
+ %38 = arith.mulf %25, %cst_2 : tensor<256xf32>
57
+ %39 = tt.splat %28 : (f32) -> tensor<256xf32>
58
+ %40 = arith.subf %38, %39 : tensor<256xf32>
59
+ %41 = tt.splat %36 : (f32) -> tensor<256xf32>
60
+ %42 = arith.mulf %32, %41 : tensor<256xf32>
61
+ %43 = arith.subf %40, %42 : tensor<256xf32>
62
+ %44 = tt.broadcast %37 : (tensor<1xf32>) -> tensor<256xf32>
63
+ %45 = arith.mulf %44, %43 : tensor<256xf32>
64
+ %46 = arith.addf %24, %45 : tensor<256xf32>
65
+ tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
66
+ %47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
67
+ %48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
68
+ %49 = arith.truncf %46 : tensor<256xf32> to tensor<256xbf16>
69
+ tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
70
+ tt.return
71
+ }
72
+ }
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32>
5
+ %c50257_i32 = arith.constant 50257 : i32
6
+ %c64_i32 = arith.constant 64 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<50257> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<50257> : tensor<1x64xi64>
10
+ %c64_i64 = arith.constant 64 : i64
11
+ %cst_3 = arith.constant dense<-1> : tensor<64x1xi64>
12
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.extsi %0 : i32 to i64
15
+ %2 = arith.muli %1, %c64_i64 : i64
16
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
17
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
18
+ %5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64>
19
+ %6 = tt.splat %2 : (i64) -> tensor<64x1xi64>
20
+ %7 = arith.addi %6, %5 : tensor<64x1xi64>
21
+ %8 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
22
+ %9 = arith.extsi %8 : tensor<1x64xi32> to tensor<1x64xi64>
23
+ %10 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
24
+ %11 = tt.addptr %10, %7 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi64>
25
+ %12 = tt.load %11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
26
+ %13 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
27
+ %14 = tt.load %13 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
28
+ %15 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
29
+ %16 = tt.load %15 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
30
+ %17 = arith.muli %7, %cst_1 : tensor<64x1xi64>
31
+ %18 = tt.broadcast %17 : (tensor<64x1xi64>) -> tensor<64x64xi64>
32
+ %19 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
33
+ %20 = arith.cmpi ne, %12, %cst_3 : tensor<64x1xi64>
34
+ %21 = arith.divf %14, %16 : f32
35
+ %22 = tt.splat %21 : (f32) -> tensor<64x1xf32>
36
+ %23 = arith.select %20, %22, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
37
+ %24 = tt.broadcast %23 : (tensor<64x1xf32>) -> tensor<64x64xf32>
38
+ %25 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x64xf32>) : i32 {
39
+ %40 = arith.extsi %arg9 : i32 to i64
40
+ %41 = tt.splat %40 : (i64) -> tensor<1x64xi64>
41
+ %42 = arith.addi %41, %9 : tensor<1x64xi64>
42
+ %43 = arith.cmpi slt, %42, %cst_2 : tensor<1x64xi64>
43
+ %44 = tt.broadcast %42 : (tensor<1x64xi64>) -> tensor<64x64xi64>
44
+ %45 = arith.addi %44, %18 : tensor<64x64xi64>
45
+ %46 = tt.addptr %19, %45 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
46
+ %47 = tt.broadcast %43 : (tensor<1x64xi1>) -> tensor<64x64xi1>
47
+ %48 = tt.load %46, %47, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
48
+ %49 = arith.mulf %48, %24 : tensor<64x64xf32>
49
+ %50 = arith.addf %arg10, %49 : tensor<64x64xf32>
50
+ %51 = arith.select %47, %50, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
51
+ scf.yield %51 : tensor<64x64xf32>
52
+ }
53
+ %26 = "tt.reduce"(%25) <{axis = 1 : i32}> ({
54
+ ^bb0(%arg9: f32, %arg10: f32):
55
+ %40 = arith.addf %arg9, %arg10 : f32
56
+ tt.reduce.return %40 : f32
57
+ }) : (tensor<64x64xf32>) -> tensor<64xf32>
58
+ %27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
59
+ %28 = arith.muli %7, %cst_1 : tensor<64x1xi64>
60
+ %29 = tt.broadcast %28 : (tensor<64x1xi64>) -> tensor<64x64xi64>
61
+ %30 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
62
+ %31 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
63
+ %32 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
64
+ %33 = arith.cmpi ne, %12, %cst_3 : tensor<64x1xi64>
65
+ %34 = arith.divf %14, %16 : f32
66
+ %35 = tt.splat %34 : (f32) -> tensor<64x1xf32>
67
+ %36 = arith.select %33, %35, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
68
+ %37 = tt.broadcast %36 : (tensor<64x1xf32>) -> tensor<64x64xf32>
69
+ %38 = tt.broadcast %27 : (tensor<64x1xf32>) -> tensor<64x64xf32>
70
+ %39 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
71
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 : i32 {
72
+ %40 = arith.extsi %arg9 : i32 to i64
73
+ %41 = tt.splat %40 : (i64) -> tensor<1x64xi64>
74
+ %42 = arith.addi %41, %9 : tensor<1x64xi64>
75
+ %43 = arith.cmpi slt, %42, %cst_2 : tensor<1x64xi64>
76
+ %44 = tt.broadcast %42 : (tensor<1x64xi64>) -> tensor<64x64xi64>
77
+ %45 = arith.addi %44, %29 : tensor<64x64xi64>
78
+ %46 = tt.addptr %30, %45 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi64>
79
+ %47 = tt.broadcast %43 : (tensor<1x64xi1>) -> tensor<64x64xi1>
80
+ %48 = tt.load %46, %47, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
81
+ %49 = arith.extf %48 : tensor<64x64xbf16> to tensor<64x64xf32>
82
+ %50 = tt.addptr %31, %45 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
83
+ %51 = tt.load %50, %47, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
84
+ %52 = tt.addptr %32, %45 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi64>
85
+ %53 = tt.load %52, %47, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
86
+ %54 = arith.extf %53 : tensor<64x64xbf16> to tensor<64x64xf32>
87
+ %55 = arith.mulf %51, %37 : tensor<64x64xf32>
88
+ %56 = math.exp %54 : tensor<64x64xf32>
89
+ %57 = arith.mulf %56, %38 : tensor<64x64xf32>
90
+ %58 = arith.subf %55, %57 : tensor<64x64xf32>
91
+ %59 = arith.addf %49, %58 : tensor<64x64xf32>
92
+ %60 = tt.addptr %39, %45 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi64>
93
+ %61 = arith.truncf %59 : tensor<64x64xf32> to tensor<64x64xbf16>
94
+ tt.store %60, %61, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
95
+ }
96
+ tt.return
97
+ }
98
+ }
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin ADDED
Binary file (31.3 kB). View file
 
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx ADDED
@@ -0,0 +1,1054 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 128, 1, 1
39
+ {
40
+ .reg .pred %p<56>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<185>;
43
+ .reg .f32 %f<169>;
44
+ .reg .b64 %rd<59>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
50
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
51
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r1, %tid.x;
55
+ and.b32 %r2, %r1, 31;
56
+ ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
57
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
58
+ bfe.u32 %r3, %r1, 6, 1;
59
+ and.b32 %r4, %r1, 1;
60
+ .loc 1 24 33
61
+ bfe.u32 %r5, %r1, 5, 1;
62
+ shl.b32 %r31, %r1, 2;
63
+ and.b32 %r6, %r31, 252;
64
+ shl.b32 %r32, %r1, 1;
65
+ and.b32 %r7, %r32, 254;
66
+ .loc 1 21 28
67
+ mov.u32 %r14, %ctaid.x;
68
+ .loc 1 21 33
69
+ shl.b32 %r33, %r14, 1;
70
+ .loc 1 22 23
71
+ or.b32 %r34, %r33, %r3;
72
+ or.b32 %r35, %r33, %r4;
73
+ .loc 1 26 30
74
+ mul.wide.s32 %rd25, %r34, 8;
75
+ add.s64 %rd11, %rd22, %rd25;
76
+ mul.wide.s32 %rd26, %r35, 8;
77
+ add.s64 %rd19, %rd22, %rd26;
78
+ mov.pred %p50, -1;
79
+ .loc 1 26 35
80
+ mov.u64 %rd10, 0x0;
81
+ @%p50 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
82
+ mov.u64 %rd12, 0x0;
83
+ @%p50 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
84
+ mov.u64 %rd14, 0x0;
85
+ @%p50 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
86
+ mov.u64 %rd16, 0x0;
87
+ @%p50 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
88
+ mov.u64 %rd18, 0x0;
89
+ @%p50 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
90
+ .loc 1 27 18
91
+ bfe.s32 %r36, %r14, 30, 1;
92
+ shr.u32 %r37, %r36, 23;
93
+ add.s32 %r38, %r34, %r37;
94
+ and.b32 %r39, %r38, 16776704;
95
+ sub.s32 %r40, %r34, %r39;
96
+ .loc 1 35 44
97
+ shl.b32 %r41, %r40, 8;
98
+ .loc 1 35 40
99
+ or.b32 %r42, %r41, %r6;
100
+ .loc 1 35 34
101
+ mul.wide.s32 %rd27, %r42, 4;
102
+ add.s64 %rd38, %rd23, %rd27;
103
+ mov.b32 %r155, 0;
104
+ .loc 1 35 50
105
+ mov.u32 %r15, 0x0;
106
+ mov.u32 %r16, 0x0;
107
+ mov.u32 %r17, 0x0;
108
+ mov.u32 %r18, 0x0;
109
+ @%p50 ld.global.L1::evict_last.v4.b32 { %r15, %r16, %r17, %r18 }, [ %rd38 + 0 ];
110
+ @!%p50 mov.u32 %r15, %r155;
111
+ @!%p50 mov.u32 %r16, %r155;
112
+ @!%p50 mov.u32 %r17, %r155;
113
+ @!%p50 mov.u32 %r18, %r155;
114
+ mov.b32 %f2, %r15;
115
+ mov.b32 %f1, %r16;
116
+ mov.b32 %f3, %r17;
117
+ mov.b32 %f4, %r18;
118
+ .loc 1 36 44
119
+ shl.b32 %r43, %r34, 8;
120
+ .loc 1 36 40
121
+ or.b32 %r44, %r43, %r6;
122
+ .loc 1 36 34
123
+ mul.wide.s32 %rd28, %r44, 2;
124
+ add.s64 %rd39, %rd24, %rd28;
125
+ .loc 1 36 50
126
+ mov.u32 %r23, 0x0;
127
+ mov.u32 %r24, 0x0;
128
+ @%p50 ld.global.L1::evict_last.v2.b32 { %r23, %r24 }, [ %rd39 + 0 ];
129
+ @!%p50 mov.u32 %r23, %r155;
130
+ @!%p50 mov.u32 %r24, %r155;
131
+ cvt.u16.u32 %rs1, %r23;
132
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r23; }
133
+ cvt.u16.u32 %rs3, %r24;
134
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r24; }
135
+ .loc 1 36 101
136
+ cvt.f32.bf16 %r27, %rs1;
137
+ mov.b32 %f5, %r27;
138
+ cvt.f32.bf16 %r28, %rs2;
139
+ mov.b32 %f6, %r28;
140
+ cvt.f32.bf16 %r29, %rs3;
141
+ mov.b32 %f7, %r29;
142
+ cvt.f32.bf16 %r30, %rs4;
143
+ mov.b32 %f8, %r30;
144
+ .loc 1 37 22
145
+ add.s64 %rd29, %rd18, 50257;
146
+ .loc 1 38 22
147
+ setp.lt.s64 %p14, %rd18, 0;
148
+ .loc 1 39 36
149
+ selp.b64 %rd5, %rd29, %rd18, %p14;
150
+ .loc 1 40 40
151
+ setp.lt.u64 %p15, %rd5, 50257;
152
+ mov.b32 %r184, 883;
153
+ mov.u64 %rd58, 1;
154
+ .loc 1 40 55
155
+ @%p15 bra $L__BB0_2;
156
+ mov.u64 %rd30, assertMessage_0;
157
+ cvta.global.u64 %rd31, %rd30;
158
+ mov.u64 %rd32, assertFile_0;
159
+ cvta.global.u64 %rd33, %rd32;
160
+ mov.u64 %rd34, assertFunc_0;
161
+ cvta.global.u64 %rd35, %rd34;
162
+ { // callseq 4, 0
163
+ .reg .b32 temp_param_reg;
164
+ .param .b64 param0;
165
+ st.param.b64 [param0+0], %rd31;
166
+ .param .b64 param1;
167
+ st.param.b64 [param1+0], %rd33;
168
+ .param .b32 param2;
169
+ st.param.b32 [param2+0], %r184;
170
+ .param .b64 param3;
171
+ st.param.b64 [param3+0], %rd35;
172
+ .param .b64 param4;
173
+ st.param.b64 [param4+0], %rd58;
174
+ call.uni
175
+ __assertfail,
176
+ (
177
+ param0,
178
+ param1,
179
+ param2,
180
+ param3,
181
+ param4
182
+ );
183
+ } // callseq 4
184
+ $L__BB0_2:
185
+ .loc 1 0 55
186
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
187
+ cvt.s64.s32 %rd3, %r44;
188
+ .loc 1 38 22
189
+ setp.lt.s64 %p42, %rd10, 0;
190
+ .loc 1 41 44
191
+ shl.b64 %rd41, %rd10, 8;
192
+ add.s64 %rd42, %rd41, 12865792;
193
+ selp.b64 %rd43, %rd42, %rd41, %p42;
194
+ cvt.u64.u32 %rd44, %r6;
195
+ .loc 1 41 40
196
+ or.b64 %rd45, %rd43, %rd44;
197
+ .loc 1 41 34
198
+ shl.b64 %rd46, %rd45, 2;
199
+ add.s64 %rd55, %rd7, %rd46;
200
+ .loc 1 41 52
201
+ mov.u32 %r46, 0x0;
202
+ mov.u32 %r47, 0x0;
203
+ mov.u32 %r48, 0x0;
204
+ mov.u32 %r49, 0x0;
205
+ @%p50 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd55 + 0 ];
206
+ @!%p50 mov.u32 %r46, %r155;
207
+ @!%p50 mov.u32 %r47, %r155;
208
+ @!%p50 mov.u32 %r48, %r155;
209
+ @!%p50 mov.u32 %r49, %r155;
210
+ mov.b32 %f15, %r48;
211
+ mov.b32 %f16, %r49;
212
+ .loc 1 42 22
213
+ add.f32 %f17, %f3, %f15;
214
+ add.f32 %f18, %f4, %f16;
215
+ .loc 1 44 22
216
+ add.f32 %f19, %f7, %f17;
217
+ add.f32 %f20, %f8, %f18;
218
+ .loc 1 41 52
219
+ mov.b32 %f21, %r46;
220
+ mov.b32 %f22, %r47;
221
+ .loc 1 42 22
222
+ add.f32 %f23, %f1, %f22;
223
+ add.f32 %f24, %f2, %f21;
224
+ .loc 1 44 22
225
+ add.f32 %f25, %f5, %f24;
226
+ add.f32 %f26, %f6, %f23;
227
+ $L__tmp1:
228
+ .loc 2 98 22
229
+ add.f32 %f27, %f26, 0f00000000;
230
+ add.f32 %f28, %f25, 0f00000000;
231
+ add.f32 %f29, %f19, 0f00000000;
232
+ add.f32 %f30, %f20, 0f00000000;
233
+ .loc 2 101 30
234
+ sub.f32 %f31, %f25, %f28;
235
+ sub.f32 %f32, %f26, %f27;
236
+ sub.f32 %f33, %f19, %f29;
237
+ sub.f32 %f34, %f20, %f30;
238
+ .loc 2 101 13
239
+ fma.rn.f32 %f35, %f25, %f31, 0f00000000;
240
+ fma.rn.f32 %f36, %f26, %f32, 0f00000000;
241
+ fma.rn.f32 %f37, %f19, %f33, 0f00000000;
242
+ fma.rn.f32 %f38, %f20, %f34, 0f00000000;
243
+ $L__tmp2:
244
+ .loc 2 108 21
245
+ sub.f32 %f39, %f27, %f28;
246
+ mov.b32 %r55, 1065353216;
247
+ mov.b32 %r56, 1073741824;
248
+ .loc 2 110 60
249
+ div.full.f32 %r54, %r55, %r56;
250
+ mov.b32 %f40, %r54;
251
+ .loc 2 112 17
252
+ fma.rn.f32 %f41, %f40, %f39, %f28;
253
+ .loc 2 113 15
254
+ add.f32 %f42, %f35, %f36;
255
+ .loc 2 113 30
256
+ mul.f32 %f43, %f39, %f39;
257
+ .loc 2 113 22
258
+ fma.rn.f32 %f44, %f40, %f43, %f42;
259
+ .loc 2 108 21
260
+ sub.f32 %f45, %f29, %f41;
261
+ mov.b32 %r59, 1077936128;
262
+ .loc 2 110 60
263
+ div.full.f32 %r57, %r55, %r59;
264
+ mov.b32 %f46, %r57;
265
+ .loc 2 112 17
266
+ fma.rn.f32 %f47, %f46, %f45, %f41;
267
+ .loc 2 113 15
268
+ add.f32 %f48, %f37, %f44;
269
+ .loc 2 113 30
270
+ mul.f32 %f49, %f45, %f45;
271
+ .loc 2 113 38
272
+ fma.rn.f32 %f50, %f45, %f45, %f49;
273
+ .loc 2 113 22
274
+ fma.rn.f32 %f51, %f46, %f50, %f48;
275
+ .loc 2 108 21
276
+ sub.f32 %f52, %f30, %f47;
277
+ mov.b32 %r62, 1082130432;
278
+ .loc 2 110 60
279
+ div.full.f32 %r60, %r55, %r62;
280
+ mov.b32 %f53, %r60;
281
+ .loc 2 112 17
282
+ fma.rn.f32 %f54, %f53, %f52, %f47;
283
+ .loc 2 113 15
284
+ add.f32 %f55, %f38, %f51;
285
+ .loc 2 113 30
286
+ mul.f32 %f56, %f52, %f52;
287
+ .loc 2 113 38
288
+ mul.f32 %f57, %f56, 0f40400000;
289
+ .loc 2 113 22
290
+ fma.rn.f32 %f58, %f53, %f57, %f55;
291
+ $L__tmp3:
292
+ .loc 2 120 46
293
+ mov.b32 %r119, %f54;
294
+ shfl.sync.bfly.b32 %r120, %r119, 16, 31, -1;
295
+ mov.b32 %f59, %r120;
296
+ mov.b32 %r121, %f58;
297
+ shfl.sync.bfly.b32 %r122, %r121, 16, 31, -1;
298
+ mov.b32 %f60, %r122;
299
+ shfl.sync.bfly.b32 %r64, %r62, 16, 31, -1;
300
+ mov.b32 %f61, %r64;
301
+ $L__tmp4:
302
+ .loc 2 108 21
303
+ sub.f32 %f62, %f59, %f54;
304
+ .loc 2 109 28
305
+ add.f32 %f63, %f61, 0f40800000;
306
+ .loc 2 110 39
307
+ setp.eq.f32 %p43, %f63, 0f00000000;
308
+ .loc 2 110 60
309
+ mov.b32 %r65, %f63;
310
+ div.full.f32 %r63, %r64, %r65;
311
+ mov.b32 %f64, %r63;
312
+ .loc 2 110 49
313
+ selp.f32 %f65, 0f00000000, %f64, %p43;
314
+ .loc 2 112 17
315
+ fma.rn.f32 %f66, %f65, %f62, %f54;
316
+ .loc 2 113 15
317
+ add.f32 %f67, %f58, %f60;
318
+ .loc 2 113 30
319
+ mul.f32 %f68, %f62, %f62;
320
+ .loc 2 113 38
321
+ mul.f32 %f69, %f68, 0f40800000;
322
+ .loc 2 113 22
323
+ fma.rn.f32 %f70, %f65, %f69, %f67;
324
+ $L__tmp5:
325
+ .loc 2 120 46
326
+ mov.b32 %r123, %f66;
327
+ shfl.sync.bfly.b32 %r124, %r123, 8, 31, -1;
328
+ mov.b32 %f71, %r124;
329
+ mov.b32 %r125, %f70;
330
+ shfl.sync.bfly.b32 %r126, %r125, 8, 31, -1;
331
+ mov.b32 %f72, %r126;
332
+ shfl.sync.bfly.b32 %r67, %r65, 8, 31, -1;
333
+ mov.b32 %f73, %r67;
334
+ $L__tmp6:
335
+ .loc 2 108 21
336
+ sub.f32 %f74, %f71, %f66;
337
+ .loc 2 109 28
338
+ add.f32 %f75, %f63, %f73;
339
+ .loc 2 110 39
340
+ setp.eq.f32 %p44, %f75, 0f00000000;
341
+ .loc 2 110 60
342
+ mov.b32 %r68, %f75;
343
+ div.full.f32 %r66, %r67, %r68;
344
+ mov.b32 %f76, %r66;
345
+ .loc 2 110 49
346
+ selp.f32 %f77, 0f00000000, %f76, %p44;
347
+ .loc 2 112 17
348
+ fma.rn.f32 %f78, %f77, %f74, %f66;
349
+ .loc 2 113 15
350
+ add.f32 %f79, %f70, %f72;
351
+ .loc 2 113 30
352
+ mul.f32 %f80, %f74, %f74;
353
+ .loc 2 113 38
354
+ mul.f32 %f81, %f63, %f80;
355
+ .loc 2 113 22
356
+ fma.rn.f32 %f82, %f77, %f81, %f79;
357
+ $L__tmp7:
358
+ .loc 2 120 46
359
+ mov.b32 %r127, %f78;
360
+ shfl.sync.bfly.b32 %r128, %r127, 4, 31, -1;
361
+ mov.b32 %f83, %r128;
362
+ mov.b32 %r129, %f82;
363
+ shfl.sync.bfly.b32 %r130, %r129, 4, 31, -1;
364
+ mov.b32 %f84, %r130;
365
+ shfl.sync.bfly.b32 %r70, %r68, 4, 31, -1;
366
+ mov.b32 %f85, %r70;
367
+ $L__tmp8:
368
+ .loc 2 108 21
369
+ sub.f32 %f86, %f83, %f78;
370
+ .loc 2 109 28
371
+ add.f32 %f87, %f75, %f85;
372
+ .loc 2 110 39
373
+ setp.eq.f32 %p45, %f87, 0f00000000;
374
+ .loc 2 110 60
375
+ mov.b32 %r71, %f87;
376
+ div.full.f32 %r69, %r70, %r71;
377
+ mov.b32 %f88, %r69;
378
+ .loc 2 110 49
379
+ selp.f32 %f89, 0f00000000, %f88, %p45;
380
+ .loc 2 112 17
381
+ fma.rn.f32 %f90, %f89, %f86, %f78;
382
+ .loc 2 113 15
383
+ add.f32 %f91, %f82, %f84;
384
+ .loc 2 113 30
385
+ mul.f32 %f92, %f86, %f86;
386
+ .loc 2 113 38
387
+ mul.f32 %f93, %f75, %f92;
388
+ .loc 2 113 22
389
+ fma.rn.f32 %f94, %f89, %f93, %f91;
390
+ $L__tmp9:
391
+ .loc 2 120 46
392
+ mov.b32 %r131, %f90;
393
+ shfl.sync.bfly.b32 %r132, %r131, 2, 31, -1;
394
+ mov.b32 %f95, %r132;
395
+ mov.b32 %r133, %f94;
396
+ shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
397
+ mov.b32 %f96, %r134;
398
+ shfl.sync.bfly.b32 %r73, %r71, 2, 31, -1;
399
+ mov.b32 %f97, %r73;
400
+ $L__tmp10:
401
+ .loc 2 108 21
402
+ sub.f32 %f98, %f95, %f90;
403
+ .loc 2 109 28
404
+ add.f32 %f99, %f87, %f97;
405
+ .loc 2 110 39
406
+ setp.eq.f32 %p46, %f99, 0f00000000;
407
+ .loc 2 110 60
408
+ mov.b32 %r74, %f99;
409
+ div.full.f32 %r72, %r73, %r74;
410
+ mov.b32 %f100, %r72;
411
+ .loc 2 110 49
412
+ selp.f32 %f101, 0f00000000, %f100, %p46;
413
+ .loc 2 112 17
414
+ fma.rn.f32 %f102, %f101, %f98, %f90;
415
+ .loc 2 113 15
416
+ add.f32 %f103, %f94, %f96;
417
+ .loc 2 113 30
418
+ mul.f32 %f104, %f98, %f98;
419
+ .loc 2 113 38
420
+ mul.f32 %f105, %f87, %f104;
421
+ .loc 2 113 22
422
+ fma.rn.f32 %f106, %f101, %f105, %f103;
423
+ $L__tmp11:
424
+ .loc 2 120 46
425
+ mov.b32 %r135, %f102;
426
+ shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
427
+ mov.b32 %f107, %r136;
428
+ mov.b32 %r137, %f106;
429
+ shfl.sync.bfly.b32 %r138, %r137, 1, 31, -1;
430
+ mov.b32 %f108, %r138;
431
+ shfl.sync.bfly.b32 %r76, %r74, 1, 31, -1;
432
+ mov.b32 %f109, %r76;
433
+ $L__tmp12:
434
+ .loc 2 108 21
435
+ sub.f32 %f110, %f107, %f102;
436
+ .loc 2 109 28
437
+ add.f32 %f111, %f99, %f109;
438
+ .loc 2 110 39
439
+ setp.eq.f32 %p47, %f111, 0f00000000;
440
+ .loc 2 110 60
441
+ mov.b32 %r77, %f111;
442
+ div.full.f32 %r75, %r76, %r77;
443
+ mov.b32 %f112, %r75;
444
+ .loc 2 110 49
445
+ selp.f32 %f113, 0f00000000, %f112, %p47;
446
+ .loc 2 112 17
447
+ fma.rn.f32 %f114, %f113, %f110, %f102;
448
+ .loc 2 113 15
449
+ add.f32 %f115, %f106, %f108;
450
+ .loc 2 113 30
451
+ mul.f32 %f116, %f110, %f110;
452
+ .loc 2 113 38
453
+ mul.f32 %f117, %f99, %f116;
454
+ .loc 2 113 22
455
+ fma.rn.f32 %f118, %f113, %f117, %f115;
456
+ $L__tmp13:
457
+ .loc 2 120 46
458
+ setp.eq.s32 %p21, %r2, 0;
459
+ shl.b32 %r139, %r5, 2;
460
+ shl.b32 %r140, %r3, 3;
461
+ or.b32 %r141, %r140, %r139;
462
+ mov.u32 %r142, global_smem;
463
+ add.s32 %r78, %r142, %r141;
464
+ mov.b32 %r79, %f114;
465
+ @%p21 st.shared.b32 [ %r78 + 0 ], %r79;
466
+ add.s32 %r143, %r142, 16;
467
+ add.s32 %r80, %r143, %r141;
468
+ mov.b32 %r81, %f118;
469
+ @%p21 st.shared.b32 [ %r80 + 0 ], %r81;
470
+ add.s32 %r144, %r142, 32;
471
+ add.s32 %r82, %r144, %r141;
472
+ @%p21 st.shared.b32 [ %r82 + 0 ], %r77;
473
+ bar.sync 0;
474
+ setp.lt.s32 %p24, %r1, 4;
475
+ add.s32 %r85, %r142, %r31;
476
+ @%p24 ld.shared.b32 %r84, [ %r85 + 0 ];
477
+ mov.b32 %f119, %r84;
478
+ add.s32 %r87, %r143, %r31;
479
+ @%p24 ld.shared.b32 %r86, [ %r87 + 0 ];
480
+ mov.b32 %f120, %r86;
481
+ add.s32 %r89, %r144, %r31;
482
+ @%p24 ld.shared.b32 %r88, [ %r89 + 0 ];
483
+ mov.b32 %f121, %r88;
484
+ shfl.sync.bfly.b32 %r146, %r84, 1, 31, -1;
485
+ mov.b32 %f122, %r146;
486
+ shfl.sync.bfly.b32 %r147, %r86, 1, 31, -1;
487
+ mov.b32 %f123, %r147;
488
+ shfl.sync.bfly.b32 %r91, %r88, 1, 31, -1;
489
+ mov.b32 %f124, %r91;
490
+ $L__tmp14:
491
+ .loc 2 108 21
492
+ sub.f32 %f125, %f122, %f119;
493
+ .loc 2 109 28
494
+ add.f32 %f126, %f121, %f124;
495
+ .loc 2 110 39
496
+ setp.eq.f32 %p48, %f126, 0f00000000;
497
+ .loc 2 110 60
498
+ mov.b32 %r92, %f126;
499
+ div.full.f32 %r90, %r91, %r92;
500
+ mov.b32 %f127, %r90;
501
+ .loc 2 110 49
502
+ selp.f32 %f128, 0f00000000, %f127, %p48;
503
+ .loc 2 112 17
504
+ fma.rn.f32 %f129, %f125, %f128, %f119;
505
+ .loc 2 113 15
506
+ add.f32 %f130, %f120, %f123;
507
+ .loc 2 113 30
508
+ mul.f32 %f131, %f125, %f125;
509
+ .loc 2 113 38
510
+ mul.f32 %f132, %f121, %f131;
511
+ .loc 2 113 22
512
+ fma.rn.f32 %f133, %f132, %f128, %f130;
513
+ $L__tmp15:
514
+ .loc 2 120 46
515
+ setp.eq.s32 %p49, %r4, 0;
516
+ and.pred %p27, %p24, %p49;
517
+ mov.b32 %r94, %f129;
518
+ @%p27 st.shared.b32 [ %r85 + 0 ], %r94;
519
+ mov.b32 %r96, %f133;
520
+ @%p27 st.shared.b32 [ %r87 + 0 ], %r96;
521
+ @%p27 st.shared.b32 [ %r89 + 0 ], %r92;
522
+ bar.sync 0;
523
+ add.s32 %r148, %r142, %r140;
524
+ ld.shared.f32 %f9, [%r148];
525
+ add.s32 %r149, %r143, %r140;
526
+ ld.shared.f32 %f10, [%r149];
527
+ $L__tmp16:
528
+ .loc 1 62 51
529
+ mov.u32 %r99, 0x0;
530
+ mov.u32 %r100, 0x0;
531
+ mov.u32 %r101, 0x0;
532
+ mov.u32 %r102, 0x0;
533
+ @%p50 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd38 + 0 ];
534
+ @!%p50 mov.u32 %r99, %r155;
535
+ @!%p50 mov.u32 %r100, %r155;
536
+ @!%p50 mov.u32 %r101, %r155;
537
+ @!%p50 mov.u32 %r102, %r155;
538
+ .loc 1 63 51
539
+ mov.u32 %r107, 0x0;
540
+ mov.u32 %r108, 0x0;
541
+ @%p50 ld.global.L1::evict_first.v2.b32 { %r107, %r108 }, [ %rd39 + 0 ];
542
+ @!%p50 mov.u32 %r107, %r155;
543
+ @!%p50 mov.u32 %r108, %r155;
544
+ cvt.u16.u32 %rs5, %r107;
545
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r107; }
546
+ cvt.u16.u32 %rs7, %r108;
547
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r108; }
548
+ .loc 1 63 103
549
+ cvt.f32.bf16 %r111, %rs5;
550
+ mov.b32 %f11, %r111;
551
+ cvt.f32.bf16 %r112, %rs6;
552
+ mov.b32 %f12, %r112;
553
+ cvt.f32.bf16 %r113, %rs7;
554
+ mov.b32 %f13, %r113;
555
+ cvt.f32.bf16 %r114, %rs8;
556
+ mov.b32 %f14, %r114;
557
+ .loc 1 64 35
558
+ mul.wide.u32 %rd47, %r7, 4;
559
+ add.s64 %rd40, %rd8, %rd47;
560
+ .loc 1 64 40
561
+ mov.u32 %r115, 0x0;
562
+ mov.u32 %r116, 0x0;
563
+ @%p50 ld.global.L1::evict_last.v2.b32 { %r115, %r116 }, [ %rd40 + 0 ];
564
+ @!%p50 mov.u32 %r115, %r155;
565
+ @!%p50 mov.u32 %r116, %r155;
566
+ .loc 1 68 57
567
+ @%p15 bra $L__BB0_4;
568
+ mov.u64 %rd48, assertMessage_1;
569
+ cvta.global.u64 %rd49, %rd48;
570
+ mov.u64 %rd50, assertFile_1;
571
+ cvta.global.u64 %rd51, %rd50;
572
+ mov.u64 %rd52, assertFunc_1;
573
+ cvta.global.u64 %rd53, %rd52;
574
+ { // callseq 5, 0
575
+ .reg .b32 temp_param_reg;
576
+ .param .b64 param0;
577
+ st.param.b64 [param0+0], %rd49;
578
+ .param .b64 param1;
579
+ st.param.b64 [param1+0], %rd51;
580
+ .param .b32 param2;
581
+ st.param.b32 [param2+0], %r184;
582
+ .param .b64 param3;
583
+ st.param.b64 [param3+0], %rd53;
584
+ .param .b64 param4;
585
+ st.param.b64 [param4+0], %rd58;
586
+ call.uni
587
+ __assertfail,
588
+ (
589
+ param0,
590
+ param1,
591
+ param2,
592
+ param3,
593
+ param4
594
+ );
595
+ } // callseq 5
596
+ $L__BB0_4:
597
+ .loc 1 69 54
598
+ mov.u32 %r151, 0x0;
599
+ mov.u32 %r152, 0x0;
600
+ mov.u32 %r153, 0x0;
601
+ mov.u32 %r154, 0x0;
602
+ @%p50 ld.global.L1::evict_first.v4.b32 { %r151, %r152, %r153, %r154 }, [ %rd55 + 0 ];
603
+ @!%p50 mov.u32 %r151, %r155;
604
+ @!%p50 mov.u32 %r152, %r155;
605
+ @!%p50 mov.u32 %r153, %r155;
606
+ @!%p50 mov.u32 %r154, %r155;
607
+ .loc 1 75 24
608
+ mov.b32 %r160, %f10;
609
+ mov.b32 %r161, 1132462080;
610
+ div.full.f32 %r159, %r160, %r161;
611
+ mov.b32 %f134, %r159;
612
+ .loc 1 77 24
613
+ add.f32 %f135, %f134, 0f3727C5AC;
614
+ .loc 1 78 30
615
+ rsqrt.approx.ftz.f32 %f136, %f135;
616
+ .loc 1 69 54
617
+ mov.b32 %f137, %r154;
618
+ .loc 1 62 51
619
+ mov.b32 %f138, %r102;
620
+ .loc 1 70 24
621
+ add.f32 %f139, %f138, %f137;
622
+ .loc 1 72 24
623
+ add.f32 %f140, %f14, %f139;
624
+ .loc 1 73 24
625
+ sub.f32 %f141, %f140, %f9;
626
+ .loc 1 69 54
627
+ mov.b32 %f142, %r153;
628
+ .loc 1 62 51
629
+ mov.b32 %f143, %r101;
630
+ .loc 1 70 24
631
+ add.f32 %f144, %f143, %f142;
632
+ .loc 1 72 24
633
+ add.f32 %f145, %f13, %f144;
634
+ .loc 1 73 24
635
+ sub.f32 %f146, %f145, %f9;
636
+ .loc 1 69 54
637
+ mov.b32 %f147, %r152;
638
+ .loc 1 62 51
639
+ mov.b32 %f148, %r100;
640
+ .loc 1 70 24
641
+ add.f32 %f149, %f148, %f147;
642
+ .loc 1 72 24
643
+ add.f32 %f150, %f12, %f149;
644
+ .loc 1 73 24
645
+ sub.f32 %f151, %f150, %f9;
646
+ .loc 1 69 54
647
+ mov.b32 %f152, %r151;
648
+ .loc 1 62 51
649
+ mov.b32 %f153, %r99;
650
+ .loc 1 70 24
651
+ add.f32 %f154, %f153, %f152;
652
+ .loc 1 72 24
653
+ add.f32 %f155, %f11, %f154;
654
+ .loc 1 73 24
655
+ sub.f32 %f156, %f155, %f9;
656
+ .loc 1 79 24
657
+ mul.f32 %f157, %f156, %f136;
658
+ mul.f32 %f158, %f151, %f136;
659
+ mul.f32 %f159, %f146, %f136;
660
+ mul.f32 %f160, %f141, %f136;
661
+ .loc 1 80 24
662
+ bar.sync 0;
663
+ shl.b32 %r177, %r7, 2;
664
+ add.s32 %r179, %r142, %r177;
665
+ st.shared.v2.u32 [%r179], {%r115, %r116};
666
+ bar.sync 0;
667
+ shl.b32 %r180, %r6, 2;
668
+ add.s32 %r181, %r142, %r180;
669
+ ld.shared.v4.f32 {%f161, %f162, %f163, %f164}, [%r181];
670
+ mul.f32 %f165, %f157, %f161;
671
+ mul.f32 %f166, %f158, %f162;
672
+ mul.f32 %f167, %f159, %f163;
673
+ mul.f32 %f168, %f160, %f164;
674
+ .loc 1 82 29
675
+ shl.b64 %rd57, %rd3, 1;
676
+ add.s64 %rd56, %rd9, %rd57;
677
+ .loc 1 82 52
678
+ mov.b32 %r171, %f165;
679
+ cvt.rn.bf16.f32 %rs9, %r171;
680
+ mov.b32 %r172, %f166;
681
+ cvt.rn.bf16.f32 %rs10, %r172;
682
+ mov.b32 %r173, %f167;
683
+ cvt.rn.bf16.f32 %rs11, %r173;
684
+ mov.b32 %r174, %f168;
685
+ cvt.rn.bf16.f32 %rs12, %r174;
686
+ mov.b32 %r182, {%rs9, %rs10};
687
+ mov.b32 %r183, {%rs11, %rs12};
688
+ @%p50 st.global.v2.b32 [ %rd56 + 0 ], { %r182, %r183 };
689
+ .loc 1 58 4
690
+ ret;
691
+ $L__tmp17:
692
+ $L__func_end0:
693
+
694
+ }
695
+ // .globl __nv_rsqrtf
696
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
697
+ .param .b32 __nv_rsqrtf_param_0
698
+ )
699
+ {
700
+ .reg .f32 %f<3>;
701
+ $L__func_begin1:
702
+
703
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
704
+ rsqrt.approx.ftz.f32 %f2, %f1;
705
+ st.param.f32 [func_retval0+0], %f2;
706
+ ret;
707
+ $L__func_end1:
708
+
709
+ }
710
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
711
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
712
+ .section .debug_abbrev
713
+ {
714
+ .b8 1
715
+ .b8 17
716
+ .b8 1
717
+ .b8 37
718
+ .b8 8
719
+ .b8 19
720
+ .b8 5
721
+ .b8 3
722
+ .b8 8
723
+ .b8 16
724
+ .b8 6
725
+ .b8 27
726
+ .b8 8
727
+ .b8 180
728
+ .b8 66
729
+ .b8 12
730
+ .b8 17
731
+ .b8 1
732
+ .b8 18
733
+ .b8 1
734
+ .b8 0
735
+ .b8 0
736
+ .b8 2
737
+ .b8 46
738
+ .b8 0
739
+ .b8 135
740
+ .b8 64
741
+ .b8 8
742
+ .b8 3
743
+ .b8 8
744
+ .b8 58
745
+ .b8 11
746
+ .b8 59
747
+ .b8 11
748
+ .b8 63
749
+ .b8 12
750
+ .b8 32
751
+ .b8 11
752
+ .b8 0
753
+ .b8 0
754
+ .b8 3
755
+ .b8 46
756
+ .b8 1
757
+ .b8 17
758
+ .b8 1
759
+ .b8 18
760
+ .b8 1
761
+ .b8 64
762
+ .b8 10
763
+ .b8 49
764
+ .b8 19
765
+ .b8 0
766
+ .b8 0
767
+ .b8 4
768
+ .b8 29
769
+ .b8 0
770
+ .b8 49
771
+ .b8 19
772
+ .b8 17
773
+ .b8 1
774
+ .b8 18
775
+ .b8 1
776
+ .b8 88
777
+ .b8 11
778
+ .b8 89
779
+ .b8 11
780
+ .b8 87
781
+ .b8 11
782
+ .b8 0
783
+ .b8 0
784
+ .b8 5
785
+ .b8 29
786
+ .b8 1
787
+ .b8 49
788
+ .b8 19
789
+ .b8 17
790
+ .b8 1
791
+ .b8 18
792
+ .b8 1
793
+ .b8 88
794
+ .b8 11
795
+ .b8 89
796
+ .b8 11
797
+ .b8 87
798
+ .b8 11
799
+ .b8 0
800
+ .b8 0
801
+ .b8 0
802
+ }
803
+ .section .debug_info
804
+ {
805
+ .b32 302
806
+ .b8 2
807
+ .b8 0
808
+ .b32 .debug_abbrev
809
+ .b8 8
810
+ .b8 1
811
+ .b8 116
812
+ .b8 114
813
+ .b8 105
814
+ .b8 116
815
+ .b8 111
816
+ .b8 110
817
+ .b8 0
818
+ .b8 2
819
+ .b8 0
820
+ .b8 99
821
+ .b8 112
822
+ .b8 110
823
+ .b8 51
824
+ .b8 108
825
+ .b8 97
826
+ .b8 119
827
+ .b8 103
828
+ .b8 54
829
+ .b8 53
830
+ .b8 108
831
+ .b8 112
832
+ .b8 105
833
+ .b8 54
834
+ .b8 51
835
+ .b8 103
836
+ .b8 118
837
+ .b8 54
838
+ .b8 99
839
+ .b8 54
840
+ .b8 112
841
+ .b8 110
842
+ .b8 52
843
+ .b8 111
844
+ .b8 105
845
+ .b8 107
846
+ .b8 104
847
+ .b8 103
848
+ .b8 54
849
+ .b8 113
850
+ .b8 118
851
+ .b8 97
852
+ .b8 50
853
+ .b8 104
854
+ .b8 50
855
+ .b8 113
856
+ .b8 106
857
+ .b8 100
858
+ .b8 112
859
+ .b8 120
860
+ .b8 101
861
+ .b8 54
862
+ .b8 113
863
+ .b8 106
864
+ .b8 52
865
+ .b8 108
866
+ .b8 118
867
+ .b8 116
868
+ .b8 116
869
+ .b8 119
870
+ .b8 101
871
+ .b8 122
872
+ .b8 46
873
+ .b8 112
874
+ .b8 121
875
+ .b8 0
876
+ .b32 .debug_line
877
+ .b8 47
878
+ .b8 116
879
+ .b8 109
880
+ .b8 112
881
+ .b8 47
882
+ .b8 116
883
+ .b8 111
884
+ .b8 114
885
+ .b8 99
886
+ .b8 104
887
+ .b8 105
888
+ .b8 110
889
+ .b8 100
890
+ .b8 117
891
+ .b8 99
892
+ .b8 116
893
+ .b8 111
894
+ .b8 114
895
+ .b8 95
896
+ .b8 114
897
+ .b8 111
898
+ .b8 111
899
+ .b8 116
900
+ .b8 47
901
+ .b8 112
902
+ .b8 110
903
+ .b8 0
904
+ .b8 1
905
+ .b64 $L__func_begin0
906
+ .b64 $L__func_end0
907
+ .b8 2
908
+ .b8 116
909
+ .b8 114
910
+ .b8 105
911
+ .b8 116
912
+ .b8 111
913
+ .b8 110
914
+ .b8 95
915
+ .b8 95
916
+ .b8 48
917
+ .b8 100
918
+ .b8 49
919
+ .b8 100
920
+ .b8 50
921
+ .b8 100
922
+ .b8 51
923
+ .b8 100
924
+ .b8 52
925
+ .b8 100
926
+ .b8 53
927
+ .b8 100
928
+ .b8 54
929
+ .b8 100
930
+ .b8 101
931
+ .b8 55
932
+ .b8 100
933
+ .b8 101
934
+ .b8 0
935
+ .b8 116
936
+ .b8 114
937
+ .b8 105
938
+ .b8 116
939
+ .b8 111
940
+ .b8 110
941
+ .b8 95
942
+ .b8 95
943
+ .b8 48
944
+ .b8 100
945
+ .b8 49
946
+ .b8 100
947
+ .b8 50
948
+ .b8 100
949
+ .b8 51
950
+ .b8 100
951
+ .b8 52
952
+ .b8 100
953
+ .b8 53
954
+ .b8 100
955
+ .b8 54
956
+ .b8 100
957
+ .b8 101
958
+ .b8 55
959
+ .b8 100
960
+ .b8 101
961
+ .b8 0
962
+ .b8 1
963
+ .b8 18
964
+ .b8 1
965
+ .b8 1
966
+ .b8 3
967
+ .b64 $L__func_begin0
968
+ .b64 $L__func_end0
969
+ .b8 1
970
+ .b8 156
971
+ .b32 125
972
+ .b8 4
973
+ .b32 125
974
+ .b64 $L__tmp1
975
+ .b64 $L__tmp2
976
+ .b8 2
977
+ .b8 47
978
+ .b8 41
979
+ .b8 5
980
+ .b32 125
981
+ .b64 $L__tmp2
982
+ .b64 $L__tmp15
983
+ .b8 2
984
+ .b8 53
985
+ .b8 44
986
+ .b8 4
987
+ .b32 125
988
+ .b64 $L__tmp2
989
+ .b64 $L__tmp15
990
+ .b8 2
991
+ .b8 120
992
+ .b8 46
993
+ .b8 0
994
+ .b8 4
995
+ .b32 125
996
+ .b64 $L__tmp3
997
+ .b64 $L__tmp16
998
+ .b8 2
999
+ .b8 53
1000
+ .b8 44
1001
+ .b8 0
1002
+ .b8 0
1003
+ }
1004
+ .section .debug_pubnames
1005
+ {
1006
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1007
+ $L__pubNames_start0:
1008
+ .b8 2
1009
+ .b8 0
1010
+ .b32 .debug_info
1011
+ .b32 306
1012
+ .b32 125
1013
+ .b8 116
1014
+ .b8 114
1015
+ .b8 105
1016
+ .b8 116
1017
+ .b8 111
1018
+ .b8 110
1019
+ .b8 95
1020
+ .b8 95
1021
+ .b8 48
1022
+ .b8 100
1023
+ .b8 49
1024
+ .b8 100
1025
+ .b8 50
1026
+ .b8 100
1027
+ .b8 51
1028
+ .b8 100
1029
+ .b8 52
1030
+ .b8 100
1031
+ .b8 53
1032
+ .b8 100
1033
+ .b8 54
1034
+ .b8 100
1035
+ .b8 101
1036
+ .b8 55
1037
+ .b8 100
1038
+ .b8 101
1039
+ .b8 0
1040
+ .b32 0
1041
+ $L__pubNames_end0:
1042
+ }
1043
+ .section .debug_pubtypes
1044
+ {
1045
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1046
+ $L__pubTypes_start0:
1047
+ .b8 2
1048
+ .b8 0
1049
+ .b32 .debug_info
1050
+ .b32 306
1051
+ .b32 0
1052
+ $L__pubTypes_end0:
1053
+ }
1054
+ .section .debug_loc { }
.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin ADDED
Binary file (66.2 kB). View file
 
.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir ADDED
@@ -0,0 +1,1211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
16
+ %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %9 = lshr i32 %8, 3, !dbg !10
18
+ %10 = and i32 %9, 31, !dbg !10
19
+ %11 = and i32 %8, 63, !dbg !10
20
+ %12 = shl i32 %8, 3, !dbg !11
21
+ %13 = and i32 %12, 56, !dbg !11
22
+ %14 = or i32 %13, 4, !dbg !11
23
+ %15 = lshr i32 %8, 6, !dbg !12
24
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
25
+ %17 = shl i32 %16, 6, !dbg !14
26
+ %18 = or i32 %17, %10, !dbg !15
27
+ %19 = or i32 %18, 32, !dbg !15
28
+ %20 = or i32 %17, %11, !dbg !15
29
+ %21 = sext i32 %18 to i64, !dbg !16
30
+ %22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !16
31
+ %23 = sext i32 %19 to i64, !dbg !16
32
+ %24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !16
33
+ %25 = sext i32 %20 to i64, !dbg !16
34
+ %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
35
+ %27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
36
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
37
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
38
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
39
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
40
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
41
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
42
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
43
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
44
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
45
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
46
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
47
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
48
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
49
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
50
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
51
+ %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
52
+ %44 = srem i32 %18, 512, !dbg !18
53
+ %45 = srem i32 %19, 512, !dbg !18
54
+ %46 = shl nsw i32 %44, 8, !dbg !19
55
+ %47 = shl nsw i32 %45, 8, !dbg !19
56
+ %48 = add i64 %43, 50257, !dbg !20
57
+ %49 = icmp slt i64 %27, 0, !dbg !21
58
+ %50 = icmp slt i64 %35, 0, !dbg !21
59
+ %51 = icmp slt i64 %43, 0, !dbg !21
60
+ %52 = select i1 %51, i64 %48, i64 %43, !dbg !22
61
+ %53 = icmp ugt i64 %52, 50256, !dbg !23
62
+ %54 = shl i64 %27, 8, !dbg !24
63
+ %55 = add i64 %54, 12865792, !dbg !24
64
+ %56 = select i1 %49, i64 %55, i64 %54, !dbg !24
65
+ %57 = shl i64 %35, 8, !dbg !24
66
+ %58 = add i64 %57, 12865792, !dbg !24
67
+ %59 = select i1 %50, i64 %58, i64 %57, !dbg !24
68
+ %60 = getelementptr float, ptr addrspace(1) %1, i64 %56
69
+ %61 = getelementptr float, ptr addrspace(1) %1, i64 %59
70
+ br label %62, !dbg !12
71
+
72
+ 62: ; preds = %7, %179
73
+ %63 = phi float [ 0.000000e+00, %7 ], [ %254, %179 ]
74
+ %64 = phi float [ 0.000000e+00, %7 ], [ %255, %179 ]
75
+ %65 = phi float [ 0.000000e+00, %7 ], [ %256, %179 ]
76
+ %66 = phi float [ 0.000000e+00, %7 ], [ %257, %179 ]
77
+ %67 = phi float [ 0.000000e+00, %7 ], [ %258, %179 ]
78
+ %68 = phi float [ 0.000000e+00, %7 ], [ %259, %179 ]
79
+ %69 = phi float [ 0.000000e+00, %7 ], [ %260, %179 ]
80
+ %70 = phi float [ 0.000000e+00, %7 ], [ %261, %179 ]
81
+ %71 = phi float [ 0.000000e+00, %7 ], [ %262, %179 ]
82
+ %72 = phi float [ 0.000000e+00, %7 ], [ %263, %179 ]
83
+ %73 = phi float [ 0.000000e+00, %7 ], [ %264, %179 ]
84
+ %74 = phi float [ 0.000000e+00, %7 ], [ %265, %179 ]
85
+ %75 = phi float [ 0.000000e+00, %7 ], [ %266, %179 ]
86
+ %76 = phi float [ 0.000000e+00, %7 ], [ %267, %179 ]
87
+ %77 = phi float [ 0.000000e+00, %7 ], [ %268, %179 ]
88
+ %78 = phi float [ 0.000000e+00, %7 ], [ %269, %179 ]
89
+ %79 = phi float [ 0.000000e+00, %7 ], [ %270, %179 ]
90
+ %80 = phi float [ 0.000000e+00, %7 ], [ %271, %179 ]
91
+ %81 = phi float [ 0.000000e+00, %7 ], [ %272, %179 ]
92
+ %82 = phi float [ 0.000000e+00, %7 ], [ %273, %179 ]
93
+ %83 = phi float [ 0.000000e+00, %7 ], [ %274, %179 ]
94
+ %84 = phi float [ 0.000000e+00, %7 ], [ %275, %179 ]
95
+ %85 = phi float [ 0.000000e+00, %7 ], [ %276, %179 ]
96
+ %86 = phi float [ 0.000000e+00, %7 ], [ %277, %179 ]
97
+ %87 = phi float [ 0.000000e+00, %7 ], [ %278, %179 ]
98
+ %88 = phi float [ 0.000000e+00, %7 ], [ %279, %179 ]
99
+ %89 = phi float [ 0.000000e+00, %7 ], [ %280, %179 ]
100
+ %90 = phi float [ 0.000000e+00, %7 ], [ %281, %179 ]
101
+ %91 = phi float [ 0.000000e+00, %7 ], [ %282, %179 ]
102
+ %92 = phi float [ 0.000000e+00, %7 ], [ %283, %179 ]
103
+ %93 = phi float [ 0.000000e+00, %7 ], [ %284, %179 ]
104
+ %94 = phi float [ 0.000000e+00, %7 ], [ %285, %179 ]
105
+ %95 = phi float [ 0.000000e+00, %7 ], [ %350, %179 ]
106
+ %96 = phi float [ 0.000000e+00, %7 ], [ %351, %179 ]
107
+ %97 = phi float [ 0.000000e+00, %7 ], [ %352, %179 ]
108
+ %98 = phi float [ 0.000000e+00, %7 ], [ %353, %179 ]
109
+ %99 = phi float [ 0.000000e+00, %7 ], [ %354, %179 ]
110
+ %100 = phi float [ 0.000000e+00, %7 ], [ %355, %179 ]
111
+ %101 = phi float [ 0.000000e+00, %7 ], [ %356, %179 ]
112
+ %102 = phi float [ 0.000000e+00, %7 ], [ %357, %179 ]
113
+ %103 = phi float [ 0.000000e+00, %7 ], [ %358, %179 ]
114
+ %104 = phi float [ 0.000000e+00, %7 ], [ %359, %179 ]
115
+ %105 = phi float [ 0.000000e+00, %7 ], [ %360, %179 ]
116
+ %106 = phi float [ 0.000000e+00, %7 ], [ %361, %179 ]
117
+ %107 = phi float [ 0.000000e+00, %7 ], [ %362, %179 ]
118
+ %108 = phi float [ 0.000000e+00, %7 ], [ %363, %179 ]
119
+ %109 = phi float [ 0.000000e+00, %7 ], [ %364, %179 ]
120
+ %110 = phi float [ 0.000000e+00, %7 ], [ %365, %179 ]
121
+ %111 = phi float [ 0.000000e+00, %7 ], [ %302, %179 ]
122
+ %112 = phi float [ 0.000000e+00, %7 ], [ %303, %179 ]
123
+ %113 = phi float [ 0.000000e+00, %7 ], [ %304, %179 ]
124
+ %114 = phi float [ 0.000000e+00, %7 ], [ %305, %179 ]
125
+ %115 = phi float [ 0.000000e+00, %7 ], [ %306, %179 ]
126
+ %116 = phi float [ 0.000000e+00, %7 ], [ %307, %179 ]
127
+ %117 = phi float [ 0.000000e+00, %7 ], [ %308, %179 ]
128
+ %118 = phi float [ 0.000000e+00, %7 ], [ %309, %179 ]
129
+ %119 = phi float [ 0.000000e+00, %7 ], [ %310, %179 ]
130
+ %120 = phi float [ 0.000000e+00, %7 ], [ %311, %179 ]
131
+ %121 = phi float [ 0.000000e+00, %7 ], [ %312, %179 ]
132
+ %122 = phi float [ 0.000000e+00, %7 ], [ %313, %179 ]
133
+ %123 = phi float [ 0.000000e+00, %7 ], [ %314, %179 ]
134
+ %124 = phi float [ 0.000000e+00, %7 ], [ %315, %179 ]
135
+ %125 = phi float [ 0.000000e+00, %7 ], [ %316, %179 ]
136
+ %126 = phi float [ 0.000000e+00, %7 ], [ %317, %179 ]
137
+ %127 = phi i32 [ 0, %7 ], [ %366, %179 ]
138
+ %128 = or i32 %127, %13, !dbg !25
139
+ %129 = or i32 %127, %14, !dbg !25
140
+ %130 = add i32 %128, %46, !dbg !26
141
+ %131 = add i32 %129, %46, !dbg !26
142
+ %132 = add i32 %128, %47, !dbg !26
143
+ %133 = add i32 %129, %47, !dbg !26
144
+ %134 = sext i32 %130 to i64, !dbg !27
145
+ %135 = getelementptr float, ptr addrspace(1) %2, i64 %134, !dbg !27
146
+ %136 = sext i32 %131 to i64, !dbg !27
147
+ %137 = getelementptr float, ptr addrspace(1) %2, i64 %136, !dbg !27
148
+ %138 = sext i32 %132 to i64, !dbg !27
149
+ %139 = getelementptr float, ptr addrspace(1) %2, i64 %138, !dbg !27
150
+ %140 = sext i32 %133 to i64, !dbg !27
151
+ %141 = getelementptr float, ptr addrspace(1) %2, i64 %140, !dbg !27
152
+ %142 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %135, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
153
+ %143 = extractvalue { i32, i32, i32, i32 } %142, 0, !dbg !28
154
+ %144 = extractvalue { i32, i32, i32, i32 } %142, 1, !dbg !28
155
+ %145 = extractvalue { i32, i32, i32, i32 } %142, 2, !dbg !28
156
+ %146 = extractvalue { i32, i32, i32, i32 } %142, 3, !dbg !28
157
+ %147 = bitcast i32 %143 to float, !dbg !28
158
+ %148 = bitcast i32 %144 to float, !dbg !28
159
+ %149 = bitcast i32 %145 to float, !dbg !28
160
+ %150 = bitcast i32 %146 to float, !dbg !28
161
+ %151 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
162
+ %152 = extractvalue { i32, i32, i32, i32 } %151, 0, !dbg !28
163
+ %153 = extractvalue { i32, i32, i32, i32 } %151, 1, !dbg !28
164
+ %154 = extractvalue { i32, i32, i32, i32 } %151, 2, !dbg !28
165
+ %155 = extractvalue { i32, i32, i32, i32 } %151, 3, !dbg !28
166
+ %156 = bitcast i32 %152 to float, !dbg !28
167
+ %157 = bitcast i32 %153 to float, !dbg !28
168
+ %158 = bitcast i32 %154 to float, !dbg !28
169
+ %159 = bitcast i32 %155 to float, !dbg !28
170
+ %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %139, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
171
+ %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !28
172
+ %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !28
173
+ %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !28
174
+ %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !28
175
+ %165 = bitcast i32 %161 to float, !dbg !28
176
+ %166 = bitcast i32 %162 to float, !dbg !28
177
+ %167 = bitcast i32 %163 to float, !dbg !28
178
+ %168 = bitcast i32 %164 to float, !dbg !28
179
+ %169 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
180
+ %170 = extractvalue { i32, i32, i32, i32 } %169, 0, !dbg !28
181
+ %171 = extractvalue { i32, i32, i32, i32 } %169, 1, !dbg !28
182
+ %172 = extractvalue { i32, i32, i32, i32 } %169, 2, !dbg !28
183
+ %173 = extractvalue { i32, i32, i32, i32 } %169, 3, !dbg !28
184
+ %174 = bitcast i32 %170 to float, !dbg !28
185
+ %175 = bitcast i32 %171 to float, !dbg !28
186
+ %176 = bitcast i32 %172 to float, !dbg !28
187
+ %177 = bitcast i32 %173 to float, !dbg !28
188
+ br i1 %53, label %178, label %179, !dbg !29
189
+
190
+ 178: ; preds = %62
191
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
192
+ br label %179, !dbg !29
193
+
194
+ 179: ; preds = %178, %62
195
+ %180 = zext nneg i32 %128 to i64, !dbg !30
196
+ %181 = zext nneg i32 %129 to i64, !dbg !30
197
+ %182 = getelementptr float, ptr addrspace(1) %60, i64 %180, !dbg !31
198
+ %183 = getelementptr float, ptr addrspace(1) %60, i64 %181, !dbg !31
199
+ %184 = getelementptr float, ptr addrspace(1) %61, i64 %180, !dbg !31
200
+ %185 = getelementptr float, ptr addrspace(1) %61, i64 %181, !dbg !31
201
+ %186 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %182, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
202
+ %187 = extractvalue { i32, i32, i32, i32 } %186, 0, !dbg !32
203
+ %188 = extractvalue { i32, i32, i32, i32 } %186, 1, !dbg !32
204
+ %189 = extractvalue { i32, i32, i32, i32 } %186, 2, !dbg !32
205
+ %190 = extractvalue { i32, i32, i32, i32 } %186, 3, !dbg !32
206
+ %191 = bitcast i32 %187 to float, !dbg !32
207
+ %192 = bitcast i32 %188 to float, !dbg !32
208
+ %193 = bitcast i32 %189 to float, !dbg !32
209
+ %194 = bitcast i32 %190 to float, !dbg !32
210
+ %195 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %183, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
211
+ %196 = extractvalue { i32, i32, i32, i32 } %195, 0, !dbg !32
212
+ %197 = extractvalue { i32, i32, i32, i32 } %195, 1, !dbg !32
213
+ %198 = extractvalue { i32, i32, i32, i32 } %195, 2, !dbg !32
214
+ %199 = extractvalue { i32, i32, i32, i32 } %195, 3, !dbg !32
215
+ %200 = bitcast i32 %196 to float, !dbg !32
216
+ %201 = bitcast i32 %197 to float, !dbg !32
217
+ %202 = bitcast i32 %198 to float, !dbg !32
218
+ %203 = bitcast i32 %199 to float, !dbg !32
219
+ %204 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
220
+ %205 = extractvalue { i32, i32, i32, i32 } %204, 0, !dbg !32
221
+ %206 = extractvalue { i32, i32, i32, i32 } %204, 1, !dbg !32
222
+ %207 = extractvalue { i32, i32, i32, i32 } %204, 2, !dbg !32
223
+ %208 = extractvalue { i32, i32, i32, i32 } %204, 3, !dbg !32
224
+ %209 = bitcast i32 %205 to float, !dbg !32
225
+ %210 = bitcast i32 %206 to float, !dbg !32
226
+ %211 = bitcast i32 %207 to float, !dbg !32
227
+ %212 = bitcast i32 %208 to float, !dbg !32
228
+ %213 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %185, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
229
+ %214 = extractvalue { i32, i32, i32, i32 } %213, 0, !dbg !32
230
+ %215 = extractvalue { i32, i32, i32, i32 } %213, 1, !dbg !32
231
+ %216 = extractvalue { i32, i32, i32, i32 } %213, 2, !dbg !32
232
+ %217 = extractvalue { i32, i32, i32, i32 } %213, 3, !dbg !32
233
+ %218 = bitcast i32 %214 to float, !dbg !32
234
+ %219 = bitcast i32 %215 to float, !dbg !32
235
+ %220 = bitcast i32 %216 to float, !dbg !32
236
+ %221 = bitcast i32 %217 to float, !dbg !32
237
+ %222 = fadd float %147, %191, !dbg !33
238
+ %223 = fadd float %148, %192, !dbg !33
239
+ %224 = fadd float %149, %193, !dbg !33
240
+ %225 = fadd float %150, %194, !dbg !33
241
+ %226 = fadd float %156, %200, !dbg !33
242
+ %227 = fadd float %157, %201, !dbg !33
243
+ %228 = fadd float %158, %202, !dbg !33
244
+ %229 = fadd float %159, %203, !dbg !33
245
+ %230 = fadd float %165, %209, !dbg !33
246
+ %231 = fadd float %166, %210, !dbg !33
247
+ %232 = fadd float %167, %211, !dbg !33
248
+ %233 = fadd float %168, %212, !dbg !33
249
+ %234 = fadd float %174, %218, !dbg !33
250
+ %235 = fadd float %175, %219, !dbg !33
251
+ %236 = fadd float %176, %220, !dbg !33
252
+ %237 = fadd float %177, %221, !dbg !33
253
+ %238 = fsub float %222, %111, !dbg !34
254
+ %239 = fsub float %223, %112, !dbg !34
255
+ %240 = fsub float %224, %113, !dbg !34
256
+ %241 = fsub float %225, %114, !dbg !34
257
+ %242 = fsub float %226, %115, !dbg !34
258
+ %243 = fsub float %227, %116, !dbg !34
259
+ %244 = fsub float %228, %117, !dbg !34
260
+ %245 = fsub float %229, %118, !dbg !34
261
+ %246 = fsub float %230, %119, !dbg !34
262
+ %247 = fsub float %231, %120, !dbg !34
263
+ %248 = fsub float %232, %121, !dbg !34
264
+ %249 = fsub float %233, %122, !dbg !34
265
+ %250 = fsub float %234, %123, !dbg !34
266
+ %251 = fsub float %235, %124, !dbg !34
267
+ %252 = fsub float %236, %125, !dbg !34
268
+ %253 = fsub float %237, %126, !dbg !34
269
+ %254 = fadd float %63, 1.000000e+00, !dbg !38
270
+ %255 = fadd float %64, 1.000000e+00, !dbg !38
271
+ %256 = fadd float %65, 1.000000e+00, !dbg !38
272
+ %257 = fadd float %66, 1.000000e+00, !dbg !38
273
+ %258 = fadd float %67, 1.000000e+00, !dbg !38
274
+ %259 = fadd float %68, 1.000000e+00, !dbg !38
275
+ %260 = fadd float %69, 1.000000e+00, !dbg !38
276
+ %261 = fadd float %70, 1.000000e+00, !dbg !38
277
+ %262 = fadd float %71, 1.000000e+00, !dbg !38
278
+ %263 = fadd float %72, 1.000000e+00, !dbg !38
279
+ %264 = fadd float %73, 1.000000e+00, !dbg !38
280
+ %265 = fadd float %74, 1.000000e+00, !dbg !38
281
+ %266 = fadd float %75, 1.000000e+00, !dbg !38
282
+ %267 = fadd float %76, 1.000000e+00, !dbg !38
283
+ %268 = fadd float %77, 1.000000e+00, !dbg !38
284
+ %269 = fadd float %78, 1.000000e+00, !dbg !38
285
+ %270 = fadd float %79, 1.000000e+00, !dbg !38
286
+ %271 = fadd float %80, 1.000000e+00, !dbg !38
287
+ %272 = fadd float %81, 1.000000e+00, !dbg !38
288
+ %273 = fadd float %82, 1.000000e+00, !dbg !38
289
+ %274 = fadd float %83, 1.000000e+00, !dbg !38
290
+ %275 = fadd float %84, 1.000000e+00, !dbg !38
291
+ %276 = fadd float %85, 1.000000e+00, !dbg !38
292
+ %277 = fadd float %86, 1.000000e+00, !dbg !38
293
+ %278 = fadd float %87, 1.000000e+00, !dbg !38
294
+ %279 = fadd float %88, 1.000000e+00, !dbg !38
295
+ %280 = fadd float %89, 1.000000e+00, !dbg !38
296
+ %281 = fadd float %90, 1.000000e+00, !dbg !38
297
+ %282 = fadd float %91, 1.000000e+00, !dbg !38
298
+ %283 = fadd float %92, 1.000000e+00, !dbg !38
299
+ %284 = fadd float %93, 1.000000e+00, !dbg !38
300
+ %285 = fadd float %94, 1.000000e+00, !dbg !38
301
+ %286 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %238, float %254) #6, !dbg !39
302
+ %287 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %239, float %255) #6, !dbg !39
303
+ %288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %256) #6, !dbg !39
304
+ %289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %241, float %257) #6, !dbg !39
305
+ %290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %242, float %258) #6, !dbg !39
306
+ %291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %243, float %259) #6, !dbg !39
307
+ %292 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %260) #6, !dbg !39
308
+ %293 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float %261) #6, !dbg !39
309
+ %294 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %246, float %262) #6, !dbg !39
310
+ %295 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %247, float %263) #6, !dbg !39
311
+ %296 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %264) #6, !dbg !39
312
+ %297 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %249, float %265) #6, !dbg !39
313
+ %298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %266) #6, !dbg !39
314
+ %299 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %251, float %267) #6, !dbg !39
315
+ %300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %268) #6, !dbg !39
316
+ %301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %269) #6, !dbg !39
317
+ %302 = fadd float %111, %286, !dbg !40
318
+ %303 = fadd float %112, %287, !dbg !40
319
+ %304 = fadd float %113, %288, !dbg !40
320
+ %305 = fadd float %114, %289, !dbg !40
321
+ %306 = fadd float %115, %290, !dbg !40
322
+ %307 = fadd float %116, %291, !dbg !40
323
+ %308 = fadd float %117, %292, !dbg !40
324
+ %309 = fadd float %118, %293, !dbg !40
325
+ %310 = fadd float %119, %294, !dbg !40
326
+ %311 = fadd float %120, %295, !dbg !40
327
+ %312 = fadd float %121, %296, !dbg !40
328
+ %313 = fadd float %122, %297, !dbg !40
329
+ %314 = fadd float %123, %298, !dbg !40
330
+ %315 = fadd float %124, %299, !dbg !40
331
+ %316 = fadd float %125, %300, !dbg !40
332
+ %317 = fadd float %126, %301, !dbg !40
333
+ %318 = fsub float %222, %302, !dbg !41
334
+ %319 = fsub float %223, %303, !dbg !41
335
+ %320 = fsub float %224, %304, !dbg !41
336
+ %321 = fsub float %225, %305, !dbg !41
337
+ %322 = fsub float %226, %306, !dbg !41
338
+ %323 = fsub float %227, %307, !dbg !41
339
+ %324 = fsub float %228, %308, !dbg !41
340
+ %325 = fsub float %229, %309, !dbg !41
341
+ %326 = fsub float %230, %310, !dbg !41
342
+ %327 = fsub float %231, %311, !dbg !41
343
+ %328 = fsub float %232, %312, !dbg !41
344
+ %329 = fsub float %233, %313, !dbg !41
345
+ %330 = fsub float %234, %314, !dbg !41
346
+ %331 = fsub float %235, %315, !dbg !41
347
+ %332 = fsub float %236, %316, !dbg !41
348
+ %333 = fsub float %237, %317, !dbg !41
349
+ %334 = fmul float %238, %318, !dbg !42
350
+ %335 = fmul float %239, %319, !dbg !42
351
+ %336 = fmul float %240, %320, !dbg !42
352
+ %337 = fmul float %241, %321, !dbg !42
353
+ %338 = fmul float %242, %322, !dbg !42
354
+ %339 = fmul float %243, %323, !dbg !42
355
+ %340 = fmul float %244, %324, !dbg !42
356
+ %341 = fmul float %245, %325, !dbg !42
357
+ %342 = fmul float %246, %326, !dbg !42
358
+ %343 = fmul float %247, %327, !dbg !42
359
+ %344 = fmul float %248, %328, !dbg !42
360
+ %345 = fmul float %249, %329, !dbg !42
361
+ %346 = fmul float %250, %330, !dbg !42
362
+ %347 = fmul float %251, %331, !dbg !42
363
+ %348 = fmul float %252, %332, !dbg !42
364
+ %349 = fmul float %253, %333, !dbg !42
365
+ %350 = fadd float %95, %334, !dbg !43
366
+ %351 = fadd float %96, %335, !dbg !43
367
+ %352 = fadd float %97, %336, !dbg !43
368
+ %353 = fadd float %98, %337, !dbg !43
369
+ %354 = fadd float %99, %338, !dbg !43
370
+ %355 = fadd float %100, %339, !dbg !43
371
+ %356 = fadd float %101, %340, !dbg !43
372
+ %357 = fadd float %102, %341, !dbg !43
373
+ %358 = fadd float %103, %342, !dbg !43
374
+ %359 = fadd float %104, %343, !dbg !43
375
+ %360 = fadd float %105, %344, !dbg !43
376
+ %361 = fadd float %106, %345, !dbg !43
377
+ %362 = fadd float %107, %346, !dbg !43
378
+ %363 = fadd float %108, %347, !dbg !43
379
+ %364 = fadd float %109, %348, !dbg !43
380
+ %365 = fadd float %110, %349, !dbg !43
381
+ %366 = add nuw nsw i32 %127, 64, !dbg !12
382
+ %367 = icmp ult i32 %127, 192, !dbg !12
383
+ br i1 %367, label %62, label %368, !dbg !12
384
+
385
+ 368: ; preds = %179
386
+ %369 = and i32 %15, 3, !dbg !12
387
+ %370 = mul nuw nsw i32 %369, 72, !dbg !12
388
+ %371 = add nuw nsw i32 %370, %11, !dbg !12
389
+ %372 = zext nneg i32 %371 to i64, !dbg !12
390
+ %373 = getelementptr float, ptr addrspace(3) @global_smem, i64 %372, !dbg !12
391
+ %374 = insertelement <1 x float> undef, float %270, i64 0, !dbg !12
392
+ store <1 x float> %374, ptr addrspace(3) %373, align 4, !dbg !12
393
+ %375 = add nuw nsw i32 %11, 288, !dbg !12
394
+ %376 = add nuw nsw i32 %375, %370, !dbg !12
395
+ %377 = zext nneg i32 %376 to i64, !dbg !12
396
+ %378 = getelementptr float, ptr addrspace(3) @global_smem, i64 %377, !dbg !12
397
+ %379 = insertelement <1 x float> undef, float %271, i64 0, !dbg !12
398
+ store <1 x float> %379, ptr addrspace(3) %378, align 4, !dbg !12
399
+ %380 = or i32 %11, 576, !dbg !12
400
+ %381 = add nuw nsw i32 %380, %370, !dbg !12
401
+ %382 = zext nneg i32 %381 to i64, !dbg !12
402
+ %383 = getelementptr float, ptr addrspace(3) @global_smem, i64 %382, !dbg !12
403
+ %384 = insertelement <1 x float> undef, float %272, i64 0, !dbg !12
404
+ store <1 x float> %384, ptr addrspace(3) %383, align 4, !dbg !12
405
+ %385 = add nuw nsw i32 %11, 864, !dbg !12
406
+ %386 = add nuw nsw i32 %385, %370, !dbg !12
407
+ %387 = zext nneg i32 %386 to i64, !dbg !12
408
+ %388 = getelementptr float, ptr addrspace(3) @global_smem, i64 %387, !dbg !12
409
+ %389 = insertelement <1 x float> undef, float %273, i64 0, !dbg !12
410
+ store <1 x float> %389, ptr addrspace(3) %388, align 4, !dbg !12
411
+ %390 = or i32 %11, 1152, !dbg !12
412
+ %391 = add nuw nsw i32 %390, %370, !dbg !12
413
+ %392 = zext nneg i32 %391 to i64, !dbg !12
414
+ %393 = getelementptr float, ptr addrspace(3) @global_smem, i64 %392, !dbg !12
415
+ %394 = insertelement <1 x float> undef, float %274, i64 0, !dbg !12
416
+ store <1 x float> %394, ptr addrspace(3) %393, align 4, !dbg !12
417
+ %395 = add nuw nsw i32 %11, 1440, !dbg !12
418
+ %396 = add nuw nsw i32 %395, %370, !dbg !12
419
+ %397 = zext nneg i32 %396 to i64, !dbg !12
420
+ %398 = getelementptr float, ptr addrspace(3) @global_smem, i64 %397, !dbg !12
421
+ %399 = insertelement <1 x float> undef, float %275, i64 0, !dbg !12
422
+ store <1 x float> %399, ptr addrspace(3) %398, align 4, !dbg !12
423
+ %400 = or i32 %11, 1728, !dbg !12
424
+ %401 = add nuw nsw i32 %400, %370, !dbg !12
425
+ %402 = zext nneg i32 %401 to i64, !dbg !12
426
+ %403 = getelementptr float, ptr addrspace(3) @global_smem, i64 %402, !dbg !12
427
+ %404 = insertelement <1 x float> undef, float %276, i64 0, !dbg !12
428
+ store <1 x float> %404, ptr addrspace(3) %403, align 4, !dbg !12
429
+ %405 = add nuw nsw i32 %11, 2016, !dbg !12
430
+ %406 = add nuw nsw i32 %405, %370, !dbg !12
431
+ %407 = zext nneg i32 %406 to i64, !dbg !12
432
+ %408 = getelementptr float, ptr addrspace(3) @global_smem, i64 %407, !dbg !12
433
+ %409 = insertelement <1 x float> undef, float %277, i64 0, !dbg !12
434
+ store <1 x float> %409, ptr addrspace(3) %408, align 4, !dbg !12
435
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
436
+ %410 = mul nuw nsw i32 %10, 72, !dbg !12
437
+ %411 = add nuw nsw i32 %410, %13, !dbg !12
438
+ %412 = zext nneg i32 %411 to i64, !dbg !12
439
+ %413 = getelementptr float, ptr addrspace(3) @global_smem, i64 %412, !dbg !12
440
+ %414 = load float, ptr addrspace(3) %413, align 32, !dbg !12
441
+ %415 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 1, !dbg !12
442
+ %416 = load float, ptr addrspace(3) %415, align 4, !dbg !12
443
+ %417 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 2, !dbg !12
444
+ %418 = load float, ptr addrspace(3) %417, align 8, !dbg !12
445
+ %419 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 3, !dbg !12
446
+ %420 = load float, ptr addrspace(3) %419, align 4, !dbg !12
447
+ %421 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 4, !dbg !12
448
+ %422 = load float, ptr addrspace(3) %421, align 16, !dbg !12
449
+ %423 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 5, !dbg !12
450
+ %424 = load float, ptr addrspace(3) %423, align 4, !dbg !12
451
+ %425 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 6, !dbg !12
452
+ %426 = load float, ptr addrspace(3) %425, align 8, !dbg !12
453
+ %427 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 7, !dbg !12
454
+ %428 = load float, ptr addrspace(3) %427, align 4, !dbg !12
455
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
456
+ %429 = insertelement <1 x float> undef, float %278, i64 0, !dbg !12
457
+ store <1 x float> %429, ptr addrspace(3) %373, align 4, !dbg !12
458
+ %430 = insertelement <1 x float> undef, float %279, i64 0, !dbg !12
459
+ store <1 x float> %430, ptr addrspace(3) %378, align 4, !dbg !12
460
+ %431 = insertelement <1 x float> undef, float %280, i64 0, !dbg !12
461
+ store <1 x float> %431, ptr addrspace(3) %383, align 4, !dbg !12
462
+ %432 = insertelement <1 x float> undef, float %281, i64 0, !dbg !12
463
+ store <1 x float> %432, ptr addrspace(3) %388, align 4, !dbg !12
464
+ %433 = insertelement <1 x float> undef, float %282, i64 0, !dbg !12
465
+ store <1 x float> %433, ptr addrspace(3) %393, align 4, !dbg !12
466
+ %434 = insertelement <1 x float> undef, float %283, i64 0, !dbg !12
467
+ store <1 x float> %434, ptr addrspace(3) %398, align 4, !dbg !12
468
+ %435 = insertelement <1 x float> undef, float %284, i64 0, !dbg !12
469
+ store <1 x float> %435, ptr addrspace(3) %403, align 4, !dbg !12
470
+ %436 = insertelement <1 x float> undef, float %285, i64 0, !dbg !12
471
+ store <1 x float> %436, ptr addrspace(3) %408, align 4, !dbg !12
472
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
473
+ %437 = load float, ptr addrspace(3) %413, align 32, !dbg !12
474
+ %438 = load float, ptr addrspace(3) %415, align 4, !dbg !12
475
+ %439 = load float, ptr addrspace(3) %417, align 8, !dbg !12
476
+ %440 = load float, ptr addrspace(3) %419, align 4, !dbg !12
477
+ %441 = load float, ptr addrspace(3) %421, align 16, !dbg !12
478
+ %442 = load float, ptr addrspace(3) %423, align 4, !dbg !12
479
+ %443 = load float, ptr addrspace(3) %425, align 8, !dbg !12
480
+ %444 = load float, ptr addrspace(3) %427, align 4, !dbg !12
481
+ %445 = fsub float %303, %302, !dbg !44
482
+ %446 = fadd float %414, %416, !dbg !48
483
+ %447 = fcmp oeq float %446, 0.000000e+00, !dbg !49
484
+ %448 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %446) #6, !dbg !50
485
+ %449 = select i1 %447, float 0.000000e+00, float %448, !dbg !51
486
+ %450 = fmul float %445, %449, !dbg !52
487
+ %451 = fadd float %302, %450, !dbg !53
488
+ %452 = fadd float %350, %351, !dbg !54
489
+ %453 = fmul float %445, %445, !dbg !55
490
+ %454 = fmul float %453, %414, !dbg !56
491
+ %455 = fmul float %454, %449, !dbg !57
492
+ %456 = fadd float %452, %455, !dbg !58
493
+ %457 = fsub float %304, %451, !dbg !44
494
+ %458 = fadd float %418, %446, !dbg !48
495
+ %459 = fcmp oeq float %458, 0.000000e+00, !dbg !49
496
+ %460 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %418, float %458) #6, !dbg !50
497
+ %461 = select i1 %459, float 0.000000e+00, float %460, !dbg !51
498
+ %462 = fmul float %461, %457, !dbg !52
499
+ %463 = fadd float %451, %462, !dbg !53
500
+ %464 = fadd float %352, %456, !dbg !54
501
+ %465 = fmul float %457, %457, !dbg !55
502
+ %466 = fmul float %446, %465, !dbg !56
503
+ %467 = fmul float %461, %466, !dbg !57
504
+ %468 = fadd float %464, %467, !dbg !58
505
+ %469 = fsub float %305, %463, !dbg !44
506
+ %470 = fadd float %420, %458, !dbg !48
507
+ %471 = fcmp oeq float %470, 0.000000e+00, !dbg !49
508
+ %472 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %420, float %470) #6, !dbg !50
509
+ %473 = select i1 %471, float 0.000000e+00, float %472, !dbg !51
510
+ %474 = fmul float %473, %469, !dbg !52
511
+ %475 = fadd float %463, %474, !dbg !53
512
+ %476 = fadd float %353, %468, !dbg !54
513
+ %477 = fmul float %469, %469, !dbg !55
514
+ %478 = fmul float %458, %477, !dbg !56
515
+ %479 = fmul float %473, %478, !dbg !57
516
+ %480 = fadd float %476, %479, !dbg !58
517
+ %481 = fsub float %306, %475, !dbg !44
518
+ %482 = fadd float %422, %470, !dbg !48
519
+ %483 = fcmp oeq float %482, 0.000000e+00, !dbg !49
520
+ %484 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %422, float %482) #6, !dbg !50
521
+ %485 = select i1 %483, float 0.000000e+00, float %484, !dbg !51
522
+ %486 = fmul float %485, %481, !dbg !52
523
+ %487 = fadd float %475, %486, !dbg !53
524
+ %488 = fadd float %354, %480, !dbg !54
525
+ %489 = fmul float %481, %481, !dbg !55
526
+ %490 = fmul float %470, %489, !dbg !56
527
+ %491 = fmul float %485, %490, !dbg !57
528
+ %492 = fadd float %488, %491, !dbg !58
529
+ %493 = fsub float %307, %487, !dbg !44
530
+ %494 = fadd float %424, %482, !dbg !48
531
+ %495 = fcmp oeq float %494, 0.000000e+00, !dbg !49
532
+ %496 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %424, float %494) #6, !dbg !50
533
+ %497 = select i1 %495, float 0.000000e+00, float %496, !dbg !51
534
+ %498 = fmul float %497, %493, !dbg !52
535
+ %499 = fadd float %487, %498, !dbg !53
536
+ %500 = fadd float %355, %492, !dbg !54
537
+ %501 = fmul float %493, %493, !dbg !55
538
+ %502 = fmul float %482, %501, !dbg !56
539
+ %503 = fmul float %497, %502, !dbg !57
540
+ %504 = fadd float %500, %503, !dbg !58
541
+ %505 = fsub float %308, %499, !dbg !44
542
+ %506 = fadd float %426, %494, !dbg !48
543
+ %507 = fcmp oeq float %506, 0.000000e+00, !dbg !49
544
+ %508 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %426, float %506) #6, !dbg !50
545
+ %509 = select i1 %507, float 0.000000e+00, float %508, !dbg !51
546
+ %510 = fmul float %509, %505, !dbg !52
547
+ %511 = fadd float %499, %510, !dbg !53
548
+ %512 = fadd float %356, %504, !dbg !54
549
+ %513 = fmul float %505, %505, !dbg !55
550
+ %514 = fmul float %494, %513, !dbg !56
551
+ %515 = fmul float %509, %514, !dbg !57
552
+ %516 = fadd float %512, %515, !dbg !58
553
+ %517 = fsub float %309, %511, !dbg !44
554
+ %518 = fadd float %428, %506, !dbg !48
555
+ %519 = fcmp oeq float %518, 0.000000e+00, !dbg !49
556
+ %520 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float %518) #6, !dbg !50
557
+ %521 = select i1 %519, float 0.000000e+00, float %520, !dbg !51
558
+ %522 = fmul float %521, %517, !dbg !52
559
+ %523 = fadd float %511, %522, !dbg !53
560
+ %524 = fadd float %357, %516, !dbg !54
561
+ %525 = fmul float %517, %517, !dbg !55
562
+ %526 = fmul float %506, %525, !dbg !56
563
+ %527 = fmul float %521, %526, !dbg !57
564
+ %528 = fadd float %524, %527, !dbg !58
565
+ %529 = fsub float %311, %310, !dbg !44
566
+ %530 = fadd float %437, %438, !dbg !48
567
+ %531 = fcmp oeq float %530, 0.000000e+00, !dbg !49
568
+ %532 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %438, float %530) #6, !dbg !50
569
+ %533 = select i1 %531, float 0.000000e+00, float %532, !dbg !51
570
+ %534 = fmul float %529, %533, !dbg !52
571
+ %535 = fadd float %310, %534, !dbg !53
572
+ %536 = fadd float %358, %359, !dbg !54
573
+ %537 = fmul float %529, %529, !dbg !55
574
+ %538 = fmul float %537, %437, !dbg !56
575
+ %539 = fmul float %538, %533, !dbg !57
576
+ %540 = fadd float %536, %539, !dbg !58
577
+ %541 = fsub float %312, %535, !dbg !44
578
+ %542 = fadd float %439, %530, !dbg !48
579
+ %543 = fcmp oeq float %542, 0.000000e+00, !dbg !49
580
+ %544 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %439, float %542) #6, !dbg !50
581
+ %545 = select i1 %543, float 0.000000e+00, float %544, !dbg !51
582
+ %546 = fmul float %545, %541, !dbg !52
583
+ %547 = fadd float %535, %546, !dbg !53
584
+ %548 = fadd float %360, %540, !dbg !54
585
+ %549 = fmul float %541, %541, !dbg !55
586
+ %550 = fmul float %530, %549, !dbg !56
587
+ %551 = fmul float %545, %550, !dbg !57
588
+ %552 = fadd float %548, %551, !dbg !58
589
+ %553 = fsub float %313, %547, !dbg !44
590
+ %554 = fadd float %440, %542, !dbg !48
591
+ %555 = fcmp oeq float %554, 0.000000e+00, !dbg !49
592
+ %556 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %440, float %554) #6, !dbg !50
593
+ %557 = select i1 %555, float 0.000000e+00, float %556, !dbg !51
594
+ %558 = fmul float %557, %553, !dbg !52
595
+ %559 = fadd float %547, %558, !dbg !53
596
+ %560 = fadd float %361, %552, !dbg !54
597
+ %561 = fmul float %553, %553, !dbg !55
598
+ %562 = fmul float %542, %561, !dbg !56
599
+ %563 = fmul float %557, %562, !dbg !57
600
+ %564 = fadd float %560, %563, !dbg !58
601
+ %565 = fsub float %314, %559, !dbg !44
602
+ %566 = fadd float %441, %554, !dbg !48
603
+ %567 = fcmp oeq float %566, 0.000000e+00, !dbg !49
604
+ %568 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %441, float %566) #6, !dbg !50
605
+ %569 = select i1 %567, float 0.000000e+00, float %568, !dbg !51
606
+ %570 = fmul float %569, %565, !dbg !52
607
+ %571 = fadd float %559, %570, !dbg !53
608
+ %572 = fadd float %362, %564, !dbg !54
609
+ %573 = fmul float %565, %565, !dbg !55
610
+ %574 = fmul float %554, %573, !dbg !56
611
+ %575 = fmul float %569, %574, !dbg !57
612
+ %576 = fadd float %572, %575, !dbg !58
613
+ %577 = fsub float %315, %571, !dbg !44
614
+ %578 = fadd float %442, %566, !dbg !48
615
+ %579 = fcmp oeq float %578, 0.000000e+00, !dbg !49
616
+ %580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %442, float %578) #6, !dbg !50
617
+ %581 = select i1 %579, float 0.000000e+00, float %580, !dbg !51
618
+ %582 = fmul float %581, %577, !dbg !52
619
+ %583 = fadd float %571, %582, !dbg !53
620
+ %584 = fadd float %363, %576, !dbg !54
621
+ %585 = fmul float %577, %577, !dbg !55
622
+ %586 = fmul float %566, %585, !dbg !56
623
+ %587 = fmul float %581, %586, !dbg !57
624
+ %588 = fadd float %584, %587, !dbg !58
625
+ %589 = fsub float %316, %583, !dbg !44
626
+ %590 = fadd float %443, %578, !dbg !48
627
+ %591 = fcmp oeq float %590, 0.000000e+00, !dbg !49
628
+ %592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %443, float %590) #6, !dbg !50
629
+ %593 = select i1 %591, float 0.000000e+00, float %592, !dbg !51
630
+ %594 = fmul float %593, %589, !dbg !52
631
+ %595 = fadd float %583, %594, !dbg !53
632
+ %596 = fadd float %364, %588, !dbg !54
633
+ %597 = fmul float %589, %589, !dbg !55
634
+ %598 = fmul float %578, %597, !dbg !56
635
+ %599 = fmul float %593, %598, !dbg !57
636
+ %600 = fadd float %596, %599, !dbg !58
637
+ %601 = fsub float %317, %595, !dbg !44
638
+ %602 = fadd float %444, %590, !dbg !48
639
+ %603 = fcmp oeq float %602, 0.000000e+00, !dbg !49
640
+ %604 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %444, float %602) #6, !dbg !50
641
+ %605 = select i1 %603, float 0.000000e+00, float %604, !dbg !51
642
+ %606 = fmul float %605, %601, !dbg !52
643
+ %607 = fadd float %595, %606, !dbg !53
644
+ %608 = fadd float %365, %600, !dbg !54
645
+ %609 = fmul float %601, %601, !dbg !55
646
+ %610 = fmul float %590, %609, !dbg !56
647
+ %611 = fmul float %605, %610, !dbg !57
648
+ %612 = fadd float %608, %611, !dbg !58
649
+ %613 = bitcast float %523 to i32, !dbg !59
650
+ %614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !59
651
+ %615 = bitcast i32 %614 to float, !dbg !59
652
+ %616 = bitcast float %528 to i32, !dbg !59
653
+ %617 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %616, i32 4, i32 31), !dbg !59
654
+ %618 = bitcast i32 %617 to float, !dbg !59
655
+ %619 = bitcast float %518 to i32, !dbg !59
656
+ %620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !59
657
+ %621 = bitcast i32 %620 to float, !dbg !59
658
+ %622 = fsub float %615, %523, !dbg !44
659
+ %623 = fadd float %518, %621, !dbg !48
660
+ %624 = fcmp oeq float %623, 0.000000e+00, !dbg !49
661
+ %625 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %621, float %623) #6, !dbg !50
662
+ %626 = select i1 %624, float 0.000000e+00, float %625, !dbg !51
663
+ %627 = fmul float %626, %622, !dbg !52
664
+ %628 = fadd float %523, %627, !dbg !53
665
+ %629 = fadd float %528, %618, !dbg !54
666
+ %630 = fmul float %622, %622, !dbg !55
667
+ %631 = fmul float %518, %630, !dbg !56
668
+ %632 = fmul float %626, %631, !dbg !57
669
+ %633 = fadd float %629, %632, !dbg !58
670
+ %634 = bitcast float %628 to i32, !dbg !59
671
+ %635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %634, i32 2, i32 31), !dbg !59
672
+ %636 = bitcast i32 %635 to float, !dbg !59
673
+ %637 = bitcast float %633 to i32, !dbg !59
674
+ %638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 2, i32 31), !dbg !59
675
+ %639 = bitcast i32 %638 to float, !dbg !59
676
+ %640 = bitcast float %623 to i32, !dbg !59
677
+ %641 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 2, i32 31), !dbg !59
678
+ %642 = bitcast i32 %641 to float, !dbg !59
679
+ %643 = fsub float %636, %628, !dbg !44
680
+ %644 = fadd float %623, %642, !dbg !48
681
+ %645 = fcmp oeq float %644, 0.000000e+00, !dbg !49
682
+ %646 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %642, float %644) #6, !dbg !50
683
+ %647 = select i1 %645, float 0.000000e+00, float %646, !dbg !51
684
+ %648 = fmul float %647, %643, !dbg !52
685
+ %649 = fadd float %628, %648, !dbg !53
686
+ %650 = fadd float %633, %639, !dbg !54
687
+ %651 = fmul float %643, %643, !dbg !55
688
+ %652 = fmul float %623, %651, !dbg !56
689
+ %653 = fmul float %647, %652, !dbg !57
690
+ %654 = fadd float %650, %653, !dbg !58
691
+ %655 = bitcast float %649 to i32, !dbg !59
692
+ %656 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %655, i32 1, i32 31), !dbg !59
693
+ %657 = bitcast i32 %656 to float, !dbg !59
694
+ %658 = bitcast float %654 to i32, !dbg !59
695
+ %659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 1, i32 31), !dbg !59
696
+ %660 = bitcast i32 %659 to float, !dbg !59
697
+ %661 = bitcast float %644 to i32, !dbg !59
698
+ %662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %661, i32 1, i32 31), !dbg !59
699
+ %663 = bitcast i32 %662 to float, !dbg !59
700
+ %664 = fsub float %657, %649, !dbg !44
701
+ %665 = fadd float %644, %663, !dbg !48
702
+ %666 = fcmp oeq float %665, 0.000000e+00, !dbg !49
703
+ %667 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %663, float %665) #6, !dbg !50
704
+ %668 = select i1 %666, float 0.000000e+00, float %667, !dbg !51
705
+ %669 = fmul float %664, %668, !dbg !52
706
+ %670 = fadd float %649, %669, !dbg !53
707
+ %671 = fadd float %654, %660, !dbg !54
708
+ %672 = fmul float %664, %664, !dbg !55
709
+ %673 = fmul float %644, %672, !dbg !56
710
+ %674 = fmul float %668, %673, !dbg !57
711
+ %675 = fadd float %671, %674, !dbg !58
712
+ %676 = bitcast float %607 to i32, !dbg !59
713
+ %677 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 4, i32 31), !dbg !59
714
+ %678 = bitcast i32 %677 to float, !dbg !59
715
+ %679 = bitcast float %612 to i32, !dbg !59
716
+ %680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %679, i32 4, i32 31), !dbg !59
717
+ %681 = bitcast i32 %680 to float, !dbg !59
718
+ %682 = bitcast float %602 to i32, !dbg !59
719
+ %683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 4, i32 31), !dbg !59
720
+ %684 = bitcast i32 %683 to float, !dbg !59
721
+ %685 = fsub float %678, %607, !dbg !44
722
+ %686 = fadd float %602, %684, !dbg !48
723
+ %687 = fcmp oeq float %686, 0.000000e+00, !dbg !49
724
+ %688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %684, float %686) #6, !dbg !50
725
+ %689 = select i1 %687, float 0.000000e+00, float %688, !dbg !51
726
+ %690 = fmul float %685, %689, !dbg !52
727
+ %691 = fadd float %607, %690, !dbg !53
728
+ %692 = fadd float %612, %681, !dbg !54
729
+ %693 = fmul float %685, %685, !dbg !55
730
+ %694 = fmul float %602, %693, !dbg !56
731
+ %695 = fmul float %694, %689, !dbg !57
732
+ %696 = fadd float %692, %695, !dbg !58
733
+ %697 = bitcast float %691 to i32, !dbg !59
734
+ %698 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %697, i32 2, i32 31), !dbg !59
735
+ %699 = bitcast i32 %698 to float, !dbg !59
736
+ %700 = bitcast float %696 to i32, !dbg !59
737
+ %701 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %700, i32 2, i32 31), !dbg !59
738
+ %702 = bitcast i32 %701 to float, !dbg !59
739
+ %703 = bitcast float %686 to i32, !dbg !59
740
+ %704 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %703, i32 2, i32 31), !dbg !59
741
+ %705 = bitcast i32 %704 to float, !dbg !59
742
+ %706 = fsub float %699, %691, !dbg !44
743
+ %707 = fadd float %686, %705, !dbg !48
744
+ %708 = fcmp oeq float %707, 0.000000e+00, !dbg !49
745
+ %709 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %705, float %707) #6, !dbg !50
746
+ %710 = select i1 %708, float 0.000000e+00, float %709, !dbg !51
747
+ %711 = fmul float %706, %710, !dbg !52
748
+ %712 = fadd float %691, %711, !dbg !53
749
+ %713 = fadd float %696, %702, !dbg !54
750
+ %714 = fmul float %706, %706, !dbg !55
751
+ %715 = fmul float %686, %714, !dbg !56
752
+ %716 = fmul float %710, %715, !dbg !57
753
+ %717 = fadd float %713, %716, !dbg !58
754
+ %718 = bitcast float %712 to i32, !dbg !59
755
+ %719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !59
756
+ %720 = bitcast i32 %719 to float, !dbg !59
757
+ %721 = bitcast float %717 to i32, !dbg !59
758
+ %722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !59
759
+ %723 = bitcast i32 %722 to float, !dbg !59
760
+ %724 = bitcast float %707 to i32, !dbg !59
761
+ %725 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %724, i32 1, i32 31), !dbg !59
762
+ %726 = bitcast i32 %725 to float, !dbg !59
763
+ %727 = fsub float %720, %712, !dbg !44
764
+ %728 = fadd float %707, %726, !dbg !48
765
+ %729 = fcmp oeq float %728, 0.000000e+00, !dbg !49
766
+ %730 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %726, float %728) #6, !dbg !50
767
+ %731 = select i1 %729, float 0.000000e+00, float %730, !dbg !51
768
+ %732 = fmul float %727, %731, !dbg !52
769
+ %733 = fadd float %712, %732, !dbg !53
770
+ %734 = fadd float %717, %723, !dbg !54
771
+ %735 = fmul float %727, %727, !dbg !55
772
+ %736 = fmul float %707, %735, !dbg !56
773
+ %737 = fmul float %731, %736, !dbg !57
774
+ %738 = fadd float %734, %737, !dbg !58
775
+ %739 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
776
+ %740 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
777
+ %741 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
778
+ %742 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
779
+ %743 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
780
+ %744 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
781
+ %745 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
782
+ %746 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
783
+ %747 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
784
+ %748 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
785
+ %749 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
786
+ %750 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
787
+ %751 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
788
+ %752 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
789
+ %753 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
790
+ %754 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
791
+ %755 = fadd float %739, 0x3EE4F8B580000000, !dbg !62
792
+ %756 = fadd float %747, 0x3EE4F8B580000000, !dbg !62
793
+ %757 = shl i32 %18, 8, !dbg !63
794
+ %758 = shl i32 %19, 8, !dbg !63
795
+ br label %759, !dbg !64
796
+
797
+ 759: ; preds = %368, %__nv_rsqrtf.exit25
798
+ %760 = phi i32 [ 0, %368 ], [ %1009, %__nv_rsqrtf.exit25 ]
799
+ %761 = or i32 %760, %13, !dbg !65
800
+ %762 = or i32 %760, %14, !dbg !65
801
+ %763 = add i32 %761, %46, !dbg !66
802
+ %764 = add i32 %762, %46, !dbg !66
803
+ %765 = add i32 %761, %47, !dbg !66
804
+ %766 = add i32 %762, %47, !dbg !66
805
+ %767 = sext i32 %763 to i64, !dbg !67
806
+ %768 = getelementptr float, ptr addrspace(1) %2, i64 %767, !dbg !67
807
+ %769 = sext i32 %764 to i64, !dbg !67
808
+ %770 = getelementptr float, ptr addrspace(1) %2, i64 %769, !dbg !67
809
+ %771 = sext i32 %765 to i64, !dbg !67
810
+ %772 = getelementptr float, ptr addrspace(1) %2, i64 %771, !dbg !67
811
+ %773 = sext i32 %766 to i64, !dbg !67
812
+ %774 = getelementptr float, ptr addrspace(1) %2, i64 %773, !dbg !67
813
+ %775 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %768, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
814
+ %776 = extractvalue { i32, i32, i32, i32 } %775, 0, !dbg !68
815
+ %777 = extractvalue { i32, i32, i32, i32 } %775, 1, !dbg !68
816
+ %778 = extractvalue { i32, i32, i32, i32 } %775, 2, !dbg !68
817
+ %779 = extractvalue { i32, i32, i32, i32 } %775, 3, !dbg !68
818
+ %780 = bitcast i32 %776 to float, !dbg !68
819
+ %781 = bitcast i32 %777 to float, !dbg !68
820
+ %782 = bitcast i32 %778 to float, !dbg !68
821
+ %783 = bitcast i32 %779 to float, !dbg !68
822
+ %784 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %770, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
823
+ %785 = extractvalue { i32, i32, i32, i32 } %784, 0, !dbg !68
824
+ %786 = extractvalue { i32, i32, i32, i32 } %784, 1, !dbg !68
825
+ %787 = extractvalue { i32, i32, i32, i32 } %784, 2, !dbg !68
826
+ %788 = extractvalue { i32, i32, i32, i32 } %784, 3, !dbg !68
827
+ %789 = bitcast i32 %785 to float, !dbg !68
828
+ %790 = bitcast i32 %786 to float, !dbg !68
829
+ %791 = bitcast i32 %787 to float, !dbg !68
830
+ %792 = bitcast i32 %788 to float, !dbg !68
831
+ %793 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %772, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
832
+ %794 = extractvalue { i32, i32, i32, i32 } %793, 0, !dbg !68
833
+ %795 = extractvalue { i32, i32, i32, i32 } %793, 1, !dbg !68
834
+ %796 = extractvalue { i32, i32, i32, i32 } %793, 2, !dbg !68
835
+ %797 = extractvalue { i32, i32, i32, i32 } %793, 3, !dbg !68
836
+ %798 = bitcast i32 %794 to float, !dbg !68
837
+ %799 = bitcast i32 %795 to float, !dbg !68
838
+ %800 = bitcast i32 %796 to float, !dbg !68
839
+ %801 = bitcast i32 %797 to float, !dbg !68
840
+ %802 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %774, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
841
+ %803 = extractvalue { i32, i32, i32, i32 } %802, 0, !dbg !68
842
+ %804 = extractvalue { i32, i32, i32, i32 } %802, 1, !dbg !68
843
+ %805 = extractvalue { i32, i32, i32, i32 } %802, 2, !dbg !68
844
+ %806 = extractvalue { i32, i32, i32, i32 } %802, 3, !dbg !68
845
+ %807 = bitcast i32 %803 to float, !dbg !68
846
+ %808 = bitcast i32 %804 to float, !dbg !68
847
+ %809 = bitcast i32 %805 to float, !dbg !68
848
+ %810 = bitcast i32 %806 to float, !dbg !68
849
+ %811 = zext nneg i32 %761 to i64, !dbg !69
850
+ %812 = getelementptr float, ptr addrspace(1) %3, i64 %811, !dbg !69
851
+ %813 = zext nneg i32 %762 to i64, !dbg !69
852
+ %814 = getelementptr float, ptr addrspace(1) %3, i64 %813, !dbg !69
853
+ %815 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %812, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
854
+ %816 = extractvalue { i32, i32, i32, i32 } %815, 0, !dbg !70
855
+ %817 = extractvalue { i32, i32, i32, i32 } %815, 1, !dbg !70
856
+ %818 = extractvalue { i32, i32, i32, i32 } %815, 2, !dbg !70
857
+ %819 = extractvalue { i32, i32, i32, i32 } %815, 3, !dbg !70
858
+ %820 = bitcast i32 %816 to float, !dbg !70
859
+ %821 = bitcast i32 %817 to float, !dbg !70
860
+ %822 = bitcast i32 %818 to float, !dbg !70
861
+ %823 = bitcast i32 %819 to float, !dbg !70
862
+ %824 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %814, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
863
+ %825 = extractvalue { i32, i32, i32, i32 } %824, 0, !dbg !70
864
+ %826 = extractvalue { i32, i32, i32, i32 } %824, 1, !dbg !70
865
+ %827 = extractvalue { i32, i32, i32, i32 } %824, 2, !dbg !70
866
+ %828 = extractvalue { i32, i32, i32, i32 } %824, 3, !dbg !70
867
+ %829 = bitcast i32 %825 to float, !dbg !70
868
+ %830 = bitcast i32 %826 to float, !dbg !70
869
+ %831 = bitcast i32 %827 to float, !dbg !70
870
+ %832 = bitcast i32 %828 to float, !dbg !70
871
+ br i1 %53, label %833, label %834, !dbg !71
872
+
873
+ 833: ; preds = %759
874
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
875
+ br label %834, !dbg !71
876
+
877
+ 834: ; preds = %833, %759
878
+ %835 = getelementptr float, ptr addrspace(1) %60, i64 %811, !dbg !72
879
+ %836 = getelementptr float, ptr addrspace(1) %60, i64 %813, !dbg !72
880
+ %837 = getelementptr float, ptr addrspace(1) %61, i64 %811, !dbg !72
881
+ %838 = getelementptr float, ptr addrspace(1) %61, i64 %813, !dbg !72
882
+ %839 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
883
+ %840 = extractvalue { i32, i32, i32, i32 } %839, 0, !dbg !73
884
+ %841 = extractvalue { i32, i32, i32, i32 } %839, 1, !dbg !73
885
+ %842 = extractvalue { i32, i32, i32, i32 } %839, 2, !dbg !73
886
+ %843 = extractvalue { i32, i32, i32, i32 } %839, 3, !dbg !73
887
+ %844 = bitcast i32 %840 to float, !dbg !73
888
+ %845 = bitcast i32 %841 to float, !dbg !73
889
+ %846 = bitcast i32 %842 to float, !dbg !73
890
+ %847 = bitcast i32 %843 to float, !dbg !73
891
+ %848 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %836, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
892
+ %849 = extractvalue { i32, i32, i32, i32 } %848, 0, !dbg !73
893
+ %850 = extractvalue { i32, i32, i32, i32 } %848, 1, !dbg !73
894
+ %851 = extractvalue { i32, i32, i32, i32 } %848, 2, !dbg !73
895
+ %852 = extractvalue { i32, i32, i32, i32 } %848, 3, !dbg !73
896
+ %853 = bitcast i32 %849 to float, !dbg !73
897
+ %854 = bitcast i32 %850 to float, !dbg !73
898
+ %855 = bitcast i32 %851 to float, !dbg !73
899
+ %856 = bitcast i32 %852 to float, !dbg !73
900
+ %857 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
901
+ %858 = extractvalue { i32, i32, i32, i32 } %857, 0, !dbg !73
902
+ %859 = extractvalue { i32, i32, i32, i32 } %857, 1, !dbg !73
903
+ %860 = extractvalue { i32, i32, i32, i32 } %857, 2, !dbg !73
904
+ %861 = extractvalue { i32, i32, i32, i32 } %857, 3, !dbg !73
905
+ %862 = bitcast i32 %858 to float, !dbg !73
906
+ %863 = bitcast i32 %859 to float, !dbg !73
907
+ %864 = bitcast i32 %860 to float, !dbg !73
908
+ %865 = bitcast i32 %861 to float, !dbg !73
909
+ %866 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %838, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
910
+ %867 = extractvalue { i32, i32, i32, i32 } %866, 0, !dbg !73
911
+ %868 = extractvalue { i32, i32, i32, i32 } %866, 1, !dbg !73
912
+ %869 = extractvalue { i32, i32, i32, i32 } %866, 2, !dbg !73
913
+ %870 = extractvalue { i32, i32, i32, i32 } %866, 3, !dbg !73
914
+ %871 = bitcast i32 %867 to float, !dbg !73
915
+ %872 = bitcast i32 %868 to float, !dbg !73
916
+ %873 = bitcast i32 %869 to float, !dbg !73
917
+ %874 = bitcast i32 %870 to float, !dbg !73
918
+ %875 = fadd float %780, %844, !dbg !74
919
+ %876 = fadd float %781, %845, !dbg !74
920
+ %877 = fadd float %782, %846, !dbg !74
921
+ %878 = fadd float %783, %847, !dbg !74
922
+ %879 = fadd float %789, %853, !dbg !74
923
+ %880 = fadd float %790, %854, !dbg !74
924
+ %881 = fadd float %791, %855, !dbg !74
925
+ %882 = fadd float %792, %856, !dbg !74
926
+ %883 = fadd float %798, %862, !dbg !74
927
+ %884 = fadd float %799, %863, !dbg !74
928
+ %885 = fadd float %800, %864, !dbg !74
929
+ %886 = fadd float %801, %865, !dbg !74
930
+ %887 = fadd float %807, %871, !dbg !74
931
+ %888 = fadd float %808, %872, !dbg !74
932
+ %889 = fadd float %809, %873, !dbg !74
933
+ %890 = fadd float %810, %874, !dbg !74
934
+ %891 = fsub float %875, %670, !dbg !75
935
+ %892 = fsub float %876, %670, !dbg !75
936
+ %893 = fsub float %877, %670, !dbg !75
937
+ %894 = fsub float %878, %670, !dbg !75
938
+ %895 = fsub float %879, %670, !dbg !75
939
+ %896 = fsub float %880, %670, !dbg !75
940
+ %897 = fsub float %881, %670, !dbg !75
941
+ %898 = fsub float %882, %670, !dbg !75
942
+ %899 = fsub float %883, %733, !dbg !75
943
+ %900 = fsub float %884, %733, !dbg !75
944
+ %901 = fsub float %885, %733, !dbg !75
945
+ %902 = fsub float %886, %733, !dbg !75
946
+ %903 = fsub float %887, %733, !dbg !75
947
+ %904 = fsub float %888, %733, !dbg !75
948
+ %905 = fsub float %889, %733, !dbg !75
949
+ %906 = fsub float %890, %733, !dbg !75
950
+ %907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
951
+ %.not.i = icmp eq i32 %907, 0, !dbg !76
952
+ br i1 %.not.i, label %910, label %908, !dbg !76
953
+
954
+ 908: ; preds = %834
955
+ %909 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %755), !dbg !76
956
+ br label %__nv_rsqrtf.exit, !dbg !76
957
+
958
+ 910: ; preds = %834
959
+ %911 = tail call float @llvm.nvvm.rsqrt.approx.f(float %755), !dbg !76
960
+ br label %__nv_rsqrtf.exit, !dbg !76
961
+
962
+ __nv_rsqrtf.exit: ; preds = %908, %910
963
+ %.0.i = phi float [ %909, %908 ], [ %911, %910 ], !dbg !76
964
+ %912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
965
+ %913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
966
+ %914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
967
+ %915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
968
+ %916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
969
+ %917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
970
+ %918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
971
+ %919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
972
+ %.not.i23 = icmp eq i32 %919, 0, !dbg !76
973
+ br i1 %.not.i23, label %922, label %920, !dbg !76
974
+
975
+ 920: ; preds = %__nv_rsqrtf.exit
976
+ %921 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %756), !dbg !76
977
+ br label %__nv_rsqrtf.exit25, !dbg !76
978
+
979
+ 922: ; preds = %__nv_rsqrtf.exit
980
+ %923 = tail call float @llvm.nvvm.rsqrt.approx.f(float %756), !dbg !76
981
+ br label %__nv_rsqrtf.exit25, !dbg !76
982
+
983
+ __nv_rsqrtf.exit25: ; preds = %920, %922
984
+ %.0.i24 = phi float [ %921, %920 ], [ %923, %922 ], !dbg !76
985
+ %924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
986
+ %925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
987
+ %926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
988
+ %927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
989
+ %928 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
990
+ %929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
991
+ %930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
992
+ %931 = fmul float %891, %.0.i, !dbg !77
993
+ %932 = fmul float %892, %.0.i, !dbg !77
994
+ %933 = fmul float %893, %.0.i, !dbg !77
995
+ %934 = fmul float %894, %.0.i, !dbg !77
996
+ %935 = fmul float %895, %.0.i, !dbg !77
997
+ %936 = fmul float %896, %.0.i, !dbg !77
998
+ %937 = fmul float %897, %.0.i, !dbg !77
999
+ %938 = fmul float %898, %.0.i, !dbg !77
1000
+ %939 = fmul float %899, %.0.i24, !dbg !77
1001
+ %940 = fmul float %900, %.0.i24, !dbg !77
1002
+ %941 = fmul float %901, %.0.i24, !dbg !77
1003
+ %942 = fmul float %902, %.0.i24, !dbg !77
1004
+ %943 = fmul float %903, %.0.i24, !dbg !77
1005
+ %944 = fmul float %904, %.0.i24, !dbg !77
1006
+ %945 = fmul float %905, %.0.i24, !dbg !77
1007
+ %946 = fmul float %906, %.0.i24, !dbg !77
1008
+ %947 = fmul float %931, %820, !dbg !78
1009
+ %948 = fmul float %932, %821, !dbg !78
1010
+ %949 = fmul float %933, %822, !dbg !78
1011
+ %950 = fmul float %934, %823, !dbg !78
1012
+ %951 = fmul float %935, %829, !dbg !78
1013
+ %952 = fmul float %936, %830, !dbg !78
1014
+ %953 = fmul float %937, %831, !dbg !78
1015
+ %954 = fmul float %938, %832, !dbg !78
1016
+ %955 = fmul float %939, %820, !dbg !78
1017
+ %956 = fmul float %940, %821, !dbg !78
1018
+ %957 = fmul float %941, %822, !dbg !78
1019
+ %958 = fmul float %942, %823, !dbg !78
1020
+ %959 = fmul float %943, %829, !dbg !78
1021
+ %960 = fmul float %944, %830, !dbg !78
1022
+ %961 = fmul float %945, %831, !dbg !78
1023
+ %962 = fmul float %946, %832, !dbg !78
1024
+ %963 = add i32 %761, %757, !dbg !79
1025
+ %964 = add i32 %761, %758, !dbg !79
1026
+ %965 = sext i32 %963 to i64, !dbg !80
1027
+ %966 = getelementptr i16, ptr addrspace(1) %4, i64 %965, !dbg !80
1028
+ %967 = sext i32 %964 to i64, !dbg !80
1029
+ %968 = getelementptr i16, ptr addrspace(1) %4, i64 %967, !dbg !80
1030
+ %969 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #6, !dbg !81
1031
+ %970 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #6, !dbg !81
1032
+ %971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #6, !dbg !81
1033
+ %972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #6, !dbg !81
1034
+ %973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #6, !dbg !81
1035
+ %974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #6, !dbg !81
1036
+ %975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #6, !dbg !81
1037
+ %976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #6, !dbg !81
1038
+ %977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %955) #6, !dbg !81
1039
+ %978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %956) #6, !dbg !81
1040
+ %979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %957) #6, !dbg !81
1041
+ %980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %958) #6, !dbg !81
1042
+ %981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %959) #6, !dbg !81
1043
+ %982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %960) #6, !dbg !81
1044
+ %983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %961) #6, !dbg !81
1045
+ %984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %962) #6, !dbg !81
1046
+ %985 = insertelement <2 x i16> undef, i16 %969, i64 0, !dbg !81
1047
+ %986 = insertelement <2 x i16> %985, i16 %970, i64 1, !dbg !81
1048
+ %987 = bitcast <2 x i16> %986 to i32, !dbg !81
1049
+ %988 = insertelement <2 x i16> undef, i16 %971, i64 0, !dbg !81
1050
+ %989 = insertelement <2 x i16> %988, i16 %972, i64 1, !dbg !81
1051
+ %990 = bitcast <2 x i16> %989 to i32, !dbg !81
1052
+ %991 = insertelement <2 x i16> undef, i16 %973, i64 0, !dbg !81
1053
+ %992 = insertelement <2 x i16> %991, i16 %974, i64 1, !dbg !81
1054
+ %993 = bitcast <2 x i16> %992 to i32, !dbg !81
1055
+ %994 = insertelement <2 x i16> undef, i16 %975, i64 0, !dbg !81
1056
+ %995 = insertelement <2 x i16> %994, i16 %976, i64 1, !dbg !81
1057
+ %996 = bitcast <2 x i16> %995 to i32, !dbg !81
1058
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %987, i32 %990, i32 %993, i32 %996, ptr addrspace(1) %966, i1 true) #6, !dbg !81
1059
+ %997 = insertelement <2 x i16> undef, i16 %977, i64 0, !dbg !81
1060
+ %998 = insertelement <2 x i16> %997, i16 %978, i64 1, !dbg !81
1061
+ %999 = bitcast <2 x i16> %998 to i32, !dbg !81
1062
+ %1000 = insertelement <2 x i16> undef, i16 %979, i64 0, !dbg !81
1063
+ %1001 = insertelement <2 x i16> %1000, i16 %980, i64 1, !dbg !81
1064
+ %1002 = bitcast <2 x i16> %1001 to i32, !dbg !81
1065
+ %1003 = insertelement <2 x i16> undef, i16 %981, i64 0, !dbg !81
1066
+ %1004 = insertelement <2 x i16> %1003, i16 %982, i64 1, !dbg !81
1067
+ %1005 = bitcast <2 x i16> %1004 to i32, !dbg !81
1068
+ %1006 = insertelement <2 x i16> undef, i16 %983, i64 0, !dbg !81
1069
+ %1007 = insertelement <2 x i16> %1006, i16 %984, i64 1, !dbg !81
1070
+ %1008 = bitcast <2 x i16> %1007 to i32, !dbg !81
1071
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %999, i32 %1002, i32 %1005, i32 %1008, ptr addrspace(1) %968, i1 true) #6, !dbg !81
1072
+ %1009 = add nuw nsw i32 %760, 64, !dbg !64
1073
+ %1010 = icmp ult i32 %760, 192, !dbg !64
1074
+ br i1 %1010, label %759, label %1011, !dbg !64
1075
+
1076
+ 1011: ; preds = %__nv_rsqrtf.exit25
1077
+ ret void, !dbg !82
1078
+ }
1079
+
1080
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1081
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
1082
+
1083
+ ; Function Attrs: convergent nocallback nounwind
1084
+ declare void @llvm.nvvm.barrier0() #1
1085
+
1086
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
1087
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
1088
+
1089
+ ; Function Attrs: alwaysinline nounwind
1090
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
1091
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
1092
+ %.not = icmp eq i32 %1, 0
1093
+ br i1 %.not, label %4, label %2
1094
+
1095
+ 2: ; preds = %0
1096
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
1097
+ br label %6
1098
+
1099
+ 4: ; preds = %0
1100
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
1101
+ br label %6
1102
+
1103
+ 6: ; preds = %4, %2
1104
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
1105
+ ret float %.0
1106
+ }
1107
+
1108
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1109
+
1110
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1111
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
1112
+
1113
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1114
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
1115
+
1116
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1117
+ attributes #1 = { convergent nocallback nounwind }
1118
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1119
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1120
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1121
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1122
+ attributes #6 = { nounwind }
1123
+
1124
+ !llvm.module.flags = !{!0, !1}
1125
+ !llvm.dbg.cu = !{!2}
1126
+ !nvvm.annotations = !{!4, !5, !5, !4}
1127
+ !llvm.ident = !{!6}
1128
+
1129
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
1130
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1131
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
1132
+ !3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
1133
+ !4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
1134
+ !5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
1135
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1136
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
1137
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
1138
+ !9 = !{}
1139
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
1140
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
1141
+ !12 = !DILocation(line: 31, column: 36, scope: !7)
1142
+ !13 = !DILocation(line: 21, column: 28, scope: !7)
1143
+ !14 = !DILocation(line: 21, column: 33, scope: !7)
1144
+ !15 = !DILocation(line: 22, column: 23, scope: !7)
1145
+ !16 = !DILocation(line: 26, column: 30, scope: !7)
1146
+ !17 = !DILocation(line: 26, column: 35, scope: !7)
1147
+ !18 = !DILocation(line: 27, column: 18, scope: !7)
1148
+ !19 = !DILocation(line: 35, column: 44, scope: !7)
1149
+ !20 = !DILocation(line: 36, column: 22, scope: !7)
1150
+ !21 = !DILocation(line: 37, column: 22, scope: !7)
1151
+ !22 = !DILocation(line: 38, column: 36, scope: !7)
1152
+ !23 = !DILocation(line: 39, column: 40, scope: !7)
1153
+ !24 = !DILocation(line: 40, column: 44, scope: !7)
1154
+ !25 = !DILocation(line: 32, column: 27, scope: !7)
1155
+ !26 = !DILocation(line: 35, column: 40, scope: !7)
1156
+ !27 = !DILocation(line: 35, column: 34, scope: !7)
1157
+ !28 = !DILocation(line: 35, column: 50, scope: !7)
1158
+ !29 = !DILocation(line: 39, column: 55, scope: !7)
1159
+ !30 = !DILocation(line: 40, column: 40, scope: !7)
1160
+ !31 = !DILocation(line: 40, column: 34, scope: !7)
1161
+ !32 = !DILocation(line: 40, column: 52, scope: !7)
1162
+ !33 = !DILocation(line: 41, column: 22, scope: !7)
1163
+ !34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
1164
+ !35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
1165
+ !36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
1166
+ !37 = !DILocation(line: 44, column: 38, scope: !35)
1167
+ !38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
1168
+ !39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
1169
+ !40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
1170
+ !41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
1171
+ !42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
1172
+ !43 = !DILocation(line: 47, column: 48, scope: !7)
1173
+ !44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
1174
+ !45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
1175
+ !46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
1176
+ !47 = !DILocation(line: 50, column: 41, scope: !45)
1177
+ !48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
1178
+ !49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
1179
+ !50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
1180
+ !51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
1181
+ !52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
1182
+ !53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
1183
+ !54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
1184
+ !55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
1185
+ !56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
1186
+ !57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
1187
+ !58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
1188
+ !59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
1189
+ !60 = !DILocation(line: 50, column: 41, scope: !35)
1190
+ !61 = !DILocation(line: 69, column: 23, scope: !7)
1191
+ !62 = !DILocation(line: 71, column: 24, scope: !7)
1192
+ !63 = !DILocation(line: 76, column: 39, scope: !7)
1193
+ !64 = !DILocation(line: 55, column: 36, scope: !7)
1194
+ !65 = !DILocation(line: 56, column: 27, scope: !7)
1195
+ !66 = !DILocation(line: 59, column: 41, scope: !7)
1196
+ !67 = !DILocation(line: 59, column: 35, scope: !7)
1197
+ !68 = !DILocation(line: 59, column: 51, scope: !7)
1198
+ !69 = !DILocation(line: 60, column: 35, scope: !7)
1199
+ !70 = !DILocation(line: 60, column: 40, scope: !7)
1200
+ !71 = !DILocation(line: 64, column: 57, scope: !7)
1201
+ !72 = !DILocation(line: 65, column: 35, scope: !7)
1202
+ !73 = !DILocation(line: 65, column: 54, scope: !7)
1203
+ !74 = !DILocation(line: 66, column: 24, scope: !7)
1204
+ !75 = !DILocation(line: 67, column: 24, scope: !7)
1205
+ !76 = !DILocation(line: 72, column: 30, scope: !7)
1206
+ !77 = !DILocation(line: 73, column: 24, scope: !7)
1207
+ !78 = !DILocation(line: 74, column: 24, scope: !7)
1208
+ !79 = !DILocation(line: 76, column: 35, scope: !7)
1209
+ !80 = !DILocation(line: 76, column: 29, scope: !7)
1210
+ !81 = !DILocation(line: 76, column: 52, scope: !7)
1211
+ !82 = !DILocation(line: 55, column: 4, scope: !7)
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx ADDED
@@ -0,0 +1,1810 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5de6de(
29
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
34
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
36
+ )
37
+ .maxntid 256, 1, 1
38
+ {
39
+ .reg .pred %p<137>;
40
+ .reg .b16 %rs<17>;
41
+ .reg .b32 %r<408>;
42
+ .reg .f32 %f<614>;
43
+ .reg .b64 %rd<107>;
44
+ .loc 1 18 0
45
+ $L__func_begin0:
46
+ .loc 1 18 0
47
+
48
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5de6de_param_4];
49
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5de6de_param_3];
50
+ ld.param.u64 %rd49, [triton__0d1d2d3d4d5de6de_param_0];
51
+ ld.param.u64 %rd50, [triton__0d1d2d3d4d5de6de_param_1];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r13, %tid.x;
55
+ ld.param.u64 %rd51, [triton__0d1d2d3d4d5de6de_param_2];
56
+ bfe.u32 %r1, %r13, 3, 5;
57
+ and.b32 %r2, %r13, 63;
58
+ .loc 1 24 33
59
+ shl.b32 %r14, %r13, 3;
60
+ and.b32 %r3, %r14, 56;
61
+ .loc 1 31 36
62
+ shr.u32 %r4, %r13, 6;
63
+ .loc 1 21 28
64
+ mov.u32 %r11, %ctaid.x;
65
+ .loc 1 21 33
66
+ shl.b32 %r15, %r11, 6;
67
+ .loc 1 22 23
68
+ or.b32 %r16, %r15, %r1;
69
+ or.b32 %r17, %r16, 32;
70
+ or.b32 %r18, %r15, %r2;
71
+ .loc 1 26 30
72
+ mul.wide.s32 %rd52, %r16, 8;
73
+ add.s64 %rd15, %rd49, %rd52;
74
+ add.s64 %rd31, %rd15, 256;
75
+ mul.wide.s32 %rd53, %r18, 8;
76
+ add.s64 %rd47, %rd49, %rd53;
77
+ mov.pred %p1, -1;
78
+ .loc 1 26 35
79
+ mov.u64 %rd14, 0x0;
80
+ @%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd15 + 0 ];
81
+ mov.u64 %rd16, 0x0;
82
+ @%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd15 + 0 ];
83
+ mov.u64 %rd18, 0x0;
84
+ @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd15 + 0 ];
85
+ mov.u64 %rd20, 0x0;
86
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd15 + 0 ];
87
+ mov.u64 %rd22, 0x0;
88
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd15 + 0 ];
89
+ mov.u64 %rd24, 0x0;
90
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd15 + 0 ];
91
+ mov.u64 %rd26, 0x0;
92
+ @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd15 + 0 ];
93
+ mov.u64 %rd28, 0x0;
94
+ @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd15 + 0 ];
95
+ mov.u64 %rd30, 0x0;
96
+ @%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ];
97
+ mov.u64 %rd32, 0x0;
98
+ @%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd31 + 0 ];
99
+ mov.u64 %rd34, 0x0;
100
+ @%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd31 + 0 ];
101
+ mov.u64 %rd36, 0x0;
102
+ @%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd31 + 0 ];
103
+ mov.u64 %rd38, 0x0;
104
+ @%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd31 + 0 ];
105
+ mov.u64 %rd40, 0x0;
106
+ @%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd31 + 0 ];
107
+ mov.u64 %rd42, 0x0;
108
+ @%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd31 + 0 ];
109
+ mov.u64 %rd44, 0x0;
110
+ @%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd31 + 0 ];
111
+ mov.u64 %rd46, 0x0;
112
+ @%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
113
+ .loc 1 27 18
114
+ bfe.s32 %r19, %r11, 25, 1;
115
+ shr.u32 %r20, %r19, 23;
116
+ add.s32 %r21, %r16, %r20;
117
+ and.b32 %r22, %r21, 16776704;
118
+ sub.s32 %r23, %r16, %r22;
119
+ add.s32 %r24, %r17, %r20;
120
+ and.b32 %r25, %r24, 16776704;
121
+ sub.s32 %r26, %r17, %r25;
122
+ .loc 1 35 44
123
+ shl.b32 %r27, %r23, 8;
124
+ shl.b32 %r28, %r26, 8;
125
+ .loc 1 36 22
126
+ add.s64 %rd54, %rd46, 50257;
127
+ .loc 1 37 22
128
+ setp.lt.s64 %p18, %rd14, 0;
129
+ setp.lt.s64 %p19, %rd30, 0;
130
+ setp.lt.s64 %p20, %rd46, 0;
131
+ .loc 1 38 36
132
+ selp.b64 %rd1, %rd54, %rd46, %p20;
133
+ .loc 1 40 44
134
+ shl.b64 %rd55, %rd14, 8;
135
+ add.s64 %rd56, %rd55, 12865792;
136
+ selp.b64 %rd57, %rd56, %rd55, %p18;
137
+ shl.b64 %rd58, %rd30, 8;
138
+ add.s64 %rd59, %rd58, 12865792;
139
+ selp.b64 %rd60, %rd59, %rd58, %p19;
140
+ .loc 1 31 36
141
+ and.b32 %r29, %r13, 7;
142
+ mul.wide.u32 %rd2, %r29, 32;
143
+ shl.b64 %rd61, %rd60, 2;
144
+ or.b64 %rd62, %rd2, %rd61;
145
+ add.s64 %rd3, %rd50, %rd62;
146
+ shl.b64 %rd63, %rd57, 2;
147
+ or.b64 %rd64, %rd2, %rd63;
148
+ add.s64 %rd4, %rd50, %rd64;
149
+ or.b32 %r30, %r28, %r3;
150
+ mul.wide.s32 %rd65, %r30, 4;
151
+ add.s64 %rd5, %rd51, %rd65;
152
+ or.b32 %r31, %r27, %r3;
153
+ mul.wide.s32 %rd66, %r31, 4;
154
+ add.s64 %rd6, %rd51, %rd66;
155
+ mov.f32 %f550, 0f00000000;
156
+ mov.u64 %rd105, 0;
157
+ mov.b32 %r406, -64;
158
+ mov.f32 %f551, %f550;
159
+ mov.f32 %f552, %f550;
160
+ mov.f32 %f553, %f550;
161
+ mov.f32 %f554, %f550;
162
+ mov.f32 %f555, %f550;
163
+ mov.f32 %f556, %f550;
164
+ mov.f32 %f557, %f550;
165
+ mov.f32 %f558, %f550;
166
+ mov.f32 %f559, %f550;
167
+ mov.f32 %f560, %f550;
168
+ mov.f32 %f561, %f550;
169
+ mov.f32 %f562, %f550;
170
+ mov.f32 %f563, %f550;
171
+ mov.f32 %f564, %f550;
172
+ mov.f32 %f565, %f550;
173
+ mov.f32 %f566, %f550;
174
+ mov.f32 %f567, %f550;
175
+ mov.f32 %f568, %f550;
176
+ mov.f32 %f569, %f550;
177
+ mov.f32 %f570, %f550;
178
+ mov.f32 %f571, %f550;
179
+ mov.f32 %f572, %f550;
180
+ mov.f32 %f573, %f550;
181
+ mov.f32 %f574, %f550;
182
+ mov.f32 %f575, %f550;
183
+ mov.f32 %f576, %f550;
184
+ mov.f32 %f577, %f550;
185
+ mov.f32 %f578, %f550;
186
+ mov.f32 %f579, %f550;
187
+ mov.f32 %f580, %f550;
188
+ mov.f32 %f581, %f550;
189
+ mov.f32 %f582, %f550;
190
+ mov.f32 %f583, %f550;
191
+ mov.f32 %f584, %f550;
192
+ mov.f32 %f585, %f550;
193
+ mov.f32 %f586, %f550;
194
+ mov.f32 %f587, %f550;
195
+ mov.f32 %f588, %f550;
196
+ mov.f32 %f589, %f550;
197
+ mov.f32 %f590, %f550;
198
+ mov.f32 %f591, %f550;
199
+ mov.f32 %f592, %f550;
200
+ mov.f32 %f593, %f550;
201
+ mov.f32 %f594, %f550;
202
+ mov.f32 %f595, %f550;
203
+ mov.f32 %f596, %f550;
204
+ mov.f32 %f597, %f550;
205
+ mov.f32 %f598, %f550;
206
+ mov.f32 %f599, %f550;
207
+ mov.f32 %f600, %f550;
208
+ mov.f32 %f601, %f550;
209
+ mov.f32 %f602, %f550;
210
+ mov.f32 %f603, %f550;
211
+ mov.f32 %f604, %f550;
212
+ mov.f32 %f605, %f550;
213
+ mov.f32 %f606, %f550;
214
+ mov.f32 %f607, %f550;
215
+ mov.f32 %f608, %f550;
216
+ mov.f32 %f609, %f550;
217
+ mov.f32 %f610, %f550;
218
+ mov.f32 %f611, %f550;
219
+ mov.f32 %f612, %f550;
220
+ mov.f32 %f613, %f550;
221
+ bra.uni $L__BB0_1;
222
+ $L__BB0_3:
223
+ .loc 1 40 40
224
+ add.s64 %rd78, %rd4, %rd105;
225
+ .loc 1 40 34
226
+ add.s64 %rd79, %rd78, 16;
227
+ add.s64 %rd80, %rd3, %rd105;
228
+ .loc 1 40 52
229
+ add.s64 %rd81, %rd80, 16;
230
+ mov.u32 %r65, 0x0;
231
+ mov.u32 %r66, 0x0;
232
+ mov.u32 %r67, 0x0;
233
+ mov.u32 %r68, 0x0;
234
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd78 + 0 ];
235
+ @!%p1 mov.u32 %r65, %r342;
236
+ @!%p1 mov.u32 %r66, %r342;
237
+ @!%p1 mov.u32 %r67, %r342;
238
+ @!%p1 mov.u32 %r68, %r342;
239
+ mov.b32 %f174, %r65;
240
+ mov.b32 %f175, %r66;
241
+ mov.b32 %f176, %r67;
242
+ mov.b32 %f177, %r68;
243
+ mov.u32 %r73, 0x0;
244
+ mov.u32 %r74, 0x0;
245
+ mov.u32 %r75, 0x0;
246
+ mov.u32 %r76, 0x0;
247
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r73, %r74, %r75, %r76 }, [ %rd79 + 0 ];
248
+ @!%p1 mov.u32 %r73, %r342;
249
+ @!%p1 mov.u32 %r74, %r342;
250
+ @!%p1 mov.u32 %r75, %r342;
251
+ @!%p1 mov.u32 %r76, %r342;
252
+ mov.b32 %f178, %r73;
253
+ mov.b32 %f179, %r74;
254
+ mov.b32 %f180, %r75;
255
+ mov.b32 %f181, %r76;
256
+ mov.u32 %r81, 0x0;
257
+ mov.u32 %r82, 0x0;
258
+ mov.u32 %r83, 0x0;
259
+ mov.u32 %r84, 0x0;
260
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r81, %r82, %r83, %r84 }, [ %rd80 + 0 ];
261
+ @!%p1 mov.u32 %r81, %r342;
262
+ @!%p1 mov.u32 %r82, %r342;
263
+ @!%p1 mov.u32 %r83, %r342;
264
+ @!%p1 mov.u32 %r84, %r342;
265
+ mov.b32 %f182, %r81;
266
+ mov.b32 %f183, %r82;
267
+ mov.b32 %f184, %r83;
268
+ mov.b32 %f185, %r84;
269
+ mov.u32 %r89, 0x0;
270
+ mov.u32 %r90, 0x0;
271
+ mov.u32 %r91, 0x0;
272
+ mov.u32 %r92, 0x0;
273
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd81 + 0 ];
274
+ @!%p1 mov.u32 %r89, %r342;
275
+ @!%p1 mov.u32 %r90, %r342;
276
+ @!%p1 mov.u32 %r91, %r342;
277
+ @!%p1 mov.u32 %r92, %r342;
278
+ mov.b32 %f186, %r89;
279
+ mov.b32 %f187, %r90;
280
+ mov.b32 %f188, %r91;
281
+ mov.b32 %f189, %r92;
282
+ .loc 1 41 22
283
+ add.f32 %f190, %f65, %f174;
284
+ add.f32 %f191, %f66, %f175;
285
+ add.f32 %f192, %f67, %f176;
286
+ add.f32 %f193, %f68, %f177;
287
+ add.f32 %f194, %f69, %f178;
288
+ add.f32 %f195, %f70, %f179;
289
+ add.f32 %f196, %f71, %f180;
290
+ add.f32 %f197, %f72, %f181;
291
+ add.f32 %f198, %f73, %f182;
292
+ add.f32 %f199, %f74, %f183;
293
+ add.f32 %f200, %f75, %f184;
294
+ add.f32 %f201, %f76, %f185;
295
+ add.f32 %f202, %f77, %f186;
296
+ add.f32 %f203, %f78, %f187;
297
+ add.f32 %f204, %f79, %f188;
298
+ add.f32 %f205, %f80, %f189;
299
+ $L__tmp1:
300
+ .loc 2 96 20
301
+ sub.f32 %f206, %f190, %f598;
302
+ sub.f32 %f207, %f191, %f599;
303
+ sub.f32 %f208, %f192, %f600;
304
+ sub.f32 %f209, %f193, %f601;
305
+ sub.f32 %f210, %f194, %f602;
306
+ sub.f32 %f211, %f195, %f603;
307
+ sub.f32 %f212, %f196, %f604;
308
+ sub.f32 %f213, %f197, %f605;
309
+ sub.f32 %f214, %f198, %f606;
310
+ sub.f32 %f215, %f199, %f607;
311
+ sub.f32 %f216, %f200, %f608;
312
+ sub.f32 %f217, %f201, %f609;
313
+ sub.f32 %f218, %f202, %f610;
314
+ sub.f32 %f219, %f203, %f611;
315
+ sub.f32 %f220, %f204, %f612;
316
+ sub.f32 %f221, %f205, %f613;
317
+ .loc 2 97 26
318
+ add.f32 %f550, %f550, 0f3F800000;
319
+ add.f32 %f551, %f551, 0f3F800000;
320
+ add.f32 %f552, %f552, 0f3F800000;
321
+ add.f32 %f553, %f553, 0f3F800000;
322
+ add.f32 %f554, %f554, 0f3F800000;
323
+ add.f32 %f555, %f555, 0f3F800000;
324
+ add.f32 %f556, %f556, 0f3F800000;
325
+ add.f32 %f557, %f557, 0f3F800000;
326
+ add.f32 %f558, %f558, 0f3F800000;
327
+ add.f32 %f559, %f559, 0f3F800000;
328
+ add.f32 %f560, %f560, 0f3F800000;
329
+ add.f32 %f561, %f561, 0f3F800000;
330
+ add.f32 %f562, %f562, 0f3F800000;
331
+ add.f32 %f563, %f563, 0f3F800000;
332
+ add.f32 %f564, %f564, 0f3F800000;
333
+ add.f32 %f565, %f565, 0f3F800000;
334
+ add.f32 %f566, %f566, 0f3F800000;
335
+ add.f32 %f567, %f567, 0f3F800000;
336
+ add.f32 %f568, %f568, 0f3F800000;
337
+ add.f32 %f569, %f569, 0f3F800000;
338
+ add.f32 %f570, %f570, 0f3F800000;
339
+ add.f32 %f571, %f571, 0f3F800000;
340
+ add.f32 %f572, %f572, 0f3F800000;
341
+ add.f32 %f573, %f573, 0f3F800000;
342
+ add.f32 %f574, %f574, 0f3F800000;
343
+ add.f32 %f575, %f575, 0f3F800000;
344
+ add.f32 %f576, %f576, 0f3F800000;
345
+ add.f32 %f577, %f577, 0f3F800000;
346
+ add.f32 %f578, %f578, 0f3F800000;
347
+ add.f32 %f579, %f579, 0f3F800000;
348
+ add.f32 %f580, %f580, 0f3F800000;
349
+ add.f32 %f581, %f581, 0f3F800000;
350
+ .loc 2 98 30
351
+ mov.b32 %r98, %f206;
352
+ mov.b32 %r99, %f550;
353
+ div.full.f32 %r97, %r98, %r99;
354
+ mov.b32 %f222, %r97;
355
+ mov.b32 %r101, %f207;
356
+ mov.b32 %r102, %f551;
357
+ div.full.f32 %r100, %r101, %r102;
358
+ mov.b32 %f223, %r100;
359
+ mov.b32 %r104, %f208;
360
+ mov.b32 %r105, %f552;
361
+ div.full.f32 %r103, %r104, %r105;
362
+ mov.b32 %f224, %r103;
363
+ mov.b32 %r107, %f209;
364
+ mov.b32 %r108, %f553;
365
+ div.full.f32 %r106, %r107, %r108;
366
+ mov.b32 %f225, %r106;
367
+ mov.b32 %r110, %f210;
368
+ mov.b32 %r111, %f554;
369
+ div.full.f32 %r109, %r110, %r111;
370
+ mov.b32 %f226, %r109;
371
+ mov.b32 %r113, %f211;
372
+ mov.b32 %r114, %f555;
373
+ div.full.f32 %r112, %r113, %r114;
374
+ mov.b32 %f227, %r112;
375
+ mov.b32 %r116, %f212;
376
+ mov.b32 %r117, %f556;
377
+ div.full.f32 %r115, %r116, %r117;
378
+ mov.b32 %f228, %r115;
379
+ mov.b32 %r119, %f213;
380
+ mov.b32 %r120, %f557;
381
+ div.full.f32 %r118, %r119, %r120;
382
+ mov.b32 %f229, %r118;
383
+ mov.b32 %r122, %f214;
384
+ mov.b32 %r123, %f558;
385
+ div.full.f32 %r121, %r122, %r123;
386
+ mov.b32 %f230, %r121;
387
+ mov.b32 %r125, %f215;
388
+ mov.b32 %r126, %f559;
389
+ div.full.f32 %r124, %r125, %r126;
390
+ mov.b32 %f231, %r124;
391
+ mov.b32 %r128, %f216;
392
+ mov.b32 %r129, %f560;
393
+ div.full.f32 %r127, %r128, %r129;
394
+ mov.b32 %f232, %r127;
395
+ mov.b32 %r131, %f217;
396
+ mov.b32 %r132, %f561;
397
+ div.full.f32 %r130, %r131, %r132;
398
+ mov.b32 %f233, %r130;
399
+ mov.b32 %r134, %f218;
400
+ mov.b32 %r135, %f562;
401
+ div.full.f32 %r133, %r134, %r135;
402
+ mov.b32 %f234, %r133;
403
+ mov.b32 %r137, %f219;
404
+ mov.b32 %r138, %f563;
405
+ div.full.f32 %r136, %r137, %r138;
406
+ mov.b32 %f235, %r136;
407
+ mov.b32 %r140, %f220;
408
+ mov.b32 %r141, %f564;
409
+ div.full.f32 %r139, %r140, %r141;
410
+ mov.b32 %f236, %r139;
411
+ mov.b32 %r143, %f221;
412
+ mov.b32 %r144, %f565;
413
+ div.full.f32 %r142, %r143, %r144;
414
+ mov.b32 %f237, %r142;
415
+ .loc 2 98 22
416
+ add.f32 %f598, %f598, %f222;
417
+ add.f32 %f599, %f599, %f223;
418
+ add.f32 %f600, %f600, %f224;
419
+ add.f32 %f601, %f601, %f225;
420
+ add.f32 %f602, %f602, %f226;
421
+ add.f32 %f603, %f603, %f227;
422
+ add.f32 %f604, %f604, %f228;
423
+ add.f32 %f605, %f605, %f229;
424
+ add.f32 %f606, %f606, %f230;
425
+ add.f32 %f607, %f607, %f231;
426
+ add.f32 %f608, %f608, %f232;
427
+ add.f32 %f609, %f609, %f233;
428
+ add.f32 %f610, %f610, %f234;
429
+ add.f32 %f611, %f611, %f235;
430
+ add.f32 %f612, %f612, %f236;
431
+ add.f32 %f613, %f613, %f237;
432
+ .loc 2 101 30
433
+ sub.f32 %f238, %f190, %f598;
434
+ sub.f32 %f239, %f191, %f599;
435
+ sub.f32 %f240, %f192, %f600;
436
+ sub.f32 %f241, %f193, %f601;
437
+ sub.f32 %f242, %f194, %f602;
438
+ sub.f32 %f243, %f195, %f603;
439
+ sub.f32 %f244, %f196, %f604;
440
+ sub.f32 %f245, %f197, %f605;
441
+ sub.f32 %f246, %f198, %f606;
442
+ sub.f32 %f247, %f199, %f607;
443
+ sub.f32 %f248, %f200, %f608;
444
+ sub.f32 %f249, %f201, %f609;
445
+ sub.f32 %f250, %f202, %f610;
446
+ sub.f32 %f251, %f203, %f611;
447
+ sub.f32 %f252, %f204, %f612;
448
+ sub.f32 %f253, %f205, %f613;
449
+ $L__tmp2:
450
+ .loc 1 47 48
451
+ fma.rn.f32 %f582, %f206, %f238, %f582;
452
+ fma.rn.f32 %f583, %f207, %f239, %f583;
453
+ fma.rn.f32 %f584, %f208, %f240, %f584;
454
+ fma.rn.f32 %f585, %f209, %f241, %f585;
455
+ fma.rn.f32 %f586, %f210, %f242, %f586;
456
+ fma.rn.f32 %f587, %f211, %f243, %f587;
457
+ fma.rn.f32 %f588, %f212, %f244, %f588;
458
+ fma.rn.f32 %f589, %f213, %f245, %f589;
459
+ fma.rn.f32 %f590, %f214, %f246, %f590;
460
+ fma.rn.f32 %f591, %f215, %f247, %f591;
461
+ fma.rn.f32 %f592, %f216, %f248, %f592;
462
+ fma.rn.f32 %f593, %f217, %f249, %f593;
463
+ fma.rn.f32 %f594, %f218, %f250, %f594;
464
+ fma.rn.f32 %f595, %f219, %f251, %f595;
465
+ fma.rn.f32 %f596, %f220, %f252, %f596;
466
+ fma.rn.f32 %f597, %f221, %f253, %f597;
467
+ .loc 1 31 36
468
+ add.s64 %rd105, %rd105, 256;
469
+ add.s32 %r406, %r406, 64;
470
+ setp.lt.u32 %p62, %r406, 192;
471
+ @%p62 bra $L__BB0_1;
472
+ bra.uni $L__BB0_4;
473
+ $L__BB0_1:
474
+ .loc 1 39 40
475
+ setp.lt.u64 %p41, %rd1, 50257;
476
+ .loc 1 35 34
477
+ add.s64 %rd67, %rd6, %rd105;
478
+ add.s64 %rd68, %rd67, 16;
479
+ add.s64 %rd69, %rd5, %rd105;
480
+ .loc 1 35 50
481
+ add.s64 %rd70, %rd69, 16;
482
+ mov.b32 %r342, 0;
483
+ mov.u32 %r32, 0x0;
484
+ mov.u32 %r33, 0x0;
485
+ mov.u32 %r34, 0x0;
486
+ mov.u32 %r35, 0x0;
487
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r32, %r33, %r34, %r35 }, [ %rd67 + 0 ];
488
+ @!%p1 mov.u32 %r32, %r342;
489
+ @!%p1 mov.u32 %r33, %r342;
490
+ @!%p1 mov.u32 %r34, %r342;
491
+ @!%p1 mov.u32 %r35, %r342;
492
+ mov.b32 %f65, %r32;
493
+ mov.b32 %f66, %r33;
494
+ mov.b32 %f67, %r34;
495
+ mov.b32 %f68, %r35;
496
+ mov.u32 %r40, 0x0;
497
+ mov.u32 %r41, 0x0;
498
+ mov.u32 %r42, 0x0;
499
+ mov.u32 %r43, 0x0;
500
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r40, %r41, %r42, %r43 }, [ %rd68 + 0 ];
501
+ @!%p1 mov.u32 %r40, %r342;
502
+ @!%p1 mov.u32 %r41, %r342;
503
+ @!%p1 mov.u32 %r42, %r342;
504
+ @!%p1 mov.u32 %r43, %r342;
505
+ mov.b32 %f69, %r40;
506
+ mov.b32 %f70, %r41;
507
+ mov.b32 %f71, %r42;
508
+ mov.b32 %f72, %r43;
509
+ mov.u32 %r48, 0x0;
510
+ mov.u32 %r49, 0x0;
511
+ mov.u32 %r50, 0x0;
512
+ mov.u32 %r51, 0x0;
513
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd69 + 0 ];
514
+ @!%p1 mov.u32 %r48, %r342;
515
+ @!%p1 mov.u32 %r49, %r342;
516
+ @!%p1 mov.u32 %r50, %r342;
517
+ @!%p1 mov.u32 %r51, %r342;
518
+ mov.b32 %f73, %r48;
519
+ mov.b32 %f74, %r49;
520
+ mov.b32 %f75, %r50;
521
+ mov.b32 %f76, %r51;
522
+ mov.u32 %r56, 0x0;
523
+ mov.u32 %r57, 0x0;
524
+ mov.u32 %r58, 0x0;
525
+ mov.u32 %r59, 0x0;
526
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd70 + 0 ];
527
+ @!%p1 mov.u32 %r56, %r342;
528
+ @!%p1 mov.u32 %r57, %r342;
529
+ @!%p1 mov.u32 %r58, %r342;
530
+ @!%p1 mov.u32 %r59, %r342;
531
+ mov.b32 %f77, %r56;
532
+ mov.b32 %f78, %r57;
533
+ mov.b32 %f79, %r58;
534
+ mov.b32 %f80, %r59;
535
+ mov.b32 %r405, 883;
536
+ mov.u64 %rd104, 1;
537
+ .loc 1 39 55
538
+ @%p41 bra $L__BB0_3;
539
+ mov.u64 %rd71, assertMessage_0;
540
+ cvta.global.u64 %rd72, %rd71;
541
+ mov.u64 %rd73, assertFile_0;
542
+ cvta.global.u64 %rd74, %rd73;
543
+ mov.u64 %rd75, assertFunc_0;
544
+ cvta.global.u64 %rd76, %rd75;
545
+ { // callseq 6, 0
546
+ .reg .b32 temp_param_reg;
547
+ .param .b64 param0;
548
+ st.param.b64 [param0+0], %rd72;
549
+ .param .b64 param1;
550
+ st.param.b64 [param1+0], %rd74;
551
+ .param .b32 param2;
552
+ st.param.b32 [param2+0], %r405;
553
+ .param .b64 param3;
554
+ st.param.b64 [param3+0], %rd76;
555
+ .param .b64 param4;
556
+ st.param.b64 [param4+0], %rd104;
557
+ call.uni
558
+ __assertfail,
559
+ (
560
+ param0,
561
+ param1,
562
+ param2,
563
+ param3,
564
+ param4
565
+ );
566
+ } // callseq 6
567
+ bra.uni $L__BB0_3;
568
+ $L__BB0_4:
569
+ .loc 1 31 36
570
+ and.b32 %r254, %r4, 3;
571
+ mad.lo.s32 %r255, %r254, 72, %r2;
572
+ shl.b32 %r256, %r255, 2;
573
+ mov.u32 %r257, global_smem;
574
+ add.s32 %r258, %r257, %r256;
575
+ st.shared.f32 [%r258], %f566;
576
+ st.shared.f32 [%r258+1152], %f567;
577
+ st.shared.f32 [%r258+2304], %f568;
578
+ st.shared.f32 [%r258+3456], %f569;
579
+ st.shared.f32 [%r258+4608], %f570;
580
+ st.shared.f32 [%r258+5760], %f571;
581
+ st.shared.f32 [%r258+6912], %f572;
582
+ st.shared.f32 [%r258+8064], %f573;
583
+ bar.sync 0;
584
+ mad.lo.s32 %r259, %r1, 72, %r3;
585
+ shl.b32 %r260, %r259, 2;
586
+ add.s32 %r261, %r257, %r260;
587
+ ld.shared.v4.f32 {%f254, %f255, %f256, %f257}, [%r261];
588
+ ld.shared.v4.f32 {%f258, %f259, %f260, %f261}, [%r261+16];
589
+ bar.sync 0;
590
+ st.shared.f32 [%r258], %f574;
591
+ st.shared.f32 [%r258+1152], %f575;
592
+ st.shared.f32 [%r258+2304], %f576;
593
+ st.shared.f32 [%r258+3456], %f577;
594
+ st.shared.f32 [%r258+4608], %f578;
595
+ st.shared.f32 [%r258+5760], %f579;
596
+ st.shared.f32 [%r258+6912], %f580;
597
+ st.shared.f32 [%r258+8064], %f581;
598
+ bar.sync 0;
599
+ ld.shared.v4.f32 {%f262, %f263, %f264, %f265}, [%r261];
600
+ ld.shared.v4.f32 {%f266, %f267, %f268, %f269}, [%r261+16];
601
+ $L__tmp3:
602
+ .loc 2 108 21
603
+ sub.f32 %f270, %f599, %f598;
604
+ .loc 2 109 28
605
+ add.f32 %f271, %f254, %f255;
606
+ .loc 2 110 39
607
+ setp.eq.f32 %p63, %f271, 0f00000000;
608
+ .loc 2 110 60
609
+ mov.b32 %r146, %f255;
610
+ mov.b32 %r147, %f271;
611
+ div.full.f32 %r145, %r146, %r147;
612
+ mov.b32 %f272, %r145;
613
+ .loc 2 110 49
614
+ selp.f32 %f273, 0f00000000, %f272, %p63;
615
+ .loc 2 112 17
616
+ fma.rn.f32 %f274, %f270, %f273, %f598;
617
+ .loc 2 113 15
618
+ add.f32 %f275, %f582, %f583;
619
+ .loc 2 113 30
620
+ mul.f32 %f276, %f270, %f270;
621
+ .loc 2 113 38
622
+ mul.f32 %f277, %f276, %f254;
623
+ .loc 2 113 22
624
+ fma.rn.f32 %f278, %f277, %f273, %f275;
625
+ .loc 2 108 21
626
+ sub.f32 %f279, %f600, %f274;
627
+ .loc 2 109 28
628
+ add.f32 %f280, %f256, %f271;
629
+ .loc 2 110 39
630
+ setp.eq.f32 %p64, %f280, 0f00000000;
631
+ .loc 2 110 60
632
+ mov.b32 %r150, %f280;
633
+ mov.b32 %r149, %f256;
634
+ div.full.f32 %r148, %r149, %r150;
635
+ mov.b32 %f281, %r148;
636
+ .loc 2 110 49
637
+ selp.f32 %f282, 0f00000000, %f281, %p64;
638
+ .loc 2 112 17
639
+ fma.rn.f32 %f283, %f282, %f279, %f274;
640
+ .loc 2 113 15
641
+ add.f32 %f284, %f584, %f278;
642
+ .loc 2 113 30
643
+ mul.f32 %f285, %f279, %f279;
644
+ .loc 2 113 38
645
+ mul.f32 %f286, %f271, %f285;
646
+ .loc 2 113 22
647
+ fma.rn.f32 %f287, %f282, %f286, %f284;
648
+ .loc 2 108 21
649
+ sub.f32 %f288, %f601, %f283;
650
+ .loc 2 109 28
651
+ add.f32 %f289, %f257, %f280;
652
+ .loc 2 110 39
653
+ setp.eq.f32 %p65, %f289, 0f00000000;
654
+ .loc 2 110 60
655
+ mov.b32 %r153, %f289;
656
+ mov.b32 %r152, %f257;
657
+ div.full.f32 %r151, %r152, %r153;
658
+ mov.b32 %f290, %r151;
659
+ .loc 2 110 49
660
+ selp.f32 %f291, 0f00000000, %f290, %p65;
661
+ .loc 2 112 17
662
+ fma.rn.f32 %f292, %f291, %f288, %f283;
663
+ .loc 2 113 15
664
+ add.f32 %f293, %f585, %f287;
665
+ .loc 2 113 30
666
+ mul.f32 %f294, %f288, %f288;
667
+ .loc 2 113 38
668
+ mul.f32 %f295, %f280, %f294;
669
+ .loc 2 113 22
670
+ fma.rn.f32 %f296, %f291, %f295, %f293;
671
+ .loc 2 108 21
672
+ sub.f32 %f297, %f602, %f292;
673
+ .loc 2 109 28
674
+ add.f32 %f298, %f258, %f289;
675
+ .loc 2 110 39
676
+ setp.eq.f32 %p66, %f298, 0f00000000;
677
+ .loc 2 110 60
678
+ mov.b32 %r156, %f298;
679
+ mov.b32 %r155, %f258;
680
+ div.full.f32 %r154, %r155, %r156;
681
+ mov.b32 %f299, %r154;
682
+ .loc 2 110 49
683
+ selp.f32 %f300, 0f00000000, %f299, %p66;
684
+ .loc 2 112 17
685
+ fma.rn.f32 %f301, %f300, %f297, %f292;
686
+ .loc 2 113 15
687
+ add.f32 %f302, %f586, %f296;
688
+ .loc 2 113 30
689
+ mul.f32 %f303, %f297, %f297;
690
+ .loc 2 113 38
691
+ mul.f32 %f304, %f289, %f303;
692
+ .loc 2 113 22
693
+ fma.rn.f32 %f305, %f300, %f304, %f302;
694
+ .loc 2 108 21
695
+ sub.f32 %f306, %f603, %f301;
696
+ .loc 2 109 28
697
+ add.f32 %f307, %f259, %f298;
698
+ .loc 2 110 39
699
+ setp.eq.f32 %p67, %f307, 0f00000000;
700
+ .loc 2 110 60
701
+ mov.b32 %r159, %f307;
702
+ mov.b32 %r158, %f259;
703
+ div.full.f32 %r157, %r158, %r159;
704
+ mov.b32 %f308, %r157;
705
+ .loc 2 110 49
706
+ selp.f32 %f309, 0f00000000, %f308, %p67;
707
+ .loc 2 112 17
708
+ fma.rn.f32 %f310, %f309, %f306, %f301;
709
+ .loc 2 113 15
710
+ add.f32 %f311, %f587, %f305;
711
+ .loc 2 113 30
712
+ mul.f32 %f312, %f306, %f306;
713
+ .loc 2 113 38
714
+ mul.f32 %f313, %f298, %f312;
715
+ .loc 2 113 22
716
+ fma.rn.f32 %f314, %f309, %f313, %f311;
717
+ .loc 2 108 21
718
+ sub.f32 %f315, %f604, %f310;
719
+ .loc 2 109 28
720
+ add.f32 %f316, %f260, %f307;
721
+ .loc 2 110 39
722
+ setp.eq.f32 %p68, %f316, 0f00000000;
723
+ .loc 2 110 60
724
+ mov.b32 %r162, %f316;
725
+ mov.b32 %r161, %f260;
726
+ div.full.f32 %r160, %r161, %r162;
727
+ mov.b32 %f317, %r160;
728
+ .loc 2 110 49
729
+ selp.f32 %f318, 0f00000000, %f317, %p68;
730
+ .loc 2 112 17
731
+ fma.rn.f32 %f319, %f318, %f315, %f310;
732
+ .loc 2 113 15
733
+ add.f32 %f320, %f588, %f314;
734
+ .loc 2 113 30
735
+ mul.f32 %f321, %f315, %f315;
736
+ .loc 2 113 38
737
+ mul.f32 %f322, %f307, %f321;
738
+ .loc 2 113 22
739
+ fma.rn.f32 %f323, %f318, %f322, %f320;
740
+ .loc 2 108 21
741
+ sub.f32 %f324, %f605, %f319;
742
+ .loc 2 109 28
743
+ add.f32 %f325, %f261, %f316;
744
+ .loc 2 110 39
745
+ setp.eq.f32 %p69, %f325, 0f00000000;
746
+ .loc 2 110 60
747
+ mov.b32 %r165, %f325;
748
+ mov.b32 %r164, %f261;
749
+ div.full.f32 %r163, %r164, %r165;
750
+ mov.b32 %f326, %r163;
751
+ .loc 2 110 49
752
+ selp.f32 %f327, 0f00000000, %f326, %p69;
753
+ .loc 2 112 17
754
+ fma.rn.f32 %f328, %f327, %f324, %f319;
755
+ .loc 2 113 15
756
+ add.f32 %f329, %f589, %f323;
757
+ .loc 2 113 30
758
+ mul.f32 %f330, %f324, %f324;
759
+ .loc 2 113 38
760
+ mul.f32 %f331, %f316, %f330;
761
+ .loc 2 113 22
762
+ fma.rn.f32 %f332, %f327, %f331, %f329;
763
+ .loc 2 108 21
764
+ sub.f32 %f333, %f607, %f606;
765
+ .loc 2 109 28
766
+ add.f32 %f334, %f262, %f263;
767
+ .loc 2 110 39
768
+ setp.eq.f32 %p70, %f334, 0f00000000;
769
+ .loc 2 110 60
770
+ mov.b32 %r167, %f263;
771
+ mov.b32 %r168, %f334;
772
+ div.full.f32 %r166, %r167, %r168;
773
+ mov.b32 %f335, %r166;
774
+ .loc 2 110 49
775
+ selp.f32 %f336, 0f00000000, %f335, %p70;
776
+ .loc 2 112 17
777
+ fma.rn.f32 %f337, %f333, %f336, %f606;
778
+ .loc 2 113 15
779
+ add.f32 %f338, %f590, %f591;
780
+ .loc 2 113 30
781
+ mul.f32 %f339, %f333, %f333;
782
+ .loc 2 113 38
783
+ mul.f32 %f340, %f339, %f262;
784
+ .loc 2 113 22
785
+ fma.rn.f32 %f341, %f340, %f336, %f338;
786
+ .loc 2 108 21
787
+ sub.f32 %f342, %f608, %f337;
788
+ .loc 2 109 28
789
+ add.f32 %f343, %f264, %f334;
790
+ .loc 2 110 39
791
+ setp.eq.f32 %p71, %f343, 0f00000000;
792
+ .loc 2 110 60
793
+ mov.b32 %r171, %f343;
794
+ mov.b32 %r170, %f264;
795
+ div.full.f32 %r169, %r170, %r171;
796
+ mov.b32 %f344, %r169;
797
+ .loc 2 110 49
798
+ selp.f32 %f345, 0f00000000, %f344, %p71;
799
+ .loc 2 112 17
800
+ fma.rn.f32 %f346, %f345, %f342, %f337;
801
+ .loc 2 113 15
802
+ add.f32 %f347, %f592, %f341;
803
+ .loc 2 113 30
804
+ mul.f32 %f348, %f342, %f342;
805
+ .loc 2 113 38
806
+ mul.f32 %f349, %f334, %f348;
807
+ .loc 2 113 22
808
+ fma.rn.f32 %f350, %f345, %f349, %f347;
809
+ .loc 2 108 21
810
+ sub.f32 %f351, %f609, %f346;
811
+ .loc 2 109 28
812
+ add.f32 %f352, %f265, %f343;
813
+ .loc 2 110 39
814
+ setp.eq.f32 %p72, %f352, 0f00000000;
815
+ .loc 2 110 60
816
+ mov.b32 %r174, %f352;
817
+ mov.b32 %r173, %f265;
818
+ div.full.f32 %r172, %r173, %r174;
819
+ mov.b32 %f353, %r172;
820
+ .loc 2 110 49
821
+ selp.f32 %f354, 0f00000000, %f353, %p72;
822
+ .loc 2 112 17
823
+ fma.rn.f32 %f355, %f354, %f351, %f346;
824
+ .loc 2 113 15
825
+ add.f32 %f356, %f593, %f350;
826
+ .loc 2 113 30
827
+ mul.f32 %f357, %f351, %f351;
828
+ .loc 2 113 38
829
+ mul.f32 %f358, %f343, %f357;
830
+ .loc 2 113 22
831
+ fma.rn.f32 %f359, %f354, %f358, %f356;
832
+ .loc 2 108 21
833
+ sub.f32 %f360, %f610, %f355;
834
+ .loc 2 109 28
835
+ add.f32 %f361, %f266, %f352;
836
+ .loc 2 110 39
837
+ setp.eq.f32 %p73, %f361, 0f00000000;
838
+ .loc 2 110 60
839
+ mov.b32 %r177, %f361;
840
+ mov.b32 %r176, %f266;
841
+ div.full.f32 %r175, %r176, %r177;
842
+ mov.b32 %f362, %r175;
843
+ .loc 2 110 49
844
+ selp.f32 %f363, 0f00000000, %f362, %p73;
845
+ .loc 2 112 17
846
+ fma.rn.f32 %f364, %f363, %f360, %f355;
847
+ .loc 2 113 15
848
+ add.f32 %f365, %f594, %f359;
849
+ .loc 2 113 30
850
+ mul.f32 %f366, %f360, %f360;
851
+ .loc 2 113 38
852
+ mul.f32 %f367, %f352, %f366;
853
+ .loc 2 113 22
854
+ fma.rn.f32 %f368, %f363, %f367, %f365;
855
+ .loc 2 108 21
856
+ sub.f32 %f369, %f611, %f364;
857
+ .loc 2 109 28
858
+ add.f32 %f370, %f267, %f361;
859
+ .loc 2 110 39
860
+ setp.eq.f32 %p74, %f370, 0f00000000;
861
+ .loc 2 110 60
862
+ mov.b32 %r180, %f370;
863
+ mov.b32 %r179, %f267;
864
+ div.full.f32 %r178, %r179, %r180;
865
+ mov.b32 %f371, %r178;
866
+ .loc 2 110 49
867
+ selp.f32 %f372, 0f00000000, %f371, %p74;
868
+ .loc 2 112 17
869
+ fma.rn.f32 %f373, %f372, %f369, %f364;
870
+ .loc 2 113 15
871
+ add.f32 %f374, %f595, %f368;
872
+ .loc 2 113 30
873
+ mul.f32 %f375, %f369, %f369;
874
+ .loc 2 113 38
875
+ mul.f32 %f376, %f361, %f375;
876
+ .loc 2 113 22
877
+ fma.rn.f32 %f377, %f372, %f376, %f374;
878
+ .loc 2 108 21
879
+ sub.f32 %f378, %f612, %f373;
880
+ .loc 2 109 28
881
+ add.f32 %f379, %f268, %f370;
882
+ .loc 2 110 39
883
+ setp.eq.f32 %p75, %f379, 0f00000000;
884
+ .loc 2 110 60
885
+ mov.b32 %r183, %f379;
886
+ mov.b32 %r182, %f268;
887
+ div.full.f32 %r181, %r182, %r183;
888
+ mov.b32 %f380, %r181;
889
+ .loc 2 110 49
890
+ selp.f32 %f381, 0f00000000, %f380, %p75;
891
+ .loc 2 112 17
892
+ fma.rn.f32 %f382, %f381, %f378, %f373;
893
+ .loc 2 113 15
894
+ add.f32 %f383, %f596, %f377;
895
+ .loc 2 113 30
896
+ mul.f32 %f384, %f378, %f378;
897
+ .loc 2 113 38
898
+ mul.f32 %f385, %f370, %f384;
899
+ .loc 2 113 22
900
+ fma.rn.f32 %f386, %f381, %f385, %f383;
901
+ .loc 2 108 21
902
+ sub.f32 %f387, %f613, %f382;
903
+ .loc 2 109 28
904
+ add.f32 %f388, %f269, %f379;
905
+ .loc 2 110 39
906
+ setp.eq.f32 %p76, %f388, 0f00000000;
907
+ .loc 2 110 60
908
+ mov.b32 %r186, %f388;
909
+ mov.b32 %r185, %f269;
910
+ div.full.f32 %r184, %r185, %r186;
911
+ mov.b32 %f389, %r184;
912
+ .loc 2 110 49
913
+ selp.f32 %f390, 0f00000000, %f389, %p76;
914
+ .loc 2 112 17
915
+ fma.rn.f32 %f391, %f390, %f387, %f382;
916
+ .loc 2 113 15
917
+ add.f32 %f392, %f597, %f386;
918
+ .loc 2 113 30
919
+ mul.f32 %f393, %f387, %f387;
920
+ .loc 2 113 38
921
+ mul.f32 %f394, %f379, %f393;
922
+ .loc 2 113 22
923
+ fma.rn.f32 %f395, %f390, %f394, %f392;
924
+ $L__tmp4:
925
+ .loc 2 120 46
926
+ mov.b32 %r262, %f328;
927
+ shfl.sync.bfly.b32 %r263, %r262, 4, 31, -1;
928
+ mov.b32 %f396, %r263;
929
+ mov.b32 %r264, %f332;
930
+ shfl.sync.bfly.b32 %r265, %r264, 4, 31, -1;
931
+ mov.b32 %f397, %r265;
932
+ shfl.sync.bfly.b32 %r188, %r165, 4, 31, -1;
933
+ mov.b32 %f398, %r188;
934
+ $L__tmp5:
935
+ .loc 2 108 21
936
+ sub.f32 %f399, %f396, %f328;
937
+ .loc 2 109 28
938
+ add.f32 %f400, %f325, %f398;
939
+ .loc 2 110 39
940
+ setp.eq.f32 %p77, %f400, 0f00000000;
941
+ .loc 2 110 60
942
+ mov.b32 %r189, %f400;
943
+ div.full.f32 %r187, %r188, %r189;
944
+ mov.b32 %f401, %r187;
945
+ .loc 2 110 49
946
+ selp.f32 %f402, 0f00000000, %f401, %p77;
947
+ .loc 2 112 17
948
+ fma.rn.f32 %f403, %f402, %f399, %f328;
949
+ .loc 2 113 15
950
+ add.f32 %f404, %f332, %f397;
951
+ .loc 2 113 30
952
+ mul.f32 %f405, %f399, %f399;
953
+ .loc 2 113 38
954
+ mul.f32 %f406, %f325, %f405;
955
+ .loc 2 113 22
956
+ fma.rn.f32 %f407, %f402, %f406, %f404;
957
+ $L__tmp6:
958
+ .loc 2 120 46
959
+ mov.b32 %r266, %f403;
960
+ shfl.sync.bfly.b32 %r267, %r266, 2, 31, -1;
961
+ mov.b32 %f408, %r267;
962
+ mov.b32 %r268, %f407;
963
+ shfl.sync.bfly.b32 %r269, %r268, 2, 31, -1;
964
+ mov.b32 %f409, %r269;
965
+ shfl.sync.bfly.b32 %r191, %r189, 2, 31, -1;
966
+ mov.b32 %f410, %r191;
967
+ $L__tmp7:
968
+ .loc 2 108 21
969
+ sub.f32 %f411, %f408, %f403;
970
+ .loc 2 109 28
971
+ add.f32 %f412, %f400, %f410;
972
+ .loc 2 110 39
973
+ setp.eq.f32 %p78, %f412, 0f00000000;
974
+ .loc 2 110 60
975
+ mov.b32 %r192, %f412;
976
+ div.full.f32 %r190, %r191, %r192;
977
+ mov.b32 %f413, %r190;
978
+ .loc 2 110 49
979
+ selp.f32 %f414, 0f00000000, %f413, %p78;
980
+ .loc 2 112 17
981
+ fma.rn.f32 %f415, %f414, %f411, %f403;
982
+ .loc 2 113 15
983
+ add.f32 %f416, %f407, %f409;
984
+ .loc 2 113 30
985
+ mul.f32 %f417, %f411, %f411;
986
+ .loc 2 113 38
987
+ mul.f32 %f418, %f400, %f417;
988
+ .loc 2 113 22
989
+ fma.rn.f32 %f419, %f414, %f418, %f416;
990
+ $L__tmp8:
991
+ .loc 2 120 46
992
+ mov.b32 %r270, %f415;
993
+ shfl.sync.bfly.b32 %r271, %r270, 1, 31, -1;
994
+ mov.b32 %f420, %r271;
995
+ mov.b32 %r272, %f419;
996
+ shfl.sync.bfly.b32 %r273, %r272, 1, 31, -1;
997
+ mov.b32 %f421, %r273;
998
+ shfl.sync.bfly.b32 %r194, %r192, 1, 31, -1;
999
+ mov.b32 %f422, %r194;
1000
+ $L__tmp9:
1001
+ .loc 2 108 21
1002
+ sub.f32 %f423, %f420, %f415;
1003
+ .loc 2 109 28
1004
+ add.f32 %f424, %f412, %f422;
1005
+ .loc 2 110 39
1006
+ setp.eq.f32 %p79, %f424, 0f00000000;
1007
+ .loc 2 110 60
1008
+ mov.b32 %r195, %f424;
1009
+ div.full.f32 %r193, %r194, %r195;
1010
+ mov.b32 %f425, %r193;
1011
+ .loc 2 110 49
1012
+ selp.f32 %f426, 0f00000000, %f425, %p79;
1013
+ .loc 2 112 17
1014
+ fma.rn.f32 %f145, %f423, %f426, %f415;
1015
+ .loc 2 113 15
1016
+ add.f32 %f427, %f419, %f421;
1017
+ .loc 2 113 30
1018
+ mul.f32 %f428, %f423, %f423;
1019
+ .loc 2 113 38
1020
+ mul.f32 %f429, %f412, %f428;
1021
+ .loc 2 113 22
1022
+ fma.rn.f32 %f430, %f426, %f429, %f427;
1023
+ $L__tmp10:
1024
+ .loc 2 120 46
1025
+ mov.b32 %r274, %f391;
1026
+ shfl.sync.bfly.b32 %r275, %r274, 4, 31, -1;
1027
+ mov.b32 %f431, %r275;
1028
+ mov.b32 %r276, %f395;
1029
+ shfl.sync.bfly.b32 %r277, %r276, 4, 31, -1;
1030
+ mov.b32 %f432, %r277;
1031
+ shfl.sync.bfly.b32 %r197, %r186, 4, 31, -1;
1032
+ mov.b32 %f433, %r197;
1033
+ $L__tmp11:
1034
+ .loc 2 108 21
1035
+ sub.f32 %f434, %f431, %f391;
1036
+ .loc 2 109 28
1037
+ add.f32 %f435, %f388, %f433;
1038
+ .loc 2 110 39
1039
+ setp.eq.f32 %p80, %f435, 0f00000000;
1040
+ .loc 2 110 60
1041
+ mov.b32 %r198, %f435;
1042
+ div.full.f32 %r196, %r197, %r198;
1043
+ mov.b32 %f436, %r196;
1044
+ .loc 2 110 49
1045
+ selp.f32 %f437, 0f00000000, %f436, %p80;
1046
+ .loc 2 112 17
1047
+ fma.rn.f32 %f438, %f434, %f437, %f391;
1048
+ .loc 2 113 15
1049
+ add.f32 %f439, %f395, %f432;
1050
+ .loc 2 113 30
1051
+ mul.f32 %f440, %f434, %f434;
1052
+ .loc 2 113 38
1053
+ mul.f32 %f441, %f388, %f440;
1054
+ .loc 2 113 22
1055
+ fma.rn.f32 %f442, %f441, %f437, %f439;
1056
+ $L__tmp12:
1057
+ .loc 2 120 46
1058
+ mov.b32 %r278, %f438;
1059
+ shfl.sync.bfly.b32 %r279, %r278, 2, 31, -1;
1060
+ mov.b32 %f443, %r279;
1061
+ mov.b32 %r280, %f442;
1062
+ shfl.sync.bfly.b32 %r281, %r280, 2, 31, -1;
1063
+ mov.b32 %f444, %r281;
1064
+ shfl.sync.bfly.b32 %r200, %r198, 2, 31, -1;
1065
+ mov.b32 %f445, %r200;
1066
+ $L__tmp13:
1067
+ .loc 2 108 21
1068
+ sub.f32 %f446, %f443, %f438;
1069
+ .loc 2 109 28
1070
+ add.f32 %f447, %f435, %f445;
1071
+ .loc 2 110 39
1072
+ setp.eq.f32 %p81, %f447, 0f00000000;
1073
+ .loc 2 110 60
1074
+ mov.b32 %r201, %f447;
1075
+ div.full.f32 %r199, %r200, %r201;
1076
+ mov.b32 %f448, %r199;
1077
+ .loc 2 110 49
1078
+ selp.f32 %f449, 0f00000000, %f448, %p81;
1079
+ .loc 2 112 17
1080
+ fma.rn.f32 %f450, %f446, %f449, %f438;
1081
+ .loc 2 113 15
1082
+ add.f32 %f451, %f442, %f444;
1083
+ .loc 2 113 30
1084
+ mul.f32 %f452, %f446, %f446;
1085
+ .loc 2 113 38
1086
+ mul.f32 %f453, %f435, %f452;
1087
+ .loc 2 113 22
1088
+ fma.rn.f32 %f454, %f449, %f453, %f451;
1089
+ $L__tmp14:
1090
+ .loc 2 120 46
1091
+ mov.b32 %r282, %f450;
1092
+ shfl.sync.bfly.b32 %r283, %r282, 1, 31, -1;
1093
+ mov.b32 %f455, %r283;
1094
+ mov.b32 %r284, %f454;
1095
+ shfl.sync.bfly.b32 %r285, %r284, 1, 31, -1;
1096
+ mov.b32 %f456, %r285;
1097
+ shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1;
1098
+ mov.b32 %f457, %r203;
1099
+ $L__tmp15:
1100
+ .loc 2 108 21
1101
+ sub.f32 %f458, %f455, %f450;
1102
+ .loc 2 109 28
1103
+ add.f32 %f459, %f447, %f457;
1104
+ .loc 2 110 39
1105
+ setp.eq.f32 %p82, %f459, 0f00000000;
1106
+ .loc 2 110 60
1107
+ mov.b32 %r204, %f459;
1108
+ div.full.f32 %r202, %r203, %r204;
1109
+ mov.b32 %f460, %r202;
1110
+ .loc 2 110 49
1111
+ selp.f32 %f461, 0f00000000, %f460, %p82;
1112
+ .loc 2 112 17
1113
+ fma.rn.f32 %f146, %f458, %f461, %f450;
1114
+ .loc 2 113 15
1115
+ add.f32 %f462, %f454, %f456;
1116
+ .loc 2 113 30
1117
+ mul.f32 %f463, %f458, %f458;
1118
+ .loc 2 113 38
1119
+ mul.f32 %f464, %f447, %f463;
1120
+ .loc 2 113 22
1121
+ fma.rn.f32 %f465, %f461, %f464, %f462;
1122
+ $L__tmp16:
1123
+ .loc 1 69 23
1124
+ mov.b32 %r206, %f430;
1125
+ mov.b32 %r207, 1132462080;
1126
+ div.full.f32 %r205, %r206, %r207;
1127
+ mov.b32 %f466, %r205;
1128
+ mov.b32 %r230, %f465;
1129
+ div.full.f32 %r229, %r230, %r207;
1130
+ mov.b32 %f467, %r229;
1131
+ .loc 1 71 24
1132
+ add.f32 %f147, %f466, 0f3727C5AC;
1133
+ add.f32 %f148, %f467, 0f3727C5AC;
1134
+ .loc 1 55 36
1135
+ add.s64 %rd9, %rd12, %rd2;
1136
+ shl.b32 %r286, %r11, 14;
1137
+ shl.b32 %r287, %r1, 8;
1138
+ or.b32 %r288, %r286, %r287;
1139
+ or.b32 %r8, %r288, %r3;
1140
+ mov.u64 %rd106, 0;
1141
+ mov.b32 %r407, -64;
1142
+ rsqrt.approx.ftz.f32 %f516, %f147;
1143
+ rsqrt.approx.ftz.f32 %f517, %f148;
1144
+ bra.uni $L__BB0_5;
1145
+ $L__BB0_7:
1146
+ .loc 1 65 35
1147
+ add.s64 %rd96, %rd4, %rd106;
1148
+ add.s64 %rd97, %rd96, 16;
1149
+ add.s64 %rd98, %rd3, %rd106;
1150
+ .loc 1 65 54
1151
+ add.s64 %rd99, %rd98, 16;
1152
+ mov.u32 %r338, 0x0;
1153
+ mov.u32 %r339, 0x0;
1154
+ mov.u32 %r340, 0x0;
1155
+ mov.u32 %r341, 0x0;
1156
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r338, %r339, %r340, %r341 }, [ %rd96 + 0 ];
1157
+ @!%p1 mov.u32 %r338, %r342;
1158
+ @!%p1 mov.u32 %r339, %r342;
1159
+ @!%p1 mov.u32 %r340, %r342;
1160
+ @!%p1 mov.u32 %r341, %r342;
1161
+ mov.b32 %f468, %r338;
1162
+ mov.b32 %f469, %r339;
1163
+ mov.b32 %f470, %r340;
1164
+ mov.b32 %f471, %r341;
1165
+ mov.u32 %r346, 0x0;
1166
+ mov.u32 %r347, 0x0;
1167
+ mov.u32 %r348, 0x0;
1168
+ mov.u32 %r349, 0x0;
1169
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r346, %r347, %r348, %r349 }, [ %rd97 + 0 ];
1170
+ @!%p1 mov.u32 %r346, %r342;
1171
+ @!%p1 mov.u32 %r347, %r342;
1172
+ @!%p1 mov.u32 %r348, %r342;
1173
+ @!%p1 mov.u32 %r349, %r342;
1174
+ mov.b32 %f472, %r346;
1175
+ mov.b32 %f473, %r347;
1176
+ mov.b32 %f474, %r348;
1177
+ mov.b32 %f475, %r349;
1178
+ mov.u32 %r354, 0x0;
1179
+ mov.u32 %r355, 0x0;
1180
+ mov.u32 %r356, 0x0;
1181
+ mov.u32 %r357, 0x0;
1182
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r354, %r355, %r356, %r357 }, [ %rd98 + 0 ];
1183
+ @!%p1 mov.u32 %r354, %r342;
1184
+ @!%p1 mov.u32 %r355, %r342;
1185
+ @!%p1 mov.u32 %r356, %r342;
1186
+ @!%p1 mov.u32 %r357, %r342;
1187
+ mov.b32 %f476, %r354;
1188
+ mov.b32 %f477, %r355;
1189
+ mov.b32 %f478, %r356;
1190
+ mov.b32 %f479, %r357;
1191
+ mov.u32 %r362, 0x0;
1192
+ mov.u32 %r363, 0x0;
1193
+ mov.u32 %r364, 0x0;
1194
+ mov.u32 %r365, 0x0;
1195
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r362, %r363, %r364, %r365 }, [ %rd99 + 0 ];
1196
+ @!%p1 mov.u32 %r362, %r342;
1197
+ @!%p1 mov.u32 %r363, %r342;
1198
+ @!%p1 mov.u32 %r364, %r342;
1199
+ @!%p1 mov.u32 %r365, %r342;
1200
+ mov.b32 %f480, %r362;
1201
+ mov.b32 %f481, %r363;
1202
+ mov.b32 %f482, %r364;
1203
+ mov.b32 %f483, %r365;
1204
+ .loc 1 66 24
1205
+ add.f32 %f484, %f149, %f468;
1206
+ add.f32 %f485, %f150, %f469;
1207
+ add.f32 %f486, %f151, %f470;
1208
+ add.f32 %f487, %f152, %f471;
1209
+ add.f32 %f488, %f153, %f472;
1210
+ add.f32 %f489, %f154, %f473;
1211
+ add.f32 %f490, %f155, %f474;
1212
+ add.f32 %f491, %f156, %f475;
1213
+ add.f32 %f492, %f157, %f476;
1214
+ add.f32 %f493, %f158, %f477;
1215
+ add.f32 %f494, %f159, %f478;
1216
+ add.f32 %f495, %f160, %f479;
1217
+ add.f32 %f496, %f161, %f480;
1218
+ add.f32 %f497, %f162, %f481;
1219
+ add.f32 %f498, %f163, %f482;
1220
+ add.f32 %f499, %f164, %f483;
1221
+ .loc 1 67 24
1222
+ sub.f32 %f500, %f484, %f145;
1223
+ sub.f32 %f501, %f485, %f145;
1224
+ sub.f32 %f502, %f486, %f145;
1225
+ sub.f32 %f503, %f487, %f145;
1226
+ sub.f32 %f504, %f488, %f145;
1227
+ sub.f32 %f505, %f489, %f145;
1228
+ sub.f32 %f506, %f490, %f145;
1229
+ sub.f32 %f507, %f491, %f145;
1230
+ sub.f32 %f508, %f492, %f146;
1231
+ sub.f32 %f509, %f493, %f146;
1232
+ sub.f32 %f510, %f494, %f146;
1233
+ sub.f32 %f511, %f495, %f146;
1234
+ sub.f32 %f512, %f496, %f146;
1235
+ sub.f32 %f513, %f497, %f146;
1236
+ sub.f32 %f514, %f498, %f146;
1237
+ sub.f32 %f515, %f499, %f146;
1238
+ .loc 1 73 24
1239
+ mul.f32 %f518, %f500, %f516;
1240
+ mul.f32 %f519, %f501, %f516;
1241
+ mul.f32 %f520, %f502, %f516;
1242
+ mul.f32 %f521, %f503, %f516;
1243
+ mul.f32 %f522, %f504, %f516;
1244
+ mul.f32 %f523, %f505, %f516;
1245
+ mul.f32 %f524, %f506, %f516;
1246
+ mul.f32 %f525, %f507, %f516;
1247
+ mul.f32 %f526, %f508, %f517;
1248
+ mul.f32 %f527, %f509, %f517;
1249
+ mul.f32 %f528, %f510, %f517;
1250
+ mul.f32 %f529, %f511, %f517;
1251
+ mul.f32 %f530, %f512, %f517;
1252
+ mul.f32 %f531, %f513, %f517;
1253
+ mul.f32 %f532, %f514, %f517;
1254
+ mul.f32 %f533, %f515, %f517;
1255
+ .loc 1 74 24
1256
+ mul.f32 %f534, %f518, %f165;
1257
+ mul.f32 %f535, %f519, %f166;
1258
+ mul.f32 %f536, %f520, %f167;
1259
+ mul.f32 %f537, %f521, %f168;
1260
+ mul.f32 %f538, %f522, %f169;
1261
+ mul.f32 %f539, %f523, %f170;
1262
+ mul.f32 %f540, %f524, %f171;
1263
+ mul.f32 %f541, %f525, %f172;
1264
+ mul.f32 %f542, %f526, %f165;
1265
+ mul.f32 %f543, %f527, %f166;
1266
+ mul.f32 %f544, %f528, %f167;
1267
+ mul.f32 %f545, %f529, %f168;
1268
+ mul.f32 %f546, %f530, %f169;
1269
+ mul.f32 %f547, %f531, %f170;
1270
+ mul.f32 %f548, %f532, %f171;
1271
+ mul.f32 %f549, %f533, %f172;
1272
+ .loc 1 76 35
1273
+ add.s32 %r394, %r8, %r407;
1274
+ add.s32 %r395, %r394, 64;
1275
+ .loc 1 76 29
1276
+ add.s32 %r396, %r394, 8256;
1277
+ mul.wide.s32 %rd102, %r395, 2;
1278
+ add.s64 %rd100, %rd13, %rd102;
1279
+ mul.wide.s32 %rd103, %r396, 2;
1280
+ add.s64 %rd101, %rd13, %rd103;
1281
+ .loc 1 76 52
1282
+ mov.b32 %r370, %f534;
1283
+ cvt.rn.bf16.f32 %rs1, %r370;
1284
+ mov.b32 %r371, %f535;
1285
+ cvt.rn.bf16.f32 %rs2, %r371;
1286
+ mov.b32 %r372, %f536;
1287
+ cvt.rn.bf16.f32 %rs3, %r372;
1288
+ mov.b32 %r373, %f537;
1289
+ cvt.rn.bf16.f32 %rs4, %r373;
1290
+ mov.b32 %r374, %f538;
1291
+ cvt.rn.bf16.f32 %rs5, %r374;
1292
+ mov.b32 %r375, %f539;
1293
+ cvt.rn.bf16.f32 %rs6, %r375;
1294
+ mov.b32 %r376, %f540;
1295
+ cvt.rn.bf16.f32 %rs7, %r376;
1296
+ mov.b32 %r377, %f541;
1297
+ cvt.rn.bf16.f32 %rs8, %r377;
1298
+ mov.b32 %r378, %f542;
1299
+ cvt.rn.bf16.f32 %rs9, %r378;
1300
+ mov.b32 %r379, %f543;
1301
+ cvt.rn.bf16.f32 %rs10, %r379;
1302
+ mov.b32 %r380, %f544;
1303
+ cvt.rn.bf16.f32 %rs11, %r380;
1304
+ mov.b32 %r381, %f545;
1305
+ cvt.rn.bf16.f32 %rs12, %r381;
1306
+ mov.b32 %r382, %f546;
1307
+ cvt.rn.bf16.f32 %rs13, %r382;
1308
+ mov.b32 %r383, %f547;
1309
+ cvt.rn.bf16.f32 %rs14, %r383;
1310
+ mov.b32 %r384, %f548;
1311
+ cvt.rn.bf16.f32 %rs15, %r384;
1312
+ mov.b32 %r385, %f549;
1313
+ cvt.rn.bf16.f32 %rs16, %r385;
1314
+ mov.b32 %r397, {%rs1, %rs2};
1315
+ mov.b32 %r398, {%rs3, %rs4};
1316
+ mov.b32 %r399, {%rs5, %rs6};
1317
+ mov.b32 %r400, {%rs7, %rs8};
1318
+ @%p1 st.global.v4.b32 [ %rd100 + 0 ], { %r397, %r398, %r399, %r400 };
1319
+ mov.b32 %r401, {%rs9, %rs10};
1320
+ mov.b32 %r402, {%rs11, %rs12};
1321
+ mov.b32 %r403, {%rs13, %rs14};
1322
+ mov.b32 %r404, {%rs15, %rs16};
1323
+ @%p1 st.global.v4.b32 [ %rd101 + 0 ], { %r401, %r402, %r403, %r404 };
1324
+ .loc 1 55 36
1325
+ add.s64 %rd106, %rd106, 256;
1326
+ add.s32 %r407, %r407, 64;
1327
+ setp.lt.u32 %p136, %r407, 192;
1328
+ @%p136 bra $L__BB0_5;
1329
+ bra.uni $L__BB0_8;
1330
+ $L__BB0_5:
1331
+ .loc 1 59 35
1332
+ add.s64 %rd83, %rd6, %rd106;
1333
+ add.s64 %rd84, %rd83, 16;
1334
+ add.s64 %rd85, %rd5, %rd106;
1335
+ .loc 1 59 51
1336
+ add.s64 %rd86, %rd85, 16;
1337
+ mov.u32 %r289, 0x0;
1338
+ mov.u32 %r290, 0x0;
1339
+ mov.u32 %r291, 0x0;
1340
+ mov.u32 %r292, 0x0;
1341
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r289, %r290, %r291, %r292 }, [ %rd83 + 0 ];
1342
+ @!%p1 mov.u32 %r289, %r342;
1343
+ @!%p1 mov.u32 %r290, %r342;
1344
+ @!%p1 mov.u32 %r291, %r342;
1345
+ @!%p1 mov.u32 %r292, %r342;
1346
+ mov.b32 %f149, %r289;
1347
+ mov.b32 %f150, %r290;
1348
+ mov.b32 %f151, %r291;
1349
+ mov.b32 %f152, %r292;
1350
+ mov.u32 %r297, 0x0;
1351
+ mov.u32 %r298, 0x0;
1352
+ mov.u32 %r299, 0x0;
1353
+ mov.u32 %r300, 0x0;
1354
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r297, %r298, %r299, %r300 }, [ %rd84 + 0 ];
1355
+ @!%p1 mov.u32 %r297, %r342;
1356
+ @!%p1 mov.u32 %r298, %r342;
1357
+ @!%p1 mov.u32 %r299, %r342;
1358
+ @!%p1 mov.u32 %r300, %r342;
1359
+ mov.b32 %f153, %r297;
1360
+ mov.b32 %f154, %r298;
1361
+ mov.b32 %f155, %r299;
1362
+ mov.b32 %f156, %r300;
1363
+ mov.u32 %r305, 0x0;
1364
+ mov.u32 %r306, 0x0;
1365
+ mov.u32 %r307, 0x0;
1366
+ mov.u32 %r308, 0x0;
1367
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r305, %r306, %r307, %r308 }, [ %rd85 + 0 ];
1368
+ @!%p1 mov.u32 %r305, %r342;
1369
+ @!%p1 mov.u32 %r306, %r342;
1370
+ @!%p1 mov.u32 %r307, %r342;
1371
+ @!%p1 mov.u32 %r308, %r342;
1372
+ mov.b32 %f157, %r305;
1373
+ mov.b32 %f158, %r306;
1374
+ mov.b32 %f159, %r307;
1375
+ mov.b32 %f160, %r308;
1376
+ mov.u32 %r313, 0x0;
1377
+ mov.u32 %r314, 0x0;
1378
+ mov.u32 %r315, 0x0;
1379
+ mov.u32 %r316, 0x0;
1380
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r313, %r314, %r315, %r316 }, [ %rd86 + 0 ];
1381
+ @!%p1 mov.u32 %r313, %r342;
1382
+ @!%p1 mov.u32 %r314, %r342;
1383
+ @!%p1 mov.u32 %r315, %r342;
1384
+ @!%p1 mov.u32 %r316, %r342;
1385
+ mov.b32 %f161, %r313;
1386
+ mov.b32 %f162, %r314;
1387
+ mov.b32 %f163, %r315;
1388
+ mov.b32 %f164, %r316;
1389
+ .loc 1 60 35
1390
+ add.s64 %rd87, %rd9, %rd106;
1391
+ .loc 1 60 40
1392
+ add.s64 %rd88, %rd87, 16;
1393
+ mov.u32 %r321, 0x0;
1394
+ mov.u32 %r322, 0x0;
1395
+ mov.u32 %r323, 0x0;
1396
+ mov.u32 %r324, 0x0;
1397
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd87 + 0 ];
1398
+ @!%p1 mov.u32 %r321, %r342;
1399
+ @!%p1 mov.u32 %r322, %r342;
1400
+ @!%p1 mov.u32 %r323, %r342;
1401
+ @!%p1 mov.u32 %r324, %r342;
1402
+ mov.b32 %f165, %r321;
1403
+ mov.b32 %f166, %r322;
1404
+ mov.b32 %f167, %r323;
1405
+ mov.b32 %f168, %r324;
1406
+ mov.u32 %r329, 0x0;
1407
+ mov.u32 %r330, 0x0;
1408
+ mov.u32 %r331, 0x0;
1409
+ mov.u32 %r332, 0x0;
1410
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd88 + 0 ];
1411
+ @!%p1 mov.u32 %r329, %r342;
1412
+ @!%p1 mov.u32 %r330, %r342;
1413
+ @!%p1 mov.u32 %r331, %r342;
1414
+ @!%p1 mov.u32 %r332, %r342;
1415
+ mov.b32 %f169, %r329;
1416
+ mov.b32 %f170, %r330;
1417
+ mov.b32 %f171, %r331;
1418
+ mov.b32 %f172, %r332;
1419
+ .loc 1 64 57
1420
+ @%p41 bra $L__BB0_7;
1421
+ mov.u64 %rd89, assertMessage_1;
1422
+ cvta.global.u64 %rd90, %rd89;
1423
+ mov.u64 %rd91, assertFile_1;
1424
+ cvta.global.u64 %rd92, %rd91;
1425
+ mov.u64 %rd93, assertFunc_1;
1426
+ cvta.global.u64 %rd94, %rd93;
1427
+ { // callseq 7, 0
1428
+ .reg .b32 temp_param_reg;
1429
+ .param .b64 param0;
1430
+ st.param.b64 [param0+0], %rd90;
1431
+ .param .b64 param1;
1432
+ st.param.b64 [param1+0], %rd92;
1433
+ .param .b32 param2;
1434
+ st.param.b32 [param2+0], %r405;
1435
+ .param .b64 param3;
1436
+ st.param.b64 [param3+0], %rd94;
1437
+ .param .b64 param4;
1438
+ st.param.b64 [param4+0], %rd104;
1439
+ call.uni
1440
+ __assertfail,
1441
+ (
1442
+ param0,
1443
+ param1,
1444
+ param2,
1445
+ param3,
1446
+ param4
1447
+ );
1448
+ } // callseq 7
1449
+ bra.uni $L__BB0_7;
1450
+ $L__BB0_8:
1451
+ .loc 1 55 4
1452
+ ret;
1453
+ $L__tmp17:
1454
+ $L__func_end0:
1455
+
1456
+ }
1457
+ // .globl __nv_rsqrtf
1458
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
1459
+ .param .b32 __nv_rsqrtf_param_0
1460
+ )
1461
+ {
1462
+ .reg .f32 %f<3>;
1463
+ $L__func_begin1:
1464
+
1465
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
1466
+ rsqrt.approx.ftz.f32 %f2, %f1;
1467
+ st.param.f32 [func_retval0+0], %f2;
1468
+ ret;
1469
+ $L__func_end1:
1470
+
1471
+ }
1472
+ .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
1473
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
1474
+ .section .debug_abbrev
1475
+ {
1476
+ .b8 1
1477
+ .b8 17
1478
+ .b8 1
1479
+ .b8 37
1480
+ .b8 8
1481
+ .b8 19
1482
+ .b8 5
1483
+ .b8 3
1484
+ .b8 8
1485
+ .b8 16
1486
+ .b8 6
1487
+ .b8 27
1488
+ .b8 8
1489
+ .b8 180
1490
+ .b8 66
1491
+ .b8 12
1492
+ .b8 17
1493
+ .b8 1
1494
+ .b8 18
1495
+ .b8 1
1496
+ .b8 0
1497
+ .b8 0
1498
+ .b8 2
1499
+ .b8 46
1500
+ .b8 0
1501
+ .b8 135
1502
+ .b8 64
1503
+ .b8 8
1504
+ .b8 3
1505
+ .b8 8
1506
+ .b8 58
1507
+ .b8 11
1508
+ .b8 59
1509
+ .b8 11
1510
+ .b8 63
1511
+ .b8 12
1512
+ .b8 32
1513
+ .b8 11
1514
+ .b8 0
1515
+ .b8 0
1516
+ .b8 3
1517
+ .b8 46
1518
+ .b8 1
1519
+ .b8 17
1520
+ .b8 1
1521
+ .b8 18
1522
+ .b8 1
1523
+ .b8 64
1524
+ .b8 10
1525
+ .b8 49
1526
+ .b8 19
1527
+ .b8 0
1528
+ .b8 0
1529
+ .b8 4
1530
+ .b8 29
1531
+ .b8 0
1532
+ .b8 49
1533
+ .b8 19
1534
+ .b8 17
1535
+ .b8 1
1536
+ .b8 18
1537
+ .b8 1
1538
+ .b8 88
1539
+ .b8 11
1540
+ .b8 89
1541
+ .b8 11
1542
+ .b8 87
1543
+ .b8 11
1544
+ .b8 0
1545
+ .b8 0
1546
+ .b8 5
1547
+ .b8 29
1548
+ .b8 1
1549
+ .b8 49
1550
+ .b8 19
1551
+ .b8 17
1552
+ .b8 1
1553
+ .b8 18
1554
+ .b8 1
1555
+ .b8 88
1556
+ .b8 11
1557
+ .b8 89
1558
+ .b8 11
1559
+ .b8 87
1560
+ .b8 11
1561
+ .b8 0
1562
+ .b8 0
1563
+ .b8 0
1564
+ }
1565
+ .section .debug_info
1566
+ {
1567
+ .b32 298
1568
+ .b8 2
1569
+ .b8 0
1570
+ .b32 .debug_abbrev
1571
+ .b8 8
1572
+ .b8 1
1573
+ .b8 116
1574
+ .b8 114
1575
+ .b8 105
1576
+ .b8 116
1577
+ .b8 111
1578
+ .b8 110
1579
+ .b8 0
1580
+ .b8 2
1581
+ .b8 0
1582
+ .b8 99
1583
+ .b8 103
1584
+ .b8 120
1585
+ .b8 53
1586
+ .b8 108
1587
+ .b8 120
1588
+ .b8 112
1589
+ .b8 117
1590
+ .b8 101
1591
+ .b8 120
1592
+ .b8 112
1593
+ .b8 105
1594
+ .b8 110
1595
+ .b8 100
1596
+ .b8 106
1597
+ .b8 52
1598
+ .b8 100
1599
+ .b8 115
1600
+ .b8 109
1601
+ .b8 106
1602
+ .b8 122
1603
+ .b8 53
1604
+ .b8 120
1605
+ .b8 52
1606
+ .b8 50
1607
+ .b8 117
1608
+ .b8 104
1609
+ .b8 121
1610
+ .b8 121
1611
+ .b8 55
1612
+ .b8 105
1613
+ .b8 115
1614
+ .b8 107
1615
+ .b8 101
1616
+ .b8 118
1617
+ .b8 113
1618
+ .b8 55
1619
+ .b8 111
1620
+ .b8 118
1621
+ .b8 122
1622
+ .b8 112
1623
+ .b8 119
1624
+ .b8 97
1625
+ .b8 103
1626
+ .b8 98
1627
+ .b8 51
1628
+ .b8 116
1629
+ .b8 53
1630
+ .b8 112
1631
+ .b8 111
1632
+ .b8 119
1633
+ .b8 106
1634
+ .b8 46
1635
+ .b8 112
1636
+ .b8 121
1637
+ .b8 0
1638
+ .b32 .debug_line
1639
+ .b8 47
1640
+ .b8 116
1641
+ .b8 109
1642
+ .b8 112
1643
+ .b8 47
1644
+ .b8 116
1645
+ .b8 111
1646
+ .b8 114
1647
+ .b8 99
1648
+ .b8 104
1649
+ .b8 105
1650
+ .b8 110
1651
+ .b8 100
1652
+ .b8 117
1653
+ .b8 99
1654
+ .b8 116
1655
+ .b8 111
1656
+ .b8 114
1657
+ .b8 95
1658
+ .b8 114
1659
+ .b8 111
1660
+ .b8 111
1661
+ .b8 116
1662
+ .b8 47
1663
+ .b8 103
1664
+ .b8 120
1665
+ .b8 0
1666
+ .b8 1
1667
+ .b64 $L__func_begin0
1668
+ .b64 $L__func_end0
1669
+ .b8 2
1670
+ .b8 116
1671
+ .b8 114
1672
+ .b8 105
1673
+ .b8 116
1674
+ .b8 111
1675
+ .b8 110
1676
+ .b8 95
1677
+ .b8 95
1678
+ .b8 48
1679
+ .b8 100
1680
+ .b8 49
1681
+ .b8 100
1682
+ .b8 50
1683
+ .b8 100
1684
+ .b8 51
1685
+ .b8 100
1686
+ .b8 52
1687
+ .b8 100
1688
+ .b8 53
1689
+ .b8 100
1690
+ .b8 101
1691
+ .b8 54
1692
+ .b8 100
1693
+ .b8 101
1694
+ .b8 0
1695
+ .b8 116
1696
+ .b8 114
1697
+ .b8 105
1698
+ .b8 116
1699
+ .b8 111
1700
+ .b8 110
1701
+ .b8 95
1702
+ .b8 95
1703
+ .b8 48
1704
+ .b8 100
1705
+ .b8 49
1706
+ .b8 100
1707
+ .b8 50
1708
+ .b8 100
1709
+ .b8 51
1710
+ .b8 100
1711
+ .b8 52
1712
+ .b8 100
1713
+ .b8 53
1714
+ .b8 100
1715
+ .b8 101
1716
+ .b8 54
1717
+ .b8 100
1718
+ .b8 101
1719
+ .b8 0
1720
+ .b8 1
1721
+ .b8 18
1722
+ .b8 1
1723
+ .b8 1
1724
+ .b8 3
1725
+ .b64 $L__func_begin0
1726
+ .b64 $L__func_end0
1727
+ .b8 1
1728
+ .b8 156
1729
+ .b32 125
1730
+ .b8 4
1731
+ .b32 125
1732
+ .b64 $L__tmp1
1733
+ .b64 $L__tmp2
1734
+ .b8 2
1735
+ .b8 44
1736
+ .b8 38
1737
+ .b8 5
1738
+ .b32 125
1739
+ .b64 $L__tmp3
1740
+ .b64 $L__tmp16
1741
+ .b8 2
1742
+ .b8 50
1743
+ .b8 41
1744
+ .b8 4
1745
+ .b32 125
1746
+ .b64 $L__tmp3
1747
+ .b64 $L__tmp16
1748
+ .b8 2
1749
+ .b8 120
1750
+ .b8 46
1751
+ .b8 0
1752
+ .b8 4
1753
+ .b32 125
1754
+ .b64 $L__tmp4
1755
+ .b64 $L__tmp15
1756
+ .b8 2
1757
+ .b8 50
1758
+ .b8 41
1759
+ .b8 0
1760
+ .b8 0
1761
+ }
1762
+ .section .debug_pubnames
1763
+ {
1764
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1765
+ $L__pubNames_start0:
1766
+ .b8 2
1767
+ .b8 0
1768
+ .b32 .debug_info
1769
+ .b32 302
1770
+ .b32 125
1771
+ .b8 116
1772
+ .b8 114
1773
+ .b8 105
1774
+ .b8 116
1775
+ .b8 111
1776
+ .b8 110
1777
+ .b8 95
1778
+ .b8 95
1779
+ .b8 48
1780
+ .b8 100
1781
+ .b8 49
1782
+ .b8 100
1783
+ .b8 50
1784
+ .b8 100
1785
+ .b8 51
1786
+ .b8 100
1787
+ .b8 52
1788
+ .b8 100
1789
+ .b8 53
1790
+ .b8 100
1791
+ .b8 101
1792
+ .b8 54
1793
+ .b8 100
1794
+ .b8 101
1795
+ .b8 0
1796
+ .b32 0
1797
+ $L__pubNames_end0:
1798
+ }
1799
+ .section .debug_pubtypes
1800
+ {
1801
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1802
+ $L__pubTypes_start0:
1803
+ .b8 2
1804
+ .b8 0
1805
+ .b32 .debug_info
1806
+ .b32 302
1807
+ .b32 0
1808
+ $L__pubTypes_end0:
1809
+ }
1810
+ .section .debug_loc { }
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant 0.000000e+00 : f32
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
5
+ %c256_i32 = arith.constant 256 : i32
6
+ %c64_i32 = arith.constant 64 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
12
+ %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
15
+ %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
16
+ %cst_9 = arith.constant dense<256> : tensor<1x64xi32>
17
+ %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
18
+ %0 = tt.get_program_id x : i32
19
+ %1 = arith.muli %0, %c64_i32 : i32
20
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
21
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
22
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
23
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
24
+ %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
25
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
26
+ %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
27
+ %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
28
+ %10 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
29
+ %11 = arith.muli %10, %cst_8 : tensor<64x1xi32>
30
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
31
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
32
+ %14 = arith.addi %9, %cst_3 : tensor<64x1xi64>
33
+ %15 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
34
+ %16 = arith.select %15, %14, %9 : tensor<64x1xi1>, tensor<64x1xi64>
35
+ %17 = arith.cmpi sge, %16, %cst_2 : tensor<64x1xi64>
36
+ %18 = arith.cmpi slt, %16, %cst_3 : tensor<64x1xi64>
37
+ %19 = arith.andi %17, %18 : tensor<64x1xi1>
38
+ %20 = arith.muli %16, %cst_1 : tensor<64x1xi64>
39
+ %21 = tt.broadcast %20 : (tensor<64x1xi64>) -> tensor<64x64xi64>
40
+ %22 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
41
+ %23:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 {
42
+ %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
43
+ %47 = arith.addi %46, %6 : tensor<1x64xi32>
44
+ %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
45
+ %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
46
+ %50 = arith.addi %49, %12 : tensor<64x64xi32>
47
+ %51 = tt.addptr %13, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
48
+ %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
49
+ %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
50
+ tt.assert %19, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
51
+ %54 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
52
+ %55 = tt.broadcast %54 : (tensor<1x64xi64>) -> tensor<64x64xi64>
53
+ %56 = arith.addi %55, %21 : tensor<64x64xi64>
54
+ %57 = tt.addptr %22, %56 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
55
+ %58 = tt.load %57, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
56
+ %59 = arith.addf %58, %53 : tensor<64x64xf32>
57
+ %60 = arith.subf %59, %arg8 : tensor<64x64xf32>
58
+ %61 = arith.addf %arg10, %cst_0 : tensor<64x64xf32>
59
+ %62 = arith.divf %60, %61 : tensor<64x64xf32>
60
+ %63 = arith.addf %arg8, %62 : tensor<64x64xf32>
61
+ %64 = arith.subf %59, %63 : tensor<64x64xf32>
62
+ %65 = arith.mulf %60, %64 : tensor<64x64xf32>
63
+ %66 = arith.addf %arg9, %65 : tensor<64x64xf32>
64
+ %67 = arith.select %52, %63, %arg8 : tensor<64x64xi1>, tensor<64x64xf32>
65
+ %68 = arith.select %52, %66, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
66
+ %69 = arith.select %52, %61, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
67
+ scf.yield %67, %68, %69 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
68
+ }
69
+ %24:3 = "tt.reduce"(%23#0, %23#1, %23#2) <{axis = 1 : i32}> ({
70
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
71
+ %46 = arith.subf %arg10, %arg7 : f32
72
+ %47 = arith.addf %arg9, %arg12 : f32
73
+ %48 = arith.cmpf oeq, %47, %cst : f32
74
+ %49 = arith.divf %arg12, %47 : f32
75
+ %50 = arith.select %48, %cst, %49 : f32
76
+ %51 = arith.mulf %46, %50 : f32
77
+ %52 = arith.addf %arg7, %51 : f32
78
+ %53 = arith.addf %arg8, %arg11 : f32
79
+ %54 = arith.mulf %46, %46 : f32
80
+ %55 = arith.mulf %54, %arg9 : f32
81
+ %56 = arith.mulf %55, %50 : f32
82
+ %57 = arith.addf %53, %56 : f32
83
+ tt.reduce.return %52, %57, %47 : f32, f32, f32
84
+ }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
85
+ %25 = tt.expand_dims %24#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
86
+ %26 = tt.expand_dims %24#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
87
+ %27 = arith.muli %10, %cst_8 : tensor<64x1xi32>
88
+ %28 = tt.broadcast %27 : (tensor<64x1xi32>) -> tensor<64x64xi32>
89
+ %29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
90
+ %30 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
91
+ %31 = arith.addi %9, %cst_3 : tensor<64x1xi64>
92
+ %32 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
93
+ %33 = arith.select %32, %31, %9 : tensor<64x1xi1>, tensor<64x1xi64>
94
+ %34 = arith.cmpi sge, %33, %cst_2 : tensor<64x1xi64>
95
+ %35 = arith.cmpi slt, %33, %cst_3 : tensor<64x1xi64>
96
+ %36 = arith.andi %34, %35 : tensor<64x1xi1>
97
+ %37 = arith.muli %33, %cst_1 : tensor<64x1xi64>
98
+ %38 = tt.broadcast %37 : (tensor<64x1xi64>) -> tensor<64x64xi64>
99
+ %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
100
+ %40 = tt.broadcast %25 : (tensor<64x1xf32>) -> tensor<64x64xf32>
101
+ %41 = arith.divf %26, %cst_5 : tensor<64x1xf32>
102
+ %42 = arith.addf %41, %cst_4 : tensor<64x1xf32>
103
+ %43 = arith.muli %5, %cst_8 : tensor<64x1xi32>
104
+ %44 = tt.broadcast %43 : (tensor<64x1xi32>) -> tensor<64x64xi32>
105
+ %45 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
106
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
107
+ %46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
108
+ %47 = arith.addi %46, %6 : tensor<1x64xi32>
109
+ %48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
110
+ %49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
111
+ %50 = arith.addi %49, %28 : tensor<64x64xi32>
112
+ %51 = tt.addptr %29, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
113
+ %52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
114
+ %53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
115
+ %54 = tt.addptr %30, %47 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
116
+ %55 = tt.load %54, %48, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
117
+ tt.assert %36, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
118
+ %56 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
119
+ %57 = tt.broadcast %56 : (tensor<1x64xi64>) -> tensor<64x64xi64>
120
+ %58 = arith.addi %57, %38 : tensor<64x64xi64>
121
+ %59 = tt.addptr %39, %58 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
122
+ %60 = tt.load %59, %52, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
123
+ %61 = arith.addf %60, %53 : tensor<64x64xf32>
124
+ %62 = arith.subf %61, %40 : tensor<64x64xf32>
125
+ %63 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
126
+ %64 = tt.broadcast %63 : (tensor<64x1xf32>) -> tensor<64x64xf32>
127
+ %65 = arith.mulf %62, %64 : tensor<64x64xf32>
128
+ %66 = tt.broadcast %55 : (tensor<1x64xf32>) -> tensor<64x64xf32>
129
+ %67 = arith.mulf %65, %66 : tensor<64x64xf32>
130
+ %68 = arith.addi %49, %44 : tensor<64x64xi32>
131
+ %69 = tt.addptr %45, %68 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
132
+ %70 = arith.truncf %67 : tensor<64x64xf32> to tensor<64x64xbf16>
133
+ tt.store %69, %70, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
134
+ }
135
+ tt.return
136
+ }
137
+ }
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir ADDED
@@ -0,0 +1,1360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
5
+ @assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
6
+ @assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
7
+ @assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
8
+ @assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
9
+ @assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
10
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
11
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
12
+
13
+ declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
14
+
15
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
16
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
17
+ %10 = lshr i32 %9, 3, !dbg !10
18
+ %11 = and i32 %10, 31, !dbg !10
19
+ %12 = and i32 %9, 63, !dbg !10
20
+ %13 = shl i32 %9, 3, !dbg !11
21
+ %14 = and i32 %13, 56, !dbg !11
22
+ %15 = or i32 %14, 4, !dbg !11
23
+ %16 = lshr i32 %9, 6, !dbg !12
24
+ %17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
25
+ %18 = shl i32 %17, 6, !dbg !14
26
+ %19 = or i32 %18, %11, !dbg !15
27
+ %20 = or i32 %19, 32, !dbg !15
28
+ %21 = or i32 %18, %12, !dbg !15
29
+ %22 = sext i32 %19 to i64, !dbg !16
30
+ %23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16
31
+ %24 = sext i32 %20 to i64, !dbg !16
32
+ %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
33
+ %26 = sext i32 %21 to i64, !dbg !16
34
+ %27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
35
+ %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
36
+ %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
37
+ %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
38
+ %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
39
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
40
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
41
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
42
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
43
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
44
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
45
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
46
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
47
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
48
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
49
+ %42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
50
+ %43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
51
+ %44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
52
+ %45 = srem i32 %19, 512, !dbg !18
53
+ %46 = srem i32 %20, 512, !dbg !18
54
+ %47 = shl nsw i32 %45, 8, !dbg !19
55
+ %48 = shl nsw i32 %46, 8, !dbg !19
56
+ %49 = shl i32 %19, 8, !dbg !20
57
+ %50 = shl i32 %20, 8, !dbg !20
58
+ %51 = add i64 %44, 50257, !dbg !21
59
+ %52 = icmp slt i64 %28, 0, !dbg !22
60
+ %53 = icmp slt i64 %36, 0, !dbg !22
61
+ %54 = icmp slt i64 %44, 0, !dbg !22
62
+ %55 = select i1 %54, i64 %51, i64 %44, !dbg !23
63
+ %56 = icmp ugt i64 %55, 50256, !dbg !24
64
+ %57 = shl i64 %28, 8, !dbg !25
65
+ %58 = add i64 %57, 12865792, !dbg !25
66
+ %59 = select i1 %52, i64 %58, i64 %57, !dbg !25
67
+ %60 = shl i64 %36, 8, !dbg !25
68
+ %61 = add i64 %60, 12865792, !dbg !25
69
+ %62 = select i1 %53, i64 %61, i64 %60, !dbg !25
70
+ %63 = getelementptr float, ptr addrspace(1) %1, i64 %59
71
+ %64 = getelementptr float, ptr addrspace(1) %1, i64 %62
72
+ br label %65, !dbg !12
73
+
74
+ 65: ; preds = %8, %230
75
+ %66 = phi float [ 0.000000e+00, %8 ], [ %321, %230 ]
76
+ %67 = phi float [ 0.000000e+00, %8 ], [ %322, %230 ]
77
+ %68 = phi float [ 0.000000e+00, %8 ], [ %323, %230 ]
78
+ %69 = phi float [ 0.000000e+00, %8 ], [ %324, %230 ]
79
+ %70 = phi float [ 0.000000e+00, %8 ], [ %325, %230 ]
80
+ %71 = phi float [ 0.000000e+00, %8 ], [ %326, %230 ]
81
+ %72 = phi float [ 0.000000e+00, %8 ], [ %327, %230 ]
82
+ %73 = phi float [ 0.000000e+00, %8 ], [ %328, %230 ]
83
+ %74 = phi float [ 0.000000e+00, %8 ], [ %329, %230 ]
84
+ %75 = phi float [ 0.000000e+00, %8 ], [ %330, %230 ]
85
+ %76 = phi float [ 0.000000e+00, %8 ], [ %331, %230 ]
86
+ %77 = phi float [ 0.000000e+00, %8 ], [ %332, %230 ]
87
+ %78 = phi float [ 0.000000e+00, %8 ], [ %333, %230 ]
88
+ %79 = phi float [ 0.000000e+00, %8 ], [ %334, %230 ]
89
+ %80 = phi float [ 0.000000e+00, %8 ], [ %335, %230 ]
90
+ %81 = phi float [ 0.000000e+00, %8 ], [ %336, %230 ]
91
+ %82 = phi float [ 0.000000e+00, %8 ], [ %337, %230 ]
92
+ %83 = phi float [ 0.000000e+00, %8 ], [ %338, %230 ]
93
+ %84 = phi float [ 0.000000e+00, %8 ], [ %339, %230 ]
94
+ %85 = phi float [ 0.000000e+00, %8 ], [ %340, %230 ]
95
+ %86 = phi float [ 0.000000e+00, %8 ], [ %341, %230 ]
96
+ %87 = phi float [ 0.000000e+00, %8 ], [ %342, %230 ]
97
+ %88 = phi float [ 0.000000e+00, %8 ], [ %343, %230 ]
98
+ %89 = phi float [ 0.000000e+00, %8 ], [ %344, %230 ]
99
+ %90 = phi float [ 0.000000e+00, %8 ], [ %345, %230 ]
100
+ %91 = phi float [ 0.000000e+00, %8 ], [ %346, %230 ]
101
+ %92 = phi float [ 0.000000e+00, %8 ], [ %347, %230 ]
102
+ %93 = phi float [ 0.000000e+00, %8 ], [ %348, %230 ]
103
+ %94 = phi float [ 0.000000e+00, %8 ], [ %349, %230 ]
104
+ %95 = phi float [ 0.000000e+00, %8 ], [ %350, %230 ]
105
+ %96 = phi float [ 0.000000e+00, %8 ], [ %351, %230 ]
106
+ %97 = phi float [ 0.000000e+00, %8 ], [ %352, %230 ]
107
+ %98 = phi float [ 0.000000e+00, %8 ], [ %417, %230 ]
108
+ %99 = phi float [ 0.000000e+00, %8 ], [ %418, %230 ]
109
+ %100 = phi float [ 0.000000e+00, %8 ], [ %419, %230 ]
110
+ %101 = phi float [ 0.000000e+00, %8 ], [ %420, %230 ]
111
+ %102 = phi float [ 0.000000e+00, %8 ], [ %421, %230 ]
112
+ %103 = phi float [ 0.000000e+00, %8 ], [ %422, %230 ]
113
+ %104 = phi float [ 0.000000e+00, %8 ], [ %423, %230 ]
114
+ %105 = phi float [ 0.000000e+00, %8 ], [ %424, %230 ]
115
+ %106 = phi float [ 0.000000e+00, %8 ], [ %425, %230 ]
116
+ %107 = phi float [ 0.000000e+00, %8 ], [ %426, %230 ]
117
+ %108 = phi float [ 0.000000e+00, %8 ], [ %427, %230 ]
118
+ %109 = phi float [ 0.000000e+00, %8 ], [ %428, %230 ]
119
+ %110 = phi float [ 0.000000e+00, %8 ], [ %429, %230 ]
120
+ %111 = phi float [ 0.000000e+00, %8 ], [ %430, %230 ]
121
+ %112 = phi float [ 0.000000e+00, %8 ], [ %431, %230 ]
122
+ %113 = phi float [ 0.000000e+00, %8 ], [ %432, %230 ]
123
+ %114 = phi float [ 0.000000e+00, %8 ], [ %369, %230 ]
124
+ %115 = phi float [ 0.000000e+00, %8 ], [ %370, %230 ]
125
+ %116 = phi float [ 0.000000e+00, %8 ], [ %371, %230 ]
126
+ %117 = phi float [ 0.000000e+00, %8 ], [ %372, %230 ]
127
+ %118 = phi float [ 0.000000e+00, %8 ], [ %373, %230 ]
128
+ %119 = phi float [ 0.000000e+00, %8 ], [ %374, %230 ]
129
+ %120 = phi float [ 0.000000e+00, %8 ], [ %375, %230 ]
130
+ %121 = phi float [ 0.000000e+00, %8 ], [ %376, %230 ]
131
+ %122 = phi float [ 0.000000e+00, %8 ], [ %377, %230 ]
132
+ %123 = phi float [ 0.000000e+00, %8 ], [ %378, %230 ]
133
+ %124 = phi float [ 0.000000e+00, %8 ], [ %379, %230 ]
134
+ %125 = phi float [ 0.000000e+00, %8 ], [ %380, %230 ]
135
+ %126 = phi float [ 0.000000e+00, %8 ], [ %381, %230 ]
136
+ %127 = phi float [ 0.000000e+00, %8 ], [ %382, %230 ]
137
+ %128 = phi float [ 0.000000e+00, %8 ], [ %383, %230 ]
138
+ %129 = phi float [ 0.000000e+00, %8 ], [ %384, %230 ]
139
+ %130 = phi i32 [ 0, %8 ], [ %433, %230 ]
140
+ %131 = or i32 %130, %14, !dbg !26
141
+ %132 = or i32 %130, %15, !dbg !26
142
+ %133 = add i32 %131, %47, !dbg !27
143
+ %134 = add i32 %132, %47, !dbg !27
144
+ %135 = add i32 %131, %48, !dbg !27
145
+ %136 = add i32 %132, %48, !dbg !27
146
+ %137 = sext i32 %133 to i64, !dbg !28
147
+ %138 = getelementptr float, ptr addrspace(1) %2, i64 %137, !dbg !28
148
+ %139 = sext i32 %134 to i64, !dbg !28
149
+ %140 = getelementptr float, ptr addrspace(1) %2, i64 %139, !dbg !28
150
+ %141 = sext i32 %135 to i64, !dbg !28
151
+ %142 = getelementptr float, ptr addrspace(1) %2, i64 %141, !dbg !28
152
+ %143 = sext i32 %136 to i64, !dbg !28
153
+ %144 = getelementptr float, ptr addrspace(1) %2, i64 %143, !dbg !28
154
+ %145 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %138, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
155
+ %146 = extractvalue { i32, i32, i32, i32 } %145, 0, !dbg !29
156
+ %147 = extractvalue { i32, i32, i32, i32 } %145, 1, !dbg !29
157
+ %148 = extractvalue { i32, i32, i32, i32 } %145, 2, !dbg !29
158
+ %149 = extractvalue { i32, i32, i32, i32 } %145, 3, !dbg !29
159
+ %150 = bitcast i32 %146 to float, !dbg !29
160
+ %151 = bitcast i32 %147 to float, !dbg !29
161
+ %152 = bitcast i32 %148 to float, !dbg !29
162
+ %153 = bitcast i32 %149 to float, !dbg !29
163
+ %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %140, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
164
+ %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !29
165
+ %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !29
166
+ %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !29
167
+ %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !29
168
+ %159 = bitcast i32 %155 to float, !dbg !29
169
+ %160 = bitcast i32 %156 to float, !dbg !29
170
+ %161 = bitcast i32 %157 to float, !dbg !29
171
+ %162 = bitcast i32 %158 to float, !dbg !29
172
+ %163 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %142, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
173
+ %164 = extractvalue { i32, i32, i32, i32 } %163, 0, !dbg !29
174
+ %165 = extractvalue { i32, i32, i32, i32 } %163, 1, !dbg !29
175
+ %166 = extractvalue { i32, i32, i32, i32 } %163, 2, !dbg !29
176
+ %167 = extractvalue { i32, i32, i32, i32 } %163, 3, !dbg !29
177
+ %168 = bitcast i32 %164 to float, !dbg !29
178
+ %169 = bitcast i32 %165 to float, !dbg !29
179
+ %170 = bitcast i32 %166 to float, !dbg !29
180
+ %171 = bitcast i32 %167 to float, !dbg !29
181
+ %172 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %144, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
182
+ %173 = extractvalue { i32, i32, i32, i32 } %172, 0, !dbg !29
183
+ %174 = extractvalue { i32, i32, i32, i32 } %172, 1, !dbg !29
184
+ %175 = extractvalue { i32, i32, i32, i32 } %172, 2, !dbg !29
185
+ %176 = extractvalue { i32, i32, i32, i32 } %172, 3, !dbg !29
186
+ %177 = bitcast i32 %173 to float, !dbg !29
187
+ %178 = bitcast i32 %174 to float, !dbg !29
188
+ %179 = bitcast i32 %175 to float, !dbg !29
189
+ %180 = bitcast i32 %176 to float, !dbg !29
190
+ %181 = add i32 %131, %49, !dbg !30
191
+ %182 = add i32 %131, %50, !dbg !30
192
+ %183 = sext i32 %181 to i64, !dbg !31
193
+ %184 = getelementptr i16, ptr addrspace(1) %3, i64 %183, !dbg !31
194
+ %185 = sext i32 %182 to i64, !dbg !31
195
+ %186 = getelementptr i16, ptr addrspace(1) %3, i64 %185, !dbg !31
196
+ %187 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
197
+ %188 = extractvalue { i32, i32, i32, i32 } %187, 0, !dbg !32
198
+ %189 = extractvalue { i32, i32, i32, i32 } %187, 1, !dbg !32
199
+ %190 = extractvalue { i32, i32, i32, i32 } %187, 2, !dbg !32
200
+ %191 = extractvalue { i32, i32, i32, i32 } %187, 3, !dbg !32
201
+ %192 = trunc i32 %188 to i16, !dbg !32
202
+ %extelt.offset9 = lshr i32 %188, 16, !dbg !32
203
+ %193 = trunc i32 %extelt.offset9 to i16, !dbg !32
204
+ %194 = trunc i32 %189 to i16, !dbg !32
205
+ %extelt.offset10 = lshr i32 %189, 16, !dbg !32
206
+ %195 = trunc i32 %extelt.offset10 to i16, !dbg !32
207
+ %196 = trunc i32 %190 to i16, !dbg !32
208
+ %extelt.offset11 = lshr i32 %190, 16, !dbg !32
209
+ %197 = trunc i32 %extelt.offset11 to i16, !dbg !32
210
+ %198 = trunc i32 %191 to i16, !dbg !32
211
+ %extelt.offset12 = lshr i32 %191, 16, !dbg !32
212
+ %199 = trunc i32 %extelt.offset12 to i16, !dbg !32
213
+ %200 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %186, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
214
+ %201 = extractvalue { i32, i32, i32, i32 } %200, 0, !dbg !32
215
+ %202 = extractvalue { i32, i32, i32, i32 } %200, 1, !dbg !32
216
+ %203 = extractvalue { i32, i32, i32, i32 } %200, 2, !dbg !32
217
+ %204 = extractvalue { i32, i32, i32, i32 } %200, 3, !dbg !32
218
+ %205 = trunc i32 %201 to i16, !dbg !32
219
+ %extelt.offset13 = lshr i32 %201, 16, !dbg !32
220
+ %206 = trunc i32 %extelt.offset13 to i16, !dbg !32
221
+ %207 = trunc i32 %202 to i16, !dbg !32
222
+ %extelt.offset14 = lshr i32 %202, 16, !dbg !32
223
+ %208 = trunc i32 %extelt.offset14 to i16, !dbg !32
224
+ %209 = trunc i32 %203 to i16, !dbg !32
225
+ %extelt.offset15 = lshr i32 %203, 16, !dbg !32
226
+ %210 = trunc i32 %extelt.offset15 to i16, !dbg !32
227
+ %211 = trunc i32 %204 to i16, !dbg !32
228
+ %extelt.offset16 = lshr i32 %204, 16, !dbg !32
229
+ %212 = trunc i32 %extelt.offset16 to i16, !dbg !32
230
+ %213 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %192) #6, !dbg !33
231
+ %214 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %193) #6, !dbg !33
232
+ %215 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %194) #6, !dbg !33
233
+ %216 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %195) #6, !dbg !33
234
+ %217 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %196) #6, !dbg !33
235
+ %218 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %197) #6, !dbg !33
236
+ %219 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %198) #6, !dbg !33
237
+ %220 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %199) #6, !dbg !33
238
+ %221 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %205) #6, !dbg !33
239
+ %222 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %206) #6, !dbg !33
240
+ %223 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %207) #6, !dbg !33
241
+ %224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %208) #6, !dbg !33
242
+ %225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #6, !dbg !33
243
+ %226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %210) #6, !dbg !33
244
+ %227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #6, !dbg !33
245
+ %228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %212) #6, !dbg !33
246
+ br i1 %56, label %229, label %230, !dbg !34
247
+
248
+ 229: ; preds = %65
249
+ tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
250
+ br label %230, !dbg !34
251
+
252
+ 230: ; preds = %229, %65
253
+ %231 = zext nneg i32 %131 to i64, !dbg !35
254
+ %232 = zext nneg i32 %132 to i64, !dbg !35
255
+ %233 = getelementptr float, ptr addrspace(1) %63, i64 %231, !dbg !36
256
+ %234 = getelementptr float, ptr addrspace(1) %63, i64 %232, !dbg !36
257
+ %235 = getelementptr float, ptr addrspace(1) %64, i64 %231, !dbg !36
258
+ %236 = getelementptr float, ptr addrspace(1) %64, i64 %232, !dbg !36
259
+ %237 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %233, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
260
+ %238 = extractvalue { i32, i32, i32, i32 } %237, 0, !dbg !37
261
+ %239 = extractvalue { i32, i32, i32, i32 } %237, 1, !dbg !37
262
+ %240 = extractvalue { i32, i32, i32, i32 } %237, 2, !dbg !37
263
+ %241 = extractvalue { i32, i32, i32, i32 } %237, 3, !dbg !37
264
+ %242 = bitcast i32 %238 to float, !dbg !37
265
+ %243 = bitcast i32 %239 to float, !dbg !37
266
+ %244 = bitcast i32 %240 to float, !dbg !37
267
+ %245 = bitcast i32 %241 to float, !dbg !37
268
+ %246 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %234, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
269
+ %247 = extractvalue { i32, i32, i32, i32 } %246, 0, !dbg !37
270
+ %248 = extractvalue { i32, i32, i32, i32 } %246, 1, !dbg !37
271
+ %249 = extractvalue { i32, i32, i32, i32 } %246, 2, !dbg !37
272
+ %250 = extractvalue { i32, i32, i32, i32 } %246, 3, !dbg !37
273
+ %251 = bitcast i32 %247 to float, !dbg !37
274
+ %252 = bitcast i32 %248 to float, !dbg !37
275
+ %253 = bitcast i32 %249 to float, !dbg !37
276
+ %254 = bitcast i32 %250 to float, !dbg !37
277
+ %255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %235, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
278
+ %256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !37
279
+ %257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !37
280
+ %258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !37
281
+ %259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !37
282
+ %260 = bitcast i32 %256 to float, !dbg !37
283
+ %261 = bitcast i32 %257 to float, !dbg !37
284
+ %262 = bitcast i32 %258 to float, !dbg !37
285
+ %263 = bitcast i32 %259 to float, !dbg !37
286
+ %264 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %236, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
287
+ %265 = extractvalue { i32, i32, i32, i32 } %264, 0, !dbg !37
288
+ %266 = extractvalue { i32, i32, i32, i32 } %264, 1, !dbg !37
289
+ %267 = extractvalue { i32, i32, i32, i32 } %264, 2, !dbg !37
290
+ %268 = extractvalue { i32, i32, i32, i32 } %264, 3, !dbg !37
291
+ %269 = bitcast i32 %265 to float, !dbg !37
292
+ %270 = bitcast i32 %266 to float, !dbg !37
293
+ %271 = bitcast i32 %267 to float, !dbg !37
294
+ %272 = bitcast i32 %268 to float, !dbg !37
295
+ %273 = fadd float %150, %242, !dbg !38
296
+ %274 = fadd float %151, %243, !dbg !38
297
+ %275 = fadd float %152, %244, !dbg !38
298
+ %276 = fadd float %153, %245, !dbg !38
299
+ %277 = fadd float %159, %251, !dbg !38
300
+ %278 = fadd float %160, %252, !dbg !38
301
+ %279 = fadd float %161, %253, !dbg !38
302
+ %280 = fadd float %162, %254, !dbg !38
303
+ %281 = fadd float %168, %260, !dbg !38
304
+ %282 = fadd float %169, %261, !dbg !38
305
+ %283 = fadd float %170, %262, !dbg !38
306
+ %284 = fadd float %171, %263, !dbg !38
307
+ %285 = fadd float %177, %269, !dbg !38
308
+ %286 = fadd float %178, %270, !dbg !38
309
+ %287 = fadd float %179, %271, !dbg !38
310
+ %288 = fadd float %180, %272, !dbg !38
311
+ %289 = fadd float %213, %273, !dbg !39
312
+ %290 = fadd float %214, %274, !dbg !39
313
+ %291 = fadd float %215, %275, !dbg !39
314
+ %292 = fadd float %216, %276, !dbg !39
315
+ %293 = fadd float %217, %277, !dbg !39
316
+ %294 = fadd float %218, %278, !dbg !39
317
+ %295 = fadd float %219, %279, !dbg !39
318
+ %296 = fadd float %220, %280, !dbg !39
319
+ %297 = fadd float %221, %281, !dbg !39
320
+ %298 = fadd float %222, %282, !dbg !39
321
+ %299 = fadd float %223, %283, !dbg !39
322
+ %300 = fadd float %224, %284, !dbg !39
323
+ %301 = fadd float %225, %285, !dbg !39
324
+ %302 = fadd float %226, %286, !dbg !39
325
+ %303 = fadd float %227, %287, !dbg !39
326
+ %304 = fadd float %228, %288, !dbg !39
327
+ %305 = fsub float %289, %114, !dbg !40
328
+ %306 = fsub float %290, %115, !dbg !40
329
+ %307 = fsub float %291, %116, !dbg !40
330
+ %308 = fsub float %292, %117, !dbg !40
331
+ %309 = fsub float %293, %118, !dbg !40
332
+ %310 = fsub float %294, %119, !dbg !40
333
+ %311 = fsub float %295, %120, !dbg !40
334
+ %312 = fsub float %296, %121, !dbg !40
335
+ %313 = fsub float %297, %122, !dbg !40
336
+ %314 = fsub float %298, %123, !dbg !40
337
+ %315 = fsub float %299, %124, !dbg !40
338
+ %316 = fsub float %300, %125, !dbg !40
339
+ %317 = fsub float %301, %126, !dbg !40
340
+ %318 = fsub float %302, %127, !dbg !40
341
+ %319 = fsub float %303, %128, !dbg !40
342
+ %320 = fsub float %304, %129, !dbg !40
343
+ %321 = fadd float %66, 1.000000e+00, !dbg !44
344
+ %322 = fadd float %67, 1.000000e+00, !dbg !44
345
+ %323 = fadd float %68, 1.000000e+00, !dbg !44
346
+ %324 = fadd float %69, 1.000000e+00, !dbg !44
347
+ %325 = fadd float %70, 1.000000e+00, !dbg !44
348
+ %326 = fadd float %71, 1.000000e+00, !dbg !44
349
+ %327 = fadd float %72, 1.000000e+00, !dbg !44
350
+ %328 = fadd float %73, 1.000000e+00, !dbg !44
351
+ %329 = fadd float %74, 1.000000e+00, !dbg !44
352
+ %330 = fadd float %75, 1.000000e+00, !dbg !44
353
+ %331 = fadd float %76, 1.000000e+00, !dbg !44
354
+ %332 = fadd float %77, 1.000000e+00, !dbg !44
355
+ %333 = fadd float %78, 1.000000e+00, !dbg !44
356
+ %334 = fadd float %79, 1.000000e+00, !dbg !44
357
+ %335 = fadd float %80, 1.000000e+00, !dbg !44
358
+ %336 = fadd float %81, 1.000000e+00, !dbg !44
359
+ %337 = fadd float %82, 1.000000e+00, !dbg !44
360
+ %338 = fadd float %83, 1.000000e+00, !dbg !44
361
+ %339 = fadd float %84, 1.000000e+00, !dbg !44
362
+ %340 = fadd float %85, 1.000000e+00, !dbg !44
363
+ %341 = fadd float %86, 1.000000e+00, !dbg !44
364
+ %342 = fadd float %87, 1.000000e+00, !dbg !44
365
+ %343 = fadd float %88, 1.000000e+00, !dbg !44
366
+ %344 = fadd float %89, 1.000000e+00, !dbg !44
367
+ %345 = fadd float %90, 1.000000e+00, !dbg !44
368
+ %346 = fadd float %91, 1.000000e+00, !dbg !44
369
+ %347 = fadd float %92, 1.000000e+00, !dbg !44
370
+ %348 = fadd float %93, 1.000000e+00, !dbg !44
371
+ %349 = fadd float %94, 1.000000e+00, !dbg !44
372
+ %350 = fadd float %95, 1.000000e+00, !dbg !44
373
+ %351 = fadd float %96, 1.000000e+00, !dbg !44
374
+ %352 = fadd float %97, 1.000000e+00, !dbg !44
375
+ %353 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %305, float %321) #6, !dbg !45
376
+ %354 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %306, float %322) #6, !dbg !45
377
+ %355 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %307, float %323) #6, !dbg !45
378
+ %356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %308, float %324) #6, !dbg !45
379
+ %357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %309, float %325) #6, !dbg !45
380
+ %358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %310, float %326) #6, !dbg !45
381
+ %359 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %311, float %327) #6, !dbg !45
382
+ %360 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %312, float %328) #6, !dbg !45
383
+ %361 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %313, float %329) #6, !dbg !45
384
+ %362 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %314, float %330) #6, !dbg !45
385
+ %363 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %315, float %331) #6, !dbg !45
386
+ %364 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %316, float %332) #6, !dbg !45
387
+ %365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %317, float %333) #6, !dbg !45
388
+ %366 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %318, float %334) #6, !dbg !45
389
+ %367 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %319, float %335) #6, !dbg !45
390
+ %368 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %320, float %336) #6, !dbg !45
391
+ %369 = fadd float %114, %353, !dbg !46
392
+ %370 = fadd float %115, %354, !dbg !46
393
+ %371 = fadd float %116, %355, !dbg !46
394
+ %372 = fadd float %117, %356, !dbg !46
395
+ %373 = fadd float %118, %357, !dbg !46
396
+ %374 = fadd float %119, %358, !dbg !46
397
+ %375 = fadd float %120, %359, !dbg !46
398
+ %376 = fadd float %121, %360, !dbg !46
399
+ %377 = fadd float %122, %361, !dbg !46
400
+ %378 = fadd float %123, %362, !dbg !46
401
+ %379 = fadd float %124, %363, !dbg !46
402
+ %380 = fadd float %125, %364, !dbg !46
403
+ %381 = fadd float %126, %365, !dbg !46
404
+ %382 = fadd float %127, %366, !dbg !46
405
+ %383 = fadd float %128, %367, !dbg !46
406
+ %384 = fadd float %129, %368, !dbg !46
407
+ %385 = fsub float %289, %369, !dbg !47
408
+ %386 = fsub float %290, %370, !dbg !47
409
+ %387 = fsub float %291, %371, !dbg !47
410
+ %388 = fsub float %292, %372, !dbg !47
411
+ %389 = fsub float %293, %373, !dbg !47
412
+ %390 = fsub float %294, %374, !dbg !47
413
+ %391 = fsub float %295, %375, !dbg !47
414
+ %392 = fsub float %296, %376, !dbg !47
415
+ %393 = fsub float %297, %377, !dbg !47
416
+ %394 = fsub float %298, %378, !dbg !47
417
+ %395 = fsub float %299, %379, !dbg !47
418
+ %396 = fsub float %300, %380, !dbg !47
419
+ %397 = fsub float %301, %381, !dbg !47
420
+ %398 = fsub float %302, %382, !dbg !47
421
+ %399 = fsub float %303, %383, !dbg !47
422
+ %400 = fsub float %304, %384, !dbg !47
423
+ %401 = fmul float %305, %385, !dbg !48
424
+ %402 = fmul float %306, %386, !dbg !48
425
+ %403 = fmul float %307, %387, !dbg !48
426
+ %404 = fmul float %308, %388, !dbg !48
427
+ %405 = fmul float %309, %389, !dbg !48
428
+ %406 = fmul float %310, %390, !dbg !48
429
+ %407 = fmul float %311, %391, !dbg !48
430
+ %408 = fmul float %312, %392, !dbg !48
431
+ %409 = fmul float %313, %393, !dbg !48
432
+ %410 = fmul float %314, %394, !dbg !48
433
+ %411 = fmul float %315, %395, !dbg !48
434
+ %412 = fmul float %316, %396, !dbg !48
435
+ %413 = fmul float %317, %397, !dbg !48
436
+ %414 = fmul float %318, %398, !dbg !48
437
+ %415 = fmul float %319, %399, !dbg !48
438
+ %416 = fmul float %320, %400, !dbg !48
439
+ %417 = fadd float %98, %401, !dbg !49
440
+ %418 = fadd float %99, %402, !dbg !49
441
+ %419 = fadd float %100, %403, !dbg !49
442
+ %420 = fadd float %101, %404, !dbg !49
443
+ %421 = fadd float %102, %405, !dbg !49
444
+ %422 = fadd float %103, %406, !dbg !49
445
+ %423 = fadd float %104, %407, !dbg !49
446
+ %424 = fadd float %105, %408, !dbg !49
447
+ %425 = fadd float %106, %409, !dbg !49
448
+ %426 = fadd float %107, %410, !dbg !49
449
+ %427 = fadd float %108, %411, !dbg !49
450
+ %428 = fadd float %109, %412, !dbg !49
451
+ %429 = fadd float %110, %413, !dbg !49
452
+ %430 = fadd float %111, %414, !dbg !49
453
+ %431 = fadd float %112, %415, !dbg !49
454
+ %432 = fadd float %113, %416, !dbg !49
455
+ %433 = add nuw nsw i32 %130, 64, !dbg !12
456
+ %434 = icmp ult i32 %130, 192, !dbg !12
457
+ br i1 %434, label %65, label %435, !dbg !12
458
+
459
+ 435: ; preds = %230
460
+ %436 = and i32 %16, 3, !dbg !12
461
+ %437 = mul nuw nsw i32 %436, 72, !dbg !12
462
+ %438 = add nuw nsw i32 %437, %12, !dbg !12
463
+ %439 = zext nneg i32 %438 to i64, !dbg !12
464
+ %440 = getelementptr float, ptr addrspace(3) @global_smem, i64 %439, !dbg !12
465
+ %441 = insertelement <1 x float> undef, float %337, i64 0, !dbg !12
466
+ store <1 x float> %441, ptr addrspace(3) %440, align 4, !dbg !12
467
+ %442 = add nuw nsw i32 %12, 288, !dbg !12
468
+ %443 = add nuw nsw i32 %442, %437, !dbg !12
469
+ %444 = zext nneg i32 %443 to i64, !dbg !12
470
+ %445 = getelementptr float, ptr addrspace(3) @global_smem, i64 %444, !dbg !12
471
+ %446 = insertelement <1 x float> undef, float %338, i64 0, !dbg !12
472
+ store <1 x float> %446, ptr addrspace(3) %445, align 4, !dbg !12
473
+ %447 = or i32 %12, 576, !dbg !12
474
+ %448 = add nuw nsw i32 %447, %437, !dbg !12
475
+ %449 = zext nneg i32 %448 to i64, !dbg !12
476
+ %450 = getelementptr float, ptr addrspace(3) @global_smem, i64 %449, !dbg !12
477
+ %451 = insertelement <1 x float> undef, float %339, i64 0, !dbg !12
478
+ store <1 x float> %451, ptr addrspace(3) %450, align 4, !dbg !12
479
+ %452 = add nuw nsw i32 %12, 864, !dbg !12
480
+ %453 = add nuw nsw i32 %452, %437, !dbg !12
481
+ %454 = zext nneg i32 %453 to i64, !dbg !12
482
+ %455 = getelementptr float, ptr addrspace(3) @global_smem, i64 %454, !dbg !12
483
+ %456 = insertelement <1 x float> undef, float %340, i64 0, !dbg !12
484
+ store <1 x float> %456, ptr addrspace(3) %455, align 4, !dbg !12
485
+ %457 = or i32 %12, 1152, !dbg !12
486
+ %458 = add nuw nsw i32 %457, %437, !dbg !12
487
+ %459 = zext nneg i32 %458 to i64, !dbg !12
488
+ %460 = getelementptr float, ptr addrspace(3) @global_smem, i64 %459, !dbg !12
489
+ %461 = insertelement <1 x float> undef, float %341, i64 0, !dbg !12
490
+ store <1 x float> %461, ptr addrspace(3) %460, align 4, !dbg !12
491
+ %462 = add nuw nsw i32 %12, 1440, !dbg !12
492
+ %463 = add nuw nsw i32 %462, %437, !dbg !12
493
+ %464 = zext nneg i32 %463 to i64, !dbg !12
494
+ %465 = getelementptr float, ptr addrspace(3) @global_smem, i64 %464, !dbg !12
495
+ %466 = insertelement <1 x float> undef, float %342, i64 0, !dbg !12
496
+ store <1 x float> %466, ptr addrspace(3) %465, align 4, !dbg !12
497
+ %467 = or i32 %12, 1728, !dbg !12
498
+ %468 = add nuw nsw i32 %467, %437, !dbg !12
499
+ %469 = zext nneg i32 %468 to i64, !dbg !12
500
+ %470 = getelementptr float, ptr addrspace(3) @global_smem, i64 %469, !dbg !12
501
+ %471 = insertelement <1 x float> undef, float %343, i64 0, !dbg !12
502
+ store <1 x float> %471, ptr addrspace(3) %470, align 4, !dbg !12
503
+ %472 = add nuw nsw i32 %12, 2016, !dbg !12
504
+ %473 = add nuw nsw i32 %472, %437, !dbg !12
505
+ %474 = zext nneg i32 %473 to i64, !dbg !12
506
+ %475 = getelementptr float, ptr addrspace(3) @global_smem, i64 %474, !dbg !12
507
+ %476 = insertelement <1 x float> undef, float %344, i64 0, !dbg !12
508
+ store <1 x float> %476, ptr addrspace(3) %475, align 4, !dbg !12
509
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
510
+ %477 = mul nuw nsw i32 %11, 72, !dbg !12
511
+ %478 = add nuw nsw i32 %477, %14, !dbg !12
512
+ %479 = zext nneg i32 %478 to i64, !dbg !12
513
+ %480 = getelementptr float, ptr addrspace(3) @global_smem, i64 %479, !dbg !12
514
+ %481 = load float, ptr addrspace(3) %480, align 32, !dbg !12
515
+ %482 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 1, !dbg !12
516
+ %483 = load float, ptr addrspace(3) %482, align 4, !dbg !12
517
+ %484 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 2, !dbg !12
518
+ %485 = load float, ptr addrspace(3) %484, align 8, !dbg !12
519
+ %486 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 3, !dbg !12
520
+ %487 = load float, ptr addrspace(3) %486, align 4, !dbg !12
521
+ %488 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 4, !dbg !12
522
+ %489 = load float, ptr addrspace(3) %488, align 16, !dbg !12
523
+ %490 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 5, !dbg !12
524
+ %491 = load float, ptr addrspace(3) %490, align 4, !dbg !12
525
+ %492 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 6, !dbg !12
526
+ %493 = load float, ptr addrspace(3) %492, align 8, !dbg !12
527
+ %494 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 7, !dbg !12
528
+ %495 = load float, ptr addrspace(3) %494, align 4, !dbg !12
529
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
530
+ %496 = insertelement <1 x float> undef, float %345, i64 0, !dbg !12
531
+ store <1 x float> %496, ptr addrspace(3) %440, align 4, !dbg !12
532
+ %497 = insertelement <1 x float> undef, float %346, i64 0, !dbg !12
533
+ store <1 x float> %497, ptr addrspace(3) %445, align 4, !dbg !12
534
+ %498 = insertelement <1 x float> undef, float %347, i64 0, !dbg !12
535
+ store <1 x float> %498, ptr addrspace(3) %450, align 4, !dbg !12
536
+ %499 = insertelement <1 x float> undef, float %348, i64 0, !dbg !12
537
+ store <1 x float> %499, ptr addrspace(3) %455, align 4, !dbg !12
538
+ %500 = insertelement <1 x float> undef, float %349, i64 0, !dbg !12
539
+ store <1 x float> %500, ptr addrspace(3) %460, align 4, !dbg !12
540
+ %501 = insertelement <1 x float> undef, float %350, i64 0, !dbg !12
541
+ store <1 x float> %501, ptr addrspace(3) %465, align 4, !dbg !12
542
+ %502 = insertelement <1 x float> undef, float %351, i64 0, !dbg !12
543
+ store <1 x float> %502, ptr addrspace(3) %470, align 4, !dbg !12
544
+ %503 = insertelement <1 x float> undef, float %352, i64 0, !dbg !12
545
+ store <1 x float> %503, ptr addrspace(3) %475, align 4, !dbg !12
546
+ tail call void @llvm.nvvm.barrier0(), !dbg !12
547
+ %504 = load float, ptr addrspace(3) %480, align 32, !dbg !12
548
+ %505 = load float, ptr addrspace(3) %482, align 4, !dbg !12
549
+ %506 = load float, ptr addrspace(3) %484, align 8, !dbg !12
550
+ %507 = load float, ptr addrspace(3) %486, align 4, !dbg !12
551
+ %508 = load float, ptr addrspace(3) %488, align 16, !dbg !12
552
+ %509 = load float, ptr addrspace(3) %490, align 4, !dbg !12
553
+ %510 = load float, ptr addrspace(3) %492, align 8, !dbg !12
554
+ %511 = load float, ptr addrspace(3) %494, align 4, !dbg !12
555
+ %512 = fsub float %370, %369, !dbg !50
556
+ %513 = fadd float %481, %483, !dbg !54
557
+ %514 = fcmp oeq float %513, 0.000000e+00, !dbg !55
558
+ %515 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %483, float %513) #6, !dbg !56
559
+ %516 = select i1 %514, float 0.000000e+00, float %515, !dbg !57
560
+ %517 = fmul float %512, %516, !dbg !58
561
+ %518 = fadd float %369, %517, !dbg !59
562
+ %519 = fadd float %417, %418, !dbg !60
563
+ %520 = fmul float %512, %512, !dbg !61
564
+ %521 = fmul float %520, %481, !dbg !62
565
+ %522 = fmul float %521, %516, !dbg !63
566
+ %523 = fadd float %519, %522, !dbg !64
567
+ %524 = fsub float %371, %518, !dbg !50
568
+ %525 = fadd float %485, %513, !dbg !54
569
+ %526 = fcmp oeq float %525, 0.000000e+00, !dbg !55
570
+ %527 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %485, float %525) #6, !dbg !56
571
+ %528 = select i1 %526, float 0.000000e+00, float %527, !dbg !57
572
+ %529 = fmul float %528, %524, !dbg !58
573
+ %530 = fadd float %518, %529, !dbg !59
574
+ %531 = fadd float %419, %523, !dbg !60
575
+ %532 = fmul float %524, %524, !dbg !61
576
+ %533 = fmul float %513, %532, !dbg !62
577
+ %534 = fmul float %528, %533, !dbg !63
578
+ %535 = fadd float %531, %534, !dbg !64
579
+ %536 = fsub float %372, %530, !dbg !50
580
+ %537 = fadd float %487, %525, !dbg !54
581
+ %538 = fcmp oeq float %537, 0.000000e+00, !dbg !55
582
+ %539 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %487, float %537) #6, !dbg !56
583
+ %540 = select i1 %538, float 0.000000e+00, float %539, !dbg !57
584
+ %541 = fmul float %540, %536, !dbg !58
585
+ %542 = fadd float %530, %541, !dbg !59
586
+ %543 = fadd float %420, %535, !dbg !60
587
+ %544 = fmul float %536, %536, !dbg !61
588
+ %545 = fmul float %525, %544, !dbg !62
589
+ %546 = fmul float %540, %545, !dbg !63
590
+ %547 = fadd float %543, %546, !dbg !64
591
+ %548 = fsub float %373, %542, !dbg !50
592
+ %549 = fadd float %489, %537, !dbg !54
593
+ %550 = fcmp oeq float %549, 0.000000e+00, !dbg !55
594
+ %551 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %489, float %549) #6, !dbg !56
595
+ %552 = select i1 %550, float 0.000000e+00, float %551, !dbg !57
596
+ %553 = fmul float %552, %548, !dbg !58
597
+ %554 = fadd float %542, %553, !dbg !59
598
+ %555 = fadd float %421, %547, !dbg !60
599
+ %556 = fmul float %548, %548, !dbg !61
600
+ %557 = fmul float %537, %556, !dbg !62
601
+ %558 = fmul float %552, %557, !dbg !63
602
+ %559 = fadd float %555, %558, !dbg !64
603
+ %560 = fsub float %374, %554, !dbg !50
604
+ %561 = fadd float %491, %549, !dbg !54
605
+ %562 = fcmp oeq float %561, 0.000000e+00, !dbg !55
606
+ %563 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %491, float %561) #6, !dbg !56
607
+ %564 = select i1 %562, float 0.000000e+00, float %563, !dbg !57
608
+ %565 = fmul float %564, %560, !dbg !58
609
+ %566 = fadd float %554, %565, !dbg !59
610
+ %567 = fadd float %422, %559, !dbg !60
611
+ %568 = fmul float %560, %560, !dbg !61
612
+ %569 = fmul float %549, %568, !dbg !62
613
+ %570 = fmul float %564, %569, !dbg !63
614
+ %571 = fadd float %567, %570, !dbg !64
615
+ %572 = fsub float %375, %566, !dbg !50
616
+ %573 = fadd float %493, %561, !dbg !54
617
+ %574 = fcmp oeq float %573, 0.000000e+00, !dbg !55
618
+ %575 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %493, float %573) #6, !dbg !56
619
+ %576 = select i1 %574, float 0.000000e+00, float %575, !dbg !57
620
+ %577 = fmul float %576, %572, !dbg !58
621
+ %578 = fadd float %566, %577, !dbg !59
622
+ %579 = fadd float %423, %571, !dbg !60
623
+ %580 = fmul float %572, %572, !dbg !61
624
+ %581 = fmul float %561, %580, !dbg !62
625
+ %582 = fmul float %576, %581, !dbg !63
626
+ %583 = fadd float %579, %582, !dbg !64
627
+ %584 = fsub float %376, %578, !dbg !50
628
+ %585 = fadd float %495, %573, !dbg !54
629
+ %586 = fcmp oeq float %585, 0.000000e+00, !dbg !55
630
+ %587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %495, float %585) #6, !dbg !56
631
+ %588 = select i1 %586, float 0.000000e+00, float %587, !dbg !57
632
+ %589 = fmul float %588, %584, !dbg !58
633
+ %590 = fadd float %578, %589, !dbg !59
634
+ %591 = fadd float %424, %583, !dbg !60
635
+ %592 = fmul float %584, %584, !dbg !61
636
+ %593 = fmul float %573, %592, !dbg !62
637
+ %594 = fmul float %588, %593, !dbg !63
638
+ %595 = fadd float %591, %594, !dbg !64
639
+ %596 = fsub float %378, %377, !dbg !50
640
+ %597 = fadd float %504, %505, !dbg !54
641
+ %598 = fcmp oeq float %597, 0.000000e+00, !dbg !55
642
+ %599 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %505, float %597) #6, !dbg !56
643
+ %600 = select i1 %598, float 0.000000e+00, float %599, !dbg !57
644
+ %601 = fmul float %596, %600, !dbg !58
645
+ %602 = fadd float %377, %601, !dbg !59
646
+ %603 = fadd float %425, %426, !dbg !60
647
+ %604 = fmul float %596, %596, !dbg !61
648
+ %605 = fmul float %604, %504, !dbg !62
649
+ %606 = fmul float %605, %600, !dbg !63
650
+ %607 = fadd float %603, %606, !dbg !64
651
+ %608 = fsub float %379, %602, !dbg !50
652
+ %609 = fadd float %506, %597, !dbg !54
653
+ %610 = fcmp oeq float %609, 0.000000e+00, !dbg !55
654
+ %611 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %506, float %609) #6, !dbg !56
655
+ %612 = select i1 %610, float 0.000000e+00, float %611, !dbg !57
656
+ %613 = fmul float %612, %608, !dbg !58
657
+ %614 = fadd float %602, %613, !dbg !59
658
+ %615 = fadd float %427, %607, !dbg !60
659
+ %616 = fmul float %608, %608, !dbg !61
660
+ %617 = fmul float %597, %616, !dbg !62
661
+ %618 = fmul float %612, %617, !dbg !63
662
+ %619 = fadd float %615, %618, !dbg !64
663
+ %620 = fsub float %380, %614, !dbg !50
664
+ %621 = fadd float %507, %609, !dbg !54
665
+ %622 = fcmp oeq float %621, 0.000000e+00, !dbg !55
666
+ %623 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %507, float %621) #6, !dbg !56
667
+ %624 = select i1 %622, float 0.000000e+00, float %623, !dbg !57
668
+ %625 = fmul float %624, %620, !dbg !58
669
+ %626 = fadd float %614, %625, !dbg !59
670
+ %627 = fadd float %428, %619, !dbg !60
671
+ %628 = fmul float %620, %620, !dbg !61
672
+ %629 = fmul float %609, %628, !dbg !62
673
+ %630 = fmul float %624, %629, !dbg !63
674
+ %631 = fadd float %627, %630, !dbg !64
675
+ %632 = fsub float %381, %626, !dbg !50
676
+ %633 = fadd float %508, %621, !dbg !54
677
+ %634 = fcmp oeq float %633, 0.000000e+00, !dbg !55
678
+ %635 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %508, float %633) #6, !dbg !56
679
+ %636 = select i1 %634, float 0.000000e+00, float %635, !dbg !57
680
+ %637 = fmul float %636, %632, !dbg !58
681
+ %638 = fadd float %626, %637, !dbg !59
682
+ %639 = fadd float %429, %631, !dbg !60
683
+ %640 = fmul float %632, %632, !dbg !61
684
+ %641 = fmul float %621, %640, !dbg !62
685
+ %642 = fmul float %636, %641, !dbg !63
686
+ %643 = fadd float %639, %642, !dbg !64
687
+ %644 = fsub float %382, %638, !dbg !50
688
+ %645 = fadd float %509, %633, !dbg !54
689
+ %646 = fcmp oeq float %645, 0.000000e+00, !dbg !55
690
+ %647 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %509, float %645) #6, !dbg !56
691
+ %648 = select i1 %646, float 0.000000e+00, float %647, !dbg !57
692
+ %649 = fmul float %648, %644, !dbg !58
693
+ %650 = fadd float %638, %649, !dbg !59
694
+ %651 = fadd float %430, %643, !dbg !60
695
+ %652 = fmul float %644, %644, !dbg !61
696
+ %653 = fmul float %633, %652, !dbg !62
697
+ %654 = fmul float %648, %653, !dbg !63
698
+ %655 = fadd float %651, %654, !dbg !64
699
+ %656 = fsub float %383, %650, !dbg !50
700
+ %657 = fadd float %510, %645, !dbg !54
701
+ %658 = fcmp oeq float %657, 0.000000e+00, !dbg !55
702
+ %659 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %510, float %657) #6, !dbg !56
703
+ %660 = select i1 %658, float 0.000000e+00, float %659, !dbg !57
704
+ %661 = fmul float %660, %656, !dbg !58
705
+ %662 = fadd float %650, %661, !dbg !59
706
+ %663 = fadd float %431, %655, !dbg !60
707
+ %664 = fmul float %656, %656, !dbg !61
708
+ %665 = fmul float %645, %664, !dbg !62
709
+ %666 = fmul float %660, %665, !dbg !63
710
+ %667 = fadd float %663, %666, !dbg !64
711
+ %668 = fsub float %384, %662, !dbg !50
712
+ %669 = fadd float %511, %657, !dbg !54
713
+ %670 = fcmp oeq float %669, 0.000000e+00, !dbg !55
714
+ %671 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %511, float %669) #6, !dbg !56
715
+ %672 = select i1 %670, float 0.000000e+00, float %671, !dbg !57
716
+ %673 = fmul float %672, %668, !dbg !58
717
+ %674 = fadd float %662, %673, !dbg !59
718
+ %675 = fadd float %432, %667, !dbg !60
719
+ %676 = fmul float %668, %668, !dbg !61
720
+ %677 = fmul float %657, %676, !dbg !62
721
+ %678 = fmul float %672, %677, !dbg !63
722
+ %679 = fadd float %675, %678, !dbg !64
723
+ %680 = bitcast float %590 to i32, !dbg !65
724
+ %681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %680, i32 4, i32 31), !dbg !65
725
+ %682 = bitcast i32 %681 to float, !dbg !65
726
+ %683 = bitcast float %595 to i32, !dbg !65
727
+ %684 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %683, i32 4, i32 31), !dbg !65
728
+ %685 = bitcast i32 %684 to float, !dbg !65
729
+ %686 = bitcast float %585 to i32, !dbg !65
730
+ %687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %686, i32 4, i32 31), !dbg !65
731
+ %688 = bitcast i32 %687 to float, !dbg !65
732
+ %689 = fsub float %682, %590, !dbg !50
733
+ %690 = fadd float %585, %688, !dbg !54
734
+ %691 = fcmp oeq float %690, 0.000000e+00, !dbg !55
735
+ %692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %688, float %690) #6, !dbg !56
736
+ %693 = select i1 %691, float 0.000000e+00, float %692, !dbg !57
737
+ %694 = fmul float %693, %689, !dbg !58
738
+ %695 = fadd float %590, %694, !dbg !59
739
+ %696 = fadd float %595, %685, !dbg !60
740
+ %697 = fmul float %689, %689, !dbg !61
741
+ %698 = fmul float %585, %697, !dbg !62
742
+ %699 = fmul float %693, %698, !dbg !63
743
+ %700 = fadd float %696, %699, !dbg !64
744
+ %701 = bitcast float %695 to i32, !dbg !65
745
+ %702 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %701, i32 2, i32 31), !dbg !65
746
+ %703 = bitcast i32 %702 to float, !dbg !65
747
+ %704 = bitcast float %700 to i32, !dbg !65
748
+ %705 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %704, i32 2, i32 31), !dbg !65
749
+ %706 = bitcast i32 %705 to float, !dbg !65
750
+ %707 = bitcast float %690 to i32, !dbg !65
751
+ %708 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 2, i32 31), !dbg !65
752
+ %709 = bitcast i32 %708 to float, !dbg !65
753
+ %710 = fsub float %703, %695, !dbg !50
754
+ %711 = fadd float %690, %709, !dbg !54
755
+ %712 = fcmp oeq float %711, 0.000000e+00, !dbg !55
756
+ %713 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %709, float %711) #6, !dbg !56
757
+ %714 = select i1 %712, float 0.000000e+00, float %713, !dbg !57
758
+ %715 = fmul float %714, %710, !dbg !58
759
+ %716 = fadd float %695, %715, !dbg !59
760
+ %717 = fadd float %700, %706, !dbg !60
761
+ %718 = fmul float %710, %710, !dbg !61
762
+ %719 = fmul float %690, %718, !dbg !62
763
+ %720 = fmul float %714, %719, !dbg !63
764
+ %721 = fadd float %717, %720, !dbg !64
765
+ %722 = bitcast float %716 to i32, !dbg !65
766
+ %723 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %722, i32 1, i32 31), !dbg !65
767
+ %724 = bitcast i32 %723 to float, !dbg !65
768
+ %725 = bitcast float %721 to i32, !dbg !65
769
+ %726 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %725, i32 1, i32 31), !dbg !65
770
+ %727 = bitcast i32 %726 to float, !dbg !65
771
+ %728 = bitcast float %711 to i32, !dbg !65
772
+ %729 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %728, i32 1, i32 31), !dbg !65
773
+ %730 = bitcast i32 %729 to float, !dbg !65
774
+ %731 = fsub float %724, %716, !dbg !50
775
+ %732 = fadd float %711, %730, !dbg !54
776
+ %733 = fcmp oeq float %732, 0.000000e+00, !dbg !55
777
+ %734 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %730, float %732) #6, !dbg !56
778
+ %735 = select i1 %733, float 0.000000e+00, float %734, !dbg !57
779
+ %736 = fmul float %731, %735, !dbg !58
780
+ %737 = fadd float %716, %736, !dbg !59
781
+ %738 = fadd float %721, %727, !dbg !60
782
+ %739 = fmul float %731, %731, !dbg !61
783
+ %740 = fmul float %711, %739, !dbg !62
784
+ %741 = fmul float %735, %740, !dbg !63
785
+ %742 = fadd float %738, %741, !dbg !64
786
+ %743 = bitcast float %674 to i32, !dbg !65
787
+ %744 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %743, i32 4, i32 31), !dbg !65
788
+ %745 = bitcast i32 %744 to float, !dbg !65
789
+ %746 = bitcast float %679 to i32, !dbg !65
790
+ %747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 4, i32 31), !dbg !65
791
+ %748 = bitcast i32 %747 to float, !dbg !65
792
+ %749 = bitcast float %669 to i32, !dbg !65
793
+ %750 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %749, i32 4, i32 31), !dbg !65
794
+ %751 = bitcast i32 %750 to float, !dbg !65
795
+ %752 = fsub float %745, %674, !dbg !50
796
+ %753 = fadd float %669, %751, !dbg !54
797
+ %754 = fcmp oeq float %753, 0.000000e+00, !dbg !55
798
+ %755 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %751, float %753) #6, !dbg !56
799
+ %756 = select i1 %754, float 0.000000e+00, float %755, !dbg !57
800
+ %757 = fmul float %752, %756, !dbg !58
801
+ %758 = fadd float %674, %757, !dbg !59
802
+ %759 = fadd float %679, %748, !dbg !60
803
+ %760 = fmul float %752, %752, !dbg !61
804
+ %761 = fmul float %669, %760, !dbg !62
805
+ %762 = fmul float %761, %756, !dbg !63
806
+ %763 = fadd float %759, %762, !dbg !64
807
+ %764 = bitcast float %758 to i32, !dbg !65
808
+ %765 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %764, i32 2, i32 31), !dbg !65
809
+ %766 = bitcast i32 %765 to float, !dbg !65
810
+ %767 = bitcast float %763 to i32, !dbg !65
811
+ %768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 2, i32 31), !dbg !65
812
+ %769 = bitcast i32 %768 to float, !dbg !65
813
+ %770 = bitcast float %753 to i32, !dbg !65
814
+ %771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 2, i32 31), !dbg !65
815
+ %772 = bitcast i32 %771 to float, !dbg !65
816
+ %773 = fsub float %766, %758, !dbg !50
817
+ %774 = fadd float %753, %772, !dbg !54
818
+ %775 = fcmp oeq float %774, 0.000000e+00, !dbg !55
819
+ %776 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %772, float %774) #6, !dbg !56
820
+ %777 = select i1 %775, float 0.000000e+00, float %776, !dbg !57
821
+ %778 = fmul float %773, %777, !dbg !58
822
+ %779 = fadd float %758, %778, !dbg !59
823
+ %780 = fadd float %763, %769, !dbg !60
824
+ %781 = fmul float %773, %773, !dbg !61
825
+ %782 = fmul float %753, %781, !dbg !62
826
+ %783 = fmul float %777, %782, !dbg !63
827
+ %784 = fadd float %780, %783, !dbg !64
828
+ %785 = bitcast float %779 to i32, !dbg !65
829
+ %786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 1, i32 31), !dbg !65
830
+ %787 = bitcast i32 %786 to float, !dbg !65
831
+ %788 = bitcast float %784 to i32, !dbg !65
832
+ %789 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %788, i32 1, i32 31), !dbg !65
833
+ %790 = bitcast i32 %789 to float, !dbg !65
834
+ %791 = bitcast float %774 to i32, !dbg !65
835
+ %792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %791, i32 1, i32 31), !dbg !65
836
+ %793 = bitcast i32 %792 to float, !dbg !65
837
+ %794 = fsub float %787, %779, !dbg !50
838
+ %795 = fadd float %774, %793, !dbg !54
839
+ %796 = fcmp oeq float %795, 0.000000e+00, !dbg !55
840
+ %797 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %793, float %795) #6, !dbg !56
841
+ %798 = select i1 %796, float 0.000000e+00, float %797, !dbg !57
842
+ %799 = fmul float %794, %798, !dbg !58
843
+ %800 = fadd float %779, %799, !dbg !59
844
+ %801 = fadd float %784, %790, !dbg !60
845
+ %802 = fmul float %794, %794, !dbg !61
846
+ %803 = fmul float %774, %802, !dbg !62
847
+ %804 = fmul float %798, %803, !dbg !63
848
+ %805 = fadd float %801, %804, !dbg !64
849
+ %806 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
850
+ %807 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
851
+ %808 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
852
+ %809 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
853
+ %810 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
854
+ %811 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
855
+ %812 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
856
+ %813 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
857
+ %814 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
858
+ %815 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
859
+ %816 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
860
+ %817 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
861
+ %818 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
862
+ %819 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
863
+ %820 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
864
+ %821 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
865
+ %822 = fadd float %806, 0x3EE4F8B580000000, !dbg !68
866
+ %823 = fadd float %814, 0x3EE4F8B580000000, !dbg !68
867
+ br label %824, !dbg !69
868
+
869
+ 824: ; preds = %435, %__nv_rsqrtf.exit40
870
+ %825 = phi i32 [ 0, %435 ], [ %1134, %__nv_rsqrtf.exit40 ]
871
+ %826 = or i32 %825, %14, !dbg !70
872
+ %827 = or i32 %825, %15, !dbg !70
873
+ %828 = add i32 %826, %47, !dbg !71
874
+ %829 = add i32 %827, %47, !dbg !71
875
+ %830 = add i32 %826, %48, !dbg !71
876
+ %831 = add i32 %827, %48, !dbg !71
877
+ %832 = sext i32 %828 to i64, !dbg !72
878
+ %833 = getelementptr float, ptr addrspace(1) %2, i64 %832, !dbg !72
879
+ %834 = sext i32 %829 to i64, !dbg !72
880
+ %835 = getelementptr float, ptr addrspace(1) %2, i64 %834, !dbg !72
881
+ %836 = sext i32 %830 to i64, !dbg !72
882
+ %837 = getelementptr float, ptr addrspace(1) %2, i64 %836, !dbg !72
883
+ %838 = sext i32 %831 to i64, !dbg !72
884
+ %839 = getelementptr float, ptr addrspace(1) %2, i64 %838, !dbg !72
885
+ %840 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %833, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
886
+ %841 = extractvalue { i32, i32, i32, i32 } %840, 0, !dbg !73
887
+ %842 = extractvalue { i32, i32, i32, i32 } %840, 1, !dbg !73
888
+ %843 = extractvalue { i32, i32, i32, i32 } %840, 2, !dbg !73
889
+ %844 = extractvalue { i32, i32, i32, i32 } %840, 3, !dbg !73
890
+ %845 = bitcast i32 %841 to float, !dbg !73
891
+ %846 = bitcast i32 %842 to float, !dbg !73
892
+ %847 = bitcast i32 %843 to float, !dbg !73
893
+ %848 = bitcast i32 %844 to float, !dbg !73
894
+ %849 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
895
+ %850 = extractvalue { i32, i32, i32, i32 } %849, 0, !dbg !73
896
+ %851 = extractvalue { i32, i32, i32, i32 } %849, 1, !dbg !73
897
+ %852 = extractvalue { i32, i32, i32, i32 } %849, 2, !dbg !73
898
+ %853 = extractvalue { i32, i32, i32, i32 } %849, 3, !dbg !73
899
+ %854 = bitcast i32 %850 to float, !dbg !73
900
+ %855 = bitcast i32 %851 to float, !dbg !73
901
+ %856 = bitcast i32 %852 to float, !dbg !73
902
+ %857 = bitcast i32 %853 to float, !dbg !73
903
+ %858 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
904
+ %859 = extractvalue { i32, i32, i32, i32 } %858, 0, !dbg !73
905
+ %860 = extractvalue { i32, i32, i32, i32 } %858, 1, !dbg !73
906
+ %861 = extractvalue { i32, i32, i32, i32 } %858, 2, !dbg !73
907
+ %862 = extractvalue { i32, i32, i32, i32 } %858, 3, !dbg !73
908
+ %863 = bitcast i32 %859 to float, !dbg !73
909
+ %864 = bitcast i32 %860 to float, !dbg !73
910
+ %865 = bitcast i32 %861 to float, !dbg !73
911
+ %866 = bitcast i32 %862 to float, !dbg !73
912
+ %867 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %839, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
913
+ %868 = extractvalue { i32, i32, i32, i32 } %867, 0, !dbg !73
914
+ %869 = extractvalue { i32, i32, i32, i32 } %867, 1, !dbg !73
915
+ %870 = extractvalue { i32, i32, i32, i32 } %867, 2, !dbg !73
916
+ %871 = extractvalue { i32, i32, i32, i32 } %867, 3, !dbg !73
917
+ %872 = bitcast i32 %868 to float, !dbg !73
918
+ %873 = bitcast i32 %869 to float, !dbg !73
919
+ %874 = bitcast i32 %870 to float, !dbg !73
920
+ %875 = bitcast i32 %871 to float, !dbg !73
921
+ %876 = add i32 %826, %49, !dbg !74
922
+ %877 = add i32 %826, %50, !dbg !74
923
+ %878 = sext i32 %876 to i64, !dbg !75
924
+ %879 = getelementptr i16, ptr addrspace(1) %3, i64 %878, !dbg !75
925
+ %880 = sext i32 %877 to i64, !dbg !75
926
+ %881 = getelementptr i16, ptr addrspace(1) %3, i64 %880, !dbg !75
927
+ %882 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %879, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
928
+ %883 = extractvalue { i32, i32, i32, i32 } %882, 0, !dbg !76
929
+ %884 = extractvalue { i32, i32, i32, i32 } %882, 1, !dbg !76
930
+ %885 = extractvalue { i32, i32, i32, i32 } %882, 2, !dbg !76
931
+ %886 = extractvalue { i32, i32, i32, i32 } %882, 3, !dbg !76
932
+ %887 = trunc i32 %883 to i16, !dbg !76
933
+ %extelt.offset = lshr i32 %883, 16, !dbg !76
934
+ %888 = trunc i32 %extelt.offset to i16, !dbg !76
935
+ %889 = trunc i32 %884 to i16, !dbg !76
936
+ %extelt.offset2 = lshr i32 %884, 16, !dbg !76
937
+ %890 = trunc i32 %extelt.offset2 to i16, !dbg !76
938
+ %891 = trunc i32 %885 to i16, !dbg !76
939
+ %extelt.offset3 = lshr i32 %885, 16, !dbg !76
940
+ %892 = trunc i32 %extelt.offset3 to i16, !dbg !76
941
+ %893 = trunc i32 %886 to i16, !dbg !76
942
+ %extelt.offset4 = lshr i32 %886, 16, !dbg !76
943
+ %894 = trunc i32 %extelt.offset4 to i16, !dbg !76
944
+ %895 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %881, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
945
+ %896 = extractvalue { i32, i32, i32, i32 } %895, 0, !dbg !76
946
+ %897 = extractvalue { i32, i32, i32, i32 } %895, 1, !dbg !76
947
+ %898 = extractvalue { i32, i32, i32, i32 } %895, 2, !dbg !76
948
+ %899 = extractvalue { i32, i32, i32, i32 } %895, 3, !dbg !76
949
+ %900 = trunc i32 %896 to i16, !dbg !76
950
+ %extelt.offset5 = lshr i32 %896, 16, !dbg !76
951
+ %901 = trunc i32 %extelt.offset5 to i16, !dbg !76
952
+ %902 = trunc i32 %897 to i16, !dbg !76
953
+ %extelt.offset6 = lshr i32 %897, 16, !dbg !76
954
+ %903 = trunc i32 %extelt.offset6 to i16, !dbg !76
955
+ %904 = trunc i32 %898 to i16, !dbg !76
956
+ %extelt.offset7 = lshr i32 %898, 16, !dbg !76
957
+ %905 = trunc i32 %extelt.offset7 to i16, !dbg !76
958
+ %906 = trunc i32 %899 to i16, !dbg !76
959
+ %extelt.offset8 = lshr i32 %899, 16, !dbg !76
960
+ %907 = trunc i32 %extelt.offset8 to i16, !dbg !76
961
+ %908 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %887) #6, !dbg !77
962
+ %909 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %888) #6, !dbg !77
963
+ %910 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %889) #6, !dbg !77
964
+ %911 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %890) #6, !dbg !77
965
+ %912 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %891) #6, !dbg !77
966
+ %913 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %892) #6, !dbg !77
967
+ %914 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %893) #6, !dbg !77
968
+ %915 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %894) #6, !dbg !77
969
+ %916 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %900) #6, !dbg !77
970
+ %917 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %901) #6, !dbg !77
971
+ %918 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %902) #6, !dbg !77
972
+ %919 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %903) #6, !dbg !77
973
+ %920 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %904) #6, !dbg !77
974
+ %921 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %905) #6, !dbg !77
975
+ %922 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %906) #6, !dbg !77
976
+ %923 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %907) #6, !dbg !77
977
+ %924 = zext nneg i32 %826 to i64, !dbg !78
978
+ %925 = getelementptr float, ptr addrspace(1) %4, i64 %924, !dbg !78
979
+ %926 = zext nneg i32 %827 to i64, !dbg !78
980
+ %927 = getelementptr float, ptr addrspace(1) %4, i64 %926, !dbg !78
981
+ %928 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %925, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
982
+ %929 = extractvalue { i32, i32, i32, i32 } %928, 0, !dbg !79
983
+ %930 = extractvalue { i32, i32, i32, i32 } %928, 1, !dbg !79
984
+ %931 = extractvalue { i32, i32, i32, i32 } %928, 2, !dbg !79
985
+ %932 = extractvalue { i32, i32, i32, i32 } %928, 3, !dbg !79
986
+ %933 = bitcast i32 %929 to float, !dbg !79
987
+ %934 = bitcast i32 %930 to float, !dbg !79
988
+ %935 = bitcast i32 %931 to float, !dbg !79
989
+ %936 = bitcast i32 %932 to float, !dbg !79
990
+ %937 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %927, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
991
+ %938 = extractvalue { i32, i32, i32, i32 } %937, 0, !dbg !79
992
+ %939 = extractvalue { i32, i32, i32, i32 } %937, 1, !dbg !79
993
+ %940 = extractvalue { i32, i32, i32, i32 } %937, 2, !dbg !79
994
+ %941 = extractvalue { i32, i32, i32, i32 } %937, 3, !dbg !79
995
+ %942 = bitcast i32 %938 to float, !dbg !79
996
+ %943 = bitcast i32 %939 to float, !dbg !79
997
+ %944 = bitcast i32 %940 to float, !dbg !79
998
+ %945 = bitcast i32 %941 to float, !dbg !79
999
+ br i1 %56, label %946, label %947, !dbg !80
1000
+
1001
+ 946: ; preds = %824
1002
+ tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
1003
+ br label %947, !dbg !80
1004
+
1005
+ 947: ; preds = %946, %824
1006
+ %948 = getelementptr float, ptr addrspace(1) %63, i64 %924, !dbg !81
1007
+ %949 = getelementptr float, ptr addrspace(1) %63, i64 %926, !dbg !81
1008
+ %950 = getelementptr float, ptr addrspace(1) %64, i64 %924, !dbg !81
1009
+ %951 = getelementptr float, ptr addrspace(1) %64, i64 %926, !dbg !81
1010
+ %952 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %948, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
1011
+ %953 = extractvalue { i32, i32, i32, i32 } %952, 0, !dbg !82
1012
+ %954 = extractvalue { i32, i32, i32, i32 } %952, 1, !dbg !82
1013
+ %955 = extractvalue { i32, i32, i32, i32 } %952, 2, !dbg !82
1014
+ %956 = extractvalue { i32, i32, i32, i32 } %952, 3, !dbg !82
1015
+ %957 = bitcast i32 %953 to float, !dbg !82
1016
+ %958 = bitcast i32 %954 to float, !dbg !82
1017
+ %959 = bitcast i32 %955 to float, !dbg !82
1018
+ %960 = bitcast i32 %956 to float, !dbg !82
1019
+ %961 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %949, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
1020
+ %962 = extractvalue { i32, i32, i32, i32 } %961, 0, !dbg !82
1021
+ %963 = extractvalue { i32, i32, i32, i32 } %961, 1, !dbg !82
1022
+ %964 = extractvalue { i32, i32, i32, i32 } %961, 2, !dbg !82
1023
+ %965 = extractvalue { i32, i32, i32, i32 } %961, 3, !dbg !82
1024
+ %966 = bitcast i32 %962 to float, !dbg !82
1025
+ %967 = bitcast i32 %963 to float, !dbg !82
1026
+ %968 = bitcast i32 %964 to float, !dbg !82
1027
+ %969 = bitcast i32 %965 to float, !dbg !82
1028
+ %970 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %950, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
1029
+ %971 = extractvalue { i32, i32, i32, i32 } %970, 0, !dbg !82
1030
+ %972 = extractvalue { i32, i32, i32, i32 } %970, 1, !dbg !82
1031
+ %973 = extractvalue { i32, i32, i32, i32 } %970, 2, !dbg !82
1032
+ %974 = extractvalue { i32, i32, i32, i32 } %970, 3, !dbg !82
1033
+ %975 = bitcast i32 %971 to float, !dbg !82
1034
+ %976 = bitcast i32 %972 to float, !dbg !82
1035
+ %977 = bitcast i32 %973 to float, !dbg !82
1036
+ %978 = bitcast i32 %974 to float, !dbg !82
1037
+ %979 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %951, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
1038
+ %980 = extractvalue { i32, i32, i32, i32 } %979, 0, !dbg !82
1039
+ %981 = extractvalue { i32, i32, i32, i32 } %979, 1, !dbg !82
1040
+ %982 = extractvalue { i32, i32, i32, i32 } %979, 2, !dbg !82
1041
+ %983 = extractvalue { i32, i32, i32, i32 } %979, 3, !dbg !82
1042
+ %984 = bitcast i32 %980 to float, !dbg !82
1043
+ %985 = bitcast i32 %981 to float, !dbg !82
1044
+ %986 = bitcast i32 %982 to float, !dbg !82
1045
+ %987 = bitcast i32 %983 to float, !dbg !82
1046
+ %988 = fadd float %845, %957, !dbg !83
1047
+ %989 = fadd float %846, %958, !dbg !83
1048
+ %990 = fadd float %847, %959, !dbg !83
1049
+ %991 = fadd float %848, %960, !dbg !83
1050
+ %992 = fadd float %854, %966, !dbg !83
1051
+ %993 = fadd float %855, %967, !dbg !83
1052
+ %994 = fadd float %856, %968, !dbg !83
1053
+ %995 = fadd float %857, %969, !dbg !83
1054
+ %996 = fadd float %863, %975, !dbg !83
1055
+ %997 = fadd float %864, %976, !dbg !83
1056
+ %998 = fadd float %865, %977, !dbg !83
1057
+ %999 = fadd float %866, %978, !dbg !83
1058
+ %1000 = fadd float %872, %984, !dbg !83
1059
+ %1001 = fadd float %873, %985, !dbg !83
1060
+ %1002 = fadd float %874, %986, !dbg !83
1061
+ %1003 = fadd float %875, %987, !dbg !83
1062
+ %1004 = fadd float %908, %988, !dbg !84
1063
+ %1005 = fadd float %909, %989, !dbg !84
1064
+ %1006 = fadd float %910, %990, !dbg !84
1065
+ %1007 = fadd float %911, %991, !dbg !84
1066
+ %1008 = fadd float %912, %992, !dbg !84
1067
+ %1009 = fadd float %913, %993, !dbg !84
1068
+ %1010 = fadd float %914, %994, !dbg !84
1069
+ %1011 = fadd float %915, %995, !dbg !84
1070
+ %1012 = fadd float %916, %996, !dbg !84
1071
+ %1013 = fadd float %917, %997, !dbg !84
1072
+ %1014 = fadd float %918, %998, !dbg !84
1073
+ %1015 = fadd float %919, %999, !dbg !84
1074
+ %1016 = fadd float %920, %1000, !dbg !84
1075
+ %1017 = fadd float %921, %1001, !dbg !84
1076
+ %1018 = fadd float %922, %1002, !dbg !84
1077
+ %1019 = fadd float %923, %1003, !dbg !84
1078
+ %1020 = fsub float %1004, %737, !dbg !85
1079
+ %1021 = fsub float %1005, %737, !dbg !85
1080
+ %1022 = fsub float %1006, %737, !dbg !85
1081
+ %1023 = fsub float %1007, %737, !dbg !85
1082
+ %1024 = fsub float %1008, %737, !dbg !85
1083
+ %1025 = fsub float %1009, %737, !dbg !85
1084
+ %1026 = fsub float %1010, %737, !dbg !85
1085
+ %1027 = fsub float %1011, %737, !dbg !85
1086
+ %1028 = fsub float %1012, %800, !dbg !85
1087
+ %1029 = fsub float %1013, %800, !dbg !85
1088
+ %1030 = fsub float %1014, %800, !dbg !85
1089
+ %1031 = fsub float %1015, %800, !dbg !85
1090
+ %1032 = fsub float %1016, %800, !dbg !85
1091
+ %1033 = fsub float %1017, %800, !dbg !85
1092
+ %1034 = fsub float %1018, %800, !dbg !85
1093
+ %1035 = fsub float %1019, %800, !dbg !85
1094
+ %1036 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1095
+ %.not.i = icmp eq i32 %1036, 0, !dbg !86
1096
+ br i1 %.not.i, label %1039, label %1037, !dbg !86
1097
+
1098
+ 1037: ; preds = %947
1099
+ %1038 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %822), !dbg !86
1100
+ br label %__nv_rsqrtf.exit, !dbg !86
1101
+
1102
+ 1039: ; preds = %947
1103
+ %1040 = tail call float @llvm.nvvm.rsqrt.approx.f(float %822), !dbg !86
1104
+ br label %__nv_rsqrtf.exit, !dbg !86
1105
+
1106
+ __nv_rsqrtf.exit: ; preds = %1037, %1039
1107
+ %.0.i = phi float [ %1038, %1037 ], [ %1040, %1039 ], !dbg !86
1108
+ %1041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1109
+ %1042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1110
+ %1043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1111
+ %1044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1112
+ %1045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1113
+ %1046 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1114
+ %1047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1115
+ %1048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1116
+ %.not.i38 = icmp eq i32 %1048, 0, !dbg !86
1117
+ br i1 %.not.i38, label %1051, label %1049, !dbg !86
1118
+
1119
+ 1049: ; preds = %__nv_rsqrtf.exit
1120
+ %1050 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %823), !dbg !86
1121
+ br label %__nv_rsqrtf.exit40, !dbg !86
1122
+
1123
+ 1051: ; preds = %__nv_rsqrtf.exit
1124
+ %1052 = tail call float @llvm.nvvm.rsqrt.approx.f(float %823), !dbg !86
1125
+ br label %__nv_rsqrtf.exit40, !dbg !86
1126
+
1127
+ __nv_rsqrtf.exit40: ; preds = %1049, %1051
1128
+ %.0.i39 = phi float [ %1050, %1049 ], [ %1052, %1051 ], !dbg !86
1129
+ %1053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1130
+ %1054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1131
+ %1055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1132
+ %1056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1133
+ %1057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1134
+ %1058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1135
+ %1059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
1136
+ %1060 = fmul float %1020, %.0.i, !dbg !87
1137
+ %1061 = fmul float %1021, %.0.i, !dbg !87
1138
+ %1062 = fmul float %1022, %.0.i, !dbg !87
1139
+ %1063 = fmul float %1023, %.0.i, !dbg !87
1140
+ %1064 = fmul float %1024, %.0.i, !dbg !87
1141
+ %1065 = fmul float %1025, %.0.i, !dbg !87
1142
+ %1066 = fmul float %1026, %.0.i, !dbg !87
1143
+ %1067 = fmul float %1027, %.0.i, !dbg !87
1144
+ %1068 = fmul float %1028, %.0.i39, !dbg !87
1145
+ %1069 = fmul float %1029, %.0.i39, !dbg !87
1146
+ %1070 = fmul float %1030, %.0.i39, !dbg !87
1147
+ %1071 = fmul float %1031, %.0.i39, !dbg !87
1148
+ %1072 = fmul float %1032, %.0.i39, !dbg !87
1149
+ %1073 = fmul float %1033, %.0.i39, !dbg !87
1150
+ %1074 = fmul float %1034, %.0.i39, !dbg !87
1151
+ %1075 = fmul float %1035, %.0.i39, !dbg !87
1152
+ %1076 = fmul float %1060, %933, !dbg !88
1153
+ %1077 = fmul float %1061, %934, !dbg !88
1154
+ %1078 = fmul float %1062, %935, !dbg !88
1155
+ %1079 = fmul float %1063, %936, !dbg !88
1156
+ %1080 = fmul float %1064, %942, !dbg !88
1157
+ %1081 = fmul float %1065, %943, !dbg !88
1158
+ %1082 = fmul float %1066, %944, !dbg !88
1159
+ %1083 = fmul float %1067, %945, !dbg !88
1160
+ %1084 = fmul float %1068, %933, !dbg !88
1161
+ %1085 = fmul float %1069, %934, !dbg !88
1162
+ %1086 = fmul float %1070, %935, !dbg !88
1163
+ %1087 = fmul float %1071, %936, !dbg !88
1164
+ %1088 = fmul float %1072, %942, !dbg !88
1165
+ %1089 = fmul float %1073, %943, !dbg !88
1166
+ %1090 = fmul float %1074, %944, !dbg !88
1167
+ %1091 = fmul float %1075, %945, !dbg !88
1168
+ %1092 = getelementptr i16, ptr addrspace(1) %5, i64 %878, !dbg !89
1169
+ %1093 = getelementptr i16, ptr addrspace(1) %5, i64 %880, !dbg !89
1170
+ %1094 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1076) #6, !dbg !90
1171
+ %1095 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1077) #6, !dbg !90
1172
+ %1096 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1078) #6, !dbg !90
1173
+ %1097 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1079) #6, !dbg !90
1174
+ %1098 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1080) #6, !dbg !90
1175
+ %1099 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1081) #6, !dbg !90
1176
+ %1100 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1082) #6, !dbg !90
1177
+ %1101 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1083) #6, !dbg !90
1178
+ %1102 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1084) #6, !dbg !90
1179
+ %1103 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1085) #6, !dbg !90
1180
+ %1104 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1086) #6, !dbg !90
1181
+ %1105 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1087) #6, !dbg !90
1182
+ %1106 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1088) #6, !dbg !90
1183
+ %1107 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1089) #6, !dbg !90
1184
+ %1108 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1090) #6, !dbg !90
1185
+ %1109 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1091) #6, !dbg !90
1186
+ %1110 = insertelement <2 x i16> undef, i16 %1094, i64 0, !dbg !90
1187
+ %1111 = insertelement <2 x i16> %1110, i16 %1095, i64 1, !dbg !90
1188
+ %1112 = bitcast <2 x i16> %1111 to i32, !dbg !90
1189
+ %1113 = insertelement <2 x i16> undef, i16 %1096, i64 0, !dbg !90
1190
+ %1114 = insertelement <2 x i16> %1113, i16 %1097, i64 1, !dbg !90
1191
+ %1115 = bitcast <2 x i16> %1114 to i32, !dbg !90
1192
+ %1116 = insertelement <2 x i16> undef, i16 %1098, i64 0, !dbg !90
1193
+ %1117 = insertelement <2 x i16> %1116, i16 %1099, i64 1, !dbg !90
1194
+ %1118 = bitcast <2 x i16> %1117 to i32, !dbg !90
1195
+ %1119 = insertelement <2 x i16> undef, i16 %1100, i64 0, !dbg !90
1196
+ %1120 = insertelement <2 x i16> %1119, i16 %1101, i64 1, !dbg !90
1197
+ %1121 = bitcast <2 x i16> %1120 to i32, !dbg !90
1198
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1112, i32 %1115, i32 %1118, i32 %1121, ptr addrspace(1) %1092, i1 true) #6, !dbg !90
1199
+ %1122 = insertelement <2 x i16> undef, i16 %1102, i64 0, !dbg !90
1200
+ %1123 = insertelement <2 x i16> %1122, i16 %1103, i64 1, !dbg !90
1201
+ %1124 = bitcast <2 x i16> %1123 to i32, !dbg !90
1202
+ %1125 = insertelement <2 x i16> undef, i16 %1104, i64 0, !dbg !90
1203
+ %1126 = insertelement <2 x i16> %1125, i16 %1105, i64 1, !dbg !90
1204
+ %1127 = bitcast <2 x i16> %1126 to i32, !dbg !90
1205
+ %1128 = insertelement <2 x i16> undef, i16 %1106, i64 0, !dbg !90
1206
+ %1129 = insertelement <2 x i16> %1128, i16 %1107, i64 1, !dbg !90
1207
+ %1130 = bitcast <2 x i16> %1129 to i32, !dbg !90
1208
+ %1131 = insertelement <2 x i16> undef, i16 %1108, i64 0, !dbg !90
1209
+ %1132 = insertelement <2 x i16> %1131, i16 %1109, i64 1, !dbg !90
1210
+ %1133 = bitcast <2 x i16> %1132 to i32, !dbg !90
1211
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1124, i32 %1127, i32 %1130, i32 %1133, ptr addrspace(1) %1093, i1 true) #6, !dbg !90
1212
+ %1134 = add nuw nsw i32 %825, 64, !dbg !69
1213
+ %1135 = icmp ult i32 %825, 192, !dbg !69
1214
+ br i1 %1135, label %824, label %1136, !dbg !69
1215
+
1216
+ 1136: ; preds = %__nv_rsqrtf.exit40
1217
+ ret void, !dbg !91
1218
+ }
1219
+
1220
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1221
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
1222
+
1223
+ ; Function Attrs: convergent nocallback nounwind
1224
+ declare void @llvm.nvvm.barrier0() #1
1225
+
1226
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
1227
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
1228
+
1229
+ ; Function Attrs: alwaysinline nounwind
1230
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
1231
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
1232
+ %.not = icmp eq i32 %1, 0
1233
+ br i1 %.not, label %4, label %2
1234
+
1235
+ 2: ; preds = %0
1236
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
1237
+ br label %6
1238
+
1239
+ 4: ; preds = %0
1240
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
1241
+ br label %6
1242
+
1243
+ 6: ; preds = %4, %2
1244
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
1245
+ ret float %.0
1246
+ }
1247
+
1248
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
1249
+
1250
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1251
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
1252
+
1253
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
1254
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
1255
+
1256
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1257
+ attributes #1 = { convergent nocallback nounwind }
1258
+ attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1259
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1260
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
1261
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
1262
+ attributes #6 = { nounwind }
1263
+
1264
+ !llvm.module.flags = !{!0, !1}
1265
+ !llvm.dbg.cu = !{!2}
1266
+ !nvvm.annotations = !{!4, !5, !5, !4}
1267
+ !llvm.ident = !{!6}
1268
+
1269
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
1270
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
1271
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
1272
+ !3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
1273
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
1274
+ !5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
1275
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
1276
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
1277
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
1278
+ !9 = !{}
1279
+ !10 = !DILocation(line: 22, column: 44, scope: !7)
1280
+ !11 = !DILocation(line: 24, column: 33, scope: !7)
1281
+ !12 = !DILocation(line: 31, column: 36, scope: !7)
1282
+ !13 = !DILocation(line: 21, column: 28, scope: !7)
1283
+ !14 = !DILocation(line: 21, column: 33, scope: !7)
1284
+ !15 = !DILocation(line: 22, column: 23, scope: !7)
1285
+ !16 = !DILocation(line: 26, column: 30, scope: !7)
1286
+ !17 = !DILocation(line: 26, column: 35, scope: !7)
1287
+ !18 = !DILocation(line: 27, column: 18, scope: !7)
1288
+ !19 = !DILocation(line: 35, column: 44, scope: !7)
1289
+ !20 = !DILocation(line: 36, column: 44, scope: !7)
1290
+ !21 = !DILocation(line: 37, column: 22, scope: !7)
1291
+ !22 = !DILocation(line: 38, column: 22, scope: !7)
1292
+ !23 = !DILocation(line: 39, column: 36, scope: !7)
1293
+ !24 = !DILocation(line: 40, column: 40, scope: !7)
1294
+ !25 = !DILocation(line: 41, column: 44, scope: !7)
1295
+ !26 = !DILocation(line: 32, column: 27, scope: !7)
1296
+ !27 = !DILocation(line: 35, column: 40, scope: !7)
1297
+ !28 = !DILocation(line: 35, column: 34, scope: !7)
1298
+ !29 = !DILocation(line: 35, column: 50, scope: !7)
1299
+ !30 = !DILocation(line: 36, column: 40, scope: !7)
1300
+ !31 = !DILocation(line: 36, column: 34, scope: !7)
1301
+ !32 = !DILocation(line: 36, column: 50, scope: !7)
1302
+ !33 = !DILocation(line: 36, column: 101, scope: !7)
1303
+ !34 = !DILocation(line: 40, column: 55, scope: !7)
1304
+ !35 = !DILocation(line: 41, column: 40, scope: !7)
1305
+ !36 = !DILocation(line: 41, column: 34, scope: !7)
1306
+ !37 = !DILocation(line: 41, column: 52, scope: !7)
1307
+ !38 = !DILocation(line: 42, column: 22, scope: !7)
1308
+ !39 = !DILocation(line: 44, column: 22, scope: !7)
1309
+ !40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
1310
+ !41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
1311
+ !42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
1312
+ !43 = !DILocation(line: 47, column: 41, scope: !41)
1313
+ !44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
1314
+ !45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
1315
+ !46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
1316
+ !47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
1317
+ !48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
1318
+ !49 = !DILocation(line: 50, column: 50, scope: !7)
1319
+ !50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
1320
+ !51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
1321
+ !52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
1322
+ !53 = !DILocation(line: 53, column: 44, scope: !51)
1323
+ !54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
1324
+ !55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
1325
+ !56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
1326
+ !57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
1327
+ !58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
1328
+ !59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
1329
+ !60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
1330
+ !61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
1331
+ !62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
1332
+ !63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
1333
+ !64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
1334
+ !65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
1335
+ !66 = !DILocation(line: 53, column: 44, scope: !41)
1336
+ !67 = !DILocation(line: 75, column: 24, scope: !7)
1337
+ !68 = !DILocation(line: 77, column: 24, scope: !7)
1338
+ !69 = !DILocation(line: 58, column: 36, scope: !7)
1339
+ !70 = !DILocation(line: 59, column: 27, scope: !7)
1340
+ !71 = !DILocation(line: 62, column: 41, scope: !7)
1341
+ !72 = !DILocation(line: 62, column: 35, scope: !7)
1342
+ !73 = !DILocation(line: 62, column: 51, scope: !7)
1343
+ !74 = !DILocation(line: 63, column: 41, scope: !7)
1344
+ !75 = !DILocation(line: 63, column: 35, scope: !7)
1345
+ !76 = !DILocation(line: 63, column: 51, scope: !7)
1346
+ !77 = !DILocation(line: 63, column: 103, scope: !7)
1347
+ !78 = !DILocation(line: 64, column: 35, scope: !7)
1348
+ !79 = !DILocation(line: 64, column: 40, scope: !7)
1349
+ !80 = !DILocation(line: 68, column: 57, scope: !7)
1350
+ !81 = !DILocation(line: 69, column: 35, scope: !7)
1351
+ !82 = !DILocation(line: 69, column: 54, scope: !7)
1352
+ !83 = !DILocation(line: 70, column: 24, scope: !7)
1353
+ !84 = !DILocation(line: 72, column: 24, scope: !7)
1354
+ !85 = !DILocation(line: 73, column: 24, scope: !7)
1355
+ !86 = !DILocation(line: 78, column: 30, scope: !7)
1356
+ !87 = !DILocation(line: 79, column: 24, scope: !7)
1357
+ !88 = !DILocation(line: 80, column: 24, scope: !7)
1358
+ !89 = !DILocation(line: 82, column: 29, scope: !7)
1359
+ !90 = !DILocation(line: 82, column: 52, scope: !7)
1360
+ !91 = !DILocation(line: 58, column: 4, scope: !7)
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16>
4
+ %cst_0 = arith.constant 0.000000e+00 : f32
5
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
6
+ %c256_i32 = arith.constant 256 : i32
7
+ %c64_i32 = arith.constant 64 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
12
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
15
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
16
+ %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
17
+ %cst_10 = arith.constant dense<256> : tensor<1x64xi32>
18
+ %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
19
+ %0 = tt.get_program_id x : i32
20
+ %1 = arith.muli %0, %c64_i32 : i32
21
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
22
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
23
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
24
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
25
+ %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
26
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
27
+ %8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
28
+ %9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
29
+ %10 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
30
+ %11 = arith.muli %10, %cst_9 : tensor<64x1xi32>
31
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
32
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
33
+ %14 = arith.muli %5, %cst_9 : tensor<64x1xi32>
34
+ %15 = tt.broadcast %14 : (tensor<64x1xi32>) -> tensor<64x64xi32>
35
+ %16 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
36
+ %17 = arith.addi %9, %cst_4 : tensor<64x1xi64>
37
+ %18 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64>
38
+ %19 = arith.select %18, %17, %9 : tensor<64x1xi1>, tensor<64x1xi64>
39
+ %20 = arith.cmpi sge, %19, %cst_3 : tensor<64x1xi64>
40
+ %21 = arith.cmpi slt, %19, %cst_4 : tensor<64x1xi64>
41
+ %22 = arith.andi %20, %21 : tensor<64x1xi1>
42
+ %23 = arith.muli %19, %cst_2 : tensor<64x1xi64>
43
+ %24 = tt.broadcast %23 : (tensor<64x1xi64>) -> tensor<64x64xi64>
44
+ %25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
45
+ %26:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 {
46
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32>
47
+ %51 = arith.addi %50, %6 : tensor<1x64xi32>
48
+ %52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32>
49
+ %53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32>
50
+ %54 = arith.addi %53, %12 : tensor<64x64xi32>
51
+ %55 = tt.addptr %13, %54 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
52
+ %56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1>
53
+ %57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
54
+ %58 = arith.addi %53, %15 : tensor<64x64xi32>
55
+ %59 = tt.addptr %16, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
56
+ %60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16>
57
+ %61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32>
58
+ tt.assert %22, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
59
+ %62 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64>
60
+ %63 = tt.broadcast %62 : (tensor<1x64xi64>) -> tensor<64x64xi64>
61
+ %64 = arith.addi %63, %24 : tensor<64x64xi64>
62
+ %65 = tt.addptr %25, %64 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
63
+ %66 = tt.load %65, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
64
+ %67 = arith.addf %66, %57 : tensor<64x64xf32>
65
+ %68 = arith.addf %67, %61 : tensor<64x64xf32>
66
+ %69 = arith.subf %68, %arg9 : tensor<64x64xf32>
67
+ %70 = arith.addf %arg11, %cst_1 : tensor<64x64xf32>
68
+ %71 = arith.divf %69, %70 : tensor<64x64xf32>
69
+ %72 = arith.addf %arg9, %71 : tensor<64x64xf32>
70
+ %73 = arith.subf %68, %72 : tensor<64x64xf32>
71
+ %74 = arith.mulf %69, %73 : tensor<64x64xf32>
72
+ %75 = arith.addf %arg10, %74 : tensor<64x64xf32>
73
+ %76 = arith.select %56, %72, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
74
+ %77 = arith.select %56, %75, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
75
+ %78 = arith.select %56, %70, %arg11 : tensor<64x64xi1>, tensor<64x64xf32>
76
+ scf.yield %76, %77, %78 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
77
+ }
78
+ %27:3 = "tt.reduce"(%26#0, %26#1, %26#2) <{axis = 1 : i32}> ({
79
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
80
+ %50 = arith.subf %arg11, %arg8 : f32
81
+ %51 = arith.addf %arg10, %arg13 : f32
82
+ %52 = arith.cmpf oeq, %51, %cst_0 : f32
83
+ %53 = arith.divf %arg13, %51 : f32
84
+ %54 = arith.select %52, %cst_0, %53 : f32
85
+ %55 = arith.mulf %50, %54 : f32
86
+ %56 = arith.addf %arg8, %55 : f32
87
+ %57 = arith.addf %arg9, %arg12 : f32
88
+ %58 = arith.mulf %50, %50 : f32
89
+ %59 = arith.mulf %58, %arg10 : f32
90
+ %60 = arith.mulf %59, %54 : f32
91
+ %61 = arith.addf %57, %60 : f32
92
+ tt.reduce.return %56, %61, %51 : f32, f32, f32
93
+ }) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
94
+ %28 = tt.expand_dims %27#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
95
+ %29 = tt.expand_dims %27#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
96
+ %30 = arith.muli %10, %cst_9 : tensor<64x1xi32>
97
+ %31 = tt.broadcast %30 : (tensor<64x1xi32>) -> tensor<64x64xi32>
98
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
99
+ %33 = arith.muli %5, %cst_9 : tensor<64x1xi32>
100
+ %34 = tt.broadcast %33 : (tensor<64x1xi32>) -> tensor<64x64xi32>
101
+ %35 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
102
+ %36 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
103
+ %37 = arith.addi %9, %cst_4 : tensor<64x1xi64>
104
+ %38 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64>
105
+ %39 = arith.select %38, %37, %9 : tensor<64x1xi1>, tensor<64x1xi64>
106
+ %40 = arith.cmpi sge, %39, %cst_3 : tensor<64x1xi64>
107
+ %41 = arith.cmpi slt, %39, %cst_4 : tensor<64x1xi64>
108
+ %42 = arith.andi %40, %41 : tensor<64x1xi1>
109
+ %43 = arith.muli %39, %cst_2 : tensor<64x1xi64>
110
+ %44 = tt.broadcast %43 : (tensor<64x1xi64>) -> tensor<64x64xi64>
111
+ %45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
112
+ %46 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x64xf32>
113
+ %47 = arith.divf %29, %cst_6 : tensor<64x1xf32>
114
+ %48 = arith.addf %47, %cst_5 : tensor<64x1xf32>
115
+ %49 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
116
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
117
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32>
118
+ %51 = arith.addi %50, %6 : tensor<1x64xi32>
119
+ %52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32>
120
+ %53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32>
121
+ %54 = arith.addi %53, %31 : tensor<64x64xi32>
122
+ %55 = tt.addptr %32, %54 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
123
+ %56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1>
124
+ %57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
125
+ %58 = arith.addi %53, %34 : tensor<64x64xi32>
126
+ %59 = tt.addptr %35, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
127
+ %60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
128
+ %61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32>
129
+ %62 = tt.addptr %36, %51 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
130
+ %63 = tt.load %62, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
131
+ tt.assert %42, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
132
+ %64 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64>
133
+ %65 = tt.broadcast %64 : (tensor<1x64xi64>) -> tensor<64x64xi64>
134
+ %66 = arith.addi %65, %44 : tensor<64x64xi64>
135
+ %67 = tt.addptr %45, %66 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
136
+ %68 = tt.load %67, %56, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
137
+ %69 = arith.addf %68, %57 : tensor<64x64xf32>
138
+ %70 = arith.addf %69, %61 : tensor<64x64xf32>
139
+ %71 = arith.subf %70, %46 : tensor<64x64xf32>
140
+ %72 = tt.extern_elementwise %48 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
141
+ %73 = tt.broadcast %72 : (tensor<64x1xf32>) -> tensor<64x64xf32>
142
+ %74 = arith.mulf %71, %73 : tensor<64x64xf32>
143
+ %75 = tt.broadcast %63 : (tensor<1x64xf32>) -> tensor<64x64xf32>
144
+ %76 = arith.mulf %74, %75 : tensor<64x64xf32>
145
+ %77 = tt.addptr %49, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
146
+ %78 = arith.truncf %76 : tensor<64x64xf32> to tensor<64x64xbf16>
147
+ tt.store %77, %78, %56 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
148
+ }
149
+ tt.return
150
+ }
151
+ }
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin ADDED
Binary file (13.7 kB). View file
 
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = shl i32 %6, 2, !dbg !8
11
+ %10 = and i32 %9, 12, !dbg !8
12
+ %11 = and i32 %6, 15, !dbg !8
13
+ %12 = and i32 %8, 7, !dbg !9
14
+ %13 = lshr i32 %7, 2, !dbg !9
15
+ %14 = shl nuw nsw i32 %12, 3, !dbg !9
16
+ %15 = or i32 %14, %13, !dbg !9
17
+ %16 = or i32 %15, 64, !dbg !9
18
+ %17 = or i32 %10, 1, !dbg !10
19
+ %18 = or i32 %10, 2, !dbg !10
20
+ %19 = or i32 %10, 3, !dbg !10
21
+ %20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
22
+ %21 = shl i32 %20, 4, !dbg !15
23
+ %22 = or i32 %21, %10, !dbg !16
24
+ %23 = or i32 %21, %11, !dbg !16
25
+ %24 = icmp ult i32 %16, 120, !dbg !17
26
+ %25 = shl nuw nsw i32 %15, 17, !dbg !18
27
+ %26 = shl nuw nsw i32 %16, 17, !dbg !18
28
+ %27 = add i32 %22, %25, !dbg !19
29
+ %28 = add i32 %22, %26, !dbg !19
30
+ %29 = sext i32 %27 to i64, !dbg !20
31
+ %30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !20
32
+ %31 = sext i32 %28 to i64, !dbg !20
33
+ %32 = getelementptr float, ptr addrspace(1) %0, i64 %31, !dbg !20
34
+ %33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
35
+ %34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !21
36
+ %35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !21
37
+ %36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !21
38
+ %37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !21
39
+ %38 = bitcast i32 %34 to float, !dbg !21
40
+ %39 = bitcast i32 %35 to float, !dbg !21
41
+ %40 = bitcast i32 %36 to float, !dbg !21
42
+ %41 = bitcast i32 %37 to float, !dbg !21
43
+ %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24) #3, !dbg !21
44
+ %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21
45
+ %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21
46
+ %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21
47
+ %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21
48
+ %47 = bitcast i32 %43 to float, !dbg !21
49
+ %48 = bitcast i32 %44 to float, !dbg !21
50
+ %49 = bitcast i32 %45 to float, !dbg !21
51
+ %50 = bitcast i32 %46 to float, !dbg !21
52
+ %51 = fadd float %38, 0.000000e+00, !dbg !22
53
+ %52 = fadd float %39, 0.000000e+00, !dbg !22
54
+ %53 = fadd float %40, 0.000000e+00, !dbg !22
55
+ %54 = fadd float %41, 0.000000e+00, !dbg !22
56
+ %55 = fadd float %47, 0.000000e+00, !dbg !22
57
+ %56 = fadd float %48, 0.000000e+00, !dbg !22
58
+ %57 = fadd float %49, 0.000000e+00, !dbg !22
59
+ %58 = fadd float %50, 0.000000e+00, !dbg !22
60
+ %59 = select i1 %24, float %55, float 0.000000e+00, !dbg !23
61
+ %60 = select i1 %24, float %56, float 0.000000e+00, !dbg !23
62
+ %61 = select i1 %24, float %57, float 0.000000e+00, !dbg !23
63
+ %62 = select i1 %24, float %58, float 0.000000e+00, !dbg !23
64
+ %63 = fadd float %51, %59, !dbg !24
65
+ %64 = fadd float %52, %60, !dbg !24
66
+ %65 = fadd float %53, %61, !dbg !24
67
+ %66 = fadd float %54, %62, !dbg !24
68
+ %67 = bitcast float %63 to i32, !dbg !10
69
+ %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !10
70
+ %69 = bitcast i32 %68 to float, !dbg !10
71
+ %70 = fadd float %63, %69, !dbg !24
72
+ %71 = bitcast float %70 to i32, !dbg !10
73
+ %72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 8, i32 31), !dbg !10
74
+ %73 = bitcast i32 %72 to float, !dbg !10
75
+ %74 = fadd float %70, %73, !dbg !24
76
+ %75 = bitcast float %74 to i32, !dbg !10
77
+ %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 4, i32 31), !dbg !10
78
+ %77 = bitcast i32 %76 to float, !dbg !10
79
+ %78 = fadd float %74, %77, !dbg !24
80
+ %79 = bitcast float %64 to i32, !dbg !10
81
+ %80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 16, i32 31), !dbg !10
82
+ %81 = bitcast i32 %80 to float, !dbg !10
83
+ %82 = fadd float %64, %81, !dbg !24
84
+ %83 = bitcast float %82 to i32, !dbg !10
85
+ %84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 8, i32 31), !dbg !10
86
+ %85 = bitcast i32 %84 to float, !dbg !10
87
+ %86 = fadd float %82, %85, !dbg !24
88
+ %87 = bitcast float %86 to i32, !dbg !10
89
+ %88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 4, i32 31), !dbg !10
90
+ %89 = bitcast i32 %88 to float, !dbg !10
91
+ %90 = fadd float %86, %89, !dbg !24
92
+ %91 = bitcast float %65 to i32, !dbg !10
93
+ %92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %91, i32 16, i32 31), !dbg !10
94
+ %93 = bitcast i32 %92 to float, !dbg !10
95
+ %94 = fadd float %65, %93, !dbg !24
96
+ %95 = bitcast float %94 to i32, !dbg !10
97
+ %96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 8, i32 31), !dbg !10
98
+ %97 = bitcast i32 %96 to float, !dbg !10
99
+ %98 = fadd float %94, %97, !dbg !24
100
+ %99 = bitcast float %98 to i32, !dbg !10
101
+ %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 4, i32 31), !dbg !10
102
+ %101 = bitcast i32 %100 to float, !dbg !10
103
+ %102 = fadd float %98, %101, !dbg !24
104
+ %103 = bitcast float %66 to i32, !dbg !10
105
+ %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !10
106
+ %105 = bitcast i32 %104 to float, !dbg !10
107
+ %106 = fadd float %66, %105, !dbg !24
108
+ %107 = bitcast float %106 to i32, !dbg !10
109
+ %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 8, i32 31), !dbg !10
110
+ %109 = bitcast i32 %108 to float, !dbg !10
111
+ %110 = fadd float %106, %109, !dbg !24
112
+ %111 = bitcast float %110 to i32, !dbg !10
113
+ %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 4, i32 31), !dbg !10
114
+ %113 = bitcast i32 %112 to float, !dbg !10
115
+ %114 = fadd float %110, %113, !dbg !24
116
+ %115 = icmp ult i32 %7, 4, !dbg !10
117
+ %116 = shl nuw nsw i32 %10, 3, !dbg !10
118
+ %117 = or i32 %116, %12, !dbg !10
119
+ %118 = zext nneg i32 %117 to i64, !dbg !10
120
+ %119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !10
121
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %78, i1 %115) #3, !dbg !10
122
+ %120 = shl nuw nsw i32 %17, 3, !dbg !10
123
+ %121 = or i32 %120, %12, !dbg !10
124
+ %122 = zext nneg i32 %121 to i64, !dbg !10
125
+ %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !10
126
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %90, i1 %115) #3, !dbg !10
127
+ %124 = shl nuw nsw i32 %18, 3, !dbg !10
128
+ %125 = or i32 %124, %12, !dbg !10
129
+ %126 = zext nneg i32 %125 to i64, !dbg !10
130
+ %127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !10
131
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %102, i1 %115) #3, !dbg !10
132
+ %128 = shl nuw nsw i32 %19, 3, !dbg !10
133
+ %129 = or i32 %128, %12, !dbg !10
134
+ %130 = zext nneg i32 %129 to i64, !dbg !10
135
+ %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !10
136
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %131, float %114, i1 %115) #3, !dbg !10
137
+ tail call void @llvm.nvvm.barrier0(), !dbg !10
138
+ %132 = icmp slt i32 %6, 128, !dbg !10
139
+ %133 = sext i32 %6 to i64, !dbg !10
140
+ %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !10
141
+ %135 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %134, i1 %132) #3, !dbg !10
142
+ %136 = bitcast float %135 to i32, !dbg !10
143
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !10
144
+ %138 = bitcast i32 %137 to float, !dbg !10
145
+ %139 = fadd float %135, %138, !dbg !24
146
+ %140 = bitcast float %139 to i32, !dbg !10
147
+ %141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !10
148
+ %142 = bitcast i32 %141 to float, !dbg !10
149
+ %143 = fadd float %139, %142, !dbg !24
150
+ %144 = bitcast float %143 to i32, !dbg !10
151
+ %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 1, i32 31), !dbg !10
152
+ %146 = bitcast i32 %145 to float, !dbg !10
153
+ %147 = fadd float %143, %146, !dbg !24
154
+ %148 = and i32 %6, 7, !dbg !10
155
+ %149 = icmp eq i32 %148, 0, !dbg !10
156
+ %150 = and i1 %132, %149, !dbg !10
157
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %134, float %147, i1 %150) #3, !dbg !10
158
+ tail call void @llvm.nvvm.barrier0(), !dbg !10
159
+ %151 = zext nneg i32 %116 to i64, !dbg !10
160
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
161
+ %153 = load float, ptr addrspace(3) %152, align 4, !dbg !10
162
+ %154 = zext nneg i32 %120 to i64, !dbg !10
163
+ %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !10
164
+ %156 = load float, ptr addrspace(3) %155, align 4, !dbg !10
165
+ %157 = zext nneg i32 %124 to i64, !dbg !10
166
+ %158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !10
167
+ %159 = load float, ptr addrspace(3) %158, align 4, !dbg !10
168
+ %160 = zext nneg i32 %128 to i64, !dbg !10
169
+ %161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !10
170
+ %162 = load float, ptr addrspace(3) %161, align 4, !dbg !10
171
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
172
+ %163 = zext nneg i32 %10 to i64, !dbg !28
173
+ %164 = getelementptr float, ptr addrspace(3) @global_smem, i64 %163, !dbg !28
174
+ %165 = insertelement <1 x float> undef, float %153, i64 0, !dbg !28
175
+ store <1 x float> %165, ptr addrspace(3) %164, align 4, !dbg !28
176
+ %166 = zext nneg i32 %17 to i64, !dbg !28
177
+ %167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !28
178
+ %168 = insertelement <1 x float> undef, float %156, i64 0, !dbg !28
179
+ store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !28
180
+ %169 = zext nneg i32 %18 to i64, !dbg !28
181
+ %170 = getelementptr float, ptr addrspace(3) @global_smem, i64 %169, !dbg !28
182
+ %171 = insertelement <1 x float> undef, float %159, i64 0, !dbg !28
183
+ store <1 x float> %171, ptr addrspace(3) %170, align 4, !dbg !28
184
+ %172 = zext nneg i32 %19 to i64, !dbg !28
185
+ %173 = getelementptr float, ptr addrspace(3) @global_smem, i64 %172, !dbg !28
186
+ %174 = insertelement <1 x float> undef, float %162, i64 0, !dbg !28
187
+ store <1 x float> %174, ptr addrspace(3) %173, align 4, !dbg !28
188
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
189
+ %175 = zext nneg i32 %11 to i64, !dbg !28
190
+ %176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !28
191
+ %177 = load <1 x float>, ptr addrspace(3) %176, align 4, !dbg !28
192
+ %.frozen = freeze i32 %23
193
+ %178 = sdiv i32 %.frozen, 256, !dbg !29
194
+ %179 = mul i32 %178, 256
195
+ %.decomposed = sub i32 %.frozen, %179
196
+ %180 = sext i32 %178 to i64, !dbg !30
197
+ %181 = getelementptr i64, ptr addrspace(1) %1, i64 %180, !dbg !30
198
+ %182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %181, i1 true) #3, !dbg !31
199
+ %183 = lshr i64 %182, 54, !dbg !32
200
+ %184 = and i64 %183, 512, !dbg !32
201
+ %185 = add i64 %184, %182, !dbg !32
202
+ %186 = shl i64 %185, 8, !dbg !33
203
+ %187 = sext i32 %.decomposed to i64, !dbg !34
204
+ %188 = getelementptr float, ptr addrspace(1) %2, i64 %186, !dbg !35
205
+ %189 = getelementptr float, ptr addrspace(1) %188, i64 %187, !dbg !35
206
+ %190 = lshr i32 %7, 4, !dbg !36
207
+ %191 = shl nuw nsw i32 %12, 1, !dbg !36
208
+ %192 = or i32 %191, %190, !dbg !36
209
+ %193 = icmp eq i32 %192, 0, !dbg !36
210
+ %194 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %189, <1 x float> %177, i1 %193) #3, !dbg !36
211
+ ret void, !dbg !37
212
+ }
213
+
214
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
215
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
216
+
217
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
218
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
219
+
220
+ ; Function Attrs: convergent nocallback nounwind
221
+ declare void @llvm.nvvm.barrier0() #2
222
+
223
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
224
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
225
+ attributes #2 = { convergent nocallback nounwind }
226
+ attributes #3 = { nounwind }
227
+
228
+ !llvm.module.flags = !{!0}
229
+ !llvm.dbg.cu = !{!1}
230
+ !nvvm.annotations = !{!3, !4, !4, !3}
231
+
232
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
233
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
234
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
235
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
236
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
237
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
238
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
239
+ !7 = !{}
240
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
241
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
242
+ !10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
243
+ !11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
244
+ !12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
245
+ !13 = !DILocation(line: 35, column: 25, scope: !11)
246
+ !14 = !DILocation(line: 21, column: 28, scope: !5)
247
+ !15 = !DILocation(line: 21, column: 33, scope: !5)
248
+ !16 = !DILocation(line: 22, column: 23, scope: !5)
249
+ !17 = !DILocation(line: 29, column: 25, scope: !5)
250
+ !18 = !DILocation(line: 31, column: 47, scope: !5)
251
+ !19 = !DILocation(line: 31, column: 40, scope: !5)
252
+ !20 = !DILocation(line: 31, column: 34, scope: !5)
253
+ !21 = !DILocation(line: 31, column: 53, scope: !5)
254
+ !22 = !DILocation(line: 33, column: 23, scope: !5)
255
+ !23 = !DILocation(line: 34, column: 38, scope: !5)
256
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
257
+ !25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
258
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
259
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
260
+ !28 = !DILocation(line: 35, column: 28, scope: !5)
261
+ !29 = !DILocation(line: 36, column: 20, scope: !5)
262
+ !30 = !DILocation(line: 38, column: 30, scope: !5)
263
+ !31 = !DILocation(line: 38, column: 35, scope: !5)
264
+ !32 = !DILocation(line: 41, column: 32, scope: !5)
265
+ !33 = !DILocation(line: 45, column: 40, scope: !5)
266
+ !34 = !DILocation(line: 45, column: 36, scope: !5)
267
+ !35 = !DILocation(line: 45, column: 30, scope: !5)
268
+ !36 = !DILocation(line: 45, column: 55, scope: !5)
269
+ !37 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 256, 1, 1
20
+ {
21
+ .reg .pred %p<20>;
22
+ .reg .b32 %r<107>;
23
+ .reg .f32 %f<60>;
24
+ .reg .b64 %rd<18>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_0];
30
+ ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_1];
31
+ $L__tmp0:
32
+ .loc 1 22 44
33
+ mov.u32 %r32, %tid.x;
34
+ and.b32 %r33, %r32, 31;
35
+ ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_2];
36
+ shl.b32 %r34, %r32, 2;
37
+ and.b32 %r35, %r34, 12;
38
+ and.b32 %r36, %r32, 15;
39
+ .loc 1 24 33
40
+ bfe.u32 %r37, %r32, 5, 3;
41
+ bfe.u32 %r38, %r32, 2, 3;
42
+ shl.b32 %r39, %r37, 3;
43
+ or.b32 %r40, %r39, %r38;
44
+ or.b32 %r41, %r40, 64;
45
+ .loc 1 21 28
46
+ mov.u32 %r1, %ctaid.x;
47
+ .loc 1 21 33
48
+ shl.b32 %r42, %r1, 4;
49
+ .loc 1 22 23
50
+ or.b32 %r43, %r42, %r35;
51
+ or.b32 %r44, %r42, %r36;
52
+ .loc 1 29 25
53
+ setp.lt.u32 %p6, %r41, 120;
54
+ .loc 1 31 47
55
+ shl.b32 %r45, %r40, 17;
56
+ shl.b32 %r46, %r41, 17;
57
+ .loc 1 31 40
58
+ add.s32 %r47, %r43, %r45;
59
+ add.s32 %r48, %r43, %r46;
60
+ .loc 1 31 34
61
+ mul.wide.s32 %rd9, %r47, 4;
62
+ add.s64 %rd1, %rd6, %rd9;
63
+ mul.wide.s32 %rd10, %r48, 4;
64
+ add.s64 %rd2, %rd6, %rd10;
65
+ mov.b32 %r6, 0;
66
+ mov.pred %p1, -1;
67
+ .loc 1 31 53
68
+ mov.u32 %r2, 0x0;
69
+ mov.u32 %r3, 0x0;
70
+ mov.u32 %r4, 0x0;
71
+ mov.u32 %r5, 0x0;
72
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
73
+ @!%p1 mov.u32 %r2, %r6;
74
+ @!%p1 mov.u32 %r3, %r6;
75
+ @!%p1 mov.u32 %r4, %r6;
76
+ @!%p1 mov.u32 %r5, %r6;
77
+ mov.b32 %f1, %r2;
78
+ mov.b32 %f2, %r3;
79
+ mov.b32 %f3, %r4;
80
+ mov.b32 %f4, %r5;
81
+ mov.u32 %r10, 0x0;
82
+ mov.u32 %r11, 0x0;
83
+ mov.u32 %r12, 0x0;
84
+ mov.u32 %r13, 0x0;
85
+ @%p6 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
86
+ @!%p6 mov.u32 %r10, %r6;
87
+ @!%p6 mov.u32 %r11, %r6;
88
+ @!%p6 mov.u32 %r12, %r6;
89
+ @!%p6 mov.u32 %r13, %r6;
90
+ mov.b32 %f5, %r10;
91
+ mov.b32 %f6, %r11;
92
+ mov.b32 %f7, %r12;
93
+ mov.b32 %f8, %r13;
94
+ .loc 1 33 23
95
+ add.f32 %f9, %f1, 0f00000000;
96
+ add.f32 %f10, %f2, 0f00000000;
97
+ add.f32 %f11, %f3, 0f00000000;
98
+ add.f32 %f12, %f4, 0f00000000;
99
+ add.f32 %f13, %f5, 0f00000000;
100
+ add.f32 %f14, %f6, 0f00000000;
101
+ add.f32 %f15, %f7, 0f00000000;
102
+ add.f32 %f16, %f8, 0f00000000;
103
+ .loc 1 34 38
104
+ selp.f32 %f17, %f13, 0f00000000, %p6;
105
+ selp.f32 %f18, %f14, 0f00000000, %p6;
106
+ selp.f32 %f19, %f15, 0f00000000, %p6;
107
+ selp.f32 %f20, %f16, 0f00000000, %p6;
108
+ $L__tmp1:
109
+ .loc 2 233 15
110
+ add.f32 %f21, %f9, %f17;
111
+ add.f32 %f22, %f10, %f18;
112
+ add.f32 %f23, %f11, %f19;
113
+ add.f32 %f24, %f12, %f20;
114
+ $L__tmp2:
115
+ .loc 2 243 36
116
+ mov.b32 %r49, %f21;
117
+ shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1;
118
+ mov.b32 %f25, %r50;
119
+ $L__tmp3:
120
+ .loc 2 233 15
121
+ add.f32 %f26, %f21, %f25;
122
+ $L__tmp4:
123
+ .loc 2 243 36
124
+ mov.b32 %r51, %f26;
125
+ shfl.sync.bfly.b32 %r52, %r51, 8, 31, -1;
126
+ mov.b32 %f27, %r52;
127
+ $L__tmp5:
128
+ .loc 2 233 15
129
+ add.f32 %f28, %f26, %f27;
130
+ $L__tmp6:
131
+ .loc 2 243 36
132
+ mov.b32 %r53, %f28;
133
+ shfl.sync.bfly.b32 %r54, %r53, 4, 31, -1;
134
+ mov.b32 %f29, %r54;
135
+ $L__tmp7:
136
+ .loc 2 233 15
137
+ add.f32 %f30, %f28, %f29;
138
+ $L__tmp8:
139
+ .loc 2 243 36
140
+ mov.b32 %r55, %f22;
141
+ shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1;
142
+ mov.b32 %f31, %r56;
143
+ $L__tmp9:
144
+ .loc 2 233 15
145
+ add.f32 %f32, %f22, %f31;
146
+ $L__tmp10:
147
+ .loc 2 243 36
148
+ mov.b32 %r57, %f32;
149
+ shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1;
150
+ mov.b32 %f33, %r58;
151
+ $L__tmp11:
152
+ .loc 2 233 15
153
+ add.f32 %f34, %f32, %f33;
154
+ $L__tmp12:
155
+ .loc 2 243 36
156
+ mov.b32 %r59, %f34;
157
+ shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1;
158
+ mov.b32 %f35, %r60;
159
+ $L__tmp13:
160
+ .loc 2 233 15
161
+ add.f32 %f36, %f34, %f35;
162
+ $L__tmp14:
163
+ .loc 2 243 36
164
+ mov.b32 %r61, %f23;
165
+ shfl.sync.bfly.b32 %r62, %r61, 16, 31, -1;
166
+ mov.b32 %f37, %r62;
167
+ $L__tmp15:
168
+ .loc 2 233 15
169
+ add.f32 %f38, %f23, %f37;
170
+ $L__tmp16:
171
+ .loc 2 243 36
172
+ mov.b32 %r63, %f38;
173
+ shfl.sync.bfly.b32 %r64, %r63, 8, 31, -1;
174
+ mov.b32 %f39, %r64;
175
+ $L__tmp17:
176
+ .loc 2 233 15
177
+ add.f32 %f40, %f38, %f39;
178
+ $L__tmp18:
179
+ .loc 2 243 36
180
+ mov.b32 %r65, %f40;
181
+ shfl.sync.bfly.b32 %r66, %r65, 4, 31, -1;
182
+ mov.b32 %f41, %r66;
183
+ $L__tmp19:
184
+ .loc 2 233 15
185
+ add.f32 %f42, %f40, %f41;
186
+ $L__tmp20:
187
+ .loc 2 243 36
188
+ mov.b32 %r67, %f24;
189
+ shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1;
190
+ mov.b32 %f43, %r68;
191
+ $L__tmp21:
192
+ .loc 2 233 15
193
+ add.f32 %f44, %f24, %f43;
194
+ $L__tmp22:
195
+ .loc 2 243 36
196
+ mov.b32 %r69, %f44;
197
+ shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1;
198
+ mov.b32 %f45, %r70;
199
+ $L__tmp23:
200
+ .loc 2 233 15
201
+ add.f32 %f46, %f44, %f45;
202
+ $L__tmp24:
203
+ .loc 2 243 36
204
+ mov.b32 %r71, %f46;
205
+ shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1;
206
+ mov.b32 %f47, %r72;
207
+ $L__tmp25:
208
+ .loc 2 233 15
209
+ add.f32 %f48, %f46, %f47;
210
+ $L__tmp26:
211
+ .loc 2 243 36
212
+ setp.lt.u32 %p11, %r33, 4;
213
+ shl.b32 %r73, %r37, 2;
214
+ shl.b32 %r74, %r35, 5;
215
+ or.b32 %r75, %r74, %r73;
216
+ mov.u32 %r76, global_smem;
217
+ add.s32 %r18, %r76, %r75;
218
+ mov.b32 %r19, %f30;
219
+ @%p11 st.shared.b32 [ %r18 + 0 ], %r19;
220
+ or.b32 %r77, %r74, 32;
221
+ or.b32 %r78, %r77, %r73;
222
+ add.s32 %r20, %r76, %r78;
223
+ mov.b32 %r21, %f36;
224
+ @%p11 st.shared.b32 [ %r20 + 0 ], %r21;
225
+ or.b32 %r79, %r74, 64;
226
+ or.b32 %r80, %r79, %r73;
227
+ add.s32 %r22, %r76, %r80;
228
+ mov.b32 %r23, %f42;
229
+ @%p11 st.shared.b32 [ %r22 + 0 ], %r23;
230
+ or.b32 %r81, %r74, 96;
231
+ or.b32 %r82, %r81, %r73;
232
+ add.s32 %r24, %r76, %r82;
233
+ mov.b32 %r25, %f48;
234
+ @%p11 st.shared.b32 [ %r24 + 0 ], %r25;
235
+ bar.sync 0;
236
+ setp.lt.s32 %p15, %r32, 128;
237
+ add.s32 %r27, %r76, %r34;
238
+ @%p15 ld.shared.b32 %r26, [ %r27 + 0 ];
239
+ mov.b32 %f49, %r26;
240
+ shfl.sync.bfly.b32 %r83, %r26, 4, 31, -1;
241
+ mov.b32 %f50, %r83;
242
+ $L__tmp27:
243
+ .loc 2 233 15
244
+ add.f32 %f51, %f49, %f50;
245
+ $L__tmp28:
246
+ .loc 2 243 36
247
+ mov.b32 %r84, %f51;
248
+ shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1;
249
+ mov.b32 %f52, %r85;
250
+ $L__tmp29:
251
+ .loc 2 233 15
252
+ add.f32 %f53, %f51, %f52;
253
+ $L__tmp30:
254
+ .loc 2 243 36
255
+ mov.b32 %r86, %f53;
256
+ shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
257
+ mov.b32 %f54, %r87;
258
+ $L__tmp31:
259
+ .loc 2 233 15
260
+ add.f32 %f55, %f53, %f54;
261
+ $L__tmp32:
262
+ .loc 2 243 36
263
+ and.b32 %r88, %r32, 7;
264
+ setp.eq.s32 %p19, %r88, 0;
265
+ and.pred %p16, %p15, %p19;
266
+ mov.b32 %r29, %f55;
267
+ @%p16 st.shared.b32 [ %r27 + 0 ], %r29;
268
+ bar.sync 0;
269
+ add.s32 %r89, %r76, %r74;
270
+ ld.shared.f32 %f56, [%r89];
271
+ add.s32 %r90, %r76, %r77;
272
+ ld.shared.f32 %f57, [%r90];
273
+ add.s32 %r91, %r76, %r79;
274
+ ld.shared.f32 %f58, [%r91];
275
+ add.s32 %r92, %r76, %r81;
276
+ ld.shared.f32 %f59, [%r92];
277
+ $L__tmp33:
278
+ .loc 1 35 28
279
+ bar.sync 0;
280
+ shl.b32 %r93, %r35, 2;
281
+ add.s32 %r94, %r76, %r93;
282
+ st.shared.f32 [%r94], %f56;
283
+ st.shared.f32 [%r94+4], %f57;
284
+ st.shared.f32 [%r94+8], %f58;
285
+ st.shared.f32 [%r94+12], %f59;
286
+ bar.sync 0;
287
+ shl.b32 %r95, %r36, 2;
288
+ add.s32 %r96, %r76, %r95;
289
+ .loc 1 36 20
290
+ shr.s32 %r98, %r44, 31;
291
+ shr.u32 %r99, %r98, 24;
292
+ add.s32 %r100, %r44, %r99;
293
+ shr.s32 %r101, %r100, 8;
294
+ and.b32 %r102, %r100, -256;
295
+ sub.s32 %r103, %r44, %r102;
296
+ .loc 1 38 30
297
+ mul.wide.s32 %rd11, %r101, 8;
298
+ add.s64 %rd4, %rd7, %rd11;
299
+ .loc 1 45 55
300
+ ld.shared.u32 %r31, [%r96];
301
+ .loc 1 38 35
302
+ mov.u64 %rd3, 0x0;
303
+ @%p1 ld.global.L1::evict_last.b64 { %rd3 }, [ %rd4 + 0 ];
304
+ .loc 1 41 32
305
+ shr.u64 %rd12, %rd3, 54;
306
+ and.b64 %rd13, %rd12, 512;
307
+ add.s64 %rd14, %rd13, %rd3;
308
+ .loc 1 45 30
309
+ shl.b64 %rd15, %rd14, 10;
310
+ add.s64 %rd16, %rd8, %rd15;
311
+ mul.wide.s32 %rd17, %r103, 4;
312
+ add.s64 %rd5, %rd16, %rd17;
313
+ .loc 1 45 55
314
+ bfe.u32 %r104, %r32, 4, 1;
315
+ shl.b32 %r105, %r37, 1;
316
+ or.b32 %r106, %r105, %r104;
317
+ setp.eq.s32 %p18, %r106, 0;
318
+ mov.u32 %r30, 0x0;
319
+ @%p18 atom.global.gpu.acq_rel.add.f32 %r30, [ %rd5 + 0 ], %r31;
320
+ .loc 1 45 4
321
+ ret;
322
+ $L__tmp34:
323
+ $L__func_end0:
324
+
325
+ }
326
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
327
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
328
+ .section .debug_abbrev
329
+ {
330
+ .b8 1
331
+ .b8 17
332
+ .b8 1
333
+ .b8 37
334
+ .b8 8
335
+ .b8 19
336
+ .b8 5
337
+ .b8 3
338
+ .b8 8
339
+ .b8 16
340
+ .b8 6
341
+ .b8 27
342
+ .b8 8
343
+ .b8 180
344
+ .b8 66
345
+ .b8 12
346
+ .b8 17
347
+ .b8 1
348
+ .b8 18
349
+ .b8 1
350
+ .b8 0
351
+ .b8 0
352
+ .b8 2
353
+ .b8 46
354
+ .b8 0
355
+ .b8 135
356
+ .b8 64
357
+ .b8 8
358
+ .b8 3
359
+ .b8 8
360
+ .b8 58
361
+ .b8 11
362
+ .b8 59
363
+ .b8 11
364
+ .b8 63
365
+ .b8 12
366
+ .b8 32
367
+ .b8 11
368
+ .b8 0
369
+ .b8 0
370
+ .b8 3
371
+ .b8 46
372
+ .b8 1
373
+ .b8 17
374
+ .b8 1
375
+ .b8 18
376
+ .b8 1
377
+ .b8 64
378
+ .b8 10
379
+ .b8 49
380
+ .b8 19
381
+ .b8 0
382
+ .b8 0
383
+ .b8 4
384
+ .b8 29
385
+ .b8 1
386
+ .b8 49
387
+ .b8 19
388
+ .b8 17
389
+ .b8 1
390
+ .b8 18
391
+ .b8 1
392
+ .b8 88
393
+ .b8 11
394
+ .b8 89
395
+ .b8 11
396
+ .b8 87
397
+ .b8 11
398
+ .b8 0
399
+ .b8 0
400
+ .b8 5
401
+ .b8 29
402
+ .b8 0
403
+ .b8 49
404
+ .b8 19
405
+ .b8 17
406
+ .b8 1
407
+ .b8 18
408
+ .b8 1
409
+ .b8 88
410
+ .b8 11
411
+ .b8 89
412
+ .b8 11
413
+ .b8 87
414
+ .b8 11
415
+ .b8 0
416
+ .b8 0
417
+ .b8 0
418
+ }
419
+ .section .debug_info
420
+ {
421
+ .b32 264
422
+ .b8 2
423
+ .b8 0
424
+ .b32 .debug_abbrev
425
+ .b8 8
426
+ .b8 1
427
+ .b8 116
428
+ .b8 114
429
+ .b8 105
430
+ .b8 116
431
+ .b8 111
432
+ .b8 110
433
+ .b8 0
434
+ .b8 2
435
+ .b8 0
436
+ .b8 99
437
+ .b8 54
438
+ .b8 105
439
+ .b8 107
440
+ .b8 53
441
+ .b8 118
442
+ .b8 120
443
+ .b8 55
444
+ .b8 112
445
+ .b8 50
446
+ .b8 50
447
+ .b8 102
448
+ .b8 112
449
+ .b8 107
450
+ .b8 52
451
+ .b8 100
452
+ .b8 99
453
+ .b8 118
454
+ .b8 104
455
+ .b8 53
456
+ .b8 53
457
+ .b8 122
458
+ .b8 105
459
+ .b8 109
460
+ .b8 119
461
+ .b8 52
462
+ .b8 116
463
+ .b8 53
464
+ .b8 110
465
+ .b8 114
466
+ .b8 53
467
+ .b8 122
468
+ .b8 110
469
+ .b8 50
470
+ .b8 98
471
+ .b8 55
472
+ .b8 105
473
+ .b8 110
474
+ .b8 117
475
+ .b8 106
476
+ .b8 120
477
+ .b8 106
478
+ .b8 97
479
+ .b8 117
480
+ .b8 120
481
+ .b8 115
482
+ .b8 104
483
+ .b8 108
484
+ .b8 106
485
+ .b8 117
486
+ .b8 109
487
+ .b8 109
488
+ .b8 46
489
+ .b8 112
490
+ .b8 121
491
+ .b8 0
492
+ .b32 .debug_line
493
+ .b8 47
494
+ .b8 116
495
+ .b8 109
496
+ .b8 112
497
+ .b8 47
498
+ .b8 116
499
+ .b8 111
500
+ .b8 114
501
+ .b8 99
502
+ .b8 104
503
+ .b8 105
504
+ .b8 110
505
+ .b8 100
506
+ .b8 117
507
+ .b8 99
508
+ .b8 116
509
+ .b8 111
510
+ .b8 114
511
+ .b8 95
512
+ .b8 114
513
+ .b8 111
514
+ .b8 111
515
+ .b8 116
516
+ .b8 47
517
+ .b8 54
518
+ .b8 105
519
+ .b8 0
520
+ .b8 1
521
+ .b64 $L__func_begin0
522
+ .b64 $L__func_end0
523
+ .b8 2
524
+ .b8 116
525
+ .b8 114
526
+ .b8 105
527
+ .b8 116
528
+ .b8 111
529
+ .b8 110
530
+ .b8 95
531
+ .b8 95
532
+ .b8 48
533
+ .b8 100
534
+ .b8 49
535
+ .b8 100
536
+ .b8 50
537
+ .b8 100
538
+ .b8 51
539
+ .b8 100
540
+ .b8 101
541
+ .b8 52
542
+ .b8 101
543
+ .b8 0
544
+ .b8 116
545
+ .b8 114
546
+ .b8 105
547
+ .b8 116
548
+ .b8 111
549
+ .b8 110
550
+ .b8 95
551
+ .b8 95
552
+ .b8 48
553
+ .b8 100
554
+ .b8 49
555
+ .b8 100
556
+ .b8 50
557
+ .b8 100
558
+ .b8 51
559
+ .b8 100
560
+ .b8 101
561
+ .b8 52
562
+ .b8 101
563
+ .b8 0
564
+ .b8 1
565
+ .b8 18
566
+ .b8 1
567
+ .b8 1
568
+ .b8 3
569
+ .b64 $L__func_begin0
570
+ .b64 $L__func_end0
571
+ .b8 1
572
+ .b8 156
573
+ .b32 125
574
+ .b8 4
575
+ .b32 125
576
+ .b64 $L__tmp1
577
+ .b64 $L__tmp32
578
+ .b8 2
579
+ .b8 35
580
+ .b8 25
581
+ .b8 5
582
+ .b32 125
583
+ .b64 $L__tmp1
584
+ .b64 $L__tmp32
585
+ .b8 2
586
+ .b8 243
587
+ .b8 36
588
+ .b8 0
589
+ .b8 5
590
+ .b32 125
591
+ .b64 $L__tmp2
592
+ .b64 $L__tmp33
593
+ .b8 2
594
+ .b8 35
595
+ .b8 25
596
+ .b8 0
597
+ .b8 0
598
+ }
599
+ .section .debug_pubnames
600
+ {
601
+ .b32 $L__pubNames_end0-$L__pubNames_start0
602
+ $L__pubNames_start0:
603
+ .b8 2
604
+ .b8 0
605
+ .b32 .debug_info
606
+ .b32 268
607
+ .b32 125
608
+ .b8 116
609
+ .b8 114
610
+ .b8 105
611
+ .b8 116
612
+ .b8 111
613
+ .b8 110
614
+ .b8 95
615
+ .b8 95
616
+ .b8 48
617
+ .b8 100
618
+ .b8 49
619
+ .b8 100
620
+ .b8 50
621
+ .b8 100
622
+ .b8 51
623
+ .b8 100
624
+ .b8 101
625
+ .b8 52
626
+ .b8 101
627
+ .b8 0
628
+ .b32 0
629
+ $L__pubNames_end0:
630
+ }
631
+ .section .debug_pubtypes
632
+ {
633
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
634
+ $L__pubTypes_start0:
635
+ .b8 2
636
+ .b8 0
637
+ .b32 .debug_info
638
+ .b32 268
639
+ .b32 0
640
+ $L__pubTypes_end0:
641
+ }
642
+ .section .debug_loc { }
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<16x1xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<512> : tensor<16x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
9
+ %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
10
+ %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
11
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked1>
12
+ %cst_6 = arith.constant dense<true> : tensor<16x1xi1, #blocked>
13
+ %c16_i32 = arith.constant 16 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c16_i32 : i32
16
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
17
+ %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
20
+ %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
21
+ %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
22
+ %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked1>
23
+ %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked>
24
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
26
+ %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
27
+ %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
28
+ %14 = tt.broadcast %8 : (tensor<16x1xi32, #blocked1>) -> tensor<16x128xi32, #blocked1>
29
+ %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<16x128xi32, #blocked1>
30
+ %16 = arith.addi %14, %15 : tensor<16x128xi32, #blocked1>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked1>
32
+ %18 = tt.addptr %17, %16 : tensor<16x128x!tt.ptr<f32, 1>, #blocked1>, tensor<16x128xi32, #blocked1>
33
+ %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<16x128xi1, #blocked1>
34
+ %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked1>
35
+ %21 = arith.addf %20, %cst_5 : tensor<16x128xf32, #blocked1>
36
+ %22 = arith.select %19, %21, %cst_5 : tensor<16x128xi1, #blocked1>, tensor<16x128xf32, #blocked1>
37
+ %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
38
+ ^bb0(%arg5: f32, %arg6: f32):
39
+ %40 = arith.addf %arg5, %arg6 : f32
40
+ tt.reduce.return %40 : f32
41
+ }) : (tensor<16x128xf32, #blocked1>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
42
+ %24 = triton_gpu.convert_layout %23 : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
43
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
44
+ %26 = arith.divsi %9, %cst_2 : tensor<16x1xi32, #blocked>
45
+ %27 = arith.remsi %9, %cst_2 : tensor<16x1xi32, #blocked>
46
+ %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
47
+ %29 = tt.addptr %28, %26 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
48
+ %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
49
+ %31 = arith.addi %30, %cst_1 : tensor<16x1xi64, #blocked>
50
+ %32 = arith.cmpi slt, %30, %cst_0 : tensor<16x1xi64, #blocked>
51
+ %33 = arith.select %32, %31, %30 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
52
+ %34 = arith.muli %33, %cst : tensor<16x1xi64, #blocked>
53
+ %35 = arith.extsi %27 : tensor<16x1xi32, #blocked> to tensor<16x1xi64, #blocked>
54
+ %36 = arith.addi %35, %34 : tensor<16x1xi64, #blocked>
55
+ %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x1x!tt.ptr<f32, 1>, #blocked>
56
+ %38 = tt.addptr %37, %36 : tensor<16x1x!tt.ptr<f32, 1>, #blocked>, tensor<16x1xi64, #blocked>
57
+ %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr<f32, 1>, #blocked>, tensor<16x1xf32, #blocked>, tensor<16x1xi1, #blocked>) -> tensor<16x1xf32, #blocked>
58
+ tt.return
59
+ }
60
+ }
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<16x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<16x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<16x1xi64>
6
+ %cst_2 = arith.constant dense<true> : tensor<16x1xi1>
7
+ %cst_3 = arith.constant dense<256> : tensor<16x1xi32>
8
+ %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
9
+ %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
10
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
11
+ %c16_i32 = arith.constant 16 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c16_i32 : i32
14
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<16x1xi32>
18
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
20
+ %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
21
+ %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
22
+ %10 = tt.broadcast %5 : (tensor<16x1xi32>) -> tensor<16x128xi32>
23
+ %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<16x128xi32>
24
+ %12 = arith.addi %10, %11 : tensor<16x128xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
26
+ %14 = tt.addptr %13, %12 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
27
+ %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<16x128xi1>
28
+ %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
29
+ %17 = arith.addf %16, %cst_6 : tensor<16x128xf32>
30
+ %18 = arith.select %15, %17, %cst_6 : tensor<16x128xi1>, tensor<16x128xf32>
31
+ %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
32
+ ^bb0(%arg5: f32, %arg6: f32):
33
+ %35 = arith.addf %arg5, %arg6 : f32
34
+ tt.reduce.return %35 : f32
35
+ }) : (tensor<16x128xf32>) -> tensor<16xf32>
36
+ %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
37
+ %21 = arith.divsi %5, %cst_3 : tensor<16x1xi32>
38
+ %22 = arith.remsi %5, %cst_3 : tensor<16x1xi32>
39
+ %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
40
+ %24 = tt.addptr %23, %21 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
41
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
42
+ %26 = arith.addi %25, %cst_1 : tensor<16x1xi64>
43
+ %27 = arith.cmpi slt, %25, %cst_0 : tensor<16x1xi64>
44
+ %28 = arith.select %27, %26, %25 : tensor<16x1xi1>, tensor<16x1xi64>
45
+ %29 = arith.muli %28, %cst : tensor<16x1xi64>
46
+ %30 = arith.extsi %22 : tensor<16x1xi32> to tensor<16x1xi64>
47
+ %31 = arith.addi %30, %29 : tensor<16x1xi64>
48
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x1x!tt.ptr<f32, 1>>
49
+ %33 = tt.addptr %32, %31 : tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xi64>
50
+ %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xf32>, tensor<16x1xi1>) -> tensor<16x1xf32>
51
+ tt.return
52
+ }
53
+ }
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
6
+ %cst_1 = arith.constant 0.000000e+00 : f32
7
+ %c256_i32 = arith.constant 256 : i32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
9
+ %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
20
+ %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
21
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
22
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
28
+ %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
29
+ %18 = tt.load %17, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
30
+ %19 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
31
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
32
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
33
+ %22 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
34
+ %23 = arith.select %2, %22, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
35
+ %24 = "tt.reduce"(%23) <{axis = 0 : i32}> ({
36
+ ^bb0(%arg8: f32, %arg9: f32):
37
+ %43 = arith.addf %arg8, %arg9 : f32
38
+ tt.reduce.return %43 : f32
39
+ }) : (tensor<256xf32, #blocked>) -> f32
40
+ %25 = arith.addf %24, %cst_1 : f32
41
+ %26 = arith.mulf %22, %15 : tensor<256xf32, #blocked>
42
+ %27 = arith.select %2, %26, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
43
+ %28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
44
+ ^bb0(%arg8: f32, %arg9: f32):
45
+ %43 = arith.addf %arg8, %arg9 : f32
46
+ tt.reduce.return %43 : f32
47
+ }) : (tensor<256xf32, #blocked>) -> f32
48
+ %29 = arith.addf %28, %cst_1 : f32
49
+ %30 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
50
+ %31 = arith.mulf %22, %cst_3 : tensor<256xf32, #blocked>
51
+ %32 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
52
+ %33 = arith.subf %31, %32 : tensor<256xf32, #blocked>
53
+ %34 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
54
+ %35 = arith.mulf %15, %34 : tensor<256xf32, #blocked>
55
+ %36 = arith.subf %33, %35 : tensor<256xf32, #blocked>
56
+ %37 = tt.broadcast %30 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
57
+ %38 = arith.mulf %37, %36 : tensor<256xf32, #blocked>
58
+ %39 = arith.addf %18, %38 : tensor<256xf32, #blocked>
59
+ tt.store %17, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
60
+ %40 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
61
+ %41 = tt.addptr %40, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
62
+ %42 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
63
+ tt.store %41, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
64
+ tt.return
65
+ }
66
+ }
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin ADDED
Binary file (4.65 kB). View file
 
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 2, !dbg !8
7
+ %5 = and i32 %4, 508, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = shl i32 %6, 10, !dbg !10
10
+ %8 = or i32 %7, %5, !dbg !11
11
+ %9 = or i32 %8, 512, !dbg !11
12
+ %10 = sext i32 %8 to i64, !dbg !12
13
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
14
+ %12 = sext i32 %9 to i64, !dbg !12
15
+ %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !12
16
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %11, i1 true) #1, !dbg !13
17
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 true) #1, !dbg !13
18
+ ret void, !dbg !14
19
+ }
20
+
21
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
22
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
23
+
24
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
25
+ attributes #1 = { nounwind }
26
+
27
+ !llvm.module.flags = !{!0}
28
+ !llvm.dbg.cu = !{!1}
29
+ !nvvm.annotations = !{!3, !4, !4, !3}
30
+
31
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
32
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
33
+ !2 = !DIFile(filename: "c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py", directory: "/tmp/torchinductor_root/7w")
34
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
35
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
36
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
37
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
38
+ !7 = !{}
39
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
40
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
41
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
42
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
43
+ !12 = !DILocation(line: 25, column: 25, scope: !5)
44
+ !13 = !DILocation(line: 25, column: 36, scope: !5)
45
+ !14 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u32 triton__0d1de_param_1
14
+ )
15
+ .maxntid 128, 1, 1
16
+ {
17
+ .reg .pred %p<3>;
18
+ .reg .b32 %r<15>;
19
+ .reg .b64 %rd<5>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r10, %tid.x;
28
+ shl.b32 %r11, %r10, 2;
29
+ and.b32 %r12, %r11, 508;
30
+ .loc 1 20 28
31
+ mov.u32 %r1, %ctaid.x;
32
+ .loc 1 20 33
33
+ shl.b32 %r13, %r1, 10;
34
+ .loc 1 21 23
35
+ or.b32 %r14, %r13, %r12;
36
+ .loc 1 25 25
37
+ mul.wide.s32 %rd4, %r14, 4;
38
+ add.s64 %rd1, %rd3, %rd4;
39
+ add.s64 %rd2, %rd1, 2048;
40
+ mov.b32 %r2, 0;
41
+ mov.pred %p1, -1;
42
+ .loc 1 25 36
43
+ @%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
44
+ @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
45
+ .loc 1 25 4
46
+ ret;
47
+ $L__tmp1:
48
+ $L__func_end0:
49
+
50
+ }
51
+ .file 1 "/tmp/torchinductor_root/7w/c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py"
52
+ .section .debug_abbrev
53
+ {
54
+ .b8 1
55
+ .b8 17
56
+ .b8 1
57
+ .b8 37
58
+ .b8 8
59
+ .b8 19
60
+ .b8 5
61
+ .b8 3
62
+ .b8 8
63
+ .b8 16
64
+ .b8 6
65
+ .b8 27
66
+ .b8 8
67
+ .b8 180
68
+ .b8 66
69
+ .b8 12
70
+ .b8 17
71
+ .b8 1
72
+ .b8 18
73
+ .b8 1
74
+ .b8 0
75
+ .b8 0
76
+ .b8 2
77
+ .b8 46
78
+ .b8 0
79
+ .b8 17
80
+ .b8 1
81
+ .b8 18
82
+ .b8 1
83
+ .b8 64
84
+ .b8 10
85
+ .b8 135
86
+ .b8 64
87
+ .b8 8
88
+ .b8 3
89
+ .b8 8
90
+ .b8 58
91
+ .b8 11
92
+ .b8 59
93
+ .b8 11
94
+ .b8 63
95
+ .b8 12
96
+ .b8 0
97
+ .b8 0
98
+ .b8 0
99
+ }
100
+ .section .debug_info
101
+ {
102
+ .b32 172
103
+ .b8 2
104
+ .b8 0
105
+ .b32 .debug_abbrev
106
+ .b8 8
107
+ .b8 1
108
+ .b8 116
109
+ .b8 114
110
+ .b8 105
111
+ .b8 116
112
+ .b8 111
113
+ .b8 110
114
+ .b8 0
115
+ .b8 2
116
+ .b8 0
117
+ .b8 99
118
+ .b8 55
119
+ .b8 119
120
+ .b8 53
121
+ .b8 114
122
+ .b8 54
123
+ .b8 54
124
+ .b8 102
125
+ .b8 99
126
+ .b8 103
127
+ .b8 103
128
+ .b8 109
129
+ .b8 54
130
+ .b8 97
131
+ .b8 111
132
+ .b8 107
133
+ .b8 107
134
+ .b8 116
135
+ .b8 122
136
+ .b8 119
137
+ .b8 109
138
+ .b8 103
139
+ .b8 50
140
+ .b8 52
141
+ .b8 109
142
+ .b8 108
143
+ .b8 101
144
+ .b8 118
145
+ .b8 113
146
+ .b8 50
147
+ .b8 104
148
+ .b8 113
149
+ .b8 100
150
+ .b8 119
151
+ .b8 50
152
+ .b8 98
153
+ .b8 103
154
+ .b8 119
155
+ .b8 122
156
+ .b8 119
157
+ .b8 108
158
+ .b8 111
159
+ .b8 118
160
+ .b8 114
161
+ .b8 101
162
+ .b8 108
163
+ .b8 54
164
+ .b8 114
165
+ .b8 101
166
+ .b8 53
167
+ .b8 121
168
+ .b8 109
169
+ .b8 46
170
+ .b8 112
171
+ .b8 121
172
+ .b8 0
173
+ .b32 .debug_line
174
+ .b8 47
175
+ .b8 116
176
+ .b8 109
177
+ .b8 112
178
+ .b8 47
179
+ .b8 116
180
+ .b8 111
181
+ .b8 114
182
+ .b8 99
183
+ .b8 104
184
+ .b8 105
185
+ .b8 110
186
+ .b8 100
187
+ .b8 117
188
+ .b8 99
189
+ .b8 116
190
+ .b8 111
191
+ .b8 114
192
+ .b8 95
193
+ .b8 114
194
+ .b8 111
195
+ .b8 111
196
+ .b8 116
197
+ .b8 47
198
+ .b8 55
199
+ .b8 119
200
+ .b8 0
201
+ .b8 1
202
+ .b64 $L__func_begin0
203
+ .b64 $L__func_end0
204
+ .b8 2
205
+ .b64 $L__func_begin0
206
+ .b64 $L__func_end0
207
+ .b8 1
208
+ .b8 156
209
+ .b8 116
210
+ .b8 114
211
+ .b8 105
212
+ .b8 116
213
+ .b8 111
214
+ .b8 110
215
+ .b8 95
216
+ .b8 95
217
+ .b8 48
218
+ .b8 100
219
+ .b8 49
220
+ .b8 100
221
+ .b8 101
222
+ .b8 0
223
+ .b8 116
224
+ .b8 114
225
+ .b8 105
226
+ .b8 116
227
+ .b8 111
228
+ .b8 110
229
+ .b8 95
230
+ .b8 95
231
+ .b8 48
232
+ .b8 100
233
+ .b8 49
234
+ .b8 100
235
+ .b8 101
236
+ .b8 0
237
+ .b8 1
238
+ .b8 18
239
+ .b8 1
240
+ .b8 0
241
+ }
242
+ .section .debug_pubnames
243
+ {
244
+ .b32 $L__pubNames_end0-$L__pubNames_start0
245
+ $L__pubNames_start0:
246
+ .b8 2
247
+ .b8 0
248
+ .b32 .debug_info
249
+ .b32 176
250
+ .b32 125
251
+ .b8 116
252
+ .b8 114
253
+ .b8 105
254
+ .b8 116
255
+ .b8 111
256
+ .b8 110
257
+ .b8 95
258
+ .b8 95
259
+ .b8 48
260
+ .b8 100
261
+ .b8 49
262
+ .b8 100
263
+ .b8 101
264
+ .b8 0
265
+ .b32 0
266
+ $L__pubNames_end0:
267
+ }
268
+ .section .debug_pubtypes
269
+ {
270
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
271
+ $L__pubTypes_start0:
272
+ .b8 2
273
+ .b8 0
274
+ .b32 .debug_info
275
+ .b32 176
276
+ .b32 0
277
+ $L__pubTypes_end0:
278
+ }
279
+ .section .debug_loc { }
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
11
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
12
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
13
+ tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
14
+ tt.return
15
+ }
16
+ }
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
10
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
11
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
12
+ tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
13
+ tt.return
14
+ }
15
+ }
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6e7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg6: i64 {tt.max_divisibility = 8 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<7680> : tensor<1x2048xi64, #blocked>
7
+ %cst_0 = arith.constant dense<7680> : tensor<1x2048xi64, #blocked1>
8
+ %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
9
+ %c385973760_i64 = arith.constant 385973760 : i64
10
+ %c7680_i64 = arith.constant 7680 : i64
11
+ %c8_i64 = arith.constant 8 : i64
12
+ %cst_2 = arith.constant dense<-1> : tensor<1x2048xi64, #blocked>
13
+ %cst_3 = arith.constant dense<0> : tensor<1x2048xi64, #blocked>
14
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked1>
15
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
16
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c7680_i32 = arith.constant 7680 : i32
19
+ %c2048_i32 = arith.constant 2048 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.extsi %0 : i32 to i64
22
+ %2 = arith.cmpi slt, %1, %c8_i64 : i64
23
+ %3 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
24
+ %4 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
25
+ %5 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
26
+ %6 = tt.expand_dims %4 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x2048xi32, #blocked1>
27
+ %7 = arith.extsi %5 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
28
+ %8 = arith.extsi %6 : tensor<1x2048xi32, #blocked1> to tensor<1x2048xi64, #blocked1>
29
+ %9 = arith.muli %1, %c7680_i64 : i64
30
+ %10 = tt.splat %9 : (i64) -> tensor<1x2048xi64, #blocked>
31
+ %11 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<1x2048x!tt.ptr<i64, 1>, #blocked>
32
+ %12 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked>
33
+ %13 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked1>
34
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
35
+ %15 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
36
+ %16 = arith.muli %1, %c385973760_i64 : i64
37
+ %17 = tt.splat %16 : (i64) -> tensor<1x2048xi64, #blocked>
38
+ %18 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
39
+ %19:2 = scf.for %arg8 = %c0_i32 to %c7680_i32 step %c2048_i32 iter_args(%arg9 = %cst_4, %arg10 = %cst_3) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>) : i32 {
40
+ %30 = arith.extsi %arg8 : i32 to i64
41
+ %31 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked>
42
+ %32 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked1>
43
+ %33 = arith.addi %31, %7 : tensor<1x2048xi64, #blocked>
44
+ %34 = arith.addi %32, %8 : tensor<1x2048xi64, #blocked1>
45
+ %35 = arith.cmpi slt, %33, %cst : tensor<1x2048xi64, #blocked>
46
+ %36 = arith.cmpi slt, %34, %cst_0 : tensor<1x2048xi64, #blocked1>
47
+ %37 = arith.addi %33, %10 : tensor<1x2048xi64, #blocked>
48
+ %38 = tt.addptr %11, %37 : tensor<1x2048x!tt.ptr<i64, 1>, #blocked>, tensor<1x2048xi64, #blocked>
49
+ %39 = arith.andi %35, %12 : tensor<1x2048xi1, #blocked>
50
+ %40 = arith.andi %36, %13 : tensor<1x2048xi1, #blocked1>
51
+ %41 = tt.load %38, %39, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xi64, #blocked>
52
+ %42 = tt.addptr %14, %37 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
53
+ %43 = tt.load %42, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
54
+ %44 = triton_gpu.convert_layout %43 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1>
55
+ %45 = tt.addptr %15, %37 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
56
+ %46 = tt.load %45, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
57
+ %47 = arith.cmpi ne, %41, %cst_2 : tensor<1x2048xi64, #blocked>
58
+ %48 = triton_gpu.convert_layout %47 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked1>
59
+ %49 = arith.select %47, %41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
60
+ %50 = arith.addi %49, %cst_1 : tensor<1x2048xi64, #blocked>
61
+ %51 = arith.cmpi slt, %49, %cst_3 : tensor<1x2048xi64, #blocked>
62
+ %52 = arith.select %51, %50, %49 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
63
+ %53 = arith.cmpi sge, %52, %cst_3 : tensor<1x2048xi64, #blocked>
64
+ %54 = arith.cmpi slt, %52, %cst_1 : tensor<1x2048xi64, #blocked>
65
+ %55 = arith.andi %53, %54 : tensor<1x2048xi1, #blocked>
66
+ %56 = triton_gpu.convert_layout %55 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked2>
67
+ tt.assert %56, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x2048xi1, #blocked2>
68
+ %57 = arith.muli %33, %cst_1 : tensor<1x2048xi64, #blocked>
69
+ %58 = arith.addi %52, %57 : tensor<1x2048xi64, #blocked>
70
+ %59 = arith.addi %58, %17 : tensor<1x2048xi64, #blocked>
71
+ %60 = tt.addptr %18, %59 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
72
+ %61 = triton_gpu.convert_layout %60 : (tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked1>
73
+ %62 = tt.load %61, %40, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked1>
74
+ %63 = arith.extf %62 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1>
75
+ %64 = arith.subf %63, %44 : tensor<1x2048xf32, #blocked1>
76
+ %65 = math.log %46 : tensor<1x2048xf32, #blocked>
77
+ %66 = triton_gpu.convert_layout %65 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1>
78
+ %67 = arith.subf %64, %66 : tensor<1x2048xf32, #blocked1>
79
+ %68 = arith.subf %cst_4, %67 : tensor<1x2048xf32, #blocked1>
80
+ %69 = arith.select %48, %68, %cst_4 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1>
81
+ %70 = arith.addf %arg9, %69 : tensor<1x2048xf32, #blocked1>
82
+ %71 = arith.select %40, %70, %arg9 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1>
83
+ %72 = arith.extui %47 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked>
84
+ %73 = arith.addi %arg10, %72 : tensor<1x2048xi64, #blocked>
85
+ %74 = arith.select %39, %73, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
86
+ scf.yield %71, %74 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>
87
+ }
88
+ %20 = "tt.reduce"(%19#0) <{axis = 1 : i32}> ({
89
+ ^bb0(%arg8: f32, %arg9: f32):
90
+ %30 = arith.addf %arg8, %arg9 : f32
91
+ tt.reduce.return %30 : f32
92
+ }) : (tensor<1x2048xf32, #blocked1>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
93
+ %21 = tt.expand_dims %20 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
94
+ %22 = tt.addptr %arg4, %1 : !tt.ptr<f32, 1>, i64
95
+ %23 = tt.splat %22 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
96
+ %24 = tt.splat %2 : (i1) -> tensor<1x1xi1, #blocked1>
97
+ tt.store %23, %21, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
98
+ %25 = "tt.reduce"(%19#1) <{axis = 1 : i32}> ({
99
+ ^bb0(%arg8: i64, %arg9: i64):
100
+ %30 = arith.addi %arg8, %arg9 : i64
101
+ tt.reduce.return %30 : i64
102
+ }) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
103
+ %26 = triton_gpu.convert_layout %25 : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
104
+ %27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xi64, #blocked1>
105
+ %28 = tt.addptr %arg5, %1 : !tt.ptr<i64, 1>, i64
106
+ %29 = tt.splat %28 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked1>
107
+ tt.store %29, %27, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked1>
108
+ tt.return
109
+ }
110
+ }
.triton/dump/962d1809855a53123762906133b1d960/triton_.cubin ADDED
Binary file (4.9 kB). View file
 
.triton/dump/962d1809855a53123762906133b1d960/triton_.llir ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 2, !dbg !8
7
+ %5 = and i32 %4, 508, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = shl i32 %6, 10, !dbg !10
10
+ %8 = or i32 %7, %5, !dbg !11
11
+ %9 = or i32 %8, 512, !dbg !11
12
+ %10 = icmp slt i32 %8, 12865792, !dbg !12
13
+ %11 = icmp slt i32 %9, 12865792, !dbg !12
14
+ %12 = sext i32 %8 to i64, !dbg !13
15
+ %13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
16
+ %14 = sext i32 %9 to i64, !dbg !13
17
+ %15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13
18
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 %10) #1, !dbg !14
19
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 %11) #1, !dbg !14
20
+ ret void, !dbg !15
21
+ }
22
+
23
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
24
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
25
+
26
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
27
+ attributes #1 = { nounwind }
28
+
29
+ !llvm.module.flags = !{!0}
30
+ !llvm.dbg.cu = !{!1}
31
+ !nvvm.annotations = !{!3, !4, !4, !3}
32
+
33
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
34
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
35
+ !2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
36
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
37
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
38
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
39
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
40
+ !7 = !{}
41
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
42
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
43
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
44
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
45
+ !12 = !DILocation(line: 22, column: 21, scope: !5)
46
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
47
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
48
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/962d1809855a53123762906133b1d960/triton_.ptx ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u32 triton__0d1de_param_1
14
+ )
15
+ .maxntid 128, 1, 1
16
+ {
17
+ .reg .pred %p<3>;
18
+ .reg .b32 %r<16>;
19
+ .reg .b64 %rd<5>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r10, %tid.x;
28
+ shl.b32 %r11, %r10, 2;
29
+ and.b32 %r12, %r11, 508;
30
+ .loc 1 20 28
31
+ mov.u32 %r1, %ctaid.x;
32
+ .loc 1 20 33
33
+ shl.b32 %r13, %r1, 10;
34
+ .loc 1 21 23
35
+ or.b32 %r14, %r13, %r12;
36
+ or.b32 %r15, %r14, 512;
37
+ .loc 1 22 21
38
+ setp.lt.s32 %p1, %r14, 12865792;
39
+ setp.lt.s32 %p2, %r15, 12865792;
40
+ .loc 1 25 25
41
+ mul.wide.s32 %rd4, %r14, 4;
42
+ add.s64 %rd1, %rd3, %rd4;
43
+ add.s64 %rd2, %rd1, 2048;
44
+ mov.b32 %r2, 0;
45
+ .loc 1 25 36
46
+ @%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
47
+ @%p2 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
48
+ .loc 1 25 4
49
+ ret;
50
+ $L__tmp1:
51
+ $L__func_end0:
52
+
53
+ }
54
+ .file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py"
55
+ .section .debug_abbrev
56
+ {
57
+ .b8 1
58
+ .b8 17
59
+ .b8 1
60
+ .b8 37
61
+ .b8 8
62
+ .b8 19
63
+ .b8 5
64
+ .b8 3
65
+ .b8 8
66
+ .b8 16
67
+ .b8 6
68
+ .b8 27
69
+ .b8 8
70
+ .b8 180
71
+ .b8 66
72
+ .b8 12
73
+ .b8 17
74
+ .b8 1
75
+ .b8 18
76
+ .b8 1
77
+ .b8 0
78
+ .b8 0
79
+ .b8 2
80
+ .b8 46
81
+ .b8 0
82
+ .b8 17
83
+ .b8 1
84
+ .b8 18
85
+ .b8 1
86
+ .b8 64
87
+ .b8 10
88
+ .b8 135
89
+ .b8 64
90
+ .b8 8
91
+ .b8 3
92
+ .b8 8
93
+ .b8 58
94
+ .b8 11
95
+ .b8 59
96
+ .b8 11
97
+ .b8 63
98
+ .b8 12
99
+ .b8 0
100
+ .b8 0
101
+ .b8 0
102
+ }
103
+ .section .debug_info
104
+ {
105
+ .b32 172
106
+ .b8 2
107
+ .b8 0
108
+ .b32 .debug_abbrev
109
+ .b8 8
110
+ .b8 1
111
+ .b8 116
112
+ .b8 114
113
+ .b8 105
114
+ .b8 116
115
+ .b8 111
116
+ .b8 110
117
+ .b8 0
118
+ .b8 2
119
+ .b8 0
120
+ .b8 99
121
+ .b8 52
122
+ .b8 121
123
+ .b8 115
124
+ .b8 101
125
+ .b8 108
126
+ .b8 100
127
+ .b8 119
128
+ .b8 109
129
+ .b8 117
130
+ .b8 51
131
+ .b8 116
132
+ .b8 111
133
+ .b8 53
134
+ .b8 50
135
+ .b8 112
136
+ .b8 98
137
+ .b8 104
138
+ .b8 50
139
+ .b8 109
140
+ .b8 100
141
+ .b8 50
142
+ .b8 111
143
+ .b8 101
144
+ .b8 117
145
+ .b8 102
146
+ .b8 114
147
+ .b8 113
148
+ .b8 51
149
+ .b8 102
150
+ .b8 99
151
+ .b8 100
152
+ .b8 109
153
+ .b8 97
154
+ .b8 112
155
+ .b8 107
156
+ .b8 116
157
+ .b8 52
158
+ .b8 110
159
+ .b8 120
160
+ .b8 100
161
+ .b8 122
162
+ .b8 109
163
+ .b8 121
164
+ .b8 113
165
+ .b8 116
166
+ .b8 103
167
+ .b8 100
168
+ .b8 50
169
+ .b8 121
170
+ .b8 115
171
+ .b8 112
172
+ .b8 46
173
+ .b8 112
174
+ .b8 121
175
+ .b8 0
176
+ .b32 .debug_line
177
+ .b8 47
178
+ .b8 116
179
+ .b8 109
180
+ .b8 112
181
+ .b8 47
182
+ .b8 116
183
+ .b8 111
184
+ .b8 114
185
+ .b8 99
186
+ .b8 104
187
+ .b8 105
188
+ .b8 110
189
+ .b8 100
190
+ .b8 117
191
+ .b8 99
192
+ .b8 116
193
+ .b8 111
194
+ .b8 114
195
+ .b8 95
196
+ .b8 114
197
+ .b8 111
198
+ .b8 111
199
+ .b8 116
200
+ .b8 47
201
+ .b8 52
202
+ .b8 121
203
+ .b8 0
204
+ .b8 1
205
+ .b64 $L__func_begin0
206
+ .b64 $L__func_end0
207
+ .b8 2
208
+ .b64 $L__func_begin0
209
+ .b64 $L__func_end0
210
+ .b8 1
211
+ .b8 156
212
+ .b8 116
213
+ .b8 114
214
+ .b8 105
215
+ .b8 116
216
+ .b8 111
217
+ .b8 110
218
+ .b8 95
219
+ .b8 95
220
+ .b8 48
221
+ .b8 100
222
+ .b8 49
223
+ .b8 100
224
+ .b8 101
225
+ .b8 0
226
+ .b8 116
227
+ .b8 114
228
+ .b8 105
229
+ .b8 116
230
+ .b8 111
231
+ .b8 110
232
+ .b8 95
233
+ .b8 95
234
+ .b8 48
235
+ .b8 100
236
+ .b8 49
237
+ .b8 100
238
+ .b8 101
239
+ .b8 0
240
+ .b8 1
241
+ .b8 18
242
+ .b8 1
243
+ .b8 0
244
+ }
245
+ .section .debug_pubnames
246
+ {
247
+ .b32 $L__pubNames_end0-$L__pubNames_start0
248
+ $L__pubNames_start0:
249
+ .b8 2
250
+ .b8 0
251
+ .b32 .debug_info
252
+ .b32 176
253
+ .b32 125
254
+ .b8 116
255
+ .b8 114
256
+ .b8 105
257
+ .b8 116
258
+ .b8 111
259
+ .b8 110
260
+ .b8 95
261
+ .b8 95
262
+ .b8 48
263
+ .b8 100
264
+ .b8 49
265
+ .b8 100
266
+ .b8 101
267
+ .b8 0
268
+ .b32 0
269
+ $L__pubNames_end0:
270
+ }
271
+ .section .debug_pubtypes
272
+ {
273
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
274
+ $L__pubTypes_start0:
275
+ .b8 2
276
+ .b8 0
277
+ .b32 .debug_info
278
+ .b32 176
279
+ .b32 0
280
+ $L__pubTypes_end0:
281
+ }
282
+ .section .debug_loc { }
.triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c1024_i32 : i32
9
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
10
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
11
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
12
+ %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32, #blocked>
13
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
14
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
15
+ tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<12865792> : tensor<1024xi32>
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
9
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
10
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
11
+ %5 = arith.cmpi slt, %4, %cst_0 : tensor<1024xi32>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
13
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
14
+ tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin ADDED
Binary file (49.4 kB). View file
 
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !5 {
7
+ %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %14 = and i32 %13, 31, !dbg !8
9
+ %15 = lshr i32 %13, 5, !dbg !8
10
+ %16 = shl i32 %13, 2, !dbg !8
11
+ %17 = and i32 %16, 60, !dbg !8
12
+ %18 = and i32 %15, 3, !dbg !8
13
+ %19 = lshr i32 %14, 1, !dbg !8
14
+ %20 = shl nuw nsw i32 %18, 4, !dbg !8
15
+ %21 = or i32 %20, %19, !dbg !8
16
+ %22 = and i32 %16, 4, !dbg !9
17
+ %23 = lshr i32 %14, 4, !dbg !9
18
+ %24 = shl nuw nsw i32 %18, 1, !dbg !9
19
+ %25 = or i32 %24, %23, !dbg !9
20
+ %26 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
21
+ %27 = shl i32 %26, 6, !dbg !11
22
+ %28 = or i32 %27, %17, !dbg !12
23
+ %29 = or i32 %27, %21, !dbg !12
24
+ %.frozen = freeze i32 %28
25
+ %30 = sdiv i32 %.frozen, 256, !dbg !13
26
+ %31 = mul i32 %30, 256
27
+ %.decomposed = sub i32 %.frozen, %31
28
+ %32 = sdiv i32 %29, 256, !dbg !13
29
+ %33 = shl i32 %30, 15, !dbg !14
30
+ %34 = shl nsw i32 %32, 7, !dbg !15
31
+ %35 = add i32 %33, %.decomposed
32
+ %36 = mul nuw nsw i32 %17, 12
33
+ %37 = or i32 %25, %36
34
+ %38 = zext nneg i32 %37 to i64
35
+ %39 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %38
36
+ %40 = or i32 %36, 12
37
+ %41 = add nuw nsw i32 %40, %25
38
+ %42 = zext nneg i32 %41 to i64
39
+ %43 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %42
40
+ %44 = add nuw nsw i32 %36, 24
41
+ %45 = or i32 %44, %25
42
+ %46 = zext nneg i32 %45 to i64
43
+ %47 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %46
44
+ %48 = add nuw nsw i32 %36, 36
45
+ %49 = add nuw nsw i32 %48, %25
46
+ %50 = zext nneg i32 %49 to i64
47
+ %51 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %50
48
+ %52 = mul nuw nsw i32 %21, 12
49
+ %53 = add nuw nsw i32 %52, %22
50
+ %54 = zext nneg i32 %53 to i64
51
+ %55 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %54
52
+ %56 = getelementptr float, ptr addrspace(3) @global_smem, i64 %38
53
+ %57 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42
54
+ %58 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46
55
+ %59 = getelementptr float, ptr addrspace(3) @global_smem, i64 %50
56
+ %60 = getelementptr float, ptr addrspace(3) @global_smem, i64 %54
57
+ %61 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 1
58
+ %62 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 2
59
+ %63 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 3
60
+ br label %64, !dbg !16
61
+
62
+ 64: ; preds = %12, %64
63
+ %65 = phi i32 [ 0, %12 ], [ %205, %64 ]
64
+ %66 = phi <8 x float> [ zeroinitializer, %12 ], [ %204, %64 ]
65
+ %67 = or i32 %65, %22, !dbg !17
66
+ %68 = or i32 %65, %25, !dbg !17
67
+ %69 = shl i32 %68, 8, !dbg !18
68
+ %70 = add i32 %35, %69, !dbg !19
69
+ %71 = sext i32 %70 to i64, !dbg !20
70
+ %72 = getelementptr i16, ptr addrspace(1) %0, i64 %71, !dbg !20
71
+ %73 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
72
+ %74 = extractvalue { i32, i32 } %73, 0, !dbg !21
73
+ %75 = extractvalue { i32, i32 } %73, 1, !dbg !21
74
+ %76 = trunc i32 %74 to i16, !dbg !21
75
+ %extelt.offset = lshr i32 %74, 16, !dbg !21
76
+ %77 = trunc i32 %extelt.offset to i16, !dbg !21
77
+ %78 = trunc i32 %75 to i16, !dbg !21
78
+ %extelt.offset1 = lshr i32 %75, 16, !dbg !21
79
+ %79 = trunc i32 %extelt.offset1 to i16, !dbg !21
80
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
81
+ %80 = insertelement <1 x i16> undef, i16 %76, i64 0, !dbg !22
82
+ store <1 x i16> %80, ptr addrspace(3) %39, align 2, !dbg !22
83
+ %81 = insertelement <1 x i16> undef, i16 %77, i64 0, !dbg !22
84
+ store <1 x i16> %81, ptr addrspace(3) %43, align 2, !dbg !22
85
+ %82 = insertelement <1 x i16> undef, i16 %78, i64 0, !dbg !22
86
+ store <1 x i16> %82, ptr addrspace(3) %47, align 2, !dbg !22
87
+ %83 = insertelement <1 x i16> undef, i16 %79, i64 0, !dbg !22
88
+ store <1 x i16> %83, ptr addrspace(3) %51, align 2, !dbg !22
89
+ tail call void @llvm.nvvm.barrier0(), !dbg !22
90
+ %84 = load i16, ptr addrspace(3) %55, align 8, !dbg !22
91
+ %85 = load i16, ptr addrspace(3) %61, align 2, !dbg !22
92
+ %86 = load i16, ptr addrspace(3) %62, align 4, !dbg !22
93
+ %87 = load i16, ptr addrspace(3) %63, align 2, !dbg !22
94
+ %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #3, !dbg !22
95
+ %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #3, !dbg !22
96
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #3, !dbg !22
97
+ %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %87) #3, !dbg !22
98
+ %92 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !23
99
+ %93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %92, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
100
+ %94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !24
101
+ %95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !24
102
+ %96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !24
103
+ %97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !24
104
+ %98 = bitcast i32 %94 to float, !dbg !24
105
+ %99 = bitcast i32 %95 to float, !dbg !24
106
+ %100 = bitcast i32 %96 to float, !dbg !24
107
+ %101 = bitcast i32 %97 to float, !dbg !24
108
+ tail call void @llvm.nvvm.barrier0(), !dbg !24
109
+ %102 = insertelement <1 x float> undef, float %98, i64 0, !dbg !24
110
+ store <1 x float> %102, ptr addrspace(3) %56, align 4, !dbg !24
111
+ %103 = insertelement <1 x float> undef, float %99, i64 0, !dbg !24
112
+ store <1 x float> %103, ptr addrspace(3) %57, align 4, !dbg !24
113
+ %104 = insertelement <1 x float> undef, float %100, i64 0, !dbg !24
114
+ store <1 x float> %104, ptr addrspace(3) %58, align 4, !dbg !24
115
+ %105 = insertelement <1 x float> undef, float %101, i64 0, !dbg !24
116
+ store <1 x float> %105, ptr addrspace(3) %59, align 4, !dbg !24
117
+ tail call void @llvm.nvvm.barrier0(), !dbg !24
118
+ %106 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !24
119
+ %107 = getelementptr i16, ptr addrspace(1) %2, i64 %71, !dbg !25
120
+ %108 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26
121
+ %109 = extractvalue { i32, i32 } %108, 0, !dbg !26
122
+ %110 = extractvalue { i32, i32 } %108, 1, !dbg !26
123
+ %111 = trunc i32 %109 to i16, !dbg !26
124
+ %extelt.offset2 = lshr i32 %109, 16, !dbg !26
125
+ %112 = trunc i32 %extelt.offset2 to i16, !dbg !26
126
+ %113 = trunc i32 %110 to i16, !dbg !26
127
+ %extelt.offset3 = lshr i32 %110, 16, !dbg !26
128
+ %114 = trunc i32 %extelt.offset3 to i16, !dbg !26
129
+ %115 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #3, !dbg !27
130
+ %116 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #3, !dbg !27
131
+ %117 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #3, !dbg !27
132
+ %118 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #3, !dbg !27
133
+ %119 = add i32 %67, %34, !dbg !28
134
+ %120 = sext i32 %119 to i64, !dbg !29
135
+ %121 = getelementptr float, ptr addrspace(1) %3, i64 %120, !dbg !29
136
+ %122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %121, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !30
137
+ %123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !30
138
+ %124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !30
139
+ %125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !30
140
+ %126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !30
141
+ %127 = getelementptr float, ptr addrspace(1) %4, i64 %120, !dbg !31
142
+ %128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %127, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32
143
+ %129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !32
144
+ %130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !32
145
+ %131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !32
146
+ %132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !32
147
+ %133 = getelementptr i16, ptr addrspace(1) %5, i64 %71, !dbg !33
148
+ %134 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !34
149
+ %135 = extractvalue { i32, i32 } %134, 0, !dbg !34
150
+ %136 = extractvalue { i32, i32 } %134, 1, !dbg !34
151
+ %137 = trunc i32 %135 to i16, !dbg !34
152
+ %extelt.offset4 = lshr i32 %135, 16, !dbg !34
153
+ %138 = trunc i32 %extelt.offset4 to i16, !dbg !34
154
+ %139 = trunc i32 %136 to i16, !dbg !34
155
+ %extelt.offset5 = lshr i32 %136, 16, !dbg !34
156
+ %140 = trunc i32 %extelt.offset5 to i16, !dbg !34
157
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
158
+ %141 = insertelement <1 x i16> undef, i16 %137, i64 0, !dbg !35
159
+ store <1 x i16> %141, ptr addrspace(3) %39, align 2, !dbg !35
160
+ %142 = insertelement <1 x i16> undef, i16 %138, i64 0, !dbg !35
161
+ store <1 x i16> %142, ptr addrspace(3) %43, align 2, !dbg !35
162
+ %143 = insertelement <1 x i16> undef, i16 %139, i64 0, !dbg !35
163
+ store <1 x i16> %143, ptr addrspace(3) %47, align 2, !dbg !35
164
+ %144 = insertelement <1 x i16> undef, i16 %140, i64 0, !dbg !35
165
+ store <1 x i16> %144, ptr addrspace(3) %51, align 2, !dbg !35
166
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
167
+ %145 = load i16, ptr addrspace(3) %55, align 8, !dbg !35
168
+ %146 = load i16, ptr addrspace(3) %61, align 2, !dbg !35
169
+ %147 = load i16, ptr addrspace(3) %62, align 4, !dbg !35
170
+ %148 = load i16, ptr addrspace(3) %63, align 2, !dbg !35
171
+ %149 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %145) #3, !dbg !35
172
+ %150 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %146) #3, !dbg !35
173
+ %151 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %147) #3, !dbg !35
174
+ %152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %148) #3, !dbg !35
175
+ %153 = getelementptr float, ptr addrspace(1) %6, i64 %120, !dbg !36
176
+ %154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %153, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !37
177
+ %155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !37
178
+ %156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !37
179
+ %157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !37
180
+ %158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !37
181
+ %159 = getelementptr float, ptr addrspace(1) %7, i64 %120, !dbg !38
182
+ %160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !39
183
+ %161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !39
184
+ %162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !39
185
+ %163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !39
186
+ %164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !39
187
+ %165 = fadd float %115, %98, !dbg !40
188
+ %166 = fadd float %116, %99, !dbg !40
189
+ %167 = fadd float %117, %100, !dbg !40
190
+ %168 = fadd float %118, %101, !dbg !40
191
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
192
+ %169 = insertelement <1 x float> undef, float %165, i64 0, !dbg !40
193
+ store <1 x float> %169, ptr addrspace(3) %56, align 4, !dbg !40
194
+ %170 = insertelement <1 x float> undef, float %166, i64 0, !dbg !40
195
+ store <1 x float> %170, ptr addrspace(3) %57, align 4, !dbg !40
196
+ %171 = insertelement <1 x float> undef, float %167, i64 0, !dbg !40
197
+ store <1 x float> %171, ptr addrspace(3) %58, align 4, !dbg !40
198
+ %172 = insertelement <1 x float> undef, float %168, i64 0, !dbg !40
199
+ store <1 x float> %172, ptr addrspace(3) %59, align 4, !dbg !40
200
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
201
+ %173 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !40
202
+ %174 = insertelement <8 x i32> poison, i32 %155, i64 0, !dbg !37
203
+ %175 = insertelement <8 x i32> %174, i32 %156, i64 1, !dbg !37
204
+ %176 = insertelement <8 x i32> %175, i32 %157, i64 2, !dbg !37
205
+ %177 = insertelement <8 x i32> %176, i32 %158, i64 3, !dbg !37
206
+ %178 = insertelement <8 x i32> %177, i32 %123, i64 4, !dbg !37
207
+ %179 = insertelement <8 x i32> %178, i32 %124, i64 5, !dbg !37
208
+ %180 = insertelement <8 x i32> %179, i32 %125, i64 6, !dbg !37
209
+ %181 = insertelement <8 x i32> %180, i32 %126, i64 7, !dbg !37
210
+ %182 = bitcast <8 x i32> %181 to <8 x float>, !dbg !37
211
+ %183 = insertelement <8 x i32> poison, i32 %161, i64 0, !dbg !39
212
+ %184 = insertelement <8 x i32> %183, i32 %162, i64 1, !dbg !39
213
+ %185 = insertelement <8 x i32> %184, i32 %163, i64 2, !dbg !39
214
+ %186 = insertelement <8 x i32> %185, i32 %164, i64 3, !dbg !39
215
+ %187 = insertelement <8 x i32> %186, i32 %129, i64 4, !dbg !39
216
+ %188 = insertelement <8 x i32> %187, i32 %130, i64 5, !dbg !39
217
+ %189 = insertelement <8 x i32> %188, i32 %131, i64 6, !dbg !39
218
+ %190 = insertelement <8 x i32> %189, i32 %132, i64 7, !dbg !39
219
+ %191 = bitcast <8 x i32> %190 to <8 x float>, !dbg !39
220
+ %192 = shufflevector <4 x float> %106, <4 x float> %173, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg !41
221
+ %193 = fsub <8 x float> %192, %182, !dbg !41
222
+ %194 = fmul <8 x float> %193, %191, !dbg !42
223
+ %195 = insertelement <8 x float> poison, float %149, i64 0, !dbg !43
224
+ %196 = insertelement <8 x float> %195, float %150, i64 1, !dbg !43
225
+ %197 = insertelement <8 x float> %196, float %151, i64 2, !dbg !43
226
+ %198 = insertelement <8 x float> %197, float %152, i64 3, !dbg !43
227
+ %199 = insertelement <8 x float> %198, float %88, i64 4, !dbg !43
228
+ %200 = insertelement <8 x float> %199, float %89, i64 5, !dbg !43
229
+ %201 = insertelement <8 x float> %200, float %90, i64 6, !dbg !43
230
+ %202 = insertelement <8 x float> %201, float %91, i64 7, !dbg !43
231
+ %203 = fmul <8 x float> %202, %194, !dbg !43
232
+ %204 = fadd <8 x float> %66, %203, !dbg !44
233
+ %205 = add nuw nsw i32 %65, 8, !dbg !16
234
+ %206 = icmp ult i32 %65, 120, !dbg !16
235
+ br i1 %206, label %64, label %207, !dbg !16
236
+
237
+ 207: ; preds = %64
238
+ %208 = and i32 %13, 63, !dbg !8
239
+ %209 = or i32 %27, %208, !dbg !12
240
+ %shift = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>, !dbg !45
241
+ %210 = fadd <8 x float> %204, %shift, !dbg !45
242
+ %shift28 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 poison, i32 poison, i32 poison>, !dbg !45
243
+ %211 = fadd <8 x float> %shift28, %210, !dbg !45
244
+ %shift29 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison>, !dbg !45
245
+ %212 = fadd <8 x float> %shift29, %211, !dbg !45
246
+ %213 = extractelement <8 x float> %212, i64 4, !dbg !45
247
+ %214 = bitcast float %213 to i32, !dbg !51
248
+ %215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !51
249
+ %216 = bitcast i32 %215 to float, !dbg !51
250
+ %217 = fadd float %213, %216, !dbg !45
251
+ tail call void @llvm.nvvm.barrier0(), !dbg !53
252
+ %218 = zext nneg i32 %21 to i64, !dbg !53
253
+ %219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !53
254
+ %220 = insertelement <1 x float> undef, float %217, i64 0, !dbg !53
255
+ store <1 x float> %220, ptr addrspace(3) %219, align 4, !dbg !53
256
+ tail call void @llvm.nvvm.barrier0(), !dbg !53
257
+ %221 = zext nneg i32 %208 to i64, !dbg !53
258
+ %222 = getelementptr float, ptr addrspace(3) @global_smem, i64 %221, !dbg !53
259
+ %223 = load i32, ptr addrspace(3) %222, align 4, !dbg !53
260
+ %224 = sext i32 %209 to i64, !dbg !54
261
+ %225 = getelementptr float, ptr addrspace(1) %8, i64 %224, !dbg !54
262
+ %226 = and i32 %13, 64, !dbg !55
263
+ %227 = icmp eq i32 %226, 0, !dbg !55
264
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %223, ptr addrspace(1) %225, i1 %227) #3, !dbg !55
265
+ %shift30 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
266
+ %228 = fadd <8 x float> %204, %shift30, !dbg !56
267
+ %shift31 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
268
+ %229 = fadd <8 x float> %shift31, %228, !dbg !56
269
+ %shift32 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
270
+ %230 = fadd <8 x float> %shift32, %229, !dbg !56
271
+ %231 = extractelement <8 x float> %230, i64 0, !dbg !56
272
+ %232 = bitcast float %231 to i32, !dbg !59
273
+ %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !59
274
+ %234 = bitcast i32 %233 to float, !dbg !59
275
+ %235 = fadd float %231, %234, !dbg !56
276
+ tail call void @llvm.nvvm.barrier0(), !dbg !61
277
+ %236 = insertelement <1 x float> undef, float %235, i64 0, !dbg !61
278
+ store <1 x float> %236, ptr addrspace(3) %219, align 4, !dbg !61
279
+ tail call void @llvm.nvvm.barrier0(), !dbg !61
280
+ %237 = load i32, ptr addrspace(3) %222, align 4, !dbg !61
281
+ %238 = getelementptr float, ptr addrspace(1) %9, i64 %224, !dbg !62
282
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %237, ptr addrspace(1) %238, i1 %227) #3, !dbg !63
283
+ ret void, !dbg !64
284
+ }
285
+
286
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
287
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
288
+
289
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
290
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
291
+
292
+ ; Function Attrs: convergent nocallback nounwind
293
+ declare void @llvm.nvvm.barrier0() #2
294
+
295
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
296
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
297
+ attributes #2 = { convergent nocallback nounwind }
298
+ attributes #3 = { nounwind }
299
+
300
+ !llvm.module.flags = !{!0}
301
+ !llvm.dbg.cu = !{!1}
302
+ !nvvm.annotations = !{!3, !4, !4, !3}
303
+
304
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
305
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
306
+ !2 = !DIFile(filename: "c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py", directory: "/tmp/torchinductor_root/3x")
307
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1}
308
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 128}
309
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
310
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
311
+ !7 = !{}
312
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
313
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
314
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
315
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
316
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
317
+ !13 = !DILocation(line: 26, column: 20, scope: !5)
318
+ !14 = !DILocation(line: 34, column: 57, scope: !5)
319
+ !15 = !DILocation(line: 37, column: 44, scope: !5)
320
+ !16 = !DILocation(line: 30, column: 36, scope: !5)
321
+ !17 = !DILocation(line: 31, column: 27, scope: !5)
322
+ !18 = !DILocation(line: 34, column: 44, scope: !5)
323
+ !19 = !DILocation(line: 34, column: 51, scope: !5)
324
+ !20 = !DILocation(line: 34, column: 34, scope: !5)
325
+ !21 = !DILocation(line: 34, column: 63, scope: !5)
326
+ !22 = !DILocation(line: 34, column: 115, scope: !5)
327
+ !23 = !DILocation(line: 35, column: 34, scope: !5)
328
+ !24 = !DILocation(line: 35, column: 63, scope: !5)
329
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
330
+ !26 = !DILocation(line: 36, column: 63, scope: !5)
331
+ !27 = !DILocation(line: 36, column: 115, scope: !5)
332
+ !28 = !DILocation(line: 37, column: 40, scope: !5)
333
+ !29 = !DILocation(line: 37, column: 34, scope: !5)
334
+ !30 = !DILocation(line: 37, column: 50, scope: !5)
335
+ !31 = !DILocation(line: 38, column: 34, scope: !5)
336
+ !32 = !DILocation(line: 38, column: 50, scope: !5)
337
+ !33 = !DILocation(line: 39, column: 35, scope: !5)
338
+ !34 = !DILocation(line: 39, column: 64, scope: !5)
339
+ !35 = !DILocation(line: 39, column: 116, scope: !5)
340
+ !36 = !DILocation(line: 40, column: 35, scope: !5)
341
+ !37 = !DILocation(line: 40, column: 51, scope: !5)
342
+ !38 = !DILocation(line: 41, column: 35, scope: !5)
343
+ !39 = !DILocation(line: 41, column: 51, scope: !5)
344
+ !40 = !DILocation(line: 44, column: 22, scope: !5)
345
+ !41 = !DILocation(line: 52, column: 23, scope: !5)
346
+ !42 = !DILocation(line: 53, column: 24, scope: !5)
347
+ !43 = !DILocation(line: 54, column: 24, scope: !5)
348
+ !44 = !DILocation(line: 57, column: 40, scope: !5)
349
+ !45 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !49)
350
+ !46 = distinct !DILexicalBlockFile(scope: !48, file: !47, discriminator: 0)
351
+ !47 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
352
+ !48 = distinct !DILexicalBlockFile(scope: !5, file: !47, discriminator: 0)
353
+ !49 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !50)
354
+ !50 = !DILocation(line: 58, column: 27, scope: !46)
355
+ !51 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !52)
356
+ !52 = !DILocation(line: 58, column: 27, scope: !48)
357
+ !53 = !DILocation(line: 58, column: 30, scope: !5)
358
+ !54 = !DILocation(line: 59, column: 25, scope: !5)
359
+ !55 = !DILocation(line: 59, column: 37, scope: !5)
360
+ !56 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !57)
361
+ !57 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !58)
362
+ !58 = !DILocation(line: 60, column: 27, scope: !46)
363
+ !59 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !60)
364
+ !60 = !DILocation(line: 60, column: 27, scope: !48)
365
+ !61 = !DILocation(line: 60, column: 30, scope: !5)
366
+ !62 = !DILocation(line: 61, column: 25, scope: !5)
367
+ !63 = !DILocation(line: 61, column: 37, scope: !5)
368
+ !64 = !DILocation(line: 61, column: 4, scope: !5)
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx ADDED
@@ -0,0 +1,771 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8,
22
+ .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9,
23
+ .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10,
24
+ .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11
25
+ )
26
+ .maxntid 128, 1, 1
27
+ {
28
+ .reg .pred %p<38>;
29
+ .reg .b16 %rs<13>;
30
+ .reg .b32 %r<135>;
31
+ .reg .f32 %f<103>;
32
+ .reg .b64 %rd<41>;
33
+ .loc 1 18 0
34
+ $L__func_begin0:
35
+ .loc 1 18 0
36
+
37
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9];
38
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8];
39
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5];
40
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2];
41
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1];
42
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0];
43
+ $L__tmp0:
44
+ .loc 1 22 44
45
+ mov.u32 %r1, %tid.x;
46
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3];
47
+ shl.b32 %r17, %r1, 2;
48
+ ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4];
49
+ and.b32 %r18, %r17, 60;
50
+ bfe.u32 %r19, %r1, 5, 2;
51
+ ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6];
52
+ bfe.u32 %r20, %r1, 1, 4;
53
+ ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7];
54
+ shl.b32 %r21, %r19, 4;
55
+ or.b32 %r2, %r21, %r20;
56
+ .loc 1 24 33
57
+ and.b32 %r22, %r17, 4;
58
+ bfe.u32 %r23, %r1, 4, 1;
59
+ shl.b32 %r24, %r19, 1;
60
+ or.b32 %r25, %r24, %r23;
61
+ .loc 1 21 28
62
+ mov.u32 %r15, %ctaid.x;
63
+ .loc 1 21 33
64
+ shl.b32 %r3, %r15, 6;
65
+ .loc 1 22 23
66
+ or.b32 %r26, %r3, %r18;
67
+ or.b32 %r27, %r3, %r2;
68
+ .loc 1 26 20
69
+ shr.s32 %r29, %r26, 31;
70
+ shr.u32 %r30, %r29, 24;
71
+ add.s32 %r31, %r26, %r30;
72
+ shr.s32 %r32, %r31, 8;
73
+ bfe.s32 %r33, %r15, 25, 1;
74
+ shr.u32 %r34, %r33, 24;
75
+ add.s32 %r35, %r27, %r34;
76
+ shr.s32 %r36, %r35, 8;
77
+ .loc 1 37 44
78
+ shl.b32 %r37, %r36, 7;
79
+ mul.lo.s32 %r38, %r18, 12;
80
+ or.b32 %r39, %r25, %r38;
81
+ shl.b32 %r40, %r39, 1;
82
+ mov.u32 %r41, global_smem;
83
+ add.s32 %r4, %r41, %r40;
84
+ mad.lo.s32 %r42, %r2, 12, %r22;
85
+ shl.b32 %r43, %r42, 1;
86
+ add.s32 %r6, %r41, %r43;
87
+ shl.b32 %r44, %r39, 2;
88
+ add.s32 %r7, %r41, %r44;
89
+ shl.b32 %r45, %r42, 2;
90
+ add.s32 %r9, %r41, %r45;
91
+ .loc 1 30 36
92
+ mad.lo.s32 %r46, %r32, 32512, %r26;
93
+ shl.b32 %r47, %r19, 9;
94
+ add.s32 %r48, %r46, %r47;
95
+ shl.b32 %r49, %r23, 8;
96
+ add.s32 %r133, %r48, %r49;
97
+ or.b32 %r50, %r37, %r22;
98
+ mul.wide.s32 %rd23, %r50, 4;
99
+ add.s64 %rd40, %rd22, %rd23;
100
+ add.s64 %rd39, %rd21, %rd23;
101
+ add.s64 %rd38, %rd20, %rd23;
102
+ add.s64 %rd37, %rd19, %rd23;
103
+ mov.f32 %f95, 0f00000000;
104
+ mov.b32 %r134, -8;
105
+ mov.pred %p1, -1;
106
+ mov.f32 %f96, %f95;
107
+ mov.f32 %f97, %f95;
108
+ mov.f32 %f98, %f95;
109
+ mov.f32 %f99, %f95;
110
+ mov.f32 %f100, %f95;
111
+ mov.f32 %f101, %f95;
112
+ mov.f32 %f102, %f95;
113
+ $L__BB0_1:
114
+ .loc 1 34 34
115
+ mul.wide.s32 %rd32, %r133, 2;
116
+ add.s64 %rd24, %rd13, %rd32;
117
+ mov.b32 %r53, 0;
118
+ .loc 1 34 63
119
+ mov.u32 %r51, 0x0;
120
+ mov.u32 %r52, 0x0;
121
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r51, %r52 }, [ %rd24 + 0 ];
122
+ @!%p1 mov.u32 %r51, %r53;
123
+ @!%p1 mov.u32 %r52, %r53;
124
+ shr.u32 %r115, %r51, 16;
125
+ shr.u32 %r116, %r52, 16;
126
+ .loc 1 34 115
127
+ bar.sync 0;
128
+ st.shared.u16 [%r4], %r51;
129
+ st.shared.u16 [%r4+24], %r115;
130
+ st.shared.u16 [%r4+48], %r52;
131
+ st.shared.u16 [%r4+72], %r116;
132
+ bar.sync 0;
133
+ ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%r6];
134
+ cvt.f32.bf16 %r55, %rs1;
135
+ mov.b32 %f25, %r55;
136
+ cvt.f32.bf16 %r56, %rs2;
137
+ mov.b32 %f26, %r56;
138
+ cvt.f32.bf16 %r57, %rs3;
139
+ mov.b32 %f27, %r57;
140
+ cvt.f32.bf16 %r58, %rs4;
141
+ mov.b32 %f28, %r58;
142
+ .loc 1 35 34
143
+ mul.wide.s32 %rd33, %r133, 4;
144
+ add.s64 %rd25, %rd14, %rd33;
145
+ .loc 1 35 63
146
+ mov.u32 %r59, 0x0;
147
+ mov.u32 %r60, 0x0;
148
+ mov.u32 %r61, 0x0;
149
+ mov.u32 %r62, 0x0;
150
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd25 + 0 ];
151
+ @!%p1 mov.u32 %r59, %r53;
152
+ @!%p1 mov.u32 %r60, %r53;
153
+ @!%p1 mov.u32 %r61, %r53;
154
+ @!%p1 mov.u32 %r62, %r53;
155
+ mov.b32 %f29, %r59;
156
+ mov.b32 %f30, %r60;
157
+ mov.b32 %f31, %r61;
158
+ mov.b32 %f32, %r62;
159
+ bar.sync 0;
160
+ st.shared.u32 [%r7], %r59;
161
+ st.shared.u32 [%r7+48], %r60;
162
+ st.shared.u32 [%r7+96], %r61;
163
+ st.shared.u32 [%r7+144], %r62;
164
+ bar.sync 0;
165
+ ld.shared.v4.f32 {%f33, %f34, %f35, %f36}, [%r9];
166
+ .loc 1 36 34
167
+ add.s64 %rd26, %rd15, %rd32;
168
+ .loc 1 36 63
169
+ mov.u32 %r67, 0x0;
170
+ mov.u32 %r68, 0x0;
171
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r67, %r68 }, [ %rd26 + 0 ];
172
+ @!%p1 mov.u32 %r67, %r53;
173
+ @!%p1 mov.u32 %r68, %r53;
174
+ cvt.u16.u32 %rs5, %r67;
175
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r67; }
176
+ cvt.u16.u32 %rs7, %r68;
177
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r68; }
178
+ .loc 1 36 115
179
+ cvt.f32.bf16 %r71, %rs5;
180
+ mov.b32 %f37, %r71;
181
+ cvt.f32.bf16 %r72, %rs6;
182
+ mov.b32 %f38, %r72;
183
+ cvt.f32.bf16 %r73, %rs7;
184
+ mov.b32 %f39, %r73;
185
+ cvt.f32.bf16 %r74, %rs8;
186
+ mov.b32 %f40, %r74;
187
+ .loc 1 37 50
188
+ mov.u32 %r75, 0x0;
189
+ mov.u32 %r76, 0x0;
190
+ mov.u32 %r77, 0x0;
191
+ mov.u32 %r78, 0x0;
192
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd37 + 0 ];
193
+ @!%p1 mov.u32 %r75, %r53;
194
+ @!%p1 mov.u32 %r76, %r53;
195
+ @!%p1 mov.u32 %r77, %r53;
196
+ @!%p1 mov.u32 %r78, %r53;
197
+ .loc 1 38 50
198
+ mov.u32 %r83, 0x0;
199
+ mov.u32 %r84, 0x0;
200
+ mov.u32 %r85, 0x0;
201
+ mov.u32 %r86, 0x0;
202
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd38 + 0 ];
203
+ @!%p1 mov.u32 %r83, %r53;
204
+ @!%p1 mov.u32 %r84, %r53;
205
+ @!%p1 mov.u32 %r85, %r53;
206
+ @!%p1 mov.u32 %r86, %r53;
207
+ .loc 1 39 35
208
+ add.s64 %rd29, %rd16, %rd32;
209
+ .loc 1 39 64
210
+ mov.u32 %r91, 0x0;
211
+ mov.u32 %r92, 0x0;
212
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r91, %r92 }, [ %rd29 + 0 ];
213
+ @!%p1 mov.u32 %r91, %r53;
214
+ @!%p1 mov.u32 %r92, %r53;
215
+ shr.u32 %r117, %r91, 16;
216
+ shr.u32 %r118, %r92, 16;
217
+ .loc 1 39 116
218
+ bar.sync 0;
219
+ st.shared.u16 [%r4], %r91;
220
+ st.shared.u16 [%r4+24], %r117;
221
+ st.shared.u16 [%r4+48], %r92;
222
+ st.shared.u16 [%r4+72], %r118;
223
+ bar.sync 0;
224
+ ld.shared.v4.u16 {%rs9, %rs10, %rs11, %rs12}, [%r6];
225
+ cvt.f32.bf16 %r95, %rs9;
226
+ mov.b32 %f41, %r95;
227
+ cvt.f32.bf16 %r96, %rs10;
228
+ mov.b32 %f42, %r96;
229
+ cvt.f32.bf16 %r97, %rs11;
230
+ mov.b32 %f43, %r97;
231
+ cvt.f32.bf16 %r98, %rs12;
232
+ mov.b32 %f44, %r98;
233
+ .loc 1 40 51
234
+ mov.u32 %r99, 0x0;
235
+ mov.u32 %r100, 0x0;
236
+ mov.u32 %r101, 0x0;
237
+ mov.u32 %r102, 0x0;
238
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd39 + 0 ];
239
+ @!%p1 mov.u32 %r99, %r53;
240
+ @!%p1 mov.u32 %r100, %r53;
241
+ @!%p1 mov.u32 %r101, %r53;
242
+ @!%p1 mov.u32 %r102, %r53;
243
+ .loc 1 41 51
244
+ mov.u32 %r107, 0x0;
245
+ mov.u32 %r108, 0x0;
246
+ mov.u32 %r109, 0x0;
247
+ mov.u32 %r110, 0x0;
248
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd40 + 0 ];
249
+ @!%p1 mov.u32 %r107, %r53;
250
+ @!%p1 mov.u32 %r108, %r53;
251
+ @!%p1 mov.u32 %r109, %r53;
252
+ @!%p1 mov.u32 %r110, %r53;
253
+ .loc 1 44 22
254
+ add.f32 %f45, %f37, %f29;
255
+ add.f32 %f46, %f38, %f30;
256
+ add.f32 %f47, %f39, %f31;
257
+ add.f32 %f48, %f40, %f32;
258
+ bar.sync 0;
259
+ st.shared.f32 [%r7], %f45;
260
+ st.shared.f32 [%r7+48], %f46;
261
+ st.shared.f32 [%r7+96], %f47;
262
+ st.shared.f32 [%r7+144], %f48;
263
+ bar.sync 0;
264
+ ld.shared.v4.f32 {%f49, %f50, %f51, %f52}, [%r9];
265
+ .loc 1 40 51
266
+ mov.b32 %f53, %r75;
267
+ mov.b32 %f54, %r76;
268
+ mov.b32 %f55, %r77;
269
+ mov.b32 %f56, %r78;
270
+ mov.b32 %f57, %r99;
271
+ mov.b32 %f58, %r100;
272
+ mov.b32 %f59, %r101;
273
+ mov.b32 %f60, %r102;
274
+ .loc 1 41 51
275
+ mov.b32 %f61, %r110;
276
+ mov.b32 %f62, %r109;
277
+ mov.b32 %f63, %r108;
278
+ mov.b32 %f64, %r107;
279
+ mov.b32 %f65, %r86;
280
+ mov.b32 %f66, %r85;
281
+ mov.b32 %f67, %r84;
282
+ mov.b32 %f68, %r83;
283
+ .loc 1 52 23
284
+ sub.f32 %f69, %f36, %f60;
285
+ sub.f32 %f70, %f35, %f59;
286
+ sub.f32 %f71, %f34, %f58;
287
+ sub.f32 %f72, %f33, %f57;
288
+ sub.f32 %f73, %f52, %f56;
289
+ sub.f32 %f74, %f51, %f55;
290
+ sub.f32 %f75, %f50, %f54;
291
+ sub.f32 %f76, %f49, %f53;
292
+ .loc 1 53 24
293
+ mul.f32 %f77, %f76, %f68;
294
+ mul.f32 %f78, %f75, %f67;
295
+ mul.f32 %f79, %f74, %f66;
296
+ mul.f32 %f80, %f73, %f65;
297
+ mul.f32 %f81, %f72, %f64;
298
+ mul.f32 %f82, %f71, %f63;
299
+ mul.f32 %f83, %f70, %f62;
300
+ mul.f32 %f84, %f69, %f61;
301
+ .loc 1 57 40
302
+ fma.rn.f32 %f98, %f44, %f84, %f98;
303
+ fma.rn.f32 %f97, %f43, %f83, %f97;
304
+ fma.rn.f32 %f96, %f42, %f82, %f96;
305
+ fma.rn.f32 %f95, %f41, %f81, %f95;
306
+ fma.rn.f32 %f102, %f28, %f80, %f102;
307
+ fma.rn.f32 %f101, %f27, %f79, %f101;
308
+ fma.rn.f32 %f100, %f26, %f78, %f100;
309
+ fma.rn.f32 %f99, %f25, %f77, %f99;
310
+ .loc 1 30 36
311
+ add.s32 %r134, %r134, 8;
312
+ add.s32 %r133, %r133, 2048;
313
+ add.s64 %rd40, %rd40, 32;
314
+ add.s64 %rd39, %rd39, 32;
315
+ add.s64 %rd38, %rd38, 32;
316
+ add.s64 %rd37, %rd37, 32;
317
+ setp.lt.u32 %p35, %r134, 120;
318
+ @%p35 bra $L__BB0_1;
319
+ .loc 1 22 44
320
+ and.b32 %r121, %r1, 63;
321
+ .loc 1 22 23
322
+ or.b32 %r122, %r3, %r121;
323
+ $L__tmp1:
324
+ .loc 2 233 15
325
+ add.f32 %f85, %f99, %f100;
326
+ add.f32 %f86, %f101, %f85;
327
+ add.f32 %f87, %f102, %f86;
328
+ $L__tmp2:
329
+ .loc 2 243 36
330
+ mov.b32 %r123, %f87;
331
+ shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1;
332
+ mov.b32 %f88, %r124;
333
+ $L__tmp3:
334
+ .loc 2 233 15
335
+ add.f32 %f89, %f87, %f88;
336
+ $L__tmp4:
337
+ .loc 1 58 30
338
+ bar.sync 0;
339
+ shl.b32 %r125, %r2, 2;
340
+ add.s32 %r127, %r41, %r125;
341
+ st.shared.f32 [%r127], %f89;
342
+ bar.sync 0;
343
+ shl.b32 %r128, %r121, 2;
344
+ add.s32 %r129, %r41, %r128;
345
+ ld.shared.u32 %r119, [%r129];
346
+ .loc 1 59 25
347
+ mul.wide.s32 %rd36, %r122, 4;
348
+ add.s64 %rd34, %rd17, %rd36;
349
+ .loc 1 59 37
350
+ and.b32 %r130, %r1, 64;
351
+ setp.eq.s32 %p36, %r130, 0;
352
+ @%p36 st.global.b32 [ %rd34 + 0 ], { %r119 };
353
+ $L__tmp5:
354
+ .loc 2 233 15
355
+ add.f32 %f90, %f95, %f96;
356
+ add.f32 %f91, %f97, %f90;
357
+ add.f32 %f92, %f98, %f91;
358
+ $L__tmp6:
359
+ .loc 2 243 36
360
+ mov.b32 %r131, %f92;
361
+ shfl.sync.bfly.b32 %r132, %r131, 1, 31, -1;
362
+ mov.b32 %f93, %r132;
363
+ $L__tmp7:
364
+ .loc 2 233 15
365
+ add.f32 %f94, %f92, %f93;
366
+ $L__tmp8:
367
+ .loc 1 60 30
368
+ bar.sync 0;
369
+ st.shared.f32 [%r127], %f94;
370
+ bar.sync 0;
371
+ ld.shared.u32 %r120, [%r129];
372
+ .loc 1 61 25
373
+ add.s64 %rd35, %rd18, %rd36;
374
+ .loc 1 61 37
375
+ @%p36 st.global.b32 [ %rd35 + 0 ], { %r120 };
376
+ .loc 1 61 4
377
+ ret;
378
+ $L__tmp9:
379
+ $L__func_end0:
380
+
381
+ }
382
+ .file 1 "/tmp/torchinductor_root/3x/c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py"
383
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
384
+ .section .debug_abbrev
385
+ {
386
+ .b8 1
387
+ .b8 17
388
+ .b8 1
389
+ .b8 37
390
+ .b8 8
391
+ .b8 19
392
+ .b8 5
393
+ .b8 3
394
+ .b8 8
395
+ .b8 16
396
+ .b8 6
397
+ .b8 27
398
+ .b8 8
399
+ .b8 180
400
+ .b8 66
401
+ .b8 12
402
+ .b8 17
403
+ .b8 1
404
+ .b8 18
405
+ .b8 1
406
+ .b8 0
407
+ .b8 0
408
+ .b8 2
409
+ .b8 46
410
+ .b8 0
411
+ .b8 135
412
+ .b8 64
413
+ .b8 8
414
+ .b8 3
415
+ .b8 8
416
+ .b8 58
417
+ .b8 11
418
+ .b8 59
419
+ .b8 11
420
+ .b8 63
421
+ .b8 12
422
+ .b8 32
423
+ .b8 11
424
+ .b8 0
425
+ .b8 0
426
+ .b8 3
427
+ .b8 46
428
+ .b8 1
429
+ .b8 17
430
+ .b8 1
431
+ .b8 18
432
+ .b8 1
433
+ .b8 64
434
+ .b8 10
435
+ .b8 49
436
+ .b8 19
437
+ .b8 0
438
+ .b8 0
439
+ .b8 4
440
+ .b8 29
441
+ .b8 1
442
+ .b8 49
443
+ .b8 19
444
+ .b8 17
445
+ .b8 1
446
+ .b8 18
447
+ .b8 1
448
+ .b8 88
449
+ .b8 11
450
+ .b8 89
451
+ .b8 11
452
+ .b8 87
453
+ .b8 11
454
+ .b8 0
455
+ .b8 0
456
+ .b8 5
457
+ .b8 29
458
+ .b8 0
459
+ .b8 49
460
+ .b8 19
461
+ .b8 17
462
+ .b8 1
463
+ .b8 18
464
+ .b8 1
465
+ .b8 88
466
+ .b8 11
467
+ .b8 89
468
+ .b8 11
469
+ .b8 87
470
+ .b8 11
471
+ .b8 0
472
+ .b8 0
473
+ .b8 0
474
+ }
475
+ .section .debug_info
476
+ {
477
+ .b32 371
478
+ .b8 2
479
+ .b8 0
480
+ .b32 .debug_abbrev
481
+ .b8 8
482
+ .b8 1
483
+ .b8 116
484
+ .b8 114
485
+ .b8 105
486
+ .b8 116
487
+ .b8 111
488
+ .b8 110
489
+ .b8 0
490
+ .b8 2
491
+ .b8 0
492
+ .b8 99
493
+ .b8 51
494
+ .b8 120
495
+ .b8 120
496
+ .b8 115
497
+ .b8 122
498
+ .b8 118
499
+ .b8 103
500
+ .b8 116
501
+ .b8 102
502
+ .b8 110
503
+ .b8 106
504
+ .b8 98
505
+ .b8 55
506
+ .b8 119
507
+ .b8 101
508
+ .b8 108
509
+ .b8 113
510
+ .b8 118
511
+ .b8 114
512
+ .b8 51
513
+ .b8 51
514
+ .b8 122
515
+ .b8 52
516
+ .b8 99
517
+ .b8 113
518
+ .b8 111
519
+ .b8 117
520
+ .b8 120
521
+ .b8 104
522
+ .b8 113
523
+ .b8 106
524
+ .b8 121
525
+ .b8 51
526
+ .b8 100
527
+ .b8 112
528
+ .b8 119
529
+ .b8 97
530
+ .b8 50
531
+ .b8 113
532
+ .b8 109
533
+ .b8 109
534
+ .b8 120
535
+ .b8 50
536
+ .b8 120
537
+ .b8 116
538
+ .b8 111
539
+ .b8 54
540
+ .b8 115
541
+ .b8 103
542
+ .b8 118
543
+ .b8 122
544
+ .b8 46
545
+ .b8 112
546
+ .b8 121
547
+ .b8 0
548
+ .b32 .debug_line
549
+ .b8 47
550
+ .b8 116
551
+ .b8 109
552
+ .b8 112
553
+ .b8 47
554
+ .b8 116
555
+ .b8 111
556
+ .b8 114
557
+ .b8 99
558
+ .b8 104
559
+ .b8 105
560
+ .b8 110
561
+ .b8 100
562
+ .b8 117
563
+ .b8 99
564
+ .b8 116
565
+ .b8 111
566
+ .b8 114
567
+ .b8 95
568
+ .b8 114
569
+ .b8 111
570
+ .b8 111
571
+ .b8 116
572
+ .b8 47
573
+ .b8 51
574
+ .b8 120
575
+ .b8 0
576
+ .b8 1
577
+ .b64 $L__func_begin0
578
+ .b64 $L__func_end0
579
+ .b8 2
580
+ .b8 116
581
+ .b8 114
582
+ .b8 105
583
+ .b8 116
584
+ .b8 111
585
+ .b8 110
586
+ .b8 95
587
+ .b8 95
588
+ .b8 48
589
+ .b8 100
590
+ .b8 49
591
+ .b8 100
592
+ .b8 50
593
+ .b8 100
594
+ .b8 51
595
+ .b8 100
596
+ .b8 52
597
+ .b8 100
598
+ .b8 53
599
+ .b8 100
600
+ .b8 54
601
+ .b8 100
602
+ .b8 55
603
+ .b8 100
604
+ .b8 56
605
+ .b8 100
606
+ .b8 57
607
+ .b8 100
608
+ .b8 49
609
+ .b8 48
610
+ .b8 100
611
+ .b8 101
612
+ .b8 49
613
+ .b8 49
614
+ .b8 100
615
+ .b8 101
616
+ .b8 0
617
+ .b8 116
618
+ .b8 114
619
+ .b8 105
620
+ .b8 116
621
+ .b8 111
622
+ .b8 110
623
+ .b8 95
624
+ .b8 95
625
+ .b8 48
626
+ .b8 100
627
+ .b8 49
628
+ .b8 100
629
+ .b8 50
630
+ .b8 100
631
+ .b8 51
632
+ .b8 100
633
+ .b8 52
634
+ .b8 100
635
+ .b8 53
636
+ .b8 100
637
+ .b8 54
638
+ .b8 100
639
+ .b8 55
640
+ .b8 100
641
+ .b8 56
642
+ .b8 100
643
+ .b8 57
644
+ .b8 100
645
+ .b8 49
646
+ .b8 48
647
+ .b8 100
648
+ .b8 101
649
+ .b8 49
650
+ .b8 49
651
+ .b8 100
652
+ .b8 101
653
+ .b8 0
654
+ .b8 1
655
+ .b8 18
656
+ .b8 1
657
+ .b8 1
658
+ .b8 3
659
+ .b64 $L__func_begin0
660
+ .b64 $L__func_end0
661
+ .b8 1
662
+ .b8 156
663
+ .b32 125
664
+ .b8 4
665
+ .b32 125
666
+ .b64 $L__tmp1
667
+ .b64 $L__tmp4
668
+ .b8 2
669
+ .b8 58
670
+ .b8 27
671
+ .b8 5
672
+ .b32 125
673
+ .b64 $L__tmp1
674
+ .b64 $L__tmp4
675
+ .b8 2
676
+ .b8 243
677
+ .b8 36
678
+ .b8 0
679
+ .b8 5
680
+ .b32 125
681
+ .b64 $L__tmp2
682
+ .b64 $L__tmp3
683
+ .b8 2
684
+ .b8 58
685
+ .b8 27
686
+ .b8 4
687
+ .b32 125
688
+ .b64 $L__tmp5
689
+ .b64 $L__tmp8
690
+ .b8 2
691
+ .b8 60
692
+ .b8 27
693
+ .b8 5
694
+ .b32 125
695
+ .b64 $L__tmp5
696
+ .b64 $L__tmp8
697
+ .b8 2
698
+ .b8 243
699
+ .b8 36
700
+ .b8 0
701
+ .b8 5
702
+ .b32 125
703
+ .b64 $L__tmp6
704
+ .b64 $L__tmp7
705
+ .b8 2
706
+ .b8 60
707
+ .b8 27
708
+ .b8 0
709
+ .b8 0
710
+ }
711
+ .section .debug_pubnames
712
+ {
713
+ .b32 $L__pubNames_end0-$L__pubNames_start0
714
+ $L__pubNames_start0:
715
+ .b8 2
716
+ .b8 0
717
+ .b32 .debug_info
718
+ .b32 375
719
+ .b32 125
720
+ .b8 116
721
+ .b8 114
722
+ .b8 105
723
+ .b8 116
724
+ .b8 111
725
+ .b8 110
726
+ .b8 95
727
+ .b8 95
728
+ .b8 48
729
+ .b8 100
730
+ .b8 49
731
+ .b8 100
732
+ .b8 50
733
+ .b8 100
734
+ .b8 51
735
+ .b8 100
736
+ .b8 52
737
+ .b8 100
738
+ .b8 53
739
+ .b8 100
740
+ .b8 54
741
+ .b8 100
742
+ .b8 55
743
+ .b8 100
744
+ .b8 56
745
+ .b8 100
746
+ .b8 57
747
+ .b8 100
748
+ .b8 49
749
+ .b8 48
750
+ .b8 100
751
+ .b8 101
752
+ .b8 49
753
+ .b8 49
754
+ .b8 100
755
+ .b8 101
756
+ .b8 0
757
+ .b32 0
758
+ $L__pubNames_end0:
759
+ }
760
+ .section .debug_pubtypes
761
+ {
762
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
763
+ $L__pubTypes_start0:
764
+ .b8 2
765
+ .b8 0
766
+ .b32 .debug_info
767
+ .b32 375
768
+ .b32 0
769
+ $L__pubTypes_end0:
770
+ }
771
+ .section .debug_loc { }
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked1>
8
+ %cst_1 = arith.constant dense<128> : tensor<64x1xi32, #blocked1>
9
+ %cst_2 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
10
+ %cst_3 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
11
+ %cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1>
12
+ %cst_5 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
13
+ %c0_i32 = arith.constant 0 : i32
14
+ %c128_i32 = arith.constant 128 : i32
15
+ %c8_i32 = arith.constant 8 : i32
16
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1>
17
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
18
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
19
+ %c64_i32 = arith.constant 64 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c64_i32 : i32
22
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
23
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
24
+ %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
25
+ %5 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
26
+ %6 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
27
+ %7 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2>
28
+ %8 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
29
+ %9 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
30
+ %10 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2>
31
+ %11 = arith.addi %8, %5 : tensor<64x1xi32, #blocked>
32
+ %12 = arith.addi %9, %6 : tensor<64x1xi32, #blocked1>
33
+ %13 = arith.addi %10, %7 : tensor<64x1xi32, #blocked2>
34
+ %14 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
35
+ %15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
36
+ %16 = tt.expand_dims %14 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1>
37
+ %17 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
38
+ %18 = arith.remsi %11, %cst : tensor<64x1xi32, #blocked>
39
+ %19 = arith.divsi %11, %cst : tensor<64x1xi32, #blocked>
40
+ %20 = arith.divsi %12, %cst_0 : tensor<64x1xi32, #blocked1>
41
+ %21 = tt.broadcast %18 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
42
+ %22 = arith.muli %19, %cst_2 : tensor<64x1xi32, #blocked>
43
+ %23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
44
+ %24 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
45
+ %25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
46
+ %26 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
47
+ %27 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked1>
48
+ %28 = tt.broadcast %27 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
49
+ %29 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
50
+ %30 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
51
+ %31 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
52
+ %32 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
53
+ %33 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
54
+ %34:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_6, %arg14 = %cst_6) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 {
55
+ %45 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked1>
56
+ %46 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked>
57
+ %47 = arith.addi %45, %16 : tensor<1x8xi32, #blocked1>
58
+ %48 = arith.addi %46, %17 : tensor<1x8xi32, #blocked>
59
+ %49 = arith.cmpi slt, %47, %cst_4 : tensor<1x8xi32, #blocked1>
60
+ %50 = arith.cmpi slt, %48, %cst_5 : tensor<1x8xi32, #blocked>
61
+ %51 = arith.muli %48, %cst_3 : tensor<1x8xi32, #blocked>
62
+ %52 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
63
+ %53 = arith.addi %21, %52 : tensor<64x8xi32, #blocked>
64
+ %54 = arith.addi %53, %23 : tensor<64x8xi32, #blocked>
65
+ %55 = tt.addptr %24, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
66
+ %56 = tt.broadcast %49 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1>
67
+ %57 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
68
+ %58 = tt.load %55, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
69
+ %59 = triton_gpu.convert_layout %58 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
70
+ %60 = arith.extf %59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
71
+ %61 = tt.addptr %25, %54 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
72
+ %62 = tt.load %61, %57, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
73
+ %63 = triton_gpu.convert_layout %62 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
74
+ %64 = tt.addptr %26, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
75
+ %65 = tt.load %64, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
76
+ %66 = arith.extf %65 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
77
+ %67 = tt.broadcast %47 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
78
+ %68 = arith.addi %67, %28 : tensor<64x8xi32, #blocked1>
79
+ %69 = tt.addptr %29, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
80
+ %70 = tt.load %69, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
81
+ %71 = tt.addptr %30, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
82
+ %72 = tt.load %71, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
83
+ %73 = tt.addptr %31, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
84
+ %74 = tt.load %73, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
85
+ %75 = triton_gpu.convert_layout %74 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
86
+ %76 = arith.extf %75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
87
+ %77 = tt.addptr %32, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
88
+ %78 = tt.load %77, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
89
+ %79 = tt.addptr %33, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
90
+ %80 = tt.load %79, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
91
+ %81 = arith.addf %62, %66 : tensor<64x8xf32, #blocked>
92
+ %82 = triton_gpu.convert_layout %81 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
93
+ %83 = arith.subf %82, %70 : tensor<64x8xf32, #blocked1>
94
+ %84 = arith.mulf %83, %72 : tensor<64x8xf32, #blocked1>
95
+ %85 = arith.mulf %60, %84 : tensor<64x8xf32, #blocked1>
96
+ %86 = arith.addf %arg13, %85 : tensor<64x8xf32, #blocked1>
97
+ %87 = arith.select %56, %86, %arg13 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
98
+ %88 = arith.subf %63, %78 : tensor<64x8xf32, #blocked1>
99
+ %89 = arith.mulf %88, %80 : tensor<64x8xf32, #blocked1>
100
+ %90 = arith.mulf %76, %89 : tensor<64x8xf32, #blocked1>
101
+ %91 = arith.addf %arg14, %90 : tensor<64x8xf32, #blocked1>
102
+ %92 = arith.select %56, %91, %arg14 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
103
+ scf.yield %87, %92 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>
104
+ }
105
+ %35 = "tt.reduce"(%34#0) <{axis = 1 : i32}> ({
106
+ ^bb0(%arg12: f32, %arg13: f32):
107
+ %45 = arith.addf %arg12, %arg13 : f32
108
+ tt.reduce.return %45 : f32
109
+ }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
110
+ %36 = triton_gpu.convert_layout %35 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
111
+ %37 = tt.expand_dims %36 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
112
+ %38 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
113
+ %39 = tt.addptr %38, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
114
+ tt.store %39, %37 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
115
+ %40 = "tt.reduce"(%34#1) <{axis = 1 : i32}> ({
116
+ ^bb0(%arg12: f32, %arg13: f32):
117
+ %45 = arith.addf %arg12, %arg13 : f32
118
+ tt.reduce.return %45 : f32
119
+ }) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
120
+ %41 = triton_gpu.convert_layout %40 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
121
+ %42 = tt.expand_dims %41 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
122
+ %43 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
123
+ %44 = tt.addptr %43, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
124
+ tt.store %44, %42 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
125
+ tt.return
126
+ }
127
+ }
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %c8_i32 = arith.constant 8 : i32
5
+ %c128_i32 = arith.constant 128 : i32
6
+ %c0_i32 = arith.constant 0 : i32
7
+ %cst_0 = arith.constant dense<128> : tensor<64x1xi32>
8
+ %cst_1 = arith.constant dense<32768> : tensor<64x1xi32>
9
+ %cst_2 = arith.constant dense<256> : tensor<1x8xi32>
10
+ %cst_3 = arith.constant dense<128> : tensor<1x8xi32>
11
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi32>
13
+ %c64_i32 = arith.constant 64 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c64_i32 : i32
16
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
17
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
18
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
19
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
20
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
21
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
22
+ %8 = arith.remsi %5, %cst_5 : tensor<64x1xi32>
23
+ %9 = arith.divsi %5, %cst_5 : tensor<64x1xi32>
24
+ %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
25
+ %11 = arith.muli %9, %cst_1 : tensor<64x1xi32>
26
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
27
+ %13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
28
+ %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
29
+ %15 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
30
+ %16 = arith.muli %9, %cst_0 : tensor<64x1xi32>
31
+ %17 = tt.broadcast %16 : (tensor<64x1xi32>) -> tensor<64x8xi32>
32
+ %18 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
33
+ %19 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
34
+ %20 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
35
+ %21 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
36
+ %22 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
37
+ %23:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_4, %arg14 = %cst_4) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
38
+ %32 = tt.splat %arg12 : (i32) -> tensor<1x8xi32>
39
+ %33 = arith.addi %32, %7 : tensor<1x8xi32>
40
+ %34 = arith.cmpi slt, %33, %cst_3 : tensor<1x8xi32>
41
+ %35 = arith.muli %33, %cst_2 : tensor<1x8xi32>
42
+ %36 = tt.broadcast %35 : (tensor<1x8xi32>) -> tensor<64x8xi32>
43
+ %37 = arith.addi %10, %36 : tensor<64x8xi32>
44
+ %38 = arith.addi %37, %12 : tensor<64x8xi32>
45
+ %39 = tt.addptr %13, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
46
+ %40 = tt.broadcast %34 : (tensor<1x8xi1>) -> tensor<64x8xi1>
47
+ %41 = tt.load %39, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
48
+ %42 = arith.extf %41 : tensor<64x8xbf16> to tensor<64x8xf32>
49
+ %43 = tt.addptr %14, %38 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
50
+ %44 = tt.load %43, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
51
+ %45 = tt.addptr %15, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
52
+ %46 = tt.load %45, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
53
+ %47 = arith.extf %46 : tensor<64x8xbf16> to tensor<64x8xf32>
54
+ %48 = tt.broadcast %33 : (tensor<1x8xi32>) -> tensor<64x8xi32>
55
+ %49 = arith.addi %48, %17 : tensor<64x8xi32>
56
+ %50 = tt.addptr %18, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
57
+ %51 = tt.load %50, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
58
+ %52 = tt.addptr %19, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
59
+ %53 = tt.load %52, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
60
+ %54 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
61
+ %55 = tt.load %54, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
62
+ %56 = arith.extf %55 : tensor<64x8xbf16> to tensor<64x8xf32>
63
+ %57 = tt.addptr %21, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
64
+ %58 = tt.load %57, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
65
+ %59 = tt.addptr %22, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
66
+ %60 = tt.load %59, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
67
+ %61 = arith.addf %44, %47 : tensor<64x8xf32>
68
+ %62 = arith.subf %61, %51 : tensor<64x8xf32>
69
+ %63 = arith.mulf %62, %53 : tensor<64x8xf32>
70
+ %64 = arith.mulf %42, %63 : tensor<64x8xf32>
71
+ %65 = arith.addf %arg13, %64 : tensor<64x8xf32>
72
+ %66 = arith.select %40, %65, %arg13 : tensor<64x8xi1>, tensor<64x8xf32>
73
+ %67 = arith.subf %44, %58 : tensor<64x8xf32>
74
+ %68 = arith.mulf %67, %60 : tensor<64x8xf32>
75
+ %69 = arith.mulf %56, %68 : tensor<64x8xf32>
76
+ %70 = arith.addf %arg14, %69 : tensor<64x8xf32>
77
+ %71 = arith.select %40, %70, %arg14 : tensor<64x8xi1>, tensor<64x8xf32>
78
+ scf.yield %66, %71 : tensor<64x8xf32>, tensor<64x8xf32>
79
+ }
80
+ %24 = "tt.reduce"(%23#0) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg12: f32, %arg13: f32):
82
+ %32 = arith.addf %arg12, %arg13 : f32
83
+ tt.reduce.return %32 : f32
84
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
85
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
86
+ %26 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
87
+ %27 = tt.addptr %26, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
88
+ tt.store %27, %25 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
89
+ %28 = "tt.reduce"(%23#1) <{axis = 1 : i32}> ({
90
+ ^bb0(%arg12: f32, %arg13: f32):
91
+ %32 = arith.addf %arg12, %arg13 : f32
92
+ tt.reduce.return %32 : f32
93
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
94
+ %29 = tt.expand_dims %28 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
95
+ %30 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
96
+ %31 = tt.addptr %30, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
97
+ tt.store %31, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
98
+ tt.return
99
+ }
100
+ }