Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin +0 -0
- .triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir +125 -0
- .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx +782 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin +0 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir +156 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx +525 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir +92 -0
- .triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir +99 -0
- .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir +99 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin +0 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir +43 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx +278 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir +18 -0
- .triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir +17 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin +0 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir +296 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx +743 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir +73 -0
- .triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir +72 -0
- .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir +98 -0
- .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin +0 -0
- .triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx +1054 -0
- .triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin +0 -0
- .triton/dump/510522bb05917b836ed253751364fcad/triton_.llir +1211 -0
- .triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx +1810 -0
- .triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir +137 -0
- .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir +1360 -0
- .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir +151 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin +0 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir +269 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx +642 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir +60 -0
- .triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir +53 -0
- .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir +66 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin +0 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir +45 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx +279 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir +16 -0
- .triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir +15 -0
- .triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir +110 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.cubin +0 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.llir +48 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.ptx +282 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir +18 -0
- .triton/dump/962d1809855a53123762906133b1d960/triton_.ttir +17 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin +0 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir +368 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx +771 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir +127 -0
- .triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir +100 -0
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.cubin
ADDED
Binary file (29 kB). View file
|
|
.triton/dump/1ed98b0d136db679153ca6a42fff755c/triton_.ttgir
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<1x256xi32, #blocked1>
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked2>
|
16 |
+
%cst_9 = arith.constant dense<50257> : tensor<2x1xi64, #blocked2>
|
17 |
+
%cst_10 = arith.constant 0.000000e+00 : f32
|
18 |
+
%cst_11 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
|
19 |
+
%cst_12 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
|
20 |
+
%cst_13 = arith.constant dense<0.000000e+00> : tensor<2x256xf32, #blocked>
|
21 |
+
%cst_14 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked1>
|
22 |
+
%c2_i32 = arith.constant 2 : i32
|
23 |
+
%0 = tt.get_program_id x : i32
|
24 |
+
%1 = arith.muli %0, %c2_i32 : i32
|
25 |
+
%2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
26 |
+
%3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
27 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
|
28 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<2x1xi32, #blocked2>
|
29 |
+
%6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
|
30 |
+
%7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked2>
|
31 |
+
%8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
|
32 |
+
%9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked2>
|
33 |
+
%10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
34 |
+
%11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
35 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
|
36 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x256xi32, #blocked1>
|
37 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
|
38 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked2>
|
39 |
+
%16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
|
40 |
+
%17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked2>, tensor<2x1xi32, #blocked2>
|
41 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
|
42 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked2>
|
43 |
+
%20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
|
44 |
+
%21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
|
45 |
+
%22 = arith.cmpi slt, %13, %cst_1 : tensor<1x256xi32, #blocked1>
|
46 |
+
%23 = arith.muli %20, %cst_2 : tensor<2x1xi32, #blocked>
|
47 |
+
%24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<2x256xi32, #blocked>
|
48 |
+
%25 = tt.broadcast %23 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
|
49 |
+
%26 = arith.addi %24, %25 : tensor<2x256xi32, #blocked>
|
50 |
+
%27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
|
51 |
+
%28 = tt.addptr %27, %26 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi32, #blocked>
|
52 |
+
%29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<2x256xi1, #blocked>
|
53 |
+
%30 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
54 |
+
%31 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
|
55 |
+
%32 = arith.addi %19, %cst_9 : tensor<2x1xi64, #blocked2>
|
56 |
+
%33 = arith.cmpi slt, %18, %cst_7 : tensor<2x1xi64, #blocked>
|
57 |
+
%34 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked2>
|
58 |
+
%35 = arith.select %33, %31, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
|
59 |
+
%36 = arith.select %34, %32, %19 : tensor<2x1xi1, #blocked2>, tensor<2x1xi64, #blocked2>
|
60 |
+
%37 = arith.cmpi sge, %36, %cst_8 : tensor<2x1xi64, #blocked2>
|
61 |
+
%38 = arith.cmpi slt, %36, %cst_9 : tensor<2x1xi64, #blocked2>
|
62 |
+
%39 = arith.andi %37, %38 : tensor<2x1xi1, #blocked2>
|
63 |
+
tt.assert %39, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
|
64 |
+
%40 = arith.muli %35, %cst_5 : tensor<2x1xi64, #blocked>
|
65 |
+
%41 = tt.broadcast %40 : (tensor<2x1xi64, #blocked>) -> tensor<2x256xi64, #blocked>
|
66 |
+
%42 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
|
67 |
+
%43 = tt.broadcast %42 : (tensor<1x256xi64, #blocked>) -> tensor<2x256xi64, #blocked>
|
68 |
+
%44 = arith.addi %43, %41 : tensor<2x256xi64, #blocked>
|
69 |
+
%45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x256x!tt.ptr<f32, 1>, #blocked>
|
70 |
+
%46 = tt.addptr %45, %44 : tensor<2x256x!tt.ptr<f32, 1>, #blocked>, tensor<2x256xi64, #blocked>
|
71 |
+
%47 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
72 |
+
%48 = arith.addf %47, %30 : tensor<2x256xf32, #blocked>
|
73 |
+
%49 = arith.addf %48, %cst_13 : tensor<2x256xf32, #blocked>
|
74 |
+
%50 = arith.subf %48, %49 : tensor<2x256xf32, #blocked>
|
75 |
+
%51 = arith.mulf %48, %50 : tensor<2x256xf32, #blocked>
|
76 |
+
%52 = arith.addf %51, %cst_13 : tensor<2x256xf32, #blocked>
|
77 |
+
%53 = arith.select %29, %49, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
|
78 |
+
%54 = arith.select %29, %52, %cst_13 : tensor<2x256xi1, #blocked>, tensor<2x256xf32, #blocked>
|
79 |
+
%55 = arith.select %21, %cst_3, %cst_4 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
80 |
+
%56 = tt.broadcast %55 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
81 |
+
%57:3 = "tt.reduce"(%53, %54, %56) <{axis = 1 : i32}> ({
|
82 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
83 |
+
%82 = arith.subf %arg10, %arg7 : f32
|
84 |
+
%83 = arith.addf %arg9, %arg12 : f32
|
85 |
+
%84 = arith.cmpf oeq, %83, %cst_10 : f32
|
86 |
+
%85 = arith.divf %arg12, %83 : f32
|
87 |
+
%86 = arith.select %84, %cst_10, %85 : f32
|
88 |
+
%87 = arith.mulf %82, %86 : f32
|
89 |
+
%88 = arith.addf %arg7, %87 : f32
|
90 |
+
%89 = arith.addf %arg8, %arg11 : f32
|
91 |
+
%90 = arith.mulf %82, %82 : f32
|
92 |
+
%91 = arith.mulf %90, %arg9 : f32
|
93 |
+
%92 = arith.mulf %91, %86 : f32
|
94 |
+
%93 = arith.addf %89, %92 : f32
|
95 |
+
tt.reduce.return %88, %93, %83 : f32, f32, f32
|
96 |
+
}) : (tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>, tensor<2x256xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
97 |
+
%58 = tt.expand_dims %57#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
|
98 |
+
%59 = tt.expand_dims %57#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
|
99 |
+
%60 = tt.load %28, %29, %cst_13 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
100 |
+
%61 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked1>
|
101 |
+
%62 = tt.addptr %61, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked1>, tensor<1x256xi32, #blocked1>
|
102 |
+
%63 = tt.load %62, %22, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked1>
|
103 |
+
tt.assert %39, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<2x1xi1, #blocked2>
|
104 |
+
%64 = tt.load %46, %29, %cst_13 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x256xf32, #blocked>
|
105 |
+
%65 = arith.addf %64, %60 : tensor<2x256xf32, #blocked>
|
106 |
+
%66 = tt.broadcast %58 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
107 |
+
%67 = arith.subf %65, %66 : tensor<2x256xf32, #blocked>
|
108 |
+
%68 = arith.divf %59, %cst_12 : tensor<2x1xf32, #blocked>
|
109 |
+
%69 = arith.addf %68, %cst_11 : tensor<2x1xf32, #blocked>
|
110 |
+
%70 = tt.extern_elementwise %69 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
|
111 |
+
%71 = tt.broadcast %70 : (tensor<2x1xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
112 |
+
%72 = arith.mulf %67, %71 : tensor<2x256xf32, #blocked>
|
113 |
+
%73 = triton_gpu.convert_layout %63 : (tensor<1x256xf32, #blocked1>) -> tensor<1x256xf32, #blocked>
|
114 |
+
%74 = tt.broadcast %73 : (tensor<1x256xf32, #blocked>) -> tensor<2x256xf32, #blocked>
|
115 |
+
%75 = arith.mulf %72, %74 : tensor<2x256xf32, #blocked>
|
116 |
+
%76 = arith.muli %8, %cst_2 : tensor<2x1xi32, #blocked>
|
117 |
+
%77 = tt.broadcast %76 : (tensor<2x1xi32, #blocked>) -> tensor<2x256xi32, #blocked>
|
118 |
+
%78 = arith.addi %24, %77 : tensor<2x256xi32, #blocked>
|
119 |
+
%79 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<2x256x!tt.ptr<bf16, 1>, #blocked>
|
120 |
+
%80 = tt.addptr %79, %78 : tensor<2x256x!tt.ptr<bf16, 1>, #blocked>, tensor<2x256xi32, #blocked>
|
121 |
+
%81 = arith.truncf %75 : tensor<2x256xf32, #blocked> to tensor<2x256xbf16, #blocked>
|
122 |
+
tt.store %80, %81, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<2x256xbf16, #blocked>
|
123 |
+
tt.return
|
124 |
+
}
|
125 |
+
}
|
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ptx
ADDED
@@ -0,0 +1,782 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8de9de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8de9de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7,
|
21 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8,
|
22 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9
|
23 |
+
)
|
24 |
+
.maxntid 64, 1, 1
|
25 |
+
{
|
26 |
+
.reg .pred %p<45>;
|
27 |
+
.reg .b16 %rs<5>;
|
28 |
+
.reg .b32 %r<106>;
|
29 |
+
.reg .f32 %f<90>;
|
30 |
+
.reg .b64 %rd<44>;
|
31 |
+
.loc 1 18 0
|
32 |
+
$L__func_begin0:
|
33 |
+
.loc 1 18 0
|
34 |
+
|
35 |
+
ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7d8de9de_param_0];
|
36 |
+
ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7d8de9de_param_1];
|
37 |
+
$L__tmp0:
|
38 |
+
.loc 1 26 26
|
39 |
+
mov.u32 %r74, %tid.x;
|
40 |
+
and.b32 %r75, %r74, 31;
|
41 |
+
ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7d8de9de_param_2];
|
42 |
+
ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7d8de9de_param_3];
|
43 |
+
ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7d8de9de_param_4];
|
44 |
+
shl.b32 %r76, %r74, 2;
|
45 |
+
ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6d7d8de9de_param_5];
|
46 |
+
and.b32 %r77, %r76, 252;
|
47 |
+
ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6d7d8de9de_param_6];
|
48 |
+
ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6d7d8de9de_param_7];
|
49 |
+
.loc 1 23 28
|
50 |
+
mov.u32 %r1, %ctaid.x;
|
51 |
+
.loc 1 30 40
|
52 |
+
shl.b32 %r78, %r1, 8;
|
53 |
+
.loc 1 30 36
|
54 |
+
or.b32 %r79, %r78, %r77;
|
55 |
+
.loc 1 30 30
|
56 |
+
mul.wide.s32 %rd33, %r79, 2;
|
57 |
+
add.s64 %rd1, %rd26, %rd33;
|
58 |
+
mov.b32 %r4, 0;
|
59 |
+
mov.pred %p1, -1;
|
60 |
+
.loc 1 30 46
|
61 |
+
mov.u32 %r2, 0x0;
|
62 |
+
mov.u32 %r3, 0x0;
|
63 |
+
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
|
64 |
+
@!%p1 mov.u32 %r2, %r4;
|
65 |
+
@!%p1 mov.u32 %r3, %r4;
|
66 |
+
cvt.u16.u32 %rs1, %r2;
|
67 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
68 |
+
cvt.u16.u32 %rs3, %r3;
|
69 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
70 |
+
.loc 1 30 67
|
71 |
+
cvt.f32.bf16 %r6, %rs1;
|
72 |
+
mov.b32 %f1, %r6;
|
73 |
+
cvt.f32.bf16 %r7, %rs2;
|
74 |
+
mov.b32 %f2, %r7;
|
75 |
+
cvt.f32.bf16 %r8, %rs3;
|
76 |
+
mov.b32 %f3, %r8;
|
77 |
+
cvt.f32.bf16 %r9, %rs4;
|
78 |
+
mov.b32 %f4, %r9;
|
79 |
+
.loc 1 31 30
|
80 |
+
cvt.u64.u32 %rd34, %r77;
|
81 |
+
mul.wide.u32 %rd35, %r77, 4;
|
82 |
+
add.s64 %rd2, %rd27, %rd35;
|
83 |
+
.loc 1 31 35
|
84 |
+
mov.u32 %r10, 0x0;
|
85 |
+
mov.u32 %r11, 0x0;
|
86 |
+
mov.u32 %r12, 0x0;
|
87 |
+
mov.u32 %r13, 0x0;
|
88 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
89 |
+
@!%p1 mov.u32 %r10, %r4;
|
90 |
+
@!%p1 mov.u32 %r11, %r4;
|
91 |
+
@!%p1 mov.u32 %r12, %r4;
|
92 |
+
@!%p1 mov.u32 %r13, %r4;
|
93 |
+
mov.b32 %f5, %r10;
|
94 |
+
mov.b32 %f6, %r11;
|
95 |
+
mov.b32 %f7, %r12;
|
96 |
+
mov.b32 %f8, %r13;
|
97 |
+
.loc 1 32 30
|
98 |
+
mul.wide.s32 %rd36, %r79, 4;
|
99 |
+
add.s64 %rd3, %rd28, %rd36;
|
100 |
+
.loc 1 32 46
|
101 |
+
mov.u32 %r18, 0x0;
|
102 |
+
mov.u32 %r19, 0x0;
|
103 |
+
mov.u32 %r20, 0x0;
|
104 |
+
mov.u32 %r21, 0x0;
|
105 |
+
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
106 |
+
@!%p1 mov.u32 %r18, %r4;
|
107 |
+
@!%p1 mov.u32 %r19, %r4;
|
108 |
+
@!%p1 mov.u32 %r20, %r4;
|
109 |
+
@!%p1 mov.u32 %r21, %r4;
|
110 |
+
mov.b32 %f9, %r18;
|
111 |
+
mov.b32 %f10, %r19;
|
112 |
+
mov.b32 %f11, %r20;
|
113 |
+
mov.b32 %f12, %r21;
|
114 |
+
.loc 1 33 30
|
115 |
+
mul.wide.s32 %rd37, %r1, 4;
|
116 |
+
add.s64 %rd4, %rd29, %rd37;
|
117 |
+
.loc 1 33 35
|
118 |
+
mov.u32 %r26, 0x0;
|
119 |
+
@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
|
120 |
+
mov.b32 %f13, %r26;
|
121 |
+
mov.u32 %r27, 0x0;
|
122 |
+
@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
|
123 |
+
mov.u32 %r28, 0x0;
|
124 |
+
@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
|
125 |
+
mov.u32 %r29, 0x0;
|
126 |
+
@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
|
127 |
+
.loc 1 34 31
|
128 |
+
add.s64 %rd8, %rd30, %rd37;
|
129 |
+
.loc 1 34 36
|
130 |
+
mov.u32 %r55, 0x0;
|
131 |
+
@%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
|
132 |
+
mov.b32 %f14, %r55;
|
133 |
+
mov.u32 %r31, 0x0;
|
134 |
+
@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
|
135 |
+
mov.u32 %r32, 0x0;
|
136 |
+
@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
|
137 |
+
mov.u32 %r33, 0x0;
|
138 |
+
@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
|
139 |
+
.loc 1 35 31
|
140 |
+
mul.wide.s32 %rd38, %r1, 8;
|
141 |
+
add.s64 %rd13, %rd31, %rd38;
|
142 |
+
.loc 1 35 36
|
143 |
+
mov.u64 %rd12, 0x0;
|
144 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd13 + 0 ];
|
145 |
+
mov.u64 %rd14, 0x0;
|
146 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd13 + 0 ];
|
147 |
+
mov.u64 %rd16, 0x0;
|
148 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd13 + 0 ];
|
149 |
+
mov.u64 %rd18, 0x0;
|
150 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd13 + 0 ];
|
151 |
+
.loc 1 36 35
|
152 |
+
add.s64 %rd20, %rd25, %rd36;
|
153 |
+
.loc 1 36 51
|
154 |
+
mov.u32 %r34, 0x0;
|
155 |
+
mov.u32 %r35, 0x0;
|
156 |
+
mov.u32 %r36, 0x0;
|
157 |
+
mov.u32 %r37, 0x0;
|
158 |
+
@%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd20 + 0 ];
|
159 |
+
@!%p1 mov.u32 %r34, %r4;
|
160 |
+
@!%p1 mov.u32 %r35, %r4;
|
161 |
+
@!%p1 mov.u32 %r36, %r4;
|
162 |
+
@!%p1 mov.u32 %r37, %r4;
|
163 |
+
mov.b32 %f15, %r34;
|
164 |
+
mov.b32 %f16, %r35;
|
165 |
+
mov.b32 %f17, %r36;
|
166 |
+
mov.b32 %f18, %r37;
|
167 |
+
.loc 1 38 18
|
168 |
+
mul.f32 %f19, %f1, %f5;
|
169 |
+
mul.f32 %f20, %f2, %f6;
|
170 |
+
mul.f32 %f21, %f3, %f7;
|
171 |
+
mul.f32 %f22, %f4, %f8;
|
172 |
+
$L__tmp1:
|
173 |
+
.loc 2 233 15
|
174 |
+
fma.rn.f32 %f23, %f1, %f5, %f20;
|
175 |
+
fma.rn.f32 %f24, %f3, %f7, %f23;
|
176 |
+
fma.rn.f32 %f25, %f4, %f8, %f24;
|
177 |
+
$L__tmp2:
|
178 |
+
.loc 2 243 36
|
179 |
+
mov.b32 %r80, %f25;
|
180 |
+
shfl.sync.bfly.b32 %r81, %r80, 16, 31, -1;
|
181 |
+
mov.b32 %f26, %r81;
|
182 |
+
$L__tmp3:
|
183 |
+
.loc 2 233 15
|
184 |
+
add.f32 %f27, %f25, %f26;
|
185 |
+
$L__tmp4:
|
186 |
+
.loc 2 243 36
|
187 |
+
mov.b32 %r82, %f27;
|
188 |
+
shfl.sync.bfly.b32 %r83, %r82, 8, 31, -1;
|
189 |
+
mov.b32 %f28, %r83;
|
190 |
+
$L__tmp5:
|
191 |
+
.loc 2 233 15
|
192 |
+
add.f32 %f29, %f27, %f28;
|
193 |
+
$L__tmp6:
|
194 |
+
.loc 2 243 36
|
195 |
+
mov.b32 %r84, %f29;
|
196 |
+
shfl.sync.bfly.b32 %r85, %r84, 4, 31, -1;
|
197 |
+
mov.b32 %f30, %r85;
|
198 |
+
$L__tmp7:
|
199 |
+
.loc 2 233 15
|
200 |
+
add.f32 %f31, %f29, %f30;
|
201 |
+
$L__tmp8:
|
202 |
+
.loc 2 243 36
|
203 |
+
mov.b32 %r86, %f31;
|
204 |
+
shfl.sync.bfly.b32 %r87, %r86, 2, 31, -1;
|
205 |
+
mov.b32 %f32, %r87;
|
206 |
+
$L__tmp9:
|
207 |
+
.loc 2 233 15
|
208 |
+
add.f32 %f33, %f31, %f32;
|
209 |
+
$L__tmp10:
|
210 |
+
.loc 2 243 36
|
211 |
+
mov.b32 %r88, %f33;
|
212 |
+
shfl.sync.bfly.b32 %r89, %r88, 1, 31, -1;
|
213 |
+
mov.b32 %f34, %r89;
|
214 |
+
$L__tmp11:
|
215 |
+
.loc 2 233 15
|
216 |
+
add.f32 %f35, %f33, %f34;
|
217 |
+
$L__tmp12:
|
218 |
+
.loc 2 243 36
|
219 |
+
setp.eq.s32 %p31, %r75, 0;
|
220 |
+
shr.u32 %r90, %r74, 3;
|
221 |
+
and.b32 %r91, %r90, 4;
|
222 |
+
mov.u32 %r92, global_smem;
|
223 |
+
add.s32 %r42, %r92, %r91;
|
224 |
+
mov.b32 %r43, %f35;
|
225 |
+
@%p31 st.shared.b32 [ %r42 + 0 ], %r43;
|
226 |
+
bar.sync 0;
|
227 |
+
setp.lt.s32 %p32, %r74, 2;
|
228 |
+
add.s32 %r45, %r92, %r76;
|
229 |
+
@%p32 ld.shared.b32 %r44, [ %r45 + 0 ];
|
230 |
+
mov.b32 %f36, %r44;
|
231 |
+
shfl.sync.bfly.b32 %r93, %r44, 1, 31, -1;
|
232 |
+
mov.b32 %f37, %r93;
|
233 |
+
$L__tmp13:
|
234 |
+
.loc 2 233 15
|
235 |
+
add.f32 %f38, %f36, %f37;
|
236 |
+
$L__tmp14:
|
237 |
+
.loc 2 243 36
|
238 |
+
and.b32 %r94, %r74, 1;
|
239 |
+
setp.eq.b32 %p41, %r94, 1;
|
240 |
+
not.pred %p42, %p41;
|
241 |
+
and.pred %p33, %p32, %p42;
|
242 |
+
mov.b32 %r47, %f38;
|
243 |
+
@%p33 st.shared.b32 [ %r45 + 0 ], %r47;
|
244 |
+
bar.sync 0;
|
245 |
+
ld.shared.f32 %f39, [global_smem];
|
246 |
+
$L__tmp15:
|
247 |
+
.loc 3 8 15
|
248 |
+
add.f32 %f40, %f39, 0f00000000;
|
249 |
+
$L__tmp16:
|
250 |
+
.loc 1 42 19
|
251 |
+
sub.f32 %f41, %f9, %f13;
|
252 |
+
sub.f32 %f42, %f10, %f13;
|
253 |
+
sub.f32 %f43, %f11, %f13;
|
254 |
+
sub.f32 %f44, %f12, %f13;
|
255 |
+
.loc 1 43 20
|
256 |
+
mul.f32 %f45, %f41, %f14;
|
257 |
+
mul.f32 %f46, %f42, %f14;
|
258 |
+
mul.f32 %f47, %f43, %f14;
|
259 |
+
mul.f32 %f48, %f44, %f14;
|
260 |
+
.loc 1 44 19
|
261 |
+
mul.f32 %f49, %f20, %f46;
|
262 |
+
$L__tmp17:
|
263 |
+
.loc 2 243 36
|
264 |
+
bar.sync 0;
|
265 |
+
$L__tmp18:
|
266 |
+
.loc 2 233 15
|
267 |
+
fma.rn.f32 %f50, %f19, %f45, %f49;
|
268 |
+
fma.rn.f32 %f51, %f21, %f47, %f50;
|
269 |
+
fma.rn.f32 %f52, %f22, %f48, %f51;
|
270 |
+
$L__tmp19:
|
271 |
+
.loc 2 243 36
|
272 |
+
mov.b32 %r95, %f52;
|
273 |
+
shfl.sync.bfly.b32 %r96, %r95, 16, 31, -1;
|
274 |
+
mov.b32 %f53, %r96;
|
275 |
+
$L__tmp20:
|
276 |
+
.loc 2 233 15
|
277 |
+
add.f32 %f54, %f52, %f53;
|
278 |
+
$L__tmp21:
|
279 |
+
.loc 2 243 36
|
280 |
+
mov.b32 %r97, %f54;
|
281 |
+
shfl.sync.bfly.b32 %r98, %r97, 8, 31, -1;
|
282 |
+
mov.b32 %f55, %r98;
|
283 |
+
$L__tmp22:
|
284 |
+
.loc 2 233 15
|
285 |
+
add.f32 %f56, %f54, %f55;
|
286 |
+
$L__tmp23:
|
287 |
+
.loc 2 243 36
|
288 |
+
mov.b32 %r99, %f56;
|
289 |
+
shfl.sync.bfly.b32 %r100, %r99, 4, 31, -1;
|
290 |
+
mov.b32 %f57, %r100;
|
291 |
+
$L__tmp24:
|
292 |
+
.loc 2 233 15
|
293 |
+
add.f32 %f58, %f56, %f57;
|
294 |
+
$L__tmp25:
|
295 |
+
.loc 2 243 36
|
296 |
+
mov.b32 %r101, %f58;
|
297 |
+
shfl.sync.bfly.b32 %r102, %r101, 2, 31, -1;
|
298 |
+
mov.b32 %f59, %r102;
|
299 |
+
$L__tmp26:
|
300 |
+
.loc 2 233 15
|
301 |
+
add.f32 %f60, %f58, %f59;
|
302 |
+
$L__tmp27:
|
303 |
+
.loc 2 243 36
|
304 |
+
mov.b32 %r103, %f60;
|
305 |
+
shfl.sync.bfly.b32 %r104, %r103, 1, 31, -1;
|
306 |
+
mov.b32 %f61, %r104;
|
307 |
+
$L__tmp28:
|
308 |
+
.loc 2 233 15
|
309 |
+
add.f32 %f62, %f60, %f61;
|
310 |
+
$L__tmp29:
|
311 |
+
.loc 2 243 36
|
312 |
+
mov.b32 %r49, %f62;
|
313 |
+
@%p31 st.shared.b32 [ %r42 + 0 ], %r49;
|
314 |
+
bar.sync 0;
|
315 |
+
@%p32 ld.shared.b32 %r50, [ %r45 + 0 ];
|
316 |
+
mov.b32 %f63, %r50;
|
317 |
+
shfl.sync.bfly.b32 %r105, %r50, 1, 31, -1;
|
318 |
+
mov.b32 %f64, %r105;
|
319 |
+
$L__tmp30:
|
320 |
+
.loc 2 233 15
|
321 |
+
add.f32 %f65, %f63, %f64;
|
322 |
+
$L__tmp31:
|
323 |
+
.loc 2 243 36
|
324 |
+
mov.b32 %r53, %f65;
|
325 |
+
@%p33 st.shared.b32 [ %r45 + 0 ], %r53;
|
326 |
+
bar.sync 0;
|
327 |
+
ld.shared.f32 %f66, [global_smem];
|
328 |
+
$L__tmp32:
|
329 |
+
.loc 3 8 15
|
330 |
+
add.f32 %f67, %f66, 0f00000000;
|
331 |
+
$L__tmp33:
|
332 |
+
.loc 1 49 21
|
333 |
+
setp.eq.s64 %p43, %rd12, -1;
|
334 |
+
mov.b32 %r56, 1132462080;
|
335 |
+
.loc 1 51 20
|
336 |
+
div.full.f32 %r54, %r55, %r56;
|
337 |
+
mov.b32 %f68, %r54;
|
338 |
+
.loc 1 53 20
|
339 |
+
neg.f32 %f69, %f40;
|
340 |
+
fma.rn.f32 %f70, %f19, 0f43800000, %f69;
|
341 |
+
fma.rn.f32 %f71, %f20, 0f43800000, %f69;
|
342 |
+
fma.rn.f32 %f72, %f21, 0f43800000, %f69;
|
343 |
+
fma.rn.f32 %f73, %f22, 0f43800000, %f69;
|
344 |
+
.loc 1 55 20
|
345 |
+
neg.f32 %f74, %f45;
|
346 |
+
fma.rn.f32 %f75, %f74, %f67, %f70;
|
347 |
+
neg.f32 %f76, %f46;
|
348 |
+
fma.rn.f32 %f77, %f76, %f67, %f71;
|
349 |
+
neg.f32 %f78, %f47;
|
350 |
+
fma.rn.f32 %f79, %f78, %f67, %f72;
|
351 |
+
neg.f32 %f80, %f48;
|
352 |
+
fma.rn.f32 %f81, %f80, %f67, %f73;
|
353 |
+
.loc 1 57 20
|
354 |
+
fma.rn.f32 %f82, %f68, %f75, %f15;
|
355 |
+
fma.rn.f32 %f83, %f68, %f77, %f16;
|
356 |
+
fma.rn.f32 %f84, %f68, %f79, %f17;
|
357 |
+
fma.rn.f32 %f85, %f68, %f81, %f18;
|
358 |
+
.loc 1 59 35
|
359 |
+
selp.f32 %f86, 0f00000000, %f82, %p43;
|
360 |
+
selp.f32 %f87, 0f00000000, %f83, %p43;
|
361 |
+
selp.f32 %f88, 0f00000000, %f84, %p43;
|
362 |
+
selp.f32 %f89, 0f00000000, %f85, %p43;
|
363 |
+
.loc 1 61 20
|
364 |
+
setp.lt.s64 %p44, %rd12, 0;
|
365 |
+
.loc 1 63 56
|
366 |
+
shl.b64 %rd39, %rd12, 8;
|
367 |
+
add.s64 %rd40, %rd39, 12865792;
|
368 |
+
selp.b64 %rd41, %rd40, %rd39, %p44;
|
369 |
+
.loc 1 63 52
|
370 |
+
or.b64 %rd42, %rd41, %rd34;
|
371 |
+
.loc 1 63 30
|
372 |
+
shl.b64 %rd43, %rd42, 2;
|
373 |
+
add.s64 %rd21, %rd32, %rd43;
|
374 |
+
add.s64 %rd22, %rd21, 4;
|
375 |
+
add.s64 %rd23, %rd21, 8;
|
376 |
+
add.s64 %rd24, %rd21, 12;
|
377 |
+
.loc 1 63 83
|
378 |
+
mov.b32 %r67, %f86;
|
379 |
+
mov.u32 %r66, 0x0;
|
380 |
+
@%p1 atom.global.gpu.acq_rel.add.f32 %r66, [ %rd21 + 0 ], %r67;
|
381 |
+
mov.b32 %r69, %f87;
|
382 |
+
mov.u32 %r68, 0x0;
|
383 |
+
@%p1 atom.global.gpu.acq_rel.add.f32 %r68, [ %rd22 + 0 ], %r69;
|
384 |
+
mov.b32 %r71, %f88;
|
385 |
+
mov.u32 %r70, 0x0;
|
386 |
+
@%p1 atom.global.gpu.acq_rel.add.f32 %r70, [ %rd23 + 0 ], %r71;
|
387 |
+
mov.b32 %r73, %f89;
|
388 |
+
mov.u32 %r72, 0x0;
|
389 |
+
@%p1 atom.global.gpu.acq_rel.add.f32 %r72, [ %rd24 + 0 ], %r73;
|
390 |
+
.loc 1 63 4
|
391 |
+
ret;
|
392 |
+
$L__tmp34:
|
393 |
+
$L__func_end0:
|
394 |
+
|
395 |
+
}
|
396 |
+
.file 1 "/tmp/torchinductor_root/qr/cqryxm46jcxyr3qdktqirn53eap7h3pjjqiqavyqqyvflabjpvmd.py"
|
397 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
398 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
399 |
+
.section .debug_abbrev
|
400 |
+
{
|
401 |
+
.b8 1
|
402 |
+
.b8 17
|
403 |
+
.b8 1
|
404 |
+
.b8 37
|
405 |
+
.b8 8
|
406 |
+
.b8 19
|
407 |
+
.b8 5
|
408 |
+
.b8 3
|
409 |
+
.b8 8
|
410 |
+
.b8 16
|
411 |
+
.b8 6
|
412 |
+
.b8 27
|
413 |
+
.b8 8
|
414 |
+
.b8 180
|
415 |
+
.b8 66
|
416 |
+
.b8 12
|
417 |
+
.b8 17
|
418 |
+
.b8 1
|
419 |
+
.b8 18
|
420 |
+
.b8 1
|
421 |
+
.b8 0
|
422 |
+
.b8 0
|
423 |
+
.b8 2
|
424 |
+
.b8 46
|
425 |
+
.b8 0
|
426 |
+
.b8 135
|
427 |
+
.b8 64
|
428 |
+
.b8 8
|
429 |
+
.b8 3
|
430 |
+
.b8 8
|
431 |
+
.b8 58
|
432 |
+
.b8 11
|
433 |
+
.b8 59
|
434 |
+
.b8 11
|
435 |
+
.b8 63
|
436 |
+
.b8 12
|
437 |
+
.b8 32
|
438 |
+
.b8 11
|
439 |
+
.b8 0
|
440 |
+
.b8 0
|
441 |
+
.b8 3
|
442 |
+
.b8 46
|
443 |
+
.b8 1
|
444 |
+
.b8 17
|
445 |
+
.b8 1
|
446 |
+
.b8 18
|
447 |
+
.b8 1
|
448 |
+
.b8 64
|
449 |
+
.b8 10
|
450 |
+
.b8 49
|
451 |
+
.b8 19
|
452 |
+
.b8 0
|
453 |
+
.b8 0
|
454 |
+
.b8 4
|
455 |
+
.b8 29
|
456 |
+
.b8 1
|
457 |
+
.b8 49
|
458 |
+
.b8 19
|
459 |
+
.b8 17
|
460 |
+
.b8 1
|
461 |
+
.b8 18
|
462 |
+
.b8 1
|
463 |
+
.b8 88
|
464 |
+
.b8 11
|
465 |
+
.b8 89
|
466 |
+
.b8 11
|
467 |
+
.b8 87
|
468 |
+
.b8 11
|
469 |
+
.b8 0
|
470 |
+
.b8 0
|
471 |
+
.b8 5
|
472 |
+
.b8 29
|
473 |
+
.b8 0
|
474 |
+
.b8 49
|
475 |
+
.b8 19
|
476 |
+
.b8 17
|
477 |
+
.b8 1
|
478 |
+
.b8 18
|
479 |
+
.b8 1
|
480 |
+
.b8 88
|
481 |
+
.b8 11
|
482 |
+
.b8 89
|
483 |
+
.b8 11
|
484 |
+
.b8 87
|
485 |
+
.b8 11
|
486 |
+
.b8 0
|
487 |
+
.b8 0
|
488 |
+
.b8 0
|
489 |
+
}
|
490 |
+
.section .debug_info
|
491 |
+
{
|
492 |
+
.b32 407
|
493 |
+
.b8 2
|
494 |
+
.b8 0
|
495 |
+
.b32 .debug_abbrev
|
496 |
+
.b8 8
|
497 |
+
.b8 1
|
498 |
+
.b8 116
|
499 |
+
.b8 114
|
500 |
+
.b8 105
|
501 |
+
.b8 116
|
502 |
+
.b8 111
|
503 |
+
.b8 110
|
504 |
+
.b8 0
|
505 |
+
.b8 2
|
506 |
+
.b8 0
|
507 |
+
.b8 99
|
508 |
+
.b8 113
|
509 |
+
.b8 114
|
510 |
+
.b8 121
|
511 |
+
.b8 120
|
512 |
+
.b8 109
|
513 |
+
.b8 52
|
514 |
+
.b8 54
|
515 |
+
.b8 106
|
516 |
+
.b8 99
|
517 |
+
.b8 120
|
518 |
+
.b8 121
|
519 |
+
.b8 114
|
520 |
+
.b8 51
|
521 |
+
.b8 113
|
522 |
+
.b8 100
|
523 |
+
.b8 107
|
524 |
+
.b8 116
|
525 |
+
.b8 113
|
526 |
+
.b8 105
|
527 |
+
.b8 114
|
528 |
+
.b8 110
|
529 |
+
.b8 53
|
530 |
+
.b8 51
|
531 |
+
.b8 101
|
532 |
+
.b8 97
|
533 |
+
.b8 112
|
534 |
+
.b8 55
|
535 |
+
.b8 104
|
536 |
+
.b8 51
|
537 |
+
.b8 112
|
538 |
+
.b8 106
|
539 |
+
.b8 106
|
540 |
+
.b8 113
|
541 |
+
.b8 105
|
542 |
+
.b8 113
|
543 |
+
.b8 97
|
544 |
+
.b8 118
|
545 |
+
.b8 121
|
546 |
+
.b8 113
|
547 |
+
.b8 113
|
548 |
+
.b8 121
|
549 |
+
.b8 118
|
550 |
+
.b8 102
|
551 |
+
.b8 108
|
552 |
+
.b8 97
|
553 |
+
.b8 98
|
554 |
+
.b8 106
|
555 |
+
.b8 112
|
556 |
+
.b8 118
|
557 |
+
.b8 109
|
558 |
+
.b8 100
|
559 |
+
.b8 46
|
560 |
+
.b8 112
|
561 |
+
.b8 121
|
562 |
+
.b8 0
|
563 |
+
.b32 .debug_line
|
564 |
+
.b8 47
|
565 |
+
.b8 116
|
566 |
+
.b8 109
|
567 |
+
.b8 112
|
568 |
+
.b8 47
|
569 |
+
.b8 116
|
570 |
+
.b8 111
|
571 |
+
.b8 114
|
572 |
+
.b8 99
|
573 |
+
.b8 104
|
574 |
+
.b8 105
|
575 |
+
.b8 110
|
576 |
+
.b8 100
|
577 |
+
.b8 117
|
578 |
+
.b8 99
|
579 |
+
.b8 116
|
580 |
+
.b8 111
|
581 |
+
.b8 114
|
582 |
+
.b8 95
|
583 |
+
.b8 114
|
584 |
+
.b8 111
|
585 |
+
.b8 111
|
586 |
+
.b8 116
|
587 |
+
.b8 47
|
588 |
+
.b8 113
|
589 |
+
.b8 114
|
590 |
+
.b8 0
|
591 |
+
.b8 1
|
592 |
+
.b64 $L__func_begin0
|
593 |
+
.b64 $L__func_end0
|
594 |
+
.b8 2
|
595 |
+
.b8 116
|
596 |
+
.b8 114
|
597 |
+
.b8 105
|
598 |
+
.b8 116
|
599 |
+
.b8 111
|
600 |
+
.b8 110
|
601 |
+
.b8 95
|
602 |
+
.b8 95
|
603 |
+
.b8 48
|
604 |
+
.b8 100
|
605 |
+
.b8 49
|
606 |
+
.b8 100
|
607 |
+
.b8 50
|
608 |
+
.b8 100
|
609 |
+
.b8 51
|
610 |
+
.b8 100
|
611 |
+
.b8 52
|
612 |
+
.b8 100
|
613 |
+
.b8 53
|
614 |
+
.b8 100
|
615 |
+
.b8 54
|
616 |
+
.b8 100
|
617 |
+
.b8 55
|
618 |
+
.b8 100
|
619 |
+
.b8 56
|
620 |
+
.b8 100
|
621 |
+
.b8 101
|
622 |
+
.b8 57
|
623 |
+
.b8 100
|
624 |
+
.b8 101
|
625 |
+
.b8 0
|
626 |
+
.b8 116
|
627 |
+
.b8 114
|
628 |
+
.b8 105
|
629 |
+
.b8 116
|
630 |
+
.b8 111
|
631 |
+
.b8 110
|
632 |
+
.b8 95
|
633 |
+
.b8 95
|
634 |
+
.b8 48
|
635 |
+
.b8 100
|
636 |
+
.b8 49
|
637 |
+
.b8 100
|
638 |
+
.b8 50
|
639 |
+
.b8 100
|
640 |
+
.b8 51
|
641 |
+
.b8 100
|
642 |
+
.b8 52
|
643 |
+
.b8 100
|
644 |
+
.b8 53
|
645 |
+
.b8 100
|
646 |
+
.b8 54
|
647 |
+
.b8 100
|
648 |
+
.b8 55
|
649 |
+
.b8 100
|
650 |
+
.b8 56
|
651 |
+
.b8 100
|
652 |
+
.b8 101
|
653 |
+
.b8 57
|
654 |
+
.b8 100
|
655 |
+
.b8 101
|
656 |
+
.b8 0
|
657 |
+
.b8 1
|
658 |
+
.b8 18
|
659 |
+
.b8 1
|
660 |
+
.b8 1
|
661 |
+
.b8 3
|
662 |
+
.b64 $L__func_begin0
|
663 |
+
.b64 $L__func_end0
|
664 |
+
.b8 1
|
665 |
+
.b8 156
|
666 |
+
.b32 125
|
667 |
+
.b8 4
|
668 |
+
.b32 125
|
669 |
+
.b64 $L__tmp1
|
670 |
+
.b64 $L__tmp14
|
671 |
+
.b8 2
|
672 |
+
.b8 41
|
673 |
+
.b8 57
|
674 |
+
.b8 5
|
675 |
+
.b32 125
|
676 |
+
.b64 $L__tmp1
|
677 |
+
.b64 $L__tmp14
|
678 |
+
.b8 2
|
679 |
+
.b8 243
|
680 |
+
.b8 36
|
681 |
+
.b8 0
|
682 |
+
.b8 5
|
683 |
+
.b32 125
|
684 |
+
.b64 $L__tmp2
|
685 |
+
.b64 $L__tmp15
|
686 |
+
.b8 2
|
687 |
+
.b8 41
|
688 |
+
.b8 57
|
689 |
+
.b8 5
|
690 |
+
.b32 125
|
691 |
+
.b64 $L__tmp15
|
692 |
+
.b64 $L__tmp16
|
693 |
+
.b8 3
|
694 |
+
.b8 41
|
695 |
+
.b8 44
|
696 |
+
.b8 5
|
697 |
+
.b32 125
|
698 |
+
.b64 $L__tmp17
|
699 |
+
.b64 $L__tmp32
|
700 |
+
.b8 2
|
701 |
+
.b8 47
|
702 |
+
.b8 59
|
703 |
+
.b8 4
|
704 |
+
.b32 125
|
705 |
+
.b64 $L__tmp18
|
706 |
+
.b64 $L__tmp31
|
707 |
+
.b8 2
|
708 |
+
.b8 47
|
709 |
+
.b8 59
|
710 |
+
.b8 5
|
711 |
+
.b32 125
|
712 |
+
.b64 $L__tmp18
|
713 |
+
.b64 $L__tmp31
|
714 |
+
.b8 2
|
715 |
+
.b8 243
|
716 |
+
.b8 36
|
717 |
+
.b8 0
|
718 |
+
.b8 5
|
719 |
+
.b32 125
|
720 |
+
.b64 $L__tmp32
|
721 |
+
.b64 $L__tmp33
|
722 |
+
.b8 3
|
723 |
+
.b8 47
|
724 |
+
.b8 45
|
725 |
+
.b8 0
|
726 |
+
.b8 0
|
727 |
+
}
|
728 |
+
.section .debug_pubnames
|
729 |
+
{
|
730 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
731 |
+
$L__pubNames_start0:
|
732 |
+
.b8 2
|
733 |
+
.b8 0
|
734 |
+
.b32 .debug_info
|
735 |
+
.b32 411
|
736 |
+
.b32 125
|
737 |
+
.b8 116
|
738 |
+
.b8 114
|
739 |
+
.b8 105
|
740 |
+
.b8 116
|
741 |
+
.b8 111
|
742 |
+
.b8 110
|
743 |
+
.b8 95
|
744 |
+
.b8 95
|
745 |
+
.b8 48
|
746 |
+
.b8 100
|
747 |
+
.b8 49
|
748 |
+
.b8 100
|
749 |
+
.b8 50
|
750 |
+
.b8 100
|
751 |
+
.b8 51
|
752 |
+
.b8 100
|
753 |
+
.b8 52
|
754 |
+
.b8 100
|
755 |
+
.b8 53
|
756 |
+
.b8 100
|
757 |
+
.b8 54
|
758 |
+
.b8 100
|
759 |
+
.b8 55
|
760 |
+
.b8 100
|
761 |
+
.b8 56
|
762 |
+
.b8 100
|
763 |
+
.b8 101
|
764 |
+
.b8 57
|
765 |
+
.b8 100
|
766 |
+
.b8 101
|
767 |
+
.b8 0
|
768 |
+
.b32 0
|
769 |
+
$L__pubNames_end0:
|
770 |
+
}
|
771 |
+
.section .debug_pubtypes
|
772 |
+
{
|
773 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
774 |
+
$L__pubTypes_start0:
|
775 |
+
.b8 2
|
776 |
+
.b8 0
|
777 |
+
.b32 .debug_info
|
778 |
+
.b32 411
|
779 |
+
.b32 0
|
780 |
+
$L__pubTypes_end0:
|
781 |
+
}
|
782 |
+
.section .debug_loc { }
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.cubin
ADDED
Binary file (10.5 kB). View file
|
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.llir
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
|
5 |
+
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%11 = lshr i32 %10, 2, !dbg !8
|
7 |
+
%12 = and i32 %11, 63, !dbg !8
|
8 |
+
%13 = and i32 %10, 3, !dbg !9
|
9 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !10
|
10 |
+
%15 = sext i32 %14 to i64, !dbg !11
|
11 |
+
%16 = shl nsw i64 %15, 6, !dbg !12
|
12 |
+
%17 = zext nneg i32 %12 to i64
|
13 |
+
%18 = or i64 %16, %17, !dbg !13
|
14 |
+
%19 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14
|
15 |
+
%20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #2, !dbg !15
|
16 |
+
%21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #2, !dbg !16
|
17 |
+
%22 = bitcast i32 %21 to float, !dbg !16
|
18 |
+
%23 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #2, !dbg !17
|
19 |
+
%24 = bitcast i32 %23 to float, !dbg !17
|
20 |
+
%25 = mul nsw i64 %18, 50257, !dbg !18
|
21 |
+
%.not = icmp eq i64 %20, -1, !dbg !19
|
22 |
+
%26 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %22, float %24) #2, !dbg !20
|
23 |
+
%27 = select i1 %.not, float 0.000000e+00, float %26, !dbg !21
|
24 |
+
%28 = getelementptr float, ptr addrspace(1) %0, i64 %25
|
25 |
+
br label %29, !dbg !22
|
26 |
+
|
27 |
+
29: ; preds = %9, %29
|
28 |
+
%30 = phi float [ 0.000000e+00, %9 ], [ %40, %29 ]
|
29 |
+
%31 = phi i32 [ 0, %9 ], [ %41, %29 ]
|
30 |
+
%32 = or i32 %31, %13, !dbg !23
|
31 |
+
%33 = zext nneg i32 %32 to i64, !dbg !23
|
32 |
+
%34 = icmp ult i32 %32, 50257, !dbg !24
|
33 |
+
%35 = getelementptr float, ptr addrspace(1) %28, i64 %33, !dbg !25
|
34 |
+
%36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %35, i1 %34, i32 0, i1 %34) #2, !dbg !26
|
35 |
+
%37 = bitcast i32 %36 to float, !dbg !26
|
36 |
+
%38 = fmul float %27, %37, !dbg !27
|
37 |
+
%39 = select i1 %34, float %38, float -0.000000e+00, !dbg !28
|
38 |
+
%40 = fadd float %30, %39, !dbg !28
|
39 |
+
%41 = add nuw nsw i32 %31, 4, !dbg !22
|
40 |
+
%42 = icmp ult i32 %31, 50253, !dbg !22
|
41 |
+
br i1 %42, label %29, label %43, !dbg !22
|
42 |
+
|
43 |
+
43: ; preds = %29
|
44 |
+
%44 = bitcast float %40 to i32, !dbg !29
|
45 |
+
%45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 2, i32 31), !dbg !29
|
46 |
+
%46 = bitcast i32 %45 to float, !dbg !29
|
47 |
+
%47 = fadd float %40, %46, !dbg !33
|
48 |
+
%48 = bitcast float %47 to i32, !dbg !29
|
49 |
+
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 1, i32 31), !dbg !29
|
50 |
+
%50 = bitcast i32 %49 to float, !dbg !29
|
51 |
+
%51 = fadd float %47, %50, !dbg !33
|
52 |
+
br label %52, !dbg !37
|
53 |
+
|
54 |
+
52: ; preds = %43, %52
|
55 |
+
%53 = phi i32 [ 0, %43 ], [ %75, %52 ]
|
56 |
+
%54 = or i32 %53, %13, !dbg !38
|
57 |
+
%55 = zext nneg i32 %54 to i64, !dbg !38
|
58 |
+
%56 = icmp ult i32 %54, 50257, !dbg !39
|
59 |
+
%57 = add nsw i64 %25, %55, !dbg !40
|
60 |
+
%58 = getelementptr i16, ptr addrspace(1) %4, i64 %57, !dbg !41
|
61 |
+
%59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %58, i1 %56, i16 0, i1 %56) #2, !dbg !42
|
62 |
+
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #2, !dbg !43
|
63 |
+
%61 = getelementptr float, ptr addrspace(1) %0, i64 %57, !dbg !44
|
64 |
+
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %61, i1 %56, i32 0, i1 %56) #2, !dbg !45
|
65 |
+
%63 = bitcast i32 %62 to float, !dbg !45
|
66 |
+
%64 = getelementptr i16, ptr addrspace(1) %5, i64 %57, !dbg !46
|
67 |
+
%65 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %64, i1 %56, i16 0, i1 %56) #2, !dbg !47
|
68 |
+
%66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !48
|
69 |
+
%67 = fmul float %27, %63, !dbg !49
|
70 |
+
%68 = fmul float %66, 0x3FF7154760000000, !dbg !50
|
71 |
+
%69 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %68) #2, !dbg !50
|
72 |
+
%70 = fmul float %51, %69, !dbg !51
|
73 |
+
%71 = fsub float %67, %70, !dbg !52
|
74 |
+
%72 = fadd float %60, %71, !dbg !53
|
75 |
+
%73 = getelementptr i16, ptr addrspace(1) %6, i64 %57, !dbg !54
|
76 |
+
%74 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %72) #2, !dbg !55
|
77 |
+
tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %74, ptr addrspace(1) %73, i1 %56) #2, !dbg !55
|
78 |
+
%75 = add nuw nsw i32 %53, 4, !dbg !37
|
79 |
+
%76 = icmp ult i32 %53, 50253, !dbg !37
|
80 |
+
br i1 %76, label %52, label %77, !dbg !37
|
81 |
+
|
82 |
+
77: ; preds = %52
|
83 |
+
ret void, !dbg !56
|
84 |
+
}
|
85 |
+
|
86 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
87 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
88 |
+
|
89 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
90 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
91 |
+
|
92 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
93 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
94 |
+
attributes #2 = { nounwind }
|
95 |
+
|
96 |
+
!llvm.module.flags = !{!0}
|
97 |
+
!llvm.dbg.cu = !{!1}
|
98 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
99 |
+
|
100 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
101 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
102 |
+
!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
|
103 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
|
104 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
|
105 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
106 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
107 |
+
!7 = !{}
|
108 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
109 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
110 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
111 |
+
!11 = !DILocation(line: 21, column: 34, scope: !5)
|
112 |
+
!12 = !DILocation(line: 21, column: 46, scope: !5)
|
113 |
+
!13 = !DILocation(line: 22, column: 23, scope: !5)
|
114 |
+
!14 = !DILocation(line: 26, column: 30, scope: !5)
|
115 |
+
!15 = !DILocation(line: 26, column: 35, scope: !5)
|
116 |
+
!16 = !DILocation(line: 27, column: 19, scope: !5)
|
117 |
+
!17 = !DILocation(line: 29, column: 19, scope: !5)
|
118 |
+
!18 = !DILocation(line: 36, column: 46, scope: !5)
|
119 |
+
!19 = !DILocation(line: 38, column: 23, scope: !5)
|
120 |
+
!20 = !DILocation(line: 39, column: 22, scope: !5)
|
121 |
+
!21 = !DILocation(line: 41, column: 37, scope: !5)
|
122 |
+
!22 = !DILocation(line: 32, column: 36, scope: !5)
|
123 |
+
!23 = !DILocation(line: 33, column: 27, scope: !5)
|
124 |
+
!24 = !DILocation(line: 34, column: 25, scope: !5)
|
125 |
+
!25 = !DILocation(line: 36, column: 34, scope: !5)
|
126 |
+
!26 = !DILocation(line: 36, column: 52, scope: !5)
|
127 |
+
!27 = !DILocation(line: 42, column: 23, scope: !5)
|
128 |
+
!28 = !DILocation(line: 45, column: 40, scope: !5)
|
129 |
+
!29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
|
130 |
+
!30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
|
131 |
+
!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
132 |
+
!32 = !DILocation(line: 46, column: 27, scope: !30)
|
133 |
+
!33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35)
|
134 |
+
!34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
|
135 |
+
!35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36)
|
136 |
+
!36 = !DILocation(line: 46, column: 27, scope: !34)
|
137 |
+
!37 = !DILocation(line: 51, column: 36, scope: !5)
|
138 |
+
!38 = !DILocation(line: 52, column: 27, scope: !5)
|
139 |
+
!39 = !DILocation(line: 53, column: 25, scope: !5)
|
140 |
+
!40 = !DILocation(line: 55, column: 41, scope: !5)
|
141 |
+
!41 = !DILocation(line: 55, column: 35, scope: !5)
|
142 |
+
!42 = !DILocation(line: 55, column: 53, scope: !5)
|
143 |
+
!43 = !DILocation(line: 55, column: 105, scope: !5)
|
144 |
+
!44 = !DILocation(line: 56, column: 35, scope: !5)
|
145 |
+
!45 = !DILocation(line: 56, column: 53, scope: !5)
|
146 |
+
!46 = !DILocation(line: 57, column: 35, scope: !5)
|
147 |
+
!47 = !DILocation(line: 57, column: 53, scope: !5)
|
148 |
+
!48 = !DILocation(line: 57, column: 105, scope: !5)
|
149 |
+
!49 = !DILocation(line: 63, column: 24, scope: !5)
|
150 |
+
!50 = !DILocation(line: 65, column: 23, scope: !5)
|
151 |
+
!51 = !DILocation(line: 66, column: 24, scope: !5)
|
152 |
+
!52 = !DILocation(line: 67, column: 24, scope: !5)
|
153 |
+
!53 = !DILocation(line: 69, column: 24, scope: !5)
|
154 |
+
!54 = !DILocation(line: 70, column: 29, scope: !5)
|
155 |
+
!55 = !DILocation(line: 70, column: 54, scope: !5)
|
156 |
+
!56 = !DILocation(line: 51, column: 4, scope: !5)
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ptx
ADDED
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8(
|
12 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
|
21 |
+
)
|
22 |
+
.maxntid 256, 1, 1
|
23 |
+
{
|
24 |
+
.reg .pred %p<16>;
|
25 |
+
.reg .b16 %rs<9>;
|
26 |
+
.reg .b32 %r<31>;
|
27 |
+
.reg .f32 %f<23>;
|
28 |
+
.reg .b64 %rd<51>;
|
29 |
+
.loc 1 18 0
|
30 |
+
$L__func_begin0:
|
31 |
+
.loc 1 18 0
|
32 |
+
|
33 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8_param_6];
|
34 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8_param_5];
|
35 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8_param_4];
|
36 |
+
ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7de8_param_0];
|
37 |
+
ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_1];
|
38 |
+
$L__tmp0:
|
39 |
+
.loc 1 22 44
|
40 |
+
mov.u32 %r13, %tid.x;
|
41 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7de8_param_2];
|
42 |
+
bfe.u32 %r14, %r13, 2, 6;
|
43 |
+
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7de8_param_3];
|
44 |
+
.loc 1 24 33
|
45 |
+
and.b32 %r1, %r13, 3;
|
46 |
+
.loc 1 21 28
|
47 |
+
mov.u32 %r6, %ctaid.x;
|
48 |
+
.loc 1 21 34
|
49 |
+
cvt.s64.s32 %rd1, %r6;
|
50 |
+
.loc 1 21 46
|
51 |
+
mul.wide.s32 %rd27, %r6, 64;
|
52 |
+
cvt.u64.u32 %rd2, %r14;
|
53 |
+
.loc 1 22 23
|
54 |
+
or.b64 %rd28, %rd27, %rd2;
|
55 |
+
.loc 1 26 30
|
56 |
+
shl.b64 %rd29, %rd28, 3;
|
57 |
+
add.s64 %rd22, %rd26, %rd29;
|
58 |
+
mov.pred %p1, -1;
|
59 |
+
.loc 1 26 35
|
60 |
+
mov.u64 %rd21, 0x0;
|
61 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd22 + 0 ];
|
62 |
+
.loc 1 27 19
|
63 |
+
mov.u32 %r10, 0x0;
|
64 |
+
@%p1 ld.global.b32 { %r10 }, [ %rd23 + 0 ];
|
65 |
+
.loc 1 29 19
|
66 |
+
mov.u32 %r11, 0x0;
|
67 |
+
@%p1 ld.global.b32 { %r11 }, [ %rd24 + 0 ];
|
68 |
+
.loc 1 38 23
|
69 |
+
setp.eq.s64 %p4, %rd21, -1;
|
70 |
+
.loc 1 39 22
|
71 |
+
div.full.f32 %r9, %r10, %r11;
|
72 |
+
mov.b32 %f6, %r9;
|
73 |
+
.loc 1 41 37
|
74 |
+
selp.f32 %f1, 0f00000000, %f6, %p4;
|
75 |
+
.loc 1 32 36
|
76 |
+
mul.wide.s32 %rd30, %r6, 12865792;
|
77 |
+
mul.wide.u32 %rd31, %r14, 201028;
|
78 |
+
add.s64 %rd32, %rd30, %rd31;
|
79 |
+
cvt.u64.u32 %rd33, %r13;
|
80 |
+
and.b64 %rd3, %rd33, 3;
|
81 |
+
mul.wide.u32 %rd34, %r1, 4;
|
82 |
+
add.s64 %rd35, %rd32, %rd34;
|
83 |
+
add.s64 %rd50, %rd25, %rd35;
|
84 |
+
mov.f32 %f22, 0f00000000;
|
85 |
+
mov.b32 %r29, -4;
|
86 |
+
mov.u64 %rd46, %rd50;
|
87 |
+
$L__BB0_1:
|
88 |
+
add.s32 %r29, %r29, 4;
|
89 |
+
.loc 1 33 27
|
90 |
+
add.s32 %r17, %r29, %r1;
|
91 |
+
.loc 1 34 25
|
92 |
+
setp.lt.u32 %p5, %r17, 50257;
|
93 |
+
mov.b32 %r16, 0;
|
94 |
+
.loc 1 36 52
|
95 |
+
mov.u32 %r15, 0x0;
|
96 |
+
@%p5 ld.global.L1::evict_last.b32 { %r15 }, [ %rd46 + 0 ];
|
97 |
+
@!%p5 mov.u32 %r15, %r16;
|
98 |
+
mov.b32 %f7, %r15;
|
99 |
+
.loc 1 42 23
|
100 |
+
mul.f32 %f8, %f1, %f7;
|
101 |
+
.loc 1 45 40
|
102 |
+
selp.f32 %f9, %f8, 0f80000000, %p5;
|
103 |
+
add.f32 %f22, %f22, %f9;
|
104 |
+
.loc 1 32 36
|
105 |
+
add.s64 %rd46, %rd46, 16;
|
106 |
+
setp.lt.u32 %p7, %r29, 50253;
|
107 |
+
@%p7 bra $L__BB0_1;
|
108 |
+
$L__tmp1:
|
109 |
+
.loc 2 243 36
|
110 |
+
mov.b32 %r19, %f22;
|
111 |
+
shfl.sync.bfly.b32 %r20, %r19, 2, 31, -1;
|
112 |
+
mov.b32 %f10, %r20;
|
113 |
+
$L__tmp2:
|
114 |
+
.loc 2 233 15
|
115 |
+
add.f32 %f11, %f22, %f10;
|
116 |
+
$L__tmp3:
|
117 |
+
.loc 2 243 36
|
118 |
+
mov.b32 %r21, %f11;
|
119 |
+
shfl.sync.bfly.b32 %r22, %r21, 1, 31, -1;
|
120 |
+
mov.b32 %f12, %r22;
|
121 |
+
$L__tmp4:
|
122 |
+
.loc 2 233 15
|
123 |
+
add.f32 %f4, %f11, %f12;
|
124 |
+
$L__tmp5:
|
125 |
+
.loc 1 51 36
|
126 |
+
mul.lo.s64 %rd37, %rd1, 3216448;
|
127 |
+
mul.lo.s64 %rd38, %rd2, 50257;
|
128 |
+
add.s64 %rd39, %rd37, %rd38;
|
129 |
+
add.s64 %rd40, %rd39, %rd3;
|
130 |
+
shl.b64 %rd41, %rd40, 1;
|
131 |
+
add.s64 %rd49, %rd20, %rd41;
|
132 |
+
add.s64 %rd48, %rd19, %rd41;
|
133 |
+
add.s64 %rd47, %rd18, %rd41;
|
134 |
+
mov.b32 %r30, -4;
|
135 |
+
mov.u16 %rs2, 0;
|
136 |
+
$L__BB0_3:
|
137 |
+
add.s32 %r30, %r30, 4;
|
138 |
+
.loc 1 52 27
|
139 |
+
add.s32 %r28, %r30, %r1;
|
140 |
+
.loc 1 53 25
|
141 |
+
setp.lt.u32 %p8, %r28, 50257;
|
142 |
+
.loc 1 55 53
|
143 |
+
mov.u16 %rs1, 0x0;
|
144 |
+
@%p8 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
|
145 |
+
@!%p8 mov.u16 %rs1, %rs2;
|
146 |
+
.loc 1 55 105
|
147 |
+
cvt.f32.bf16 %r23, %rs1;
|
148 |
+
mov.b32 %f15, %r23;
|
149 |
+
.loc 1 56 53
|
150 |
+
mov.u32 %r24, 0x0;
|
151 |
+
@%p8 ld.global.L1::evict_first.b32 { %r24 }, [ %rd50 + 0 ];
|
152 |
+
@!%p8 mov.u32 %r24, %r16;
|
153 |
+
mov.b32 %f16, %r24;
|
154 |
+
.loc 1 57 53
|
155 |
+
mov.u16 %rs4, 0x0;
|
156 |
+
@%p8 ld.global.L1::evict_first.b16 { %rs4 }, [ %rd48 + 0 ];
|
157 |
+
@!%p8 mov.u16 %rs4, %rs2;
|
158 |
+
.loc 1 57 105
|
159 |
+
cvt.f32.bf16 %r26, %rs4;
|
160 |
+
mov.b32 %f17, %r26;
|
161 |
+
.loc 1 65 23
|
162 |
+
mul.f32 %f14, %f17, 0f3FB8AA3B;
|
163 |
+
ex2.approx.f32 %f13, %f14;
|
164 |
+
.loc 1 66 24
|
165 |
+
mul.f32 %f18, %f4, %f13;
|
166 |
+
.loc 1 67 24
|
167 |
+
neg.f32 %f19, %f18;
|
168 |
+
fma.rn.f32 %f20, %f1, %f16, %f19;
|
169 |
+
.loc 1 69 24
|
170 |
+
add.f32 %f21, %f15, %f20;
|
171 |
+
.loc 1 70 54
|
172 |
+
mov.b32 %r27, %f21;
|
173 |
+
cvt.rn.bf16.f32 %rs7, %r27;
|
174 |
+
@%p8 st.global.b16 [ %rd49 + 0 ], { %rs7 };
|
175 |
+
.loc 1 51 36
|
176 |
+
add.s64 %rd50, %rd50, 16;
|
177 |
+
add.s64 %rd49, %rd49, 8;
|
178 |
+
add.s64 %rd48, %rd48, 8;
|
179 |
+
add.s64 %rd47, %rd47, 8;
|
180 |
+
setp.lt.u32 %p15, %r30, 50253;
|
181 |
+
@%p15 bra $L__BB0_3;
|
182 |
+
.loc 1 51 4
|
183 |
+
ret;
|
184 |
+
$L__tmp6:
|
185 |
+
$L__func_end0:
|
186 |
+
|
187 |
+
}
|
188 |
+
.file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
|
189 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
190 |
+
.section .debug_abbrev
|
191 |
+
{
|
192 |
+
.b8 1
|
193 |
+
.b8 17
|
194 |
+
.b8 1
|
195 |
+
.b8 37
|
196 |
+
.b8 8
|
197 |
+
.b8 19
|
198 |
+
.b8 5
|
199 |
+
.b8 3
|
200 |
+
.b8 8
|
201 |
+
.b8 16
|
202 |
+
.b8 6
|
203 |
+
.b8 27
|
204 |
+
.b8 8
|
205 |
+
.b8 180
|
206 |
+
.b8 66
|
207 |
+
.b8 12
|
208 |
+
.b8 17
|
209 |
+
.b8 1
|
210 |
+
.b8 18
|
211 |
+
.b8 1
|
212 |
+
.b8 0
|
213 |
+
.b8 0
|
214 |
+
.b8 2
|
215 |
+
.b8 46
|
216 |
+
.b8 0
|
217 |
+
.b8 135
|
218 |
+
.b8 64
|
219 |
+
.b8 8
|
220 |
+
.b8 3
|
221 |
+
.b8 8
|
222 |
+
.b8 58
|
223 |
+
.b8 11
|
224 |
+
.b8 59
|
225 |
+
.b8 11
|
226 |
+
.b8 63
|
227 |
+
.b8 12
|
228 |
+
.b8 32
|
229 |
+
.b8 11
|
230 |
+
.b8 0
|
231 |
+
.b8 0
|
232 |
+
.b8 3
|
233 |
+
.b8 46
|
234 |
+
.b8 1
|
235 |
+
.b8 17
|
236 |
+
.b8 1
|
237 |
+
.b8 18
|
238 |
+
.b8 1
|
239 |
+
.b8 64
|
240 |
+
.b8 10
|
241 |
+
.b8 49
|
242 |
+
.b8 19
|
243 |
+
.b8 0
|
244 |
+
.b8 0
|
245 |
+
.b8 4
|
246 |
+
.b8 29
|
247 |
+
.b8 0
|
248 |
+
.b8 49
|
249 |
+
.b8 19
|
250 |
+
.b8 17
|
251 |
+
.b8 1
|
252 |
+
.b8 18
|
253 |
+
.b8 1
|
254 |
+
.b8 88
|
255 |
+
.b8 11
|
256 |
+
.b8 89
|
257 |
+
.b8 11
|
258 |
+
.b8 87
|
259 |
+
.b8 11
|
260 |
+
.b8 0
|
261 |
+
.b8 0
|
262 |
+
.b8 5
|
263 |
+
.b8 29
|
264 |
+
.b8 1
|
265 |
+
.b8 49
|
266 |
+
.b8 19
|
267 |
+
.b8 17
|
268 |
+
.b8 1
|
269 |
+
.b8 18
|
270 |
+
.b8 1
|
271 |
+
.b8 88
|
272 |
+
.b8 11
|
273 |
+
.b8 89
|
274 |
+
.b8 11
|
275 |
+
.b8 87
|
276 |
+
.b8 11
|
277 |
+
.b8 0
|
278 |
+
.b8 0
|
279 |
+
.b8 0
|
280 |
+
}
|
281 |
+
.section .debug_info
|
282 |
+
{
|
283 |
+
.b32 278
|
284 |
+
.b8 2
|
285 |
+
.b8 0
|
286 |
+
.b32 .debug_abbrev
|
287 |
+
.b8 8
|
288 |
+
.b8 1
|
289 |
+
.b8 116
|
290 |
+
.b8 114
|
291 |
+
.b8 105
|
292 |
+
.b8 116
|
293 |
+
.b8 111
|
294 |
+
.b8 110
|
295 |
+
.b8 0
|
296 |
+
.b8 2
|
297 |
+
.b8 0
|
298 |
+
.b8 99
|
299 |
+
.b8 107
|
300 |
+
.b8 122
|
301 |
+
.b8 103
|
302 |
+
.b8 108
|
303 |
+
.b8 55
|
304 |
+
.b8 116
|
305 |
+
.b8 104
|
306 |
+
.b8 98
|
307 |
+
.b8 52
|
308 |
+
.b8 120
|
309 |
+
.b8 100
|
310 |
+
.b8 102
|
311 |
+
.b8 107
|
312 |
+
.b8 102
|
313 |
+
.b8 110
|
314 |
+
.b8 100
|
315 |
+
.b8 50
|
316 |
+
.b8 116
|
317 |
+
.b8 105
|
318 |
+
.b8 100
|
319 |
+
.b8 107
|
320 |
+
.b8 115
|
321 |
+
.b8 54
|
322 |
+
.b8 109
|
323 |
+
.b8 116
|
324 |
+
.b8 53
|
325 |
+
.b8 102
|
326 |
+
.b8 51
|
327 |
+
.b8 104
|
328 |
+
.b8 97
|
329 |
+
.b8 117
|
330 |
+
.b8 119
|
331 |
+
.b8 102
|
332 |
+
.b8 121
|
333 |
+
.b8 106
|
334 |
+
.b8 102
|
335 |
+
.b8 108
|
336 |
+
.b8 98
|
337 |
+
.b8 116
|
338 |
+
.b8 122
|
339 |
+
.b8 121
|
340 |
+
.b8 101
|
341 |
+
.b8 112
|
342 |
+
.b8 111
|
343 |
+
.b8 53
|
344 |
+
.b8 111
|
345 |
+
.b8 120
|
346 |
+
.b8 107
|
347 |
+
.b8 118
|
348 |
+
.b8 104
|
349 |
+
.b8 107
|
350 |
+
.b8 46
|
351 |
+
.b8 112
|
352 |
+
.b8 121
|
353 |
+
.b8 0
|
354 |
+
.b32 .debug_line
|
355 |
+
.b8 47
|
356 |
+
.b8 116
|
357 |
+
.b8 109
|
358 |
+
.b8 112
|
359 |
+
.b8 47
|
360 |
+
.b8 116
|
361 |
+
.b8 111
|
362 |
+
.b8 114
|
363 |
+
.b8 99
|
364 |
+
.b8 104
|
365 |
+
.b8 105
|
366 |
+
.b8 110
|
367 |
+
.b8 100
|
368 |
+
.b8 117
|
369 |
+
.b8 99
|
370 |
+
.b8 116
|
371 |
+
.b8 111
|
372 |
+
.b8 114
|
373 |
+
.b8 95
|
374 |
+
.b8 114
|
375 |
+
.b8 111
|
376 |
+
.b8 111
|
377 |
+
.b8 116
|
378 |
+
.b8 47
|
379 |
+
.b8 107
|
380 |
+
.b8 122
|
381 |
+
.b8 0
|
382 |
+
.b8 1
|
383 |
+
.b64 $L__func_begin0
|
384 |
+
.b64 $L__func_end0
|
385 |
+
.b8 2
|
386 |
+
.b8 116
|
387 |
+
.b8 114
|
388 |
+
.b8 105
|
389 |
+
.b8 116
|
390 |
+
.b8 111
|
391 |
+
.b8 110
|
392 |
+
.b8 95
|
393 |
+
.b8 95
|
394 |
+
.b8 48
|
395 |
+
.b8 100
|
396 |
+
.b8 49
|
397 |
+
.b8 100
|
398 |
+
.b8 50
|
399 |
+
.b8 100
|
400 |
+
.b8 51
|
401 |
+
.b8 100
|
402 |
+
.b8 52
|
403 |
+
.b8 100
|
404 |
+
.b8 53
|
405 |
+
.b8 100
|
406 |
+
.b8 54
|
407 |
+
.b8 100
|
408 |
+
.b8 55
|
409 |
+
.b8 100
|
410 |
+
.b8 101
|
411 |
+
.b8 56
|
412 |
+
.b8 0
|
413 |
+
.b8 116
|
414 |
+
.b8 114
|
415 |
+
.b8 105
|
416 |
+
.b8 116
|
417 |
+
.b8 111
|
418 |
+
.b8 110
|
419 |
+
.b8 95
|
420 |
+
.b8 95
|
421 |
+
.b8 48
|
422 |
+
.b8 100
|
423 |
+
.b8 49
|
424 |
+
.b8 100
|
425 |
+
.b8 50
|
426 |
+
.b8 100
|
427 |
+
.b8 51
|
428 |
+
.b8 100
|
429 |
+
.b8 52
|
430 |
+
.b8 100
|
431 |
+
.b8 53
|
432 |
+
.b8 100
|
433 |
+
.b8 54
|
434 |
+
.b8 100
|
435 |
+
.b8 55
|
436 |
+
.b8 100
|
437 |
+
.b8 101
|
438 |
+
.b8 56
|
439 |
+
.b8 0
|
440 |
+
.b8 1
|
441 |
+
.b8 18
|
442 |
+
.b8 1
|
443 |
+
.b8 1
|
444 |
+
.b8 3
|
445 |
+
.b64 $L__func_begin0
|
446 |
+
.b64 $L__func_end0
|
447 |
+
.b8 1
|
448 |
+
.b8 156
|
449 |
+
.b32 125
|
450 |
+
.b8 4
|
451 |
+
.b32 125
|
452 |
+
.b64 $L__tmp1
|
453 |
+
.b64 $L__tmp4
|
454 |
+
.b8 2
|
455 |
+
.b8 46
|
456 |
+
.b8 27
|
457 |
+
.b8 5
|
458 |
+
.b32 125
|
459 |
+
.b64 $L__tmp2
|
460 |
+
.b64 $L__tmp5
|
461 |
+
.b8 2
|
462 |
+
.b8 46
|
463 |
+
.b8 27
|
464 |
+
.b8 4
|
465 |
+
.b32 125
|
466 |
+
.b64 $L__tmp2
|
467 |
+
.b64 $L__tmp5
|
468 |
+
.b8 2
|
469 |
+
.b8 243
|
470 |
+
.b8 36
|
471 |
+
.b8 0
|
472 |
+
.b8 0
|
473 |
+
.b8 0
|
474 |
+
}
|
475 |
+
.section .debug_pubnames
|
476 |
+
{
|
477 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
478 |
+
$L__pubNames_start0:
|
479 |
+
.b8 2
|
480 |
+
.b8 0
|
481 |
+
.b32 .debug_info
|
482 |
+
.b32 282
|
483 |
+
.b32 125
|
484 |
+
.b8 116
|
485 |
+
.b8 114
|
486 |
+
.b8 105
|
487 |
+
.b8 116
|
488 |
+
.b8 111
|
489 |
+
.b8 110
|
490 |
+
.b8 95
|
491 |
+
.b8 95
|
492 |
+
.b8 48
|
493 |
+
.b8 100
|
494 |
+
.b8 49
|
495 |
+
.b8 100
|
496 |
+
.b8 50
|
497 |
+
.b8 100
|
498 |
+
.b8 51
|
499 |
+
.b8 100
|
500 |
+
.b8 52
|
501 |
+
.b8 100
|
502 |
+
.b8 53
|
503 |
+
.b8 100
|
504 |
+
.b8 54
|
505 |
+
.b8 100
|
506 |
+
.b8 55
|
507 |
+
.b8 100
|
508 |
+
.b8 101
|
509 |
+
.b8 56
|
510 |
+
.b8 0
|
511 |
+
.b32 0
|
512 |
+
$L__pubNames_end0:
|
513 |
+
}
|
514 |
+
.section .debug_pubtypes
|
515 |
+
{
|
516 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
517 |
+
$L__pubTypes_start0:
|
518 |
+
.b8 2
|
519 |
+
.b8 0
|
520 |
+
.b32 .debug_info
|
521 |
+
.b32 282
|
522 |
+
.b32 0
|
523 |
+
$L__pubTypes_end0:
|
524 |
+
}
|
525 |
+
.section .debug_loc { }
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttgir
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x1xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<-1> : tensor<64x1xi64, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
|
8 |
+
%c64_i64 = arith.constant 64 : i64
|
9 |
+
%cst_3 = arith.constant dense<50257> : tensor<1x4xi64, #blocked>
|
10 |
+
%c0_i32 = arith.constant 0 : i32
|
11 |
+
%c4_i32 = arith.constant 4 : i32
|
12 |
+
%c50257_i32 = arith.constant 50257 : i32
|
13 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked>
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.extsi %0 : i32 to i64
|
16 |
+
%2 = arith.muli %1, %c64_i64 : i64
|
17 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
19 |
+
%5 = arith.extsi %4 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
|
20 |
+
%6 = tt.splat %2 : (i64) -> tensor<64x1xi64, #blocked>
|
21 |
+
%7 = arith.addi %6, %5 : tensor<64x1xi64, #blocked>
|
22 |
+
%8 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
23 |
+
%9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
|
24 |
+
%10 = arith.extsi %9 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
|
25 |
+
%11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
26 |
+
%12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi64, #blocked>
|
27 |
+
%13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
28 |
+
%14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
29 |
+
%15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
30 |
+
%16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
31 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
32 |
+
%18 = arith.muli %7, %cst_0 : tensor<64x1xi64, #blocked>
|
33 |
+
%19 = tt.broadcast %18 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
34 |
+
%20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
|
35 |
+
%21 = arith.cmpi ne, %13, %cst_1 : tensor<64x1xi64, #blocked>
|
36 |
+
%22 = arith.divf %15, %17 : f32
|
37 |
+
%23 = tt.splat %22 : (f32) -> tensor<64x1xf32, #blocked>
|
38 |
+
%24 = arith.select %21, %23, %cst : tensor<64x1xi1, #blocked>, tensor<64x1xf32, #blocked>
|
39 |
+
%25 = tt.broadcast %24 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
40 |
+
%26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 iter_args(%arg10 = %cst_2) -> (tensor<64x4xf32, #blocked>) : i32 {
|
41 |
+
%33 = arith.extsi %arg9 : i32 to i64
|
42 |
+
%34 = tt.splat %33 : (i64) -> tensor<1x4xi64, #blocked>
|
43 |
+
%35 = arith.addi %34, %10 : tensor<1x4xi64, #blocked>
|
44 |
+
%36 = arith.cmpi slt, %35, %cst_3 : tensor<1x4xi64, #blocked>
|
45 |
+
%37 = tt.broadcast %35 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
46 |
+
%38 = arith.addi %37, %19 : tensor<64x4xi64, #blocked>
|
47 |
+
%39 = tt.addptr %20, %38 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
48 |
+
%40 = tt.broadcast %36 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
49 |
+
%41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
50 |
+
%42 = arith.mulf %41, %25 : tensor<64x4xf32, #blocked>
|
51 |
+
%43 = arith.addf %arg10, %42 : tensor<64x4xf32, #blocked>
|
52 |
+
%44 = arith.select %40, %43, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
53 |
+
scf.yield %44 : tensor<64x4xf32, #blocked>
|
54 |
+
}
|
55 |
+
%27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
|
56 |
+
^bb0(%arg9: f32, %arg10: f32):
|
57 |
+
%33 = arith.addf %arg9, %arg10 : f32
|
58 |
+
tt.reduce.return %33 : f32
|
59 |
+
}) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
60 |
+
%28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
61 |
+
%29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
|
62 |
+
%30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
|
63 |
+
%31 = tt.broadcast %28 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
64 |
+
%32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
|
65 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 : i32 {
|
66 |
+
%33 = arith.extsi %arg9 : i32 to i64
|
67 |
+
%34 = tt.splat %33 : (i64) -> tensor<1x4xi64, #blocked>
|
68 |
+
%35 = arith.addi %34, %10 : tensor<1x4xi64, #blocked>
|
69 |
+
%36 = arith.cmpi slt, %35, %cst_3 : tensor<1x4xi64, #blocked>
|
70 |
+
%37 = tt.broadcast %35 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
71 |
+
%38 = arith.addi %37, %19 : tensor<64x4xi64, #blocked>
|
72 |
+
%39 = tt.addptr %29, %38 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
73 |
+
%40 = tt.broadcast %36 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
74 |
+
%41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
|
75 |
+
%42 = arith.extf %41 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
|
76 |
+
%43 = tt.addptr %20, %38 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
77 |
+
%44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
78 |
+
%45 = tt.addptr %30, %38 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
79 |
+
%46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
|
80 |
+
%47 = arith.extf %46 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
|
81 |
+
%48 = arith.mulf %44, %25 : tensor<64x4xf32, #blocked>
|
82 |
+
%49 = math.exp %47 : tensor<64x4xf32, #blocked>
|
83 |
+
%50 = arith.mulf %49, %31 : tensor<64x4xf32, #blocked>
|
84 |
+
%51 = arith.subf %48, %50 : tensor<64x4xf32, #blocked>
|
85 |
+
%52 = arith.addf %42, %51 : tensor<64x4xf32, #blocked>
|
86 |
+
%53 = tt.addptr %32, %38 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
87 |
+
%54 = arith.truncf %52 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
|
88 |
+
tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
|
89 |
+
}
|
90 |
+
tt.return
|
91 |
+
}
|
92 |
+
}
|
.triton/dump/24a29ccba6fa75ba50c8f1e68f7098d8/triton_.ttir
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
|
4 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32>
|
5 |
+
%c50257_i32 = arith.constant 50257 : i32
|
6 |
+
%c4_i32 = arith.constant 4 : i32
|
7 |
+
%c0_i32 = arith.constant 0 : i32
|
8 |
+
%cst_1 = arith.constant dense<50257> : tensor<64x1xi64>
|
9 |
+
%cst_2 = arith.constant dense<50257> : tensor<1x4xi64>
|
10 |
+
%c64_i64 = arith.constant 64 : i64
|
11 |
+
%cst_3 = arith.constant dense<-1> : tensor<64x1xi64>
|
12 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = arith.extsi %0 : i32 to i64
|
15 |
+
%2 = arith.muli %1, %c64_i64 : i64
|
16 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
17 |
+
%4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
18 |
+
%5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64>
|
19 |
+
%6 = tt.splat %2 : (i64) -> tensor<64x1xi64>
|
20 |
+
%7 = arith.addi %6, %5 : tensor<64x1xi64>
|
21 |
+
%8 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
|
22 |
+
%9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
|
23 |
+
%10 = arith.extsi %9 : tensor<1x4xi32> to tensor<1x4xi64>
|
24 |
+
%11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
25 |
+
%12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi64>
|
26 |
+
%13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
27 |
+
%14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
28 |
+
%15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
29 |
+
%16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
30 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
31 |
+
%18 = arith.muli %7, %cst_1 : tensor<64x1xi64>
|
32 |
+
%19 = tt.broadcast %18 : (tensor<64x1xi64>) -> tensor<64x4xi64>
|
33 |
+
%20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
34 |
+
%21 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64>
|
35 |
+
%22 = arith.divf %15, %17 : f32
|
36 |
+
%23 = tt.splat %22 : (f32) -> tensor<64x1xf32>
|
37 |
+
%24 = arith.select %21, %23, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
|
38 |
+
%25 = tt.broadcast %24 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
39 |
+
%26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x4xf32>) : i32 {
|
40 |
+
%41 = arith.extsi %arg9 : i32 to i64
|
41 |
+
%42 = tt.splat %41 : (i64) -> tensor<1x4xi64>
|
42 |
+
%43 = arith.addi %42, %10 : tensor<1x4xi64>
|
43 |
+
%44 = arith.cmpi slt, %43, %cst_2 : tensor<1x4xi64>
|
44 |
+
%45 = tt.broadcast %43 : (tensor<1x4xi64>) -> tensor<64x4xi64>
|
45 |
+
%46 = arith.addi %45, %19 : tensor<64x4xi64>
|
46 |
+
%47 = tt.addptr %20, %46 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
|
47 |
+
%48 = tt.broadcast %44 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
48 |
+
%49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
49 |
+
%50 = arith.mulf %49, %25 : tensor<64x4xf32>
|
50 |
+
%51 = arith.addf %arg10, %50 : tensor<64x4xf32>
|
51 |
+
%52 = arith.select %48, %51, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
|
52 |
+
scf.yield %52 : tensor<64x4xf32>
|
53 |
+
}
|
54 |
+
%27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
|
55 |
+
^bb0(%arg9: f32, %arg10: f32):
|
56 |
+
%41 = arith.addf %arg9, %arg10 : f32
|
57 |
+
tt.reduce.return %41 : f32
|
58 |
+
}) : (tensor<64x4xf32>) -> tensor<64xf32>
|
59 |
+
%28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
60 |
+
%29 = arith.muli %7, %cst_1 : tensor<64x1xi64>
|
61 |
+
%30 = tt.broadcast %29 : (tensor<64x1xi64>) -> tensor<64x4xi64>
|
62 |
+
%31 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
63 |
+
%32 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
64 |
+
%33 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
65 |
+
%34 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64>
|
66 |
+
%35 = arith.divf %15, %17 : f32
|
67 |
+
%36 = tt.splat %35 : (f32) -> tensor<64x1xf32>
|
68 |
+
%37 = arith.select %34, %36, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
|
69 |
+
%38 = tt.broadcast %37 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
70 |
+
%39 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
71 |
+
%40 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
72 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c4_i32 : i32 {
|
73 |
+
%41 = arith.extsi %arg9 : i32 to i64
|
74 |
+
%42 = tt.splat %41 : (i64) -> tensor<1x4xi64>
|
75 |
+
%43 = arith.addi %42, %10 : tensor<1x4xi64>
|
76 |
+
%44 = arith.cmpi slt, %43, %cst_2 : tensor<1x4xi64>
|
77 |
+
%45 = tt.broadcast %43 : (tensor<1x4xi64>) -> tensor<64x4xi64>
|
78 |
+
%46 = arith.addi %45, %30 : tensor<64x4xi64>
|
79 |
+
%47 = tt.addptr %31, %46 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi64>
|
80 |
+
%48 = tt.broadcast %44 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
81 |
+
%49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
|
82 |
+
%50 = arith.extf %49 : tensor<64x4xbf16> to tensor<64x4xf32>
|
83 |
+
%51 = tt.addptr %32, %46 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
|
84 |
+
%52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
|
85 |
+
%53 = tt.addptr %33, %46 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi64>
|
86 |
+
%54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
|
87 |
+
%55 = arith.extf %54 : tensor<64x4xbf16> to tensor<64x4xf32>
|
88 |
+
%56 = arith.mulf %52, %38 : tensor<64x4xf32>
|
89 |
+
%57 = math.exp %55 : tensor<64x4xf32>
|
90 |
+
%58 = arith.mulf %57, %39 : tensor<64x4xf32>
|
91 |
+
%59 = arith.subf %56, %58 : tensor<64x4xf32>
|
92 |
+
%60 = arith.addf %50, %59 : tensor<64x4xf32>
|
93 |
+
%61 = tt.addptr %40, %46 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi64>
|
94 |
+
%62 = arith.truncf %60 : tensor<64x4xf32> to tensor<64x4xbf16>
|
95 |
+
tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
|
96 |
+
}
|
97 |
+
tt.return
|
98 |
+
}
|
99 |
+
}
|
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttir
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<8x512xbf16>
|
4 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<8x1xf32>
|
5 |
+
%c50257_i32 = arith.constant 50257 : i32
|
6 |
+
%c512_i32 = arith.constant 512 : i32
|
7 |
+
%c0_i32 = arith.constant 0 : i32
|
8 |
+
%cst_1 = arith.constant dense<50257> : tensor<8x1xi64>
|
9 |
+
%cst_2 = arith.constant dense<50257> : tensor<1x512xi64>
|
10 |
+
%c8_i64 = arith.constant 8 : i64
|
11 |
+
%cst_3 = arith.constant dense<-1> : tensor<8x1xi64>
|
12 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<8x512xf32>
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = arith.extsi %0 : i32 to i64
|
15 |
+
%2 = arith.muli %1, %c8_i64 : i64
|
16 |
+
%3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
17 |
+
%4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32>) -> tensor<8x1xi32>
|
18 |
+
%5 = arith.extsi %4 : tensor<8x1xi32> to tensor<8x1xi64>
|
19 |
+
%6 = tt.splat %2 : (i64) -> tensor<8x1xi64>
|
20 |
+
%7 = arith.addi %6, %5 : tensor<8x1xi64>
|
21 |
+
%8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
22 |
+
%9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<512xi32>) -> tensor<1x512xi32>
|
23 |
+
%10 = arith.extsi %9 : tensor<1x512xi32> to tensor<1x512xi64>
|
24 |
+
%11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<8x1x!tt.ptr<i64, 1>>
|
25 |
+
%12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr<i64, 1>>, tensor<8x1xi64>
|
26 |
+
%13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64>
|
27 |
+
%14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
28 |
+
%15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
29 |
+
%16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
30 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
31 |
+
%18 = arith.muli %7, %cst_1 : tensor<8x1xi64>
|
32 |
+
%19 = tt.broadcast %18 : (tensor<8x1xi64>) -> tensor<8x512xi64>
|
33 |
+
%20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>>
|
34 |
+
%21 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64>
|
35 |
+
%22 = arith.divf %15, %17 : f32
|
36 |
+
%23 = tt.splat %22 : (f32) -> tensor<8x1xf32>
|
37 |
+
%24 = arith.select %21, %23, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32>
|
38 |
+
%25 = tt.broadcast %24 : (tensor<8x1xf32>) -> tensor<8x512xf32>
|
39 |
+
%26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 iter_args(%arg10 = %cst_4) -> (tensor<8x512xf32>) : i32 {
|
40 |
+
%41 = arith.extsi %arg9 : i32 to i64
|
41 |
+
%42 = tt.splat %41 : (i64) -> tensor<1x512xi64>
|
42 |
+
%43 = arith.addi %42, %10 : tensor<1x512xi64>
|
43 |
+
%44 = arith.cmpi slt, %43, %cst_2 : tensor<1x512xi64>
|
44 |
+
%45 = tt.broadcast %43 : (tensor<1x512xi64>) -> tensor<8x512xi64>
|
45 |
+
%46 = arith.addi %45, %19 : tensor<8x512xi64>
|
46 |
+
%47 = tt.addptr %20, %46 : tensor<8x512x!tt.ptr<f32, 1>>, tensor<8x512xi64>
|
47 |
+
%48 = tt.broadcast %44 : (tensor<1x512xi1>) -> tensor<8x512xi1>
|
48 |
+
%49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x512xf32>
|
49 |
+
%50 = arith.mulf %49, %25 : tensor<8x512xf32>
|
50 |
+
%51 = arith.addf %arg10, %50 : tensor<8x512xf32>
|
51 |
+
%52 = arith.select %48, %51, %arg10 : tensor<8x512xi1>, tensor<8x512xf32>
|
52 |
+
scf.yield %52 : tensor<8x512xf32>
|
53 |
+
}
|
54 |
+
%27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
|
55 |
+
^bb0(%arg9: f32, %arg10: f32):
|
56 |
+
%41 = arith.addf %arg9, %arg10 : f32
|
57 |
+
tt.reduce.return %41 : f32
|
58 |
+
}) : (tensor<8x512xf32>) -> tensor<8xf32>
|
59 |
+
%28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32>) -> tensor<8x1xf32>
|
60 |
+
%29 = arith.muli %7, %cst_1 : tensor<8x1xi64>
|
61 |
+
%30 = tt.broadcast %29 : (tensor<8x1xi64>) -> tensor<8x512xi64>
|
62 |
+
%31 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>>
|
63 |
+
%32 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>>
|
64 |
+
%33 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>>
|
65 |
+
%34 = arith.cmpi ne, %13, %cst_3 : tensor<8x1xi64>
|
66 |
+
%35 = arith.divf %15, %17 : f32
|
67 |
+
%36 = tt.splat %35 : (f32) -> tensor<8x1xf32>
|
68 |
+
%37 = arith.select %34, %36, %cst_0 : tensor<8x1xi1>, tensor<8x1xf32>
|
69 |
+
%38 = tt.broadcast %37 : (tensor<8x1xf32>) -> tensor<8x512xf32>
|
70 |
+
%39 = tt.broadcast %28 : (tensor<8x1xf32>) -> tensor<8x512xf32>
|
71 |
+
%40 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>>
|
72 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 : i32 {
|
73 |
+
%41 = arith.extsi %arg9 : i32 to i64
|
74 |
+
%42 = tt.splat %41 : (i64) -> tensor<1x512xi64>
|
75 |
+
%43 = arith.addi %42, %10 : tensor<1x512xi64>
|
76 |
+
%44 = arith.cmpi slt, %43, %cst_2 : tensor<1x512xi64>
|
77 |
+
%45 = tt.broadcast %43 : (tensor<1x512xi64>) -> tensor<8x512xi64>
|
78 |
+
%46 = arith.addi %45, %30 : tensor<8x512xi64>
|
79 |
+
%47 = tt.addptr %31, %46 : tensor<8x512x!tt.ptr<bf16, 1>>, tensor<8x512xi64>
|
80 |
+
%48 = tt.broadcast %44 : (tensor<1x512xi1>) -> tensor<8x512xi1>
|
81 |
+
%49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16>
|
82 |
+
%50 = arith.extf %49 : tensor<8x512xbf16> to tensor<8x512xf32>
|
83 |
+
%51 = tt.addptr %32, %46 : tensor<8x512x!tt.ptr<f32, 1>>, tensor<8x512xi64>
|
84 |
+
%52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xf32>
|
85 |
+
%53 = tt.addptr %33, %46 : tensor<8x512x!tt.ptr<bf16, 1>>, tensor<8x512xi64>
|
86 |
+
%54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16>
|
87 |
+
%55 = arith.extf %54 : tensor<8x512xbf16> to tensor<8x512xf32>
|
88 |
+
%56 = arith.mulf %52, %38 : tensor<8x512xf32>
|
89 |
+
%57 = math.exp %55 : tensor<8x512xf32>
|
90 |
+
%58 = arith.mulf %57, %39 : tensor<8x512xf32>
|
91 |
+
%59 = arith.subf %56, %58 : tensor<8x512xf32>
|
92 |
+
%60 = arith.addf %50, %59 : tensor<8x512xf32>
|
93 |
+
%61 = tt.addptr %40, %46 : tensor<8x512x!tt.ptr<bf16, 1>>, tensor<8x512xi64>
|
94 |
+
%62 = arith.truncf %60 : tensor<8x512xf32> to tensor<8x512xbf16>
|
95 |
+
tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<8x512xbf16>
|
96 |
+
}
|
97 |
+
tt.return
|
98 |
+
}
|
99 |
+
}
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.cubin
ADDED
Binary file (4.52 kB). View file
|
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.llir
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 1, !dbg !8
|
7 |
+
%5 = and i32 %4, 510, !dbg !8
|
8 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%7 = shl i32 %6, 9, !dbg !10
|
10 |
+
%8 = or i32 %7, %5, !dbg !11
|
11 |
+
%9 = icmp slt i32 %8, 12865792, !dbg !12
|
12 |
+
%10 = sext i32 %8 to i64, !dbg !13
|
13 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !13
|
14 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %11, i1 %9) #1, !dbg !14
|
15 |
+
ret void, !dbg !15
|
16 |
+
}
|
17 |
+
|
18 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
19 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
20 |
+
|
21 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
22 |
+
attributes #1 = { nounwind }
|
23 |
+
|
24 |
+
!llvm.module.flags = !{!0}
|
25 |
+
!llvm.dbg.cu = !{!1}
|
26 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
27 |
+
|
28 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
29 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
30 |
+
!2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
|
31 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
32 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
|
33 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
34 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
35 |
+
!7 = !{}
|
36 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
37 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
38 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
39 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
40 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
41 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
42 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
43 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ptx
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u32 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 256, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<2>;
|
18 |
+
.reg .b32 %r<9>;
|
19 |
+
.reg .b64 %rd<4>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd2, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r4, %tid.x;
|
28 |
+
shl.b32 %r5, %r4, 1;
|
29 |
+
and.b32 %r6, %r5, 510;
|
30 |
+
.loc 1 20 28
|
31 |
+
mov.u32 %r1, %ctaid.x;
|
32 |
+
.loc 1 20 33
|
33 |
+
shl.b32 %r7, %r1, 9;
|
34 |
+
.loc 1 21 23
|
35 |
+
or.b32 %r8, %r7, %r6;
|
36 |
+
.loc 1 22 21
|
37 |
+
setp.lt.s32 %p1, %r8, 12865792;
|
38 |
+
.loc 1 25 25
|
39 |
+
mul.wide.s32 %rd3, %r8, 4;
|
40 |
+
add.s64 %rd1, %rd2, %rd3;
|
41 |
+
mov.b32 %r2, 0;
|
42 |
+
.loc 1 25 36
|
43 |
+
@%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
|
44 |
+
.loc 1 25 4
|
45 |
+
ret;
|
46 |
+
$L__tmp1:
|
47 |
+
$L__func_end0:
|
48 |
+
|
49 |
+
}
|
50 |
+
.file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py"
|
51 |
+
.section .debug_abbrev
|
52 |
+
{
|
53 |
+
.b8 1
|
54 |
+
.b8 17
|
55 |
+
.b8 1
|
56 |
+
.b8 37
|
57 |
+
.b8 8
|
58 |
+
.b8 19
|
59 |
+
.b8 5
|
60 |
+
.b8 3
|
61 |
+
.b8 8
|
62 |
+
.b8 16
|
63 |
+
.b8 6
|
64 |
+
.b8 27
|
65 |
+
.b8 8
|
66 |
+
.b8 180
|
67 |
+
.b8 66
|
68 |
+
.b8 12
|
69 |
+
.b8 17
|
70 |
+
.b8 1
|
71 |
+
.b8 18
|
72 |
+
.b8 1
|
73 |
+
.b8 0
|
74 |
+
.b8 0
|
75 |
+
.b8 2
|
76 |
+
.b8 46
|
77 |
+
.b8 0
|
78 |
+
.b8 17
|
79 |
+
.b8 1
|
80 |
+
.b8 18
|
81 |
+
.b8 1
|
82 |
+
.b8 64
|
83 |
+
.b8 10
|
84 |
+
.b8 135
|
85 |
+
.b8 64
|
86 |
+
.b8 8
|
87 |
+
.b8 3
|
88 |
+
.b8 8
|
89 |
+
.b8 58
|
90 |
+
.b8 11
|
91 |
+
.b8 59
|
92 |
+
.b8 11
|
93 |
+
.b8 63
|
94 |
+
.b8 12
|
95 |
+
.b8 0
|
96 |
+
.b8 0
|
97 |
+
.b8 0
|
98 |
+
}
|
99 |
+
.section .debug_info
|
100 |
+
{
|
101 |
+
.b32 172
|
102 |
+
.b8 2
|
103 |
+
.b8 0
|
104 |
+
.b32 .debug_abbrev
|
105 |
+
.b8 8
|
106 |
+
.b8 1
|
107 |
+
.b8 116
|
108 |
+
.b8 114
|
109 |
+
.b8 105
|
110 |
+
.b8 116
|
111 |
+
.b8 111
|
112 |
+
.b8 110
|
113 |
+
.b8 0
|
114 |
+
.b8 2
|
115 |
+
.b8 0
|
116 |
+
.b8 99
|
117 |
+
.b8 52
|
118 |
+
.b8 121
|
119 |
+
.b8 115
|
120 |
+
.b8 101
|
121 |
+
.b8 108
|
122 |
+
.b8 100
|
123 |
+
.b8 119
|
124 |
+
.b8 109
|
125 |
+
.b8 117
|
126 |
+
.b8 51
|
127 |
+
.b8 116
|
128 |
+
.b8 111
|
129 |
+
.b8 53
|
130 |
+
.b8 50
|
131 |
+
.b8 112
|
132 |
+
.b8 98
|
133 |
+
.b8 104
|
134 |
+
.b8 50
|
135 |
+
.b8 109
|
136 |
+
.b8 100
|
137 |
+
.b8 50
|
138 |
+
.b8 111
|
139 |
+
.b8 101
|
140 |
+
.b8 117
|
141 |
+
.b8 102
|
142 |
+
.b8 114
|
143 |
+
.b8 113
|
144 |
+
.b8 51
|
145 |
+
.b8 102
|
146 |
+
.b8 99
|
147 |
+
.b8 100
|
148 |
+
.b8 109
|
149 |
+
.b8 97
|
150 |
+
.b8 112
|
151 |
+
.b8 107
|
152 |
+
.b8 116
|
153 |
+
.b8 52
|
154 |
+
.b8 110
|
155 |
+
.b8 120
|
156 |
+
.b8 100
|
157 |
+
.b8 122
|
158 |
+
.b8 109
|
159 |
+
.b8 121
|
160 |
+
.b8 113
|
161 |
+
.b8 116
|
162 |
+
.b8 103
|
163 |
+
.b8 100
|
164 |
+
.b8 50
|
165 |
+
.b8 121
|
166 |
+
.b8 115
|
167 |
+
.b8 112
|
168 |
+
.b8 46
|
169 |
+
.b8 112
|
170 |
+
.b8 121
|
171 |
+
.b8 0
|
172 |
+
.b32 .debug_line
|
173 |
+
.b8 47
|
174 |
+
.b8 116
|
175 |
+
.b8 109
|
176 |
+
.b8 112
|
177 |
+
.b8 47
|
178 |
+
.b8 116
|
179 |
+
.b8 111
|
180 |
+
.b8 114
|
181 |
+
.b8 99
|
182 |
+
.b8 104
|
183 |
+
.b8 105
|
184 |
+
.b8 110
|
185 |
+
.b8 100
|
186 |
+
.b8 117
|
187 |
+
.b8 99
|
188 |
+
.b8 116
|
189 |
+
.b8 111
|
190 |
+
.b8 114
|
191 |
+
.b8 95
|
192 |
+
.b8 114
|
193 |
+
.b8 111
|
194 |
+
.b8 111
|
195 |
+
.b8 116
|
196 |
+
.b8 47
|
197 |
+
.b8 52
|
198 |
+
.b8 121
|
199 |
+
.b8 0
|
200 |
+
.b8 1
|
201 |
+
.b64 $L__func_begin0
|
202 |
+
.b64 $L__func_end0
|
203 |
+
.b8 2
|
204 |
+
.b64 $L__func_begin0
|
205 |
+
.b64 $L__func_end0
|
206 |
+
.b8 1
|
207 |
+
.b8 156
|
208 |
+
.b8 116
|
209 |
+
.b8 114
|
210 |
+
.b8 105
|
211 |
+
.b8 116
|
212 |
+
.b8 111
|
213 |
+
.b8 110
|
214 |
+
.b8 95
|
215 |
+
.b8 95
|
216 |
+
.b8 48
|
217 |
+
.b8 100
|
218 |
+
.b8 49
|
219 |
+
.b8 100
|
220 |
+
.b8 101
|
221 |
+
.b8 0
|
222 |
+
.b8 116
|
223 |
+
.b8 114
|
224 |
+
.b8 105
|
225 |
+
.b8 116
|
226 |
+
.b8 111
|
227 |
+
.b8 110
|
228 |
+
.b8 95
|
229 |
+
.b8 95
|
230 |
+
.b8 48
|
231 |
+
.b8 100
|
232 |
+
.b8 49
|
233 |
+
.b8 100
|
234 |
+
.b8 101
|
235 |
+
.b8 0
|
236 |
+
.b8 1
|
237 |
+
.b8 18
|
238 |
+
.b8 1
|
239 |
+
.b8 0
|
240 |
+
}
|
241 |
+
.section .debug_pubnames
|
242 |
+
{
|
243 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
244 |
+
$L__pubNames_start0:
|
245 |
+
.b8 2
|
246 |
+
.b8 0
|
247 |
+
.b32 .debug_info
|
248 |
+
.b32 176
|
249 |
+
.b32 125
|
250 |
+
.b8 116
|
251 |
+
.b8 114
|
252 |
+
.b8 105
|
253 |
+
.b8 116
|
254 |
+
.b8 111
|
255 |
+
.b8 110
|
256 |
+
.b8 95
|
257 |
+
.b8 95
|
258 |
+
.b8 48
|
259 |
+
.b8 100
|
260 |
+
.b8 49
|
261 |
+
.b8 100
|
262 |
+
.b8 101
|
263 |
+
.b8 0
|
264 |
+
.b32 0
|
265 |
+
$L__pubNames_end0:
|
266 |
+
}
|
267 |
+
.section .debug_pubtypes
|
268 |
+
{
|
269 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
270 |
+
$L__pubTypes_start0:
|
271 |
+
.b8 2
|
272 |
+
.b8 0
|
273 |
+
.b32 .debug_info
|
274 |
+
.b32 176
|
275 |
+
.b32 0
|
276 |
+
$L__pubTypes_end0:
|
277 |
+
}
|
278 |
+
.section .debug_loc { }
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
|
5 |
+
%c512_i32 = arith.constant 512 : i32
|
6 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
|
12 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
|
13 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
14 |
+
%7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
|
15 |
+
tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/36c33e585c0636c631d3aeea97f0cc97/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<512xf32>
|
4 |
+
%cst_0 = arith.constant dense<12865792> : tensor<512xi32>
|
5 |
+
%c512_i32 = arith.constant 512 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c512_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<512xi32>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<512xi32>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst_0 : tensor<512xi32>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
|
14 |
+
tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.cubin
ADDED
Binary file (15 kB). View file
|
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.llir
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !5 {
|
7 |
+
%10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%11 = and i32 %10, 31, !dbg !8
|
9 |
+
%12 = lshr i32 %10, 5, !dbg !8
|
10 |
+
%13 = and i32 %12, 1, !dbg !8
|
11 |
+
%urem = shl i32 %10, 2, !dbg !8
|
12 |
+
%14 = and i32 %urem, 252, !dbg !8
|
13 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
14 |
+
%16 = shl i32 %15, 8, !dbg !10
|
15 |
+
%17 = or i32 %16, %14, !dbg !11
|
16 |
+
%18 = sext i32 %17 to i64, !dbg !12
|
17 |
+
%19 = getelementptr i16, ptr addrspace(1) %1, i64 %18, !dbg !12
|
18 |
+
%20 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
|
19 |
+
%21 = extractvalue { i32, i32 } %20, 0, !dbg !13
|
20 |
+
%22 = extractvalue { i32, i32 } %20, 1, !dbg !13
|
21 |
+
%23 = trunc i32 %21 to i16, !dbg !13
|
22 |
+
%extelt.offset = lshr i32 %21, 16, !dbg !13
|
23 |
+
%24 = trunc i32 %extelt.offset to i16, !dbg !13
|
24 |
+
%25 = trunc i32 %22 to i16, !dbg !13
|
25 |
+
%extelt.offset1 = lshr i32 %22, 16, !dbg !13
|
26 |
+
%26 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
27 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
|
28 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
|
29 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
|
30 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14
|
31 |
+
%31 = zext nneg i32 %14 to i64, !dbg !15
|
32 |
+
%32 = getelementptr float, ptr addrspace(1) %2, i64 %31, !dbg !15
|
33 |
+
%33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
|
34 |
+
%34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !16
|
35 |
+
%35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !16
|
36 |
+
%36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !16
|
37 |
+
%37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !16
|
38 |
+
%38 = bitcast i32 %34 to float, !dbg !16
|
39 |
+
%39 = bitcast i32 %35 to float, !dbg !16
|
40 |
+
%40 = bitcast i32 %36 to float, !dbg !16
|
41 |
+
%41 = bitcast i32 %37 to float, !dbg !16
|
42 |
+
%42 = getelementptr float, ptr addrspace(1) %3, i64 %18, !dbg !17
|
43 |
+
%43 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
|
44 |
+
%44 = extractvalue { i32, i32, i32, i32 } %43, 0, !dbg !18
|
45 |
+
%45 = extractvalue { i32, i32, i32, i32 } %43, 1, !dbg !18
|
46 |
+
%46 = extractvalue { i32, i32, i32, i32 } %43, 2, !dbg !18
|
47 |
+
%47 = extractvalue { i32, i32, i32, i32 } %43, 3, !dbg !18
|
48 |
+
%48 = bitcast i32 %44 to float, !dbg !18
|
49 |
+
%49 = bitcast i32 %45 to float, !dbg !18
|
50 |
+
%50 = bitcast i32 %46 to float, !dbg !18
|
51 |
+
%51 = bitcast i32 %47 to float, !dbg !18
|
52 |
+
%52 = sext i32 %15 to i64, !dbg !19
|
53 |
+
%53 = getelementptr float, ptr addrspace(1) %4, i64 %52, !dbg !19
|
54 |
+
%54 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
55 |
+
%55 = bitcast i32 %54 to float, !dbg !20
|
56 |
+
%56 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
57 |
+
%57 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
58 |
+
%58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %53, i1 true) #3, !dbg !20
|
59 |
+
%59 = getelementptr float, ptr addrspace(1) %5, i64 %52, !dbg !21
|
60 |
+
%60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
61 |
+
%61 = bitcast i32 %60 to float, !dbg !22
|
62 |
+
%62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
63 |
+
%63 = bitcast i32 %62 to float, !dbg !22
|
64 |
+
%64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
65 |
+
%65 = bitcast i32 %64 to float, !dbg !22
|
66 |
+
%66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %59, i1 true) #3, !dbg !22
|
67 |
+
%67 = bitcast i32 %66 to float, !dbg !22
|
68 |
+
%68 = getelementptr float, ptr addrspace(1) %0, i64 %18, !dbg !23
|
69 |
+
%69 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
|
70 |
+
%70 = extractvalue { i32, i32, i32, i32 } %69, 0, !dbg !24
|
71 |
+
%71 = extractvalue { i32, i32, i32, i32 } %69, 1, !dbg !24
|
72 |
+
%72 = extractvalue { i32, i32, i32, i32 } %69, 2, !dbg !24
|
73 |
+
%73 = extractvalue { i32, i32, i32, i32 } %69, 3, !dbg !24
|
74 |
+
%74 = bitcast i32 %70 to float, !dbg !24
|
75 |
+
%75 = bitcast i32 %71 to float, !dbg !24
|
76 |
+
%76 = bitcast i32 %72 to float, !dbg !24
|
77 |
+
%77 = bitcast i32 %73 to float, !dbg !24
|
78 |
+
%78 = fmul float %27, %38, !dbg !25
|
79 |
+
%79 = fmul float %28, %39, !dbg !25
|
80 |
+
%80 = fmul float %29, %40, !dbg !25
|
81 |
+
%81 = fmul float %30, %41, !dbg !25
|
82 |
+
%82 = fadd float %78, %79, !dbg !26
|
83 |
+
%83 = fadd float %80, %82, !dbg !26
|
84 |
+
%84 = fadd float %81, %83, !dbg !26
|
85 |
+
%85 = bitcast float %84 to i32, !dbg !32
|
86 |
+
%86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 16, i32 31), !dbg !32
|
87 |
+
%87 = bitcast i32 %86 to float, !dbg !32
|
88 |
+
%88 = fadd float %84, %87, !dbg !26
|
89 |
+
%89 = bitcast float %88 to i32, !dbg !32
|
90 |
+
%90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 8, i32 31), !dbg !32
|
91 |
+
%91 = bitcast i32 %90 to float, !dbg !32
|
92 |
+
%92 = fadd float %88, %91, !dbg !26
|
93 |
+
%93 = bitcast float %92 to i32, !dbg !32
|
94 |
+
%94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 4, i32 31), !dbg !32
|
95 |
+
%95 = bitcast i32 %94 to float, !dbg !32
|
96 |
+
%96 = fadd float %92, %95, !dbg !26
|
97 |
+
%97 = bitcast float %96 to i32, !dbg !32
|
98 |
+
%98 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %97, i32 2, i32 31), !dbg !32
|
99 |
+
%99 = bitcast i32 %98 to float, !dbg !32
|
100 |
+
%100 = fadd float %96, %99, !dbg !26
|
101 |
+
%101 = bitcast float %100 to i32, !dbg !32
|
102 |
+
%102 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %101, i32 1, i32 31), !dbg !32
|
103 |
+
%103 = bitcast i32 %102 to float, !dbg !32
|
104 |
+
%104 = fadd float %100, %103, !dbg !26
|
105 |
+
%105 = icmp eq i32 %11, 0, !dbg !32
|
106 |
+
%106 = zext nneg i32 %13 to i64, !dbg !32
|
107 |
+
%107 = getelementptr float, ptr addrspace(3) @global_smem, i64 %106, !dbg !32
|
108 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %104, i1 %105) #3, !dbg !32
|
109 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
110 |
+
%108 = icmp slt i32 %10, 2, !dbg !32
|
111 |
+
%109 = sext i32 %10 to i64, !dbg !32
|
112 |
+
%110 = getelementptr float, ptr addrspace(3) @global_smem, i64 %109, !dbg !32
|
113 |
+
%111 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !32
|
114 |
+
%112 = bitcast float %111 to i32, !dbg !32
|
115 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !32
|
116 |
+
%114 = bitcast i32 %113 to float, !dbg !32
|
117 |
+
%115 = fadd float %111, %114, !dbg !26
|
118 |
+
%116 = and i32 %10, 1, !dbg !32
|
119 |
+
%117 = icmp eq i32 %116, 0, !dbg !32
|
120 |
+
%118 = and i1 %108, %117, !dbg !32
|
121 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %115, i1 %118) #3, !dbg !32
|
122 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
123 |
+
%119 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
124 |
+
%120 = fadd float %119, 0.000000e+00, !dbg !34
|
125 |
+
%121 = fsub float %48, %55, !dbg !38
|
126 |
+
%122 = fsub float %49, %55, !dbg !38
|
127 |
+
%123 = fsub float %50, %55, !dbg !38
|
128 |
+
%124 = fsub float %51, %55, !dbg !38
|
129 |
+
%125 = fmul float %121, %61, !dbg !39
|
130 |
+
%126 = fmul float %122, %61, !dbg !39
|
131 |
+
%127 = fmul float %123, %61, !dbg !39
|
132 |
+
%128 = fmul float %124, %61, !dbg !39
|
133 |
+
%129 = fmul float %78, %125, !dbg !40
|
134 |
+
%130 = fmul float %79, %126, !dbg !40
|
135 |
+
%131 = fmul float %80, %127, !dbg !40
|
136 |
+
%132 = fmul float %81, %128, !dbg !40
|
137 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
138 |
+
%133 = fadd float %129, %130, !dbg !43
|
139 |
+
%134 = fadd float %131, %133, !dbg !43
|
140 |
+
%135 = fadd float %132, %134, !dbg !43
|
141 |
+
%136 = bitcast float %135 to i32, !dbg !41
|
142 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 16, i32 31), !dbg !41
|
143 |
+
%138 = bitcast i32 %137 to float, !dbg !41
|
144 |
+
%139 = fadd float %135, %138, !dbg !43
|
145 |
+
%140 = bitcast float %139 to i32, !dbg !41
|
146 |
+
%141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 8, i32 31), !dbg !41
|
147 |
+
%142 = bitcast i32 %141 to float, !dbg !41
|
148 |
+
%143 = fadd float %139, %142, !dbg !43
|
149 |
+
%144 = bitcast float %143 to i32, !dbg !41
|
150 |
+
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !41
|
151 |
+
%146 = bitcast i32 %145 to float, !dbg !41
|
152 |
+
%147 = fadd float %143, %146, !dbg !43
|
153 |
+
%148 = bitcast float %147 to i32, !dbg !41
|
154 |
+
%149 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %148, i32 2, i32 31), !dbg !41
|
155 |
+
%150 = bitcast i32 %149 to float, !dbg !41
|
156 |
+
%151 = fadd float %147, %150, !dbg !43
|
157 |
+
%152 = bitcast float %151 to i32, !dbg !41
|
158 |
+
%153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 1, i32 31), !dbg !41
|
159 |
+
%154 = bitcast i32 %153 to float, !dbg !41
|
160 |
+
%155 = fadd float %151, %154, !dbg !43
|
161 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %107, float %155, i1 %105) #3, !dbg !41
|
162 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
163 |
+
%156 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %110, i1 %108) #3, !dbg !41
|
164 |
+
%157 = bitcast float %156 to i32, !dbg !41
|
165 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 1, i32 31), !dbg !41
|
166 |
+
%159 = bitcast i32 %158 to float, !dbg !41
|
167 |
+
%160 = fadd float %156, %159, !dbg !43
|
168 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %110, float %160, i1 %118) #3, !dbg !41
|
169 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !41
|
170 |
+
%161 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
|
171 |
+
%162 = fadd float %161, 0.000000e+00, !dbg !46
|
172 |
+
%163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %61, float 2.560000e+02) #3, !dbg !48
|
173 |
+
%164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %63, float 2.560000e+02) #3, !dbg !48
|
174 |
+
%165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %65, float 2.560000e+02) #3, !dbg !48
|
175 |
+
%166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %67, float 2.560000e+02) #3, !dbg !48
|
176 |
+
%167 = fmul float %78, 2.560000e+02, !dbg !49
|
177 |
+
%168 = fmul float %79, 2.560000e+02, !dbg !49
|
178 |
+
%169 = fmul float %80, 2.560000e+02, !dbg !49
|
179 |
+
%170 = fmul float %81, 2.560000e+02, !dbg !49
|
180 |
+
%171 = fsub float %167, %120, !dbg !50
|
181 |
+
%172 = fsub float %168, %120, !dbg !50
|
182 |
+
%173 = fsub float %169, %120, !dbg !50
|
183 |
+
%174 = fsub float %170, %120, !dbg !50
|
184 |
+
%175 = fmul float %125, %162, !dbg !51
|
185 |
+
%176 = fmul float %126, %162, !dbg !51
|
186 |
+
%177 = fmul float %127, %162, !dbg !51
|
187 |
+
%178 = fmul float %128, %162, !dbg !51
|
188 |
+
%179 = fsub float %171, %175, !dbg !52
|
189 |
+
%180 = fsub float %172, %176, !dbg !52
|
190 |
+
%181 = fsub float %173, %177, !dbg !52
|
191 |
+
%182 = fsub float %174, %178, !dbg !52
|
192 |
+
%183 = fmul float %163, %179, !dbg !53
|
193 |
+
%184 = fmul float %163, %180, !dbg !53
|
194 |
+
%185 = fmul float %163, %181, !dbg !53
|
195 |
+
%186 = fmul float %163, %182, !dbg !53
|
196 |
+
%187 = fadd float %183, %74, !dbg !54
|
197 |
+
%188 = fadd float %184, %75, !dbg !54
|
198 |
+
%189 = fadd float %185, %76, !dbg !54
|
199 |
+
%190 = fadd float %186, %77, !dbg !54
|
200 |
+
%191 = bitcast float %187 to i32, !dbg !55
|
201 |
+
%192 = bitcast float %188 to i32, !dbg !55
|
202 |
+
%193 = bitcast float %189 to i32, !dbg !55
|
203 |
+
%194 = bitcast float %190 to i32, !dbg !55
|
204 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %191, i32 %192, i32 %193, i32 %194, ptr addrspace(1) %68, i1 true) #3, !dbg !55
|
205 |
+
%195 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56
|
206 |
+
%196 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %187) #3, !dbg !57
|
207 |
+
%197 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %188) #3, !dbg !57
|
208 |
+
%198 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %189) #3, !dbg !57
|
209 |
+
%199 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %190) #3, !dbg !57
|
210 |
+
%200 = insertelement <2 x i16> undef, i16 %196, i64 0, !dbg !57
|
211 |
+
%201 = insertelement <2 x i16> %200, i16 %197, i64 1, !dbg !57
|
212 |
+
%202 = bitcast <2 x i16> %201 to i32, !dbg !57
|
213 |
+
%203 = insertelement <2 x i16> undef, i16 %198, i64 0, !dbg !57
|
214 |
+
%204 = insertelement <2 x i16> %203, i16 %199, i64 1, !dbg !57
|
215 |
+
%205 = bitcast <2 x i16> %204 to i32, !dbg !57
|
216 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %202, i32 %205, ptr addrspace(1) %195, i1 true) #3, !dbg !57
|
217 |
+
ret void, !dbg !58
|
218 |
+
}
|
219 |
+
|
220 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
221 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
222 |
+
|
223 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
224 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
225 |
+
|
226 |
+
; Function Attrs: convergent nocallback nounwind
|
227 |
+
declare void @llvm.nvvm.barrier0() #2
|
228 |
+
|
229 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
230 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
231 |
+
attributes #2 = { convergent nocallback nounwind }
|
232 |
+
attributes #3 = { nounwind }
|
233 |
+
|
234 |
+
!llvm.module.flags = !{!0}
|
235 |
+
!llvm.dbg.cu = !{!1}
|
236 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
237 |
+
|
238 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
239 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
240 |
+
!2 = !DIFile(filename: "csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py", directory: "/tmp/torchinductor_root/sn")
|
241 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1}
|
242 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64}
|
243 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
244 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
245 |
+
!7 = !{}
|
246 |
+
!8 = !DILocation(line: 26, column: 26, scope: !5)
|
247 |
+
!9 = !DILocation(line: 23, column: 28, scope: !5)
|
248 |
+
!10 = !DILocation(line: 30, column: 40, scope: !5)
|
249 |
+
!11 = !DILocation(line: 30, column: 36, scope: !5)
|
250 |
+
!12 = !DILocation(line: 30, column: 30, scope: !5)
|
251 |
+
!13 = !DILocation(line: 30, column: 46, scope: !5)
|
252 |
+
!14 = !DILocation(line: 30, column: 67, scope: !5)
|
253 |
+
!15 = !DILocation(line: 31, column: 30, scope: !5)
|
254 |
+
!16 = !DILocation(line: 31, column: 35, scope: !5)
|
255 |
+
!17 = !DILocation(line: 32, column: 30, scope: !5)
|
256 |
+
!18 = !DILocation(line: 32, column: 46, scope: !5)
|
257 |
+
!19 = !DILocation(line: 33, column: 30, scope: !5)
|
258 |
+
!20 = !DILocation(line: 33, column: 35, scope: !5)
|
259 |
+
!21 = !DILocation(line: 34, column: 31, scope: !5)
|
260 |
+
!22 = !DILocation(line: 34, column: 36, scope: !5)
|
261 |
+
!23 = !DILocation(line: 35, column: 35, scope: !5)
|
262 |
+
!24 = !DILocation(line: 35, column: 51, scope: !5)
|
263 |
+
!25 = !DILocation(line: 37, column: 18, scope: !5)
|
264 |
+
!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
|
265 |
+
!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
|
266 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
267 |
+
!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
|
268 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
269 |
+
!31 = !DILocation(line: 40, column: 57, scope: !27)
|
270 |
+
!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
|
271 |
+
!33 = !DILocation(line: 40, column: 57, scope: !29)
|
272 |
+
!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
|
273 |
+
!35 = distinct !DILexicalBlockFile(scope: !5, file: !36, discriminator: 0)
|
274 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
275 |
+
!37 = !DILocation(line: 40, column: 44, scope: !35)
|
276 |
+
!38 = !DILocation(line: 41, column: 19, scope: !5)
|
277 |
+
!39 = !DILocation(line: 42, column: 20, scope: !5)
|
278 |
+
!40 = !DILocation(line: 43, column: 19, scope: !5)
|
279 |
+
!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
|
280 |
+
!42 = !DILocation(line: 46, column: 59, scope: !29)
|
281 |
+
!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
|
282 |
+
!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
|
283 |
+
!45 = !DILocation(line: 46, column: 59, scope: !27)
|
284 |
+
!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
|
285 |
+
!47 = !DILocation(line: 46, column: 45, scope: !35)
|
286 |
+
!48 = !DILocation(line: 48, column: 20, scope: !5)
|
287 |
+
!49 = !DILocation(line: 49, column: 19, scope: !5)
|
288 |
+
!50 = !DILocation(line: 50, column: 20, scope: !5)
|
289 |
+
!51 = !DILocation(line: 51, column: 20, scope: !5)
|
290 |
+
!52 = !DILocation(line: 52, column: 20, scope: !5)
|
291 |
+
!53 = !DILocation(line: 53, column: 20, scope: !5)
|
292 |
+
!54 = !DILocation(line: 54, column: 20, scope: !5)
|
293 |
+
!55 = !DILocation(line: 56, column: 51, scope: !5)
|
294 |
+
!56 = !DILocation(line: 57, column: 25, scope: !5)
|
295 |
+
!57 = !DILocation(line: 57, column: 48, scope: !5)
|
296 |
+
!58 = !DILocation(line: 57, column: 4, scope: !5)
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ptx
ADDED
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7de8de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7de8de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7,
|
21 |
+
.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8
|
22 |
+
)
|
23 |
+
.maxntid 64, 1, 1
|
24 |
+
{
|
25 |
+
.reg .pred %p<37>;
|
26 |
+
.reg .b16 %rs<9>;
|
27 |
+
.reg .b32 %r<110>;
|
28 |
+
.reg .f32 %f<86>;
|
29 |
+
.reg .b64 %rd<26>;
|
30 |
+
.loc 1 18 0
|
31 |
+
$L__func_begin0:
|
32 |
+
.loc 1 18 0
|
33 |
+
|
34 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8de_param_0];
|
35 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8de_param_1];
|
36 |
+
$L__tmp0:
|
37 |
+
.loc 1 26 26
|
38 |
+
mov.u32 %r76, %tid.x;
|
39 |
+
and.b32 %r77, %r76, 31;
|
40 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8de_param_2];
|
41 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7de8de_param_3];
|
42 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7de8de_param_4];
|
43 |
+
shl.b32 %r78, %r76, 2;
|
44 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7de8de_param_5];
|
45 |
+
and.b32 %r79, %r78, 252;
|
46 |
+
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7de8de_param_6];
|
47 |
+
.loc 1 23 28
|
48 |
+
mov.u32 %r1, %ctaid.x;
|
49 |
+
.loc 1 30 40
|
50 |
+
shl.b32 %r80, %r1, 8;
|
51 |
+
.loc 1 30 36
|
52 |
+
or.b32 %r81, %r80, %r79;
|
53 |
+
.loc 1 30 30
|
54 |
+
mul.wide.s32 %rd22, %r81, 2;
|
55 |
+
add.s64 %rd1, %rd16, %rd22;
|
56 |
+
mov.b32 %r4, 0;
|
57 |
+
mov.pred %p1, -1;
|
58 |
+
.loc 1 30 46
|
59 |
+
mov.u32 %r2, 0x0;
|
60 |
+
mov.u32 %r3, 0x0;
|
61 |
+
@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
|
62 |
+
@!%p1 mov.u32 %r2, %r4;
|
63 |
+
@!%p1 mov.u32 %r3, %r4;
|
64 |
+
cvt.u16.u32 %rs1, %r2;
|
65 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
66 |
+
cvt.u16.u32 %rs3, %r3;
|
67 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
68 |
+
.loc 1 30 67
|
69 |
+
cvt.f32.bf16 %r6, %rs1;
|
70 |
+
mov.b32 %f1, %r6;
|
71 |
+
cvt.f32.bf16 %r7, %rs2;
|
72 |
+
mov.b32 %f2, %r7;
|
73 |
+
cvt.f32.bf16 %r8, %rs3;
|
74 |
+
mov.b32 %f3, %r8;
|
75 |
+
cvt.f32.bf16 %r9, %rs4;
|
76 |
+
mov.b32 %f4, %r9;
|
77 |
+
.loc 1 31 30
|
78 |
+
mul.wide.u32 %rd23, %r79, 4;
|
79 |
+
add.s64 %rd2, %rd17, %rd23;
|
80 |
+
.loc 1 31 35
|
81 |
+
mov.u32 %r10, 0x0;
|
82 |
+
mov.u32 %r11, 0x0;
|
83 |
+
mov.u32 %r12, 0x0;
|
84 |
+
mov.u32 %r13, 0x0;
|
85 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
86 |
+
@!%p1 mov.u32 %r10, %r4;
|
87 |
+
@!%p1 mov.u32 %r11, %r4;
|
88 |
+
@!%p1 mov.u32 %r12, %r4;
|
89 |
+
@!%p1 mov.u32 %r13, %r4;
|
90 |
+
mov.b32 %f5, %r10;
|
91 |
+
mov.b32 %f6, %r11;
|
92 |
+
mov.b32 %f7, %r12;
|
93 |
+
mov.b32 %f8, %r13;
|
94 |
+
.loc 1 32 30
|
95 |
+
mul.wide.s32 %rd24, %r81, 4;
|
96 |
+
add.s64 %rd3, %rd18, %rd24;
|
97 |
+
.loc 1 32 46
|
98 |
+
mov.u32 %r18, 0x0;
|
99 |
+
mov.u32 %r19, 0x0;
|
100 |
+
mov.u32 %r20, 0x0;
|
101 |
+
mov.u32 %r21, 0x0;
|
102 |
+
@%p1 ld.global.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
103 |
+
@!%p1 mov.u32 %r18, %r4;
|
104 |
+
@!%p1 mov.u32 %r19, %r4;
|
105 |
+
@!%p1 mov.u32 %r20, %r4;
|
106 |
+
@!%p1 mov.u32 %r21, %r4;
|
107 |
+
mov.b32 %f9, %r18;
|
108 |
+
mov.b32 %f10, %r19;
|
109 |
+
mov.b32 %f11, %r20;
|
110 |
+
mov.b32 %f12, %r21;
|
111 |
+
.loc 1 33 30
|
112 |
+
mul.wide.s32 %rd25, %r1, 4;
|
113 |
+
add.s64 %rd4, %rd19, %rd25;
|
114 |
+
.loc 1 33 35
|
115 |
+
mov.u32 %r26, 0x0;
|
116 |
+
@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
|
117 |
+
mov.b32 %f13, %r26;
|
118 |
+
mov.u32 %r27, 0x0;
|
119 |
+
@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
|
120 |
+
mov.u32 %r28, 0x0;
|
121 |
+
@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
|
122 |
+
mov.u32 %r29, 0x0;
|
123 |
+
@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
|
124 |
+
.loc 1 34 31
|
125 |
+
add.s64 %rd8, %rd20, %rd25;
|
126 |
+
.loc 1 34 36
|
127 |
+
mov.u32 %r55, 0x0;
|
128 |
+
@%p1 ld.global.L1::evict_last.b32 { %r55 }, [ %rd8 + 0 ];
|
129 |
+
mov.b32 %f14, %r55;
|
130 |
+
mov.u32 %r31, 0x0;
|
131 |
+
@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
|
132 |
+
mov.u32 %r32, 0x0;
|
133 |
+
@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
|
134 |
+
mov.u32 %r33, 0x0;
|
135 |
+
@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
|
136 |
+
.loc 1 35 35
|
137 |
+
add.s64 %rd12, %rd15, %rd24;
|
138 |
+
.loc 1 35 51
|
139 |
+
mov.u32 %r34, 0x0;
|
140 |
+
mov.u32 %r35, 0x0;
|
141 |
+
mov.u32 %r36, 0x0;
|
142 |
+
mov.u32 %r37, 0x0;
|
143 |
+
@%p1 ld.global.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd12 + 0 ];
|
144 |
+
@!%p1 mov.u32 %r34, %r4;
|
145 |
+
@!%p1 mov.u32 %r35, %r4;
|
146 |
+
@!%p1 mov.u32 %r36, %r4;
|
147 |
+
@!%p1 mov.u32 %r37, %r4;
|
148 |
+
mov.b32 %f15, %r34;
|
149 |
+
mov.b32 %f16, %r35;
|
150 |
+
mov.b32 %f17, %r36;
|
151 |
+
mov.b32 %f18, %r37;
|
152 |
+
.loc 1 37 18
|
153 |
+
mul.f32 %f19, %f1, %f5;
|
154 |
+
mul.f32 %f20, %f2, %f6;
|
155 |
+
mul.f32 %f21, %f3, %f7;
|
156 |
+
mul.f32 %f22, %f4, %f8;
|
157 |
+
$L__tmp1:
|
158 |
+
.loc 2 233 15
|
159 |
+
fma.rn.f32 %f23, %f1, %f5, %f20;
|
160 |
+
fma.rn.f32 %f24, %f3, %f7, %f23;
|
161 |
+
fma.rn.f32 %f25, %f4, %f8, %f24;
|
162 |
+
$L__tmp2:
|
163 |
+
.loc 2 243 36
|
164 |
+
mov.b32 %r82, %f25;
|
165 |
+
shfl.sync.bfly.b32 %r83, %r82, 16, 31, -1;
|
166 |
+
mov.b32 %f26, %r83;
|
167 |
+
$L__tmp3:
|
168 |
+
.loc 2 233 15
|
169 |
+
add.f32 %f27, %f25, %f26;
|
170 |
+
$L__tmp4:
|
171 |
+
.loc 2 243 36
|
172 |
+
mov.b32 %r84, %f27;
|
173 |
+
shfl.sync.bfly.b32 %r85, %r84, 8, 31, -1;
|
174 |
+
mov.b32 %f28, %r85;
|
175 |
+
$L__tmp5:
|
176 |
+
.loc 2 233 15
|
177 |
+
add.f32 %f29, %f27, %f28;
|
178 |
+
$L__tmp6:
|
179 |
+
.loc 2 243 36
|
180 |
+
mov.b32 %r86, %f29;
|
181 |
+
shfl.sync.bfly.b32 %r87, %r86, 4, 31, -1;
|
182 |
+
mov.b32 %f30, %r87;
|
183 |
+
$L__tmp7:
|
184 |
+
.loc 2 233 15
|
185 |
+
add.f32 %f31, %f29, %f30;
|
186 |
+
$L__tmp8:
|
187 |
+
.loc 2 243 36
|
188 |
+
mov.b32 %r88, %f31;
|
189 |
+
shfl.sync.bfly.b32 %r89, %r88, 2, 31, -1;
|
190 |
+
mov.b32 %f32, %r89;
|
191 |
+
$L__tmp9:
|
192 |
+
.loc 2 233 15
|
193 |
+
add.f32 %f33, %f31, %f32;
|
194 |
+
$L__tmp10:
|
195 |
+
.loc 2 243 36
|
196 |
+
mov.b32 %r90, %f33;
|
197 |
+
shfl.sync.bfly.b32 %r91, %r90, 1, 31, -1;
|
198 |
+
mov.b32 %f34, %r91;
|
199 |
+
$L__tmp11:
|
200 |
+
.loc 2 233 15
|
201 |
+
add.f32 %f35, %f33, %f34;
|
202 |
+
$L__tmp12:
|
203 |
+
.loc 2 243 36
|
204 |
+
setp.eq.s32 %p27, %r77, 0;
|
205 |
+
shr.u32 %r92, %r76, 3;
|
206 |
+
and.b32 %r93, %r92, 4;
|
207 |
+
mov.u32 %r94, global_smem;
|
208 |
+
add.s32 %r42, %r94, %r93;
|
209 |
+
mov.b32 %r43, %f35;
|
210 |
+
@%p27 st.shared.b32 [ %r42 + 0 ], %r43;
|
211 |
+
bar.sync 0;
|
212 |
+
setp.lt.s32 %p28, %r76, 2;
|
213 |
+
add.s32 %r45, %r94, %r78;
|
214 |
+
@%p28 ld.shared.b32 %r44, [ %r45 + 0 ];
|
215 |
+
mov.b32 %f36, %r44;
|
216 |
+
shfl.sync.bfly.b32 %r95, %r44, 1, 31, -1;
|
217 |
+
mov.b32 %f37, %r95;
|
218 |
+
$L__tmp13:
|
219 |
+
.loc 2 233 15
|
220 |
+
add.f32 %f38, %f36, %f37;
|
221 |
+
$L__tmp14:
|
222 |
+
.loc 2 243 36
|
223 |
+
and.b32 %r96, %r76, 1;
|
224 |
+
setp.eq.b32 %p35, %r96, 1;
|
225 |
+
not.pred %p36, %p35;
|
226 |
+
and.pred %p29, %p28, %p36;
|
227 |
+
mov.b32 %r47, %f38;
|
228 |
+
@%p29 st.shared.b32 [ %r45 + 0 ], %r47;
|
229 |
+
bar.sync 0;
|
230 |
+
ld.shared.f32 %f39, [global_smem];
|
231 |
+
$L__tmp15:
|
232 |
+
.loc 3 8 15
|
233 |
+
add.f32 %f40, %f39, 0f00000000;
|
234 |
+
$L__tmp16:
|
235 |
+
.loc 1 41 19
|
236 |
+
sub.f32 %f41, %f9, %f13;
|
237 |
+
sub.f32 %f42, %f10, %f13;
|
238 |
+
sub.f32 %f43, %f11, %f13;
|
239 |
+
sub.f32 %f44, %f12, %f13;
|
240 |
+
.loc 1 42 20
|
241 |
+
mul.f32 %f45, %f41, %f14;
|
242 |
+
mul.f32 %f46, %f42, %f14;
|
243 |
+
mul.f32 %f47, %f43, %f14;
|
244 |
+
mul.f32 %f48, %f44, %f14;
|
245 |
+
.loc 1 43 19
|
246 |
+
mul.f32 %f49, %f20, %f46;
|
247 |
+
$L__tmp17:
|
248 |
+
.loc 2 243 36
|
249 |
+
bar.sync 0;
|
250 |
+
$L__tmp18:
|
251 |
+
.loc 2 233 15
|
252 |
+
fma.rn.f32 %f50, %f19, %f45, %f49;
|
253 |
+
fma.rn.f32 %f51, %f21, %f47, %f50;
|
254 |
+
fma.rn.f32 %f52, %f22, %f48, %f51;
|
255 |
+
$L__tmp19:
|
256 |
+
.loc 2 243 36
|
257 |
+
mov.b32 %r97, %f52;
|
258 |
+
shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
|
259 |
+
mov.b32 %f53, %r98;
|
260 |
+
$L__tmp20:
|
261 |
+
.loc 2 233 15
|
262 |
+
add.f32 %f54, %f52, %f53;
|
263 |
+
$L__tmp21:
|
264 |
+
.loc 2 243 36
|
265 |
+
mov.b32 %r99, %f54;
|
266 |
+
shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
|
267 |
+
mov.b32 %f55, %r100;
|
268 |
+
$L__tmp22:
|
269 |
+
.loc 2 233 15
|
270 |
+
add.f32 %f56, %f54, %f55;
|
271 |
+
$L__tmp23:
|
272 |
+
.loc 2 243 36
|
273 |
+
mov.b32 %r101, %f56;
|
274 |
+
shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
|
275 |
+
mov.b32 %f57, %r102;
|
276 |
+
$L__tmp24:
|
277 |
+
.loc 2 233 15
|
278 |
+
add.f32 %f58, %f56, %f57;
|
279 |
+
$L__tmp25:
|
280 |
+
.loc 2 243 36
|
281 |
+
mov.b32 %r103, %f58;
|
282 |
+
shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
|
283 |
+
mov.b32 %f59, %r104;
|
284 |
+
$L__tmp26:
|
285 |
+
.loc 2 233 15
|
286 |
+
add.f32 %f60, %f58, %f59;
|
287 |
+
$L__tmp27:
|
288 |
+
.loc 2 243 36
|
289 |
+
mov.b32 %r105, %f60;
|
290 |
+
shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
|
291 |
+
mov.b32 %f61, %r106;
|
292 |
+
$L__tmp28:
|
293 |
+
.loc 2 233 15
|
294 |
+
add.f32 %f62, %f60, %f61;
|
295 |
+
$L__tmp29:
|
296 |
+
.loc 2 243 36
|
297 |
+
mov.b32 %r49, %f62;
|
298 |
+
@%p27 st.shared.b32 [ %r42 + 0 ], %r49;
|
299 |
+
bar.sync 0;
|
300 |
+
@%p28 ld.shared.b32 %r50, [ %r45 + 0 ];
|
301 |
+
mov.b32 %f63, %r50;
|
302 |
+
shfl.sync.bfly.b32 %r107, %r50, 1, 31, -1;
|
303 |
+
mov.b32 %f64, %r107;
|
304 |
+
$L__tmp30:
|
305 |
+
.loc 2 233 15
|
306 |
+
add.f32 %f65, %f63, %f64;
|
307 |
+
$L__tmp31:
|
308 |
+
.loc 2 243 36
|
309 |
+
mov.b32 %r53, %f65;
|
310 |
+
@%p29 st.shared.b32 [ %r45 + 0 ], %r53;
|
311 |
+
bar.sync 0;
|
312 |
+
ld.shared.f32 %f66, [global_smem];
|
313 |
+
$L__tmp32:
|
314 |
+
.loc 3 8 15
|
315 |
+
add.f32 %f67, %f66, 0f00000000;
|
316 |
+
mov.b32 %r56, 1132462080;
|
317 |
+
$L__tmp33:
|
318 |
+
.loc 1 48 20
|
319 |
+
div.full.f32 %r54, %r55, %r56;
|
320 |
+
mov.b32 %f68, %r54;
|
321 |
+
.loc 1 50 20
|
322 |
+
neg.f32 %f69, %f40;
|
323 |
+
fma.rn.f32 %f70, %f19, 0f43800000, %f69;
|
324 |
+
fma.rn.f32 %f71, %f20, 0f43800000, %f69;
|
325 |
+
fma.rn.f32 %f72, %f21, 0f43800000, %f69;
|
326 |
+
fma.rn.f32 %f73, %f22, 0f43800000, %f69;
|
327 |
+
.loc 1 52 20
|
328 |
+
neg.f32 %f74, %f45;
|
329 |
+
fma.rn.f32 %f75, %f74, %f67, %f70;
|
330 |
+
neg.f32 %f76, %f46;
|
331 |
+
fma.rn.f32 %f77, %f76, %f67, %f71;
|
332 |
+
neg.f32 %f78, %f47;
|
333 |
+
fma.rn.f32 %f79, %f78, %f67, %f72;
|
334 |
+
neg.f32 %f80, %f48;
|
335 |
+
fma.rn.f32 %f81, %f80, %f67, %f73;
|
336 |
+
.loc 1 54 20
|
337 |
+
fma.rn.f32 %f82, %f68, %f75, %f15;
|
338 |
+
fma.rn.f32 %f83, %f68, %f77, %f16;
|
339 |
+
fma.rn.f32 %f84, %f68, %f79, %f17;
|
340 |
+
fma.rn.f32 %f85, %f68, %f81, %f18;
|
341 |
+
.loc 1 56 51
|
342 |
+
mov.b32 %r66, %f82;
|
343 |
+
mov.b32 %r67, %f83;
|
344 |
+
mov.b32 %r68, %f84;
|
345 |
+
mov.b32 %r69, %f85;
|
346 |
+
@%p1 st.global.v4.b32 [ %rd12 + 0 ], { %r66, %r67, %r68, %r69 };
|
347 |
+
.loc 1 57 25
|
348 |
+
add.s64 %rd14, %rd21, %rd22;
|
349 |
+
.loc 1 57 48
|
350 |
+
cvt.rn.bf16.f32 %rs5, %r66;
|
351 |
+
cvt.rn.bf16.f32 %rs6, %r67;
|
352 |
+
cvt.rn.bf16.f32 %rs7, %r68;
|
353 |
+
cvt.rn.bf16.f32 %rs8, %r69;
|
354 |
+
mov.b32 %r108, {%rs5, %rs6};
|
355 |
+
mov.b32 %r109, {%rs7, %rs8};
|
356 |
+
@%p1 st.global.v2.b32 [ %rd14 + 0 ], { %r108, %r109 };
|
357 |
+
.loc 1 57 4
|
358 |
+
ret;
|
359 |
+
$L__tmp34:
|
360 |
+
$L__func_end0:
|
361 |
+
|
362 |
+
}
|
363 |
+
.file 1 "/tmp/torchinductor_root/sn/csned4hyxpgwu5ttubs3r7uxkjq5yfl3zh6c2sozobtkek2uzfcv.py"
|
364 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
365 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
366 |
+
.section .debug_abbrev
|
367 |
+
{
|
368 |
+
.b8 1
|
369 |
+
.b8 17
|
370 |
+
.b8 1
|
371 |
+
.b8 37
|
372 |
+
.b8 8
|
373 |
+
.b8 19
|
374 |
+
.b8 5
|
375 |
+
.b8 3
|
376 |
+
.b8 8
|
377 |
+
.b8 16
|
378 |
+
.b8 6
|
379 |
+
.b8 27
|
380 |
+
.b8 8
|
381 |
+
.b8 180
|
382 |
+
.b8 66
|
383 |
+
.b8 12
|
384 |
+
.b8 17
|
385 |
+
.b8 1
|
386 |
+
.b8 18
|
387 |
+
.b8 1
|
388 |
+
.b8 0
|
389 |
+
.b8 0
|
390 |
+
.b8 2
|
391 |
+
.b8 46
|
392 |
+
.b8 0
|
393 |
+
.b8 135
|
394 |
+
.b8 64
|
395 |
+
.b8 8
|
396 |
+
.b8 3
|
397 |
+
.b8 8
|
398 |
+
.b8 58
|
399 |
+
.b8 11
|
400 |
+
.b8 59
|
401 |
+
.b8 11
|
402 |
+
.b8 63
|
403 |
+
.b8 12
|
404 |
+
.b8 32
|
405 |
+
.b8 11
|
406 |
+
.b8 0
|
407 |
+
.b8 0
|
408 |
+
.b8 3
|
409 |
+
.b8 46
|
410 |
+
.b8 1
|
411 |
+
.b8 17
|
412 |
+
.b8 1
|
413 |
+
.b8 18
|
414 |
+
.b8 1
|
415 |
+
.b8 64
|
416 |
+
.b8 10
|
417 |
+
.b8 49
|
418 |
+
.b8 19
|
419 |
+
.b8 0
|
420 |
+
.b8 0
|
421 |
+
.b8 4
|
422 |
+
.b8 29
|
423 |
+
.b8 1
|
424 |
+
.b8 49
|
425 |
+
.b8 19
|
426 |
+
.b8 17
|
427 |
+
.b8 1
|
428 |
+
.b8 18
|
429 |
+
.b8 1
|
430 |
+
.b8 88
|
431 |
+
.b8 11
|
432 |
+
.b8 89
|
433 |
+
.b8 11
|
434 |
+
.b8 87
|
435 |
+
.b8 11
|
436 |
+
.b8 0
|
437 |
+
.b8 0
|
438 |
+
.b8 5
|
439 |
+
.b8 29
|
440 |
+
.b8 0
|
441 |
+
.b8 49
|
442 |
+
.b8 19
|
443 |
+
.b8 17
|
444 |
+
.b8 1
|
445 |
+
.b8 18
|
446 |
+
.b8 1
|
447 |
+
.b8 88
|
448 |
+
.b8 11
|
449 |
+
.b8 89
|
450 |
+
.b8 11
|
451 |
+
.b8 87
|
452 |
+
.b8 11
|
453 |
+
.b8 0
|
454 |
+
.b8 0
|
455 |
+
.b8 0
|
456 |
+
}
|
457 |
+
.section .debug_info
|
458 |
+
{
|
459 |
+
.b32 403
|
460 |
+
.b8 2
|
461 |
+
.b8 0
|
462 |
+
.b32 .debug_abbrev
|
463 |
+
.b8 8
|
464 |
+
.b8 1
|
465 |
+
.b8 116
|
466 |
+
.b8 114
|
467 |
+
.b8 105
|
468 |
+
.b8 116
|
469 |
+
.b8 111
|
470 |
+
.b8 110
|
471 |
+
.b8 0
|
472 |
+
.b8 2
|
473 |
+
.b8 0
|
474 |
+
.b8 99
|
475 |
+
.b8 115
|
476 |
+
.b8 110
|
477 |
+
.b8 101
|
478 |
+
.b8 100
|
479 |
+
.b8 52
|
480 |
+
.b8 104
|
481 |
+
.b8 121
|
482 |
+
.b8 120
|
483 |
+
.b8 112
|
484 |
+
.b8 103
|
485 |
+
.b8 119
|
486 |
+
.b8 117
|
487 |
+
.b8 53
|
488 |
+
.b8 116
|
489 |
+
.b8 116
|
490 |
+
.b8 117
|
491 |
+
.b8 98
|
492 |
+
.b8 115
|
493 |
+
.b8 51
|
494 |
+
.b8 114
|
495 |
+
.b8 55
|
496 |
+
.b8 117
|
497 |
+
.b8 120
|
498 |
+
.b8 107
|
499 |
+
.b8 106
|
500 |
+
.b8 113
|
501 |
+
.b8 53
|
502 |
+
.b8 121
|
503 |
+
.b8 102
|
504 |
+
.b8 108
|
505 |
+
.b8 51
|
506 |
+
.b8 122
|
507 |
+
.b8 104
|
508 |
+
.b8 54
|
509 |
+
.b8 99
|
510 |
+
.b8 50
|
511 |
+
.b8 115
|
512 |
+
.b8 111
|
513 |
+
.b8 122
|
514 |
+
.b8 111
|
515 |
+
.b8 98
|
516 |
+
.b8 116
|
517 |
+
.b8 107
|
518 |
+
.b8 101
|
519 |
+
.b8 107
|
520 |
+
.b8 50
|
521 |
+
.b8 117
|
522 |
+
.b8 122
|
523 |
+
.b8 102
|
524 |
+
.b8 99
|
525 |
+
.b8 118
|
526 |
+
.b8 46
|
527 |
+
.b8 112
|
528 |
+
.b8 121
|
529 |
+
.b8 0
|
530 |
+
.b32 .debug_line
|
531 |
+
.b8 47
|
532 |
+
.b8 116
|
533 |
+
.b8 109
|
534 |
+
.b8 112
|
535 |
+
.b8 47
|
536 |
+
.b8 116
|
537 |
+
.b8 111
|
538 |
+
.b8 114
|
539 |
+
.b8 99
|
540 |
+
.b8 104
|
541 |
+
.b8 105
|
542 |
+
.b8 110
|
543 |
+
.b8 100
|
544 |
+
.b8 117
|
545 |
+
.b8 99
|
546 |
+
.b8 116
|
547 |
+
.b8 111
|
548 |
+
.b8 114
|
549 |
+
.b8 95
|
550 |
+
.b8 114
|
551 |
+
.b8 111
|
552 |
+
.b8 111
|
553 |
+
.b8 116
|
554 |
+
.b8 47
|
555 |
+
.b8 115
|
556 |
+
.b8 110
|
557 |
+
.b8 0
|
558 |
+
.b8 1
|
559 |
+
.b64 $L__func_begin0
|
560 |
+
.b64 $L__func_end0
|
561 |
+
.b8 2
|
562 |
+
.b8 116
|
563 |
+
.b8 114
|
564 |
+
.b8 105
|
565 |
+
.b8 116
|
566 |
+
.b8 111
|
567 |
+
.b8 110
|
568 |
+
.b8 95
|
569 |
+
.b8 95
|
570 |
+
.b8 48
|
571 |
+
.b8 100
|
572 |
+
.b8 49
|
573 |
+
.b8 100
|
574 |
+
.b8 50
|
575 |
+
.b8 100
|
576 |
+
.b8 51
|
577 |
+
.b8 100
|
578 |
+
.b8 52
|
579 |
+
.b8 100
|
580 |
+
.b8 53
|
581 |
+
.b8 100
|
582 |
+
.b8 54
|
583 |
+
.b8 100
|
584 |
+
.b8 55
|
585 |
+
.b8 100
|
586 |
+
.b8 101
|
587 |
+
.b8 56
|
588 |
+
.b8 100
|
589 |
+
.b8 101
|
590 |
+
.b8 0
|
591 |
+
.b8 116
|
592 |
+
.b8 114
|
593 |
+
.b8 105
|
594 |
+
.b8 116
|
595 |
+
.b8 111
|
596 |
+
.b8 110
|
597 |
+
.b8 95
|
598 |
+
.b8 95
|
599 |
+
.b8 48
|
600 |
+
.b8 100
|
601 |
+
.b8 49
|
602 |
+
.b8 100
|
603 |
+
.b8 50
|
604 |
+
.b8 100
|
605 |
+
.b8 51
|
606 |
+
.b8 100
|
607 |
+
.b8 52
|
608 |
+
.b8 100
|
609 |
+
.b8 53
|
610 |
+
.b8 100
|
611 |
+
.b8 54
|
612 |
+
.b8 100
|
613 |
+
.b8 55
|
614 |
+
.b8 100
|
615 |
+
.b8 101
|
616 |
+
.b8 56
|
617 |
+
.b8 100
|
618 |
+
.b8 101
|
619 |
+
.b8 0
|
620 |
+
.b8 1
|
621 |
+
.b8 18
|
622 |
+
.b8 1
|
623 |
+
.b8 1
|
624 |
+
.b8 3
|
625 |
+
.b64 $L__func_begin0
|
626 |
+
.b64 $L__func_end0
|
627 |
+
.b8 1
|
628 |
+
.b8 156
|
629 |
+
.b32 125
|
630 |
+
.b8 4
|
631 |
+
.b32 125
|
632 |
+
.b64 $L__tmp1
|
633 |
+
.b64 $L__tmp14
|
634 |
+
.b8 2
|
635 |
+
.b8 40
|
636 |
+
.b8 57
|
637 |
+
.b8 5
|
638 |
+
.b32 125
|
639 |
+
.b64 $L__tmp1
|
640 |
+
.b64 $L__tmp14
|
641 |
+
.b8 2
|
642 |
+
.b8 243
|
643 |
+
.b8 36
|
644 |
+
.b8 0
|
645 |
+
.b8 5
|
646 |
+
.b32 125
|
647 |
+
.b64 $L__tmp2
|
648 |
+
.b64 $L__tmp15
|
649 |
+
.b8 2
|
650 |
+
.b8 40
|
651 |
+
.b8 57
|
652 |
+
.b8 5
|
653 |
+
.b32 125
|
654 |
+
.b64 $L__tmp15
|
655 |
+
.b64 $L__tmp16
|
656 |
+
.b8 3
|
657 |
+
.b8 40
|
658 |
+
.b8 44
|
659 |
+
.b8 5
|
660 |
+
.b32 125
|
661 |
+
.b64 $L__tmp17
|
662 |
+
.b64 $L__tmp32
|
663 |
+
.b8 2
|
664 |
+
.b8 46
|
665 |
+
.b8 59
|
666 |
+
.b8 4
|
667 |
+
.b32 125
|
668 |
+
.b64 $L__tmp18
|
669 |
+
.b64 $L__tmp31
|
670 |
+
.b8 2
|
671 |
+
.b8 46
|
672 |
+
.b8 59
|
673 |
+
.b8 5
|
674 |
+
.b32 125
|
675 |
+
.b64 $L__tmp18
|
676 |
+
.b64 $L__tmp31
|
677 |
+
.b8 2
|
678 |
+
.b8 243
|
679 |
+
.b8 36
|
680 |
+
.b8 0
|
681 |
+
.b8 5
|
682 |
+
.b32 125
|
683 |
+
.b64 $L__tmp32
|
684 |
+
.b64 $L__tmp33
|
685 |
+
.b8 3
|
686 |
+
.b8 46
|
687 |
+
.b8 45
|
688 |
+
.b8 0
|
689 |
+
.b8 0
|
690 |
+
}
|
691 |
+
.section .debug_pubnames
|
692 |
+
{
|
693 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
694 |
+
$L__pubNames_start0:
|
695 |
+
.b8 2
|
696 |
+
.b8 0
|
697 |
+
.b32 .debug_info
|
698 |
+
.b32 407
|
699 |
+
.b32 125
|
700 |
+
.b8 116
|
701 |
+
.b8 114
|
702 |
+
.b8 105
|
703 |
+
.b8 116
|
704 |
+
.b8 111
|
705 |
+
.b8 110
|
706 |
+
.b8 95
|
707 |
+
.b8 95
|
708 |
+
.b8 48
|
709 |
+
.b8 100
|
710 |
+
.b8 49
|
711 |
+
.b8 100
|
712 |
+
.b8 50
|
713 |
+
.b8 100
|
714 |
+
.b8 51
|
715 |
+
.b8 100
|
716 |
+
.b8 52
|
717 |
+
.b8 100
|
718 |
+
.b8 53
|
719 |
+
.b8 100
|
720 |
+
.b8 54
|
721 |
+
.b8 100
|
722 |
+
.b8 55
|
723 |
+
.b8 100
|
724 |
+
.b8 101
|
725 |
+
.b8 56
|
726 |
+
.b8 100
|
727 |
+
.b8 101
|
728 |
+
.b8 0
|
729 |
+
.b32 0
|
730 |
+
$L__pubNames_end0:
|
731 |
+
}
|
732 |
+
.section .debug_pubtypes
|
733 |
+
{
|
734 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
735 |
+
$L__pubTypes_start0:
|
736 |
+
.b8 2
|
737 |
+
.b8 0
|
738 |
+
.b32 .debug_info
|
739 |
+
.b32 407
|
740 |
+
.b32 0
|
741 |
+
$L__pubTypes_end0:
|
742 |
+
}
|
743 |
+
.section .debug_loc { }
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttgir
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
7 |
+
%c256_i32 = arith.constant 256 : i32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
20 |
+
%9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
21 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
22 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
27 |
+
%16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
28 |
+
%17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
29 |
+
%18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
30 |
+
%19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
31 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
32 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
33 |
+
%22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
34 |
+
%23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
35 |
+
%24 = tt.load %23, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
36 |
+
%25 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
|
37 |
+
%26 = arith.select %2, %25, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
38 |
+
%27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
|
39 |
+
^bb0(%arg9: f32, %arg10: f32):
|
40 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
41 |
+
tt.reduce.return %50 : f32
|
42 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
43 |
+
%28 = arith.addf %27, %cst_1 : f32
|
44 |
+
%29 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
45 |
+
%30 = arith.subf %15, %29 : tensor<256xf32, #blocked>
|
46 |
+
%31 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
47 |
+
%32 = arith.mulf %30, %31 : tensor<256xf32, #blocked>
|
48 |
+
%33 = arith.mulf %25, %32 : tensor<256xf32, #blocked>
|
49 |
+
%34 = arith.select %2, %33, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
50 |
+
%35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
|
51 |
+
^bb0(%arg9: f32, %arg10: f32):
|
52 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
53 |
+
tt.reduce.return %50 : f32
|
54 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
55 |
+
%36 = arith.addf %35, %cst_1 : f32
|
56 |
+
%37 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
|
57 |
+
%38 = arith.mulf %25, %cst_3 : tensor<256xf32, #blocked>
|
58 |
+
%39 = tt.splat %28 : (f32) -> tensor<256xf32, #blocked>
|
59 |
+
%40 = arith.subf %38, %39 : tensor<256xf32, #blocked>
|
60 |
+
%41 = tt.splat %36 : (f32) -> tensor<256xf32, #blocked>
|
61 |
+
%42 = arith.mulf %32, %41 : tensor<256xf32, #blocked>
|
62 |
+
%43 = arith.subf %40, %42 : tensor<256xf32, #blocked>
|
63 |
+
%44 = tt.broadcast %37 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
64 |
+
%45 = arith.mulf %44, %43 : tensor<256xf32, #blocked>
|
65 |
+
%46 = arith.addf %24, %45 : tensor<256xf32, #blocked>
|
66 |
+
tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
67 |
+
%47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
68 |
+
%48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
69 |
+
%49 = arith.truncf %46 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
70 |
+
tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
71 |
+
tt.return
|
72 |
+
}
|
73 |
+
}
|
.triton/dump/3791d630ba27aad5e647360045d2f1b5/triton_.ttir
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
7 |
+
%cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32>
|
8 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
19 |
+
%9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32>
|
20 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
21 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
22 |
+
%12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
26 |
+
%16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
27 |
+
%17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
28 |
+
%18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
|
29 |
+
%19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
30 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
31 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32>
|
32 |
+
%22 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
33 |
+
%23 = tt.addptr %22, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
34 |
+
%24 = tt.load %23, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
35 |
+
%25 = arith.mulf %9, %12 : tensor<256xf32>
|
36 |
+
%26 = arith.select %2, %25, %cst_1 : tensor<256xi1>, tensor<256xf32>
|
37 |
+
%27 = "tt.reduce"(%26) <{axis = 0 : i32}> ({
|
38 |
+
^bb0(%arg9: f32, %arg10: f32):
|
39 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
40 |
+
tt.reduce.return %50 : f32
|
41 |
+
}) : (tensor<256xf32>) -> f32
|
42 |
+
%28 = arith.addf %27, %cst_0 : f32
|
43 |
+
%29 = tt.broadcast %18 : (tensor<1xf32>) -> tensor<256xf32>
|
44 |
+
%30 = arith.subf %15, %29 : tensor<256xf32>
|
45 |
+
%31 = tt.broadcast %21 : (tensor<1xf32>) -> tensor<256xf32>
|
46 |
+
%32 = arith.mulf %30, %31 : tensor<256xf32>
|
47 |
+
%33 = arith.mulf %25, %32 : tensor<256xf32>
|
48 |
+
%34 = arith.select %2, %33, %cst_1 : tensor<256xi1>, tensor<256xf32>
|
49 |
+
%35 = "tt.reduce"(%34) <{axis = 0 : i32}> ({
|
50 |
+
^bb0(%arg9: f32, %arg10: f32):
|
51 |
+
%50 = arith.addf %arg9, %arg10 : f32
|
52 |
+
tt.reduce.return %50 : f32
|
53 |
+
}) : (tensor<256xf32>) -> f32
|
54 |
+
%36 = arith.addf %35, %cst_0 : f32
|
55 |
+
%37 = arith.divf %21, %cst_3 : tensor<1xf32>
|
56 |
+
%38 = arith.mulf %25, %cst_2 : tensor<256xf32>
|
57 |
+
%39 = tt.splat %28 : (f32) -> tensor<256xf32>
|
58 |
+
%40 = arith.subf %38, %39 : tensor<256xf32>
|
59 |
+
%41 = tt.splat %36 : (f32) -> tensor<256xf32>
|
60 |
+
%42 = arith.mulf %32, %41 : tensor<256xf32>
|
61 |
+
%43 = arith.subf %40, %42 : tensor<256xf32>
|
62 |
+
%44 = tt.broadcast %37 : (tensor<1xf32>) -> tensor<256xf32>
|
63 |
+
%45 = arith.mulf %44, %43 : tensor<256xf32>
|
64 |
+
%46 = arith.addf %24, %45 : tensor<256xf32>
|
65 |
+
tt.store %23, %46, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
66 |
+
%47 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
67 |
+
%48 = tt.addptr %47, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
68 |
+
%49 = arith.truncf %46 : tensor<256xf32> to tensor<256xbf16>
|
69 |
+
tt.store %48, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
70 |
+
tt.return
|
71 |
+
}
|
72 |
+
}
|
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttir
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16>
|
4 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32>
|
5 |
+
%c50257_i32 = arith.constant 50257 : i32
|
6 |
+
%c64_i32 = arith.constant 64 : i32
|
7 |
+
%c0_i32 = arith.constant 0 : i32
|
8 |
+
%cst_1 = arith.constant dense<50257> : tensor<64x1xi64>
|
9 |
+
%cst_2 = arith.constant dense<50257> : tensor<1x64xi64>
|
10 |
+
%c64_i64 = arith.constant 64 : i64
|
11 |
+
%cst_3 = arith.constant dense<-1> : tensor<64x1xi64>
|
12 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = arith.extsi %0 : i32 to i64
|
15 |
+
%2 = arith.muli %1, %c64_i64 : i64
|
16 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
17 |
+
%4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
18 |
+
%5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64>
|
19 |
+
%6 = tt.splat %2 : (i64) -> tensor<64x1xi64>
|
20 |
+
%7 = arith.addi %6, %5 : tensor<64x1xi64>
|
21 |
+
%8 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
|
22 |
+
%9 = arith.extsi %8 : tensor<1x64xi32> to tensor<1x64xi64>
|
23 |
+
%10 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
24 |
+
%11 = tt.addptr %10, %7 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi64>
|
25 |
+
%12 = tt.load %11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
26 |
+
%13 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
|
27 |
+
%14 = tt.load %13 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
28 |
+
%15 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
|
29 |
+
%16 = tt.load %15 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
|
30 |
+
%17 = arith.muli %7, %cst_1 : tensor<64x1xi64>
|
31 |
+
%18 = tt.broadcast %17 : (tensor<64x1xi64>) -> tensor<64x64xi64>
|
32 |
+
%19 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
33 |
+
%20 = arith.cmpi ne, %12, %cst_3 : tensor<64x1xi64>
|
34 |
+
%21 = arith.divf %14, %16 : f32
|
35 |
+
%22 = tt.splat %21 : (f32) -> tensor<64x1xf32>
|
36 |
+
%23 = arith.select %20, %22, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
|
37 |
+
%24 = tt.broadcast %23 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
38 |
+
%25 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x64xf32>) : i32 {
|
39 |
+
%40 = arith.extsi %arg9 : i32 to i64
|
40 |
+
%41 = tt.splat %40 : (i64) -> tensor<1x64xi64>
|
41 |
+
%42 = arith.addi %41, %9 : tensor<1x64xi64>
|
42 |
+
%43 = arith.cmpi slt, %42, %cst_2 : tensor<1x64xi64>
|
43 |
+
%44 = tt.broadcast %42 : (tensor<1x64xi64>) -> tensor<64x64xi64>
|
44 |
+
%45 = arith.addi %44, %18 : tensor<64x64xi64>
|
45 |
+
%46 = tt.addptr %19, %45 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
|
46 |
+
%47 = tt.broadcast %43 : (tensor<1x64xi1>) -> tensor<64x64xi1>
|
47 |
+
%48 = tt.load %46, %47, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
48 |
+
%49 = arith.mulf %48, %24 : tensor<64x64xf32>
|
49 |
+
%50 = arith.addf %arg10, %49 : tensor<64x64xf32>
|
50 |
+
%51 = arith.select %47, %50, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
|
51 |
+
scf.yield %51 : tensor<64x64xf32>
|
52 |
+
}
|
53 |
+
%26 = "tt.reduce"(%25) <{axis = 1 : i32}> ({
|
54 |
+
^bb0(%arg9: f32, %arg10: f32):
|
55 |
+
%40 = arith.addf %arg9, %arg10 : f32
|
56 |
+
tt.reduce.return %40 : f32
|
57 |
+
}) : (tensor<64x64xf32>) -> tensor<64xf32>
|
58 |
+
%27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
59 |
+
%28 = arith.muli %7, %cst_1 : tensor<64x1xi64>
|
60 |
+
%29 = tt.broadcast %28 : (tensor<64x1xi64>) -> tensor<64x64xi64>
|
61 |
+
%30 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
62 |
+
%31 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
63 |
+
%32 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
64 |
+
%33 = arith.cmpi ne, %12, %cst_3 : tensor<64x1xi64>
|
65 |
+
%34 = arith.divf %14, %16 : f32
|
66 |
+
%35 = tt.splat %34 : (f32) -> tensor<64x1xf32>
|
67 |
+
%36 = arith.select %33, %35, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
|
68 |
+
%37 = tt.broadcast %36 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
69 |
+
%38 = tt.broadcast %27 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
70 |
+
%39 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
71 |
+
scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 : i32 {
|
72 |
+
%40 = arith.extsi %arg9 : i32 to i64
|
73 |
+
%41 = tt.splat %40 : (i64) -> tensor<1x64xi64>
|
74 |
+
%42 = arith.addi %41, %9 : tensor<1x64xi64>
|
75 |
+
%43 = arith.cmpi slt, %42, %cst_2 : tensor<1x64xi64>
|
76 |
+
%44 = tt.broadcast %42 : (tensor<1x64xi64>) -> tensor<64x64xi64>
|
77 |
+
%45 = arith.addi %44, %29 : tensor<64x64xi64>
|
78 |
+
%46 = tt.addptr %30, %45 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi64>
|
79 |
+
%47 = tt.broadcast %43 : (tensor<1x64xi1>) -> tensor<64x64xi1>
|
80 |
+
%48 = tt.load %46, %47, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
|
81 |
+
%49 = arith.extf %48 : tensor<64x64xbf16> to tensor<64x64xf32>
|
82 |
+
%50 = tt.addptr %31, %45 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
|
83 |
+
%51 = tt.load %50, %47, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
|
84 |
+
%52 = tt.addptr %32, %45 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi64>
|
85 |
+
%53 = tt.load %52, %47, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
|
86 |
+
%54 = arith.extf %53 : tensor<64x64xbf16> to tensor<64x64xf32>
|
87 |
+
%55 = arith.mulf %51, %37 : tensor<64x64xf32>
|
88 |
+
%56 = math.exp %54 : tensor<64x64xf32>
|
89 |
+
%57 = arith.mulf %56, %38 : tensor<64x64xf32>
|
90 |
+
%58 = arith.subf %55, %57 : tensor<64x64xf32>
|
91 |
+
%59 = arith.addf %49, %58 : tensor<64x64xf32>
|
92 |
+
%60 = tt.addptr %39, %45 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi64>
|
93 |
+
%61 = arith.truncf %59 : tensor<64x64xf32> to tensor<64x64xbf16>
|
94 |
+
tt.store %60, %61, %47 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
|
95 |
+
}
|
96 |
+
tt.return
|
97 |
+
}
|
98 |
+
}
|
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.cubin
ADDED
Binary file (31.3 kB). View file
|
|
.triton/dump/473cf6e25c3e63117cd59fc0ed04b89f/triton_.ptx
ADDED
@@ -0,0 +1,1054 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
34 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
36 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
37 |
+
)
|
38 |
+
.maxntid 128, 1, 1
|
39 |
+
{
|
40 |
+
.reg .pred %p<56>;
|
41 |
+
.reg .b16 %rs<13>;
|
42 |
+
.reg .b32 %r<185>;
|
43 |
+
.reg .f32 %f<169>;
|
44 |
+
.reg .b64 %rd<59>;
|
45 |
+
.loc 1 18 0
|
46 |
+
$L__func_begin0:
|
47 |
+
.loc 1 18 0
|
48 |
+
|
49 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5d6de7de_param_4];
|
50 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5d6de7de_param_1];
|
51 |
+
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6de7de_param_0];
|
52 |
+
$L__tmp0:
|
53 |
+
.loc 1 22 44
|
54 |
+
mov.u32 %r1, %tid.x;
|
55 |
+
and.b32 %r2, %r1, 31;
|
56 |
+
ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6de7de_param_2];
|
57 |
+
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_3];
|
58 |
+
bfe.u32 %r3, %r1, 6, 1;
|
59 |
+
and.b32 %r4, %r1, 1;
|
60 |
+
.loc 1 24 33
|
61 |
+
bfe.u32 %r5, %r1, 5, 1;
|
62 |
+
shl.b32 %r31, %r1, 2;
|
63 |
+
and.b32 %r6, %r31, 252;
|
64 |
+
shl.b32 %r32, %r1, 1;
|
65 |
+
and.b32 %r7, %r32, 254;
|
66 |
+
.loc 1 21 28
|
67 |
+
mov.u32 %r14, %ctaid.x;
|
68 |
+
.loc 1 21 33
|
69 |
+
shl.b32 %r33, %r14, 1;
|
70 |
+
.loc 1 22 23
|
71 |
+
or.b32 %r34, %r33, %r3;
|
72 |
+
or.b32 %r35, %r33, %r4;
|
73 |
+
.loc 1 26 30
|
74 |
+
mul.wide.s32 %rd25, %r34, 8;
|
75 |
+
add.s64 %rd11, %rd22, %rd25;
|
76 |
+
mul.wide.s32 %rd26, %r35, 8;
|
77 |
+
add.s64 %rd19, %rd22, %rd26;
|
78 |
+
mov.pred %p50, -1;
|
79 |
+
.loc 1 26 35
|
80 |
+
mov.u64 %rd10, 0x0;
|
81 |
+
@%p50 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd11 + 0 ];
|
82 |
+
mov.u64 %rd12, 0x0;
|
83 |
+
@%p50 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd11 + 0 ];
|
84 |
+
mov.u64 %rd14, 0x0;
|
85 |
+
@%p50 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd11 + 0 ];
|
86 |
+
mov.u64 %rd16, 0x0;
|
87 |
+
@%p50 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd11 + 0 ];
|
88 |
+
mov.u64 %rd18, 0x0;
|
89 |
+
@%p50 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
|
90 |
+
.loc 1 27 18
|
91 |
+
bfe.s32 %r36, %r14, 30, 1;
|
92 |
+
shr.u32 %r37, %r36, 23;
|
93 |
+
add.s32 %r38, %r34, %r37;
|
94 |
+
and.b32 %r39, %r38, 16776704;
|
95 |
+
sub.s32 %r40, %r34, %r39;
|
96 |
+
.loc 1 35 44
|
97 |
+
shl.b32 %r41, %r40, 8;
|
98 |
+
.loc 1 35 40
|
99 |
+
or.b32 %r42, %r41, %r6;
|
100 |
+
.loc 1 35 34
|
101 |
+
mul.wide.s32 %rd27, %r42, 4;
|
102 |
+
add.s64 %rd38, %rd23, %rd27;
|
103 |
+
mov.b32 %r155, 0;
|
104 |
+
.loc 1 35 50
|
105 |
+
mov.u32 %r15, 0x0;
|
106 |
+
mov.u32 %r16, 0x0;
|
107 |
+
mov.u32 %r17, 0x0;
|
108 |
+
mov.u32 %r18, 0x0;
|
109 |
+
@%p50 ld.global.L1::evict_last.v4.b32 { %r15, %r16, %r17, %r18 }, [ %rd38 + 0 ];
|
110 |
+
@!%p50 mov.u32 %r15, %r155;
|
111 |
+
@!%p50 mov.u32 %r16, %r155;
|
112 |
+
@!%p50 mov.u32 %r17, %r155;
|
113 |
+
@!%p50 mov.u32 %r18, %r155;
|
114 |
+
mov.b32 %f2, %r15;
|
115 |
+
mov.b32 %f1, %r16;
|
116 |
+
mov.b32 %f3, %r17;
|
117 |
+
mov.b32 %f4, %r18;
|
118 |
+
.loc 1 36 44
|
119 |
+
shl.b32 %r43, %r34, 8;
|
120 |
+
.loc 1 36 40
|
121 |
+
or.b32 %r44, %r43, %r6;
|
122 |
+
.loc 1 36 34
|
123 |
+
mul.wide.s32 %rd28, %r44, 2;
|
124 |
+
add.s64 %rd39, %rd24, %rd28;
|
125 |
+
.loc 1 36 50
|
126 |
+
mov.u32 %r23, 0x0;
|
127 |
+
mov.u32 %r24, 0x0;
|
128 |
+
@%p50 ld.global.L1::evict_last.v2.b32 { %r23, %r24 }, [ %rd39 + 0 ];
|
129 |
+
@!%p50 mov.u32 %r23, %r155;
|
130 |
+
@!%p50 mov.u32 %r24, %r155;
|
131 |
+
cvt.u16.u32 %rs1, %r23;
|
132 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r23; }
|
133 |
+
cvt.u16.u32 %rs3, %r24;
|
134 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r24; }
|
135 |
+
.loc 1 36 101
|
136 |
+
cvt.f32.bf16 %r27, %rs1;
|
137 |
+
mov.b32 %f5, %r27;
|
138 |
+
cvt.f32.bf16 %r28, %rs2;
|
139 |
+
mov.b32 %f6, %r28;
|
140 |
+
cvt.f32.bf16 %r29, %rs3;
|
141 |
+
mov.b32 %f7, %r29;
|
142 |
+
cvt.f32.bf16 %r30, %rs4;
|
143 |
+
mov.b32 %f8, %r30;
|
144 |
+
.loc 1 37 22
|
145 |
+
add.s64 %rd29, %rd18, 50257;
|
146 |
+
.loc 1 38 22
|
147 |
+
setp.lt.s64 %p14, %rd18, 0;
|
148 |
+
.loc 1 39 36
|
149 |
+
selp.b64 %rd5, %rd29, %rd18, %p14;
|
150 |
+
.loc 1 40 40
|
151 |
+
setp.lt.u64 %p15, %rd5, 50257;
|
152 |
+
mov.b32 %r184, 883;
|
153 |
+
mov.u64 %rd58, 1;
|
154 |
+
.loc 1 40 55
|
155 |
+
@%p15 bra $L__BB0_2;
|
156 |
+
mov.u64 %rd30, assertMessage_0;
|
157 |
+
cvta.global.u64 %rd31, %rd30;
|
158 |
+
mov.u64 %rd32, assertFile_0;
|
159 |
+
cvta.global.u64 %rd33, %rd32;
|
160 |
+
mov.u64 %rd34, assertFunc_0;
|
161 |
+
cvta.global.u64 %rd35, %rd34;
|
162 |
+
{ // callseq 4, 0
|
163 |
+
.reg .b32 temp_param_reg;
|
164 |
+
.param .b64 param0;
|
165 |
+
st.param.b64 [param0+0], %rd31;
|
166 |
+
.param .b64 param1;
|
167 |
+
st.param.b64 [param1+0], %rd33;
|
168 |
+
.param .b32 param2;
|
169 |
+
st.param.b32 [param2+0], %r184;
|
170 |
+
.param .b64 param3;
|
171 |
+
st.param.b64 [param3+0], %rd35;
|
172 |
+
.param .b64 param4;
|
173 |
+
st.param.b64 [param4+0], %rd58;
|
174 |
+
call.uni
|
175 |
+
__assertfail,
|
176 |
+
(
|
177 |
+
param0,
|
178 |
+
param1,
|
179 |
+
param2,
|
180 |
+
param3,
|
181 |
+
param4
|
182 |
+
);
|
183 |
+
} // callseq 4
|
184 |
+
$L__BB0_2:
|
185 |
+
.loc 1 0 55
|
186 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_5];
|
187 |
+
cvt.s64.s32 %rd3, %r44;
|
188 |
+
.loc 1 38 22
|
189 |
+
setp.lt.s64 %p42, %rd10, 0;
|
190 |
+
.loc 1 41 44
|
191 |
+
shl.b64 %rd41, %rd10, 8;
|
192 |
+
add.s64 %rd42, %rd41, 12865792;
|
193 |
+
selp.b64 %rd43, %rd42, %rd41, %p42;
|
194 |
+
cvt.u64.u32 %rd44, %r6;
|
195 |
+
.loc 1 41 40
|
196 |
+
or.b64 %rd45, %rd43, %rd44;
|
197 |
+
.loc 1 41 34
|
198 |
+
shl.b64 %rd46, %rd45, 2;
|
199 |
+
add.s64 %rd55, %rd7, %rd46;
|
200 |
+
.loc 1 41 52
|
201 |
+
mov.u32 %r46, 0x0;
|
202 |
+
mov.u32 %r47, 0x0;
|
203 |
+
mov.u32 %r48, 0x0;
|
204 |
+
mov.u32 %r49, 0x0;
|
205 |
+
@%p50 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd55 + 0 ];
|
206 |
+
@!%p50 mov.u32 %r46, %r155;
|
207 |
+
@!%p50 mov.u32 %r47, %r155;
|
208 |
+
@!%p50 mov.u32 %r48, %r155;
|
209 |
+
@!%p50 mov.u32 %r49, %r155;
|
210 |
+
mov.b32 %f15, %r48;
|
211 |
+
mov.b32 %f16, %r49;
|
212 |
+
.loc 1 42 22
|
213 |
+
add.f32 %f17, %f3, %f15;
|
214 |
+
add.f32 %f18, %f4, %f16;
|
215 |
+
.loc 1 44 22
|
216 |
+
add.f32 %f19, %f7, %f17;
|
217 |
+
add.f32 %f20, %f8, %f18;
|
218 |
+
.loc 1 41 52
|
219 |
+
mov.b32 %f21, %r46;
|
220 |
+
mov.b32 %f22, %r47;
|
221 |
+
.loc 1 42 22
|
222 |
+
add.f32 %f23, %f1, %f22;
|
223 |
+
add.f32 %f24, %f2, %f21;
|
224 |
+
.loc 1 44 22
|
225 |
+
add.f32 %f25, %f5, %f24;
|
226 |
+
add.f32 %f26, %f6, %f23;
|
227 |
+
$L__tmp1:
|
228 |
+
.loc 2 98 22
|
229 |
+
add.f32 %f27, %f26, 0f00000000;
|
230 |
+
add.f32 %f28, %f25, 0f00000000;
|
231 |
+
add.f32 %f29, %f19, 0f00000000;
|
232 |
+
add.f32 %f30, %f20, 0f00000000;
|
233 |
+
.loc 2 101 30
|
234 |
+
sub.f32 %f31, %f25, %f28;
|
235 |
+
sub.f32 %f32, %f26, %f27;
|
236 |
+
sub.f32 %f33, %f19, %f29;
|
237 |
+
sub.f32 %f34, %f20, %f30;
|
238 |
+
.loc 2 101 13
|
239 |
+
fma.rn.f32 %f35, %f25, %f31, 0f00000000;
|
240 |
+
fma.rn.f32 %f36, %f26, %f32, 0f00000000;
|
241 |
+
fma.rn.f32 %f37, %f19, %f33, 0f00000000;
|
242 |
+
fma.rn.f32 %f38, %f20, %f34, 0f00000000;
|
243 |
+
$L__tmp2:
|
244 |
+
.loc 2 108 21
|
245 |
+
sub.f32 %f39, %f27, %f28;
|
246 |
+
mov.b32 %r55, 1065353216;
|
247 |
+
mov.b32 %r56, 1073741824;
|
248 |
+
.loc 2 110 60
|
249 |
+
div.full.f32 %r54, %r55, %r56;
|
250 |
+
mov.b32 %f40, %r54;
|
251 |
+
.loc 2 112 17
|
252 |
+
fma.rn.f32 %f41, %f40, %f39, %f28;
|
253 |
+
.loc 2 113 15
|
254 |
+
add.f32 %f42, %f35, %f36;
|
255 |
+
.loc 2 113 30
|
256 |
+
mul.f32 %f43, %f39, %f39;
|
257 |
+
.loc 2 113 22
|
258 |
+
fma.rn.f32 %f44, %f40, %f43, %f42;
|
259 |
+
.loc 2 108 21
|
260 |
+
sub.f32 %f45, %f29, %f41;
|
261 |
+
mov.b32 %r59, 1077936128;
|
262 |
+
.loc 2 110 60
|
263 |
+
div.full.f32 %r57, %r55, %r59;
|
264 |
+
mov.b32 %f46, %r57;
|
265 |
+
.loc 2 112 17
|
266 |
+
fma.rn.f32 %f47, %f46, %f45, %f41;
|
267 |
+
.loc 2 113 15
|
268 |
+
add.f32 %f48, %f37, %f44;
|
269 |
+
.loc 2 113 30
|
270 |
+
mul.f32 %f49, %f45, %f45;
|
271 |
+
.loc 2 113 38
|
272 |
+
fma.rn.f32 %f50, %f45, %f45, %f49;
|
273 |
+
.loc 2 113 22
|
274 |
+
fma.rn.f32 %f51, %f46, %f50, %f48;
|
275 |
+
.loc 2 108 21
|
276 |
+
sub.f32 %f52, %f30, %f47;
|
277 |
+
mov.b32 %r62, 1082130432;
|
278 |
+
.loc 2 110 60
|
279 |
+
div.full.f32 %r60, %r55, %r62;
|
280 |
+
mov.b32 %f53, %r60;
|
281 |
+
.loc 2 112 17
|
282 |
+
fma.rn.f32 %f54, %f53, %f52, %f47;
|
283 |
+
.loc 2 113 15
|
284 |
+
add.f32 %f55, %f38, %f51;
|
285 |
+
.loc 2 113 30
|
286 |
+
mul.f32 %f56, %f52, %f52;
|
287 |
+
.loc 2 113 38
|
288 |
+
mul.f32 %f57, %f56, 0f40400000;
|
289 |
+
.loc 2 113 22
|
290 |
+
fma.rn.f32 %f58, %f53, %f57, %f55;
|
291 |
+
$L__tmp3:
|
292 |
+
.loc 2 120 46
|
293 |
+
mov.b32 %r119, %f54;
|
294 |
+
shfl.sync.bfly.b32 %r120, %r119, 16, 31, -1;
|
295 |
+
mov.b32 %f59, %r120;
|
296 |
+
mov.b32 %r121, %f58;
|
297 |
+
shfl.sync.bfly.b32 %r122, %r121, 16, 31, -1;
|
298 |
+
mov.b32 %f60, %r122;
|
299 |
+
shfl.sync.bfly.b32 %r64, %r62, 16, 31, -1;
|
300 |
+
mov.b32 %f61, %r64;
|
301 |
+
$L__tmp4:
|
302 |
+
.loc 2 108 21
|
303 |
+
sub.f32 %f62, %f59, %f54;
|
304 |
+
.loc 2 109 28
|
305 |
+
add.f32 %f63, %f61, 0f40800000;
|
306 |
+
.loc 2 110 39
|
307 |
+
setp.eq.f32 %p43, %f63, 0f00000000;
|
308 |
+
.loc 2 110 60
|
309 |
+
mov.b32 %r65, %f63;
|
310 |
+
div.full.f32 %r63, %r64, %r65;
|
311 |
+
mov.b32 %f64, %r63;
|
312 |
+
.loc 2 110 49
|
313 |
+
selp.f32 %f65, 0f00000000, %f64, %p43;
|
314 |
+
.loc 2 112 17
|
315 |
+
fma.rn.f32 %f66, %f65, %f62, %f54;
|
316 |
+
.loc 2 113 15
|
317 |
+
add.f32 %f67, %f58, %f60;
|
318 |
+
.loc 2 113 30
|
319 |
+
mul.f32 %f68, %f62, %f62;
|
320 |
+
.loc 2 113 38
|
321 |
+
mul.f32 %f69, %f68, 0f40800000;
|
322 |
+
.loc 2 113 22
|
323 |
+
fma.rn.f32 %f70, %f65, %f69, %f67;
|
324 |
+
$L__tmp5:
|
325 |
+
.loc 2 120 46
|
326 |
+
mov.b32 %r123, %f66;
|
327 |
+
shfl.sync.bfly.b32 %r124, %r123, 8, 31, -1;
|
328 |
+
mov.b32 %f71, %r124;
|
329 |
+
mov.b32 %r125, %f70;
|
330 |
+
shfl.sync.bfly.b32 %r126, %r125, 8, 31, -1;
|
331 |
+
mov.b32 %f72, %r126;
|
332 |
+
shfl.sync.bfly.b32 %r67, %r65, 8, 31, -1;
|
333 |
+
mov.b32 %f73, %r67;
|
334 |
+
$L__tmp6:
|
335 |
+
.loc 2 108 21
|
336 |
+
sub.f32 %f74, %f71, %f66;
|
337 |
+
.loc 2 109 28
|
338 |
+
add.f32 %f75, %f63, %f73;
|
339 |
+
.loc 2 110 39
|
340 |
+
setp.eq.f32 %p44, %f75, 0f00000000;
|
341 |
+
.loc 2 110 60
|
342 |
+
mov.b32 %r68, %f75;
|
343 |
+
div.full.f32 %r66, %r67, %r68;
|
344 |
+
mov.b32 %f76, %r66;
|
345 |
+
.loc 2 110 49
|
346 |
+
selp.f32 %f77, 0f00000000, %f76, %p44;
|
347 |
+
.loc 2 112 17
|
348 |
+
fma.rn.f32 %f78, %f77, %f74, %f66;
|
349 |
+
.loc 2 113 15
|
350 |
+
add.f32 %f79, %f70, %f72;
|
351 |
+
.loc 2 113 30
|
352 |
+
mul.f32 %f80, %f74, %f74;
|
353 |
+
.loc 2 113 38
|
354 |
+
mul.f32 %f81, %f63, %f80;
|
355 |
+
.loc 2 113 22
|
356 |
+
fma.rn.f32 %f82, %f77, %f81, %f79;
|
357 |
+
$L__tmp7:
|
358 |
+
.loc 2 120 46
|
359 |
+
mov.b32 %r127, %f78;
|
360 |
+
shfl.sync.bfly.b32 %r128, %r127, 4, 31, -1;
|
361 |
+
mov.b32 %f83, %r128;
|
362 |
+
mov.b32 %r129, %f82;
|
363 |
+
shfl.sync.bfly.b32 %r130, %r129, 4, 31, -1;
|
364 |
+
mov.b32 %f84, %r130;
|
365 |
+
shfl.sync.bfly.b32 %r70, %r68, 4, 31, -1;
|
366 |
+
mov.b32 %f85, %r70;
|
367 |
+
$L__tmp8:
|
368 |
+
.loc 2 108 21
|
369 |
+
sub.f32 %f86, %f83, %f78;
|
370 |
+
.loc 2 109 28
|
371 |
+
add.f32 %f87, %f75, %f85;
|
372 |
+
.loc 2 110 39
|
373 |
+
setp.eq.f32 %p45, %f87, 0f00000000;
|
374 |
+
.loc 2 110 60
|
375 |
+
mov.b32 %r71, %f87;
|
376 |
+
div.full.f32 %r69, %r70, %r71;
|
377 |
+
mov.b32 %f88, %r69;
|
378 |
+
.loc 2 110 49
|
379 |
+
selp.f32 %f89, 0f00000000, %f88, %p45;
|
380 |
+
.loc 2 112 17
|
381 |
+
fma.rn.f32 %f90, %f89, %f86, %f78;
|
382 |
+
.loc 2 113 15
|
383 |
+
add.f32 %f91, %f82, %f84;
|
384 |
+
.loc 2 113 30
|
385 |
+
mul.f32 %f92, %f86, %f86;
|
386 |
+
.loc 2 113 38
|
387 |
+
mul.f32 %f93, %f75, %f92;
|
388 |
+
.loc 2 113 22
|
389 |
+
fma.rn.f32 %f94, %f89, %f93, %f91;
|
390 |
+
$L__tmp9:
|
391 |
+
.loc 2 120 46
|
392 |
+
mov.b32 %r131, %f90;
|
393 |
+
shfl.sync.bfly.b32 %r132, %r131, 2, 31, -1;
|
394 |
+
mov.b32 %f95, %r132;
|
395 |
+
mov.b32 %r133, %f94;
|
396 |
+
shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
|
397 |
+
mov.b32 %f96, %r134;
|
398 |
+
shfl.sync.bfly.b32 %r73, %r71, 2, 31, -1;
|
399 |
+
mov.b32 %f97, %r73;
|
400 |
+
$L__tmp10:
|
401 |
+
.loc 2 108 21
|
402 |
+
sub.f32 %f98, %f95, %f90;
|
403 |
+
.loc 2 109 28
|
404 |
+
add.f32 %f99, %f87, %f97;
|
405 |
+
.loc 2 110 39
|
406 |
+
setp.eq.f32 %p46, %f99, 0f00000000;
|
407 |
+
.loc 2 110 60
|
408 |
+
mov.b32 %r74, %f99;
|
409 |
+
div.full.f32 %r72, %r73, %r74;
|
410 |
+
mov.b32 %f100, %r72;
|
411 |
+
.loc 2 110 49
|
412 |
+
selp.f32 %f101, 0f00000000, %f100, %p46;
|
413 |
+
.loc 2 112 17
|
414 |
+
fma.rn.f32 %f102, %f101, %f98, %f90;
|
415 |
+
.loc 2 113 15
|
416 |
+
add.f32 %f103, %f94, %f96;
|
417 |
+
.loc 2 113 30
|
418 |
+
mul.f32 %f104, %f98, %f98;
|
419 |
+
.loc 2 113 38
|
420 |
+
mul.f32 %f105, %f87, %f104;
|
421 |
+
.loc 2 113 22
|
422 |
+
fma.rn.f32 %f106, %f101, %f105, %f103;
|
423 |
+
$L__tmp11:
|
424 |
+
.loc 2 120 46
|
425 |
+
mov.b32 %r135, %f102;
|
426 |
+
shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
|
427 |
+
mov.b32 %f107, %r136;
|
428 |
+
mov.b32 %r137, %f106;
|
429 |
+
shfl.sync.bfly.b32 %r138, %r137, 1, 31, -1;
|
430 |
+
mov.b32 %f108, %r138;
|
431 |
+
shfl.sync.bfly.b32 %r76, %r74, 1, 31, -1;
|
432 |
+
mov.b32 %f109, %r76;
|
433 |
+
$L__tmp12:
|
434 |
+
.loc 2 108 21
|
435 |
+
sub.f32 %f110, %f107, %f102;
|
436 |
+
.loc 2 109 28
|
437 |
+
add.f32 %f111, %f99, %f109;
|
438 |
+
.loc 2 110 39
|
439 |
+
setp.eq.f32 %p47, %f111, 0f00000000;
|
440 |
+
.loc 2 110 60
|
441 |
+
mov.b32 %r77, %f111;
|
442 |
+
div.full.f32 %r75, %r76, %r77;
|
443 |
+
mov.b32 %f112, %r75;
|
444 |
+
.loc 2 110 49
|
445 |
+
selp.f32 %f113, 0f00000000, %f112, %p47;
|
446 |
+
.loc 2 112 17
|
447 |
+
fma.rn.f32 %f114, %f113, %f110, %f102;
|
448 |
+
.loc 2 113 15
|
449 |
+
add.f32 %f115, %f106, %f108;
|
450 |
+
.loc 2 113 30
|
451 |
+
mul.f32 %f116, %f110, %f110;
|
452 |
+
.loc 2 113 38
|
453 |
+
mul.f32 %f117, %f99, %f116;
|
454 |
+
.loc 2 113 22
|
455 |
+
fma.rn.f32 %f118, %f113, %f117, %f115;
|
456 |
+
$L__tmp13:
|
457 |
+
.loc 2 120 46
|
458 |
+
setp.eq.s32 %p21, %r2, 0;
|
459 |
+
shl.b32 %r139, %r5, 2;
|
460 |
+
shl.b32 %r140, %r3, 3;
|
461 |
+
or.b32 %r141, %r140, %r139;
|
462 |
+
mov.u32 %r142, global_smem;
|
463 |
+
add.s32 %r78, %r142, %r141;
|
464 |
+
mov.b32 %r79, %f114;
|
465 |
+
@%p21 st.shared.b32 [ %r78 + 0 ], %r79;
|
466 |
+
add.s32 %r143, %r142, 16;
|
467 |
+
add.s32 %r80, %r143, %r141;
|
468 |
+
mov.b32 %r81, %f118;
|
469 |
+
@%p21 st.shared.b32 [ %r80 + 0 ], %r81;
|
470 |
+
add.s32 %r144, %r142, 32;
|
471 |
+
add.s32 %r82, %r144, %r141;
|
472 |
+
@%p21 st.shared.b32 [ %r82 + 0 ], %r77;
|
473 |
+
bar.sync 0;
|
474 |
+
setp.lt.s32 %p24, %r1, 4;
|
475 |
+
add.s32 %r85, %r142, %r31;
|
476 |
+
@%p24 ld.shared.b32 %r84, [ %r85 + 0 ];
|
477 |
+
mov.b32 %f119, %r84;
|
478 |
+
add.s32 %r87, %r143, %r31;
|
479 |
+
@%p24 ld.shared.b32 %r86, [ %r87 + 0 ];
|
480 |
+
mov.b32 %f120, %r86;
|
481 |
+
add.s32 %r89, %r144, %r31;
|
482 |
+
@%p24 ld.shared.b32 %r88, [ %r89 + 0 ];
|
483 |
+
mov.b32 %f121, %r88;
|
484 |
+
shfl.sync.bfly.b32 %r146, %r84, 1, 31, -1;
|
485 |
+
mov.b32 %f122, %r146;
|
486 |
+
shfl.sync.bfly.b32 %r147, %r86, 1, 31, -1;
|
487 |
+
mov.b32 %f123, %r147;
|
488 |
+
shfl.sync.bfly.b32 %r91, %r88, 1, 31, -1;
|
489 |
+
mov.b32 %f124, %r91;
|
490 |
+
$L__tmp14:
|
491 |
+
.loc 2 108 21
|
492 |
+
sub.f32 %f125, %f122, %f119;
|
493 |
+
.loc 2 109 28
|
494 |
+
add.f32 %f126, %f121, %f124;
|
495 |
+
.loc 2 110 39
|
496 |
+
setp.eq.f32 %p48, %f126, 0f00000000;
|
497 |
+
.loc 2 110 60
|
498 |
+
mov.b32 %r92, %f126;
|
499 |
+
div.full.f32 %r90, %r91, %r92;
|
500 |
+
mov.b32 %f127, %r90;
|
501 |
+
.loc 2 110 49
|
502 |
+
selp.f32 %f128, 0f00000000, %f127, %p48;
|
503 |
+
.loc 2 112 17
|
504 |
+
fma.rn.f32 %f129, %f125, %f128, %f119;
|
505 |
+
.loc 2 113 15
|
506 |
+
add.f32 %f130, %f120, %f123;
|
507 |
+
.loc 2 113 30
|
508 |
+
mul.f32 %f131, %f125, %f125;
|
509 |
+
.loc 2 113 38
|
510 |
+
mul.f32 %f132, %f121, %f131;
|
511 |
+
.loc 2 113 22
|
512 |
+
fma.rn.f32 %f133, %f132, %f128, %f130;
|
513 |
+
$L__tmp15:
|
514 |
+
.loc 2 120 46
|
515 |
+
setp.eq.s32 %p49, %r4, 0;
|
516 |
+
and.pred %p27, %p24, %p49;
|
517 |
+
mov.b32 %r94, %f129;
|
518 |
+
@%p27 st.shared.b32 [ %r85 + 0 ], %r94;
|
519 |
+
mov.b32 %r96, %f133;
|
520 |
+
@%p27 st.shared.b32 [ %r87 + 0 ], %r96;
|
521 |
+
@%p27 st.shared.b32 [ %r89 + 0 ], %r92;
|
522 |
+
bar.sync 0;
|
523 |
+
add.s32 %r148, %r142, %r140;
|
524 |
+
ld.shared.f32 %f9, [%r148];
|
525 |
+
add.s32 %r149, %r143, %r140;
|
526 |
+
ld.shared.f32 %f10, [%r149];
|
527 |
+
$L__tmp16:
|
528 |
+
.loc 1 62 51
|
529 |
+
mov.u32 %r99, 0x0;
|
530 |
+
mov.u32 %r100, 0x0;
|
531 |
+
mov.u32 %r101, 0x0;
|
532 |
+
mov.u32 %r102, 0x0;
|
533 |
+
@%p50 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd38 + 0 ];
|
534 |
+
@!%p50 mov.u32 %r99, %r155;
|
535 |
+
@!%p50 mov.u32 %r100, %r155;
|
536 |
+
@!%p50 mov.u32 %r101, %r155;
|
537 |
+
@!%p50 mov.u32 %r102, %r155;
|
538 |
+
.loc 1 63 51
|
539 |
+
mov.u32 %r107, 0x0;
|
540 |
+
mov.u32 %r108, 0x0;
|
541 |
+
@%p50 ld.global.L1::evict_first.v2.b32 { %r107, %r108 }, [ %rd39 + 0 ];
|
542 |
+
@!%p50 mov.u32 %r107, %r155;
|
543 |
+
@!%p50 mov.u32 %r108, %r155;
|
544 |
+
cvt.u16.u32 %rs5, %r107;
|
545 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r107; }
|
546 |
+
cvt.u16.u32 %rs7, %r108;
|
547 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r108; }
|
548 |
+
.loc 1 63 103
|
549 |
+
cvt.f32.bf16 %r111, %rs5;
|
550 |
+
mov.b32 %f11, %r111;
|
551 |
+
cvt.f32.bf16 %r112, %rs6;
|
552 |
+
mov.b32 %f12, %r112;
|
553 |
+
cvt.f32.bf16 %r113, %rs7;
|
554 |
+
mov.b32 %f13, %r113;
|
555 |
+
cvt.f32.bf16 %r114, %rs8;
|
556 |
+
mov.b32 %f14, %r114;
|
557 |
+
.loc 1 64 35
|
558 |
+
mul.wide.u32 %rd47, %r7, 4;
|
559 |
+
add.s64 %rd40, %rd8, %rd47;
|
560 |
+
.loc 1 64 40
|
561 |
+
mov.u32 %r115, 0x0;
|
562 |
+
mov.u32 %r116, 0x0;
|
563 |
+
@%p50 ld.global.L1::evict_last.v2.b32 { %r115, %r116 }, [ %rd40 + 0 ];
|
564 |
+
@!%p50 mov.u32 %r115, %r155;
|
565 |
+
@!%p50 mov.u32 %r116, %r155;
|
566 |
+
.loc 1 68 57
|
567 |
+
@%p15 bra $L__BB0_4;
|
568 |
+
mov.u64 %rd48, assertMessage_1;
|
569 |
+
cvta.global.u64 %rd49, %rd48;
|
570 |
+
mov.u64 %rd50, assertFile_1;
|
571 |
+
cvta.global.u64 %rd51, %rd50;
|
572 |
+
mov.u64 %rd52, assertFunc_1;
|
573 |
+
cvta.global.u64 %rd53, %rd52;
|
574 |
+
{ // callseq 5, 0
|
575 |
+
.reg .b32 temp_param_reg;
|
576 |
+
.param .b64 param0;
|
577 |
+
st.param.b64 [param0+0], %rd49;
|
578 |
+
.param .b64 param1;
|
579 |
+
st.param.b64 [param1+0], %rd51;
|
580 |
+
.param .b32 param2;
|
581 |
+
st.param.b32 [param2+0], %r184;
|
582 |
+
.param .b64 param3;
|
583 |
+
st.param.b64 [param3+0], %rd53;
|
584 |
+
.param .b64 param4;
|
585 |
+
st.param.b64 [param4+0], %rd58;
|
586 |
+
call.uni
|
587 |
+
__assertfail,
|
588 |
+
(
|
589 |
+
param0,
|
590 |
+
param1,
|
591 |
+
param2,
|
592 |
+
param3,
|
593 |
+
param4
|
594 |
+
);
|
595 |
+
} // callseq 5
|
596 |
+
$L__BB0_4:
|
597 |
+
.loc 1 69 54
|
598 |
+
mov.u32 %r151, 0x0;
|
599 |
+
mov.u32 %r152, 0x0;
|
600 |
+
mov.u32 %r153, 0x0;
|
601 |
+
mov.u32 %r154, 0x0;
|
602 |
+
@%p50 ld.global.L1::evict_first.v4.b32 { %r151, %r152, %r153, %r154 }, [ %rd55 + 0 ];
|
603 |
+
@!%p50 mov.u32 %r151, %r155;
|
604 |
+
@!%p50 mov.u32 %r152, %r155;
|
605 |
+
@!%p50 mov.u32 %r153, %r155;
|
606 |
+
@!%p50 mov.u32 %r154, %r155;
|
607 |
+
.loc 1 75 24
|
608 |
+
mov.b32 %r160, %f10;
|
609 |
+
mov.b32 %r161, 1132462080;
|
610 |
+
div.full.f32 %r159, %r160, %r161;
|
611 |
+
mov.b32 %f134, %r159;
|
612 |
+
.loc 1 77 24
|
613 |
+
add.f32 %f135, %f134, 0f3727C5AC;
|
614 |
+
.loc 1 78 30
|
615 |
+
rsqrt.approx.ftz.f32 %f136, %f135;
|
616 |
+
.loc 1 69 54
|
617 |
+
mov.b32 %f137, %r154;
|
618 |
+
.loc 1 62 51
|
619 |
+
mov.b32 %f138, %r102;
|
620 |
+
.loc 1 70 24
|
621 |
+
add.f32 %f139, %f138, %f137;
|
622 |
+
.loc 1 72 24
|
623 |
+
add.f32 %f140, %f14, %f139;
|
624 |
+
.loc 1 73 24
|
625 |
+
sub.f32 %f141, %f140, %f9;
|
626 |
+
.loc 1 69 54
|
627 |
+
mov.b32 %f142, %r153;
|
628 |
+
.loc 1 62 51
|
629 |
+
mov.b32 %f143, %r101;
|
630 |
+
.loc 1 70 24
|
631 |
+
add.f32 %f144, %f143, %f142;
|
632 |
+
.loc 1 72 24
|
633 |
+
add.f32 %f145, %f13, %f144;
|
634 |
+
.loc 1 73 24
|
635 |
+
sub.f32 %f146, %f145, %f9;
|
636 |
+
.loc 1 69 54
|
637 |
+
mov.b32 %f147, %r152;
|
638 |
+
.loc 1 62 51
|
639 |
+
mov.b32 %f148, %r100;
|
640 |
+
.loc 1 70 24
|
641 |
+
add.f32 %f149, %f148, %f147;
|
642 |
+
.loc 1 72 24
|
643 |
+
add.f32 %f150, %f12, %f149;
|
644 |
+
.loc 1 73 24
|
645 |
+
sub.f32 %f151, %f150, %f9;
|
646 |
+
.loc 1 69 54
|
647 |
+
mov.b32 %f152, %r151;
|
648 |
+
.loc 1 62 51
|
649 |
+
mov.b32 %f153, %r99;
|
650 |
+
.loc 1 70 24
|
651 |
+
add.f32 %f154, %f153, %f152;
|
652 |
+
.loc 1 72 24
|
653 |
+
add.f32 %f155, %f11, %f154;
|
654 |
+
.loc 1 73 24
|
655 |
+
sub.f32 %f156, %f155, %f9;
|
656 |
+
.loc 1 79 24
|
657 |
+
mul.f32 %f157, %f156, %f136;
|
658 |
+
mul.f32 %f158, %f151, %f136;
|
659 |
+
mul.f32 %f159, %f146, %f136;
|
660 |
+
mul.f32 %f160, %f141, %f136;
|
661 |
+
.loc 1 80 24
|
662 |
+
bar.sync 0;
|
663 |
+
shl.b32 %r177, %r7, 2;
|
664 |
+
add.s32 %r179, %r142, %r177;
|
665 |
+
st.shared.v2.u32 [%r179], {%r115, %r116};
|
666 |
+
bar.sync 0;
|
667 |
+
shl.b32 %r180, %r6, 2;
|
668 |
+
add.s32 %r181, %r142, %r180;
|
669 |
+
ld.shared.v4.f32 {%f161, %f162, %f163, %f164}, [%r181];
|
670 |
+
mul.f32 %f165, %f157, %f161;
|
671 |
+
mul.f32 %f166, %f158, %f162;
|
672 |
+
mul.f32 %f167, %f159, %f163;
|
673 |
+
mul.f32 %f168, %f160, %f164;
|
674 |
+
.loc 1 82 29
|
675 |
+
shl.b64 %rd57, %rd3, 1;
|
676 |
+
add.s64 %rd56, %rd9, %rd57;
|
677 |
+
.loc 1 82 52
|
678 |
+
mov.b32 %r171, %f165;
|
679 |
+
cvt.rn.bf16.f32 %rs9, %r171;
|
680 |
+
mov.b32 %r172, %f166;
|
681 |
+
cvt.rn.bf16.f32 %rs10, %r172;
|
682 |
+
mov.b32 %r173, %f167;
|
683 |
+
cvt.rn.bf16.f32 %rs11, %r173;
|
684 |
+
mov.b32 %r174, %f168;
|
685 |
+
cvt.rn.bf16.f32 %rs12, %r174;
|
686 |
+
mov.b32 %r182, {%rs9, %rs10};
|
687 |
+
mov.b32 %r183, {%rs11, %rs12};
|
688 |
+
@%p50 st.global.v2.b32 [ %rd56 + 0 ], { %r182, %r183 };
|
689 |
+
.loc 1 58 4
|
690 |
+
ret;
|
691 |
+
$L__tmp17:
|
692 |
+
$L__func_end0:
|
693 |
+
|
694 |
+
}
|
695 |
+
// .globl __nv_rsqrtf
|
696 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
697 |
+
.param .b32 __nv_rsqrtf_param_0
|
698 |
+
)
|
699 |
+
{
|
700 |
+
.reg .f32 %f<3>;
|
701 |
+
$L__func_begin1:
|
702 |
+
|
703 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
704 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
705 |
+
st.param.f32 [func_retval0+0], %f2;
|
706 |
+
ret;
|
707 |
+
$L__func_end1:
|
708 |
+
|
709 |
+
}
|
710 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
711 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
712 |
+
.section .debug_abbrev
|
713 |
+
{
|
714 |
+
.b8 1
|
715 |
+
.b8 17
|
716 |
+
.b8 1
|
717 |
+
.b8 37
|
718 |
+
.b8 8
|
719 |
+
.b8 19
|
720 |
+
.b8 5
|
721 |
+
.b8 3
|
722 |
+
.b8 8
|
723 |
+
.b8 16
|
724 |
+
.b8 6
|
725 |
+
.b8 27
|
726 |
+
.b8 8
|
727 |
+
.b8 180
|
728 |
+
.b8 66
|
729 |
+
.b8 12
|
730 |
+
.b8 17
|
731 |
+
.b8 1
|
732 |
+
.b8 18
|
733 |
+
.b8 1
|
734 |
+
.b8 0
|
735 |
+
.b8 0
|
736 |
+
.b8 2
|
737 |
+
.b8 46
|
738 |
+
.b8 0
|
739 |
+
.b8 135
|
740 |
+
.b8 64
|
741 |
+
.b8 8
|
742 |
+
.b8 3
|
743 |
+
.b8 8
|
744 |
+
.b8 58
|
745 |
+
.b8 11
|
746 |
+
.b8 59
|
747 |
+
.b8 11
|
748 |
+
.b8 63
|
749 |
+
.b8 12
|
750 |
+
.b8 32
|
751 |
+
.b8 11
|
752 |
+
.b8 0
|
753 |
+
.b8 0
|
754 |
+
.b8 3
|
755 |
+
.b8 46
|
756 |
+
.b8 1
|
757 |
+
.b8 17
|
758 |
+
.b8 1
|
759 |
+
.b8 18
|
760 |
+
.b8 1
|
761 |
+
.b8 64
|
762 |
+
.b8 10
|
763 |
+
.b8 49
|
764 |
+
.b8 19
|
765 |
+
.b8 0
|
766 |
+
.b8 0
|
767 |
+
.b8 4
|
768 |
+
.b8 29
|
769 |
+
.b8 0
|
770 |
+
.b8 49
|
771 |
+
.b8 19
|
772 |
+
.b8 17
|
773 |
+
.b8 1
|
774 |
+
.b8 18
|
775 |
+
.b8 1
|
776 |
+
.b8 88
|
777 |
+
.b8 11
|
778 |
+
.b8 89
|
779 |
+
.b8 11
|
780 |
+
.b8 87
|
781 |
+
.b8 11
|
782 |
+
.b8 0
|
783 |
+
.b8 0
|
784 |
+
.b8 5
|
785 |
+
.b8 29
|
786 |
+
.b8 1
|
787 |
+
.b8 49
|
788 |
+
.b8 19
|
789 |
+
.b8 17
|
790 |
+
.b8 1
|
791 |
+
.b8 18
|
792 |
+
.b8 1
|
793 |
+
.b8 88
|
794 |
+
.b8 11
|
795 |
+
.b8 89
|
796 |
+
.b8 11
|
797 |
+
.b8 87
|
798 |
+
.b8 11
|
799 |
+
.b8 0
|
800 |
+
.b8 0
|
801 |
+
.b8 0
|
802 |
+
}
|
803 |
+
.section .debug_info
|
804 |
+
{
|
805 |
+
.b32 302
|
806 |
+
.b8 2
|
807 |
+
.b8 0
|
808 |
+
.b32 .debug_abbrev
|
809 |
+
.b8 8
|
810 |
+
.b8 1
|
811 |
+
.b8 116
|
812 |
+
.b8 114
|
813 |
+
.b8 105
|
814 |
+
.b8 116
|
815 |
+
.b8 111
|
816 |
+
.b8 110
|
817 |
+
.b8 0
|
818 |
+
.b8 2
|
819 |
+
.b8 0
|
820 |
+
.b8 99
|
821 |
+
.b8 112
|
822 |
+
.b8 110
|
823 |
+
.b8 51
|
824 |
+
.b8 108
|
825 |
+
.b8 97
|
826 |
+
.b8 119
|
827 |
+
.b8 103
|
828 |
+
.b8 54
|
829 |
+
.b8 53
|
830 |
+
.b8 108
|
831 |
+
.b8 112
|
832 |
+
.b8 105
|
833 |
+
.b8 54
|
834 |
+
.b8 51
|
835 |
+
.b8 103
|
836 |
+
.b8 118
|
837 |
+
.b8 54
|
838 |
+
.b8 99
|
839 |
+
.b8 54
|
840 |
+
.b8 112
|
841 |
+
.b8 110
|
842 |
+
.b8 52
|
843 |
+
.b8 111
|
844 |
+
.b8 105
|
845 |
+
.b8 107
|
846 |
+
.b8 104
|
847 |
+
.b8 103
|
848 |
+
.b8 54
|
849 |
+
.b8 113
|
850 |
+
.b8 118
|
851 |
+
.b8 97
|
852 |
+
.b8 50
|
853 |
+
.b8 104
|
854 |
+
.b8 50
|
855 |
+
.b8 113
|
856 |
+
.b8 106
|
857 |
+
.b8 100
|
858 |
+
.b8 112
|
859 |
+
.b8 120
|
860 |
+
.b8 101
|
861 |
+
.b8 54
|
862 |
+
.b8 113
|
863 |
+
.b8 106
|
864 |
+
.b8 52
|
865 |
+
.b8 108
|
866 |
+
.b8 118
|
867 |
+
.b8 116
|
868 |
+
.b8 116
|
869 |
+
.b8 119
|
870 |
+
.b8 101
|
871 |
+
.b8 122
|
872 |
+
.b8 46
|
873 |
+
.b8 112
|
874 |
+
.b8 121
|
875 |
+
.b8 0
|
876 |
+
.b32 .debug_line
|
877 |
+
.b8 47
|
878 |
+
.b8 116
|
879 |
+
.b8 109
|
880 |
+
.b8 112
|
881 |
+
.b8 47
|
882 |
+
.b8 116
|
883 |
+
.b8 111
|
884 |
+
.b8 114
|
885 |
+
.b8 99
|
886 |
+
.b8 104
|
887 |
+
.b8 105
|
888 |
+
.b8 110
|
889 |
+
.b8 100
|
890 |
+
.b8 117
|
891 |
+
.b8 99
|
892 |
+
.b8 116
|
893 |
+
.b8 111
|
894 |
+
.b8 114
|
895 |
+
.b8 95
|
896 |
+
.b8 114
|
897 |
+
.b8 111
|
898 |
+
.b8 111
|
899 |
+
.b8 116
|
900 |
+
.b8 47
|
901 |
+
.b8 112
|
902 |
+
.b8 110
|
903 |
+
.b8 0
|
904 |
+
.b8 1
|
905 |
+
.b64 $L__func_begin0
|
906 |
+
.b64 $L__func_end0
|
907 |
+
.b8 2
|
908 |
+
.b8 116
|
909 |
+
.b8 114
|
910 |
+
.b8 105
|
911 |
+
.b8 116
|
912 |
+
.b8 111
|
913 |
+
.b8 110
|
914 |
+
.b8 95
|
915 |
+
.b8 95
|
916 |
+
.b8 48
|
917 |
+
.b8 100
|
918 |
+
.b8 49
|
919 |
+
.b8 100
|
920 |
+
.b8 50
|
921 |
+
.b8 100
|
922 |
+
.b8 51
|
923 |
+
.b8 100
|
924 |
+
.b8 52
|
925 |
+
.b8 100
|
926 |
+
.b8 53
|
927 |
+
.b8 100
|
928 |
+
.b8 54
|
929 |
+
.b8 100
|
930 |
+
.b8 101
|
931 |
+
.b8 55
|
932 |
+
.b8 100
|
933 |
+
.b8 101
|
934 |
+
.b8 0
|
935 |
+
.b8 116
|
936 |
+
.b8 114
|
937 |
+
.b8 105
|
938 |
+
.b8 116
|
939 |
+
.b8 111
|
940 |
+
.b8 110
|
941 |
+
.b8 95
|
942 |
+
.b8 95
|
943 |
+
.b8 48
|
944 |
+
.b8 100
|
945 |
+
.b8 49
|
946 |
+
.b8 100
|
947 |
+
.b8 50
|
948 |
+
.b8 100
|
949 |
+
.b8 51
|
950 |
+
.b8 100
|
951 |
+
.b8 52
|
952 |
+
.b8 100
|
953 |
+
.b8 53
|
954 |
+
.b8 100
|
955 |
+
.b8 54
|
956 |
+
.b8 100
|
957 |
+
.b8 101
|
958 |
+
.b8 55
|
959 |
+
.b8 100
|
960 |
+
.b8 101
|
961 |
+
.b8 0
|
962 |
+
.b8 1
|
963 |
+
.b8 18
|
964 |
+
.b8 1
|
965 |
+
.b8 1
|
966 |
+
.b8 3
|
967 |
+
.b64 $L__func_begin0
|
968 |
+
.b64 $L__func_end0
|
969 |
+
.b8 1
|
970 |
+
.b8 156
|
971 |
+
.b32 125
|
972 |
+
.b8 4
|
973 |
+
.b32 125
|
974 |
+
.b64 $L__tmp1
|
975 |
+
.b64 $L__tmp2
|
976 |
+
.b8 2
|
977 |
+
.b8 47
|
978 |
+
.b8 41
|
979 |
+
.b8 5
|
980 |
+
.b32 125
|
981 |
+
.b64 $L__tmp2
|
982 |
+
.b64 $L__tmp15
|
983 |
+
.b8 2
|
984 |
+
.b8 53
|
985 |
+
.b8 44
|
986 |
+
.b8 4
|
987 |
+
.b32 125
|
988 |
+
.b64 $L__tmp2
|
989 |
+
.b64 $L__tmp15
|
990 |
+
.b8 2
|
991 |
+
.b8 120
|
992 |
+
.b8 46
|
993 |
+
.b8 0
|
994 |
+
.b8 4
|
995 |
+
.b32 125
|
996 |
+
.b64 $L__tmp3
|
997 |
+
.b64 $L__tmp16
|
998 |
+
.b8 2
|
999 |
+
.b8 53
|
1000 |
+
.b8 44
|
1001 |
+
.b8 0
|
1002 |
+
.b8 0
|
1003 |
+
}
|
1004 |
+
.section .debug_pubnames
|
1005 |
+
{
|
1006 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1007 |
+
$L__pubNames_start0:
|
1008 |
+
.b8 2
|
1009 |
+
.b8 0
|
1010 |
+
.b32 .debug_info
|
1011 |
+
.b32 306
|
1012 |
+
.b32 125
|
1013 |
+
.b8 116
|
1014 |
+
.b8 114
|
1015 |
+
.b8 105
|
1016 |
+
.b8 116
|
1017 |
+
.b8 111
|
1018 |
+
.b8 110
|
1019 |
+
.b8 95
|
1020 |
+
.b8 95
|
1021 |
+
.b8 48
|
1022 |
+
.b8 100
|
1023 |
+
.b8 49
|
1024 |
+
.b8 100
|
1025 |
+
.b8 50
|
1026 |
+
.b8 100
|
1027 |
+
.b8 51
|
1028 |
+
.b8 100
|
1029 |
+
.b8 52
|
1030 |
+
.b8 100
|
1031 |
+
.b8 53
|
1032 |
+
.b8 100
|
1033 |
+
.b8 54
|
1034 |
+
.b8 100
|
1035 |
+
.b8 101
|
1036 |
+
.b8 55
|
1037 |
+
.b8 100
|
1038 |
+
.b8 101
|
1039 |
+
.b8 0
|
1040 |
+
.b32 0
|
1041 |
+
$L__pubNames_end0:
|
1042 |
+
}
|
1043 |
+
.section .debug_pubtypes
|
1044 |
+
{
|
1045 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1046 |
+
$L__pubTypes_start0:
|
1047 |
+
.b8 2
|
1048 |
+
.b8 0
|
1049 |
+
.b32 .debug_info
|
1050 |
+
.b32 306
|
1051 |
+
.b32 0
|
1052 |
+
$L__pubTypes_end0:
|
1053 |
+
}
|
1054 |
+
.section .debug_loc { }
|
.triton/dump/510522bb05917b836ed253751364fcad/triton_.cubin
ADDED
Binary file (66.2 kB). View file
|
|
.triton/dump/510522bb05917b836ed253751364fcad/triton_.llir
ADDED
@@ -0,0 +1,1211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
16 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%9 = lshr i32 %8, 3, !dbg !10
|
18 |
+
%10 = and i32 %9, 31, !dbg !10
|
19 |
+
%11 = and i32 %8, 63, !dbg !10
|
20 |
+
%12 = shl i32 %8, 3, !dbg !11
|
21 |
+
%13 = and i32 %12, 56, !dbg !11
|
22 |
+
%14 = or i32 %13, 4, !dbg !11
|
23 |
+
%15 = lshr i32 %8, 6, !dbg !12
|
24 |
+
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
|
25 |
+
%17 = shl i32 %16, 6, !dbg !14
|
26 |
+
%18 = or i32 %17, %10, !dbg !15
|
27 |
+
%19 = or i32 %18, 32, !dbg !15
|
28 |
+
%20 = or i32 %17, %11, !dbg !15
|
29 |
+
%21 = sext i32 %18 to i64, !dbg !16
|
30 |
+
%22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !16
|
31 |
+
%23 = sext i32 %19 to i64, !dbg !16
|
32 |
+
%24 = getelementptr i64, ptr addrspace(1) %0, i64 %23, !dbg !16
|
33 |
+
%25 = sext i32 %20 to i64, !dbg !16
|
34 |
+
%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
|
35 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
36 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
37 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
38 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
39 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
40 |
+
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
41 |
+
%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
42 |
+
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !17
|
43 |
+
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
44 |
+
%36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
45 |
+
%37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
46 |
+
%38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
47 |
+
%39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
48 |
+
%40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
49 |
+
%41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
50 |
+
%42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %24, i1 true) #6, !dbg !17
|
51 |
+
%43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
|
52 |
+
%44 = srem i32 %18, 512, !dbg !18
|
53 |
+
%45 = srem i32 %19, 512, !dbg !18
|
54 |
+
%46 = shl nsw i32 %44, 8, !dbg !19
|
55 |
+
%47 = shl nsw i32 %45, 8, !dbg !19
|
56 |
+
%48 = add i64 %43, 50257, !dbg !20
|
57 |
+
%49 = icmp slt i64 %27, 0, !dbg !21
|
58 |
+
%50 = icmp slt i64 %35, 0, !dbg !21
|
59 |
+
%51 = icmp slt i64 %43, 0, !dbg !21
|
60 |
+
%52 = select i1 %51, i64 %48, i64 %43, !dbg !22
|
61 |
+
%53 = icmp ugt i64 %52, 50256, !dbg !23
|
62 |
+
%54 = shl i64 %27, 8, !dbg !24
|
63 |
+
%55 = add i64 %54, 12865792, !dbg !24
|
64 |
+
%56 = select i1 %49, i64 %55, i64 %54, !dbg !24
|
65 |
+
%57 = shl i64 %35, 8, !dbg !24
|
66 |
+
%58 = add i64 %57, 12865792, !dbg !24
|
67 |
+
%59 = select i1 %50, i64 %58, i64 %57, !dbg !24
|
68 |
+
%60 = getelementptr float, ptr addrspace(1) %1, i64 %56
|
69 |
+
%61 = getelementptr float, ptr addrspace(1) %1, i64 %59
|
70 |
+
br label %62, !dbg !12
|
71 |
+
|
72 |
+
62: ; preds = %7, %179
|
73 |
+
%63 = phi float [ 0.000000e+00, %7 ], [ %254, %179 ]
|
74 |
+
%64 = phi float [ 0.000000e+00, %7 ], [ %255, %179 ]
|
75 |
+
%65 = phi float [ 0.000000e+00, %7 ], [ %256, %179 ]
|
76 |
+
%66 = phi float [ 0.000000e+00, %7 ], [ %257, %179 ]
|
77 |
+
%67 = phi float [ 0.000000e+00, %7 ], [ %258, %179 ]
|
78 |
+
%68 = phi float [ 0.000000e+00, %7 ], [ %259, %179 ]
|
79 |
+
%69 = phi float [ 0.000000e+00, %7 ], [ %260, %179 ]
|
80 |
+
%70 = phi float [ 0.000000e+00, %7 ], [ %261, %179 ]
|
81 |
+
%71 = phi float [ 0.000000e+00, %7 ], [ %262, %179 ]
|
82 |
+
%72 = phi float [ 0.000000e+00, %7 ], [ %263, %179 ]
|
83 |
+
%73 = phi float [ 0.000000e+00, %7 ], [ %264, %179 ]
|
84 |
+
%74 = phi float [ 0.000000e+00, %7 ], [ %265, %179 ]
|
85 |
+
%75 = phi float [ 0.000000e+00, %7 ], [ %266, %179 ]
|
86 |
+
%76 = phi float [ 0.000000e+00, %7 ], [ %267, %179 ]
|
87 |
+
%77 = phi float [ 0.000000e+00, %7 ], [ %268, %179 ]
|
88 |
+
%78 = phi float [ 0.000000e+00, %7 ], [ %269, %179 ]
|
89 |
+
%79 = phi float [ 0.000000e+00, %7 ], [ %270, %179 ]
|
90 |
+
%80 = phi float [ 0.000000e+00, %7 ], [ %271, %179 ]
|
91 |
+
%81 = phi float [ 0.000000e+00, %7 ], [ %272, %179 ]
|
92 |
+
%82 = phi float [ 0.000000e+00, %7 ], [ %273, %179 ]
|
93 |
+
%83 = phi float [ 0.000000e+00, %7 ], [ %274, %179 ]
|
94 |
+
%84 = phi float [ 0.000000e+00, %7 ], [ %275, %179 ]
|
95 |
+
%85 = phi float [ 0.000000e+00, %7 ], [ %276, %179 ]
|
96 |
+
%86 = phi float [ 0.000000e+00, %7 ], [ %277, %179 ]
|
97 |
+
%87 = phi float [ 0.000000e+00, %7 ], [ %278, %179 ]
|
98 |
+
%88 = phi float [ 0.000000e+00, %7 ], [ %279, %179 ]
|
99 |
+
%89 = phi float [ 0.000000e+00, %7 ], [ %280, %179 ]
|
100 |
+
%90 = phi float [ 0.000000e+00, %7 ], [ %281, %179 ]
|
101 |
+
%91 = phi float [ 0.000000e+00, %7 ], [ %282, %179 ]
|
102 |
+
%92 = phi float [ 0.000000e+00, %7 ], [ %283, %179 ]
|
103 |
+
%93 = phi float [ 0.000000e+00, %7 ], [ %284, %179 ]
|
104 |
+
%94 = phi float [ 0.000000e+00, %7 ], [ %285, %179 ]
|
105 |
+
%95 = phi float [ 0.000000e+00, %7 ], [ %350, %179 ]
|
106 |
+
%96 = phi float [ 0.000000e+00, %7 ], [ %351, %179 ]
|
107 |
+
%97 = phi float [ 0.000000e+00, %7 ], [ %352, %179 ]
|
108 |
+
%98 = phi float [ 0.000000e+00, %7 ], [ %353, %179 ]
|
109 |
+
%99 = phi float [ 0.000000e+00, %7 ], [ %354, %179 ]
|
110 |
+
%100 = phi float [ 0.000000e+00, %7 ], [ %355, %179 ]
|
111 |
+
%101 = phi float [ 0.000000e+00, %7 ], [ %356, %179 ]
|
112 |
+
%102 = phi float [ 0.000000e+00, %7 ], [ %357, %179 ]
|
113 |
+
%103 = phi float [ 0.000000e+00, %7 ], [ %358, %179 ]
|
114 |
+
%104 = phi float [ 0.000000e+00, %7 ], [ %359, %179 ]
|
115 |
+
%105 = phi float [ 0.000000e+00, %7 ], [ %360, %179 ]
|
116 |
+
%106 = phi float [ 0.000000e+00, %7 ], [ %361, %179 ]
|
117 |
+
%107 = phi float [ 0.000000e+00, %7 ], [ %362, %179 ]
|
118 |
+
%108 = phi float [ 0.000000e+00, %7 ], [ %363, %179 ]
|
119 |
+
%109 = phi float [ 0.000000e+00, %7 ], [ %364, %179 ]
|
120 |
+
%110 = phi float [ 0.000000e+00, %7 ], [ %365, %179 ]
|
121 |
+
%111 = phi float [ 0.000000e+00, %7 ], [ %302, %179 ]
|
122 |
+
%112 = phi float [ 0.000000e+00, %7 ], [ %303, %179 ]
|
123 |
+
%113 = phi float [ 0.000000e+00, %7 ], [ %304, %179 ]
|
124 |
+
%114 = phi float [ 0.000000e+00, %7 ], [ %305, %179 ]
|
125 |
+
%115 = phi float [ 0.000000e+00, %7 ], [ %306, %179 ]
|
126 |
+
%116 = phi float [ 0.000000e+00, %7 ], [ %307, %179 ]
|
127 |
+
%117 = phi float [ 0.000000e+00, %7 ], [ %308, %179 ]
|
128 |
+
%118 = phi float [ 0.000000e+00, %7 ], [ %309, %179 ]
|
129 |
+
%119 = phi float [ 0.000000e+00, %7 ], [ %310, %179 ]
|
130 |
+
%120 = phi float [ 0.000000e+00, %7 ], [ %311, %179 ]
|
131 |
+
%121 = phi float [ 0.000000e+00, %7 ], [ %312, %179 ]
|
132 |
+
%122 = phi float [ 0.000000e+00, %7 ], [ %313, %179 ]
|
133 |
+
%123 = phi float [ 0.000000e+00, %7 ], [ %314, %179 ]
|
134 |
+
%124 = phi float [ 0.000000e+00, %7 ], [ %315, %179 ]
|
135 |
+
%125 = phi float [ 0.000000e+00, %7 ], [ %316, %179 ]
|
136 |
+
%126 = phi float [ 0.000000e+00, %7 ], [ %317, %179 ]
|
137 |
+
%127 = phi i32 [ 0, %7 ], [ %366, %179 ]
|
138 |
+
%128 = or i32 %127, %13, !dbg !25
|
139 |
+
%129 = or i32 %127, %14, !dbg !25
|
140 |
+
%130 = add i32 %128, %46, !dbg !26
|
141 |
+
%131 = add i32 %129, %46, !dbg !26
|
142 |
+
%132 = add i32 %128, %47, !dbg !26
|
143 |
+
%133 = add i32 %129, %47, !dbg !26
|
144 |
+
%134 = sext i32 %130 to i64, !dbg !27
|
145 |
+
%135 = getelementptr float, ptr addrspace(1) %2, i64 %134, !dbg !27
|
146 |
+
%136 = sext i32 %131 to i64, !dbg !27
|
147 |
+
%137 = getelementptr float, ptr addrspace(1) %2, i64 %136, !dbg !27
|
148 |
+
%138 = sext i32 %132 to i64, !dbg !27
|
149 |
+
%139 = getelementptr float, ptr addrspace(1) %2, i64 %138, !dbg !27
|
150 |
+
%140 = sext i32 %133 to i64, !dbg !27
|
151 |
+
%141 = getelementptr float, ptr addrspace(1) %2, i64 %140, !dbg !27
|
152 |
+
%142 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %135, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
153 |
+
%143 = extractvalue { i32, i32, i32, i32 } %142, 0, !dbg !28
|
154 |
+
%144 = extractvalue { i32, i32, i32, i32 } %142, 1, !dbg !28
|
155 |
+
%145 = extractvalue { i32, i32, i32, i32 } %142, 2, !dbg !28
|
156 |
+
%146 = extractvalue { i32, i32, i32, i32 } %142, 3, !dbg !28
|
157 |
+
%147 = bitcast i32 %143 to float, !dbg !28
|
158 |
+
%148 = bitcast i32 %144 to float, !dbg !28
|
159 |
+
%149 = bitcast i32 %145 to float, !dbg !28
|
160 |
+
%150 = bitcast i32 %146 to float, !dbg !28
|
161 |
+
%151 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
162 |
+
%152 = extractvalue { i32, i32, i32, i32 } %151, 0, !dbg !28
|
163 |
+
%153 = extractvalue { i32, i32, i32, i32 } %151, 1, !dbg !28
|
164 |
+
%154 = extractvalue { i32, i32, i32, i32 } %151, 2, !dbg !28
|
165 |
+
%155 = extractvalue { i32, i32, i32, i32 } %151, 3, !dbg !28
|
166 |
+
%156 = bitcast i32 %152 to float, !dbg !28
|
167 |
+
%157 = bitcast i32 %153 to float, !dbg !28
|
168 |
+
%158 = bitcast i32 %154 to float, !dbg !28
|
169 |
+
%159 = bitcast i32 %155 to float, !dbg !28
|
170 |
+
%160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %139, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
171 |
+
%161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !28
|
172 |
+
%162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !28
|
173 |
+
%163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !28
|
174 |
+
%164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !28
|
175 |
+
%165 = bitcast i32 %161 to float, !dbg !28
|
176 |
+
%166 = bitcast i32 %162 to float, !dbg !28
|
177 |
+
%167 = bitcast i32 %163 to float, !dbg !28
|
178 |
+
%168 = bitcast i32 %164 to float, !dbg !28
|
179 |
+
%169 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %141, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
180 |
+
%170 = extractvalue { i32, i32, i32, i32 } %169, 0, !dbg !28
|
181 |
+
%171 = extractvalue { i32, i32, i32, i32 } %169, 1, !dbg !28
|
182 |
+
%172 = extractvalue { i32, i32, i32, i32 } %169, 2, !dbg !28
|
183 |
+
%173 = extractvalue { i32, i32, i32, i32 } %169, 3, !dbg !28
|
184 |
+
%174 = bitcast i32 %170 to float, !dbg !28
|
185 |
+
%175 = bitcast i32 %171 to float, !dbg !28
|
186 |
+
%176 = bitcast i32 %172 to float, !dbg !28
|
187 |
+
%177 = bitcast i32 %173 to float, !dbg !28
|
188 |
+
br i1 %53, label %178, label %179, !dbg !29
|
189 |
+
|
190 |
+
178: ; preds = %62
|
191 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
|
192 |
+
br label %179, !dbg !29
|
193 |
+
|
194 |
+
179: ; preds = %178, %62
|
195 |
+
%180 = zext nneg i32 %128 to i64, !dbg !30
|
196 |
+
%181 = zext nneg i32 %129 to i64, !dbg !30
|
197 |
+
%182 = getelementptr float, ptr addrspace(1) %60, i64 %180, !dbg !31
|
198 |
+
%183 = getelementptr float, ptr addrspace(1) %60, i64 %181, !dbg !31
|
199 |
+
%184 = getelementptr float, ptr addrspace(1) %61, i64 %180, !dbg !31
|
200 |
+
%185 = getelementptr float, ptr addrspace(1) %61, i64 %181, !dbg !31
|
201 |
+
%186 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %182, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
202 |
+
%187 = extractvalue { i32, i32, i32, i32 } %186, 0, !dbg !32
|
203 |
+
%188 = extractvalue { i32, i32, i32, i32 } %186, 1, !dbg !32
|
204 |
+
%189 = extractvalue { i32, i32, i32, i32 } %186, 2, !dbg !32
|
205 |
+
%190 = extractvalue { i32, i32, i32, i32 } %186, 3, !dbg !32
|
206 |
+
%191 = bitcast i32 %187 to float, !dbg !32
|
207 |
+
%192 = bitcast i32 %188 to float, !dbg !32
|
208 |
+
%193 = bitcast i32 %189 to float, !dbg !32
|
209 |
+
%194 = bitcast i32 %190 to float, !dbg !32
|
210 |
+
%195 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %183, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
211 |
+
%196 = extractvalue { i32, i32, i32, i32 } %195, 0, !dbg !32
|
212 |
+
%197 = extractvalue { i32, i32, i32, i32 } %195, 1, !dbg !32
|
213 |
+
%198 = extractvalue { i32, i32, i32, i32 } %195, 2, !dbg !32
|
214 |
+
%199 = extractvalue { i32, i32, i32, i32 } %195, 3, !dbg !32
|
215 |
+
%200 = bitcast i32 %196 to float, !dbg !32
|
216 |
+
%201 = bitcast i32 %197 to float, !dbg !32
|
217 |
+
%202 = bitcast i32 %198 to float, !dbg !32
|
218 |
+
%203 = bitcast i32 %199 to float, !dbg !32
|
219 |
+
%204 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
220 |
+
%205 = extractvalue { i32, i32, i32, i32 } %204, 0, !dbg !32
|
221 |
+
%206 = extractvalue { i32, i32, i32, i32 } %204, 1, !dbg !32
|
222 |
+
%207 = extractvalue { i32, i32, i32, i32 } %204, 2, !dbg !32
|
223 |
+
%208 = extractvalue { i32, i32, i32, i32 } %204, 3, !dbg !32
|
224 |
+
%209 = bitcast i32 %205 to float, !dbg !32
|
225 |
+
%210 = bitcast i32 %206 to float, !dbg !32
|
226 |
+
%211 = bitcast i32 %207 to float, !dbg !32
|
227 |
+
%212 = bitcast i32 %208 to float, !dbg !32
|
228 |
+
%213 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %185, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
229 |
+
%214 = extractvalue { i32, i32, i32, i32 } %213, 0, !dbg !32
|
230 |
+
%215 = extractvalue { i32, i32, i32, i32 } %213, 1, !dbg !32
|
231 |
+
%216 = extractvalue { i32, i32, i32, i32 } %213, 2, !dbg !32
|
232 |
+
%217 = extractvalue { i32, i32, i32, i32 } %213, 3, !dbg !32
|
233 |
+
%218 = bitcast i32 %214 to float, !dbg !32
|
234 |
+
%219 = bitcast i32 %215 to float, !dbg !32
|
235 |
+
%220 = bitcast i32 %216 to float, !dbg !32
|
236 |
+
%221 = bitcast i32 %217 to float, !dbg !32
|
237 |
+
%222 = fadd float %147, %191, !dbg !33
|
238 |
+
%223 = fadd float %148, %192, !dbg !33
|
239 |
+
%224 = fadd float %149, %193, !dbg !33
|
240 |
+
%225 = fadd float %150, %194, !dbg !33
|
241 |
+
%226 = fadd float %156, %200, !dbg !33
|
242 |
+
%227 = fadd float %157, %201, !dbg !33
|
243 |
+
%228 = fadd float %158, %202, !dbg !33
|
244 |
+
%229 = fadd float %159, %203, !dbg !33
|
245 |
+
%230 = fadd float %165, %209, !dbg !33
|
246 |
+
%231 = fadd float %166, %210, !dbg !33
|
247 |
+
%232 = fadd float %167, %211, !dbg !33
|
248 |
+
%233 = fadd float %168, %212, !dbg !33
|
249 |
+
%234 = fadd float %174, %218, !dbg !33
|
250 |
+
%235 = fadd float %175, %219, !dbg !33
|
251 |
+
%236 = fadd float %176, %220, !dbg !33
|
252 |
+
%237 = fadd float %177, %221, !dbg !33
|
253 |
+
%238 = fsub float %222, %111, !dbg !34
|
254 |
+
%239 = fsub float %223, %112, !dbg !34
|
255 |
+
%240 = fsub float %224, %113, !dbg !34
|
256 |
+
%241 = fsub float %225, %114, !dbg !34
|
257 |
+
%242 = fsub float %226, %115, !dbg !34
|
258 |
+
%243 = fsub float %227, %116, !dbg !34
|
259 |
+
%244 = fsub float %228, %117, !dbg !34
|
260 |
+
%245 = fsub float %229, %118, !dbg !34
|
261 |
+
%246 = fsub float %230, %119, !dbg !34
|
262 |
+
%247 = fsub float %231, %120, !dbg !34
|
263 |
+
%248 = fsub float %232, %121, !dbg !34
|
264 |
+
%249 = fsub float %233, %122, !dbg !34
|
265 |
+
%250 = fsub float %234, %123, !dbg !34
|
266 |
+
%251 = fsub float %235, %124, !dbg !34
|
267 |
+
%252 = fsub float %236, %125, !dbg !34
|
268 |
+
%253 = fsub float %237, %126, !dbg !34
|
269 |
+
%254 = fadd float %63, 1.000000e+00, !dbg !38
|
270 |
+
%255 = fadd float %64, 1.000000e+00, !dbg !38
|
271 |
+
%256 = fadd float %65, 1.000000e+00, !dbg !38
|
272 |
+
%257 = fadd float %66, 1.000000e+00, !dbg !38
|
273 |
+
%258 = fadd float %67, 1.000000e+00, !dbg !38
|
274 |
+
%259 = fadd float %68, 1.000000e+00, !dbg !38
|
275 |
+
%260 = fadd float %69, 1.000000e+00, !dbg !38
|
276 |
+
%261 = fadd float %70, 1.000000e+00, !dbg !38
|
277 |
+
%262 = fadd float %71, 1.000000e+00, !dbg !38
|
278 |
+
%263 = fadd float %72, 1.000000e+00, !dbg !38
|
279 |
+
%264 = fadd float %73, 1.000000e+00, !dbg !38
|
280 |
+
%265 = fadd float %74, 1.000000e+00, !dbg !38
|
281 |
+
%266 = fadd float %75, 1.000000e+00, !dbg !38
|
282 |
+
%267 = fadd float %76, 1.000000e+00, !dbg !38
|
283 |
+
%268 = fadd float %77, 1.000000e+00, !dbg !38
|
284 |
+
%269 = fadd float %78, 1.000000e+00, !dbg !38
|
285 |
+
%270 = fadd float %79, 1.000000e+00, !dbg !38
|
286 |
+
%271 = fadd float %80, 1.000000e+00, !dbg !38
|
287 |
+
%272 = fadd float %81, 1.000000e+00, !dbg !38
|
288 |
+
%273 = fadd float %82, 1.000000e+00, !dbg !38
|
289 |
+
%274 = fadd float %83, 1.000000e+00, !dbg !38
|
290 |
+
%275 = fadd float %84, 1.000000e+00, !dbg !38
|
291 |
+
%276 = fadd float %85, 1.000000e+00, !dbg !38
|
292 |
+
%277 = fadd float %86, 1.000000e+00, !dbg !38
|
293 |
+
%278 = fadd float %87, 1.000000e+00, !dbg !38
|
294 |
+
%279 = fadd float %88, 1.000000e+00, !dbg !38
|
295 |
+
%280 = fadd float %89, 1.000000e+00, !dbg !38
|
296 |
+
%281 = fadd float %90, 1.000000e+00, !dbg !38
|
297 |
+
%282 = fadd float %91, 1.000000e+00, !dbg !38
|
298 |
+
%283 = fadd float %92, 1.000000e+00, !dbg !38
|
299 |
+
%284 = fadd float %93, 1.000000e+00, !dbg !38
|
300 |
+
%285 = fadd float %94, 1.000000e+00, !dbg !38
|
301 |
+
%286 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %238, float %254) #6, !dbg !39
|
302 |
+
%287 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %239, float %255) #6, !dbg !39
|
303 |
+
%288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %240, float %256) #6, !dbg !39
|
304 |
+
%289 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %241, float %257) #6, !dbg !39
|
305 |
+
%290 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %242, float %258) #6, !dbg !39
|
306 |
+
%291 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %243, float %259) #6, !dbg !39
|
307 |
+
%292 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %260) #6, !dbg !39
|
308 |
+
%293 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float %261) #6, !dbg !39
|
309 |
+
%294 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %246, float %262) #6, !dbg !39
|
310 |
+
%295 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %247, float %263) #6, !dbg !39
|
311 |
+
%296 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %264) #6, !dbg !39
|
312 |
+
%297 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %249, float %265) #6, !dbg !39
|
313 |
+
%298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %266) #6, !dbg !39
|
314 |
+
%299 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %251, float %267) #6, !dbg !39
|
315 |
+
%300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %268) #6, !dbg !39
|
316 |
+
%301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %253, float %269) #6, !dbg !39
|
317 |
+
%302 = fadd float %111, %286, !dbg !40
|
318 |
+
%303 = fadd float %112, %287, !dbg !40
|
319 |
+
%304 = fadd float %113, %288, !dbg !40
|
320 |
+
%305 = fadd float %114, %289, !dbg !40
|
321 |
+
%306 = fadd float %115, %290, !dbg !40
|
322 |
+
%307 = fadd float %116, %291, !dbg !40
|
323 |
+
%308 = fadd float %117, %292, !dbg !40
|
324 |
+
%309 = fadd float %118, %293, !dbg !40
|
325 |
+
%310 = fadd float %119, %294, !dbg !40
|
326 |
+
%311 = fadd float %120, %295, !dbg !40
|
327 |
+
%312 = fadd float %121, %296, !dbg !40
|
328 |
+
%313 = fadd float %122, %297, !dbg !40
|
329 |
+
%314 = fadd float %123, %298, !dbg !40
|
330 |
+
%315 = fadd float %124, %299, !dbg !40
|
331 |
+
%316 = fadd float %125, %300, !dbg !40
|
332 |
+
%317 = fadd float %126, %301, !dbg !40
|
333 |
+
%318 = fsub float %222, %302, !dbg !41
|
334 |
+
%319 = fsub float %223, %303, !dbg !41
|
335 |
+
%320 = fsub float %224, %304, !dbg !41
|
336 |
+
%321 = fsub float %225, %305, !dbg !41
|
337 |
+
%322 = fsub float %226, %306, !dbg !41
|
338 |
+
%323 = fsub float %227, %307, !dbg !41
|
339 |
+
%324 = fsub float %228, %308, !dbg !41
|
340 |
+
%325 = fsub float %229, %309, !dbg !41
|
341 |
+
%326 = fsub float %230, %310, !dbg !41
|
342 |
+
%327 = fsub float %231, %311, !dbg !41
|
343 |
+
%328 = fsub float %232, %312, !dbg !41
|
344 |
+
%329 = fsub float %233, %313, !dbg !41
|
345 |
+
%330 = fsub float %234, %314, !dbg !41
|
346 |
+
%331 = fsub float %235, %315, !dbg !41
|
347 |
+
%332 = fsub float %236, %316, !dbg !41
|
348 |
+
%333 = fsub float %237, %317, !dbg !41
|
349 |
+
%334 = fmul float %238, %318, !dbg !42
|
350 |
+
%335 = fmul float %239, %319, !dbg !42
|
351 |
+
%336 = fmul float %240, %320, !dbg !42
|
352 |
+
%337 = fmul float %241, %321, !dbg !42
|
353 |
+
%338 = fmul float %242, %322, !dbg !42
|
354 |
+
%339 = fmul float %243, %323, !dbg !42
|
355 |
+
%340 = fmul float %244, %324, !dbg !42
|
356 |
+
%341 = fmul float %245, %325, !dbg !42
|
357 |
+
%342 = fmul float %246, %326, !dbg !42
|
358 |
+
%343 = fmul float %247, %327, !dbg !42
|
359 |
+
%344 = fmul float %248, %328, !dbg !42
|
360 |
+
%345 = fmul float %249, %329, !dbg !42
|
361 |
+
%346 = fmul float %250, %330, !dbg !42
|
362 |
+
%347 = fmul float %251, %331, !dbg !42
|
363 |
+
%348 = fmul float %252, %332, !dbg !42
|
364 |
+
%349 = fmul float %253, %333, !dbg !42
|
365 |
+
%350 = fadd float %95, %334, !dbg !43
|
366 |
+
%351 = fadd float %96, %335, !dbg !43
|
367 |
+
%352 = fadd float %97, %336, !dbg !43
|
368 |
+
%353 = fadd float %98, %337, !dbg !43
|
369 |
+
%354 = fadd float %99, %338, !dbg !43
|
370 |
+
%355 = fadd float %100, %339, !dbg !43
|
371 |
+
%356 = fadd float %101, %340, !dbg !43
|
372 |
+
%357 = fadd float %102, %341, !dbg !43
|
373 |
+
%358 = fadd float %103, %342, !dbg !43
|
374 |
+
%359 = fadd float %104, %343, !dbg !43
|
375 |
+
%360 = fadd float %105, %344, !dbg !43
|
376 |
+
%361 = fadd float %106, %345, !dbg !43
|
377 |
+
%362 = fadd float %107, %346, !dbg !43
|
378 |
+
%363 = fadd float %108, %347, !dbg !43
|
379 |
+
%364 = fadd float %109, %348, !dbg !43
|
380 |
+
%365 = fadd float %110, %349, !dbg !43
|
381 |
+
%366 = add nuw nsw i32 %127, 64, !dbg !12
|
382 |
+
%367 = icmp ult i32 %127, 192, !dbg !12
|
383 |
+
br i1 %367, label %62, label %368, !dbg !12
|
384 |
+
|
385 |
+
368: ; preds = %179
|
386 |
+
%369 = and i32 %15, 3, !dbg !12
|
387 |
+
%370 = mul nuw nsw i32 %369, 72, !dbg !12
|
388 |
+
%371 = add nuw nsw i32 %370, %11, !dbg !12
|
389 |
+
%372 = zext nneg i32 %371 to i64, !dbg !12
|
390 |
+
%373 = getelementptr float, ptr addrspace(3) @global_smem, i64 %372, !dbg !12
|
391 |
+
%374 = insertelement <1 x float> undef, float %270, i64 0, !dbg !12
|
392 |
+
store <1 x float> %374, ptr addrspace(3) %373, align 4, !dbg !12
|
393 |
+
%375 = add nuw nsw i32 %11, 288, !dbg !12
|
394 |
+
%376 = add nuw nsw i32 %375, %370, !dbg !12
|
395 |
+
%377 = zext nneg i32 %376 to i64, !dbg !12
|
396 |
+
%378 = getelementptr float, ptr addrspace(3) @global_smem, i64 %377, !dbg !12
|
397 |
+
%379 = insertelement <1 x float> undef, float %271, i64 0, !dbg !12
|
398 |
+
store <1 x float> %379, ptr addrspace(3) %378, align 4, !dbg !12
|
399 |
+
%380 = or i32 %11, 576, !dbg !12
|
400 |
+
%381 = add nuw nsw i32 %380, %370, !dbg !12
|
401 |
+
%382 = zext nneg i32 %381 to i64, !dbg !12
|
402 |
+
%383 = getelementptr float, ptr addrspace(3) @global_smem, i64 %382, !dbg !12
|
403 |
+
%384 = insertelement <1 x float> undef, float %272, i64 0, !dbg !12
|
404 |
+
store <1 x float> %384, ptr addrspace(3) %383, align 4, !dbg !12
|
405 |
+
%385 = add nuw nsw i32 %11, 864, !dbg !12
|
406 |
+
%386 = add nuw nsw i32 %385, %370, !dbg !12
|
407 |
+
%387 = zext nneg i32 %386 to i64, !dbg !12
|
408 |
+
%388 = getelementptr float, ptr addrspace(3) @global_smem, i64 %387, !dbg !12
|
409 |
+
%389 = insertelement <1 x float> undef, float %273, i64 0, !dbg !12
|
410 |
+
store <1 x float> %389, ptr addrspace(3) %388, align 4, !dbg !12
|
411 |
+
%390 = or i32 %11, 1152, !dbg !12
|
412 |
+
%391 = add nuw nsw i32 %390, %370, !dbg !12
|
413 |
+
%392 = zext nneg i32 %391 to i64, !dbg !12
|
414 |
+
%393 = getelementptr float, ptr addrspace(3) @global_smem, i64 %392, !dbg !12
|
415 |
+
%394 = insertelement <1 x float> undef, float %274, i64 0, !dbg !12
|
416 |
+
store <1 x float> %394, ptr addrspace(3) %393, align 4, !dbg !12
|
417 |
+
%395 = add nuw nsw i32 %11, 1440, !dbg !12
|
418 |
+
%396 = add nuw nsw i32 %395, %370, !dbg !12
|
419 |
+
%397 = zext nneg i32 %396 to i64, !dbg !12
|
420 |
+
%398 = getelementptr float, ptr addrspace(3) @global_smem, i64 %397, !dbg !12
|
421 |
+
%399 = insertelement <1 x float> undef, float %275, i64 0, !dbg !12
|
422 |
+
store <1 x float> %399, ptr addrspace(3) %398, align 4, !dbg !12
|
423 |
+
%400 = or i32 %11, 1728, !dbg !12
|
424 |
+
%401 = add nuw nsw i32 %400, %370, !dbg !12
|
425 |
+
%402 = zext nneg i32 %401 to i64, !dbg !12
|
426 |
+
%403 = getelementptr float, ptr addrspace(3) @global_smem, i64 %402, !dbg !12
|
427 |
+
%404 = insertelement <1 x float> undef, float %276, i64 0, !dbg !12
|
428 |
+
store <1 x float> %404, ptr addrspace(3) %403, align 4, !dbg !12
|
429 |
+
%405 = add nuw nsw i32 %11, 2016, !dbg !12
|
430 |
+
%406 = add nuw nsw i32 %405, %370, !dbg !12
|
431 |
+
%407 = zext nneg i32 %406 to i64, !dbg !12
|
432 |
+
%408 = getelementptr float, ptr addrspace(3) @global_smem, i64 %407, !dbg !12
|
433 |
+
%409 = insertelement <1 x float> undef, float %277, i64 0, !dbg !12
|
434 |
+
store <1 x float> %409, ptr addrspace(3) %408, align 4, !dbg !12
|
435 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
436 |
+
%410 = mul nuw nsw i32 %10, 72, !dbg !12
|
437 |
+
%411 = add nuw nsw i32 %410, %13, !dbg !12
|
438 |
+
%412 = zext nneg i32 %411 to i64, !dbg !12
|
439 |
+
%413 = getelementptr float, ptr addrspace(3) @global_smem, i64 %412, !dbg !12
|
440 |
+
%414 = load float, ptr addrspace(3) %413, align 32, !dbg !12
|
441 |
+
%415 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 1, !dbg !12
|
442 |
+
%416 = load float, ptr addrspace(3) %415, align 4, !dbg !12
|
443 |
+
%417 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 2, !dbg !12
|
444 |
+
%418 = load float, ptr addrspace(3) %417, align 8, !dbg !12
|
445 |
+
%419 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 3, !dbg !12
|
446 |
+
%420 = load float, ptr addrspace(3) %419, align 4, !dbg !12
|
447 |
+
%421 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 4, !dbg !12
|
448 |
+
%422 = load float, ptr addrspace(3) %421, align 16, !dbg !12
|
449 |
+
%423 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 5, !dbg !12
|
450 |
+
%424 = load float, ptr addrspace(3) %423, align 4, !dbg !12
|
451 |
+
%425 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 6, !dbg !12
|
452 |
+
%426 = load float, ptr addrspace(3) %425, align 8, !dbg !12
|
453 |
+
%427 = getelementptr inbounds <8 x float>, ptr addrspace(3) %413, i64 0, i64 7, !dbg !12
|
454 |
+
%428 = load float, ptr addrspace(3) %427, align 4, !dbg !12
|
455 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
456 |
+
%429 = insertelement <1 x float> undef, float %278, i64 0, !dbg !12
|
457 |
+
store <1 x float> %429, ptr addrspace(3) %373, align 4, !dbg !12
|
458 |
+
%430 = insertelement <1 x float> undef, float %279, i64 0, !dbg !12
|
459 |
+
store <1 x float> %430, ptr addrspace(3) %378, align 4, !dbg !12
|
460 |
+
%431 = insertelement <1 x float> undef, float %280, i64 0, !dbg !12
|
461 |
+
store <1 x float> %431, ptr addrspace(3) %383, align 4, !dbg !12
|
462 |
+
%432 = insertelement <1 x float> undef, float %281, i64 0, !dbg !12
|
463 |
+
store <1 x float> %432, ptr addrspace(3) %388, align 4, !dbg !12
|
464 |
+
%433 = insertelement <1 x float> undef, float %282, i64 0, !dbg !12
|
465 |
+
store <1 x float> %433, ptr addrspace(3) %393, align 4, !dbg !12
|
466 |
+
%434 = insertelement <1 x float> undef, float %283, i64 0, !dbg !12
|
467 |
+
store <1 x float> %434, ptr addrspace(3) %398, align 4, !dbg !12
|
468 |
+
%435 = insertelement <1 x float> undef, float %284, i64 0, !dbg !12
|
469 |
+
store <1 x float> %435, ptr addrspace(3) %403, align 4, !dbg !12
|
470 |
+
%436 = insertelement <1 x float> undef, float %285, i64 0, !dbg !12
|
471 |
+
store <1 x float> %436, ptr addrspace(3) %408, align 4, !dbg !12
|
472 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
473 |
+
%437 = load float, ptr addrspace(3) %413, align 32, !dbg !12
|
474 |
+
%438 = load float, ptr addrspace(3) %415, align 4, !dbg !12
|
475 |
+
%439 = load float, ptr addrspace(3) %417, align 8, !dbg !12
|
476 |
+
%440 = load float, ptr addrspace(3) %419, align 4, !dbg !12
|
477 |
+
%441 = load float, ptr addrspace(3) %421, align 16, !dbg !12
|
478 |
+
%442 = load float, ptr addrspace(3) %423, align 4, !dbg !12
|
479 |
+
%443 = load float, ptr addrspace(3) %425, align 8, !dbg !12
|
480 |
+
%444 = load float, ptr addrspace(3) %427, align 4, !dbg !12
|
481 |
+
%445 = fsub float %303, %302, !dbg !44
|
482 |
+
%446 = fadd float %414, %416, !dbg !48
|
483 |
+
%447 = fcmp oeq float %446, 0.000000e+00, !dbg !49
|
484 |
+
%448 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %446) #6, !dbg !50
|
485 |
+
%449 = select i1 %447, float 0.000000e+00, float %448, !dbg !51
|
486 |
+
%450 = fmul float %445, %449, !dbg !52
|
487 |
+
%451 = fadd float %302, %450, !dbg !53
|
488 |
+
%452 = fadd float %350, %351, !dbg !54
|
489 |
+
%453 = fmul float %445, %445, !dbg !55
|
490 |
+
%454 = fmul float %453, %414, !dbg !56
|
491 |
+
%455 = fmul float %454, %449, !dbg !57
|
492 |
+
%456 = fadd float %452, %455, !dbg !58
|
493 |
+
%457 = fsub float %304, %451, !dbg !44
|
494 |
+
%458 = fadd float %418, %446, !dbg !48
|
495 |
+
%459 = fcmp oeq float %458, 0.000000e+00, !dbg !49
|
496 |
+
%460 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %418, float %458) #6, !dbg !50
|
497 |
+
%461 = select i1 %459, float 0.000000e+00, float %460, !dbg !51
|
498 |
+
%462 = fmul float %461, %457, !dbg !52
|
499 |
+
%463 = fadd float %451, %462, !dbg !53
|
500 |
+
%464 = fadd float %352, %456, !dbg !54
|
501 |
+
%465 = fmul float %457, %457, !dbg !55
|
502 |
+
%466 = fmul float %446, %465, !dbg !56
|
503 |
+
%467 = fmul float %461, %466, !dbg !57
|
504 |
+
%468 = fadd float %464, %467, !dbg !58
|
505 |
+
%469 = fsub float %305, %463, !dbg !44
|
506 |
+
%470 = fadd float %420, %458, !dbg !48
|
507 |
+
%471 = fcmp oeq float %470, 0.000000e+00, !dbg !49
|
508 |
+
%472 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %420, float %470) #6, !dbg !50
|
509 |
+
%473 = select i1 %471, float 0.000000e+00, float %472, !dbg !51
|
510 |
+
%474 = fmul float %473, %469, !dbg !52
|
511 |
+
%475 = fadd float %463, %474, !dbg !53
|
512 |
+
%476 = fadd float %353, %468, !dbg !54
|
513 |
+
%477 = fmul float %469, %469, !dbg !55
|
514 |
+
%478 = fmul float %458, %477, !dbg !56
|
515 |
+
%479 = fmul float %473, %478, !dbg !57
|
516 |
+
%480 = fadd float %476, %479, !dbg !58
|
517 |
+
%481 = fsub float %306, %475, !dbg !44
|
518 |
+
%482 = fadd float %422, %470, !dbg !48
|
519 |
+
%483 = fcmp oeq float %482, 0.000000e+00, !dbg !49
|
520 |
+
%484 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %422, float %482) #6, !dbg !50
|
521 |
+
%485 = select i1 %483, float 0.000000e+00, float %484, !dbg !51
|
522 |
+
%486 = fmul float %485, %481, !dbg !52
|
523 |
+
%487 = fadd float %475, %486, !dbg !53
|
524 |
+
%488 = fadd float %354, %480, !dbg !54
|
525 |
+
%489 = fmul float %481, %481, !dbg !55
|
526 |
+
%490 = fmul float %470, %489, !dbg !56
|
527 |
+
%491 = fmul float %485, %490, !dbg !57
|
528 |
+
%492 = fadd float %488, %491, !dbg !58
|
529 |
+
%493 = fsub float %307, %487, !dbg !44
|
530 |
+
%494 = fadd float %424, %482, !dbg !48
|
531 |
+
%495 = fcmp oeq float %494, 0.000000e+00, !dbg !49
|
532 |
+
%496 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %424, float %494) #6, !dbg !50
|
533 |
+
%497 = select i1 %495, float 0.000000e+00, float %496, !dbg !51
|
534 |
+
%498 = fmul float %497, %493, !dbg !52
|
535 |
+
%499 = fadd float %487, %498, !dbg !53
|
536 |
+
%500 = fadd float %355, %492, !dbg !54
|
537 |
+
%501 = fmul float %493, %493, !dbg !55
|
538 |
+
%502 = fmul float %482, %501, !dbg !56
|
539 |
+
%503 = fmul float %497, %502, !dbg !57
|
540 |
+
%504 = fadd float %500, %503, !dbg !58
|
541 |
+
%505 = fsub float %308, %499, !dbg !44
|
542 |
+
%506 = fadd float %426, %494, !dbg !48
|
543 |
+
%507 = fcmp oeq float %506, 0.000000e+00, !dbg !49
|
544 |
+
%508 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %426, float %506) #6, !dbg !50
|
545 |
+
%509 = select i1 %507, float 0.000000e+00, float %508, !dbg !51
|
546 |
+
%510 = fmul float %509, %505, !dbg !52
|
547 |
+
%511 = fadd float %499, %510, !dbg !53
|
548 |
+
%512 = fadd float %356, %504, !dbg !54
|
549 |
+
%513 = fmul float %505, %505, !dbg !55
|
550 |
+
%514 = fmul float %494, %513, !dbg !56
|
551 |
+
%515 = fmul float %509, %514, !dbg !57
|
552 |
+
%516 = fadd float %512, %515, !dbg !58
|
553 |
+
%517 = fsub float %309, %511, !dbg !44
|
554 |
+
%518 = fadd float %428, %506, !dbg !48
|
555 |
+
%519 = fcmp oeq float %518, 0.000000e+00, !dbg !49
|
556 |
+
%520 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float %518) #6, !dbg !50
|
557 |
+
%521 = select i1 %519, float 0.000000e+00, float %520, !dbg !51
|
558 |
+
%522 = fmul float %521, %517, !dbg !52
|
559 |
+
%523 = fadd float %511, %522, !dbg !53
|
560 |
+
%524 = fadd float %357, %516, !dbg !54
|
561 |
+
%525 = fmul float %517, %517, !dbg !55
|
562 |
+
%526 = fmul float %506, %525, !dbg !56
|
563 |
+
%527 = fmul float %521, %526, !dbg !57
|
564 |
+
%528 = fadd float %524, %527, !dbg !58
|
565 |
+
%529 = fsub float %311, %310, !dbg !44
|
566 |
+
%530 = fadd float %437, %438, !dbg !48
|
567 |
+
%531 = fcmp oeq float %530, 0.000000e+00, !dbg !49
|
568 |
+
%532 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %438, float %530) #6, !dbg !50
|
569 |
+
%533 = select i1 %531, float 0.000000e+00, float %532, !dbg !51
|
570 |
+
%534 = fmul float %529, %533, !dbg !52
|
571 |
+
%535 = fadd float %310, %534, !dbg !53
|
572 |
+
%536 = fadd float %358, %359, !dbg !54
|
573 |
+
%537 = fmul float %529, %529, !dbg !55
|
574 |
+
%538 = fmul float %537, %437, !dbg !56
|
575 |
+
%539 = fmul float %538, %533, !dbg !57
|
576 |
+
%540 = fadd float %536, %539, !dbg !58
|
577 |
+
%541 = fsub float %312, %535, !dbg !44
|
578 |
+
%542 = fadd float %439, %530, !dbg !48
|
579 |
+
%543 = fcmp oeq float %542, 0.000000e+00, !dbg !49
|
580 |
+
%544 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %439, float %542) #6, !dbg !50
|
581 |
+
%545 = select i1 %543, float 0.000000e+00, float %544, !dbg !51
|
582 |
+
%546 = fmul float %545, %541, !dbg !52
|
583 |
+
%547 = fadd float %535, %546, !dbg !53
|
584 |
+
%548 = fadd float %360, %540, !dbg !54
|
585 |
+
%549 = fmul float %541, %541, !dbg !55
|
586 |
+
%550 = fmul float %530, %549, !dbg !56
|
587 |
+
%551 = fmul float %545, %550, !dbg !57
|
588 |
+
%552 = fadd float %548, %551, !dbg !58
|
589 |
+
%553 = fsub float %313, %547, !dbg !44
|
590 |
+
%554 = fadd float %440, %542, !dbg !48
|
591 |
+
%555 = fcmp oeq float %554, 0.000000e+00, !dbg !49
|
592 |
+
%556 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %440, float %554) #6, !dbg !50
|
593 |
+
%557 = select i1 %555, float 0.000000e+00, float %556, !dbg !51
|
594 |
+
%558 = fmul float %557, %553, !dbg !52
|
595 |
+
%559 = fadd float %547, %558, !dbg !53
|
596 |
+
%560 = fadd float %361, %552, !dbg !54
|
597 |
+
%561 = fmul float %553, %553, !dbg !55
|
598 |
+
%562 = fmul float %542, %561, !dbg !56
|
599 |
+
%563 = fmul float %557, %562, !dbg !57
|
600 |
+
%564 = fadd float %560, %563, !dbg !58
|
601 |
+
%565 = fsub float %314, %559, !dbg !44
|
602 |
+
%566 = fadd float %441, %554, !dbg !48
|
603 |
+
%567 = fcmp oeq float %566, 0.000000e+00, !dbg !49
|
604 |
+
%568 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %441, float %566) #6, !dbg !50
|
605 |
+
%569 = select i1 %567, float 0.000000e+00, float %568, !dbg !51
|
606 |
+
%570 = fmul float %569, %565, !dbg !52
|
607 |
+
%571 = fadd float %559, %570, !dbg !53
|
608 |
+
%572 = fadd float %362, %564, !dbg !54
|
609 |
+
%573 = fmul float %565, %565, !dbg !55
|
610 |
+
%574 = fmul float %554, %573, !dbg !56
|
611 |
+
%575 = fmul float %569, %574, !dbg !57
|
612 |
+
%576 = fadd float %572, %575, !dbg !58
|
613 |
+
%577 = fsub float %315, %571, !dbg !44
|
614 |
+
%578 = fadd float %442, %566, !dbg !48
|
615 |
+
%579 = fcmp oeq float %578, 0.000000e+00, !dbg !49
|
616 |
+
%580 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %442, float %578) #6, !dbg !50
|
617 |
+
%581 = select i1 %579, float 0.000000e+00, float %580, !dbg !51
|
618 |
+
%582 = fmul float %581, %577, !dbg !52
|
619 |
+
%583 = fadd float %571, %582, !dbg !53
|
620 |
+
%584 = fadd float %363, %576, !dbg !54
|
621 |
+
%585 = fmul float %577, %577, !dbg !55
|
622 |
+
%586 = fmul float %566, %585, !dbg !56
|
623 |
+
%587 = fmul float %581, %586, !dbg !57
|
624 |
+
%588 = fadd float %584, %587, !dbg !58
|
625 |
+
%589 = fsub float %316, %583, !dbg !44
|
626 |
+
%590 = fadd float %443, %578, !dbg !48
|
627 |
+
%591 = fcmp oeq float %590, 0.000000e+00, !dbg !49
|
628 |
+
%592 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %443, float %590) #6, !dbg !50
|
629 |
+
%593 = select i1 %591, float 0.000000e+00, float %592, !dbg !51
|
630 |
+
%594 = fmul float %593, %589, !dbg !52
|
631 |
+
%595 = fadd float %583, %594, !dbg !53
|
632 |
+
%596 = fadd float %364, %588, !dbg !54
|
633 |
+
%597 = fmul float %589, %589, !dbg !55
|
634 |
+
%598 = fmul float %578, %597, !dbg !56
|
635 |
+
%599 = fmul float %593, %598, !dbg !57
|
636 |
+
%600 = fadd float %596, %599, !dbg !58
|
637 |
+
%601 = fsub float %317, %595, !dbg !44
|
638 |
+
%602 = fadd float %444, %590, !dbg !48
|
639 |
+
%603 = fcmp oeq float %602, 0.000000e+00, !dbg !49
|
640 |
+
%604 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %444, float %602) #6, !dbg !50
|
641 |
+
%605 = select i1 %603, float 0.000000e+00, float %604, !dbg !51
|
642 |
+
%606 = fmul float %605, %601, !dbg !52
|
643 |
+
%607 = fadd float %595, %606, !dbg !53
|
644 |
+
%608 = fadd float %365, %600, !dbg !54
|
645 |
+
%609 = fmul float %601, %601, !dbg !55
|
646 |
+
%610 = fmul float %590, %609, !dbg !56
|
647 |
+
%611 = fmul float %605, %610, !dbg !57
|
648 |
+
%612 = fadd float %608, %611, !dbg !58
|
649 |
+
%613 = bitcast float %523 to i32, !dbg !59
|
650 |
+
%614 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %613, i32 4, i32 31), !dbg !59
|
651 |
+
%615 = bitcast i32 %614 to float, !dbg !59
|
652 |
+
%616 = bitcast float %528 to i32, !dbg !59
|
653 |
+
%617 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %616, i32 4, i32 31), !dbg !59
|
654 |
+
%618 = bitcast i32 %617 to float, !dbg !59
|
655 |
+
%619 = bitcast float %518 to i32, !dbg !59
|
656 |
+
%620 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %619, i32 4, i32 31), !dbg !59
|
657 |
+
%621 = bitcast i32 %620 to float, !dbg !59
|
658 |
+
%622 = fsub float %615, %523, !dbg !44
|
659 |
+
%623 = fadd float %518, %621, !dbg !48
|
660 |
+
%624 = fcmp oeq float %623, 0.000000e+00, !dbg !49
|
661 |
+
%625 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %621, float %623) #6, !dbg !50
|
662 |
+
%626 = select i1 %624, float 0.000000e+00, float %625, !dbg !51
|
663 |
+
%627 = fmul float %626, %622, !dbg !52
|
664 |
+
%628 = fadd float %523, %627, !dbg !53
|
665 |
+
%629 = fadd float %528, %618, !dbg !54
|
666 |
+
%630 = fmul float %622, %622, !dbg !55
|
667 |
+
%631 = fmul float %518, %630, !dbg !56
|
668 |
+
%632 = fmul float %626, %631, !dbg !57
|
669 |
+
%633 = fadd float %629, %632, !dbg !58
|
670 |
+
%634 = bitcast float %628 to i32, !dbg !59
|
671 |
+
%635 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %634, i32 2, i32 31), !dbg !59
|
672 |
+
%636 = bitcast i32 %635 to float, !dbg !59
|
673 |
+
%637 = bitcast float %633 to i32, !dbg !59
|
674 |
+
%638 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %637, i32 2, i32 31), !dbg !59
|
675 |
+
%639 = bitcast i32 %638 to float, !dbg !59
|
676 |
+
%640 = bitcast float %623 to i32, !dbg !59
|
677 |
+
%641 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %640, i32 2, i32 31), !dbg !59
|
678 |
+
%642 = bitcast i32 %641 to float, !dbg !59
|
679 |
+
%643 = fsub float %636, %628, !dbg !44
|
680 |
+
%644 = fadd float %623, %642, !dbg !48
|
681 |
+
%645 = fcmp oeq float %644, 0.000000e+00, !dbg !49
|
682 |
+
%646 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %642, float %644) #6, !dbg !50
|
683 |
+
%647 = select i1 %645, float 0.000000e+00, float %646, !dbg !51
|
684 |
+
%648 = fmul float %647, %643, !dbg !52
|
685 |
+
%649 = fadd float %628, %648, !dbg !53
|
686 |
+
%650 = fadd float %633, %639, !dbg !54
|
687 |
+
%651 = fmul float %643, %643, !dbg !55
|
688 |
+
%652 = fmul float %623, %651, !dbg !56
|
689 |
+
%653 = fmul float %647, %652, !dbg !57
|
690 |
+
%654 = fadd float %650, %653, !dbg !58
|
691 |
+
%655 = bitcast float %649 to i32, !dbg !59
|
692 |
+
%656 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %655, i32 1, i32 31), !dbg !59
|
693 |
+
%657 = bitcast i32 %656 to float, !dbg !59
|
694 |
+
%658 = bitcast float %654 to i32, !dbg !59
|
695 |
+
%659 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %658, i32 1, i32 31), !dbg !59
|
696 |
+
%660 = bitcast i32 %659 to float, !dbg !59
|
697 |
+
%661 = bitcast float %644 to i32, !dbg !59
|
698 |
+
%662 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %661, i32 1, i32 31), !dbg !59
|
699 |
+
%663 = bitcast i32 %662 to float, !dbg !59
|
700 |
+
%664 = fsub float %657, %649, !dbg !44
|
701 |
+
%665 = fadd float %644, %663, !dbg !48
|
702 |
+
%666 = fcmp oeq float %665, 0.000000e+00, !dbg !49
|
703 |
+
%667 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %663, float %665) #6, !dbg !50
|
704 |
+
%668 = select i1 %666, float 0.000000e+00, float %667, !dbg !51
|
705 |
+
%669 = fmul float %664, %668, !dbg !52
|
706 |
+
%670 = fadd float %649, %669, !dbg !53
|
707 |
+
%671 = fadd float %654, %660, !dbg !54
|
708 |
+
%672 = fmul float %664, %664, !dbg !55
|
709 |
+
%673 = fmul float %644, %672, !dbg !56
|
710 |
+
%674 = fmul float %668, %673, !dbg !57
|
711 |
+
%675 = fadd float %671, %674, !dbg !58
|
712 |
+
%676 = bitcast float %607 to i32, !dbg !59
|
713 |
+
%677 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %676, i32 4, i32 31), !dbg !59
|
714 |
+
%678 = bitcast i32 %677 to float, !dbg !59
|
715 |
+
%679 = bitcast float %612 to i32, !dbg !59
|
716 |
+
%680 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %679, i32 4, i32 31), !dbg !59
|
717 |
+
%681 = bitcast i32 %680 to float, !dbg !59
|
718 |
+
%682 = bitcast float %602 to i32, !dbg !59
|
719 |
+
%683 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %682, i32 4, i32 31), !dbg !59
|
720 |
+
%684 = bitcast i32 %683 to float, !dbg !59
|
721 |
+
%685 = fsub float %678, %607, !dbg !44
|
722 |
+
%686 = fadd float %602, %684, !dbg !48
|
723 |
+
%687 = fcmp oeq float %686, 0.000000e+00, !dbg !49
|
724 |
+
%688 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %684, float %686) #6, !dbg !50
|
725 |
+
%689 = select i1 %687, float 0.000000e+00, float %688, !dbg !51
|
726 |
+
%690 = fmul float %685, %689, !dbg !52
|
727 |
+
%691 = fadd float %607, %690, !dbg !53
|
728 |
+
%692 = fadd float %612, %681, !dbg !54
|
729 |
+
%693 = fmul float %685, %685, !dbg !55
|
730 |
+
%694 = fmul float %602, %693, !dbg !56
|
731 |
+
%695 = fmul float %694, %689, !dbg !57
|
732 |
+
%696 = fadd float %692, %695, !dbg !58
|
733 |
+
%697 = bitcast float %691 to i32, !dbg !59
|
734 |
+
%698 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %697, i32 2, i32 31), !dbg !59
|
735 |
+
%699 = bitcast i32 %698 to float, !dbg !59
|
736 |
+
%700 = bitcast float %696 to i32, !dbg !59
|
737 |
+
%701 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %700, i32 2, i32 31), !dbg !59
|
738 |
+
%702 = bitcast i32 %701 to float, !dbg !59
|
739 |
+
%703 = bitcast float %686 to i32, !dbg !59
|
740 |
+
%704 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %703, i32 2, i32 31), !dbg !59
|
741 |
+
%705 = bitcast i32 %704 to float, !dbg !59
|
742 |
+
%706 = fsub float %699, %691, !dbg !44
|
743 |
+
%707 = fadd float %686, %705, !dbg !48
|
744 |
+
%708 = fcmp oeq float %707, 0.000000e+00, !dbg !49
|
745 |
+
%709 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %705, float %707) #6, !dbg !50
|
746 |
+
%710 = select i1 %708, float 0.000000e+00, float %709, !dbg !51
|
747 |
+
%711 = fmul float %706, %710, !dbg !52
|
748 |
+
%712 = fadd float %691, %711, !dbg !53
|
749 |
+
%713 = fadd float %696, %702, !dbg !54
|
750 |
+
%714 = fmul float %706, %706, !dbg !55
|
751 |
+
%715 = fmul float %686, %714, !dbg !56
|
752 |
+
%716 = fmul float %710, %715, !dbg !57
|
753 |
+
%717 = fadd float %713, %716, !dbg !58
|
754 |
+
%718 = bitcast float %712 to i32, !dbg !59
|
755 |
+
%719 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %718, i32 1, i32 31), !dbg !59
|
756 |
+
%720 = bitcast i32 %719 to float, !dbg !59
|
757 |
+
%721 = bitcast float %717 to i32, !dbg !59
|
758 |
+
%722 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %721, i32 1, i32 31), !dbg !59
|
759 |
+
%723 = bitcast i32 %722 to float, !dbg !59
|
760 |
+
%724 = bitcast float %707 to i32, !dbg !59
|
761 |
+
%725 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %724, i32 1, i32 31), !dbg !59
|
762 |
+
%726 = bitcast i32 %725 to float, !dbg !59
|
763 |
+
%727 = fsub float %720, %712, !dbg !44
|
764 |
+
%728 = fadd float %707, %726, !dbg !48
|
765 |
+
%729 = fcmp oeq float %728, 0.000000e+00, !dbg !49
|
766 |
+
%730 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %726, float %728) #6, !dbg !50
|
767 |
+
%731 = select i1 %729, float 0.000000e+00, float %730, !dbg !51
|
768 |
+
%732 = fmul float %727, %731, !dbg !52
|
769 |
+
%733 = fadd float %712, %732, !dbg !53
|
770 |
+
%734 = fadd float %717, %723, !dbg !54
|
771 |
+
%735 = fmul float %727, %727, !dbg !55
|
772 |
+
%736 = fmul float %707, %735, !dbg !56
|
773 |
+
%737 = fmul float %731, %736, !dbg !57
|
774 |
+
%738 = fadd float %734, %737, !dbg !58
|
775 |
+
%739 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
776 |
+
%740 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
777 |
+
%741 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
778 |
+
%742 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
779 |
+
%743 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
780 |
+
%744 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
781 |
+
%745 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
782 |
+
%746 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %675, float 2.560000e+02) #6, !dbg !61
|
783 |
+
%747 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
784 |
+
%748 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
785 |
+
%749 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
786 |
+
%750 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
787 |
+
%751 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
788 |
+
%752 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
789 |
+
%753 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
790 |
+
%754 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %738, float 2.560000e+02) #6, !dbg !61
|
791 |
+
%755 = fadd float %739, 0x3EE4F8B580000000, !dbg !62
|
792 |
+
%756 = fadd float %747, 0x3EE4F8B580000000, !dbg !62
|
793 |
+
%757 = shl i32 %18, 8, !dbg !63
|
794 |
+
%758 = shl i32 %19, 8, !dbg !63
|
795 |
+
br label %759, !dbg !64
|
796 |
+
|
797 |
+
759: ; preds = %368, %__nv_rsqrtf.exit25
|
798 |
+
%760 = phi i32 [ 0, %368 ], [ %1009, %__nv_rsqrtf.exit25 ]
|
799 |
+
%761 = or i32 %760, %13, !dbg !65
|
800 |
+
%762 = or i32 %760, %14, !dbg !65
|
801 |
+
%763 = add i32 %761, %46, !dbg !66
|
802 |
+
%764 = add i32 %762, %46, !dbg !66
|
803 |
+
%765 = add i32 %761, %47, !dbg !66
|
804 |
+
%766 = add i32 %762, %47, !dbg !66
|
805 |
+
%767 = sext i32 %763 to i64, !dbg !67
|
806 |
+
%768 = getelementptr float, ptr addrspace(1) %2, i64 %767, !dbg !67
|
807 |
+
%769 = sext i32 %764 to i64, !dbg !67
|
808 |
+
%770 = getelementptr float, ptr addrspace(1) %2, i64 %769, !dbg !67
|
809 |
+
%771 = sext i32 %765 to i64, !dbg !67
|
810 |
+
%772 = getelementptr float, ptr addrspace(1) %2, i64 %771, !dbg !67
|
811 |
+
%773 = sext i32 %766 to i64, !dbg !67
|
812 |
+
%774 = getelementptr float, ptr addrspace(1) %2, i64 %773, !dbg !67
|
813 |
+
%775 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %768, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
814 |
+
%776 = extractvalue { i32, i32, i32, i32 } %775, 0, !dbg !68
|
815 |
+
%777 = extractvalue { i32, i32, i32, i32 } %775, 1, !dbg !68
|
816 |
+
%778 = extractvalue { i32, i32, i32, i32 } %775, 2, !dbg !68
|
817 |
+
%779 = extractvalue { i32, i32, i32, i32 } %775, 3, !dbg !68
|
818 |
+
%780 = bitcast i32 %776 to float, !dbg !68
|
819 |
+
%781 = bitcast i32 %777 to float, !dbg !68
|
820 |
+
%782 = bitcast i32 %778 to float, !dbg !68
|
821 |
+
%783 = bitcast i32 %779 to float, !dbg !68
|
822 |
+
%784 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %770, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
823 |
+
%785 = extractvalue { i32, i32, i32, i32 } %784, 0, !dbg !68
|
824 |
+
%786 = extractvalue { i32, i32, i32, i32 } %784, 1, !dbg !68
|
825 |
+
%787 = extractvalue { i32, i32, i32, i32 } %784, 2, !dbg !68
|
826 |
+
%788 = extractvalue { i32, i32, i32, i32 } %784, 3, !dbg !68
|
827 |
+
%789 = bitcast i32 %785 to float, !dbg !68
|
828 |
+
%790 = bitcast i32 %786 to float, !dbg !68
|
829 |
+
%791 = bitcast i32 %787 to float, !dbg !68
|
830 |
+
%792 = bitcast i32 %788 to float, !dbg !68
|
831 |
+
%793 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %772, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
832 |
+
%794 = extractvalue { i32, i32, i32, i32 } %793, 0, !dbg !68
|
833 |
+
%795 = extractvalue { i32, i32, i32, i32 } %793, 1, !dbg !68
|
834 |
+
%796 = extractvalue { i32, i32, i32, i32 } %793, 2, !dbg !68
|
835 |
+
%797 = extractvalue { i32, i32, i32, i32 } %793, 3, !dbg !68
|
836 |
+
%798 = bitcast i32 %794 to float, !dbg !68
|
837 |
+
%799 = bitcast i32 %795 to float, !dbg !68
|
838 |
+
%800 = bitcast i32 %796 to float, !dbg !68
|
839 |
+
%801 = bitcast i32 %797 to float, !dbg !68
|
840 |
+
%802 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %774, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
841 |
+
%803 = extractvalue { i32, i32, i32, i32 } %802, 0, !dbg !68
|
842 |
+
%804 = extractvalue { i32, i32, i32, i32 } %802, 1, !dbg !68
|
843 |
+
%805 = extractvalue { i32, i32, i32, i32 } %802, 2, !dbg !68
|
844 |
+
%806 = extractvalue { i32, i32, i32, i32 } %802, 3, !dbg !68
|
845 |
+
%807 = bitcast i32 %803 to float, !dbg !68
|
846 |
+
%808 = bitcast i32 %804 to float, !dbg !68
|
847 |
+
%809 = bitcast i32 %805 to float, !dbg !68
|
848 |
+
%810 = bitcast i32 %806 to float, !dbg !68
|
849 |
+
%811 = zext nneg i32 %761 to i64, !dbg !69
|
850 |
+
%812 = getelementptr float, ptr addrspace(1) %3, i64 %811, !dbg !69
|
851 |
+
%813 = zext nneg i32 %762 to i64, !dbg !69
|
852 |
+
%814 = getelementptr float, ptr addrspace(1) %3, i64 %813, !dbg !69
|
853 |
+
%815 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %812, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
|
854 |
+
%816 = extractvalue { i32, i32, i32, i32 } %815, 0, !dbg !70
|
855 |
+
%817 = extractvalue { i32, i32, i32, i32 } %815, 1, !dbg !70
|
856 |
+
%818 = extractvalue { i32, i32, i32, i32 } %815, 2, !dbg !70
|
857 |
+
%819 = extractvalue { i32, i32, i32, i32 } %815, 3, !dbg !70
|
858 |
+
%820 = bitcast i32 %816 to float, !dbg !70
|
859 |
+
%821 = bitcast i32 %817 to float, !dbg !70
|
860 |
+
%822 = bitcast i32 %818 to float, !dbg !70
|
861 |
+
%823 = bitcast i32 %819 to float, !dbg !70
|
862 |
+
%824 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %814, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
|
863 |
+
%825 = extractvalue { i32, i32, i32, i32 } %824, 0, !dbg !70
|
864 |
+
%826 = extractvalue { i32, i32, i32, i32 } %824, 1, !dbg !70
|
865 |
+
%827 = extractvalue { i32, i32, i32, i32 } %824, 2, !dbg !70
|
866 |
+
%828 = extractvalue { i32, i32, i32, i32 } %824, 3, !dbg !70
|
867 |
+
%829 = bitcast i32 %825 to float, !dbg !70
|
868 |
+
%830 = bitcast i32 %826 to float, !dbg !70
|
869 |
+
%831 = bitcast i32 %827 to float, !dbg !70
|
870 |
+
%832 = bitcast i32 %828 to float, !dbg !70
|
871 |
+
br i1 %53, label %833, label %834, !dbg !71
|
872 |
+
|
873 |
+
833: ; preds = %759
|
874 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
|
875 |
+
br label %834, !dbg !71
|
876 |
+
|
877 |
+
834: ; preds = %833, %759
|
878 |
+
%835 = getelementptr float, ptr addrspace(1) %60, i64 %811, !dbg !72
|
879 |
+
%836 = getelementptr float, ptr addrspace(1) %60, i64 %813, !dbg !72
|
880 |
+
%837 = getelementptr float, ptr addrspace(1) %61, i64 %811, !dbg !72
|
881 |
+
%838 = getelementptr float, ptr addrspace(1) %61, i64 %813, !dbg !72
|
882 |
+
%839 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
883 |
+
%840 = extractvalue { i32, i32, i32, i32 } %839, 0, !dbg !73
|
884 |
+
%841 = extractvalue { i32, i32, i32, i32 } %839, 1, !dbg !73
|
885 |
+
%842 = extractvalue { i32, i32, i32, i32 } %839, 2, !dbg !73
|
886 |
+
%843 = extractvalue { i32, i32, i32, i32 } %839, 3, !dbg !73
|
887 |
+
%844 = bitcast i32 %840 to float, !dbg !73
|
888 |
+
%845 = bitcast i32 %841 to float, !dbg !73
|
889 |
+
%846 = bitcast i32 %842 to float, !dbg !73
|
890 |
+
%847 = bitcast i32 %843 to float, !dbg !73
|
891 |
+
%848 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %836, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
892 |
+
%849 = extractvalue { i32, i32, i32, i32 } %848, 0, !dbg !73
|
893 |
+
%850 = extractvalue { i32, i32, i32, i32 } %848, 1, !dbg !73
|
894 |
+
%851 = extractvalue { i32, i32, i32, i32 } %848, 2, !dbg !73
|
895 |
+
%852 = extractvalue { i32, i32, i32, i32 } %848, 3, !dbg !73
|
896 |
+
%853 = bitcast i32 %849 to float, !dbg !73
|
897 |
+
%854 = bitcast i32 %850 to float, !dbg !73
|
898 |
+
%855 = bitcast i32 %851 to float, !dbg !73
|
899 |
+
%856 = bitcast i32 %852 to float, !dbg !73
|
900 |
+
%857 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
901 |
+
%858 = extractvalue { i32, i32, i32, i32 } %857, 0, !dbg !73
|
902 |
+
%859 = extractvalue { i32, i32, i32, i32 } %857, 1, !dbg !73
|
903 |
+
%860 = extractvalue { i32, i32, i32, i32 } %857, 2, !dbg !73
|
904 |
+
%861 = extractvalue { i32, i32, i32, i32 } %857, 3, !dbg !73
|
905 |
+
%862 = bitcast i32 %858 to float, !dbg !73
|
906 |
+
%863 = bitcast i32 %859 to float, !dbg !73
|
907 |
+
%864 = bitcast i32 %860 to float, !dbg !73
|
908 |
+
%865 = bitcast i32 %861 to float, !dbg !73
|
909 |
+
%866 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %838, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
910 |
+
%867 = extractvalue { i32, i32, i32, i32 } %866, 0, !dbg !73
|
911 |
+
%868 = extractvalue { i32, i32, i32, i32 } %866, 1, !dbg !73
|
912 |
+
%869 = extractvalue { i32, i32, i32, i32 } %866, 2, !dbg !73
|
913 |
+
%870 = extractvalue { i32, i32, i32, i32 } %866, 3, !dbg !73
|
914 |
+
%871 = bitcast i32 %867 to float, !dbg !73
|
915 |
+
%872 = bitcast i32 %868 to float, !dbg !73
|
916 |
+
%873 = bitcast i32 %869 to float, !dbg !73
|
917 |
+
%874 = bitcast i32 %870 to float, !dbg !73
|
918 |
+
%875 = fadd float %780, %844, !dbg !74
|
919 |
+
%876 = fadd float %781, %845, !dbg !74
|
920 |
+
%877 = fadd float %782, %846, !dbg !74
|
921 |
+
%878 = fadd float %783, %847, !dbg !74
|
922 |
+
%879 = fadd float %789, %853, !dbg !74
|
923 |
+
%880 = fadd float %790, %854, !dbg !74
|
924 |
+
%881 = fadd float %791, %855, !dbg !74
|
925 |
+
%882 = fadd float %792, %856, !dbg !74
|
926 |
+
%883 = fadd float %798, %862, !dbg !74
|
927 |
+
%884 = fadd float %799, %863, !dbg !74
|
928 |
+
%885 = fadd float %800, %864, !dbg !74
|
929 |
+
%886 = fadd float %801, %865, !dbg !74
|
930 |
+
%887 = fadd float %807, %871, !dbg !74
|
931 |
+
%888 = fadd float %808, %872, !dbg !74
|
932 |
+
%889 = fadd float %809, %873, !dbg !74
|
933 |
+
%890 = fadd float %810, %874, !dbg !74
|
934 |
+
%891 = fsub float %875, %670, !dbg !75
|
935 |
+
%892 = fsub float %876, %670, !dbg !75
|
936 |
+
%893 = fsub float %877, %670, !dbg !75
|
937 |
+
%894 = fsub float %878, %670, !dbg !75
|
938 |
+
%895 = fsub float %879, %670, !dbg !75
|
939 |
+
%896 = fsub float %880, %670, !dbg !75
|
940 |
+
%897 = fsub float %881, %670, !dbg !75
|
941 |
+
%898 = fsub float %882, %670, !dbg !75
|
942 |
+
%899 = fsub float %883, %733, !dbg !75
|
943 |
+
%900 = fsub float %884, %733, !dbg !75
|
944 |
+
%901 = fsub float %885, %733, !dbg !75
|
945 |
+
%902 = fsub float %886, %733, !dbg !75
|
946 |
+
%903 = fsub float %887, %733, !dbg !75
|
947 |
+
%904 = fsub float %888, %733, !dbg !75
|
948 |
+
%905 = fsub float %889, %733, !dbg !75
|
949 |
+
%906 = fsub float %890, %733, !dbg !75
|
950 |
+
%907 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
951 |
+
%.not.i = icmp eq i32 %907, 0, !dbg !76
|
952 |
+
br i1 %.not.i, label %910, label %908, !dbg !76
|
953 |
+
|
954 |
+
908: ; preds = %834
|
955 |
+
%909 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %755), !dbg !76
|
956 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
957 |
+
|
958 |
+
910: ; preds = %834
|
959 |
+
%911 = tail call float @llvm.nvvm.rsqrt.approx.f(float %755), !dbg !76
|
960 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
961 |
+
|
962 |
+
__nv_rsqrtf.exit: ; preds = %908, %910
|
963 |
+
%.0.i = phi float [ %909, %908 ], [ %911, %910 ], !dbg !76
|
964 |
+
%912 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
965 |
+
%913 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
966 |
+
%914 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
967 |
+
%915 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
968 |
+
%916 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
969 |
+
%917 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
970 |
+
%918 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
971 |
+
%919 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
972 |
+
%.not.i23 = icmp eq i32 %919, 0, !dbg !76
|
973 |
+
br i1 %.not.i23, label %922, label %920, !dbg !76
|
974 |
+
|
975 |
+
920: ; preds = %__nv_rsqrtf.exit
|
976 |
+
%921 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %756), !dbg !76
|
977 |
+
br label %__nv_rsqrtf.exit25, !dbg !76
|
978 |
+
|
979 |
+
922: ; preds = %__nv_rsqrtf.exit
|
980 |
+
%923 = tail call float @llvm.nvvm.rsqrt.approx.f(float %756), !dbg !76
|
981 |
+
br label %__nv_rsqrtf.exit25, !dbg !76
|
982 |
+
|
983 |
+
__nv_rsqrtf.exit25: ; preds = %920, %922
|
984 |
+
%.0.i24 = phi float [ %921, %920 ], [ %923, %922 ], !dbg !76
|
985 |
+
%924 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
986 |
+
%925 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
987 |
+
%926 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
988 |
+
%927 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
989 |
+
%928 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
990 |
+
%929 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
991 |
+
%930 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
992 |
+
%931 = fmul float %891, %.0.i, !dbg !77
|
993 |
+
%932 = fmul float %892, %.0.i, !dbg !77
|
994 |
+
%933 = fmul float %893, %.0.i, !dbg !77
|
995 |
+
%934 = fmul float %894, %.0.i, !dbg !77
|
996 |
+
%935 = fmul float %895, %.0.i, !dbg !77
|
997 |
+
%936 = fmul float %896, %.0.i, !dbg !77
|
998 |
+
%937 = fmul float %897, %.0.i, !dbg !77
|
999 |
+
%938 = fmul float %898, %.0.i, !dbg !77
|
1000 |
+
%939 = fmul float %899, %.0.i24, !dbg !77
|
1001 |
+
%940 = fmul float %900, %.0.i24, !dbg !77
|
1002 |
+
%941 = fmul float %901, %.0.i24, !dbg !77
|
1003 |
+
%942 = fmul float %902, %.0.i24, !dbg !77
|
1004 |
+
%943 = fmul float %903, %.0.i24, !dbg !77
|
1005 |
+
%944 = fmul float %904, %.0.i24, !dbg !77
|
1006 |
+
%945 = fmul float %905, %.0.i24, !dbg !77
|
1007 |
+
%946 = fmul float %906, %.0.i24, !dbg !77
|
1008 |
+
%947 = fmul float %931, %820, !dbg !78
|
1009 |
+
%948 = fmul float %932, %821, !dbg !78
|
1010 |
+
%949 = fmul float %933, %822, !dbg !78
|
1011 |
+
%950 = fmul float %934, %823, !dbg !78
|
1012 |
+
%951 = fmul float %935, %829, !dbg !78
|
1013 |
+
%952 = fmul float %936, %830, !dbg !78
|
1014 |
+
%953 = fmul float %937, %831, !dbg !78
|
1015 |
+
%954 = fmul float %938, %832, !dbg !78
|
1016 |
+
%955 = fmul float %939, %820, !dbg !78
|
1017 |
+
%956 = fmul float %940, %821, !dbg !78
|
1018 |
+
%957 = fmul float %941, %822, !dbg !78
|
1019 |
+
%958 = fmul float %942, %823, !dbg !78
|
1020 |
+
%959 = fmul float %943, %829, !dbg !78
|
1021 |
+
%960 = fmul float %944, %830, !dbg !78
|
1022 |
+
%961 = fmul float %945, %831, !dbg !78
|
1023 |
+
%962 = fmul float %946, %832, !dbg !78
|
1024 |
+
%963 = add i32 %761, %757, !dbg !79
|
1025 |
+
%964 = add i32 %761, %758, !dbg !79
|
1026 |
+
%965 = sext i32 %963 to i64, !dbg !80
|
1027 |
+
%966 = getelementptr i16, ptr addrspace(1) %4, i64 %965, !dbg !80
|
1028 |
+
%967 = sext i32 %964 to i64, !dbg !80
|
1029 |
+
%968 = getelementptr i16, ptr addrspace(1) %4, i64 %967, !dbg !80
|
1030 |
+
%969 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #6, !dbg !81
|
1031 |
+
%970 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #6, !dbg !81
|
1032 |
+
%971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #6, !dbg !81
|
1033 |
+
%972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #6, !dbg !81
|
1034 |
+
%973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #6, !dbg !81
|
1035 |
+
%974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #6, !dbg !81
|
1036 |
+
%975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #6, !dbg !81
|
1037 |
+
%976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #6, !dbg !81
|
1038 |
+
%977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %955) #6, !dbg !81
|
1039 |
+
%978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %956) #6, !dbg !81
|
1040 |
+
%979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %957) #6, !dbg !81
|
1041 |
+
%980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %958) #6, !dbg !81
|
1042 |
+
%981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %959) #6, !dbg !81
|
1043 |
+
%982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %960) #6, !dbg !81
|
1044 |
+
%983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %961) #6, !dbg !81
|
1045 |
+
%984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %962) #6, !dbg !81
|
1046 |
+
%985 = insertelement <2 x i16> undef, i16 %969, i64 0, !dbg !81
|
1047 |
+
%986 = insertelement <2 x i16> %985, i16 %970, i64 1, !dbg !81
|
1048 |
+
%987 = bitcast <2 x i16> %986 to i32, !dbg !81
|
1049 |
+
%988 = insertelement <2 x i16> undef, i16 %971, i64 0, !dbg !81
|
1050 |
+
%989 = insertelement <2 x i16> %988, i16 %972, i64 1, !dbg !81
|
1051 |
+
%990 = bitcast <2 x i16> %989 to i32, !dbg !81
|
1052 |
+
%991 = insertelement <2 x i16> undef, i16 %973, i64 0, !dbg !81
|
1053 |
+
%992 = insertelement <2 x i16> %991, i16 %974, i64 1, !dbg !81
|
1054 |
+
%993 = bitcast <2 x i16> %992 to i32, !dbg !81
|
1055 |
+
%994 = insertelement <2 x i16> undef, i16 %975, i64 0, !dbg !81
|
1056 |
+
%995 = insertelement <2 x i16> %994, i16 %976, i64 1, !dbg !81
|
1057 |
+
%996 = bitcast <2 x i16> %995 to i32, !dbg !81
|
1058 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %987, i32 %990, i32 %993, i32 %996, ptr addrspace(1) %966, i1 true) #6, !dbg !81
|
1059 |
+
%997 = insertelement <2 x i16> undef, i16 %977, i64 0, !dbg !81
|
1060 |
+
%998 = insertelement <2 x i16> %997, i16 %978, i64 1, !dbg !81
|
1061 |
+
%999 = bitcast <2 x i16> %998 to i32, !dbg !81
|
1062 |
+
%1000 = insertelement <2 x i16> undef, i16 %979, i64 0, !dbg !81
|
1063 |
+
%1001 = insertelement <2 x i16> %1000, i16 %980, i64 1, !dbg !81
|
1064 |
+
%1002 = bitcast <2 x i16> %1001 to i32, !dbg !81
|
1065 |
+
%1003 = insertelement <2 x i16> undef, i16 %981, i64 0, !dbg !81
|
1066 |
+
%1004 = insertelement <2 x i16> %1003, i16 %982, i64 1, !dbg !81
|
1067 |
+
%1005 = bitcast <2 x i16> %1004 to i32, !dbg !81
|
1068 |
+
%1006 = insertelement <2 x i16> undef, i16 %983, i64 0, !dbg !81
|
1069 |
+
%1007 = insertelement <2 x i16> %1006, i16 %984, i64 1, !dbg !81
|
1070 |
+
%1008 = bitcast <2 x i16> %1007 to i32, !dbg !81
|
1071 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %999, i32 %1002, i32 %1005, i32 %1008, ptr addrspace(1) %968, i1 true) #6, !dbg !81
|
1072 |
+
%1009 = add nuw nsw i32 %760, 64, !dbg !64
|
1073 |
+
%1010 = icmp ult i32 %760, 192, !dbg !64
|
1074 |
+
br i1 %1010, label %759, label %1011, !dbg !64
|
1075 |
+
|
1076 |
+
1011: ; preds = %__nv_rsqrtf.exit25
|
1077 |
+
ret void, !dbg !82
|
1078 |
+
}
|
1079 |
+
|
1080 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
1081 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
1082 |
+
|
1083 |
+
; Function Attrs: convergent nocallback nounwind
|
1084 |
+
declare void @llvm.nvvm.barrier0() #1
|
1085 |
+
|
1086 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
1087 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
1088 |
+
|
1089 |
+
; Function Attrs: alwaysinline nounwind
|
1090 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
1091 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
1092 |
+
%.not = icmp eq i32 %1, 0
|
1093 |
+
br i1 %.not, label %4, label %2
|
1094 |
+
|
1095 |
+
2: ; preds = %0
|
1096 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
1097 |
+
br label %6
|
1098 |
+
|
1099 |
+
4: ; preds = %0
|
1100 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
1101 |
+
br label %6
|
1102 |
+
|
1103 |
+
6: ; preds = %4, %2
|
1104 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
1105 |
+
ret float %.0
|
1106 |
+
}
|
1107 |
+
|
1108 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
1109 |
+
|
1110 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
1111 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
1112 |
+
|
1113 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
1114 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
1115 |
+
|
1116 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
1117 |
+
attributes #1 = { convergent nocallback nounwind }
|
1118 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
1119 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
1120 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
1121 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
1122 |
+
attributes #6 = { nounwind }
|
1123 |
+
|
1124 |
+
!llvm.module.flags = !{!0, !1}
|
1125 |
+
!llvm.dbg.cu = !{!2}
|
1126 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
1127 |
+
!llvm.ident = !{!6}
|
1128 |
+
|
1129 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
1130 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
1131 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
1132 |
+
!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
|
1133 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
1134 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
|
1135 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
1136 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
1137 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
1138 |
+
!9 = !{}
|
1139 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
1140 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
1141 |
+
!12 = !DILocation(line: 31, column: 36, scope: !7)
|
1142 |
+
!13 = !DILocation(line: 21, column: 28, scope: !7)
|
1143 |
+
!14 = !DILocation(line: 21, column: 33, scope: !7)
|
1144 |
+
!15 = !DILocation(line: 22, column: 23, scope: !7)
|
1145 |
+
!16 = !DILocation(line: 26, column: 30, scope: !7)
|
1146 |
+
!17 = !DILocation(line: 26, column: 35, scope: !7)
|
1147 |
+
!18 = !DILocation(line: 27, column: 18, scope: !7)
|
1148 |
+
!19 = !DILocation(line: 35, column: 44, scope: !7)
|
1149 |
+
!20 = !DILocation(line: 36, column: 22, scope: !7)
|
1150 |
+
!21 = !DILocation(line: 37, column: 22, scope: !7)
|
1151 |
+
!22 = !DILocation(line: 38, column: 36, scope: !7)
|
1152 |
+
!23 = !DILocation(line: 39, column: 40, scope: !7)
|
1153 |
+
!24 = !DILocation(line: 40, column: 44, scope: !7)
|
1154 |
+
!25 = !DILocation(line: 32, column: 27, scope: !7)
|
1155 |
+
!26 = !DILocation(line: 35, column: 40, scope: !7)
|
1156 |
+
!27 = !DILocation(line: 35, column: 34, scope: !7)
|
1157 |
+
!28 = !DILocation(line: 35, column: 50, scope: !7)
|
1158 |
+
!29 = !DILocation(line: 39, column: 55, scope: !7)
|
1159 |
+
!30 = !DILocation(line: 40, column: 40, scope: !7)
|
1160 |
+
!31 = !DILocation(line: 40, column: 34, scope: !7)
|
1161 |
+
!32 = !DILocation(line: 40, column: 52, scope: !7)
|
1162 |
+
!33 = !DILocation(line: 41, column: 22, scope: !7)
|
1163 |
+
!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
|
1164 |
+
!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
1165 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
1166 |
+
!37 = !DILocation(line: 44, column: 38, scope: !35)
|
1167 |
+
!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
|
1168 |
+
!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
|
1169 |
+
!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
|
1170 |
+
!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
|
1171 |
+
!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
|
1172 |
+
!43 = !DILocation(line: 47, column: 48, scope: !7)
|
1173 |
+
!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
|
1174 |
+
!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
|
1175 |
+
!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
|
1176 |
+
!47 = !DILocation(line: 50, column: 41, scope: !45)
|
1177 |
+
!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
|
1178 |
+
!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
|
1179 |
+
!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
|
1180 |
+
!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
|
1181 |
+
!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
|
1182 |
+
!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
|
1183 |
+
!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
|
1184 |
+
!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
|
1185 |
+
!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
|
1186 |
+
!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
|
1187 |
+
!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
|
1188 |
+
!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
|
1189 |
+
!60 = !DILocation(line: 50, column: 41, scope: !35)
|
1190 |
+
!61 = !DILocation(line: 69, column: 23, scope: !7)
|
1191 |
+
!62 = !DILocation(line: 71, column: 24, scope: !7)
|
1192 |
+
!63 = !DILocation(line: 76, column: 39, scope: !7)
|
1193 |
+
!64 = !DILocation(line: 55, column: 36, scope: !7)
|
1194 |
+
!65 = !DILocation(line: 56, column: 27, scope: !7)
|
1195 |
+
!66 = !DILocation(line: 59, column: 41, scope: !7)
|
1196 |
+
!67 = !DILocation(line: 59, column: 35, scope: !7)
|
1197 |
+
!68 = !DILocation(line: 59, column: 51, scope: !7)
|
1198 |
+
!69 = !DILocation(line: 60, column: 35, scope: !7)
|
1199 |
+
!70 = !DILocation(line: 60, column: 40, scope: !7)
|
1200 |
+
!71 = !DILocation(line: 64, column: 57, scope: !7)
|
1201 |
+
!72 = !DILocation(line: 65, column: 35, scope: !7)
|
1202 |
+
!73 = !DILocation(line: 65, column: 54, scope: !7)
|
1203 |
+
!74 = !DILocation(line: 66, column: 24, scope: !7)
|
1204 |
+
!75 = !DILocation(line: 67, column: 24, scope: !7)
|
1205 |
+
!76 = !DILocation(line: 72, column: 30, scope: !7)
|
1206 |
+
!77 = !DILocation(line: 73, column: 24, scope: !7)
|
1207 |
+
!78 = !DILocation(line: 74, column: 24, scope: !7)
|
1208 |
+
!79 = !DILocation(line: 76, column: 35, scope: !7)
|
1209 |
+
!80 = !DILocation(line: 76, column: 29, scope: !7)
|
1210 |
+
!81 = !DILocation(line: 76, column: 52, scope: !7)
|
1211 |
+
!82 = !DILocation(line: 55, column: 4, scope: !7)
|
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ptx
ADDED
@@ -0,0 +1,1810 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5de6de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5de6de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
|
34 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_6
|
36 |
+
)
|
37 |
+
.maxntid 256, 1, 1
|
38 |
+
{
|
39 |
+
.reg .pred %p<137>;
|
40 |
+
.reg .b16 %rs<17>;
|
41 |
+
.reg .b32 %r<408>;
|
42 |
+
.reg .f32 %f<614>;
|
43 |
+
.reg .b64 %rd<107>;
|
44 |
+
.loc 1 18 0
|
45 |
+
$L__func_begin0:
|
46 |
+
.loc 1 18 0
|
47 |
+
|
48 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5de6de_param_4];
|
49 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5de6de_param_3];
|
50 |
+
ld.param.u64 %rd49, [triton__0d1d2d3d4d5de6de_param_0];
|
51 |
+
ld.param.u64 %rd50, [triton__0d1d2d3d4d5de6de_param_1];
|
52 |
+
$L__tmp0:
|
53 |
+
.loc 1 22 44
|
54 |
+
mov.u32 %r13, %tid.x;
|
55 |
+
ld.param.u64 %rd51, [triton__0d1d2d3d4d5de6de_param_2];
|
56 |
+
bfe.u32 %r1, %r13, 3, 5;
|
57 |
+
and.b32 %r2, %r13, 63;
|
58 |
+
.loc 1 24 33
|
59 |
+
shl.b32 %r14, %r13, 3;
|
60 |
+
and.b32 %r3, %r14, 56;
|
61 |
+
.loc 1 31 36
|
62 |
+
shr.u32 %r4, %r13, 6;
|
63 |
+
.loc 1 21 28
|
64 |
+
mov.u32 %r11, %ctaid.x;
|
65 |
+
.loc 1 21 33
|
66 |
+
shl.b32 %r15, %r11, 6;
|
67 |
+
.loc 1 22 23
|
68 |
+
or.b32 %r16, %r15, %r1;
|
69 |
+
or.b32 %r17, %r16, 32;
|
70 |
+
or.b32 %r18, %r15, %r2;
|
71 |
+
.loc 1 26 30
|
72 |
+
mul.wide.s32 %rd52, %r16, 8;
|
73 |
+
add.s64 %rd15, %rd49, %rd52;
|
74 |
+
add.s64 %rd31, %rd15, 256;
|
75 |
+
mul.wide.s32 %rd53, %r18, 8;
|
76 |
+
add.s64 %rd47, %rd49, %rd53;
|
77 |
+
mov.pred %p1, -1;
|
78 |
+
.loc 1 26 35
|
79 |
+
mov.u64 %rd14, 0x0;
|
80 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd15 + 0 ];
|
81 |
+
mov.u64 %rd16, 0x0;
|
82 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd15 + 0 ];
|
83 |
+
mov.u64 %rd18, 0x0;
|
84 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd15 + 0 ];
|
85 |
+
mov.u64 %rd20, 0x0;
|
86 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd15 + 0 ];
|
87 |
+
mov.u64 %rd22, 0x0;
|
88 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd15 + 0 ];
|
89 |
+
mov.u64 %rd24, 0x0;
|
90 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd15 + 0 ];
|
91 |
+
mov.u64 %rd26, 0x0;
|
92 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd15 + 0 ];
|
93 |
+
mov.u64 %rd28, 0x0;
|
94 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd15 + 0 ];
|
95 |
+
mov.u64 %rd30, 0x0;
|
96 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ];
|
97 |
+
mov.u64 %rd32, 0x0;
|
98 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd31 + 0 ];
|
99 |
+
mov.u64 %rd34, 0x0;
|
100 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd31 + 0 ];
|
101 |
+
mov.u64 %rd36, 0x0;
|
102 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd31 + 0 ];
|
103 |
+
mov.u64 %rd38, 0x0;
|
104 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd31 + 0 ];
|
105 |
+
mov.u64 %rd40, 0x0;
|
106 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd31 + 0 ];
|
107 |
+
mov.u64 %rd42, 0x0;
|
108 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd31 + 0 ];
|
109 |
+
mov.u64 %rd44, 0x0;
|
110 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd31 + 0 ];
|
111 |
+
mov.u64 %rd46, 0x0;
|
112 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
|
113 |
+
.loc 1 27 18
|
114 |
+
bfe.s32 %r19, %r11, 25, 1;
|
115 |
+
shr.u32 %r20, %r19, 23;
|
116 |
+
add.s32 %r21, %r16, %r20;
|
117 |
+
and.b32 %r22, %r21, 16776704;
|
118 |
+
sub.s32 %r23, %r16, %r22;
|
119 |
+
add.s32 %r24, %r17, %r20;
|
120 |
+
and.b32 %r25, %r24, 16776704;
|
121 |
+
sub.s32 %r26, %r17, %r25;
|
122 |
+
.loc 1 35 44
|
123 |
+
shl.b32 %r27, %r23, 8;
|
124 |
+
shl.b32 %r28, %r26, 8;
|
125 |
+
.loc 1 36 22
|
126 |
+
add.s64 %rd54, %rd46, 50257;
|
127 |
+
.loc 1 37 22
|
128 |
+
setp.lt.s64 %p18, %rd14, 0;
|
129 |
+
setp.lt.s64 %p19, %rd30, 0;
|
130 |
+
setp.lt.s64 %p20, %rd46, 0;
|
131 |
+
.loc 1 38 36
|
132 |
+
selp.b64 %rd1, %rd54, %rd46, %p20;
|
133 |
+
.loc 1 40 44
|
134 |
+
shl.b64 %rd55, %rd14, 8;
|
135 |
+
add.s64 %rd56, %rd55, 12865792;
|
136 |
+
selp.b64 %rd57, %rd56, %rd55, %p18;
|
137 |
+
shl.b64 %rd58, %rd30, 8;
|
138 |
+
add.s64 %rd59, %rd58, 12865792;
|
139 |
+
selp.b64 %rd60, %rd59, %rd58, %p19;
|
140 |
+
.loc 1 31 36
|
141 |
+
and.b32 %r29, %r13, 7;
|
142 |
+
mul.wide.u32 %rd2, %r29, 32;
|
143 |
+
shl.b64 %rd61, %rd60, 2;
|
144 |
+
or.b64 %rd62, %rd2, %rd61;
|
145 |
+
add.s64 %rd3, %rd50, %rd62;
|
146 |
+
shl.b64 %rd63, %rd57, 2;
|
147 |
+
or.b64 %rd64, %rd2, %rd63;
|
148 |
+
add.s64 %rd4, %rd50, %rd64;
|
149 |
+
or.b32 %r30, %r28, %r3;
|
150 |
+
mul.wide.s32 %rd65, %r30, 4;
|
151 |
+
add.s64 %rd5, %rd51, %rd65;
|
152 |
+
or.b32 %r31, %r27, %r3;
|
153 |
+
mul.wide.s32 %rd66, %r31, 4;
|
154 |
+
add.s64 %rd6, %rd51, %rd66;
|
155 |
+
mov.f32 %f550, 0f00000000;
|
156 |
+
mov.u64 %rd105, 0;
|
157 |
+
mov.b32 %r406, -64;
|
158 |
+
mov.f32 %f551, %f550;
|
159 |
+
mov.f32 %f552, %f550;
|
160 |
+
mov.f32 %f553, %f550;
|
161 |
+
mov.f32 %f554, %f550;
|
162 |
+
mov.f32 %f555, %f550;
|
163 |
+
mov.f32 %f556, %f550;
|
164 |
+
mov.f32 %f557, %f550;
|
165 |
+
mov.f32 %f558, %f550;
|
166 |
+
mov.f32 %f559, %f550;
|
167 |
+
mov.f32 %f560, %f550;
|
168 |
+
mov.f32 %f561, %f550;
|
169 |
+
mov.f32 %f562, %f550;
|
170 |
+
mov.f32 %f563, %f550;
|
171 |
+
mov.f32 %f564, %f550;
|
172 |
+
mov.f32 %f565, %f550;
|
173 |
+
mov.f32 %f566, %f550;
|
174 |
+
mov.f32 %f567, %f550;
|
175 |
+
mov.f32 %f568, %f550;
|
176 |
+
mov.f32 %f569, %f550;
|
177 |
+
mov.f32 %f570, %f550;
|
178 |
+
mov.f32 %f571, %f550;
|
179 |
+
mov.f32 %f572, %f550;
|
180 |
+
mov.f32 %f573, %f550;
|
181 |
+
mov.f32 %f574, %f550;
|
182 |
+
mov.f32 %f575, %f550;
|
183 |
+
mov.f32 %f576, %f550;
|
184 |
+
mov.f32 %f577, %f550;
|
185 |
+
mov.f32 %f578, %f550;
|
186 |
+
mov.f32 %f579, %f550;
|
187 |
+
mov.f32 %f580, %f550;
|
188 |
+
mov.f32 %f581, %f550;
|
189 |
+
mov.f32 %f582, %f550;
|
190 |
+
mov.f32 %f583, %f550;
|
191 |
+
mov.f32 %f584, %f550;
|
192 |
+
mov.f32 %f585, %f550;
|
193 |
+
mov.f32 %f586, %f550;
|
194 |
+
mov.f32 %f587, %f550;
|
195 |
+
mov.f32 %f588, %f550;
|
196 |
+
mov.f32 %f589, %f550;
|
197 |
+
mov.f32 %f590, %f550;
|
198 |
+
mov.f32 %f591, %f550;
|
199 |
+
mov.f32 %f592, %f550;
|
200 |
+
mov.f32 %f593, %f550;
|
201 |
+
mov.f32 %f594, %f550;
|
202 |
+
mov.f32 %f595, %f550;
|
203 |
+
mov.f32 %f596, %f550;
|
204 |
+
mov.f32 %f597, %f550;
|
205 |
+
mov.f32 %f598, %f550;
|
206 |
+
mov.f32 %f599, %f550;
|
207 |
+
mov.f32 %f600, %f550;
|
208 |
+
mov.f32 %f601, %f550;
|
209 |
+
mov.f32 %f602, %f550;
|
210 |
+
mov.f32 %f603, %f550;
|
211 |
+
mov.f32 %f604, %f550;
|
212 |
+
mov.f32 %f605, %f550;
|
213 |
+
mov.f32 %f606, %f550;
|
214 |
+
mov.f32 %f607, %f550;
|
215 |
+
mov.f32 %f608, %f550;
|
216 |
+
mov.f32 %f609, %f550;
|
217 |
+
mov.f32 %f610, %f550;
|
218 |
+
mov.f32 %f611, %f550;
|
219 |
+
mov.f32 %f612, %f550;
|
220 |
+
mov.f32 %f613, %f550;
|
221 |
+
bra.uni $L__BB0_1;
|
222 |
+
$L__BB0_3:
|
223 |
+
.loc 1 40 40
|
224 |
+
add.s64 %rd78, %rd4, %rd105;
|
225 |
+
.loc 1 40 34
|
226 |
+
add.s64 %rd79, %rd78, 16;
|
227 |
+
add.s64 %rd80, %rd3, %rd105;
|
228 |
+
.loc 1 40 52
|
229 |
+
add.s64 %rd81, %rd80, 16;
|
230 |
+
mov.u32 %r65, 0x0;
|
231 |
+
mov.u32 %r66, 0x0;
|
232 |
+
mov.u32 %r67, 0x0;
|
233 |
+
mov.u32 %r68, 0x0;
|
234 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd78 + 0 ];
|
235 |
+
@!%p1 mov.u32 %r65, %r342;
|
236 |
+
@!%p1 mov.u32 %r66, %r342;
|
237 |
+
@!%p1 mov.u32 %r67, %r342;
|
238 |
+
@!%p1 mov.u32 %r68, %r342;
|
239 |
+
mov.b32 %f174, %r65;
|
240 |
+
mov.b32 %f175, %r66;
|
241 |
+
mov.b32 %f176, %r67;
|
242 |
+
mov.b32 %f177, %r68;
|
243 |
+
mov.u32 %r73, 0x0;
|
244 |
+
mov.u32 %r74, 0x0;
|
245 |
+
mov.u32 %r75, 0x0;
|
246 |
+
mov.u32 %r76, 0x0;
|
247 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r73, %r74, %r75, %r76 }, [ %rd79 + 0 ];
|
248 |
+
@!%p1 mov.u32 %r73, %r342;
|
249 |
+
@!%p1 mov.u32 %r74, %r342;
|
250 |
+
@!%p1 mov.u32 %r75, %r342;
|
251 |
+
@!%p1 mov.u32 %r76, %r342;
|
252 |
+
mov.b32 %f178, %r73;
|
253 |
+
mov.b32 %f179, %r74;
|
254 |
+
mov.b32 %f180, %r75;
|
255 |
+
mov.b32 %f181, %r76;
|
256 |
+
mov.u32 %r81, 0x0;
|
257 |
+
mov.u32 %r82, 0x0;
|
258 |
+
mov.u32 %r83, 0x0;
|
259 |
+
mov.u32 %r84, 0x0;
|
260 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r81, %r82, %r83, %r84 }, [ %rd80 + 0 ];
|
261 |
+
@!%p1 mov.u32 %r81, %r342;
|
262 |
+
@!%p1 mov.u32 %r82, %r342;
|
263 |
+
@!%p1 mov.u32 %r83, %r342;
|
264 |
+
@!%p1 mov.u32 %r84, %r342;
|
265 |
+
mov.b32 %f182, %r81;
|
266 |
+
mov.b32 %f183, %r82;
|
267 |
+
mov.b32 %f184, %r83;
|
268 |
+
mov.b32 %f185, %r84;
|
269 |
+
mov.u32 %r89, 0x0;
|
270 |
+
mov.u32 %r90, 0x0;
|
271 |
+
mov.u32 %r91, 0x0;
|
272 |
+
mov.u32 %r92, 0x0;
|
273 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r89, %r90, %r91, %r92 }, [ %rd81 + 0 ];
|
274 |
+
@!%p1 mov.u32 %r89, %r342;
|
275 |
+
@!%p1 mov.u32 %r90, %r342;
|
276 |
+
@!%p1 mov.u32 %r91, %r342;
|
277 |
+
@!%p1 mov.u32 %r92, %r342;
|
278 |
+
mov.b32 %f186, %r89;
|
279 |
+
mov.b32 %f187, %r90;
|
280 |
+
mov.b32 %f188, %r91;
|
281 |
+
mov.b32 %f189, %r92;
|
282 |
+
.loc 1 41 22
|
283 |
+
add.f32 %f190, %f65, %f174;
|
284 |
+
add.f32 %f191, %f66, %f175;
|
285 |
+
add.f32 %f192, %f67, %f176;
|
286 |
+
add.f32 %f193, %f68, %f177;
|
287 |
+
add.f32 %f194, %f69, %f178;
|
288 |
+
add.f32 %f195, %f70, %f179;
|
289 |
+
add.f32 %f196, %f71, %f180;
|
290 |
+
add.f32 %f197, %f72, %f181;
|
291 |
+
add.f32 %f198, %f73, %f182;
|
292 |
+
add.f32 %f199, %f74, %f183;
|
293 |
+
add.f32 %f200, %f75, %f184;
|
294 |
+
add.f32 %f201, %f76, %f185;
|
295 |
+
add.f32 %f202, %f77, %f186;
|
296 |
+
add.f32 %f203, %f78, %f187;
|
297 |
+
add.f32 %f204, %f79, %f188;
|
298 |
+
add.f32 %f205, %f80, %f189;
|
299 |
+
$L__tmp1:
|
300 |
+
.loc 2 96 20
|
301 |
+
sub.f32 %f206, %f190, %f598;
|
302 |
+
sub.f32 %f207, %f191, %f599;
|
303 |
+
sub.f32 %f208, %f192, %f600;
|
304 |
+
sub.f32 %f209, %f193, %f601;
|
305 |
+
sub.f32 %f210, %f194, %f602;
|
306 |
+
sub.f32 %f211, %f195, %f603;
|
307 |
+
sub.f32 %f212, %f196, %f604;
|
308 |
+
sub.f32 %f213, %f197, %f605;
|
309 |
+
sub.f32 %f214, %f198, %f606;
|
310 |
+
sub.f32 %f215, %f199, %f607;
|
311 |
+
sub.f32 %f216, %f200, %f608;
|
312 |
+
sub.f32 %f217, %f201, %f609;
|
313 |
+
sub.f32 %f218, %f202, %f610;
|
314 |
+
sub.f32 %f219, %f203, %f611;
|
315 |
+
sub.f32 %f220, %f204, %f612;
|
316 |
+
sub.f32 %f221, %f205, %f613;
|
317 |
+
.loc 2 97 26
|
318 |
+
add.f32 %f550, %f550, 0f3F800000;
|
319 |
+
add.f32 %f551, %f551, 0f3F800000;
|
320 |
+
add.f32 %f552, %f552, 0f3F800000;
|
321 |
+
add.f32 %f553, %f553, 0f3F800000;
|
322 |
+
add.f32 %f554, %f554, 0f3F800000;
|
323 |
+
add.f32 %f555, %f555, 0f3F800000;
|
324 |
+
add.f32 %f556, %f556, 0f3F800000;
|
325 |
+
add.f32 %f557, %f557, 0f3F800000;
|
326 |
+
add.f32 %f558, %f558, 0f3F800000;
|
327 |
+
add.f32 %f559, %f559, 0f3F800000;
|
328 |
+
add.f32 %f560, %f560, 0f3F800000;
|
329 |
+
add.f32 %f561, %f561, 0f3F800000;
|
330 |
+
add.f32 %f562, %f562, 0f3F800000;
|
331 |
+
add.f32 %f563, %f563, 0f3F800000;
|
332 |
+
add.f32 %f564, %f564, 0f3F800000;
|
333 |
+
add.f32 %f565, %f565, 0f3F800000;
|
334 |
+
add.f32 %f566, %f566, 0f3F800000;
|
335 |
+
add.f32 %f567, %f567, 0f3F800000;
|
336 |
+
add.f32 %f568, %f568, 0f3F800000;
|
337 |
+
add.f32 %f569, %f569, 0f3F800000;
|
338 |
+
add.f32 %f570, %f570, 0f3F800000;
|
339 |
+
add.f32 %f571, %f571, 0f3F800000;
|
340 |
+
add.f32 %f572, %f572, 0f3F800000;
|
341 |
+
add.f32 %f573, %f573, 0f3F800000;
|
342 |
+
add.f32 %f574, %f574, 0f3F800000;
|
343 |
+
add.f32 %f575, %f575, 0f3F800000;
|
344 |
+
add.f32 %f576, %f576, 0f3F800000;
|
345 |
+
add.f32 %f577, %f577, 0f3F800000;
|
346 |
+
add.f32 %f578, %f578, 0f3F800000;
|
347 |
+
add.f32 %f579, %f579, 0f3F800000;
|
348 |
+
add.f32 %f580, %f580, 0f3F800000;
|
349 |
+
add.f32 %f581, %f581, 0f3F800000;
|
350 |
+
.loc 2 98 30
|
351 |
+
mov.b32 %r98, %f206;
|
352 |
+
mov.b32 %r99, %f550;
|
353 |
+
div.full.f32 %r97, %r98, %r99;
|
354 |
+
mov.b32 %f222, %r97;
|
355 |
+
mov.b32 %r101, %f207;
|
356 |
+
mov.b32 %r102, %f551;
|
357 |
+
div.full.f32 %r100, %r101, %r102;
|
358 |
+
mov.b32 %f223, %r100;
|
359 |
+
mov.b32 %r104, %f208;
|
360 |
+
mov.b32 %r105, %f552;
|
361 |
+
div.full.f32 %r103, %r104, %r105;
|
362 |
+
mov.b32 %f224, %r103;
|
363 |
+
mov.b32 %r107, %f209;
|
364 |
+
mov.b32 %r108, %f553;
|
365 |
+
div.full.f32 %r106, %r107, %r108;
|
366 |
+
mov.b32 %f225, %r106;
|
367 |
+
mov.b32 %r110, %f210;
|
368 |
+
mov.b32 %r111, %f554;
|
369 |
+
div.full.f32 %r109, %r110, %r111;
|
370 |
+
mov.b32 %f226, %r109;
|
371 |
+
mov.b32 %r113, %f211;
|
372 |
+
mov.b32 %r114, %f555;
|
373 |
+
div.full.f32 %r112, %r113, %r114;
|
374 |
+
mov.b32 %f227, %r112;
|
375 |
+
mov.b32 %r116, %f212;
|
376 |
+
mov.b32 %r117, %f556;
|
377 |
+
div.full.f32 %r115, %r116, %r117;
|
378 |
+
mov.b32 %f228, %r115;
|
379 |
+
mov.b32 %r119, %f213;
|
380 |
+
mov.b32 %r120, %f557;
|
381 |
+
div.full.f32 %r118, %r119, %r120;
|
382 |
+
mov.b32 %f229, %r118;
|
383 |
+
mov.b32 %r122, %f214;
|
384 |
+
mov.b32 %r123, %f558;
|
385 |
+
div.full.f32 %r121, %r122, %r123;
|
386 |
+
mov.b32 %f230, %r121;
|
387 |
+
mov.b32 %r125, %f215;
|
388 |
+
mov.b32 %r126, %f559;
|
389 |
+
div.full.f32 %r124, %r125, %r126;
|
390 |
+
mov.b32 %f231, %r124;
|
391 |
+
mov.b32 %r128, %f216;
|
392 |
+
mov.b32 %r129, %f560;
|
393 |
+
div.full.f32 %r127, %r128, %r129;
|
394 |
+
mov.b32 %f232, %r127;
|
395 |
+
mov.b32 %r131, %f217;
|
396 |
+
mov.b32 %r132, %f561;
|
397 |
+
div.full.f32 %r130, %r131, %r132;
|
398 |
+
mov.b32 %f233, %r130;
|
399 |
+
mov.b32 %r134, %f218;
|
400 |
+
mov.b32 %r135, %f562;
|
401 |
+
div.full.f32 %r133, %r134, %r135;
|
402 |
+
mov.b32 %f234, %r133;
|
403 |
+
mov.b32 %r137, %f219;
|
404 |
+
mov.b32 %r138, %f563;
|
405 |
+
div.full.f32 %r136, %r137, %r138;
|
406 |
+
mov.b32 %f235, %r136;
|
407 |
+
mov.b32 %r140, %f220;
|
408 |
+
mov.b32 %r141, %f564;
|
409 |
+
div.full.f32 %r139, %r140, %r141;
|
410 |
+
mov.b32 %f236, %r139;
|
411 |
+
mov.b32 %r143, %f221;
|
412 |
+
mov.b32 %r144, %f565;
|
413 |
+
div.full.f32 %r142, %r143, %r144;
|
414 |
+
mov.b32 %f237, %r142;
|
415 |
+
.loc 2 98 22
|
416 |
+
add.f32 %f598, %f598, %f222;
|
417 |
+
add.f32 %f599, %f599, %f223;
|
418 |
+
add.f32 %f600, %f600, %f224;
|
419 |
+
add.f32 %f601, %f601, %f225;
|
420 |
+
add.f32 %f602, %f602, %f226;
|
421 |
+
add.f32 %f603, %f603, %f227;
|
422 |
+
add.f32 %f604, %f604, %f228;
|
423 |
+
add.f32 %f605, %f605, %f229;
|
424 |
+
add.f32 %f606, %f606, %f230;
|
425 |
+
add.f32 %f607, %f607, %f231;
|
426 |
+
add.f32 %f608, %f608, %f232;
|
427 |
+
add.f32 %f609, %f609, %f233;
|
428 |
+
add.f32 %f610, %f610, %f234;
|
429 |
+
add.f32 %f611, %f611, %f235;
|
430 |
+
add.f32 %f612, %f612, %f236;
|
431 |
+
add.f32 %f613, %f613, %f237;
|
432 |
+
.loc 2 101 30
|
433 |
+
sub.f32 %f238, %f190, %f598;
|
434 |
+
sub.f32 %f239, %f191, %f599;
|
435 |
+
sub.f32 %f240, %f192, %f600;
|
436 |
+
sub.f32 %f241, %f193, %f601;
|
437 |
+
sub.f32 %f242, %f194, %f602;
|
438 |
+
sub.f32 %f243, %f195, %f603;
|
439 |
+
sub.f32 %f244, %f196, %f604;
|
440 |
+
sub.f32 %f245, %f197, %f605;
|
441 |
+
sub.f32 %f246, %f198, %f606;
|
442 |
+
sub.f32 %f247, %f199, %f607;
|
443 |
+
sub.f32 %f248, %f200, %f608;
|
444 |
+
sub.f32 %f249, %f201, %f609;
|
445 |
+
sub.f32 %f250, %f202, %f610;
|
446 |
+
sub.f32 %f251, %f203, %f611;
|
447 |
+
sub.f32 %f252, %f204, %f612;
|
448 |
+
sub.f32 %f253, %f205, %f613;
|
449 |
+
$L__tmp2:
|
450 |
+
.loc 1 47 48
|
451 |
+
fma.rn.f32 %f582, %f206, %f238, %f582;
|
452 |
+
fma.rn.f32 %f583, %f207, %f239, %f583;
|
453 |
+
fma.rn.f32 %f584, %f208, %f240, %f584;
|
454 |
+
fma.rn.f32 %f585, %f209, %f241, %f585;
|
455 |
+
fma.rn.f32 %f586, %f210, %f242, %f586;
|
456 |
+
fma.rn.f32 %f587, %f211, %f243, %f587;
|
457 |
+
fma.rn.f32 %f588, %f212, %f244, %f588;
|
458 |
+
fma.rn.f32 %f589, %f213, %f245, %f589;
|
459 |
+
fma.rn.f32 %f590, %f214, %f246, %f590;
|
460 |
+
fma.rn.f32 %f591, %f215, %f247, %f591;
|
461 |
+
fma.rn.f32 %f592, %f216, %f248, %f592;
|
462 |
+
fma.rn.f32 %f593, %f217, %f249, %f593;
|
463 |
+
fma.rn.f32 %f594, %f218, %f250, %f594;
|
464 |
+
fma.rn.f32 %f595, %f219, %f251, %f595;
|
465 |
+
fma.rn.f32 %f596, %f220, %f252, %f596;
|
466 |
+
fma.rn.f32 %f597, %f221, %f253, %f597;
|
467 |
+
.loc 1 31 36
|
468 |
+
add.s64 %rd105, %rd105, 256;
|
469 |
+
add.s32 %r406, %r406, 64;
|
470 |
+
setp.lt.u32 %p62, %r406, 192;
|
471 |
+
@%p62 bra $L__BB0_1;
|
472 |
+
bra.uni $L__BB0_4;
|
473 |
+
$L__BB0_1:
|
474 |
+
.loc 1 39 40
|
475 |
+
setp.lt.u64 %p41, %rd1, 50257;
|
476 |
+
.loc 1 35 34
|
477 |
+
add.s64 %rd67, %rd6, %rd105;
|
478 |
+
add.s64 %rd68, %rd67, 16;
|
479 |
+
add.s64 %rd69, %rd5, %rd105;
|
480 |
+
.loc 1 35 50
|
481 |
+
add.s64 %rd70, %rd69, 16;
|
482 |
+
mov.b32 %r342, 0;
|
483 |
+
mov.u32 %r32, 0x0;
|
484 |
+
mov.u32 %r33, 0x0;
|
485 |
+
mov.u32 %r34, 0x0;
|
486 |
+
mov.u32 %r35, 0x0;
|
487 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r32, %r33, %r34, %r35 }, [ %rd67 + 0 ];
|
488 |
+
@!%p1 mov.u32 %r32, %r342;
|
489 |
+
@!%p1 mov.u32 %r33, %r342;
|
490 |
+
@!%p1 mov.u32 %r34, %r342;
|
491 |
+
@!%p1 mov.u32 %r35, %r342;
|
492 |
+
mov.b32 %f65, %r32;
|
493 |
+
mov.b32 %f66, %r33;
|
494 |
+
mov.b32 %f67, %r34;
|
495 |
+
mov.b32 %f68, %r35;
|
496 |
+
mov.u32 %r40, 0x0;
|
497 |
+
mov.u32 %r41, 0x0;
|
498 |
+
mov.u32 %r42, 0x0;
|
499 |
+
mov.u32 %r43, 0x0;
|
500 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r40, %r41, %r42, %r43 }, [ %rd68 + 0 ];
|
501 |
+
@!%p1 mov.u32 %r40, %r342;
|
502 |
+
@!%p1 mov.u32 %r41, %r342;
|
503 |
+
@!%p1 mov.u32 %r42, %r342;
|
504 |
+
@!%p1 mov.u32 %r43, %r342;
|
505 |
+
mov.b32 %f69, %r40;
|
506 |
+
mov.b32 %f70, %r41;
|
507 |
+
mov.b32 %f71, %r42;
|
508 |
+
mov.b32 %f72, %r43;
|
509 |
+
mov.u32 %r48, 0x0;
|
510 |
+
mov.u32 %r49, 0x0;
|
511 |
+
mov.u32 %r50, 0x0;
|
512 |
+
mov.u32 %r51, 0x0;
|
513 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r48, %r49, %r50, %r51 }, [ %rd69 + 0 ];
|
514 |
+
@!%p1 mov.u32 %r48, %r342;
|
515 |
+
@!%p1 mov.u32 %r49, %r342;
|
516 |
+
@!%p1 mov.u32 %r50, %r342;
|
517 |
+
@!%p1 mov.u32 %r51, %r342;
|
518 |
+
mov.b32 %f73, %r48;
|
519 |
+
mov.b32 %f74, %r49;
|
520 |
+
mov.b32 %f75, %r50;
|
521 |
+
mov.b32 %f76, %r51;
|
522 |
+
mov.u32 %r56, 0x0;
|
523 |
+
mov.u32 %r57, 0x0;
|
524 |
+
mov.u32 %r58, 0x0;
|
525 |
+
mov.u32 %r59, 0x0;
|
526 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r56, %r57, %r58, %r59 }, [ %rd70 + 0 ];
|
527 |
+
@!%p1 mov.u32 %r56, %r342;
|
528 |
+
@!%p1 mov.u32 %r57, %r342;
|
529 |
+
@!%p1 mov.u32 %r58, %r342;
|
530 |
+
@!%p1 mov.u32 %r59, %r342;
|
531 |
+
mov.b32 %f77, %r56;
|
532 |
+
mov.b32 %f78, %r57;
|
533 |
+
mov.b32 %f79, %r58;
|
534 |
+
mov.b32 %f80, %r59;
|
535 |
+
mov.b32 %r405, 883;
|
536 |
+
mov.u64 %rd104, 1;
|
537 |
+
.loc 1 39 55
|
538 |
+
@%p41 bra $L__BB0_3;
|
539 |
+
mov.u64 %rd71, assertMessage_0;
|
540 |
+
cvta.global.u64 %rd72, %rd71;
|
541 |
+
mov.u64 %rd73, assertFile_0;
|
542 |
+
cvta.global.u64 %rd74, %rd73;
|
543 |
+
mov.u64 %rd75, assertFunc_0;
|
544 |
+
cvta.global.u64 %rd76, %rd75;
|
545 |
+
{ // callseq 6, 0
|
546 |
+
.reg .b32 temp_param_reg;
|
547 |
+
.param .b64 param0;
|
548 |
+
st.param.b64 [param0+0], %rd72;
|
549 |
+
.param .b64 param1;
|
550 |
+
st.param.b64 [param1+0], %rd74;
|
551 |
+
.param .b32 param2;
|
552 |
+
st.param.b32 [param2+0], %r405;
|
553 |
+
.param .b64 param3;
|
554 |
+
st.param.b64 [param3+0], %rd76;
|
555 |
+
.param .b64 param4;
|
556 |
+
st.param.b64 [param4+0], %rd104;
|
557 |
+
call.uni
|
558 |
+
__assertfail,
|
559 |
+
(
|
560 |
+
param0,
|
561 |
+
param1,
|
562 |
+
param2,
|
563 |
+
param3,
|
564 |
+
param4
|
565 |
+
);
|
566 |
+
} // callseq 6
|
567 |
+
bra.uni $L__BB0_3;
|
568 |
+
$L__BB0_4:
|
569 |
+
.loc 1 31 36
|
570 |
+
and.b32 %r254, %r4, 3;
|
571 |
+
mad.lo.s32 %r255, %r254, 72, %r2;
|
572 |
+
shl.b32 %r256, %r255, 2;
|
573 |
+
mov.u32 %r257, global_smem;
|
574 |
+
add.s32 %r258, %r257, %r256;
|
575 |
+
st.shared.f32 [%r258], %f566;
|
576 |
+
st.shared.f32 [%r258+1152], %f567;
|
577 |
+
st.shared.f32 [%r258+2304], %f568;
|
578 |
+
st.shared.f32 [%r258+3456], %f569;
|
579 |
+
st.shared.f32 [%r258+4608], %f570;
|
580 |
+
st.shared.f32 [%r258+5760], %f571;
|
581 |
+
st.shared.f32 [%r258+6912], %f572;
|
582 |
+
st.shared.f32 [%r258+8064], %f573;
|
583 |
+
bar.sync 0;
|
584 |
+
mad.lo.s32 %r259, %r1, 72, %r3;
|
585 |
+
shl.b32 %r260, %r259, 2;
|
586 |
+
add.s32 %r261, %r257, %r260;
|
587 |
+
ld.shared.v4.f32 {%f254, %f255, %f256, %f257}, [%r261];
|
588 |
+
ld.shared.v4.f32 {%f258, %f259, %f260, %f261}, [%r261+16];
|
589 |
+
bar.sync 0;
|
590 |
+
st.shared.f32 [%r258], %f574;
|
591 |
+
st.shared.f32 [%r258+1152], %f575;
|
592 |
+
st.shared.f32 [%r258+2304], %f576;
|
593 |
+
st.shared.f32 [%r258+3456], %f577;
|
594 |
+
st.shared.f32 [%r258+4608], %f578;
|
595 |
+
st.shared.f32 [%r258+5760], %f579;
|
596 |
+
st.shared.f32 [%r258+6912], %f580;
|
597 |
+
st.shared.f32 [%r258+8064], %f581;
|
598 |
+
bar.sync 0;
|
599 |
+
ld.shared.v4.f32 {%f262, %f263, %f264, %f265}, [%r261];
|
600 |
+
ld.shared.v4.f32 {%f266, %f267, %f268, %f269}, [%r261+16];
|
601 |
+
$L__tmp3:
|
602 |
+
.loc 2 108 21
|
603 |
+
sub.f32 %f270, %f599, %f598;
|
604 |
+
.loc 2 109 28
|
605 |
+
add.f32 %f271, %f254, %f255;
|
606 |
+
.loc 2 110 39
|
607 |
+
setp.eq.f32 %p63, %f271, 0f00000000;
|
608 |
+
.loc 2 110 60
|
609 |
+
mov.b32 %r146, %f255;
|
610 |
+
mov.b32 %r147, %f271;
|
611 |
+
div.full.f32 %r145, %r146, %r147;
|
612 |
+
mov.b32 %f272, %r145;
|
613 |
+
.loc 2 110 49
|
614 |
+
selp.f32 %f273, 0f00000000, %f272, %p63;
|
615 |
+
.loc 2 112 17
|
616 |
+
fma.rn.f32 %f274, %f270, %f273, %f598;
|
617 |
+
.loc 2 113 15
|
618 |
+
add.f32 %f275, %f582, %f583;
|
619 |
+
.loc 2 113 30
|
620 |
+
mul.f32 %f276, %f270, %f270;
|
621 |
+
.loc 2 113 38
|
622 |
+
mul.f32 %f277, %f276, %f254;
|
623 |
+
.loc 2 113 22
|
624 |
+
fma.rn.f32 %f278, %f277, %f273, %f275;
|
625 |
+
.loc 2 108 21
|
626 |
+
sub.f32 %f279, %f600, %f274;
|
627 |
+
.loc 2 109 28
|
628 |
+
add.f32 %f280, %f256, %f271;
|
629 |
+
.loc 2 110 39
|
630 |
+
setp.eq.f32 %p64, %f280, 0f00000000;
|
631 |
+
.loc 2 110 60
|
632 |
+
mov.b32 %r150, %f280;
|
633 |
+
mov.b32 %r149, %f256;
|
634 |
+
div.full.f32 %r148, %r149, %r150;
|
635 |
+
mov.b32 %f281, %r148;
|
636 |
+
.loc 2 110 49
|
637 |
+
selp.f32 %f282, 0f00000000, %f281, %p64;
|
638 |
+
.loc 2 112 17
|
639 |
+
fma.rn.f32 %f283, %f282, %f279, %f274;
|
640 |
+
.loc 2 113 15
|
641 |
+
add.f32 %f284, %f584, %f278;
|
642 |
+
.loc 2 113 30
|
643 |
+
mul.f32 %f285, %f279, %f279;
|
644 |
+
.loc 2 113 38
|
645 |
+
mul.f32 %f286, %f271, %f285;
|
646 |
+
.loc 2 113 22
|
647 |
+
fma.rn.f32 %f287, %f282, %f286, %f284;
|
648 |
+
.loc 2 108 21
|
649 |
+
sub.f32 %f288, %f601, %f283;
|
650 |
+
.loc 2 109 28
|
651 |
+
add.f32 %f289, %f257, %f280;
|
652 |
+
.loc 2 110 39
|
653 |
+
setp.eq.f32 %p65, %f289, 0f00000000;
|
654 |
+
.loc 2 110 60
|
655 |
+
mov.b32 %r153, %f289;
|
656 |
+
mov.b32 %r152, %f257;
|
657 |
+
div.full.f32 %r151, %r152, %r153;
|
658 |
+
mov.b32 %f290, %r151;
|
659 |
+
.loc 2 110 49
|
660 |
+
selp.f32 %f291, 0f00000000, %f290, %p65;
|
661 |
+
.loc 2 112 17
|
662 |
+
fma.rn.f32 %f292, %f291, %f288, %f283;
|
663 |
+
.loc 2 113 15
|
664 |
+
add.f32 %f293, %f585, %f287;
|
665 |
+
.loc 2 113 30
|
666 |
+
mul.f32 %f294, %f288, %f288;
|
667 |
+
.loc 2 113 38
|
668 |
+
mul.f32 %f295, %f280, %f294;
|
669 |
+
.loc 2 113 22
|
670 |
+
fma.rn.f32 %f296, %f291, %f295, %f293;
|
671 |
+
.loc 2 108 21
|
672 |
+
sub.f32 %f297, %f602, %f292;
|
673 |
+
.loc 2 109 28
|
674 |
+
add.f32 %f298, %f258, %f289;
|
675 |
+
.loc 2 110 39
|
676 |
+
setp.eq.f32 %p66, %f298, 0f00000000;
|
677 |
+
.loc 2 110 60
|
678 |
+
mov.b32 %r156, %f298;
|
679 |
+
mov.b32 %r155, %f258;
|
680 |
+
div.full.f32 %r154, %r155, %r156;
|
681 |
+
mov.b32 %f299, %r154;
|
682 |
+
.loc 2 110 49
|
683 |
+
selp.f32 %f300, 0f00000000, %f299, %p66;
|
684 |
+
.loc 2 112 17
|
685 |
+
fma.rn.f32 %f301, %f300, %f297, %f292;
|
686 |
+
.loc 2 113 15
|
687 |
+
add.f32 %f302, %f586, %f296;
|
688 |
+
.loc 2 113 30
|
689 |
+
mul.f32 %f303, %f297, %f297;
|
690 |
+
.loc 2 113 38
|
691 |
+
mul.f32 %f304, %f289, %f303;
|
692 |
+
.loc 2 113 22
|
693 |
+
fma.rn.f32 %f305, %f300, %f304, %f302;
|
694 |
+
.loc 2 108 21
|
695 |
+
sub.f32 %f306, %f603, %f301;
|
696 |
+
.loc 2 109 28
|
697 |
+
add.f32 %f307, %f259, %f298;
|
698 |
+
.loc 2 110 39
|
699 |
+
setp.eq.f32 %p67, %f307, 0f00000000;
|
700 |
+
.loc 2 110 60
|
701 |
+
mov.b32 %r159, %f307;
|
702 |
+
mov.b32 %r158, %f259;
|
703 |
+
div.full.f32 %r157, %r158, %r159;
|
704 |
+
mov.b32 %f308, %r157;
|
705 |
+
.loc 2 110 49
|
706 |
+
selp.f32 %f309, 0f00000000, %f308, %p67;
|
707 |
+
.loc 2 112 17
|
708 |
+
fma.rn.f32 %f310, %f309, %f306, %f301;
|
709 |
+
.loc 2 113 15
|
710 |
+
add.f32 %f311, %f587, %f305;
|
711 |
+
.loc 2 113 30
|
712 |
+
mul.f32 %f312, %f306, %f306;
|
713 |
+
.loc 2 113 38
|
714 |
+
mul.f32 %f313, %f298, %f312;
|
715 |
+
.loc 2 113 22
|
716 |
+
fma.rn.f32 %f314, %f309, %f313, %f311;
|
717 |
+
.loc 2 108 21
|
718 |
+
sub.f32 %f315, %f604, %f310;
|
719 |
+
.loc 2 109 28
|
720 |
+
add.f32 %f316, %f260, %f307;
|
721 |
+
.loc 2 110 39
|
722 |
+
setp.eq.f32 %p68, %f316, 0f00000000;
|
723 |
+
.loc 2 110 60
|
724 |
+
mov.b32 %r162, %f316;
|
725 |
+
mov.b32 %r161, %f260;
|
726 |
+
div.full.f32 %r160, %r161, %r162;
|
727 |
+
mov.b32 %f317, %r160;
|
728 |
+
.loc 2 110 49
|
729 |
+
selp.f32 %f318, 0f00000000, %f317, %p68;
|
730 |
+
.loc 2 112 17
|
731 |
+
fma.rn.f32 %f319, %f318, %f315, %f310;
|
732 |
+
.loc 2 113 15
|
733 |
+
add.f32 %f320, %f588, %f314;
|
734 |
+
.loc 2 113 30
|
735 |
+
mul.f32 %f321, %f315, %f315;
|
736 |
+
.loc 2 113 38
|
737 |
+
mul.f32 %f322, %f307, %f321;
|
738 |
+
.loc 2 113 22
|
739 |
+
fma.rn.f32 %f323, %f318, %f322, %f320;
|
740 |
+
.loc 2 108 21
|
741 |
+
sub.f32 %f324, %f605, %f319;
|
742 |
+
.loc 2 109 28
|
743 |
+
add.f32 %f325, %f261, %f316;
|
744 |
+
.loc 2 110 39
|
745 |
+
setp.eq.f32 %p69, %f325, 0f00000000;
|
746 |
+
.loc 2 110 60
|
747 |
+
mov.b32 %r165, %f325;
|
748 |
+
mov.b32 %r164, %f261;
|
749 |
+
div.full.f32 %r163, %r164, %r165;
|
750 |
+
mov.b32 %f326, %r163;
|
751 |
+
.loc 2 110 49
|
752 |
+
selp.f32 %f327, 0f00000000, %f326, %p69;
|
753 |
+
.loc 2 112 17
|
754 |
+
fma.rn.f32 %f328, %f327, %f324, %f319;
|
755 |
+
.loc 2 113 15
|
756 |
+
add.f32 %f329, %f589, %f323;
|
757 |
+
.loc 2 113 30
|
758 |
+
mul.f32 %f330, %f324, %f324;
|
759 |
+
.loc 2 113 38
|
760 |
+
mul.f32 %f331, %f316, %f330;
|
761 |
+
.loc 2 113 22
|
762 |
+
fma.rn.f32 %f332, %f327, %f331, %f329;
|
763 |
+
.loc 2 108 21
|
764 |
+
sub.f32 %f333, %f607, %f606;
|
765 |
+
.loc 2 109 28
|
766 |
+
add.f32 %f334, %f262, %f263;
|
767 |
+
.loc 2 110 39
|
768 |
+
setp.eq.f32 %p70, %f334, 0f00000000;
|
769 |
+
.loc 2 110 60
|
770 |
+
mov.b32 %r167, %f263;
|
771 |
+
mov.b32 %r168, %f334;
|
772 |
+
div.full.f32 %r166, %r167, %r168;
|
773 |
+
mov.b32 %f335, %r166;
|
774 |
+
.loc 2 110 49
|
775 |
+
selp.f32 %f336, 0f00000000, %f335, %p70;
|
776 |
+
.loc 2 112 17
|
777 |
+
fma.rn.f32 %f337, %f333, %f336, %f606;
|
778 |
+
.loc 2 113 15
|
779 |
+
add.f32 %f338, %f590, %f591;
|
780 |
+
.loc 2 113 30
|
781 |
+
mul.f32 %f339, %f333, %f333;
|
782 |
+
.loc 2 113 38
|
783 |
+
mul.f32 %f340, %f339, %f262;
|
784 |
+
.loc 2 113 22
|
785 |
+
fma.rn.f32 %f341, %f340, %f336, %f338;
|
786 |
+
.loc 2 108 21
|
787 |
+
sub.f32 %f342, %f608, %f337;
|
788 |
+
.loc 2 109 28
|
789 |
+
add.f32 %f343, %f264, %f334;
|
790 |
+
.loc 2 110 39
|
791 |
+
setp.eq.f32 %p71, %f343, 0f00000000;
|
792 |
+
.loc 2 110 60
|
793 |
+
mov.b32 %r171, %f343;
|
794 |
+
mov.b32 %r170, %f264;
|
795 |
+
div.full.f32 %r169, %r170, %r171;
|
796 |
+
mov.b32 %f344, %r169;
|
797 |
+
.loc 2 110 49
|
798 |
+
selp.f32 %f345, 0f00000000, %f344, %p71;
|
799 |
+
.loc 2 112 17
|
800 |
+
fma.rn.f32 %f346, %f345, %f342, %f337;
|
801 |
+
.loc 2 113 15
|
802 |
+
add.f32 %f347, %f592, %f341;
|
803 |
+
.loc 2 113 30
|
804 |
+
mul.f32 %f348, %f342, %f342;
|
805 |
+
.loc 2 113 38
|
806 |
+
mul.f32 %f349, %f334, %f348;
|
807 |
+
.loc 2 113 22
|
808 |
+
fma.rn.f32 %f350, %f345, %f349, %f347;
|
809 |
+
.loc 2 108 21
|
810 |
+
sub.f32 %f351, %f609, %f346;
|
811 |
+
.loc 2 109 28
|
812 |
+
add.f32 %f352, %f265, %f343;
|
813 |
+
.loc 2 110 39
|
814 |
+
setp.eq.f32 %p72, %f352, 0f00000000;
|
815 |
+
.loc 2 110 60
|
816 |
+
mov.b32 %r174, %f352;
|
817 |
+
mov.b32 %r173, %f265;
|
818 |
+
div.full.f32 %r172, %r173, %r174;
|
819 |
+
mov.b32 %f353, %r172;
|
820 |
+
.loc 2 110 49
|
821 |
+
selp.f32 %f354, 0f00000000, %f353, %p72;
|
822 |
+
.loc 2 112 17
|
823 |
+
fma.rn.f32 %f355, %f354, %f351, %f346;
|
824 |
+
.loc 2 113 15
|
825 |
+
add.f32 %f356, %f593, %f350;
|
826 |
+
.loc 2 113 30
|
827 |
+
mul.f32 %f357, %f351, %f351;
|
828 |
+
.loc 2 113 38
|
829 |
+
mul.f32 %f358, %f343, %f357;
|
830 |
+
.loc 2 113 22
|
831 |
+
fma.rn.f32 %f359, %f354, %f358, %f356;
|
832 |
+
.loc 2 108 21
|
833 |
+
sub.f32 %f360, %f610, %f355;
|
834 |
+
.loc 2 109 28
|
835 |
+
add.f32 %f361, %f266, %f352;
|
836 |
+
.loc 2 110 39
|
837 |
+
setp.eq.f32 %p73, %f361, 0f00000000;
|
838 |
+
.loc 2 110 60
|
839 |
+
mov.b32 %r177, %f361;
|
840 |
+
mov.b32 %r176, %f266;
|
841 |
+
div.full.f32 %r175, %r176, %r177;
|
842 |
+
mov.b32 %f362, %r175;
|
843 |
+
.loc 2 110 49
|
844 |
+
selp.f32 %f363, 0f00000000, %f362, %p73;
|
845 |
+
.loc 2 112 17
|
846 |
+
fma.rn.f32 %f364, %f363, %f360, %f355;
|
847 |
+
.loc 2 113 15
|
848 |
+
add.f32 %f365, %f594, %f359;
|
849 |
+
.loc 2 113 30
|
850 |
+
mul.f32 %f366, %f360, %f360;
|
851 |
+
.loc 2 113 38
|
852 |
+
mul.f32 %f367, %f352, %f366;
|
853 |
+
.loc 2 113 22
|
854 |
+
fma.rn.f32 %f368, %f363, %f367, %f365;
|
855 |
+
.loc 2 108 21
|
856 |
+
sub.f32 %f369, %f611, %f364;
|
857 |
+
.loc 2 109 28
|
858 |
+
add.f32 %f370, %f267, %f361;
|
859 |
+
.loc 2 110 39
|
860 |
+
setp.eq.f32 %p74, %f370, 0f00000000;
|
861 |
+
.loc 2 110 60
|
862 |
+
mov.b32 %r180, %f370;
|
863 |
+
mov.b32 %r179, %f267;
|
864 |
+
div.full.f32 %r178, %r179, %r180;
|
865 |
+
mov.b32 %f371, %r178;
|
866 |
+
.loc 2 110 49
|
867 |
+
selp.f32 %f372, 0f00000000, %f371, %p74;
|
868 |
+
.loc 2 112 17
|
869 |
+
fma.rn.f32 %f373, %f372, %f369, %f364;
|
870 |
+
.loc 2 113 15
|
871 |
+
add.f32 %f374, %f595, %f368;
|
872 |
+
.loc 2 113 30
|
873 |
+
mul.f32 %f375, %f369, %f369;
|
874 |
+
.loc 2 113 38
|
875 |
+
mul.f32 %f376, %f361, %f375;
|
876 |
+
.loc 2 113 22
|
877 |
+
fma.rn.f32 %f377, %f372, %f376, %f374;
|
878 |
+
.loc 2 108 21
|
879 |
+
sub.f32 %f378, %f612, %f373;
|
880 |
+
.loc 2 109 28
|
881 |
+
add.f32 %f379, %f268, %f370;
|
882 |
+
.loc 2 110 39
|
883 |
+
setp.eq.f32 %p75, %f379, 0f00000000;
|
884 |
+
.loc 2 110 60
|
885 |
+
mov.b32 %r183, %f379;
|
886 |
+
mov.b32 %r182, %f268;
|
887 |
+
div.full.f32 %r181, %r182, %r183;
|
888 |
+
mov.b32 %f380, %r181;
|
889 |
+
.loc 2 110 49
|
890 |
+
selp.f32 %f381, 0f00000000, %f380, %p75;
|
891 |
+
.loc 2 112 17
|
892 |
+
fma.rn.f32 %f382, %f381, %f378, %f373;
|
893 |
+
.loc 2 113 15
|
894 |
+
add.f32 %f383, %f596, %f377;
|
895 |
+
.loc 2 113 30
|
896 |
+
mul.f32 %f384, %f378, %f378;
|
897 |
+
.loc 2 113 38
|
898 |
+
mul.f32 %f385, %f370, %f384;
|
899 |
+
.loc 2 113 22
|
900 |
+
fma.rn.f32 %f386, %f381, %f385, %f383;
|
901 |
+
.loc 2 108 21
|
902 |
+
sub.f32 %f387, %f613, %f382;
|
903 |
+
.loc 2 109 28
|
904 |
+
add.f32 %f388, %f269, %f379;
|
905 |
+
.loc 2 110 39
|
906 |
+
setp.eq.f32 %p76, %f388, 0f00000000;
|
907 |
+
.loc 2 110 60
|
908 |
+
mov.b32 %r186, %f388;
|
909 |
+
mov.b32 %r185, %f269;
|
910 |
+
div.full.f32 %r184, %r185, %r186;
|
911 |
+
mov.b32 %f389, %r184;
|
912 |
+
.loc 2 110 49
|
913 |
+
selp.f32 %f390, 0f00000000, %f389, %p76;
|
914 |
+
.loc 2 112 17
|
915 |
+
fma.rn.f32 %f391, %f390, %f387, %f382;
|
916 |
+
.loc 2 113 15
|
917 |
+
add.f32 %f392, %f597, %f386;
|
918 |
+
.loc 2 113 30
|
919 |
+
mul.f32 %f393, %f387, %f387;
|
920 |
+
.loc 2 113 38
|
921 |
+
mul.f32 %f394, %f379, %f393;
|
922 |
+
.loc 2 113 22
|
923 |
+
fma.rn.f32 %f395, %f390, %f394, %f392;
|
924 |
+
$L__tmp4:
|
925 |
+
.loc 2 120 46
|
926 |
+
mov.b32 %r262, %f328;
|
927 |
+
shfl.sync.bfly.b32 %r263, %r262, 4, 31, -1;
|
928 |
+
mov.b32 %f396, %r263;
|
929 |
+
mov.b32 %r264, %f332;
|
930 |
+
shfl.sync.bfly.b32 %r265, %r264, 4, 31, -1;
|
931 |
+
mov.b32 %f397, %r265;
|
932 |
+
shfl.sync.bfly.b32 %r188, %r165, 4, 31, -1;
|
933 |
+
mov.b32 %f398, %r188;
|
934 |
+
$L__tmp5:
|
935 |
+
.loc 2 108 21
|
936 |
+
sub.f32 %f399, %f396, %f328;
|
937 |
+
.loc 2 109 28
|
938 |
+
add.f32 %f400, %f325, %f398;
|
939 |
+
.loc 2 110 39
|
940 |
+
setp.eq.f32 %p77, %f400, 0f00000000;
|
941 |
+
.loc 2 110 60
|
942 |
+
mov.b32 %r189, %f400;
|
943 |
+
div.full.f32 %r187, %r188, %r189;
|
944 |
+
mov.b32 %f401, %r187;
|
945 |
+
.loc 2 110 49
|
946 |
+
selp.f32 %f402, 0f00000000, %f401, %p77;
|
947 |
+
.loc 2 112 17
|
948 |
+
fma.rn.f32 %f403, %f402, %f399, %f328;
|
949 |
+
.loc 2 113 15
|
950 |
+
add.f32 %f404, %f332, %f397;
|
951 |
+
.loc 2 113 30
|
952 |
+
mul.f32 %f405, %f399, %f399;
|
953 |
+
.loc 2 113 38
|
954 |
+
mul.f32 %f406, %f325, %f405;
|
955 |
+
.loc 2 113 22
|
956 |
+
fma.rn.f32 %f407, %f402, %f406, %f404;
|
957 |
+
$L__tmp6:
|
958 |
+
.loc 2 120 46
|
959 |
+
mov.b32 %r266, %f403;
|
960 |
+
shfl.sync.bfly.b32 %r267, %r266, 2, 31, -1;
|
961 |
+
mov.b32 %f408, %r267;
|
962 |
+
mov.b32 %r268, %f407;
|
963 |
+
shfl.sync.bfly.b32 %r269, %r268, 2, 31, -1;
|
964 |
+
mov.b32 %f409, %r269;
|
965 |
+
shfl.sync.bfly.b32 %r191, %r189, 2, 31, -1;
|
966 |
+
mov.b32 %f410, %r191;
|
967 |
+
$L__tmp7:
|
968 |
+
.loc 2 108 21
|
969 |
+
sub.f32 %f411, %f408, %f403;
|
970 |
+
.loc 2 109 28
|
971 |
+
add.f32 %f412, %f400, %f410;
|
972 |
+
.loc 2 110 39
|
973 |
+
setp.eq.f32 %p78, %f412, 0f00000000;
|
974 |
+
.loc 2 110 60
|
975 |
+
mov.b32 %r192, %f412;
|
976 |
+
div.full.f32 %r190, %r191, %r192;
|
977 |
+
mov.b32 %f413, %r190;
|
978 |
+
.loc 2 110 49
|
979 |
+
selp.f32 %f414, 0f00000000, %f413, %p78;
|
980 |
+
.loc 2 112 17
|
981 |
+
fma.rn.f32 %f415, %f414, %f411, %f403;
|
982 |
+
.loc 2 113 15
|
983 |
+
add.f32 %f416, %f407, %f409;
|
984 |
+
.loc 2 113 30
|
985 |
+
mul.f32 %f417, %f411, %f411;
|
986 |
+
.loc 2 113 38
|
987 |
+
mul.f32 %f418, %f400, %f417;
|
988 |
+
.loc 2 113 22
|
989 |
+
fma.rn.f32 %f419, %f414, %f418, %f416;
|
990 |
+
$L__tmp8:
|
991 |
+
.loc 2 120 46
|
992 |
+
mov.b32 %r270, %f415;
|
993 |
+
shfl.sync.bfly.b32 %r271, %r270, 1, 31, -1;
|
994 |
+
mov.b32 %f420, %r271;
|
995 |
+
mov.b32 %r272, %f419;
|
996 |
+
shfl.sync.bfly.b32 %r273, %r272, 1, 31, -1;
|
997 |
+
mov.b32 %f421, %r273;
|
998 |
+
shfl.sync.bfly.b32 %r194, %r192, 1, 31, -1;
|
999 |
+
mov.b32 %f422, %r194;
|
1000 |
+
$L__tmp9:
|
1001 |
+
.loc 2 108 21
|
1002 |
+
sub.f32 %f423, %f420, %f415;
|
1003 |
+
.loc 2 109 28
|
1004 |
+
add.f32 %f424, %f412, %f422;
|
1005 |
+
.loc 2 110 39
|
1006 |
+
setp.eq.f32 %p79, %f424, 0f00000000;
|
1007 |
+
.loc 2 110 60
|
1008 |
+
mov.b32 %r195, %f424;
|
1009 |
+
div.full.f32 %r193, %r194, %r195;
|
1010 |
+
mov.b32 %f425, %r193;
|
1011 |
+
.loc 2 110 49
|
1012 |
+
selp.f32 %f426, 0f00000000, %f425, %p79;
|
1013 |
+
.loc 2 112 17
|
1014 |
+
fma.rn.f32 %f145, %f423, %f426, %f415;
|
1015 |
+
.loc 2 113 15
|
1016 |
+
add.f32 %f427, %f419, %f421;
|
1017 |
+
.loc 2 113 30
|
1018 |
+
mul.f32 %f428, %f423, %f423;
|
1019 |
+
.loc 2 113 38
|
1020 |
+
mul.f32 %f429, %f412, %f428;
|
1021 |
+
.loc 2 113 22
|
1022 |
+
fma.rn.f32 %f430, %f426, %f429, %f427;
|
1023 |
+
$L__tmp10:
|
1024 |
+
.loc 2 120 46
|
1025 |
+
mov.b32 %r274, %f391;
|
1026 |
+
shfl.sync.bfly.b32 %r275, %r274, 4, 31, -1;
|
1027 |
+
mov.b32 %f431, %r275;
|
1028 |
+
mov.b32 %r276, %f395;
|
1029 |
+
shfl.sync.bfly.b32 %r277, %r276, 4, 31, -1;
|
1030 |
+
mov.b32 %f432, %r277;
|
1031 |
+
shfl.sync.bfly.b32 %r197, %r186, 4, 31, -1;
|
1032 |
+
mov.b32 %f433, %r197;
|
1033 |
+
$L__tmp11:
|
1034 |
+
.loc 2 108 21
|
1035 |
+
sub.f32 %f434, %f431, %f391;
|
1036 |
+
.loc 2 109 28
|
1037 |
+
add.f32 %f435, %f388, %f433;
|
1038 |
+
.loc 2 110 39
|
1039 |
+
setp.eq.f32 %p80, %f435, 0f00000000;
|
1040 |
+
.loc 2 110 60
|
1041 |
+
mov.b32 %r198, %f435;
|
1042 |
+
div.full.f32 %r196, %r197, %r198;
|
1043 |
+
mov.b32 %f436, %r196;
|
1044 |
+
.loc 2 110 49
|
1045 |
+
selp.f32 %f437, 0f00000000, %f436, %p80;
|
1046 |
+
.loc 2 112 17
|
1047 |
+
fma.rn.f32 %f438, %f434, %f437, %f391;
|
1048 |
+
.loc 2 113 15
|
1049 |
+
add.f32 %f439, %f395, %f432;
|
1050 |
+
.loc 2 113 30
|
1051 |
+
mul.f32 %f440, %f434, %f434;
|
1052 |
+
.loc 2 113 38
|
1053 |
+
mul.f32 %f441, %f388, %f440;
|
1054 |
+
.loc 2 113 22
|
1055 |
+
fma.rn.f32 %f442, %f441, %f437, %f439;
|
1056 |
+
$L__tmp12:
|
1057 |
+
.loc 2 120 46
|
1058 |
+
mov.b32 %r278, %f438;
|
1059 |
+
shfl.sync.bfly.b32 %r279, %r278, 2, 31, -1;
|
1060 |
+
mov.b32 %f443, %r279;
|
1061 |
+
mov.b32 %r280, %f442;
|
1062 |
+
shfl.sync.bfly.b32 %r281, %r280, 2, 31, -1;
|
1063 |
+
mov.b32 %f444, %r281;
|
1064 |
+
shfl.sync.bfly.b32 %r200, %r198, 2, 31, -1;
|
1065 |
+
mov.b32 %f445, %r200;
|
1066 |
+
$L__tmp13:
|
1067 |
+
.loc 2 108 21
|
1068 |
+
sub.f32 %f446, %f443, %f438;
|
1069 |
+
.loc 2 109 28
|
1070 |
+
add.f32 %f447, %f435, %f445;
|
1071 |
+
.loc 2 110 39
|
1072 |
+
setp.eq.f32 %p81, %f447, 0f00000000;
|
1073 |
+
.loc 2 110 60
|
1074 |
+
mov.b32 %r201, %f447;
|
1075 |
+
div.full.f32 %r199, %r200, %r201;
|
1076 |
+
mov.b32 %f448, %r199;
|
1077 |
+
.loc 2 110 49
|
1078 |
+
selp.f32 %f449, 0f00000000, %f448, %p81;
|
1079 |
+
.loc 2 112 17
|
1080 |
+
fma.rn.f32 %f450, %f446, %f449, %f438;
|
1081 |
+
.loc 2 113 15
|
1082 |
+
add.f32 %f451, %f442, %f444;
|
1083 |
+
.loc 2 113 30
|
1084 |
+
mul.f32 %f452, %f446, %f446;
|
1085 |
+
.loc 2 113 38
|
1086 |
+
mul.f32 %f453, %f435, %f452;
|
1087 |
+
.loc 2 113 22
|
1088 |
+
fma.rn.f32 %f454, %f449, %f453, %f451;
|
1089 |
+
$L__tmp14:
|
1090 |
+
.loc 2 120 46
|
1091 |
+
mov.b32 %r282, %f450;
|
1092 |
+
shfl.sync.bfly.b32 %r283, %r282, 1, 31, -1;
|
1093 |
+
mov.b32 %f455, %r283;
|
1094 |
+
mov.b32 %r284, %f454;
|
1095 |
+
shfl.sync.bfly.b32 %r285, %r284, 1, 31, -1;
|
1096 |
+
mov.b32 %f456, %r285;
|
1097 |
+
shfl.sync.bfly.b32 %r203, %r201, 1, 31, -1;
|
1098 |
+
mov.b32 %f457, %r203;
|
1099 |
+
$L__tmp15:
|
1100 |
+
.loc 2 108 21
|
1101 |
+
sub.f32 %f458, %f455, %f450;
|
1102 |
+
.loc 2 109 28
|
1103 |
+
add.f32 %f459, %f447, %f457;
|
1104 |
+
.loc 2 110 39
|
1105 |
+
setp.eq.f32 %p82, %f459, 0f00000000;
|
1106 |
+
.loc 2 110 60
|
1107 |
+
mov.b32 %r204, %f459;
|
1108 |
+
div.full.f32 %r202, %r203, %r204;
|
1109 |
+
mov.b32 %f460, %r202;
|
1110 |
+
.loc 2 110 49
|
1111 |
+
selp.f32 %f461, 0f00000000, %f460, %p82;
|
1112 |
+
.loc 2 112 17
|
1113 |
+
fma.rn.f32 %f146, %f458, %f461, %f450;
|
1114 |
+
.loc 2 113 15
|
1115 |
+
add.f32 %f462, %f454, %f456;
|
1116 |
+
.loc 2 113 30
|
1117 |
+
mul.f32 %f463, %f458, %f458;
|
1118 |
+
.loc 2 113 38
|
1119 |
+
mul.f32 %f464, %f447, %f463;
|
1120 |
+
.loc 2 113 22
|
1121 |
+
fma.rn.f32 %f465, %f461, %f464, %f462;
|
1122 |
+
$L__tmp16:
|
1123 |
+
.loc 1 69 23
|
1124 |
+
mov.b32 %r206, %f430;
|
1125 |
+
mov.b32 %r207, 1132462080;
|
1126 |
+
div.full.f32 %r205, %r206, %r207;
|
1127 |
+
mov.b32 %f466, %r205;
|
1128 |
+
mov.b32 %r230, %f465;
|
1129 |
+
div.full.f32 %r229, %r230, %r207;
|
1130 |
+
mov.b32 %f467, %r229;
|
1131 |
+
.loc 1 71 24
|
1132 |
+
add.f32 %f147, %f466, 0f3727C5AC;
|
1133 |
+
add.f32 %f148, %f467, 0f3727C5AC;
|
1134 |
+
.loc 1 55 36
|
1135 |
+
add.s64 %rd9, %rd12, %rd2;
|
1136 |
+
shl.b32 %r286, %r11, 14;
|
1137 |
+
shl.b32 %r287, %r1, 8;
|
1138 |
+
or.b32 %r288, %r286, %r287;
|
1139 |
+
or.b32 %r8, %r288, %r3;
|
1140 |
+
mov.u64 %rd106, 0;
|
1141 |
+
mov.b32 %r407, -64;
|
1142 |
+
rsqrt.approx.ftz.f32 %f516, %f147;
|
1143 |
+
rsqrt.approx.ftz.f32 %f517, %f148;
|
1144 |
+
bra.uni $L__BB0_5;
|
1145 |
+
$L__BB0_7:
|
1146 |
+
.loc 1 65 35
|
1147 |
+
add.s64 %rd96, %rd4, %rd106;
|
1148 |
+
add.s64 %rd97, %rd96, 16;
|
1149 |
+
add.s64 %rd98, %rd3, %rd106;
|
1150 |
+
.loc 1 65 54
|
1151 |
+
add.s64 %rd99, %rd98, 16;
|
1152 |
+
mov.u32 %r338, 0x0;
|
1153 |
+
mov.u32 %r339, 0x0;
|
1154 |
+
mov.u32 %r340, 0x0;
|
1155 |
+
mov.u32 %r341, 0x0;
|
1156 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r338, %r339, %r340, %r341 }, [ %rd96 + 0 ];
|
1157 |
+
@!%p1 mov.u32 %r338, %r342;
|
1158 |
+
@!%p1 mov.u32 %r339, %r342;
|
1159 |
+
@!%p1 mov.u32 %r340, %r342;
|
1160 |
+
@!%p1 mov.u32 %r341, %r342;
|
1161 |
+
mov.b32 %f468, %r338;
|
1162 |
+
mov.b32 %f469, %r339;
|
1163 |
+
mov.b32 %f470, %r340;
|
1164 |
+
mov.b32 %f471, %r341;
|
1165 |
+
mov.u32 %r346, 0x0;
|
1166 |
+
mov.u32 %r347, 0x0;
|
1167 |
+
mov.u32 %r348, 0x0;
|
1168 |
+
mov.u32 %r349, 0x0;
|
1169 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r346, %r347, %r348, %r349 }, [ %rd97 + 0 ];
|
1170 |
+
@!%p1 mov.u32 %r346, %r342;
|
1171 |
+
@!%p1 mov.u32 %r347, %r342;
|
1172 |
+
@!%p1 mov.u32 %r348, %r342;
|
1173 |
+
@!%p1 mov.u32 %r349, %r342;
|
1174 |
+
mov.b32 %f472, %r346;
|
1175 |
+
mov.b32 %f473, %r347;
|
1176 |
+
mov.b32 %f474, %r348;
|
1177 |
+
mov.b32 %f475, %r349;
|
1178 |
+
mov.u32 %r354, 0x0;
|
1179 |
+
mov.u32 %r355, 0x0;
|
1180 |
+
mov.u32 %r356, 0x0;
|
1181 |
+
mov.u32 %r357, 0x0;
|
1182 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r354, %r355, %r356, %r357 }, [ %rd98 + 0 ];
|
1183 |
+
@!%p1 mov.u32 %r354, %r342;
|
1184 |
+
@!%p1 mov.u32 %r355, %r342;
|
1185 |
+
@!%p1 mov.u32 %r356, %r342;
|
1186 |
+
@!%p1 mov.u32 %r357, %r342;
|
1187 |
+
mov.b32 %f476, %r354;
|
1188 |
+
mov.b32 %f477, %r355;
|
1189 |
+
mov.b32 %f478, %r356;
|
1190 |
+
mov.b32 %f479, %r357;
|
1191 |
+
mov.u32 %r362, 0x0;
|
1192 |
+
mov.u32 %r363, 0x0;
|
1193 |
+
mov.u32 %r364, 0x0;
|
1194 |
+
mov.u32 %r365, 0x0;
|
1195 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r362, %r363, %r364, %r365 }, [ %rd99 + 0 ];
|
1196 |
+
@!%p1 mov.u32 %r362, %r342;
|
1197 |
+
@!%p1 mov.u32 %r363, %r342;
|
1198 |
+
@!%p1 mov.u32 %r364, %r342;
|
1199 |
+
@!%p1 mov.u32 %r365, %r342;
|
1200 |
+
mov.b32 %f480, %r362;
|
1201 |
+
mov.b32 %f481, %r363;
|
1202 |
+
mov.b32 %f482, %r364;
|
1203 |
+
mov.b32 %f483, %r365;
|
1204 |
+
.loc 1 66 24
|
1205 |
+
add.f32 %f484, %f149, %f468;
|
1206 |
+
add.f32 %f485, %f150, %f469;
|
1207 |
+
add.f32 %f486, %f151, %f470;
|
1208 |
+
add.f32 %f487, %f152, %f471;
|
1209 |
+
add.f32 %f488, %f153, %f472;
|
1210 |
+
add.f32 %f489, %f154, %f473;
|
1211 |
+
add.f32 %f490, %f155, %f474;
|
1212 |
+
add.f32 %f491, %f156, %f475;
|
1213 |
+
add.f32 %f492, %f157, %f476;
|
1214 |
+
add.f32 %f493, %f158, %f477;
|
1215 |
+
add.f32 %f494, %f159, %f478;
|
1216 |
+
add.f32 %f495, %f160, %f479;
|
1217 |
+
add.f32 %f496, %f161, %f480;
|
1218 |
+
add.f32 %f497, %f162, %f481;
|
1219 |
+
add.f32 %f498, %f163, %f482;
|
1220 |
+
add.f32 %f499, %f164, %f483;
|
1221 |
+
.loc 1 67 24
|
1222 |
+
sub.f32 %f500, %f484, %f145;
|
1223 |
+
sub.f32 %f501, %f485, %f145;
|
1224 |
+
sub.f32 %f502, %f486, %f145;
|
1225 |
+
sub.f32 %f503, %f487, %f145;
|
1226 |
+
sub.f32 %f504, %f488, %f145;
|
1227 |
+
sub.f32 %f505, %f489, %f145;
|
1228 |
+
sub.f32 %f506, %f490, %f145;
|
1229 |
+
sub.f32 %f507, %f491, %f145;
|
1230 |
+
sub.f32 %f508, %f492, %f146;
|
1231 |
+
sub.f32 %f509, %f493, %f146;
|
1232 |
+
sub.f32 %f510, %f494, %f146;
|
1233 |
+
sub.f32 %f511, %f495, %f146;
|
1234 |
+
sub.f32 %f512, %f496, %f146;
|
1235 |
+
sub.f32 %f513, %f497, %f146;
|
1236 |
+
sub.f32 %f514, %f498, %f146;
|
1237 |
+
sub.f32 %f515, %f499, %f146;
|
1238 |
+
.loc 1 73 24
|
1239 |
+
mul.f32 %f518, %f500, %f516;
|
1240 |
+
mul.f32 %f519, %f501, %f516;
|
1241 |
+
mul.f32 %f520, %f502, %f516;
|
1242 |
+
mul.f32 %f521, %f503, %f516;
|
1243 |
+
mul.f32 %f522, %f504, %f516;
|
1244 |
+
mul.f32 %f523, %f505, %f516;
|
1245 |
+
mul.f32 %f524, %f506, %f516;
|
1246 |
+
mul.f32 %f525, %f507, %f516;
|
1247 |
+
mul.f32 %f526, %f508, %f517;
|
1248 |
+
mul.f32 %f527, %f509, %f517;
|
1249 |
+
mul.f32 %f528, %f510, %f517;
|
1250 |
+
mul.f32 %f529, %f511, %f517;
|
1251 |
+
mul.f32 %f530, %f512, %f517;
|
1252 |
+
mul.f32 %f531, %f513, %f517;
|
1253 |
+
mul.f32 %f532, %f514, %f517;
|
1254 |
+
mul.f32 %f533, %f515, %f517;
|
1255 |
+
.loc 1 74 24
|
1256 |
+
mul.f32 %f534, %f518, %f165;
|
1257 |
+
mul.f32 %f535, %f519, %f166;
|
1258 |
+
mul.f32 %f536, %f520, %f167;
|
1259 |
+
mul.f32 %f537, %f521, %f168;
|
1260 |
+
mul.f32 %f538, %f522, %f169;
|
1261 |
+
mul.f32 %f539, %f523, %f170;
|
1262 |
+
mul.f32 %f540, %f524, %f171;
|
1263 |
+
mul.f32 %f541, %f525, %f172;
|
1264 |
+
mul.f32 %f542, %f526, %f165;
|
1265 |
+
mul.f32 %f543, %f527, %f166;
|
1266 |
+
mul.f32 %f544, %f528, %f167;
|
1267 |
+
mul.f32 %f545, %f529, %f168;
|
1268 |
+
mul.f32 %f546, %f530, %f169;
|
1269 |
+
mul.f32 %f547, %f531, %f170;
|
1270 |
+
mul.f32 %f548, %f532, %f171;
|
1271 |
+
mul.f32 %f549, %f533, %f172;
|
1272 |
+
.loc 1 76 35
|
1273 |
+
add.s32 %r394, %r8, %r407;
|
1274 |
+
add.s32 %r395, %r394, 64;
|
1275 |
+
.loc 1 76 29
|
1276 |
+
add.s32 %r396, %r394, 8256;
|
1277 |
+
mul.wide.s32 %rd102, %r395, 2;
|
1278 |
+
add.s64 %rd100, %rd13, %rd102;
|
1279 |
+
mul.wide.s32 %rd103, %r396, 2;
|
1280 |
+
add.s64 %rd101, %rd13, %rd103;
|
1281 |
+
.loc 1 76 52
|
1282 |
+
mov.b32 %r370, %f534;
|
1283 |
+
cvt.rn.bf16.f32 %rs1, %r370;
|
1284 |
+
mov.b32 %r371, %f535;
|
1285 |
+
cvt.rn.bf16.f32 %rs2, %r371;
|
1286 |
+
mov.b32 %r372, %f536;
|
1287 |
+
cvt.rn.bf16.f32 %rs3, %r372;
|
1288 |
+
mov.b32 %r373, %f537;
|
1289 |
+
cvt.rn.bf16.f32 %rs4, %r373;
|
1290 |
+
mov.b32 %r374, %f538;
|
1291 |
+
cvt.rn.bf16.f32 %rs5, %r374;
|
1292 |
+
mov.b32 %r375, %f539;
|
1293 |
+
cvt.rn.bf16.f32 %rs6, %r375;
|
1294 |
+
mov.b32 %r376, %f540;
|
1295 |
+
cvt.rn.bf16.f32 %rs7, %r376;
|
1296 |
+
mov.b32 %r377, %f541;
|
1297 |
+
cvt.rn.bf16.f32 %rs8, %r377;
|
1298 |
+
mov.b32 %r378, %f542;
|
1299 |
+
cvt.rn.bf16.f32 %rs9, %r378;
|
1300 |
+
mov.b32 %r379, %f543;
|
1301 |
+
cvt.rn.bf16.f32 %rs10, %r379;
|
1302 |
+
mov.b32 %r380, %f544;
|
1303 |
+
cvt.rn.bf16.f32 %rs11, %r380;
|
1304 |
+
mov.b32 %r381, %f545;
|
1305 |
+
cvt.rn.bf16.f32 %rs12, %r381;
|
1306 |
+
mov.b32 %r382, %f546;
|
1307 |
+
cvt.rn.bf16.f32 %rs13, %r382;
|
1308 |
+
mov.b32 %r383, %f547;
|
1309 |
+
cvt.rn.bf16.f32 %rs14, %r383;
|
1310 |
+
mov.b32 %r384, %f548;
|
1311 |
+
cvt.rn.bf16.f32 %rs15, %r384;
|
1312 |
+
mov.b32 %r385, %f549;
|
1313 |
+
cvt.rn.bf16.f32 %rs16, %r385;
|
1314 |
+
mov.b32 %r397, {%rs1, %rs2};
|
1315 |
+
mov.b32 %r398, {%rs3, %rs4};
|
1316 |
+
mov.b32 %r399, {%rs5, %rs6};
|
1317 |
+
mov.b32 %r400, {%rs7, %rs8};
|
1318 |
+
@%p1 st.global.v4.b32 [ %rd100 + 0 ], { %r397, %r398, %r399, %r400 };
|
1319 |
+
mov.b32 %r401, {%rs9, %rs10};
|
1320 |
+
mov.b32 %r402, {%rs11, %rs12};
|
1321 |
+
mov.b32 %r403, {%rs13, %rs14};
|
1322 |
+
mov.b32 %r404, {%rs15, %rs16};
|
1323 |
+
@%p1 st.global.v4.b32 [ %rd101 + 0 ], { %r401, %r402, %r403, %r404 };
|
1324 |
+
.loc 1 55 36
|
1325 |
+
add.s64 %rd106, %rd106, 256;
|
1326 |
+
add.s32 %r407, %r407, 64;
|
1327 |
+
setp.lt.u32 %p136, %r407, 192;
|
1328 |
+
@%p136 bra $L__BB0_5;
|
1329 |
+
bra.uni $L__BB0_8;
|
1330 |
+
$L__BB0_5:
|
1331 |
+
.loc 1 59 35
|
1332 |
+
add.s64 %rd83, %rd6, %rd106;
|
1333 |
+
add.s64 %rd84, %rd83, 16;
|
1334 |
+
add.s64 %rd85, %rd5, %rd106;
|
1335 |
+
.loc 1 59 51
|
1336 |
+
add.s64 %rd86, %rd85, 16;
|
1337 |
+
mov.u32 %r289, 0x0;
|
1338 |
+
mov.u32 %r290, 0x0;
|
1339 |
+
mov.u32 %r291, 0x0;
|
1340 |
+
mov.u32 %r292, 0x0;
|
1341 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r289, %r290, %r291, %r292 }, [ %rd83 + 0 ];
|
1342 |
+
@!%p1 mov.u32 %r289, %r342;
|
1343 |
+
@!%p1 mov.u32 %r290, %r342;
|
1344 |
+
@!%p1 mov.u32 %r291, %r342;
|
1345 |
+
@!%p1 mov.u32 %r292, %r342;
|
1346 |
+
mov.b32 %f149, %r289;
|
1347 |
+
mov.b32 %f150, %r290;
|
1348 |
+
mov.b32 %f151, %r291;
|
1349 |
+
mov.b32 %f152, %r292;
|
1350 |
+
mov.u32 %r297, 0x0;
|
1351 |
+
mov.u32 %r298, 0x0;
|
1352 |
+
mov.u32 %r299, 0x0;
|
1353 |
+
mov.u32 %r300, 0x0;
|
1354 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r297, %r298, %r299, %r300 }, [ %rd84 + 0 ];
|
1355 |
+
@!%p1 mov.u32 %r297, %r342;
|
1356 |
+
@!%p1 mov.u32 %r298, %r342;
|
1357 |
+
@!%p1 mov.u32 %r299, %r342;
|
1358 |
+
@!%p1 mov.u32 %r300, %r342;
|
1359 |
+
mov.b32 %f153, %r297;
|
1360 |
+
mov.b32 %f154, %r298;
|
1361 |
+
mov.b32 %f155, %r299;
|
1362 |
+
mov.b32 %f156, %r300;
|
1363 |
+
mov.u32 %r305, 0x0;
|
1364 |
+
mov.u32 %r306, 0x0;
|
1365 |
+
mov.u32 %r307, 0x0;
|
1366 |
+
mov.u32 %r308, 0x0;
|
1367 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r305, %r306, %r307, %r308 }, [ %rd85 + 0 ];
|
1368 |
+
@!%p1 mov.u32 %r305, %r342;
|
1369 |
+
@!%p1 mov.u32 %r306, %r342;
|
1370 |
+
@!%p1 mov.u32 %r307, %r342;
|
1371 |
+
@!%p1 mov.u32 %r308, %r342;
|
1372 |
+
mov.b32 %f157, %r305;
|
1373 |
+
mov.b32 %f158, %r306;
|
1374 |
+
mov.b32 %f159, %r307;
|
1375 |
+
mov.b32 %f160, %r308;
|
1376 |
+
mov.u32 %r313, 0x0;
|
1377 |
+
mov.u32 %r314, 0x0;
|
1378 |
+
mov.u32 %r315, 0x0;
|
1379 |
+
mov.u32 %r316, 0x0;
|
1380 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r313, %r314, %r315, %r316 }, [ %rd86 + 0 ];
|
1381 |
+
@!%p1 mov.u32 %r313, %r342;
|
1382 |
+
@!%p1 mov.u32 %r314, %r342;
|
1383 |
+
@!%p1 mov.u32 %r315, %r342;
|
1384 |
+
@!%p1 mov.u32 %r316, %r342;
|
1385 |
+
mov.b32 %f161, %r313;
|
1386 |
+
mov.b32 %f162, %r314;
|
1387 |
+
mov.b32 %f163, %r315;
|
1388 |
+
mov.b32 %f164, %r316;
|
1389 |
+
.loc 1 60 35
|
1390 |
+
add.s64 %rd87, %rd9, %rd106;
|
1391 |
+
.loc 1 60 40
|
1392 |
+
add.s64 %rd88, %rd87, 16;
|
1393 |
+
mov.u32 %r321, 0x0;
|
1394 |
+
mov.u32 %r322, 0x0;
|
1395 |
+
mov.u32 %r323, 0x0;
|
1396 |
+
mov.u32 %r324, 0x0;
|
1397 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd87 + 0 ];
|
1398 |
+
@!%p1 mov.u32 %r321, %r342;
|
1399 |
+
@!%p1 mov.u32 %r322, %r342;
|
1400 |
+
@!%p1 mov.u32 %r323, %r342;
|
1401 |
+
@!%p1 mov.u32 %r324, %r342;
|
1402 |
+
mov.b32 %f165, %r321;
|
1403 |
+
mov.b32 %f166, %r322;
|
1404 |
+
mov.b32 %f167, %r323;
|
1405 |
+
mov.b32 %f168, %r324;
|
1406 |
+
mov.u32 %r329, 0x0;
|
1407 |
+
mov.u32 %r330, 0x0;
|
1408 |
+
mov.u32 %r331, 0x0;
|
1409 |
+
mov.u32 %r332, 0x0;
|
1410 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd88 + 0 ];
|
1411 |
+
@!%p1 mov.u32 %r329, %r342;
|
1412 |
+
@!%p1 mov.u32 %r330, %r342;
|
1413 |
+
@!%p1 mov.u32 %r331, %r342;
|
1414 |
+
@!%p1 mov.u32 %r332, %r342;
|
1415 |
+
mov.b32 %f169, %r329;
|
1416 |
+
mov.b32 %f170, %r330;
|
1417 |
+
mov.b32 %f171, %r331;
|
1418 |
+
mov.b32 %f172, %r332;
|
1419 |
+
.loc 1 64 57
|
1420 |
+
@%p41 bra $L__BB0_7;
|
1421 |
+
mov.u64 %rd89, assertMessage_1;
|
1422 |
+
cvta.global.u64 %rd90, %rd89;
|
1423 |
+
mov.u64 %rd91, assertFile_1;
|
1424 |
+
cvta.global.u64 %rd92, %rd91;
|
1425 |
+
mov.u64 %rd93, assertFunc_1;
|
1426 |
+
cvta.global.u64 %rd94, %rd93;
|
1427 |
+
{ // callseq 7, 0
|
1428 |
+
.reg .b32 temp_param_reg;
|
1429 |
+
.param .b64 param0;
|
1430 |
+
st.param.b64 [param0+0], %rd90;
|
1431 |
+
.param .b64 param1;
|
1432 |
+
st.param.b64 [param1+0], %rd92;
|
1433 |
+
.param .b32 param2;
|
1434 |
+
st.param.b32 [param2+0], %r405;
|
1435 |
+
.param .b64 param3;
|
1436 |
+
st.param.b64 [param3+0], %rd94;
|
1437 |
+
.param .b64 param4;
|
1438 |
+
st.param.b64 [param4+0], %rd104;
|
1439 |
+
call.uni
|
1440 |
+
__assertfail,
|
1441 |
+
(
|
1442 |
+
param0,
|
1443 |
+
param1,
|
1444 |
+
param2,
|
1445 |
+
param3,
|
1446 |
+
param4
|
1447 |
+
);
|
1448 |
+
} // callseq 7
|
1449 |
+
bra.uni $L__BB0_7;
|
1450 |
+
$L__BB0_8:
|
1451 |
+
.loc 1 55 4
|
1452 |
+
ret;
|
1453 |
+
$L__tmp17:
|
1454 |
+
$L__func_end0:
|
1455 |
+
|
1456 |
+
}
|
1457 |
+
// .globl __nv_rsqrtf
|
1458 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
1459 |
+
.param .b32 __nv_rsqrtf_param_0
|
1460 |
+
)
|
1461 |
+
{
|
1462 |
+
.reg .f32 %f<3>;
|
1463 |
+
$L__func_begin1:
|
1464 |
+
|
1465 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
1466 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
1467 |
+
st.param.f32 [func_retval0+0], %f2;
|
1468 |
+
ret;
|
1469 |
+
$L__func_end1:
|
1470 |
+
|
1471 |
+
}
|
1472 |
+
.file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
|
1473 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
1474 |
+
.section .debug_abbrev
|
1475 |
+
{
|
1476 |
+
.b8 1
|
1477 |
+
.b8 17
|
1478 |
+
.b8 1
|
1479 |
+
.b8 37
|
1480 |
+
.b8 8
|
1481 |
+
.b8 19
|
1482 |
+
.b8 5
|
1483 |
+
.b8 3
|
1484 |
+
.b8 8
|
1485 |
+
.b8 16
|
1486 |
+
.b8 6
|
1487 |
+
.b8 27
|
1488 |
+
.b8 8
|
1489 |
+
.b8 180
|
1490 |
+
.b8 66
|
1491 |
+
.b8 12
|
1492 |
+
.b8 17
|
1493 |
+
.b8 1
|
1494 |
+
.b8 18
|
1495 |
+
.b8 1
|
1496 |
+
.b8 0
|
1497 |
+
.b8 0
|
1498 |
+
.b8 2
|
1499 |
+
.b8 46
|
1500 |
+
.b8 0
|
1501 |
+
.b8 135
|
1502 |
+
.b8 64
|
1503 |
+
.b8 8
|
1504 |
+
.b8 3
|
1505 |
+
.b8 8
|
1506 |
+
.b8 58
|
1507 |
+
.b8 11
|
1508 |
+
.b8 59
|
1509 |
+
.b8 11
|
1510 |
+
.b8 63
|
1511 |
+
.b8 12
|
1512 |
+
.b8 32
|
1513 |
+
.b8 11
|
1514 |
+
.b8 0
|
1515 |
+
.b8 0
|
1516 |
+
.b8 3
|
1517 |
+
.b8 46
|
1518 |
+
.b8 1
|
1519 |
+
.b8 17
|
1520 |
+
.b8 1
|
1521 |
+
.b8 18
|
1522 |
+
.b8 1
|
1523 |
+
.b8 64
|
1524 |
+
.b8 10
|
1525 |
+
.b8 49
|
1526 |
+
.b8 19
|
1527 |
+
.b8 0
|
1528 |
+
.b8 0
|
1529 |
+
.b8 4
|
1530 |
+
.b8 29
|
1531 |
+
.b8 0
|
1532 |
+
.b8 49
|
1533 |
+
.b8 19
|
1534 |
+
.b8 17
|
1535 |
+
.b8 1
|
1536 |
+
.b8 18
|
1537 |
+
.b8 1
|
1538 |
+
.b8 88
|
1539 |
+
.b8 11
|
1540 |
+
.b8 89
|
1541 |
+
.b8 11
|
1542 |
+
.b8 87
|
1543 |
+
.b8 11
|
1544 |
+
.b8 0
|
1545 |
+
.b8 0
|
1546 |
+
.b8 5
|
1547 |
+
.b8 29
|
1548 |
+
.b8 1
|
1549 |
+
.b8 49
|
1550 |
+
.b8 19
|
1551 |
+
.b8 17
|
1552 |
+
.b8 1
|
1553 |
+
.b8 18
|
1554 |
+
.b8 1
|
1555 |
+
.b8 88
|
1556 |
+
.b8 11
|
1557 |
+
.b8 89
|
1558 |
+
.b8 11
|
1559 |
+
.b8 87
|
1560 |
+
.b8 11
|
1561 |
+
.b8 0
|
1562 |
+
.b8 0
|
1563 |
+
.b8 0
|
1564 |
+
}
|
1565 |
+
.section .debug_info
|
1566 |
+
{
|
1567 |
+
.b32 298
|
1568 |
+
.b8 2
|
1569 |
+
.b8 0
|
1570 |
+
.b32 .debug_abbrev
|
1571 |
+
.b8 8
|
1572 |
+
.b8 1
|
1573 |
+
.b8 116
|
1574 |
+
.b8 114
|
1575 |
+
.b8 105
|
1576 |
+
.b8 116
|
1577 |
+
.b8 111
|
1578 |
+
.b8 110
|
1579 |
+
.b8 0
|
1580 |
+
.b8 2
|
1581 |
+
.b8 0
|
1582 |
+
.b8 99
|
1583 |
+
.b8 103
|
1584 |
+
.b8 120
|
1585 |
+
.b8 53
|
1586 |
+
.b8 108
|
1587 |
+
.b8 120
|
1588 |
+
.b8 112
|
1589 |
+
.b8 117
|
1590 |
+
.b8 101
|
1591 |
+
.b8 120
|
1592 |
+
.b8 112
|
1593 |
+
.b8 105
|
1594 |
+
.b8 110
|
1595 |
+
.b8 100
|
1596 |
+
.b8 106
|
1597 |
+
.b8 52
|
1598 |
+
.b8 100
|
1599 |
+
.b8 115
|
1600 |
+
.b8 109
|
1601 |
+
.b8 106
|
1602 |
+
.b8 122
|
1603 |
+
.b8 53
|
1604 |
+
.b8 120
|
1605 |
+
.b8 52
|
1606 |
+
.b8 50
|
1607 |
+
.b8 117
|
1608 |
+
.b8 104
|
1609 |
+
.b8 121
|
1610 |
+
.b8 121
|
1611 |
+
.b8 55
|
1612 |
+
.b8 105
|
1613 |
+
.b8 115
|
1614 |
+
.b8 107
|
1615 |
+
.b8 101
|
1616 |
+
.b8 118
|
1617 |
+
.b8 113
|
1618 |
+
.b8 55
|
1619 |
+
.b8 111
|
1620 |
+
.b8 118
|
1621 |
+
.b8 122
|
1622 |
+
.b8 112
|
1623 |
+
.b8 119
|
1624 |
+
.b8 97
|
1625 |
+
.b8 103
|
1626 |
+
.b8 98
|
1627 |
+
.b8 51
|
1628 |
+
.b8 116
|
1629 |
+
.b8 53
|
1630 |
+
.b8 112
|
1631 |
+
.b8 111
|
1632 |
+
.b8 119
|
1633 |
+
.b8 106
|
1634 |
+
.b8 46
|
1635 |
+
.b8 112
|
1636 |
+
.b8 121
|
1637 |
+
.b8 0
|
1638 |
+
.b32 .debug_line
|
1639 |
+
.b8 47
|
1640 |
+
.b8 116
|
1641 |
+
.b8 109
|
1642 |
+
.b8 112
|
1643 |
+
.b8 47
|
1644 |
+
.b8 116
|
1645 |
+
.b8 111
|
1646 |
+
.b8 114
|
1647 |
+
.b8 99
|
1648 |
+
.b8 104
|
1649 |
+
.b8 105
|
1650 |
+
.b8 110
|
1651 |
+
.b8 100
|
1652 |
+
.b8 117
|
1653 |
+
.b8 99
|
1654 |
+
.b8 116
|
1655 |
+
.b8 111
|
1656 |
+
.b8 114
|
1657 |
+
.b8 95
|
1658 |
+
.b8 114
|
1659 |
+
.b8 111
|
1660 |
+
.b8 111
|
1661 |
+
.b8 116
|
1662 |
+
.b8 47
|
1663 |
+
.b8 103
|
1664 |
+
.b8 120
|
1665 |
+
.b8 0
|
1666 |
+
.b8 1
|
1667 |
+
.b64 $L__func_begin0
|
1668 |
+
.b64 $L__func_end0
|
1669 |
+
.b8 2
|
1670 |
+
.b8 116
|
1671 |
+
.b8 114
|
1672 |
+
.b8 105
|
1673 |
+
.b8 116
|
1674 |
+
.b8 111
|
1675 |
+
.b8 110
|
1676 |
+
.b8 95
|
1677 |
+
.b8 95
|
1678 |
+
.b8 48
|
1679 |
+
.b8 100
|
1680 |
+
.b8 49
|
1681 |
+
.b8 100
|
1682 |
+
.b8 50
|
1683 |
+
.b8 100
|
1684 |
+
.b8 51
|
1685 |
+
.b8 100
|
1686 |
+
.b8 52
|
1687 |
+
.b8 100
|
1688 |
+
.b8 53
|
1689 |
+
.b8 100
|
1690 |
+
.b8 101
|
1691 |
+
.b8 54
|
1692 |
+
.b8 100
|
1693 |
+
.b8 101
|
1694 |
+
.b8 0
|
1695 |
+
.b8 116
|
1696 |
+
.b8 114
|
1697 |
+
.b8 105
|
1698 |
+
.b8 116
|
1699 |
+
.b8 111
|
1700 |
+
.b8 110
|
1701 |
+
.b8 95
|
1702 |
+
.b8 95
|
1703 |
+
.b8 48
|
1704 |
+
.b8 100
|
1705 |
+
.b8 49
|
1706 |
+
.b8 100
|
1707 |
+
.b8 50
|
1708 |
+
.b8 100
|
1709 |
+
.b8 51
|
1710 |
+
.b8 100
|
1711 |
+
.b8 52
|
1712 |
+
.b8 100
|
1713 |
+
.b8 53
|
1714 |
+
.b8 100
|
1715 |
+
.b8 101
|
1716 |
+
.b8 54
|
1717 |
+
.b8 100
|
1718 |
+
.b8 101
|
1719 |
+
.b8 0
|
1720 |
+
.b8 1
|
1721 |
+
.b8 18
|
1722 |
+
.b8 1
|
1723 |
+
.b8 1
|
1724 |
+
.b8 3
|
1725 |
+
.b64 $L__func_begin0
|
1726 |
+
.b64 $L__func_end0
|
1727 |
+
.b8 1
|
1728 |
+
.b8 156
|
1729 |
+
.b32 125
|
1730 |
+
.b8 4
|
1731 |
+
.b32 125
|
1732 |
+
.b64 $L__tmp1
|
1733 |
+
.b64 $L__tmp2
|
1734 |
+
.b8 2
|
1735 |
+
.b8 44
|
1736 |
+
.b8 38
|
1737 |
+
.b8 5
|
1738 |
+
.b32 125
|
1739 |
+
.b64 $L__tmp3
|
1740 |
+
.b64 $L__tmp16
|
1741 |
+
.b8 2
|
1742 |
+
.b8 50
|
1743 |
+
.b8 41
|
1744 |
+
.b8 4
|
1745 |
+
.b32 125
|
1746 |
+
.b64 $L__tmp3
|
1747 |
+
.b64 $L__tmp16
|
1748 |
+
.b8 2
|
1749 |
+
.b8 120
|
1750 |
+
.b8 46
|
1751 |
+
.b8 0
|
1752 |
+
.b8 4
|
1753 |
+
.b32 125
|
1754 |
+
.b64 $L__tmp4
|
1755 |
+
.b64 $L__tmp15
|
1756 |
+
.b8 2
|
1757 |
+
.b8 50
|
1758 |
+
.b8 41
|
1759 |
+
.b8 0
|
1760 |
+
.b8 0
|
1761 |
+
}
|
1762 |
+
.section .debug_pubnames
|
1763 |
+
{
|
1764 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1765 |
+
$L__pubNames_start0:
|
1766 |
+
.b8 2
|
1767 |
+
.b8 0
|
1768 |
+
.b32 .debug_info
|
1769 |
+
.b32 302
|
1770 |
+
.b32 125
|
1771 |
+
.b8 116
|
1772 |
+
.b8 114
|
1773 |
+
.b8 105
|
1774 |
+
.b8 116
|
1775 |
+
.b8 111
|
1776 |
+
.b8 110
|
1777 |
+
.b8 95
|
1778 |
+
.b8 95
|
1779 |
+
.b8 48
|
1780 |
+
.b8 100
|
1781 |
+
.b8 49
|
1782 |
+
.b8 100
|
1783 |
+
.b8 50
|
1784 |
+
.b8 100
|
1785 |
+
.b8 51
|
1786 |
+
.b8 100
|
1787 |
+
.b8 52
|
1788 |
+
.b8 100
|
1789 |
+
.b8 53
|
1790 |
+
.b8 100
|
1791 |
+
.b8 101
|
1792 |
+
.b8 54
|
1793 |
+
.b8 100
|
1794 |
+
.b8 101
|
1795 |
+
.b8 0
|
1796 |
+
.b32 0
|
1797 |
+
$L__pubNames_end0:
|
1798 |
+
}
|
1799 |
+
.section .debug_pubtypes
|
1800 |
+
{
|
1801 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1802 |
+
$L__pubTypes_start0:
|
1803 |
+
.b8 2
|
1804 |
+
.b8 0
|
1805 |
+
.b32 .debug_info
|
1806 |
+
.b32 302
|
1807 |
+
.b32 0
|
1808 |
+
$L__pubTypes_end0:
|
1809 |
+
}
|
1810 |
+
.section .debug_loc { }
|
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttir
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant 0.000000e+00 : f32
|
4 |
+
%cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
|
5 |
+
%c256_i32 = arith.constant 256 : i32
|
6 |
+
%c64_i32 = arith.constant 64 : i32
|
7 |
+
%c0_i32 = arith.constant 0 : i32
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi64>
|
9 |
+
%cst_2 = arith.constant dense<0> : tensor<64x1xi64>
|
10 |
+
%cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
|
11 |
+
%cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
|
12 |
+
%cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
|
13 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
|
14 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
|
15 |
+
%cst_8 = arith.constant dense<256> : tensor<64x1xi32>
|
16 |
+
%cst_9 = arith.constant dense<256> : tensor<1x64xi32>
|
17 |
+
%cst_10 = arith.constant dense<512> : tensor<64x1xi32>
|
18 |
+
%0 = tt.get_program_id x : i32
|
19 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
20 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
21 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
22 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
23 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
24 |
+
%6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
|
25 |
+
%7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
26 |
+
%8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
27 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
28 |
+
%10 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
|
29 |
+
%11 = arith.muli %10, %cst_8 : tensor<64x1xi32>
|
30 |
+
%12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
31 |
+
%13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
32 |
+
%14 = arith.addi %9, %cst_3 : tensor<64x1xi64>
|
33 |
+
%15 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
|
34 |
+
%16 = arith.select %15, %14, %9 : tensor<64x1xi1>, tensor<64x1xi64>
|
35 |
+
%17 = arith.cmpi sge, %16, %cst_2 : tensor<64x1xi64>
|
36 |
+
%18 = arith.cmpi slt, %16, %cst_3 : tensor<64x1xi64>
|
37 |
+
%19 = arith.andi %17, %18 : tensor<64x1xi1>
|
38 |
+
%20 = arith.muli %16, %cst_1 : tensor<64x1xi64>
|
39 |
+
%21 = tt.broadcast %20 : (tensor<64x1xi64>) -> tensor<64x64xi64>
|
40 |
+
%22 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
41 |
+
%23:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 {
|
42 |
+
%46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
|
43 |
+
%47 = arith.addi %46, %6 : tensor<1x64xi32>
|
44 |
+
%48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
|
45 |
+
%49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
|
46 |
+
%50 = arith.addi %49, %12 : tensor<64x64xi32>
|
47 |
+
%51 = tt.addptr %13, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
|
48 |
+
%52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
|
49 |
+
%53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
50 |
+
tt.assert %19, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
51 |
+
%54 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
|
52 |
+
%55 = tt.broadcast %54 : (tensor<1x64xi64>) -> tensor<64x64xi64>
|
53 |
+
%56 = arith.addi %55, %21 : tensor<64x64xi64>
|
54 |
+
%57 = tt.addptr %22, %56 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
|
55 |
+
%58 = tt.load %57, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
56 |
+
%59 = arith.addf %58, %53 : tensor<64x64xf32>
|
57 |
+
%60 = arith.subf %59, %arg8 : tensor<64x64xf32>
|
58 |
+
%61 = arith.addf %arg10, %cst_0 : tensor<64x64xf32>
|
59 |
+
%62 = arith.divf %60, %61 : tensor<64x64xf32>
|
60 |
+
%63 = arith.addf %arg8, %62 : tensor<64x64xf32>
|
61 |
+
%64 = arith.subf %59, %63 : tensor<64x64xf32>
|
62 |
+
%65 = arith.mulf %60, %64 : tensor<64x64xf32>
|
63 |
+
%66 = arith.addf %arg9, %65 : tensor<64x64xf32>
|
64 |
+
%67 = arith.select %52, %63, %arg8 : tensor<64x64xi1>, tensor<64x64xf32>
|
65 |
+
%68 = arith.select %52, %66, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
|
66 |
+
%69 = arith.select %52, %61, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
|
67 |
+
scf.yield %67, %68, %69 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
|
68 |
+
}
|
69 |
+
%24:3 = "tt.reduce"(%23#0, %23#1, %23#2) <{axis = 1 : i32}> ({
|
70 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
71 |
+
%46 = arith.subf %arg10, %arg7 : f32
|
72 |
+
%47 = arith.addf %arg9, %arg12 : f32
|
73 |
+
%48 = arith.cmpf oeq, %47, %cst : f32
|
74 |
+
%49 = arith.divf %arg12, %47 : f32
|
75 |
+
%50 = arith.select %48, %cst, %49 : f32
|
76 |
+
%51 = arith.mulf %46, %50 : f32
|
77 |
+
%52 = arith.addf %arg7, %51 : f32
|
78 |
+
%53 = arith.addf %arg8, %arg11 : f32
|
79 |
+
%54 = arith.mulf %46, %46 : f32
|
80 |
+
%55 = arith.mulf %54, %arg9 : f32
|
81 |
+
%56 = arith.mulf %55, %50 : f32
|
82 |
+
%57 = arith.addf %53, %56 : f32
|
83 |
+
tt.reduce.return %52, %57, %47 : f32, f32, f32
|
84 |
+
}) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
|
85 |
+
%25 = tt.expand_dims %24#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
86 |
+
%26 = tt.expand_dims %24#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
87 |
+
%27 = arith.muli %10, %cst_8 : tensor<64x1xi32>
|
88 |
+
%28 = tt.broadcast %27 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
89 |
+
%29 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
90 |
+
%30 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
|
91 |
+
%31 = arith.addi %9, %cst_3 : tensor<64x1xi64>
|
92 |
+
%32 = arith.cmpi slt, %9, %cst_2 : tensor<64x1xi64>
|
93 |
+
%33 = arith.select %32, %31, %9 : tensor<64x1xi1>, tensor<64x1xi64>
|
94 |
+
%34 = arith.cmpi sge, %33, %cst_2 : tensor<64x1xi64>
|
95 |
+
%35 = arith.cmpi slt, %33, %cst_3 : tensor<64x1xi64>
|
96 |
+
%36 = arith.andi %34, %35 : tensor<64x1xi1>
|
97 |
+
%37 = arith.muli %33, %cst_1 : tensor<64x1xi64>
|
98 |
+
%38 = tt.broadcast %37 : (tensor<64x1xi64>) -> tensor<64x64xi64>
|
99 |
+
%39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
100 |
+
%40 = tt.broadcast %25 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
101 |
+
%41 = arith.divf %26, %cst_5 : tensor<64x1xf32>
|
102 |
+
%42 = arith.addf %41, %cst_4 : tensor<64x1xf32>
|
103 |
+
%43 = arith.muli %5, %cst_8 : tensor<64x1xi32>
|
104 |
+
%44 = tt.broadcast %43 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
105 |
+
%45 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
106 |
+
scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
|
107 |
+
%46 = tt.splat %arg7 : (i32) -> tensor<1x64xi32>
|
108 |
+
%47 = arith.addi %46, %6 : tensor<1x64xi32>
|
109 |
+
%48 = arith.cmpi slt, %47, %cst_9 : tensor<1x64xi32>
|
110 |
+
%49 = tt.broadcast %47 : (tensor<1x64xi32>) -> tensor<64x64xi32>
|
111 |
+
%50 = arith.addi %49, %28 : tensor<64x64xi32>
|
112 |
+
%51 = tt.addptr %29, %50 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
|
113 |
+
%52 = tt.broadcast %48 : (tensor<1x64xi1>) -> tensor<64x64xi1>
|
114 |
+
%53 = tt.load %51, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
115 |
+
%54 = tt.addptr %30, %47 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
|
116 |
+
%55 = tt.load %54, %48, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
|
117 |
+
tt.assert %36, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
118 |
+
%56 = arith.extsi %47 : tensor<1x64xi32> to tensor<1x64xi64>
|
119 |
+
%57 = tt.broadcast %56 : (tensor<1x64xi64>) -> tensor<64x64xi64>
|
120 |
+
%58 = arith.addi %57, %38 : tensor<64x64xi64>
|
121 |
+
%59 = tt.addptr %39, %58 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
|
122 |
+
%60 = tt.load %59, %52, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
|
123 |
+
%61 = arith.addf %60, %53 : tensor<64x64xf32>
|
124 |
+
%62 = arith.subf %61, %40 : tensor<64x64xf32>
|
125 |
+
%63 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
|
126 |
+
%64 = tt.broadcast %63 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
127 |
+
%65 = arith.mulf %62, %64 : tensor<64x64xf32>
|
128 |
+
%66 = tt.broadcast %55 : (tensor<1x64xf32>) -> tensor<64x64xf32>
|
129 |
+
%67 = arith.mulf %65, %66 : tensor<64x64xf32>
|
130 |
+
%68 = arith.addi %49, %44 : tensor<64x64xi32>
|
131 |
+
%69 = tt.addptr %45, %68 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
|
132 |
+
%70 = arith.truncf %67 : tensor<64x64xf32> to tensor<64x64xbf16>
|
133 |
+
tt.store %69, %70, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
|
134 |
+
}
|
135 |
+
tt.return
|
136 |
+
}
|
137 |
+
}
|
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.llir
ADDED
@@ -0,0 +1,1360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
|
5 |
+
@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
|
8 |
+
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = lshr i32 %9, 3, !dbg !10
|
18 |
+
%11 = and i32 %10, 31, !dbg !10
|
19 |
+
%12 = and i32 %9, 63, !dbg !10
|
20 |
+
%13 = shl i32 %9, 3, !dbg !11
|
21 |
+
%14 = and i32 %13, 56, !dbg !11
|
22 |
+
%15 = or i32 %14, 4, !dbg !11
|
23 |
+
%16 = lshr i32 %9, 6, !dbg !12
|
24 |
+
%17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
|
25 |
+
%18 = shl i32 %17, 6, !dbg !14
|
26 |
+
%19 = or i32 %18, %11, !dbg !15
|
27 |
+
%20 = or i32 %19, 32, !dbg !15
|
28 |
+
%21 = or i32 %18, %12, !dbg !15
|
29 |
+
%22 = sext i32 %19 to i64, !dbg !16
|
30 |
+
%23 = getelementptr i64, ptr addrspace(1) %0, i64 %22, !dbg !16
|
31 |
+
%24 = sext i32 %20 to i64, !dbg !16
|
32 |
+
%25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
|
33 |
+
%26 = sext i32 %21 to i64, !dbg !16
|
34 |
+
%27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
|
35 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
36 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
37 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
38 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
39 |
+
%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
40 |
+
%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
41 |
+
%34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
42 |
+
%35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %23, i1 true) #6, !dbg !17
|
43 |
+
%36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
44 |
+
%37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
45 |
+
%38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
46 |
+
%39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
47 |
+
%40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
48 |
+
%41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
49 |
+
%42 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
50 |
+
%43 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
|
51 |
+
%44 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
|
52 |
+
%45 = srem i32 %19, 512, !dbg !18
|
53 |
+
%46 = srem i32 %20, 512, !dbg !18
|
54 |
+
%47 = shl nsw i32 %45, 8, !dbg !19
|
55 |
+
%48 = shl nsw i32 %46, 8, !dbg !19
|
56 |
+
%49 = shl i32 %19, 8, !dbg !20
|
57 |
+
%50 = shl i32 %20, 8, !dbg !20
|
58 |
+
%51 = add i64 %44, 50257, !dbg !21
|
59 |
+
%52 = icmp slt i64 %28, 0, !dbg !22
|
60 |
+
%53 = icmp slt i64 %36, 0, !dbg !22
|
61 |
+
%54 = icmp slt i64 %44, 0, !dbg !22
|
62 |
+
%55 = select i1 %54, i64 %51, i64 %44, !dbg !23
|
63 |
+
%56 = icmp ugt i64 %55, 50256, !dbg !24
|
64 |
+
%57 = shl i64 %28, 8, !dbg !25
|
65 |
+
%58 = add i64 %57, 12865792, !dbg !25
|
66 |
+
%59 = select i1 %52, i64 %58, i64 %57, !dbg !25
|
67 |
+
%60 = shl i64 %36, 8, !dbg !25
|
68 |
+
%61 = add i64 %60, 12865792, !dbg !25
|
69 |
+
%62 = select i1 %53, i64 %61, i64 %60, !dbg !25
|
70 |
+
%63 = getelementptr float, ptr addrspace(1) %1, i64 %59
|
71 |
+
%64 = getelementptr float, ptr addrspace(1) %1, i64 %62
|
72 |
+
br label %65, !dbg !12
|
73 |
+
|
74 |
+
65: ; preds = %8, %230
|
75 |
+
%66 = phi float [ 0.000000e+00, %8 ], [ %321, %230 ]
|
76 |
+
%67 = phi float [ 0.000000e+00, %8 ], [ %322, %230 ]
|
77 |
+
%68 = phi float [ 0.000000e+00, %8 ], [ %323, %230 ]
|
78 |
+
%69 = phi float [ 0.000000e+00, %8 ], [ %324, %230 ]
|
79 |
+
%70 = phi float [ 0.000000e+00, %8 ], [ %325, %230 ]
|
80 |
+
%71 = phi float [ 0.000000e+00, %8 ], [ %326, %230 ]
|
81 |
+
%72 = phi float [ 0.000000e+00, %8 ], [ %327, %230 ]
|
82 |
+
%73 = phi float [ 0.000000e+00, %8 ], [ %328, %230 ]
|
83 |
+
%74 = phi float [ 0.000000e+00, %8 ], [ %329, %230 ]
|
84 |
+
%75 = phi float [ 0.000000e+00, %8 ], [ %330, %230 ]
|
85 |
+
%76 = phi float [ 0.000000e+00, %8 ], [ %331, %230 ]
|
86 |
+
%77 = phi float [ 0.000000e+00, %8 ], [ %332, %230 ]
|
87 |
+
%78 = phi float [ 0.000000e+00, %8 ], [ %333, %230 ]
|
88 |
+
%79 = phi float [ 0.000000e+00, %8 ], [ %334, %230 ]
|
89 |
+
%80 = phi float [ 0.000000e+00, %8 ], [ %335, %230 ]
|
90 |
+
%81 = phi float [ 0.000000e+00, %8 ], [ %336, %230 ]
|
91 |
+
%82 = phi float [ 0.000000e+00, %8 ], [ %337, %230 ]
|
92 |
+
%83 = phi float [ 0.000000e+00, %8 ], [ %338, %230 ]
|
93 |
+
%84 = phi float [ 0.000000e+00, %8 ], [ %339, %230 ]
|
94 |
+
%85 = phi float [ 0.000000e+00, %8 ], [ %340, %230 ]
|
95 |
+
%86 = phi float [ 0.000000e+00, %8 ], [ %341, %230 ]
|
96 |
+
%87 = phi float [ 0.000000e+00, %8 ], [ %342, %230 ]
|
97 |
+
%88 = phi float [ 0.000000e+00, %8 ], [ %343, %230 ]
|
98 |
+
%89 = phi float [ 0.000000e+00, %8 ], [ %344, %230 ]
|
99 |
+
%90 = phi float [ 0.000000e+00, %8 ], [ %345, %230 ]
|
100 |
+
%91 = phi float [ 0.000000e+00, %8 ], [ %346, %230 ]
|
101 |
+
%92 = phi float [ 0.000000e+00, %8 ], [ %347, %230 ]
|
102 |
+
%93 = phi float [ 0.000000e+00, %8 ], [ %348, %230 ]
|
103 |
+
%94 = phi float [ 0.000000e+00, %8 ], [ %349, %230 ]
|
104 |
+
%95 = phi float [ 0.000000e+00, %8 ], [ %350, %230 ]
|
105 |
+
%96 = phi float [ 0.000000e+00, %8 ], [ %351, %230 ]
|
106 |
+
%97 = phi float [ 0.000000e+00, %8 ], [ %352, %230 ]
|
107 |
+
%98 = phi float [ 0.000000e+00, %8 ], [ %417, %230 ]
|
108 |
+
%99 = phi float [ 0.000000e+00, %8 ], [ %418, %230 ]
|
109 |
+
%100 = phi float [ 0.000000e+00, %8 ], [ %419, %230 ]
|
110 |
+
%101 = phi float [ 0.000000e+00, %8 ], [ %420, %230 ]
|
111 |
+
%102 = phi float [ 0.000000e+00, %8 ], [ %421, %230 ]
|
112 |
+
%103 = phi float [ 0.000000e+00, %8 ], [ %422, %230 ]
|
113 |
+
%104 = phi float [ 0.000000e+00, %8 ], [ %423, %230 ]
|
114 |
+
%105 = phi float [ 0.000000e+00, %8 ], [ %424, %230 ]
|
115 |
+
%106 = phi float [ 0.000000e+00, %8 ], [ %425, %230 ]
|
116 |
+
%107 = phi float [ 0.000000e+00, %8 ], [ %426, %230 ]
|
117 |
+
%108 = phi float [ 0.000000e+00, %8 ], [ %427, %230 ]
|
118 |
+
%109 = phi float [ 0.000000e+00, %8 ], [ %428, %230 ]
|
119 |
+
%110 = phi float [ 0.000000e+00, %8 ], [ %429, %230 ]
|
120 |
+
%111 = phi float [ 0.000000e+00, %8 ], [ %430, %230 ]
|
121 |
+
%112 = phi float [ 0.000000e+00, %8 ], [ %431, %230 ]
|
122 |
+
%113 = phi float [ 0.000000e+00, %8 ], [ %432, %230 ]
|
123 |
+
%114 = phi float [ 0.000000e+00, %8 ], [ %369, %230 ]
|
124 |
+
%115 = phi float [ 0.000000e+00, %8 ], [ %370, %230 ]
|
125 |
+
%116 = phi float [ 0.000000e+00, %8 ], [ %371, %230 ]
|
126 |
+
%117 = phi float [ 0.000000e+00, %8 ], [ %372, %230 ]
|
127 |
+
%118 = phi float [ 0.000000e+00, %8 ], [ %373, %230 ]
|
128 |
+
%119 = phi float [ 0.000000e+00, %8 ], [ %374, %230 ]
|
129 |
+
%120 = phi float [ 0.000000e+00, %8 ], [ %375, %230 ]
|
130 |
+
%121 = phi float [ 0.000000e+00, %8 ], [ %376, %230 ]
|
131 |
+
%122 = phi float [ 0.000000e+00, %8 ], [ %377, %230 ]
|
132 |
+
%123 = phi float [ 0.000000e+00, %8 ], [ %378, %230 ]
|
133 |
+
%124 = phi float [ 0.000000e+00, %8 ], [ %379, %230 ]
|
134 |
+
%125 = phi float [ 0.000000e+00, %8 ], [ %380, %230 ]
|
135 |
+
%126 = phi float [ 0.000000e+00, %8 ], [ %381, %230 ]
|
136 |
+
%127 = phi float [ 0.000000e+00, %8 ], [ %382, %230 ]
|
137 |
+
%128 = phi float [ 0.000000e+00, %8 ], [ %383, %230 ]
|
138 |
+
%129 = phi float [ 0.000000e+00, %8 ], [ %384, %230 ]
|
139 |
+
%130 = phi i32 [ 0, %8 ], [ %433, %230 ]
|
140 |
+
%131 = or i32 %130, %14, !dbg !26
|
141 |
+
%132 = or i32 %130, %15, !dbg !26
|
142 |
+
%133 = add i32 %131, %47, !dbg !27
|
143 |
+
%134 = add i32 %132, %47, !dbg !27
|
144 |
+
%135 = add i32 %131, %48, !dbg !27
|
145 |
+
%136 = add i32 %132, %48, !dbg !27
|
146 |
+
%137 = sext i32 %133 to i64, !dbg !28
|
147 |
+
%138 = getelementptr float, ptr addrspace(1) %2, i64 %137, !dbg !28
|
148 |
+
%139 = sext i32 %134 to i64, !dbg !28
|
149 |
+
%140 = getelementptr float, ptr addrspace(1) %2, i64 %139, !dbg !28
|
150 |
+
%141 = sext i32 %135 to i64, !dbg !28
|
151 |
+
%142 = getelementptr float, ptr addrspace(1) %2, i64 %141, !dbg !28
|
152 |
+
%143 = sext i32 %136 to i64, !dbg !28
|
153 |
+
%144 = getelementptr float, ptr addrspace(1) %2, i64 %143, !dbg !28
|
154 |
+
%145 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %138, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
155 |
+
%146 = extractvalue { i32, i32, i32, i32 } %145, 0, !dbg !29
|
156 |
+
%147 = extractvalue { i32, i32, i32, i32 } %145, 1, !dbg !29
|
157 |
+
%148 = extractvalue { i32, i32, i32, i32 } %145, 2, !dbg !29
|
158 |
+
%149 = extractvalue { i32, i32, i32, i32 } %145, 3, !dbg !29
|
159 |
+
%150 = bitcast i32 %146 to float, !dbg !29
|
160 |
+
%151 = bitcast i32 %147 to float, !dbg !29
|
161 |
+
%152 = bitcast i32 %148 to float, !dbg !29
|
162 |
+
%153 = bitcast i32 %149 to float, !dbg !29
|
163 |
+
%154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %140, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
164 |
+
%155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !29
|
165 |
+
%156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !29
|
166 |
+
%157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !29
|
167 |
+
%158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !29
|
168 |
+
%159 = bitcast i32 %155 to float, !dbg !29
|
169 |
+
%160 = bitcast i32 %156 to float, !dbg !29
|
170 |
+
%161 = bitcast i32 %157 to float, !dbg !29
|
171 |
+
%162 = bitcast i32 %158 to float, !dbg !29
|
172 |
+
%163 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %142, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
173 |
+
%164 = extractvalue { i32, i32, i32, i32 } %163, 0, !dbg !29
|
174 |
+
%165 = extractvalue { i32, i32, i32, i32 } %163, 1, !dbg !29
|
175 |
+
%166 = extractvalue { i32, i32, i32, i32 } %163, 2, !dbg !29
|
176 |
+
%167 = extractvalue { i32, i32, i32, i32 } %163, 3, !dbg !29
|
177 |
+
%168 = bitcast i32 %164 to float, !dbg !29
|
178 |
+
%169 = bitcast i32 %165 to float, !dbg !29
|
179 |
+
%170 = bitcast i32 %166 to float, !dbg !29
|
180 |
+
%171 = bitcast i32 %167 to float, !dbg !29
|
181 |
+
%172 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %144, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
182 |
+
%173 = extractvalue { i32, i32, i32, i32 } %172, 0, !dbg !29
|
183 |
+
%174 = extractvalue { i32, i32, i32, i32 } %172, 1, !dbg !29
|
184 |
+
%175 = extractvalue { i32, i32, i32, i32 } %172, 2, !dbg !29
|
185 |
+
%176 = extractvalue { i32, i32, i32, i32 } %172, 3, !dbg !29
|
186 |
+
%177 = bitcast i32 %173 to float, !dbg !29
|
187 |
+
%178 = bitcast i32 %174 to float, !dbg !29
|
188 |
+
%179 = bitcast i32 %175 to float, !dbg !29
|
189 |
+
%180 = bitcast i32 %176 to float, !dbg !29
|
190 |
+
%181 = add i32 %131, %49, !dbg !30
|
191 |
+
%182 = add i32 %131, %50, !dbg !30
|
192 |
+
%183 = sext i32 %181 to i64, !dbg !31
|
193 |
+
%184 = getelementptr i16, ptr addrspace(1) %3, i64 %183, !dbg !31
|
194 |
+
%185 = sext i32 %182 to i64, !dbg !31
|
195 |
+
%186 = getelementptr i16, ptr addrspace(1) %3, i64 %185, !dbg !31
|
196 |
+
%187 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
197 |
+
%188 = extractvalue { i32, i32, i32, i32 } %187, 0, !dbg !32
|
198 |
+
%189 = extractvalue { i32, i32, i32, i32 } %187, 1, !dbg !32
|
199 |
+
%190 = extractvalue { i32, i32, i32, i32 } %187, 2, !dbg !32
|
200 |
+
%191 = extractvalue { i32, i32, i32, i32 } %187, 3, !dbg !32
|
201 |
+
%192 = trunc i32 %188 to i16, !dbg !32
|
202 |
+
%extelt.offset9 = lshr i32 %188, 16, !dbg !32
|
203 |
+
%193 = trunc i32 %extelt.offset9 to i16, !dbg !32
|
204 |
+
%194 = trunc i32 %189 to i16, !dbg !32
|
205 |
+
%extelt.offset10 = lshr i32 %189, 16, !dbg !32
|
206 |
+
%195 = trunc i32 %extelt.offset10 to i16, !dbg !32
|
207 |
+
%196 = trunc i32 %190 to i16, !dbg !32
|
208 |
+
%extelt.offset11 = lshr i32 %190, 16, !dbg !32
|
209 |
+
%197 = trunc i32 %extelt.offset11 to i16, !dbg !32
|
210 |
+
%198 = trunc i32 %191 to i16, !dbg !32
|
211 |
+
%extelt.offset12 = lshr i32 %191, 16, !dbg !32
|
212 |
+
%199 = trunc i32 %extelt.offset12 to i16, !dbg !32
|
213 |
+
%200 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %186, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
214 |
+
%201 = extractvalue { i32, i32, i32, i32 } %200, 0, !dbg !32
|
215 |
+
%202 = extractvalue { i32, i32, i32, i32 } %200, 1, !dbg !32
|
216 |
+
%203 = extractvalue { i32, i32, i32, i32 } %200, 2, !dbg !32
|
217 |
+
%204 = extractvalue { i32, i32, i32, i32 } %200, 3, !dbg !32
|
218 |
+
%205 = trunc i32 %201 to i16, !dbg !32
|
219 |
+
%extelt.offset13 = lshr i32 %201, 16, !dbg !32
|
220 |
+
%206 = trunc i32 %extelt.offset13 to i16, !dbg !32
|
221 |
+
%207 = trunc i32 %202 to i16, !dbg !32
|
222 |
+
%extelt.offset14 = lshr i32 %202, 16, !dbg !32
|
223 |
+
%208 = trunc i32 %extelt.offset14 to i16, !dbg !32
|
224 |
+
%209 = trunc i32 %203 to i16, !dbg !32
|
225 |
+
%extelt.offset15 = lshr i32 %203, 16, !dbg !32
|
226 |
+
%210 = trunc i32 %extelt.offset15 to i16, !dbg !32
|
227 |
+
%211 = trunc i32 %204 to i16, !dbg !32
|
228 |
+
%extelt.offset16 = lshr i32 %204, 16, !dbg !32
|
229 |
+
%212 = trunc i32 %extelt.offset16 to i16, !dbg !32
|
230 |
+
%213 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %192) #6, !dbg !33
|
231 |
+
%214 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %193) #6, !dbg !33
|
232 |
+
%215 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %194) #6, !dbg !33
|
233 |
+
%216 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %195) #6, !dbg !33
|
234 |
+
%217 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %196) #6, !dbg !33
|
235 |
+
%218 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %197) #6, !dbg !33
|
236 |
+
%219 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %198) #6, !dbg !33
|
237 |
+
%220 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %199) #6, !dbg !33
|
238 |
+
%221 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %205) #6, !dbg !33
|
239 |
+
%222 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %206) #6, !dbg !33
|
240 |
+
%223 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %207) #6, !dbg !33
|
241 |
+
%224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %208) #6, !dbg !33
|
242 |
+
%225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #6, !dbg !33
|
243 |
+
%226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %210) #6, !dbg !33
|
244 |
+
%227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #6, !dbg !33
|
245 |
+
%228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %212) #6, !dbg !33
|
246 |
+
br i1 %56, label %229, label %230, !dbg !34
|
247 |
+
|
248 |
+
229: ; preds = %65
|
249 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
|
250 |
+
br label %230, !dbg !34
|
251 |
+
|
252 |
+
230: ; preds = %229, %65
|
253 |
+
%231 = zext nneg i32 %131 to i64, !dbg !35
|
254 |
+
%232 = zext nneg i32 %132 to i64, !dbg !35
|
255 |
+
%233 = getelementptr float, ptr addrspace(1) %63, i64 %231, !dbg !36
|
256 |
+
%234 = getelementptr float, ptr addrspace(1) %63, i64 %232, !dbg !36
|
257 |
+
%235 = getelementptr float, ptr addrspace(1) %64, i64 %231, !dbg !36
|
258 |
+
%236 = getelementptr float, ptr addrspace(1) %64, i64 %232, !dbg !36
|
259 |
+
%237 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %233, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
260 |
+
%238 = extractvalue { i32, i32, i32, i32 } %237, 0, !dbg !37
|
261 |
+
%239 = extractvalue { i32, i32, i32, i32 } %237, 1, !dbg !37
|
262 |
+
%240 = extractvalue { i32, i32, i32, i32 } %237, 2, !dbg !37
|
263 |
+
%241 = extractvalue { i32, i32, i32, i32 } %237, 3, !dbg !37
|
264 |
+
%242 = bitcast i32 %238 to float, !dbg !37
|
265 |
+
%243 = bitcast i32 %239 to float, !dbg !37
|
266 |
+
%244 = bitcast i32 %240 to float, !dbg !37
|
267 |
+
%245 = bitcast i32 %241 to float, !dbg !37
|
268 |
+
%246 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %234, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
269 |
+
%247 = extractvalue { i32, i32, i32, i32 } %246, 0, !dbg !37
|
270 |
+
%248 = extractvalue { i32, i32, i32, i32 } %246, 1, !dbg !37
|
271 |
+
%249 = extractvalue { i32, i32, i32, i32 } %246, 2, !dbg !37
|
272 |
+
%250 = extractvalue { i32, i32, i32, i32 } %246, 3, !dbg !37
|
273 |
+
%251 = bitcast i32 %247 to float, !dbg !37
|
274 |
+
%252 = bitcast i32 %248 to float, !dbg !37
|
275 |
+
%253 = bitcast i32 %249 to float, !dbg !37
|
276 |
+
%254 = bitcast i32 %250 to float, !dbg !37
|
277 |
+
%255 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %235, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
278 |
+
%256 = extractvalue { i32, i32, i32, i32 } %255, 0, !dbg !37
|
279 |
+
%257 = extractvalue { i32, i32, i32, i32 } %255, 1, !dbg !37
|
280 |
+
%258 = extractvalue { i32, i32, i32, i32 } %255, 2, !dbg !37
|
281 |
+
%259 = extractvalue { i32, i32, i32, i32 } %255, 3, !dbg !37
|
282 |
+
%260 = bitcast i32 %256 to float, !dbg !37
|
283 |
+
%261 = bitcast i32 %257 to float, !dbg !37
|
284 |
+
%262 = bitcast i32 %258 to float, !dbg !37
|
285 |
+
%263 = bitcast i32 %259 to float, !dbg !37
|
286 |
+
%264 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %236, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
287 |
+
%265 = extractvalue { i32, i32, i32, i32 } %264, 0, !dbg !37
|
288 |
+
%266 = extractvalue { i32, i32, i32, i32 } %264, 1, !dbg !37
|
289 |
+
%267 = extractvalue { i32, i32, i32, i32 } %264, 2, !dbg !37
|
290 |
+
%268 = extractvalue { i32, i32, i32, i32 } %264, 3, !dbg !37
|
291 |
+
%269 = bitcast i32 %265 to float, !dbg !37
|
292 |
+
%270 = bitcast i32 %266 to float, !dbg !37
|
293 |
+
%271 = bitcast i32 %267 to float, !dbg !37
|
294 |
+
%272 = bitcast i32 %268 to float, !dbg !37
|
295 |
+
%273 = fadd float %150, %242, !dbg !38
|
296 |
+
%274 = fadd float %151, %243, !dbg !38
|
297 |
+
%275 = fadd float %152, %244, !dbg !38
|
298 |
+
%276 = fadd float %153, %245, !dbg !38
|
299 |
+
%277 = fadd float %159, %251, !dbg !38
|
300 |
+
%278 = fadd float %160, %252, !dbg !38
|
301 |
+
%279 = fadd float %161, %253, !dbg !38
|
302 |
+
%280 = fadd float %162, %254, !dbg !38
|
303 |
+
%281 = fadd float %168, %260, !dbg !38
|
304 |
+
%282 = fadd float %169, %261, !dbg !38
|
305 |
+
%283 = fadd float %170, %262, !dbg !38
|
306 |
+
%284 = fadd float %171, %263, !dbg !38
|
307 |
+
%285 = fadd float %177, %269, !dbg !38
|
308 |
+
%286 = fadd float %178, %270, !dbg !38
|
309 |
+
%287 = fadd float %179, %271, !dbg !38
|
310 |
+
%288 = fadd float %180, %272, !dbg !38
|
311 |
+
%289 = fadd float %213, %273, !dbg !39
|
312 |
+
%290 = fadd float %214, %274, !dbg !39
|
313 |
+
%291 = fadd float %215, %275, !dbg !39
|
314 |
+
%292 = fadd float %216, %276, !dbg !39
|
315 |
+
%293 = fadd float %217, %277, !dbg !39
|
316 |
+
%294 = fadd float %218, %278, !dbg !39
|
317 |
+
%295 = fadd float %219, %279, !dbg !39
|
318 |
+
%296 = fadd float %220, %280, !dbg !39
|
319 |
+
%297 = fadd float %221, %281, !dbg !39
|
320 |
+
%298 = fadd float %222, %282, !dbg !39
|
321 |
+
%299 = fadd float %223, %283, !dbg !39
|
322 |
+
%300 = fadd float %224, %284, !dbg !39
|
323 |
+
%301 = fadd float %225, %285, !dbg !39
|
324 |
+
%302 = fadd float %226, %286, !dbg !39
|
325 |
+
%303 = fadd float %227, %287, !dbg !39
|
326 |
+
%304 = fadd float %228, %288, !dbg !39
|
327 |
+
%305 = fsub float %289, %114, !dbg !40
|
328 |
+
%306 = fsub float %290, %115, !dbg !40
|
329 |
+
%307 = fsub float %291, %116, !dbg !40
|
330 |
+
%308 = fsub float %292, %117, !dbg !40
|
331 |
+
%309 = fsub float %293, %118, !dbg !40
|
332 |
+
%310 = fsub float %294, %119, !dbg !40
|
333 |
+
%311 = fsub float %295, %120, !dbg !40
|
334 |
+
%312 = fsub float %296, %121, !dbg !40
|
335 |
+
%313 = fsub float %297, %122, !dbg !40
|
336 |
+
%314 = fsub float %298, %123, !dbg !40
|
337 |
+
%315 = fsub float %299, %124, !dbg !40
|
338 |
+
%316 = fsub float %300, %125, !dbg !40
|
339 |
+
%317 = fsub float %301, %126, !dbg !40
|
340 |
+
%318 = fsub float %302, %127, !dbg !40
|
341 |
+
%319 = fsub float %303, %128, !dbg !40
|
342 |
+
%320 = fsub float %304, %129, !dbg !40
|
343 |
+
%321 = fadd float %66, 1.000000e+00, !dbg !44
|
344 |
+
%322 = fadd float %67, 1.000000e+00, !dbg !44
|
345 |
+
%323 = fadd float %68, 1.000000e+00, !dbg !44
|
346 |
+
%324 = fadd float %69, 1.000000e+00, !dbg !44
|
347 |
+
%325 = fadd float %70, 1.000000e+00, !dbg !44
|
348 |
+
%326 = fadd float %71, 1.000000e+00, !dbg !44
|
349 |
+
%327 = fadd float %72, 1.000000e+00, !dbg !44
|
350 |
+
%328 = fadd float %73, 1.000000e+00, !dbg !44
|
351 |
+
%329 = fadd float %74, 1.000000e+00, !dbg !44
|
352 |
+
%330 = fadd float %75, 1.000000e+00, !dbg !44
|
353 |
+
%331 = fadd float %76, 1.000000e+00, !dbg !44
|
354 |
+
%332 = fadd float %77, 1.000000e+00, !dbg !44
|
355 |
+
%333 = fadd float %78, 1.000000e+00, !dbg !44
|
356 |
+
%334 = fadd float %79, 1.000000e+00, !dbg !44
|
357 |
+
%335 = fadd float %80, 1.000000e+00, !dbg !44
|
358 |
+
%336 = fadd float %81, 1.000000e+00, !dbg !44
|
359 |
+
%337 = fadd float %82, 1.000000e+00, !dbg !44
|
360 |
+
%338 = fadd float %83, 1.000000e+00, !dbg !44
|
361 |
+
%339 = fadd float %84, 1.000000e+00, !dbg !44
|
362 |
+
%340 = fadd float %85, 1.000000e+00, !dbg !44
|
363 |
+
%341 = fadd float %86, 1.000000e+00, !dbg !44
|
364 |
+
%342 = fadd float %87, 1.000000e+00, !dbg !44
|
365 |
+
%343 = fadd float %88, 1.000000e+00, !dbg !44
|
366 |
+
%344 = fadd float %89, 1.000000e+00, !dbg !44
|
367 |
+
%345 = fadd float %90, 1.000000e+00, !dbg !44
|
368 |
+
%346 = fadd float %91, 1.000000e+00, !dbg !44
|
369 |
+
%347 = fadd float %92, 1.000000e+00, !dbg !44
|
370 |
+
%348 = fadd float %93, 1.000000e+00, !dbg !44
|
371 |
+
%349 = fadd float %94, 1.000000e+00, !dbg !44
|
372 |
+
%350 = fadd float %95, 1.000000e+00, !dbg !44
|
373 |
+
%351 = fadd float %96, 1.000000e+00, !dbg !44
|
374 |
+
%352 = fadd float %97, 1.000000e+00, !dbg !44
|
375 |
+
%353 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %305, float %321) #6, !dbg !45
|
376 |
+
%354 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %306, float %322) #6, !dbg !45
|
377 |
+
%355 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %307, float %323) #6, !dbg !45
|
378 |
+
%356 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %308, float %324) #6, !dbg !45
|
379 |
+
%357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %309, float %325) #6, !dbg !45
|
380 |
+
%358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %310, float %326) #6, !dbg !45
|
381 |
+
%359 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %311, float %327) #6, !dbg !45
|
382 |
+
%360 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %312, float %328) #6, !dbg !45
|
383 |
+
%361 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %313, float %329) #6, !dbg !45
|
384 |
+
%362 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %314, float %330) #6, !dbg !45
|
385 |
+
%363 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %315, float %331) #6, !dbg !45
|
386 |
+
%364 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %316, float %332) #6, !dbg !45
|
387 |
+
%365 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %317, float %333) #6, !dbg !45
|
388 |
+
%366 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %318, float %334) #6, !dbg !45
|
389 |
+
%367 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %319, float %335) #6, !dbg !45
|
390 |
+
%368 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %320, float %336) #6, !dbg !45
|
391 |
+
%369 = fadd float %114, %353, !dbg !46
|
392 |
+
%370 = fadd float %115, %354, !dbg !46
|
393 |
+
%371 = fadd float %116, %355, !dbg !46
|
394 |
+
%372 = fadd float %117, %356, !dbg !46
|
395 |
+
%373 = fadd float %118, %357, !dbg !46
|
396 |
+
%374 = fadd float %119, %358, !dbg !46
|
397 |
+
%375 = fadd float %120, %359, !dbg !46
|
398 |
+
%376 = fadd float %121, %360, !dbg !46
|
399 |
+
%377 = fadd float %122, %361, !dbg !46
|
400 |
+
%378 = fadd float %123, %362, !dbg !46
|
401 |
+
%379 = fadd float %124, %363, !dbg !46
|
402 |
+
%380 = fadd float %125, %364, !dbg !46
|
403 |
+
%381 = fadd float %126, %365, !dbg !46
|
404 |
+
%382 = fadd float %127, %366, !dbg !46
|
405 |
+
%383 = fadd float %128, %367, !dbg !46
|
406 |
+
%384 = fadd float %129, %368, !dbg !46
|
407 |
+
%385 = fsub float %289, %369, !dbg !47
|
408 |
+
%386 = fsub float %290, %370, !dbg !47
|
409 |
+
%387 = fsub float %291, %371, !dbg !47
|
410 |
+
%388 = fsub float %292, %372, !dbg !47
|
411 |
+
%389 = fsub float %293, %373, !dbg !47
|
412 |
+
%390 = fsub float %294, %374, !dbg !47
|
413 |
+
%391 = fsub float %295, %375, !dbg !47
|
414 |
+
%392 = fsub float %296, %376, !dbg !47
|
415 |
+
%393 = fsub float %297, %377, !dbg !47
|
416 |
+
%394 = fsub float %298, %378, !dbg !47
|
417 |
+
%395 = fsub float %299, %379, !dbg !47
|
418 |
+
%396 = fsub float %300, %380, !dbg !47
|
419 |
+
%397 = fsub float %301, %381, !dbg !47
|
420 |
+
%398 = fsub float %302, %382, !dbg !47
|
421 |
+
%399 = fsub float %303, %383, !dbg !47
|
422 |
+
%400 = fsub float %304, %384, !dbg !47
|
423 |
+
%401 = fmul float %305, %385, !dbg !48
|
424 |
+
%402 = fmul float %306, %386, !dbg !48
|
425 |
+
%403 = fmul float %307, %387, !dbg !48
|
426 |
+
%404 = fmul float %308, %388, !dbg !48
|
427 |
+
%405 = fmul float %309, %389, !dbg !48
|
428 |
+
%406 = fmul float %310, %390, !dbg !48
|
429 |
+
%407 = fmul float %311, %391, !dbg !48
|
430 |
+
%408 = fmul float %312, %392, !dbg !48
|
431 |
+
%409 = fmul float %313, %393, !dbg !48
|
432 |
+
%410 = fmul float %314, %394, !dbg !48
|
433 |
+
%411 = fmul float %315, %395, !dbg !48
|
434 |
+
%412 = fmul float %316, %396, !dbg !48
|
435 |
+
%413 = fmul float %317, %397, !dbg !48
|
436 |
+
%414 = fmul float %318, %398, !dbg !48
|
437 |
+
%415 = fmul float %319, %399, !dbg !48
|
438 |
+
%416 = fmul float %320, %400, !dbg !48
|
439 |
+
%417 = fadd float %98, %401, !dbg !49
|
440 |
+
%418 = fadd float %99, %402, !dbg !49
|
441 |
+
%419 = fadd float %100, %403, !dbg !49
|
442 |
+
%420 = fadd float %101, %404, !dbg !49
|
443 |
+
%421 = fadd float %102, %405, !dbg !49
|
444 |
+
%422 = fadd float %103, %406, !dbg !49
|
445 |
+
%423 = fadd float %104, %407, !dbg !49
|
446 |
+
%424 = fadd float %105, %408, !dbg !49
|
447 |
+
%425 = fadd float %106, %409, !dbg !49
|
448 |
+
%426 = fadd float %107, %410, !dbg !49
|
449 |
+
%427 = fadd float %108, %411, !dbg !49
|
450 |
+
%428 = fadd float %109, %412, !dbg !49
|
451 |
+
%429 = fadd float %110, %413, !dbg !49
|
452 |
+
%430 = fadd float %111, %414, !dbg !49
|
453 |
+
%431 = fadd float %112, %415, !dbg !49
|
454 |
+
%432 = fadd float %113, %416, !dbg !49
|
455 |
+
%433 = add nuw nsw i32 %130, 64, !dbg !12
|
456 |
+
%434 = icmp ult i32 %130, 192, !dbg !12
|
457 |
+
br i1 %434, label %65, label %435, !dbg !12
|
458 |
+
|
459 |
+
435: ; preds = %230
|
460 |
+
%436 = and i32 %16, 3, !dbg !12
|
461 |
+
%437 = mul nuw nsw i32 %436, 72, !dbg !12
|
462 |
+
%438 = add nuw nsw i32 %437, %12, !dbg !12
|
463 |
+
%439 = zext nneg i32 %438 to i64, !dbg !12
|
464 |
+
%440 = getelementptr float, ptr addrspace(3) @global_smem, i64 %439, !dbg !12
|
465 |
+
%441 = insertelement <1 x float> undef, float %337, i64 0, !dbg !12
|
466 |
+
store <1 x float> %441, ptr addrspace(3) %440, align 4, !dbg !12
|
467 |
+
%442 = add nuw nsw i32 %12, 288, !dbg !12
|
468 |
+
%443 = add nuw nsw i32 %442, %437, !dbg !12
|
469 |
+
%444 = zext nneg i32 %443 to i64, !dbg !12
|
470 |
+
%445 = getelementptr float, ptr addrspace(3) @global_smem, i64 %444, !dbg !12
|
471 |
+
%446 = insertelement <1 x float> undef, float %338, i64 0, !dbg !12
|
472 |
+
store <1 x float> %446, ptr addrspace(3) %445, align 4, !dbg !12
|
473 |
+
%447 = or i32 %12, 576, !dbg !12
|
474 |
+
%448 = add nuw nsw i32 %447, %437, !dbg !12
|
475 |
+
%449 = zext nneg i32 %448 to i64, !dbg !12
|
476 |
+
%450 = getelementptr float, ptr addrspace(3) @global_smem, i64 %449, !dbg !12
|
477 |
+
%451 = insertelement <1 x float> undef, float %339, i64 0, !dbg !12
|
478 |
+
store <1 x float> %451, ptr addrspace(3) %450, align 4, !dbg !12
|
479 |
+
%452 = add nuw nsw i32 %12, 864, !dbg !12
|
480 |
+
%453 = add nuw nsw i32 %452, %437, !dbg !12
|
481 |
+
%454 = zext nneg i32 %453 to i64, !dbg !12
|
482 |
+
%455 = getelementptr float, ptr addrspace(3) @global_smem, i64 %454, !dbg !12
|
483 |
+
%456 = insertelement <1 x float> undef, float %340, i64 0, !dbg !12
|
484 |
+
store <1 x float> %456, ptr addrspace(3) %455, align 4, !dbg !12
|
485 |
+
%457 = or i32 %12, 1152, !dbg !12
|
486 |
+
%458 = add nuw nsw i32 %457, %437, !dbg !12
|
487 |
+
%459 = zext nneg i32 %458 to i64, !dbg !12
|
488 |
+
%460 = getelementptr float, ptr addrspace(3) @global_smem, i64 %459, !dbg !12
|
489 |
+
%461 = insertelement <1 x float> undef, float %341, i64 0, !dbg !12
|
490 |
+
store <1 x float> %461, ptr addrspace(3) %460, align 4, !dbg !12
|
491 |
+
%462 = add nuw nsw i32 %12, 1440, !dbg !12
|
492 |
+
%463 = add nuw nsw i32 %462, %437, !dbg !12
|
493 |
+
%464 = zext nneg i32 %463 to i64, !dbg !12
|
494 |
+
%465 = getelementptr float, ptr addrspace(3) @global_smem, i64 %464, !dbg !12
|
495 |
+
%466 = insertelement <1 x float> undef, float %342, i64 0, !dbg !12
|
496 |
+
store <1 x float> %466, ptr addrspace(3) %465, align 4, !dbg !12
|
497 |
+
%467 = or i32 %12, 1728, !dbg !12
|
498 |
+
%468 = add nuw nsw i32 %467, %437, !dbg !12
|
499 |
+
%469 = zext nneg i32 %468 to i64, !dbg !12
|
500 |
+
%470 = getelementptr float, ptr addrspace(3) @global_smem, i64 %469, !dbg !12
|
501 |
+
%471 = insertelement <1 x float> undef, float %343, i64 0, !dbg !12
|
502 |
+
store <1 x float> %471, ptr addrspace(3) %470, align 4, !dbg !12
|
503 |
+
%472 = add nuw nsw i32 %12, 2016, !dbg !12
|
504 |
+
%473 = add nuw nsw i32 %472, %437, !dbg !12
|
505 |
+
%474 = zext nneg i32 %473 to i64, !dbg !12
|
506 |
+
%475 = getelementptr float, ptr addrspace(3) @global_smem, i64 %474, !dbg !12
|
507 |
+
%476 = insertelement <1 x float> undef, float %344, i64 0, !dbg !12
|
508 |
+
store <1 x float> %476, ptr addrspace(3) %475, align 4, !dbg !12
|
509 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
510 |
+
%477 = mul nuw nsw i32 %11, 72, !dbg !12
|
511 |
+
%478 = add nuw nsw i32 %477, %14, !dbg !12
|
512 |
+
%479 = zext nneg i32 %478 to i64, !dbg !12
|
513 |
+
%480 = getelementptr float, ptr addrspace(3) @global_smem, i64 %479, !dbg !12
|
514 |
+
%481 = load float, ptr addrspace(3) %480, align 32, !dbg !12
|
515 |
+
%482 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 1, !dbg !12
|
516 |
+
%483 = load float, ptr addrspace(3) %482, align 4, !dbg !12
|
517 |
+
%484 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 2, !dbg !12
|
518 |
+
%485 = load float, ptr addrspace(3) %484, align 8, !dbg !12
|
519 |
+
%486 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 3, !dbg !12
|
520 |
+
%487 = load float, ptr addrspace(3) %486, align 4, !dbg !12
|
521 |
+
%488 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 4, !dbg !12
|
522 |
+
%489 = load float, ptr addrspace(3) %488, align 16, !dbg !12
|
523 |
+
%490 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 5, !dbg !12
|
524 |
+
%491 = load float, ptr addrspace(3) %490, align 4, !dbg !12
|
525 |
+
%492 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 6, !dbg !12
|
526 |
+
%493 = load float, ptr addrspace(3) %492, align 8, !dbg !12
|
527 |
+
%494 = getelementptr inbounds <8 x float>, ptr addrspace(3) %480, i64 0, i64 7, !dbg !12
|
528 |
+
%495 = load float, ptr addrspace(3) %494, align 4, !dbg !12
|
529 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
530 |
+
%496 = insertelement <1 x float> undef, float %345, i64 0, !dbg !12
|
531 |
+
store <1 x float> %496, ptr addrspace(3) %440, align 4, !dbg !12
|
532 |
+
%497 = insertelement <1 x float> undef, float %346, i64 0, !dbg !12
|
533 |
+
store <1 x float> %497, ptr addrspace(3) %445, align 4, !dbg !12
|
534 |
+
%498 = insertelement <1 x float> undef, float %347, i64 0, !dbg !12
|
535 |
+
store <1 x float> %498, ptr addrspace(3) %450, align 4, !dbg !12
|
536 |
+
%499 = insertelement <1 x float> undef, float %348, i64 0, !dbg !12
|
537 |
+
store <1 x float> %499, ptr addrspace(3) %455, align 4, !dbg !12
|
538 |
+
%500 = insertelement <1 x float> undef, float %349, i64 0, !dbg !12
|
539 |
+
store <1 x float> %500, ptr addrspace(3) %460, align 4, !dbg !12
|
540 |
+
%501 = insertelement <1 x float> undef, float %350, i64 0, !dbg !12
|
541 |
+
store <1 x float> %501, ptr addrspace(3) %465, align 4, !dbg !12
|
542 |
+
%502 = insertelement <1 x float> undef, float %351, i64 0, !dbg !12
|
543 |
+
store <1 x float> %502, ptr addrspace(3) %470, align 4, !dbg !12
|
544 |
+
%503 = insertelement <1 x float> undef, float %352, i64 0, !dbg !12
|
545 |
+
store <1 x float> %503, ptr addrspace(3) %475, align 4, !dbg !12
|
546 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !12
|
547 |
+
%504 = load float, ptr addrspace(3) %480, align 32, !dbg !12
|
548 |
+
%505 = load float, ptr addrspace(3) %482, align 4, !dbg !12
|
549 |
+
%506 = load float, ptr addrspace(3) %484, align 8, !dbg !12
|
550 |
+
%507 = load float, ptr addrspace(3) %486, align 4, !dbg !12
|
551 |
+
%508 = load float, ptr addrspace(3) %488, align 16, !dbg !12
|
552 |
+
%509 = load float, ptr addrspace(3) %490, align 4, !dbg !12
|
553 |
+
%510 = load float, ptr addrspace(3) %492, align 8, !dbg !12
|
554 |
+
%511 = load float, ptr addrspace(3) %494, align 4, !dbg !12
|
555 |
+
%512 = fsub float %370, %369, !dbg !50
|
556 |
+
%513 = fadd float %481, %483, !dbg !54
|
557 |
+
%514 = fcmp oeq float %513, 0.000000e+00, !dbg !55
|
558 |
+
%515 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %483, float %513) #6, !dbg !56
|
559 |
+
%516 = select i1 %514, float 0.000000e+00, float %515, !dbg !57
|
560 |
+
%517 = fmul float %512, %516, !dbg !58
|
561 |
+
%518 = fadd float %369, %517, !dbg !59
|
562 |
+
%519 = fadd float %417, %418, !dbg !60
|
563 |
+
%520 = fmul float %512, %512, !dbg !61
|
564 |
+
%521 = fmul float %520, %481, !dbg !62
|
565 |
+
%522 = fmul float %521, %516, !dbg !63
|
566 |
+
%523 = fadd float %519, %522, !dbg !64
|
567 |
+
%524 = fsub float %371, %518, !dbg !50
|
568 |
+
%525 = fadd float %485, %513, !dbg !54
|
569 |
+
%526 = fcmp oeq float %525, 0.000000e+00, !dbg !55
|
570 |
+
%527 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %485, float %525) #6, !dbg !56
|
571 |
+
%528 = select i1 %526, float 0.000000e+00, float %527, !dbg !57
|
572 |
+
%529 = fmul float %528, %524, !dbg !58
|
573 |
+
%530 = fadd float %518, %529, !dbg !59
|
574 |
+
%531 = fadd float %419, %523, !dbg !60
|
575 |
+
%532 = fmul float %524, %524, !dbg !61
|
576 |
+
%533 = fmul float %513, %532, !dbg !62
|
577 |
+
%534 = fmul float %528, %533, !dbg !63
|
578 |
+
%535 = fadd float %531, %534, !dbg !64
|
579 |
+
%536 = fsub float %372, %530, !dbg !50
|
580 |
+
%537 = fadd float %487, %525, !dbg !54
|
581 |
+
%538 = fcmp oeq float %537, 0.000000e+00, !dbg !55
|
582 |
+
%539 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %487, float %537) #6, !dbg !56
|
583 |
+
%540 = select i1 %538, float 0.000000e+00, float %539, !dbg !57
|
584 |
+
%541 = fmul float %540, %536, !dbg !58
|
585 |
+
%542 = fadd float %530, %541, !dbg !59
|
586 |
+
%543 = fadd float %420, %535, !dbg !60
|
587 |
+
%544 = fmul float %536, %536, !dbg !61
|
588 |
+
%545 = fmul float %525, %544, !dbg !62
|
589 |
+
%546 = fmul float %540, %545, !dbg !63
|
590 |
+
%547 = fadd float %543, %546, !dbg !64
|
591 |
+
%548 = fsub float %373, %542, !dbg !50
|
592 |
+
%549 = fadd float %489, %537, !dbg !54
|
593 |
+
%550 = fcmp oeq float %549, 0.000000e+00, !dbg !55
|
594 |
+
%551 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %489, float %549) #6, !dbg !56
|
595 |
+
%552 = select i1 %550, float 0.000000e+00, float %551, !dbg !57
|
596 |
+
%553 = fmul float %552, %548, !dbg !58
|
597 |
+
%554 = fadd float %542, %553, !dbg !59
|
598 |
+
%555 = fadd float %421, %547, !dbg !60
|
599 |
+
%556 = fmul float %548, %548, !dbg !61
|
600 |
+
%557 = fmul float %537, %556, !dbg !62
|
601 |
+
%558 = fmul float %552, %557, !dbg !63
|
602 |
+
%559 = fadd float %555, %558, !dbg !64
|
603 |
+
%560 = fsub float %374, %554, !dbg !50
|
604 |
+
%561 = fadd float %491, %549, !dbg !54
|
605 |
+
%562 = fcmp oeq float %561, 0.000000e+00, !dbg !55
|
606 |
+
%563 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %491, float %561) #6, !dbg !56
|
607 |
+
%564 = select i1 %562, float 0.000000e+00, float %563, !dbg !57
|
608 |
+
%565 = fmul float %564, %560, !dbg !58
|
609 |
+
%566 = fadd float %554, %565, !dbg !59
|
610 |
+
%567 = fadd float %422, %559, !dbg !60
|
611 |
+
%568 = fmul float %560, %560, !dbg !61
|
612 |
+
%569 = fmul float %549, %568, !dbg !62
|
613 |
+
%570 = fmul float %564, %569, !dbg !63
|
614 |
+
%571 = fadd float %567, %570, !dbg !64
|
615 |
+
%572 = fsub float %375, %566, !dbg !50
|
616 |
+
%573 = fadd float %493, %561, !dbg !54
|
617 |
+
%574 = fcmp oeq float %573, 0.000000e+00, !dbg !55
|
618 |
+
%575 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %493, float %573) #6, !dbg !56
|
619 |
+
%576 = select i1 %574, float 0.000000e+00, float %575, !dbg !57
|
620 |
+
%577 = fmul float %576, %572, !dbg !58
|
621 |
+
%578 = fadd float %566, %577, !dbg !59
|
622 |
+
%579 = fadd float %423, %571, !dbg !60
|
623 |
+
%580 = fmul float %572, %572, !dbg !61
|
624 |
+
%581 = fmul float %561, %580, !dbg !62
|
625 |
+
%582 = fmul float %576, %581, !dbg !63
|
626 |
+
%583 = fadd float %579, %582, !dbg !64
|
627 |
+
%584 = fsub float %376, %578, !dbg !50
|
628 |
+
%585 = fadd float %495, %573, !dbg !54
|
629 |
+
%586 = fcmp oeq float %585, 0.000000e+00, !dbg !55
|
630 |
+
%587 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %495, float %585) #6, !dbg !56
|
631 |
+
%588 = select i1 %586, float 0.000000e+00, float %587, !dbg !57
|
632 |
+
%589 = fmul float %588, %584, !dbg !58
|
633 |
+
%590 = fadd float %578, %589, !dbg !59
|
634 |
+
%591 = fadd float %424, %583, !dbg !60
|
635 |
+
%592 = fmul float %584, %584, !dbg !61
|
636 |
+
%593 = fmul float %573, %592, !dbg !62
|
637 |
+
%594 = fmul float %588, %593, !dbg !63
|
638 |
+
%595 = fadd float %591, %594, !dbg !64
|
639 |
+
%596 = fsub float %378, %377, !dbg !50
|
640 |
+
%597 = fadd float %504, %505, !dbg !54
|
641 |
+
%598 = fcmp oeq float %597, 0.000000e+00, !dbg !55
|
642 |
+
%599 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %505, float %597) #6, !dbg !56
|
643 |
+
%600 = select i1 %598, float 0.000000e+00, float %599, !dbg !57
|
644 |
+
%601 = fmul float %596, %600, !dbg !58
|
645 |
+
%602 = fadd float %377, %601, !dbg !59
|
646 |
+
%603 = fadd float %425, %426, !dbg !60
|
647 |
+
%604 = fmul float %596, %596, !dbg !61
|
648 |
+
%605 = fmul float %604, %504, !dbg !62
|
649 |
+
%606 = fmul float %605, %600, !dbg !63
|
650 |
+
%607 = fadd float %603, %606, !dbg !64
|
651 |
+
%608 = fsub float %379, %602, !dbg !50
|
652 |
+
%609 = fadd float %506, %597, !dbg !54
|
653 |
+
%610 = fcmp oeq float %609, 0.000000e+00, !dbg !55
|
654 |
+
%611 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %506, float %609) #6, !dbg !56
|
655 |
+
%612 = select i1 %610, float 0.000000e+00, float %611, !dbg !57
|
656 |
+
%613 = fmul float %612, %608, !dbg !58
|
657 |
+
%614 = fadd float %602, %613, !dbg !59
|
658 |
+
%615 = fadd float %427, %607, !dbg !60
|
659 |
+
%616 = fmul float %608, %608, !dbg !61
|
660 |
+
%617 = fmul float %597, %616, !dbg !62
|
661 |
+
%618 = fmul float %612, %617, !dbg !63
|
662 |
+
%619 = fadd float %615, %618, !dbg !64
|
663 |
+
%620 = fsub float %380, %614, !dbg !50
|
664 |
+
%621 = fadd float %507, %609, !dbg !54
|
665 |
+
%622 = fcmp oeq float %621, 0.000000e+00, !dbg !55
|
666 |
+
%623 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %507, float %621) #6, !dbg !56
|
667 |
+
%624 = select i1 %622, float 0.000000e+00, float %623, !dbg !57
|
668 |
+
%625 = fmul float %624, %620, !dbg !58
|
669 |
+
%626 = fadd float %614, %625, !dbg !59
|
670 |
+
%627 = fadd float %428, %619, !dbg !60
|
671 |
+
%628 = fmul float %620, %620, !dbg !61
|
672 |
+
%629 = fmul float %609, %628, !dbg !62
|
673 |
+
%630 = fmul float %624, %629, !dbg !63
|
674 |
+
%631 = fadd float %627, %630, !dbg !64
|
675 |
+
%632 = fsub float %381, %626, !dbg !50
|
676 |
+
%633 = fadd float %508, %621, !dbg !54
|
677 |
+
%634 = fcmp oeq float %633, 0.000000e+00, !dbg !55
|
678 |
+
%635 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %508, float %633) #6, !dbg !56
|
679 |
+
%636 = select i1 %634, float 0.000000e+00, float %635, !dbg !57
|
680 |
+
%637 = fmul float %636, %632, !dbg !58
|
681 |
+
%638 = fadd float %626, %637, !dbg !59
|
682 |
+
%639 = fadd float %429, %631, !dbg !60
|
683 |
+
%640 = fmul float %632, %632, !dbg !61
|
684 |
+
%641 = fmul float %621, %640, !dbg !62
|
685 |
+
%642 = fmul float %636, %641, !dbg !63
|
686 |
+
%643 = fadd float %639, %642, !dbg !64
|
687 |
+
%644 = fsub float %382, %638, !dbg !50
|
688 |
+
%645 = fadd float %509, %633, !dbg !54
|
689 |
+
%646 = fcmp oeq float %645, 0.000000e+00, !dbg !55
|
690 |
+
%647 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %509, float %645) #6, !dbg !56
|
691 |
+
%648 = select i1 %646, float 0.000000e+00, float %647, !dbg !57
|
692 |
+
%649 = fmul float %648, %644, !dbg !58
|
693 |
+
%650 = fadd float %638, %649, !dbg !59
|
694 |
+
%651 = fadd float %430, %643, !dbg !60
|
695 |
+
%652 = fmul float %644, %644, !dbg !61
|
696 |
+
%653 = fmul float %633, %652, !dbg !62
|
697 |
+
%654 = fmul float %648, %653, !dbg !63
|
698 |
+
%655 = fadd float %651, %654, !dbg !64
|
699 |
+
%656 = fsub float %383, %650, !dbg !50
|
700 |
+
%657 = fadd float %510, %645, !dbg !54
|
701 |
+
%658 = fcmp oeq float %657, 0.000000e+00, !dbg !55
|
702 |
+
%659 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %510, float %657) #6, !dbg !56
|
703 |
+
%660 = select i1 %658, float 0.000000e+00, float %659, !dbg !57
|
704 |
+
%661 = fmul float %660, %656, !dbg !58
|
705 |
+
%662 = fadd float %650, %661, !dbg !59
|
706 |
+
%663 = fadd float %431, %655, !dbg !60
|
707 |
+
%664 = fmul float %656, %656, !dbg !61
|
708 |
+
%665 = fmul float %645, %664, !dbg !62
|
709 |
+
%666 = fmul float %660, %665, !dbg !63
|
710 |
+
%667 = fadd float %663, %666, !dbg !64
|
711 |
+
%668 = fsub float %384, %662, !dbg !50
|
712 |
+
%669 = fadd float %511, %657, !dbg !54
|
713 |
+
%670 = fcmp oeq float %669, 0.000000e+00, !dbg !55
|
714 |
+
%671 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %511, float %669) #6, !dbg !56
|
715 |
+
%672 = select i1 %670, float 0.000000e+00, float %671, !dbg !57
|
716 |
+
%673 = fmul float %672, %668, !dbg !58
|
717 |
+
%674 = fadd float %662, %673, !dbg !59
|
718 |
+
%675 = fadd float %432, %667, !dbg !60
|
719 |
+
%676 = fmul float %668, %668, !dbg !61
|
720 |
+
%677 = fmul float %657, %676, !dbg !62
|
721 |
+
%678 = fmul float %672, %677, !dbg !63
|
722 |
+
%679 = fadd float %675, %678, !dbg !64
|
723 |
+
%680 = bitcast float %590 to i32, !dbg !65
|
724 |
+
%681 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %680, i32 4, i32 31), !dbg !65
|
725 |
+
%682 = bitcast i32 %681 to float, !dbg !65
|
726 |
+
%683 = bitcast float %595 to i32, !dbg !65
|
727 |
+
%684 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %683, i32 4, i32 31), !dbg !65
|
728 |
+
%685 = bitcast i32 %684 to float, !dbg !65
|
729 |
+
%686 = bitcast float %585 to i32, !dbg !65
|
730 |
+
%687 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %686, i32 4, i32 31), !dbg !65
|
731 |
+
%688 = bitcast i32 %687 to float, !dbg !65
|
732 |
+
%689 = fsub float %682, %590, !dbg !50
|
733 |
+
%690 = fadd float %585, %688, !dbg !54
|
734 |
+
%691 = fcmp oeq float %690, 0.000000e+00, !dbg !55
|
735 |
+
%692 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %688, float %690) #6, !dbg !56
|
736 |
+
%693 = select i1 %691, float 0.000000e+00, float %692, !dbg !57
|
737 |
+
%694 = fmul float %693, %689, !dbg !58
|
738 |
+
%695 = fadd float %590, %694, !dbg !59
|
739 |
+
%696 = fadd float %595, %685, !dbg !60
|
740 |
+
%697 = fmul float %689, %689, !dbg !61
|
741 |
+
%698 = fmul float %585, %697, !dbg !62
|
742 |
+
%699 = fmul float %693, %698, !dbg !63
|
743 |
+
%700 = fadd float %696, %699, !dbg !64
|
744 |
+
%701 = bitcast float %695 to i32, !dbg !65
|
745 |
+
%702 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %701, i32 2, i32 31), !dbg !65
|
746 |
+
%703 = bitcast i32 %702 to float, !dbg !65
|
747 |
+
%704 = bitcast float %700 to i32, !dbg !65
|
748 |
+
%705 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %704, i32 2, i32 31), !dbg !65
|
749 |
+
%706 = bitcast i32 %705 to float, !dbg !65
|
750 |
+
%707 = bitcast float %690 to i32, !dbg !65
|
751 |
+
%708 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %707, i32 2, i32 31), !dbg !65
|
752 |
+
%709 = bitcast i32 %708 to float, !dbg !65
|
753 |
+
%710 = fsub float %703, %695, !dbg !50
|
754 |
+
%711 = fadd float %690, %709, !dbg !54
|
755 |
+
%712 = fcmp oeq float %711, 0.000000e+00, !dbg !55
|
756 |
+
%713 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %709, float %711) #6, !dbg !56
|
757 |
+
%714 = select i1 %712, float 0.000000e+00, float %713, !dbg !57
|
758 |
+
%715 = fmul float %714, %710, !dbg !58
|
759 |
+
%716 = fadd float %695, %715, !dbg !59
|
760 |
+
%717 = fadd float %700, %706, !dbg !60
|
761 |
+
%718 = fmul float %710, %710, !dbg !61
|
762 |
+
%719 = fmul float %690, %718, !dbg !62
|
763 |
+
%720 = fmul float %714, %719, !dbg !63
|
764 |
+
%721 = fadd float %717, %720, !dbg !64
|
765 |
+
%722 = bitcast float %716 to i32, !dbg !65
|
766 |
+
%723 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %722, i32 1, i32 31), !dbg !65
|
767 |
+
%724 = bitcast i32 %723 to float, !dbg !65
|
768 |
+
%725 = bitcast float %721 to i32, !dbg !65
|
769 |
+
%726 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %725, i32 1, i32 31), !dbg !65
|
770 |
+
%727 = bitcast i32 %726 to float, !dbg !65
|
771 |
+
%728 = bitcast float %711 to i32, !dbg !65
|
772 |
+
%729 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %728, i32 1, i32 31), !dbg !65
|
773 |
+
%730 = bitcast i32 %729 to float, !dbg !65
|
774 |
+
%731 = fsub float %724, %716, !dbg !50
|
775 |
+
%732 = fadd float %711, %730, !dbg !54
|
776 |
+
%733 = fcmp oeq float %732, 0.000000e+00, !dbg !55
|
777 |
+
%734 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %730, float %732) #6, !dbg !56
|
778 |
+
%735 = select i1 %733, float 0.000000e+00, float %734, !dbg !57
|
779 |
+
%736 = fmul float %731, %735, !dbg !58
|
780 |
+
%737 = fadd float %716, %736, !dbg !59
|
781 |
+
%738 = fadd float %721, %727, !dbg !60
|
782 |
+
%739 = fmul float %731, %731, !dbg !61
|
783 |
+
%740 = fmul float %711, %739, !dbg !62
|
784 |
+
%741 = fmul float %735, %740, !dbg !63
|
785 |
+
%742 = fadd float %738, %741, !dbg !64
|
786 |
+
%743 = bitcast float %674 to i32, !dbg !65
|
787 |
+
%744 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %743, i32 4, i32 31), !dbg !65
|
788 |
+
%745 = bitcast i32 %744 to float, !dbg !65
|
789 |
+
%746 = bitcast float %679 to i32, !dbg !65
|
790 |
+
%747 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %746, i32 4, i32 31), !dbg !65
|
791 |
+
%748 = bitcast i32 %747 to float, !dbg !65
|
792 |
+
%749 = bitcast float %669 to i32, !dbg !65
|
793 |
+
%750 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %749, i32 4, i32 31), !dbg !65
|
794 |
+
%751 = bitcast i32 %750 to float, !dbg !65
|
795 |
+
%752 = fsub float %745, %674, !dbg !50
|
796 |
+
%753 = fadd float %669, %751, !dbg !54
|
797 |
+
%754 = fcmp oeq float %753, 0.000000e+00, !dbg !55
|
798 |
+
%755 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %751, float %753) #6, !dbg !56
|
799 |
+
%756 = select i1 %754, float 0.000000e+00, float %755, !dbg !57
|
800 |
+
%757 = fmul float %752, %756, !dbg !58
|
801 |
+
%758 = fadd float %674, %757, !dbg !59
|
802 |
+
%759 = fadd float %679, %748, !dbg !60
|
803 |
+
%760 = fmul float %752, %752, !dbg !61
|
804 |
+
%761 = fmul float %669, %760, !dbg !62
|
805 |
+
%762 = fmul float %761, %756, !dbg !63
|
806 |
+
%763 = fadd float %759, %762, !dbg !64
|
807 |
+
%764 = bitcast float %758 to i32, !dbg !65
|
808 |
+
%765 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %764, i32 2, i32 31), !dbg !65
|
809 |
+
%766 = bitcast i32 %765 to float, !dbg !65
|
810 |
+
%767 = bitcast float %763 to i32, !dbg !65
|
811 |
+
%768 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %767, i32 2, i32 31), !dbg !65
|
812 |
+
%769 = bitcast i32 %768 to float, !dbg !65
|
813 |
+
%770 = bitcast float %753 to i32, !dbg !65
|
814 |
+
%771 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %770, i32 2, i32 31), !dbg !65
|
815 |
+
%772 = bitcast i32 %771 to float, !dbg !65
|
816 |
+
%773 = fsub float %766, %758, !dbg !50
|
817 |
+
%774 = fadd float %753, %772, !dbg !54
|
818 |
+
%775 = fcmp oeq float %774, 0.000000e+00, !dbg !55
|
819 |
+
%776 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %772, float %774) #6, !dbg !56
|
820 |
+
%777 = select i1 %775, float 0.000000e+00, float %776, !dbg !57
|
821 |
+
%778 = fmul float %773, %777, !dbg !58
|
822 |
+
%779 = fadd float %758, %778, !dbg !59
|
823 |
+
%780 = fadd float %763, %769, !dbg !60
|
824 |
+
%781 = fmul float %773, %773, !dbg !61
|
825 |
+
%782 = fmul float %753, %781, !dbg !62
|
826 |
+
%783 = fmul float %777, %782, !dbg !63
|
827 |
+
%784 = fadd float %780, %783, !dbg !64
|
828 |
+
%785 = bitcast float %779 to i32, !dbg !65
|
829 |
+
%786 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %785, i32 1, i32 31), !dbg !65
|
830 |
+
%787 = bitcast i32 %786 to float, !dbg !65
|
831 |
+
%788 = bitcast float %784 to i32, !dbg !65
|
832 |
+
%789 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %788, i32 1, i32 31), !dbg !65
|
833 |
+
%790 = bitcast i32 %789 to float, !dbg !65
|
834 |
+
%791 = bitcast float %774 to i32, !dbg !65
|
835 |
+
%792 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %791, i32 1, i32 31), !dbg !65
|
836 |
+
%793 = bitcast i32 %792 to float, !dbg !65
|
837 |
+
%794 = fsub float %787, %779, !dbg !50
|
838 |
+
%795 = fadd float %774, %793, !dbg !54
|
839 |
+
%796 = fcmp oeq float %795, 0.000000e+00, !dbg !55
|
840 |
+
%797 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %793, float %795) #6, !dbg !56
|
841 |
+
%798 = select i1 %796, float 0.000000e+00, float %797, !dbg !57
|
842 |
+
%799 = fmul float %794, %798, !dbg !58
|
843 |
+
%800 = fadd float %779, %799, !dbg !59
|
844 |
+
%801 = fadd float %784, %790, !dbg !60
|
845 |
+
%802 = fmul float %794, %794, !dbg !61
|
846 |
+
%803 = fmul float %774, %802, !dbg !62
|
847 |
+
%804 = fmul float %798, %803, !dbg !63
|
848 |
+
%805 = fadd float %801, %804, !dbg !64
|
849 |
+
%806 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
850 |
+
%807 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
851 |
+
%808 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
852 |
+
%809 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
853 |
+
%810 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
854 |
+
%811 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
855 |
+
%812 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
856 |
+
%813 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %742, float 2.560000e+02) #6, !dbg !67
|
857 |
+
%814 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
858 |
+
%815 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
859 |
+
%816 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
860 |
+
%817 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
861 |
+
%818 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
862 |
+
%819 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
863 |
+
%820 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
864 |
+
%821 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %805, float 2.560000e+02) #6, !dbg !67
|
865 |
+
%822 = fadd float %806, 0x3EE4F8B580000000, !dbg !68
|
866 |
+
%823 = fadd float %814, 0x3EE4F8B580000000, !dbg !68
|
867 |
+
br label %824, !dbg !69
|
868 |
+
|
869 |
+
824: ; preds = %435, %__nv_rsqrtf.exit40
|
870 |
+
%825 = phi i32 [ 0, %435 ], [ %1134, %__nv_rsqrtf.exit40 ]
|
871 |
+
%826 = or i32 %825, %14, !dbg !70
|
872 |
+
%827 = or i32 %825, %15, !dbg !70
|
873 |
+
%828 = add i32 %826, %47, !dbg !71
|
874 |
+
%829 = add i32 %827, %47, !dbg !71
|
875 |
+
%830 = add i32 %826, %48, !dbg !71
|
876 |
+
%831 = add i32 %827, %48, !dbg !71
|
877 |
+
%832 = sext i32 %828 to i64, !dbg !72
|
878 |
+
%833 = getelementptr float, ptr addrspace(1) %2, i64 %832, !dbg !72
|
879 |
+
%834 = sext i32 %829 to i64, !dbg !72
|
880 |
+
%835 = getelementptr float, ptr addrspace(1) %2, i64 %834, !dbg !72
|
881 |
+
%836 = sext i32 %830 to i64, !dbg !72
|
882 |
+
%837 = getelementptr float, ptr addrspace(1) %2, i64 %836, !dbg !72
|
883 |
+
%838 = sext i32 %831 to i64, !dbg !72
|
884 |
+
%839 = getelementptr float, ptr addrspace(1) %2, i64 %838, !dbg !72
|
885 |
+
%840 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %833, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
886 |
+
%841 = extractvalue { i32, i32, i32, i32 } %840, 0, !dbg !73
|
887 |
+
%842 = extractvalue { i32, i32, i32, i32 } %840, 1, !dbg !73
|
888 |
+
%843 = extractvalue { i32, i32, i32, i32 } %840, 2, !dbg !73
|
889 |
+
%844 = extractvalue { i32, i32, i32, i32 } %840, 3, !dbg !73
|
890 |
+
%845 = bitcast i32 %841 to float, !dbg !73
|
891 |
+
%846 = bitcast i32 %842 to float, !dbg !73
|
892 |
+
%847 = bitcast i32 %843 to float, !dbg !73
|
893 |
+
%848 = bitcast i32 %844 to float, !dbg !73
|
894 |
+
%849 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %835, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
895 |
+
%850 = extractvalue { i32, i32, i32, i32 } %849, 0, !dbg !73
|
896 |
+
%851 = extractvalue { i32, i32, i32, i32 } %849, 1, !dbg !73
|
897 |
+
%852 = extractvalue { i32, i32, i32, i32 } %849, 2, !dbg !73
|
898 |
+
%853 = extractvalue { i32, i32, i32, i32 } %849, 3, !dbg !73
|
899 |
+
%854 = bitcast i32 %850 to float, !dbg !73
|
900 |
+
%855 = bitcast i32 %851 to float, !dbg !73
|
901 |
+
%856 = bitcast i32 %852 to float, !dbg !73
|
902 |
+
%857 = bitcast i32 %853 to float, !dbg !73
|
903 |
+
%858 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %837, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
904 |
+
%859 = extractvalue { i32, i32, i32, i32 } %858, 0, !dbg !73
|
905 |
+
%860 = extractvalue { i32, i32, i32, i32 } %858, 1, !dbg !73
|
906 |
+
%861 = extractvalue { i32, i32, i32, i32 } %858, 2, !dbg !73
|
907 |
+
%862 = extractvalue { i32, i32, i32, i32 } %858, 3, !dbg !73
|
908 |
+
%863 = bitcast i32 %859 to float, !dbg !73
|
909 |
+
%864 = bitcast i32 %860 to float, !dbg !73
|
910 |
+
%865 = bitcast i32 %861 to float, !dbg !73
|
911 |
+
%866 = bitcast i32 %862 to float, !dbg !73
|
912 |
+
%867 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %839, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
913 |
+
%868 = extractvalue { i32, i32, i32, i32 } %867, 0, !dbg !73
|
914 |
+
%869 = extractvalue { i32, i32, i32, i32 } %867, 1, !dbg !73
|
915 |
+
%870 = extractvalue { i32, i32, i32, i32 } %867, 2, !dbg !73
|
916 |
+
%871 = extractvalue { i32, i32, i32, i32 } %867, 3, !dbg !73
|
917 |
+
%872 = bitcast i32 %868 to float, !dbg !73
|
918 |
+
%873 = bitcast i32 %869 to float, !dbg !73
|
919 |
+
%874 = bitcast i32 %870 to float, !dbg !73
|
920 |
+
%875 = bitcast i32 %871 to float, !dbg !73
|
921 |
+
%876 = add i32 %826, %49, !dbg !74
|
922 |
+
%877 = add i32 %826, %50, !dbg !74
|
923 |
+
%878 = sext i32 %876 to i64, !dbg !75
|
924 |
+
%879 = getelementptr i16, ptr addrspace(1) %3, i64 %878, !dbg !75
|
925 |
+
%880 = sext i32 %877 to i64, !dbg !75
|
926 |
+
%881 = getelementptr i16, ptr addrspace(1) %3, i64 %880, !dbg !75
|
927 |
+
%882 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %879, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
|
928 |
+
%883 = extractvalue { i32, i32, i32, i32 } %882, 0, !dbg !76
|
929 |
+
%884 = extractvalue { i32, i32, i32, i32 } %882, 1, !dbg !76
|
930 |
+
%885 = extractvalue { i32, i32, i32, i32 } %882, 2, !dbg !76
|
931 |
+
%886 = extractvalue { i32, i32, i32, i32 } %882, 3, !dbg !76
|
932 |
+
%887 = trunc i32 %883 to i16, !dbg !76
|
933 |
+
%extelt.offset = lshr i32 %883, 16, !dbg !76
|
934 |
+
%888 = trunc i32 %extelt.offset to i16, !dbg !76
|
935 |
+
%889 = trunc i32 %884 to i16, !dbg !76
|
936 |
+
%extelt.offset2 = lshr i32 %884, 16, !dbg !76
|
937 |
+
%890 = trunc i32 %extelt.offset2 to i16, !dbg !76
|
938 |
+
%891 = trunc i32 %885 to i16, !dbg !76
|
939 |
+
%extelt.offset3 = lshr i32 %885, 16, !dbg !76
|
940 |
+
%892 = trunc i32 %extelt.offset3 to i16, !dbg !76
|
941 |
+
%893 = trunc i32 %886 to i16, !dbg !76
|
942 |
+
%extelt.offset4 = lshr i32 %886, 16, !dbg !76
|
943 |
+
%894 = trunc i32 %extelt.offset4 to i16, !dbg !76
|
944 |
+
%895 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %881, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
|
945 |
+
%896 = extractvalue { i32, i32, i32, i32 } %895, 0, !dbg !76
|
946 |
+
%897 = extractvalue { i32, i32, i32, i32 } %895, 1, !dbg !76
|
947 |
+
%898 = extractvalue { i32, i32, i32, i32 } %895, 2, !dbg !76
|
948 |
+
%899 = extractvalue { i32, i32, i32, i32 } %895, 3, !dbg !76
|
949 |
+
%900 = trunc i32 %896 to i16, !dbg !76
|
950 |
+
%extelt.offset5 = lshr i32 %896, 16, !dbg !76
|
951 |
+
%901 = trunc i32 %extelt.offset5 to i16, !dbg !76
|
952 |
+
%902 = trunc i32 %897 to i16, !dbg !76
|
953 |
+
%extelt.offset6 = lshr i32 %897, 16, !dbg !76
|
954 |
+
%903 = trunc i32 %extelt.offset6 to i16, !dbg !76
|
955 |
+
%904 = trunc i32 %898 to i16, !dbg !76
|
956 |
+
%extelt.offset7 = lshr i32 %898, 16, !dbg !76
|
957 |
+
%905 = trunc i32 %extelt.offset7 to i16, !dbg !76
|
958 |
+
%906 = trunc i32 %899 to i16, !dbg !76
|
959 |
+
%extelt.offset8 = lshr i32 %899, 16, !dbg !76
|
960 |
+
%907 = trunc i32 %extelt.offset8 to i16, !dbg !76
|
961 |
+
%908 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %887) #6, !dbg !77
|
962 |
+
%909 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %888) #6, !dbg !77
|
963 |
+
%910 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %889) #6, !dbg !77
|
964 |
+
%911 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %890) #6, !dbg !77
|
965 |
+
%912 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %891) #6, !dbg !77
|
966 |
+
%913 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %892) #6, !dbg !77
|
967 |
+
%914 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %893) #6, !dbg !77
|
968 |
+
%915 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %894) #6, !dbg !77
|
969 |
+
%916 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %900) #6, !dbg !77
|
970 |
+
%917 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %901) #6, !dbg !77
|
971 |
+
%918 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %902) #6, !dbg !77
|
972 |
+
%919 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %903) #6, !dbg !77
|
973 |
+
%920 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %904) #6, !dbg !77
|
974 |
+
%921 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %905) #6, !dbg !77
|
975 |
+
%922 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %906) #6, !dbg !77
|
976 |
+
%923 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %907) #6, !dbg !77
|
977 |
+
%924 = zext nneg i32 %826 to i64, !dbg !78
|
978 |
+
%925 = getelementptr float, ptr addrspace(1) %4, i64 %924, !dbg !78
|
979 |
+
%926 = zext nneg i32 %827 to i64, !dbg !78
|
980 |
+
%927 = getelementptr float, ptr addrspace(1) %4, i64 %926, !dbg !78
|
981 |
+
%928 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %925, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
|
982 |
+
%929 = extractvalue { i32, i32, i32, i32 } %928, 0, !dbg !79
|
983 |
+
%930 = extractvalue { i32, i32, i32, i32 } %928, 1, !dbg !79
|
984 |
+
%931 = extractvalue { i32, i32, i32, i32 } %928, 2, !dbg !79
|
985 |
+
%932 = extractvalue { i32, i32, i32, i32 } %928, 3, !dbg !79
|
986 |
+
%933 = bitcast i32 %929 to float, !dbg !79
|
987 |
+
%934 = bitcast i32 %930 to float, !dbg !79
|
988 |
+
%935 = bitcast i32 %931 to float, !dbg !79
|
989 |
+
%936 = bitcast i32 %932 to float, !dbg !79
|
990 |
+
%937 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %927, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
|
991 |
+
%938 = extractvalue { i32, i32, i32, i32 } %937, 0, !dbg !79
|
992 |
+
%939 = extractvalue { i32, i32, i32, i32 } %937, 1, !dbg !79
|
993 |
+
%940 = extractvalue { i32, i32, i32, i32 } %937, 2, !dbg !79
|
994 |
+
%941 = extractvalue { i32, i32, i32, i32 } %937, 3, !dbg !79
|
995 |
+
%942 = bitcast i32 %938 to float, !dbg !79
|
996 |
+
%943 = bitcast i32 %939 to float, !dbg !79
|
997 |
+
%944 = bitcast i32 %940 to float, !dbg !79
|
998 |
+
%945 = bitcast i32 %941 to float, !dbg !79
|
999 |
+
br i1 %56, label %946, label %947, !dbg !80
|
1000 |
+
|
1001 |
+
946: ; preds = %824
|
1002 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
|
1003 |
+
br label %947, !dbg !80
|
1004 |
+
|
1005 |
+
947: ; preds = %946, %824
|
1006 |
+
%948 = getelementptr float, ptr addrspace(1) %63, i64 %924, !dbg !81
|
1007 |
+
%949 = getelementptr float, ptr addrspace(1) %63, i64 %926, !dbg !81
|
1008 |
+
%950 = getelementptr float, ptr addrspace(1) %64, i64 %924, !dbg !81
|
1009 |
+
%951 = getelementptr float, ptr addrspace(1) %64, i64 %926, !dbg !81
|
1010 |
+
%952 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %948, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
1011 |
+
%953 = extractvalue { i32, i32, i32, i32 } %952, 0, !dbg !82
|
1012 |
+
%954 = extractvalue { i32, i32, i32, i32 } %952, 1, !dbg !82
|
1013 |
+
%955 = extractvalue { i32, i32, i32, i32 } %952, 2, !dbg !82
|
1014 |
+
%956 = extractvalue { i32, i32, i32, i32 } %952, 3, !dbg !82
|
1015 |
+
%957 = bitcast i32 %953 to float, !dbg !82
|
1016 |
+
%958 = bitcast i32 %954 to float, !dbg !82
|
1017 |
+
%959 = bitcast i32 %955 to float, !dbg !82
|
1018 |
+
%960 = bitcast i32 %956 to float, !dbg !82
|
1019 |
+
%961 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %949, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
1020 |
+
%962 = extractvalue { i32, i32, i32, i32 } %961, 0, !dbg !82
|
1021 |
+
%963 = extractvalue { i32, i32, i32, i32 } %961, 1, !dbg !82
|
1022 |
+
%964 = extractvalue { i32, i32, i32, i32 } %961, 2, !dbg !82
|
1023 |
+
%965 = extractvalue { i32, i32, i32, i32 } %961, 3, !dbg !82
|
1024 |
+
%966 = bitcast i32 %962 to float, !dbg !82
|
1025 |
+
%967 = bitcast i32 %963 to float, !dbg !82
|
1026 |
+
%968 = bitcast i32 %964 to float, !dbg !82
|
1027 |
+
%969 = bitcast i32 %965 to float, !dbg !82
|
1028 |
+
%970 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %950, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
1029 |
+
%971 = extractvalue { i32, i32, i32, i32 } %970, 0, !dbg !82
|
1030 |
+
%972 = extractvalue { i32, i32, i32, i32 } %970, 1, !dbg !82
|
1031 |
+
%973 = extractvalue { i32, i32, i32, i32 } %970, 2, !dbg !82
|
1032 |
+
%974 = extractvalue { i32, i32, i32, i32 } %970, 3, !dbg !82
|
1033 |
+
%975 = bitcast i32 %971 to float, !dbg !82
|
1034 |
+
%976 = bitcast i32 %972 to float, !dbg !82
|
1035 |
+
%977 = bitcast i32 %973 to float, !dbg !82
|
1036 |
+
%978 = bitcast i32 %974 to float, !dbg !82
|
1037 |
+
%979 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %951, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
1038 |
+
%980 = extractvalue { i32, i32, i32, i32 } %979, 0, !dbg !82
|
1039 |
+
%981 = extractvalue { i32, i32, i32, i32 } %979, 1, !dbg !82
|
1040 |
+
%982 = extractvalue { i32, i32, i32, i32 } %979, 2, !dbg !82
|
1041 |
+
%983 = extractvalue { i32, i32, i32, i32 } %979, 3, !dbg !82
|
1042 |
+
%984 = bitcast i32 %980 to float, !dbg !82
|
1043 |
+
%985 = bitcast i32 %981 to float, !dbg !82
|
1044 |
+
%986 = bitcast i32 %982 to float, !dbg !82
|
1045 |
+
%987 = bitcast i32 %983 to float, !dbg !82
|
1046 |
+
%988 = fadd float %845, %957, !dbg !83
|
1047 |
+
%989 = fadd float %846, %958, !dbg !83
|
1048 |
+
%990 = fadd float %847, %959, !dbg !83
|
1049 |
+
%991 = fadd float %848, %960, !dbg !83
|
1050 |
+
%992 = fadd float %854, %966, !dbg !83
|
1051 |
+
%993 = fadd float %855, %967, !dbg !83
|
1052 |
+
%994 = fadd float %856, %968, !dbg !83
|
1053 |
+
%995 = fadd float %857, %969, !dbg !83
|
1054 |
+
%996 = fadd float %863, %975, !dbg !83
|
1055 |
+
%997 = fadd float %864, %976, !dbg !83
|
1056 |
+
%998 = fadd float %865, %977, !dbg !83
|
1057 |
+
%999 = fadd float %866, %978, !dbg !83
|
1058 |
+
%1000 = fadd float %872, %984, !dbg !83
|
1059 |
+
%1001 = fadd float %873, %985, !dbg !83
|
1060 |
+
%1002 = fadd float %874, %986, !dbg !83
|
1061 |
+
%1003 = fadd float %875, %987, !dbg !83
|
1062 |
+
%1004 = fadd float %908, %988, !dbg !84
|
1063 |
+
%1005 = fadd float %909, %989, !dbg !84
|
1064 |
+
%1006 = fadd float %910, %990, !dbg !84
|
1065 |
+
%1007 = fadd float %911, %991, !dbg !84
|
1066 |
+
%1008 = fadd float %912, %992, !dbg !84
|
1067 |
+
%1009 = fadd float %913, %993, !dbg !84
|
1068 |
+
%1010 = fadd float %914, %994, !dbg !84
|
1069 |
+
%1011 = fadd float %915, %995, !dbg !84
|
1070 |
+
%1012 = fadd float %916, %996, !dbg !84
|
1071 |
+
%1013 = fadd float %917, %997, !dbg !84
|
1072 |
+
%1014 = fadd float %918, %998, !dbg !84
|
1073 |
+
%1015 = fadd float %919, %999, !dbg !84
|
1074 |
+
%1016 = fadd float %920, %1000, !dbg !84
|
1075 |
+
%1017 = fadd float %921, %1001, !dbg !84
|
1076 |
+
%1018 = fadd float %922, %1002, !dbg !84
|
1077 |
+
%1019 = fadd float %923, %1003, !dbg !84
|
1078 |
+
%1020 = fsub float %1004, %737, !dbg !85
|
1079 |
+
%1021 = fsub float %1005, %737, !dbg !85
|
1080 |
+
%1022 = fsub float %1006, %737, !dbg !85
|
1081 |
+
%1023 = fsub float %1007, %737, !dbg !85
|
1082 |
+
%1024 = fsub float %1008, %737, !dbg !85
|
1083 |
+
%1025 = fsub float %1009, %737, !dbg !85
|
1084 |
+
%1026 = fsub float %1010, %737, !dbg !85
|
1085 |
+
%1027 = fsub float %1011, %737, !dbg !85
|
1086 |
+
%1028 = fsub float %1012, %800, !dbg !85
|
1087 |
+
%1029 = fsub float %1013, %800, !dbg !85
|
1088 |
+
%1030 = fsub float %1014, %800, !dbg !85
|
1089 |
+
%1031 = fsub float %1015, %800, !dbg !85
|
1090 |
+
%1032 = fsub float %1016, %800, !dbg !85
|
1091 |
+
%1033 = fsub float %1017, %800, !dbg !85
|
1092 |
+
%1034 = fsub float %1018, %800, !dbg !85
|
1093 |
+
%1035 = fsub float %1019, %800, !dbg !85
|
1094 |
+
%1036 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1095 |
+
%.not.i = icmp eq i32 %1036, 0, !dbg !86
|
1096 |
+
br i1 %.not.i, label %1039, label %1037, !dbg !86
|
1097 |
+
|
1098 |
+
1037: ; preds = %947
|
1099 |
+
%1038 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %822), !dbg !86
|
1100 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
1101 |
+
|
1102 |
+
1039: ; preds = %947
|
1103 |
+
%1040 = tail call float @llvm.nvvm.rsqrt.approx.f(float %822), !dbg !86
|
1104 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
1105 |
+
|
1106 |
+
__nv_rsqrtf.exit: ; preds = %1037, %1039
|
1107 |
+
%.0.i = phi float [ %1038, %1037 ], [ %1040, %1039 ], !dbg !86
|
1108 |
+
%1041 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1109 |
+
%1042 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1110 |
+
%1043 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1111 |
+
%1044 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1112 |
+
%1045 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1113 |
+
%1046 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1114 |
+
%1047 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1115 |
+
%1048 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1116 |
+
%.not.i38 = icmp eq i32 %1048, 0, !dbg !86
|
1117 |
+
br i1 %.not.i38, label %1051, label %1049, !dbg !86
|
1118 |
+
|
1119 |
+
1049: ; preds = %__nv_rsqrtf.exit
|
1120 |
+
%1050 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %823), !dbg !86
|
1121 |
+
br label %__nv_rsqrtf.exit40, !dbg !86
|
1122 |
+
|
1123 |
+
1051: ; preds = %__nv_rsqrtf.exit
|
1124 |
+
%1052 = tail call float @llvm.nvvm.rsqrt.approx.f(float %823), !dbg !86
|
1125 |
+
br label %__nv_rsqrtf.exit40, !dbg !86
|
1126 |
+
|
1127 |
+
__nv_rsqrtf.exit40: ; preds = %1049, %1051
|
1128 |
+
%.0.i39 = phi float [ %1050, %1049 ], [ %1052, %1051 ], !dbg !86
|
1129 |
+
%1053 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1130 |
+
%1054 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1131 |
+
%1055 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1132 |
+
%1056 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1133 |
+
%1057 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1134 |
+
%1058 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1135 |
+
%1059 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
1136 |
+
%1060 = fmul float %1020, %.0.i, !dbg !87
|
1137 |
+
%1061 = fmul float %1021, %.0.i, !dbg !87
|
1138 |
+
%1062 = fmul float %1022, %.0.i, !dbg !87
|
1139 |
+
%1063 = fmul float %1023, %.0.i, !dbg !87
|
1140 |
+
%1064 = fmul float %1024, %.0.i, !dbg !87
|
1141 |
+
%1065 = fmul float %1025, %.0.i, !dbg !87
|
1142 |
+
%1066 = fmul float %1026, %.0.i, !dbg !87
|
1143 |
+
%1067 = fmul float %1027, %.0.i, !dbg !87
|
1144 |
+
%1068 = fmul float %1028, %.0.i39, !dbg !87
|
1145 |
+
%1069 = fmul float %1029, %.0.i39, !dbg !87
|
1146 |
+
%1070 = fmul float %1030, %.0.i39, !dbg !87
|
1147 |
+
%1071 = fmul float %1031, %.0.i39, !dbg !87
|
1148 |
+
%1072 = fmul float %1032, %.0.i39, !dbg !87
|
1149 |
+
%1073 = fmul float %1033, %.0.i39, !dbg !87
|
1150 |
+
%1074 = fmul float %1034, %.0.i39, !dbg !87
|
1151 |
+
%1075 = fmul float %1035, %.0.i39, !dbg !87
|
1152 |
+
%1076 = fmul float %1060, %933, !dbg !88
|
1153 |
+
%1077 = fmul float %1061, %934, !dbg !88
|
1154 |
+
%1078 = fmul float %1062, %935, !dbg !88
|
1155 |
+
%1079 = fmul float %1063, %936, !dbg !88
|
1156 |
+
%1080 = fmul float %1064, %942, !dbg !88
|
1157 |
+
%1081 = fmul float %1065, %943, !dbg !88
|
1158 |
+
%1082 = fmul float %1066, %944, !dbg !88
|
1159 |
+
%1083 = fmul float %1067, %945, !dbg !88
|
1160 |
+
%1084 = fmul float %1068, %933, !dbg !88
|
1161 |
+
%1085 = fmul float %1069, %934, !dbg !88
|
1162 |
+
%1086 = fmul float %1070, %935, !dbg !88
|
1163 |
+
%1087 = fmul float %1071, %936, !dbg !88
|
1164 |
+
%1088 = fmul float %1072, %942, !dbg !88
|
1165 |
+
%1089 = fmul float %1073, %943, !dbg !88
|
1166 |
+
%1090 = fmul float %1074, %944, !dbg !88
|
1167 |
+
%1091 = fmul float %1075, %945, !dbg !88
|
1168 |
+
%1092 = getelementptr i16, ptr addrspace(1) %5, i64 %878, !dbg !89
|
1169 |
+
%1093 = getelementptr i16, ptr addrspace(1) %5, i64 %880, !dbg !89
|
1170 |
+
%1094 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1076) #6, !dbg !90
|
1171 |
+
%1095 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1077) #6, !dbg !90
|
1172 |
+
%1096 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1078) #6, !dbg !90
|
1173 |
+
%1097 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1079) #6, !dbg !90
|
1174 |
+
%1098 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1080) #6, !dbg !90
|
1175 |
+
%1099 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1081) #6, !dbg !90
|
1176 |
+
%1100 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1082) #6, !dbg !90
|
1177 |
+
%1101 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1083) #6, !dbg !90
|
1178 |
+
%1102 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1084) #6, !dbg !90
|
1179 |
+
%1103 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1085) #6, !dbg !90
|
1180 |
+
%1104 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1086) #6, !dbg !90
|
1181 |
+
%1105 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1087) #6, !dbg !90
|
1182 |
+
%1106 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1088) #6, !dbg !90
|
1183 |
+
%1107 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1089) #6, !dbg !90
|
1184 |
+
%1108 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1090) #6, !dbg !90
|
1185 |
+
%1109 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %1091) #6, !dbg !90
|
1186 |
+
%1110 = insertelement <2 x i16> undef, i16 %1094, i64 0, !dbg !90
|
1187 |
+
%1111 = insertelement <2 x i16> %1110, i16 %1095, i64 1, !dbg !90
|
1188 |
+
%1112 = bitcast <2 x i16> %1111 to i32, !dbg !90
|
1189 |
+
%1113 = insertelement <2 x i16> undef, i16 %1096, i64 0, !dbg !90
|
1190 |
+
%1114 = insertelement <2 x i16> %1113, i16 %1097, i64 1, !dbg !90
|
1191 |
+
%1115 = bitcast <2 x i16> %1114 to i32, !dbg !90
|
1192 |
+
%1116 = insertelement <2 x i16> undef, i16 %1098, i64 0, !dbg !90
|
1193 |
+
%1117 = insertelement <2 x i16> %1116, i16 %1099, i64 1, !dbg !90
|
1194 |
+
%1118 = bitcast <2 x i16> %1117 to i32, !dbg !90
|
1195 |
+
%1119 = insertelement <2 x i16> undef, i16 %1100, i64 0, !dbg !90
|
1196 |
+
%1120 = insertelement <2 x i16> %1119, i16 %1101, i64 1, !dbg !90
|
1197 |
+
%1121 = bitcast <2 x i16> %1120 to i32, !dbg !90
|
1198 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1112, i32 %1115, i32 %1118, i32 %1121, ptr addrspace(1) %1092, i1 true) #6, !dbg !90
|
1199 |
+
%1122 = insertelement <2 x i16> undef, i16 %1102, i64 0, !dbg !90
|
1200 |
+
%1123 = insertelement <2 x i16> %1122, i16 %1103, i64 1, !dbg !90
|
1201 |
+
%1124 = bitcast <2 x i16> %1123 to i32, !dbg !90
|
1202 |
+
%1125 = insertelement <2 x i16> undef, i16 %1104, i64 0, !dbg !90
|
1203 |
+
%1126 = insertelement <2 x i16> %1125, i16 %1105, i64 1, !dbg !90
|
1204 |
+
%1127 = bitcast <2 x i16> %1126 to i32, !dbg !90
|
1205 |
+
%1128 = insertelement <2 x i16> undef, i16 %1106, i64 0, !dbg !90
|
1206 |
+
%1129 = insertelement <2 x i16> %1128, i16 %1107, i64 1, !dbg !90
|
1207 |
+
%1130 = bitcast <2 x i16> %1129 to i32, !dbg !90
|
1208 |
+
%1131 = insertelement <2 x i16> undef, i16 %1108, i64 0, !dbg !90
|
1209 |
+
%1132 = insertelement <2 x i16> %1131, i16 %1109, i64 1, !dbg !90
|
1210 |
+
%1133 = bitcast <2 x i16> %1132 to i32, !dbg !90
|
1211 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %1124, i32 %1127, i32 %1130, i32 %1133, ptr addrspace(1) %1093, i1 true) #6, !dbg !90
|
1212 |
+
%1134 = add nuw nsw i32 %825, 64, !dbg !69
|
1213 |
+
%1135 = icmp ult i32 %825, 192, !dbg !69
|
1214 |
+
br i1 %1135, label %824, label %1136, !dbg !69
|
1215 |
+
|
1216 |
+
1136: ; preds = %__nv_rsqrtf.exit40
|
1217 |
+
ret void, !dbg !91
|
1218 |
+
}
|
1219 |
+
|
1220 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
1221 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
1222 |
+
|
1223 |
+
; Function Attrs: convergent nocallback nounwind
|
1224 |
+
declare void @llvm.nvvm.barrier0() #1
|
1225 |
+
|
1226 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
1227 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
1228 |
+
|
1229 |
+
; Function Attrs: alwaysinline nounwind
|
1230 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
1231 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
1232 |
+
%.not = icmp eq i32 %1, 0
|
1233 |
+
br i1 %.not, label %4, label %2
|
1234 |
+
|
1235 |
+
2: ; preds = %0
|
1236 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
1237 |
+
br label %6
|
1238 |
+
|
1239 |
+
4: ; preds = %0
|
1240 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
1241 |
+
br label %6
|
1242 |
+
|
1243 |
+
6: ; preds = %4, %2
|
1244 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
1245 |
+
ret float %.0
|
1246 |
+
}
|
1247 |
+
|
1248 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
1249 |
+
|
1250 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
1251 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
1252 |
+
|
1253 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
1254 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
1255 |
+
|
1256 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
1257 |
+
attributes #1 = { convergent nocallback nounwind }
|
1258 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
1259 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
1260 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
1261 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
1262 |
+
attributes #6 = { nounwind }
|
1263 |
+
|
1264 |
+
!llvm.module.flags = !{!0, !1}
|
1265 |
+
!llvm.dbg.cu = !{!2}
|
1266 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
1267 |
+
!llvm.ident = !{!6}
|
1268 |
+
|
1269 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
1270 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
1271 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
1272 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
1273 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
1274 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
|
1275 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
1276 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
1277 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
1278 |
+
!9 = !{}
|
1279 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
1280 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
1281 |
+
!12 = !DILocation(line: 31, column: 36, scope: !7)
|
1282 |
+
!13 = !DILocation(line: 21, column: 28, scope: !7)
|
1283 |
+
!14 = !DILocation(line: 21, column: 33, scope: !7)
|
1284 |
+
!15 = !DILocation(line: 22, column: 23, scope: !7)
|
1285 |
+
!16 = !DILocation(line: 26, column: 30, scope: !7)
|
1286 |
+
!17 = !DILocation(line: 26, column: 35, scope: !7)
|
1287 |
+
!18 = !DILocation(line: 27, column: 18, scope: !7)
|
1288 |
+
!19 = !DILocation(line: 35, column: 44, scope: !7)
|
1289 |
+
!20 = !DILocation(line: 36, column: 44, scope: !7)
|
1290 |
+
!21 = !DILocation(line: 37, column: 22, scope: !7)
|
1291 |
+
!22 = !DILocation(line: 38, column: 22, scope: !7)
|
1292 |
+
!23 = !DILocation(line: 39, column: 36, scope: !7)
|
1293 |
+
!24 = !DILocation(line: 40, column: 40, scope: !7)
|
1294 |
+
!25 = !DILocation(line: 41, column: 44, scope: !7)
|
1295 |
+
!26 = !DILocation(line: 32, column: 27, scope: !7)
|
1296 |
+
!27 = !DILocation(line: 35, column: 40, scope: !7)
|
1297 |
+
!28 = !DILocation(line: 35, column: 34, scope: !7)
|
1298 |
+
!29 = !DILocation(line: 35, column: 50, scope: !7)
|
1299 |
+
!30 = !DILocation(line: 36, column: 40, scope: !7)
|
1300 |
+
!31 = !DILocation(line: 36, column: 34, scope: !7)
|
1301 |
+
!32 = !DILocation(line: 36, column: 50, scope: !7)
|
1302 |
+
!33 = !DILocation(line: 36, column: 101, scope: !7)
|
1303 |
+
!34 = !DILocation(line: 40, column: 55, scope: !7)
|
1304 |
+
!35 = !DILocation(line: 41, column: 40, scope: !7)
|
1305 |
+
!36 = !DILocation(line: 41, column: 34, scope: !7)
|
1306 |
+
!37 = !DILocation(line: 41, column: 52, scope: !7)
|
1307 |
+
!38 = !DILocation(line: 42, column: 22, scope: !7)
|
1308 |
+
!39 = !DILocation(line: 44, column: 22, scope: !7)
|
1309 |
+
!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
|
1310 |
+
!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
|
1311 |
+
!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
1312 |
+
!43 = !DILocation(line: 47, column: 41, scope: !41)
|
1313 |
+
!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
|
1314 |
+
!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
|
1315 |
+
!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
|
1316 |
+
!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
|
1317 |
+
!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
|
1318 |
+
!49 = !DILocation(line: 50, column: 50, scope: !7)
|
1319 |
+
!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
|
1320 |
+
!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
|
1321 |
+
!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
|
1322 |
+
!53 = !DILocation(line: 53, column: 44, scope: !51)
|
1323 |
+
!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
|
1324 |
+
!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
|
1325 |
+
!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
|
1326 |
+
!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
|
1327 |
+
!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
|
1328 |
+
!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
|
1329 |
+
!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
|
1330 |
+
!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
|
1331 |
+
!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
|
1332 |
+
!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
|
1333 |
+
!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
|
1334 |
+
!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
|
1335 |
+
!66 = !DILocation(line: 53, column: 44, scope: !41)
|
1336 |
+
!67 = !DILocation(line: 75, column: 24, scope: !7)
|
1337 |
+
!68 = !DILocation(line: 77, column: 24, scope: !7)
|
1338 |
+
!69 = !DILocation(line: 58, column: 36, scope: !7)
|
1339 |
+
!70 = !DILocation(line: 59, column: 27, scope: !7)
|
1340 |
+
!71 = !DILocation(line: 62, column: 41, scope: !7)
|
1341 |
+
!72 = !DILocation(line: 62, column: 35, scope: !7)
|
1342 |
+
!73 = !DILocation(line: 62, column: 51, scope: !7)
|
1343 |
+
!74 = !DILocation(line: 63, column: 41, scope: !7)
|
1344 |
+
!75 = !DILocation(line: 63, column: 35, scope: !7)
|
1345 |
+
!76 = !DILocation(line: 63, column: 51, scope: !7)
|
1346 |
+
!77 = !DILocation(line: 63, column: 103, scope: !7)
|
1347 |
+
!78 = !DILocation(line: 64, column: 35, scope: !7)
|
1348 |
+
!79 = !DILocation(line: 64, column: 40, scope: !7)
|
1349 |
+
!80 = !DILocation(line: 68, column: 57, scope: !7)
|
1350 |
+
!81 = !DILocation(line: 69, column: 35, scope: !7)
|
1351 |
+
!82 = !DILocation(line: 69, column: 54, scope: !7)
|
1352 |
+
!83 = !DILocation(line: 70, column: 24, scope: !7)
|
1353 |
+
!84 = !DILocation(line: 72, column: 24, scope: !7)
|
1354 |
+
!85 = !DILocation(line: 73, column: 24, scope: !7)
|
1355 |
+
!86 = !DILocation(line: 78, column: 30, scope: !7)
|
1356 |
+
!87 = !DILocation(line: 79, column: 24, scope: !7)
|
1357 |
+
!88 = !DILocation(line: 80, column: 24, scope: !7)
|
1358 |
+
!89 = !DILocation(line: 82, column: 29, scope: !7)
|
1359 |
+
!90 = !DILocation(line: 82, column: 52, scope: !7)
|
1360 |
+
!91 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttir
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x64xbf16>
|
4 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
5 |
+
%cst_1 = arith.constant dense<1.000000e+00> : tensor<64x64xf32>
|
6 |
+
%c256_i32 = arith.constant 256 : i32
|
7 |
+
%c64_i32 = arith.constant 64 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<64x1xi64>
|
10 |
+
%cst_3 = arith.constant dense<0> : tensor<64x1xi64>
|
11 |
+
%cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
|
12 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
|
13 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
|
14 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x64xf32>
|
15 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<64x64xf32>
|
16 |
+
%cst_9 = arith.constant dense<256> : tensor<64x1xi32>
|
17 |
+
%cst_10 = arith.constant dense<256> : tensor<1x64xi32>
|
18 |
+
%cst_11 = arith.constant dense<512> : tensor<64x1xi32>
|
19 |
+
%0 = tt.get_program_id x : i32
|
20 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
21 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
22 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
23 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
24 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
25 |
+
%6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32>
|
26 |
+
%7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
27 |
+
%8 = tt.addptr %7, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
28 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
29 |
+
%10 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
|
30 |
+
%11 = arith.muli %10, %cst_9 : tensor<64x1xi32>
|
31 |
+
%12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
32 |
+
%13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
33 |
+
%14 = arith.muli %5, %cst_9 : tensor<64x1xi32>
|
34 |
+
%15 = tt.broadcast %14 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
35 |
+
%16 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
36 |
+
%17 = arith.addi %9, %cst_4 : tensor<64x1xi64>
|
37 |
+
%18 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64>
|
38 |
+
%19 = arith.select %18, %17, %9 : tensor<64x1xi1>, tensor<64x1xi64>
|
39 |
+
%20 = arith.cmpi sge, %19, %cst_3 : tensor<64x1xi64>
|
40 |
+
%21 = arith.cmpi slt, %19, %cst_4 : tensor<64x1xi64>
|
41 |
+
%22 = arith.andi %20, %21 : tensor<64x1xi1>
|
42 |
+
%23 = arith.muli %19, %cst_2 : tensor<64x1xi64>
|
43 |
+
%24 = tt.broadcast %23 : (tensor<64x1xi64>) -> tensor<64x64xi64>
|
44 |
+
%25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
45 |
+
%26:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) : i32 {
|
46 |
+
%50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32>
|
47 |
+
%51 = arith.addi %50, %6 : tensor<1x64xi32>
|
48 |
+
%52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32>
|
49 |
+
%53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32>
|
50 |
+
%54 = arith.addi %53, %12 : tensor<64x64xi32>
|
51 |
+
%55 = tt.addptr %13, %54 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
|
52 |
+
%56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1>
|
53 |
+
%57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
54 |
+
%58 = arith.addi %53, %15 : tensor<64x64xi32>
|
55 |
+
%59 = tt.addptr %16, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
|
56 |
+
%60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16>
|
57 |
+
%61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32>
|
58 |
+
tt.assert %22, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
59 |
+
%62 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64>
|
60 |
+
%63 = tt.broadcast %62 : (tensor<1x64xi64>) -> tensor<64x64xi64>
|
61 |
+
%64 = arith.addi %63, %24 : tensor<64x64xi64>
|
62 |
+
%65 = tt.addptr %25, %64 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
|
63 |
+
%66 = tt.load %65, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
64 |
+
%67 = arith.addf %66, %57 : tensor<64x64xf32>
|
65 |
+
%68 = arith.addf %67, %61 : tensor<64x64xf32>
|
66 |
+
%69 = arith.subf %68, %arg9 : tensor<64x64xf32>
|
67 |
+
%70 = arith.addf %arg11, %cst_1 : tensor<64x64xf32>
|
68 |
+
%71 = arith.divf %69, %70 : tensor<64x64xf32>
|
69 |
+
%72 = arith.addf %arg9, %71 : tensor<64x64xf32>
|
70 |
+
%73 = arith.subf %68, %72 : tensor<64x64xf32>
|
71 |
+
%74 = arith.mulf %69, %73 : tensor<64x64xf32>
|
72 |
+
%75 = arith.addf %arg10, %74 : tensor<64x64xf32>
|
73 |
+
%76 = arith.select %56, %72, %arg9 : tensor<64x64xi1>, tensor<64x64xf32>
|
74 |
+
%77 = arith.select %56, %75, %arg10 : tensor<64x64xi1>, tensor<64x64xf32>
|
75 |
+
%78 = arith.select %56, %70, %arg11 : tensor<64x64xi1>, tensor<64x64xf32>
|
76 |
+
scf.yield %76, %77, %78 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>
|
77 |
+
}
|
78 |
+
%27:3 = "tt.reduce"(%26#0, %26#1, %26#2) <{axis = 1 : i32}> ({
|
79 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
80 |
+
%50 = arith.subf %arg11, %arg8 : f32
|
81 |
+
%51 = arith.addf %arg10, %arg13 : f32
|
82 |
+
%52 = arith.cmpf oeq, %51, %cst_0 : f32
|
83 |
+
%53 = arith.divf %arg13, %51 : f32
|
84 |
+
%54 = arith.select %52, %cst_0, %53 : f32
|
85 |
+
%55 = arith.mulf %50, %54 : f32
|
86 |
+
%56 = arith.addf %arg8, %55 : f32
|
87 |
+
%57 = arith.addf %arg9, %arg12 : f32
|
88 |
+
%58 = arith.mulf %50, %50 : f32
|
89 |
+
%59 = arith.mulf %58, %arg10 : f32
|
90 |
+
%60 = arith.mulf %59, %54 : f32
|
91 |
+
%61 = arith.addf %57, %60 : f32
|
92 |
+
tt.reduce.return %56, %61, %51 : f32, f32, f32
|
93 |
+
}) : (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
|
94 |
+
%28 = tt.expand_dims %27#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
95 |
+
%29 = tt.expand_dims %27#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
96 |
+
%30 = arith.muli %10, %cst_9 : tensor<64x1xi32>
|
97 |
+
%31 = tt.broadcast %30 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
98 |
+
%32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
99 |
+
%33 = arith.muli %5, %cst_9 : tensor<64x1xi32>
|
100 |
+
%34 = tt.broadcast %33 : (tensor<64x1xi32>) -> tensor<64x64xi32>
|
101 |
+
%35 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
102 |
+
%36 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>>
|
103 |
+
%37 = arith.addi %9, %cst_4 : tensor<64x1xi64>
|
104 |
+
%38 = arith.cmpi slt, %9, %cst_3 : tensor<64x1xi64>
|
105 |
+
%39 = arith.select %38, %37, %9 : tensor<64x1xi1>, tensor<64x1xi64>
|
106 |
+
%40 = arith.cmpi sge, %39, %cst_3 : tensor<64x1xi64>
|
107 |
+
%41 = arith.cmpi slt, %39, %cst_4 : tensor<64x1xi64>
|
108 |
+
%42 = arith.andi %40, %41 : tensor<64x1xi1>
|
109 |
+
%43 = arith.muli %39, %cst_2 : tensor<64x1xi64>
|
110 |
+
%44 = tt.broadcast %43 : (tensor<64x1xi64>) -> tensor<64x64xi64>
|
111 |
+
%45 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>>
|
112 |
+
%46 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
113 |
+
%47 = arith.divf %29, %cst_6 : tensor<64x1xf32>
|
114 |
+
%48 = arith.addf %47, %cst_5 : tensor<64x1xf32>
|
115 |
+
%49 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>>
|
116 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
|
117 |
+
%50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32>
|
118 |
+
%51 = arith.addi %50, %6 : tensor<1x64xi32>
|
119 |
+
%52 = arith.cmpi slt, %51, %cst_10 : tensor<1x64xi32>
|
120 |
+
%53 = tt.broadcast %51 : (tensor<1x64xi32>) -> tensor<64x64xi32>
|
121 |
+
%54 = arith.addi %53, %31 : tensor<64x64xi32>
|
122 |
+
%55 = tt.addptr %32, %54 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi32>
|
123 |
+
%56 = tt.broadcast %52 : (tensor<1x64xi1>) -> tensor<64x64xi1>
|
124 |
+
%57 = tt.load %55, %56, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32>
|
125 |
+
%58 = arith.addi %53, %34 : tensor<64x64xi32>
|
126 |
+
%59 = tt.addptr %35, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
|
127 |
+
%60 = tt.load %59, %56, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16>
|
128 |
+
%61 = arith.extf %60 : tensor<64x64xbf16> to tensor<64x64xf32>
|
129 |
+
%62 = tt.addptr %36, %51 : tensor<1x64x!tt.ptr<f32, 1>>, tensor<1x64xi32>
|
130 |
+
%63 = tt.load %62, %52, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32>
|
131 |
+
tt.assert %42, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
132 |
+
%64 = arith.extsi %51 : tensor<1x64xi32> to tensor<1x64xi64>
|
133 |
+
%65 = tt.broadcast %64 : (tensor<1x64xi64>) -> tensor<64x64xi64>
|
134 |
+
%66 = arith.addi %65, %44 : tensor<64x64xi64>
|
135 |
+
%67 = tt.addptr %45, %66 : tensor<64x64x!tt.ptr<f32, 1>>, tensor<64x64xi64>
|
136 |
+
%68 = tt.load %67, %56, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32>
|
137 |
+
%69 = arith.addf %68, %57 : tensor<64x64xf32>
|
138 |
+
%70 = arith.addf %69, %61 : tensor<64x64xf32>
|
139 |
+
%71 = arith.subf %70, %46 : tensor<64x64xf32>
|
140 |
+
%72 = tt.extern_elementwise %48 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
|
141 |
+
%73 = tt.broadcast %72 : (tensor<64x1xf32>) -> tensor<64x64xf32>
|
142 |
+
%74 = arith.mulf %71, %73 : tensor<64x64xf32>
|
143 |
+
%75 = tt.broadcast %63 : (tensor<1x64xf32>) -> tensor<64x64xf32>
|
144 |
+
%76 = arith.mulf %74, %75 : tensor<64x64xf32>
|
145 |
+
%77 = tt.addptr %49, %58 : tensor<64x64x!tt.ptr<bf16, 1>>, tensor<64x64xi32>
|
146 |
+
%78 = arith.truncf %76 : tensor<64x64xf32> to tensor<64x64xbf16>
|
147 |
+
tt.store %77, %78, %56 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16>
|
148 |
+
}
|
149 |
+
tt.return
|
150 |
+
}
|
151 |
+
}
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.cubin
ADDED
Binary file (13.7 kB). View file
|
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.llir
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = shl i32 %6, 2, !dbg !8
|
11 |
+
%10 = and i32 %9, 12, !dbg !8
|
12 |
+
%11 = and i32 %6, 15, !dbg !8
|
13 |
+
%12 = and i32 %8, 7, !dbg !9
|
14 |
+
%13 = lshr i32 %7, 2, !dbg !9
|
15 |
+
%14 = shl nuw nsw i32 %12, 3, !dbg !9
|
16 |
+
%15 = or i32 %14, %13, !dbg !9
|
17 |
+
%16 = or i32 %15, 64, !dbg !9
|
18 |
+
%17 = or i32 %10, 1, !dbg !10
|
19 |
+
%18 = or i32 %10, 2, !dbg !10
|
20 |
+
%19 = or i32 %10, 3, !dbg !10
|
21 |
+
%20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
|
22 |
+
%21 = shl i32 %20, 4, !dbg !15
|
23 |
+
%22 = or i32 %21, %10, !dbg !16
|
24 |
+
%23 = or i32 %21, %11, !dbg !16
|
25 |
+
%24 = icmp ult i32 %16, 120, !dbg !17
|
26 |
+
%25 = shl nuw nsw i32 %15, 17, !dbg !18
|
27 |
+
%26 = shl nuw nsw i32 %16, 17, !dbg !18
|
28 |
+
%27 = add i32 %22, %25, !dbg !19
|
29 |
+
%28 = add i32 %22, %26, !dbg !19
|
30 |
+
%29 = sext i32 %27 to i64, !dbg !20
|
31 |
+
%30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !20
|
32 |
+
%31 = sext i32 %28 to i64, !dbg !20
|
33 |
+
%32 = getelementptr float, ptr addrspace(1) %0, i64 %31, !dbg !20
|
34 |
+
%33 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
35 |
+
%34 = extractvalue { i32, i32, i32, i32 } %33, 0, !dbg !21
|
36 |
+
%35 = extractvalue { i32, i32, i32, i32 } %33, 1, !dbg !21
|
37 |
+
%36 = extractvalue { i32, i32, i32, i32 } %33, 2, !dbg !21
|
38 |
+
%37 = extractvalue { i32, i32, i32, i32 } %33, 3, !dbg !21
|
39 |
+
%38 = bitcast i32 %34 to float, !dbg !21
|
40 |
+
%39 = bitcast i32 %35 to float, !dbg !21
|
41 |
+
%40 = bitcast i32 %36 to float, !dbg !21
|
42 |
+
%41 = bitcast i32 %37 to float, !dbg !21
|
43 |
+
%42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24, i32 0, i1 %24) #3, !dbg !21
|
44 |
+
%43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !21
|
45 |
+
%44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !21
|
46 |
+
%45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !21
|
47 |
+
%46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !21
|
48 |
+
%47 = bitcast i32 %43 to float, !dbg !21
|
49 |
+
%48 = bitcast i32 %44 to float, !dbg !21
|
50 |
+
%49 = bitcast i32 %45 to float, !dbg !21
|
51 |
+
%50 = bitcast i32 %46 to float, !dbg !21
|
52 |
+
%51 = fadd float %38, 0.000000e+00, !dbg !22
|
53 |
+
%52 = fadd float %39, 0.000000e+00, !dbg !22
|
54 |
+
%53 = fadd float %40, 0.000000e+00, !dbg !22
|
55 |
+
%54 = fadd float %41, 0.000000e+00, !dbg !22
|
56 |
+
%55 = fadd float %47, 0.000000e+00, !dbg !22
|
57 |
+
%56 = fadd float %48, 0.000000e+00, !dbg !22
|
58 |
+
%57 = fadd float %49, 0.000000e+00, !dbg !22
|
59 |
+
%58 = fadd float %50, 0.000000e+00, !dbg !22
|
60 |
+
%59 = select i1 %24, float %55, float 0.000000e+00, !dbg !23
|
61 |
+
%60 = select i1 %24, float %56, float 0.000000e+00, !dbg !23
|
62 |
+
%61 = select i1 %24, float %57, float 0.000000e+00, !dbg !23
|
63 |
+
%62 = select i1 %24, float %58, float 0.000000e+00, !dbg !23
|
64 |
+
%63 = fadd float %51, %59, !dbg !24
|
65 |
+
%64 = fadd float %52, %60, !dbg !24
|
66 |
+
%65 = fadd float %53, %61, !dbg !24
|
67 |
+
%66 = fadd float %54, %62, !dbg !24
|
68 |
+
%67 = bitcast float %63 to i32, !dbg !10
|
69 |
+
%68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !10
|
70 |
+
%69 = bitcast i32 %68 to float, !dbg !10
|
71 |
+
%70 = fadd float %63, %69, !dbg !24
|
72 |
+
%71 = bitcast float %70 to i32, !dbg !10
|
73 |
+
%72 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %71, i32 8, i32 31), !dbg !10
|
74 |
+
%73 = bitcast i32 %72 to float, !dbg !10
|
75 |
+
%74 = fadd float %70, %73, !dbg !24
|
76 |
+
%75 = bitcast float %74 to i32, !dbg !10
|
77 |
+
%76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 4, i32 31), !dbg !10
|
78 |
+
%77 = bitcast i32 %76 to float, !dbg !10
|
79 |
+
%78 = fadd float %74, %77, !dbg !24
|
80 |
+
%79 = bitcast float %64 to i32, !dbg !10
|
81 |
+
%80 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %79, i32 16, i32 31), !dbg !10
|
82 |
+
%81 = bitcast i32 %80 to float, !dbg !10
|
83 |
+
%82 = fadd float %64, %81, !dbg !24
|
84 |
+
%83 = bitcast float %82 to i32, !dbg !10
|
85 |
+
%84 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %83, i32 8, i32 31), !dbg !10
|
86 |
+
%85 = bitcast i32 %84 to float, !dbg !10
|
87 |
+
%86 = fadd float %82, %85, !dbg !24
|
88 |
+
%87 = bitcast float %86 to i32, !dbg !10
|
89 |
+
%88 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %87, i32 4, i32 31), !dbg !10
|
90 |
+
%89 = bitcast i32 %88 to float, !dbg !10
|
91 |
+
%90 = fadd float %86, %89, !dbg !24
|
92 |
+
%91 = bitcast float %65 to i32, !dbg !10
|
93 |
+
%92 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %91, i32 16, i32 31), !dbg !10
|
94 |
+
%93 = bitcast i32 %92 to float, !dbg !10
|
95 |
+
%94 = fadd float %65, %93, !dbg !24
|
96 |
+
%95 = bitcast float %94 to i32, !dbg !10
|
97 |
+
%96 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %95, i32 8, i32 31), !dbg !10
|
98 |
+
%97 = bitcast i32 %96 to float, !dbg !10
|
99 |
+
%98 = fadd float %94, %97, !dbg !24
|
100 |
+
%99 = bitcast float %98 to i32, !dbg !10
|
101 |
+
%100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 4, i32 31), !dbg !10
|
102 |
+
%101 = bitcast i32 %100 to float, !dbg !10
|
103 |
+
%102 = fadd float %98, %101, !dbg !24
|
104 |
+
%103 = bitcast float %66 to i32, !dbg !10
|
105 |
+
%104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !10
|
106 |
+
%105 = bitcast i32 %104 to float, !dbg !10
|
107 |
+
%106 = fadd float %66, %105, !dbg !24
|
108 |
+
%107 = bitcast float %106 to i32, !dbg !10
|
109 |
+
%108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 8, i32 31), !dbg !10
|
110 |
+
%109 = bitcast i32 %108 to float, !dbg !10
|
111 |
+
%110 = fadd float %106, %109, !dbg !24
|
112 |
+
%111 = bitcast float %110 to i32, !dbg !10
|
113 |
+
%112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 4, i32 31), !dbg !10
|
114 |
+
%113 = bitcast i32 %112 to float, !dbg !10
|
115 |
+
%114 = fadd float %110, %113, !dbg !24
|
116 |
+
%115 = icmp ult i32 %7, 4, !dbg !10
|
117 |
+
%116 = shl nuw nsw i32 %10, 3, !dbg !10
|
118 |
+
%117 = or i32 %116, %12, !dbg !10
|
119 |
+
%118 = zext nneg i32 %117 to i64, !dbg !10
|
120 |
+
%119 = getelementptr float, ptr addrspace(3) @global_smem, i64 %118, !dbg !10
|
121 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %119, float %78, i1 %115) #3, !dbg !10
|
122 |
+
%120 = shl nuw nsw i32 %17, 3, !dbg !10
|
123 |
+
%121 = or i32 %120, %12, !dbg !10
|
124 |
+
%122 = zext nneg i32 %121 to i64, !dbg !10
|
125 |
+
%123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !10
|
126 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %90, i1 %115) #3, !dbg !10
|
127 |
+
%124 = shl nuw nsw i32 %18, 3, !dbg !10
|
128 |
+
%125 = or i32 %124, %12, !dbg !10
|
129 |
+
%126 = zext nneg i32 %125 to i64, !dbg !10
|
130 |
+
%127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !10
|
131 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %102, i1 %115) #3, !dbg !10
|
132 |
+
%128 = shl nuw nsw i32 %19, 3, !dbg !10
|
133 |
+
%129 = or i32 %128, %12, !dbg !10
|
134 |
+
%130 = zext nneg i32 %129 to i64, !dbg !10
|
135 |
+
%131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !10
|
136 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %131, float %114, i1 %115) #3, !dbg !10
|
137 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !10
|
138 |
+
%132 = icmp slt i32 %6, 128, !dbg !10
|
139 |
+
%133 = sext i32 %6 to i64, !dbg !10
|
140 |
+
%134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !10
|
141 |
+
%135 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %134, i1 %132) #3, !dbg !10
|
142 |
+
%136 = bitcast float %135 to i32, !dbg !10
|
143 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 4, i32 31), !dbg !10
|
144 |
+
%138 = bitcast i32 %137 to float, !dbg !10
|
145 |
+
%139 = fadd float %135, %138, !dbg !24
|
146 |
+
%140 = bitcast float %139 to i32, !dbg !10
|
147 |
+
%141 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %140, i32 2, i32 31), !dbg !10
|
148 |
+
%142 = bitcast i32 %141 to float, !dbg !10
|
149 |
+
%143 = fadd float %139, %142, !dbg !24
|
150 |
+
%144 = bitcast float %143 to i32, !dbg !10
|
151 |
+
%145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 1, i32 31), !dbg !10
|
152 |
+
%146 = bitcast i32 %145 to float, !dbg !10
|
153 |
+
%147 = fadd float %143, %146, !dbg !24
|
154 |
+
%148 = and i32 %6, 7, !dbg !10
|
155 |
+
%149 = icmp eq i32 %148, 0, !dbg !10
|
156 |
+
%150 = and i1 %132, %149, !dbg !10
|
157 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %134, float %147, i1 %150) #3, !dbg !10
|
158 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !10
|
159 |
+
%151 = zext nneg i32 %116 to i64, !dbg !10
|
160 |
+
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
|
161 |
+
%153 = load float, ptr addrspace(3) %152, align 4, !dbg !10
|
162 |
+
%154 = zext nneg i32 %120 to i64, !dbg !10
|
163 |
+
%155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !10
|
164 |
+
%156 = load float, ptr addrspace(3) %155, align 4, !dbg !10
|
165 |
+
%157 = zext nneg i32 %124 to i64, !dbg !10
|
166 |
+
%158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !10
|
167 |
+
%159 = load float, ptr addrspace(3) %158, align 4, !dbg !10
|
168 |
+
%160 = zext nneg i32 %128 to i64, !dbg !10
|
169 |
+
%161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !10
|
170 |
+
%162 = load float, ptr addrspace(3) %161, align 4, !dbg !10
|
171 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
172 |
+
%163 = zext nneg i32 %10 to i64, !dbg !28
|
173 |
+
%164 = getelementptr float, ptr addrspace(3) @global_smem, i64 %163, !dbg !28
|
174 |
+
%165 = insertelement <1 x float> undef, float %153, i64 0, !dbg !28
|
175 |
+
store <1 x float> %165, ptr addrspace(3) %164, align 4, !dbg !28
|
176 |
+
%166 = zext nneg i32 %17 to i64, !dbg !28
|
177 |
+
%167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !28
|
178 |
+
%168 = insertelement <1 x float> undef, float %156, i64 0, !dbg !28
|
179 |
+
store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !28
|
180 |
+
%169 = zext nneg i32 %18 to i64, !dbg !28
|
181 |
+
%170 = getelementptr float, ptr addrspace(3) @global_smem, i64 %169, !dbg !28
|
182 |
+
%171 = insertelement <1 x float> undef, float %159, i64 0, !dbg !28
|
183 |
+
store <1 x float> %171, ptr addrspace(3) %170, align 4, !dbg !28
|
184 |
+
%172 = zext nneg i32 %19 to i64, !dbg !28
|
185 |
+
%173 = getelementptr float, ptr addrspace(3) @global_smem, i64 %172, !dbg !28
|
186 |
+
%174 = insertelement <1 x float> undef, float %162, i64 0, !dbg !28
|
187 |
+
store <1 x float> %174, ptr addrspace(3) %173, align 4, !dbg !28
|
188 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
189 |
+
%175 = zext nneg i32 %11 to i64, !dbg !28
|
190 |
+
%176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !28
|
191 |
+
%177 = load <1 x float>, ptr addrspace(3) %176, align 4, !dbg !28
|
192 |
+
%.frozen = freeze i32 %23
|
193 |
+
%178 = sdiv i32 %.frozen, 256, !dbg !29
|
194 |
+
%179 = mul i32 %178, 256
|
195 |
+
%.decomposed = sub i32 %.frozen, %179
|
196 |
+
%180 = sext i32 %178 to i64, !dbg !30
|
197 |
+
%181 = getelementptr i64, ptr addrspace(1) %1, i64 %180, !dbg !30
|
198 |
+
%182 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %181, i1 true) #3, !dbg !31
|
199 |
+
%183 = lshr i64 %182, 54, !dbg !32
|
200 |
+
%184 = and i64 %183, 512, !dbg !32
|
201 |
+
%185 = add i64 %184, %182, !dbg !32
|
202 |
+
%186 = shl i64 %185, 8, !dbg !33
|
203 |
+
%187 = sext i32 %.decomposed to i64, !dbg !34
|
204 |
+
%188 = getelementptr float, ptr addrspace(1) %2, i64 %186, !dbg !35
|
205 |
+
%189 = getelementptr float, ptr addrspace(1) %188, i64 %187, !dbg !35
|
206 |
+
%190 = lshr i32 %7, 4, !dbg !36
|
207 |
+
%191 = shl nuw nsw i32 %12, 1, !dbg !36
|
208 |
+
%192 = or i32 %191, %190, !dbg !36
|
209 |
+
%193 = icmp eq i32 %192, 0, !dbg !36
|
210 |
+
%194 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %189, <1 x float> %177, i1 %193) #3, !dbg !36
|
211 |
+
ret void, !dbg !37
|
212 |
+
}
|
213 |
+
|
214 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
215 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
216 |
+
|
217 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
218 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
219 |
+
|
220 |
+
; Function Attrs: convergent nocallback nounwind
|
221 |
+
declare void @llvm.nvvm.barrier0() #2
|
222 |
+
|
223 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
224 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
225 |
+
attributes #2 = { convergent nocallback nounwind }
|
226 |
+
attributes #3 = { nounwind }
|
227 |
+
|
228 |
+
!llvm.module.flags = !{!0}
|
229 |
+
!llvm.dbg.cu = !{!1}
|
230 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
231 |
+
|
232 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
233 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
234 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
235 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
236 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
|
237 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
238 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
239 |
+
!7 = !{}
|
240 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
241 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
242 |
+
!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
|
243 |
+
!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
|
244 |
+
!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
245 |
+
!13 = !DILocation(line: 35, column: 25, scope: !11)
|
246 |
+
!14 = !DILocation(line: 21, column: 28, scope: !5)
|
247 |
+
!15 = !DILocation(line: 21, column: 33, scope: !5)
|
248 |
+
!16 = !DILocation(line: 22, column: 23, scope: !5)
|
249 |
+
!17 = !DILocation(line: 29, column: 25, scope: !5)
|
250 |
+
!18 = !DILocation(line: 31, column: 47, scope: !5)
|
251 |
+
!19 = !DILocation(line: 31, column: 40, scope: !5)
|
252 |
+
!20 = !DILocation(line: 31, column: 34, scope: !5)
|
253 |
+
!21 = !DILocation(line: 31, column: 53, scope: !5)
|
254 |
+
!22 = !DILocation(line: 33, column: 23, scope: !5)
|
255 |
+
!23 = !DILocation(line: 34, column: 38, scope: !5)
|
256 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
257 |
+
!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
|
258 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
259 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
260 |
+
!28 = !DILocation(line: 35, column: 28, scope: !5)
|
261 |
+
!29 = !DILocation(line: 36, column: 20, scope: !5)
|
262 |
+
!30 = !DILocation(line: 38, column: 30, scope: !5)
|
263 |
+
!31 = !DILocation(line: 38, column: 35, scope: !5)
|
264 |
+
!32 = !DILocation(line: 41, column: 32, scope: !5)
|
265 |
+
!33 = !DILocation(line: 45, column: 40, scope: !5)
|
266 |
+
!34 = !DILocation(line: 45, column: 36, scope: !5)
|
267 |
+
!35 = !DILocation(line: 45, column: 30, scope: !5)
|
268 |
+
!36 = !DILocation(line: 45, column: 55, scope: !5)
|
269 |
+
!37 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ptx
ADDED
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4e
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4e(
|
13 |
+
.param .u64 triton__0d1d2d3de4e_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4e_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4e_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4e_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4e_param_4
|
18 |
+
)
|
19 |
+
.maxntid 256, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<20>;
|
22 |
+
.reg .b32 %r<107>;
|
23 |
+
.reg .f32 %f<60>;
|
24 |
+
.reg .b64 %rd<18>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_0];
|
30 |
+
ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_1];
|
31 |
+
$L__tmp0:
|
32 |
+
.loc 1 22 44
|
33 |
+
mov.u32 %r32, %tid.x;
|
34 |
+
and.b32 %r33, %r32, 31;
|
35 |
+
ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_2];
|
36 |
+
shl.b32 %r34, %r32, 2;
|
37 |
+
and.b32 %r35, %r34, 12;
|
38 |
+
and.b32 %r36, %r32, 15;
|
39 |
+
.loc 1 24 33
|
40 |
+
bfe.u32 %r37, %r32, 5, 3;
|
41 |
+
bfe.u32 %r38, %r32, 2, 3;
|
42 |
+
shl.b32 %r39, %r37, 3;
|
43 |
+
or.b32 %r40, %r39, %r38;
|
44 |
+
or.b32 %r41, %r40, 64;
|
45 |
+
.loc 1 21 28
|
46 |
+
mov.u32 %r1, %ctaid.x;
|
47 |
+
.loc 1 21 33
|
48 |
+
shl.b32 %r42, %r1, 4;
|
49 |
+
.loc 1 22 23
|
50 |
+
or.b32 %r43, %r42, %r35;
|
51 |
+
or.b32 %r44, %r42, %r36;
|
52 |
+
.loc 1 29 25
|
53 |
+
setp.lt.u32 %p6, %r41, 120;
|
54 |
+
.loc 1 31 47
|
55 |
+
shl.b32 %r45, %r40, 17;
|
56 |
+
shl.b32 %r46, %r41, 17;
|
57 |
+
.loc 1 31 40
|
58 |
+
add.s32 %r47, %r43, %r45;
|
59 |
+
add.s32 %r48, %r43, %r46;
|
60 |
+
.loc 1 31 34
|
61 |
+
mul.wide.s32 %rd9, %r47, 4;
|
62 |
+
add.s64 %rd1, %rd6, %rd9;
|
63 |
+
mul.wide.s32 %rd10, %r48, 4;
|
64 |
+
add.s64 %rd2, %rd6, %rd10;
|
65 |
+
mov.b32 %r6, 0;
|
66 |
+
mov.pred %p1, -1;
|
67 |
+
.loc 1 31 53
|
68 |
+
mov.u32 %r2, 0x0;
|
69 |
+
mov.u32 %r3, 0x0;
|
70 |
+
mov.u32 %r4, 0x0;
|
71 |
+
mov.u32 %r5, 0x0;
|
72 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
73 |
+
@!%p1 mov.u32 %r2, %r6;
|
74 |
+
@!%p1 mov.u32 %r3, %r6;
|
75 |
+
@!%p1 mov.u32 %r4, %r6;
|
76 |
+
@!%p1 mov.u32 %r5, %r6;
|
77 |
+
mov.b32 %f1, %r2;
|
78 |
+
mov.b32 %f2, %r3;
|
79 |
+
mov.b32 %f3, %r4;
|
80 |
+
mov.b32 %f4, %r5;
|
81 |
+
mov.u32 %r10, 0x0;
|
82 |
+
mov.u32 %r11, 0x0;
|
83 |
+
mov.u32 %r12, 0x0;
|
84 |
+
mov.u32 %r13, 0x0;
|
85 |
+
@%p6 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
86 |
+
@!%p6 mov.u32 %r10, %r6;
|
87 |
+
@!%p6 mov.u32 %r11, %r6;
|
88 |
+
@!%p6 mov.u32 %r12, %r6;
|
89 |
+
@!%p6 mov.u32 %r13, %r6;
|
90 |
+
mov.b32 %f5, %r10;
|
91 |
+
mov.b32 %f6, %r11;
|
92 |
+
mov.b32 %f7, %r12;
|
93 |
+
mov.b32 %f8, %r13;
|
94 |
+
.loc 1 33 23
|
95 |
+
add.f32 %f9, %f1, 0f00000000;
|
96 |
+
add.f32 %f10, %f2, 0f00000000;
|
97 |
+
add.f32 %f11, %f3, 0f00000000;
|
98 |
+
add.f32 %f12, %f4, 0f00000000;
|
99 |
+
add.f32 %f13, %f5, 0f00000000;
|
100 |
+
add.f32 %f14, %f6, 0f00000000;
|
101 |
+
add.f32 %f15, %f7, 0f00000000;
|
102 |
+
add.f32 %f16, %f8, 0f00000000;
|
103 |
+
.loc 1 34 38
|
104 |
+
selp.f32 %f17, %f13, 0f00000000, %p6;
|
105 |
+
selp.f32 %f18, %f14, 0f00000000, %p6;
|
106 |
+
selp.f32 %f19, %f15, 0f00000000, %p6;
|
107 |
+
selp.f32 %f20, %f16, 0f00000000, %p6;
|
108 |
+
$L__tmp1:
|
109 |
+
.loc 2 233 15
|
110 |
+
add.f32 %f21, %f9, %f17;
|
111 |
+
add.f32 %f22, %f10, %f18;
|
112 |
+
add.f32 %f23, %f11, %f19;
|
113 |
+
add.f32 %f24, %f12, %f20;
|
114 |
+
$L__tmp2:
|
115 |
+
.loc 2 243 36
|
116 |
+
mov.b32 %r49, %f21;
|
117 |
+
shfl.sync.bfly.b32 %r50, %r49, 16, 31, -1;
|
118 |
+
mov.b32 %f25, %r50;
|
119 |
+
$L__tmp3:
|
120 |
+
.loc 2 233 15
|
121 |
+
add.f32 %f26, %f21, %f25;
|
122 |
+
$L__tmp4:
|
123 |
+
.loc 2 243 36
|
124 |
+
mov.b32 %r51, %f26;
|
125 |
+
shfl.sync.bfly.b32 %r52, %r51, 8, 31, -1;
|
126 |
+
mov.b32 %f27, %r52;
|
127 |
+
$L__tmp5:
|
128 |
+
.loc 2 233 15
|
129 |
+
add.f32 %f28, %f26, %f27;
|
130 |
+
$L__tmp6:
|
131 |
+
.loc 2 243 36
|
132 |
+
mov.b32 %r53, %f28;
|
133 |
+
shfl.sync.bfly.b32 %r54, %r53, 4, 31, -1;
|
134 |
+
mov.b32 %f29, %r54;
|
135 |
+
$L__tmp7:
|
136 |
+
.loc 2 233 15
|
137 |
+
add.f32 %f30, %f28, %f29;
|
138 |
+
$L__tmp8:
|
139 |
+
.loc 2 243 36
|
140 |
+
mov.b32 %r55, %f22;
|
141 |
+
shfl.sync.bfly.b32 %r56, %r55, 16, 31, -1;
|
142 |
+
mov.b32 %f31, %r56;
|
143 |
+
$L__tmp9:
|
144 |
+
.loc 2 233 15
|
145 |
+
add.f32 %f32, %f22, %f31;
|
146 |
+
$L__tmp10:
|
147 |
+
.loc 2 243 36
|
148 |
+
mov.b32 %r57, %f32;
|
149 |
+
shfl.sync.bfly.b32 %r58, %r57, 8, 31, -1;
|
150 |
+
mov.b32 %f33, %r58;
|
151 |
+
$L__tmp11:
|
152 |
+
.loc 2 233 15
|
153 |
+
add.f32 %f34, %f32, %f33;
|
154 |
+
$L__tmp12:
|
155 |
+
.loc 2 243 36
|
156 |
+
mov.b32 %r59, %f34;
|
157 |
+
shfl.sync.bfly.b32 %r60, %r59, 4, 31, -1;
|
158 |
+
mov.b32 %f35, %r60;
|
159 |
+
$L__tmp13:
|
160 |
+
.loc 2 233 15
|
161 |
+
add.f32 %f36, %f34, %f35;
|
162 |
+
$L__tmp14:
|
163 |
+
.loc 2 243 36
|
164 |
+
mov.b32 %r61, %f23;
|
165 |
+
shfl.sync.bfly.b32 %r62, %r61, 16, 31, -1;
|
166 |
+
mov.b32 %f37, %r62;
|
167 |
+
$L__tmp15:
|
168 |
+
.loc 2 233 15
|
169 |
+
add.f32 %f38, %f23, %f37;
|
170 |
+
$L__tmp16:
|
171 |
+
.loc 2 243 36
|
172 |
+
mov.b32 %r63, %f38;
|
173 |
+
shfl.sync.bfly.b32 %r64, %r63, 8, 31, -1;
|
174 |
+
mov.b32 %f39, %r64;
|
175 |
+
$L__tmp17:
|
176 |
+
.loc 2 233 15
|
177 |
+
add.f32 %f40, %f38, %f39;
|
178 |
+
$L__tmp18:
|
179 |
+
.loc 2 243 36
|
180 |
+
mov.b32 %r65, %f40;
|
181 |
+
shfl.sync.bfly.b32 %r66, %r65, 4, 31, -1;
|
182 |
+
mov.b32 %f41, %r66;
|
183 |
+
$L__tmp19:
|
184 |
+
.loc 2 233 15
|
185 |
+
add.f32 %f42, %f40, %f41;
|
186 |
+
$L__tmp20:
|
187 |
+
.loc 2 243 36
|
188 |
+
mov.b32 %r67, %f24;
|
189 |
+
shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1;
|
190 |
+
mov.b32 %f43, %r68;
|
191 |
+
$L__tmp21:
|
192 |
+
.loc 2 233 15
|
193 |
+
add.f32 %f44, %f24, %f43;
|
194 |
+
$L__tmp22:
|
195 |
+
.loc 2 243 36
|
196 |
+
mov.b32 %r69, %f44;
|
197 |
+
shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1;
|
198 |
+
mov.b32 %f45, %r70;
|
199 |
+
$L__tmp23:
|
200 |
+
.loc 2 233 15
|
201 |
+
add.f32 %f46, %f44, %f45;
|
202 |
+
$L__tmp24:
|
203 |
+
.loc 2 243 36
|
204 |
+
mov.b32 %r71, %f46;
|
205 |
+
shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1;
|
206 |
+
mov.b32 %f47, %r72;
|
207 |
+
$L__tmp25:
|
208 |
+
.loc 2 233 15
|
209 |
+
add.f32 %f48, %f46, %f47;
|
210 |
+
$L__tmp26:
|
211 |
+
.loc 2 243 36
|
212 |
+
setp.lt.u32 %p11, %r33, 4;
|
213 |
+
shl.b32 %r73, %r37, 2;
|
214 |
+
shl.b32 %r74, %r35, 5;
|
215 |
+
or.b32 %r75, %r74, %r73;
|
216 |
+
mov.u32 %r76, global_smem;
|
217 |
+
add.s32 %r18, %r76, %r75;
|
218 |
+
mov.b32 %r19, %f30;
|
219 |
+
@%p11 st.shared.b32 [ %r18 + 0 ], %r19;
|
220 |
+
or.b32 %r77, %r74, 32;
|
221 |
+
or.b32 %r78, %r77, %r73;
|
222 |
+
add.s32 %r20, %r76, %r78;
|
223 |
+
mov.b32 %r21, %f36;
|
224 |
+
@%p11 st.shared.b32 [ %r20 + 0 ], %r21;
|
225 |
+
or.b32 %r79, %r74, 64;
|
226 |
+
or.b32 %r80, %r79, %r73;
|
227 |
+
add.s32 %r22, %r76, %r80;
|
228 |
+
mov.b32 %r23, %f42;
|
229 |
+
@%p11 st.shared.b32 [ %r22 + 0 ], %r23;
|
230 |
+
or.b32 %r81, %r74, 96;
|
231 |
+
or.b32 %r82, %r81, %r73;
|
232 |
+
add.s32 %r24, %r76, %r82;
|
233 |
+
mov.b32 %r25, %f48;
|
234 |
+
@%p11 st.shared.b32 [ %r24 + 0 ], %r25;
|
235 |
+
bar.sync 0;
|
236 |
+
setp.lt.s32 %p15, %r32, 128;
|
237 |
+
add.s32 %r27, %r76, %r34;
|
238 |
+
@%p15 ld.shared.b32 %r26, [ %r27 + 0 ];
|
239 |
+
mov.b32 %f49, %r26;
|
240 |
+
shfl.sync.bfly.b32 %r83, %r26, 4, 31, -1;
|
241 |
+
mov.b32 %f50, %r83;
|
242 |
+
$L__tmp27:
|
243 |
+
.loc 2 233 15
|
244 |
+
add.f32 %f51, %f49, %f50;
|
245 |
+
$L__tmp28:
|
246 |
+
.loc 2 243 36
|
247 |
+
mov.b32 %r84, %f51;
|
248 |
+
shfl.sync.bfly.b32 %r85, %r84, 2, 31, -1;
|
249 |
+
mov.b32 %f52, %r85;
|
250 |
+
$L__tmp29:
|
251 |
+
.loc 2 233 15
|
252 |
+
add.f32 %f53, %f51, %f52;
|
253 |
+
$L__tmp30:
|
254 |
+
.loc 2 243 36
|
255 |
+
mov.b32 %r86, %f53;
|
256 |
+
shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
|
257 |
+
mov.b32 %f54, %r87;
|
258 |
+
$L__tmp31:
|
259 |
+
.loc 2 233 15
|
260 |
+
add.f32 %f55, %f53, %f54;
|
261 |
+
$L__tmp32:
|
262 |
+
.loc 2 243 36
|
263 |
+
and.b32 %r88, %r32, 7;
|
264 |
+
setp.eq.s32 %p19, %r88, 0;
|
265 |
+
and.pred %p16, %p15, %p19;
|
266 |
+
mov.b32 %r29, %f55;
|
267 |
+
@%p16 st.shared.b32 [ %r27 + 0 ], %r29;
|
268 |
+
bar.sync 0;
|
269 |
+
add.s32 %r89, %r76, %r74;
|
270 |
+
ld.shared.f32 %f56, [%r89];
|
271 |
+
add.s32 %r90, %r76, %r77;
|
272 |
+
ld.shared.f32 %f57, [%r90];
|
273 |
+
add.s32 %r91, %r76, %r79;
|
274 |
+
ld.shared.f32 %f58, [%r91];
|
275 |
+
add.s32 %r92, %r76, %r81;
|
276 |
+
ld.shared.f32 %f59, [%r92];
|
277 |
+
$L__tmp33:
|
278 |
+
.loc 1 35 28
|
279 |
+
bar.sync 0;
|
280 |
+
shl.b32 %r93, %r35, 2;
|
281 |
+
add.s32 %r94, %r76, %r93;
|
282 |
+
st.shared.f32 [%r94], %f56;
|
283 |
+
st.shared.f32 [%r94+4], %f57;
|
284 |
+
st.shared.f32 [%r94+8], %f58;
|
285 |
+
st.shared.f32 [%r94+12], %f59;
|
286 |
+
bar.sync 0;
|
287 |
+
shl.b32 %r95, %r36, 2;
|
288 |
+
add.s32 %r96, %r76, %r95;
|
289 |
+
.loc 1 36 20
|
290 |
+
shr.s32 %r98, %r44, 31;
|
291 |
+
shr.u32 %r99, %r98, 24;
|
292 |
+
add.s32 %r100, %r44, %r99;
|
293 |
+
shr.s32 %r101, %r100, 8;
|
294 |
+
and.b32 %r102, %r100, -256;
|
295 |
+
sub.s32 %r103, %r44, %r102;
|
296 |
+
.loc 1 38 30
|
297 |
+
mul.wide.s32 %rd11, %r101, 8;
|
298 |
+
add.s64 %rd4, %rd7, %rd11;
|
299 |
+
.loc 1 45 55
|
300 |
+
ld.shared.u32 %r31, [%r96];
|
301 |
+
.loc 1 38 35
|
302 |
+
mov.u64 %rd3, 0x0;
|
303 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd3 }, [ %rd4 + 0 ];
|
304 |
+
.loc 1 41 32
|
305 |
+
shr.u64 %rd12, %rd3, 54;
|
306 |
+
and.b64 %rd13, %rd12, 512;
|
307 |
+
add.s64 %rd14, %rd13, %rd3;
|
308 |
+
.loc 1 45 30
|
309 |
+
shl.b64 %rd15, %rd14, 10;
|
310 |
+
add.s64 %rd16, %rd8, %rd15;
|
311 |
+
mul.wide.s32 %rd17, %r103, 4;
|
312 |
+
add.s64 %rd5, %rd16, %rd17;
|
313 |
+
.loc 1 45 55
|
314 |
+
bfe.u32 %r104, %r32, 4, 1;
|
315 |
+
shl.b32 %r105, %r37, 1;
|
316 |
+
or.b32 %r106, %r105, %r104;
|
317 |
+
setp.eq.s32 %p18, %r106, 0;
|
318 |
+
mov.u32 %r30, 0x0;
|
319 |
+
@%p18 atom.global.gpu.acq_rel.add.f32 %r30, [ %rd5 + 0 ], %r31;
|
320 |
+
.loc 1 45 4
|
321 |
+
ret;
|
322 |
+
$L__tmp34:
|
323 |
+
$L__func_end0:
|
324 |
+
|
325 |
+
}
|
326 |
+
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
|
327 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
328 |
+
.section .debug_abbrev
|
329 |
+
{
|
330 |
+
.b8 1
|
331 |
+
.b8 17
|
332 |
+
.b8 1
|
333 |
+
.b8 37
|
334 |
+
.b8 8
|
335 |
+
.b8 19
|
336 |
+
.b8 5
|
337 |
+
.b8 3
|
338 |
+
.b8 8
|
339 |
+
.b8 16
|
340 |
+
.b8 6
|
341 |
+
.b8 27
|
342 |
+
.b8 8
|
343 |
+
.b8 180
|
344 |
+
.b8 66
|
345 |
+
.b8 12
|
346 |
+
.b8 17
|
347 |
+
.b8 1
|
348 |
+
.b8 18
|
349 |
+
.b8 1
|
350 |
+
.b8 0
|
351 |
+
.b8 0
|
352 |
+
.b8 2
|
353 |
+
.b8 46
|
354 |
+
.b8 0
|
355 |
+
.b8 135
|
356 |
+
.b8 64
|
357 |
+
.b8 8
|
358 |
+
.b8 3
|
359 |
+
.b8 8
|
360 |
+
.b8 58
|
361 |
+
.b8 11
|
362 |
+
.b8 59
|
363 |
+
.b8 11
|
364 |
+
.b8 63
|
365 |
+
.b8 12
|
366 |
+
.b8 32
|
367 |
+
.b8 11
|
368 |
+
.b8 0
|
369 |
+
.b8 0
|
370 |
+
.b8 3
|
371 |
+
.b8 46
|
372 |
+
.b8 1
|
373 |
+
.b8 17
|
374 |
+
.b8 1
|
375 |
+
.b8 18
|
376 |
+
.b8 1
|
377 |
+
.b8 64
|
378 |
+
.b8 10
|
379 |
+
.b8 49
|
380 |
+
.b8 19
|
381 |
+
.b8 0
|
382 |
+
.b8 0
|
383 |
+
.b8 4
|
384 |
+
.b8 29
|
385 |
+
.b8 1
|
386 |
+
.b8 49
|
387 |
+
.b8 19
|
388 |
+
.b8 17
|
389 |
+
.b8 1
|
390 |
+
.b8 18
|
391 |
+
.b8 1
|
392 |
+
.b8 88
|
393 |
+
.b8 11
|
394 |
+
.b8 89
|
395 |
+
.b8 11
|
396 |
+
.b8 87
|
397 |
+
.b8 11
|
398 |
+
.b8 0
|
399 |
+
.b8 0
|
400 |
+
.b8 5
|
401 |
+
.b8 29
|
402 |
+
.b8 0
|
403 |
+
.b8 49
|
404 |
+
.b8 19
|
405 |
+
.b8 17
|
406 |
+
.b8 1
|
407 |
+
.b8 18
|
408 |
+
.b8 1
|
409 |
+
.b8 88
|
410 |
+
.b8 11
|
411 |
+
.b8 89
|
412 |
+
.b8 11
|
413 |
+
.b8 87
|
414 |
+
.b8 11
|
415 |
+
.b8 0
|
416 |
+
.b8 0
|
417 |
+
.b8 0
|
418 |
+
}
|
419 |
+
.section .debug_info
|
420 |
+
{
|
421 |
+
.b32 264
|
422 |
+
.b8 2
|
423 |
+
.b8 0
|
424 |
+
.b32 .debug_abbrev
|
425 |
+
.b8 8
|
426 |
+
.b8 1
|
427 |
+
.b8 116
|
428 |
+
.b8 114
|
429 |
+
.b8 105
|
430 |
+
.b8 116
|
431 |
+
.b8 111
|
432 |
+
.b8 110
|
433 |
+
.b8 0
|
434 |
+
.b8 2
|
435 |
+
.b8 0
|
436 |
+
.b8 99
|
437 |
+
.b8 54
|
438 |
+
.b8 105
|
439 |
+
.b8 107
|
440 |
+
.b8 53
|
441 |
+
.b8 118
|
442 |
+
.b8 120
|
443 |
+
.b8 55
|
444 |
+
.b8 112
|
445 |
+
.b8 50
|
446 |
+
.b8 50
|
447 |
+
.b8 102
|
448 |
+
.b8 112
|
449 |
+
.b8 107
|
450 |
+
.b8 52
|
451 |
+
.b8 100
|
452 |
+
.b8 99
|
453 |
+
.b8 118
|
454 |
+
.b8 104
|
455 |
+
.b8 53
|
456 |
+
.b8 53
|
457 |
+
.b8 122
|
458 |
+
.b8 105
|
459 |
+
.b8 109
|
460 |
+
.b8 119
|
461 |
+
.b8 52
|
462 |
+
.b8 116
|
463 |
+
.b8 53
|
464 |
+
.b8 110
|
465 |
+
.b8 114
|
466 |
+
.b8 53
|
467 |
+
.b8 122
|
468 |
+
.b8 110
|
469 |
+
.b8 50
|
470 |
+
.b8 98
|
471 |
+
.b8 55
|
472 |
+
.b8 105
|
473 |
+
.b8 110
|
474 |
+
.b8 117
|
475 |
+
.b8 106
|
476 |
+
.b8 120
|
477 |
+
.b8 106
|
478 |
+
.b8 97
|
479 |
+
.b8 117
|
480 |
+
.b8 120
|
481 |
+
.b8 115
|
482 |
+
.b8 104
|
483 |
+
.b8 108
|
484 |
+
.b8 106
|
485 |
+
.b8 117
|
486 |
+
.b8 109
|
487 |
+
.b8 109
|
488 |
+
.b8 46
|
489 |
+
.b8 112
|
490 |
+
.b8 121
|
491 |
+
.b8 0
|
492 |
+
.b32 .debug_line
|
493 |
+
.b8 47
|
494 |
+
.b8 116
|
495 |
+
.b8 109
|
496 |
+
.b8 112
|
497 |
+
.b8 47
|
498 |
+
.b8 116
|
499 |
+
.b8 111
|
500 |
+
.b8 114
|
501 |
+
.b8 99
|
502 |
+
.b8 104
|
503 |
+
.b8 105
|
504 |
+
.b8 110
|
505 |
+
.b8 100
|
506 |
+
.b8 117
|
507 |
+
.b8 99
|
508 |
+
.b8 116
|
509 |
+
.b8 111
|
510 |
+
.b8 114
|
511 |
+
.b8 95
|
512 |
+
.b8 114
|
513 |
+
.b8 111
|
514 |
+
.b8 111
|
515 |
+
.b8 116
|
516 |
+
.b8 47
|
517 |
+
.b8 54
|
518 |
+
.b8 105
|
519 |
+
.b8 0
|
520 |
+
.b8 1
|
521 |
+
.b64 $L__func_begin0
|
522 |
+
.b64 $L__func_end0
|
523 |
+
.b8 2
|
524 |
+
.b8 116
|
525 |
+
.b8 114
|
526 |
+
.b8 105
|
527 |
+
.b8 116
|
528 |
+
.b8 111
|
529 |
+
.b8 110
|
530 |
+
.b8 95
|
531 |
+
.b8 95
|
532 |
+
.b8 48
|
533 |
+
.b8 100
|
534 |
+
.b8 49
|
535 |
+
.b8 100
|
536 |
+
.b8 50
|
537 |
+
.b8 100
|
538 |
+
.b8 51
|
539 |
+
.b8 100
|
540 |
+
.b8 101
|
541 |
+
.b8 52
|
542 |
+
.b8 101
|
543 |
+
.b8 0
|
544 |
+
.b8 116
|
545 |
+
.b8 114
|
546 |
+
.b8 105
|
547 |
+
.b8 116
|
548 |
+
.b8 111
|
549 |
+
.b8 110
|
550 |
+
.b8 95
|
551 |
+
.b8 95
|
552 |
+
.b8 48
|
553 |
+
.b8 100
|
554 |
+
.b8 49
|
555 |
+
.b8 100
|
556 |
+
.b8 50
|
557 |
+
.b8 100
|
558 |
+
.b8 51
|
559 |
+
.b8 100
|
560 |
+
.b8 101
|
561 |
+
.b8 52
|
562 |
+
.b8 101
|
563 |
+
.b8 0
|
564 |
+
.b8 1
|
565 |
+
.b8 18
|
566 |
+
.b8 1
|
567 |
+
.b8 1
|
568 |
+
.b8 3
|
569 |
+
.b64 $L__func_begin0
|
570 |
+
.b64 $L__func_end0
|
571 |
+
.b8 1
|
572 |
+
.b8 156
|
573 |
+
.b32 125
|
574 |
+
.b8 4
|
575 |
+
.b32 125
|
576 |
+
.b64 $L__tmp1
|
577 |
+
.b64 $L__tmp32
|
578 |
+
.b8 2
|
579 |
+
.b8 35
|
580 |
+
.b8 25
|
581 |
+
.b8 5
|
582 |
+
.b32 125
|
583 |
+
.b64 $L__tmp1
|
584 |
+
.b64 $L__tmp32
|
585 |
+
.b8 2
|
586 |
+
.b8 243
|
587 |
+
.b8 36
|
588 |
+
.b8 0
|
589 |
+
.b8 5
|
590 |
+
.b32 125
|
591 |
+
.b64 $L__tmp2
|
592 |
+
.b64 $L__tmp33
|
593 |
+
.b8 2
|
594 |
+
.b8 35
|
595 |
+
.b8 25
|
596 |
+
.b8 0
|
597 |
+
.b8 0
|
598 |
+
}
|
599 |
+
.section .debug_pubnames
|
600 |
+
{
|
601 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
602 |
+
$L__pubNames_start0:
|
603 |
+
.b8 2
|
604 |
+
.b8 0
|
605 |
+
.b32 .debug_info
|
606 |
+
.b32 268
|
607 |
+
.b32 125
|
608 |
+
.b8 116
|
609 |
+
.b8 114
|
610 |
+
.b8 105
|
611 |
+
.b8 116
|
612 |
+
.b8 111
|
613 |
+
.b8 110
|
614 |
+
.b8 95
|
615 |
+
.b8 95
|
616 |
+
.b8 48
|
617 |
+
.b8 100
|
618 |
+
.b8 49
|
619 |
+
.b8 100
|
620 |
+
.b8 50
|
621 |
+
.b8 100
|
622 |
+
.b8 51
|
623 |
+
.b8 100
|
624 |
+
.b8 101
|
625 |
+
.b8 52
|
626 |
+
.b8 101
|
627 |
+
.b8 0
|
628 |
+
.b32 0
|
629 |
+
$L__pubNames_end0:
|
630 |
+
}
|
631 |
+
.section .debug_pubtypes
|
632 |
+
{
|
633 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
634 |
+
$L__pubTypes_start0:
|
635 |
+
.b8 2
|
636 |
+
.b8 0
|
637 |
+
.b32 .debug_info
|
638 |
+
.b32 268
|
639 |
+
.b32 0
|
640 |
+
$L__pubTypes_end0:
|
641 |
+
}
|
642 |
+
.section .debug_loc { }
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttgir
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<16x1xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<512> : tensor<16x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
|
10 |
+
%cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
|
11 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked1>
|
12 |
+
%cst_6 = arith.constant dense<true> : tensor<16x1xi1, #blocked>
|
13 |
+
%c16_i32 = arith.constant 16 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
17 |
+
%3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
|
19 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
|
20 |
+
%6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
|
21 |
+
%7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
|
22 |
+
%8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked1>
|
23 |
+
%9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked>
|
24 |
+
%10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
25 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
|
26 |
+
%12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
|
27 |
+
%13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
|
28 |
+
%14 = tt.broadcast %8 : (tensor<16x1xi32, #blocked1>) -> tensor<16x128xi32, #blocked1>
|
29 |
+
%15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<16x128xi32, #blocked1>
|
30 |
+
%16 = arith.addi %14, %15 : tensor<16x128xi32, #blocked1>
|
31 |
+
%17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>, #blocked1>
|
32 |
+
%18 = tt.addptr %17, %16 : tensor<16x128x!tt.ptr<f32, 1>, #blocked1>, tensor<16x128xi32, #blocked1>
|
33 |
+
%19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<16x128xi1, #blocked1>
|
34 |
+
%20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked1>
|
35 |
+
%21 = arith.addf %20, %cst_5 : tensor<16x128xf32, #blocked1>
|
36 |
+
%22 = arith.select %19, %21, %cst_5 : tensor<16x128xi1, #blocked1>, tensor<16x128xf32, #blocked1>
|
37 |
+
%23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
|
38 |
+
^bb0(%arg5: f32, %arg6: f32):
|
39 |
+
%40 = arith.addf %arg5, %arg6 : f32
|
40 |
+
tt.reduce.return %40 : f32
|
41 |
+
}) : (tensor<16x128xf32, #blocked1>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
42 |
+
%24 = triton_gpu.convert_layout %23 : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
43 |
+
%25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
44 |
+
%26 = arith.divsi %9, %cst_2 : tensor<16x1xi32, #blocked>
|
45 |
+
%27 = arith.remsi %9, %cst_2 : tensor<16x1xi32, #blocked>
|
46 |
+
%28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
|
47 |
+
%29 = tt.addptr %28, %26 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
|
48 |
+
%30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
|
49 |
+
%31 = arith.addi %30, %cst_1 : tensor<16x1xi64, #blocked>
|
50 |
+
%32 = arith.cmpi slt, %30, %cst_0 : tensor<16x1xi64, #blocked>
|
51 |
+
%33 = arith.select %32, %31, %30 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
|
52 |
+
%34 = arith.muli %33, %cst : tensor<16x1xi64, #blocked>
|
53 |
+
%35 = arith.extsi %27 : tensor<16x1xi32, #blocked> to tensor<16x1xi64, #blocked>
|
54 |
+
%36 = arith.addi %35, %34 : tensor<16x1xi64, #blocked>
|
55 |
+
%37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x1x!tt.ptr<f32, 1>, #blocked>
|
56 |
+
%38 = tt.addptr %37, %36 : tensor<16x1x!tt.ptr<f32, 1>, #blocked>, tensor<16x1xi64, #blocked>
|
57 |
+
%39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr<f32, 1>, #blocked>, tensor<16x1xf32, #blocked>, tensor<16x1xi1, #blocked>) -> tensor<16x1xf32, #blocked>
|
58 |
+
tt.return
|
59 |
+
}
|
60 |
+
}
|
.triton/dump/550b88a9db74a71f80def697002389b5/triton_.ttir
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<16x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<16x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<16x1xi64>
|
6 |
+
%cst_2 = arith.constant dense<true> : tensor<16x1xi1>
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<16x1xi32>
|
8 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
|
9 |
+
%cst_5 = arith.constant dense<120> : tensor<1x128xi32>
|
10 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<16x128xf32>
|
11 |
+
%c16_i32 = arith.constant 16 : i32
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
14 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
|
16 |
+
%4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
|
17 |
+
%5 = arith.addi %4, %3 : tensor<16x1xi32>
|
18 |
+
%6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
19 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
|
20 |
+
%8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
|
21 |
+
%9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
|
22 |
+
%10 = tt.broadcast %5 : (tensor<16x1xi32>) -> tensor<16x128xi32>
|
23 |
+
%11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<16x128xi32>
|
24 |
+
%12 = arith.addi %10, %11 : tensor<16x128xi32>
|
25 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<16x128x!tt.ptr<f32, 1>>
|
26 |
+
%14 = tt.addptr %13, %12 : tensor<16x128x!tt.ptr<f32, 1>>, tensor<16x128xi32>
|
27 |
+
%15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<16x128xi1>
|
28 |
+
%16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32>
|
29 |
+
%17 = arith.addf %16, %cst_6 : tensor<16x128xf32>
|
30 |
+
%18 = arith.select %15, %17, %cst_6 : tensor<16x128xi1>, tensor<16x128xf32>
|
31 |
+
%19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
|
32 |
+
^bb0(%arg5: f32, %arg6: f32):
|
33 |
+
%35 = arith.addf %arg5, %arg6 : f32
|
34 |
+
tt.reduce.return %35 : f32
|
35 |
+
}) : (tensor<16x128xf32>) -> tensor<16xf32>
|
36 |
+
%20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
37 |
+
%21 = arith.divsi %5, %cst_3 : tensor<16x1xi32>
|
38 |
+
%22 = arith.remsi %5, %cst_3 : tensor<16x1xi32>
|
39 |
+
%23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
|
40 |
+
%24 = tt.addptr %23, %21 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
|
41 |
+
%25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
|
42 |
+
%26 = arith.addi %25, %cst_1 : tensor<16x1xi64>
|
43 |
+
%27 = arith.cmpi slt, %25, %cst_0 : tensor<16x1xi64>
|
44 |
+
%28 = arith.select %27, %26, %25 : tensor<16x1xi1>, tensor<16x1xi64>
|
45 |
+
%29 = arith.muli %28, %cst : tensor<16x1xi64>
|
46 |
+
%30 = arith.extsi %22 : tensor<16x1xi32> to tensor<16x1xi64>
|
47 |
+
%31 = arith.addi %30, %29 : tensor<16x1xi64>
|
48 |
+
%32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x1x!tt.ptr<f32, 1>>
|
49 |
+
%33 = tt.addptr %32, %31 : tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xi64>
|
50 |
+
%34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<16x1x!tt.ptr<f32, 1>>, tensor<16x1xf32>, tensor<16x1xi1>) -> tensor<16x1xf32>
|
51 |
+
tt.return
|
52 |
+
}
|
53 |
+
}
|
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.ttgir
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant 0.000000e+00 : f32
|
7 |
+
%c256_i32 = arith.constant 256 : i32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
20 |
+
%9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
21 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
22 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
27 |
+
%16 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
28 |
+
%17 = tt.addptr %16, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
29 |
+
%18 = tt.load %17, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
30 |
+
%19 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
31 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
32 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
33 |
+
%22 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
|
34 |
+
%23 = arith.select %2, %22, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
35 |
+
%24 = "tt.reduce"(%23) <{axis = 0 : i32}> ({
|
36 |
+
^bb0(%arg8: f32, %arg9: f32):
|
37 |
+
%43 = arith.addf %arg8, %arg9 : f32
|
38 |
+
tt.reduce.return %43 : f32
|
39 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
40 |
+
%25 = arith.addf %24, %cst_1 : f32
|
41 |
+
%26 = arith.mulf %22, %15 : tensor<256xf32, #blocked>
|
42 |
+
%27 = arith.select %2, %26, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
43 |
+
%28 = "tt.reduce"(%27) <{axis = 0 : i32}> ({
|
44 |
+
^bb0(%arg8: f32, %arg9: f32):
|
45 |
+
%43 = arith.addf %arg8, %arg9 : f32
|
46 |
+
tt.reduce.return %43 : f32
|
47 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
48 |
+
%29 = arith.addf %28, %cst_1 : f32
|
49 |
+
%30 = arith.divf %21, %cst_0 : tensor<1xf32, #blocked>
|
50 |
+
%31 = arith.mulf %22, %cst_3 : tensor<256xf32, #blocked>
|
51 |
+
%32 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
|
52 |
+
%33 = arith.subf %31, %32 : tensor<256xf32, #blocked>
|
53 |
+
%34 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
|
54 |
+
%35 = arith.mulf %15, %34 : tensor<256xf32, #blocked>
|
55 |
+
%36 = arith.subf %33, %35 : tensor<256xf32, #blocked>
|
56 |
+
%37 = tt.broadcast %30 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
57 |
+
%38 = arith.mulf %37, %36 : tensor<256xf32, #blocked>
|
58 |
+
%39 = arith.addf %18, %38 : tensor<256xf32, #blocked>
|
59 |
+
tt.store %17, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
60 |
+
%40 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
61 |
+
%41 = tt.addptr %40, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
62 |
+
%42 = arith.truncf %39 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
63 |
+
tt.store %41, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
64 |
+
tt.return
|
65 |
+
}
|
66 |
+
}
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.cubin
ADDED
Binary file (4.65 kB). View file
|
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.llir
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 2, !dbg !8
|
7 |
+
%5 = and i32 %4, 508, !dbg !8
|
8 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%7 = shl i32 %6, 10, !dbg !10
|
10 |
+
%8 = or i32 %7, %5, !dbg !11
|
11 |
+
%9 = or i32 %8, 512, !dbg !11
|
12 |
+
%10 = sext i32 %8 to i64, !dbg !12
|
13 |
+
%11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
|
14 |
+
%12 = sext i32 %9 to i64, !dbg !12
|
15 |
+
%13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !12
|
16 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %11, i1 true) #1, !dbg !13
|
17 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 true) #1, !dbg !13
|
18 |
+
ret void, !dbg !14
|
19 |
+
}
|
20 |
+
|
21 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
22 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
23 |
+
|
24 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
25 |
+
attributes #1 = { nounwind }
|
26 |
+
|
27 |
+
!llvm.module.flags = !{!0}
|
28 |
+
!llvm.dbg.cu = !{!1}
|
29 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
30 |
+
|
31 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
32 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
33 |
+
!2 = !DIFile(filename: "c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py", directory: "/tmp/torchinductor_root/7w")
|
34 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
35 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
36 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
37 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
38 |
+
!7 = !{}
|
39 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
40 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
41 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
42 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
43 |
+
!12 = !DILocation(line: 25, column: 25, scope: !5)
|
44 |
+
!13 = !DILocation(line: 25, column: 36, scope: !5)
|
45 |
+
!14 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ptx
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u32 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 128, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<3>;
|
18 |
+
.reg .b32 %r<15>;
|
19 |
+
.reg .b64 %rd<5>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r10, %tid.x;
|
28 |
+
shl.b32 %r11, %r10, 2;
|
29 |
+
and.b32 %r12, %r11, 508;
|
30 |
+
.loc 1 20 28
|
31 |
+
mov.u32 %r1, %ctaid.x;
|
32 |
+
.loc 1 20 33
|
33 |
+
shl.b32 %r13, %r1, 10;
|
34 |
+
.loc 1 21 23
|
35 |
+
or.b32 %r14, %r13, %r12;
|
36 |
+
.loc 1 25 25
|
37 |
+
mul.wide.s32 %rd4, %r14, 4;
|
38 |
+
add.s64 %rd1, %rd3, %rd4;
|
39 |
+
add.s64 %rd2, %rd1, 2048;
|
40 |
+
mov.b32 %r2, 0;
|
41 |
+
mov.pred %p1, -1;
|
42 |
+
.loc 1 25 36
|
43 |
+
@%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
|
44 |
+
@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
|
45 |
+
.loc 1 25 4
|
46 |
+
ret;
|
47 |
+
$L__tmp1:
|
48 |
+
$L__func_end0:
|
49 |
+
|
50 |
+
}
|
51 |
+
.file 1 "/tmp/torchinductor_root/7w/c7w5r66fcggm6aokktzwmg24mlevq2hqdw2bgwzwlovrel6re5ym.py"
|
52 |
+
.section .debug_abbrev
|
53 |
+
{
|
54 |
+
.b8 1
|
55 |
+
.b8 17
|
56 |
+
.b8 1
|
57 |
+
.b8 37
|
58 |
+
.b8 8
|
59 |
+
.b8 19
|
60 |
+
.b8 5
|
61 |
+
.b8 3
|
62 |
+
.b8 8
|
63 |
+
.b8 16
|
64 |
+
.b8 6
|
65 |
+
.b8 27
|
66 |
+
.b8 8
|
67 |
+
.b8 180
|
68 |
+
.b8 66
|
69 |
+
.b8 12
|
70 |
+
.b8 17
|
71 |
+
.b8 1
|
72 |
+
.b8 18
|
73 |
+
.b8 1
|
74 |
+
.b8 0
|
75 |
+
.b8 0
|
76 |
+
.b8 2
|
77 |
+
.b8 46
|
78 |
+
.b8 0
|
79 |
+
.b8 17
|
80 |
+
.b8 1
|
81 |
+
.b8 18
|
82 |
+
.b8 1
|
83 |
+
.b8 64
|
84 |
+
.b8 10
|
85 |
+
.b8 135
|
86 |
+
.b8 64
|
87 |
+
.b8 8
|
88 |
+
.b8 3
|
89 |
+
.b8 8
|
90 |
+
.b8 58
|
91 |
+
.b8 11
|
92 |
+
.b8 59
|
93 |
+
.b8 11
|
94 |
+
.b8 63
|
95 |
+
.b8 12
|
96 |
+
.b8 0
|
97 |
+
.b8 0
|
98 |
+
.b8 0
|
99 |
+
}
|
100 |
+
.section .debug_info
|
101 |
+
{
|
102 |
+
.b32 172
|
103 |
+
.b8 2
|
104 |
+
.b8 0
|
105 |
+
.b32 .debug_abbrev
|
106 |
+
.b8 8
|
107 |
+
.b8 1
|
108 |
+
.b8 116
|
109 |
+
.b8 114
|
110 |
+
.b8 105
|
111 |
+
.b8 116
|
112 |
+
.b8 111
|
113 |
+
.b8 110
|
114 |
+
.b8 0
|
115 |
+
.b8 2
|
116 |
+
.b8 0
|
117 |
+
.b8 99
|
118 |
+
.b8 55
|
119 |
+
.b8 119
|
120 |
+
.b8 53
|
121 |
+
.b8 114
|
122 |
+
.b8 54
|
123 |
+
.b8 54
|
124 |
+
.b8 102
|
125 |
+
.b8 99
|
126 |
+
.b8 103
|
127 |
+
.b8 103
|
128 |
+
.b8 109
|
129 |
+
.b8 54
|
130 |
+
.b8 97
|
131 |
+
.b8 111
|
132 |
+
.b8 107
|
133 |
+
.b8 107
|
134 |
+
.b8 116
|
135 |
+
.b8 122
|
136 |
+
.b8 119
|
137 |
+
.b8 109
|
138 |
+
.b8 103
|
139 |
+
.b8 50
|
140 |
+
.b8 52
|
141 |
+
.b8 109
|
142 |
+
.b8 108
|
143 |
+
.b8 101
|
144 |
+
.b8 118
|
145 |
+
.b8 113
|
146 |
+
.b8 50
|
147 |
+
.b8 104
|
148 |
+
.b8 113
|
149 |
+
.b8 100
|
150 |
+
.b8 119
|
151 |
+
.b8 50
|
152 |
+
.b8 98
|
153 |
+
.b8 103
|
154 |
+
.b8 119
|
155 |
+
.b8 122
|
156 |
+
.b8 119
|
157 |
+
.b8 108
|
158 |
+
.b8 111
|
159 |
+
.b8 118
|
160 |
+
.b8 114
|
161 |
+
.b8 101
|
162 |
+
.b8 108
|
163 |
+
.b8 54
|
164 |
+
.b8 114
|
165 |
+
.b8 101
|
166 |
+
.b8 53
|
167 |
+
.b8 121
|
168 |
+
.b8 109
|
169 |
+
.b8 46
|
170 |
+
.b8 112
|
171 |
+
.b8 121
|
172 |
+
.b8 0
|
173 |
+
.b32 .debug_line
|
174 |
+
.b8 47
|
175 |
+
.b8 116
|
176 |
+
.b8 109
|
177 |
+
.b8 112
|
178 |
+
.b8 47
|
179 |
+
.b8 116
|
180 |
+
.b8 111
|
181 |
+
.b8 114
|
182 |
+
.b8 99
|
183 |
+
.b8 104
|
184 |
+
.b8 105
|
185 |
+
.b8 110
|
186 |
+
.b8 100
|
187 |
+
.b8 117
|
188 |
+
.b8 99
|
189 |
+
.b8 116
|
190 |
+
.b8 111
|
191 |
+
.b8 114
|
192 |
+
.b8 95
|
193 |
+
.b8 114
|
194 |
+
.b8 111
|
195 |
+
.b8 111
|
196 |
+
.b8 116
|
197 |
+
.b8 47
|
198 |
+
.b8 55
|
199 |
+
.b8 119
|
200 |
+
.b8 0
|
201 |
+
.b8 1
|
202 |
+
.b64 $L__func_begin0
|
203 |
+
.b64 $L__func_end0
|
204 |
+
.b8 2
|
205 |
+
.b64 $L__func_begin0
|
206 |
+
.b64 $L__func_end0
|
207 |
+
.b8 1
|
208 |
+
.b8 156
|
209 |
+
.b8 116
|
210 |
+
.b8 114
|
211 |
+
.b8 105
|
212 |
+
.b8 116
|
213 |
+
.b8 111
|
214 |
+
.b8 110
|
215 |
+
.b8 95
|
216 |
+
.b8 95
|
217 |
+
.b8 48
|
218 |
+
.b8 100
|
219 |
+
.b8 49
|
220 |
+
.b8 100
|
221 |
+
.b8 101
|
222 |
+
.b8 0
|
223 |
+
.b8 116
|
224 |
+
.b8 114
|
225 |
+
.b8 105
|
226 |
+
.b8 116
|
227 |
+
.b8 111
|
228 |
+
.b8 110
|
229 |
+
.b8 95
|
230 |
+
.b8 95
|
231 |
+
.b8 48
|
232 |
+
.b8 100
|
233 |
+
.b8 49
|
234 |
+
.b8 100
|
235 |
+
.b8 101
|
236 |
+
.b8 0
|
237 |
+
.b8 1
|
238 |
+
.b8 18
|
239 |
+
.b8 1
|
240 |
+
.b8 0
|
241 |
+
}
|
242 |
+
.section .debug_pubnames
|
243 |
+
{
|
244 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
245 |
+
$L__pubNames_start0:
|
246 |
+
.b8 2
|
247 |
+
.b8 0
|
248 |
+
.b32 .debug_info
|
249 |
+
.b32 176
|
250 |
+
.b32 125
|
251 |
+
.b8 116
|
252 |
+
.b8 114
|
253 |
+
.b8 105
|
254 |
+
.b8 116
|
255 |
+
.b8 111
|
256 |
+
.b8 110
|
257 |
+
.b8 95
|
258 |
+
.b8 95
|
259 |
+
.b8 48
|
260 |
+
.b8 100
|
261 |
+
.b8 49
|
262 |
+
.b8 100
|
263 |
+
.b8 101
|
264 |
+
.b8 0
|
265 |
+
.b32 0
|
266 |
+
$L__pubNames_end0:
|
267 |
+
}
|
268 |
+
.section .debug_pubtypes
|
269 |
+
{
|
270 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
271 |
+
$L__pubTypes_start0:
|
272 |
+
.b8 2
|
273 |
+
.b8 0
|
274 |
+
.b32 .debug_info
|
275 |
+
.b32 176
|
276 |
+
.b32 0
|
277 |
+
$L__pubTypes_end0:
|
278 |
+
}
|
279 |
+
.section .debug_loc { }
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttgir
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
11 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
12 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
|
13 |
+
tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
|
14 |
+
tt.return
|
15 |
+
}
|
16 |
+
}
|
.triton/dump/7b1a931e36ddc741e8bf98e3cbffe01d/triton_.ttir
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
10 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
11 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
12 |
+
tt.store %6, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
13 |
+
tt.return
|
14 |
+
}
|
15 |
+
}
|
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ttgir
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6e7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg6: i64 {tt.max_divisibility = 8 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<7680> : tensor<1x2048xi64, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<7680> : tensor<1x2048xi64, #blocked1>
|
8 |
+
%cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
|
9 |
+
%c385973760_i64 = arith.constant 385973760 : i64
|
10 |
+
%c7680_i64 = arith.constant 7680 : i64
|
11 |
+
%c8_i64 = arith.constant 8 : i64
|
12 |
+
%cst_2 = arith.constant dense<-1> : tensor<1x2048xi64, #blocked>
|
13 |
+
%cst_3 = arith.constant dense<0> : tensor<1x2048xi64, #blocked>
|
14 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked1>
|
15 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
|
16 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked1>
|
17 |
+
%c0_i32 = arith.constant 0 : i32
|
18 |
+
%c7680_i32 = arith.constant 7680 : i32
|
19 |
+
%c2048_i32 = arith.constant 2048 : i32
|
20 |
+
%0 = tt.get_program_id x : i32
|
21 |
+
%1 = arith.extsi %0 : i32 to i64
|
22 |
+
%2 = arith.cmpi slt, %1, %c8_i64 : i64
|
23 |
+
%3 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
24 |
+
%4 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
25 |
+
%5 = tt.expand_dims %3 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
|
26 |
+
%6 = tt.expand_dims %4 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x2048xi32, #blocked1>
|
27 |
+
%7 = arith.extsi %5 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
|
28 |
+
%8 = arith.extsi %6 : tensor<1x2048xi32, #blocked1> to tensor<1x2048xi64, #blocked1>
|
29 |
+
%9 = arith.muli %1, %c7680_i64 : i64
|
30 |
+
%10 = tt.splat %9 : (i64) -> tensor<1x2048xi64, #blocked>
|
31 |
+
%11 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<1x2048x!tt.ptr<i64, 1>, #blocked>
|
32 |
+
%12 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked>
|
33 |
+
%13 = tt.splat %2 : (i1) -> tensor<1x2048xi1, #blocked1>
|
34 |
+
%14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
|
35 |
+
%15 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
|
36 |
+
%16 = arith.muli %1, %c385973760_i64 : i64
|
37 |
+
%17 = tt.splat %16 : (i64) -> tensor<1x2048xi64, #blocked>
|
38 |
+
%18 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
|
39 |
+
%19:2 = scf.for %arg8 = %c0_i32 to %c7680_i32 step %c2048_i32 iter_args(%arg9 = %cst_4, %arg10 = %cst_3) -> (tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>) : i32 {
|
40 |
+
%30 = arith.extsi %arg8 : i32 to i64
|
41 |
+
%31 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked>
|
42 |
+
%32 = tt.splat %30 : (i64) -> tensor<1x2048xi64, #blocked1>
|
43 |
+
%33 = arith.addi %31, %7 : tensor<1x2048xi64, #blocked>
|
44 |
+
%34 = arith.addi %32, %8 : tensor<1x2048xi64, #blocked1>
|
45 |
+
%35 = arith.cmpi slt, %33, %cst : tensor<1x2048xi64, #blocked>
|
46 |
+
%36 = arith.cmpi slt, %34, %cst_0 : tensor<1x2048xi64, #blocked1>
|
47 |
+
%37 = arith.addi %33, %10 : tensor<1x2048xi64, #blocked>
|
48 |
+
%38 = tt.addptr %11, %37 : tensor<1x2048x!tt.ptr<i64, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
49 |
+
%39 = arith.andi %35, %12 : tensor<1x2048xi1, #blocked>
|
50 |
+
%40 = arith.andi %36, %13 : tensor<1x2048xi1, #blocked1>
|
51 |
+
%41 = tt.load %38, %39, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xi64, #blocked>
|
52 |
+
%42 = tt.addptr %14, %37 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
53 |
+
%43 = tt.load %42, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
|
54 |
+
%44 = triton_gpu.convert_layout %43 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1>
|
55 |
+
%45 = tt.addptr %15, %37 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
56 |
+
%46 = tt.load %45, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
|
57 |
+
%47 = arith.cmpi ne, %41, %cst_2 : tensor<1x2048xi64, #blocked>
|
58 |
+
%48 = triton_gpu.convert_layout %47 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked1>
|
59 |
+
%49 = arith.select %47, %41, %cst_3 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
|
60 |
+
%50 = arith.addi %49, %cst_1 : tensor<1x2048xi64, #blocked>
|
61 |
+
%51 = arith.cmpi slt, %49, %cst_3 : tensor<1x2048xi64, #blocked>
|
62 |
+
%52 = arith.select %51, %50, %49 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
|
63 |
+
%53 = arith.cmpi sge, %52, %cst_3 : tensor<1x2048xi64, #blocked>
|
64 |
+
%54 = arith.cmpi slt, %52, %cst_1 : tensor<1x2048xi64, #blocked>
|
65 |
+
%55 = arith.andi %53, %54 : tensor<1x2048xi1, #blocked>
|
66 |
+
%56 = triton_gpu.convert_layout %55 : (tensor<1x2048xi1, #blocked>) -> tensor<1x2048xi1, #blocked2>
|
67 |
+
tt.assert %56, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1x2048xi1, #blocked2>
|
68 |
+
%57 = arith.muli %33, %cst_1 : tensor<1x2048xi64, #blocked>
|
69 |
+
%58 = arith.addi %52, %57 : tensor<1x2048xi64, #blocked>
|
70 |
+
%59 = arith.addi %58, %17 : tensor<1x2048xi64, #blocked>
|
71 |
+
%60 = tt.addptr %18, %59 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
72 |
+
%61 = triton_gpu.convert_layout %60 : (tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked1>
|
73 |
+
%62 = tt.load %61, %40, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked1>
|
74 |
+
%63 = arith.extf %62 : tensor<1x2048xbf16, #blocked1> to tensor<1x2048xf32, #blocked1>
|
75 |
+
%64 = arith.subf %63, %44 : tensor<1x2048xf32, #blocked1>
|
76 |
+
%65 = math.log %46 : tensor<1x2048xf32, #blocked>
|
77 |
+
%66 = triton_gpu.convert_layout %65 : (tensor<1x2048xf32, #blocked>) -> tensor<1x2048xf32, #blocked1>
|
78 |
+
%67 = arith.subf %64, %66 : tensor<1x2048xf32, #blocked1>
|
79 |
+
%68 = arith.subf %cst_4, %67 : tensor<1x2048xf32, #blocked1>
|
80 |
+
%69 = arith.select %48, %68, %cst_4 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1>
|
81 |
+
%70 = arith.addf %arg9, %69 : tensor<1x2048xf32, #blocked1>
|
82 |
+
%71 = arith.select %40, %70, %arg9 : tensor<1x2048xi1, #blocked1>, tensor<1x2048xf32, #blocked1>
|
83 |
+
%72 = arith.extui %47 : tensor<1x2048xi1, #blocked> to tensor<1x2048xi64, #blocked>
|
84 |
+
%73 = arith.addi %arg10, %72 : tensor<1x2048xi64, #blocked>
|
85 |
+
%74 = arith.select %39, %73, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xi64, #blocked>
|
86 |
+
scf.yield %71, %74 : tensor<1x2048xf32, #blocked1>, tensor<1x2048xi64, #blocked>
|
87 |
+
}
|
88 |
+
%20 = "tt.reduce"(%19#0) <{axis = 1 : i32}> ({
|
89 |
+
^bb0(%arg8: f32, %arg9: f32):
|
90 |
+
%30 = arith.addf %arg8, %arg9 : f32
|
91 |
+
tt.reduce.return %30 : f32
|
92 |
+
}) : (tensor<1x2048xf32, #blocked1>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
93 |
+
%21 = tt.expand_dims %20 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
|
94 |
+
%22 = tt.addptr %arg4, %1 : !tt.ptr<f32, 1>, i64
|
95 |
+
%23 = tt.splat %22 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
|
96 |
+
%24 = tt.splat %2 : (i1) -> tensor<1x1xi1, #blocked1>
|
97 |
+
tt.store %23, %21, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
|
98 |
+
%25 = "tt.reduce"(%19#1) <{axis = 1 : i32}> ({
|
99 |
+
^bb0(%arg8: i64, %arg9: i64):
|
100 |
+
%30 = arith.addi %arg8, %arg9 : i64
|
101 |
+
tt.reduce.return %30 : i64
|
102 |
+
}) : (tensor<1x2048xi64, #blocked>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
103 |
+
%26 = triton_gpu.convert_layout %25 : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
104 |
+
%27 = tt.expand_dims %26 {axis = 1 : i32} : (tensor<1xi64, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xi64, #blocked1>
|
105 |
+
%28 = tt.addptr %arg5, %1 : !tt.ptr<i64, 1>, i64
|
106 |
+
%29 = tt.splat %28 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked1>
|
107 |
+
tt.store %29, %27, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xi64, #blocked1>
|
108 |
+
tt.return
|
109 |
+
}
|
110 |
+
}
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.cubin
ADDED
Binary file (4.9 kB). View file
|
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.llir
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !5 {
|
5 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%4 = shl i32 %3, 2, !dbg !8
|
7 |
+
%5 = and i32 %4, 508, !dbg !8
|
8 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%7 = shl i32 %6, 10, !dbg !10
|
10 |
+
%8 = or i32 %7, %5, !dbg !11
|
11 |
+
%9 = or i32 %8, 512, !dbg !11
|
12 |
+
%10 = icmp slt i32 %8, 12865792, !dbg !12
|
13 |
+
%11 = icmp slt i32 %9, 12865792, !dbg !12
|
14 |
+
%12 = sext i32 %8 to i64, !dbg !13
|
15 |
+
%13 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
|
16 |
+
%14 = sext i32 %9 to i64, !dbg !13
|
17 |
+
%15 = getelementptr float, ptr addrspace(1) %0, i64 %14, !dbg !13
|
18 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %13, i1 %10) #1, !dbg !14
|
19 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 %11) #1, !dbg !14
|
20 |
+
ret void, !dbg !15
|
21 |
+
}
|
22 |
+
|
23 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
24 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
25 |
+
|
26 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
27 |
+
attributes #1 = { nounwind }
|
28 |
+
|
29 |
+
!llvm.module.flags = !{!0}
|
30 |
+
!llvm.dbg.cu = !{!1}
|
31 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
32 |
+
|
33 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
34 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
35 |
+
!2 = !DIFile(filename: "c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py", directory: "/tmp/torchinductor_root/4y")
|
36 |
+
!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
37 |
+
!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
38 |
+
!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
39 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
40 |
+
!7 = !{}
|
41 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
42 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
43 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
44 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
45 |
+
!12 = !DILocation(line: 22, column: 21, scope: !5)
|
46 |
+
!13 = !DILocation(line: 25, column: 25, scope: !5)
|
47 |
+
!14 = !DILocation(line: 25, column: 36, scope: !5)
|
48 |
+
!15 = !DILocation(line: 25, column: 4, scope: !5)
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.ptx
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u32 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 128, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<3>;
|
18 |
+
.reg .b32 %r<16>;
|
19 |
+
.reg .b64 %rd<5>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r10, %tid.x;
|
28 |
+
shl.b32 %r11, %r10, 2;
|
29 |
+
and.b32 %r12, %r11, 508;
|
30 |
+
.loc 1 20 28
|
31 |
+
mov.u32 %r1, %ctaid.x;
|
32 |
+
.loc 1 20 33
|
33 |
+
shl.b32 %r13, %r1, 10;
|
34 |
+
.loc 1 21 23
|
35 |
+
or.b32 %r14, %r13, %r12;
|
36 |
+
or.b32 %r15, %r14, 512;
|
37 |
+
.loc 1 22 21
|
38 |
+
setp.lt.s32 %p1, %r14, 12865792;
|
39 |
+
setp.lt.s32 %p2, %r15, 12865792;
|
40 |
+
.loc 1 25 25
|
41 |
+
mul.wide.s32 %rd4, %r14, 4;
|
42 |
+
add.s64 %rd1, %rd3, %rd4;
|
43 |
+
add.s64 %rd2, %rd1, 2048;
|
44 |
+
mov.b32 %r2, 0;
|
45 |
+
.loc 1 25 36
|
46 |
+
@%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
|
47 |
+
@%p2 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
|
48 |
+
.loc 1 25 4
|
49 |
+
ret;
|
50 |
+
$L__tmp1:
|
51 |
+
$L__func_end0:
|
52 |
+
|
53 |
+
}
|
54 |
+
.file 1 "/tmp/torchinductor_root/4y/c4yseldwmu3to52pbh2md2oeufrq3fcdmapkt4nxdzmyqtgd2ysp.py"
|
55 |
+
.section .debug_abbrev
|
56 |
+
{
|
57 |
+
.b8 1
|
58 |
+
.b8 17
|
59 |
+
.b8 1
|
60 |
+
.b8 37
|
61 |
+
.b8 8
|
62 |
+
.b8 19
|
63 |
+
.b8 5
|
64 |
+
.b8 3
|
65 |
+
.b8 8
|
66 |
+
.b8 16
|
67 |
+
.b8 6
|
68 |
+
.b8 27
|
69 |
+
.b8 8
|
70 |
+
.b8 180
|
71 |
+
.b8 66
|
72 |
+
.b8 12
|
73 |
+
.b8 17
|
74 |
+
.b8 1
|
75 |
+
.b8 18
|
76 |
+
.b8 1
|
77 |
+
.b8 0
|
78 |
+
.b8 0
|
79 |
+
.b8 2
|
80 |
+
.b8 46
|
81 |
+
.b8 0
|
82 |
+
.b8 17
|
83 |
+
.b8 1
|
84 |
+
.b8 18
|
85 |
+
.b8 1
|
86 |
+
.b8 64
|
87 |
+
.b8 10
|
88 |
+
.b8 135
|
89 |
+
.b8 64
|
90 |
+
.b8 8
|
91 |
+
.b8 3
|
92 |
+
.b8 8
|
93 |
+
.b8 58
|
94 |
+
.b8 11
|
95 |
+
.b8 59
|
96 |
+
.b8 11
|
97 |
+
.b8 63
|
98 |
+
.b8 12
|
99 |
+
.b8 0
|
100 |
+
.b8 0
|
101 |
+
.b8 0
|
102 |
+
}
|
103 |
+
.section .debug_info
|
104 |
+
{
|
105 |
+
.b32 172
|
106 |
+
.b8 2
|
107 |
+
.b8 0
|
108 |
+
.b32 .debug_abbrev
|
109 |
+
.b8 8
|
110 |
+
.b8 1
|
111 |
+
.b8 116
|
112 |
+
.b8 114
|
113 |
+
.b8 105
|
114 |
+
.b8 116
|
115 |
+
.b8 111
|
116 |
+
.b8 110
|
117 |
+
.b8 0
|
118 |
+
.b8 2
|
119 |
+
.b8 0
|
120 |
+
.b8 99
|
121 |
+
.b8 52
|
122 |
+
.b8 121
|
123 |
+
.b8 115
|
124 |
+
.b8 101
|
125 |
+
.b8 108
|
126 |
+
.b8 100
|
127 |
+
.b8 119
|
128 |
+
.b8 109
|
129 |
+
.b8 117
|
130 |
+
.b8 51
|
131 |
+
.b8 116
|
132 |
+
.b8 111
|
133 |
+
.b8 53
|
134 |
+
.b8 50
|
135 |
+
.b8 112
|
136 |
+
.b8 98
|
137 |
+
.b8 104
|
138 |
+
.b8 50
|
139 |
+
.b8 109
|
140 |
+
.b8 100
|
141 |
+
.b8 50
|
142 |
+
.b8 111
|
143 |
+
.b8 101
|
144 |
+
.b8 117
|
145 |
+
.b8 102
|
146 |
+
.b8 114
|
147 |
+
.b8 113
|
148 |
+
.b8 51
|
149 |
+
.b8 102
|
150 |
+
.b8 99
|
151 |
+
.b8 100
|
152 |
+
.b8 109
|
153 |
+
.b8 97
|
154 |
+
.b8 112
|
155 |
+
.b8 107
|
156 |
+
.b8 116
|
157 |
+
.b8 52
|
158 |
+
.b8 110
|
159 |
+
.b8 120
|
160 |
+
.b8 100
|
161 |
+
.b8 122
|
162 |
+
.b8 109
|
163 |
+
.b8 121
|
164 |
+
.b8 113
|
165 |
+
.b8 116
|
166 |
+
.b8 103
|
167 |
+
.b8 100
|
168 |
+
.b8 50
|
169 |
+
.b8 121
|
170 |
+
.b8 115
|
171 |
+
.b8 112
|
172 |
+
.b8 46
|
173 |
+
.b8 112
|
174 |
+
.b8 121
|
175 |
+
.b8 0
|
176 |
+
.b32 .debug_line
|
177 |
+
.b8 47
|
178 |
+
.b8 116
|
179 |
+
.b8 109
|
180 |
+
.b8 112
|
181 |
+
.b8 47
|
182 |
+
.b8 116
|
183 |
+
.b8 111
|
184 |
+
.b8 114
|
185 |
+
.b8 99
|
186 |
+
.b8 104
|
187 |
+
.b8 105
|
188 |
+
.b8 110
|
189 |
+
.b8 100
|
190 |
+
.b8 117
|
191 |
+
.b8 99
|
192 |
+
.b8 116
|
193 |
+
.b8 111
|
194 |
+
.b8 114
|
195 |
+
.b8 95
|
196 |
+
.b8 114
|
197 |
+
.b8 111
|
198 |
+
.b8 111
|
199 |
+
.b8 116
|
200 |
+
.b8 47
|
201 |
+
.b8 52
|
202 |
+
.b8 121
|
203 |
+
.b8 0
|
204 |
+
.b8 1
|
205 |
+
.b64 $L__func_begin0
|
206 |
+
.b64 $L__func_end0
|
207 |
+
.b8 2
|
208 |
+
.b64 $L__func_begin0
|
209 |
+
.b64 $L__func_end0
|
210 |
+
.b8 1
|
211 |
+
.b8 156
|
212 |
+
.b8 116
|
213 |
+
.b8 114
|
214 |
+
.b8 105
|
215 |
+
.b8 116
|
216 |
+
.b8 111
|
217 |
+
.b8 110
|
218 |
+
.b8 95
|
219 |
+
.b8 95
|
220 |
+
.b8 48
|
221 |
+
.b8 100
|
222 |
+
.b8 49
|
223 |
+
.b8 100
|
224 |
+
.b8 101
|
225 |
+
.b8 0
|
226 |
+
.b8 116
|
227 |
+
.b8 114
|
228 |
+
.b8 105
|
229 |
+
.b8 116
|
230 |
+
.b8 111
|
231 |
+
.b8 110
|
232 |
+
.b8 95
|
233 |
+
.b8 95
|
234 |
+
.b8 48
|
235 |
+
.b8 100
|
236 |
+
.b8 49
|
237 |
+
.b8 100
|
238 |
+
.b8 101
|
239 |
+
.b8 0
|
240 |
+
.b8 1
|
241 |
+
.b8 18
|
242 |
+
.b8 1
|
243 |
+
.b8 0
|
244 |
+
}
|
245 |
+
.section .debug_pubnames
|
246 |
+
{
|
247 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
248 |
+
$L__pubNames_start0:
|
249 |
+
.b8 2
|
250 |
+
.b8 0
|
251 |
+
.b32 .debug_info
|
252 |
+
.b32 176
|
253 |
+
.b32 125
|
254 |
+
.b8 116
|
255 |
+
.b8 114
|
256 |
+
.b8 105
|
257 |
+
.b8 116
|
258 |
+
.b8 111
|
259 |
+
.b8 110
|
260 |
+
.b8 95
|
261 |
+
.b8 95
|
262 |
+
.b8 48
|
263 |
+
.b8 100
|
264 |
+
.b8 49
|
265 |
+
.b8 100
|
266 |
+
.b8 101
|
267 |
+
.b8 0
|
268 |
+
.b32 0
|
269 |
+
$L__pubNames_end0:
|
270 |
+
}
|
271 |
+
.section .debug_pubtypes
|
272 |
+
{
|
273 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
274 |
+
$L__pubTypes_start0:
|
275 |
+
.b8 2
|
276 |
+
.b8 0
|
277 |
+
.b32 .debug_info
|
278 |
+
.b32 176
|
279 |
+
.b32 0
|
280 |
+
$L__pubTypes_end0:
|
281 |
+
}
|
282 |
+
.section .debug_loc { }
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
12 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<1024xi32, #blocked>
|
13 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
14 |
+
%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
|
15 |
+
tt.store %7, %cst_0, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/962d1809855a53123762906133b1d960/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<12865792> : tensor<1024xi32>
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst_0 : tensor<1024xi32>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
14 |
+
tt.store %7, %cst, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.cubin
ADDED
Binary file (49.4 kB). View file
|
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.llir
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !5 {
|
7 |
+
%13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%14 = and i32 %13, 31, !dbg !8
|
9 |
+
%15 = lshr i32 %13, 5, !dbg !8
|
10 |
+
%16 = shl i32 %13, 2, !dbg !8
|
11 |
+
%17 = and i32 %16, 60, !dbg !8
|
12 |
+
%18 = and i32 %15, 3, !dbg !8
|
13 |
+
%19 = lshr i32 %14, 1, !dbg !8
|
14 |
+
%20 = shl nuw nsw i32 %18, 4, !dbg !8
|
15 |
+
%21 = or i32 %20, %19, !dbg !8
|
16 |
+
%22 = and i32 %16, 4, !dbg !9
|
17 |
+
%23 = lshr i32 %14, 4, !dbg !9
|
18 |
+
%24 = shl nuw nsw i32 %18, 1, !dbg !9
|
19 |
+
%25 = or i32 %24, %23, !dbg !9
|
20 |
+
%26 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
21 |
+
%27 = shl i32 %26, 6, !dbg !11
|
22 |
+
%28 = or i32 %27, %17, !dbg !12
|
23 |
+
%29 = or i32 %27, %21, !dbg !12
|
24 |
+
%.frozen = freeze i32 %28
|
25 |
+
%30 = sdiv i32 %.frozen, 256, !dbg !13
|
26 |
+
%31 = mul i32 %30, 256
|
27 |
+
%.decomposed = sub i32 %.frozen, %31
|
28 |
+
%32 = sdiv i32 %29, 256, !dbg !13
|
29 |
+
%33 = shl i32 %30, 15, !dbg !14
|
30 |
+
%34 = shl nsw i32 %32, 7, !dbg !15
|
31 |
+
%35 = add i32 %33, %.decomposed
|
32 |
+
%36 = mul nuw nsw i32 %17, 12
|
33 |
+
%37 = or i32 %25, %36
|
34 |
+
%38 = zext nneg i32 %37 to i64
|
35 |
+
%39 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %38
|
36 |
+
%40 = or i32 %36, 12
|
37 |
+
%41 = add nuw nsw i32 %40, %25
|
38 |
+
%42 = zext nneg i32 %41 to i64
|
39 |
+
%43 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %42
|
40 |
+
%44 = add nuw nsw i32 %36, 24
|
41 |
+
%45 = or i32 %44, %25
|
42 |
+
%46 = zext nneg i32 %45 to i64
|
43 |
+
%47 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %46
|
44 |
+
%48 = add nuw nsw i32 %36, 36
|
45 |
+
%49 = add nuw nsw i32 %48, %25
|
46 |
+
%50 = zext nneg i32 %49 to i64
|
47 |
+
%51 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %50
|
48 |
+
%52 = mul nuw nsw i32 %21, 12
|
49 |
+
%53 = add nuw nsw i32 %52, %22
|
50 |
+
%54 = zext nneg i32 %53 to i64
|
51 |
+
%55 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %54
|
52 |
+
%56 = getelementptr float, ptr addrspace(3) @global_smem, i64 %38
|
53 |
+
%57 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42
|
54 |
+
%58 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46
|
55 |
+
%59 = getelementptr float, ptr addrspace(3) @global_smem, i64 %50
|
56 |
+
%60 = getelementptr float, ptr addrspace(3) @global_smem, i64 %54
|
57 |
+
%61 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 1
|
58 |
+
%62 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 2
|
59 |
+
%63 = getelementptr inbounds <4 x i16>, ptr addrspace(3) %55, i64 0, i64 3
|
60 |
+
br label %64, !dbg !16
|
61 |
+
|
62 |
+
64: ; preds = %12, %64
|
63 |
+
%65 = phi i32 [ 0, %12 ], [ %205, %64 ]
|
64 |
+
%66 = phi <8 x float> [ zeroinitializer, %12 ], [ %204, %64 ]
|
65 |
+
%67 = or i32 %65, %22, !dbg !17
|
66 |
+
%68 = or i32 %65, %25, !dbg !17
|
67 |
+
%69 = shl i32 %68, 8, !dbg !18
|
68 |
+
%70 = add i32 %35, %69, !dbg !19
|
69 |
+
%71 = sext i32 %70 to i64, !dbg !20
|
70 |
+
%72 = getelementptr i16, ptr addrspace(1) %0, i64 %71, !dbg !20
|
71 |
+
%73 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %72, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
72 |
+
%74 = extractvalue { i32, i32 } %73, 0, !dbg !21
|
73 |
+
%75 = extractvalue { i32, i32 } %73, 1, !dbg !21
|
74 |
+
%76 = trunc i32 %74 to i16, !dbg !21
|
75 |
+
%extelt.offset = lshr i32 %74, 16, !dbg !21
|
76 |
+
%77 = trunc i32 %extelt.offset to i16, !dbg !21
|
77 |
+
%78 = trunc i32 %75 to i16, !dbg !21
|
78 |
+
%extelt.offset1 = lshr i32 %75, 16, !dbg !21
|
79 |
+
%79 = trunc i32 %extelt.offset1 to i16, !dbg !21
|
80 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
81 |
+
%80 = insertelement <1 x i16> undef, i16 %76, i64 0, !dbg !22
|
82 |
+
store <1 x i16> %80, ptr addrspace(3) %39, align 2, !dbg !22
|
83 |
+
%81 = insertelement <1 x i16> undef, i16 %77, i64 0, !dbg !22
|
84 |
+
store <1 x i16> %81, ptr addrspace(3) %43, align 2, !dbg !22
|
85 |
+
%82 = insertelement <1 x i16> undef, i16 %78, i64 0, !dbg !22
|
86 |
+
store <1 x i16> %82, ptr addrspace(3) %47, align 2, !dbg !22
|
87 |
+
%83 = insertelement <1 x i16> undef, i16 %79, i64 0, !dbg !22
|
88 |
+
store <1 x i16> %83, ptr addrspace(3) %51, align 2, !dbg !22
|
89 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !22
|
90 |
+
%84 = load i16, ptr addrspace(3) %55, align 8, !dbg !22
|
91 |
+
%85 = load i16, ptr addrspace(3) %61, align 2, !dbg !22
|
92 |
+
%86 = load i16, ptr addrspace(3) %62, align 4, !dbg !22
|
93 |
+
%87 = load i16, ptr addrspace(3) %63, align 2, !dbg !22
|
94 |
+
%88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #3, !dbg !22
|
95 |
+
%89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #3, !dbg !22
|
96 |
+
%90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #3, !dbg !22
|
97 |
+
%91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %87) #3, !dbg !22
|
98 |
+
%92 = getelementptr float, ptr addrspace(1) %1, i64 %71, !dbg !23
|
99 |
+
%93 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %92, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !24
|
100 |
+
%94 = extractvalue { i32, i32, i32, i32 } %93, 0, !dbg !24
|
101 |
+
%95 = extractvalue { i32, i32, i32, i32 } %93, 1, !dbg !24
|
102 |
+
%96 = extractvalue { i32, i32, i32, i32 } %93, 2, !dbg !24
|
103 |
+
%97 = extractvalue { i32, i32, i32, i32 } %93, 3, !dbg !24
|
104 |
+
%98 = bitcast i32 %94 to float, !dbg !24
|
105 |
+
%99 = bitcast i32 %95 to float, !dbg !24
|
106 |
+
%100 = bitcast i32 %96 to float, !dbg !24
|
107 |
+
%101 = bitcast i32 %97 to float, !dbg !24
|
108 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !24
|
109 |
+
%102 = insertelement <1 x float> undef, float %98, i64 0, !dbg !24
|
110 |
+
store <1 x float> %102, ptr addrspace(3) %56, align 4, !dbg !24
|
111 |
+
%103 = insertelement <1 x float> undef, float %99, i64 0, !dbg !24
|
112 |
+
store <1 x float> %103, ptr addrspace(3) %57, align 4, !dbg !24
|
113 |
+
%104 = insertelement <1 x float> undef, float %100, i64 0, !dbg !24
|
114 |
+
store <1 x float> %104, ptr addrspace(3) %58, align 4, !dbg !24
|
115 |
+
%105 = insertelement <1 x float> undef, float %101, i64 0, !dbg !24
|
116 |
+
store <1 x float> %105, ptr addrspace(3) %59, align 4, !dbg !24
|
117 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !24
|
118 |
+
%106 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !24
|
119 |
+
%107 = getelementptr i16, ptr addrspace(1) %2, i64 %71, !dbg !25
|
120 |
+
%108 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !26
|
121 |
+
%109 = extractvalue { i32, i32 } %108, 0, !dbg !26
|
122 |
+
%110 = extractvalue { i32, i32 } %108, 1, !dbg !26
|
123 |
+
%111 = trunc i32 %109 to i16, !dbg !26
|
124 |
+
%extelt.offset2 = lshr i32 %109, 16, !dbg !26
|
125 |
+
%112 = trunc i32 %extelt.offset2 to i16, !dbg !26
|
126 |
+
%113 = trunc i32 %110 to i16, !dbg !26
|
127 |
+
%extelt.offset3 = lshr i32 %110, 16, !dbg !26
|
128 |
+
%114 = trunc i32 %extelt.offset3 to i16, !dbg !26
|
129 |
+
%115 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %111) #3, !dbg !27
|
130 |
+
%116 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %112) #3, !dbg !27
|
131 |
+
%117 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #3, !dbg !27
|
132 |
+
%118 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #3, !dbg !27
|
133 |
+
%119 = add i32 %67, %34, !dbg !28
|
134 |
+
%120 = sext i32 %119 to i64, !dbg !29
|
135 |
+
%121 = getelementptr float, ptr addrspace(1) %3, i64 %120, !dbg !29
|
136 |
+
%122 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %121, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !30
|
137 |
+
%123 = extractvalue { i32, i32, i32, i32 } %122, 0, !dbg !30
|
138 |
+
%124 = extractvalue { i32, i32, i32, i32 } %122, 1, !dbg !30
|
139 |
+
%125 = extractvalue { i32, i32, i32, i32 } %122, 2, !dbg !30
|
140 |
+
%126 = extractvalue { i32, i32, i32, i32 } %122, 3, !dbg !30
|
141 |
+
%127 = getelementptr float, ptr addrspace(1) %4, i64 %120, !dbg !31
|
142 |
+
%128 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %127, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32
|
143 |
+
%129 = extractvalue { i32, i32, i32, i32 } %128, 0, !dbg !32
|
144 |
+
%130 = extractvalue { i32, i32, i32, i32 } %128, 1, !dbg !32
|
145 |
+
%131 = extractvalue { i32, i32, i32, i32 } %128, 2, !dbg !32
|
146 |
+
%132 = extractvalue { i32, i32, i32, i32 } %128, 3, !dbg !32
|
147 |
+
%133 = getelementptr i16, ptr addrspace(1) %5, i64 %71, !dbg !33
|
148 |
+
%134 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !34
|
149 |
+
%135 = extractvalue { i32, i32 } %134, 0, !dbg !34
|
150 |
+
%136 = extractvalue { i32, i32 } %134, 1, !dbg !34
|
151 |
+
%137 = trunc i32 %135 to i16, !dbg !34
|
152 |
+
%extelt.offset4 = lshr i32 %135, 16, !dbg !34
|
153 |
+
%138 = trunc i32 %extelt.offset4 to i16, !dbg !34
|
154 |
+
%139 = trunc i32 %136 to i16, !dbg !34
|
155 |
+
%extelt.offset5 = lshr i32 %136, 16, !dbg !34
|
156 |
+
%140 = trunc i32 %extelt.offset5 to i16, !dbg !34
|
157 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
158 |
+
%141 = insertelement <1 x i16> undef, i16 %137, i64 0, !dbg !35
|
159 |
+
store <1 x i16> %141, ptr addrspace(3) %39, align 2, !dbg !35
|
160 |
+
%142 = insertelement <1 x i16> undef, i16 %138, i64 0, !dbg !35
|
161 |
+
store <1 x i16> %142, ptr addrspace(3) %43, align 2, !dbg !35
|
162 |
+
%143 = insertelement <1 x i16> undef, i16 %139, i64 0, !dbg !35
|
163 |
+
store <1 x i16> %143, ptr addrspace(3) %47, align 2, !dbg !35
|
164 |
+
%144 = insertelement <1 x i16> undef, i16 %140, i64 0, !dbg !35
|
165 |
+
store <1 x i16> %144, ptr addrspace(3) %51, align 2, !dbg !35
|
166 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
167 |
+
%145 = load i16, ptr addrspace(3) %55, align 8, !dbg !35
|
168 |
+
%146 = load i16, ptr addrspace(3) %61, align 2, !dbg !35
|
169 |
+
%147 = load i16, ptr addrspace(3) %62, align 4, !dbg !35
|
170 |
+
%148 = load i16, ptr addrspace(3) %63, align 2, !dbg !35
|
171 |
+
%149 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %145) #3, !dbg !35
|
172 |
+
%150 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %146) #3, !dbg !35
|
173 |
+
%151 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %147) #3, !dbg !35
|
174 |
+
%152 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %148) #3, !dbg !35
|
175 |
+
%153 = getelementptr float, ptr addrspace(1) %6, i64 %120, !dbg !36
|
176 |
+
%154 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %153, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !37
|
177 |
+
%155 = extractvalue { i32, i32, i32, i32 } %154, 0, !dbg !37
|
178 |
+
%156 = extractvalue { i32, i32, i32, i32 } %154, 1, !dbg !37
|
179 |
+
%157 = extractvalue { i32, i32, i32, i32 } %154, 2, !dbg !37
|
180 |
+
%158 = extractvalue { i32, i32, i32, i32 } %154, 3, !dbg !37
|
181 |
+
%159 = getelementptr float, ptr addrspace(1) %7, i64 %120, !dbg !38
|
182 |
+
%160 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %159, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !39
|
183 |
+
%161 = extractvalue { i32, i32, i32, i32 } %160, 0, !dbg !39
|
184 |
+
%162 = extractvalue { i32, i32, i32, i32 } %160, 1, !dbg !39
|
185 |
+
%163 = extractvalue { i32, i32, i32, i32 } %160, 2, !dbg !39
|
186 |
+
%164 = extractvalue { i32, i32, i32, i32 } %160, 3, !dbg !39
|
187 |
+
%165 = fadd float %115, %98, !dbg !40
|
188 |
+
%166 = fadd float %116, %99, !dbg !40
|
189 |
+
%167 = fadd float %117, %100, !dbg !40
|
190 |
+
%168 = fadd float %118, %101, !dbg !40
|
191 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
192 |
+
%169 = insertelement <1 x float> undef, float %165, i64 0, !dbg !40
|
193 |
+
store <1 x float> %169, ptr addrspace(3) %56, align 4, !dbg !40
|
194 |
+
%170 = insertelement <1 x float> undef, float %166, i64 0, !dbg !40
|
195 |
+
store <1 x float> %170, ptr addrspace(3) %57, align 4, !dbg !40
|
196 |
+
%171 = insertelement <1 x float> undef, float %167, i64 0, !dbg !40
|
197 |
+
store <1 x float> %171, ptr addrspace(3) %58, align 4, !dbg !40
|
198 |
+
%172 = insertelement <1 x float> undef, float %168, i64 0, !dbg !40
|
199 |
+
store <1 x float> %172, ptr addrspace(3) %59, align 4, !dbg !40
|
200 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
201 |
+
%173 = load <4 x float>, ptr addrspace(3) %60, align 16, !dbg !40
|
202 |
+
%174 = insertelement <8 x i32> poison, i32 %155, i64 0, !dbg !37
|
203 |
+
%175 = insertelement <8 x i32> %174, i32 %156, i64 1, !dbg !37
|
204 |
+
%176 = insertelement <8 x i32> %175, i32 %157, i64 2, !dbg !37
|
205 |
+
%177 = insertelement <8 x i32> %176, i32 %158, i64 3, !dbg !37
|
206 |
+
%178 = insertelement <8 x i32> %177, i32 %123, i64 4, !dbg !37
|
207 |
+
%179 = insertelement <8 x i32> %178, i32 %124, i64 5, !dbg !37
|
208 |
+
%180 = insertelement <8 x i32> %179, i32 %125, i64 6, !dbg !37
|
209 |
+
%181 = insertelement <8 x i32> %180, i32 %126, i64 7, !dbg !37
|
210 |
+
%182 = bitcast <8 x i32> %181 to <8 x float>, !dbg !37
|
211 |
+
%183 = insertelement <8 x i32> poison, i32 %161, i64 0, !dbg !39
|
212 |
+
%184 = insertelement <8 x i32> %183, i32 %162, i64 1, !dbg !39
|
213 |
+
%185 = insertelement <8 x i32> %184, i32 %163, i64 2, !dbg !39
|
214 |
+
%186 = insertelement <8 x i32> %185, i32 %164, i64 3, !dbg !39
|
215 |
+
%187 = insertelement <8 x i32> %186, i32 %129, i64 4, !dbg !39
|
216 |
+
%188 = insertelement <8 x i32> %187, i32 %130, i64 5, !dbg !39
|
217 |
+
%189 = insertelement <8 x i32> %188, i32 %131, i64 6, !dbg !39
|
218 |
+
%190 = insertelement <8 x i32> %189, i32 %132, i64 7, !dbg !39
|
219 |
+
%191 = bitcast <8 x i32> %190 to <8 x float>, !dbg !39
|
220 |
+
%192 = shufflevector <4 x float> %106, <4 x float> %173, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, !dbg !41
|
221 |
+
%193 = fsub <8 x float> %192, %182, !dbg !41
|
222 |
+
%194 = fmul <8 x float> %193, %191, !dbg !42
|
223 |
+
%195 = insertelement <8 x float> poison, float %149, i64 0, !dbg !43
|
224 |
+
%196 = insertelement <8 x float> %195, float %150, i64 1, !dbg !43
|
225 |
+
%197 = insertelement <8 x float> %196, float %151, i64 2, !dbg !43
|
226 |
+
%198 = insertelement <8 x float> %197, float %152, i64 3, !dbg !43
|
227 |
+
%199 = insertelement <8 x float> %198, float %88, i64 4, !dbg !43
|
228 |
+
%200 = insertelement <8 x float> %199, float %89, i64 5, !dbg !43
|
229 |
+
%201 = insertelement <8 x float> %200, float %90, i64 6, !dbg !43
|
230 |
+
%202 = insertelement <8 x float> %201, float %91, i64 7, !dbg !43
|
231 |
+
%203 = fmul <8 x float> %202, %194, !dbg !43
|
232 |
+
%204 = fadd <8 x float> %66, %203, !dbg !44
|
233 |
+
%205 = add nuw nsw i32 %65, 8, !dbg !16
|
234 |
+
%206 = icmp ult i32 %65, 120, !dbg !16
|
235 |
+
br i1 %206, label %64, label %207, !dbg !16
|
236 |
+
|
237 |
+
207: ; preds = %64
|
238 |
+
%208 = and i32 %13, 63, !dbg !8
|
239 |
+
%209 = or i32 %27, %208, !dbg !12
|
240 |
+
%shift = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>, !dbg !45
|
241 |
+
%210 = fadd <8 x float> %204, %shift, !dbg !45
|
242 |
+
%shift28 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 poison, i32 poison, i32 poison>, !dbg !45
|
243 |
+
%211 = fadd <8 x float> %shift28, %210, !dbg !45
|
244 |
+
%shift29 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison>, !dbg !45
|
245 |
+
%212 = fadd <8 x float> %shift29, %211, !dbg !45
|
246 |
+
%213 = extractelement <8 x float> %212, i64 4, !dbg !45
|
247 |
+
%214 = bitcast float %213 to i32, !dbg !51
|
248 |
+
%215 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %214, i32 1, i32 31), !dbg !51
|
249 |
+
%216 = bitcast i32 %215 to float, !dbg !51
|
250 |
+
%217 = fadd float %213, %216, !dbg !45
|
251 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !53
|
252 |
+
%218 = zext nneg i32 %21 to i64, !dbg !53
|
253 |
+
%219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !53
|
254 |
+
%220 = insertelement <1 x float> undef, float %217, i64 0, !dbg !53
|
255 |
+
store <1 x float> %220, ptr addrspace(3) %219, align 4, !dbg !53
|
256 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !53
|
257 |
+
%221 = zext nneg i32 %208 to i64, !dbg !53
|
258 |
+
%222 = getelementptr float, ptr addrspace(3) @global_smem, i64 %221, !dbg !53
|
259 |
+
%223 = load i32, ptr addrspace(3) %222, align 4, !dbg !53
|
260 |
+
%224 = sext i32 %209 to i64, !dbg !54
|
261 |
+
%225 = getelementptr float, ptr addrspace(1) %8, i64 %224, !dbg !54
|
262 |
+
%226 = and i32 %13, 64, !dbg !55
|
263 |
+
%227 = icmp eq i32 %226, 0, !dbg !55
|
264 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %223, ptr addrspace(1) %225, i1 %227) #3, !dbg !55
|
265 |
+
%shift30 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
|
266 |
+
%228 = fadd <8 x float> %204, %shift30, !dbg !56
|
267 |
+
%shift31 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
|
268 |
+
%229 = fadd <8 x float> %shift31, %228, !dbg !56
|
269 |
+
%shift32 = shufflevector <8 x float> %204, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !56
|
270 |
+
%230 = fadd <8 x float> %shift32, %229, !dbg !56
|
271 |
+
%231 = extractelement <8 x float> %230, i64 0, !dbg !56
|
272 |
+
%232 = bitcast float %231 to i32, !dbg !59
|
273 |
+
%233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !59
|
274 |
+
%234 = bitcast i32 %233 to float, !dbg !59
|
275 |
+
%235 = fadd float %231, %234, !dbg !56
|
276 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !61
|
277 |
+
%236 = insertelement <1 x float> undef, float %235, i64 0, !dbg !61
|
278 |
+
store <1 x float> %236, ptr addrspace(3) %219, align 4, !dbg !61
|
279 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !61
|
280 |
+
%237 = load i32, ptr addrspace(3) %222, align 4, !dbg !61
|
281 |
+
%238 = getelementptr float, ptr addrspace(1) %9, i64 %224, !dbg !62
|
282 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %237, ptr addrspace(1) %238, i1 %227) #3, !dbg !63
|
283 |
+
ret void, !dbg !64
|
284 |
+
}
|
285 |
+
|
286 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
287 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
288 |
+
|
289 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
290 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
291 |
+
|
292 |
+
; Function Attrs: convergent nocallback nounwind
|
293 |
+
declare void @llvm.nvvm.barrier0() #2
|
294 |
+
|
295 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
296 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
297 |
+
attributes #2 = { convergent nocallback nounwind }
|
298 |
+
attributes #3 = { nounwind }
|
299 |
+
|
300 |
+
!llvm.module.flags = !{!0}
|
301 |
+
!llvm.dbg.cu = !{!1}
|
302 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
303 |
+
|
304 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
305 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
306 |
+
!2 = !DIFile(filename: "c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py", directory: "/tmp/torchinductor_root/3x")
|
307 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1}
|
308 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 128}
|
309 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
310 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
311 |
+
!7 = !{}
|
312 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
313 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
314 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
315 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
316 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
317 |
+
!13 = !DILocation(line: 26, column: 20, scope: !5)
|
318 |
+
!14 = !DILocation(line: 34, column: 57, scope: !5)
|
319 |
+
!15 = !DILocation(line: 37, column: 44, scope: !5)
|
320 |
+
!16 = !DILocation(line: 30, column: 36, scope: !5)
|
321 |
+
!17 = !DILocation(line: 31, column: 27, scope: !5)
|
322 |
+
!18 = !DILocation(line: 34, column: 44, scope: !5)
|
323 |
+
!19 = !DILocation(line: 34, column: 51, scope: !5)
|
324 |
+
!20 = !DILocation(line: 34, column: 34, scope: !5)
|
325 |
+
!21 = !DILocation(line: 34, column: 63, scope: !5)
|
326 |
+
!22 = !DILocation(line: 34, column: 115, scope: !5)
|
327 |
+
!23 = !DILocation(line: 35, column: 34, scope: !5)
|
328 |
+
!24 = !DILocation(line: 35, column: 63, scope: !5)
|
329 |
+
!25 = !DILocation(line: 36, column: 34, scope: !5)
|
330 |
+
!26 = !DILocation(line: 36, column: 63, scope: !5)
|
331 |
+
!27 = !DILocation(line: 36, column: 115, scope: !5)
|
332 |
+
!28 = !DILocation(line: 37, column: 40, scope: !5)
|
333 |
+
!29 = !DILocation(line: 37, column: 34, scope: !5)
|
334 |
+
!30 = !DILocation(line: 37, column: 50, scope: !5)
|
335 |
+
!31 = !DILocation(line: 38, column: 34, scope: !5)
|
336 |
+
!32 = !DILocation(line: 38, column: 50, scope: !5)
|
337 |
+
!33 = !DILocation(line: 39, column: 35, scope: !5)
|
338 |
+
!34 = !DILocation(line: 39, column: 64, scope: !5)
|
339 |
+
!35 = !DILocation(line: 39, column: 116, scope: !5)
|
340 |
+
!36 = !DILocation(line: 40, column: 35, scope: !5)
|
341 |
+
!37 = !DILocation(line: 40, column: 51, scope: !5)
|
342 |
+
!38 = !DILocation(line: 41, column: 35, scope: !5)
|
343 |
+
!39 = !DILocation(line: 41, column: 51, scope: !5)
|
344 |
+
!40 = !DILocation(line: 44, column: 22, scope: !5)
|
345 |
+
!41 = !DILocation(line: 52, column: 23, scope: !5)
|
346 |
+
!42 = !DILocation(line: 53, column: 24, scope: !5)
|
347 |
+
!43 = !DILocation(line: 54, column: 24, scope: !5)
|
348 |
+
!44 = !DILocation(line: 57, column: 40, scope: !5)
|
349 |
+
!45 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !49)
|
350 |
+
!46 = distinct !DILexicalBlockFile(scope: !48, file: !47, discriminator: 0)
|
351 |
+
!47 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
352 |
+
!48 = distinct !DILexicalBlockFile(scope: !5, file: !47, discriminator: 0)
|
353 |
+
!49 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !50)
|
354 |
+
!50 = !DILocation(line: 58, column: 27, scope: !46)
|
355 |
+
!51 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !52)
|
356 |
+
!52 = !DILocation(line: 58, column: 27, scope: !48)
|
357 |
+
!53 = !DILocation(line: 58, column: 30, scope: !5)
|
358 |
+
!54 = !DILocation(line: 59, column: 25, scope: !5)
|
359 |
+
!55 = !DILocation(line: 59, column: 37, scope: !5)
|
360 |
+
!56 = !DILocation(line: 233, column: 15, scope: !46, inlinedAt: !57)
|
361 |
+
!57 = !DILocation(line: 243, column: 36, scope: !46, inlinedAt: !58)
|
362 |
+
!58 = !DILocation(line: 60, column: 27, scope: !46)
|
363 |
+
!59 = !DILocation(line: 243, column: 36, scope: !48, inlinedAt: !60)
|
364 |
+
!60 = !DILocation(line: 60, column: 27, scope: !48)
|
365 |
+
!61 = !DILocation(line: 60, column: 30, scope: !5)
|
366 |
+
!62 = !DILocation(line: 61, column: 25, scope: !5)
|
367 |
+
!63 = !DILocation(line: 61, column: 37, scope: !5)
|
368 |
+
!64 = !DILocation(line: 61, column: 4, scope: !5)
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ptx
ADDED
@@ -0,0 +1,771 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de(
|
13 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5,
|
19 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6,
|
20 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7,
|
21 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8,
|
22 |
+
.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9,
|
23 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10,
|
24 |
+
.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11
|
25 |
+
)
|
26 |
+
.maxntid 128, 1, 1
|
27 |
+
{
|
28 |
+
.reg .pred %p<38>;
|
29 |
+
.reg .b16 %rs<13>;
|
30 |
+
.reg .b32 %r<135>;
|
31 |
+
.reg .f32 %f<103>;
|
32 |
+
.reg .b64 %rd<41>;
|
33 |
+
.loc 1 18 0
|
34 |
+
$L__func_begin0:
|
35 |
+
.loc 1 18 0
|
36 |
+
|
37 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9];
|
38 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8];
|
39 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5];
|
40 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2];
|
41 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1];
|
42 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0];
|
43 |
+
$L__tmp0:
|
44 |
+
.loc 1 22 44
|
45 |
+
mov.u32 %r1, %tid.x;
|
46 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3];
|
47 |
+
shl.b32 %r17, %r1, 2;
|
48 |
+
ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4];
|
49 |
+
and.b32 %r18, %r17, 60;
|
50 |
+
bfe.u32 %r19, %r1, 5, 2;
|
51 |
+
ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6];
|
52 |
+
bfe.u32 %r20, %r1, 1, 4;
|
53 |
+
ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7];
|
54 |
+
shl.b32 %r21, %r19, 4;
|
55 |
+
or.b32 %r2, %r21, %r20;
|
56 |
+
.loc 1 24 33
|
57 |
+
and.b32 %r22, %r17, 4;
|
58 |
+
bfe.u32 %r23, %r1, 4, 1;
|
59 |
+
shl.b32 %r24, %r19, 1;
|
60 |
+
or.b32 %r25, %r24, %r23;
|
61 |
+
.loc 1 21 28
|
62 |
+
mov.u32 %r15, %ctaid.x;
|
63 |
+
.loc 1 21 33
|
64 |
+
shl.b32 %r3, %r15, 6;
|
65 |
+
.loc 1 22 23
|
66 |
+
or.b32 %r26, %r3, %r18;
|
67 |
+
or.b32 %r27, %r3, %r2;
|
68 |
+
.loc 1 26 20
|
69 |
+
shr.s32 %r29, %r26, 31;
|
70 |
+
shr.u32 %r30, %r29, 24;
|
71 |
+
add.s32 %r31, %r26, %r30;
|
72 |
+
shr.s32 %r32, %r31, 8;
|
73 |
+
bfe.s32 %r33, %r15, 25, 1;
|
74 |
+
shr.u32 %r34, %r33, 24;
|
75 |
+
add.s32 %r35, %r27, %r34;
|
76 |
+
shr.s32 %r36, %r35, 8;
|
77 |
+
.loc 1 37 44
|
78 |
+
shl.b32 %r37, %r36, 7;
|
79 |
+
mul.lo.s32 %r38, %r18, 12;
|
80 |
+
or.b32 %r39, %r25, %r38;
|
81 |
+
shl.b32 %r40, %r39, 1;
|
82 |
+
mov.u32 %r41, global_smem;
|
83 |
+
add.s32 %r4, %r41, %r40;
|
84 |
+
mad.lo.s32 %r42, %r2, 12, %r22;
|
85 |
+
shl.b32 %r43, %r42, 1;
|
86 |
+
add.s32 %r6, %r41, %r43;
|
87 |
+
shl.b32 %r44, %r39, 2;
|
88 |
+
add.s32 %r7, %r41, %r44;
|
89 |
+
shl.b32 %r45, %r42, 2;
|
90 |
+
add.s32 %r9, %r41, %r45;
|
91 |
+
.loc 1 30 36
|
92 |
+
mad.lo.s32 %r46, %r32, 32512, %r26;
|
93 |
+
shl.b32 %r47, %r19, 9;
|
94 |
+
add.s32 %r48, %r46, %r47;
|
95 |
+
shl.b32 %r49, %r23, 8;
|
96 |
+
add.s32 %r133, %r48, %r49;
|
97 |
+
or.b32 %r50, %r37, %r22;
|
98 |
+
mul.wide.s32 %rd23, %r50, 4;
|
99 |
+
add.s64 %rd40, %rd22, %rd23;
|
100 |
+
add.s64 %rd39, %rd21, %rd23;
|
101 |
+
add.s64 %rd38, %rd20, %rd23;
|
102 |
+
add.s64 %rd37, %rd19, %rd23;
|
103 |
+
mov.f32 %f95, 0f00000000;
|
104 |
+
mov.b32 %r134, -8;
|
105 |
+
mov.pred %p1, -1;
|
106 |
+
mov.f32 %f96, %f95;
|
107 |
+
mov.f32 %f97, %f95;
|
108 |
+
mov.f32 %f98, %f95;
|
109 |
+
mov.f32 %f99, %f95;
|
110 |
+
mov.f32 %f100, %f95;
|
111 |
+
mov.f32 %f101, %f95;
|
112 |
+
mov.f32 %f102, %f95;
|
113 |
+
$L__BB0_1:
|
114 |
+
.loc 1 34 34
|
115 |
+
mul.wide.s32 %rd32, %r133, 2;
|
116 |
+
add.s64 %rd24, %rd13, %rd32;
|
117 |
+
mov.b32 %r53, 0;
|
118 |
+
.loc 1 34 63
|
119 |
+
mov.u32 %r51, 0x0;
|
120 |
+
mov.u32 %r52, 0x0;
|
121 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r51, %r52 }, [ %rd24 + 0 ];
|
122 |
+
@!%p1 mov.u32 %r51, %r53;
|
123 |
+
@!%p1 mov.u32 %r52, %r53;
|
124 |
+
shr.u32 %r115, %r51, 16;
|
125 |
+
shr.u32 %r116, %r52, 16;
|
126 |
+
.loc 1 34 115
|
127 |
+
bar.sync 0;
|
128 |
+
st.shared.u16 [%r4], %r51;
|
129 |
+
st.shared.u16 [%r4+24], %r115;
|
130 |
+
st.shared.u16 [%r4+48], %r52;
|
131 |
+
st.shared.u16 [%r4+72], %r116;
|
132 |
+
bar.sync 0;
|
133 |
+
ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%r6];
|
134 |
+
cvt.f32.bf16 %r55, %rs1;
|
135 |
+
mov.b32 %f25, %r55;
|
136 |
+
cvt.f32.bf16 %r56, %rs2;
|
137 |
+
mov.b32 %f26, %r56;
|
138 |
+
cvt.f32.bf16 %r57, %rs3;
|
139 |
+
mov.b32 %f27, %r57;
|
140 |
+
cvt.f32.bf16 %r58, %rs4;
|
141 |
+
mov.b32 %f28, %r58;
|
142 |
+
.loc 1 35 34
|
143 |
+
mul.wide.s32 %rd33, %r133, 4;
|
144 |
+
add.s64 %rd25, %rd14, %rd33;
|
145 |
+
.loc 1 35 63
|
146 |
+
mov.u32 %r59, 0x0;
|
147 |
+
mov.u32 %r60, 0x0;
|
148 |
+
mov.u32 %r61, 0x0;
|
149 |
+
mov.u32 %r62, 0x0;
|
150 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r59, %r60, %r61, %r62 }, [ %rd25 + 0 ];
|
151 |
+
@!%p1 mov.u32 %r59, %r53;
|
152 |
+
@!%p1 mov.u32 %r60, %r53;
|
153 |
+
@!%p1 mov.u32 %r61, %r53;
|
154 |
+
@!%p1 mov.u32 %r62, %r53;
|
155 |
+
mov.b32 %f29, %r59;
|
156 |
+
mov.b32 %f30, %r60;
|
157 |
+
mov.b32 %f31, %r61;
|
158 |
+
mov.b32 %f32, %r62;
|
159 |
+
bar.sync 0;
|
160 |
+
st.shared.u32 [%r7], %r59;
|
161 |
+
st.shared.u32 [%r7+48], %r60;
|
162 |
+
st.shared.u32 [%r7+96], %r61;
|
163 |
+
st.shared.u32 [%r7+144], %r62;
|
164 |
+
bar.sync 0;
|
165 |
+
ld.shared.v4.f32 {%f33, %f34, %f35, %f36}, [%r9];
|
166 |
+
.loc 1 36 34
|
167 |
+
add.s64 %rd26, %rd15, %rd32;
|
168 |
+
.loc 1 36 63
|
169 |
+
mov.u32 %r67, 0x0;
|
170 |
+
mov.u32 %r68, 0x0;
|
171 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r67, %r68 }, [ %rd26 + 0 ];
|
172 |
+
@!%p1 mov.u32 %r67, %r53;
|
173 |
+
@!%p1 mov.u32 %r68, %r53;
|
174 |
+
cvt.u16.u32 %rs5, %r67;
|
175 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r67; }
|
176 |
+
cvt.u16.u32 %rs7, %r68;
|
177 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r68; }
|
178 |
+
.loc 1 36 115
|
179 |
+
cvt.f32.bf16 %r71, %rs5;
|
180 |
+
mov.b32 %f37, %r71;
|
181 |
+
cvt.f32.bf16 %r72, %rs6;
|
182 |
+
mov.b32 %f38, %r72;
|
183 |
+
cvt.f32.bf16 %r73, %rs7;
|
184 |
+
mov.b32 %f39, %r73;
|
185 |
+
cvt.f32.bf16 %r74, %rs8;
|
186 |
+
mov.b32 %f40, %r74;
|
187 |
+
.loc 1 37 50
|
188 |
+
mov.u32 %r75, 0x0;
|
189 |
+
mov.u32 %r76, 0x0;
|
190 |
+
mov.u32 %r77, 0x0;
|
191 |
+
mov.u32 %r78, 0x0;
|
192 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r75, %r76, %r77, %r78 }, [ %rd37 + 0 ];
|
193 |
+
@!%p1 mov.u32 %r75, %r53;
|
194 |
+
@!%p1 mov.u32 %r76, %r53;
|
195 |
+
@!%p1 mov.u32 %r77, %r53;
|
196 |
+
@!%p1 mov.u32 %r78, %r53;
|
197 |
+
.loc 1 38 50
|
198 |
+
mov.u32 %r83, 0x0;
|
199 |
+
mov.u32 %r84, 0x0;
|
200 |
+
mov.u32 %r85, 0x0;
|
201 |
+
mov.u32 %r86, 0x0;
|
202 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r83, %r84, %r85, %r86 }, [ %rd38 + 0 ];
|
203 |
+
@!%p1 mov.u32 %r83, %r53;
|
204 |
+
@!%p1 mov.u32 %r84, %r53;
|
205 |
+
@!%p1 mov.u32 %r85, %r53;
|
206 |
+
@!%p1 mov.u32 %r86, %r53;
|
207 |
+
.loc 1 39 35
|
208 |
+
add.s64 %rd29, %rd16, %rd32;
|
209 |
+
.loc 1 39 64
|
210 |
+
mov.u32 %r91, 0x0;
|
211 |
+
mov.u32 %r92, 0x0;
|
212 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r91, %r92 }, [ %rd29 + 0 ];
|
213 |
+
@!%p1 mov.u32 %r91, %r53;
|
214 |
+
@!%p1 mov.u32 %r92, %r53;
|
215 |
+
shr.u32 %r117, %r91, 16;
|
216 |
+
shr.u32 %r118, %r92, 16;
|
217 |
+
.loc 1 39 116
|
218 |
+
bar.sync 0;
|
219 |
+
st.shared.u16 [%r4], %r91;
|
220 |
+
st.shared.u16 [%r4+24], %r117;
|
221 |
+
st.shared.u16 [%r4+48], %r92;
|
222 |
+
st.shared.u16 [%r4+72], %r118;
|
223 |
+
bar.sync 0;
|
224 |
+
ld.shared.v4.u16 {%rs9, %rs10, %rs11, %rs12}, [%r6];
|
225 |
+
cvt.f32.bf16 %r95, %rs9;
|
226 |
+
mov.b32 %f41, %r95;
|
227 |
+
cvt.f32.bf16 %r96, %rs10;
|
228 |
+
mov.b32 %f42, %r96;
|
229 |
+
cvt.f32.bf16 %r97, %rs11;
|
230 |
+
mov.b32 %f43, %r97;
|
231 |
+
cvt.f32.bf16 %r98, %rs12;
|
232 |
+
mov.b32 %f44, %r98;
|
233 |
+
.loc 1 40 51
|
234 |
+
mov.u32 %r99, 0x0;
|
235 |
+
mov.u32 %r100, 0x0;
|
236 |
+
mov.u32 %r101, 0x0;
|
237 |
+
mov.u32 %r102, 0x0;
|
238 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd39 + 0 ];
|
239 |
+
@!%p1 mov.u32 %r99, %r53;
|
240 |
+
@!%p1 mov.u32 %r100, %r53;
|
241 |
+
@!%p1 mov.u32 %r101, %r53;
|
242 |
+
@!%p1 mov.u32 %r102, %r53;
|
243 |
+
.loc 1 41 51
|
244 |
+
mov.u32 %r107, 0x0;
|
245 |
+
mov.u32 %r108, 0x0;
|
246 |
+
mov.u32 %r109, 0x0;
|
247 |
+
mov.u32 %r110, 0x0;
|
248 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd40 + 0 ];
|
249 |
+
@!%p1 mov.u32 %r107, %r53;
|
250 |
+
@!%p1 mov.u32 %r108, %r53;
|
251 |
+
@!%p1 mov.u32 %r109, %r53;
|
252 |
+
@!%p1 mov.u32 %r110, %r53;
|
253 |
+
.loc 1 44 22
|
254 |
+
add.f32 %f45, %f37, %f29;
|
255 |
+
add.f32 %f46, %f38, %f30;
|
256 |
+
add.f32 %f47, %f39, %f31;
|
257 |
+
add.f32 %f48, %f40, %f32;
|
258 |
+
bar.sync 0;
|
259 |
+
st.shared.f32 [%r7], %f45;
|
260 |
+
st.shared.f32 [%r7+48], %f46;
|
261 |
+
st.shared.f32 [%r7+96], %f47;
|
262 |
+
st.shared.f32 [%r7+144], %f48;
|
263 |
+
bar.sync 0;
|
264 |
+
ld.shared.v4.f32 {%f49, %f50, %f51, %f52}, [%r9];
|
265 |
+
.loc 1 40 51
|
266 |
+
mov.b32 %f53, %r75;
|
267 |
+
mov.b32 %f54, %r76;
|
268 |
+
mov.b32 %f55, %r77;
|
269 |
+
mov.b32 %f56, %r78;
|
270 |
+
mov.b32 %f57, %r99;
|
271 |
+
mov.b32 %f58, %r100;
|
272 |
+
mov.b32 %f59, %r101;
|
273 |
+
mov.b32 %f60, %r102;
|
274 |
+
.loc 1 41 51
|
275 |
+
mov.b32 %f61, %r110;
|
276 |
+
mov.b32 %f62, %r109;
|
277 |
+
mov.b32 %f63, %r108;
|
278 |
+
mov.b32 %f64, %r107;
|
279 |
+
mov.b32 %f65, %r86;
|
280 |
+
mov.b32 %f66, %r85;
|
281 |
+
mov.b32 %f67, %r84;
|
282 |
+
mov.b32 %f68, %r83;
|
283 |
+
.loc 1 52 23
|
284 |
+
sub.f32 %f69, %f36, %f60;
|
285 |
+
sub.f32 %f70, %f35, %f59;
|
286 |
+
sub.f32 %f71, %f34, %f58;
|
287 |
+
sub.f32 %f72, %f33, %f57;
|
288 |
+
sub.f32 %f73, %f52, %f56;
|
289 |
+
sub.f32 %f74, %f51, %f55;
|
290 |
+
sub.f32 %f75, %f50, %f54;
|
291 |
+
sub.f32 %f76, %f49, %f53;
|
292 |
+
.loc 1 53 24
|
293 |
+
mul.f32 %f77, %f76, %f68;
|
294 |
+
mul.f32 %f78, %f75, %f67;
|
295 |
+
mul.f32 %f79, %f74, %f66;
|
296 |
+
mul.f32 %f80, %f73, %f65;
|
297 |
+
mul.f32 %f81, %f72, %f64;
|
298 |
+
mul.f32 %f82, %f71, %f63;
|
299 |
+
mul.f32 %f83, %f70, %f62;
|
300 |
+
mul.f32 %f84, %f69, %f61;
|
301 |
+
.loc 1 57 40
|
302 |
+
fma.rn.f32 %f98, %f44, %f84, %f98;
|
303 |
+
fma.rn.f32 %f97, %f43, %f83, %f97;
|
304 |
+
fma.rn.f32 %f96, %f42, %f82, %f96;
|
305 |
+
fma.rn.f32 %f95, %f41, %f81, %f95;
|
306 |
+
fma.rn.f32 %f102, %f28, %f80, %f102;
|
307 |
+
fma.rn.f32 %f101, %f27, %f79, %f101;
|
308 |
+
fma.rn.f32 %f100, %f26, %f78, %f100;
|
309 |
+
fma.rn.f32 %f99, %f25, %f77, %f99;
|
310 |
+
.loc 1 30 36
|
311 |
+
add.s32 %r134, %r134, 8;
|
312 |
+
add.s32 %r133, %r133, 2048;
|
313 |
+
add.s64 %rd40, %rd40, 32;
|
314 |
+
add.s64 %rd39, %rd39, 32;
|
315 |
+
add.s64 %rd38, %rd38, 32;
|
316 |
+
add.s64 %rd37, %rd37, 32;
|
317 |
+
setp.lt.u32 %p35, %r134, 120;
|
318 |
+
@%p35 bra $L__BB0_1;
|
319 |
+
.loc 1 22 44
|
320 |
+
and.b32 %r121, %r1, 63;
|
321 |
+
.loc 1 22 23
|
322 |
+
or.b32 %r122, %r3, %r121;
|
323 |
+
$L__tmp1:
|
324 |
+
.loc 2 233 15
|
325 |
+
add.f32 %f85, %f99, %f100;
|
326 |
+
add.f32 %f86, %f101, %f85;
|
327 |
+
add.f32 %f87, %f102, %f86;
|
328 |
+
$L__tmp2:
|
329 |
+
.loc 2 243 36
|
330 |
+
mov.b32 %r123, %f87;
|
331 |
+
shfl.sync.bfly.b32 %r124, %r123, 1, 31, -1;
|
332 |
+
mov.b32 %f88, %r124;
|
333 |
+
$L__tmp3:
|
334 |
+
.loc 2 233 15
|
335 |
+
add.f32 %f89, %f87, %f88;
|
336 |
+
$L__tmp4:
|
337 |
+
.loc 1 58 30
|
338 |
+
bar.sync 0;
|
339 |
+
shl.b32 %r125, %r2, 2;
|
340 |
+
add.s32 %r127, %r41, %r125;
|
341 |
+
st.shared.f32 [%r127], %f89;
|
342 |
+
bar.sync 0;
|
343 |
+
shl.b32 %r128, %r121, 2;
|
344 |
+
add.s32 %r129, %r41, %r128;
|
345 |
+
ld.shared.u32 %r119, [%r129];
|
346 |
+
.loc 1 59 25
|
347 |
+
mul.wide.s32 %rd36, %r122, 4;
|
348 |
+
add.s64 %rd34, %rd17, %rd36;
|
349 |
+
.loc 1 59 37
|
350 |
+
and.b32 %r130, %r1, 64;
|
351 |
+
setp.eq.s32 %p36, %r130, 0;
|
352 |
+
@%p36 st.global.b32 [ %rd34 + 0 ], { %r119 };
|
353 |
+
$L__tmp5:
|
354 |
+
.loc 2 233 15
|
355 |
+
add.f32 %f90, %f95, %f96;
|
356 |
+
add.f32 %f91, %f97, %f90;
|
357 |
+
add.f32 %f92, %f98, %f91;
|
358 |
+
$L__tmp6:
|
359 |
+
.loc 2 243 36
|
360 |
+
mov.b32 %r131, %f92;
|
361 |
+
shfl.sync.bfly.b32 %r132, %r131, 1, 31, -1;
|
362 |
+
mov.b32 %f93, %r132;
|
363 |
+
$L__tmp7:
|
364 |
+
.loc 2 233 15
|
365 |
+
add.f32 %f94, %f92, %f93;
|
366 |
+
$L__tmp8:
|
367 |
+
.loc 1 60 30
|
368 |
+
bar.sync 0;
|
369 |
+
st.shared.f32 [%r127], %f94;
|
370 |
+
bar.sync 0;
|
371 |
+
ld.shared.u32 %r120, [%r129];
|
372 |
+
.loc 1 61 25
|
373 |
+
add.s64 %rd35, %rd18, %rd36;
|
374 |
+
.loc 1 61 37
|
375 |
+
@%p36 st.global.b32 [ %rd35 + 0 ], { %r120 };
|
376 |
+
.loc 1 61 4
|
377 |
+
ret;
|
378 |
+
$L__tmp9:
|
379 |
+
$L__func_end0:
|
380 |
+
|
381 |
+
}
|
382 |
+
.file 1 "/tmp/torchinductor_root/3x/c3xxszvgtfnjb7welqvr33z4cqouxhqjy3dpwa2qmmx2xto6sgvz.py"
|
383 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
384 |
+
.section .debug_abbrev
|
385 |
+
{
|
386 |
+
.b8 1
|
387 |
+
.b8 17
|
388 |
+
.b8 1
|
389 |
+
.b8 37
|
390 |
+
.b8 8
|
391 |
+
.b8 19
|
392 |
+
.b8 5
|
393 |
+
.b8 3
|
394 |
+
.b8 8
|
395 |
+
.b8 16
|
396 |
+
.b8 6
|
397 |
+
.b8 27
|
398 |
+
.b8 8
|
399 |
+
.b8 180
|
400 |
+
.b8 66
|
401 |
+
.b8 12
|
402 |
+
.b8 17
|
403 |
+
.b8 1
|
404 |
+
.b8 18
|
405 |
+
.b8 1
|
406 |
+
.b8 0
|
407 |
+
.b8 0
|
408 |
+
.b8 2
|
409 |
+
.b8 46
|
410 |
+
.b8 0
|
411 |
+
.b8 135
|
412 |
+
.b8 64
|
413 |
+
.b8 8
|
414 |
+
.b8 3
|
415 |
+
.b8 8
|
416 |
+
.b8 58
|
417 |
+
.b8 11
|
418 |
+
.b8 59
|
419 |
+
.b8 11
|
420 |
+
.b8 63
|
421 |
+
.b8 12
|
422 |
+
.b8 32
|
423 |
+
.b8 11
|
424 |
+
.b8 0
|
425 |
+
.b8 0
|
426 |
+
.b8 3
|
427 |
+
.b8 46
|
428 |
+
.b8 1
|
429 |
+
.b8 17
|
430 |
+
.b8 1
|
431 |
+
.b8 18
|
432 |
+
.b8 1
|
433 |
+
.b8 64
|
434 |
+
.b8 10
|
435 |
+
.b8 49
|
436 |
+
.b8 19
|
437 |
+
.b8 0
|
438 |
+
.b8 0
|
439 |
+
.b8 4
|
440 |
+
.b8 29
|
441 |
+
.b8 1
|
442 |
+
.b8 49
|
443 |
+
.b8 19
|
444 |
+
.b8 17
|
445 |
+
.b8 1
|
446 |
+
.b8 18
|
447 |
+
.b8 1
|
448 |
+
.b8 88
|
449 |
+
.b8 11
|
450 |
+
.b8 89
|
451 |
+
.b8 11
|
452 |
+
.b8 87
|
453 |
+
.b8 11
|
454 |
+
.b8 0
|
455 |
+
.b8 0
|
456 |
+
.b8 5
|
457 |
+
.b8 29
|
458 |
+
.b8 0
|
459 |
+
.b8 49
|
460 |
+
.b8 19
|
461 |
+
.b8 17
|
462 |
+
.b8 1
|
463 |
+
.b8 18
|
464 |
+
.b8 1
|
465 |
+
.b8 88
|
466 |
+
.b8 11
|
467 |
+
.b8 89
|
468 |
+
.b8 11
|
469 |
+
.b8 87
|
470 |
+
.b8 11
|
471 |
+
.b8 0
|
472 |
+
.b8 0
|
473 |
+
.b8 0
|
474 |
+
}
|
475 |
+
.section .debug_info
|
476 |
+
{
|
477 |
+
.b32 371
|
478 |
+
.b8 2
|
479 |
+
.b8 0
|
480 |
+
.b32 .debug_abbrev
|
481 |
+
.b8 8
|
482 |
+
.b8 1
|
483 |
+
.b8 116
|
484 |
+
.b8 114
|
485 |
+
.b8 105
|
486 |
+
.b8 116
|
487 |
+
.b8 111
|
488 |
+
.b8 110
|
489 |
+
.b8 0
|
490 |
+
.b8 2
|
491 |
+
.b8 0
|
492 |
+
.b8 99
|
493 |
+
.b8 51
|
494 |
+
.b8 120
|
495 |
+
.b8 120
|
496 |
+
.b8 115
|
497 |
+
.b8 122
|
498 |
+
.b8 118
|
499 |
+
.b8 103
|
500 |
+
.b8 116
|
501 |
+
.b8 102
|
502 |
+
.b8 110
|
503 |
+
.b8 106
|
504 |
+
.b8 98
|
505 |
+
.b8 55
|
506 |
+
.b8 119
|
507 |
+
.b8 101
|
508 |
+
.b8 108
|
509 |
+
.b8 113
|
510 |
+
.b8 118
|
511 |
+
.b8 114
|
512 |
+
.b8 51
|
513 |
+
.b8 51
|
514 |
+
.b8 122
|
515 |
+
.b8 52
|
516 |
+
.b8 99
|
517 |
+
.b8 113
|
518 |
+
.b8 111
|
519 |
+
.b8 117
|
520 |
+
.b8 120
|
521 |
+
.b8 104
|
522 |
+
.b8 113
|
523 |
+
.b8 106
|
524 |
+
.b8 121
|
525 |
+
.b8 51
|
526 |
+
.b8 100
|
527 |
+
.b8 112
|
528 |
+
.b8 119
|
529 |
+
.b8 97
|
530 |
+
.b8 50
|
531 |
+
.b8 113
|
532 |
+
.b8 109
|
533 |
+
.b8 109
|
534 |
+
.b8 120
|
535 |
+
.b8 50
|
536 |
+
.b8 120
|
537 |
+
.b8 116
|
538 |
+
.b8 111
|
539 |
+
.b8 54
|
540 |
+
.b8 115
|
541 |
+
.b8 103
|
542 |
+
.b8 118
|
543 |
+
.b8 122
|
544 |
+
.b8 46
|
545 |
+
.b8 112
|
546 |
+
.b8 121
|
547 |
+
.b8 0
|
548 |
+
.b32 .debug_line
|
549 |
+
.b8 47
|
550 |
+
.b8 116
|
551 |
+
.b8 109
|
552 |
+
.b8 112
|
553 |
+
.b8 47
|
554 |
+
.b8 116
|
555 |
+
.b8 111
|
556 |
+
.b8 114
|
557 |
+
.b8 99
|
558 |
+
.b8 104
|
559 |
+
.b8 105
|
560 |
+
.b8 110
|
561 |
+
.b8 100
|
562 |
+
.b8 117
|
563 |
+
.b8 99
|
564 |
+
.b8 116
|
565 |
+
.b8 111
|
566 |
+
.b8 114
|
567 |
+
.b8 95
|
568 |
+
.b8 114
|
569 |
+
.b8 111
|
570 |
+
.b8 111
|
571 |
+
.b8 116
|
572 |
+
.b8 47
|
573 |
+
.b8 51
|
574 |
+
.b8 120
|
575 |
+
.b8 0
|
576 |
+
.b8 1
|
577 |
+
.b64 $L__func_begin0
|
578 |
+
.b64 $L__func_end0
|
579 |
+
.b8 2
|
580 |
+
.b8 116
|
581 |
+
.b8 114
|
582 |
+
.b8 105
|
583 |
+
.b8 116
|
584 |
+
.b8 111
|
585 |
+
.b8 110
|
586 |
+
.b8 95
|
587 |
+
.b8 95
|
588 |
+
.b8 48
|
589 |
+
.b8 100
|
590 |
+
.b8 49
|
591 |
+
.b8 100
|
592 |
+
.b8 50
|
593 |
+
.b8 100
|
594 |
+
.b8 51
|
595 |
+
.b8 100
|
596 |
+
.b8 52
|
597 |
+
.b8 100
|
598 |
+
.b8 53
|
599 |
+
.b8 100
|
600 |
+
.b8 54
|
601 |
+
.b8 100
|
602 |
+
.b8 55
|
603 |
+
.b8 100
|
604 |
+
.b8 56
|
605 |
+
.b8 100
|
606 |
+
.b8 57
|
607 |
+
.b8 100
|
608 |
+
.b8 49
|
609 |
+
.b8 48
|
610 |
+
.b8 100
|
611 |
+
.b8 101
|
612 |
+
.b8 49
|
613 |
+
.b8 49
|
614 |
+
.b8 100
|
615 |
+
.b8 101
|
616 |
+
.b8 0
|
617 |
+
.b8 116
|
618 |
+
.b8 114
|
619 |
+
.b8 105
|
620 |
+
.b8 116
|
621 |
+
.b8 111
|
622 |
+
.b8 110
|
623 |
+
.b8 95
|
624 |
+
.b8 95
|
625 |
+
.b8 48
|
626 |
+
.b8 100
|
627 |
+
.b8 49
|
628 |
+
.b8 100
|
629 |
+
.b8 50
|
630 |
+
.b8 100
|
631 |
+
.b8 51
|
632 |
+
.b8 100
|
633 |
+
.b8 52
|
634 |
+
.b8 100
|
635 |
+
.b8 53
|
636 |
+
.b8 100
|
637 |
+
.b8 54
|
638 |
+
.b8 100
|
639 |
+
.b8 55
|
640 |
+
.b8 100
|
641 |
+
.b8 56
|
642 |
+
.b8 100
|
643 |
+
.b8 57
|
644 |
+
.b8 100
|
645 |
+
.b8 49
|
646 |
+
.b8 48
|
647 |
+
.b8 100
|
648 |
+
.b8 101
|
649 |
+
.b8 49
|
650 |
+
.b8 49
|
651 |
+
.b8 100
|
652 |
+
.b8 101
|
653 |
+
.b8 0
|
654 |
+
.b8 1
|
655 |
+
.b8 18
|
656 |
+
.b8 1
|
657 |
+
.b8 1
|
658 |
+
.b8 3
|
659 |
+
.b64 $L__func_begin0
|
660 |
+
.b64 $L__func_end0
|
661 |
+
.b8 1
|
662 |
+
.b8 156
|
663 |
+
.b32 125
|
664 |
+
.b8 4
|
665 |
+
.b32 125
|
666 |
+
.b64 $L__tmp1
|
667 |
+
.b64 $L__tmp4
|
668 |
+
.b8 2
|
669 |
+
.b8 58
|
670 |
+
.b8 27
|
671 |
+
.b8 5
|
672 |
+
.b32 125
|
673 |
+
.b64 $L__tmp1
|
674 |
+
.b64 $L__tmp4
|
675 |
+
.b8 2
|
676 |
+
.b8 243
|
677 |
+
.b8 36
|
678 |
+
.b8 0
|
679 |
+
.b8 5
|
680 |
+
.b32 125
|
681 |
+
.b64 $L__tmp2
|
682 |
+
.b64 $L__tmp3
|
683 |
+
.b8 2
|
684 |
+
.b8 58
|
685 |
+
.b8 27
|
686 |
+
.b8 4
|
687 |
+
.b32 125
|
688 |
+
.b64 $L__tmp5
|
689 |
+
.b64 $L__tmp8
|
690 |
+
.b8 2
|
691 |
+
.b8 60
|
692 |
+
.b8 27
|
693 |
+
.b8 5
|
694 |
+
.b32 125
|
695 |
+
.b64 $L__tmp5
|
696 |
+
.b64 $L__tmp8
|
697 |
+
.b8 2
|
698 |
+
.b8 243
|
699 |
+
.b8 36
|
700 |
+
.b8 0
|
701 |
+
.b8 5
|
702 |
+
.b32 125
|
703 |
+
.b64 $L__tmp6
|
704 |
+
.b64 $L__tmp7
|
705 |
+
.b8 2
|
706 |
+
.b8 60
|
707 |
+
.b8 27
|
708 |
+
.b8 0
|
709 |
+
.b8 0
|
710 |
+
}
|
711 |
+
.section .debug_pubnames
|
712 |
+
{
|
713 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
714 |
+
$L__pubNames_start0:
|
715 |
+
.b8 2
|
716 |
+
.b8 0
|
717 |
+
.b32 .debug_info
|
718 |
+
.b32 375
|
719 |
+
.b32 125
|
720 |
+
.b8 116
|
721 |
+
.b8 114
|
722 |
+
.b8 105
|
723 |
+
.b8 116
|
724 |
+
.b8 111
|
725 |
+
.b8 110
|
726 |
+
.b8 95
|
727 |
+
.b8 95
|
728 |
+
.b8 48
|
729 |
+
.b8 100
|
730 |
+
.b8 49
|
731 |
+
.b8 100
|
732 |
+
.b8 50
|
733 |
+
.b8 100
|
734 |
+
.b8 51
|
735 |
+
.b8 100
|
736 |
+
.b8 52
|
737 |
+
.b8 100
|
738 |
+
.b8 53
|
739 |
+
.b8 100
|
740 |
+
.b8 54
|
741 |
+
.b8 100
|
742 |
+
.b8 55
|
743 |
+
.b8 100
|
744 |
+
.b8 56
|
745 |
+
.b8 100
|
746 |
+
.b8 57
|
747 |
+
.b8 100
|
748 |
+
.b8 49
|
749 |
+
.b8 48
|
750 |
+
.b8 100
|
751 |
+
.b8 101
|
752 |
+
.b8 49
|
753 |
+
.b8 49
|
754 |
+
.b8 100
|
755 |
+
.b8 101
|
756 |
+
.b8 0
|
757 |
+
.b32 0
|
758 |
+
$L__pubNames_end0:
|
759 |
+
}
|
760 |
+
.section .debug_pubtypes
|
761 |
+
{
|
762 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
763 |
+
$L__pubTypes_start0:
|
764 |
+
.b8 2
|
765 |
+
.b8 0
|
766 |
+
.b32 .debug_info
|
767 |
+
.b32 375
|
768 |
+
.b32 0
|
769 |
+
$L__pubTypes_end0:
|
770 |
+
}
|
771 |
+
.section .debug_loc { }
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttgir
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked1>
|
8 |
+
%cst_1 = arith.constant dense<128> : tensor<64x1xi32, #blocked1>
|
9 |
+
%cst_2 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<128> : tensor<1x8xi32, #blocked1>
|
12 |
+
%cst_5 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
|
13 |
+
%c0_i32 = arith.constant 0 : i32
|
14 |
+
%c128_i32 = arith.constant 128 : i32
|
15 |
+
%c8_i32 = arith.constant 8 : i32
|
16 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked1>
|
17 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
18 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
|
19 |
+
%c64_i32 = arith.constant 64 : i32
|
20 |
+
%0 = tt.get_program_id x : i32
|
21 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
22 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
23 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
24 |
+
%4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
25 |
+
%5 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
26 |
+
%6 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
27 |
+
%7 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2>
|
28 |
+
%8 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
29 |
+
%9 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
30 |
+
%10 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked2>
|
31 |
+
%11 = arith.addi %8, %5 : tensor<64x1xi32, #blocked>
|
32 |
+
%12 = arith.addi %9, %6 : tensor<64x1xi32, #blocked1>
|
33 |
+
%13 = arith.addi %10, %7 : tensor<64x1xi32, #blocked2>
|
34 |
+
%14 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
35 |
+
%15 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
36 |
+
%16 = tt.expand_dims %14 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x8xi32, #blocked1>
|
37 |
+
%17 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
38 |
+
%18 = arith.remsi %11, %cst : tensor<64x1xi32, #blocked>
|
39 |
+
%19 = arith.divsi %11, %cst : tensor<64x1xi32, #blocked>
|
40 |
+
%20 = arith.divsi %12, %cst_0 : tensor<64x1xi32, #blocked1>
|
41 |
+
%21 = tt.broadcast %18 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
42 |
+
%22 = arith.muli %19, %cst_2 : tensor<64x1xi32, #blocked>
|
43 |
+
%23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
44 |
+
%24 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
45 |
+
%25 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
46 |
+
%26 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
47 |
+
%27 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked1>
|
48 |
+
%28 = tt.broadcast %27 : (tensor<64x1xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
|
49 |
+
%29 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
50 |
+
%30 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
51 |
+
%31 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
52 |
+
%32 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
53 |
+
%33 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked1>
|
54 |
+
%34:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_6, %arg14 = %cst_6) -> (tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>) : i32 {
|
55 |
+
%45 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked1>
|
56 |
+
%46 = tt.splat %arg12 : (i32) -> tensor<1x8xi32, #blocked>
|
57 |
+
%47 = arith.addi %45, %16 : tensor<1x8xi32, #blocked1>
|
58 |
+
%48 = arith.addi %46, %17 : tensor<1x8xi32, #blocked>
|
59 |
+
%49 = arith.cmpi slt, %47, %cst_4 : tensor<1x8xi32, #blocked1>
|
60 |
+
%50 = arith.cmpi slt, %48, %cst_5 : tensor<1x8xi32, #blocked>
|
61 |
+
%51 = arith.muli %48, %cst_3 : tensor<1x8xi32, #blocked>
|
62 |
+
%52 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
63 |
+
%53 = arith.addi %21, %52 : tensor<64x8xi32, #blocked>
|
64 |
+
%54 = arith.addi %53, %23 : tensor<64x8xi32, #blocked>
|
65 |
+
%55 = tt.addptr %24, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
66 |
+
%56 = tt.broadcast %49 : (tensor<1x8xi1, #blocked1>) -> tensor<64x8xi1, #blocked1>
|
67 |
+
%57 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
68 |
+
%58 = tt.load %55, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
69 |
+
%59 = triton_gpu.convert_layout %58 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
|
70 |
+
%60 = arith.extf %59 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
|
71 |
+
%61 = tt.addptr %25, %54 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
72 |
+
%62 = tt.load %61, %57, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
73 |
+
%63 = triton_gpu.convert_layout %62 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
|
74 |
+
%64 = tt.addptr %26, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
75 |
+
%65 = tt.load %64, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
76 |
+
%66 = arith.extf %65 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
|
77 |
+
%67 = tt.broadcast %47 : (tensor<1x8xi32, #blocked1>) -> tensor<64x8xi32, #blocked1>
|
78 |
+
%68 = arith.addi %67, %28 : tensor<64x8xi32, #blocked1>
|
79 |
+
%69 = tt.addptr %29, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
80 |
+
%70 = tt.load %69, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
81 |
+
%71 = tt.addptr %30, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
82 |
+
%72 = tt.load %71, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
83 |
+
%73 = tt.addptr %31, %54 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
84 |
+
%74 = tt.load %73, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
85 |
+
%75 = triton_gpu.convert_layout %74 : (tensor<64x8xbf16, #blocked>) -> tensor<64x8xbf16, #blocked1>
|
86 |
+
%76 = arith.extf %75 : tensor<64x8xbf16, #blocked1> to tensor<64x8xf32, #blocked1>
|
87 |
+
%77 = tt.addptr %32, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
88 |
+
%78 = tt.load %77, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
89 |
+
%79 = tt.addptr %33, %68 : tensor<64x8x!tt.ptr<f32, 1>, #blocked1>, tensor<64x8xi32, #blocked1>
|
90 |
+
%80 = tt.load %79, %56, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked1>
|
91 |
+
%81 = arith.addf %62, %66 : tensor<64x8xf32, #blocked>
|
92 |
+
%82 = triton_gpu.convert_layout %81 : (tensor<64x8xf32, #blocked>) -> tensor<64x8xf32, #blocked1>
|
93 |
+
%83 = arith.subf %82, %70 : tensor<64x8xf32, #blocked1>
|
94 |
+
%84 = arith.mulf %83, %72 : tensor<64x8xf32, #blocked1>
|
95 |
+
%85 = arith.mulf %60, %84 : tensor<64x8xf32, #blocked1>
|
96 |
+
%86 = arith.addf %arg13, %85 : tensor<64x8xf32, #blocked1>
|
97 |
+
%87 = arith.select %56, %86, %arg13 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
|
98 |
+
%88 = arith.subf %63, %78 : tensor<64x8xf32, #blocked1>
|
99 |
+
%89 = arith.mulf %88, %80 : tensor<64x8xf32, #blocked1>
|
100 |
+
%90 = arith.mulf %76, %89 : tensor<64x8xf32, #blocked1>
|
101 |
+
%91 = arith.addf %arg14, %90 : tensor<64x8xf32, #blocked1>
|
102 |
+
%92 = arith.select %56, %91, %arg14 : tensor<64x8xi1, #blocked1>, tensor<64x8xf32, #blocked1>
|
103 |
+
scf.yield %87, %92 : tensor<64x8xf32, #blocked1>, tensor<64x8xf32, #blocked1>
|
104 |
+
}
|
105 |
+
%35 = "tt.reduce"(%34#0) <{axis = 1 : i32}> ({
|
106 |
+
^bb0(%arg12: f32, %arg13: f32):
|
107 |
+
%45 = arith.addf %arg12, %arg13 : f32
|
108 |
+
tt.reduce.return %45 : f32
|
109 |
+
}) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
110 |
+
%36 = triton_gpu.convert_layout %35 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
111 |
+
%37 = tt.expand_dims %36 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
|
112 |
+
%38 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
|
113 |
+
%39 = tt.addptr %38, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
|
114 |
+
tt.store %39, %37 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
|
115 |
+
%40 = "tt.reduce"(%34#1) <{axis = 1 : i32}> ({
|
116 |
+
^bb0(%arg12: f32, %arg13: f32):
|
117 |
+
%45 = arith.addf %arg12, %arg13 : f32
|
118 |
+
tt.reduce.return %45 : f32
|
119 |
+
}) : (tensor<64x8xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
120 |
+
%41 = triton_gpu.convert_layout %40 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
|
121 |
+
%42 = tt.expand_dims %41 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xf32, #blocked2>
|
122 |
+
%43 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked2>
|
123 |
+
%44 = tt.addptr %43, %13 : tensor<64x1x!tt.ptr<f32, 1>, #blocked2>, tensor<64x1xi32, #blocked2>
|
124 |
+
tt.store %44, %42 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked2>
|
125 |
+
tt.return
|
126 |
+
}
|
127 |
+
}
|
.triton/dump/99e09ac62cbb6a1b0eda5bd8218743c3/triton_.ttir
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
|
4 |
+
%c8_i32 = arith.constant 8 : i32
|
5 |
+
%c128_i32 = arith.constant 128 : i32
|
6 |
+
%c0_i32 = arith.constant 0 : i32
|
7 |
+
%cst_0 = arith.constant dense<128> : tensor<64x1xi32>
|
8 |
+
%cst_1 = arith.constant dense<32768> : tensor<64x1xi32>
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<1x8xi32>
|
10 |
+
%cst_3 = arith.constant dense<128> : tensor<1x8xi32>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<64x1xi32>
|
13 |
+
%c64_i32 = arith.constant 64 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
17 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
18 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
19 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
20 |
+
%6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
21 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
22 |
+
%8 = arith.remsi %5, %cst_5 : tensor<64x1xi32>
|
23 |
+
%9 = arith.divsi %5, %cst_5 : tensor<64x1xi32>
|
24 |
+
%10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
25 |
+
%11 = arith.muli %9, %cst_1 : tensor<64x1xi32>
|
26 |
+
%12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
27 |
+
%13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
28 |
+
%14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
29 |
+
%15 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
30 |
+
%16 = arith.muli %9, %cst_0 : tensor<64x1xi32>
|
31 |
+
%17 = tt.broadcast %16 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
32 |
+
%18 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
33 |
+
%19 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
34 |
+
%20 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
35 |
+
%21 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
36 |
+
%22 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
37 |
+
%23:2 = scf.for %arg12 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg13 = %cst_4, %arg14 = %cst_4) -> (tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
|
38 |
+
%32 = tt.splat %arg12 : (i32) -> tensor<1x8xi32>
|
39 |
+
%33 = arith.addi %32, %7 : tensor<1x8xi32>
|
40 |
+
%34 = arith.cmpi slt, %33, %cst_3 : tensor<1x8xi32>
|
41 |
+
%35 = arith.muli %33, %cst_2 : tensor<1x8xi32>
|
42 |
+
%36 = tt.broadcast %35 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
43 |
+
%37 = arith.addi %10, %36 : tensor<64x8xi32>
|
44 |
+
%38 = arith.addi %37, %12 : tensor<64x8xi32>
|
45 |
+
%39 = tt.addptr %13, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
46 |
+
%40 = tt.broadcast %34 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
47 |
+
%41 = tt.load %39, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
48 |
+
%42 = arith.extf %41 : tensor<64x8xbf16> to tensor<64x8xf32>
|
49 |
+
%43 = tt.addptr %14, %38 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
50 |
+
%44 = tt.load %43, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
51 |
+
%45 = tt.addptr %15, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
52 |
+
%46 = tt.load %45, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
53 |
+
%47 = arith.extf %46 : tensor<64x8xbf16> to tensor<64x8xf32>
|
54 |
+
%48 = tt.broadcast %33 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
55 |
+
%49 = arith.addi %48, %17 : tensor<64x8xi32>
|
56 |
+
%50 = tt.addptr %18, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
57 |
+
%51 = tt.load %50, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
58 |
+
%52 = tt.addptr %19, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
59 |
+
%53 = tt.load %52, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
60 |
+
%54 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
61 |
+
%55 = tt.load %54, %40, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
62 |
+
%56 = arith.extf %55 : tensor<64x8xbf16> to tensor<64x8xf32>
|
63 |
+
%57 = tt.addptr %21, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
64 |
+
%58 = tt.load %57, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
65 |
+
%59 = tt.addptr %22, %49 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
66 |
+
%60 = tt.load %59, %40, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
67 |
+
%61 = arith.addf %44, %47 : tensor<64x8xf32>
|
68 |
+
%62 = arith.subf %61, %51 : tensor<64x8xf32>
|
69 |
+
%63 = arith.mulf %62, %53 : tensor<64x8xf32>
|
70 |
+
%64 = arith.mulf %42, %63 : tensor<64x8xf32>
|
71 |
+
%65 = arith.addf %arg13, %64 : tensor<64x8xf32>
|
72 |
+
%66 = arith.select %40, %65, %arg13 : tensor<64x8xi1>, tensor<64x8xf32>
|
73 |
+
%67 = arith.subf %44, %58 : tensor<64x8xf32>
|
74 |
+
%68 = arith.mulf %67, %60 : tensor<64x8xf32>
|
75 |
+
%69 = arith.mulf %56, %68 : tensor<64x8xf32>
|
76 |
+
%70 = arith.addf %arg14, %69 : tensor<64x8xf32>
|
77 |
+
%71 = arith.select %40, %70, %arg14 : tensor<64x8xi1>, tensor<64x8xf32>
|
78 |
+
scf.yield %66, %71 : tensor<64x8xf32>, tensor<64x8xf32>
|
79 |
+
}
|
80 |
+
%24 = "tt.reduce"(%23#0) <{axis = 1 : i32}> ({
|
81 |
+
^bb0(%arg12: f32, %arg13: f32):
|
82 |
+
%32 = arith.addf %arg12, %arg13 : f32
|
83 |
+
tt.reduce.return %32 : f32
|
84 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
85 |
+
%25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
86 |
+
%26 = tt.splat %arg8 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
87 |
+
%27 = tt.addptr %26, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
|
88 |
+
tt.store %27, %25 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
|
89 |
+
%28 = "tt.reduce"(%23#1) <{axis = 1 : i32}> ({
|
90 |
+
^bb0(%arg12: f32, %arg13: f32):
|
91 |
+
%32 = arith.addf %arg12, %arg13 : f32
|
92 |
+
tt.reduce.return %32 : f32
|
93 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
94 |
+
%29 = tt.expand_dims %28 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
95 |
+
%30 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
96 |
+
%31 = tt.addptr %30, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
|
97 |
+
tt.store %31, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
|
98 |
+
tt.return
|
99 |
+
}
|
100 |
+
}
|