Add files using upload-large-folder tool
Browse files- .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.cubin +0 -0
- .triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttgir +19 -0
- .triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir +153 -0
- .triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir +164 -0
- .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ptx +277 -0
- .triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.llir +0 -0
- .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin +0 -0
- .triton/dump/fac03406d1136fc802dac111a1efea36/triton_.llir +443 -0
- .triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttgir +81 -0
- wandb/run-20240926_055222-14kj2390/files/wandb-metadata.json +60 -0
- wandb/run-20240926_055222-14kj2390/logs/debug-core.log +14 -0
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.cubin
ADDED
Binary file (16.8 kB). View file
|
|
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttgir
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
10 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
11 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
|
12 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
|
13 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
15 |
+
%10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
16 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
17 |
+
tt.return
|
18 |
+
}
|
19 |
+
}
|
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
|
17 |
+
%c0_i32 = arith.constant 0 : i32
|
18 |
+
%c64_i32 = arith.constant 64 : i32
|
19 |
+
%c256_i32 = arith.constant 256 : i32
|
20 |
+
%cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
|
21 |
+
%cst_11 = arith.constant 0.000000e+00 : f32
|
22 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
|
23 |
+
%cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
|
24 |
+
%cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
|
25 |
+
%cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
|
26 |
+
%0 = tt.get_program_id x : i32
|
27 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
28 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
29 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
30 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
31 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
32 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
33 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
34 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
35 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
36 |
+
%10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
37 |
+
%11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
38 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
|
39 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
|
40 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
41 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
|
42 |
+
%16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
43 |
+
%17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
44 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
45 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
|
46 |
+
%20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
47 |
+
%21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
|
48 |
+
%22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
49 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
|
50 |
+
%24 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
|
51 |
+
%25 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
|
52 |
+
%26 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
|
53 |
+
%27 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
|
54 |
+
%28 = arith.select %26, %24, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
55 |
+
%29 = arith.select %27, %25, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
|
56 |
+
%30 = arith.cmpi sge, %29, %cst_9 : tensor<64x1xi64, #blocked1>
|
57 |
+
%31 = arith.cmpi slt, %29, %cst_8 : tensor<64x1xi64, #blocked1>
|
58 |
+
%32 = arith.andi %30, %31 : tensor<64x1xi1, #blocked1>
|
59 |
+
%33 = arith.muli %28, %cst_5 : tensor<64x1xi64, #blocked>
|
60 |
+
%34 = tt.broadcast %33 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
|
61 |
+
%35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
|
62 |
+
%36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>) : i32 {
|
63 |
+
%48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
|
64 |
+
%49 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked2>
|
65 |
+
%50 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
|
66 |
+
%51 = arith.addi %49, %13 : tensor<1x64xi32, #blocked2>
|
67 |
+
%52 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
|
68 |
+
%53 = arith.cmpi slt, %51, %cst_13 : tensor<1x64xi32, #blocked2>
|
69 |
+
%54 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
70 |
+
%55 = arith.addi %54, %22 : tensor<64x64xi32, #blocked>
|
71 |
+
%56 = tt.addptr %23, %55 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
72 |
+
%57 = tt.broadcast %52 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
|
73 |
+
%58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
|
74 |
+
%59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
75 |
+
tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
76 |
+
%60 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
|
77 |
+
%61 = tt.broadcast %60 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
|
78 |
+
%62 = arith.addi %61, %34 : tensor<64x64xi64, #blocked>
|
79 |
+
%63 = tt.addptr %35, %62 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
|
80 |
+
%64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
81 |
+
%65 = arith.addf %64, %59 : tensor<64x64xf32, #blocked>
|
82 |
+
%66 = arith.subf %65, %arg8 : tensor<64x64xf32, #blocked>
|
83 |
+
%67 = arith.addf %arg11, %cst_4 : tensor<64x64xf32, #blocked>
|
84 |
+
%68 = arith.addf %arg10, %cst_10 : tensor<64x64xf32, #blocked2>
|
85 |
+
%69 = arith.divf %66, %67 : tensor<64x64xf32, #blocked>
|
86 |
+
%70 = arith.addf %arg8, %69 : tensor<64x64xf32, #blocked>
|
87 |
+
%71 = arith.subf %65, %70 : tensor<64x64xf32, #blocked>
|
88 |
+
%72 = arith.mulf %66, %71 : tensor<64x64xf32, #blocked>
|
89 |
+
%73 = arith.addf %arg9, %72 : tensor<64x64xf32, #blocked>
|
90 |
+
%74 = arith.select %57, %70, %arg8 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
|
91 |
+
%75 = arith.select %57, %73, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
|
92 |
+
%76 = arith.select %57, %67, %arg11 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
|
93 |
+
%77 = arith.select %58, %68, %arg10 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
|
94 |
+
scf.yield %74, %75, %77, %76 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
|
95 |
+
}
|
96 |
+
%37 = triton_gpu.convert_layout %36#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
|
97 |
+
%38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
|
98 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
99 |
+
%48 = arith.subf %arg10, %arg7 : f32
|
100 |
+
%49 = arith.addf %arg9, %arg12 : f32
|
101 |
+
%50 = arith.cmpf oeq, %49, %cst_11 : f32
|
102 |
+
%51 = arith.divf %arg12, %49 : f32
|
103 |
+
%52 = arith.select %50, %cst_11, %51 : f32
|
104 |
+
%53 = arith.mulf %48, %52 : f32
|
105 |
+
%54 = arith.addf %arg7, %53 : f32
|
106 |
+
%55 = arith.addf %arg8, %arg11 : f32
|
107 |
+
%56 = arith.mulf %48, %48 : f32
|
108 |
+
%57 = arith.mulf %56, %arg9 : f32
|
109 |
+
%58 = arith.mulf %57, %52 : f32
|
110 |
+
%59 = arith.addf %55, %58 : f32
|
111 |
+
tt.reduce.return %54, %59, %49 : f32, f32, f32
|
112 |
+
}) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
113 |
+
%39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
114 |
+
%40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
115 |
+
%41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
|
116 |
+
%42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
|
117 |
+
%43 = arith.divf %40, %cst_15 : tensor<64x1xf32, #blocked>
|
118 |
+
%44 = arith.addf %43, %cst_14 : tensor<64x1xf32, #blocked>
|
119 |
+
%45 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
|
120 |
+
%46 = tt.broadcast %45 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
121 |
+
%47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
|
122 |
+
scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
|
123 |
+
%48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
|
124 |
+
%49 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
|
125 |
+
%50 = arith.cmpi slt, %49, %cst_0 : tensor<1x64xi32, #blocked>
|
126 |
+
%51 = tt.broadcast %49 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
127 |
+
%52 = arith.addi %51, %22 : tensor<64x64xi32, #blocked>
|
128 |
+
%53 = tt.addptr %23, %52 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
129 |
+
%54 = tt.broadcast %50 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
|
130 |
+
%55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
131 |
+
%56 = tt.addptr %41, %49 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
|
132 |
+
%57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
|
133 |
+
tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
134 |
+
%58 = arith.extsi %49 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
|
135 |
+
%59 = tt.broadcast %58 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
|
136 |
+
%60 = arith.addi %59, %34 : tensor<64x64xi64, #blocked>
|
137 |
+
%61 = tt.addptr %35, %60 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
|
138 |
+
%62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
139 |
+
%63 = arith.addf %62, %55 : tensor<64x64xf32, #blocked>
|
140 |
+
%64 = arith.subf %63, %42 : tensor<64x64xf32, #blocked>
|
141 |
+
%65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
|
142 |
+
%66 = tt.broadcast %65 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
|
143 |
+
%67 = arith.mulf %64, %66 : tensor<64x64xf32, #blocked>
|
144 |
+
%68 = tt.broadcast %57 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
|
145 |
+
%69 = arith.mulf %67, %68 : tensor<64x64xf32, #blocked>
|
146 |
+
%70 = arith.addi %51, %46 : tensor<64x64xi32, #blocked>
|
147 |
+
%71 = tt.addptr %47, %70 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
148 |
+
%72 = arith.truncf %69 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
|
149 |
+
tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
|
150 |
+
}
|
151 |
+
tt.return
|
152 |
+
}
|
153 |
+
}
|
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
|
17 |
+
%c0_i32 = arith.constant 0 : i32
|
18 |
+
%c64_i32 = arith.constant 64 : i32
|
19 |
+
%c256_i32 = arith.constant 256 : i32
|
20 |
+
%cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
|
21 |
+
%cst_11 = arith.constant 0.000000e+00 : f32
|
22 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
|
23 |
+
%cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
|
24 |
+
%cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
|
25 |
+
%cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
|
26 |
+
%cst_16 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked>
|
27 |
+
%0 = tt.get_program_id x : i32
|
28 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
29 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
30 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
31 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
32 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
33 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
34 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
35 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
36 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
37 |
+
%10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
38 |
+
%11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
39 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
|
40 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
|
41 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
42 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
|
43 |
+
%16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
44 |
+
%17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
45 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
46 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
|
47 |
+
%20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
48 |
+
%21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
|
49 |
+
%22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
50 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
|
51 |
+
%24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
|
52 |
+
%25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
53 |
+
%26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
|
54 |
+
%27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
|
55 |
+
%28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
|
56 |
+
%29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
|
57 |
+
%30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
|
58 |
+
%31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
59 |
+
%32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
|
60 |
+
%33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
|
61 |
+
%34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
|
62 |
+
%35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
|
63 |
+
%36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
|
64 |
+
%37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
|
65 |
+
%38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
|
66 |
+
%39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>) : i32 {
|
67 |
+
%49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
|
68 |
+
%50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked2>
|
69 |
+
%51 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
|
70 |
+
%52 = arith.addi %50, %13 : tensor<1x64xi32, #blocked2>
|
71 |
+
%53 = arith.cmpi slt, %51, %cst_0 : tensor<1x64xi32, #blocked>
|
72 |
+
%54 = arith.cmpi slt, %52, %cst_13 : tensor<1x64xi32, #blocked2>
|
73 |
+
%55 = tt.broadcast %51 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
74 |
+
%56 = arith.addi %55, %22 : tensor<64x64xi32, #blocked>
|
75 |
+
%57 = tt.addptr %23, %56 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
76 |
+
%58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
|
77 |
+
%59 = tt.broadcast %54 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
|
78 |
+
%60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
79 |
+
%61 = arith.addi %55, %25 : tensor<64x64xi32, #blocked>
|
80 |
+
%62 = tt.addptr %26, %61 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
81 |
+
%63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
|
82 |
+
%64 = arith.extf %63 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
|
83 |
+
tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
84 |
+
%65 = arith.extsi %51 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
|
85 |
+
%66 = tt.broadcast %65 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
|
86 |
+
%67 = arith.addi %66, %37 : tensor<64x64xi64, #blocked>
|
87 |
+
%68 = tt.addptr %38, %67 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
|
88 |
+
%69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
89 |
+
%70 = arith.addf %69, %60 : tensor<64x64xf32, #blocked>
|
90 |
+
%71 = arith.addf %70, %64 : tensor<64x64xf32, #blocked>
|
91 |
+
%72 = arith.subf %71, %arg9 : tensor<64x64xf32, #blocked>
|
92 |
+
%73 = arith.addf %arg12, %cst_4 : tensor<64x64xf32, #blocked>
|
93 |
+
%74 = arith.addf %arg11, %cst_10 : tensor<64x64xf32, #blocked2>
|
94 |
+
%75 = arith.divf %72, %73 : tensor<64x64xf32, #blocked>
|
95 |
+
%76 = arith.addf %arg9, %75 : tensor<64x64xf32, #blocked>
|
96 |
+
%77 = arith.subf %71, %76 : tensor<64x64xf32, #blocked>
|
97 |
+
%78 = arith.mulf %72, %77 : tensor<64x64xf32, #blocked>
|
98 |
+
%79 = arith.addf %arg10, %78 : tensor<64x64xf32, #blocked>
|
99 |
+
%80 = arith.select %58, %76, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
|
100 |
+
%81 = arith.select %58, %79, %arg10 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
|
101 |
+
%82 = arith.select %58, %73, %arg12 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
|
102 |
+
%83 = arith.select %59, %74, %arg11 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
|
103 |
+
scf.yield %80, %81, %83, %82 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
|
104 |
+
}
|
105 |
+
%40 = triton_gpu.convert_layout %39#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
|
106 |
+
%41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
|
107 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
108 |
+
%49 = arith.subf %arg11, %arg8 : f32
|
109 |
+
%50 = arith.addf %arg10, %arg13 : f32
|
110 |
+
%51 = arith.cmpf oeq, %50, %cst_11 : f32
|
111 |
+
%52 = arith.divf %arg13, %50 : f32
|
112 |
+
%53 = arith.select %51, %cst_11, %52 : f32
|
113 |
+
%54 = arith.mulf %49, %53 : f32
|
114 |
+
%55 = arith.addf %arg8, %54 : f32
|
115 |
+
%56 = arith.addf %arg9, %arg12 : f32
|
116 |
+
%57 = arith.mulf %49, %49 : f32
|
117 |
+
%58 = arith.mulf %57, %arg10 : f32
|
118 |
+
%59 = arith.mulf %58, %53 : f32
|
119 |
+
%60 = arith.addf %56, %59 : f32
|
120 |
+
tt.reduce.return %55, %60, %50 : f32, f32, f32
|
121 |
+
}) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
122 |
+
%42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
123 |
+
%43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
124 |
+
%44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
|
125 |
+
%45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
|
126 |
+
%46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
|
127 |
+
%47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
|
128 |
+
%48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
|
129 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
|
130 |
+
%49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
|
131 |
+
%50 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
|
132 |
+
%51 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
|
133 |
+
%52 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
|
134 |
+
%53 = arith.addi %52, %22 : tensor<64x64xi32, #blocked>
|
135 |
+
%54 = tt.addptr %23, %53 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
136 |
+
%55 = tt.broadcast %51 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
|
137 |
+
%56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
138 |
+
%57 = arith.addi %52, %25 : tensor<64x64xi32, #blocked>
|
139 |
+
%58 = tt.addptr %26, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
140 |
+
%59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
|
141 |
+
%60 = arith.extf %59 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
|
142 |
+
%61 = tt.addptr %44, %50 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
|
143 |
+
%62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
|
144 |
+
tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
145 |
+
%63 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
|
146 |
+
%64 = tt.broadcast %63 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
|
147 |
+
%65 = arith.addi %64, %37 : tensor<64x64xi64, #blocked>
|
148 |
+
%66 = tt.addptr %38, %65 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
|
149 |
+
%67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
|
150 |
+
%68 = arith.addf %67, %56 : tensor<64x64xf32, #blocked>
|
151 |
+
%69 = arith.addf %68, %60 : tensor<64x64xf32, #blocked>
|
152 |
+
%70 = arith.subf %69, %45 : tensor<64x64xf32, #blocked>
|
153 |
+
%71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
|
154 |
+
%72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
|
155 |
+
%73 = arith.mulf %70, %72 : tensor<64x64xf32, #blocked>
|
156 |
+
%74 = tt.broadcast %62 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
|
157 |
+
%75 = arith.mulf %73, %74 : tensor<64x64xf32, #blocked>
|
158 |
+
%76 = tt.addptr %48, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
|
159 |
+
%77 = arith.truncf %75 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
|
160 |
+
tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
|
161 |
+
}
|
162 |
+
tt.return
|
163 |
+
}
|
164 |
+
}
|
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ptx
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1de(
|
12 |
+
.param .u64 triton__0d1de_param_0,
|
13 |
+
.param .u32 triton__0d1de_param_1
|
14 |
+
)
|
15 |
+
.maxntid 128, 1, 1
|
16 |
+
{
|
17 |
+
.reg .pred %p<2>;
|
18 |
+
.reg .b32 %r<6>;
|
19 |
+
.reg .b64 %rd<5>;
|
20 |
+
.loc 1 18 0
|
21 |
+
$L__func_begin0:
|
22 |
+
.loc 1 18 0
|
23 |
+
|
24 |
+
ld.param.u64 %rd3, [triton__0d1de_param_0];
|
25 |
+
$L__tmp0:
|
26 |
+
.loc 1 21 36
|
27 |
+
mov.u32 %r2, %tid.x;
|
28 |
+
and.b32 %r3, %r2, 127;
|
29 |
+
.loc 1 20 28
|
30 |
+
mov.u32 %r1, %ctaid.x;
|
31 |
+
.loc 1 20 33
|
32 |
+
shl.b32 %r4, %r1, 7;
|
33 |
+
.loc 1 21 23
|
34 |
+
or.b32 %r5, %r4, %r3;
|
35 |
+
.loc 1 22 21
|
36 |
+
setp.lt.s32 %p1, %r5, 512;
|
37 |
+
.loc 1 25 25
|
38 |
+
cvt.s64.s32 %rd1, %r5;
|
39 |
+
mul.wide.s32 %rd4, %r5, 8;
|
40 |
+
add.s64 %rd2, %rd3, %rd4;
|
41 |
+
.loc 1 25 36
|
42 |
+
@%p1 st.global.b64 [ %rd2 + 0 ], { %rd1 };
|
43 |
+
.loc 1 25 4
|
44 |
+
ret;
|
45 |
+
$L__tmp1:
|
46 |
+
$L__func_end0:
|
47 |
+
|
48 |
+
}
|
49 |
+
.file 1 "/tmp/torchinductor_root/wx/cwxxgxdevnyc453z7hh4nxzgmvlhh6suwokktps3dw62btskgxt4.py"
|
50 |
+
.section .debug_abbrev
|
51 |
+
{
|
52 |
+
.b8 1
|
53 |
+
.b8 17
|
54 |
+
.b8 1
|
55 |
+
.b8 37
|
56 |
+
.b8 8
|
57 |
+
.b8 19
|
58 |
+
.b8 5
|
59 |
+
.b8 3
|
60 |
+
.b8 8
|
61 |
+
.b8 16
|
62 |
+
.b8 6
|
63 |
+
.b8 27
|
64 |
+
.b8 8
|
65 |
+
.b8 180
|
66 |
+
.b8 66
|
67 |
+
.b8 12
|
68 |
+
.b8 17
|
69 |
+
.b8 1
|
70 |
+
.b8 18
|
71 |
+
.b8 1
|
72 |
+
.b8 0
|
73 |
+
.b8 0
|
74 |
+
.b8 2
|
75 |
+
.b8 46
|
76 |
+
.b8 0
|
77 |
+
.b8 17
|
78 |
+
.b8 1
|
79 |
+
.b8 18
|
80 |
+
.b8 1
|
81 |
+
.b8 64
|
82 |
+
.b8 10
|
83 |
+
.b8 135
|
84 |
+
.b8 64
|
85 |
+
.b8 8
|
86 |
+
.b8 3
|
87 |
+
.b8 8
|
88 |
+
.b8 58
|
89 |
+
.b8 11
|
90 |
+
.b8 59
|
91 |
+
.b8 11
|
92 |
+
.b8 63
|
93 |
+
.b8 12
|
94 |
+
.b8 0
|
95 |
+
.b8 0
|
96 |
+
.b8 0
|
97 |
+
}
|
98 |
+
.section .debug_info
|
99 |
+
{
|
100 |
+
.b32 172
|
101 |
+
.b8 2
|
102 |
+
.b8 0
|
103 |
+
.b32 .debug_abbrev
|
104 |
+
.b8 8
|
105 |
+
.b8 1
|
106 |
+
.b8 116
|
107 |
+
.b8 114
|
108 |
+
.b8 105
|
109 |
+
.b8 116
|
110 |
+
.b8 111
|
111 |
+
.b8 110
|
112 |
+
.b8 0
|
113 |
+
.b8 2
|
114 |
+
.b8 0
|
115 |
+
.b8 99
|
116 |
+
.b8 119
|
117 |
+
.b8 120
|
118 |
+
.b8 120
|
119 |
+
.b8 103
|
120 |
+
.b8 120
|
121 |
+
.b8 100
|
122 |
+
.b8 101
|
123 |
+
.b8 118
|
124 |
+
.b8 110
|
125 |
+
.b8 121
|
126 |
+
.b8 99
|
127 |
+
.b8 52
|
128 |
+
.b8 53
|
129 |
+
.b8 51
|
130 |
+
.b8 122
|
131 |
+
.b8 55
|
132 |
+
.b8 104
|
133 |
+
.b8 104
|
134 |
+
.b8 52
|
135 |
+
.b8 110
|
136 |
+
.b8 120
|
137 |
+
.b8 122
|
138 |
+
.b8 103
|
139 |
+
.b8 109
|
140 |
+
.b8 118
|
141 |
+
.b8 108
|
142 |
+
.b8 104
|
143 |
+
.b8 104
|
144 |
+
.b8 54
|
145 |
+
.b8 115
|
146 |
+
.b8 117
|
147 |
+
.b8 119
|
148 |
+
.b8 111
|
149 |
+
.b8 107
|
150 |
+
.b8 107
|
151 |
+
.b8 116
|
152 |
+
.b8 112
|
153 |
+
.b8 115
|
154 |
+
.b8 51
|
155 |
+
.b8 100
|
156 |
+
.b8 119
|
157 |
+
.b8 54
|
158 |
+
.b8 50
|
159 |
+
.b8 98
|
160 |
+
.b8 116
|
161 |
+
.b8 115
|
162 |
+
.b8 107
|
163 |
+
.b8 103
|
164 |
+
.b8 120
|
165 |
+
.b8 116
|
166 |
+
.b8 52
|
167 |
+
.b8 46
|
168 |
+
.b8 112
|
169 |
+
.b8 121
|
170 |
+
.b8 0
|
171 |
+
.b32 .debug_line
|
172 |
+
.b8 47
|
173 |
+
.b8 116
|
174 |
+
.b8 109
|
175 |
+
.b8 112
|
176 |
+
.b8 47
|
177 |
+
.b8 116
|
178 |
+
.b8 111
|
179 |
+
.b8 114
|
180 |
+
.b8 99
|
181 |
+
.b8 104
|
182 |
+
.b8 105
|
183 |
+
.b8 110
|
184 |
+
.b8 100
|
185 |
+
.b8 117
|
186 |
+
.b8 99
|
187 |
+
.b8 116
|
188 |
+
.b8 111
|
189 |
+
.b8 114
|
190 |
+
.b8 95
|
191 |
+
.b8 114
|
192 |
+
.b8 111
|
193 |
+
.b8 111
|
194 |
+
.b8 116
|
195 |
+
.b8 47
|
196 |
+
.b8 119
|
197 |
+
.b8 120
|
198 |
+
.b8 0
|
199 |
+
.b8 1
|
200 |
+
.b64 $L__func_begin0
|
201 |
+
.b64 $L__func_end0
|
202 |
+
.b8 2
|
203 |
+
.b64 $L__func_begin0
|
204 |
+
.b64 $L__func_end0
|
205 |
+
.b8 1
|
206 |
+
.b8 156
|
207 |
+
.b8 116
|
208 |
+
.b8 114
|
209 |
+
.b8 105
|
210 |
+
.b8 116
|
211 |
+
.b8 111
|
212 |
+
.b8 110
|
213 |
+
.b8 95
|
214 |
+
.b8 95
|
215 |
+
.b8 48
|
216 |
+
.b8 100
|
217 |
+
.b8 49
|
218 |
+
.b8 100
|
219 |
+
.b8 101
|
220 |
+
.b8 0
|
221 |
+
.b8 116
|
222 |
+
.b8 114
|
223 |
+
.b8 105
|
224 |
+
.b8 116
|
225 |
+
.b8 111
|
226 |
+
.b8 110
|
227 |
+
.b8 95
|
228 |
+
.b8 95
|
229 |
+
.b8 48
|
230 |
+
.b8 100
|
231 |
+
.b8 49
|
232 |
+
.b8 100
|
233 |
+
.b8 101
|
234 |
+
.b8 0
|
235 |
+
.b8 1
|
236 |
+
.b8 18
|
237 |
+
.b8 1
|
238 |
+
.b8 0
|
239 |
+
}
|
240 |
+
.section .debug_pubnames
|
241 |
+
{
|
242 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
243 |
+
$L__pubNames_start0:
|
244 |
+
.b8 2
|
245 |
+
.b8 0
|
246 |
+
.b32 .debug_info
|
247 |
+
.b32 176
|
248 |
+
.b32 125
|
249 |
+
.b8 116
|
250 |
+
.b8 114
|
251 |
+
.b8 105
|
252 |
+
.b8 116
|
253 |
+
.b8 111
|
254 |
+
.b8 110
|
255 |
+
.b8 95
|
256 |
+
.b8 95
|
257 |
+
.b8 48
|
258 |
+
.b8 100
|
259 |
+
.b8 49
|
260 |
+
.b8 100
|
261 |
+
.b8 101
|
262 |
+
.b8 0
|
263 |
+
.b32 0
|
264 |
+
$L__pubNames_end0:
|
265 |
+
}
|
266 |
+
.section .debug_pubtypes
|
267 |
+
{
|
268 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
269 |
+
$L__pubTypes_start0:
|
270 |
+
.b8 2
|
271 |
+
.b8 0
|
272 |
+
.b32 .debug_info
|
273 |
+
.b32 176
|
274 |
+
.b32 0
|
275 |
+
$L__pubTypes_end0:
|
276 |
+
}
|
277 |
+
.section .debug_loc { }
|
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.llir
ADDED
The diff for this file is too large to render.
See raw diff
|
|
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin
ADDED
Binary file (4.78 kB). View file
|
|
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.llir
ADDED
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = lshr i32 %6, 5, !dbg !8
|
9 |
+
%urem = and i32 %6, 255, !dbg !8
|
10 |
+
%8 = or i32 %urem, 256, !dbg !8
|
11 |
+
%9 = or i32 %urem, 512, !dbg !8
|
12 |
+
%10 = or i32 %urem, 768, !dbg !8
|
13 |
+
%11 = or i32 %urem, 1024, !dbg !8
|
14 |
+
%12 = or i32 %urem, 1280, !dbg !8
|
15 |
+
%13 = or i32 %urem, 1536, !dbg !8
|
16 |
+
%14 = or i32 %urem, 1792, !dbg !8
|
17 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
18 |
+
%16 = sext i32 %15 to i64, !dbg !10
|
19 |
+
%17 = insertelement <8 x i32> poison, i32 %urem, i64 0
|
20 |
+
%18 = insertelement <8 x i32> %17, i32 %8, i64 1
|
21 |
+
%19 = insertelement <8 x i32> %18, i32 %9, i64 2
|
22 |
+
%20 = insertelement <8 x i32> %19, i32 %10, i64 3
|
23 |
+
%21 = insertelement <8 x i32> %20, i32 %11, i64 4
|
24 |
+
%22 = insertelement <8 x i32> %21, i32 %12, i64 5
|
25 |
+
%23 = insertelement <8 x i32> %22, i32 %13, i64 6
|
26 |
+
%24 = insertelement <8 x i32> %23, i32 %14, i64 7
|
27 |
+
%25 = zext <8 x i32> %24 to <8 x i64>
|
28 |
+
%26 = mul nsw i64 %16, 50257, !dbg !11
|
29 |
+
%invariant.gep = getelementptr i16, ptr addrspace(1) %0, i64 %26, !dbg !12
|
30 |
+
br label %27, !dbg !12
|
31 |
+
|
32 |
+
27: ; preds = %5, %27
|
33 |
+
%28 = phi i32 [ 0, %5 ], [ %80, %27 ]
|
34 |
+
%29 = phi <8 x float> [ <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, %5 ], [ %79, %27 ]
|
35 |
+
%30 = zext nneg i32 %28 to i64, !dbg !13
|
36 |
+
%31 = fcmp ord <8 x float> %29, zeroinitializer, !dbg !14
|
37 |
+
%32 = insertelement <8 x i64> poison, i64 %30, i64 0, !dbg !13
|
38 |
+
%33 = shufflevector <8 x i64> %32, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !13
|
39 |
+
%34 = or <8 x i64> %33, %25, !dbg !13
|
40 |
+
%35 = icmp ult <8 x i64> %34, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !18
|
41 |
+
%36 = extractelement <8 x i64> %34, i64 0, !dbg !19
|
42 |
+
%gep = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19
|
43 |
+
%37 = extractelement <8 x i64> %34, i64 1, !dbg !19
|
44 |
+
%gep20 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19
|
45 |
+
%38 = extractelement <8 x i64> %34, i64 2, !dbg !19
|
46 |
+
%gep22 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %38, !dbg !19
|
47 |
+
%39 = extractelement <8 x i64> %34, i64 3, !dbg !19
|
48 |
+
%gep24 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %39, !dbg !19
|
49 |
+
%40 = extractelement <8 x i64> %34, i64 4, !dbg !19
|
50 |
+
%gep26 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %40, !dbg !19
|
51 |
+
%41 = extractelement <8 x i64> %34, i64 5, !dbg !19
|
52 |
+
%gep28 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %41, !dbg !19
|
53 |
+
%42 = extractelement <8 x i64> %34, i64 6, !dbg !19
|
54 |
+
%gep30 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %42, !dbg !19
|
55 |
+
%43 = extractelement <8 x i64> %34, i64 7, !dbg !19
|
56 |
+
%gep32 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %43, !dbg !19
|
57 |
+
%44 = extractelement <8 x i1> %35, i64 0, !dbg !20
|
58 |
+
%45 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep, i1 %44, i16 0, i1 %44) #3, !dbg !20
|
59 |
+
%46 = extractelement <8 x i1> %35, i64 1, !dbg !20
|
60 |
+
%47 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep20, i1 %46, i16 0, i1 %46) #3, !dbg !20
|
61 |
+
%48 = extractelement <8 x i1> %35, i64 2, !dbg !20
|
62 |
+
%49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep22, i1 %48, i16 0, i1 %48) #3, !dbg !20
|
63 |
+
%50 = extractelement <8 x i1> %35, i64 3, !dbg !20
|
64 |
+
%51 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep24, i1 %50, i16 0, i1 %50) #3, !dbg !20
|
65 |
+
%52 = extractelement <8 x i1> %35, i64 4, !dbg !20
|
66 |
+
%53 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep26, i1 %52, i16 0, i1 %52) #3, !dbg !20
|
67 |
+
%54 = extractelement <8 x i1> %35, i64 5, !dbg !20
|
68 |
+
%55 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep28, i1 %54, i16 0, i1 %54) #3, !dbg !20
|
69 |
+
%56 = extractelement <8 x i1> %35, i64 6, !dbg !20
|
70 |
+
%57 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep30, i1 %56, i16 0, i1 %56) #3, !dbg !20
|
71 |
+
%58 = extractelement <8 x i1> %35, i64 7, !dbg !20
|
72 |
+
%59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep32, i1 %58, i16 0, i1 %58) #3, !dbg !20
|
73 |
+
%60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #3, !dbg !21
|
74 |
+
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #3, !dbg !21
|
75 |
+
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #3, !dbg !21
|
76 |
+
%63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #3, !dbg !21
|
77 |
+
%64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #3, !dbg !21
|
78 |
+
%65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #3, !dbg !21
|
79 |
+
%66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !21
|
80 |
+
%67 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #3, !dbg !21
|
81 |
+
%68 = insertelement <8 x float> poison, float %60, i64 0, !dbg !22
|
82 |
+
%69 = insertelement <8 x float> %68, float %61, i64 1, !dbg !22
|
83 |
+
%70 = insertelement <8 x float> %69, float %62, i64 2, !dbg !22
|
84 |
+
%71 = insertelement <8 x float> %70, float %63, i64 3, !dbg !22
|
85 |
+
%72 = insertelement <8 x float> %71, float %64, i64 4, !dbg !22
|
86 |
+
%73 = insertelement <8 x float> %72, float %65, i64 5, !dbg !22
|
87 |
+
%74 = insertelement <8 x float> %73, float %66, i64 6, !dbg !22
|
88 |
+
%75 = insertelement <8 x float> %74, float %67, i64 7, !dbg !22
|
89 |
+
%76 = fcmp ule <8 x float> %29, %75, !dbg !22
|
90 |
+
%77 = and <8 x i1> %31, %76, !dbg !23
|
91 |
+
%78 = and <8 x i1> %35, %77, !dbg !24
|
92 |
+
%79 = select <8 x i1> %78, <8 x float> %75, <8 x float> %29, !dbg !24
|
93 |
+
%80 = add nuw nsw i32 %28, 2048, !dbg !12
|
94 |
+
%81 = icmp ult i32 %28, 48209, !dbg !12
|
95 |
+
br i1 %81, label %27, label %82, !dbg !12
|
96 |
+
|
97 |
+
82: ; preds = %27
|
98 |
+
%83 = and i32 %6, 31, !dbg !8
|
99 |
+
%84 = and i32 %7, 7, !dbg !8
|
100 |
+
%85 = extractelement <8 x float> %79, i64 0, !dbg !25
|
101 |
+
%86 = extractelement <8 x float> %79, i64 1, !dbg !25
|
102 |
+
%87 = fcmp ogt float %85, %86, !dbg !25
|
103 |
+
%88 = fcmp uno float %85, 0.000000e+00, !dbg !29
|
104 |
+
%89 = or i1 %87, %88, !dbg !30
|
105 |
+
%90 = select i1 %89, float %85, float %86, !dbg !31
|
106 |
+
%91 = extractelement <8 x float> %79, i64 2, !dbg !25
|
107 |
+
%92 = fcmp ogt float %90, %91, !dbg !25
|
108 |
+
%93 = fcmp uno float %90, 0.000000e+00, !dbg !29
|
109 |
+
%94 = or i1 %92, %93, !dbg !30
|
110 |
+
%95 = select i1 %94, float %90, float %91, !dbg !31
|
111 |
+
%96 = extractelement <8 x float> %79, i64 3, !dbg !25
|
112 |
+
%97 = fcmp ogt float %95, %96, !dbg !25
|
113 |
+
%98 = fcmp uno float %95, 0.000000e+00, !dbg !29
|
114 |
+
%99 = or i1 %97, %98, !dbg !30
|
115 |
+
%100 = select i1 %99, float %95, float %96, !dbg !31
|
116 |
+
%101 = extractelement <8 x float> %79, i64 4, !dbg !25
|
117 |
+
%102 = fcmp ogt float %100, %101, !dbg !25
|
118 |
+
%103 = fcmp uno float %100, 0.000000e+00, !dbg !29
|
119 |
+
%104 = or i1 %102, %103, !dbg !30
|
120 |
+
%105 = select i1 %104, float %100, float %101, !dbg !31
|
121 |
+
%106 = extractelement <8 x float> %79, i64 5, !dbg !25
|
122 |
+
%107 = fcmp ogt float %105, %106, !dbg !25
|
123 |
+
%108 = fcmp uno float %105, 0.000000e+00, !dbg !29
|
124 |
+
%109 = or i1 %107, %108, !dbg !30
|
125 |
+
%110 = select i1 %109, float %105, float %106, !dbg !31
|
126 |
+
%111 = extractelement <8 x float> %79, i64 6, !dbg !25
|
127 |
+
%112 = fcmp ogt float %110, %111, !dbg !25
|
128 |
+
%113 = fcmp uno float %110, 0.000000e+00, !dbg !29
|
129 |
+
%114 = or i1 %112, %113, !dbg !30
|
130 |
+
%115 = select i1 %114, float %110, float %111, !dbg !31
|
131 |
+
%116 = extractelement <8 x float> %79, i64 7, !dbg !25
|
132 |
+
%117 = fcmp ogt float %115, %116, !dbg !25
|
133 |
+
%118 = fcmp uno float %115, 0.000000e+00, !dbg !29
|
134 |
+
%119 = or i1 %117, %118, !dbg !30
|
135 |
+
%120 = select i1 %119, float %115, float %116, !dbg !31
|
136 |
+
%121 = bitcast float %120 to i32, !dbg !32
|
137 |
+
%122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !32
|
138 |
+
%123 = bitcast i32 %122 to float, !dbg !32
|
139 |
+
%124 = fcmp ogt float %120, %123, !dbg !25
|
140 |
+
%125 = fcmp uno float %120, 0.000000e+00, !dbg !29
|
141 |
+
%126 = or i1 %125, %124, !dbg !30
|
142 |
+
%127 = select i1 %126, float %120, float %123, !dbg !31
|
143 |
+
%128 = bitcast float %127 to i32, !dbg !32
|
144 |
+
%129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !32
|
145 |
+
%130 = bitcast i32 %129 to float, !dbg !32
|
146 |
+
%131 = fcmp ogt float %127, %130, !dbg !25
|
147 |
+
%132 = fcmp uno float %127, 0.000000e+00, !dbg !29
|
148 |
+
%133 = or i1 %131, %132, !dbg !30
|
149 |
+
%134 = select i1 %133, float %127, float %130, !dbg !31
|
150 |
+
%135 = bitcast float %134 to i32, !dbg !32
|
151 |
+
%136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 4, i32 31), !dbg !32
|
152 |
+
%137 = bitcast i32 %136 to float, !dbg !32
|
153 |
+
%138 = fcmp ogt float %134, %137, !dbg !25
|
154 |
+
%139 = fcmp uno float %134, 0.000000e+00, !dbg !29
|
155 |
+
%140 = or i1 %138, %139, !dbg !30
|
156 |
+
%141 = select i1 %140, float %134, float %137, !dbg !31
|
157 |
+
%142 = bitcast float %141 to i32, !dbg !32
|
158 |
+
%143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 2, i32 31), !dbg !32
|
159 |
+
%144 = bitcast i32 %143 to float, !dbg !32
|
160 |
+
%145 = fcmp ogt float %141, %144, !dbg !25
|
161 |
+
%146 = fcmp uno float %141, 0.000000e+00, !dbg !29
|
162 |
+
%147 = or i1 %145, %146, !dbg !30
|
163 |
+
%148 = select i1 %147, float %141, float %144, !dbg !31
|
164 |
+
%149 = bitcast float %148 to i32, !dbg !32
|
165 |
+
%150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 1, i32 31), !dbg !32
|
166 |
+
%151 = bitcast i32 %150 to float, !dbg !32
|
167 |
+
%152 = fcmp ogt float %148, %151, !dbg !25
|
168 |
+
%153 = fcmp uno float %148, 0.000000e+00, !dbg !29
|
169 |
+
%154 = or i1 %152, %153, !dbg !30
|
170 |
+
%155 = select i1 %154, float %148, float %151, !dbg !31
|
171 |
+
%156 = icmp eq i32 %83, 0, !dbg !32
|
172 |
+
%157 = zext nneg i32 %84 to i64, !dbg !32
|
173 |
+
%158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !32
|
174 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %158, float %155, i1 %156) #3, !dbg !32
|
175 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
176 |
+
%159 = icmp slt i32 %6, 8, !dbg !32
|
177 |
+
%160 = sext i32 %6 to i64, !dbg !32
|
178 |
+
%161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !32
|
179 |
+
%162 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %161, i1 %159) #3, !dbg !32
|
180 |
+
%163 = bitcast float %162 to i32, !dbg !32
|
181 |
+
%164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 4, i32 31), !dbg !32
|
182 |
+
%165 = bitcast i32 %164 to float, !dbg !32
|
183 |
+
%166 = fcmp ogt float %162, %165, !dbg !25
|
184 |
+
%167 = fcmp uno float %162, 0.000000e+00, !dbg !29
|
185 |
+
%168 = or i1 %167, %166, !dbg !30
|
186 |
+
%169 = select i1 %168, float %162, float %165, !dbg !31
|
187 |
+
%170 = bitcast float %169 to i32, !dbg !32
|
188 |
+
%171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 2, i32 31), !dbg !32
|
189 |
+
%172 = bitcast i32 %171 to float, !dbg !32
|
190 |
+
%173 = fcmp ogt float %169, %172, !dbg !25
|
191 |
+
%174 = fcmp uno float %169, 0.000000e+00, !dbg !29
|
192 |
+
%175 = or i1 %173, %174, !dbg !30
|
193 |
+
%176 = select i1 %175, float %169, float %172, !dbg !31
|
194 |
+
%177 = bitcast float %176 to i32, !dbg !32
|
195 |
+
%178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !32
|
196 |
+
%179 = bitcast i32 %178 to float, !dbg !32
|
197 |
+
%180 = fcmp ogt float %176, %179, !dbg !25
|
198 |
+
%181 = fcmp uno float %176, 0.000000e+00, !dbg !29
|
199 |
+
%182 = or i1 %180, %181, !dbg !30
|
200 |
+
%183 = select i1 %182, float %176, float %179, !dbg !31
|
201 |
+
%184 = and i32 %6, 7, !dbg !32
|
202 |
+
%185 = icmp eq i32 %184, 0, !dbg !32
|
203 |
+
%186 = and i1 %159, %185, !dbg !32
|
204 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %161, float %183, i1 %186) #3, !dbg !32
|
205 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !32
|
206 |
+
%187 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
|
207 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !34
|
208 |
+
%188 = insertelement <1 x float> undef, float %187, i64 0, !dbg !34
|
209 |
+
store <1 x float> %188, ptr addrspace(3) @global_smem, align 4, !dbg !34
|
210 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !34
|
211 |
+
%189 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !34
|
212 |
+
%190 = getelementptr float, ptr addrspace(1) %1, i64 %16, !dbg !35
|
213 |
+
%191 = icmp eq i32 %urem, 0, !dbg !36
|
214 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %189, ptr addrspace(1) %190, i1 %191) #3, !dbg !36
|
215 |
+
br label %192, !dbg !37
|
216 |
+
|
217 |
+
192: ; preds = %82, %192
|
218 |
+
%193 = phi i32 [ 0, %82 ], [ %266, %192 ]
|
219 |
+
%194 = phi <8 x float> [ zeroinitializer, %82 ], [ %265, %192 ]
|
220 |
+
%195 = zext nneg i32 %193 to i64, !dbg !38
|
221 |
+
%196 = insertelement <8 x i64> poison, i64 %195, i64 0, !dbg !38
|
222 |
+
%197 = shufflevector <8 x i64> %196, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !38
|
223 |
+
%198 = or <8 x i64> %197, %25, !dbg !38
|
224 |
+
%199 = icmp ult <8 x i64> %198, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !39
|
225 |
+
%200 = extractelement <8 x i64> %198, i64 0, !dbg !40
|
226 |
+
%gep34 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %200, !dbg !40
|
227 |
+
%201 = extractelement <8 x i64> %198, i64 1, !dbg !40
|
228 |
+
%gep36 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %201, !dbg !40
|
229 |
+
%202 = extractelement <8 x i64> %198, i64 2, !dbg !40
|
230 |
+
%gep38 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %202, !dbg !40
|
231 |
+
%203 = extractelement <8 x i64> %198, i64 3, !dbg !40
|
232 |
+
%gep40 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %203, !dbg !40
|
233 |
+
%204 = extractelement <8 x i64> %198, i64 4, !dbg !40
|
234 |
+
%gep42 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %204, !dbg !40
|
235 |
+
%205 = extractelement <8 x i64> %198, i64 5, !dbg !40
|
236 |
+
%gep44 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %205, !dbg !40
|
237 |
+
%206 = extractelement <8 x i64> %198, i64 6, !dbg !40
|
238 |
+
%gep46 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %206, !dbg !40
|
239 |
+
%207 = extractelement <8 x i64> %198, i64 7, !dbg !40
|
240 |
+
%gep48 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %207, !dbg !40
|
241 |
+
%208 = extractelement <8 x i1> %199, i64 0, !dbg !41
|
242 |
+
%209 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep34, i1 %208, i16 0, i1 %208) #3, !dbg !41
|
243 |
+
%210 = extractelement <8 x i1> %199, i64 1, !dbg !41
|
244 |
+
%211 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep36, i1 %210, i16 0, i1 %210) #3, !dbg !41
|
245 |
+
%212 = extractelement <8 x i1> %199, i64 2, !dbg !41
|
246 |
+
%213 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep38, i1 %212, i16 0, i1 %212) #3, !dbg !41
|
247 |
+
%214 = extractelement <8 x i1> %199, i64 3, !dbg !41
|
248 |
+
%215 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep40, i1 %214, i16 0, i1 %214) #3, !dbg !41
|
249 |
+
%216 = extractelement <8 x i1> %199, i64 4, !dbg !41
|
250 |
+
%217 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep42, i1 %216, i16 0, i1 %216) #3, !dbg !41
|
251 |
+
%218 = extractelement <8 x i1> %199, i64 5, !dbg !41
|
252 |
+
%219 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep44, i1 %218, i16 0, i1 %218) #3, !dbg !41
|
253 |
+
%220 = extractelement <8 x i1> %199, i64 6, !dbg !41
|
254 |
+
%221 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep46, i1 %220, i16 0, i1 %220) #3, !dbg !41
|
255 |
+
%222 = extractelement <8 x i1> %199, i64 7, !dbg !41
|
256 |
+
%223 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep48, i1 %222, i16 0, i1 %222) #3, !dbg !41
|
257 |
+
%224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #3, !dbg !42
|
258 |
+
%225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #3, !dbg !42
|
259 |
+
%226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %213) #3, !dbg !42
|
260 |
+
%227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %215) #3, !dbg !42
|
261 |
+
%228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %217) #3, !dbg !42
|
262 |
+
%229 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %219) #3, !dbg !42
|
263 |
+
%230 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %221) #3, !dbg !42
|
264 |
+
%231 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %223) #3, !dbg !42
|
265 |
+
%232 = fsub float %224, %187, !dbg !43
|
266 |
+
%233 = fsub float %225, %187, !dbg !43
|
267 |
+
%234 = fsub float %226, %187, !dbg !43
|
268 |
+
%235 = fsub float %227, %187, !dbg !43
|
269 |
+
%236 = fsub float %228, %187, !dbg !43
|
270 |
+
%237 = fsub float %229, %187, !dbg !43
|
271 |
+
%238 = fsub float %230, %187, !dbg !43
|
272 |
+
%239 = fsub float %231, %187, !dbg !43
|
273 |
+
%240 = fmul float %232, 0x3FF7154760000000, !dbg !44
|
274 |
+
%241 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %240) #3, !dbg !44
|
275 |
+
%242 = fmul float %233, 0x3FF7154760000000, !dbg !44
|
276 |
+
%243 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %242) #3, !dbg !44
|
277 |
+
%244 = fmul float %234, 0x3FF7154760000000, !dbg !44
|
278 |
+
%245 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %244) #3, !dbg !44
|
279 |
+
%246 = fmul float %235, 0x3FF7154760000000, !dbg !44
|
280 |
+
%247 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %246) #3, !dbg !44
|
281 |
+
%248 = fmul float %236, 0x3FF7154760000000, !dbg !44
|
282 |
+
%249 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %248) #3, !dbg !44
|
283 |
+
%250 = fmul float %237, 0x3FF7154760000000, !dbg !44
|
284 |
+
%251 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %250) #3, !dbg !44
|
285 |
+
%252 = fmul float %238, 0x3FF7154760000000, !dbg !44
|
286 |
+
%253 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %252) #3, !dbg !44
|
287 |
+
%254 = fmul float %239, 0x3FF7154760000000, !dbg !44
|
288 |
+
%255 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %254) #3, !dbg !44
|
289 |
+
%256 = insertelement <8 x float> poison, float %241, i64 0, !dbg !45
|
290 |
+
%257 = insertelement <8 x float> %256, float %243, i64 1, !dbg !45
|
291 |
+
%258 = insertelement <8 x float> %257, float %245, i64 2, !dbg !45
|
292 |
+
%259 = insertelement <8 x float> %258, float %247, i64 3, !dbg !45
|
293 |
+
%260 = insertelement <8 x float> %259, float %249, i64 4, !dbg !45
|
294 |
+
%261 = insertelement <8 x float> %260, float %251, i64 5, !dbg !45
|
295 |
+
%262 = insertelement <8 x float> %261, float %253, i64 6, !dbg !45
|
296 |
+
%263 = insertelement <8 x float> %262, float %255, i64 7, !dbg !45
|
297 |
+
%264 = select <8 x i1> %199, <8 x float> %263, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !45
|
298 |
+
%265 = fadd <8 x float> %194, %264, !dbg !45
|
299 |
+
%266 = add nuw nsw i32 %193, 2048, !dbg !37
|
300 |
+
%267 = icmp ult i32 %193, 48209, !dbg !37
|
301 |
+
br i1 %267, label %192, label %268, !dbg !37
|
302 |
+
|
303 |
+
268: ; preds = %192
|
304 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !46
|
305 |
+
%shift = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
306 |
+
%269 = fadd <8 x float> %265, %shift, !dbg !50
|
307 |
+
%shift94 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
308 |
+
%270 = fadd <8 x float> %shift94, %269, !dbg !50
|
309 |
+
%shift95 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
310 |
+
%271 = fadd <8 x float> %shift95, %270, !dbg !50
|
311 |
+
%shift96 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
312 |
+
%272 = fadd <8 x float> %shift96, %271, !dbg !50
|
313 |
+
%shift97 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
314 |
+
%273 = fadd <8 x float> %shift97, %272, !dbg !50
|
315 |
+
%shift98 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
316 |
+
%274 = fadd <8 x float> %shift98, %273, !dbg !50
|
317 |
+
%shift99 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
|
318 |
+
%275 = fadd <8 x float> %shift99, %274, !dbg !50
|
319 |
+
%276 = extractelement <8 x float> %275, i64 0, !dbg !50
|
320 |
+
%277 = bitcast float %276 to i32, !dbg !46
|
321 |
+
%278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 16, i32 31), !dbg !46
|
322 |
+
%279 = bitcast i32 %278 to float, !dbg !46
|
323 |
+
%280 = fadd float %276, %279, !dbg !50
|
324 |
+
%281 = bitcast float %280 to i32, !dbg !46
|
325 |
+
%282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 8, i32 31), !dbg !46
|
326 |
+
%283 = bitcast i32 %282 to float, !dbg !46
|
327 |
+
%284 = fadd float %280, %283, !dbg !50
|
328 |
+
%285 = bitcast float %284 to i32, !dbg !46
|
329 |
+
%286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !46
|
330 |
+
%287 = bitcast i32 %286 to float, !dbg !46
|
331 |
+
%288 = fadd float %284, %287, !dbg !50
|
332 |
+
%289 = bitcast float %288 to i32, !dbg !46
|
333 |
+
%290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 2, i32 31), !dbg !46
|
334 |
+
%291 = bitcast i32 %290 to float, !dbg !46
|
335 |
+
%292 = fadd float %288, %291, !dbg !50
|
336 |
+
%293 = bitcast float %292 to i32, !dbg !46
|
337 |
+
%294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 1, i32 31), !dbg !46
|
338 |
+
%295 = bitcast i32 %294 to float, !dbg !46
|
339 |
+
%296 = fadd float %292, %295, !dbg !50
|
340 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %158, float %296, i1 %156) #3, !dbg !46
|
341 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !46
|
342 |
+
%297 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %161, i1 %159) #3, !dbg !46
|
343 |
+
%298 = bitcast float %297 to i32, !dbg !46
|
344 |
+
%299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 4, i32 31), !dbg !46
|
345 |
+
%300 = bitcast i32 %299 to float, !dbg !46
|
346 |
+
%301 = fadd float %297, %300, !dbg !50
|
347 |
+
%302 = bitcast float %301 to i32, !dbg !46
|
348 |
+
%303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 2, i32 31), !dbg !46
|
349 |
+
%304 = bitcast i32 %303 to float, !dbg !46
|
350 |
+
%305 = fadd float %301, %304, !dbg !50
|
351 |
+
%306 = bitcast float %305 to i32, !dbg !46
|
352 |
+
%307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 1, i32 31), !dbg !46
|
353 |
+
%308 = bitcast i32 %307 to float, !dbg !46
|
354 |
+
%309 = fadd float %305, %308, !dbg !50
|
355 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %161, float %309, i1 %186) #3, !dbg !46
|
356 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !46
|
357 |
+
%310 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !46
|
358 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !54
|
359 |
+
%311 = insertelement <1 x float> undef, float %310, i64 0, !dbg !54
|
360 |
+
store <1 x float> %311, ptr addrspace(3) @global_smem, align 4, !dbg !54
|
361 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !54
|
362 |
+
%312 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !54
|
363 |
+
%313 = getelementptr float, ptr addrspace(1) %2, i64 %16, !dbg !55
|
364 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %312, ptr addrspace(1) %313, i1 %191) #3, !dbg !56
|
365 |
+
ret void, !dbg !57
|
366 |
+
}
|
367 |
+
|
368 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
369 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
370 |
+
|
371 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
372 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
373 |
+
|
374 |
+
; Function Attrs: convergent nocallback nounwind
|
375 |
+
declare void @llvm.nvvm.barrier0() #2
|
376 |
+
|
377 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
378 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
379 |
+
attributes #2 = { convergent nocallback nounwind }
|
380 |
+
attributes #3 = { nounwind }
|
381 |
+
|
382 |
+
!llvm.module.flags = !{!0}
|
383 |
+
!llvm.dbg.cu = !{!1}
|
384 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
385 |
+
|
386 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
387 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
388 |
+
!2 = !DIFile(filename: "ccyhhqogjmaiuaq7b54att75rswph7r3hvxgfmkjyupj74n77r6i.py", directory: "/tmp/torchinductor_root/cy")
|
389 |
+
!3 = !{ptr @triton__0d1d2d3de4, !"kernel", i32 1}
|
390 |
+
!4 = !{ptr @triton__0d1d2d3de4, !"maxntidx", i32 256}
|
391 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4", linkageName: "triton__0d1d2d3de4", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
392 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
393 |
+
!7 = !{}
|
394 |
+
!8 = !DILocation(line: 24, column: 33, scope: !5)
|
395 |
+
!9 = !DILocation(line: 21, column: 28, scope: !5)
|
396 |
+
!10 = !DILocation(line: 21, column: 34, scope: !5)
|
397 |
+
!11 = !DILocation(line: 31, column: 46, scope: !5)
|
398 |
+
!12 = !DILocation(line: 27, column: 36, scope: !5)
|
399 |
+
!13 = !DILocation(line: 28, column: 27, scope: !5)
|
400 |
+
!14 = !DILocation(line: 38, column: 21, scope: !15, inlinedAt: !17)
|
401 |
+
!15 = distinct !DILexicalBlockFile(scope: !5, file: !16, discriminator: 0)
|
402 |
+
!16 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
403 |
+
!17 = !DILocation(line: 34, column: 45, scope: !15)
|
404 |
+
!18 = !DILocation(line: 29, column: 25, scope: !5)
|
405 |
+
!19 = !DILocation(line: 31, column: 34, scope: !5)
|
406 |
+
!20 = !DILocation(line: 31, column: 52, scope: !5)
|
407 |
+
!21 = !DILocation(line: 31, column: 103, scope: !5)
|
408 |
+
!22 = !DILocation(line: 36, column: 15, scope: !15, inlinedAt: !17)
|
409 |
+
!23 = !DILocation(line: 38, column: 16, scope: !15, inlinedAt: !17)
|
410 |
+
!24 = !DILocation(line: 0, scope: !5)
|
411 |
+
!25 = !DILocation(line: 36, column: 15, scope: !26, inlinedAt: !27)
|
412 |
+
!26 = distinct !DILexicalBlockFile(scope: !15, file: !16, discriminator: 0)
|
413 |
+
!27 = !DILocation(line: 49, column: 29, scope: !26, inlinedAt: !28)
|
414 |
+
!28 = !DILocation(line: 36, column: 38, scope: !26)
|
415 |
+
!29 = !DILocation(line: 38, column: 21, scope: !26, inlinedAt: !27)
|
416 |
+
!30 = !DILocation(line: 38, column: 16, scope: !26, inlinedAt: !27)
|
417 |
+
!31 = !DILocation(line: 39, column: 29, scope: !26, inlinedAt: !27)
|
418 |
+
!32 = !DILocation(line: 49, column: 29, scope: !15, inlinedAt: !33)
|
419 |
+
!33 = !DILocation(line: 36, column: 38, scope: !15)
|
420 |
+
!34 = !DILocation(line: 36, column: 41, scope: !5)
|
421 |
+
!35 = !DILocation(line: 37, column: 25, scope: !5)
|
422 |
+
!36 = !DILocation(line: 37, column: 36, scope: !5)
|
423 |
+
!37 = !DILocation(line: 39, column: 36, scope: !5)
|
424 |
+
!38 = !DILocation(line: 40, column: 27, scope: !5)
|
425 |
+
!39 = !DILocation(line: 41, column: 25, scope: !5)
|
426 |
+
!40 = !DILocation(line: 43, column: 34, scope: !5)
|
427 |
+
!41 = !DILocation(line: 43, column: 52, scope: !5)
|
428 |
+
!42 = !DILocation(line: 43, column: 104, scope: !5)
|
429 |
+
!43 = !DILocation(line: 45, column: 22, scope: !5)
|
430 |
+
!44 = !DILocation(line: 46, column: 22, scope: !5)
|
431 |
+
!45 = !DILocation(line: 49, column: 40, scope: !5)
|
432 |
+
!46 = !DILocation(line: 243, column: 36, scope: !47, inlinedAt: !49)
|
433 |
+
!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0)
|
434 |
+
!48 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
435 |
+
!49 = !DILocation(line: 50, column: 27, scope: !47)
|
436 |
+
!50 = !DILocation(line: 233, column: 15, scope: !51, inlinedAt: !52)
|
437 |
+
!51 = distinct !DILexicalBlockFile(scope: !47, file: !48, discriminator: 0)
|
438 |
+
!52 = !DILocation(line: 243, column: 36, scope: !51, inlinedAt: !53)
|
439 |
+
!53 = !DILocation(line: 50, column: 27, scope: !51)
|
440 |
+
!54 = !DILocation(line: 50, column: 30, scope: !5)
|
441 |
+
!55 = !DILocation(line: 51, column: 25, scope: !5)
|
442 |
+
!56 = !DILocation(line: 51, column: 37, scope: !5)
|
443 |
+
!57 = !DILocation(line: 51, column: 4, scope: !5)
|
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttgir
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i64) attributes {noinline = false} {
|
5 |
+
%c50257_i64 = arith.constant 50257 : i64
|
6 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<true> : tensor<1x2048xi1, #blocked>
|
8 |
+
%c50257_i32 = arith.constant 50257 : i32
|
9 |
+
%c2048_i32 = arith.constant 2048 : i32
|
10 |
+
%c0_i32 = arith.constant 0 : i32
|
11 |
+
%cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
|
12 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
|
13 |
+
%cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked>
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.extsi %0 : i32 to i64
|
16 |
+
%2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
17 |
+
%3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
|
18 |
+
%4 = arith.extsi %3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
|
19 |
+
%5 = arith.muli %1, %c50257_i64 : i64
|
20 |
+
%6 = tt.splat %5 : (i64) -> tensor<1x2048xi64, #blocked>
|
21 |
+
%7 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
|
22 |
+
%8 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>) : i32 {
|
23 |
+
%22 = arith.extsi %arg5 : i32 to i64
|
24 |
+
%23 = tt.splat %22 : (i64) -> tensor<1x2048xi64, #blocked>
|
25 |
+
%24 = arith.addi %23, %4 : tensor<1x2048xi64, #blocked>
|
26 |
+
%25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64, #blocked>
|
27 |
+
%26 = arith.addi %24, %6 : tensor<1x2048xi64, #blocked>
|
28 |
+
%27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
29 |
+
%28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
|
30 |
+
%29 = arith.extf %28 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
|
31 |
+
%30 = arith.cmpf ogt, %arg6, %29 : tensor<1x2048xf32, #blocked>
|
32 |
+
%31 = arith.cmpf une, %arg6, %arg6 : tensor<1x2048xf32, #blocked>
|
33 |
+
%32 = arith.ori %30, %31 : tensor<1x2048xi1, #blocked>
|
34 |
+
%33 = arith.xori %32, %cst_0 : tensor<1x2048xi1, #blocked>
|
35 |
+
%34 = arith.andi %25, %33 : tensor<1x2048xi1, #blocked>
|
36 |
+
%35 = arith.select %34, %29, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
|
37 |
+
scf.yield %35 : tensor<1x2048xf32, #blocked>
|
38 |
+
}
|
39 |
+
%9 = "tt.reduce"(%8) <{axis = 1 : i32}> ({
|
40 |
+
^bb0(%arg5: f32, %arg6: f32):
|
41 |
+
%22 = arith.cmpf ogt, %arg5, %arg6 : f32
|
42 |
+
%23 = arith.cmpf une, %arg5, %arg5 : f32
|
43 |
+
%24 = arith.ori %22, %23 : i1
|
44 |
+
%25 = arith.select %24, %arg5, %arg6 : f32
|
45 |
+
tt.reduce.return %25 : f32
|
46 |
+
}) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
47 |
+
%10 = triton_gpu.convert_layout %9 : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
48 |
+
%11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
|
49 |
+
%12 = tt.expand_dims %9 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
|
50 |
+
%13 = tt.addptr %arg1, %1 : !tt.ptr<f32, 1>, i64
|
51 |
+
%14 = tt.splat %13 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
|
52 |
+
tt.store %14, %11 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
|
53 |
+
%15 = tt.broadcast %12 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
|
54 |
+
%16 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_2) -> (tensor<1x2048xf32, #blocked>) : i32 {
|
55 |
+
%22 = arith.extsi %arg5 : i32 to i64
|
56 |
+
%23 = tt.splat %22 : (i64) -> tensor<1x2048xi64, #blocked>
|
57 |
+
%24 = arith.addi %23, %4 : tensor<1x2048xi64, #blocked>
|
58 |
+
%25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64, #blocked>
|
59 |
+
%26 = arith.addi %24, %6 : tensor<1x2048xi64, #blocked>
|
60 |
+
%27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
|
61 |
+
%28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
|
62 |
+
%29 = arith.extf %28 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
|
63 |
+
%30 = arith.subf %29, %15 : tensor<1x2048xf32, #blocked>
|
64 |
+
%31 = math.exp %30 : tensor<1x2048xf32, #blocked>
|
65 |
+
%32 = arith.addf %arg6, %31 : tensor<1x2048xf32, #blocked>
|
66 |
+
%33 = arith.select %25, %32, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
|
67 |
+
scf.yield %33 : tensor<1x2048xf32, #blocked>
|
68 |
+
}
|
69 |
+
%17 = "tt.reduce"(%16) <{axis = 1 : i32}> ({
|
70 |
+
^bb0(%arg5: f32, %arg6: f32):
|
71 |
+
%22 = arith.addf %arg5, %arg6 : f32
|
72 |
+
tt.reduce.return %22 : f32
|
73 |
+
}) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
74 |
+
%18 = triton_gpu.convert_layout %17 : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
75 |
+
%19 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
|
76 |
+
%20 = tt.addptr %arg2, %1 : !tt.ptr<f32, 1>, i64
|
77 |
+
%21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
|
78 |
+
tt.store %21, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
|
79 |
+
tt.return
|
80 |
+
}
|
81 |
+
}
|
wandb/run-20240926_055222-14kj2390/files/wandb-metadata.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.15.0-113-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "3.10.12",
|
4 |
+
"startedAt": "2024-09-26T05:52:22.950984Z",
|
5 |
+
"args": [
|
6 |
+
"--batch_size=120"
|
7 |
+
],
|
8 |
+
"program": "/root/train.py",
|
9 |
+
"codePath": "train.py",
|
10 |
+
"email": "prasadchandalada@gmail.com",
|
11 |
+
"root": "/root",
|
12 |
+
"host": "184d1c0992ce",
|
13 |
+
"username": "root",
|
14 |
+
"executable": "/usr/bin/python",
|
15 |
+
"codePathLocal": "train.py",
|
16 |
+
"cpu_count": 64,
|
17 |
+
"cpu_count_logical": 128,
|
18 |
+
"gpu": "[NVIDIA L40S, NVIDIA L40S, NVIDIA L40S, NVIDIA L40S]",
|
19 |
+
"gpu_count": 4,
|
20 |
+
"disk": {
|
21 |
+
"/": {
|
22 |
+
"total": "542239621120",
|
23 |
+
"used": "400404738048"
|
24 |
+
}
|
25 |
+
},
|
26 |
+
"memory": {
|
27 |
+
"total": "811327934464"
|
28 |
+
},
|
29 |
+
"cpu": {
|
30 |
+
"count": 64,
|
31 |
+
"countLogical": 128
|
32 |
+
},
|
33 |
+
"gpu_nvidia": [
|
34 |
+
{
|
35 |
+
"name": "NVIDIA L40S",
|
36 |
+
"memoryTotal": "48305799168",
|
37 |
+
"cudaCores": 18176,
|
38 |
+
"architecture": "Ada"
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"name": "NVIDIA L40S",
|
42 |
+
"memoryTotal": "48305799168",
|
43 |
+
"cudaCores": 18176,
|
44 |
+
"architecture": "Ada"
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"name": "NVIDIA L40S",
|
48 |
+
"memoryTotal": "48305799168",
|
49 |
+
"cudaCores": 18176,
|
50 |
+
"architecture": "Ada"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"name": "NVIDIA L40S",
|
54 |
+
"memoryTotal": "48305799168",
|
55 |
+
"cudaCores": 18176,
|
56 |
+
"architecture": "Ada"
|
57 |
+
}
|
58 |
+
],
|
59 |
+
"cudaVersion": "12.2"
|
60 |
+
}
|
wandb/run-20240926_055222-14kj2390/logs/debug-core.log
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-09-26T05:52:22.315197362Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp0irn9n95/port-986.txt","pid":986,"debug":false,"disable-analytics":false}
|
2 |
+
{"time":"2024-09-26T05:52:22.315233622Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
|
3 |
+
{"time":"2024-09-26T05:52:22.316073319Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":986}
|
4 |
+
{"time":"2024-09-26T05:52:22.316064076Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41193,"Zone":""}}
|
5 |
+
{"time":"2024-09-26T05:52:22.505125081Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:42070"}
|
6 |
+
{"time":"2024-09-26T05:52:22.951875072Z","level":"INFO","msg":"connection init received","streamId":"14kj2390","id":"127.0.0.1:42070"}
|
7 |
+
{"time":"2024-09-26T05:52:22.952304043Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_055222.log /root/wandb/run-20240926_055222-14kj2390/logs/debug-core.log: file exists"}
|
8 |
+
{"time":"2024-09-26T05:52:22.955991404Z","level":"INFO","msg":"connection init completed","streamId":"14kj2390","id":"127.0.0.1:42070"}
|
9 |
+
{"time":"2024-09-26T12:39:40.244212691Z","level":"INFO","msg":"handle finish received","streamId":"14kj2390","id":"127.0.0.1:42070"}
|
10 |
+
{"time":"2024-09-26T12:39:41.460220703Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:42070"}
|
11 |
+
{"time":"2024-09-26T12:39:41.460275234Z","level":"INFO","msg":"server is shutting down"}
|
12 |
+
{"time":"2024-09-26T12:39:41.460350917Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:42070"}
|
13 |
+
{"time":"2024-09-26T12:39:41.460369816Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:42070"}
|
14 |
+
{"time":"2024-09-26T12:39:41.460376796Z","level":"INFO","msg":"server is closed"}
|