0-hero commited on
Commit
0eeffdd
·
verified ·
1 Parent(s): 00b5d9e

Add files using upload-large-folder tool

Browse files
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.cubin ADDED
Binary file (16.8 kB). View file
 
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttgir ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
10
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
11
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
12
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
13
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
14
+ %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
15
+ %10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
16
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
17
+ tt.return
18
+ }
19
+ }
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
11
+ %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
16
+ %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c64_i32 = arith.constant 64 : i32
19
+ %c256_i32 = arith.constant 256 : i32
20
+ %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
21
+ %cst_11 = arith.constant 0.000000e+00 : f32
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
26
+ %0 = tt.get_program_id x : i32
27
+ %1 = arith.muli %0, %c64_i32 : i32
28
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
29
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
30
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
31
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
32
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
33
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
34
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
35
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
36
+ %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
37
+ %11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
38
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
39
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
40
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
41
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
42
+ %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
43
+ %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
44
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
45
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
46
+ %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
47
+ %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
48
+ %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
49
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
50
+ %24 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
51
+ %25 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
52
+ %26 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
53
+ %27 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
54
+ %28 = arith.select %26, %24, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
55
+ %29 = arith.select %27, %25, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
56
+ %30 = arith.cmpi sge, %29, %cst_9 : tensor<64x1xi64, #blocked1>
57
+ %31 = arith.cmpi slt, %29, %cst_8 : tensor<64x1xi64, #blocked1>
58
+ %32 = arith.andi %30, %31 : tensor<64x1xi1, #blocked1>
59
+ %33 = arith.muli %28, %cst_5 : tensor<64x1xi64, #blocked>
60
+ %34 = tt.broadcast %33 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
61
+ %35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
62
+ %36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>) : i32 {
63
+ %48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
64
+ %49 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked2>
65
+ %50 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
66
+ %51 = arith.addi %49, %13 : tensor<1x64xi32, #blocked2>
67
+ %52 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
68
+ %53 = arith.cmpi slt, %51, %cst_13 : tensor<1x64xi32, #blocked2>
69
+ %54 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
70
+ %55 = arith.addi %54, %22 : tensor<64x64xi32, #blocked>
71
+ %56 = tt.addptr %23, %55 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
72
+ %57 = tt.broadcast %52 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
73
+ %58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
74
+ %59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
75
+ tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
76
+ %60 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
77
+ %61 = tt.broadcast %60 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
78
+ %62 = arith.addi %61, %34 : tensor<64x64xi64, #blocked>
79
+ %63 = tt.addptr %35, %62 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
80
+ %64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
81
+ %65 = arith.addf %64, %59 : tensor<64x64xf32, #blocked>
82
+ %66 = arith.subf %65, %arg8 : tensor<64x64xf32, #blocked>
83
+ %67 = arith.addf %arg11, %cst_4 : tensor<64x64xf32, #blocked>
84
+ %68 = arith.addf %arg10, %cst_10 : tensor<64x64xf32, #blocked2>
85
+ %69 = arith.divf %66, %67 : tensor<64x64xf32, #blocked>
86
+ %70 = arith.addf %arg8, %69 : tensor<64x64xf32, #blocked>
87
+ %71 = arith.subf %65, %70 : tensor<64x64xf32, #blocked>
88
+ %72 = arith.mulf %66, %71 : tensor<64x64xf32, #blocked>
89
+ %73 = arith.addf %arg9, %72 : tensor<64x64xf32, #blocked>
90
+ %74 = arith.select %57, %70, %arg8 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
91
+ %75 = arith.select %57, %73, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
92
+ %76 = arith.select %57, %67, %arg11 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
93
+ %77 = arith.select %58, %68, %arg10 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
94
+ scf.yield %74, %75, %77, %76 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
95
+ }
96
+ %37 = triton_gpu.convert_layout %36#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
97
+ %38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
98
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
99
+ %48 = arith.subf %arg10, %arg7 : f32
100
+ %49 = arith.addf %arg9, %arg12 : f32
101
+ %50 = arith.cmpf oeq, %49, %cst_11 : f32
102
+ %51 = arith.divf %arg12, %49 : f32
103
+ %52 = arith.select %50, %cst_11, %51 : f32
104
+ %53 = arith.mulf %48, %52 : f32
105
+ %54 = arith.addf %arg7, %53 : f32
106
+ %55 = arith.addf %arg8, %arg11 : f32
107
+ %56 = arith.mulf %48, %48 : f32
108
+ %57 = arith.mulf %56, %arg9 : f32
109
+ %58 = arith.mulf %57, %52 : f32
110
+ %59 = arith.addf %55, %58 : f32
111
+ tt.reduce.return %54, %59, %49 : f32, f32, f32
112
+ }) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
113
+ %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
114
+ %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
115
+ %41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
116
+ %42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
117
+ %43 = arith.divf %40, %cst_15 : tensor<64x1xf32, #blocked>
118
+ %44 = arith.addf %43, %cst_14 : tensor<64x1xf32, #blocked>
119
+ %45 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
120
+ %46 = tt.broadcast %45 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
121
+ %47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
122
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
123
+ %48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
124
+ %49 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
125
+ %50 = arith.cmpi slt, %49, %cst_0 : tensor<1x64xi32, #blocked>
126
+ %51 = tt.broadcast %49 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
127
+ %52 = arith.addi %51, %22 : tensor<64x64xi32, #blocked>
128
+ %53 = tt.addptr %23, %52 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
129
+ %54 = tt.broadcast %50 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
130
+ %55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
131
+ %56 = tt.addptr %41, %49 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
132
+ %57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
133
+ tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
134
+ %58 = arith.extsi %49 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
135
+ %59 = tt.broadcast %58 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
136
+ %60 = arith.addi %59, %34 : tensor<64x64xi64, #blocked>
137
+ %61 = tt.addptr %35, %60 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
138
+ %62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
139
+ %63 = arith.addf %62, %55 : tensor<64x64xf32, #blocked>
140
+ %64 = arith.subf %63, %42 : tensor<64x64xf32, #blocked>
141
+ %65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
142
+ %66 = tt.broadcast %65 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
143
+ %67 = arith.mulf %64, %66 : tensor<64x64xf32, #blocked>
144
+ %68 = tt.broadcast %57 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
145
+ %69 = arith.mulf %67, %68 : tensor<64x64xf32, #blocked>
146
+ %70 = arith.addi %51, %46 : tensor<64x64xi32, #blocked>
147
+ %71 = tt.addptr %47, %70 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
148
+ %72 = arith.truncf %69 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
149
+ tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
150
+ }
151
+ tt.return
152
+ }
153
+ }
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
11
+ %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
16
+ %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c64_i32 = arith.constant 64 : i32
19
+ %c256_i32 = arith.constant 256 : i32
20
+ %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
21
+ %cst_11 = arith.constant 0.000000e+00 : f32
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
26
+ %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked>
27
+ %0 = tt.get_program_id x : i32
28
+ %1 = arith.muli %0, %c64_i32 : i32
29
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
30
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
31
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
32
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
33
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
34
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
35
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
36
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
37
+ %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
38
+ %11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
39
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
40
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
41
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
42
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
43
+ %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
44
+ %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
45
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
46
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
47
+ %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
48
+ %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
49
+ %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
50
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
51
+ %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
52
+ %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
53
+ %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
54
+ %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
55
+ %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
56
+ %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
57
+ %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
58
+ %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
59
+ %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
60
+ %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
61
+ %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
62
+ %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
63
+ %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
64
+ %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
65
+ %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
66
+ %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>) : i32 {
67
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
68
+ %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked2>
69
+ %51 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
70
+ %52 = arith.addi %50, %13 : tensor<1x64xi32, #blocked2>
71
+ %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x64xi32, #blocked>
72
+ %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x64xi32, #blocked2>
73
+ %55 = tt.broadcast %51 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
74
+ %56 = arith.addi %55, %22 : tensor<64x64xi32, #blocked>
75
+ %57 = tt.addptr %23, %56 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
76
+ %58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
77
+ %59 = tt.broadcast %54 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
78
+ %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
79
+ %61 = arith.addi %55, %25 : tensor<64x64xi32, #blocked>
80
+ %62 = tt.addptr %26, %61 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
81
+ %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
82
+ %64 = arith.extf %63 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
83
+ tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
84
+ %65 = arith.extsi %51 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
85
+ %66 = tt.broadcast %65 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
86
+ %67 = arith.addi %66, %37 : tensor<64x64xi64, #blocked>
87
+ %68 = tt.addptr %38, %67 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
88
+ %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
89
+ %70 = arith.addf %69, %60 : tensor<64x64xf32, #blocked>
90
+ %71 = arith.addf %70, %64 : tensor<64x64xf32, #blocked>
91
+ %72 = arith.subf %71, %arg9 : tensor<64x64xf32, #blocked>
92
+ %73 = arith.addf %arg12, %cst_4 : tensor<64x64xf32, #blocked>
93
+ %74 = arith.addf %arg11, %cst_10 : tensor<64x64xf32, #blocked2>
94
+ %75 = arith.divf %72, %73 : tensor<64x64xf32, #blocked>
95
+ %76 = arith.addf %arg9, %75 : tensor<64x64xf32, #blocked>
96
+ %77 = arith.subf %71, %76 : tensor<64x64xf32, #blocked>
97
+ %78 = arith.mulf %72, %77 : tensor<64x64xf32, #blocked>
98
+ %79 = arith.addf %arg10, %78 : tensor<64x64xf32, #blocked>
99
+ %80 = arith.select %58, %76, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
100
+ %81 = arith.select %58, %79, %arg10 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
101
+ %82 = arith.select %58, %73, %arg12 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
102
+ %83 = arith.select %59, %74, %arg11 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
103
+ scf.yield %80, %81, %83, %82 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
104
+ }
105
+ %40 = triton_gpu.convert_layout %39#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
106
+ %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
107
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
108
+ %49 = arith.subf %arg11, %arg8 : f32
109
+ %50 = arith.addf %arg10, %arg13 : f32
110
+ %51 = arith.cmpf oeq, %50, %cst_11 : f32
111
+ %52 = arith.divf %arg13, %50 : f32
112
+ %53 = arith.select %51, %cst_11, %52 : f32
113
+ %54 = arith.mulf %49, %53 : f32
114
+ %55 = arith.addf %arg8, %54 : f32
115
+ %56 = arith.addf %arg9, %arg12 : f32
116
+ %57 = arith.mulf %49, %49 : f32
117
+ %58 = arith.mulf %57, %arg10 : f32
118
+ %59 = arith.mulf %58, %53 : f32
119
+ %60 = arith.addf %56, %59 : f32
120
+ tt.reduce.return %55, %60, %50 : f32, f32, f32
121
+ }) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
122
+ %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
123
+ %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
124
+ %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
125
+ %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
126
+ %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
127
+ %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
128
+ %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
129
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 : i32 {
130
+ %49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
131
+ %50 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
132
+ %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
133
+ %52 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
134
+ %53 = arith.addi %52, %22 : tensor<64x64xi32, #blocked>
135
+ %54 = tt.addptr %23, %53 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
136
+ %55 = tt.broadcast %51 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
137
+ %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
138
+ %57 = arith.addi %52, %25 : tensor<64x64xi32, #blocked>
139
+ %58 = tt.addptr %26, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
140
+ %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
141
+ %60 = arith.extf %59 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
142
+ %61 = tt.addptr %44, %50 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
143
+ %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
144
+ tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
145
+ %63 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
146
+ %64 = tt.broadcast %63 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
147
+ %65 = arith.addi %64, %37 : tensor<64x64xi64, #blocked>
148
+ %66 = tt.addptr %38, %65 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
149
+ %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
150
+ %68 = arith.addf %67, %56 : tensor<64x64xf32, #blocked>
151
+ %69 = arith.addf %68, %60 : tensor<64x64xf32, #blocked>
152
+ %70 = arith.subf %69, %45 : tensor<64x64xf32, #blocked>
153
+ %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
154
+ %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
155
+ %73 = arith.mulf %70, %72 : tensor<64x64xf32, #blocked>
156
+ %74 = tt.broadcast %62 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
157
+ %75 = arith.mulf %73, %74 : tensor<64x64xf32, #blocked>
158
+ %76 = tt.addptr %48, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
159
+ %77 = arith.truncf %75 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
160
+ tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
161
+ }
162
+ tt.return
163
+ }
164
+ }
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ptx ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1de
10
+
11
+ .visible .entry triton__0d1de(
12
+ .param .u64 triton__0d1de_param_0,
13
+ .param .u32 triton__0d1de_param_1
14
+ )
15
+ .maxntid 128, 1, 1
16
+ {
17
+ .reg .pred %p<2>;
18
+ .reg .b32 %r<6>;
19
+ .reg .b64 %rd<5>;
20
+ .loc 1 18 0
21
+ $L__func_begin0:
22
+ .loc 1 18 0
23
+
24
+ ld.param.u64 %rd3, [triton__0d1de_param_0];
25
+ $L__tmp0:
26
+ .loc 1 21 36
27
+ mov.u32 %r2, %tid.x;
28
+ and.b32 %r3, %r2, 127;
29
+ .loc 1 20 28
30
+ mov.u32 %r1, %ctaid.x;
31
+ .loc 1 20 33
32
+ shl.b32 %r4, %r1, 7;
33
+ .loc 1 21 23
34
+ or.b32 %r5, %r4, %r3;
35
+ .loc 1 22 21
36
+ setp.lt.s32 %p1, %r5, 512;
37
+ .loc 1 25 25
38
+ cvt.s64.s32 %rd1, %r5;
39
+ mul.wide.s32 %rd4, %r5, 8;
40
+ add.s64 %rd2, %rd3, %rd4;
41
+ .loc 1 25 36
42
+ @%p1 st.global.b64 [ %rd2 + 0 ], { %rd1 };
43
+ .loc 1 25 4
44
+ ret;
45
+ $L__tmp1:
46
+ $L__func_end0:
47
+
48
+ }
49
+ .file 1 "/tmp/torchinductor_root/wx/cwxxgxdevnyc453z7hh4nxzgmvlhh6suwokktps3dw62btskgxt4.py"
50
+ .section .debug_abbrev
51
+ {
52
+ .b8 1
53
+ .b8 17
54
+ .b8 1
55
+ .b8 37
56
+ .b8 8
57
+ .b8 19
58
+ .b8 5
59
+ .b8 3
60
+ .b8 8
61
+ .b8 16
62
+ .b8 6
63
+ .b8 27
64
+ .b8 8
65
+ .b8 180
66
+ .b8 66
67
+ .b8 12
68
+ .b8 17
69
+ .b8 1
70
+ .b8 18
71
+ .b8 1
72
+ .b8 0
73
+ .b8 0
74
+ .b8 2
75
+ .b8 46
76
+ .b8 0
77
+ .b8 17
78
+ .b8 1
79
+ .b8 18
80
+ .b8 1
81
+ .b8 64
82
+ .b8 10
83
+ .b8 135
84
+ .b8 64
85
+ .b8 8
86
+ .b8 3
87
+ .b8 8
88
+ .b8 58
89
+ .b8 11
90
+ .b8 59
91
+ .b8 11
92
+ .b8 63
93
+ .b8 12
94
+ .b8 0
95
+ .b8 0
96
+ .b8 0
97
+ }
98
+ .section .debug_info
99
+ {
100
+ .b32 172
101
+ .b8 2
102
+ .b8 0
103
+ .b32 .debug_abbrev
104
+ .b8 8
105
+ .b8 1
106
+ .b8 116
107
+ .b8 114
108
+ .b8 105
109
+ .b8 116
110
+ .b8 111
111
+ .b8 110
112
+ .b8 0
113
+ .b8 2
114
+ .b8 0
115
+ .b8 99
116
+ .b8 119
117
+ .b8 120
118
+ .b8 120
119
+ .b8 103
120
+ .b8 120
121
+ .b8 100
122
+ .b8 101
123
+ .b8 118
124
+ .b8 110
125
+ .b8 121
126
+ .b8 99
127
+ .b8 52
128
+ .b8 53
129
+ .b8 51
130
+ .b8 122
131
+ .b8 55
132
+ .b8 104
133
+ .b8 104
134
+ .b8 52
135
+ .b8 110
136
+ .b8 120
137
+ .b8 122
138
+ .b8 103
139
+ .b8 109
140
+ .b8 118
141
+ .b8 108
142
+ .b8 104
143
+ .b8 104
144
+ .b8 54
145
+ .b8 115
146
+ .b8 117
147
+ .b8 119
148
+ .b8 111
149
+ .b8 107
150
+ .b8 107
151
+ .b8 116
152
+ .b8 112
153
+ .b8 115
154
+ .b8 51
155
+ .b8 100
156
+ .b8 119
157
+ .b8 54
158
+ .b8 50
159
+ .b8 98
160
+ .b8 116
161
+ .b8 115
162
+ .b8 107
163
+ .b8 103
164
+ .b8 120
165
+ .b8 116
166
+ .b8 52
167
+ .b8 46
168
+ .b8 112
169
+ .b8 121
170
+ .b8 0
171
+ .b32 .debug_line
172
+ .b8 47
173
+ .b8 116
174
+ .b8 109
175
+ .b8 112
176
+ .b8 47
177
+ .b8 116
178
+ .b8 111
179
+ .b8 114
180
+ .b8 99
181
+ .b8 104
182
+ .b8 105
183
+ .b8 110
184
+ .b8 100
185
+ .b8 117
186
+ .b8 99
187
+ .b8 116
188
+ .b8 111
189
+ .b8 114
190
+ .b8 95
191
+ .b8 114
192
+ .b8 111
193
+ .b8 111
194
+ .b8 116
195
+ .b8 47
196
+ .b8 119
197
+ .b8 120
198
+ .b8 0
199
+ .b8 1
200
+ .b64 $L__func_begin0
201
+ .b64 $L__func_end0
202
+ .b8 2
203
+ .b64 $L__func_begin0
204
+ .b64 $L__func_end0
205
+ .b8 1
206
+ .b8 156
207
+ .b8 116
208
+ .b8 114
209
+ .b8 105
210
+ .b8 116
211
+ .b8 111
212
+ .b8 110
213
+ .b8 95
214
+ .b8 95
215
+ .b8 48
216
+ .b8 100
217
+ .b8 49
218
+ .b8 100
219
+ .b8 101
220
+ .b8 0
221
+ .b8 116
222
+ .b8 114
223
+ .b8 105
224
+ .b8 116
225
+ .b8 111
226
+ .b8 110
227
+ .b8 95
228
+ .b8 95
229
+ .b8 48
230
+ .b8 100
231
+ .b8 49
232
+ .b8 100
233
+ .b8 101
234
+ .b8 0
235
+ .b8 1
236
+ .b8 18
237
+ .b8 1
238
+ .b8 0
239
+ }
240
+ .section .debug_pubnames
241
+ {
242
+ .b32 $L__pubNames_end0-$L__pubNames_start0
243
+ $L__pubNames_start0:
244
+ .b8 2
245
+ .b8 0
246
+ .b32 .debug_info
247
+ .b32 176
248
+ .b32 125
249
+ .b8 116
250
+ .b8 114
251
+ .b8 105
252
+ .b8 116
253
+ .b8 111
254
+ .b8 110
255
+ .b8 95
256
+ .b8 95
257
+ .b8 48
258
+ .b8 100
259
+ .b8 49
260
+ .b8 100
261
+ .b8 101
262
+ .b8 0
263
+ .b32 0
264
+ $L__pubNames_end0:
265
+ }
266
+ .section .debug_pubtypes
267
+ {
268
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
269
+ $L__pubTypes_start0:
270
+ .b8 2
271
+ .b8 0
272
+ .b32 .debug_info
273
+ .b32 176
274
+ .b32 0
275
+ $L__pubTypes_end0:
276
+ }
277
+ .section .debug_loc { }
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.llir ADDED
The diff for this file is too large to render. See raw diff
 
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin ADDED
Binary file (4.78 kB). View file
 
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.llir ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = lshr i32 %6, 5, !dbg !8
9
+ %urem = and i32 %6, 255, !dbg !8
10
+ %8 = or i32 %urem, 256, !dbg !8
11
+ %9 = or i32 %urem, 512, !dbg !8
12
+ %10 = or i32 %urem, 768, !dbg !8
13
+ %11 = or i32 %urem, 1024, !dbg !8
14
+ %12 = or i32 %urem, 1280, !dbg !8
15
+ %13 = or i32 %urem, 1536, !dbg !8
16
+ %14 = or i32 %urem, 1792, !dbg !8
17
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
18
+ %16 = sext i32 %15 to i64, !dbg !10
19
+ %17 = insertelement <8 x i32> poison, i32 %urem, i64 0
20
+ %18 = insertelement <8 x i32> %17, i32 %8, i64 1
21
+ %19 = insertelement <8 x i32> %18, i32 %9, i64 2
22
+ %20 = insertelement <8 x i32> %19, i32 %10, i64 3
23
+ %21 = insertelement <8 x i32> %20, i32 %11, i64 4
24
+ %22 = insertelement <8 x i32> %21, i32 %12, i64 5
25
+ %23 = insertelement <8 x i32> %22, i32 %13, i64 6
26
+ %24 = insertelement <8 x i32> %23, i32 %14, i64 7
27
+ %25 = zext <8 x i32> %24 to <8 x i64>
28
+ %26 = mul nsw i64 %16, 50257, !dbg !11
29
+ %invariant.gep = getelementptr i16, ptr addrspace(1) %0, i64 %26, !dbg !12
30
+ br label %27, !dbg !12
31
+
32
+ 27: ; preds = %5, %27
33
+ %28 = phi i32 [ 0, %5 ], [ %80, %27 ]
34
+ %29 = phi <8 x float> [ <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, %5 ], [ %79, %27 ]
35
+ %30 = zext nneg i32 %28 to i64, !dbg !13
36
+ %31 = fcmp ord <8 x float> %29, zeroinitializer, !dbg !14
37
+ %32 = insertelement <8 x i64> poison, i64 %30, i64 0, !dbg !13
38
+ %33 = shufflevector <8 x i64> %32, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !13
39
+ %34 = or <8 x i64> %33, %25, !dbg !13
40
+ %35 = icmp ult <8 x i64> %34, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !18
41
+ %36 = extractelement <8 x i64> %34, i64 0, !dbg !19
42
+ %gep = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19
43
+ %37 = extractelement <8 x i64> %34, i64 1, !dbg !19
44
+ %gep20 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19
45
+ %38 = extractelement <8 x i64> %34, i64 2, !dbg !19
46
+ %gep22 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %38, !dbg !19
47
+ %39 = extractelement <8 x i64> %34, i64 3, !dbg !19
48
+ %gep24 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %39, !dbg !19
49
+ %40 = extractelement <8 x i64> %34, i64 4, !dbg !19
50
+ %gep26 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %40, !dbg !19
51
+ %41 = extractelement <8 x i64> %34, i64 5, !dbg !19
52
+ %gep28 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %41, !dbg !19
53
+ %42 = extractelement <8 x i64> %34, i64 6, !dbg !19
54
+ %gep30 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %42, !dbg !19
55
+ %43 = extractelement <8 x i64> %34, i64 7, !dbg !19
56
+ %gep32 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %43, !dbg !19
57
+ %44 = extractelement <8 x i1> %35, i64 0, !dbg !20
58
+ %45 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep, i1 %44, i16 0, i1 %44) #3, !dbg !20
59
+ %46 = extractelement <8 x i1> %35, i64 1, !dbg !20
60
+ %47 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep20, i1 %46, i16 0, i1 %46) #3, !dbg !20
61
+ %48 = extractelement <8 x i1> %35, i64 2, !dbg !20
62
+ %49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep22, i1 %48, i16 0, i1 %48) #3, !dbg !20
63
+ %50 = extractelement <8 x i1> %35, i64 3, !dbg !20
64
+ %51 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep24, i1 %50, i16 0, i1 %50) #3, !dbg !20
65
+ %52 = extractelement <8 x i1> %35, i64 4, !dbg !20
66
+ %53 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep26, i1 %52, i16 0, i1 %52) #3, !dbg !20
67
+ %54 = extractelement <8 x i1> %35, i64 5, !dbg !20
68
+ %55 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep28, i1 %54, i16 0, i1 %54) #3, !dbg !20
69
+ %56 = extractelement <8 x i1> %35, i64 6, !dbg !20
70
+ %57 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep30, i1 %56, i16 0, i1 %56) #3, !dbg !20
71
+ %58 = extractelement <8 x i1> %35, i64 7, !dbg !20
72
+ %59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep32, i1 %58, i16 0, i1 %58) #3, !dbg !20
73
+ %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #3, !dbg !21
74
+ %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #3, !dbg !21
75
+ %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #3, !dbg !21
76
+ %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #3, !dbg !21
77
+ %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #3, !dbg !21
78
+ %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #3, !dbg !21
79
+ %66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !21
80
+ %67 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #3, !dbg !21
81
+ %68 = insertelement <8 x float> poison, float %60, i64 0, !dbg !22
82
+ %69 = insertelement <8 x float> %68, float %61, i64 1, !dbg !22
83
+ %70 = insertelement <8 x float> %69, float %62, i64 2, !dbg !22
84
+ %71 = insertelement <8 x float> %70, float %63, i64 3, !dbg !22
85
+ %72 = insertelement <8 x float> %71, float %64, i64 4, !dbg !22
86
+ %73 = insertelement <8 x float> %72, float %65, i64 5, !dbg !22
87
+ %74 = insertelement <8 x float> %73, float %66, i64 6, !dbg !22
88
+ %75 = insertelement <8 x float> %74, float %67, i64 7, !dbg !22
89
+ %76 = fcmp ule <8 x float> %29, %75, !dbg !22
90
+ %77 = and <8 x i1> %31, %76, !dbg !23
91
+ %78 = and <8 x i1> %35, %77, !dbg !24
92
+ %79 = select <8 x i1> %78, <8 x float> %75, <8 x float> %29, !dbg !24
93
+ %80 = add nuw nsw i32 %28, 2048, !dbg !12
94
+ %81 = icmp ult i32 %28, 48209, !dbg !12
95
+ br i1 %81, label %27, label %82, !dbg !12
96
+
97
+ 82: ; preds = %27
98
+ %83 = and i32 %6, 31, !dbg !8
99
+ %84 = and i32 %7, 7, !dbg !8
100
+ %85 = extractelement <8 x float> %79, i64 0, !dbg !25
101
+ %86 = extractelement <8 x float> %79, i64 1, !dbg !25
102
+ %87 = fcmp ogt float %85, %86, !dbg !25
103
+ %88 = fcmp uno float %85, 0.000000e+00, !dbg !29
104
+ %89 = or i1 %87, %88, !dbg !30
105
+ %90 = select i1 %89, float %85, float %86, !dbg !31
106
+ %91 = extractelement <8 x float> %79, i64 2, !dbg !25
107
+ %92 = fcmp ogt float %90, %91, !dbg !25
108
+ %93 = fcmp uno float %90, 0.000000e+00, !dbg !29
109
+ %94 = or i1 %92, %93, !dbg !30
110
+ %95 = select i1 %94, float %90, float %91, !dbg !31
111
+ %96 = extractelement <8 x float> %79, i64 3, !dbg !25
112
+ %97 = fcmp ogt float %95, %96, !dbg !25
113
+ %98 = fcmp uno float %95, 0.000000e+00, !dbg !29
114
+ %99 = or i1 %97, %98, !dbg !30
115
+ %100 = select i1 %99, float %95, float %96, !dbg !31
116
+ %101 = extractelement <8 x float> %79, i64 4, !dbg !25
117
+ %102 = fcmp ogt float %100, %101, !dbg !25
118
+ %103 = fcmp uno float %100, 0.000000e+00, !dbg !29
119
+ %104 = or i1 %102, %103, !dbg !30
120
+ %105 = select i1 %104, float %100, float %101, !dbg !31
121
+ %106 = extractelement <8 x float> %79, i64 5, !dbg !25
122
+ %107 = fcmp ogt float %105, %106, !dbg !25
123
+ %108 = fcmp uno float %105, 0.000000e+00, !dbg !29
124
+ %109 = or i1 %107, %108, !dbg !30
125
+ %110 = select i1 %109, float %105, float %106, !dbg !31
126
+ %111 = extractelement <8 x float> %79, i64 6, !dbg !25
127
+ %112 = fcmp ogt float %110, %111, !dbg !25
128
+ %113 = fcmp uno float %110, 0.000000e+00, !dbg !29
129
+ %114 = or i1 %112, %113, !dbg !30
130
+ %115 = select i1 %114, float %110, float %111, !dbg !31
131
+ %116 = extractelement <8 x float> %79, i64 7, !dbg !25
132
+ %117 = fcmp ogt float %115, %116, !dbg !25
133
+ %118 = fcmp uno float %115, 0.000000e+00, !dbg !29
134
+ %119 = or i1 %117, %118, !dbg !30
135
+ %120 = select i1 %119, float %115, float %116, !dbg !31
136
+ %121 = bitcast float %120 to i32, !dbg !32
137
+ %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !32
138
+ %123 = bitcast i32 %122 to float, !dbg !32
139
+ %124 = fcmp ogt float %120, %123, !dbg !25
140
+ %125 = fcmp uno float %120, 0.000000e+00, !dbg !29
141
+ %126 = or i1 %125, %124, !dbg !30
142
+ %127 = select i1 %126, float %120, float %123, !dbg !31
143
+ %128 = bitcast float %127 to i32, !dbg !32
144
+ %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !32
145
+ %130 = bitcast i32 %129 to float, !dbg !32
146
+ %131 = fcmp ogt float %127, %130, !dbg !25
147
+ %132 = fcmp uno float %127, 0.000000e+00, !dbg !29
148
+ %133 = or i1 %131, %132, !dbg !30
149
+ %134 = select i1 %133, float %127, float %130, !dbg !31
150
+ %135 = bitcast float %134 to i32, !dbg !32
151
+ %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 4, i32 31), !dbg !32
152
+ %137 = bitcast i32 %136 to float, !dbg !32
153
+ %138 = fcmp ogt float %134, %137, !dbg !25
154
+ %139 = fcmp uno float %134, 0.000000e+00, !dbg !29
155
+ %140 = or i1 %138, %139, !dbg !30
156
+ %141 = select i1 %140, float %134, float %137, !dbg !31
157
+ %142 = bitcast float %141 to i32, !dbg !32
158
+ %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 2, i32 31), !dbg !32
159
+ %144 = bitcast i32 %143 to float, !dbg !32
160
+ %145 = fcmp ogt float %141, %144, !dbg !25
161
+ %146 = fcmp uno float %141, 0.000000e+00, !dbg !29
162
+ %147 = or i1 %145, %146, !dbg !30
163
+ %148 = select i1 %147, float %141, float %144, !dbg !31
164
+ %149 = bitcast float %148 to i32, !dbg !32
165
+ %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 1, i32 31), !dbg !32
166
+ %151 = bitcast i32 %150 to float, !dbg !32
167
+ %152 = fcmp ogt float %148, %151, !dbg !25
168
+ %153 = fcmp uno float %148, 0.000000e+00, !dbg !29
169
+ %154 = or i1 %152, %153, !dbg !30
170
+ %155 = select i1 %154, float %148, float %151, !dbg !31
171
+ %156 = icmp eq i32 %83, 0, !dbg !32
172
+ %157 = zext nneg i32 %84 to i64, !dbg !32
173
+ %158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !32
174
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %158, float %155, i1 %156) #3, !dbg !32
175
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
176
+ %159 = icmp slt i32 %6, 8, !dbg !32
177
+ %160 = sext i32 %6 to i64, !dbg !32
178
+ %161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !32
179
+ %162 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %161, i1 %159) #3, !dbg !32
180
+ %163 = bitcast float %162 to i32, !dbg !32
181
+ %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 4, i32 31), !dbg !32
182
+ %165 = bitcast i32 %164 to float, !dbg !32
183
+ %166 = fcmp ogt float %162, %165, !dbg !25
184
+ %167 = fcmp uno float %162, 0.000000e+00, !dbg !29
185
+ %168 = or i1 %167, %166, !dbg !30
186
+ %169 = select i1 %168, float %162, float %165, !dbg !31
187
+ %170 = bitcast float %169 to i32, !dbg !32
188
+ %171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 2, i32 31), !dbg !32
189
+ %172 = bitcast i32 %171 to float, !dbg !32
190
+ %173 = fcmp ogt float %169, %172, !dbg !25
191
+ %174 = fcmp uno float %169, 0.000000e+00, !dbg !29
192
+ %175 = or i1 %173, %174, !dbg !30
193
+ %176 = select i1 %175, float %169, float %172, !dbg !31
194
+ %177 = bitcast float %176 to i32, !dbg !32
195
+ %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !32
196
+ %179 = bitcast i32 %178 to float, !dbg !32
197
+ %180 = fcmp ogt float %176, %179, !dbg !25
198
+ %181 = fcmp uno float %176, 0.000000e+00, !dbg !29
199
+ %182 = or i1 %180, %181, !dbg !30
200
+ %183 = select i1 %182, float %176, float %179, !dbg !31
201
+ %184 = and i32 %6, 7, !dbg !32
202
+ %185 = icmp eq i32 %184, 0, !dbg !32
203
+ %186 = and i1 %159, %185, !dbg !32
204
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %161, float %183, i1 %186) #3, !dbg !32
205
+ tail call void @llvm.nvvm.barrier0(), !dbg !32
206
+ %187 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
207
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
208
+ %188 = insertelement <1 x float> undef, float %187, i64 0, !dbg !34
209
+ store <1 x float> %188, ptr addrspace(3) @global_smem, align 4, !dbg !34
210
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
211
+ %189 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !34
212
+ %190 = getelementptr float, ptr addrspace(1) %1, i64 %16, !dbg !35
213
+ %191 = icmp eq i32 %urem, 0, !dbg !36
214
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %189, ptr addrspace(1) %190, i1 %191) #3, !dbg !36
215
+ br label %192, !dbg !37
216
+
217
+ 192: ; preds = %82, %192
218
+ %193 = phi i32 [ 0, %82 ], [ %266, %192 ]
219
+ %194 = phi <8 x float> [ zeroinitializer, %82 ], [ %265, %192 ]
220
+ %195 = zext nneg i32 %193 to i64, !dbg !38
221
+ %196 = insertelement <8 x i64> poison, i64 %195, i64 0, !dbg !38
222
+ %197 = shufflevector <8 x i64> %196, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !38
223
+ %198 = or <8 x i64> %197, %25, !dbg !38
224
+ %199 = icmp ult <8 x i64> %198, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !39
225
+ %200 = extractelement <8 x i64> %198, i64 0, !dbg !40
226
+ %gep34 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %200, !dbg !40
227
+ %201 = extractelement <8 x i64> %198, i64 1, !dbg !40
228
+ %gep36 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %201, !dbg !40
229
+ %202 = extractelement <8 x i64> %198, i64 2, !dbg !40
230
+ %gep38 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %202, !dbg !40
231
+ %203 = extractelement <8 x i64> %198, i64 3, !dbg !40
232
+ %gep40 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %203, !dbg !40
233
+ %204 = extractelement <8 x i64> %198, i64 4, !dbg !40
234
+ %gep42 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %204, !dbg !40
235
+ %205 = extractelement <8 x i64> %198, i64 5, !dbg !40
236
+ %gep44 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %205, !dbg !40
237
+ %206 = extractelement <8 x i64> %198, i64 6, !dbg !40
238
+ %gep46 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %206, !dbg !40
239
+ %207 = extractelement <8 x i64> %198, i64 7, !dbg !40
240
+ %gep48 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %207, !dbg !40
241
+ %208 = extractelement <8 x i1> %199, i64 0, !dbg !41
242
+ %209 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep34, i1 %208, i16 0, i1 %208) #3, !dbg !41
243
+ %210 = extractelement <8 x i1> %199, i64 1, !dbg !41
244
+ %211 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep36, i1 %210, i16 0, i1 %210) #3, !dbg !41
245
+ %212 = extractelement <8 x i1> %199, i64 2, !dbg !41
246
+ %213 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep38, i1 %212, i16 0, i1 %212) #3, !dbg !41
247
+ %214 = extractelement <8 x i1> %199, i64 3, !dbg !41
248
+ %215 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep40, i1 %214, i16 0, i1 %214) #3, !dbg !41
249
+ %216 = extractelement <8 x i1> %199, i64 4, !dbg !41
250
+ %217 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep42, i1 %216, i16 0, i1 %216) #3, !dbg !41
251
+ %218 = extractelement <8 x i1> %199, i64 5, !dbg !41
252
+ %219 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep44, i1 %218, i16 0, i1 %218) #3, !dbg !41
253
+ %220 = extractelement <8 x i1> %199, i64 6, !dbg !41
254
+ %221 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep46, i1 %220, i16 0, i1 %220) #3, !dbg !41
255
+ %222 = extractelement <8 x i1> %199, i64 7, !dbg !41
256
+ %223 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep48, i1 %222, i16 0, i1 %222) #3, !dbg !41
257
+ %224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #3, !dbg !42
258
+ %225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #3, !dbg !42
259
+ %226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %213) #3, !dbg !42
260
+ %227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %215) #3, !dbg !42
261
+ %228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %217) #3, !dbg !42
262
+ %229 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %219) #3, !dbg !42
263
+ %230 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %221) #3, !dbg !42
264
+ %231 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %223) #3, !dbg !42
265
+ %232 = fsub float %224, %187, !dbg !43
266
+ %233 = fsub float %225, %187, !dbg !43
267
+ %234 = fsub float %226, %187, !dbg !43
268
+ %235 = fsub float %227, %187, !dbg !43
269
+ %236 = fsub float %228, %187, !dbg !43
270
+ %237 = fsub float %229, %187, !dbg !43
271
+ %238 = fsub float %230, %187, !dbg !43
272
+ %239 = fsub float %231, %187, !dbg !43
273
+ %240 = fmul float %232, 0x3FF7154760000000, !dbg !44
274
+ %241 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %240) #3, !dbg !44
275
+ %242 = fmul float %233, 0x3FF7154760000000, !dbg !44
276
+ %243 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %242) #3, !dbg !44
277
+ %244 = fmul float %234, 0x3FF7154760000000, !dbg !44
278
+ %245 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %244) #3, !dbg !44
279
+ %246 = fmul float %235, 0x3FF7154760000000, !dbg !44
280
+ %247 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %246) #3, !dbg !44
281
+ %248 = fmul float %236, 0x3FF7154760000000, !dbg !44
282
+ %249 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %248) #3, !dbg !44
283
+ %250 = fmul float %237, 0x3FF7154760000000, !dbg !44
284
+ %251 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %250) #3, !dbg !44
285
+ %252 = fmul float %238, 0x3FF7154760000000, !dbg !44
286
+ %253 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %252) #3, !dbg !44
287
+ %254 = fmul float %239, 0x3FF7154760000000, !dbg !44
288
+ %255 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %254) #3, !dbg !44
289
+ %256 = insertelement <8 x float> poison, float %241, i64 0, !dbg !45
290
+ %257 = insertelement <8 x float> %256, float %243, i64 1, !dbg !45
291
+ %258 = insertelement <8 x float> %257, float %245, i64 2, !dbg !45
292
+ %259 = insertelement <8 x float> %258, float %247, i64 3, !dbg !45
293
+ %260 = insertelement <8 x float> %259, float %249, i64 4, !dbg !45
294
+ %261 = insertelement <8 x float> %260, float %251, i64 5, !dbg !45
295
+ %262 = insertelement <8 x float> %261, float %253, i64 6, !dbg !45
296
+ %263 = insertelement <8 x float> %262, float %255, i64 7, !dbg !45
297
+ %264 = select <8 x i1> %199, <8 x float> %263, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !45
298
+ %265 = fadd <8 x float> %194, %264, !dbg !45
299
+ %266 = add nuw nsw i32 %193, 2048, !dbg !37
300
+ %267 = icmp ult i32 %193, 48209, !dbg !37
301
+ br i1 %267, label %192, label %268, !dbg !37
302
+
303
+ 268: ; preds = %192
304
+ tail call void @llvm.nvvm.barrier0(), !dbg !46
305
+ %shift = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
306
+ %269 = fadd <8 x float> %265, %shift, !dbg !50
307
+ %shift94 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
308
+ %270 = fadd <8 x float> %shift94, %269, !dbg !50
309
+ %shift95 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
310
+ %271 = fadd <8 x float> %shift95, %270, !dbg !50
311
+ %shift96 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
312
+ %272 = fadd <8 x float> %shift96, %271, !dbg !50
313
+ %shift97 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
314
+ %273 = fadd <8 x float> %shift97, %272, !dbg !50
315
+ %shift98 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
316
+ %274 = fadd <8 x float> %shift98, %273, !dbg !50
317
+ %shift99 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
318
+ %275 = fadd <8 x float> %shift99, %274, !dbg !50
319
+ %276 = extractelement <8 x float> %275, i64 0, !dbg !50
320
+ %277 = bitcast float %276 to i32, !dbg !46
321
+ %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 16, i32 31), !dbg !46
322
+ %279 = bitcast i32 %278 to float, !dbg !46
323
+ %280 = fadd float %276, %279, !dbg !50
324
+ %281 = bitcast float %280 to i32, !dbg !46
325
+ %282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 8, i32 31), !dbg !46
326
+ %283 = bitcast i32 %282 to float, !dbg !46
327
+ %284 = fadd float %280, %283, !dbg !50
328
+ %285 = bitcast float %284 to i32, !dbg !46
329
+ %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !46
330
+ %287 = bitcast i32 %286 to float, !dbg !46
331
+ %288 = fadd float %284, %287, !dbg !50
332
+ %289 = bitcast float %288 to i32, !dbg !46
333
+ %290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 2, i32 31), !dbg !46
334
+ %291 = bitcast i32 %290 to float, !dbg !46
335
+ %292 = fadd float %288, %291, !dbg !50
336
+ %293 = bitcast float %292 to i32, !dbg !46
337
+ %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 1, i32 31), !dbg !46
338
+ %295 = bitcast i32 %294 to float, !dbg !46
339
+ %296 = fadd float %292, %295, !dbg !50
340
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %158, float %296, i1 %156) #3, !dbg !46
341
+ tail call void @llvm.nvvm.barrier0(), !dbg !46
342
+ %297 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %161, i1 %159) #3, !dbg !46
343
+ %298 = bitcast float %297 to i32, !dbg !46
344
+ %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 4, i32 31), !dbg !46
345
+ %300 = bitcast i32 %299 to float, !dbg !46
346
+ %301 = fadd float %297, %300, !dbg !50
347
+ %302 = bitcast float %301 to i32, !dbg !46
348
+ %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 2, i32 31), !dbg !46
349
+ %304 = bitcast i32 %303 to float, !dbg !46
350
+ %305 = fadd float %301, %304, !dbg !50
351
+ %306 = bitcast float %305 to i32, !dbg !46
352
+ %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 1, i32 31), !dbg !46
353
+ %308 = bitcast i32 %307 to float, !dbg !46
354
+ %309 = fadd float %305, %308, !dbg !50
355
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %161, float %309, i1 %186) #3, !dbg !46
356
+ tail call void @llvm.nvvm.barrier0(), !dbg !46
357
+ %310 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !46
358
+ tail call void @llvm.nvvm.barrier0(), !dbg !54
359
+ %311 = insertelement <1 x float> undef, float %310, i64 0, !dbg !54
360
+ store <1 x float> %311, ptr addrspace(3) @global_smem, align 4, !dbg !54
361
+ tail call void @llvm.nvvm.barrier0(), !dbg !54
362
+ %312 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !54
363
+ %313 = getelementptr float, ptr addrspace(1) %2, i64 %16, !dbg !55
364
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %312, ptr addrspace(1) %313, i1 %191) #3, !dbg !56
365
+ ret void, !dbg !57
366
+ }
367
+
368
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
369
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
370
+
371
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
372
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
373
+
374
+ ; Function Attrs: convergent nocallback nounwind
375
+ declare void @llvm.nvvm.barrier0() #2
376
+
377
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
378
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
379
+ attributes #2 = { convergent nocallback nounwind }
380
+ attributes #3 = { nounwind }
381
+
382
+ !llvm.module.flags = !{!0}
383
+ !llvm.dbg.cu = !{!1}
384
+ !nvvm.annotations = !{!3, !4, !4, !3}
385
+
386
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
387
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
388
+ !2 = !DIFile(filename: "ccyhhqogjmaiuaq7b54att75rswph7r3hvxgfmkjyupj74n77r6i.py", directory: "/tmp/torchinductor_root/cy")
389
+ !3 = !{ptr @triton__0d1d2d3de4, !"kernel", i32 1}
390
+ !4 = !{ptr @triton__0d1d2d3de4, !"maxntidx", i32 256}
391
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4", linkageName: "triton__0d1d2d3de4", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
392
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
393
+ !7 = !{}
394
+ !8 = !DILocation(line: 24, column: 33, scope: !5)
395
+ !9 = !DILocation(line: 21, column: 28, scope: !5)
396
+ !10 = !DILocation(line: 21, column: 34, scope: !5)
397
+ !11 = !DILocation(line: 31, column: 46, scope: !5)
398
+ !12 = !DILocation(line: 27, column: 36, scope: !5)
399
+ !13 = !DILocation(line: 28, column: 27, scope: !5)
400
+ !14 = !DILocation(line: 38, column: 21, scope: !15, inlinedAt: !17)
401
+ !15 = distinct !DILexicalBlockFile(scope: !5, file: !16, discriminator: 0)
402
+ !16 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
403
+ !17 = !DILocation(line: 34, column: 45, scope: !15)
404
+ !18 = !DILocation(line: 29, column: 25, scope: !5)
405
+ !19 = !DILocation(line: 31, column: 34, scope: !5)
406
+ !20 = !DILocation(line: 31, column: 52, scope: !5)
407
+ !21 = !DILocation(line: 31, column: 103, scope: !5)
408
+ !22 = !DILocation(line: 36, column: 15, scope: !15, inlinedAt: !17)
409
+ !23 = !DILocation(line: 38, column: 16, scope: !15, inlinedAt: !17)
410
+ !24 = !DILocation(line: 0, scope: !5)
411
+ !25 = !DILocation(line: 36, column: 15, scope: !26, inlinedAt: !27)
412
+ !26 = distinct !DILexicalBlockFile(scope: !15, file: !16, discriminator: 0)
413
+ !27 = !DILocation(line: 49, column: 29, scope: !26, inlinedAt: !28)
414
+ !28 = !DILocation(line: 36, column: 38, scope: !26)
415
+ !29 = !DILocation(line: 38, column: 21, scope: !26, inlinedAt: !27)
416
+ !30 = !DILocation(line: 38, column: 16, scope: !26, inlinedAt: !27)
417
+ !31 = !DILocation(line: 39, column: 29, scope: !26, inlinedAt: !27)
418
+ !32 = !DILocation(line: 49, column: 29, scope: !15, inlinedAt: !33)
419
+ !33 = !DILocation(line: 36, column: 38, scope: !15)
420
+ !34 = !DILocation(line: 36, column: 41, scope: !5)
421
+ !35 = !DILocation(line: 37, column: 25, scope: !5)
422
+ !36 = !DILocation(line: 37, column: 36, scope: !5)
423
+ !37 = !DILocation(line: 39, column: 36, scope: !5)
424
+ !38 = !DILocation(line: 40, column: 27, scope: !5)
425
+ !39 = !DILocation(line: 41, column: 25, scope: !5)
426
+ !40 = !DILocation(line: 43, column: 34, scope: !5)
427
+ !41 = !DILocation(line: 43, column: 52, scope: !5)
428
+ !42 = !DILocation(line: 43, column: 104, scope: !5)
429
+ !43 = !DILocation(line: 45, column: 22, scope: !5)
430
+ !44 = !DILocation(line: 46, column: 22, scope: !5)
431
+ !45 = !DILocation(line: 49, column: 40, scope: !5)
432
+ !46 = !DILocation(line: 243, column: 36, scope: !47, inlinedAt: !49)
433
+ !47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0)
434
+ !48 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
435
+ !49 = !DILocation(line: 50, column: 27, scope: !47)
436
+ !50 = !DILocation(line: 233, column: 15, scope: !51, inlinedAt: !52)
437
+ !51 = distinct !DILexicalBlockFile(scope: !47, file: !48, discriminator: 0)
438
+ !52 = !DILocation(line: 243, column: 36, scope: !51, inlinedAt: !53)
439
+ !53 = !DILocation(line: 50, column: 27, scope: !51)
440
+ !54 = !DILocation(line: 50, column: 30, scope: !5)
441
+ !55 = !DILocation(line: 51, column: 25, scope: !5)
442
+ !56 = !DILocation(line: 51, column: 37, scope: !5)
443
+ !57 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttgir ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i64) attributes {noinline = false} {
5
+ %c50257_i64 = arith.constant 50257 : i64
6
+ %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked>
7
+ %cst_0 = arith.constant dense<true> : tensor<1x2048xi1, #blocked>
8
+ %c50257_i32 = arith.constant 50257 : i32
9
+ %c2048_i32 = arith.constant 2048 : i32
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
12
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
13
+ %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.extsi %0 : i32 to i64
16
+ %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
17
+ %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
18
+ %4 = arith.extsi %3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
19
+ %5 = arith.muli %1, %c50257_i64 : i64
20
+ %6 = tt.splat %5 : (i64) -> tensor<1x2048xi64, #blocked>
21
+ %7 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
22
+ %8 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>) : i32 {
23
+ %22 = arith.extsi %arg5 : i32 to i64
24
+ %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64, #blocked>
25
+ %24 = arith.addi %23, %4 : tensor<1x2048xi64, #blocked>
26
+ %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64, #blocked>
27
+ %26 = arith.addi %24, %6 : tensor<1x2048xi64, #blocked>
28
+ %27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
29
+ %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
30
+ %29 = arith.extf %28 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
31
+ %30 = arith.cmpf ogt, %arg6, %29 : tensor<1x2048xf32, #blocked>
32
+ %31 = arith.cmpf une, %arg6, %arg6 : tensor<1x2048xf32, #blocked>
33
+ %32 = arith.ori %30, %31 : tensor<1x2048xi1, #blocked>
34
+ %33 = arith.xori %32, %cst_0 : tensor<1x2048xi1, #blocked>
35
+ %34 = arith.andi %25, %33 : tensor<1x2048xi1, #blocked>
36
+ %35 = arith.select %34, %29, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
37
+ scf.yield %35 : tensor<1x2048xf32, #blocked>
38
+ }
39
+ %9 = "tt.reduce"(%8) <{axis = 1 : i32}> ({
40
+ ^bb0(%arg5: f32, %arg6: f32):
41
+ %22 = arith.cmpf ogt, %arg5, %arg6 : f32
42
+ %23 = arith.cmpf une, %arg5, %arg5 : f32
43
+ %24 = arith.ori %22, %23 : i1
44
+ %25 = arith.select %24, %arg5, %arg6 : f32
45
+ tt.reduce.return %25 : f32
46
+ }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
47
+ %10 = triton_gpu.convert_layout %9 : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
48
+ %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
49
+ %12 = tt.expand_dims %9 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
50
+ %13 = tt.addptr %arg1, %1 : !tt.ptr<f32, 1>, i64
51
+ %14 = tt.splat %13 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
52
+ tt.store %14, %11 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
53
+ %15 = tt.broadcast %12 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
54
+ %16 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_2) -> (tensor<1x2048xf32, #blocked>) : i32 {
55
+ %22 = arith.extsi %arg5 : i32 to i64
56
+ %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64, #blocked>
57
+ %24 = arith.addi %23, %4 : tensor<1x2048xi64, #blocked>
58
+ %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64, #blocked>
59
+ %26 = arith.addi %24, %6 : tensor<1x2048xi64, #blocked>
60
+ %27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
61
+ %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
62
+ %29 = arith.extf %28 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
63
+ %30 = arith.subf %29, %15 : tensor<1x2048xf32, #blocked>
64
+ %31 = math.exp %30 : tensor<1x2048xf32, #blocked>
65
+ %32 = arith.addf %arg6, %31 : tensor<1x2048xf32, #blocked>
66
+ %33 = arith.select %25, %32, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
67
+ scf.yield %33 : tensor<1x2048xf32, #blocked>
68
+ }
69
+ %17 = "tt.reduce"(%16) <{axis = 1 : i32}> ({
70
+ ^bb0(%arg5: f32, %arg6: f32):
71
+ %22 = arith.addf %arg5, %arg6 : f32
72
+ tt.reduce.return %22 : f32
73
+ }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
74
+ %18 = triton_gpu.convert_layout %17 : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
75
+ %19 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
76
+ %20 = tt.addptr %arg2, %1 : !tt.ptr<f32, 1>, i64
77
+ %21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
78
+ tt.store %21, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
79
+ tt.return
80
+ }
81
+ }
wandb/run-20240926_055222-14kj2390/files/wandb-metadata.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-113-generic-x86_64-with-glibc2.35",
3
+ "python": "3.10.12",
4
+ "startedAt": "2024-09-26T05:52:22.950984Z",
5
+ "args": [
6
+ "--batch_size=120"
7
+ ],
8
+ "program": "/root/train.py",
9
+ "codePath": "train.py",
10
+ "email": "prasadchandalada@gmail.com",
11
+ "root": "/root",
12
+ "host": "184d1c0992ce",
13
+ "username": "root",
14
+ "executable": "/usr/bin/python",
15
+ "codePathLocal": "train.py",
16
+ "cpu_count": 64,
17
+ "cpu_count_logical": 128,
18
+ "gpu": "[NVIDIA L40S, NVIDIA L40S, NVIDIA L40S, NVIDIA L40S]",
19
+ "gpu_count": 4,
20
+ "disk": {
21
+ "/": {
22
+ "total": "542239621120",
23
+ "used": "400404738048"
24
+ }
25
+ },
26
+ "memory": {
27
+ "total": "811327934464"
28
+ },
29
+ "cpu": {
30
+ "count": 64,
31
+ "countLogical": 128
32
+ },
33
+ "gpu_nvidia": [
34
+ {
35
+ "name": "NVIDIA L40S",
36
+ "memoryTotal": "48305799168",
37
+ "cudaCores": 18176,
38
+ "architecture": "Ada"
39
+ },
40
+ {
41
+ "name": "NVIDIA L40S",
42
+ "memoryTotal": "48305799168",
43
+ "cudaCores": 18176,
44
+ "architecture": "Ada"
45
+ },
46
+ {
47
+ "name": "NVIDIA L40S",
48
+ "memoryTotal": "48305799168",
49
+ "cudaCores": 18176,
50
+ "architecture": "Ada"
51
+ },
52
+ {
53
+ "name": "NVIDIA L40S",
54
+ "memoryTotal": "48305799168",
55
+ "cudaCores": 18176,
56
+ "architecture": "Ada"
57
+ }
58
+ ],
59
+ "cudaVersion": "12.2"
60
+ }
wandb/run-20240926_055222-14kj2390/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-26T05:52:22.315197362Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp0irn9n95/port-986.txt","pid":986,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-09-26T05:52:22.315233622Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-09-26T05:52:22.316073319Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":986}
4
+ {"time":"2024-09-26T05:52:22.316064076Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41193,"Zone":""}}
5
+ {"time":"2024-09-26T05:52:22.505125081Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:42070"}
6
+ {"time":"2024-09-26T05:52:22.951875072Z","level":"INFO","msg":"connection init received","streamId":"14kj2390","id":"127.0.0.1:42070"}
7
+ {"time":"2024-09-26T05:52:22.952304043Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_055222.log /root/wandb/run-20240926_055222-14kj2390/logs/debug-core.log: file exists"}
8
+ {"time":"2024-09-26T05:52:22.955991404Z","level":"INFO","msg":"connection init completed","streamId":"14kj2390","id":"127.0.0.1:42070"}
9
+ {"time":"2024-09-26T12:39:40.244212691Z","level":"INFO","msg":"handle finish received","streamId":"14kj2390","id":"127.0.0.1:42070"}
10
+ {"time":"2024-09-26T12:39:41.460220703Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:42070"}
11
+ {"time":"2024-09-26T12:39:41.460275234Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2024-09-26T12:39:41.460350917Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:42070"}
13
+ {"time":"2024-09-26T12:39:41.460369816Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:42070"}
14
+ {"time":"2024-09-26T12:39:41.460376796Z","level":"INFO","msg":"server is closed"}