0-hero commited on
Commit
00b5d9e
·
verified ·
1 Parent(s): 4f38035

Add files using upload-large-folder tool

Browse files
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin ADDED
Binary file (28.5 kB). View file
 
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 128, 1, 1
39
+ {
40
+ .reg .pred %p<48>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<158>;
43
+ .reg .f32 %f<164>;
44
+ .reg .b64 %rd<73>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6de7de_param_5];
50
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_4];
51
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_3];
52
+ ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6de7de_param_0];
53
+ ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6de7de_param_1];
54
+ $L__tmp0:
55
+ .loc 1 22 44
56
+ mov.u32 %r13, %tid.x;
57
+ and.b32 %r1, %r13, 31;
58
+ ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6de7de_param_2];
59
+ bfe.u32 %r14, %r13, 5, 2;
60
+ bfe.u32 %r15, %r13, 1, 4;
61
+ shl.b32 %r16, %r14, 4;
62
+ or.b32 %r2, %r16, %r15;
63
+ and.b32 %r17, %r13, 63;
64
+ .loc 1 24 33
65
+ shl.b32 %r18, %r13, 2;
66
+ and.b32 %r3, %r18, 4;
67
+ and.b32 %r4, %r13, 7;
68
+ .loc 1 31 36
69
+ shl.b32 %r5, %r14, 2;
70
+ .loc 1 21 28
71
+ mov.u32 %r11, %ctaid.x;
72
+ .loc 1 21 33
73
+ shl.b32 %r19, %r11, 6;
74
+ .loc 1 22 23
75
+ or.b32 %r20, %r19, %r2;
76
+ or.b32 %r21, %r19, %r17;
77
+ .loc 1 26 30
78
+ mul.wide.s32 %rd33, %r20, 8;
79
+ add.s64 %rd21, %rd30, %rd33;
80
+ mul.wide.s32 %rd34, %r21, 8;
81
+ add.s64 %rd29, %rd30, %rd34;
82
+ mov.pred %p1, -1;
83
+ .loc 1 26 35
84
+ mov.u64 %rd20, 0x0;
85
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
86
+ mov.u64 %rd22, 0x0;
87
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd21 + 0 ];
88
+ mov.u64 %rd24, 0x0;
89
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd21 + 0 ];
90
+ mov.u64 %rd26, 0x0;
91
+ @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd21 + 0 ];
92
+ mov.u64 %rd28, 0x0;
93
+ @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
94
+ .loc 1 27 18
95
+ bfe.s32 %r22, %r11, 25, 1;
96
+ shr.u32 %r23, %r22, 23;
97
+ add.s32 %r24, %r20, %r23;
98
+ and.b32 %r25, %r24, 16776704;
99
+ sub.s32 %r26, %r20, %r25;
100
+ .loc 1 35 44
101
+ shl.b32 %r27, %r26, 8;
102
+ .loc 1 37 22
103
+ add.s64 %rd35, %rd28, 50257;
104
+ .loc 1 38 22
105
+ setp.lt.s64 %p6, %rd20, 0;
106
+ setp.lt.s64 %p7, %rd28, 0;
107
+ .loc 1 39 36
108
+ selp.b64 %rd1, %rd35, %rd28, %p7;
109
+ .loc 1 41 44
110
+ shl.b64 %rd36, %rd20, 8;
111
+ add.s64 %rd37, %rd36, 12865792;
112
+ selp.b64 %rd38, %rd37, %rd36, %p6;
113
+ .loc 1 31 36
114
+ and.b32 %r28, %r13, 1;
115
+ mul.wide.u32 %rd2, %r28, 16;
116
+ shl.b64 %rd39, %rd38, 2;
117
+ or.b64 %rd40, %rd2, %rd39;
118
+ add.s64 %rd72, %rd31, %rd40;
119
+ shl.b32 %r29, %r11, 14;
120
+ shl.b32 %r30, %r14, 12;
121
+ or.b32 %r31, %r29, %r30;
122
+ shl.b32 %r32, %r15, 8;
123
+ or.b32 %r33, %r31, %r32;
124
+ or.b32 %r6, %r33, %r3;
125
+ or.b32 %r34, %r27, %r3;
126
+ mul.wide.s32 %rd41, %r34, 4;
127
+ add.s64 %rd70, %rd32, %rd41;
128
+ mov.f32 %f148, 0f00000000;
129
+ mov.b32 %r156, -8;
130
+ mov.u64 %rd68, %rd70;
131
+ mov.u64 %rd69, %rd72;
132
+ mov.f32 %f149, %f148;
133
+ mov.f32 %f150, %f148;
134
+ mov.f32 %f151, %f148;
135
+ mov.f32 %f152, %f148;
136
+ mov.f32 %f153, %f148;
137
+ mov.f32 %f154, %f148;
138
+ mov.f32 %f155, %f148;
139
+ mov.f32 %f156, %f148;
140
+ mov.f32 %f157, %f148;
141
+ mov.f32 %f158, %f148;
142
+ mov.f32 %f159, %f148;
143
+ mov.f32 %f160, %f148;
144
+ mov.f32 %f161, %f148;
145
+ mov.f32 %f162, %f148;
146
+ mov.f32 %f163, %f148;
147
+ bra.uni $L__BB0_1;
148
+ $L__BB0_3:
149
+ .loc 1 0 0
150
+ mov.b32 %f17, %r35;
151
+ mov.b32 %f18, %r36;
152
+ mov.b32 %f19, %r37;
153
+ mov.b32 %f20, %r38;
154
+ cvt.u16.u32 %rs1, %r43;
155
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r43; }
156
+ cvt.u16.u32 %rs3, %r44;
157
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r44; }
158
+ cvt.f32.bf16 %r47, %rs1;
159
+ mov.b32 %f21, %r47;
160
+ cvt.f32.bf16 %r48, %rs2;
161
+ mov.b32 %f22, %r48;
162
+ cvt.f32.bf16 %r49, %rs3;
163
+ mov.b32 %f23, %r49;
164
+ cvt.f32.bf16 %r50, %rs4;
165
+ mov.b32 %f24, %r50;
166
+ .loc 1 41 52
167
+ mov.u32 %r54, 0x0;
168
+ mov.u32 %r55, 0x0;
169
+ mov.u32 %r56, 0x0;
170
+ mov.u32 %r57, 0x0;
171
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd69 + 0 ];
172
+ @!%p1 mov.u32 %r54, %r143;
173
+ @!%p1 mov.u32 %r55, %r143;
174
+ @!%p1 mov.u32 %r56, %r143;
175
+ @!%p1 mov.u32 %r57, %r143;
176
+ mov.b32 %f56, %r54;
177
+ mov.b32 %f57, %r55;
178
+ mov.b32 %f58, %r56;
179
+ mov.b32 %f59, %r57;
180
+ .loc 1 42 22
181
+ add.f32 %f60, %f17, %f56;
182
+ add.f32 %f61, %f18, %f57;
183
+ add.f32 %f62, %f19, %f58;
184
+ add.f32 %f63, %f20, %f59;
185
+ .loc 1 44 22
186
+ add.f32 %f64, %f21, %f60;
187
+ add.f32 %f65, %f22, %f61;
188
+ add.f32 %f66, %f23, %f62;
189
+ add.f32 %f67, %f24, %f63;
190
+ $L__tmp1:
191
+ .loc 2 96 20
192
+ sub.f32 %f68, %f64, %f160;
193
+ sub.f32 %f69, %f65, %f161;
194
+ sub.f32 %f70, %f66, %f162;
195
+ sub.f32 %f71, %f67, %f163;
196
+ .loc 2 97 26
197
+ add.f32 %f148, %f148, 0f3F800000;
198
+ add.f32 %f149, %f149, 0f3F800000;
199
+ add.f32 %f150, %f150, 0f3F800000;
200
+ add.f32 %f151, %f151, 0f3F800000;
201
+ add.f32 %f152, %f152, 0f3F800000;
202
+ add.f32 %f153, %f153, 0f3F800000;
203
+ add.f32 %f154, %f154, 0f3F800000;
204
+ add.f32 %f155, %f155, 0f3F800000;
205
+ .loc 2 98 30
206
+ mov.b32 %r63, %f68;
207
+ mov.b32 %r64, %f148;
208
+ div.full.f32 %r62, %r63, %r64;
209
+ mov.b32 %f72, %r62;
210
+ mov.b32 %r66, %f69;
211
+ mov.b32 %r67, %f149;
212
+ div.full.f32 %r65, %r66, %r67;
213
+ mov.b32 %f73, %r65;
214
+ mov.b32 %r69, %f70;
215
+ mov.b32 %r70, %f150;
216
+ div.full.f32 %r68, %r69, %r70;
217
+ mov.b32 %f74, %r68;
218
+ mov.b32 %r72, %f71;
219
+ mov.b32 %r73, %f151;
220
+ div.full.f32 %r71, %r72, %r73;
221
+ mov.b32 %f75, %r71;
222
+ .loc 2 98 22
223
+ add.f32 %f160, %f160, %f72;
224
+ add.f32 %f161, %f161, %f73;
225
+ add.f32 %f162, %f162, %f74;
226
+ add.f32 %f163, %f163, %f75;
227
+ .loc 2 101 30
228
+ sub.f32 %f76, %f64, %f160;
229
+ sub.f32 %f77, %f65, %f161;
230
+ sub.f32 %f78, %f66, %f162;
231
+ sub.f32 %f79, %f67, %f163;
232
+ $L__tmp2:
233
+ .loc 1 50 50
234
+ fma.rn.f32 %f156, %f68, %f76, %f156;
235
+ fma.rn.f32 %f157, %f69, %f77, %f157;
236
+ fma.rn.f32 %f158, %f70, %f78, %f158;
237
+ fma.rn.f32 %f159, %f71, %f79, %f159;
238
+ .loc 1 31 36
239
+ add.s32 %r156, %r156, 8;
240
+ add.s64 %rd69, %rd69, 32;
241
+ add.s64 %rd68, %rd68, 32;
242
+ setp.lt.u32 %p22, %r156, 248;
243
+ @%p22 bra $L__BB0_1;
244
+ bra.uni $L__BB0_4;
245
+ $L__BB0_1:
246
+ .loc 1 40 40
247
+ setp.lt.u64 %p16, %rd1, 50257;
248
+ mov.b32 %r143, 0;
249
+ .loc 1 35 50
250
+ mov.u32 %r35, 0x0;
251
+ mov.u32 %r36, 0x0;
252
+ mov.u32 %r37, 0x0;
253
+ mov.u32 %r38, 0x0;
254
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd68 + 0 ];
255
+ @!%p1 mov.u32 %r35, %r143;
256
+ @!%p1 mov.u32 %r36, %r143;
257
+ @!%p1 mov.u32 %r37, %r143;
258
+ @!%p1 mov.u32 %r38, %r143;
259
+ .loc 1 36 34
260
+ add.s32 %r51, %r6, %r156;
261
+ add.s32 %r52, %r51, 8;
262
+ mul.wide.s32 %rd44, %r52, 2;
263
+ add.s64 %rd43, %rd17, %rd44;
264
+ .loc 1 36 50
265
+ mov.u32 %r43, 0x0;
266
+ mov.u32 %r44, 0x0;
267
+ @%p1 ld.global.L1::evict_last.v2.b32 { %r43, %r44 }, [ %rd43 + 0 ];
268
+ @!%p1 mov.u32 %r43, %r143;
269
+ @!%p1 mov.u32 %r44, %r143;
270
+ mov.b32 %r155, 883;
271
+ mov.u64 %rd67, 1;
272
+ .loc 1 40 55
273
+ @%p16 bra $L__BB0_3;
274
+ mov.u64 %rd45, assertMessage_0;
275
+ cvta.global.u64 %rd46, %rd45;
276
+ mov.u64 %rd47, assertFile_0;
277
+ cvta.global.u64 %rd48, %rd47;
278
+ mov.u64 %rd49, assertFunc_0;
279
+ cvta.global.u64 %rd50, %rd49;
280
+ { // callseq 2, 0
281
+ .reg .b32 temp_param_reg;
282
+ .param .b64 param0;
283
+ st.param.b64 [param0+0], %rd46;
284
+ .param .b64 param1;
285
+ st.param.b64 [param1+0], %rd48;
286
+ .param .b32 param2;
287
+ st.param.b32 [param2+0], %r155;
288
+ .param .b64 param3;
289
+ st.param.b64 [param3+0], %rd50;
290
+ .param .b64 param4;
291
+ st.param.b64 [param4+0], %rd67;
292
+ call.uni
293
+ __assertfail,
294
+ (
295
+ param0,
296
+ param1,
297
+ param2,
298
+ param3,
299
+ param4
300
+ );
301
+ } // callseq 2
302
+ bra.uni $L__BB0_3;
303
+ $L__BB0_4:
304
+ .loc 1 31 36
305
+ shr.u32 %r99, %r1, 3;
306
+ or.b32 %r100, %r5, %r99;
307
+ mad.lo.s32 %r101, %r100, 12, %r4;
308
+ shl.b32 %r102, %r101, 2;
309
+ mov.u32 %r103, global_smem;
310
+ add.s32 %r104, %r103, %r102;
311
+ st.shared.f32 [%r104], %f152;
312
+ st.shared.f32 [%r104+768], %f153;
313
+ st.shared.f32 [%r104+1536], %f154;
314
+ st.shared.f32 [%r104+2304], %f155;
315
+ bar.sync 0;
316
+ mad.lo.s32 %r105, %r2, 12, %r3;
317
+ shl.b32 %r106, %r105, 2;
318
+ add.s32 %r107, %r103, %r106;
319
+ ld.shared.v4.f32 {%f80, %f81, %f82, %f83}, [%r107];
320
+ $L__tmp3:
321
+ .loc 2 108 21
322
+ sub.f32 %f84, %f161, %f160;
323
+ .loc 2 109 28
324
+ add.f32 %f85, %f80, %f81;
325
+ .loc 2 110 39
326
+ setp.eq.f32 %p23, %f85, 0f00000000;
327
+ .loc 2 110 60
328
+ mov.b32 %r75, %f81;
329
+ mov.b32 %r76, %f85;
330
+ div.full.f32 %r74, %r75, %r76;
331
+ mov.b32 %f86, %r74;
332
+ .loc 2 110 49
333
+ selp.f32 %f87, 0f00000000, %f86, %p23;
334
+ .loc 2 112 17
335
+ fma.rn.f32 %f88, %f84, %f87, %f160;
336
+ .loc 2 113 15
337
+ add.f32 %f89, %f156, %f157;
338
+ .loc 2 113 30
339
+ mul.f32 %f90, %f84, %f84;
340
+ .loc 2 113 38
341
+ mul.f32 %f91, %f90, %f80;
342
+ .loc 2 113 22
343
+ fma.rn.f32 %f92, %f91, %f87, %f89;
344
+ .loc 2 108 21
345
+ sub.f32 %f93, %f162, %f88;
346
+ .loc 2 109 28
347
+ add.f32 %f94, %f82, %f85;
348
+ .loc 2 110 39
349
+ setp.eq.f32 %p24, %f94, 0f00000000;
350
+ .loc 2 110 60
351
+ mov.b32 %r79, %f94;
352
+ mov.b32 %r78, %f82;
353
+ div.full.f32 %r77, %r78, %r79;
354
+ mov.b32 %f95, %r77;
355
+ .loc 2 110 49
356
+ selp.f32 %f96, 0f00000000, %f95, %p24;
357
+ .loc 2 112 17
358
+ fma.rn.f32 %f97, %f96, %f93, %f88;
359
+ .loc 2 113 15
360
+ add.f32 %f98, %f158, %f92;
361
+ .loc 2 113 30
362
+ mul.f32 %f99, %f93, %f93;
363
+ .loc 2 113 38
364
+ mul.f32 %f100, %f85, %f99;
365
+ .loc 2 113 22
366
+ fma.rn.f32 %f101, %f96, %f100, %f98;
367
+ .loc 2 108 21
368
+ sub.f32 %f102, %f163, %f97;
369
+ .loc 2 109 28
370
+ add.f32 %f103, %f83, %f94;
371
+ .loc 2 110 39
372
+ setp.eq.f32 %p25, %f103, 0f00000000;
373
+ .loc 2 110 60
374
+ mov.b32 %r82, %f103;
375
+ mov.b32 %r81, %f83;
376
+ div.full.f32 %r80, %r81, %r82;
377
+ mov.b32 %f104, %r80;
378
+ .loc 2 110 49
379
+ selp.f32 %f105, 0f00000000, %f104, %p25;
380
+ .loc 2 112 17
381
+ fma.rn.f32 %f106, %f105, %f102, %f97;
382
+ .loc 2 113 15
383
+ add.f32 %f107, %f159, %f101;
384
+ .loc 2 113 30
385
+ mul.f32 %f108, %f102, %f102;
386
+ .loc 2 113 38
387
+ mul.f32 %f109, %f94, %f108;
388
+ .loc 2 113 22
389
+ fma.rn.f32 %f110, %f105, %f109, %f107;
390
+ $L__tmp4:
391
+ .loc 2 120 46
392
+ mov.b32 %r108, %f106;
393
+ shfl.sync.bfly.b32 %r109, %r108, 1, 31, -1;
394
+ mov.b32 %f111, %r109;
395
+ mov.b32 %r110, %f110;
396
+ shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1;
397
+ mov.b32 %f112, %r111;
398
+ shfl.sync.bfly.b32 %r84, %r82, 1, 31, -1;
399
+ mov.b32 %f113, %r84;
400
+ $L__tmp5:
401
+ .loc 2 108 21
402
+ sub.f32 %f114, %f111, %f106;
403
+ .loc 2 109 28
404
+ add.f32 %f115, %f103, %f113;
405
+ .loc 2 110 39
406
+ setp.eq.f32 %p26, %f115, 0f00000000;
407
+ .loc 2 110 60
408
+ mov.b32 %r85, %f115;
409
+ div.full.f32 %r83, %r84, %r85;
410
+ mov.b32 %f116, %r83;
411
+ .loc 2 110 49
412
+ selp.f32 %f117, 0f00000000, %f116, %p26;
413
+ .loc 2 112 17
414
+ fma.rn.f32 %f41, %f117, %f114, %f106;
415
+ .loc 2 113 15
416
+ add.f32 %f118, %f110, %f112;
417
+ .loc 2 113 30
418
+ mul.f32 %f119, %f114, %f114;
419
+ .loc 2 113 38
420
+ mul.f32 %f120, %f103, %f119;
421
+ .loc 2 113 22
422
+ fma.rn.f32 %f121, %f117, %f120, %f118;
423
+ $L__tmp6:
424
+ .loc 1 75 24
425
+ mov.b32 %r87, %f121;
426
+ mov.b32 %r88, 1132462080;
427
+ div.full.f32 %r86, %r87, %r88;
428
+ mov.b32 %f122, %r86;
429
+ .loc 1 77 24
430
+ add.f32 %f42, %f122, 0f3727C5AC;
431
+ .loc 1 58 36
432
+ add.s64 %rd71, %rd18, %rd2;
433
+ mov.b32 %r157, -8;
434
+ rsqrt.approx.ftz.f32 %f139, %f42;
435
+ bra.uni $L__BB0_5;
436
+ $L__BB0_7:
437
+ .loc 1 0 0
438
+ mov.b32 %f43, %r112;
439
+ mov.b32 %f44, %r113;
440
+ mov.b32 %f45, %r114;
441
+ mov.b32 %f46, %r115;
442
+ cvt.s64.s32 %rd13, %r137;
443
+ mov.b32 %f47, %r124;
444
+ mov.b32 %f48, %r125;
445
+ mov.b32 %f49, %r126;
446
+ mov.b32 %f50, %r127;
447
+ mov.b32 %f51, %r128;
448
+ mov.b32 %f52, %r129;
449
+ mov.b32 %f53, %r130;
450
+ mov.b32 %f54, %r131;
451
+ .loc 1 69 54
452
+ mov.u32 %r139, 0x0;
453
+ mov.u32 %r140, 0x0;
454
+ mov.u32 %r141, 0x0;
455
+ mov.u32 %r142, 0x0;
456
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r139, %r140, %r141, %r142 }, [ %rd72 + 0 ];
457
+ @!%p1 mov.u32 %r139, %r143;
458
+ @!%p1 mov.u32 %r140, %r143;
459
+ @!%p1 mov.u32 %r141, %r143;
460
+ @!%p1 mov.u32 %r142, %r143;
461
+ mov.b32 %f123, %r139;
462
+ mov.b32 %f124, %r140;
463
+ mov.b32 %f125, %r141;
464
+ mov.b32 %f126, %r142;
465
+ .loc 1 70 24
466
+ add.f32 %f127, %f43, %f123;
467
+ add.f32 %f128, %f44, %f124;
468
+ add.f32 %f129, %f45, %f125;
469
+ add.f32 %f130, %f46, %f126;
470
+ .loc 1 72 24
471
+ add.f32 %f131, %f47, %f127;
472
+ add.f32 %f132, %f48, %f128;
473
+ add.f32 %f133, %f49, %f129;
474
+ add.f32 %f134, %f50, %f130;
475
+ .loc 1 73 24
476
+ sub.f32 %f135, %f131, %f41;
477
+ sub.f32 %f136, %f132, %f41;
478
+ sub.f32 %f137, %f133, %f41;
479
+ sub.f32 %f138, %f134, %f41;
480
+ .loc 1 79 24
481
+ mul.f32 %f140, %f135, %f139;
482
+ mul.f32 %f141, %f136, %f139;
483
+ mul.f32 %f142, %f137, %f139;
484
+ mul.f32 %f143, %f138, %f139;
485
+ .loc 1 80 24
486
+ mul.f32 %f144, %f140, %f51;
487
+ mul.f32 %f145, %f141, %f52;
488
+ mul.f32 %f146, %f142, %f53;
489
+ mul.f32 %f147, %f143, %f54;
490
+ .loc 1 82 29
491
+ shl.b64 %rd66, %rd13, 1;
492
+ add.s64 %rd65, %rd19, %rd66;
493
+ .loc 1 82 52
494
+ mov.b32 %r147, %f144;
495
+ cvt.rn.bf16.f32 %rs9, %r147;
496
+ mov.b32 %r148, %f145;
497
+ cvt.rn.bf16.f32 %rs10, %r148;
498
+ mov.b32 %r149, %f146;
499
+ cvt.rn.bf16.f32 %rs11, %r149;
500
+ mov.b32 %r150, %f147;
501
+ cvt.rn.bf16.f32 %rs12, %r150;
502
+ mov.b32 %r153, {%rs9, %rs10};
503
+ mov.b32 %r154, {%rs11, %rs12};
504
+ @%p1 st.global.v2.b32 [ %rd65 + 0 ], { %r153, %r154 };
505
+ .loc 1 58 36
506
+ add.s32 %r157, %r157, 8;
507
+ add.s64 %rd72, %rd72, 32;
508
+ add.s64 %rd71, %rd71, 32;
509
+ add.s64 %rd70, %rd70, 32;
510
+ setp.lt.u32 %p47, %r157, 248;
511
+ @%p47 bra $L__BB0_5;
512
+ bra.uni $L__BB0_8;
513
+ $L__BB0_5:
514
+ .loc 1 62 51
515
+ mov.u32 %r112, 0x0;
516
+ mov.u32 %r113, 0x0;
517
+ mov.u32 %r114, 0x0;
518
+ mov.u32 %r115, 0x0;
519
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r112, %r113, %r114, %r115 }, [ %rd70 + 0 ];
520
+ @!%p1 mov.u32 %r112, %r143;
521
+ @!%p1 mov.u32 %r113, %r143;
522
+ @!%p1 mov.u32 %r114, %r143;
523
+ @!%p1 mov.u32 %r115, %r143;
524
+ .loc 1 63 35
525
+ add.s32 %r136, %r6, %r157;
526
+ add.s32 %r137, %r136, 8;
527
+ mul.wide.s32 %rd56, %r137, 2;
528
+ add.s64 %rd54, %rd17, %rd56;
529
+ .loc 1 63 51
530
+ mov.u32 %r120, 0x0;
531
+ mov.u32 %r121, 0x0;
532
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r120, %r121 }, [ %rd54 + 0 ];
533
+ @!%p1 mov.u32 %r120, %r143;
534
+ @!%p1 mov.u32 %r121, %r143;
535
+ cvt.u16.u32 %rs5, %r120;
536
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r120; }
537
+ cvt.u16.u32 %rs7, %r121;
538
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r121; }
539
+ .loc 1 63 103
540
+ cvt.f32.bf16 %r124, %rs5;
541
+ cvt.f32.bf16 %r125, %rs6;
542
+ cvt.f32.bf16 %r126, %rs7;
543
+ cvt.f32.bf16 %r127, %rs8;
544
+ .loc 1 64 40
545
+ mov.u32 %r128, 0x0;
546
+ mov.u32 %r129, 0x0;
547
+ mov.u32 %r130, 0x0;
548
+ mov.u32 %r131, 0x0;
549
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r128, %r129, %r130, %r131 }, [ %rd71 + 0 ];
550
+ @!%p1 mov.u32 %r128, %r143;
551
+ @!%p1 mov.u32 %r129, %r143;
552
+ @!%p1 mov.u32 %r130, %r143;
553
+ @!%p1 mov.u32 %r131, %r143;
554
+ .loc 1 68 57
555
+ @%p16 bra $L__BB0_7;
556
+ mov.u64 %rd57, assertMessage_1;
557
+ cvta.global.u64 %rd58, %rd57;
558
+ mov.u64 %rd59, assertFile_1;
559
+ cvta.global.u64 %rd60, %rd59;
560
+ mov.u64 %rd61, assertFunc_1;
561
+ cvta.global.u64 %rd62, %rd61;
562
+ { // callseq 3, 0
563
+ .reg .b32 temp_param_reg;
564
+ .param .b64 param0;
565
+ st.param.b64 [param0+0], %rd58;
566
+ .param .b64 param1;
567
+ st.param.b64 [param1+0], %rd60;
568
+ .param .b32 param2;
569
+ st.param.b32 [param2+0], %r155;
570
+ .param .b64 param3;
571
+ st.param.b64 [param3+0], %rd62;
572
+ .param .b64 param4;
573
+ st.param.b64 [param4+0], %rd67;
574
+ call.uni
575
+ __assertfail,
576
+ (
577
+ param0,
578
+ param1,
579
+ param2,
580
+ param3,
581
+ param4
582
+ );
583
+ } // callseq 3
584
+ bra.uni $L__BB0_7;
585
+ $L__BB0_8:
586
+ .loc 1 58 4
587
+ ret;
588
+ $L__tmp7:
589
+ $L__func_end0:
590
+
591
+ }
592
+ // .globl __nv_rsqrtf
593
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
594
+ .param .b32 __nv_rsqrtf_param_0
595
+ )
596
+ {
597
+ .reg .f32 %f<3>;
598
+ $L__func_begin1:
599
+
600
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
601
+ rsqrt.approx.ftz.f32 %f2, %f1;
602
+ st.param.f32 [func_retval0+0], %f2;
603
+ ret;
604
+ $L__func_end1:
605
+
606
+ }
607
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
608
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
609
+ .section .debug_abbrev
610
+ {
611
+ .b8 1
612
+ .b8 17
613
+ .b8 1
614
+ .b8 37
615
+ .b8 8
616
+ .b8 19
617
+ .b8 5
618
+ .b8 3
619
+ .b8 8
620
+ .b8 16
621
+ .b8 6
622
+ .b8 27
623
+ .b8 8
624
+ .b8 180
625
+ .b8 66
626
+ .b8 12
627
+ .b8 17
628
+ .b8 1
629
+ .b8 18
630
+ .b8 1
631
+ .b8 0
632
+ .b8 0
633
+ .b8 2
634
+ .b8 46
635
+ .b8 0
636
+ .b8 135
637
+ .b8 64
638
+ .b8 8
639
+ .b8 3
640
+ .b8 8
641
+ .b8 58
642
+ .b8 11
643
+ .b8 59
644
+ .b8 11
645
+ .b8 63
646
+ .b8 12
647
+ .b8 32
648
+ .b8 11
649
+ .b8 0
650
+ .b8 0
651
+ .b8 3
652
+ .b8 46
653
+ .b8 1
654
+ .b8 17
655
+ .b8 1
656
+ .b8 18
657
+ .b8 1
658
+ .b8 64
659
+ .b8 10
660
+ .b8 49
661
+ .b8 19
662
+ .b8 0
663
+ .b8 0
664
+ .b8 4
665
+ .b8 29
666
+ .b8 0
667
+ .b8 49
668
+ .b8 19
669
+ .b8 17
670
+ .b8 1
671
+ .b8 18
672
+ .b8 1
673
+ .b8 88
674
+ .b8 11
675
+ .b8 89
676
+ .b8 11
677
+ .b8 87
678
+ .b8 11
679
+ .b8 0
680
+ .b8 0
681
+ .b8 5
682
+ .b8 29
683
+ .b8 1
684
+ .b8 49
685
+ .b8 19
686
+ .b8 17
687
+ .b8 1
688
+ .b8 18
689
+ .b8 1
690
+ .b8 88
691
+ .b8 11
692
+ .b8 89
693
+ .b8 11
694
+ .b8 87
695
+ .b8 11
696
+ .b8 0
697
+ .b8 0
698
+ .b8 0
699
+ }
700
+ .section .debug_info
701
+ {
702
+ .b32 302
703
+ .b8 2
704
+ .b8 0
705
+ .b32 .debug_abbrev
706
+ .b8 8
707
+ .b8 1
708
+ .b8 116
709
+ .b8 114
710
+ .b8 105
711
+ .b8 116
712
+ .b8 111
713
+ .b8 110
714
+ .b8 0
715
+ .b8 2
716
+ .b8 0
717
+ .b8 99
718
+ .b8 112
719
+ .b8 110
720
+ .b8 51
721
+ .b8 108
722
+ .b8 97
723
+ .b8 119
724
+ .b8 103
725
+ .b8 54
726
+ .b8 53
727
+ .b8 108
728
+ .b8 112
729
+ .b8 105
730
+ .b8 54
731
+ .b8 51
732
+ .b8 103
733
+ .b8 118
734
+ .b8 54
735
+ .b8 99
736
+ .b8 54
737
+ .b8 112
738
+ .b8 110
739
+ .b8 52
740
+ .b8 111
741
+ .b8 105
742
+ .b8 107
743
+ .b8 104
744
+ .b8 103
745
+ .b8 54
746
+ .b8 113
747
+ .b8 118
748
+ .b8 97
749
+ .b8 50
750
+ .b8 104
751
+ .b8 50
752
+ .b8 113
753
+ .b8 106
754
+ .b8 100
755
+ .b8 112
756
+ .b8 120
757
+ .b8 101
758
+ .b8 54
759
+ .b8 113
760
+ .b8 106
761
+ .b8 52
762
+ .b8 108
763
+ .b8 118
764
+ .b8 116
765
+ .b8 116
766
+ .b8 119
767
+ .b8 101
768
+ .b8 122
769
+ .b8 46
770
+ .b8 112
771
+ .b8 121
772
+ .b8 0
773
+ .b32 .debug_line
774
+ .b8 47
775
+ .b8 116
776
+ .b8 109
777
+ .b8 112
778
+ .b8 47
779
+ .b8 116
780
+ .b8 111
781
+ .b8 114
782
+ .b8 99
783
+ .b8 104
784
+ .b8 105
785
+ .b8 110
786
+ .b8 100
787
+ .b8 117
788
+ .b8 99
789
+ .b8 116
790
+ .b8 111
791
+ .b8 114
792
+ .b8 95
793
+ .b8 114
794
+ .b8 111
795
+ .b8 111
796
+ .b8 116
797
+ .b8 47
798
+ .b8 112
799
+ .b8 110
800
+ .b8 0
801
+ .b8 1
802
+ .b64 $L__func_begin0
803
+ .b64 $L__func_end0
804
+ .b8 2
805
+ .b8 116
806
+ .b8 114
807
+ .b8 105
808
+ .b8 116
809
+ .b8 111
810
+ .b8 110
811
+ .b8 95
812
+ .b8 95
813
+ .b8 48
814
+ .b8 100
815
+ .b8 49
816
+ .b8 100
817
+ .b8 50
818
+ .b8 100
819
+ .b8 51
820
+ .b8 100
821
+ .b8 52
822
+ .b8 100
823
+ .b8 53
824
+ .b8 100
825
+ .b8 54
826
+ .b8 100
827
+ .b8 101
828
+ .b8 55
829
+ .b8 100
830
+ .b8 101
831
+ .b8 0
832
+ .b8 116
833
+ .b8 114
834
+ .b8 105
835
+ .b8 116
836
+ .b8 111
837
+ .b8 110
838
+ .b8 95
839
+ .b8 95
840
+ .b8 48
841
+ .b8 100
842
+ .b8 49
843
+ .b8 100
844
+ .b8 50
845
+ .b8 100
846
+ .b8 51
847
+ .b8 100
848
+ .b8 52
849
+ .b8 100
850
+ .b8 53
851
+ .b8 100
852
+ .b8 54
853
+ .b8 100
854
+ .b8 101
855
+ .b8 55
856
+ .b8 100
857
+ .b8 101
858
+ .b8 0
859
+ .b8 1
860
+ .b8 18
861
+ .b8 1
862
+ .b8 1
863
+ .b8 3
864
+ .b64 $L__func_begin0
865
+ .b64 $L__func_end0
866
+ .b8 1
867
+ .b8 156
868
+ .b32 125
869
+ .b8 4
870
+ .b32 125
871
+ .b64 $L__tmp1
872
+ .b64 $L__tmp2
873
+ .b8 2
874
+ .b8 47
875
+ .b8 41
876
+ .b8 5
877
+ .b32 125
878
+ .b64 $L__tmp3
879
+ .b64 $L__tmp6
880
+ .b8 2
881
+ .b8 53
882
+ .b8 44
883
+ .b8 4
884
+ .b32 125
885
+ .b64 $L__tmp3
886
+ .b64 $L__tmp6
887
+ .b8 2
888
+ .b8 120
889
+ .b8 46
890
+ .b8 0
891
+ .b8 4
892
+ .b32 125
893
+ .b64 $L__tmp4
894
+ .b64 $L__tmp5
895
+ .b8 2
896
+ .b8 53
897
+ .b8 44
898
+ .b8 0
899
+ .b8 0
900
+ }
901
+ .section .debug_pubnames
902
+ {
903
+ .b32 $L__pubNames_end0-$L__pubNames_start0
904
+ $L__pubNames_start0:
905
+ .b8 2
906
+ .b8 0
907
+ .b32 .debug_info
908
+ .b32 306
909
+ .b32 125
910
+ .b8 116
911
+ .b8 114
912
+ .b8 105
913
+ .b8 116
914
+ .b8 111
915
+ .b8 110
916
+ .b8 95
917
+ .b8 95
918
+ .b8 48
919
+ .b8 100
920
+ .b8 49
921
+ .b8 100
922
+ .b8 50
923
+ .b8 100
924
+ .b8 51
925
+ .b8 100
926
+ .b8 52
927
+ .b8 100
928
+ .b8 53
929
+ .b8 100
930
+ .b8 54
931
+ .b8 100
932
+ .b8 101
933
+ .b8 55
934
+ .b8 100
935
+ .b8 101
936
+ .b8 0
937
+ .b32 0
938
+ $L__pubNames_end0:
939
+ }
940
+ .section .debug_pubtypes
941
+ {
942
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
943
+ $L__pubTypes_start0:
944
+ .b8 2
945
+ .b8 0
946
+ .b32 .debug_info
947
+ .b32 306
948
+ .b32 0
949
+ $L__pubTypes_end0:
950
+ }
951
+ .section .debug_loc { }
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %cst_0 = arith.constant 0.000000e+00 : f32
5
+ %cst_1 = arith.constant dense<1.000000e+00> : tensor<64x8xf32>
6
+ %c256_i32 = arith.constant 256 : i32
7
+ %c8_i32 = arith.constant 8 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<256> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<0> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
12
+ %cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
15
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
16
+ %cst_9 = arith.constant dense<256> : tensor<64x1xi32>
17
+ %cst_10 = arith.constant dense<256> : tensor<1x8xi32>
18
+ %cst_11 = arith.constant dense<512> : tensor<64x1xi32>
19
+ %c64_i32 = arith.constant 64 : i32
20
+ %0 = tt.get_program_id x : i32
21
+ %1 = arith.muli %0, %c64_i32 : i32
22
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
23
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
24
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
25
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
26
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
27
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
28
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
29
+ %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
30
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
31
+ %11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
32
+ %12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
33
+ %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x8xi32>
34
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
35
+ %15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
36
+ %16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x8xi32>
37
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
38
+ %18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
39
+ %19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
40
+ %20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
41
+ %21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
42
+ %22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
43
+ %23 = arith.andi %21, %22 : tensor<64x1xi1>
44
+ %24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
45
+ %25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x8xi64>
46
+ %26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
47
+ %27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
48
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
49
+ %52 = arith.addi %51, %7 : tensor<1x8xi32>
50
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
51
+ %54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
52
+ %55 = arith.addi %54, %13 : tensor<64x8xi32>
53
+ %56 = tt.addptr %14, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
54
+ %57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
55
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
56
+ %59 = arith.addi %54, %16 : tensor<64x8xi32>
57
+ %60 = tt.addptr %17, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
58
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16>
59
+ %62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
60
+ tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
61
+ %63 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
62
+ %64 = tt.broadcast %63 : (tensor<1x8xi64>) -> tensor<64x8xi64>
63
+ %65 = arith.addi %64, %25 : tensor<64x8xi64>
64
+ %66 = tt.addptr %26, %65 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
65
+ %67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
66
+ %68 = arith.addf %67, %58 : tensor<64x8xf32>
67
+ %69 = arith.addf %68, %62 : tensor<64x8xf32>
68
+ %70 = arith.subf %69, %arg9 : tensor<64x8xf32>
69
+ %71 = arith.addf %arg11, %cst_1 : tensor<64x8xf32>
70
+ %72 = arith.divf %70, %71 : tensor<64x8xf32>
71
+ %73 = arith.addf %arg9, %72 : tensor<64x8xf32>
72
+ %74 = arith.subf %69, %73 : tensor<64x8xf32>
73
+ %75 = arith.mulf %70, %74 : tensor<64x8xf32>
74
+ %76 = arith.addf %arg10, %75 : tensor<64x8xf32>
75
+ %77 = arith.select %57, %73, %arg9 : tensor<64x8xi1>, tensor<64x8xf32>
76
+ %78 = arith.select %57, %76, %arg10 : tensor<64x8xi1>, tensor<64x8xf32>
77
+ %79 = arith.select %57, %71, %arg11 : tensor<64x8xi1>, tensor<64x8xf32>
78
+ scf.yield %77, %78, %79 : tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>
79
+ }
80
+ %28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
81
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
82
+ %51 = arith.subf %arg11, %arg8 : f32
83
+ %52 = arith.addf %arg10, %arg13 : f32
84
+ %53 = arith.cmpf oeq, %52, %cst_0 : f32
85
+ %54 = arith.divf %arg13, %52 : f32
86
+ %55 = arith.select %53, %cst_0, %54 : f32
87
+ %56 = arith.mulf %51, %55 : f32
88
+ %57 = arith.addf %arg8, %56 : f32
89
+ %58 = arith.addf %arg9, %arg12 : f32
90
+ %59 = arith.mulf %51, %51 : f32
91
+ %60 = arith.mulf %59, %arg10 : f32
92
+ %61 = arith.mulf %60, %55 : f32
93
+ %62 = arith.addf %58, %61 : f32
94
+ tt.reduce.return %57, %62, %52 : f32, f32, f32
95
+ }) : (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
96
+ %29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
97
+ %30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
98
+ %31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
99
+ %32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x8xi32>
100
+ %33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
101
+ %34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
102
+ %35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x8xi32>
103
+ %36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
104
+ %37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
105
+ %38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
106
+ %39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
107
+ %40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
108
+ %41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
109
+ %42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
110
+ %43 = arith.andi %41, %42 : tensor<64x1xi1>
111
+ %44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
112
+ %45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x8xi64>
113
+ %46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
114
+ %47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x8xf32>
115
+ %48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
116
+ %49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
117
+ %50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
118
+ scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
119
+ %51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
120
+ %52 = arith.addi %51, %7 : tensor<1x8xi32>
121
+ %53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
122
+ %54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
123
+ %55 = arith.addi %54, %32 : tensor<64x8xi32>
124
+ %56 = tt.addptr %33, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
125
+ %57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
126
+ %58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
127
+ %59 = arith.addi %54, %35 : tensor<64x8xi32>
128
+ %60 = tt.addptr %36, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
129
+ %61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
130
+ %62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
131
+ %63 = tt.addptr %37, %52 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
132
+ %64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32>
133
+ tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
134
+ %65 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
135
+ %66 = tt.broadcast %65 : (tensor<1x8xi64>) -> tensor<64x8xi64>
136
+ %67 = arith.addi %66, %45 : tensor<64x8xi64>
137
+ %68 = tt.addptr %46, %67 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
138
+ %69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
139
+ %70 = arith.addf %69, %58 : tensor<64x8xf32>
140
+ %71 = arith.addf %70, %62 : tensor<64x8xf32>
141
+ %72 = arith.subf %71, %47 : tensor<64x8xf32>
142
+ %73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
143
+ %74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x8xf32>
144
+ %75 = arith.mulf %72, %74 : tensor<64x8xf32>
145
+ %76 = tt.broadcast %64 : (tensor<1x8xf32>) -> tensor<64x8xf32>
146
+ %77 = arith.mulf %75, %76 : tensor<64x8xf32>
147
+ %78 = tt.addptr %50, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
148
+ %79 = arith.truncf %77 : tensor<64x8xf32> to tensor<64x8xbf16>
149
+ tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16>
150
+ }
151
+ tt.return
152
+ }
153
+ }
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ttgir ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
11
+ %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
12
+ %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
15
+ %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
16
+ %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
17
+ %c0_i32 = arith.constant 0 : i32
18
+ %c8_i32 = arith.constant 8 : i32
19
+ %c256_i32 = arith.constant 256 : i32
20
+ %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
21
+ %cst_11 = arith.constant 0.000000e+00 : f32
22
+ %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
23
+ %cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
24
+ %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
25
+ %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
26
+ %c64_i32 = arith.constant 64 : i32
27
+ %0 = tt.get_program_id x : i32
28
+ %1 = arith.muli %0, %c64_i32 : i32
29
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
30
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
31
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
32
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
33
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
34
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
35
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
36
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
37
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
38
+ %11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
39
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
40
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
41
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
42
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
43
+ %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
44
+ %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
45
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
46
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
47
+ %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
48
+ %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
49
+ %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
50
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
51
+ %24 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
52
+ %25 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
53
+ %26 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
54
+ %27 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
55
+ %28 = arith.select %26, %24, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
56
+ %29 = arith.select %27, %25, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
57
+ %30 = arith.cmpi sge, %29, %cst_9 : tensor<64x1xi64, #blocked1>
58
+ %31 = arith.cmpi slt, %29, %cst_8 : tensor<64x1xi64, #blocked1>
59
+ %32 = arith.andi %30, %31 : tensor<64x1xi1, #blocked1>
60
+ %33 = arith.muli %28, %cst_5 : tensor<64x1xi64, #blocked>
61
+ %34 = tt.broadcast %33 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
62
+ %35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
63
+ %36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>) : i32 {
64
+ %48 = tt.splat %arg7 : (i32) -> tensor<1x8xi32, #blocked>
65
+ %49 = tt.splat %arg7 : (i32) -> tensor<1x8xi32, #blocked2>
66
+ %50 = arith.addi %48, %12 : tensor<1x8xi32, #blocked>
67
+ %51 = arith.addi %49, %13 : tensor<1x8xi32, #blocked2>
68
+ %52 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
69
+ %53 = arith.cmpi slt, %51, %cst_13 : tensor<1x8xi32, #blocked2>
70
+ %54 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
71
+ %55 = arith.addi %54, %22 : tensor<64x8xi32, #blocked>
72
+ %56 = tt.addptr %23, %55 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
73
+ %57 = tt.broadcast %52 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
74
+ %58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
75
+ %59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
76
+ tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
77
+ %60 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
78
+ %61 = tt.broadcast %60 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
79
+ %62 = arith.addi %61, %34 : tensor<64x8xi64, #blocked>
80
+ %63 = tt.addptr %35, %62 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
81
+ %64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
82
+ %65 = arith.addf %64, %59 : tensor<64x8xf32, #blocked>
83
+ %66 = arith.subf %65, %arg8 : tensor<64x8xf32, #blocked>
84
+ %67 = arith.addf %arg11, %cst_4 : tensor<64x8xf32, #blocked>
85
+ %68 = arith.addf %arg10, %cst_10 : tensor<64x8xf32, #blocked2>
86
+ %69 = arith.divf %66, %67 : tensor<64x8xf32, #blocked>
87
+ %70 = arith.addf %arg8, %69 : tensor<64x8xf32, #blocked>
88
+ %71 = arith.subf %65, %70 : tensor<64x8xf32, #blocked>
89
+ %72 = arith.mulf %66, %71 : tensor<64x8xf32, #blocked>
90
+ %73 = arith.addf %arg9, %72 : tensor<64x8xf32, #blocked>
91
+ %74 = arith.select %57, %70, %arg8 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
92
+ %75 = arith.select %57, %73, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
93
+ %76 = arith.select %57, %67, %arg11 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
94
+ %77 = arith.select %58, %68, %arg10 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
95
+ scf.yield %74, %75, %77, %76 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
96
+ }
97
+ %37 = triton_gpu.convert_layout %36#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
98
+ %38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
99
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
100
+ %48 = arith.subf %arg10, %arg7 : f32
101
+ %49 = arith.addf %arg9, %arg12 : f32
102
+ %50 = arith.cmpf oeq, %49, %cst_11 : f32
103
+ %51 = arith.divf %arg12, %49 : f32
104
+ %52 = arith.select %50, %cst_11, %51 : f32
105
+ %53 = arith.mulf %48, %52 : f32
106
+ %54 = arith.addf %arg7, %53 : f32
107
+ %55 = arith.addf %arg8, %arg11 : f32
108
+ %56 = arith.mulf %48, %48 : f32
109
+ %57 = arith.mulf %56, %arg9 : f32
110
+ %58 = arith.mulf %57, %52 : f32
111
+ %59 = arith.addf %55, %58 : f32
112
+ tt.reduce.return %54, %59, %49 : f32, f32, f32
113
+ }) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
114
+ %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
115
+ %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
116
+ %41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
117
+ %42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
118
+ %43 = arith.divf %40, %cst_15 : tensor<64x1xf32, #blocked>
119
+ %44 = arith.addf %43, %cst_14 : tensor<64x1xf32, #blocked>
120
+ %45 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
121
+ %46 = tt.broadcast %45 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
122
+ %47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
123
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
124
+ %48 = tt.splat %arg7 : (i32) -> tensor<1x8xi32, #blocked>
125
+ %49 = arith.addi %48, %12 : tensor<1x8xi32, #blocked>
126
+ %50 = arith.cmpi slt, %49, %cst_0 : tensor<1x8xi32, #blocked>
127
+ %51 = tt.broadcast %49 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
128
+ %52 = arith.addi %51, %22 : tensor<64x8xi32, #blocked>
129
+ %53 = tt.addptr %23, %52 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
130
+ %54 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
131
+ %55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
132
+ %56 = tt.addptr %41, %49 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
133
+ %57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
134
+ tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
135
+ %58 = arith.extsi %49 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
136
+ %59 = tt.broadcast %58 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
137
+ %60 = arith.addi %59, %34 : tensor<64x8xi64, #blocked>
138
+ %61 = tt.addptr %35, %60 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
139
+ %62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
140
+ %63 = arith.addf %62, %55 : tensor<64x8xf32, #blocked>
141
+ %64 = arith.subf %63, %42 : tensor<64x8xf32, #blocked>
142
+ %65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
143
+ %66 = tt.broadcast %65 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
144
+ %67 = arith.mulf %64, %66 : tensor<64x8xf32, #blocked>
145
+ %68 = tt.broadcast %57 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
146
+ %69 = arith.mulf %67, %68 : tensor<64x8xf32, #blocked>
147
+ %70 = arith.addi %51, %46 : tensor<64x8xi32, #blocked>
148
+ %71 = tt.addptr %47, %70 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
149
+ %72 = arith.truncf %69 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
150
+ tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
151
+ }
152
+ tt.return
153
+ }
154
+ }
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ptx ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5de6de(
14
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
19
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
20
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
21
+ )
22
+ .maxntid 64, 1, 1
23
+ {
24
+ .reg .pred %p<26>;
25
+ .reg .b16 %rs<13>;
26
+ .reg .b32 %r<92>;
27
+ .reg .f32 %f<78>;
28
+ .reg .b64 %rd<14>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_0];
34
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_1];
35
+ $L__tmp0:
36
+ .loc 1 26 26
37
+ mov.u32 %r58, %tid.x;
38
+ and.b32 %r59, %r58, 31;
39
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2];
40
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3];
41
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4];
42
+ shl.b32 %r60, %r58, 2;
43
+ and.b32 %r61, %r60, 252;
44
+ .loc 1 23 28
45
+ mov.u32 %r1, %ctaid.x;
46
+ .loc 1 30 40
47
+ shl.b32 %r62, %r1, 8;
48
+ .loc 1 30 36
49
+ or.b32 %r63, %r62, %r61;
50
+ .loc 1 30 30
51
+ mul.wide.s32 %rd11, %r63, 4;
52
+ add.s64 %rd1, %rd6, %rd11;
53
+ mov.b32 %r6, 0;
54
+ mov.pred %p1, -1;
55
+ .loc 1 30 46
56
+ mov.u32 %r2, 0x0;
57
+ mov.u32 %r3, 0x0;
58
+ mov.u32 %r4, 0x0;
59
+ mov.u32 %r5, 0x0;
60
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
61
+ @!%p1 mov.u32 %r2, %r6;
62
+ @!%p1 mov.u32 %r3, %r6;
63
+ @!%p1 mov.u32 %r4, %r6;
64
+ @!%p1 mov.u32 %r5, %r6;
65
+ mov.b32 %f1, %r4;
66
+ mov.b32 %f2, %r5;
67
+ .loc 1 31 30
68
+ mul.wide.s32 %rd12, %r63, 2;
69
+ add.s64 %rd2, %rd7, %rd12;
70
+ .loc 1 31 46
71
+ mov.u32 %r10, 0x0;
72
+ mov.u32 %r11, 0x0;
73
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
74
+ @!%p1 mov.u32 %r10, %r6;
75
+ @!%p1 mov.u32 %r11, %r6;
76
+ cvt.u16.u32 %rs1, %r10;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
78
+ cvt.u16.u32 %rs3, %r11;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
80
+ .loc 1 31 67
81
+ cvt.f32.bf16 %r14, %rs1;
82
+ mov.b32 %f3, %r14;
83
+ cvt.f32.bf16 %r15, %rs2;
84
+ mov.b32 %f4, %r15;
85
+ cvt.f32.bf16 %r16, %rs3;
86
+ mov.b32 %f5, %r16;
87
+ cvt.f32.bf16 %r17, %rs4;
88
+ mov.b32 %f6, %r17;
89
+ .loc 1 32 30
90
+ add.s64 %rd3, %rd8, %rd12;
91
+ .loc 1 32 46
92
+ mov.u32 %r18, 0x0;
93
+ mov.u32 %r19, 0x0;
94
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
95
+ @!%p1 mov.u32 %r18, %r6;
96
+ @!%p1 mov.u32 %r19, %r6;
97
+ cvt.u16.u32 %rs5, %r18;
98
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
99
+ cvt.u16.u32 %rs7, %r19;
100
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
101
+ .loc 1 32 67
102
+ cvt.f32.bf16 %r22, %rs5;
103
+ mov.b32 %f7, %r22;
104
+ cvt.f32.bf16 %r23, %rs6;
105
+ mov.b32 %f8, %r23;
106
+ cvt.f32.bf16 %r24, %rs7;
107
+ mov.b32 %f9, %r24;
108
+ cvt.f32.bf16 %r25, %rs8;
109
+ mov.b32 %f10, %r25;
110
+ .loc 1 33 31
111
+ mul.wide.u32 %rd13, %r61, 4;
112
+ add.s64 %rd4, %rd9, %rd13;
113
+ .loc 1 33 36
114
+ mov.u32 %r26, 0x0;
115
+ mov.u32 %r27, 0x0;
116
+ mov.u32 %r28, 0x0;
117
+ mov.u32 %r29, 0x0;
118
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
119
+ @!%p1 mov.u32 %r26, %r6;
120
+ @!%p1 mov.u32 %r27, %r6;
121
+ @!%p1 mov.u32 %r28, %r6;
122
+ @!%p1 mov.u32 %r29, %r6;
123
+ .loc 1 35 18
124
+ add.f32 %f11, %f5, %f1;
125
+ add.f32 %f12, %f6, %f2;
126
+ .loc 1 30 46
127
+ mov.b32 %f13, %r3;
128
+ mov.b32 %f14, %r2;
129
+ .loc 1 35 18
130
+ add.f32 %f15, %f3, %f14;
131
+ add.f32 %f16, %f4, %f13;
132
+ .loc 1 37 18
133
+ add.f32 %f17, %f16, %f8;
134
+ add.f32 %f18, %f15, %f7;
135
+ add.f32 %f19, %f11, %f9;
136
+ add.f32 %f20, %f12, %f10;
137
+ $L__tmp1:
138
+ .loc 2 233 15
139
+ add.f32 %f21, %f18, %f17;
140
+ add.f32 %f22, %f21, %f19;
141
+ add.f32 %f23, %f22, %f20;
142
+ $L__tmp2:
143
+ .loc 2 243 36
144
+ mov.b32 %r64, %f23;
145
+ shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
146
+ mov.b32 %f24, %r65;
147
+ $L__tmp3:
148
+ .loc 2 233 15
149
+ add.f32 %f25, %f23, %f24;
150
+ $L__tmp4:
151
+ .loc 2 243 36
152
+ mov.b32 %r66, %f25;
153
+ shfl.sync.bfly.b32 %r67, %r66, 8, 31, -1;
154
+ mov.b32 %f26, %r67;
155
+ $L__tmp5:
156
+ .loc 2 233 15
157
+ add.f32 %f27, %f25, %f26;
158
+ $L__tmp6:
159
+ .loc 2 243 36
160
+ mov.b32 %r68, %f27;
161
+ shfl.sync.bfly.b32 %r69, %r68, 4, 31, -1;
162
+ mov.b32 %f28, %r69;
163
+ $L__tmp7:
164
+ .loc 2 233 15
165
+ add.f32 %f29, %f27, %f28;
166
+ $L__tmp8:
167
+ .loc 2 243 36
168
+ mov.b32 %r70, %f29;
169
+ shfl.sync.bfly.b32 %r71, %r70, 2, 31, -1;
170
+ mov.b32 %f30, %r71;
171
+ $L__tmp9:
172
+ .loc 2 233 15
173
+ add.f32 %f31, %f29, %f30;
174
+ $L__tmp10:
175
+ .loc 2 243 36
176
+ mov.b32 %r72, %f31;
177
+ shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1;
178
+ mov.b32 %f32, %r73;
179
+ $L__tmp11:
180
+ .loc 2 233 15
181
+ add.f32 %f33, %f31, %f32;
182
+ $L__tmp12:
183
+ .loc 2 243 36
184
+ setp.eq.s32 %p17, %r59, 0;
185
+ shr.u32 %r74, %r58, 3;
186
+ and.b32 %r75, %r74, 4;
187
+ mov.u32 %r76, global_smem;
188
+ add.s32 %r34, %r76, %r75;
189
+ mov.b32 %r35, %f33;
190
+ @%p17 st.shared.b32 [ %r34 + 0 ], %r35;
191
+ bar.sync 0;
192
+ setp.lt.s32 %p18, %r58, 2;
193
+ add.s32 %r37, %r76, %r60;
194
+ @%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
195
+ mov.b32 %f34, %r36;
196
+ shfl.sync.bfly.b32 %r77, %r36, 1, 31, -1;
197
+ mov.b32 %f35, %r77;
198
+ $L__tmp13:
199
+ .loc 2 233 15
200
+ add.f32 %f36, %f34, %f35;
201
+ $L__tmp14:
202
+ .loc 2 243 36
203
+ and.b32 %r78, %r58, 1;
204
+ setp.eq.b32 %p24, %r78, 1;
205
+ not.pred %p25, %p24;
206
+ and.pred %p19, %p18, %p25;
207
+ mov.b32 %r39, %f36;
208
+ @%p19 st.shared.b32 [ %r37 + 0 ], %r39;
209
+ bar.sync 0;
210
+ ld.shared.f32 %f37, [global_smem];
211
+ $L__tmp15:
212
+ .loc 3 8 15
213
+ add.f32 %f38, %f37, 0f00000000;
214
+ $L__tmp16:
215
+ .loc 1 45 20
216
+ mov.b32 %r41, %f38;
217
+ mov.b32 %r42, 1132462080;
218
+ div.full.f32 %r40, %r41, %r42;
219
+ mov.b32 %f39, %r40;
220
+ .loc 1 46 19
221
+ sub.f32 %f40, %f18, %f39;
222
+ sub.f32 %f41, %f17, %f39;
223
+ sub.f32 %f42, %f19, %f39;
224
+ sub.f32 %f43, %f20, %f39;
225
+ .loc 1 47 20
226
+ mul.f32 %f44, %f41, %f41;
227
+ $L__tmp17:
228
+ .loc 2 243 36
229
+ bar.sync 0;
230
+ $L__tmp18:
231
+ .loc 2 233 15
232
+ fma.rn.f32 %f45, %f40, %f40, %f44;
233
+ fma.rn.f32 %f46, %f42, %f42, %f45;
234
+ fma.rn.f32 %f47, %f43, %f43, %f46;
235
+ $L__tmp19:
236
+ .loc 2 243 36
237
+ mov.b32 %r79, %f47;
238
+ shfl.sync.bfly.b32 %r80, %r79, 16, 31, -1;
239
+ mov.b32 %f48, %r80;
240
+ $L__tmp20:
241
+ .loc 2 233 15
242
+ add.f32 %f49, %f47, %f48;
243
+ $L__tmp21:
244
+ .loc 2 243 36
245
+ mov.b32 %r81, %f49;
246
+ shfl.sync.bfly.b32 %r82, %r81, 8, 31, -1;
247
+ mov.b32 %f50, %r82;
248
+ $L__tmp22:
249
+ .loc 2 233 15
250
+ add.f32 %f51, %f49, %f50;
251
+ $L__tmp23:
252
+ .loc 2 243 36
253
+ mov.b32 %r83, %f51;
254
+ shfl.sync.bfly.b32 %r84, %r83, 4, 31, -1;
255
+ mov.b32 %f52, %r84;
256
+ $L__tmp24:
257
+ .loc 2 233 15
258
+ add.f32 %f53, %f51, %f52;
259
+ $L__tmp25:
260
+ .loc 2 243 36
261
+ mov.b32 %r85, %f53;
262
+ shfl.sync.bfly.b32 %r86, %r85, 2, 31, -1;
263
+ mov.b32 %f54, %r86;
264
+ $L__tmp26:
265
+ .loc 2 233 15
266
+ add.f32 %f55, %f53, %f54;
267
+ $L__tmp27:
268
+ .loc 2 243 36
269
+ mov.b32 %r87, %f55;
270
+ shfl.sync.bfly.b32 %r88, %r87, 1, 31, -1;
271
+ mov.b32 %f56, %r88;
272
+ $L__tmp28:
273
+ .loc 2 233 15
274
+ add.f32 %f57, %f55, %f56;
275
+ $L__tmp29:
276
+ .loc 2 243 36
277
+ mov.b32 %r44, %f57;
278
+ @%p17 st.shared.b32 [ %r34 + 0 ], %r44;
279
+ bar.sync 0;
280
+ @%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
281
+ mov.b32 %f58, %r45;
282
+ shfl.sync.bfly.b32 %r89, %r45, 1, 31, -1;
283
+ mov.b32 %f59, %r89;
284
+ $L__tmp30:
285
+ .loc 2 233 15
286
+ add.f32 %f60, %f58, %f59;
287
+ $L__tmp31:
288
+ .loc 2 243 36
289
+ mov.b32 %r48, %f60;
290
+ @%p19 st.shared.b32 [ %r37 + 0 ], %r48;
291
+ bar.sync 0;
292
+ ld.shared.f32 %f61, [global_smem];
293
+ $L__tmp32:
294
+ .loc 3 8 15
295
+ add.f32 %f62, %f61, 0f00000000;
296
+ $L__tmp33:
297
+ .loc 1 53 20
298
+ mov.b32 %r50, %f62;
299
+ div.full.f32 %r49, %r50, %r42;
300
+ mov.b32 %f63, %r49;
301
+ .loc 1 55 20
302
+ add.f32 %f64, %f63, 0f3727C5AC;
303
+ .loc 1 56 26
304
+ rsqrt.approx.ftz.f32 %f65, %f64;
305
+ .loc 1 33 36
306
+ mov.b32 %f66, %r29;
307
+ mov.b32 %f67, %r28;
308
+ mov.b32 %f68, %r27;
309
+ mov.b32 %f69, %r26;
310
+ .loc 1 57 20
311
+ mul.f32 %f70, %f40, %f65;
312
+ mul.f32 %f71, %f41, %f65;
313
+ mul.f32 %f72, %f42, %f65;
314
+ mul.f32 %f73, %f43, %f65;
315
+ .loc 1 58 20
316
+ mul.f32 %f74, %f70, %f69;
317
+ mul.f32 %f75, %f71, %f68;
318
+ mul.f32 %f76, %f72, %f67;
319
+ mul.f32 %f77, %f73, %f66;
320
+ .loc 1 60 25
321
+ add.s64 %rd5, %rd10, %rd12;
322
+ .loc 1 60 48
323
+ mov.b32 %r52, %f74;
324
+ cvt.rn.bf16.f32 %rs9, %r52;
325
+ mov.b32 %r53, %f75;
326
+ cvt.rn.bf16.f32 %rs10, %r53;
327
+ mov.b32 %r54, %f76;
328
+ cvt.rn.bf16.f32 %rs11, %r54;
329
+ mov.b32 %r55, %f77;
330
+ cvt.rn.bf16.f32 %rs12, %r55;
331
+ mov.b32 %r90, {%rs9, %rs10};
332
+ mov.b32 %r91, {%rs11, %rs12};
333
+ @%p1 st.global.v2.b32 [ %rd5 + 0 ], { %r90, %r91 };
334
+ .loc 1 60 4
335
+ ret;
336
+ $L__tmp34:
337
+ $L__func_end0:
338
+
339
+ }
340
+ // .globl __nv_rsqrtf
341
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
342
+ .param .b32 __nv_rsqrtf_param_0
343
+ )
344
+ {
345
+ .reg .f32 %f<3>;
346
+ $L__func_begin1:
347
+
348
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
349
+ rsqrt.approx.ftz.f32 %f2, %f1;
350
+ st.param.f32 [func_retval0+0], %f2;
351
+ ret;
352
+ $L__func_end1:
353
+
354
+ }
355
+ .file 1 "/tmp/torchinductor_root/do/cdohrmmhfsykzlva6pepxaa7gf7klw7w5jzorpspyaldhfg3acr2.py"
356
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
357
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
358
+ .section .debug_abbrev
359
+ {
360
+ .b8 1
361
+ .b8 17
362
+ .b8 1
363
+ .b8 37
364
+ .b8 8
365
+ .b8 19
366
+ .b8 5
367
+ .b8 3
368
+ .b8 8
369
+ .b8 16
370
+ .b8 6
371
+ .b8 27
372
+ .b8 8
373
+ .b8 180
374
+ .b8 66
375
+ .b8 12
376
+ .b8 17
377
+ .b8 1
378
+ .b8 18
379
+ .b8 1
380
+ .b8 0
381
+ .b8 0
382
+ .b8 2
383
+ .b8 46
384
+ .b8 0
385
+ .b8 135
386
+ .b8 64
387
+ .b8 8
388
+ .b8 3
389
+ .b8 8
390
+ .b8 58
391
+ .b8 11
392
+ .b8 59
393
+ .b8 11
394
+ .b8 63
395
+ .b8 12
396
+ .b8 32
397
+ .b8 11
398
+ .b8 0
399
+ .b8 0
400
+ .b8 3
401
+ .b8 46
402
+ .b8 1
403
+ .b8 17
404
+ .b8 1
405
+ .b8 18
406
+ .b8 1
407
+ .b8 64
408
+ .b8 10
409
+ .b8 49
410
+ .b8 19
411
+ .b8 0
412
+ .b8 0
413
+ .b8 4
414
+ .b8 29
415
+ .b8 1
416
+ .b8 49
417
+ .b8 19
418
+ .b8 17
419
+ .b8 1
420
+ .b8 18
421
+ .b8 1
422
+ .b8 88
423
+ .b8 11
424
+ .b8 89
425
+ .b8 11
426
+ .b8 87
427
+ .b8 11
428
+ .b8 0
429
+ .b8 0
430
+ .b8 5
431
+ .b8 29
432
+ .b8 0
433
+ .b8 49
434
+ .b8 19
435
+ .b8 17
436
+ .b8 1
437
+ .b8 18
438
+ .b8 1
439
+ .b8 88
440
+ .b8 11
441
+ .b8 89
442
+ .b8 11
443
+ .b8 87
444
+ .b8 11
445
+ .b8 0
446
+ .b8 0
447
+ .b8 0
448
+ }
449
+ .section .debug_info
450
+ {
451
+ .b32 395
452
+ .b8 2
453
+ .b8 0
454
+ .b32 .debug_abbrev
455
+ .b8 8
456
+ .b8 1
457
+ .b8 116
458
+ .b8 114
459
+ .b8 105
460
+ .b8 116
461
+ .b8 111
462
+ .b8 110
463
+ .b8 0
464
+ .b8 2
465
+ .b8 0
466
+ .b8 99
467
+ .b8 100
468
+ .b8 111
469
+ .b8 104
470
+ .b8 114
471
+ .b8 109
472
+ .b8 109
473
+ .b8 104
474
+ .b8 102
475
+ .b8 115
476
+ .b8 121
477
+ .b8 107
478
+ .b8 122
479
+ .b8 108
480
+ .b8 118
481
+ .b8 97
482
+ .b8 54
483
+ .b8 112
484
+ .b8 101
485
+ .b8 112
486
+ .b8 120
487
+ .b8 97
488
+ .b8 97
489
+ .b8 55
490
+ .b8 103
491
+ .b8 102
492
+ .b8 55
493
+ .b8 107
494
+ .b8 108
495
+ .b8 119
496
+ .b8 55
497
+ .b8 119
498
+ .b8 53
499
+ .b8 106
500
+ .b8 122
501
+ .b8 111
502
+ .b8 114
503
+ .b8 112
504
+ .b8 115
505
+ .b8 112
506
+ .b8 121
507
+ .b8 97
508
+ .b8 108
509
+ .b8 100
510
+ .b8 104
511
+ .b8 102
512
+ .b8 103
513
+ .b8 51
514
+ .b8 97
515
+ .b8 99
516
+ .b8 114
517
+ .b8 50
518
+ .b8 46
519
+ .b8 112
520
+ .b8 121
521
+ .b8 0
522
+ .b32 .debug_line
523
+ .b8 47
524
+ .b8 116
525
+ .b8 109
526
+ .b8 112
527
+ .b8 47
528
+ .b8 116
529
+ .b8 111
530
+ .b8 114
531
+ .b8 99
532
+ .b8 104
533
+ .b8 105
534
+ .b8 110
535
+ .b8 100
536
+ .b8 117
537
+ .b8 99
538
+ .b8 116
539
+ .b8 111
540
+ .b8 114
541
+ .b8 95
542
+ .b8 114
543
+ .b8 111
544
+ .b8 111
545
+ .b8 116
546
+ .b8 47
547
+ .b8 100
548
+ .b8 111
549
+ .b8 0
550
+ .b8 1
551
+ .b64 $L__func_begin0
552
+ .b64 $L__func_end0
553
+ .b8 2
554
+ .b8 116
555
+ .b8 114
556
+ .b8 105
557
+ .b8 116
558
+ .b8 111
559
+ .b8 110
560
+ .b8 95
561
+ .b8 95
562
+ .b8 48
563
+ .b8 100
564
+ .b8 49
565
+ .b8 100
566
+ .b8 50
567
+ .b8 100
568
+ .b8 51
569
+ .b8 100
570
+ .b8 52
571
+ .b8 100
572
+ .b8 53
573
+ .b8 100
574
+ .b8 101
575
+ .b8 54
576
+ .b8 100
577
+ .b8 101
578
+ .b8 0
579
+ .b8 116
580
+ .b8 114
581
+ .b8 105
582
+ .b8 116
583
+ .b8 111
584
+ .b8 110
585
+ .b8 95
586
+ .b8 95
587
+ .b8 48
588
+ .b8 100
589
+ .b8 49
590
+ .b8 100
591
+ .b8 50
592
+ .b8 100
593
+ .b8 51
594
+ .b8 100
595
+ .b8 52
596
+ .b8 100
597
+ .b8 53
598
+ .b8 100
599
+ .b8 101
600
+ .b8 54
601
+ .b8 100
602
+ .b8 101
603
+ .b8 0
604
+ .b8 1
605
+ .b8 18
606
+ .b8 1
607
+ .b8 1
608
+ .b8 3
609
+ .b64 $L__func_begin0
610
+ .b64 $L__func_end0
611
+ .b8 1
612
+ .b8 156
613
+ .b32 125
614
+ .b8 4
615
+ .b32 125
616
+ .b64 $L__tmp1
617
+ .b64 $L__tmp14
618
+ .b8 2
619
+ .b8 42
620
+ .b8 59
621
+ .b8 5
622
+ .b32 125
623
+ .b64 $L__tmp1
624
+ .b64 $L__tmp14
625
+ .b8 2
626
+ .b8 243
627
+ .b8 36
628
+ .b8 0
629
+ .b8 5
630
+ .b32 125
631
+ .b64 $L__tmp2
632
+ .b64 $L__tmp15
633
+ .b8 2
634
+ .b8 42
635
+ .b8 59
636
+ .b8 5
637
+ .b32 125
638
+ .b64 $L__tmp15
639
+ .b64 $L__tmp16
640
+ .b8 3
641
+ .b8 42
642
+ .b8 45
643
+ .b8 5
644
+ .b32 125
645
+ .b64 $L__tmp17
646
+ .b64 $L__tmp32
647
+ .b8 2
648
+ .b8 50
649
+ .b8 59
650
+ .b8 4
651
+ .b32 125
652
+ .b64 $L__tmp18
653
+ .b64 $L__tmp31
654
+ .b8 2
655
+ .b8 50
656
+ .b8 59
657
+ .b8 5
658
+ .b32 125
659
+ .b64 $L__tmp18
660
+ .b64 $L__tmp31
661
+ .b8 2
662
+ .b8 243
663
+ .b8 36
664
+ .b8 0
665
+ .b8 5
666
+ .b32 125
667
+ .b64 $L__tmp32
668
+ .b64 $L__tmp33
669
+ .b8 3
670
+ .b8 50
671
+ .b8 45
672
+ .b8 0
673
+ .b8 0
674
+ }
675
+ .section .debug_pubnames
676
+ {
677
+ .b32 $L__pubNames_end0-$L__pubNames_start0
678
+ $L__pubNames_start0:
679
+ .b8 2
680
+ .b8 0
681
+ .b32 .debug_info
682
+ .b32 399
683
+ .b32 125
684
+ .b8 116
685
+ .b8 114
686
+ .b8 105
687
+ .b8 116
688
+ .b8 111
689
+ .b8 110
690
+ .b8 95
691
+ .b8 95
692
+ .b8 48
693
+ .b8 100
694
+ .b8 49
695
+ .b8 100
696
+ .b8 50
697
+ .b8 100
698
+ .b8 51
699
+ .b8 100
700
+ .b8 52
701
+ .b8 100
702
+ .b8 53
703
+ .b8 100
704
+ .b8 101
705
+ .b8 54
706
+ .b8 100
707
+ .b8 101
708
+ .b8 0
709
+ .b32 0
710
+ $L__pubNames_end0:
711
+ }
712
+ .section .debug_pubtypes
713
+ {
714
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
715
+ $L__pubTypes_start0:
716
+ .b8 2
717
+ .b8 0
718
+ .b32 .debug_info
719
+ .b32 399
720
+ .b32 0
721
+ $L__pubTypes_end0:
722
+ }
723
+ .section .debug_loc { }