Add files using upload-large-folder tool
Browse files- .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin +0 -0
- .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx +951 -0
- .triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir +153 -0
- .triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ttgir +154 -0
- .triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ptx +723 -0
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.cubin
ADDED
Binary file (28.5 kB). View file
|
|
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ptx
ADDED
@@ -0,0 +1,951 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
34 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
36 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
37 |
+
)
|
38 |
+
.maxntid 128, 1, 1
|
39 |
+
{
|
40 |
+
.reg .pred %p<48>;
|
41 |
+
.reg .b16 %rs<13>;
|
42 |
+
.reg .b32 %r<158>;
|
43 |
+
.reg .f32 %f<164>;
|
44 |
+
.reg .b64 %rd<73>;
|
45 |
+
.loc 1 18 0
|
46 |
+
$L__func_begin0:
|
47 |
+
.loc 1 18 0
|
48 |
+
|
49 |
+
ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6de7de_param_5];
|
50 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_4];
|
51 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_3];
|
52 |
+
ld.param.u64 %rd30, [triton__0d1d2d3d4d5d6de7de_param_0];
|
53 |
+
ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6de7de_param_1];
|
54 |
+
$L__tmp0:
|
55 |
+
.loc 1 22 44
|
56 |
+
mov.u32 %r13, %tid.x;
|
57 |
+
and.b32 %r1, %r13, 31;
|
58 |
+
ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6de7de_param_2];
|
59 |
+
bfe.u32 %r14, %r13, 5, 2;
|
60 |
+
bfe.u32 %r15, %r13, 1, 4;
|
61 |
+
shl.b32 %r16, %r14, 4;
|
62 |
+
or.b32 %r2, %r16, %r15;
|
63 |
+
and.b32 %r17, %r13, 63;
|
64 |
+
.loc 1 24 33
|
65 |
+
shl.b32 %r18, %r13, 2;
|
66 |
+
and.b32 %r3, %r18, 4;
|
67 |
+
and.b32 %r4, %r13, 7;
|
68 |
+
.loc 1 31 36
|
69 |
+
shl.b32 %r5, %r14, 2;
|
70 |
+
.loc 1 21 28
|
71 |
+
mov.u32 %r11, %ctaid.x;
|
72 |
+
.loc 1 21 33
|
73 |
+
shl.b32 %r19, %r11, 6;
|
74 |
+
.loc 1 22 23
|
75 |
+
or.b32 %r20, %r19, %r2;
|
76 |
+
or.b32 %r21, %r19, %r17;
|
77 |
+
.loc 1 26 30
|
78 |
+
mul.wide.s32 %rd33, %r20, 8;
|
79 |
+
add.s64 %rd21, %rd30, %rd33;
|
80 |
+
mul.wide.s32 %rd34, %r21, 8;
|
81 |
+
add.s64 %rd29, %rd30, %rd34;
|
82 |
+
mov.pred %p1, -1;
|
83 |
+
.loc 1 26 35
|
84 |
+
mov.u64 %rd20, 0x0;
|
85 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
|
86 |
+
mov.u64 %rd22, 0x0;
|
87 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd21 + 0 ];
|
88 |
+
mov.u64 %rd24, 0x0;
|
89 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd21 + 0 ];
|
90 |
+
mov.u64 %rd26, 0x0;
|
91 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd21 + 0 ];
|
92 |
+
mov.u64 %rd28, 0x0;
|
93 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
|
94 |
+
.loc 1 27 18
|
95 |
+
bfe.s32 %r22, %r11, 25, 1;
|
96 |
+
shr.u32 %r23, %r22, 23;
|
97 |
+
add.s32 %r24, %r20, %r23;
|
98 |
+
and.b32 %r25, %r24, 16776704;
|
99 |
+
sub.s32 %r26, %r20, %r25;
|
100 |
+
.loc 1 35 44
|
101 |
+
shl.b32 %r27, %r26, 8;
|
102 |
+
.loc 1 37 22
|
103 |
+
add.s64 %rd35, %rd28, 50257;
|
104 |
+
.loc 1 38 22
|
105 |
+
setp.lt.s64 %p6, %rd20, 0;
|
106 |
+
setp.lt.s64 %p7, %rd28, 0;
|
107 |
+
.loc 1 39 36
|
108 |
+
selp.b64 %rd1, %rd35, %rd28, %p7;
|
109 |
+
.loc 1 41 44
|
110 |
+
shl.b64 %rd36, %rd20, 8;
|
111 |
+
add.s64 %rd37, %rd36, 12865792;
|
112 |
+
selp.b64 %rd38, %rd37, %rd36, %p6;
|
113 |
+
.loc 1 31 36
|
114 |
+
and.b32 %r28, %r13, 1;
|
115 |
+
mul.wide.u32 %rd2, %r28, 16;
|
116 |
+
shl.b64 %rd39, %rd38, 2;
|
117 |
+
or.b64 %rd40, %rd2, %rd39;
|
118 |
+
add.s64 %rd72, %rd31, %rd40;
|
119 |
+
shl.b32 %r29, %r11, 14;
|
120 |
+
shl.b32 %r30, %r14, 12;
|
121 |
+
or.b32 %r31, %r29, %r30;
|
122 |
+
shl.b32 %r32, %r15, 8;
|
123 |
+
or.b32 %r33, %r31, %r32;
|
124 |
+
or.b32 %r6, %r33, %r3;
|
125 |
+
or.b32 %r34, %r27, %r3;
|
126 |
+
mul.wide.s32 %rd41, %r34, 4;
|
127 |
+
add.s64 %rd70, %rd32, %rd41;
|
128 |
+
mov.f32 %f148, 0f00000000;
|
129 |
+
mov.b32 %r156, -8;
|
130 |
+
mov.u64 %rd68, %rd70;
|
131 |
+
mov.u64 %rd69, %rd72;
|
132 |
+
mov.f32 %f149, %f148;
|
133 |
+
mov.f32 %f150, %f148;
|
134 |
+
mov.f32 %f151, %f148;
|
135 |
+
mov.f32 %f152, %f148;
|
136 |
+
mov.f32 %f153, %f148;
|
137 |
+
mov.f32 %f154, %f148;
|
138 |
+
mov.f32 %f155, %f148;
|
139 |
+
mov.f32 %f156, %f148;
|
140 |
+
mov.f32 %f157, %f148;
|
141 |
+
mov.f32 %f158, %f148;
|
142 |
+
mov.f32 %f159, %f148;
|
143 |
+
mov.f32 %f160, %f148;
|
144 |
+
mov.f32 %f161, %f148;
|
145 |
+
mov.f32 %f162, %f148;
|
146 |
+
mov.f32 %f163, %f148;
|
147 |
+
bra.uni $L__BB0_1;
|
148 |
+
$L__BB0_3:
|
149 |
+
.loc 1 0 0
|
150 |
+
mov.b32 %f17, %r35;
|
151 |
+
mov.b32 %f18, %r36;
|
152 |
+
mov.b32 %f19, %r37;
|
153 |
+
mov.b32 %f20, %r38;
|
154 |
+
cvt.u16.u32 %rs1, %r43;
|
155 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r43; }
|
156 |
+
cvt.u16.u32 %rs3, %r44;
|
157 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r44; }
|
158 |
+
cvt.f32.bf16 %r47, %rs1;
|
159 |
+
mov.b32 %f21, %r47;
|
160 |
+
cvt.f32.bf16 %r48, %rs2;
|
161 |
+
mov.b32 %f22, %r48;
|
162 |
+
cvt.f32.bf16 %r49, %rs3;
|
163 |
+
mov.b32 %f23, %r49;
|
164 |
+
cvt.f32.bf16 %r50, %rs4;
|
165 |
+
mov.b32 %f24, %r50;
|
166 |
+
.loc 1 41 52
|
167 |
+
mov.u32 %r54, 0x0;
|
168 |
+
mov.u32 %r55, 0x0;
|
169 |
+
mov.u32 %r56, 0x0;
|
170 |
+
mov.u32 %r57, 0x0;
|
171 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r54, %r55, %r56, %r57 }, [ %rd69 + 0 ];
|
172 |
+
@!%p1 mov.u32 %r54, %r143;
|
173 |
+
@!%p1 mov.u32 %r55, %r143;
|
174 |
+
@!%p1 mov.u32 %r56, %r143;
|
175 |
+
@!%p1 mov.u32 %r57, %r143;
|
176 |
+
mov.b32 %f56, %r54;
|
177 |
+
mov.b32 %f57, %r55;
|
178 |
+
mov.b32 %f58, %r56;
|
179 |
+
mov.b32 %f59, %r57;
|
180 |
+
.loc 1 42 22
|
181 |
+
add.f32 %f60, %f17, %f56;
|
182 |
+
add.f32 %f61, %f18, %f57;
|
183 |
+
add.f32 %f62, %f19, %f58;
|
184 |
+
add.f32 %f63, %f20, %f59;
|
185 |
+
.loc 1 44 22
|
186 |
+
add.f32 %f64, %f21, %f60;
|
187 |
+
add.f32 %f65, %f22, %f61;
|
188 |
+
add.f32 %f66, %f23, %f62;
|
189 |
+
add.f32 %f67, %f24, %f63;
|
190 |
+
$L__tmp1:
|
191 |
+
.loc 2 96 20
|
192 |
+
sub.f32 %f68, %f64, %f160;
|
193 |
+
sub.f32 %f69, %f65, %f161;
|
194 |
+
sub.f32 %f70, %f66, %f162;
|
195 |
+
sub.f32 %f71, %f67, %f163;
|
196 |
+
.loc 2 97 26
|
197 |
+
add.f32 %f148, %f148, 0f3F800000;
|
198 |
+
add.f32 %f149, %f149, 0f3F800000;
|
199 |
+
add.f32 %f150, %f150, 0f3F800000;
|
200 |
+
add.f32 %f151, %f151, 0f3F800000;
|
201 |
+
add.f32 %f152, %f152, 0f3F800000;
|
202 |
+
add.f32 %f153, %f153, 0f3F800000;
|
203 |
+
add.f32 %f154, %f154, 0f3F800000;
|
204 |
+
add.f32 %f155, %f155, 0f3F800000;
|
205 |
+
.loc 2 98 30
|
206 |
+
mov.b32 %r63, %f68;
|
207 |
+
mov.b32 %r64, %f148;
|
208 |
+
div.full.f32 %r62, %r63, %r64;
|
209 |
+
mov.b32 %f72, %r62;
|
210 |
+
mov.b32 %r66, %f69;
|
211 |
+
mov.b32 %r67, %f149;
|
212 |
+
div.full.f32 %r65, %r66, %r67;
|
213 |
+
mov.b32 %f73, %r65;
|
214 |
+
mov.b32 %r69, %f70;
|
215 |
+
mov.b32 %r70, %f150;
|
216 |
+
div.full.f32 %r68, %r69, %r70;
|
217 |
+
mov.b32 %f74, %r68;
|
218 |
+
mov.b32 %r72, %f71;
|
219 |
+
mov.b32 %r73, %f151;
|
220 |
+
div.full.f32 %r71, %r72, %r73;
|
221 |
+
mov.b32 %f75, %r71;
|
222 |
+
.loc 2 98 22
|
223 |
+
add.f32 %f160, %f160, %f72;
|
224 |
+
add.f32 %f161, %f161, %f73;
|
225 |
+
add.f32 %f162, %f162, %f74;
|
226 |
+
add.f32 %f163, %f163, %f75;
|
227 |
+
.loc 2 101 30
|
228 |
+
sub.f32 %f76, %f64, %f160;
|
229 |
+
sub.f32 %f77, %f65, %f161;
|
230 |
+
sub.f32 %f78, %f66, %f162;
|
231 |
+
sub.f32 %f79, %f67, %f163;
|
232 |
+
$L__tmp2:
|
233 |
+
.loc 1 50 50
|
234 |
+
fma.rn.f32 %f156, %f68, %f76, %f156;
|
235 |
+
fma.rn.f32 %f157, %f69, %f77, %f157;
|
236 |
+
fma.rn.f32 %f158, %f70, %f78, %f158;
|
237 |
+
fma.rn.f32 %f159, %f71, %f79, %f159;
|
238 |
+
.loc 1 31 36
|
239 |
+
add.s32 %r156, %r156, 8;
|
240 |
+
add.s64 %rd69, %rd69, 32;
|
241 |
+
add.s64 %rd68, %rd68, 32;
|
242 |
+
setp.lt.u32 %p22, %r156, 248;
|
243 |
+
@%p22 bra $L__BB0_1;
|
244 |
+
bra.uni $L__BB0_4;
|
245 |
+
$L__BB0_1:
|
246 |
+
.loc 1 40 40
|
247 |
+
setp.lt.u64 %p16, %rd1, 50257;
|
248 |
+
mov.b32 %r143, 0;
|
249 |
+
.loc 1 35 50
|
250 |
+
mov.u32 %r35, 0x0;
|
251 |
+
mov.u32 %r36, 0x0;
|
252 |
+
mov.u32 %r37, 0x0;
|
253 |
+
mov.u32 %r38, 0x0;
|
254 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd68 + 0 ];
|
255 |
+
@!%p1 mov.u32 %r35, %r143;
|
256 |
+
@!%p1 mov.u32 %r36, %r143;
|
257 |
+
@!%p1 mov.u32 %r37, %r143;
|
258 |
+
@!%p1 mov.u32 %r38, %r143;
|
259 |
+
.loc 1 36 34
|
260 |
+
add.s32 %r51, %r6, %r156;
|
261 |
+
add.s32 %r52, %r51, 8;
|
262 |
+
mul.wide.s32 %rd44, %r52, 2;
|
263 |
+
add.s64 %rd43, %rd17, %rd44;
|
264 |
+
.loc 1 36 50
|
265 |
+
mov.u32 %r43, 0x0;
|
266 |
+
mov.u32 %r44, 0x0;
|
267 |
+
@%p1 ld.global.L1::evict_last.v2.b32 { %r43, %r44 }, [ %rd43 + 0 ];
|
268 |
+
@!%p1 mov.u32 %r43, %r143;
|
269 |
+
@!%p1 mov.u32 %r44, %r143;
|
270 |
+
mov.b32 %r155, 883;
|
271 |
+
mov.u64 %rd67, 1;
|
272 |
+
.loc 1 40 55
|
273 |
+
@%p16 bra $L__BB0_3;
|
274 |
+
mov.u64 %rd45, assertMessage_0;
|
275 |
+
cvta.global.u64 %rd46, %rd45;
|
276 |
+
mov.u64 %rd47, assertFile_0;
|
277 |
+
cvta.global.u64 %rd48, %rd47;
|
278 |
+
mov.u64 %rd49, assertFunc_0;
|
279 |
+
cvta.global.u64 %rd50, %rd49;
|
280 |
+
{ // callseq 2, 0
|
281 |
+
.reg .b32 temp_param_reg;
|
282 |
+
.param .b64 param0;
|
283 |
+
st.param.b64 [param0+0], %rd46;
|
284 |
+
.param .b64 param1;
|
285 |
+
st.param.b64 [param1+0], %rd48;
|
286 |
+
.param .b32 param2;
|
287 |
+
st.param.b32 [param2+0], %r155;
|
288 |
+
.param .b64 param3;
|
289 |
+
st.param.b64 [param3+0], %rd50;
|
290 |
+
.param .b64 param4;
|
291 |
+
st.param.b64 [param4+0], %rd67;
|
292 |
+
call.uni
|
293 |
+
__assertfail,
|
294 |
+
(
|
295 |
+
param0,
|
296 |
+
param1,
|
297 |
+
param2,
|
298 |
+
param3,
|
299 |
+
param4
|
300 |
+
);
|
301 |
+
} // callseq 2
|
302 |
+
bra.uni $L__BB0_3;
|
303 |
+
$L__BB0_4:
|
304 |
+
.loc 1 31 36
|
305 |
+
shr.u32 %r99, %r1, 3;
|
306 |
+
or.b32 %r100, %r5, %r99;
|
307 |
+
mad.lo.s32 %r101, %r100, 12, %r4;
|
308 |
+
shl.b32 %r102, %r101, 2;
|
309 |
+
mov.u32 %r103, global_smem;
|
310 |
+
add.s32 %r104, %r103, %r102;
|
311 |
+
st.shared.f32 [%r104], %f152;
|
312 |
+
st.shared.f32 [%r104+768], %f153;
|
313 |
+
st.shared.f32 [%r104+1536], %f154;
|
314 |
+
st.shared.f32 [%r104+2304], %f155;
|
315 |
+
bar.sync 0;
|
316 |
+
mad.lo.s32 %r105, %r2, 12, %r3;
|
317 |
+
shl.b32 %r106, %r105, 2;
|
318 |
+
add.s32 %r107, %r103, %r106;
|
319 |
+
ld.shared.v4.f32 {%f80, %f81, %f82, %f83}, [%r107];
|
320 |
+
$L__tmp3:
|
321 |
+
.loc 2 108 21
|
322 |
+
sub.f32 %f84, %f161, %f160;
|
323 |
+
.loc 2 109 28
|
324 |
+
add.f32 %f85, %f80, %f81;
|
325 |
+
.loc 2 110 39
|
326 |
+
setp.eq.f32 %p23, %f85, 0f00000000;
|
327 |
+
.loc 2 110 60
|
328 |
+
mov.b32 %r75, %f81;
|
329 |
+
mov.b32 %r76, %f85;
|
330 |
+
div.full.f32 %r74, %r75, %r76;
|
331 |
+
mov.b32 %f86, %r74;
|
332 |
+
.loc 2 110 49
|
333 |
+
selp.f32 %f87, 0f00000000, %f86, %p23;
|
334 |
+
.loc 2 112 17
|
335 |
+
fma.rn.f32 %f88, %f84, %f87, %f160;
|
336 |
+
.loc 2 113 15
|
337 |
+
add.f32 %f89, %f156, %f157;
|
338 |
+
.loc 2 113 30
|
339 |
+
mul.f32 %f90, %f84, %f84;
|
340 |
+
.loc 2 113 38
|
341 |
+
mul.f32 %f91, %f90, %f80;
|
342 |
+
.loc 2 113 22
|
343 |
+
fma.rn.f32 %f92, %f91, %f87, %f89;
|
344 |
+
.loc 2 108 21
|
345 |
+
sub.f32 %f93, %f162, %f88;
|
346 |
+
.loc 2 109 28
|
347 |
+
add.f32 %f94, %f82, %f85;
|
348 |
+
.loc 2 110 39
|
349 |
+
setp.eq.f32 %p24, %f94, 0f00000000;
|
350 |
+
.loc 2 110 60
|
351 |
+
mov.b32 %r79, %f94;
|
352 |
+
mov.b32 %r78, %f82;
|
353 |
+
div.full.f32 %r77, %r78, %r79;
|
354 |
+
mov.b32 %f95, %r77;
|
355 |
+
.loc 2 110 49
|
356 |
+
selp.f32 %f96, 0f00000000, %f95, %p24;
|
357 |
+
.loc 2 112 17
|
358 |
+
fma.rn.f32 %f97, %f96, %f93, %f88;
|
359 |
+
.loc 2 113 15
|
360 |
+
add.f32 %f98, %f158, %f92;
|
361 |
+
.loc 2 113 30
|
362 |
+
mul.f32 %f99, %f93, %f93;
|
363 |
+
.loc 2 113 38
|
364 |
+
mul.f32 %f100, %f85, %f99;
|
365 |
+
.loc 2 113 22
|
366 |
+
fma.rn.f32 %f101, %f96, %f100, %f98;
|
367 |
+
.loc 2 108 21
|
368 |
+
sub.f32 %f102, %f163, %f97;
|
369 |
+
.loc 2 109 28
|
370 |
+
add.f32 %f103, %f83, %f94;
|
371 |
+
.loc 2 110 39
|
372 |
+
setp.eq.f32 %p25, %f103, 0f00000000;
|
373 |
+
.loc 2 110 60
|
374 |
+
mov.b32 %r82, %f103;
|
375 |
+
mov.b32 %r81, %f83;
|
376 |
+
div.full.f32 %r80, %r81, %r82;
|
377 |
+
mov.b32 %f104, %r80;
|
378 |
+
.loc 2 110 49
|
379 |
+
selp.f32 %f105, 0f00000000, %f104, %p25;
|
380 |
+
.loc 2 112 17
|
381 |
+
fma.rn.f32 %f106, %f105, %f102, %f97;
|
382 |
+
.loc 2 113 15
|
383 |
+
add.f32 %f107, %f159, %f101;
|
384 |
+
.loc 2 113 30
|
385 |
+
mul.f32 %f108, %f102, %f102;
|
386 |
+
.loc 2 113 38
|
387 |
+
mul.f32 %f109, %f94, %f108;
|
388 |
+
.loc 2 113 22
|
389 |
+
fma.rn.f32 %f110, %f105, %f109, %f107;
|
390 |
+
$L__tmp4:
|
391 |
+
.loc 2 120 46
|
392 |
+
mov.b32 %r108, %f106;
|
393 |
+
shfl.sync.bfly.b32 %r109, %r108, 1, 31, -1;
|
394 |
+
mov.b32 %f111, %r109;
|
395 |
+
mov.b32 %r110, %f110;
|
396 |
+
shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1;
|
397 |
+
mov.b32 %f112, %r111;
|
398 |
+
shfl.sync.bfly.b32 %r84, %r82, 1, 31, -1;
|
399 |
+
mov.b32 %f113, %r84;
|
400 |
+
$L__tmp5:
|
401 |
+
.loc 2 108 21
|
402 |
+
sub.f32 %f114, %f111, %f106;
|
403 |
+
.loc 2 109 28
|
404 |
+
add.f32 %f115, %f103, %f113;
|
405 |
+
.loc 2 110 39
|
406 |
+
setp.eq.f32 %p26, %f115, 0f00000000;
|
407 |
+
.loc 2 110 60
|
408 |
+
mov.b32 %r85, %f115;
|
409 |
+
div.full.f32 %r83, %r84, %r85;
|
410 |
+
mov.b32 %f116, %r83;
|
411 |
+
.loc 2 110 49
|
412 |
+
selp.f32 %f117, 0f00000000, %f116, %p26;
|
413 |
+
.loc 2 112 17
|
414 |
+
fma.rn.f32 %f41, %f117, %f114, %f106;
|
415 |
+
.loc 2 113 15
|
416 |
+
add.f32 %f118, %f110, %f112;
|
417 |
+
.loc 2 113 30
|
418 |
+
mul.f32 %f119, %f114, %f114;
|
419 |
+
.loc 2 113 38
|
420 |
+
mul.f32 %f120, %f103, %f119;
|
421 |
+
.loc 2 113 22
|
422 |
+
fma.rn.f32 %f121, %f117, %f120, %f118;
|
423 |
+
$L__tmp6:
|
424 |
+
.loc 1 75 24
|
425 |
+
mov.b32 %r87, %f121;
|
426 |
+
mov.b32 %r88, 1132462080;
|
427 |
+
div.full.f32 %r86, %r87, %r88;
|
428 |
+
mov.b32 %f122, %r86;
|
429 |
+
.loc 1 77 24
|
430 |
+
add.f32 %f42, %f122, 0f3727C5AC;
|
431 |
+
.loc 1 58 36
|
432 |
+
add.s64 %rd71, %rd18, %rd2;
|
433 |
+
mov.b32 %r157, -8;
|
434 |
+
rsqrt.approx.ftz.f32 %f139, %f42;
|
435 |
+
bra.uni $L__BB0_5;
|
436 |
+
$L__BB0_7:
|
437 |
+
.loc 1 0 0
|
438 |
+
mov.b32 %f43, %r112;
|
439 |
+
mov.b32 %f44, %r113;
|
440 |
+
mov.b32 %f45, %r114;
|
441 |
+
mov.b32 %f46, %r115;
|
442 |
+
cvt.s64.s32 %rd13, %r137;
|
443 |
+
mov.b32 %f47, %r124;
|
444 |
+
mov.b32 %f48, %r125;
|
445 |
+
mov.b32 %f49, %r126;
|
446 |
+
mov.b32 %f50, %r127;
|
447 |
+
mov.b32 %f51, %r128;
|
448 |
+
mov.b32 %f52, %r129;
|
449 |
+
mov.b32 %f53, %r130;
|
450 |
+
mov.b32 %f54, %r131;
|
451 |
+
.loc 1 69 54
|
452 |
+
mov.u32 %r139, 0x0;
|
453 |
+
mov.u32 %r140, 0x0;
|
454 |
+
mov.u32 %r141, 0x0;
|
455 |
+
mov.u32 %r142, 0x0;
|
456 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r139, %r140, %r141, %r142 }, [ %rd72 + 0 ];
|
457 |
+
@!%p1 mov.u32 %r139, %r143;
|
458 |
+
@!%p1 mov.u32 %r140, %r143;
|
459 |
+
@!%p1 mov.u32 %r141, %r143;
|
460 |
+
@!%p1 mov.u32 %r142, %r143;
|
461 |
+
mov.b32 %f123, %r139;
|
462 |
+
mov.b32 %f124, %r140;
|
463 |
+
mov.b32 %f125, %r141;
|
464 |
+
mov.b32 %f126, %r142;
|
465 |
+
.loc 1 70 24
|
466 |
+
add.f32 %f127, %f43, %f123;
|
467 |
+
add.f32 %f128, %f44, %f124;
|
468 |
+
add.f32 %f129, %f45, %f125;
|
469 |
+
add.f32 %f130, %f46, %f126;
|
470 |
+
.loc 1 72 24
|
471 |
+
add.f32 %f131, %f47, %f127;
|
472 |
+
add.f32 %f132, %f48, %f128;
|
473 |
+
add.f32 %f133, %f49, %f129;
|
474 |
+
add.f32 %f134, %f50, %f130;
|
475 |
+
.loc 1 73 24
|
476 |
+
sub.f32 %f135, %f131, %f41;
|
477 |
+
sub.f32 %f136, %f132, %f41;
|
478 |
+
sub.f32 %f137, %f133, %f41;
|
479 |
+
sub.f32 %f138, %f134, %f41;
|
480 |
+
.loc 1 79 24
|
481 |
+
mul.f32 %f140, %f135, %f139;
|
482 |
+
mul.f32 %f141, %f136, %f139;
|
483 |
+
mul.f32 %f142, %f137, %f139;
|
484 |
+
mul.f32 %f143, %f138, %f139;
|
485 |
+
.loc 1 80 24
|
486 |
+
mul.f32 %f144, %f140, %f51;
|
487 |
+
mul.f32 %f145, %f141, %f52;
|
488 |
+
mul.f32 %f146, %f142, %f53;
|
489 |
+
mul.f32 %f147, %f143, %f54;
|
490 |
+
.loc 1 82 29
|
491 |
+
shl.b64 %rd66, %rd13, 1;
|
492 |
+
add.s64 %rd65, %rd19, %rd66;
|
493 |
+
.loc 1 82 52
|
494 |
+
mov.b32 %r147, %f144;
|
495 |
+
cvt.rn.bf16.f32 %rs9, %r147;
|
496 |
+
mov.b32 %r148, %f145;
|
497 |
+
cvt.rn.bf16.f32 %rs10, %r148;
|
498 |
+
mov.b32 %r149, %f146;
|
499 |
+
cvt.rn.bf16.f32 %rs11, %r149;
|
500 |
+
mov.b32 %r150, %f147;
|
501 |
+
cvt.rn.bf16.f32 %rs12, %r150;
|
502 |
+
mov.b32 %r153, {%rs9, %rs10};
|
503 |
+
mov.b32 %r154, {%rs11, %rs12};
|
504 |
+
@%p1 st.global.v2.b32 [ %rd65 + 0 ], { %r153, %r154 };
|
505 |
+
.loc 1 58 36
|
506 |
+
add.s32 %r157, %r157, 8;
|
507 |
+
add.s64 %rd72, %rd72, 32;
|
508 |
+
add.s64 %rd71, %rd71, 32;
|
509 |
+
add.s64 %rd70, %rd70, 32;
|
510 |
+
setp.lt.u32 %p47, %r157, 248;
|
511 |
+
@%p47 bra $L__BB0_5;
|
512 |
+
bra.uni $L__BB0_8;
|
513 |
+
$L__BB0_5:
|
514 |
+
.loc 1 62 51
|
515 |
+
mov.u32 %r112, 0x0;
|
516 |
+
mov.u32 %r113, 0x0;
|
517 |
+
mov.u32 %r114, 0x0;
|
518 |
+
mov.u32 %r115, 0x0;
|
519 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r112, %r113, %r114, %r115 }, [ %rd70 + 0 ];
|
520 |
+
@!%p1 mov.u32 %r112, %r143;
|
521 |
+
@!%p1 mov.u32 %r113, %r143;
|
522 |
+
@!%p1 mov.u32 %r114, %r143;
|
523 |
+
@!%p1 mov.u32 %r115, %r143;
|
524 |
+
.loc 1 63 35
|
525 |
+
add.s32 %r136, %r6, %r157;
|
526 |
+
add.s32 %r137, %r136, 8;
|
527 |
+
mul.wide.s32 %rd56, %r137, 2;
|
528 |
+
add.s64 %rd54, %rd17, %rd56;
|
529 |
+
.loc 1 63 51
|
530 |
+
mov.u32 %r120, 0x0;
|
531 |
+
mov.u32 %r121, 0x0;
|
532 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r120, %r121 }, [ %rd54 + 0 ];
|
533 |
+
@!%p1 mov.u32 %r120, %r143;
|
534 |
+
@!%p1 mov.u32 %r121, %r143;
|
535 |
+
cvt.u16.u32 %rs5, %r120;
|
536 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r120; }
|
537 |
+
cvt.u16.u32 %rs7, %r121;
|
538 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r121; }
|
539 |
+
.loc 1 63 103
|
540 |
+
cvt.f32.bf16 %r124, %rs5;
|
541 |
+
cvt.f32.bf16 %r125, %rs6;
|
542 |
+
cvt.f32.bf16 %r126, %rs7;
|
543 |
+
cvt.f32.bf16 %r127, %rs8;
|
544 |
+
.loc 1 64 40
|
545 |
+
mov.u32 %r128, 0x0;
|
546 |
+
mov.u32 %r129, 0x0;
|
547 |
+
mov.u32 %r130, 0x0;
|
548 |
+
mov.u32 %r131, 0x0;
|
549 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r128, %r129, %r130, %r131 }, [ %rd71 + 0 ];
|
550 |
+
@!%p1 mov.u32 %r128, %r143;
|
551 |
+
@!%p1 mov.u32 %r129, %r143;
|
552 |
+
@!%p1 mov.u32 %r130, %r143;
|
553 |
+
@!%p1 mov.u32 %r131, %r143;
|
554 |
+
.loc 1 68 57
|
555 |
+
@%p16 bra $L__BB0_7;
|
556 |
+
mov.u64 %rd57, assertMessage_1;
|
557 |
+
cvta.global.u64 %rd58, %rd57;
|
558 |
+
mov.u64 %rd59, assertFile_1;
|
559 |
+
cvta.global.u64 %rd60, %rd59;
|
560 |
+
mov.u64 %rd61, assertFunc_1;
|
561 |
+
cvta.global.u64 %rd62, %rd61;
|
562 |
+
{ // callseq 3, 0
|
563 |
+
.reg .b32 temp_param_reg;
|
564 |
+
.param .b64 param0;
|
565 |
+
st.param.b64 [param0+0], %rd58;
|
566 |
+
.param .b64 param1;
|
567 |
+
st.param.b64 [param1+0], %rd60;
|
568 |
+
.param .b32 param2;
|
569 |
+
st.param.b32 [param2+0], %r155;
|
570 |
+
.param .b64 param3;
|
571 |
+
st.param.b64 [param3+0], %rd62;
|
572 |
+
.param .b64 param4;
|
573 |
+
st.param.b64 [param4+0], %rd67;
|
574 |
+
call.uni
|
575 |
+
__assertfail,
|
576 |
+
(
|
577 |
+
param0,
|
578 |
+
param1,
|
579 |
+
param2,
|
580 |
+
param3,
|
581 |
+
param4
|
582 |
+
);
|
583 |
+
} // callseq 3
|
584 |
+
bra.uni $L__BB0_7;
|
585 |
+
$L__BB0_8:
|
586 |
+
.loc 1 58 4
|
587 |
+
ret;
|
588 |
+
$L__tmp7:
|
589 |
+
$L__func_end0:
|
590 |
+
|
591 |
+
}
|
592 |
+
// .globl __nv_rsqrtf
|
593 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
594 |
+
.param .b32 __nv_rsqrtf_param_0
|
595 |
+
)
|
596 |
+
{
|
597 |
+
.reg .f32 %f<3>;
|
598 |
+
$L__func_begin1:
|
599 |
+
|
600 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
601 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
602 |
+
st.param.f32 [func_retval0+0], %f2;
|
603 |
+
ret;
|
604 |
+
$L__func_end1:
|
605 |
+
|
606 |
+
}
|
607 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
608 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
609 |
+
.section .debug_abbrev
|
610 |
+
{
|
611 |
+
.b8 1
|
612 |
+
.b8 17
|
613 |
+
.b8 1
|
614 |
+
.b8 37
|
615 |
+
.b8 8
|
616 |
+
.b8 19
|
617 |
+
.b8 5
|
618 |
+
.b8 3
|
619 |
+
.b8 8
|
620 |
+
.b8 16
|
621 |
+
.b8 6
|
622 |
+
.b8 27
|
623 |
+
.b8 8
|
624 |
+
.b8 180
|
625 |
+
.b8 66
|
626 |
+
.b8 12
|
627 |
+
.b8 17
|
628 |
+
.b8 1
|
629 |
+
.b8 18
|
630 |
+
.b8 1
|
631 |
+
.b8 0
|
632 |
+
.b8 0
|
633 |
+
.b8 2
|
634 |
+
.b8 46
|
635 |
+
.b8 0
|
636 |
+
.b8 135
|
637 |
+
.b8 64
|
638 |
+
.b8 8
|
639 |
+
.b8 3
|
640 |
+
.b8 8
|
641 |
+
.b8 58
|
642 |
+
.b8 11
|
643 |
+
.b8 59
|
644 |
+
.b8 11
|
645 |
+
.b8 63
|
646 |
+
.b8 12
|
647 |
+
.b8 32
|
648 |
+
.b8 11
|
649 |
+
.b8 0
|
650 |
+
.b8 0
|
651 |
+
.b8 3
|
652 |
+
.b8 46
|
653 |
+
.b8 1
|
654 |
+
.b8 17
|
655 |
+
.b8 1
|
656 |
+
.b8 18
|
657 |
+
.b8 1
|
658 |
+
.b8 64
|
659 |
+
.b8 10
|
660 |
+
.b8 49
|
661 |
+
.b8 19
|
662 |
+
.b8 0
|
663 |
+
.b8 0
|
664 |
+
.b8 4
|
665 |
+
.b8 29
|
666 |
+
.b8 0
|
667 |
+
.b8 49
|
668 |
+
.b8 19
|
669 |
+
.b8 17
|
670 |
+
.b8 1
|
671 |
+
.b8 18
|
672 |
+
.b8 1
|
673 |
+
.b8 88
|
674 |
+
.b8 11
|
675 |
+
.b8 89
|
676 |
+
.b8 11
|
677 |
+
.b8 87
|
678 |
+
.b8 11
|
679 |
+
.b8 0
|
680 |
+
.b8 0
|
681 |
+
.b8 5
|
682 |
+
.b8 29
|
683 |
+
.b8 1
|
684 |
+
.b8 49
|
685 |
+
.b8 19
|
686 |
+
.b8 17
|
687 |
+
.b8 1
|
688 |
+
.b8 18
|
689 |
+
.b8 1
|
690 |
+
.b8 88
|
691 |
+
.b8 11
|
692 |
+
.b8 89
|
693 |
+
.b8 11
|
694 |
+
.b8 87
|
695 |
+
.b8 11
|
696 |
+
.b8 0
|
697 |
+
.b8 0
|
698 |
+
.b8 0
|
699 |
+
}
|
700 |
+
.section .debug_info
|
701 |
+
{
|
702 |
+
.b32 302
|
703 |
+
.b8 2
|
704 |
+
.b8 0
|
705 |
+
.b32 .debug_abbrev
|
706 |
+
.b8 8
|
707 |
+
.b8 1
|
708 |
+
.b8 116
|
709 |
+
.b8 114
|
710 |
+
.b8 105
|
711 |
+
.b8 116
|
712 |
+
.b8 111
|
713 |
+
.b8 110
|
714 |
+
.b8 0
|
715 |
+
.b8 2
|
716 |
+
.b8 0
|
717 |
+
.b8 99
|
718 |
+
.b8 112
|
719 |
+
.b8 110
|
720 |
+
.b8 51
|
721 |
+
.b8 108
|
722 |
+
.b8 97
|
723 |
+
.b8 119
|
724 |
+
.b8 103
|
725 |
+
.b8 54
|
726 |
+
.b8 53
|
727 |
+
.b8 108
|
728 |
+
.b8 112
|
729 |
+
.b8 105
|
730 |
+
.b8 54
|
731 |
+
.b8 51
|
732 |
+
.b8 103
|
733 |
+
.b8 118
|
734 |
+
.b8 54
|
735 |
+
.b8 99
|
736 |
+
.b8 54
|
737 |
+
.b8 112
|
738 |
+
.b8 110
|
739 |
+
.b8 52
|
740 |
+
.b8 111
|
741 |
+
.b8 105
|
742 |
+
.b8 107
|
743 |
+
.b8 104
|
744 |
+
.b8 103
|
745 |
+
.b8 54
|
746 |
+
.b8 113
|
747 |
+
.b8 118
|
748 |
+
.b8 97
|
749 |
+
.b8 50
|
750 |
+
.b8 104
|
751 |
+
.b8 50
|
752 |
+
.b8 113
|
753 |
+
.b8 106
|
754 |
+
.b8 100
|
755 |
+
.b8 112
|
756 |
+
.b8 120
|
757 |
+
.b8 101
|
758 |
+
.b8 54
|
759 |
+
.b8 113
|
760 |
+
.b8 106
|
761 |
+
.b8 52
|
762 |
+
.b8 108
|
763 |
+
.b8 118
|
764 |
+
.b8 116
|
765 |
+
.b8 116
|
766 |
+
.b8 119
|
767 |
+
.b8 101
|
768 |
+
.b8 122
|
769 |
+
.b8 46
|
770 |
+
.b8 112
|
771 |
+
.b8 121
|
772 |
+
.b8 0
|
773 |
+
.b32 .debug_line
|
774 |
+
.b8 47
|
775 |
+
.b8 116
|
776 |
+
.b8 109
|
777 |
+
.b8 112
|
778 |
+
.b8 47
|
779 |
+
.b8 116
|
780 |
+
.b8 111
|
781 |
+
.b8 114
|
782 |
+
.b8 99
|
783 |
+
.b8 104
|
784 |
+
.b8 105
|
785 |
+
.b8 110
|
786 |
+
.b8 100
|
787 |
+
.b8 117
|
788 |
+
.b8 99
|
789 |
+
.b8 116
|
790 |
+
.b8 111
|
791 |
+
.b8 114
|
792 |
+
.b8 95
|
793 |
+
.b8 114
|
794 |
+
.b8 111
|
795 |
+
.b8 111
|
796 |
+
.b8 116
|
797 |
+
.b8 47
|
798 |
+
.b8 112
|
799 |
+
.b8 110
|
800 |
+
.b8 0
|
801 |
+
.b8 1
|
802 |
+
.b64 $L__func_begin0
|
803 |
+
.b64 $L__func_end0
|
804 |
+
.b8 2
|
805 |
+
.b8 116
|
806 |
+
.b8 114
|
807 |
+
.b8 105
|
808 |
+
.b8 116
|
809 |
+
.b8 111
|
810 |
+
.b8 110
|
811 |
+
.b8 95
|
812 |
+
.b8 95
|
813 |
+
.b8 48
|
814 |
+
.b8 100
|
815 |
+
.b8 49
|
816 |
+
.b8 100
|
817 |
+
.b8 50
|
818 |
+
.b8 100
|
819 |
+
.b8 51
|
820 |
+
.b8 100
|
821 |
+
.b8 52
|
822 |
+
.b8 100
|
823 |
+
.b8 53
|
824 |
+
.b8 100
|
825 |
+
.b8 54
|
826 |
+
.b8 100
|
827 |
+
.b8 101
|
828 |
+
.b8 55
|
829 |
+
.b8 100
|
830 |
+
.b8 101
|
831 |
+
.b8 0
|
832 |
+
.b8 116
|
833 |
+
.b8 114
|
834 |
+
.b8 105
|
835 |
+
.b8 116
|
836 |
+
.b8 111
|
837 |
+
.b8 110
|
838 |
+
.b8 95
|
839 |
+
.b8 95
|
840 |
+
.b8 48
|
841 |
+
.b8 100
|
842 |
+
.b8 49
|
843 |
+
.b8 100
|
844 |
+
.b8 50
|
845 |
+
.b8 100
|
846 |
+
.b8 51
|
847 |
+
.b8 100
|
848 |
+
.b8 52
|
849 |
+
.b8 100
|
850 |
+
.b8 53
|
851 |
+
.b8 100
|
852 |
+
.b8 54
|
853 |
+
.b8 100
|
854 |
+
.b8 101
|
855 |
+
.b8 55
|
856 |
+
.b8 100
|
857 |
+
.b8 101
|
858 |
+
.b8 0
|
859 |
+
.b8 1
|
860 |
+
.b8 18
|
861 |
+
.b8 1
|
862 |
+
.b8 1
|
863 |
+
.b8 3
|
864 |
+
.b64 $L__func_begin0
|
865 |
+
.b64 $L__func_end0
|
866 |
+
.b8 1
|
867 |
+
.b8 156
|
868 |
+
.b32 125
|
869 |
+
.b8 4
|
870 |
+
.b32 125
|
871 |
+
.b64 $L__tmp1
|
872 |
+
.b64 $L__tmp2
|
873 |
+
.b8 2
|
874 |
+
.b8 47
|
875 |
+
.b8 41
|
876 |
+
.b8 5
|
877 |
+
.b32 125
|
878 |
+
.b64 $L__tmp3
|
879 |
+
.b64 $L__tmp6
|
880 |
+
.b8 2
|
881 |
+
.b8 53
|
882 |
+
.b8 44
|
883 |
+
.b8 4
|
884 |
+
.b32 125
|
885 |
+
.b64 $L__tmp3
|
886 |
+
.b64 $L__tmp6
|
887 |
+
.b8 2
|
888 |
+
.b8 120
|
889 |
+
.b8 46
|
890 |
+
.b8 0
|
891 |
+
.b8 4
|
892 |
+
.b32 125
|
893 |
+
.b64 $L__tmp4
|
894 |
+
.b64 $L__tmp5
|
895 |
+
.b8 2
|
896 |
+
.b8 53
|
897 |
+
.b8 44
|
898 |
+
.b8 0
|
899 |
+
.b8 0
|
900 |
+
}
|
901 |
+
.section .debug_pubnames
|
902 |
+
{
|
903 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
904 |
+
$L__pubNames_start0:
|
905 |
+
.b8 2
|
906 |
+
.b8 0
|
907 |
+
.b32 .debug_info
|
908 |
+
.b32 306
|
909 |
+
.b32 125
|
910 |
+
.b8 116
|
911 |
+
.b8 114
|
912 |
+
.b8 105
|
913 |
+
.b8 116
|
914 |
+
.b8 111
|
915 |
+
.b8 110
|
916 |
+
.b8 95
|
917 |
+
.b8 95
|
918 |
+
.b8 48
|
919 |
+
.b8 100
|
920 |
+
.b8 49
|
921 |
+
.b8 100
|
922 |
+
.b8 50
|
923 |
+
.b8 100
|
924 |
+
.b8 51
|
925 |
+
.b8 100
|
926 |
+
.b8 52
|
927 |
+
.b8 100
|
928 |
+
.b8 53
|
929 |
+
.b8 100
|
930 |
+
.b8 54
|
931 |
+
.b8 100
|
932 |
+
.b8 101
|
933 |
+
.b8 55
|
934 |
+
.b8 100
|
935 |
+
.b8 101
|
936 |
+
.b8 0
|
937 |
+
.b32 0
|
938 |
+
$L__pubNames_end0:
|
939 |
+
}
|
940 |
+
.section .debug_pubtypes
|
941 |
+
{
|
942 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
943 |
+
$L__pubTypes_start0:
|
944 |
+
.b8 2
|
945 |
+
.b8 0
|
946 |
+
.b32 .debug_info
|
947 |
+
.b32 306
|
948 |
+
.b32 0
|
949 |
+
$L__pubTypes_end0:
|
950 |
+
}
|
951 |
+
.section .debug_loc { }
|
.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttir
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
|
4 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
5 |
+
%cst_1 = arith.constant dense<1.000000e+00> : tensor<64x8xf32>
|
6 |
+
%c256_i32 = arith.constant 256 : i32
|
7 |
+
%c8_i32 = arith.constant 8 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<64x1xi64>
|
10 |
+
%cst_3 = arith.constant dense<0> : tensor<64x1xi64>
|
11 |
+
%cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
|
12 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
|
13 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
|
14 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
|
15 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
|
16 |
+
%cst_9 = arith.constant dense<256> : tensor<64x1xi32>
|
17 |
+
%cst_10 = arith.constant dense<256> : tensor<1x8xi32>
|
18 |
+
%cst_11 = arith.constant dense<512> : tensor<64x1xi32>
|
19 |
+
%c64_i32 = arith.constant 64 : i32
|
20 |
+
%0 = tt.get_program_id x : i32
|
21 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
22 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
23 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
24 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
25 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
26 |
+
%6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
27 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
28 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
29 |
+
%9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
30 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
31 |
+
%11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
|
32 |
+
%12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
|
33 |
+
%13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
34 |
+
%14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
35 |
+
%15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
|
36 |
+
%16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
37 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
38 |
+
%18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
|
39 |
+
%19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
|
40 |
+
%20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
|
41 |
+
%21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
|
42 |
+
%22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
|
43 |
+
%23 = arith.andi %21, %22 : tensor<64x1xi1>
|
44 |
+
%24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
|
45 |
+
%25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x8xi64>
|
46 |
+
%26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
47 |
+
%27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) : i32 {
|
48 |
+
%51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
|
49 |
+
%52 = arith.addi %51, %7 : tensor<1x8xi32>
|
50 |
+
%53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
|
51 |
+
%54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
52 |
+
%55 = arith.addi %54, %13 : tensor<64x8xi32>
|
53 |
+
%56 = tt.addptr %14, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
54 |
+
%57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
55 |
+
%58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
56 |
+
%59 = arith.addi %54, %16 : tensor<64x8xi32>
|
57 |
+
%60 = tt.addptr %17, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
58 |
+
%61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16>
|
59 |
+
%62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
|
60 |
+
tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
61 |
+
%63 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
|
62 |
+
%64 = tt.broadcast %63 : (tensor<1x8xi64>) -> tensor<64x8xi64>
|
63 |
+
%65 = arith.addi %64, %25 : tensor<64x8xi64>
|
64 |
+
%66 = tt.addptr %26, %65 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
|
65 |
+
%67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
66 |
+
%68 = arith.addf %67, %58 : tensor<64x8xf32>
|
67 |
+
%69 = arith.addf %68, %62 : tensor<64x8xf32>
|
68 |
+
%70 = arith.subf %69, %arg9 : tensor<64x8xf32>
|
69 |
+
%71 = arith.addf %arg11, %cst_1 : tensor<64x8xf32>
|
70 |
+
%72 = arith.divf %70, %71 : tensor<64x8xf32>
|
71 |
+
%73 = arith.addf %arg9, %72 : tensor<64x8xf32>
|
72 |
+
%74 = arith.subf %69, %73 : tensor<64x8xf32>
|
73 |
+
%75 = arith.mulf %70, %74 : tensor<64x8xf32>
|
74 |
+
%76 = arith.addf %arg10, %75 : tensor<64x8xf32>
|
75 |
+
%77 = arith.select %57, %73, %arg9 : tensor<64x8xi1>, tensor<64x8xf32>
|
76 |
+
%78 = arith.select %57, %76, %arg10 : tensor<64x8xi1>, tensor<64x8xf32>
|
77 |
+
%79 = arith.select %57, %71, %arg11 : tensor<64x8xi1>, tensor<64x8xf32>
|
78 |
+
scf.yield %77, %78, %79 : tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>
|
79 |
+
}
|
80 |
+
%28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
|
81 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
82 |
+
%51 = arith.subf %arg11, %arg8 : f32
|
83 |
+
%52 = arith.addf %arg10, %arg13 : f32
|
84 |
+
%53 = arith.cmpf oeq, %52, %cst_0 : f32
|
85 |
+
%54 = arith.divf %arg13, %52 : f32
|
86 |
+
%55 = arith.select %53, %cst_0, %54 : f32
|
87 |
+
%56 = arith.mulf %51, %55 : f32
|
88 |
+
%57 = arith.addf %arg8, %56 : f32
|
89 |
+
%58 = arith.addf %arg9, %arg12 : f32
|
90 |
+
%59 = arith.mulf %51, %51 : f32
|
91 |
+
%60 = arith.mulf %59, %arg10 : f32
|
92 |
+
%61 = arith.mulf %60, %55 : f32
|
93 |
+
%62 = arith.addf %58, %61 : f32
|
94 |
+
tt.reduce.return %57, %62, %52 : f32, f32, f32
|
95 |
+
}) : (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
|
96 |
+
%29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
97 |
+
%30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
98 |
+
%31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
|
99 |
+
%32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
100 |
+
%33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
101 |
+
%34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
|
102 |
+
%35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
103 |
+
%36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
104 |
+
%37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
|
105 |
+
%38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
|
106 |
+
%39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
|
107 |
+
%40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
|
108 |
+
%41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
|
109 |
+
%42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
|
110 |
+
%43 = arith.andi %41, %42 : tensor<64x1xi1>
|
111 |
+
%44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
|
112 |
+
%45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x8xi64>
|
113 |
+
%46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
114 |
+
%47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x8xf32>
|
115 |
+
%48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
|
116 |
+
%49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
|
117 |
+
%50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
118 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
|
119 |
+
%51 = tt.splat %arg8 : (i32) -> tensor<1x8xi32>
|
120 |
+
%52 = arith.addi %51, %7 : tensor<1x8xi32>
|
121 |
+
%53 = arith.cmpi slt, %52, %cst_10 : tensor<1x8xi32>
|
122 |
+
%54 = tt.broadcast %52 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
123 |
+
%55 = arith.addi %54, %32 : tensor<64x8xi32>
|
124 |
+
%56 = tt.addptr %33, %55 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
125 |
+
%57 = tt.broadcast %53 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
126 |
+
%58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
|
127 |
+
%59 = arith.addi %54, %35 : tensor<64x8xi32>
|
128 |
+
%60 = tt.addptr %36, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
129 |
+
%61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
130 |
+
%62 = arith.extf %61 : tensor<64x8xbf16> to tensor<64x8xf32>
|
131 |
+
%63 = tt.addptr %37, %52 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
|
132 |
+
%64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32>
|
133 |
+
tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
134 |
+
%65 = arith.extsi %52 : tensor<1x8xi32> to tensor<1x8xi64>
|
135 |
+
%66 = tt.broadcast %65 : (tensor<1x8xi64>) -> tensor<64x8xi64>
|
136 |
+
%67 = arith.addi %66, %45 : tensor<64x8xi64>
|
137 |
+
%68 = tt.addptr %46, %67 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
|
138 |
+
%69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
139 |
+
%70 = arith.addf %69, %58 : tensor<64x8xf32>
|
140 |
+
%71 = arith.addf %70, %62 : tensor<64x8xf32>
|
141 |
+
%72 = arith.subf %71, %47 : tensor<64x8xf32>
|
142 |
+
%73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
|
143 |
+
%74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x8xf32>
|
144 |
+
%75 = arith.mulf %72, %74 : tensor<64x8xf32>
|
145 |
+
%76 = tt.broadcast %64 : (tensor<1x8xf32>) -> tensor<64x8xf32>
|
146 |
+
%77 = arith.mulf %75, %76 : tensor<64x8xf32>
|
147 |
+
%78 = tt.addptr %50, %59 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
148 |
+
%79 = arith.truncf %77 : tensor<64x8xf32> to tensor<64x8xbf16>
|
149 |
+
tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16>
|
150 |
+
}
|
151 |
+
tt.return
|
152 |
+
}
|
153 |
+
}
|
.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ttgir
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
|
17 |
+
%c0_i32 = arith.constant 0 : i32
|
18 |
+
%c8_i32 = arith.constant 8 : i32
|
19 |
+
%c256_i32 = arith.constant 256 : i32
|
20 |
+
%cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
|
21 |
+
%cst_11 = arith.constant 0.000000e+00 : f32
|
22 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
|
23 |
+
%cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
|
24 |
+
%cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
|
25 |
+
%cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
|
26 |
+
%c64_i32 = arith.constant 64 : i32
|
27 |
+
%0 = tt.get_program_id x : i32
|
28 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
29 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
30 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
31 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
32 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
33 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
34 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
35 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
36 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
37 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
38 |
+
%11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
39 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
40 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
|
41 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
42 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
|
43 |
+
%16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
44 |
+
%17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
45 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
46 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
|
47 |
+
%20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
48 |
+
%21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
|
49 |
+
%22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
50 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
51 |
+
%24 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
|
52 |
+
%25 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
|
53 |
+
%26 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
|
54 |
+
%27 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
|
55 |
+
%28 = arith.select %26, %24, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
56 |
+
%29 = arith.select %27, %25, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
|
57 |
+
%30 = arith.cmpi sge, %29, %cst_9 : tensor<64x1xi64, #blocked1>
|
58 |
+
%31 = arith.cmpi slt, %29, %cst_8 : tensor<64x1xi64, #blocked1>
|
59 |
+
%32 = arith.andi %30, %31 : tensor<64x1xi1, #blocked1>
|
60 |
+
%33 = arith.muli %28, %cst_5 : tensor<64x1xi64, #blocked>
|
61 |
+
%34 = tt.broadcast %33 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
|
62 |
+
%35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
63 |
+
%36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>) : i32 {
|
64 |
+
%48 = tt.splat %arg7 : (i32) -> tensor<1x8xi32, #blocked>
|
65 |
+
%49 = tt.splat %arg7 : (i32) -> tensor<1x8xi32, #blocked2>
|
66 |
+
%50 = arith.addi %48, %12 : tensor<1x8xi32, #blocked>
|
67 |
+
%51 = arith.addi %49, %13 : tensor<1x8xi32, #blocked2>
|
68 |
+
%52 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
|
69 |
+
%53 = arith.cmpi slt, %51, %cst_13 : tensor<1x8xi32, #blocked2>
|
70 |
+
%54 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
71 |
+
%55 = arith.addi %54, %22 : tensor<64x8xi32, #blocked>
|
72 |
+
%56 = tt.addptr %23, %55 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
73 |
+
%57 = tt.broadcast %52 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
74 |
+
%58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
|
75 |
+
%59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
76 |
+
tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
77 |
+
%60 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
|
78 |
+
%61 = tt.broadcast %60 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
|
79 |
+
%62 = arith.addi %61, %34 : tensor<64x8xi64, #blocked>
|
80 |
+
%63 = tt.addptr %35, %62 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
|
81 |
+
%64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
82 |
+
%65 = arith.addf %64, %59 : tensor<64x8xf32, #blocked>
|
83 |
+
%66 = arith.subf %65, %arg8 : tensor<64x8xf32, #blocked>
|
84 |
+
%67 = arith.addf %arg11, %cst_4 : tensor<64x8xf32, #blocked>
|
85 |
+
%68 = arith.addf %arg10, %cst_10 : tensor<64x8xf32, #blocked2>
|
86 |
+
%69 = arith.divf %66, %67 : tensor<64x8xf32, #blocked>
|
87 |
+
%70 = arith.addf %arg8, %69 : tensor<64x8xf32, #blocked>
|
88 |
+
%71 = arith.subf %65, %70 : tensor<64x8xf32, #blocked>
|
89 |
+
%72 = arith.mulf %66, %71 : tensor<64x8xf32, #blocked>
|
90 |
+
%73 = arith.addf %arg9, %72 : tensor<64x8xf32, #blocked>
|
91 |
+
%74 = arith.select %57, %70, %arg8 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
92 |
+
%75 = arith.select %57, %73, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
93 |
+
%76 = arith.select %57, %67, %arg11 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
94 |
+
%77 = arith.select %58, %68, %arg10 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
|
95 |
+
scf.yield %74, %75, %77, %76 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
|
96 |
+
}
|
97 |
+
%37 = triton_gpu.convert_layout %36#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
|
98 |
+
%38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
|
99 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
100 |
+
%48 = arith.subf %arg10, %arg7 : f32
|
101 |
+
%49 = arith.addf %arg9, %arg12 : f32
|
102 |
+
%50 = arith.cmpf oeq, %49, %cst_11 : f32
|
103 |
+
%51 = arith.divf %arg12, %49 : f32
|
104 |
+
%52 = arith.select %50, %cst_11, %51 : f32
|
105 |
+
%53 = arith.mulf %48, %52 : f32
|
106 |
+
%54 = arith.addf %arg7, %53 : f32
|
107 |
+
%55 = arith.addf %arg8, %arg11 : f32
|
108 |
+
%56 = arith.mulf %48, %48 : f32
|
109 |
+
%57 = arith.mulf %56, %arg9 : f32
|
110 |
+
%58 = arith.mulf %57, %52 : f32
|
111 |
+
%59 = arith.addf %55, %58 : f32
|
112 |
+
tt.reduce.return %54, %59, %49 : f32, f32, f32
|
113 |
+
}) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
114 |
+
%39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
115 |
+
%40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
116 |
+
%41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
|
117 |
+
%42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
|
118 |
+
%43 = arith.divf %40, %cst_15 : tensor<64x1xf32, #blocked>
|
119 |
+
%44 = arith.addf %43, %cst_14 : tensor<64x1xf32, #blocked>
|
120 |
+
%45 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
|
121 |
+
%46 = tt.broadcast %45 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
122 |
+
%47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
123 |
+
scf.for %arg7 = %c0_i32 to %c256_i32 step %c8_i32 : i32 {
|
124 |
+
%48 = tt.splat %arg7 : (i32) -> tensor<1x8xi32, #blocked>
|
125 |
+
%49 = arith.addi %48, %12 : tensor<1x8xi32, #blocked>
|
126 |
+
%50 = arith.cmpi slt, %49, %cst_0 : tensor<1x8xi32, #blocked>
|
127 |
+
%51 = tt.broadcast %49 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
128 |
+
%52 = arith.addi %51, %22 : tensor<64x8xi32, #blocked>
|
129 |
+
%53 = tt.addptr %23, %52 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
130 |
+
%54 = tt.broadcast %50 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
131 |
+
%55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
132 |
+
%56 = tt.addptr %41, %49 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
|
133 |
+
%57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
|
134 |
+
tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
135 |
+
%58 = arith.extsi %49 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
|
136 |
+
%59 = tt.broadcast %58 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
|
137 |
+
%60 = arith.addi %59, %34 : tensor<64x8xi64, #blocked>
|
138 |
+
%61 = tt.addptr %35, %60 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
|
139 |
+
%62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
140 |
+
%63 = arith.addf %62, %55 : tensor<64x8xf32, #blocked>
|
141 |
+
%64 = arith.subf %63, %42 : tensor<64x8xf32, #blocked>
|
142 |
+
%65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
|
143 |
+
%66 = tt.broadcast %65 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
|
144 |
+
%67 = arith.mulf %64, %66 : tensor<64x8xf32, #blocked>
|
145 |
+
%68 = tt.broadcast %57 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
|
146 |
+
%69 = arith.mulf %67, %68 : tensor<64x8xf32, #blocked>
|
147 |
+
%70 = arith.addi %51, %46 : tensor<64x8xi32, #blocked>
|
148 |
+
%71 = tt.addptr %47, %70 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
149 |
+
%72 = arith.truncf %69 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
|
150 |
+
tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
|
151 |
+
}
|
152 |
+
tt.return
|
153 |
+
}
|
154 |
+
}
|
.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ptx
ADDED
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5de6de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5de6de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
|
19 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_6
|
21 |
+
)
|
22 |
+
.maxntid 64, 1, 1
|
23 |
+
{
|
24 |
+
.reg .pred %p<26>;
|
25 |
+
.reg .b16 %rs<13>;
|
26 |
+
.reg .b32 %r<92>;
|
27 |
+
.reg .f32 %f<78>;
|
28 |
+
.reg .b64 %rd<14>;
|
29 |
+
.loc 1 18 0
|
30 |
+
$L__func_begin0:
|
31 |
+
.loc 1 18 0
|
32 |
+
|
33 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_0];
|
34 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_1];
|
35 |
+
$L__tmp0:
|
36 |
+
.loc 1 26 26
|
37 |
+
mov.u32 %r58, %tid.x;
|
38 |
+
and.b32 %r59, %r58, 31;
|
39 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2];
|
40 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3];
|
41 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4];
|
42 |
+
shl.b32 %r60, %r58, 2;
|
43 |
+
and.b32 %r61, %r60, 252;
|
44 |
+
.loc 1 23 28
|
45 |
+
mov.u32 %r1, %ctaid.x;
|
46 |
+
.loc 1 30 40
|
47 |
+
shl.b32 %r62, %r1, 8;
|
48 |
+
.loc 1 30 36
|
49 |
+
or.b32 %r63, %r62, %r61;
|
50 |
+
.loc 1 30 30
|
51 |
+
mul.wide.s32 %rd11, %r63, 4;
|
52 |
+
add.s64 %rd1, %rd6, %rd11;
|
53 |
+
mov.b32 %r6, 0;
|
54 |
+
mov.pred %p1, -1;
|
55 |
+
.loc 1 30 46
|
56 |
+
mov.u32 %r2, 0x0;
|
57 |
+
mov.u32 %r3, 0x0;
|
58 |
+
mov.u32 %r4, 0x0;
|
59 |
+
mov.u32 %r5, 0x0;
|
60 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
61 |
+
@!%p1 mov.u32 %r2, %r6;
|
62 |
+
@!%p1 mov.u32 %r3, %r6;
|
63 |
+
@!%p1 mov.u32 %r4, %r6;
|
64 |
+
@!%p1 mov.u32 %r5, %r6;
|
65 |
+
mov.b32 %f1, %r4;
|
66 |
+
mov.b32 %f2, %r5;
|
67 |
+
.loc 1 31 30
|
68 |
+
mul.wide.s32 %rd12, %r63, 2;
|
69 |
+
add.s64 %rd2, %rd7, %rd12;
|
70 |
+
.loc 1 31 46
|
71 |
+
mov.u32 %r10, 0x0;
|
72 |
+
mov.u32 %r11, 0x0;
|
73 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
74 |
+
@!%p1 mov.u32 %r10, %r6;
|
75 |
+
@!%p1 mov.u32 %r11, %r6;
|
76 |
+
cvt.u16.u32 %rs1, %r10;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
78 |
+
cvt.u16.u32 %rs3, %r11;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
80 |
+
.loc 1 31 67
|
81 |
+
cvt.f32.bf16 %r14, %rs1;
|
82 |
+
mov.b32 %f3, %r14;
|
83 |
+
cvt.f32.bf16 %r15, %rs2;
|
84 |
+
mov.b32 %f4, %r15;
|
85 |
+
cvt.f32.bf16 %r16, %rs3;
|
86 |
+
mov.b32 %f5, %r16;
|
87 |
+
cvt.f32.bf16 %r17, %rs4;
|
88 |
+
mov.b32 %f6, %r17;
|
89 |
+
.loc 1 32 30
|
90 |
+
add.s64 %rd3, %rd8, %rd12;
|
91 |
+
.loc 1 32 46
|
92 |
+
mov.u32 %r18, 0x0;
|
93 |
+
mov.u32 %r19, 0x0;
|
94 |
+
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
|
95 |
+
@!%p1 mov.u32 %r18, %r6;
|
96 |
+
@!%p1 mov.u32 %r19, %r6;
|
97 |
+
cvt.u16.u32 %rs5, %r18;
|
98 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
|
99 |
+
cvt.u16.u32 %rs7, %r19;
|
100 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
|
101 |
+
.loc 1 32 67
|
102 |
+
cvt.f32.bf16 %r22, %rs5;
|
103 |
+
mov.b32 %f7, %r22;
|
104 |
+
cvt.f32.bf16 %r23, %rs6;
|
105 |
+
mov.b32 %f8, %r23;
|
106 |
+
cvt.f32.bf16 %r24, %rs7;
|
107 |
+
mov.b32 %f9, %r24;
|
108 |
+
cvt.f32.bf16 %r25, %rs8;
|
109 |
+
mov.b32 %f10, %r25;
|
110 |
+
.loc 1 33 31
|
111 |
+
mul.wide.u32 %rd13, %r61, 4;
|
112 |
+
add.s64 %rd4, %rd9, %rd13;
|
113 |
+
.loc 1 33 36
|
114 |
+
mov.u32 %r26, 0x0;
|
115 |
+
mov.u32 %r27, 0x0;
|
116 |
+
mov.u32 %r28, 0x0;
|
117 |
+
mov.u32 %r29, 0x0;
|
118 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
|
119 |
+
@!%p1 mov.u32 %r26, %r6;
|
120 |
+
@!%p1 mov.u32 %r27, %r6;
|
121 |
+
@!%p1 mov.u32 %r28, %r6;
|
122 |
+
@!%p1 mov.u32 %r29, %r6;
|
123 |
+
.loc 1 35 18
|
124 |
+
add.f32 %f11, %f5, %f1;
|
125 |
+
add.f32 %f12, %f6, %f2;
|
126 |
+
.loc 1 30 46
|
127 |
+
mov.b32 %f13, %r3;
|
128 |
+
mov.b32 %f14, %r2;
|
129 |
+
.loc 1 35 18
|
130 |
+
add.f32 %f15, %f3, %f14;
|
131 |
+
add.f32 %f16, %f4, %f13;
|
132 |
+
.loc 1 37 18
|
133 |
+
add.f32 %f17, %f16, %f8;
|
134 |
+
add.f32 %f18, %f15, %f7;
|
135 |
+
add.f32 %f19, %f11, %f9;
|
136 |
+
add.f32 %f20, %f12, %f10;
|
137 |
+
$L__tmp1:
|
138 |
+
.loc 2 233 15
|
139 |
+
add.f32 %f21, %f18, %f17;
|
140 |
+
add.f32 %f22, %f21, %f19;
|
141 |
+
add.f32 %f23, %f22, %f20;
|
142 |
+
$L__tmp2:
|
143 |
+
.loc 2 243 36
|
144 |
+
mov.b32 %r64, %f23;
|
145 |
+
shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
|
146 |
+
mov.b32 %f24, %r65;
|
147 |
+
$L__tmp3:
|
148 |
+
.loc 2 233 15
|
149 |
+
add.f32 %f25, %f23, %f24;
|
150 |
+
$L__tmp4:
|
151 |
+
.loc 2 243 36
|
152 |
+
mov.b32 %r66, %f25;
|
153 |
+
shfl.sync.bfly.b32 %r67, %r66, 8, 31, -1;
|
154 |
+
mov.b32 %f26, %r67;
|
155 |
+
$L__tmp5:
|
156 |
+
.loc 2 233 15
|
157 |
+
add.f32 %f27, %f25, %f26;
|
158 |
+
$L__tmp6:
|
159 |
+
.loc 2 243 36
|
160 |
+
mov.b32 %r68, %f27;
|
161 |
+
shfl.sync.bfly.b32 %r69, %r68, 4, 31, -1;
|
162 |
+
mov.b32 %f28, %r69;
|
163 |
+
$L__tmp7:
|
164 |
+
.loc 2 233 15
|
165 |
+
add.f32 %f29, %f27, %f28;
|
166 |
+
$L__tmp8:
|
167 |
+
.loc 2 243 36
|
168 |
+
mov.b32 %r70, %f29;
|
169 |
+
shfl.sync.bfly.b32 %r71, %r70, 2, 31, -1;
|
170 |
+
mov.b32 %f30, %r71;
|
171 |
+
$L__tmp9:
|
172 |
+
.loc 2 233 15
|
173 |
+
add.f32 %f31, %f29, %f30;
|
174 |
+
$L__tmp10:
|
175 |
+
.loc 2 243 36
|
176 |
+
mov.b32 %r72, %f31;
|
177 |
+
shfl.sync.bfly.b32 %r73, %r72, 1, 31, -1;
|
178 |
+
mov.b32 %f32, %r73;
|
179 |
+
$L__tmp11:
|
180 |
+
.loc 2 233 15
|
181 |
+
add.f32 %f33, %f31, %f32;
|
182 |
+
$L__tmp12:
|
183 |
+
.loc 2 243 36
|
184 |
+
setp.eq.s32 %p17, %r59, 0;
|
185 |
+
shr.u32 %r74, %r58, 3;
|
186 |
+
and.b32 %r75, %r74, 4;
|
187 |
+
mov.u32 %r76, global_smem;
|
188 |
+
add.s32 %r34, %r76, %r75;
|
189 |
+
mov.b32 %r35, %f33;
|
190 |
+
@%p17 st.shared.b32 [ %r34 + 0 ], %r35;
|
191 |
+
bar.sync 0;
|
192 |
+
setp.lt.s32 %p18, %r58, 2;
|
193 |
+
add.s32 %r37, %r76, %r60;
|
194 |
+
@%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
|
195 |
+
mov.b32 %f34, %r36;
|
196 |
+
shfl.sync.bfly.b32 %r77, %r36, 1, 31, -1;
|
197 |
+
mov.b32 %f35, %r77;
|
198 |
+
$L__tmp13:
|
199 |
+
.loc 2 233 15
|
200 |
+
add.f32 %f36, %f34, %f35;
|
201 |
+
$L__tmp14:
|
202 |
+
.loc 2 243 36
|
203 |
+
and.b32 %r78, %r58, 1;
|
204 |
+
setp.eq.b32 %p24, %r78, 1;
|
205 |
+
not.pred %p25, %p24;
|
206 |
+
and.pred %p19, %p18, %p25;
|
207 |
+
mov.b32 %r39, %f36;
|
208 |
+
@%p19 st.shared.b32 [ %r37 + 0 ], %r39;
|
209 |
+
bar.sync 0;
|
210 |
+
ld.shared.f32 %f37, [global_smem];
|
211 |
+
$L__tmp15:
|
212 |
+
.loc 3 8 15
|
213 |
+
add.f32 %f38, %f37, 0f00000000;
|
214 |
+
$L__tmp16:
|
215 |
+
.loc 1 45 20
|
216 |
+
mov.b32 %r41, %f38;
|
217 |
+
mov.b32 %r42, 1132462080;
|
218 |
+
div.full.f32 %r40, %r41, %r42;
|
219 |
+
mov.b32 %f39, %r40;
|
220 |
+
.loc 1 46 19
|
221 |
+
sub.f32 %f40, %f18, %f39;
|
222 |
+
sub.f32 %f41, %f17, %f39;
|
223 |
+
sub.f32 %f42, %f19, %f39;
|
224 |
+
sub.f32 %f43, %f20, %f39;
|
225 |
+
.loc 1 47 20
|
226 |
+
mul.f32 %f44, %f41, %f41;
|
227 |
+
$L__tmp17:
|
228 |
+
.loc 2 243 36
|
229 |
+
bar.sync 0;
|
230 |
+
$L__tmp18:
|
231 |
+
.loc 2 233 15
|
232 |
+
fma.rn.f32 %f45, %f40, %f40, %f44;
|
233 |
+
fma.rn.f32 %f46, %f42, %f42, %f45;
|
234 |
+
fma.rn.f32 %f47, %f43, %f43, %f46;
|
235 |
+
$L__tmp19:
|
236 |
+
.loc 2 243 36
|
237 |
+
mov.b32 %r79, %f47;
|
238 |
+
shfl.sync.bfly.b32 %r80, %r79, 16, 31, -1;
|
239 |
+
mov.b32 %f48, %r80;
|
240 |
+
$L__tmp20:
|
241 |
+
.loc 2 233 15
|
242 |
+
add.f32 %f49, %f47, %f48;
|
243 |
+
$L__tmp21:
|
244 |
+
.loc 2 243 36
|
245 |
+
mov.b32 %r81, %f49;
|
246 |
+
shfl.sync.bfly.b32 %r82, %r81, 8, 31, -1;
|
247 |
+
mov.b32 %f50, %r82;
|
248 |
+
$L__tmp22:
|
249 |
+
.loc 2 233 15
|
250 |
+
add.f32 %f51, %f49, %f50;
|
251 |
+
$L__tmp23:
|
252 |
+
.loc 2 243 36
|
253 |
+
mov.b32 %r83, %f51;
|
254 |
+
shfl.sync.bfly.b32 %r84, %r83, 4, 31, -1;
|
255 |
+
mov.b32 %f52, %r84;
|
256 |
+
$L__tmp24:
|
257 |
+
.loc 2 233 15
|
258 |
+
add.f32 %f53, %f51, %f52;
|
259 |
+
$L__tmp25:
|
260 |
+
.loc 2 243 36
|
261 |
+
mov.b32 %r85, %f53;
|
262 |
+
shfl.sync.bfly.b32 %r86, %r85, 2, 31, -1;
|
263 |
+
mov.b32 %f54, %r86;
|
264 |
+
$L__tmp26:
|
265 |
+
.loc 2 233 15
|
266 |
+
add.f32 %f55, %f53, %f54;
|
267 |
+
$L__tmp27:
|
268 |
+
.loc 2 243 36
|
269 |
+
mov.b32 %r87, %f55;
|
270 |
+
shfl.sync.bfly.b32 %r88, %r87, 1, 31, -1;
|
271 |
+
mov.b32 %f56, %r88;
|
272 |
+
$L__tmp28:
|
273 |
+
.loc 2 233 15
|
274 |
+
add.f32 %f57, %f55, %f56;
|
275 |
+
$L__tmp29:
|
276 |
+
.loc 2 243 36
|
277 |
+
mov.b32 %r44, %f57;
|
278 |
+
@%p17 st.shared.b32 [ %r34 + 0 ], %r44;
|
279 |
+
bar.sync 0;
|
280 |
+
@%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
|
281 |
+
mov.b32 %f58, %r45;
|
282 |
+
shfl.sync.bfly.b32 %r89, %r45, 1, 31, -1;
|
283 |
+
mov.b32 %f59, %r89;
|
284 |
+
$L__tmp30:
|
285 |
+
.loc 2 233 15
|
286 |
+
add.f32 %f60, %f58, %f59;
|
287 |
+
$L__tmp31:
|
288 |
+
.loc 2 243 36
|
289 |
+
mov.b32 %r48, %f60;
|
290 |
+
@%p19 st.shared.b32 [ %r37 + 0 ], %r48;
|
291 |
+
bar.sync 0;
|
292 |
+
ld.shared.f32 %f61, [global_smem];
|
293 |
+
$L__tmp32:
|
294 |
+
.loc 3 8 15
|
295 |
+
add.f32 %f62, %f61, 0f00000000;
|
296 |
+
$L__tmp33:
|
297 |
+
.loc 1 53 20
|
298 |
+
mov.b32 %r50, %f62;
|
299 |
+
div.full.f32 %r49, %r50, %r42;
|
300 |
+
mov.b32 %f63, %r49;
|
301 |
+
.loc 1 55 20
|
302 |
+
add.f32 %f64, %f63, 0f3727C5AC;
|
303 |
+
.loc 1 56 26
|
304 |
+
rsqrt.approx.ftz.f32 %f65, %f64;
|
305 |
+
.loc 1 33 36
|
306 |
+
mov.b32 %f66, %r29;
|
307 |
+
mov.b32 %f67, %r28;
|
308 |
+
mov.b32 %f68, %r27;
|
309 |
+
mov.b32 %f69, %r26;
|
310 |
+
.loc 1 57 20
|
311 |
+
mul.f32 %f70, %f40, %f65;
|
312 |
+
mul.f32 %f71, %f41, %f65;
|
313 |
+
mul.f32 %f72, %f42, %f65;
|
314 |
+
mul.f32 %f73, %f43, %f65;
|
315 |
+
.loc 1 58 20
|
316 |
+
mul.f32 %f74, %f70, %f69;
|
317 |
+
mul.f32 %f75, %f71, %f68;
|
318 |
+
mul.f32 %f76, %f72, %f67;
|
319 |
+
mul.f32 %f77, %f73, %f66;
|
320 |
+
.loc 1 60 25
|
321 |
+
add.s64 %rd5, %rd10, %rd12;
|
322 |
+
.loc 1 60 48
|
323 |
+
mov.b32 %r52, %f74;
|
324 |
+
cvt.rn.bf16.f32 %rs9, %r52;
|
325 |
+
mov.b32 %r53, %f75;
|
326 |
+
cvt.rn.bf16.f32 %rs10, %r53;
|
327 |
+
mov.b32 %r54, %f76;
|
328 |
+
cvt.rn.bf16.f32 %rs11, %r54;
|
329 |
+
mov.b32 %r55, %f77;
|
330 |
+
cvt.rn.bf16.f32 %rs12, %r55;
|
331 |
+
mov.b32 %r90, {%rs9, %rs10};
|
332 |
+
mov.b32 %r91, {%rs11, %rs12};
|
333 |
+
@%p1 st.global.v2.b32 [ %rd5 + 0 ], { %r90, %r91 };
|
334 |
+
.loc 1 60 4
|
335 |
+
ret;
|
336 |
+
$L__tmp34:
|
337 |
+
$L__func_end0:
|
338 |
+
|
339 |
+
}
|
340 |
+
// .globl __nv_rsqrtf
|
341 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
342 |
+
.param .b32 __nv_rsqrtf_param_0
|
343 |
+
)
|
344 |
+
{
|
345 |
+
.reg .f32 %f<3>;
|
346 |
+
$L__func_begin1:
|
347 |
+
|
348 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
349 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
350 |
+
st.param.f32 [func_retval0+0], %f2;
|
351 |
+
ret;
|
352 |
+
$L__func_end1:
|
353 |
+
|
354 |
+
}
|
355 |
+
.file 1 "/tmp/torchinductor_root/do/cdohrmmhfsykzlva6pepxaa7gf7klw7w5jzorpspyaldhfg3acr2.py"
|
356 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
357 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
358 |
+
.section .debug_abbrev
|
359 |
+
{
|
360 |
+
.b8 1
|
361 |
+
.b8 17
|
362 |
+
.b8 1
|
363 |
+
.b8 37
|
364 |
+
.b8 8
|
365 |
+
.b8 19
|
366 |
+
.b8 5
|
367 |
+
.b8 3
|
368 |
+
.b8 8
|
369 |
+
.b8 16
|
370 |
+
.b8 6
|
371 |
+
.b8 27
|
372 |
+
.b8 8
|
373 |
+
.b8 180
|
374 |
+
.b8 66
|
375 |
+
.b8 12
|
376 |
+
.b8 17
|
377 |
+
.b8 1
|
378 |
+
.b8 18
|
379 |
+
.b8 1
|
380 |
+
.b8 0
|
381 |
+
.b8 0
|
382 |
+
.b8 2
|
383 |
+
.b8 46
|
384 |
+
.b8 0
|
385 |
+
.b8 135
|
386 |
+
.b8 64
|
387 |
+
.b8 8
|
388 |
+
.b8 3
|
389 |
+
.b8 8
|
390 |
+
.b8 58
|
391 |
+
.b8 11
|
392 |
+
.b8 59
|
393 |
+
.b8 11
|
394 |
+
.b8 63
|
395 |
+
.b8 12
|
396 |
+
.b8 32
|
397 |
+
.b8 11
|
398 |
+
.b8 0
|
399 |
+
.b8 0
|
400 |
+
.b8 3
|
401 |
+
.b8 46
|
402 |
+
.b8 1
|
403 |
+
.b8 17
|
404 |
+
.b8 1
|
405 |
+
.b8 18
|
406 |
+
.b8 1
|
407 |
+
.b8 64
|
408 |
+
.b8 10
|
409 |
+
.b8 49
|
410 |
+
.b8 19
|
411 |
+
.b8 0
|
412 |
+
.b8 0
|
413 |
+
.b8 4
|
414 |
+
.b8 29
|
415 |
+
.b8 1
|
416 |
+
.b8 49
|
417 |
+
.b8 19
|
418 |
+
.b8 17
|
419 |
+
.b8 1
|
420 |
+
.b8 18
|
421 |
+
.b8 1
|
422 |
+
.b8 88
|
423 |
+
.b8 11
|
424 |
+
.b8 89
|
425 |
+
.b8 11
|
426 |
+
.b8 87
|
427 |
+
.b8 11
|
428 |
+
.b8 0
|
429 |
+
.b8 0
|
430 |
+
.b8 5
|
431 |
+
.b8 29
|
432 |
+
.b8 0
|
433 |
+
.b8 49
|
434 |
+
.b8 19
|
435 |
+
.b8 17
|
436 |
+
.b8 1
|
437 |
+
.b8 18
|
438 |
+
.b8 1
|
439 |
+
.b8 88
|
440 |
+
.b8 11
|
441 |
+
.b8 89
|
442 |
+
.b8 11
|
443 |
+
.b8 87
|
444 |
+
.b8 11
|
445 |
+
.b8 0
|
446 |
+
.b8 0
|
447 |
+
.b8 0
|
448 |
+
}
|
449 |
+
.section .debug_info
|
450 |
+
{
|
451 |
+
.b32 395
|
452 |
+
.b8 2
|
453 |
+
.b8 0
|
454 |
+
.b32 .debug_abbrev
|
455 |
+
.b8 8
|
456 |
+
.b8 1
|
457 |
+
.b8 116
|
458 |
+
.b8 114
|
459 |
+
.b8 105
|
460 |
+
.b8 116
|
461 |
+
.b8 111
|
462 |
+
.b8 110
|
463 |
+
.b8 0
|
464 |
+
.b8 2
|
465 |
+
.b8 0
|
466 |
+
.b8 99
|
467 |
+
.b8 100
|
468 |
+
.b8 111
|
469 |
+
.b8 104
|
470 |
+
.b8 114
|
471 |
+
.b8 109
|
472 |
+
.b8 109
|
473 |
+
.b8 104
|
474 |
+
.b8 102
|
475 |
+
.b8 115
|
476 |
+
.b8 121
|
477 |
+
.b8 107
|
478 |
+
.b8 122
|
479 |
+
.b8 108
|
480 |
+
.b8 118
|
481 |
+
.b8 97
|
482 |
+
.b8 54
|
483 |
+
.b8 112
|
484 |
+
.b8 101
|
485 |
+
.b8 112
|
486 |
+
.b8 120
|
487 |
+
.b8 97
|
488 |
+
.b8 97
|
489 |
+
.b8 55
|
490 |
+
.b8 103
|
491 |
+
.b8 102
|
492 |
+
.b8 55
|
493 |
+
.b8 107
|
494 |
+
.b8 108
|
495 |
+
.b8 119
|
496 |
+
.b8 55
|
497 |
+
.b8 119
|
498 |
+
.b8 53
|
499 |
+
.b8 106
|
500 |
+
.b8 122
|
501 |
+
.b8 111
|
502 |
+
.b8 114
|
503 |
+
.b8 112
|
504 |
+
.b8 115
|
505 |
+
.b8 112
|
506 |
+
.b8 121
|
507 |
+
.b8 97
|
508 |
+
.b8 108
|
509 |
+
.b8 100
|
510 |
+
.b8 104
|
511 |
+
.b8 102
|
512 |
+
.b8 103
|
513 |
+
.b8 51
|
514 |
+
.b8 97
|
515 |
+
.b8 99
|
516 |
+
.b8 114
|
517 |
+
.b8 50
|
518 |
+
.b8 46
|
519 |
+
.b8 112
|
520 |
+
.b8 121
|
521 |
+
.b8 0
|
522 |
+
.b32 .debug_line
|
523 |
+
.b8 47
|
524 |
+
.b8 116
|
525 |
+
.b8 109
|
526 |
+
.b8 112
|
527 |
+
.b8 47
|
528 |
+
.b8 116
|
529 |
+
.b8 111
|
530 |
+
.b8 114
|
531 |
+
.b8 99
|
532 |
+
.b8 104
|
533 |
+
.b8 105
|
534 |
+
.b8 110
|
535 |
+
.b8 100
|
536 |
+
.b8 117
|
537 |
+
.b8 99
|
538 |
+
.b8 116
|
539 |
+
.b8 111
|
540 |
+
.b8 114
|
541 |
+
.b8 95
|
542 |
+
.b8 114
|
543 |
+
.b8 111
|
544 |
+
.b8 111
|
545 |
+
.b8 116
|
546 |
+
.b8 47
|
547 |
+
.b8 100
|
548 |
+
.b8 111
|
549 |
+
.b8 0
|
550 |
+
.b8 1
|
551 |
+
.b64 $L__func_begin0
|
552 |
+
.b64 $L__func_end0
|
553 |
+
.b8 2
|
554 |
+
.b8 116
|
555 |
+
.b8 114
|
556 |
+
.b8 105
|
557 |
+
.b8 116
|
558 |
+
.b8 111
|
559 |
+
.b8 110
|
560 |
+
.b8 95
|
561 |
+
.b8 95
|
562 |
+
.b8 48
|
563 |
+
.b8 100
|
564 |
+
.b8 49
|
565 |
+
.b8 100
|
566 |
+
.b8 50
|
567 |
+
.b8 100
|
568 |
+
.b8 51
|
569 |
+
.b8 100
|
570 |
+
.b8 52
|
571 |
+
.b8 100
|
572 |
+
.b8 53
|
573 |
+
.b8 100
|
574 |
+
.b8 101
|
575 |
+
.b8 54
|
576 |
+
.b8 100
|
577 |
+
.b8 101
|
578 |
+
.b8 0
|
579 |
+
.b8 116
|
580 |
+
.b8 114
|
581 |
+
.b8 105
|
582 |
+
.b8 116
|
583 |
+
.b8 111
|
584 |
+
.b8 110
|
585 |
+
.b8 95
|
586 |
+
.b8 95
|
587 |
+
.b8 48
|
588 |
+
.b8 100
|
589 |
+
.b8 49
|
590 |
+
.b8 100
|
591 |
+
.b8 50
|
592 |
+
.b8 100
|
593 |
+
.b8 51
|
594 |
+
.b8 100
|
595 |
+
.b8 52
|
596 |
+
.b8 100
|
597 |
+
.b8 53
|
598 |
+
.b8 100
|
599 |
+
.b8 101
|
600 |
+
.b8 54
|
601 |
+
.b8 100
|
602 |
+
.b8 101
|
603 |
+
.b8 0
|
604 |
+
.b8 1
|
605 |
+
.b8 18
|
606 |
+
.b8 1
|
607 |
+
.b8 1
|
608 |
+
.b8 3
|
609 |
+
.b64 $L__func_begin0
|
610 |
+
.b64 $L__func_end0
|
611 |
+
.b8 1
|
612 |
+
.b8 156
|
613 |
+
.b32 125
|
614 |
+
.b8 4
|
615 |
+
.b32 125
|
616 |
+
.b64 $L__tmp1
|
617 |
+
.b64 $L__tmp14
|
618 |
+
.b8 2
|
619 |
+
.b8 42
|
620 |
+
.b8 59
|
621 |
+
.b8 5
|
622 |
+
.b32 125
|
623 |
+
.b64 $L__tmp1
|
624 |
+
.b64 $L__tmp14
|
625 |
+
.b8 2
|
626 |
+
.b8 243
|
627 |
+
.b8 36
|
628 |
+
.b8 0
|
629 |
+
.b8 5
|
630 |
+
.b32 125
|
631 |
+
.b64 $L__tmp2
|
632 |
+
.b64 $L__tmp15
|
633 |
+
.b8 2
|
634 |
+
.b8 42
|
635 |
+
.b8 59
|
636 |
+
.b8 5
|
637 |
+
.b32 125
|
638 |
+
.b64 $L__tmp15
|
639 |
+
.b64 $L__tmp16
|
640 |
+
.b8 3
|
641 |
+
.b8 42
|
642 |
+
.b8 45
|
643 |
+
.b8 5
|
644 |
+
.b32 125
|
645 |
+
.b64 $L__tmp17
|
646 |
+
.b64 $L__tmp32
|
647 |
+
.b8 2
|
648 |
+
.b8 50
|
649 |
+
.b8 59
|
650 |
+
.b8 4
|
651 |
+
.b32 125
|
652 |
+
.b64 $L__tmp18
|
653 |
+
.b64 $L__tmp31
|
654 |
+
.b8 2
|
655 |
+
.b8 50
|
656 |
+
.b8 59
|
657 |
+
.b8 5
|
658 |
+
.b32 125
|
659 |
+
.b64 $L__tmp18
|
660 |
+
.b64 $L__tmp31
|
661 |
+
.b8 2
|
662 |
+
.b8 243
|
663 |
+
.b8 36
|
664 |
+
.b8 0
|
665 |
+
.b8 5
|
666 |
+
.b32 125
|
667 |
+
.b64 $L__tmp32
|
668 |
+
.b64 $L__tmp33
|
669 |
+
.b8 3
|
670 |
+
.b8 50
|
671 |
+
.b8 45
|
672 |
+
.b8 0
|
673 |
+
.b8 0
|
674 |
+
}
|
675 |
+
.section .debug_pubnames
|
676 |
+
{
|
677 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
678 |
+
$L__pubNames_start0:
|
679 |
+
.b8 2
|
680 |
+
.b8 0
|
681 |
+
.b32 .debug_info
|
682 |
+
.b32 399
|
683 |
+
.b32 125
|
684 |
+
.b8 116
|
685 |
+
.b8 114
|
686 |
+
.b8 105
|
687 |
+
.b8 116
|
688 |
+
.b8 111
|
689 |
+
.b8 110
|
690 |
+
.b8 95
|
691 |
+
.b8 95
|
692 |
+
.b8 48
|
693 |
+
.b8 100
|
694 |
+
.b8 49
|
695 |
+
.b8 100
|
696 |
+
.b8 50
|
697 |
+
.b8 100
|
698 |
+
.b8 51
|
699 |
+
.b8 100
|
700 |
+
.b8 52
|
701 |
+
.b8 100
|
702 |
+
.b8 53
|
703 |
+
.b8 100
|
704 |
+
.b8 101
|
705 |
+
.b8 54
|
706 |
+
.b8 100
|
707 |
+
.b8 101
|
708 |
+
.b8 0
|
709 |
+
.b32 0
|
710 |
+
$L__pubNames_end0:
|
711 |
+
}
|
712 |
+
.section .debug_pubtypes
|
713 |
+
{
|
714 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
715 |
+
$L__pubTypes_start0:
|
716 |
+
.b8 2
|
717 |
+
.b8 0
|
718 |
+
.b32 .debug_info
|
719 |
+
.b32 399
|
720 |
+
.b32 0
|
721 |
+
$L__pubTypes_end0:
|
722 |
+
}
|
723 |
+
.section .debug_loc { }
|