Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin +0 -0
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir +366 -0
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir +76 -0
- .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir +75 -0
- .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin +0 -0
- .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir +283 -0
- .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx +687 -0
- .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir +58 -0
- .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir +57 -0
- .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin +0 -0
- .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx +1854 -0
- .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir +134 -0
- .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir +113 -0
- .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir +245 -0
- .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx +651 -0
- .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir +53 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin +0 -0
- .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir +858 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir +290 -0
- .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx +653 -0
- .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir +162 -0
- .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx +338 -0
- .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir +24 -0
- .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir +18 -0
- .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin +0 -0
- .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx +756 -0
- .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir +141 -0
- .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir +139 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir +235 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx +572 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir +63 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin +0 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir +243 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx +577 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir +65 -0
- .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir +58 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx +456 -0
- .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir +61 -0
- .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin +0 -0
- .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx +809 -0
- .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx +717 -0
- .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir +38 -0
- .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx +495 -0
- .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir +278 -0
- .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx +1154 -0
- .triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx +1608 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir +949 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir +38 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir +37 -0
- .triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir +318 -0
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin
ADDED
Binary file (16.9 kB). View file
|
|
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir
ADDED
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 {
|
8 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%12 = and i32 %11, 31, !dbg !10
|
10 |
+
%13 = lshr i32 %11, 5, !dbg !10
|
11 |
+
%14 = and i32 %13, 1, !dbg !10
|
12 |
+
%urem = shl i32 %11, 2, !dbg !10
|
13 |
+
%15 = and i32 %urem, 252, !dbg !10
|
14 |
+
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%17 = shl i32 %16, 8, !dbg !12
|
16 |
+
%18 = or i32 %17, %15, !dbg !13
|
17 |
+
%19 = sext i32 %18 to i64, !dbg !14
|
18 |
+
%20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14
|
19 |
+
%21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15
|
21 |
+
%23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15
|
22 |
+
%24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15
|
23 |
+
%25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15
|
24 |
+
%26 = bitcast i32 %22 to float, !dbg !15
|
25 |
+
%27 = bitcast i32 %23 to float, !dbg !15
|
26 |
+
%28 = bitcast i32 %24 to float, !dbg !15
|
27 |
+
%29 = bitcast i32 %25 to float, !dbg !15
|
28 |
+
%30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16
|
29 |
+
%31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
30 |
+
%32 = extractvalue { i32, i32 } %31, 0, !dbg !17
|
31 |
+
%33 = extractvalue { i32, i32 } %31, 1, !dbg !17
|
32 |
+
%34 = trunc i32 %32 to i16, !dbg !17
|
33 |
+
%extelt.offset = lshr i32 %32, 16, !dbg !17
|
34 |
+
%35 = trunc i32 %extelt.offset to i16, !dbg !17
|
35 |
+
%36 = trunc i32 %33 to i16, !dbg !17
|
36 |
+
%extelt.offset1 = lshr i32 %33, 16, !dbg !17
|
37 |
+
%37 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
38 |
+
%38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
|
39 |
+
%39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18
|
40 |
+
%40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18
|
41 |
+
%41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18
|
42 |
+
%42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19
|
43 |
+
%43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
44 |
+
%44 = extractvalue { i32, i32 } %43, 0, !dbg !20
|
45 |
+
%45 = extractvalue { i32, i32 } %43, 1, !dbg !20
|
46 |
+
%46 = trunc i32 %44 to i16, !dbg !20
|
47 |
+
%extelt.offset2 = lshr i32 %44, 16, !dbg !20
|
48 |
+
%47 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
49 |
+
%48 = trunc i32 %45 to i16, !dbg !20
|
50 |
+
%extelt.offset3 = lshr i32 %45, 16, !dbg !20
|
51 |
+
%49 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
52 |
+
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21
|
53 |
+
%51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21
|
54 |
+
%52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21
|
55 |
+
%53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21
|
56 |
+
%54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22
|
57 |
+
%55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
|
58 |
+
%56 = extractvalue { i32, i32 } %55, 0, !dbg !23
|
59 |
+
%57 = extractvalue { i32, i32 } %55, 1, !dbg !23
|
60 |
+
%58 = trunc i32 %56 to i16, !dbg !23
|
61 |
+
%extelt.offset4 = lshr i32 %56, 16, !dbg !23
|
62 |
+
%59 = trunc i32 %extelt.offset4 to i16, !dbg !23
|
63 |
+
%60 = trunc i32 %57 to i16, !dbg !23
|
64 |
+
%extelt.offset5 = lshr i32 %57, 16, !dbg !23
|
65 |
+
%61 = trunc i32 %extelt.offset5 to i16, !dbg !23
|
66 |
+
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24
|
67 |
+
%63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24
|
68 |
+
%64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24
|
69 |
+
%65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24
|
70 |
+
%66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25
|
71 |
+
%67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
|
72 |
+
%68 = extractvalue { i32, i32 } %67, 0, !dbg !26
|
73 |
+
%69 = extractvalue { i32, i32 } %67, 1, !dbg !26
|
74 |
+
%70 = trunc i32 %68 to i16, !dbg !26
|
75 |
+
%extelt.offset6 = lshr i32 %68, 16, !dbg !26
|
76 |
+
%71 = trunc i32 %extelt.offset6 to i16, !dbg !26
|
77 |
+
%72 = trunc i32 %69 to i16, !dbg !26
|
78 |
+
%extelt.offset7 = lshr i32 %69, 16, !dbg !26
|
79 |
+
%73 = trunc i32 %extelt.offset7 to i16, !dbg !26
|
80 |
+
%74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27
|
81 |
+
%75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27
|
82 |
+
%76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27
|
83 |
+
%77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27
|
84 |
+
%78 = zext nneg i32 %15 to i64, !dbg !28
|
85 |
+
%79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28
|
86 |
+
%80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
87 |
+
%81 = fadd float %38, %26, !dbg !30
|
88 |
+
%82 = fadd float %39, %27, !dbg !30
|
89 |
+
%83 = fadd float %40, %28, !dbg !30
|
90 |
+
%84 = fadd float %81, %50, !dbg !31
|
91 |
+
%85 = fadd float %82, %51, !dbg !31
|
92 |
+
%86 = fadd float %83, %52, !dbg !31
|
93 |
+
%87 = fadd float %85, %63, !dbg !32
|
94 |
+
%88 = fadd float %86, %64, !dbg !32
|
95 |
+
%89 = fadd float %87, %75, !dbg !33
|
96 |
+
%90 = fadd float %88, %76, !dbg !33
|
97 |
+
%91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32
|
98 |
+
%92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32
|
99 |
+
%93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32
|
100 |
+
%94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32
|
101 |
+
%95 = fadd <2 x float> %92, %94, !dbg !32
|
102 |
+
%96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33
|
103 |
+
%97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33
|
104 |
+
%98 = fadd <2 x float> %95, %97, !dbg !33
|
105 |
+
%99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34
|
106 |
+
%100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34
|
107 |
+
%101 = fadd <2 x float> %98, %100, !dbg !34
|
108 |
+
%102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34
|
109 |
+
%103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34
|
110 |
+
%104 = fadd <2 x float> %101, %103, !dbg !34
|
111 |
+
%105 = extractelement <2 x float> %104, i64 0, !dbg !34
|
112 |
+
%106 = extractelement <2 x float> %104, i64 1, !dbg !34
|
113 |
+
%107 = fadd float %105, %106, !dbg !34
|
114 |
+
%108 = bitcast float %107 to i32, !dbg !40
|
115 |
+
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40
|
116 |
+
%110 = bitcast i32 %109 to float, !dbg !40
|
117 |
+
%111 = fadd float %107, %110, !dbg !34
|
118 |
+
%112 = bitcast float %111 to i32, !dbg !40
|
119 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40
|
120 |
+
%114 = bitcast i32 %113 to float, !dbg !40
|
121 |
+
%115 = fadd float %111, %114, !dbg !34
|
122 |
+
%116 = bitcast float %115 to i32, !dbg !40
|
123 |
+
%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40
|
124 |
+
%118 = bitcast i32 %117 to float, !dbg !40
|
125 |
+
%119 = fadd float %115, %118, !dbg !34
|
126 |
+
%120 = bitcast float %119 to i32, !dbg !40
|
127 |
+
%121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40
|
128 |
+
%122 = bitcast i32 %121 to float, !dbg !40
|
129 |
+
%123 = fadd float %119, %122, !dbg !34
|
130 |
+
%124 = bitcast float %123 to i32, !dbg !40
|
131 |
+
%125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40
|
132 |
+
%126 = bitcast i32 %125 to float, !dbg !40
|
133 |
+
%127 = fadd float %123, %126, !dbg !34
|
134 |
+
%128 = icmp eq i32 %12, 0, !dbg !40
|
135 |
+
%129 = zext nneg i32 %14 to i64, !dbg !40
|
136 |
+
%130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40
|
137 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40
|
138 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
139 |
+
%131 = icmp slt i32 %11, 2, !dbg !40
|
140 |
+
%132 = sext i32 %11 to i64, !dbg !40
|
141 |
+
%133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40
|
142 |
+
%134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40
|
143 |
+
%135 = bitcast float %134 to i32, !dbg !40
|
144 |
+
%136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40
|
145 |
+
%137 = bitcast i32 %136 to float, !dbg !40
|
146 |
+
%138 = fadd float %134, %137, !dbg !34
|
147 |
+
%139 = and i32 %11, 1, !dbg !40
|
148 |
+
%140 = icmp eq i32 %139, 0, !dbg !40
|
149 |
+
%141 = and i1 %131, %140, !dbg !40
|
150 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40
|
151 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !40
|
152 |
+
%142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40
|
153 |
+
%143 = fadd float %142, 0.000000e+00, !dbg !42
|
154 |
+
%144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46
|
155 |
+
%145 = extractelement <2 x float> %98, i64 0, !dbg !47
|
156 |
+
%146 = fsub float %145, %144, !dbg !47
|
157 |
+
%147 = fsub float %89, %144, !dbg !47
|
158 |
+
%148 = fsub float %90, %144, !dbg !47
|
159 |
+
%149 = fsub float %106, %144, !dbg !47
|
160 |
+
%150 = fmul float %146, %146, !dbg !48
|
161 |
+
%151 = fmul float %147, %147, !dbg !48
|
162 |
+
%152 = fmul float %148, %148, !dbg !48
|
163 |
+
%153 = fmul float %149, %149, !dbg !48
|
164 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
165 |
+
%154 = fadd float %150, %151, !dbg !51
|
166 |
+
%155 = fadd float %152, %154, !dbg !51
|
167 |
+
%156 = fadd float %153, %155, !dbg !51
|
168 |
+
%157 = bitcast float %156 to i32, !dbg !49
|
169 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49
|
170 |
+
%159 = bitcast i32 %158 to float, !dbg !49
|
171 |
+
%160 = fadd float %156, %159, !dbg !51
|
172 |
+
%161 = bitcast float %160 to i32, !dbg !49
|
173 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49
|
174 |
+
%163 = bitcast i32 %162 to float, !dbg !49
|
175 |
+
%164 = fadd float %160, %163, !dbg !51
|
176 |
+
%165 = bitcast float %164 to i32, !dbg !49
|
177 |
+
%166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49
|
178 |
+
%167 = bitcast i32 %166 to float, !dbg !49
|
179 |
+
%168 = fadd float %164, %167, !dbg !51
|
180 |
+
%169 = bitcast float %168 to i32, !dbg !49
|
181 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49
|
182 |
+
%171 = bitcast i32 %170 to float, !dbg !49
|
183 |
+
%172 = fadd float %168, %171, !dbg !51
|
184 |
+
%173 = bitcast float %172 to i32, !dbg !49
|
185 |
+
%174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49
|
186 |
+
%175 = bitcast i32 %174 to float, !dbg !49
|
187 |
+
%176 = fadd float %172, %175, !dbg !51
|
188 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49
|
189 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
190 |
+
%177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49
|
191 |
+
%178 = bitcast float %177 to i32, !dbg !49
|
192 |
+
%179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49
|
193 |
+
%180 = bitcast i32 %179 to float, !dbg !49
|
194 |
+
%181 = fadd float %177, %180, !dbg !51
|
195 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49
|
196 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
197 |
+
%182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49
|
198 |
+
%183 = fadd float %182, 0.000000e+00, !dbg !54
|
199 |
+
%184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56
|
200 |
+
%185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57
|
201 |
+
%186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58
|
202 |
+
%.not.i = icmp eq i32 %186, 0, !dbg !58
|
203 |
+
br i1 %.not.i, label %189, label %187, !dbg !58
|
204 |
+
|
205 |
+
187: ; preds = %10
|
206 |
+
%188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58
|
207 |
+
br label %__nv_rsqrtf.exit, !dbg !58
|
208 |
+
|
209 |
+
189: ; preds = %10
|
210 |
+
%190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58
|
211 |
+
br label %__nv_rsqrtf.exit, !dbg !58
|
212 |
+
|
213 |
+
__nv_rsqrtf.exit: ; preds = %187, %189
|
214 |
+
%.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58
|
215 |
+
%191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29
|
216 |
+
%192 = bitcast i32 %191 to float, !dbg !29
|
217 |
+
%193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29
|
218 |
+
%194 = bitcast i32 %193 to float, !dbg !29
|
219 |
+
%195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29
|
220 |
+
%196 = bitcast i32 %195 to float, !dbg !29
|
221 |
+
%197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29
|
222 |
+
%198 = bitcast i32 %197 to float, !dbg !29
|
223 |
+
%199 = fmul float %146, %.0.i, !dbg !59
|
224 |
+
%200 = fmul float %147, %.0.i, !dbg !59
|
225 |
+
%201 = fmul float %148, %.0.i, !dbg !59
|
226 |
+
%202 = fmul float %149, %.0.i, !dbg !59
|
227 |
+
%203 = fmul float %199, %198, !dbg !60
|
228 |
+
%204 = fmul float %200, %196, !dbg !60
|
229 |
+
%205 = fmul float %201, %194, !dbg !60
|
230 |
+
%206 = fmul float %202, %192, !dbg !60
|
231 |
+
%207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61
|
232 |
+
%208 = bitcast float %145 to i32, !dbg !62
|
233 |
+
%209 = bitcast float %89 to i32, !dbg !62
|
234 |
+
%210 = bitcast float %90 to i32, !dbg !62
|
235 |
+
%211 = bitcast float %106 to i32, !dbg !62
|
236 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62
|
237 |
+
%212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63
|
238 |
+
%213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64
|
239 |
+
%214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64
|
240 |
+
%215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64
|
241 |
+
%216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64
|
242 |
+
%217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64
|
243 |
+
%218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64
|
244 |
+
%219 = bitcast <2 x i16> %218 to i32, !dbg !64
|
245 |
+
%220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64
|
246 |
+
%221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64
|
247 |
+
%222 = bitcast <2 x i16> %221 to i32, !dbg !64
|
248 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64
|
249 |
+
ret void, !dbg !65
|
250 |
+
}
|
251 |
+
|
252 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
253 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
254 |
+
|
255 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
256 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
257 |
+
|
258 |
+
; Function Attrs: convergent nocallback nounwind
|
259 |
+
declare void @llvm.nvvm.barrier0() #2
|
260 |
+
|
261 |
+
; Function Attrs: alwaysinline nounwind
|
262 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
263 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
264 |
+
%.not = icmp eq i32 %1, 0
|
265 |
+
br i1 %.not, label %4, label %2
|
266 |
+
|
267 |
+
2: ; preds = %0
|
268 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
269 |
+
br label %6
|
270 |
+
|
271 |
+
4: ; preds = %0
|
272 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
273 |
+
br label %6
|
274 |
+
|
275 |
+
6: ; preds = %4, %2
|
276 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
277 |
+
ret float %.0
|
278 |
+
}
|
279 |
+
|
280 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
281 |
+
|
282 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
283 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
284 |
+
|
285 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
286 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
287 |
+
|
288 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
289 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
290 |
+
attributes #2 = { convergent nocallback nounwind }
|
291 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
292 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
293 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
294 |
+
attributes #6 = { nounwind }
|
295 |
+
|
296 |
+
!llvm.module.flags = !{!0, !1}
|
297 |
+
!llvm.dbg.cu = !{!2}
|
298 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
299 |
+
!llvm.ident = !{!6}
|
300 |
+
|
301 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
302 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
303 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
304 |
+
!3 = !DIFile(filename: "cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py", directory: "/tmp/torchinductor_root/jb")
|
305 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
|
306 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
|
307 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
308 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
309 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
310 |
+
!9 = !{}
|
311 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
312 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
313 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
314 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
315 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
316 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
317 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
318 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
319 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
320 |
+
!19 = !DILocation(line: 32, column: 30, scope: !7)
|
321 |
+
!20 = !DILocation(line: 32, column: 46, scope: !7)
|
322 |
+
!21 = !DILocation(line: 32, column: 67, scope: !7)
|
323 |
+
!22 = !DILocation(line: 33, column: 30, scope: !7)
|
324 |
+
!23 = !DILocation(line: 33, column: 46, scope: !7)
|
325 |
+
!24 = !DILocation(line: 33, column: 67, scope: !7)
|
326 |
+
!25 = !DILocation(line: 34, column: 31, scope: !7)
|
327 |
+
!26 = !DILocation(line: 34, column: 47, scope: !7)
|
328 |
+
!27 = !DILocation(line: 34, column: 68, scope: !7)
|
329 |
+
!28 = !DILocation(line: 35, column: 31, scope: !7)
|
330 |
+
!29 = !DILocation(line: 35, column: 36, scope: !7)
|
331 |
+
!30 = !DILocation(line: 37, column: 18, scope: !7)
|
332 |
+
!31 = !DILocation(line: 39, column: 18, scope: !7)
|
333 |
+
!32 = !DILocation(line: 41, column: 18, scope: !7)
|
334 |
+
!33 = !DILocation(line: 43, column: 19, scope: !7)
|
335 |
+
!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38)
|
336 |
+
!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0)
|
337 |
+
!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
338 |
+
!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
339 |
+
!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39)
|
340 |
+
!39 = !DILocation(line: 48, column: 59, scope: !35)
|
341 |
+
!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41)
|
342 |
+
!41 = !DILocation(line: 48, column: 59, scope: !37)
|
343 |
+
!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45)
|
344 |
+
!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0)
|
345 |
+
!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
346 |
+
!45 = !DILocation(line: 48, column: 45, scope: !43)
|
347 |
+
!46 = !DILocation(line: 51, column: 20, scope: !7)
|
348 |
+
!47 = !DILocation(line: 52, column: 20, scope: !7)
|
349 |
+
!48 = !DILocation(line: 53, column: 20, scope: !7)
|
350 |
+
!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50)
|
351 |
+
!50 = !DILocation(line: 56, column: 59, scope: !37)
|
352 |
+
!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52)
|
353 |
+
!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53)
|
354 |
+
!53 = !DILocation(line: 56, column: 59, scope: !35)
|
355 |
+
!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55)
|
356 |
+
!55 = !DILocation(line: 56, column: 45, scope: !43)
|
357 |
+
!56 = !DILocation(line: 59, column: 20, scope: !7)
|
358 |
+
!57 = !DILocation(line: 61, column: 20, scope: !7)
|
359 |
+
!58 = !DILocation(line: 62, column: 26, scope: !7)
|
360 |
+
!59 = !DILocation(line: 63, column: 20, scope: !7)
|
361 |
+
!60 = !DILocation(line: 64, column: 20, scope: !7)
|
362 |
+
!61 = !DILocation(line: 66, column: 25, scope: !7)
|
363 |
+
!62 = !DILocation(line: 66, column: 48, scope: !7)
|
364 |
+
!63 = !DILocation(line: 67, column: 25, scope: !7)
|
365 |
+
!64 = !DILocation(line: 67, column: 48, scope: !7)
|
366 |
+
!65 = !DILocation(line: 67, column: 4, scope: !7)
|
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
8 |
+
%c256_i32 = arith.constant 256 : i32
|
9 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
20 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
21 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
22 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
23 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
27 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
28 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
29 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
30 |
+
%19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
31 |
+
%20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
32 |
+
%21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
33 |
+
%22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
34 |
+
%23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
35 |
+
%24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
36 |
+
%25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
37 |
+
%26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
38 |
+
%27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
39 |
+
%28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
40 |
+
%29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
|
41 |
+
%30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
|
42 |
+
%31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
|
43 |
+
%32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
44 |
+
%33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
|
45 |
+
^bb0(%arg10: f32, %arg11: f32):
|
46 |
+
%53 = arith.addf %arg10, %arg11 : f32
|
47 |
+
tt.reduce.return %53 : f32
|
48 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
49 |
+
%34 = arith.addf %33, %cst_2 : f32
|
50 |
+
%35 = arith.divf %34, %cst_1 : f32
|
51 |
+
%36 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
|
52 |
+
%37 = arith.subf %31, %36 : tensor<256xf32, #blocked>
|
53 |
+
%38 = arith.mulf %37, %37 : tensor<256xf32, #blocked>
|
54 |
+
%39 = arith.select %2, %38, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
55 |
+
%40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
|
56 |
+
^bb0(%arg10: f32, %arg11: f32):
|
57 |
+
%53 = arith.addf %arg10, %arg11 : f32
|
58 |
+
tt.reduce.return %53 : f32
|
59 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
60 |
+
%41 = arith.addf %40, %cst_2 : f32
|
61 |
+
%42 = arith.divf %41, %cst_1 : f32
|
62 |
+
%43 = arith.addf %42, %cst_0 : f32
|
63 |
+
%44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
64 |
+
%45 = tt.splat %44 : (f32) -> tensor<256xf32, #blocked>
|
65 |
+
%46 = arith.mulf %37, %45 : tensor<256xf32, #blocked>
|
66 |
+
%47 = arith.mulf %46, %27 : tensor<256xf32, #blocked>
|
67 |
+
%48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
68 |
+
%49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
69 |
+
tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
70 |
+
%50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
71 |
+
%51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
72 |
+
%52 = arith.truncf %47 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
73 |
+
tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
74 |
+
tt.return
|
75 |
+
}
|
76 |
+
}
|
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
24 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
26 |
+
%16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
|
27 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
28 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
29 |
+
%19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
30 |
+
%20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
|
31 |
+
%21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
32 |
+
%22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
33 |
+
%23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
34 |
+
%24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
|
35 |
+
%25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
36 |
+
%26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
37 |
+
%27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
38 |
+
%28 = arith.addf %8, %12 : tensor<256xf32>
|
39 |
+
%29 = arith.addf %28, %16 : tensor<256xf32>
|
40 |
+
%30 = arith.addf %29, %20 : tensor<256xf32>
|
41 |
+
%31 = arith.addf %30, %24 : tensor<256xf32>
|
42 |
+
%32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
43 |
+
%33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
|
44 |
+
^bb0(%arg10: f32, %arg11: f32):
|
45 |
+
%53 = arith.addf %arg10, %arg11 : f32
|
46 |
+
tt.reduce.return %53 : f32
|
47 |
+
}) : (tensor<256xf32>) -> f32
|
48 |
+
%34 = arith.addf %33, %cst_0 : f32
|
49 |
+
%35 = arith.divf %34, %cst_1 : f32
|
50 |
+
%36 = tt.splat %35 : (f32) -> tensor<256xf32>
|
51 |
+
%37 = arith.subf %31, %36 : tensor<256xf32>
|
52 |
+
%38 = arith.mulf %37, %37 : tensor<256xf32>
|
53 |
+
%39 = arith.select %2, %38, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
54 |
+
%40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
|
55 |
+
^bb0(%arg10: f32, %arg11: f32):
|
56 |
+
%53 = arith.addf %arg10, %arg11 : f32
|
57 |
+
tt.reduce.return %53 : f32
|
58 |
+
}) : (tensor<256xf32>) -> f32
|
59 |
+
%41 = arith.addf %40, %cst_0 : f32
|
60 |
+
%42 = arith.divf %41, %cst_1 : f32
|
61 |
+
%43 = arith.addf %42, %cst_2 : f32
|
62 |
+
%44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
63 |
+
%45 = tt.splat %44 : (f32) -> tensor<256xf32>
|
64 |
+
%46 = arith.mulf %37, %45 : tensor<256xf32>
|
65 |
+
%47 = arith.mulf %46, %27 : tensor<256xf32>
|
66 |
+
%48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
67 |
+
%49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
68 |
+
tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
69 |
+
%50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
70 |
+
%51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
71 |
+
%52 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
|
72 |
+
tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
73 |
+
tt.return
|
74 |
+
}
|
75 |
+
}
|
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin
ADDED
Binary file (13.2 kB). View file
|
|
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
6 |
+
|
7 |
+
define void @triton__0d1d2d3d4de5de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !7 {
|
8 |
+
%7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
9 |
+
%8 = and i32 %7, 31, !dbg !10
|
10 |
+
%9 = lshr i32 %7, 5, !dbg !10
|
11 |
+
%10 = and i32 %9, 1, !dbg !10
|
12 |
+
%urem = shl i32 %7, 2, !dbg !10
|
13 |
+
%11 = and i32 %urem, 252, !dbg !10
|
14 |
+
%12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
|
15 |
+
%13 = shl i32 %12, 8, !dbg !12
|
16 |
+
%14 = or i32 %13, %11, !dbg !13
|
17 |
+
%15 = sext i32 %14 to i64, !dbg !14
|
18 |
+
%16 = getelementptr float, ptr addrspace(1) %0, i64 %15, !dbg !14
|
19 |
+
%17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %16, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
|
20 |
+
%18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !15
|
21 |
+
%19 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !15
|
22 |
+
%20 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !15
|
23 |
+
%21 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !15
|
24 |
+
%22 = bitcast i32 %18 to float, !dbg !15
|
25 |
+
%23 = bitcast i32 %19 to float, !dbg !15
|
26 |
+
%24 = bitcast i32 %20 to float, !dbg !15
|
27 |
+
%25 = bitcast i32 %21 to float, !dbg !15
|
28 |
+
%26 = getelementptr i16, ptr addrspace(1) %1, i64 %15, !dbg !16
|
29 |
+
%27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
|
30 |
+
%28 = extractvalue { i32, i32 } %27, 0, !dbg !17
|
31 |
+
%29 = extractvalue { i32, i32 } %27, 1, !dbg !17
|
32 |
+
%30 = trunc i32 %28 to i16, !dbg !17
|
33 |
+
%extelt.offset = lshr i32 %28, 16, !dbg !17
|
34 |
+
%31 = trunc i32 %extelt.offset to i16, !dbg !17
|
35 |
+
%32 = trunc i32 %29 to i16, !dbg !17
|
36 |
+
%extelt.offset1 = lshr i32 %29, 16, !dbg !17
|
37 |
+
%33 = trunc i32 %extelt.offset1 to i16, !dbg !17
|
38 |
+
%34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
|
39 |
+
%35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
|
40 |
+
%36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
|
41 |
+
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
|
42 |
+
%38 = zext nneg i32 %11 to i64, !dbg !19
|
43 |
+
%39 = getelementptr float, ptr addrspace(1) %2, i64 %38, !dbg !19
|
44 |
+
%40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
|
45 |
+
%41 = fadd float %34, %22, !dbg !21
|
46 |
+
%42 = fadd float %35, %23, !dbg !21
|
47 |
+
%43 = fadd float %36, %24, !dbg !21
|
48 |
+
%44 = fadd float %37, %25, !dbg !21
|
49 |
+
%45 = fadd float %41, %42, !dbg !22
|
50 |
+
%46 = fadd float %45, %43, !dbg !22
|
51 |
+
%47 = fadd float %46, %44, !dbg !22
|
52 |
+
%48 = bitcast float %47 to i32, !dbg !28
|
53 |
+
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !28
|
54 |
+
%50 = bitcast i32 %49 to float, !dbg !28
|
55 |
+
%51 = fadd float %47, %50, !dbg !22
|
56 |
+
%52 = bitcast float %51 to i32, !dbg !28
|
57 |
+
%53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !28
|
58 |
+
%54 = bitcast i32 %53 to float, !dbg !28
|
59 |
+
%55 = fadd float %51, %54, !dbg !22
|
60 |
+
%56 = bitcast float %55 to i32, !dbg !28
|
61 |
+
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !28
|
62 |
+
%58 = bitcast i32 %57 to float, !dbg !28
|
63 |
+
%59 = fadd float %55, %58, !dbg !22
|
64 |
+
%60 = bitcast float %59 to i32, !dbg !28
|
65 |
+
%61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !28
|
66 |
+
%62 = bitcast i32 %61 to float, !dbg !28
|
67 |
+
%63 = fadd float %59, %62, !dbg !22
|
68 |
+
%64 = bitcast float %63 to i32, !dbg !28
|
69 |
+
%65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !28
|
70 |
+
%66 = bitcast i32 %65 to float, !dbg !28
|
71 |
+
%67 = fadd float %63, %66, !dbg !22
|
72 |
+
%68 = icmp eq i32 %8, 0, !dbg !28
|
73 |
+
%69 = zext nneg i32 %10 to i64, !dbg !28
|
74 |
+
%70 = getelementptr float, ptr addrspace(3) @global_smem, i64 %69, !dbg !28
|
75 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %67, i1 %68) #6, !dbg !28
|
76 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
77 |
+
%71 = icmp slt i32 %7, 2, !dbg !28
|
78 |
+
%72 = sext i32 %7 to i64, !dbg !28
|
79 |
+
%73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !28
|
80 |
+
%74 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !28
|
81 |
+
%75 = bitcast float %74 to i32, !dbg !28
|
82 |
+
%76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 1, i32 31), !dbg !28
|
83 |
+
%77 = bitcast i32 %76 to float, !dbg !28
|
84 |
+
%78 = fadd float %74, %77, !dbg !22
|
85 |
+
%79 = and i32 %7, 1, !dbg !28
|
86 |
+
%80 = icmp eq i32 %79, 0, !dbg !28
|
87 |
+
%81 = and i1 %71, %80, !dbg !28
|
88 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %78, i1 %81) #6, !dbg !28
|
89 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
90 |
+
%82 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28
|
91 |
+
%83 = fadd float %82, 0.000000e+00, !dbg !30
|
92 |
+
%84 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %83, float 2.560000e+02) #6, !dbg !34
|
93 |
+
%85 = fsub float %41, %84, !dbg !35
|
94 |
+
%86 = fsub float %42, %84, !dbg !35
|
95 |
+
%87 = fsub float %43, %84, !dbg !35
|
96 |
+
%88 = fsub float %44, %84, !dbg !35
|
97 |
+
%89 = fmul float %85, %85, !dbg !36
|
98 |
+
%90 = fmul float %86, %86, !dbg !36
|
99 |
+
%91 = fmul float %87, %87, !dbg !36
|
100 |
+
%92 = fmul float %88, %88, !dbg !36
|
101 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !37
|
102 |
+
%93 = fadd float %89, %90, !dbg !39
|
103 |
+
%94 = fadd float %91, %93, !dbg !39
|
104 |
+
%95 = fadd float %92, %94, !dbg !39
|
105 |
+
%96 = bitcast float %95 to i32, !dbg !37
|
106 |
+
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !37
|
107 |
+
%98 = bitcast i32 %97 to float, !dbg !37
|
108 |
+
%99 = fadd float %95, %98, !dbg !39
|
109 |
+
%100 = bitcast float %99 to i32, !dbg !37
|
110 |
+
%101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !37
|
111 |
+
%102 = bitcast i32 %101 to float, !dbg !37
|
112 |
+
%103 = fadd float %99, %102, !dbg !39
|
113 |
+
%104 = bitcast float %103 to i32, !dbg !37
|
114 |
+
%105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !37
|
115 |
+
%106 = bitcast i32 %105 to float, !dbg !37
|
116 |
+
%107 = fadd float %103, %106, !dbg !39
|
117 |
+
%108 = bitcast float %107 to i32, !dbg !37
|
118 |
+
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !37
|
119 |
+
%110 = bitcast i32 %109 to float, !dbg !37
|
120 |
+
%111 = fadd float %107, %110, !dbg !39
|
121 |
+
%112 = bitcast float %111 to i32, !dbg !37
|
122 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !37
|
123 |
+
%114 = bitcast i32 %113 to float, !dbg !37
|
124 |
+
%115 = fadd float %111, %114, !dbg !39
|
125 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %115, i1 %68) #6, !dbg !37
|
126 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !37
|
127 |
+
%116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !37
|
128 |
+
%117 = bitcast float %116 to i32, !dbg !37
|
129 |
+
%118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
|
130 |
+
%119 = bitcast i32 %118 to float, !dbg !37
|
131 |
+
%120 = fadd float %116, %119, !dbg !39
|
132 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %120, i1 %81) #6, !dbg !37
|
133 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !37
|
134 |
+
%121 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
|
135 |
+
%122 = fadd float %121, 0.000000e+00, !dbg !42
|
136 |
+
%123 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %122, float 2.560000e+02) #6, !dbg !44
|
137 |
+
%124 = fadd float %123, 0x3EE4F8B580000000, !dbg !45
|
138 |
+
%125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46
|
139 |
+
%.not.i = icmp eq i32 %125, 0, !dbg !46
|
140 |
+
br i1 %.not.i, label %128, label %126, !dbg !46
|
141 |
+
|
142 |
+
126: ; preds = %6
|
143 |
+
%127 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !46
|
144 |
+
br label %__nv_rsqrtf.exit, !dbg !46
|
145 |
+
|
146 |
+
128: ; preds = %6
|
147 |
+
%129 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !46
|
148 |
+
br label %__nv_rsqrtf.exit, !dbg !46
|
149 |
+
|
150 |
+
__nv_rsqrtf.exit: ; preds = %126, %128
|
151 |
+
%.0.i = phi float [ %127, %126 ], [ %129, %128 ], !dbg !46
|
152 |
+
%130 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !20
|
153 |
+
%131 = bitcast i32 %130 to float, !dbg !20
|
154 |
+
%132 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !20
|
155 |
+
%133 = bitcast i32 %132 to float, !dbg !20
|
156 |
+
%134 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !20
|
157 |
+
%135 = bitcast i32 %134 to float, !dbg !20
|
158 |
+
%136 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !20
|
159 |
+
%137 = bitcast i32 %136 to float, !dbg !20
|
160 |
+
%138 = fmul float %85, %.0.i, !dbg !47
|
161 |
+
%139 = fmul float %86, %.0.i, !dbg !47
|
162 |
+
%140 = fmul float %87, %.0.i, !dbg !47
|
163 |
+
%141 = fmul float %88, %.0.i, !dbg !47
|
164 |
+
%142 = fmul float %138, %137, !dbg !48
|
165 |
+
%143 = fmul float %139, %135, !dbg !48
|
166 |
+
%144 = fmul float %140, %133, !dbg !48
|
167 |
+
%145 = fmul float %141, %131, !dbg !48
|
168 |
+
%146 = getelementptr i16, ptr addrspace(1) %3, i64 %15, !dbg !49
|
169 |
+
%147 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %142) #6, !dbg !50
|
170 |
+
%148 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %143) #6, !dbg !50
|
171 |
+
%149 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !50
|
172 |
+
%150 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !50
|
173 |
+
%151 = insertelement <2 x i16> undef, i16 %147, i64 0, !dbg !50
|
174 |
+
%152 = insertelement <2 x i16> %151, i16 %148, i64 1, !dbg !50
|
175 |
+
%153 = bitcast <2 x i16> %152 to i32, !dbg !50
|
176 |
+
%154 = insertelement <2 x i16> undef, i16 %149, i64 0, !dbg !50
|
177 |
+
%155 = insertelement <2 x i16> %154, i16 %150, i64 1, !dbg !50
|
178 |
+
%156 = bitcast <2 x i16> %155 to i32, !dbg !50
|
179 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %153, i32 %156, ptr addrspace(1) %146, i1 true) #6, !dbg !50
|
180 |
+
ret void, !dbg !51
|
181 |
+
}
|
182 |
+
|
183 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
184 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
185 |
+
|
186 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
187 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
188 |
+
|
189 |
+
; Function Attrs: convergent nocallback nounwind
|
190 |
+
declare void @llvm.nvvm.barrier0() #2
|
191 |
+
|
192 |
+
; Function Attrs: alwaysinline nounwind
|
193 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
194 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
195 |
+
%.not = icmp eq i32 %1, 0
|
196 |
+
br i1 %.not, label %4, label %2
|
197 |
+
|
198 |
+
2: ; preds = %0
|
199 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
200 |
+
br label %6
|
201 |
+
|
202 |
+
4: ; preds = %0
|
203 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
204 |
+
br label %6
|
205 |
+
|
206 |
+
6: ; preds = %4, %2
|
207 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
208 |
+
ret float %.0
|
209 |
+
}
|
210 |
+
|
211 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
212 |
+
|
213 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
214 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
215 |
+
|
216 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
217 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
218 |
+
|
219 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
220 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
221 |
+
attributes #2 = { convergent nocallback nounwind }
|
222 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
223 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
224 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
225 |
+
attributes #6 = { nounwind }
|
226 |
+
|
227 |
+
!llvm.module.flags = !{!0, !1}
|
228 |
+
!llvm.dbg.cu = !{!2}
|
229 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
230 |
+
!llvm.ident = !{!6}
|
231 |
+
|
232 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
233 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
234 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
235 |
+
!3 = !DIFile(filename: "cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py", directory: "/tmp/torchinductor_root/qh")
|
236 |
+
!4 = !{ptr @triton__0d1d2d3d4de5de, !"kernel", i32 1}
|
237 |
+
!5 = !{ptr @triton__0d1d2d3d4de5de, !"maxntidx", i32 64}
|
238 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
239 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5de", linkageName: "triton__0d1d2d3d4de5de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
240 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
241 |
+
!9 = !{}
|
242 |
+
!10 = !DILocation(line: 26, column: 26, scope: !7)
|
243 |
+
!11 = !DILocation(line: 23, column: 28, scope: !7)
|
244 |
+
!12 = !DILocation(line: 30, column: 40, scope: !7)
|
245 |
+
!13 = !DILocation(line: 30, column: 36, scope: !7)
|
246 |
+
!14 = !DILocation(line: 30, column: 30, scope: !7)
|
247 |
+
!15 = !DILocation(line: 30, column: 46, scope: !7)
|
248 |
+
!16 = !DILocation(line: 31, column: 30, scope: !7)
|
249 |
+
!17 = !DILocation(line: 31, column: 46, scope: !7)
|
250 |
+
!18 = !DILocation(line: 31, column: 67, scope: !7)
|
251 |
+
!19 = !DILocation(line: 32, column: 31, scope: !7)
|
252 |
+
!20 = !DILocation(line: 32, column: 36, scope: !7)
|
253 |
+
!21 = !DILocation(line: 34, column: 18, scope: !7)
|
254 |
+
!22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26)
|
255 |
+
!23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0)
|
256 |
+
!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
257 |
+
!25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0)
|
258 |
+
!26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27)
|
259 |
+
!27 = !DILocation(line: 39, column: 58, scope: !23)
|
260 |
+
!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
|
261 |
+
!29 = !DILocation(line: 39, column: 58, scope: !25)
|
262 |
+
!30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33)
|
263 |
+
!31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
|
264 |
+
!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
265 |
+
!33 = !DILocation(line: 39, column: 45, scope: !31)
|
266 |
+
!34 = !DILocation(line: 42, column: 20, scope: !7)
|
267 |
+
!35 = !DILocation(line: 43, column: 19, scope: !7)
|
268 |
+
!36 = !DILocation(line: 44, column: 20, scope: !7)
|
269 |
+
!37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38)
|
270 |
+
!38 = !DILocation(line: 47, column: 59, scope: !25)
|
271 |
+
!39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40)
|
272 |
+
!40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41)
|
273 |
+
!41 = !DILocation(line: 47, column: 59, scope: !23)
|
274 |
+
!42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43)
|
275 |
+
!43 = !DILocation(line: 47, column: 45, scope: !31)
|
276 |
+
!44 = !DILocation(line: 50, column: 20, scope: !7)
|
277 |
+
!45 = !DILocation(line: 52, column: 20, scope: !7)
|
278 |
+
!46 = !DILocation(line: 53, column: 26, scope: !7)
|
279 |
+
!47 = !DILocation(line: 54, column: 20, scope: !7)
|
280 |
+
!48 = !DILocation(line: 55, column: 20, scope: !7)
|
281 |
+
!49 = !DILocation(line: 57, column: 25, scope: !7)
|
282 |
+
!50 = !DILocation(line: 57, column: 48, scope: !7)
|
283 |
+
!51 = !DILocation(line: 57, column: 4, scope: !7)
|
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx
ADDED
@@ -0,0 +1,687 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4de5de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4de5de(
|
14 |
+
.param .u64 triton__0d1d2d3d4de5de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4de5de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4de5de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4de5de_param_3,
|
18 |
+
.param .u32 triton__0d1d2d3d4de5de_param_4,
|
19 |
+
.param .u32 triton__0d1d2d3d4de5de_param_5
|
20 |
+
)
|
21 |
+
.maxntid 64, 1, 1
|
22 |
+
{
|
23 |
+
.reg .pred %p<23>;
|
24 |
+
.reg .b16 %rs<9>;
|
25 |
+
.reg .b32 %r<84>;
|
26 |
+
.reg .f32 %f<70>;
|
27 |
+
.reg .b64 %rd<12>;
|
28 |
+
.loc 1 18 0
|
29 |
+
$L__func_begin0:
|
30 |
+
.loc 1 18 0
|
31 |
+
|
32 |
+
ld.param.u64 %rd5, [triton__0d1d2d3d4de5de_param_0];
|
33 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4de5de_param_1];
|
34 |
+
$L__tmp0:
|
35 |
+
.loc 1 26 26
|
36 |
+
mov.u32 %r50, %tid.x;
|
37 |
+
and.b32 %r51, %r50, 31;
|
38 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4de5de_param_2];
|
39 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4de5de_param_3];
|
40 |
+
shl.b32 %r52, %r50, 2;
|
41 |
+
and.b32 %r53, %r52, 252;
|
42 |
+
.loc 1 23 28
|
43 |
+
mov.u32 %r1, %ctaid.x;
|
44 |
+
.loc 1 30 40
|
45 |
+
shl.b32 %r54, %r1, 8;
|
46 |
+
.loc 1 30 36
|
47 |
+
or.b32 %r55, %r54, %r53;
|
48 |
+
.loc 1 30 30
|
49 |
+
mul.wide.s32 %rd9, %r55, 4;
|
50 |
+
add.s64 %rd1, %rd5, %rd9;
|
51 |
+
mov.b32 %r6, 0;
|
52 |
+
mov.pred %p1, -1;
|
53 |
+
.loc 1 30 46
|
54 |
+
mov.u32 %r2, 0x0;
|
55 |
+
mov.u32 %r3, 0x0;
|
56 |
+
mov.u32 %r4, 0x0;
|
57 |
+
mov.u32 %r5, 0x0;
|
58 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
59 |
+
@!%p1 mov.u32 %r2, %r6;
|
60 |
+
@!%p1 mov.u32 %r3, %r6;
|
61 |
+
@!%p1 mov.u32 %r4, %r6;
|
62 |
+
@!%p1 mov.u32 %r5, %r6;
|
63 |
+
mov.b32 %f1, %r2;
|
64 |
+
mov.b32 %f2, %r3;
|
65 |
+
mov.b32 %f3, %r4;
|
66 |
+
mov.b32 %f4, %r5;
|
67 |
+
.loc 1 31 30
|
68 |
+
mul.wide.s32 %rd10, %r55, 2;
|
69 |
+
add.s64 %rd2, %rd6, %rd10;
|
70 |
+
.loc 1 31 46
|
71 |
+
mov.u32 %r10, 0x0;
|
72 |
+
mov.u32 %r11, 0x0;
|
73 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
74 |
+
@!%p1 mov.u32 %r10, %r6;
|
75 |
+
@!%p1 mov.u32 %r11, %r6;
|
76 |
+
cvt.u16.u32 %rs1, %r10;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
78 |
+
cvt.u16.u32 %rs3, %r11;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
80 |
+
.loc 1 31 67
|
81 |
+
cvt.f32.bf16 %r14, %rs1;
|
82 |
+
mov.b32 %f5, %r14;
|
83 |
+
cvt.f32.bf16 %r15, %rs2;
|
84 |
+
mov.b32 %f6, %r15;
|
85 |
+
cvt.f32.bf16 %r16, %rs3;
|
86 |
+
mov.b32 %f7, %r16;
|
87 |
+
cvt.f32.bf16 %r17, %rs4;
|
88 |
+
mov.b32 %f8, %r17;
|
89 |
+
.loc 1 32 31
|
90 |
+
mul.wide.u32 %rd11, %r53, 4;
|
91 |
+
add.s64 %rd3, %rd7, %rd11;
|
92 |
+
.loc 1 32 36
|
93 |
+
mov.u32 %r18, 0x0;
|
94 |
+
mov.u32 %r19, 0x0;
|
95 |
+
mov.u32 %r20, 0x0;
|
96 |
+
mov.u32 %r21, 0x0;
|
97 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
98 |
+
@!%p1 mov.u32 %r18, %r6;
|
99 |
+
@!%p1 mov.u32 %r19, %r6;
|
100 |
+
@!%p1 mov.u32 %r20, %r6;
|
101 |
+
@!%p1 mov.u32 %r21, %r6;
|
102 |
+
.loc 1 34 18
|
103 |
+
add.f32 %f9, %f5, %f1;
|
104 |
+
add.f32 %f10, %f6, %f2;
|
105 |
+
add.f32 %f11, %f7, %f3;
|
106 |
+
add.f32 %f12, %f8, %f4;
|
107 |
+
$L__tmp1:
|
108 |
+
.loc 2 233 15
|
109 |
+
add.f32 %f13, %f9, %f10;
|
110 |
+
add.f32 %f14, %f13, %f11;
|
111 |
+
add.f32 %f15, %f14, %f12;
|
112 |
+
$L__tmp2:
|
113 |
+
.loc 2 243 36
|
114 |
+
mov.b32 %r56, %f15;
|
115 |
+
shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1;
|
116 |
+
mov.b32 %f16, %r57;
|
117 |
+
$L__tmp3:
|
118 |
+
.loc 2 233 15
|
119 |
+
add.f32 %f17, %f15, %f16;
|
120 |
+
$L__tmp4:
|
121 |
+
.loc 2 243 36
|
122 |
+
mov.b32 %r58, %f17;
|
123 |
+
shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1;
|
124 |
+
mov.b32 %f18, %r59;
|
125 |
+
$L__tmp5:
|
126 |
+
.loc 2 233 15
|
127 |
+
add.f32 %f19, %f17, %f18;
|
128 |
+
$L__tmp6:
|
129 |
+
.loc 2 243 36
|
130 |
+
mov.b32 %r60, %f19;
|
131 |
+
shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1;
|
132 |
+
mov.b32 %f20, %r61;
|
133 |
+
$L__tmp7:
|
134 |
+
.loc 2 233 15
|
135 |
+
add.f32 %f21, %f19, %f20;
|
136 |
+
$L__tmp8:
|
137 |
+
.loc 2 243 36
|
138 |
+
mov.b32 %r62, %f21;
|
139 |
+
shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1;
|
140 |
+
mov.b32 %f22, %r63;
|
141 |
+
$L__tmp9:
|
142 |
+
.loc 2 233 15
|
143 |
+
add.f32 %f23, %f21, %f22;
|
144 |
+
$L__tmp10:
|
145 |
+
.loc 2 243 36
|
146 |
+
mov.b32 %r64, %f23;
|
147 |
+
shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1;
|
148 |
+
mov.b32 %f24, %r65;
|
149 |
+
$L__tmp11:
|
150 |
+
.loc 2 233 15
|
151 |
+
add.f32 %f25, %f23, %f24;
|
152 |
+
$L__tmp12:
|
153 |
+
.loc 2 243 36
|
154 |
+
setp.eq.s32 %p14, %r51, 0;
|
155 |
+
shr.u32 %r66, %r50, 3;
|
156 |
+
and.b32 %r67, %r66, 4;
|
157 |
+
mov.u32 %r68, global_smem;
|
158 |
+
add.s32 %r26, %r68, %r67;
|
159 |
+
mov.b32 %r27, %f25;
|
160 |
+
@%p14 st.shared.b32 [ %r26 + 0 ], %r27;
|
161 |
+
bar.sync 0;
|
162 |
+
setp.lt.s32 %p15, %r50, 2;
|
163 |
+
add.s32 %r29, %r68, %r52;
|
164 |
+
@%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
|
165 |
+
mov.b32 %f26, %r28;
|
166 |
+
shfl.sync.bfly.b32 %r69, %r28, 1, 31, -1;
|
167 |
+
mov.b32 %f27, %r69;
|
168 |
+
$L__tmp13:
|
169 |
+
.loc 2 233 15
|
170 |
+
add.f32 %f28, %f26, %f27;
|
171 |
+
$L__tmp14:
|
172 |
+
.loc 2 243 36
|
173 |
+
and.b32 %r70, %r50, 1;
|
174 |
+
setp.eq.b32 %p21, %r70, 1;
|
175 |
+
not.pred %p22, %p21;
|
176 |
+
and.pred %p16, %p15, %p22;
|
177 |
+
mov.b32 %r31, %f28;
|
178 |
+
@%p16 st.shared.b32 [ %r29 + 0 ], %r31;
|
179 |
+
bar.sync 0;
|
180 |
+
ld.shared.f32 %f29, [global_smem];
|
181 |
+
$L__tmp15:
|
182 |
+
.loc 3 8 15
|
183 |
+
add.f32 %f30, %f29, 0f00000000;
|
184 |
+
$L__tmp16:
|
185 |
+
.loc 1 42 20
|
186 |
+
mov.b32 %r33, %f30;
|
187 |
+
mov.b32 %r34, 1132462080;
|
188 |
+
div.full.f32 %r32, %r33, %r34;
|
189 |
+
mov.b32 %f31, %r32;
|
190 |
+
.loc 1 43 19
|
191 |
+
sub.f32 %f32, %f9, %f31;
|
192 |
+
sub.f32 %f33, %f10, %f31;
|
193 |
+
sub.f32 %f34, %f11, %f31;
|
194 |
+
sub.f32 %f35, %f12, %f31;
|
195 |
+
.loc 1 44 20
|
196 |
+
mul.f32 %f36, %f33, %f33;
|
197 |
+
$L__tmp17:
|
198 |
+
.loc 2 243 36
|
199 |
+
bar.sync 0;
|
200 |
+
$L__tmp18:
|
201 |
+
.loc 2 233 15
|
202 |
+
fma.rn.f32 %f37, %f32, %f32, %f36;
|
203 |
+
fma.rn.f32 %f38, %f34, %f34, %f37;
|
204 |
+
fma.rn.f32 %f39, %f35, %f35, %f38;
|
205 |
+
$L__tmp19:
|
206 |
+
.loc 2 243 36
|
207 |
+
mov.b32 %r71, %f39;
|
208 |
+
shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1;
|
209 |
+
mov.b32 %f40, %r72;
|
210 |
+
$L__tmp20:
|
211 |
+
.loc 2 233 15
|
212 |
+
add.f32 %f41, %f39, %f40;
|
213 |
+
$L__tmp21:
|
214 |
+
.loc 2 243 36
|
215 |
+
mov.b32 %r73, %f41;
|
216 |
+
shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1;
|
217 |
+
mov.b32 %f42, %r74;
|
218 |
+
$L__tmp22:
|
219 |
+
.loc 2 233 15
|
220 |
+
add.f32 %f43, %f41, %f42;
|
221 |
+
$L__tmp23:
|
222 |
+
.loc 2 243 36
|
223 |
+
mov.b32 %r75, %f43;
|
224 |
+
shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1;
|
225 |
+
mov.b32 %f44, %r76;
|
226 |
+
$L__tmp24:
|
227 |
+
.loc 2 233 15
|
228 |
+
add.f32 %f45, %f43, %f44;
|
229 |
+
$L__tmp25:
|
230 |
+
.loc 2 243 36
|
231 |
+
mov.b32 %r77, %f45;
|
232 |
+
shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1;
|
233 |
+
mov.b32 %f46, %r78;
|
234 |
+
$L__tmp26:
|
235 |
+
.loc 2 233 15
|
236 |
+
add.f32 %f47, %f45, %f46;
|
237 |
+
$L__tmp27:
|
238 |
+
.loc 2 243 36
|
239 |
+
mov.b32 %r79, %f47;
|
240 |
+
shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1;
|
241 |
+
mov.b32 %f48, %r80;
|
242 |
+
$L__tmp28:
|
243 |
+
.loc 2 233 15
|
244 |
+
add.f32 %f49, %f47, %f48;
|
245 |
+
$L__tmp29:
|
246 |
+
.loc 2 243 36
|
247 |
+
mov.b32 %r36, %f49;
|
248 |
+
@%p14 st.shared.b32 [ %r26 + 0 ], %r36;
|
249 |
+
bar.sync 0;
|
250 |
+
@%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
|
251 |
+
mov.b32 %f50, %r37;
|
252 |
+
shfl.sync.bfly.b32 %r81, %r37, 1, 31, -1;
|
253 |
+
mov.b32 %f51, %r81;
|
254 |
+
$L__tmp30:
|
255 |
+
.loc 2 233 15
|
256 |
+
add.f32 %f52, %f50, %f51;
|
257 |
+
$L__tmp31:
|
258 |
+
.loc 2 243 36
|
259 |
+
mov.b32 %r40, %f52;
|
260 |
+
@%p16 st.shared.b32 [ %r29 + 0 ], %r40;
|
261 |
+
bar.sync 0;
|
262 |
+
ld.shared.f32 %f53, [global_smem];
|
263 |
+
$L__tmp32:
|
264 |
+
.loc 3 8 15
|
265 |
+
add.f32 %f54, %f53, 0f00000000;
|
266 |
+
$L__tmp33:
|
267 |
+
.loc 1 50 20
|
268 |
+
mov.b32 %r42, %f54;
|
269 |
+
div.full.f32 %r41, %r42, %r34;
|
270 |
+
mov.b32 %f55, %r41;
|
271 |
+
.loc 1 52 20
|
272 |
+
add.f32 %f56, %f55, 0f3727C5AC;
|
273 |
+
.loc 1 53 26
|
274 |
+
rsqrt.approx.ftz.f32 %f57, %f56;
|
275 |
+
.loc 1 32 36
|
276 |
+
mov.b32 %f58, %r21;
|
277 |
+
mov.b32 %f59, %r20;
|
278 |
+
mov.b32 %f60, %r19;
|
279 |
+
mov.b32 %f61, %r18;
|
280 |
+
.loc 1 54 20
|
281 |
+
mul.f32 %f62, %f32, %f57;
|
282 |
+
mul.f32 %f63, %f33, %f57;
|
283 |
+
mul.f32 %f64, %f34, %f57;
|
284 |
+
mul.f32 %f65, %f35, %f57;
|
285 |
+
.loc 1 55 20
|
286 |
+
mul.f32 %f66, %f62, %f61;
|
287 |
+
mul.f32 %f67, %f63, %f60;
|
288 |
+
mul.f32 %f68, %f64, %f59;
|
289 |
+
mul.f32 %f69, %f65, %f58;
|
290 |
+
.loc 1 57 25
|
291 |
+
add.s64 %rd4, %rd8, %rd10;
|
292 |
+
.loc 1 57 48
|
293 |
+
mov.b32 %r44, %f66;
|
294 |
+
cvt.rn.bf16.f32 %rs5, %r44;
|
295 |
+
mov.b32 %r45, %f67;
|
296 |
+
cvt.rn.bf16.f32 %rs6, %r45;
|
297 |
+
mov.b32 %r46, %f68;
|
298 |
+
cvt.rn.bf16.f32 %rs7, %r46;
|
299 |
+
mov.b32 %r47, %f69;
|
300 |
+
cvt.rn.bf16.f32 %rs8, %r47;
|
301 |
+
mov.b32 %r82, {%rs5, %rs6};
|
302 |
+
mov.b32 %r83, {%rs7, %rs8};
|
303 |
+
@%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 };
|
304 |
+
.loc 1 57 4
|
305 |
+
ret;
|
306 |
+
$L__tmp34:
|
307 |
+
$L__func_end0:
|
308 |
+
|
309 |
+
}
|
310 |
+
// .globl __nv_rsqrtf
|
311 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
312 |
+
.param .b32 __nv_rsqrtf_param_0
|
313 |
+
)
|
314 |
+
{
|
315 |
+
.reg .f32 %f<3>;
|
316 |
+
$L__func_begin1:
|
317 |
+
|
318 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
319 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
320 |
+
st.param.f32 [func_retval0+0], %f2;
|
321 |
+
ret;
|
322 |
+
$L__func_end1:
|
323 |
+
|
324 |
+
}
|
325 |
+
.file 1 "/tmp/torchinductor_root/qh/cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py"
|
326 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
327 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
328 |
+
.section .debug_abbrev
|
329 |
+
{
|
330 |
+
.b8 1
|
331 |
+
.b8 17
|
332 |
+
.b8 1
|
333 |
+
.b8 37
|
334 |
+
.b8 8
|
335 |
+
.b8 19
|
336 |
+
.b8 5
|
337 |
+
.b8 3
|
338 |
+
.b8 8
|
339 |
+
.b8 16
|
340 |
+
.b8 6
|
341 |
+
.b8 27
|
342 |
+
.b8 8
|
343 |
+
.b8 180
|
344 |
+
.b8 66
|
345 |
+
.b8 12
|
346 |
+
.b8 17
|
347 |
+
.b8 1
|
348 |
+
.b8 18
|
349 |
+
.b8 1
|
350 |
+
.b8 0
|
351 |
+
.b8 0
|
352 |
+
.b8 2
|
353 |
+
.b8 46
|
354 |
+
.b8 0
|
355 |
+
.b8 135
|
356 |
+
.b8 64
|
357 |
+
.b8 8
|
358 |
+
.b8 3
|
359 |
+
.b8 8
|
360 |
+
.b8 58
|
361 |
+
.b8 11
|
362 |
+
.b8 59
|
363 |
+
.b8 11
|
364 |
+
.b8 63
|
365 |
+
.b8 12
|
366 |
+
.b8 32
|
367 |
+
.b8 11
|
368 |
+
.b8 0
|
369 |
+
.b8 0
|
370 |
+
.b8 3
|
371 |
+
.b8 46
|
372 |
+
.b8 1
|
373 |
+
.b8 17
|
374 |
+
.b8 1
|
375 |
+
.b8 18
|
376 |
+
.b8 1
|
377 |
+
.b8 64
|
378 |
+
.b8 10
|
379 |
+
.b8 49
|
380 |
+
.b8 19
|
381 |
+
.b8 0
|
382 |
+
.b8 0
|
383 |
+
.b8 4
|
384 |
+
.b8 29
|
385 |
+
.b8 1
|
386 |
+
.b8 49
|
387 |
+
.b8 19
|
388 |
+
.b8 17
|
389 |
+
.b8 1
|
390 |
+
.b8 18
|
391 |
+
.b8 1
|
392 |
+
.b8 88
|
393 |
+
.b8 11
|
394 |
+
.b8 89
|
395 |
+
.b8 11
|
396 |
+
.b8 87
|
397 |
+
.b8 11
|
398 |
+
.b8 0
|
399 |
+
.b8 0
|
400 |
+
.b8 5
|
401 |
+
.b8 29
|
402 |
+
.b8 0
|
403 |
+
.b8 49
|
404 |
+
.b8 19
|
405 |
+
.b8 17
|
406 |
+
.b8 1
|
407 |
+
.b8 18
|
408 |
+
.b8 1
|
409 |
+
.b8 88
|
410 |
+
.b8 11
|
411 |
+
.b8 89
|
412 |
+
.b8 11
|
413 |
+
.b8 87
|
414 |
+
.b8 11
|
415 |
+
.b8 0
|
416 |
+
.b8 0
|
417 |
+
.b8 0
|
418 |
+
}
|
419 |
+
.section .debug_info
|
420 |
+
{
|
421 |
+
.b32 391
|
422 |
+
.b8 2
|
423 |
+
.b8 0
|
424 |
+
.b32 .debug_abbrev
|
425 |
+
.b8 8
|
426 |
+
.b8 1
|
427 |
+
.b8 116
|
428 |
+
.b8 114
|
429 |
+
.b8 105
|
430 |
+
.b8 116
|
431 |
+
.b8 111
|
432 |
+
.b8 110
|
433 |
+
.b8 0
|
434 |
+
.b8 2
|
435 |
+
.b8 0
|
436 |
+
.b8 99
|
437 |
+
.b8 113
|
438 |
+
.b8 104
|
439 |
+
.b8 50
|
440 |
+
.b8 100
|
441 |
+
.b8 106
|
442 |
+
.b8 51
|
443 |
+
.b8 53
|
444 |
+
.b8 53
|
445 |
+
.b8 105
|
446 |
+
.b8 97
|
447 |
+
.b8 116
|
448 |
+
.b8 106
|
449 |
+
.b8 122
|
450 |
+
.b8 118
|
451 |
+
.b8 105
|
452 |
+
.b8 53
|
453 |
+
.b8 99
|
454 |
+
.b8 109
|
455 |
+
.b8 122
|
456 |
+
.b8 52
|
457 |
+
.b8 116
|
458 |
+
.b8 120
|
459 |
+
.b8 118
|
460 |
+
.b8 106
|
461 |
+
.b8 100
|
462 |
+
.b8 51
|
463 |
+
.b8 97
|
464 |
+
.b8 112
|
465 |
+
.b8 53
|
466 |
+
.b8 50
|
467 |
+
.b8 115
|
468 |
+
.b8 104
|
469 |
+
.b8 103
|
470 |
+
.b8 97
|
471 |
+
.b8 115
|
472 |
+
.b8 104
|
473 |
+
.b8 52
|
474 |
+
.b8 99
|
475 |
+
.b8 122
|
476 |
+
.b8 105
|
477 |
+
.b8 102
|
478 |
+
.b8 100
|
479 |
+
.b8 99
|
480 |
+
.b8 110
|
481 |
+
.b8 97
|
482 |
+
.b8 102
|
483 |
+
.b8 110
|
484 |
+
.b8 107
|
485 |
+
.b8 107
|
486 |
+
.b8 97
|
487 |
+
.b8 109
|
488 |
+
.b8 46
|
489 |
+
.b8 112
|
490 |
+
.b8 121
|
491 |
+
.b8 0
|
492 |
+
.b32 .debug_line
|
493 |
+
.b8 47
|
494 |
+
.b8 116
|
495 |
+
.b8 109
|
496 |
+
.b8 112
|
497 |
+
.b8 47
|
498 |
+
.b8 116
|
499 |
+
.b8 111
|
500 |
+
.b8 114
|
501 |
+
.b8 99
|
502 |
+
.b8 104
|
503 |
+
.b8 105
|
504 |
+
.b8 110
|
505 |
+
.b8 100
|
506 |
+
.b8 117
|
507 |
+
.b8 99
|
508 |
+
.b8 116
|
509 |
+
.b8 111
|
510 |
+
.b8 114
|
511 |
+
.b8 95
|
512 |
+
.b8 114
|
513 |
+
.b8 111
|
514 |
+
.b8 111
|
515 |
+
.b8 116
|
516 |
+
.b8 47
|
517 |
+
.b8 113
|
518 |
+
.b8 104
|
519 |
+
.b8 0
|
520 |
+
.b8 1
|
521 |
+
.b64 $L__func_begin0
|
522 |
+
.b64 $L__func_end0
|
523 |
+
.b8 2
|
524 |
+
.b8 116
|
525 |
+
.b8 114
|
526 |
+
.b8 105
|
527 |
+
.b8 116
|
528 |
+
.b8 111
|
529 |
+
.b8 110
|
530 |
+
.b8 95
|
531 |
+
.b8 95
|
532 |
+
.b8 48
|
533 |
+
.b8 100
|
534 |
+
.b8 49
|
535 |
+
.b8 100
|
536 |
+
.b8 50
|
537 |
+
.b8 100
|
538 |
+
.b8 51
|
539 |
+
.b8 100
|
540 |
+
.b8 52
|
541 |
+
.b8 100
|
542 |
+
.b8 101
|
543 |
+
.b8 53
|
544 |
+
.b8 100
|
545 |
+
.b8 101
|
546 |
+
.b8 0
|
547 |
+
.b8 116
|
548 |
+
.b8 114
|
549 |
+
.b8 105
|
550 |
+
.b8 116
|
551 |
+
.b8 111
|
552 |
+
.b8 110
|
553 |
+
.b8 95
|
554 |
+
.b8 95
|
555 |
+
.b8 48
|
556 |
+
.b8 100
|
557 |
+
.b8 49
|
558 |
+
.b8 100
|
559 |
+
.b8 50
|
560 |
+
.b8 100
|
561 |
+
.b8 51
|
562 |
+
.b8 100
|
563 |
+
.b8 52
|
564 |
+
.b8 100
|
565 |
+
.b8 101
|
566 |
+
.b8 53
|
567 |
+
.b8 100
|
568 |
+
.b8 101
|
569 |
+
.b8 0
|
570 |
+
.b8 1
|
571 |
+
.b8 18
|
572 |
+
.b8 1
|
573 |
+
.b8 1
|
574 |
+
.b8 3
|
575 |
+
.b64 $L__func_begin0
|
576 |
+
.b64 $L__func_end0
|
577 |
+
.b8 1
|
578 |
+
.b8 156
|
579 |
+
.b32 125
|
580 |
+
.b8 4
|
581 |
+
.b32 125
|
582 |
+
.b64 $L__tmp1
|
583 |
+
.b64 $L__tmp14
|
584 |
+
.b8 2
|
585 |
+
.b8 39
|
586 |
+
.b8 58
|
587 |
+
.b8 5
|
588 |
+
.b32 125
|
589 |
+
.b64 $L__tmp1
|
590 |
+
.b64 $L__tmp14
|
591 |
+
.b8 2
|
592 |
+
.b8 243
|
593 |
+
.b8 36
|
594 |
+
.b8 0
|
595 |
+
.b8 5
|
596 |
+
.b32 125
|
597 |
+
.b64 $L__tmp2
|
598 |
+
.b64 $L__tmp15
|
599 |
+
.b8 2
|
600 |
+
.b8 39
|
601 |
+
.b8 58
|
602 |
+
.b8 5
|
603 |
+
.b32 125
|
604 |
+
.b64 $L__tmp15
|
605 |
+
.b64 $L__tmp16
|
606 |
+
.b8 3
|
607 |
+
.b8 39
|
608 |
+
.b8 45
|
609 |
+
.b8 5
|
610 |
+
.b32 125
|
611 |
+
.b64 $L__tmp17
|
612 |
+
.b64 $L__tmp32
|
613 |
+
.b8 2
|
614 |
+
.b8 47
|
615 |
+
.b8 59
|
616 |
+
.b8 4
|
617 |
+
.b32 125
|
618 |
+
.b64 $L__tmp18
|
619 |
+
.b64 $L__tmp31
|
620 |
+
.b8 2
|
621 |
+
.b8 47
|
622 |
+
.b8 59
|
623 |
+
.b8 5
|
624 |
+
.b32 125
|
625 |
+
.b64 $L__tmp18
|
626 |
+
.b64 $L__tmp31
|
627 |
+
.b8 2
|
628 |
+
.b8 243
|
629 |
+
.b8 36
|
630 |
+
.b8 0
|
631 |
+
.b8 5
|
632 |
+
.b32 125
|
633 |
+
.b64 $L__tmp32
|
634 |
+
.b64 $L__tmp33
|
635 |
+
.b8 3
|
636 |
+
.b8 47
|
637 |
+
.b8 45
|
638 |
+
.b8 0
|
639 |
+
.b8 0
|
640 |
+
}
|
641 |
+
.section .debug_pubnames
|
642 |
+
{
|
643 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
644 |
+
$L__pubNames_start0:
|
645 |
+
.b8 2
|
646 |
+
.b8 0
|
647 |
+
.b32 .debug_info
|
648 |
+
.b32 395
|
649 |
+
.b32 125
|
650 |
+
.b8 116
|
651 |
+
.b8 114
|
652 |
+
.b8 105
|
653 |
+
.b8 116
|
654 |
+
.b8 111
|
655 |
+
.b8 110
|
656 |
+
.b8 95
|
657 |
+
.b8 95
|
658 |
+
.b8 48
|
659 |
+
.b8 100
|
660 |
+
.b8 49
|
661 |
+
.b8 100
|
662 |
+
.b8 50
|
663 |
+
.b8 100
|
664 |
+
.b8 51
|
665 |
+
.b8 100
|
666 |
+
.b8 52
|
667 |
+
.b8 100
|
668 |
+
.b8 101
|
669 |
+
.b8 53
|
670 |
+
.b8 100
|
671 |
+
.b8 101
|
672 |
+
.b8 0
|
673 |
+
.b32 0
|
674 |
+
$L__pubNames_end0:
|
675 |
+
}
|
676 |
+
.section .debug_pubtypes
|
677 |
+
{
|
678 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
679 |
+
$L__pubTypes_start0:
|
680 |
+
.b8 2
|
681 |
+
.b8 0
|
682 |
+
.b32 .debug_info
|
683 |
+
.b32 395
|
684 |
+
.b32 0
|
685 |
+
$L__pubTypes_end0:
|
686 |
+
}
|
687 |
+
.section .debug_loc { }
|
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
8 |
+
%c256_i32 = arith.constant 256 : i32
|
9 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
11 |
+
%0 = tt.get_program_id x : i32
|
12 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
13 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
14 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
15 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
16 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
17 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
18 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
19 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
20 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
21 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
22 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
23 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
24 |
+
%13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
25 |
+
%14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
26 |
+
%15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
27 |
+
%16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
28 |
+
%17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
29 |
+
%18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
|
30 |
+
^bb0(%arg6: f32, %arg7: f32):
|
31 |
+
%36 = arith.addf %arg6, %arg7 : f32
|
32 |
+
tt.reduce.return %36 : f32
|
33 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
34 |
+
%19 = arith.addf %18, %cst_2 : f32
|
35 |
+
%20 = arith.divf %19, %cst_1 : f32
|
36 |
+
%21 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
|
37 |
+
%22 = arith.subf %16, %21 : tensor<256xf32, #blocked>
|
38 |
+
%23 = arith.mulf %22, %22 : tensor<256xf32, #blocked>
|
39 |
+
%24 = arith.select %2, %23, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
40 |
+
%25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
|
41 |
+
^bb0(%arg6: f32, %arg7: f32):
|
42 |
+
%36 = arith.addf %arg6, %arg7 : f32
|
43 |
+
tt.reduce.return %36 : f32
|
44 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
45 |
+
%26 = arith.addf %25, %cst_2 : f32
|
46 |
+
%27 = arith.divf %26, %cst_1 : f32
|
47 |
+
%28 = arith.addf %27, %cst_0 : f32
|
48 |
+
%29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
49 |
+
%30 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
|
50 |
+
%31 = arith.mulf %22, %30 : tensor<256xf32, #blocked>
|
51 |
+
%32 = arith.mulf %31, %15 : tensor<256xf32, #blocked>
|
52 |
+
%33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
53 |
+
%34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
54 |
+
%35 = arith.truncf %32 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
55 |
+
tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
56 |
+
tt.return
|
57 |
+
}
|
58 |
+
}
|
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
|
5 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_2 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_4 = arith.constant dense<256> : tensor<256xi32>
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
12 |
+
%2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
|
13 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
14 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32>
|
15 |
+
%5 = arith.addi %1, %4 : tensor<256xi32>
|
16 |
+
%6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
17 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
18 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
20 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
21 |
+
%11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
22 |
+
%12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
|
23 |
+
%13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
24 |
+
%14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
25 |
+
%15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
26 |
+
%16 = arith.addf %8, %12 : tensor<256xf32>
|
27 |
+
%17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
28 |
+
%18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
|
29 |
+
^bb0(%arg6: f32, %arg7: f32):
|
30 |
+
%36 = arith.addf %arg6, %arg7 : f32
|
31 |
+
tt.reduce.return %36 : f32
|
32 |
+
}) : (tensor<256xf32>) -> f32
|
33 |
+
%19 = arith.addf %18, %cst_0 : f32
|
34 |
+
%20 = arith.divf %19, %cst_1 : f32
|
35 |
+
%21 = tt.splat %20 : (f32) -> tensor<256xf32>
|
36 |
+
%22 = arith.subf %16, %21 : tensor<256xf32>
|
37 |
+
%23 = arith.mulf %22, %22 : tensor<256xf32>
|
38 |
+
%24 = arith.select %2, %23, %cst_3 : tensor<256xi1>, tensor<256xf32>
|
39 |
+
%25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
|
40 |
+
^bb0(%arg6: f32, %arg7: f32):
|
41 |
+
%36 = arith.addf %arg6, %arg7 : f32
|
42 |
+
tt.reduce.return %36 : f32
|
43 |
+
}) : (tensor<256xf32>) -> f32
|
44 |
+
%26 = arith.addf %25, %cst_0 : f32
|
45 |
+
%27 = arith.divf %26, %cst_1 : f32
|
46 |
+
%28 = arith.addf %27, %cst_2 : f32
|
47 |
+
%29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
48 |
+
%30 = tt.splat %29 : (f32) -> tensor<256xf32>
|
49 |
+
%31 = arith.mulf %22, %30 : tensor<256xf32>
|
50 |
+
%32 = arith.mulf %31, %15 : tensor<256xf32>
|
51 |
+
%33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
52 |
+
%34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
53 |
+
%35 = arith.truncf %32 : tensor<256xf32> to tensor<256xbf16>
|
54 |
+
tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
55 |
+
tt.return
|
56 |
+
}
|
57 |
+
}
|
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin
ADDED
Binary file (60 kB). View file
|
|
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx
ADDED
@@ -0,0 +1,1854 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
34 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
36 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
37 |
+
)
|
38 |
+
.maxntid 256, 1, 1
|
39 |
+
{
|
40 |
+
.reg .pred %p<137>;
|
41 |
+
.reg .b16 %rs<49>;
|
42 |
+
.reg .b32 %r<439>;
|
43 |
+
.reg .f32 %f<487>;
|
44 |
+
.reg .b64 %rd<124>;
|
45 |
+
.loc 1 18 0
|
46 |
+
$L__func_begin0:
|
47 |
+
.loc 1 18 0
|
48 |
+
|
49 |
+
ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_4];
|
50 |
+
ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_1];
|
51 |
+
ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6de7de_param_0];
|
52 |
+
$L__tmp0:
|
53 |
+
.loc 1 22 44
|
54 |
+
mov.u32 %r89, %tid.x;
|
55 |
+
ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6de7de_param_2];
|
56 |
+
bfe.u32 %r90, %r89, 5, 3;
|
57 |
+
ld.param.u64 %rd61, [triton__0d1d2d3d4d5d6de7de_param_3];
|
58 |
+
and.b32 %r91, %r89, 15;
|
59 |
+
.loc 1 24 33
|
60 |
+
shl.b32 %r92, %r89, 3;
|
61 |
+
and.b32 %r1, %r92, 248;
|
62 |
+
and.b32 %r2, %r89, 255;
|
63 |
+
.loc 1 21 28
|
64 |
+
mov.u32 %r24, %ctaid.x;
|
65 |
+
.loc 1 21 33
|
66 |
+
shl.b32 %r93, %r24, 4;
|
67 |
+
.loc 1 22 23
|
68 |
+
or.b32 %r94, %r93, %r90;
|
69 |
+
or.b32 %r95, %r94, 8;
|
70 |
+
or.b32 %r96, %r93, %r91;
|
71 |
+
.loc 1 26 30
|
72 |
+
mul.wide.s32 %rd62, %r94, 8;
|
73 |
+
add.s64 %rd20, %rd59, %rd62;
|
74 |
+
add.s64 %rd36, %rd20, 64;
|
75 |
+
mul.wide.s32 %rd63, %r96, 8;
|
76 |
+
add.s64 %rd52, %rd59, %rd63;
|
77 |
+
mov.pred %p113, -1;
|
78 |
+
.loc 1 26 35
|
79 |
+
mov.u64 %rd19, 0x0;
|
80 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd20 + 0 ];
|
81 |
+
mov.u64 %rd21, 0x0;
|
82 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd20 + 0 ];
|
83 |
+
mov.u64 %rd23, 0x0;
|
84 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd20 + 0 ];
|
85 |
+
mov.u64 %rd25, 0x0;
|
86 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd20 + 0 ];
|
87 |
+
mov.u64 %rd27, 0x0;
|
88 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd20 + 0 ];
|
89 |
+
mov.u64 %rd29, 0x0;
|
90 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd20 + 0 ];
|
91 |
+
mov.u64 %rd31, 0x0;
|
92 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd20 + 0 ];
|
93 |
+
mov.u64 %rd33, 0x0;
|
94 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd20 + 0 ];
|
95 |
+
mov.u64 %rd35, 0x0;
|
96 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ];
|
97 |
+
mov.u64 %rd37, 0x0;
|
98 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd36 + 0 ];
|
99 |
+
mov.u64 %rd39, 0x0;
|
100 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd36 + 0 ];
|
101 |
+
mov.u64 %rd41, 0x0;
|
102 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd36 + 0 ];
|
103 |
+
mov.u64 %rd43, 0x0;
|
104 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd36 + 0 ];
|
105 |
+
mov.u64 %rd45, 0x0;
|
106 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd36 + 0 ];
|
107 |
+
mov.u64 %rd47, 0x0;
|
108 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd36 + 0 ];
|
109 |
+
mov.u64 %rd49, 0x0;
|
110 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd36 + 0 ];
|
111 |
+
mov.u64 %rd51, 0x0;
|
112 |
+
@%p113 ld.global.L1::evict_last.b64 { %rd51 }, [ %rd52 + 0 ];
|
113 |
+
.loc 1 27 18
|
114 |
+
bfe.s32 %r97, %r24, 27, 1;
|
115 |
+
shr.u32 %r98, %r97, 23;
|
116 |
+
add.s32 %r99, %r94, %r98;
|
117 |
+
and.b32 %r100, %r99, 16776704;
|
118 |
+
sub.s32 %r101, %r94, %r100;
|
119 |
+
add.s32 %r102, %r95, %r98;
|
120 |
+
and.b32 %r103, %r102, 16776704;
|
121 |
+
sub.s32 %r104, %r95, %r103;
|
122 |
+
.loc 1 35 44
|
123 |
+
shl.b32 %r105, %r101, 8;
|
124 |
+
shl.b32 %r106, %r104, 8;
|
125 |
+
.loc 1 35 40
|
126 |
+
or.b32 %r107, %r105, %r1;
|
127 |
+
or.b32 %r108, %r106, %r1;
|
128 |
+
.loc 1 35 34
|
129 |
+
mul.wide.s32 %rd64, %r107, 4;
|
130 |
+
add.s64 %rd89, %rd60, %rd64;
|
131 |
+
cvt.s64.s32 %rd65, %r105;
|
132 |
+
cvt.u64.u32 %rd66, %r1;
|
133 |
+
or.b64 %rd67, %rd65, %rd66;
|
134 |
+
shl.b64 %rd68, %rd67, 2;
|
135 |
+
add.s64 %rd69, %rd60, %rd68;
|
136 |
+
add.s64 %rd90, %rd69, 16;
|
137 |
+
mul.wide.s32 %rd70, %r108, 4;
|
138 |
+
add.s64 %rd91, %rd60, %rd70;
|
139 |
+
cvt.s64.s32 %rd71, %r106;
|
140 |
+
or.b64 %rd72, %rd71, %rd66;
|
141 |
+
shl.b64 %rd73, %rd72, 2;
|
142 |
+
add.s64 %rd74, %rd60, %rd73;
|
143 |
+
add.s64 %rd92, %rd74, 16;
|
144 |
+
mov.b32 %r325, 0;
|
145 |
+
.loc 1 35 50
|
146 |
+
mov.u32 %r25, 0x0;
|
147 |
+
mov.u32 %r26, 0x0;
|
148 |
+
mov.u32 %r27, 0x0;
|
149 |
+
mov.u32 %r28, 0x0;
|
150 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd89 + 0 ];
|
151 |
+
@!%p113 mov.u32 %r25, %r325;
|
152 |
+
@!%p113 mov.u32 %r26, %r325;
|
153 |
+
@!%p113 mov.u32 %r27, %r325;
|
154 |
+
@!%p113 mov.u32 %r28, %r325;
|
155 |
+
mov.b32 %f1, %r25;
|
156 |
+
mov.b32 %f2, %r26;
|
157 |
+
mov.b32 %f3, %r27;
|
158 |
+
mov.b32 %f4, %r28;
|
159 |
+
mov.u32 %r33, 0x0;
|
160 |
+
mov.u32 %r34, 0x0;
|
161 |
+
mov.u32 %r35, 0x0;
|
162 |
+
mov.u32 %r36, 0x0;
|
163 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd90 + 0 ];
|
164 |
+
@!%p113 mov.u32 %r33, %r325;
|
165 |
+
@!%p113 mov.u32 %r34, %r325;
|
166 |
+
@!%p113 mov.u32 %r35, %r325;
|
167 |
+
@!%p113 mov.u32 %r36, %r325;
|
168 |
+
mov.b32 %f5, %r33;
|
169 |
+
mov.b32 %f6, %r34;
|
170 |
+
mov.b32 %f7, %r35;
|
171 |
+
mov.b32 %f8, %r36;
|
172 |
+
mov.u32 %r41, 0x0;
|
173 |
+
mov.u32 %r42, 0x0;
|
174 |
+
mov.u32 %r43, 0x0;
|
175 |
+
mov.u32 %r44, 0x0;
|
176 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd91 + 0 ];
|
177 |
+
@!%p113 mov.u32 %r41, %r325;
|
178 |
+
@!%p113 mov.u32 %r42, %r325;
|
179 |
+
@!%p113 mov.u32 %r43, %r325;
|
180 |
+
@!%p113 mov.u32 %r44, %r325;
|
181 |
+
mov.b32 %f9, %r41;
|
182 |
+
mov.b32 %f10, %r42;
|
183 |
+
mov.b32 %f11, %r43;
|
184 |
+
mov.b32 %f12, %r44;
|
185 |
+
mov.u32 %r49, 0x0;
|
186 |
+
mov.u32 %r50, 0x0;
|
187 |
+
mov.u32 %r51, 0x0;
|
188 |
+
mov.u32 %r52, 0x0;
|
189 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd92 + 0 ];
|
190 |
+
@!%p113 mov.u32 %r49, %r325;
|
191 |
+
@!%p113 mov.u32 %r50, %r325;
|
192 |
+
@!%p113 mov.u32 %r51, %r325;
|
193 |
+
@!%p113 mov.u32 %r52, %r325;
|
194 |
+
mov.b32 %f13, %r49;
|
195 |
+
mov.b32 %f14, %r50;
|
196 |
+
mov.b32 %f15, %r51;
|
197 |
+
mov.b32 %f16, %r52;
|
198 |
+
.loc 1 36 44
|
199 |
+
shl.b32 %r109, %r94, 8;
|
200 |
+
shl.b32 %r110, %r95, 8;
|
201 |
+
.loc 1 36 40
|
202 |
+
or.b32 %r111, %r109, %r1;
|
203 |
+
or.b32 %r112, %r110, %r1;
|
204 |
+
.loc 1 36 34
|
205 |
+
mul.wide.s32 %rd75, %r111, 2;
|
206 |
+
add.s64 %rd93, %rd61, %rd75;
|
207 |
+
mul.wide.s32 %rd76, %r112, 2;
|
208 |
+
add.s64 %rd94, %rd61, %rd76;
|
209 |
+
.loc 1 36 50
|
210 |
+
mov.u32 %r57, 0x0;
|
211 |
+
mov.u32 %r58, 0x0;
|
212 |
+
mov.u32 %r59, 0x0;
|
213 |
+
mov.u32 %r60, 0x0;
|
214 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd93 + 0 ];
|
215 |
+
@!%p113 mov.u32 %r57, %r325;
|
216 |
+
@!%p113 mov.u32 %r58, %r325;
|
217 |
+
@!%p113 mov.u32 %r59, %r325;
|
218 |
+
@!%p113 mov.u32 %r60, %r325;
|
219 |
+
cvt.u16.u32 %rs1, %r57;
|
220 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r57; }
|
221 |
+
cvt.u16.u32 %rs3, %r58;
|
222 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r58; }
|
223 |
+
cvt.u16.u32 %rs5, %r59;
|
224 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r59; }
|
225 |
+
cvt.u16.u32 %rs7, %r60;
|
226 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r60; }
|
227 |
+
mov.u32 %r65, 0x0;
|
228 |
+
mov.u32 %r66, 0x0;
|
229 |
+
mov.u32 %r67, 0x0;
|
230 |
+
mov.u32 %r68, 0x0;
|
231 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd94 + 0 ];
|
232 |
+
@!%p113 mov.u32 %r65, %r325;
|
233 |
+
@!%p113 mov.u32 %r66, %r325;
|
234 |
+
@!%p113 mov.u32 %r67, %r325;
|
235 |
+
@!%p113 mov.u32 %r68, %r325;
|
236 |
+
cvt.u16.u32 %rs9, %r65;
|
237 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r65; }
|
238 |
+
cvt.u16.u32 %rs11, %r66;
|
239 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r66; }
|
240 |
+
cvt.u16.u32 %rs13, %r67;
|
241 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r67; }
|
242 |
+
cvt.u16.u32 %rs15, %r68;
|
243 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r68; }
|
244 |
+
.loc 1 36 101
|
245 |
+
cvt.f32.bf16 %r73, %rs1;
|
246 |
+
mov.b32 %f17, %r73;
|
247 |
+
cvt.f32.bf16 %r74, %rs2;
|
248 |
+
mov.b32 %f18, %r74;
|
249 |
+
cvt.f32.bf16 %r75, %rs3;
|
250 |
+
mov.b32 %f19, %r75;
|
251 |
+
cvt.f32.bf16 %r76, %rs4;
|
252 |
+
mov.b32 %f20, %r76;
|
253 |
+
cvt.f32.bf16 %r77, %rs5;
|
254 |
+
mov.b32 %f21, %r77;
|
255 |
+
cvt.f32.bf16 %r78, %rs6;
|
256 |
+
mov.b32 %f22, %r78;
|
257 |
+
cvt.f32.bf16 %r79, %rs7;
|
258 |
+
mov.b32 %f23, %r79;
|
259 |
+
cvt.f32.bf16 %r80, %rs8;
|
260 |
+
mov.b32 %f24, %r80;
|
261 |
+
cvt.f32.bf16 %r81, %rs9;
|
262 |
+
mov.b32 %f25, %r81;
|
263 |
+
cvt.f32.bf16 %r82, %rs10;
|
264 |
+
mov.b32 %f26, %r82;
|
265 |
+
cvt.f32.bf16 %r83, %rs11;
|
266 |
+
mov.b32 %f27, %r83;
|
267 |
+
cvt.f32.bf16 %r84, %rs12;
|
268 |
+
mov.b32 %f28, %r84;
|
269 |
+
cvt.f32.bf16 %r85, %rs13;
|
270 |
+
mov.b32 %f29, %r85;
|
271 |
+
cvt.f32.bf16 %r86, %rs14;
|
272 |
+
mov.b32 %f30, %r86;
|
273 |
+
cvt.f32.bf16 %r87, %rs15;
|
274 |
+
mov.b32 %f31, %r87;
|
275 |
+
cvt.f32.bf16 %r88, %rs16;
|
276 |
+
mov.b32 %f32, %r88;
|
277 |
+
.loc 1 37 22
|
278 |
+
add.s64 %rd77, %rd51, 50257;
|
279 |
+
.loc 1 38 22
|
280 |
+
setp.lt.s64 %p48, %rd51, 0;
|
281 |
+
.loc 1 39 36
|
282 |
+
selp.b64 %rd11, %rd77, %rd51, %p48;
|
283 |
+
.loc 1 40 40
|
284 |
+
setp.lt.u64 %p49, %rd11, 50257;
|
285 |
+
mov.b32 %r438, 883;
|
286 |
+
mov.u64 %rd123, 1;
|
287 |
+
.loc 1 40 55
|
288 |
+
@%p49 bra $L__BB0_2;
|
289 |
+
mov.u64 %rd78, assertMessage_0;
|
290 |
+
cvta.global.u64 %rd79, %rd78;
|
291 |
+
mov.u64 %rd80, assertFile_0;
|
292 |
+
cvta.global.u64 %rd81, %rd80;
|
293 |
+
mov.u64 %rd82, assertFunc_0;
|
294 |
+
cvta.global.u64 %rd83, %rd82;
|
295 |
+
{ // callseq 8, 0
|
296 |
+
.reg .b32 temp_param_reg;
|
297 |
+
.param .b64 param0;
|
298 |
+
st.param.b64 [param0+0], %rd79;
|
299 |
+
.param .b64 param1;
|
300 |
+
st.param.b64 [param1+0], %rd81;
|
301 |
+
.param .b32 param2;
|
302 |
+
st.param.b32 [param2+0], %r438;
|
303 |
+
.param .b64 param3;
|
304 |
+
st.param.b64 [param3+0], %rd83;
|
305 |
+
.param .b64 param4;
|
306 |
+
st.param.b64 [param4+0], %rd123;
|
307 |
+
call.uni
|
308 |
+
__assertfail,
|
309 |
+
(
|
310 |
+
param0,
|
311 |
+
param1,
|
312 |
+
param2,
|
313 |
+
param3,
|
314 |
+
param4
|
315 |
+
);
|
316 |
+
} // callseq 8
|
317 |
+
$L__BB0_2:
|
318 |
+
.loc 1 0 55
|
319 |
+
ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_5];
|
320 |
+
cvt.s64.s32 %rd7, %r111;
|
321 |
+
cvt.s64.s32 %rd9, %r112;
|
322 |
+
.loc 1 38 22
|
323 |
+
setp.lt.s64 %p103, %rd35, 0;
|
324 |
+
setp.lt.s64 %p104, %rd19, 0;
|
325 |
+
.loc 1 41 44
|
326 |
+
shl.b64 %rd96, %rd19, 8;
|
327 |
+
add.s64 %rd97, %rd96, 12865792;
|
328 |
+
selp.b64 %rd98, %rd97, %rd96, %p104;
|
329 |
+
shl.b64 %rd99, %rd35, 8;
|
330 |
+
add.s64 %rd100, %rd99, 12865792;
|
331 |
+
selp.b64 %rd101, %rd100, %rd99, %p103;
|
332 |
+
.loc 1 41 40
|
333 |
+
or.b64 %rd103, %rd98, %rd66;
|
334 |
+
or.b64 %rd104, %rd101, %rd66;
|
335 |
+
.loc 1 41 34
|
336 |
+
shl.b64 %rd105, %rd103, 2;
|
337 |
+
add.s64 %rd115, %rd16, %rd105;
|
338 |
+
add.s64 %rd116, %rd115, 16;
|
339 |
+
shl.b64 %rd106, %rd104, 2;
|
340 |
+
add.s64 %rd117, %rd16, %rd106;
|
341 |
+
add.s64 %rd118, %rd117, 16;
|
342 |
+
.loc 1 41 52
|
343 |
+
mov.u32 %r114, 0x0;
|
344 |
+
mov.u32 %r115, 0x0;
|
345 |
+
mov.u32 %r116, 0x0;
|
346 |
+
mov.u32 %r117, 0x0;
|
347 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd115 + 0 ];
|
348 |
+
@!%p113 mov.u32 %r114, %r325;
|
349 |
+
@!%p113 mov.u32 %r115, %r325;
|
350 |
+
@!%p113 mov.u32 %r116, %r325;
|
351 |
+
@!%p113 mov.u32 %r117, %r325;
|
352 |
+
mov.b32 %f59, %r114;
|
353 |
+
mov.b32 %f60, %r115;
|
354 |
+
mov.b32 %f61, %r116;
|
355 |
+
mov.b32 %f62, %r117;
|
356 |
+
mov.u32 %r122, 0x0;
|
357 |
+
mov.u32 %r123, 0x0;
|
358 |
+
mov.u32 %r124, 0x0;
|
359 |
+
mov.u32 %r125, 0x0;
|
360 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd116 + 0 ];
|
361 |
+
@!%p113 mov.u32 %r122, %r325;
|
362 |
+
@!%p113 mov.u32 %r123, %r325;
|
363 |
+
@!%p113 mov.u32 %r124, %r325;
|
364 |
+
@!%p113 mov.u32 %r125, %r325;
|
365 |
+
mov.b32 %f63, %r122;
|
366 |
+
mov.b32 %f64, %r123;
|
367 |
+
mov.b32 %f65, %r124;
|
368 |
+
mov.b32 %f66, %r125;
|
369 |
+
mov.u32 %r130, 0x0;
|
370 |
+
mov.u32 %r131, 0x0;
|
371 |
+
mov.u32 %r132, 0x0;
|
372 |
+
mov.u32 %r133, 0x0;
|
373 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd117 + 0 ];
|
374 |
+
@!%p113 mov.u32 %r130, %r325;
|
375 |
+
@!%p113 mov.u32 %r131, %r325;
|
376 |
+
@!%p113 mov.u32 %r132, %r325;
|
377 |
+
@!%p113 mov.u32 %r133, %r325;
|
378 |
+
mov.b32 %f67, %r130;
|
379 |
+
mov.b32 %f68, %r131;
|
380 |
+
mov.b32 %f69, %r132;
|
381 |
+
mov.b32 %f70, %r133;
|
382 |
+
mov.u32 %r138, 0x0;
|
383 |
+
mov.u32 %r139, 0x0;
|
384 |
+
mov.u32 %r140, 0x0;
|
385 |
+
mov.u32 %r141, 0x0;
|
386 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd118 + 0 ];
|
387 |
+
@!%p113 mov.u32 %r138, %r325;
|
388 |
+
@!%p113 mov.u32 %r139, %r325;
|
389 |
+
@!%p113 mov.u32 %r140, %r325;
|
390 |
+
@!%p113 mov.u32 %r141, %r325;
|
391 |
+
mov.b32 %f71, %r138;
|
392 |
+
mov.b32 %f72, %r139;
|
393 |
+
mov.b32 %f73, %r140;
|
394 |
+
mov.b32 %f74, %r141;
|
395 |
+
.loc 1 42 22
|
396 |
+
add.f32 %f75, %f1, %f59;
|
397 |
+
add.f32 %f76, %f2, %f60;
|
398 |
+
add.f32 %f77, %f3, %f61;
|
399 |
+
add.f32 %f78, %f4, %f62;
|
400 |
+
add.f32 %f79, %f5, %f63;
|
401 |
+
add.f32 %f80, %f6, %f64;
|
402 |
+
add.f32 %f81, %f7, %f65;
|
403 |
+
add.f32 %f82, %f8, %f66;
|
404 |
+
add.f32 %f83, %f9, %f67;
|
405 |
+
add.f32 %f84, %f10, %f68;
|
406 |
+
add.f32 %f85, %f11, %f69;
|
407 |
+
add.f32 %f86, %f12, %f70;
|
408 |
+
add.f32 %f87, %f13, %f71;
|
409 |
+
add.f32 %f88, %f14, %f72;
|
410 |
+
add.f32 %f89, %f15, %f73;
|
411 |
+
add.f32 %f90, %f16, %f74;
|
412 |
+
.loc 1 44 22
|
413 |
+
add.f32 %f91, %f17, %f75;
|
414 |
+
add.f32 %f92, %f18, %f76;
|
415 |
+
add.f32 %f93, %f19, %f77;
|
416 |
+
add.f32 %f94, %f20, %f78;
|
417 |
+
add.f32 %f95, %f21, %f79;
|
418 |
+
add.f32 %f96, %f22, %f80;
|
419 |
+
add.f32 %f97, %f23, %f81;
|
420 |
+
add.f32 %f98, %f24, %f82;
|
421 |
+
add.f32 %f99, %f25, %f83;
|
422 |
+
add.f32 %f100, %f26, %f84;
|
423 |
+
add.f32 %f101, %f27, %f85;
|
424 |
+
add.f32 %f102, %f28, %f86;
|
425 |
+
add.f32 %f103, %f29, %f87;
|
426 |
+
add.f32 %f104, %f30, %f88;
|
427 |
+
add.f32 %f105, %f31, %f89;
|
428 |
+
add.f32 %f106, %f32, %f90;
|
429 |
+
$L__tmp1:
|
430 |
+
.loc 2 98 22
|
431 |
+
add.f32 %f107, %f91, 0f00000000;
|
432 |
+
add.f32 %f108, %f92, 0f00000000;
|
433 |
+
add.f32 %f109, %f93, 0f00000000;
|
434 |
+
add.f32 %f110, %f94, 0f00000000;
|
435 |
+
add.f32 %f111, %f95, 0f00000000;
|
436 |
+
add.f32 %f112, %f96, 0f00000000;
|
437 |
+
add.f32 %f113, %f97, 0f00000000;
|
438 |
+
add.f32 %f114, %f98, 0f00000000;
|
439 |
+
add.f32 %f115, %f99, 0f00000000;
|
440 |
+
add.f32 %f116, %f100, 0f00000000;
|
441 |
+
add.f32 %f117, %f101, 0f00000000;
|
442 |
+
add.f32 %f118, %f102, 0f00000000;
|
443 |
+
add.f32 %f119, %f103, 0f00000000;
|
444 |
+
add.f32 %f120, %f104, 0f00000000;
|
445 |
+
add.f32 %f121, %f105, 0f00000000;
|
446 |
+
add.f32 %f122, %f106, 0f00000000;
|
447 |
+
.loc 2 101 30
|
448 |
+
sub.f32 %f123, %f91, %f107;
|
449 |
+
sub.f32 %f124, %f92, %f108;
|
450 |
+
sub.f32 %f125, %f93, %f109;
|
451 |
+
sub.f32 %f126, %f94, %f110;
|
452 |
+
sub.f32 %f127, %f95, %f111;
|
453 |
+
sub.f32 %f128, %f96, %f112;
|
454 |
+
sub.f32 %f129, %f97, %f113;
|
455 |
+
sub.f32 %f130, %f98, %f114;
|
456 |
+
sub.f32 %f131, %f99, %f115;
|
457 |
+
sub.f32 %f132, %f100, %f116;
|
458 |
+
sub.f32 %f133, %f101, %f117;
|
459 |
+
sub.f32 %f134, %f102, %f118;
|
460 |
+
sub.f32 %f135, %f103, %f119;
|
461 |
+
sub.f32 %f136, %f104, %f120;
|
462 |
+
sub.f32 %f137, %f105, %f121;
|
463 |
+
sub.f32 %f138, %f106, %f122;
|
464 |
+
.loc 2 101 13
|
465 |
+
fma.rn.f32 %f139, %f91, %f123, 0f00000000;
|
466 |
+
fma.rn.f32 %f140, %f92, %f124, 0f00000000;
|
467 |
+
fma.rn.f32 %f141, %f93, %f125, 0f00000000;
|
468 |
+
fma.rn.f32 %f142, %f94, %f126, 0f00000000;
|
469 |
+
fma.rn.f32 %f143, %f95, %f127, 0f00000000;
|
470 |
+
fma.rn.f32 %f144, %f96, %f128, 0f00000000;
|
471 |
+
fma.rn.f32 %f145, %f97, %f129, 0f00000000;
|
472 |
+
fma.rn.f32 %f146, %f98, %f130, 0f00000000;
|
473 |
+
fma.rn.f32 %f147, %f99, %f131, 0f00000000;
|
474 |
+
fma.rn.f32 %f148, %f100, %f132, 0f00000000;
|
475 |
+
fma.rn.f32 %f149, %f101, %f133, 0f00000000;
|
476 |
+
fma.rn.f32 %f150, %f102, %f134, 0f00000000;
|
477 |
+
fma.rn.f32 %f151, %f103, %f135, 0f00000000;
|
478 |
+
fma.rn.f32 %f152, %f104, %f136, 0f00000000;
|
479 |
+
fma.rn.f32 %f153, %f105, %f137, 0f00000000;
|
480 |
+
fma.rn.f32 %f154, %f106, %f138, 0f00000000;
|
481 |
+
$L__tmp2:
|
482 |
+
.loc 2 108 21
|
483 |
+
sub.f32 %f155, %f108, %f107;
|
484 |
+
mov.b32 %r147, 1065353216;
|
485 |
+
mov.b32 %r148, 1073741824;
|
486 |
+
.loc 2 110 60
|
487 |
+
div.full.f32 %r146, %r147, %r148;
|
488 |
+
mov.b32 %f156, %r146;
|
489 |
+
.loc 2 112 17
|
490 |
+
fma.rn.f32 %f157, %f156, %f155, %f107;
|
491 |
+
.loc 2 113 15
|
492 |
+
add.f32 %f158, %f139, %f140;
|
493 |
+
.loc 2 113 30
|
494 |
+
mul.f32 %f159, %f155, %f155;
|
495 |
+
.loc 2 113 22
|
496 |
+
fma.rn.f32 %f160, %f156, %f159, %f158;
|
497 |
+
.loc 2 108 21
|
498 |
+
sub.f32 %f161, %f109, %f157;
|
499 |
+
mov.b32 %r151, 1077936128;
|
500 |
+
.loc 2 110 60
|
501 |
+
div.full.f32 %r149, %r147, %r151;
|
502 |
+
mov.b32 %f162, %r149;
|
503 |
+
.loc 2 112 17
|
504 |
+
fma.rn.f32 %f163, %f162, %f161, %f157;
|
505 |
+
.loc 2 113 15
|
506 |
+
add.f32 %f164, %f141, %f160;
|
507 |
+
.loc 2 113 30
|
508 |
+
mul.f32 %f165, %f161, %f161;
|
509 |
+
.loc 2 113 38
|
510 |
+
fma.rn.f32 %f166, %f161, %f161, %f165;
|
511 |
+
.loc 2 113 22
|
512 |
+
fma.rn.f32 %f167, %f162, %f166, %f164;
|
513 |
+
.loc 2 108 21
|
514 |
+
sub.f32 %f168, %f110, %f163;
|
515 |
+
mov.b32 %r154, 1082130432;
|
516 |
+
.loc 2 110 60
|
517 |
+
div.full.f32 %r152, %r147, %r154;
|
518 |
+
mov.b32 %f169, %r152;
|
519 |
+
.loc 2 112 17
|
520 |
+
fma.rn.f32 %f170, %f169, %f168, %f163;
|
521 |
+
.loc 2 113 15
|
522 |
+
add.f32 %f171, %f142, %f167;
|
523 |
+
.loc 2 113 30
|
524 |
+
mul.f32 %f172, %f168, %f168;
|
525 |
+
.loc 2 113 38
|
526 |
+
mul.f32 %f173, %f172, 0f40400000;
|
527 |
+
.loc 2 113 22
|
528 |
+
fma.rn.f32 %f174, %f169, %f173, %f171;
|
529 |
+
.loc 2 108 21
|
530 |
+
sub.f32 %f175, %f111, %f170;
|
531 |
+
mov.b32 %r157, 1084227584;
|
532 |
+
.loc 2 110 60
|
533 |
+
div.full.f32 %r155, %r147, %r157;
|
534 |
+
mov.b32 %f176, %r155;
|
535 |
+
.loc 2 112 17
|
536 |
+
fma.rn.f32 %f177, %f176, %f175, %f170;
|
537 |
+
.loc 2 113 15
|
538 |
+
add.f32 %f178, %f143, %f174;
|
539 |
+
.loc 2 113 30
|
540 |
+
mul.f32 %f179, %f175, %f175;
|
541 |
+
.loc 2 113 38
|
542 |
+
mul.f32 %f180, %f179, 0f40800000;
|
543 |
+
.loc 2 113 22
|
544 |
+
fma.rn.f32 %f181, %f176, %f180, %f178;
|
545 |
+
.loc 2 108 21
|
546 |
+
sub.f32 %f182, %f112, %f177;
|
547 |
+
mov.b32 %r160, 1086324736;
|
548 |
+
.loc 2 110 60
|
549 |
+
div.full.f32 %r158, %r147, %r160;
|
550 |
+
mov.b32 %f183, %r158;
|
551 |
+
.loc 2 112 17
|
552 |
+
fma.rn.f32 %f184, %f183, %f182, %f177;
|
553 |
+
.loc 2 113 15
|
554 |
+
add.f32 %f185, %f144, %f181;
|
555 |
+
.loc 2 113 30
|
556 |
+
mul.f32 %f186, %f182, %f182;
|
557 |
+
.loc 2 113 38
|
558 |
+
mul.f32 %f187, %f186, 0f40A00000;
|
559 |
+
.loc 2 113 22
|
560 |
+
fma.rn.f32 %f188, %f183, %f187, %f185;
|
561 |
+
.loc 2 108 21
|
562 |
+
sub.f32 %f189, %f113, %f184;
|
563 |
+
mov.b32 %r163, 1088421888;
|
564 |
+
.loc 2 110 60
|
565 |
+
div.full.f32 %r161, %r147, %r163;
|
566 |
+
mov.b32 %f190, %r161;
|
567 |
+
.loc 2 112 17
|
568 |
+
fma.rn.f32 %f191, %f190, %f189, %f184;
|
569 |
+
.loc 2 113 15
|
570 |
+
add.f32 %f192, %f145, %f188;
|
571 |
+
.loc 2 113 30
|
572 |
+
mul.f32 %f193, %f189, %f189;
|
573 |
+
.loc 2 113 38
|
574 |
+
mul.f32 %f194, %f193, 0f40C00000;
|
575 |
+
.loc 2 113 22
|
576 |
+
fma.rn.f32 %f195, %f190, %f194, %f192;
|
577 |
+
.loc 2 108 21
|
578 |
+
sub.f32 %f196, %f114, %f191;
|
579 |
+
mov.b32 %r166, 1090519040;
|
580 |
+
.loc 2 110 60
|
581 |
+
div.full.f32 %r164, %r147, %r166;
|
582 |
+
mov.b32 %f197, %r164;
|
583 |
+
.loc 2 112 17
|
584 |
+
fma.rn.f32 %f198, %f197, %f196, %f191;
|
585 |
+
.loc 2 113 15
|
586 |
+
add.f32 %f199, %f146, %f195;
|
587 |
+
.loc 2 113 30
|
588 |
+
mul.f32 %f200, %f196, %f196;
|
589 |
+
.loc 2 113 38
|
590 |
+
mul.f32 %f201, %f200, 0f40E00000;
|
591 |
+
.loc 2 113 22
|
592 |
+
fma.rn.f32 %f202, %f197, %f201, %f199;
|
593 |
+
.loc 2 108 21
|
594 |
+
sub.f32 %f203, %f116, %f115;
|
595 |
+
.loc 2 110 60
|
596 |
+
div.full.f32 %r167, %r147, %r148;
|
597 |
+
mov.b32 %f204, %r167;
|
598 |
+
.loc 2 112 17
|
599 |
+
fma.rn.f32 %f205, %f203, %f204, %f115;
|
600 |
+
.loc 2 113 15
|
601 |
+
add.f32 %f206, %f147, %f148;
|
602 |
+
.loc 2 113 30
|
603 |
+
mul.f32 %f207, %f203, %f203;
|
604 |
+
.loc 2 113 22
|
605 |
+
fma.rn.f32 %f208, %f207, %f204, %f206;
|
606 |
+
.loc 2 108 21
|
607 |
+
sub.f32 %f209, %f117, %f205;
|
608 |
+
.loc 2 110 60
|
609 |
+
div.full.f32 %r170, %r147, %r151;
|
610 |
+
mov.b32 %f210, %r170;
|
611 |
+
.loc 2 112 17
|
612 |
+
fma.rn.f32 %f211, %f210, %f209, %f205;
|
613 |
+
.loc 2 113 15
|
614 |
+
add.f32 %f212, %f149, %f208;
|
615 |
+
.loc 2 113 30
|
616 |
+
mul.f32 %f213, %f209, %f209;
|
617 |
+
.loc 2 113 38
|
618 |
+
fma.rn.f32 %f214, %f209, %f209, %f213;
|
619 |
+
.loc 2 113 22
|
620 |
+
fma.rn.f32 %f215, %f210, %f214, %f212;
|
621 |
+
.loc 2 108 21
|
622 |
+
sub.f32 %f216, %f118, %f211;
|
623 |
+
.loc 2 110 60
|
624 |
+
div.full.f32 %r173, %r147, %r154;
|
625 |
+
mov.b32 %f217, %r173;
|
626 |
+
.loc 2 112 17
|
627 |
+
fma.rn.f32 %f218, %f217, %f216, %f211;
|
628 |
+
.loc 2 113 15
|
629 |
+
add.f32 %f219, %f150, %f215;
|
630 |
+
.loc 2 113 30
|
631 |
+
mul.f32 %f220, %f216, %f216;
|
632 |
+
.loc 2 113 38
|
633 |
+
mul.f32 %f221, %f220, 0f40400000;
|
634 |
+
.loc 2 113 22
|
635 |
+
fma.rn.f32 %f222, %f217, %f221, %f219;
|
636 |
+
.loc 2 108 21
|
637 |
+
sub.f32 %f223, %f119, %f218;
|
638 |
+
.loc 2 110 60
|
639 |
+
div.full.f32 %r176, %r147, %r157;
|
640 |
+
mov.b32 %f224, %r176;
|
641 |
+
.loc 2 112 17
|
642 |
+
fma.rn.f32 %f225, %f224, %f223, %f218;
|
643 |
+
.loc 2 113 15
|
644 |
+
add.f32 %f226, %f151, %f222;
|
645 |
+
.loc 2 113 30
|
646 |
+
mul.f32 %f227, %f223, %f223;
|
647 |
+
.loc 2 113 38
|
648 |
+
mul.f32 %f228, %f227, 0f40800000;
|
649 |
+
.loc 2 113 22
|
650 |
+
fma.rn.f32 %f229, %f224, %f228, %f226;
|
651 |
+
.loc 2 108 21
|
652 |
+
sub.f32 %f230, %f120, %f225;
|
653 |
+
.loc 2 110 60
|
654 |
+
div.full.f32 %r179, %r147, %r160;
|
655 |
+
mov.b32 %f231, %r179;
|
656 |
+
.loc 2 112 17
|
657 |
+
fma.rn.f32 %f232, %f231, %f230, %f225;
|
658 |
+
.loc 2 113 15
|
659 |
+
add.f32 %f233, %f152, %f229;
|
660 |
+
.loc 2 113 30
|
661 |
+
mul.f32 %f234, %f230, %f230;
|
662 |
+
.loc 2 113 38
|
663 |
+
mul.f32 %f235, %f234, 0f40A00000;
|
664 |
+
.loc 2 113 22
|
665 |
+
fma.rn.f32 %f236, %f231, %f235, %f233;
|
666 |
+
.loc 2 108 21
|
667 |
+
sub.f32 %f237, %f121, %f232;
|
668 |
+
.loc 2 110 60
|
669 |
+
div.full.f32 %r182, %r147, %r163;
|
670 |
+
mov.b32 %f238, %r182;
|
671 |
+
.loc 2 112 17
|
672 |
+
fma.rn.f32 %f239, %f238, %f237, %f232;
|
673 |
+
.loc 2 113 15
|
674 |
+
add.f32 %f240, %f153, %f236;
|
675 |
+
.loc 2 113 30
|
676 |
+
mul.f32 %f241, %f237, %f237;
|
677 |
+
.loc 2 113 38
|
678 |
+
mul.f32 %f242, %f241, 0f40C00000;
|
679 |
+
.loc 2 113 22
|
680 |
+
fma.rn.f32 %f243, %f238, %f242, %f240;
|
681 |
+
.loc 2 108 21
|
682 |
+
sub.f32 %f244, %f122, %f239;
|
683 |
+
.loc 2 110 60
|
684 |
+
div.full.f32 %r185, %r147, %r166;
|
685 |
+
mov.b32 %f245, %r185;
|
686 |
+
.loc 2 112 17
|
687 |
+
fma.rn.f32 %f246, %f245, %f244, %f239;
|
688 |
+
.loc 2 113 15
|
689 |
+
add.f32 %f247, %f154, %f243;
|
690 |
+
.loc 2 113 30
|
691 |
+
mul.f32 %f248, %f244, %f244;
|
692 |
+
.loc 2 113 38
|
693 |
+
mul.f32 %f249, %f248, 0f40E00000;
|
694 |
+
.loc 2 113 22
|
695 |
+
fma.rn.f32 %f250, %f245, %f249, %f247;
|
696 |
+
$L__tmp3:
|
697 |
+
.loc 2 120 46
|
698 |
+
mov.b32 %r284, %f198;
|
699 |
+
shfl.sync.bfly.b32 %r285, %r284, 16, 31, -1;
|
700 |
+
mov.b32 %f251, %r285;
|
701 |
+
mov.b32 %r286, %f202;
|
702 |
+
shfl.sync.bfly.b32 %r287, %r286, 16, 31, -1;
|
703 |
+
mov.b32 %f252, %r287;
|
704 |
+
shfl.sync.bfly.b32 %r189, %r166, 16, 31, -1;
|
705 |
+
mov.b32 %f253, %r189;
|
706 |
+
$L__tmp4:
|
707 |
+
.loc 2 108 21
|
708 |
+
sub.f32 %f254, %f251, %f198;
|
709 |
+
.loc 2 109 28
|
710 |
+
add.f32 %f255, %f253, 0f41000000;
|
711 |
+
.loc 2 110 39
|
712 |
+
setp.eq.f32 %p105, %f255, 0f00000000;
|
713 |
+
.loc 2 110 60
|
714 |
+
mov.b32 %r190, %f255;
|
715 |
+
div.full.f32 %r188, %r189, %r190;
|
716 |
+
mov.b32 %f256, %r188;
|
717 |
+
.loc 2 110 49
|
718 |
+
selp.f32 %f257, 0f00000000, %f256, %p105;
|
719 |
+
.loc 2 112 17
|
720 |
+
fma.rn.f32 %f258, %f257, %f254, %f198;
|
721 |
+
.loc 2 113 15
|
722 |
+
add.f32 %f259, %f202, %f252;
|
723 |
+
.loc 2 113 30
|
724 |
+
mul.f32 %f260, %f254, %f254;
|
725 |
+
.loc 2 113 38
|
726 |
+
mul.f32 %f261, %f260, 0f41000000;
|
727 |
+
.loc 2 113 22
|
728 |
+
fma.rn.f32 %f262, %f257, %f261, %f259;
|
729 |
+
$L__tmp5:
|
730 |
+
.loc 2 120 46
|
731 |
+
mov.b32 %r288, %f258;
|
732 |
+
shfl.sync.bfly.b32 %r289, %r288, 8, 31, -1;
|
733 |
+
mov.b32 %f263, %r289;
|
734 |
+
mov.b32 %r290, %f262;
|
735 |
+
shfl.sync.bfly.b32 %r291, %r290, 8, 31, -1;
|
736 |
+
mov.b32 %f264, %r291;
|
737 |
+
shfl.sync.bfly.b32 %r192, %r190, 8, 31, -1;
|
738 |
+
mov.b32 %f265, %r192;
|
739 |
+
$L__tmp6:
|
740 |
+
.loc 2 108 21
|
741 |
+
sub.f32 %f266, %f263, %f258;
|
742 |
+
.loc 2 109 28
|
743 |
+
add.f32 %f267, %f255, %f265;
|
744 |
+
.loc 2 110 39
|
745 |
+
setp.eq.f32 %p106, %f267, 0f00000000;
|
746 |
+
.loc 2 110 60
|
747 |
+
mov.b32 %r193, %f267;
|
748 |
+
div.full.f32 %r191, %r192, %r193;
|
749 |
+
mov.b32 %f268, %r191;
|
750 |
+
.loc 2 110 49
|
751 |
+
selp.f32 %f269, 0f00000000, %f268, %p106;
|
752 |
+
.loc 2 112 17
|
753 |
+
fma.rn.f32 %f270, %f269, %f266, %f258;
|
754 |
+
.loc 2 113 15
|
755 |
+
add.f32 %f271, %f262, %f264;
|
756 |
+
.loc 2 113 30
|
757 |
+
mul.f32 %f272, %f266, %f266;
|
758 |
+
.loc 2 113 38
|
759 |
+
mul.f32 %f273, %f255, %f272;
|
760 |
+
.loc 2 113 22
|
761 |
+
fma.rn.f32 %f274, %f269, %f273, %f271;
|
762 |
+
$L__tmp7:
|
763 |
+
.loc 2 120 46
|
764 |
+
mov.b32 %r292, %f270;
|
765 |
+
shfl.sync.bfly.b32 %r293, %r292, 4, 31, -1;
|
766 |
+
mov.b32 %f275, %r293;
|
767 |
+
mov.b32 %r294, %f274;
|
768 |
+
shfl.sync.bfly.b32 %r295, %r294, 4, 31, -1;
|
769 |
+
mov.b32 %f276, %r295;
|
770 |
+
shfl.sync.bfly.b32 %r195, %r193, 4, 31, -1;
|
771 |
+
mov.b32 %f277, %r195;
|
772 |
+
$L__tmp8:
|
773 |
+
.loc 2 108 21
|
774 |
+
sub.f32 %f278, %f275, %f270;
|
775 |
+
.loc 2 109 28
|
776 |
+
add.f32 %f279, %f267, %f277;
|
777 |
+
.loc 2 110 39
|
778 |
+
setp.eq.f32 %p107, %f279, 0f00000000;
|
779 |
+
.loc 2 110 60
|
780 |
+
mov.b32 %r196, %f279;
|
781 |
+
div.full.f32 %r194, %r195, %r196;
|
782 |
+
mov.b32 %f280, %r194;
|
783 |
+
.loc 2 110 49
|
784 |
+
selp.f32 %f281, 0f00000000, %f280, %p107;
|
785 |
+
.loc 2 112 17
|
786 |
+
fma.rn.f32 %f282, %f281, %f278, %f270;
|
787 |
+
.loc 2 113 15
|
788 |
+
add.f32 %f283, %f274, %f276;
|
789 |
+
.loc 2 113 30
|
790 |
+
mul.f32 %f284, %f278, %f278;
|
791 |
+
.loc 2 113 38
|
792 |
+
mul.f32 %f285, %f267, %f284;
|
793 |
+
.loc 2 113 22
|
794 |
+
fma.rn.f32 %f286, %f281, %f285, %f283;
|
795 |
+
$L__tmp9:
|
796 |
+
.loc 2 120 46
|
797 |
+
mov.b32 %r296, %f282;
|
798 |
+
shfl.sync.bfly.b32 %r297, %r296, 2, 31, -1;
|
799 |
+
mov.b32 %f287, %r297;
|
800 |
+
mov.b32 %r298, %f286;
|
801 |
+
shfl.sync.bfly.b32 %r299, %r298, 2, 31, -1;
|
802 |
+
mov.b32 %f288, %r299;
|
803 |
+
shfl.sync.bfly.b32 %r198, %r196, 2, 31, -1;
|
804 |
+
mov.b32 %f289, %r198;
|
805 |
+
$L__tmp10:
|
806 |
+
.loc 2 108 21
|
807 |
+
sub.f32 %f290, %f287, %f282;
|
808 |
+
.loc 2 109 28
|
809 |
+
add.f32 %f33, %f279, %f289;
|
810 |
+
.loc 2 110 39
|
811 |
+
setp.eq.f32 %p108, %f33, 0f00000000;
|
812 |
+
.loc 2 110 60
|
813 |
+
mov.b32 %r199, %f33;
|
814 |
+
div.full.f32 %r197, %r198, %r199;
|
815 |
+
mov.b32 %f291, %r197;
|
816 |
+
.loc 2 110 49
|
817 |
+
selp.f32 %f292, 0f00000000, %f291, %p108;
|
818 |
+
.loc 2 112 17
|
819 |
+
fma.rn.f32 %f34, %f290, %f292, %f282;
|
820 |
+
.loc 2 113 15
|
821 |
+
add.f32 %f293, %f286, %f288;
|
822 |
+
.loc 2 113 30
|
823 |
+
mul.f32 %f294, %f290, %f290;
|
824 |
+
.loc 2 113 38
|
825 |
+
mul.f32 %f295, %f279, %f294;
|
826 |
+
.loc 2 113 22
|
827 |
+
fma.rn.f32 %f35, %f292, %f295, %f293;
|
828 |
+
$L__tmp11:
|
829 |
+
.loc 2 120 46
|
830 |
+
mov.b32 %r300, %f34;
|
831 |
+
shfl.sync.bfly.b32 %r3, %r300, 1, 31, -1;
|
832 |
+
mov.b32 %r301, %f35;
|
833 |
+
shfl.sync.bfly.b32 %r4, %r301, 1, 31, -1;
|
834 |
+
shfl.sync.bfly.b32 %r201, %r199, 1, 31, -1;
|
835 |
+
mov.b32 %f296, %r201;
|
836 |
+
$L__tmp12:
|
837 |
+
.loc 2 109 28
|
838 |
+
add.f32 %f36, %f33, %f296;
|
839 |
+
.loc 2 110 60
|
840 |
+
mov.b32 %r202, %f36;
|
841 |
+
div.full.f32 %r200, %r201, %r202;
|
842 |
+
mov.b32 %f37, %r200;
|
843 |
+
$L__tmp13:
|
844 |
+
.loc 2 120 46
|
845 |
+
mov.b32 %r302, %f246;
|
846 |
+
shfl.sync.bfly.b32 %r303, %r302, 16, 31, -1;
|
847 |
+
mov.b32 %f297, %r303;
|
848 |
+
mov.b32 %r304, %f250;
|
849 |
+
shfl.sync.bfly.b32 %r305, %r304, 16, 31, -1;
|
850 |
+
mov.b32 %f298, %r305;
|
851 |
+
shfl.sync.bfly.b32 %r204, %r166, 16, 31, -1;
|
852 |
+
mov.b32 %f299, %r204;
|
853 |
+
$L__tmp14:
|
854 |
+
.loc 2 108 21
|
855 |
+
sub.f32 %f300, %f297, %f246;
|
856 |
+
.loc 2 109 28
|
857 |
+
add.f32 %f301, %f299, 0f41000000;
|
858 |
+
.loc 2 110 39
|
859 |
+
setp.eq.f32 %p109, %f301, 0f00000000;
|
860 |
+
.loc 2 110 60
|
861 |
+
mov.b32 %r205, %f301;
|
862 |
+
div.full.f32 %r203, %r204, %r205;
|
863 |
+
mov.b32 %f302, %r203;
|
864 |
+
.loc 2 110 49
|
865 |
+
selp.f32 %f303, 0f00000000, %f302, %p109;
|
866 |
+
.loc 2 112 17
|
867 |
+
fma.rn.f32 %f304, %f300, %f303, %f246;
|
868 |
+
.loc 2 113 15
|
869 |
+
add.f32 %f305, %f250, %f298;
|
870 |
+
.loc 2 113 30
|
871 |
+
mul.f32 %f306, %f300, %f300;
|
872 |
+
.loc 2 113 38
|
873 |
+
mul.f32 %f307, %f306, 0f41000000;
|
874 |
+
.loc 2 113 22
|
875 |
+
fma.rn.f32 %f308, %f307, %f303, %f305;
|
876 |
+
$L__tmp15:
|
877 |
+
.loc 2 120 46
|
878 |
+
mov.b32 %r306, %f304;
|
879 |
+
shfl.sync.bfly.b32 %r307, %r306, 8, 31, -1;
|
880 |
+
mov.b32 %f309, %r307;
|
881 |
+
mov.b32 %r308, %f308;
|
882 |
+
shfl.sync.bfly.b32 %r309, %r308, 8, 31, -1;
|
883 |
+
mov.b32 %f310, %r309;
|
884 |
+
shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1;
|
885 |
+
mov.b32 %f311, %r207;
|
886 |
+
$L__tmp16:
|
887 |
+
.loc 2 108 21
|
888 |
+
sub.f32 %f312, %f309, %f304;
|
889 |
+
.loc 2 109 28
|
890 |
+
add.f32 %f313, %f301, %f311;
|
891 |
+
.loc 2 110 39
|
892 |
+
setp.eq.f32 %p110, %f313, 0f00000000;
|
893 |
+
.loc 2 110 60
|
894 |
+
mov.b32 %r208, %f313;
|
895 |
+
div.full.f32 %r206, %r207, %r208;
|
896 |
+
mov.b32 %f314, %r206;
|
897 |
+
.loc 2 110 49
|
898 |
+
selp.f32 %f315, 0f00000000, %f314, %p110;
|
899 |
+
.loc 2 112 17
|
900 |
+
fma.rn.f32 %f316, %f312, %f315, %f304;
|
901 |
+
.loc 2 113 15
|
902 |
+
add.f32 %f317, %f308, %f310;
|
903 |
+
.loc 2 113 30
|
904 |
+
mul.f32 %f318, %f312, %f312;
|
905 |
+
.loc 2 113 38
|
906 |
+
mul.f32 %f319, %f301, %f318;
|
907 |
+
.loc 2 113 22
|
908 |
+
fma.rn.f32 %f320, %f315, %f319, %f317;
|
909 |
+
$L__tmp17:
|
910 |
+
.loc 2 120 46
|
911 |
+
mov.b32 %r310, %f316;
|
912 |
+
shfl.sync.bfly.b32 %r311, %r310, 4, 31, -1;
|
913 |
+
mov.b32 %f321, %r311;
|
914 |
+
mov.b32 %r312, %f320;
|
915 |
+
shfl.sync.bfly.b32 %r313, %r312, 4, 31, -1;
|
916 |
+
mov.b32 %f322, %r313;
|
917 |
+
shfl.sync.bfly.b32 %r210, %r208, 4, 31, -1;
|
918 |
+
mov.b32 %f323, %r210;
|
919 |
+
$L__tmp18:
|
920 |
+
.loc 2 108 21
|
921 |
+
sub.f32 %f324, %f321, %f316;
|
922 |
+
.loc 2 109 28
|
923 |
+
add.f32 %f325, %f313, %f323;
|
924 |
+
.loc 2 110 39
|
925 |
+
setp.eq.f32 %p111, %f325, 0f00000000;
|
926 |
+
.loc 2 110 60
|
927 |
+
mov.b32 %r211, %f325;
|
928 |
+
div.full.f32 %r209, %r210, %r211;
|
929 |
+
mov.b32 %f326, %r209;
|
930 |
+
.loc 2 110 49
|
931 |
+
selp.f32 %f327, 0f00000000, %f326, %p111;
|
932 |
+
.loc 2 112 17
|
933 |
+
fma.rn.f32 %f328, %f324, %f327, %f316;
|
934 |
+
.loc 2 113 15
|
935 |
+
add.f32 %f329, %f320, %f322;
|
936 |
+
.loc 2 113 30
|
937 |
+
mul.f32 %f330, %f324, %f324;
|
938 |
+
.loc 2 113 38
|
939 |
+
mul.f32 %f331, %f313, %f330;
|
940 |
+
.loc 2 113 22
|
941 |
+
fma.rn.f32 %f332, %f327, %f331, %f329;
|
942 |
+
$L__tmp19:
|
943 |
+
.loc 2 120 46
|
944 |
+
mov.b32 %r314, %f328;
|
945 |
+
shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1;
|
946 |
+
mov.b32 %f333, %r315;
|
947 |
+
mov.b32 %r316, %f332;
|
948 |
+
shfl.sync.bfly.b32 %r317, %r316, 2, 31, -1;
|
949 |
+
mov.b32 %f334, %r317;
|
950 |
+
shfl.sync.bfly.b32 %r213, %r211, 2, 31, -1;
|
951 |
+
mov.b32 %f335, %r213;
|
952 |
+
$L__tmp20:
|
953 |
+
.loc 2 108 21
|
954 |
+
sub.f32 %f336, %f333, %f328;
|
955 |
+
.loc 2 109 28
|
956 |
+
add.f32 %f38, %f325, %f335;
|
957 |
+
.loc 2 110 39
|
958 |
+
setp.eq.f32 %p112, %f38, 0f00000000;
|
959 |
+
.loc 2 110 60
|
960 |
+
mov.b32 %r214, %f38;
|
961 |
+
div.full.f32 %r212, %r213, %r214;
|
962 |
+
mov.b32 %f337, %r212;
|
963 |
+
.loc 2 110 49
|
964 |
+
selp.f32 %f338, 0f00000000, %f337, %p112;
|
965 |
+
.loc 2 112 17
|
966 |
+
fma.rn.f32 %f39, %f336, %f338, %f328;
|
967 |
+
.loc 2 113 15
|
968 |
+
add.f32 %f339, %f332, %f334;
|
969 |
+
.loc 2 113 30
|
970 |
+
mul.f32 %f340, %f336, %f336;
|
971 |
+
.loc 2 113 38
|
972 |
+
mul.f32 %f341, %f325, %f340;
|
973 |
+
.loc 2 113 22
|
974 |
+
fma.rn.f32 %f40, %f338, %f341, %f339;
|
975 |
+
$L__tmp21:
|
976 |
+
.loc 2 120 46
|
977 |
+
mov.b32 %r318, %f39;
|
978 |
+
shfl.sync.bfly.b32 %r5, %r318, 1, 31, -1;
|
979 |
+
mov.b32 %r319, %f40;
|
980 |
+
shfl.sync.bfly.b32 %r6, %r319, 1, 31, -1;
|
981 |
+
shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1;
|
982 |
+
mov.b32 %f342, %r216;
|
983 |
+
$L__tmp22:
|
984 |
+
.loc 2 109 28
|
985 |
+
add.f32 %f41, %f38, %f342;
|
986 |
+
.loc 2 110 60
|
987 |
+
mov.b32 %r217, %f41;
|
988 |
+
div.full.f32 %r215, %r216, %r217;
|
989 |
+
mov.b32 %f42, %r215;
|
990 |
+
$L__tmp23:
|
991 |
+
.loc 1 62 51
|
992 |
+
mov.u32 %r218, 0x0;
|
993 |
+
mov.u32 %r219, 0x0;
|
994 |
+
mov.u32 %r220, 0x0;
|
995 |
+
mov.u32 %r221, 0x0;
|
996 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r218, %r219, %r220, %r221 }, [ %rd89 + 0 ];
|
997 |
+
@!%p113 mov.u32 %r218, %r325;
|
998 |
+
@!%p113 mov.u32 %r219, %r325;
|
999 |
+
@!%p113 mov.u32 %r220, %r325;
|
1000 |
+
@!%p113 mov.u32 %r221, %r325;
|
1001 |
+
mov.u32 %r226, 0x0;
|
1002 |
+
mov.u32 %r227, 0x0;
|
1003 |
+
mov.u32 %r228, 0x0;
|
1004 |
+
mov.u32 %r229, 0x0;
|
1005 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r226, %r227, %r228, %r229 }, [ %rd90 + 0 ];
|
1006 |
+
@!%p113 mov.u32 %r226, %r325;
|
1007 |
+
@!%p113 mov.u32 %r227, %r325;
|
1008 |
+
@!%p113 mov.u32 %r228, %r325;
|
1009 |
+
@!%p113 mov.u32 %r229, %r325;
|
1010 |
+
mov.u32 %r234, 0x0;
|
1011 |
+
mov.u32 %r235, 0x0;
|
1012 |
+
mov.u32 %r236, 0x0;
|
1013 |
+
mov.u32 %r237, 0x0;
|
1014 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r234, %r235, %r236, %r237 }, [ %rd91 + 0 ];
|
1015 |
+
@!%p113 mov.u32 %r234, %r325;
|
1016 |
+
@!%p113 mov.u32 %r235, %r325;
|
1017 |
+
@!%p113 mov.u32 %r236, %r325;
|
1018 |
+
@!%p113 mov.u32 %r237, %r325;
|
1019 |
+
mov.u32 %r242, 0x0;
|
1020 |
+
mov.u32 %r243, 0x0;
|
1021 |
+
mov.u32 %r244, 0x0;
|
1022 |
+
mov.u32 %r245, 0x0;
|
1023 |
+
@%p113 ld.global.L1::evict_last.v4.b32 { %r242, %r243, %r244, %r245 }, [ %rd92 + 0 ];
|
1024 |
+
@!%p113 mov.u32 %r242, %r325;
|
1025 |
+
@!%p113 mov.u32 %r243, %r325;
|
1026 |
+
@!%p113 mov.u32 %r244, %r325;
|
1027 |
+
@!%p113 mov.u32 %r245, %r325;
|
1028 |
+
.loc 1 63 51
|
1029 |
+
mov.u32 %r250, 0x0;
|
1030 |
+
mov.u32 %r251, 0x0;
|
1031 |
+
mov.u32 %r252, 0x0;
|
1032 |
+
mov.u32 %r253, 0x0;
|
1033 |
+
@%p113 ld.global.L1::evict_first.v4.b32 { %r250, %r251, %r252, %r253 }, [ %rd93 + 0 ];
|
1034 |
+
@!%p113 mov.u32 %r250, %r325;
|
1035 |
+
@!%p113 mov.u32 %r251, %r325;
|
1036 |
+
@!%p113 mov.u32 %r252, %r325;
|
1037 |
+
@!%p113 mov.u32 %r253, %r325;
|
1038 |
+
cvt.u16.u32 %rs17, %r250;
|
1039 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r250; }
|
1040 |
+
cvt.u16.u32 %rs19, %r251;
|
1041 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r251; }
|
1042 |
+
cvt.u16.u32 %rs21, %r252;
|
1043 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r252; }
|
1044 |
+
cvt.u16.u32 %rs23, %r253;
|
1045 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r253; }
|
1046 |
+
mov.u32 %r258, 0x0;
|
1047 |
+
mov.u32 %r259, 0x0;
|
1048 |
+
mov.u32 %r260, 0x0;
|
1049 |
+
mov.u32 %r261, 0x0;
|
1050 |
+
@%p113 ld.global.L1::evict_first.v4.b32 { %r258, %r259, %r260, %r261 }, [ %rd94 + 0 ];
|
1051 |
+
@!%p113 mov.u32 %r258, %r325;
|
1052 |
+
@!%p113 mov.u32 %r259, %r325;
|
1053 |
+
@!%p113 mov.u32 %r260, %r325;
|
1054 |
+
@!%p113 mov.u32 %r261, %r325;
|
1055 |
+
cvt.u16.u32 %rs25, %r258;
|
1056 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r258; }
|
1057 |
+
cvt.u16.u32 %rs27, %r259;
|
1058 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r259; }
|
1059 |
+
cvt.u16.u32 %rs29, %r260;
|
1060 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r260; }
|
1061 |
+
cvt.u16.u32 %rs31, %r261;
|
1062 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r261; }
|
1063 |
+
.loc 1 63 103
|
1064 |
+
cvt.f32.bf16 %r266, %rs17;
|
1065 |
+
mov.b32 %f43, %r266;
|
1066 |
+
cvt.f32.bf16 %r267, %rs18;
|
1067 |
+
mov.b32 %f44, %r267;
|
1068 |
+
cvt.f32.bf16 %r268, %rs19;
|
1069 |
+
mov.b32 %f45, %r268;
|
1070 |
+
cvt.f32.bf16 %r269, %rs20;
|
1071 |
+
mov.b32 %f46, %r269;
|
1072 |
+
cvt.f32.bf16 %r270, %rs21;
|
1073 |
+
mov.b32 %f47, %r270;
|
1074 |
+
cvt.f32.bf16 %r271, %rs22;
|
1075 |
+
mov.b32 %f48, %r271;
|
1076 |
+
cvt.f32.bf16 %r272, %rs23;
|
1077 |
+
mov.b32 %f49, %r272;
|
1078 |
+
cvt.f32.bf16 %r273, %rs24;
|
1079 |
+
mov.b32 %f50, %r273;
|
1080 |
+
cvt.f32.bf16 %r274, %rs25;
|
1081 |
+
mov.b32 %f51, %r274;
|
1082 |
+
cvt.f32.bf16 %r275, %rs26;
|
1083 |
+
mov.b32 %f52, %r275;
|
1084 |
+
cvt.f32.bf16 %r276, %rs27;
|
1085 |
+
mov.b32 %f53, %r276;
|
1086 |
+
cvt.f32.bf16 %r277, %rs28;
|
1087 |
+
mov.b32 %f54, %r277;
|
1088 |
+
cvt.f32.bf16 %r278, %rs29;
|
1089 |
+
mov.b32 %f55, %r278;
|
1090 |
+
cvt.f32.bf16 %r279, %rs30;
|
1091 |
+
mov.b32 %f56, %r279;
|
1092 |
+
cvt.f32.bf16 %r280, %rs31;
|
1093 |
+
mov.b32 %f57, %r280;
|
1094 |
+
cvt.f32.bf16 %r281, %rs32;
|
1095 |
+
mov.b32 %f58, %r281;
|
1096 |
+
.loc 1 64 35
|
1097 |
+
mul.wide.u32 %rd107, %r2, 4;
|
1098 |
+
add.s64 %rd95, %rd17, %rd107;
|
1099 |
+
.loc 1 64 40
|
1100 |
+
mov.u32 %r282, 0x0;
|
1101 |
+
@%p113 ld.global.L1::evict_last.b32 { %r282 }, [ %rd95 + 0 ];
|
1102 |
+
@!%p113 mov.u32 %r282, %r325;
|
1103 |
+
.loc 1 68 57
|
1104 |
+
@%p49 bra $L__BB0_4;
|
1105 |
+
mov.u64 %rd108, assertMessage_1;
|
1106 |
+
cvta.global.u64 %rd109, %rd108;
|
1107 |
+
mov.u64 %rd110, assertFile_1;
|
1108 |
+
cvta.global.u64 %rd111, %rd110;
|
1109 |
+
mov.u64 %rd112, assertFunc_1;
|
1110 |
+
cvta.global.u64 %rd113, %rd112;
|
1111 |
+
{ // callseq 9, 0
|
1112 |
+
.reg .b32 temp_param_reg;
|
1113 |
+
.param .b64 param0;
|
1114 |
+
st.param.b64 [param0+0], %rd109;
|
1115 |
+
.param .b64 param1;
|
1116 |
+
st.param.b64 [param1+0], %rd111;
|
1117 |
+
.param .b32 param2;
|
1118 |
+
st.param.b32 [param2+0], %r438;
|
1119 |
+
.param .b64 param3;
|
1120 |
+
st.param.b64 [param3+0], %rd113;
|
1121 |
+
.param .b64 param4;
|
1122 |
+
st.param.b64 [param4+0], %rd123;
|
1123 |
+
call.uni
|
1124 |
+
__assertfail,
|
1125 |
+
(
|
1126 |
+
param0,
|
1127 |
+
param1,
|
1128 |
+
param2,
|
1129 |
+
param3,
|
1130 |
+
param4
|
1131 |
+
);
|
1132 |
+
} // callseq 9
|
1133 |
+
$L__BB0_4:
|
1134 |
+
$L__tmp24:
|
1135 |
+
.loc 2 120 46
|
1136 |
+
mov.b32 %f343, %r6;
|
1137 |
+
$L__tmp25:
|
1138 |
+
.loc 2 113 15
|
1139 |
+
add.f32 %f344, %f40, %f343;
|
1140 |
+
$L__tmp26:
|
1141 |
+
.loc 2 120 46
|
1142 |
+
mov.b32 %f345, %r5;
|
1143 |
+
$L__tmp27:
|
1144 |
+
.loc 2 108 21
|
1145 |
+
sub.f32 %f346, %f345, %f39;
|
1146 |
+
.loc 2 113 30
|
1147 |
+
mul.f32 %f347, %f346, %f346;
|
1148 |
+
.loc 2 113 38
|
1149 |
+
mul.f32 %f348, %f38, %f347;
|
1150 |
+
.loc 2 110 39
|
1151 |
+
setp.eq.f32 %p135, %f41, 0f00000000;
|
1152 |
+
.loc 2 110 49
|
1153 |
+
selp.f32 %f349, 0f00000000, %f42, %p135;
|
1154 |
+
.loc 2 113 22
|
1155 |
+
fma.rn.f32 %f350, %f349, %f348, %f344;
|
1156 |
+
$L__tmp28:
|
1157 |
+
.loc 2 120 46
|
1158 |
+
mov.b32 %f351, %r4;
|
1159 |
+
$L__tmp29:
|
1160 |
+
.loc 2 113 15
|
1161 |
+
add.f32 %f352, %f35, %f351;
|
1162 |
+
$L__tmp30:
|
1163 |
+
.loc 2 120 46
|
1164 |
+
mov.b32 %f353, %r3;
|
1165 |
+
$L__tmp31:
|
1166 |
+
.loc 2 108 21
|
1167 |
+
sub.f32 %f354, %f353, %f34;
|
1168 |
+
.loc 2 113 30
|
1169 |
+
mul.f32 %f355, %f354, %f354;
|
1170 |
+
.loc 2 113 38
|
1171 |
+
mul.f32 %f356, %f33, %f355;
|
1172 |
+
.loc 2 110 39
|
1173 |
+
setp.eq.f32 %p136, %f36, 0f00000000;
|
1174 |
+
.loc 2 110 49
|
1175 |
+
selp.f32 %f357, 0f00000000, %f37, %p136;
|
1176 |
+
.loc 2 113 22
|
1177 |
+
fma.rn.f32 %f358, %f357, %f356, %f352;
|
1178 |
+
$L__tmp32:
|
1179 |
+
.loc 1 69 54
|
1180 |
+
mov.u32 %r321, 0x0;
|
1181 |
+
mov.u32 %r322, 0x0;
|
1182 |
+
mov.u32 %r323, 0x0;
|
1183 |
+
mov.u32 %r324, 0x0;
|
1184 |
+
@%p113 ld.global.L1::evict_first.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd115 + 0 ];
|
1185 |
+
@!%p113 mov.u32 %r321, %r325;
|
1186 |
+
@!%p113 mov.u32 %r322, %r325;
|
1187 |
+
@!%p113 mov.u32 %r323, %r325;
|
1188 |
+
@!%p113 mov.u32 %r324, %r325;
|
1189 |
+
mov.u32 %r329, 0x0;
|
1190 |
+
mov.u32 %r330, 0x0;
|
1191 |
+
mov.u32 %r331, 0x0;
|
1192 |
+
mov.u32 %r332, 0x0;
|
1193 |
+
@%p113 ld.global.L1::evict_first.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd116 + 0 ];
|
1194 |
+
@!%p113 mov.u32 %r329, %r325;
|
1195 |
+
@!%p113 mov.u32 %r330, %r325;
|
1196 |
+
@!%p113 mov.u32 %r331, %r325;
|
1197 |
+
@!%p113 mov.u32 %r332, %r325;
|
1198 |
+
mov.u32 %r337, 0x0;
|
1199 |
+
mov.u32 %r338, 0x0;
|
1200 |
+
mov.u32 %r339, 0x0;
|
1201 |
+
mov.u32 %r340, 0x0;
|
1202 |
+
@%p113 ld.global.L1::evict_first.v4.b32 { %r337, %r338, %r339, %r340 }, [ %rd117 + 0 ];
|
1203 |
+
@!%p113 mov.u32 %r337, %r325;
|
1204 |
+
@!%p113 mov.u32 %r338, %r325;
|
1205 |
+
@!%p113 mov.u32 %r339, %r325;
|
1206 |
+
@!%p113 mov.u32 %r340, %r325;
|
1207 |
+
mov.u32 %r345, 0x0;
|
1208 |
+
mov.u32 %r346, 0x0;
|
1209 |
+
mov.u32 %r347, 0x0;
|
1210 |
+
mov.u32 %r348, 0x0;
|
1211 |
+
@%p113 ld.global.L1::evict_first.v4.b32 { %r345, %r346, %r347, %r348 }, [ %rd118 + 0 ];
|
1212 |
+
@!%p113 mov.u32 %r345, %r325;
|
1213 |
+
@!%p113 mov.u32 %r346, %r325;
|
1214 |
+
@!%p113 mov.u32 %r347, %r325;
|
1215 |
+
@!%p113 mov.u32 %r348, %r325;
|
1216 |
+
.loc 1 75 24
|
1217 |
+
mov.b32 %r354, %f358;
|
1218 |
+
mov.b32 %r355, 1132462080;
|
1219 |
+
div.full.f32 %r353, %r354, %r355;
|
1220 |
+
mov.b32 %f359, %r353;
|
1221 |
+
mov.b32 %r378, %f350;
|
1222 |
+
div.full.f32 %r377, %r378, %r355;
|
1223 |
+
mov.b32 %f360, %r377;
|
1224 |
+
.loc 1 77 24
|
1225 |
+
add.f32 %f361, %f359, 0f3727C5AC;
|
1226 |
+
add.f32 %f362, %f360, 0f3727C5AC;
|
1227 |
+
.loc 1 78 30
|
1228 |
+
rsqrt.approx.ftz.f32 %f363, %f361;
|
1229 |
+
rsqrt.approx.ftz.f32 %f364, %f362;
|
1230 |
+
.loc 1 69 54
|
1231 |
+
mov.b32 %f365, %r348;
|
1232 |
+
.loc 1 62 51
|
1233 |
+
mov.b32 %f366, %r245;
|
1234 |
+
.loc 1 70 24
|
1235 |
+
add.f32 %f367, %f366, %f365;
|
1236 |
+
.loc 1 72 24
|
1237 |
+
add.f32 %f368, %f58, %f367;
|
1238 |
+
$L__tmp33:
|
1239 |
+
.loc 2 112 17
|
1240 |
+
fma.rn.f32 %f369, %f346, %f349, %f39;
|
1241 |
+
$L__tmp34:
|
1242 |
+
.loc 1 73 24
|
1243 |
+
sub.f32 %f370, %f368, %f369;
|
1244 |
+
.loc 1 69 54
|
1245 |
+
mov.b32 %f371, %r347;
|
1246 |
+
.loc 1 62 51
|
1247 |
+
mov.b32 %f372, %r244;
|
1248 |
+
.loc 1 70 24
|
1249 |
+
add.f32 %f373, %f372, %f371;
|
1250 |
+
.loc 1 72 24
|
1251 |
+
add.f32 %f374, %f57, %f373;
|
1252 |
+
.loc 1 73 24
|
1253 |
+
sub.f32 %f375, %f374, %f369;
|
1254 |
+
.loc 1 69 54
|
1255 |
+
mov.b32 %f376, %r346;
|
1256 |
+
.loc 1 62 51
|
1257 |
+
mov.b32 %f377, %r243;
|
1258 |
+
.loc 1 70 24
|
1259 |
+
add.f32 %f378, %f377, %f376;
|
1260 |
+
.loc 1 72 24
|
1261 |
+
add.f32 %f379, %f56, %f378;
|
1262 |
+
.loc 1 73 24
|
1263 |
+
sub.f32 %f380, %f379, %f369;
|
1264 |
+
.loc 1 69 54
|
1265 |
+
mov.b32 %f381, %r345;
|
1266 |
+
.loc 1 62 51
|
1267 |
+
mov.b32 %f382, %r242;
|
1268 |
+
.loc 1 70 24
|
1269 |
+
add.f32 %f383, %f382, %f381;
|
1270 |
+
.loc 1 72 24
|
1271 |
+
add.f32 %f384, %f55, %f383;
|
1272 |
+
.loc 1 73 24
|
1273 |
+
sub.f32 %f385, %f384, %f369;
|
1274 |
+
.loc 1 69 54
|
1275 |
+
mov.b32 %f386, %r340;
|
1276 |
+
.loc 1 62 51
|
1277 |
+
mov.b32 %f387, %r237;
|
1278 |
+
.loc 1 70 24
|
1279 |
+
add.f32 %f388, %f387, %f386;
|
1280 |
+
.loc 1 72 24
|
1281 |
+
add.f32 %f389, %f54, %f388;
|
1282 |
+
.loc 1 73 24
|
1283 |
+
sub.f32 %f390, %f389, %f369;
|
1284 |
+
.loc 1 69 54
|
1285 |
+
mov.b32 %f391, %r339;
|
1286 |
+
.loc 1 62 51
|
1287 |
+
mov.b32 %f392, %r236;
|
1288 |
+
.loc 1 70 24
|
1289 |
+
add.f32 %f393, %f392, %f391;
|
1290 |
+
.loc 1 72 24
|
1291 |
+
add.f32 %f394, %f53, %f393;
|
1292 |
+
.loc 1 73 24
|
1293 |
+
sub.f32 %f395, %f394, %f369;
|
1294 |
+
.loc 1 69 54
|
1295 |
+
mov.b32 %f396, %r338;
|
1296 |
+
.loc 1 62 51
|
1297 |
+
mov.b32 %f397, %r235;
|
1298 |
+
.loc 1 70 24
|
1299 |
+
add.f32 %f398, %f397, %f396;
|
1300 |
+
.loc 1 72 24
|
1301 |
+
add.f32 %f399, %f52, %f398;
|
1302 |
+
.loc 1 73 24
|
1303 |
+
sub.f32 %f400, %f399, %f369;
|
1304 |
+
.loc 1 69 54
|
1305 |
+
mov.b32 %f401, %r337;
|
1306 |
+
.loc 1 62 51
|
1307 |
+
mov.b32 %f402, %r234;
|
1308 |
+
.loc 1 70 24
|
1309 |
+
add.f32 %f403, %f402, %f401;
|
1310 |
+
.loc 1 72 24
|
1311 |
+
add.f32 %f404, %f51, %f403;
|
1312 |
+
.loc 1 73 24
|
1313 |
+
sub.f32 %f405, %f404, %f369;
|
1314 |
+
.loc 1 69 54
|
1315 |
+
mov.b32 %f406, %r332;
|
1316 |
+
.loc 1 62 51
|
1317 |
+
mov.b32 %f407, %r229;
|
1318 |
+
.loc 1 70 24
|
1319 |
+
add.f32 %f408, %f407, %f406;
|
1320 |
+
.loc 1 72 24
|
1321 |
+
add.f32 %f409, %f50, %f408;
|
1322 |
+
$L__tmp35:
|
1323 |
+
.loc 2 112 17
|
1324 |
+
fma.rn.f32 %f410, %f354, %f357, %f34;
|
1325 |
+
$L__tmp36:
|
1326 |
+
.loc 1 73 24
|
1327 |
+
sub.f32 %f411, %f409, %f410;
|
1328 |
+
.loc 1 69 54
|
1329 |
+
mov.b32 %f412, %r331;
|
1330 |
+
.loc 1 62 51
|
1331 |
+
mov.b32 %f413, %r228;
|
1332 |
+
.loc 1 70 24
|
1333 |
+
add.f32 %f414, %f413, %f412;
|
1334 |
+
.loc 1 72 24
|
1335 |
+
add.f32 %f415, %f49, %f414;
|
1336 |
+
.loc 1 73 24
|
1337 |
+
sub.f32 %f416, %f415, %f410;
|
1338 |
+
.loc 1 69 54
|
1339 |
+
mov.b32 %f417, %r330;
|
1340 |
+
.loc 1 62 51
|
1341 |
+
mov.b32 %f418, %r227;
|
1342 |
+
.loc 1 70 24
|
1343 |
+
add.f32 %f419, %f418, %f417;
|
1344 |
+
.loc 1 72 24
|
1345 |
+
add.f32 %f420, %f48, %f419;
|
1346 |
+
.loc 1 73 24
|
1347 |
+
sub.f32 %f421, %f420, %f410;
|
1348 |
+
.loc 1 69 54
|
1349 |
+
mov.b32 %f422, %r329;
|
1350 |
+
.loc 1 62 51
|
1351 |
+
mov.b32 %f423, %r226;
|
1352 |
+
.loc 1 70 24
|
1353 |
+
add.f32 %f424, %f423, %f422;
|
1354 |
+
.loc 1 72 24
|
1355 |
+
add.f32 %f425, %f47, %f424;
|
1356 |
+
.loc 1 73 24
|
1357 |
+
sub.f32 %f426, %f425, %f410;
|
1358 |
+
.loc 1 69 54
|
1359 |
+
mov.b32 %f427, %r324;
|
1360 |
+
.loc 1 62 51
|
1361 |
+
mov.b32 %f428, %r221;
|
1362 |
+
.loc 1 70 24
|
1363 |
+
add.f32 %f429, %f428, %f427;
|
1364 |
+
.loc 1 72 24
|
1365 |
+
add.f32 %f430, %f46, %f429;
|
1366 |
+
.loc 1 73 24
|
1367 |
+
sub.f32 %f431, %f430, %f410;
|
1368 |
+
.loc 1 69 54
|
1369 |
+
mov.b32 %f432, %r323;
|
1370 |
+
.loc 1 62 51
|
1371 |
+
mov.b32 %f433, %r220;
|
1372 |
+
.loc 1 70 24
|
1373 |
+
add.f32 %f434, %f433, %f432;
|
1374 |
+
.loc 1 72 24
|
1375 |
+
add.f32 %f435, %f45, %f434;
|
1376 |
+
.loc 1 73 24
|
1377 |
+
sub.f32 %f436, %f435, %f410;
|
1378 |
+
.loc 1 69 54
|
1379 |
+
mov.b32 %f437, %r322;
|
1380 |
+
.loc 1 62 51
|
1381 |
+
mov.b32 %f438, %r219;
|
1382 |
+
.loc 1 70 24
|
1383 |
+
add.f32 %f439, %f438, %f437;
|
1384 |
+
.loc 1 72 24
|
1385 |
+
add.f32 %f440, %f44, %f439;
|
1386 |
+
.loc 1 73 24
|
1387 |
+
sub.f32 %f441, %f440, %f410;
|
1388 |
+
.loc 1 69 54
|
1389 |
+
mov.b32 %f442, %r321;
|
1390 |
+
.loc 1 62 51
|
1391 |
+
mov.b32 %f443, %r218;
|
1392 |
+
.loc 1 70 24
|
1393 |
+
add.f32 %f444, %f443, %f442;
|
1394 |
+
.loc 1 72 24
|
1395 |
+
add.f32 %f445, %f43, %f444;
|
1396 |
+
.loc 1 73 24
|
1397 |
+
sub.f32 %f446, %f445, %f410;
|
1398 |
+
.loc 1 79 24
|
1399 |
+
mul.f32 %f447, %f446, %f363;
|
1400 |
+
mul.f32 %f448, %f441, %f363;
|
1401 |
+
mul.f32 %f449, %f436, %f363;
|
1402 |
+
mul.f32 %f450, %f431, %f363;
|
1403 |
+
mul.f32 %f451, %f426, %f363;
|
1404 |
+
mul.f32 %f452, %f421, %f363;
|
1405 |
+
mul.f32 %f453, %f416, %f363;
|
1406 |
+
mul.f32 %f454, %f411, %f363;
|
1407 |
+
mul.f32 %f455, %f405, %f364;
|
1408 |
+
mul.f32 %f456, %f400, %f364;
|
1409 |
+
mul.f32 %f457, %f395, %f364;
|
1410 |
+
mul.f32 %f458, %f390, %f364;
|
1411 |
+
mul.f32 %f459, %f385, %f364;
|
1412 |
+
mul.f32 %f460, %f380, %f364;
|
1413 |
+
mul.f32 %f461, %f375, %f364;
|
1414 |
+
mul.f32 %f462, %f370, %f364;
|
1415 |
+
.loc 1 80 24
|
1416 |
+
shl.b32 %r425, %r2, 2;
|
1417 |
+
mov.u32 %r426, global_smem;
|
1418 |
+
add.s32 %r427, %r426, %r425;
|
1419 |
+
st.shared.u32 [%r427], %r282;
|
1420 |
+
bar.sync 0;
|
1421 |
+
shl.b32 %r428, %r1, 2;
|
1422 |
+
add.s32 %r429, %r426, %r428;
|
1423 |
+
ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r429];
|
1424 |
+
ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r429+16];
|
1425 |
+
mul.f32 %f471, %f447, %f463;
|
1426 |
+
mul.f32 %f472, %f448, %f464;
|
1427 |
+
mul.f32 %f473, %f449, %f465;
|
1428 |
+
mul.f32 %f474, %f450, %f466;
|
1429 |
+
mul.f32 %f475, %f451, %f467;
|
1430 |
+
mul.f32 %f476, %f452, %f468;
|
1431 |
+
mul.f32 %f477, %f453, %f469;
|
1432 |
+
mul.f32 %f478, %f454, %f470;
|
1433 |
+
mul.f32 %f479, %f455, %f463;
|
1434 |
+
mul.f32 %f480, %f456, %f464;
|
1435 |
+
mul.f32 %f481, %f457, %f465;
|
1436 |
+
mul.f32 %f482, %f458, %f466;
|
1437 |
+
mul.f32 %f483, %f459, %f467;
|
1438 |
+
mul.f32 %f484, %f460, %f468;
|
1439 |
+
mul.f32 %f485, %f461, %f469;
|
1440 |
+
mul.f32 %f486, %f462, %f470;
|
1441 |
+
.loc 1 82 29
|
1442 |
+
shl.b64 %rd121, %rd7, 1;
|
1443 |
+
add.s64 %rd119, %rd18, %rd121;
|
1444 |
+
shl.b64 %rd122, %rd9, 1;
|
1445 |
+
add.s64 %rd120, %rd18, %rd122;
|
1446 |
+
.loc 1 82 52
|
1447 |
+
mov.b32 %r401, %f471;
|
1448 |
+
cvt.rn.bf16.f32 %rs33, %r401;
|
1449 |
+
mov.b32 %r402, %f472;
|
1450 |
+
cvt.rn.bf16.f32 %rs34, %r402;
|
1451 |
+
mov.b32 %r403, %f473;
|
1452 |
+
cvt.rn.bf16.f32 %rs35, %r403;
|
1453 |
+
mov.b32 %r404, %f474;
|
1454 |
+
cvt.rn.bf16.f32 %rs36, %r404;
|
1455 |
+
mov.b32 %r405, %f475;
|
1456 |
+
cvt.rn.bf16.f32 %rs37, %r405;
|
1457 |
+
mov.b32 %r406, %f476;
|
1458 |
+
cvt.rn.bf16.f32 %rs38, %r406;
|
1459 |
+
mov.b32 %r407, %f477;
|
1460 |
+
cvt.rn.bf16.f32 %rs39, %r407;
|
1461 |
+
mov.b32 %r408, %f478;
|
1462 |
+
cvt.rn.bf16.f32 %rs40, %r408;
|
1463 |
+
mov.b32 %r409, %f479;
|
1464 |
+
cvt.rn.bf16.f32 %rs41, %r409;
|
1465 |
+
mov.b32 %r410, %f480;
|
1466 |
+
cvt.rn.bf16.f32 %rs42, %r410;
|
1467 |
+
mov.b32 %r411, %f481;
|
1468 |
+
cvt.rn.bf16.f32 %rs43, %r411;
|
1469 |
+
mov.b32 %r412, %f482;
|
1470 |
+
cvt.rn.bf16.f32 %rs44, %r412;
|
1471 |
+
mov.b32 %r413, %f483;
|
1472 |
+
cvt.rn.bf16.f32 %rs45, %r413;
|
1473 |
+
mov.b32 %r414, %f484;
|
1474 |
+
cvt.rn.bf16.f32 %rs46, %r414;
|
1475 |
+
mov.b32 %r415, %f485;
|
1476 |
+
cvt.rn.bf16.f32 %rs47, %r415;
|
1477 |
+
mov.b32 %r416, %f486;
|
1478 |
+
cvt.rn.bf16.f32 %rs48, %r416;
|
1479 |
+
mov.b32 %r430, {%rs33, %rs34};
|
1480 |
+
mov.b32 %r431, {%rs35, %rs36};
|
1481 |
+
mov.b32 %r432, {%rs37, %rs38};
|
1482 |
+
mov.b32 %r433, {%rs39, %rs40};
|
1483 |
+
@%p113 st.global.v4.b32 [ %rd119 + 0 ], { %r430, %r431, %r432, %r433 };
|
1484 |
+
mov.b32 %r434, {%rs41, %rs42};
|
1485 |
+
mov.b32 %r435, {%rs43, %rs44};
|
1486 |
+
mov.b32 %r436, {%rs45, %rs46};
|
1487 |
+
mov.b32 %r437, {%rs47, %rs48};
|
1488 |
+
@%p113 st.global.v4.b32 [ %rd120 + 0 ], { %r434, %r435, %r436, %r437 };
|
1489 |
+
.loc 1 58 4
|
1490 |
+
ret;
|
1491 |
+
$L__tmp37:
|
1492 |
+
$L__func_end0:
|
1493 |
+
|
1494 |
+
}
|
1495 |
+
// .globl __nv_rsqrtf
|
1496 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
1497 |
+
.param .b32 __nv_rsqrtf_param_0
|
1498 |
+
)
|
1499 |
+
{
|
1500 |
+
.reg .f32 %f<3>;
|
1501 |
+
$L__func_begin1:
|
1502 |
+
|
1503 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
1504 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
1505 |
+
st.param.f32 [func_retval0+0], %f2;
|
1506 |
+
ret;
|
1507 |
+
$L__func_end1:
|
1508 |
+
|
1509 |
+
}
|
1510 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
1511 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
1512 |
+
.section .debug_abbrev
|
1513 |
+
{
|
1514 |
+
.b8 1
|
1515 |
+
.b8 17
|
1516 |
+
.b8 1
|
1517 |
+
.b8 37
|
1518 |
+
.b8 8
|
1519 |
+
.b8 19
|
1520 |
+
.b8 5
|
1521 |
+
.b8 3
|
1522 |
+
.b8 8
|
1523 |
+
.b8 16
|
1524 |
+
.b8 6
|
1525 |
+
.b8 27
|
1526 |
+
.b8 8
|
1527 |
+
.b8 180
|
1528 |
+
.b8 66
|
1529 |
+
.b8 12
|
1530 |
+
.b8 17
|
1531 |
+
.b8 1
|
1532 |
+
.b8 18
|
1533 |
+
.b8 1
|
1534 |
+
.b8 0
|
1535 |
+
.b8 0
|
1536 |
+
.b8 2
|
1537 |
+
.b8 46
|
1538 |
+
.b8 0
|
1539 |
+
.b8 135
|
1540 |
+
.b8 64
|
1541 |
+
.b8 8
|
1542 |
+
.b8 3
|
1543 |
+
.b8 8
|
1544 |
+
.b8 58
|
1545 |
+
.b8 11
|
1546 |
+
.b8 59
|
1547 |
+
.b8 11
|
1548 |
+
.b8 63
|
1549 |
+
.b8 12
|
1550 |
+
.b8 32
|
1551 |
+
.b8 11
|
1552 |
+
.b8 0
|
1553 |
+
.b8 0
|
1554 |
+
.b8 3
|
1555 |
+
.b8 46
|
1556 |
+
.b8 1
|
1557 |
+
.b8 17
|
1558 |
+
.b8 1
|
1559 |
+
.b8 18
|
1560 |
+
.b8 1
|
1561 |
+
.b8 64
|
1562 |
+
.b8 10
|
1563 |
+
.b8 49
|
1564 |
+
.b8 19
|
1565 |
+
.b8 0
|
1566 |
+
.b8 0
|
1567 |
+
.b8 4
|
1568 |
+
.b8 29
|
1569 |
+
.b8 0
|
1570 |
+
.b8 49
|
1571 |
+
.b8 19
|
1572 |
+
.b8 17
|
1573 |
+
.b8 1
|
1574 |
+
.b8 18
|
1575 |
+
.b8 1
|
1576 |
+
.b8 88
|
1577 |
+
.b8 11
|
1578 |
+
.b8 89
|
1579 |
+
.b8 11
|
1580 |
+
.b8 87
|
1581 |
+
.b8 11
|
1582 |
+
.b8 0
|
1583 |
+
.b8 0
|
1584 |
+
.b8 5
|
1585 |
+
.b8 29
|
1586 |
+
.b8 1
|
1587 |
+
.b8 49
|
1588 |
+
.b8 19
|
1589 |
+
.b8 17
|
1590 |
+
.b8 1
|
1591 |
+
.b8 18
|
1592 |
+
.b8 1
|
1593 |
+
.b8 88
|
1594 |
+
.b8 11
|
1595 |
+
.b8 89
|
1596 |
+
.b8 11
|
1597 |
+
.b8 87
|
1598 |
+
.b8 11
|
1599 |
+
.b8 0
|
1600 |
+
.b8 0
|
1601 |
+
.b8 0
|
1602 |
+
}
|
1603 |
+
.section .debug_info
|
1604 |
+
{
|
1605 |
+
.b32 302
|
1606 |
+
.b8 2
|
1607 |
+
.b8 0
|
1608 |
+
.b32 .debug_abbrev
|
1609 |
+
.b8 8
|
1610 |
+
.b8 1
|
1611 |
+
.b8 116
|
1612 |
+
.b8 114
|
1613 |
+
.b8 105
|
1614 |
+
.b8 116
|
1615 |
+
.b8 111
|
1616 |
+
.b8 110
|
1617 |
+
.b8 0
|
1618 |
+
.b8 2
|
1619 |
+
.b8 0
|
1620 |
+
.b8 99
|
1621 |
+
.b8 112
|
1622 |
+
.b8 110
|
1623 |
+
.b8 51
|
1624 |
+
.b8 108
|
1625 |
+
.b8 97
|
1626 |
+
.b8 119
|
1627 |
+
.b8 103
|
1628 |
+
.b8 54
|
1629 |
+
.b8 53
|
1630 |
+
.b8 108
|
1631 |
+
.b8 112
|
1632 |
+
.b8 105
|
1633 |
+
.b8 54
|
1634 |
+
.b8 51
|
1635 |
+
.b8 103
|
1636 |
+
.b8 118
|
1637 |
+
.b8 54
|
1638 |
+
.b8 99
|
1639 |
+
.b8 54
|
1640 |
+
.b8 112
|
1641 |
+
.b8 110
|
1642 |
+
.b8 52
|
1643 |
+
.b8 111
|
1644 |
+
.b8 105
|
1645 |
+
.b8 107
|
1646 |
+
.b8 104
|
1647 |
+
.b8 103
|
1648 |
+
.b8 54
|
1649 |
+
.b8 113
|
1650 |
+
.b8 118
|
1651 |
+
.b8 97
|
1652 |
+
.b8 50
|
1653 |
+
.b8 104
|
1654 |
+
.b8 50
|
1655 |
+
.b8 113
|
1656 |
+
.b8 106
|
1657 |
+
.b8 100
|
1658 |
+
.b8 112
|
1659 |
+
.b8 120
|
1660 |
+
.b8 101
|
1661 |
+
.b8 54
|
1662 |
+
.b8 113
|
1663 |
+
.b8 106
|
1664 |
+
.b8 52
|
1665 |
+
.b8 108
|
1666 |
+
.b8 118
|
1667 |
+
.b8 116
|
1668 |
+
.b8 116
|
1669 |
+
.b8 119
|
1670 |
+
.b8 101
|
1671 |
+
.b8 122
|
1672 |
+
.b8 46
|
1673 |
+
.b8 112
|
1674 |
+
.b8 121
|
1675 |
+
.b8 0
|
1676 |
+
.b32 .debug_line
|
1677 |
+
.b8 47
|
1678 |
+
.b8 116
|
1679 |
+
.b8 109
|
1680 |
+
.b8 112
|
1681 |
+
.b8 47
|
1682 |
+
.b8 116
|
1683 |
+
.b8 111
|
1684 |
+
.b8 114
|
1685 |
+
.b8 99
|
1686 |
+
.b8 104
|
1687 |
+
.b8 105
|
1688 |
+
.b8 110
|
1689 |
+
.b8 100
|
1690 |
+
.b8 117
|
1691 |
+
.b8 99
|
1692 |
+
.b8 116
|
1693 |
+
.b8 111
|
1694 |
+
.b8 114
|
1695 |
+
.b8 95
|
1696 |
+
.b8 114
|
1697 |
+
.b8 111
|
1698 |
+
.b8 111
|
1699 |
+
.b8 116
|
1700 |
+
.b8 47
|
1701 |
+
.b8 112
|
1702 |
+
.b8 110
|
1703 |
+
.b8 0
|
1704 |
+
.b8 1
|
1705 |
+
.b64 $L__func_begin0
|
1706 |
+
.b64 $L__func_end0
|
1707 |
+
.b8 2
|
1708 |
+
.b8 116
|
1709 |
+
.b8 114
|
1710 |
+
.b8 105
|
1711 |
+
.b8 116
|
1712 |
+
.b8 111
|
1713 |
+
.b8 110
|
1714 |
+
.b8 95
|
1715 |
+
.b8 95
|
1716 |
+
.b8 48
|
1717 |
+
.b8 100
|
1718 |
+
.b8 49
|
1719 |
+
.b8 100
|
1720 |
+
.b8 50
|
1721 |
+
.b8 100
|
1722 |
+
.b8 51
|
1723 |
+
.b8 100
|
1724 |
+
.b8 52
|
1725 |
+
.b8 100
|
1726 |
+
.b8 53
|
1727 |
+
.b8 100
|
1728 |
+
.b8 54
|
1729 |
+
.b8 100
|
1730 |
+
.b8 101
|
1731 |
+
.b8 55
|
1732 |
+
.b8 100
|
1733 |
+
.b8 101
|
1734 |
+
.b8 0
|
1735 |
+
.b8 116
|
1736 |
+
.b8 114
|
1737 |
+
.b8 105
|
1738 |
+
.b8 116
|
1739 |
+
.b8 111
|
1740 |
+
.b8 110
|
1741 |
+
.b8 95
|
1742 |
+
.b8 95
|
1743 |
+
.b8 48
|
1744 |
+
.b8 100
|
1745 |
+
.b8 49
|
1746 |
+
.b8 100
|
1747 |
+
.b8 50
|
1748 |
+
.b8 100
|
1749 |
+
.b8 51
|
1750 |
+
.b8 100
|
1751 |
+
.b8 52
|
1752 |
+
.b8 100
|
1753 |
+
.b8 53
|
1754 |
+
.b8 100
|
1755 |
+
.b8 54
|
1756 |
+
.b8 100
|
1757 |
+
.b8 101
|
1758 |
+
.b8 55
|
1759 |
+
.b8 100
|
1760 |
+
.b8 101
|
1761 |
+
.b8 0
|
1762 |
+
.b8 1
|
1763 |
+
.b8 18
|
1764 |
+
.b8 1
|
1765 |
+
.b8 1
|
1766 |
+
.b8 3
|
1767 |
+
.b64 $L__func_begin0
|
1768 |
+
.b64 $L__func_end0
|
1769 |
+
.b8 1
|
1770 |
+
.b8 156
|
1771 |
+
.b32 125
|
1772 |
+
.b8 4
|
1773 |
+
.b32 125
|
1774 |
+
.b64 $L__tmp1
|
1775 |
+
.b64 $L__tmp2
|
1776 |
+
.b8 2
|
1777 |
+
.b8 47
|
1778 |
+
.b8 41
|
1779 |
+
.b8 5
|
1780 |
+
.b32 125
|
1781 |
+
.b64 $L__tmp2
|
1782 |
+
.b64 $L__tmp36
|
1783 |
+
.b8 2
|
1784 |
+
.b8 53
|
1785 |
+
.b8 44
|
1786 |
+
.b8 4
|
1787 |
+
.b32 125
|
1788 |
+
.b64 $L__tmp2
|
1789 |
+
.b64 $L__tmp36
|
1790 |
+
.b8 2
|
1791 |
+
.b8 120
|
1792 |
+
.b8 46
|
1793 |
+
.b8 0
|
1794 |
+
.b8 4
|
1795 |
+
.b32 125
|
1796 |
+
.b64 $L__tmp3
|
1797 |
+
.b64 $L__tmp31
|
1798 |
+
.b8 2
|
1799 |
+
.b8 53
|
1800 |
+
.b8 44
|
1801 |
+
.b8 0
|
1802 |
+
.b8 0
|
1803 |
+
}
|
1804 |
+
.section .debug_pubnames
|
1805 |
+
{
|
1806 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1807 |
+
$L__pubNames_start0:
|
1808 |
+
.b8 2
|
1809 |
+
.b8 0
|
1810 |
+
.b32 .debug_info
|
1811 |
+
.b32 306
|
1812 |
+
.b32 125
|
1813 |
+
.b8 116
|
1814 |
+
.b8 114
|
1815 |
+
.b8 105
|
1816 |
+
.b8 116
|
1817 |
+
.b8 111
|
1818 |
+
.b8 110
|
1819 |
+
.b8 95
|
1820 |
+
.b8 95
|
1821 |
+
.b8 48
|
1822 |
+
.b8 100
|
1823 |
+
.b8 49
|
1824 |
+
.b8 100
|
1825 |
+
.b8 50
|
1826 |
+
.b8 100
|
1827 |
+
.b8 51
|
1828 |
+
.b8 100
|
1829 |
+
.b8 52
|
1830 |
+
.b8 100
|
1831 |
+
.b8 53
|
1832 |
+
.b8 100
|
1833 |
+
.b8 54
|
1834 |
+
.b8 100
|
1835 |
+
.b8 101
|
1836 |
+
.b8 55
|
1837 |
+
.b8 100
|
1838 |
+
.b8 101
|
1839 |
+
.b8 0
|
1840 |
+
.b32 0
|
1841 |
+
$L__pubNames_end0:
|
1842 |
+
}
|
1843 |
+
.section .debug_pubtypes
|
1844 |
+
{
|
1845 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1846 |
+
$L__pubTypes_start0:
|
1847 |
+
.b8 2
|
1848 |
+
.b8 0
|
1849 |
+
.b32 .debug_info
|
1850 |
+
.b32 306
|
1851 |
+
.b32 0
|
1852 |
+
$L__pubTypes_end0:
|
1853 |
+
}
|
1854 |
+
.section .debug_loc { }
|
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
|
15 |
+
%cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
|
16 |
+
%cst_9 = arith.constant 0.000000e+00 : f32
|
17 |
+
%cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
|
18 |
+
%cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
|
19 |
+
%cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
|
20 |
+
%cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
|
21 |
+
%cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
|
22 |
+
%cst_15 = arith.constant dense<0.000000e+00> : tensor<16x256xbf16, #blocked>
|
23 |
+
%c16_i32 = arith.constant 16 : i32
|
24 |
+
%0 = tt.get_program_id x : i32
|
25 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
26 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
27 |
+
%3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
28 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
|
29 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
|
30 |
+
%6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
|
31 |
+
%7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
|
32 |
+
%8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
|
33 |
+
%9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
|
34 |
+
%10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
35 |
+
%11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
36 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
|
37 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
|
38 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
|
39 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
|
40 |
+
%16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
|
41 |
+
%17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
|
42 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
|
43 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
|
44 |
+
%20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
|
45 |
+
%21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
|
46 |
+
%22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
|
47 |
+
%23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
|
48 |
+
%24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
|
49 |
+
%25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
|
50 |
+
%26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
|
51 |
+
%27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
|
52 |
+
%28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
|
53 |
+
%29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
|
54 |
+
%30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
55 |
+
%31 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
|
56 |
+
%32 = tt.broadcast %31 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
|
57 |
+
%33 = arith.addi %24, %32 : tensor<16x256xi32, #blocked>
|
58 |
+
%34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
|
59 |
+
%35 = tt.addptr %34, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
|
60 |
+
%36 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
|
61 |
+
%37 = arith.extf %36 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
|
62 |
+
%38 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
|
63 |
+
%39 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
|
64 |
+
%40 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
|
65 |
+
%41 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
|
66 |
+
%42 = arith.select %40, %38, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
|
67 |
+
%43 = arith.select %41, %39, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
|
68 |
+
%44 = arith.cmpi sge, %43, %cst_7 : tensor<16x1xi64, #blocked1>
|
69 |
+
%45 = arith.cmpi slt, %43, %cst_8 : tensor<16x1xi64, #blocked1>
|
70 |
+
%46 = arith.andi %44, %45 : tensor<16x1xi1, #blocked1>
|
71 |
+
tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
|
72 |
+
%47 = arith.muli %42, %cst_4 : tensor<16x1xi64, #blocked>
|
73 |
+
%48 = tt.broadcast %47 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
|
74 |
+
%49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
|
75 |
+
%50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
|
76 |
+
%51 = arith.addi %50, %48 : tensor<16x256xi64, #blocked>
|
77 |
+
%52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
|
78 |
+
%53 = tt.addptr %52, %51 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
|
79 |
+
%54 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
80 |
+
%55 = arith.addf %54, %30 : tensor<16x256xf32, #blocked>
|
81 |
+
%56 = arith.addf %55, %37 : tensor<16x256xf32, #blocked>
|
82 |
+
%57 = arith.addf %56, %cst_14 : tensor<16x256xf32, #blocked>
|
83 |
+
%58 = arith.subf %56, %57 : tensor<16x256xf32, #blocked>
|
84 |
+
%59 = arith.mulf %56, %58 : tensor<16x256xf32, #blocked>
|
85 |
+
%60 = arith.addf %59, %cst_14 : tensor<16x256xf32, #blocked>
|
86 |
+
%61 = arith.select %29, %57, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
|
87 |
+
%62 = arith.select %29, %60, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
|
88 |
+
%63 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
|
89 |
+
%64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
90 |
+
%65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
|
91 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
92 |
+
%90 = arith.subf %arg11, %arg8 : f32
|
93 |
+
%91 = arith.addf %arg10, %arg13 : f32
|
94 |
+
%92 = arith.cmpf oeq, %91, %cst_9 : f32
|
95 |
+
%93 = arith.divf %arg13, %91 : f32
|
96 |
+
%94 = arith.select %92, %cst_9, %93 : f32
|
97 |
+
%95 = arith.mulf %90, %94 : f32
|
98 |
+
%96 = arith.addf %arg8, %95 : f32
|
99 |
+
%97 = arith.addf %arg9, %arg12 : f32
|
100 |
+
%98 = arith.mulf %90, %90 : f32
|
101 |
+
%99 = arith.mulf %98, %arg10 : f32
|
102 |
+
%100 = arith.mulf %99, %94 : f32
|
103 |
+
%101 = arith.addf %97, %100 : f32
|
104 |
+
tt.reduce.return %96, %101, %91 : f32, f32, f32
|
105 |
+
}) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
106 |
+
%66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
107 |
+
%67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
|
108 |
+
%68 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
109 |
+
%69 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
|
110 |
+
%70 = arith.extf %69 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
|
111 |
+
%71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
|
112 |
+
%72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
|
113 |
+
%73 = tt.load %72, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
|
114 |
+
tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
|
115 |
+
%74 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
|
116 |
+
%75 = arith.addf %74, %68 : tensor<16x256xf32, #blocked>
|
117 |
+
%76 = arith.addf %75, %70 : tensor<16x256xf32, #blocked>
|
118 |
+
%77 = tt.broadcast %66 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
119 |
+
%78 = arith.subf %76, %77 : tensor<16x256xf32, #blocked>
|
120 |
+
%79 = arith.divf %67, %cst_13 : tensor<16x1xf32, #blocked>
|
121 |
+
%80 = arith.addf %79, %cst_12 : tensor<16x1xf32, #blocked>
|
122 |
+
%81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
|
123 |
+
%82 = tt.broadcast %81 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
124 |
+
%83 = arith.mulf %78, %82 : tensor<16x256xf32, #blocked>
|
125 |
+
%84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
|
126 |
+
%85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
|
127 |
+
%86 = arith.mulf %83, %85 : tensor<16x256xf32, #blocked>
|
128 |
+
%87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
|
129 |
+
%88 = tt.addptr %87, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
|
130 |
+
%89 = arith.truncf %86 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
|
131 |
+
tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
|
132 |
+
tt.return
|
133 |
+
}
|
134 |
+
}
|
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<16x256xbf16>
|
4 |
+
%cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
|
5 |
+
%cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
|
6 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<16x1xi64>
|
8 |
+
%cst_4 = arith.constant dense<50257> : tensor<16x1xi64>
|
9 |
+
%cst_5 = arith.constant dense<0> : tensor<16x1xi64>
|
10 |
+
%cst_6 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
|
11 |
+
%cst_7 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
|
12 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<16x256xf32>
|
13 |
+
%cst_9 = arith.constant dense<256> : tensor<16x1xi32>
|
14 |
+
%cst_10 = arith.constant dense<256> : tensor<1x256xi32>
|
15 |
+
%cst_11 = arith.constant dense<512> : tensor<16x1xi32>
|
16 |
+
%c16_i32 = arith.constant 16 : i32
|
17 |
+
%0 = tt.get_program_id x : i32
|
18 |
+
%1 = arith.muli %0, %c16_i32 : i32
|
19 |
+
%2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
|
20 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
|
21 |
+
%4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
|
22 |
+
%5 = arith.addi %4, %3 : tensor<16x1xi32>
|
23 |
+
%6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
24 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
|
25 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
|
26 |
+
%9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
|
27 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
|
28 |
+
%11 = arith.remsi %5, %cst_11 : tensor<16x1xi32>
|
29 |
+
%12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
|
30 |
+
%13 = arith.muli %11, %cst_9 : tensor<16x1xi32>
|
31 |
+
%14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<16x256xi32>
|
32 |
+
%15 = tt.broadcast %13 : (tensor<16x1xi32>) -> tensor<16x256xi32>
|
33 |
+
%16 = arith.addi %14, %15 : tensor<16x256xi32>
|
34 |
+
%17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
|
35 |
+
%18 = tt.addptr %17, %16 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi32>
|
36 |
+
%19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<16x256xi1>
|
37 |
+
%20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
|
38 |
+
%21 = arith.muli %5, %cst_9 : tensor<16x1xi32>
|
39 |
+
%22 = tt.broadcast %21 : (tensor<16x1xi32>) -> tensor<16x256xi32>
|
40 |
+
%23 = arith.addi %14, %22 : tensor<16x256xi32>
|
41 |
+
%24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
|
42 |
+
%25 = tt.addptr %24, %23 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
|
43 |
+
%26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16>
|
44 |
+
%27 = arith.extf %26 : tensor<16x256xbf16> to tensor<16x256xf32>
|
45 |
+
%28 = arith.addi %10, %cst_4 : tensor<16x1xi64>
|
46 |
+
%29 = arith.cmpi slt, %10, %cst_5 : tensor<16x1xi64>
|
47 |
+
%30 = arith.select %29, %28, %10 : tensor<16x1xi1>, tensor<16x1xi64>
|
48 |
+
%31 = arith.cmpi sge, %30, %cst_5 : tensor<16x1xi64>
|
49 |
+
%32 = arith.cmpi slt, %30, %cst_4 : tensor<16x1xi64>
|
50 |
+
%33 = arith.andi %31, %32 : tensor<16x1xi1>
|
51 |
+
tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
|
52 |
+
%34 = arith.muli %30, %cst_3 : tensor<16x1xi64>
|
53 |
+
%35 = tt.broadcast %34 : (tensor<16x1xi64>) -> tensor<16x256xi64>
|
54 |
+
%36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
|
55 |
+
%37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<16x256xi64>
|
56 |
+
%38 = arith.addi %37, %35 : tensor<16x256xi64>
|
57 |
+
%39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
|
58 |
+
%40 = tt.addptr %39, %38 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi64>
|
59 |
+
%41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
|
60 |
+
%42 = arith.addf %41, %20 : tensor<16x256xf32>
|
61 |
+
%43 = arith.addf %42, %27 : tensor<16x256xf32>
|
62 |
+
%44 = arith.addf %43, %cst_8 : tensor<16x256xf32>
|
63 |
+
%45 = arith.subf %43, %44 : tensor<16x256xf32>
|
64 |
+
%46 = arith.mulf %43, %45 : tensor<16x256xf32>
|
65 |
+
%47 = arith.addf %46, %cst_8 : tensor<16x256xf32>
|
66 |
+
%48 = arith.select %19, %44, %cst_8 : tensor<16x256xi1>, tensor<16x256xf32>
|
67 |
+
%49 = arith.select %19, %47, %cst_8 : tensor<16x256xi1>, tensor<16x256xf32>
|
68 |
+
%50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
|
69 |
+
%51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<16x256xf32>
|
70 |
+
%52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
|
71 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
72 |
+
%76 = arith.subf %arg11, %arg8 : f32
|
73 |
+
%77 = arith.addf %arg10, %arg13 : f32
|
74 |
+
%78 = arith.cmpf oeq, %77, %cst_2 : f32
|
75 |
+
%79 = arith.divf %arg13, %77 : f32
|
76 |
+
%80 = arith.select %78, %cst_2, %79 : f32
|
77 |
+
%81 = arith.mulf %76, %80 : f32
|
78 |
+
%82 = arith.addf %arg8, %81 : f32
|
79 |
+
%83 = arith.addf %arg9, %arg12 : f32
|
80 |
+
%84 = arith.mulf %76, %76 : f32
|
81 |
+
%85 = arith.mulf %84, %arg10 : f32
|
82 |
+
%86 = arith.mulf %85, %80 : f32
|
83 |
+
%87 = arith.addf %83, %86 : f32
|
84 |
+
tt.reduce.return %82, %87, %77 : f32, f32, f32
|
85 |
+
}) : (tensor<16x256xf32>, tensor<16x256xf32>, tensor<16x256xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
|
86 |
+
%53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
87 |
+
%54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
|
88 |
+
%55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
|
89 |
+
%56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16>
|
90 |
+
%57 = arith.extf %56 : tensor<16x256xbf16> to tensor<16x256xf32>
|
91 |
+
%58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
|
92 |
+
%59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
|
93 |
+
%60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
|
94 |
+
tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
|
95 |
+
%61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32>
|
96 |
+
%62 = arith.addf %61, %55 : tensor<16x256xf32>
|
97 |
+
%63 = arith.addf %62, %57 : tensor<16x256xf32>
|
98 |
+
%64 = tt.broadcast %53 : (tensor<16x1xf32>) -> tensor<16x256xf32>
|
99 |
+
%65 = arith.subf %63, %64 : tensor<16x256xf32>
|
100 |
+
%66 = arith.divf %54, %cst_7 : tensor<16x1xf32>
|
101 |
+
%67 = arith.addf %66, %cst_6 : tensor<16x1xf32>
|
102 |
+
%68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
|
103 |
+
%69 = tt.broadcast %68 : (tensor<16x1xf32>) -> tensor<16x256xf32>
|
104 |
+
%70 = arith.mulf %65, %69 : tensor<16x256xf32>
|
105 |
+
%71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<16x256xf32>
|
106 |
+
%72 = arith.mulf %70, %71 : tensor<16x256xf32>
|
107 |
+
%73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
|
108 |
+
%74 = tt.addptr %73, %23 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
|
109 |
+
%75 = arith.truncf %72 : tensor<16x256xf32> to tensor<16x256xbf16>
|
110 |
+
tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16>
|
111 |
+
tt.return
|
112 |
+
}
|
113 |
+
}
|
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = and i32 %6, 3, !dbg !8
|
11 |
+
%10 = and i32 %8, 3, !dbg !9
|
12 |
+
%urem = and i32 %6, 127, !dbg !9
|
13 |
+
%11 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
14 |
+
%12 = shl i32 %11, 2, !dbg !11
|
15 |
+
%13 = or i32 %12, %9, !dbg !12
|
16 |
+
%14 = icmp ult i32 %urem, 120, !dbg !13
|
17 |
+
%15 = shl nuw nsw i32 %urem, 17, !dbg !14
|
18 |
+
%16 = add i32 %12, %15, !dbg !15
|
19 |
+
%17 = sext i32 %16 to i64, !dbg !16
|
20 |
+
%18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !16
|
21 |
+
%19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14) #3, !dbg !17
|
22 |
+
%20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !17
|
23 |
+
%21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !17
|
24 |
+
%22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !17
|
25 |
+
%23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !17
|
26 |
+
%24 = bitcast i32 %20 to float, !dbg !17
|
27 |
+
%25 = bitcast i32 %21 to float, !dbg !17
|
28 |
+
%26 = bitcast i32 %22 to float, !dbg !17
|
29 |
+
%27 = bitcast i32 %23 to float, !dbg !17
|
30 |
+
%28 = fadd float %24, 0.000000e+00, !dbg !18
|
31 |
+
%29 = fadd float %25, 0.000000e+00, !dbg !18
|
32 |
+
%30 = fadd float %26, 0.000000e+00, !dbg !18
|
33 |
+
%31 = fadd float %27, 0.000000e+00, !dbg !18
|
34 |
+
%32 = select i1 %14, float %28, float 0.000000e+00, !dbg !19
|
35 |
+
%33 = select i1 %14, float %29, float 0.000000e+00, !dbg !19
|
36 |
+
%34 = select i1 %14, float %30, float 0.000000e+00, !dbg !19
|
37 |
+
%35 = select i1 %14, float %31, float 0.000000e+00, !dbg !19
|
38 |
+
%36 = bitcast float %32 to i32, !dbg !20
|
39 |
+
%37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 16, i32 31), !dbg !20
|
40 |
+
%38 = bitcast i32 %37 to float, !dbg !20
|
41 |
+
%39 = fadd float %32, %38, !dbg !24
|
42 |
+
%40 = bitcast float %39 to i32, !dbg !20
|
43 |
+
%41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !20
|
44 |
+
%42 = bitcast i32 %41 to float, !dbg !20
|
45 |
+
%43 = fadd float %39, %42, !dbg !24
|
46 |
+
%44 = bitcast float %43 to i32, !dbg !20
|
47 |
+
%45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !20
|
48 |
+
%46 = bitcast i32 %45 to float, !dbg !20
|
49 |
+
%47 = fadd float %43, %46, !dbg !24
|
50 |
+
%48 = bitcast float %47 to i32, !dbg !20
|
51 |
+
%49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !20
|
52 |
+
%50 = bitcast i32 %49 to float, !dbg !20
|
53 |
+
%51 = fadd float %47, %50, !dbg !24
|
54 |
+
%52 = bitcast float %51 to i32, !dbg !20
|
55 |
+
%53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !20
|
56 |
+
%54 = bitcast i32 %53 to float, !dbg !20
|
57 |
+
%55 = fadd float %51, %54, !dbg !24
|
58 |
+
%56 = bitcast float %33 to i32, !dbg !20
|
59 |
+
%57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
|
60 |
+
%58 = bitcast i32 %57 to float, !dbg !20
|
61 |
+
%59 = fadd float %33, %58, !dbg !24
|
62 |
+
%60 = bitcast float %59 to i32, !dbg !20
|
63 |
+
%61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !20
|
64 |
+
%62 = bitcast i32 %61 to float, !dbg !20
|
65 |
+
%63 = fadd float %59, %62, !dbg !24
|
66 |
+
%64 = bitcast float %63 to i32, !dbg !20
|
67 |
+
%65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 4, i32 31), !dbg !20
|
68 |
+
%66 = bitcast i32 %65 to float, !dbg !20
|
69 |
+
%67 = fadd float %63, %66, !dbg !24
|
70 |
+
%68 = bitcast float %67 to i32, !dbg !20
|
71 |
+
%69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !20
|
72 |
+
%70 = bitcast i32 %69 to float, !dbg !20
|
73 |
+
%71 = fadd float %67, %70, !dbg !24
|
74 |
+
%72 = bitcast float %71 to i32, !dbg !20
|
75 |
+
%73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !20
|
76 |
+
%74 = bitcast i32 %73 to float, !dbg !20
|
77 |
+
%75 = fadd float %71, %74, !dbg !24
|
78 |
+
%76 = bitcast float %34 to i32, !dbg !20
|
79 |
+
%77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !20
|
80 |
+
%78 = bitcast i32 %77 to float, !dbg !20
|
81 |
+
%79 = fadd float %34, %78, !dbg !24
|
82 |
+
%80 = bitcast float %79 to i32, !dbg !20
|
83 |
+
%81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !20
|
84 |
+
%82 = bitcast i32 %81 to float, !dbg !20
|
85 |
+
%83 = fadd float %79, %82, !dbg !24
|
86 |
+
%84 = bitcast float %83 to i32, !dbg !20
|
87 |
+
%85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !20
|
88 |
+
%86 = bitcast i32 %85 to float, !dbg !20
|
89 |
+
%87 = fadd float %83, %86, !dbg !24
|
90 |
+
%88 = bitcast float %87 to i32, !dbg !20
|
91 |
+
%89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !20
|
92 |
+
%90 = bitcast i32 %89 to float, !dbg !20
|
93 |
+
%91 = fadd float %87, %90, !dbg !24
|
94 |
+
%92 = bitcast float %91 to i32, !dbg !20
|
95 |
+
%93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !20
|
96 |
+
%94 = bitcast i32 %93 to float, !dbg !20
|
97 |
+
%95 = fadd float %91, %94, !dbg !24
|
98 |
+
%96 = bitcast float %35 to i32, !dbg !20
|
99 |
+
%97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !20
|
100 |
+
%98 = bitcast i32 %97 to float, !dbg !20
|
101 |
+
%99 = fadd float %35, %98, !dbg !24
|
102 |
+
%100 = bitcast float %99 to i32, !dbg !20
|
103 |
+
%101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !20
|
104 |
+
%102 = bitcast i32 %101 to float, !dbg !20
|
105 |
+
%103 = fadd float %99, %102, !dbg !24
|
106 |
+
%104 = bitcast float %103 to i32, !dbg !20
|
107 |
+
%105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !20
|
108 |
+
%106 = bitcast i32 %105 to float, !dbg !20
|
109 |
+
%107 = fadd float %103, %106, !dbg !24
|
110 |
+
%108 = bitcast float %107 to i32, !dbg !20
|
111 |
+
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !20
|
112 |
+
%110 = bitcast i32 %109 to float, !dbg !20
|
113 |
+
%111 = fadd float %107, %110, !dbg !24
|
114 |
+
%112 = bitcast float %111 to i32, !dbg !20
|
115 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !20
|
116 |
+
%114 = bitcast i32 %113 to float, !dbg !20
|
117 |
+
%115 = fadd float %111, %114, !dbg !24
|
118 |
+
%116 = icmp eq i32 %7, 0, !dbg !20
|
119 |
+
%117 = zext nneg i32 %10 to i64, !dbg !20
|
120 |
+
%118 = getelementptr float, ptr addrspace(3) @global_smem, i64 %117, !dbg !20
|
121 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %118, float %55, i1 %116) #3, !dbg !20
|
122 |
+
%119 = or i32 %10, 4, !dbg !20
|
123 |
+
%120 = zext nneg i32 %119 to i64, !dbg !20
|
124 |
+
%121 = getelementptr float, ptr addrspace(3) @global_smem, i64 %120, !dbg !20
|
125 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %121, float %75, i1 %116) #3, !dbg !20
|
126 |
+
%122 = or i32 %10, 8, !dbg !20
|
127 |
+
%123 = zext nneg i32 %122 to i64, !dbg !20
|
128 |
+
%124 = getelementptr float, ptr addrspace(3) @global_smem, i64 %123, !dbg !20
|
129 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %124, float %95, i1 %116) #3, !dbg !20
|
130 |
+
%125 = or i32 %10, 12, !dbg !20
|
131 |
+
%126 = zext nneg i32 %125 to i64, !dbg !20
|
132 |
+
%127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !20
|
133 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %115, i1 %116) #3, !dbg !20
|
134 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
135 |
+
%128 = icmp slt i32 %6, 16, !dbg !20
|
136 |
+
%129 = sext i32 %6 to i64, !dbg !20
|
137 |
+
%130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !20
|
138 |
+
%131 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %130, i1 %128) #3, !dbg !20
|
139 |
+
%132 = bitcast float %131 to i32, !dbg !20
|
140 |
+
%133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !20
|
141 |
+
%134 = bitcast i32 %133 to float, !dbg !20
|
142 |
+
%135 = fadd float %131, %134, !dbg !24
|
143 |
+
%136 = bitcast float %135 to i32, !dbg !20
|
144 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !20
|
145 |
+
%138 = bitcast i32 %137 to float, !dbg !20
|
146 |
+
%139 = fadd float %135, %138, !dbg !24
|
147 |
+
%140 = icmp eq i32 %9, 0, !dbg !20
|
148 |
+
%141 = and i1 %128, %140, !dbg !20
|
149 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %139, i1 %141) #3, !dbg !20
|
150 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !20
|
151 |
+
%142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !20
|
152 |
+
%143 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), align 4, !dbg !20
|
153 |
+
%144 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !20
|
154 |
+
%145 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 48), align 4, !dbg !20
|
155 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
156 |
+
%146 = insertelement <1 x float> undef, float %142, i64 0, !dbg !28
|
157 |
+
store <1 x float> %146, ptr addrspace(3) @global_smem, align 4, !dbg !28
|
158 |
+
%147 = insertelement <1 x float> undef, float %143, i64 0, !dbg !28
|
159 |
+
store <1 x float> %147, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4), align 4, !dbg !28
|
160 |
+
%148 = insertelement <1 x float> undef, float %144, i64 0, !dbg !28
|
161 |
+
store <1 x float> %148, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !28
|
162 |
+
%149 = insertelement <1 x float> undef, float %145, i64 0, !dbg !28
|
163 |
+
store <1 x float> %149, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 12), align 4, !dbg !28
|
164 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
165 |
+
%150 = zext nneg i32 %9 to i64, !dbg !28
|
166 |
+
%151 = getelementptr float, ptr addrspace(3) @global_smem, i64 %150, !dbg !28
|
167 |
+
%152 = load <1 x float>, ptr addrspace(3) %151, align 4, !dbg !28
|
168 |
+
%.frozen = freeze i32 %13
|
169 |
+
%153 = sdiv i32 %.frozen, 256, !dbg !29
|
170 |
+
%154 = mul i32 %153, 256
|
171 |
+
%.decomposed = sub i32 %.frozen, %154
|
172 |
+
%155 = sext i32 %153 to i64, !dbg !30
|
173 |
+
%156 = getelementptr i64, ptr addrspace(1) %1, i64 %155, !dbg !30
|
174 |
+
%157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %156, i1 true) #3, !dbg !31
|
175 |
+
%158 = lshr i64 %157, 54, !dbg !32
|
176 |
+
%159 = and i64 %158, 512, !dbg !32
|
177 |
+
%160 = add i64 %159, %157, !dbg !32
|
178 |
+
%161 = shl i64 %160, 8, !dbg !33
|
179 |
+
%162 = sext i32 %.decomposed to i64, !dbg !34
|
180 |
+
%163 = getelementptr float, ptr addrspace(1) %2, i64 %161, !dbg !35
|
181 |
+
%164 = getelementptr float, ptr addrspace(1) %163, i64 %162, !dbg !35
|
182 |
+
%165 = lshr i32 %7, 2, !dbg !36
|
183 |
+
%166 = shl nuw nsw i32 %10, 3, !dbg !36
|
184 |
+
%167 = or i32 %166, %165, !dbg !36
|
185 |
+
%168 = icmp eq i32 %167, 0, !dbg !36
|
186 |
+
%169 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %164, <1 x float> %152, i1 %168) #3, !dbg !36
|
187 |
+
ret void, !dbg !37
|
188 |
+
}
|
189 |
+
|
190 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
191 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
192 |
+
|
193 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
194 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
195 |
+
|
196 |
+
; Function Attrs: convergent nocallback nounwind
|
197 |
+
declare void @llvm.nvvm.barrier0() #2
|
198 |
+
|
199 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
200 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
201 |
+
attributes #2 = { convergent nocallback nounwind }
|
202 |
+
attributes #3 = { nounwind }
|
203 |
+
|
204 |
+
!llvm.module.flags = !{!0}
|
205 |
+
!llvm.dbg.cu = !{!1}
|
206 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
207 |
+
|
208 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
209 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
210 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
211 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
212 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
|
213 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
214 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
215 |
+
!7 = !{}
|
216 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
217 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
218 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
219 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
220 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
221 |
+
!13 = !DILocation(line: 29, column: 25, scope: !5)
|
222 |
+
!14 = !DILocation(line: 31, column: 47, scope: !5)
|
223 |
+
!15 = !DILocation(line: 31, column: 40, scope: !5)
|
224 |
+
!16 = !DILocation(line: 31, column: 34, scope: !5)
|
225 |
+
!17 = !DILocation(line: 31, column: 53, scope: !5)
|
226 |
+
!18 = !DILocation(line: 33, column: 23, scope: !5)
|
227 |
+
!19 = !DILocation(line: 34, column: 38, scope: !5)
|
228 |
+
!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
|
229 |
+
!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
|
230 |
+
!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
231 |
+
!23 = !DILocation(line: 35, column: 25, scope: !21)
|
232 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
233 |
+
!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
|
234 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
235 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
236 |
+
!28 = !DILocation(line: 35, column: 28, scope: !5)
|
237 |
+
!29 = !DILocation(line: 36, column: 20, scope: !5)
|
238 |
+
!30 = !DILocation(line: 38, column: 30, scope: !5)
|
239 |
+
!31 = !DILocation(line: 38, column: 35, scope: !5)
|
240 |
+
!32 = !DILocation(line: 41, column: 32, scope: !5)
|
241 |
+
!33 = !DILocation(line: 45, column: 40, scope: !5)
|
242 |
+
!34 = !DILocation(line: 45, column: 36, scope: !5)
|
243 |
+
!35 = !DILocation(line: 45, column: 30, scope: !5)
|
244 |
+
!36 = !DILocation(line: 45, column: 55, scope: !5)
|
245 |
+
!37 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx
ADDED
@@ -0,0 +1,651 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4e
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4e(
|
13 |
+
.param .u64 triton__0d1d2d3de4e_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4e_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4e_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4e_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4e_param_4
|
18 |
+
)
|
19 |
+
.maxntid 128, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<15>;
|
22 |
+
.reg .b32 %r<91>;
|
23 |
+
.reg .f32 %f<62>;
|
24 |
+
.reg .b64 %rd<16>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2d3de4e_param_0];
|
30 |
+
ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_1];
|
31 |
+
$L__tmp0:
|
32 |
+
.loc 1 22 44
|
33 |
+
mov.u32 %r24, %tid.x;
|
34 |
+
and.b32 %r25, %r24, 31;
|
35 |
+
ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_2];
|
36 |
+
and.b32 %r26, %r24, 3;
|
37 |
+
.loc 1 24 33
|
38 |
+
bfe.u32 %r27, %r24, 5, 2;
|
39 |
+
and.b32 %r28, %r24, 127;
|
40 |
+
.loc 1 21 28
|
41 |
+
mov.u32 %r1, %ctaid.x;
|
42 |
+
.loc 1 21 33
|
43 |
+
shl.b32 %r29, %r1, 2;
|
44 |
+
.loc 1 22 23
|
45 |
+
or.b32 %r30, %r29, %r26;
|
46 |
+
.loc 1 29 25
|
47 |
+
setp.lt.u32 %p1, %r28, 120;
|
48 |
+
.loc 1 31 47
|
49 |
+
shl.b32 %r31, %r28, 17;
|
50 |
+
.loc 1 31 40
|
51 |
+
add.s32 %r32, %r29, %r31;
|
52 |
+
.loc 1 31 34
|
53 |
+
mul.wide.s32 %rd8, %r32, 4;
|
54 |
+
add.s64 %rd1, %rd5, %rd8;
|
55 |
+
mov.b32 %r6, 0;
|
56 |
+
.loc 1 31 53
|
57 |
+
mov.u32 %r2, 0x0;
|
58 |
+
mov.u32 %r3, 0x0;
|
59 |
+
mov.u32 %r4, 0x0;
|
60 |
+
mov.u32 %r5, 0x0;
|
61 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
62 |
+
@!%p1 mov.u32 %r2, %r6;
|
63 |
+
@!%p1 mov.u32 %r3, %r6;
|
64 |
+
@!%p1 mov.u32 %r4, %r6;
|
65 |
+
@!%p1 mov.u32 %r5, %r6;
|
66 |
+
mov.b32 %f1, %r2;
|
67 |
+
mov.b32 %f2, %r3;
|
68 |
+
mov.b32 %f3, %r4;
|
69 |
+
mov.b32 %f4, %r5;
|
70 |
+
.loc 1 33 23
|
71 |
+
add.f32 %f5, %f1, 0f00000000;
|
72 |
+
add.f32 %f6, %f2, 0f00000000;
|
73 |
+
add.f32 %f7, %f3, 0f00000000;
|
74 |
+
add.f32 %f8, %f4, 0f00000000;
|
75 |
+
.loc 1 34 38
|
76 |
+
selp.f32 %f9, %f5, 0f00000000, %p1;
|
77 |
+
selp.f32 %f10, %f6, 0f00000000, %p1;
|
78 |
+
selp.f32 %f11, %f7, 0f00000000, %p1;
|
79 |
+
selp.f32 %f12, %f8, 0f00000000, %p1;
|
80 |
+
$L__tmp1:
|
81 |
+
.loc 2 243 36
|
82 |
+
mov.b32 %r33, %f9;
|
83 |
+
shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1;
|
84 |
+
mov.b32 %f13, %r34;
|
85 |
+
$L__tmp2:
|
86 |
+
.loc 2 233 15
|
87 |
+
add.f32 %f14, %f9, %f13;
|
88 |
+
$L__tmp3:
|
89 |
+
.loc 2 243 36
|
90 |
+
mov.b32 %r35, %f14;
|
91 |
+
shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1;
|
92 |
+
mov.b32 %f15, %r36;
|
93 |
+
$L__tmp4:
|
94 |
+
.loc 2 233 15
|
95 |
+
add.f32 %f16, %f14, %f15;
|
96 |
+
$L__tmp5:
|
97 |
+
.loc 2 243 36
|
98 |
+
mov.b32 %r37, %f16;
|
99 |
+
shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
|
100 |
+
mov.b32 %f17, %r38;
|
101 |
+
$L__tmp6:
|
102 |
+
.loc 2 233 15
|
103 |
+
add.f32 %f18, %f16, %f17;
|
104 |
+
$L__tmp7:
|
105 |
+
.loc 2 243 36
|
106 |
+
mov.b32 %r39, %f18;
|
107 |
+
shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
|
108 |
+
mov.b32 %f19, %r40;
|
109 |
+
$L__tmp8:
|
110 |
+
.loc 2 233 15
|
111 |
+
add.f32 %f20, %f18, %f19;
|
112 |
+
$L__tmp9:
|
113 |
+
.loc 2 243 36
|
114 |
+
mov.b32 %r41, %f20;
|
115 |
+
shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
|
116 |
+
mov.b32 %f21, %r42;
|
117 |
+
$L__tmp10:
|
118 |
+
.loc 2 233 15
|
119 |
+
add.f32 %f22, %f20, %f21;
|
120 |
+
$L__tmp11:
|
121 |
+
.loc 2 243 36
|
122 |
+
mov.b32 %r43, %f10;
|
123 |
+
shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1;
|
124 |
+
mov.b32 %f23, %r44;
|
125 |
+
$L__tmp12:
|
126 |
+
.loc 2 233 15
|
127 |
+
add.f32 %f24, %f10, %f23;
|
128 |
+
$L__tmp13:
|
129 |
+
.loc 2 243 36
|
130 |
+
mov.b32 %r45, %f24;
|
131 |
+
shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1;
|
132 |
+
mov.b32 %f25, %r46;
|
133 |
+
$L__tmp14:
|
134 |
+
.loc 2 233 15
|
135 |
+
add.f32 %f26, %f24, %f25;
|
136 |
+
$L__tmp15:
|
137 |
+
.loc 2 243 36
|
138 |
+
mov.b32 %r47, %f26;
|
139 |
+
shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1;
|
140 |
+
mov.b32 %f27, %r48;
|
141 |
+
$L__tmp16:
|
142 |
+
.loc 2 233 15
|
143 |
+
add.f32 %f28, %f26, %f27;
|
144 |
+
$L__tmp17:
|
145 |
+
.loc 2 243 36
|
146 |
+
mov.b32 %r49, %f28;
|
147 |
+
shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1;
|
148 |
+
mov.b32 %f29, %r50;
|
149 |
+
$L__tmp18:
|
150 |
+
.loc 2 233 15
|
151 |
+
add.f32 %f30, %f28, %f29;
|
152 |
+
$L__tmp19:
|
153 |
+
.loc 2 243 36
|
154 |
+
mov.b32 %r51, %f30;
|
155 |
+
shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
|
156 |
+
mov.b32 %f31, %r52;
|
157 |
+
$L__tmp20:
|
158 |
+
.loc 2 233 15
|
159 |
+
add.f32 %f32, %f30, %f31;
|
160 |
+
$L__tmp21:
|
161 |
+
.loc 2 243 36
|
162 |
+
mov.b32 %r53, %f11;
|
163 |
+
shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1;
|
164 |
+
mov.b32 %f33, %r54;
|
165 |
+
$L__tmp22:
|
166 |
+
.loc 2 233 15
|
167 |
+
add.f32 %f34, %f11, %f33;
|
168 |
+
$L__tmp23:
|
169 |
+
.loc 2 243 36
|
170 |
+
mov.b32 %r55, %f34;
|
171 |
+
shfl.sync.bfly.b32 %r56, %r55, 8, 31, -1;
|
172 |
+
mov.b32 %f35, %r56;
|
173 |
+
$L__tmp24:
|
174 |
+
.loc 2 233 15
|
175 |
+
add.f32 %f36, %f34, %f35;
|
176 |
+
$L__tmp25:
|
177 |
+
.loc 2 243 36
|
178 |
+
mov.b32 %r57, %f36;
|
179 |
+
shfl.sync.bfly.b32 %r58, %r57, 4, 31, -1;
|
180 |
+
mov.b32 %f37, %r58;
|
181 |
+
$L__tmp26:
|
182 |
+
.loc 2 233 15
|
183 |
+
add.f32 %f38, %f36, %f37;
|
184 |
+
$L__tmp27:
|
185 |
+
.loc 2 243 36
|
186 |
+
mov.b32 %r59, %f38;
|
187 |
+
shfl.sync.bfly.b32 %r60, %r59, 2, 31, -1;
|
188 |
+
mov.b32 %f39, %r60;
|
189 |
+
$L__tmp28:
|
190 |
+
.loc 2 233 15
|
191 |
+
add.f32 %f40, %f38, %f39;
|
192 |
+
$L__tmp29:
|
193 |
+
.loc 2 243 36
|
194 |
+
mov.b32 %r61, %f40;
|
195 |
+
shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
|
196 |
+
mov.b32 %f41, %r62;
|
197 |
+
$L__tmp30:
|
198 |
+
.loc 2 233 15
|
199 |
+
add.f32 %f42, %f40, %f41;
|
200 |
+
$L__tmp31:
|
201 |
+
.loc 2 243 36
|
202 |
+
mov.b32 %r63, %f12;
|
203 |
+
shfl.sync.bfly.b32 %r64, %r63, 16, 31, -1;
|
204 |
+
mov.b32 %f43, %r64;
|
205 |
+
$L__tmp32:
|
206 |
+
.loc 2 233 15
|
207 |
+
add.f32 %f44, %f12, %f43;
|
208 |
+
$L__tmp33:
|
209 |
+
.loc 2 243 36
|
210 |
+
mov.b32 %r65, %f44;
|
211 |
+
shfl.sync.bfly.b32 %r66, %r65, 8, 31, -1;
|
212 |
+
mov.b32 %f45, %r66;
|
213 |
+
$L__tmp34:
|
214 |
+
.loc 2 233 15
|
215 |
+
add.f32 %f46, %f44, %f45;
|
216 |
+
$L__tmp35:
|
217 |
+
.loc 2 243 36
|
218 |
+
mov.b32 %r67, %f46;
|
219 |
+
shfl.sync.bfly.b32 %r68, %r67, 4, 31, -1;
|
220 |
+
mov.b32 %f47, %r68;
|
221 |
+
$L__tmp36:
|
222 |
+
.loc 2 233 15
|
223 |
+
add.f32 %f48, %f46, %f47;
|
224 |
+
$L__tmp37:
|
225 |
+
.loc 2 243 36
|
226 |
+
mov.b32 %r69, %f48;
|
227 |
+
shfl.sync.bfly.b32 %r70, %r69, 2, 31, -1;
|
228 |
+
mov.b32 %f49, %r70;
|
229 |
+
$L__tmp38:
|
230 |
+
.loc 2 233 15
|
231 |
+
add.f32 %f50, %f48, %f49;
|
232 |
+
$L__tmp39:
|
233 |
+
.loc 2 243 36
|
234 |
+
mov.b32 %r71, %f50;
|
235 |
+
shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
|
236 |
+
mov.b32 %f51, %r72;
|
237 |
+
$L__tmp40:
|
238 |
+
.loc 2 233 15
|
239 |
+
add.f32 %f52, %f50, %f51;
|
240 |
+
$L__tmp41:
|
241 |
+
.loc 2 243 36
|
242 |
+
setp.eq.s32 %p6, %r25, 0;
|
243 |
+
shl.b32 %r73, %r27, 2;
|
244 |
+
mov.u32 %r74, global_smem;
|
245 |
+
add.s32 %r10, %r74, %r73;
|
246 |
+
mov.b32 %r11, %f22;
|
247 |
+
@%p6 st.shared.b32 [ %r10 + 0 ], %r11;
|
248 |
+
add.s32 %r12, %r10, 16;
|
249 |
+
mov.b32 %r13, %f32;
|
250 |
+
@%p6 st.shared.b32 [ %r12 + 0 ], %r13;
|
251 |
+
add.s32 %r14, %r10, 32;
|
252 |
+
mov.b32 %r15, %f42;
|
253 |
+
@%p6 st.shared.b32 [ %r14 + 0 ], %r15;
|
254 |
+
add.s32 %r16, %r10, 48;
|
255 |
+
mov.b32 %r17, %f52;
|
256 |
+
@%p6 st.shared.b32 [ %r16 + 0 ], %r17;
|
257 |
+
bar.sync 0;
|
258 |
+
setp.lt.s32 %p10, %r24, 16;
|
259 |
+
shl.b32 %r75, %r24, 2;
|
260 |
+
add.s32 %r19, %r74, %r75;
|
261 |
+
@%p10 ld.shared.b32 %r18, [ %r19 + 0 ];
|
262 |
+
mov.b32 %f53, %r18;
|
263 |
+
shfl.sync.bfly.b32 %r76, %r18, 2, 31, -1;
|
264 |
+
mov.b32 %f54, %r76;
|
265 |
+
$L__tmp42:
|
266 |
+
.loc 2 233 15
|
267 |
+
add.f32 %f55, %f53, %f54;
|
268 |
+
$L__tmp43:
|
269 |
+
.loc 2 243 36
|
270 |
+
mov.b32 %r77, %f55;
|
271 |
+
shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1;
|
272 |
+
mov.b32 %f56, %r78;
|
273 |
+
$L__tmp44:
|
274 |
+
.loc 2 233 15
|
275 |
+
add.f32 %f57, %f55, %f56;
|
276 |
+
$L__tmp45:
|
277 |
+
.loc 2 243 36
|
278 |
+
setp.eq.s32 %p14, %r26, 0;
|
279 |
+
and.pred %p11, %p10, %p14;
|
280 |
+
mov.b32 %r21, %f57;
|
281 |
+
@%p11 st.shared.b32 [ %r19 + 0 ], %r21;
|
282 |
+
bar.sync 0;
|
283 |
+
ld.shared.f32 %f58, [global_smem];
|
284 |
+
ld.shared.f32 %f59, [global_smem+16];
|
285 |
+
ld.shared.f32 %f60, [global_smem+32];
|
286 |
+
ld.shared.f32 %f61, [global_smem+48];
|
287 |
+
$L__tmp46:
|
288 |
+
.loc 1 35 28
|
289 |
+
bar.sync 0;
|
290 |
+
st.shared.f32 [global_smem], %f58;
|
291 |
+
st.shared.f32 [global_smem+4], %f59;
|
292 |
+
st.shared.f32 [global_smem+8], %f60;
|
293 |
+
st.shared.f32 [global_smem+12], %f61;
|
294 |
+
bar.sync 0;
|
295 |
+
shl.b32 %r79, %r26, 2;
|
296 |
+
add.s32 %r80, %r74, %r79;
|
297 |
+
.loc 1 36 20
|
298 |
+
shr.s32 %r82, %r30, 31;
|
299 |
+
shr.u32 %r83, %r82, 24;
|
300 |
+
add.s32 %r84, %r30, %r83;
|
301 |
+
shr.s32 %r85, %r84, 8;
|
302 |
+
and.b32 %r86, %r84, -256;
|
303 |
+
sub.s32 %r87, %r30, %r86;
|
304 |
+
.loc 1 38 30
|
305 |
+
mul.wide.s32 %rd9, %r85, 8;
|
306 |
+
add.s64 %rd3, %rd6, %rd9;
|
307 |
+
.loc 1 45 55
|
308 |
+
ld.shared.u32 %r23, [%r80];
|
309 |
+
mov.pred %p12, -1;
|
310 |
+
.loc 1 38 35
|
311 |
+
mov.u64 %rd2, 0x0;
|
312 |
+
@%p12 ld.global.L1::evict_last.b64 { %rd2 }, [ %rd3 + 0 ];
|
313 |
+
.loc 1 41 32
|
314 |
+
shr.u64 %rd10, %rd2, 54;
|
315 |
+
and.b64 %rd11, %rd10, 512;
|
316 |
+
add.s64 %rd12, %rd11, %rd2;
|
317 |
+
.loc 1 45 30
|
318 |
+
shl.b64 %rd13, %rd12, 10;
|
319 |
+
add.s64 %rd14, %rd7, %rd13;
|
320 |
+
mul.wide.s32 %rd15, %r87, 4;
|
321 |
+
add.s64 %rd4, %rd14, %rd15;
|
322 |
+
.loc 1 45 55
|
323 |
+
bfe.u32 %r88, %r24, 2, 3;
|
324 |
+
shl.b32 %r89, %r27, 3;
|
325 |
+
or.b32 %r90, %r89, %r88;
|
326 |
+
setp.eq.s32 %p13, %r90, 0;
|
327 |
+
mov.u32 %r22, 0x0;
|
328 |
+
@%p13 atom.global.gpu.acq_rel.add.f32 %r22, [ %rd4 + 0 ], %r23;
|
329 |
+
.loc 1 45 4
|
330 |
+
ret;
|
331 |
+
$L__tmp47:
|
332 |
+
$L__func_end0:
|
333 |
+
|
334 |
+
}
|
335 |
+
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
|
336 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
337 |
+
.section .debug_abbrev
|
338 |
+
{
|
339 |
+
.b8 1
|
340 |
+
.b8 17
|
341 |
+
.b8 1
|
342 |
+
.b8 37
|
343 |
+
.b8 8
|
344 |
+
.b8 19
|
345 |
+
.b8 5
|
346 |
+
.b8 3
|
347 |
+
.b8 8
|
348 |
+
.b8 16
|
349 |
+
.b8 6
|
350 |
+
.b8 27
|
351 |
+
.b8 8
|
352 |
+
.b8 180
|
353 |
+
.b8 66
|
354 |
+
.b8 12
|
355 |
+
.b8 17
|
356 |
+
.b8 1
|
357 |
+
.b8 18
|
358 |
+
.b8 1
|
359 |
+
.b8 0
|
360 |
+
.b8 0
|
361 |
+
.b8 2
|
362 |
+
.b8 46
|
363 |
+
.b8 0
|
364 |
+
.b8 135
|
365 |
+
.b8 64
|
366 |
+
.b8 8
|
367 |
+
.b8 3
|
368 |
+
.b8 8
|
369 |
+
.b8 58
|
370 |
+
.b8 11
|
371 |
+
.b8 59
|
372 |
+
.b8 11
|
373 |
+
.b8 63
|
374 |
+
.b8 12
|
375 |
+
.b8 32
|
376 |
+
.b8 11
|
377 |
+
.b8 0
|
378 |
+
.b8 0
|
379 |
+
.b8 3
|
380 |
+
.b8 46
|
381 |
+
.b8 1
|
382 |
+
.b8 17
|
383 |
+
.b8 1
|
384 |
+
.b8 18
|
385 |
+
.b8 1
|
386 |
+
.b8 64
|
387 |
+
.b8 10
|
388 |
+
.b8 49
|
389 |
+
.b8 19
|
390 |
+
.b8 0
|
391 |
+
.b8 0
|
392 |
+
.b8 4
|
393 |
+
.b8 29
|
394 |
+
.b8 0
|
395 |
+
.b8 49
|
396 |
+
.b8 19
|
397 |
+
.b8 17
|
398 |
+
.b8 1
|
399 |
+
.b8 18
|
400 |
+
.b8 1
|
401 |
+
.b8 88
|
402 |
+
.b8 11
|
403 |
+
.b8 89
|
404 |
+
.b8 11
|
405 |
+
.b8 87
|
406 |
+
.b8 11
|
407 |
+
.b8 0
|
408 |
+
.b8 0
|
409 |
+
.b8 5
|
410 |
+
.b8 29
|
411 |
+
.b8 1
|
412 |
+
.b8 49
|
413 |
+
.b8 19
|
414 |
+
.b8 17
|
415 |
+
.b8 1
|
416 |
+
.b8 18
|
417 |
+
.b8 1
|
418 |
+
.b8 88
|
419 |
+
.b8 11
|
420 |
+
.b8 89
|
421 |
+
.b8 11
|
422 |
+
.b8 87
|
423 |
+
.b8 11
|
424 |
+
.b8 0
|
425 |
+
.b8 0
|
426 |
+
.b8 0
|
427 |
+
}
|
428 |
+
.section .debug_info
|
429 |
+
{
|
430 |
+
.b32 264
|
431 |
+
.b8 2
|
432 |
+
.b8 0
|
433 |
+
.b32 .debug_abbrev
|
434 |
+
.b8 8
|
435 |
+
.b8 1
|
436 |
+
.b8 116
|
437 |
+
.b8 114
|
438 |
+
.b8 105
|
439 |
+
.b8 116
|
440 |
+
.b8 111
|
441 |
+
.b8 110
|
442 |
+
.b8 0
|
443 |
+
.b8 2
|
444 |
+
.b8 0
|
445 |
+
.b8 99
|
446 |
+
.b8 54
|
447 |
+
.b8 105
|
448 |
+
.b8 107
|
449 |
+
.b8 53
|
450 |
+
.b8 118
|
451 |
+
.b8 120
|
452 |
+
.b8 55
|
453 |
+
.b8 112
|
454 |
+
.b8 50
|
455 |
+
.b8 50
|
456 |
+
.b8 102
|
457 |
+
.b8 112
|
458 |
+
.b8 107
|
459 |
+
.b8 52
|
460 |
+
.b8 100
|
461 |
+
.b8 99
|
462 |
+
.b8 118
|
463 |
+
.b8 104
|
464 |
+
.b8 53
|
465 |
+
.b8 53
|
466 |
+
.b8 122
|
467 |
+
.b8 105
|
468 |
+
.b8 109
|
469 |
+
.b8 119
|
470 |
+
.b8 52
|
471 |
+
.b8 116
|
472 |
+
.b8 53
|
473 |
+
.b8 110
|
474 |
+
.b8 114
|
475 |
+
.b8 53
|
476 |
+
.b8 122
|
477 |
+
.b8 110
|
478 |
+
.b8 50
|
479 |
+
.b8 98
|
480 |
+
.b8 55
|
481 |
+
.b8 105
|
482 |
+
.b8 110
|
483 |
+
.b8 117
|
484 |
+
.b8 106
|
485 |
+
.b8 120
|
486 |
+
.b8 106
|
487 |
+
.b8 97
|
488 |
+
.b8 117
|
489 |
+
.b8 120
|
490 |
+
.b8 115
|
491 |
+
.b8 104
|
492 |
+
.b8 108
|
493 |
+
.b8 106
|
494 |
+
.b8 117
|
495 |
+
.b8 109
|
496 |
+
.b8 109
|
497 |
+
.b8 46
|
498 |
+
.b8 112
|
499 |
+
.b8 121
|
500 |
+
.b8 0
|
501 |
+
.b32 .debug_line
|
502 |
+
.b8 47
|
503 |
+
.b8 116
|
504 |
+
.b8 109
|
505 |
+
.b8 112
|
506 |
+
.b8 47
|
507 |
+
.b8 116
|
508 |
+
.b8 111
|
509 |
+
.b8 114
|
510 |
+
.b8 99
|
511 |
+
.b8 104
|
512 |
+
.b8 105
|
513 |
+
.b8 110
|
514 |
+
.b8 100
|
515 |
+
.b8 117
|
516 |
+
.b8 99
|
517 |
+
.b8 116
|
518 |
+
.b8 111
|
519 |
+
.b8 114
|
520 |
+
.b8 95
|
521 |
+
.b8 114
|
522 |
+
.b8 111
|
523 |
+
.b8 111
|
524 |
+
.b8 116
|
525 |
+
.b8 47
|
526 |
+
.b8 54
|
527 |
+
.b8 105
|
528 |
+
.b8 0
|
529 |
+
.b8 1
|
530 |
+
.b64 $L__func_begin0
|
531 |
+
.b64 $L__func_end0
|
532 |
+
.b8 2
|
533 |
+
.b8 116
|
534 |
+
.b8 114
|
535 |
+
.b8 105
|
536 |
+
.b8 116
|
537 |
+
.b8 111
|
538 |
+
.b8 110
|
539 |
+
.b8 95
|
540 |
+
.b8 95
|
541 |
+
.b8 48
|
542 |
+
.b8 100
|
543 |
+
.b8 49
|
544 |
+
.b8 100
|
545 |
+
.b8 50
|
546 |
+
.b8 100
|
547 |
+
.b8 51
|
548 |
+
.b8 100
|
549 |
+
.b8 101
|
550 |
+
.b8 52
|
551 |
+
.b8 101
|
552 |
+
.b8 0
|
553 |
+
.b8 116
|
554 |
+
.b8 114
|
555 |
+
.b8 105
|
556 |
+
.b8 116
|
557 |
+
.b8 111
|
558 |
+
.b8 110
|
559 |
+
.b8 95
|
560 |
+
.b8 95
|
561 |
+
.b8 48
|
562 |
+
.b8 100
|
563 |
+
.b8 49
|
564 |
+
.b8 100
|
565 |
+
.b8 50
|
566 |
+
.b8 100
|
567 |
+
.b8 51
|
568 |
+
.b8 100
|
569 |
+
.b8 101
|
570 |
+
.b8 52
|
571 |
+
.b8 101
|
572 |
+
.b8 0
|
573 |
+
.b8 1
|
574 |
+
.b8 18
|
575 |
+
.b8 1
|
576 |
+
.b8 1
|
577 |
+
.b8 3
|
578 |
+
.b64 $L__func_begin0
|
579 |
+
.b64 $L__func_end0
|
580 |
+
.b8 1
|
581 |
+
.b8 156
|
582 |
+
.b32 125
|
583 |
+
.b8 4
|
584 |
+
.b32 125
|
585 |
+
.b64 $L__tmp1
|
586 |
+
.b64 $L__tmp46
|
587 |
+
.b8 2
|
588 |
+
.b8 35
|
589 |
+
.b8 25
|
590 |
+
.b8 5
|
591 |
+
.b32 125
|
592 |
+
.b64 $L__tmp2
|
593 |
+
.b64 $L__tmp45
|
594 |
+
.b8 2
|
595 |
+
.b8 35
|
596 |
+
.b8 25
|
597 |
+
.b8 4
|
598 |
+
.b32 125
|
599 |
+
.b64 $L__tmp2
|
600 |
+
.b64 $L__tmp45
|
601 |
+
.b8 2
|
602 |
+
.b8 243
|
603 |
+
.b8 36
|
604 |
+
.b8 0
|
605 |
+
.b8 0
|
606 |
+
.b8 0
|
607 |
+
}
|
608 |
+
.section .debug_pubnames
|
609 |
+
{
|
610 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
611 |
+
$L__pubNames_start0:
|
612 |
+
.b8 2
|
613 |
+
.b8 0
|
614 |
+
.b32 .debug_info
|
615 |
+
.b32 268
|
616 |
+
.b32 125
|
617 |
+
.b8 116
|
618 |
+
.b8 114
|
619 |
+
.b8 105
|
620 |
+
.b8 116
|
621 |
+
.b8 111
|
622 |
+
.b8 110
|
623 |
+
.b8 95
|
624 |
+
.b8 95
|
625 |
+
.b8 48
|
626 |
+
.b8 100
|
627 |
+
.b8 49
|
628 |
+
.b8 100
|
629 |
+
.b8 50
|
630 |
+
.b8 100
|
631 |
+
.b8 51
|
632 |
+
.b8 100
|
633 |
+
.b8 101
|
634 |
+
.b8 52
|
635 |
+
.b8 101
|
636 |
+
.b8 0
|
637 |
+
.b32 0
|
638 |
+
$L__pubNames_end0:
|
639 |
+
}
|
640 |
+
.section .debug_pubtypes
|
641 |
+
{
|
642 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
643 |
+
$L__pubTypes_start0:
|
644 |
+
.b8 2
|
645 |
+
.b8 0
|
646 |
+
.b32 .debug_info
|
647 |
+
.b32 268
|
648 |
+
.b32 0
|
649 |
+
$L__pubTypes_end0:
|
650 |
+
}
|
651 |
+
.section .debug_loc { }
|
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<4x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<4x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<4x1xi64>
|
6 |
+
%cst_2 = arith.constant dense<true> : tensor<4x1xi1>
|
7 |
+
%cst_3 = arith.constant dense<256> : tensor<4x1xi32>
|
8 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
|
9 |
+
%cst_5 = arith.constant dense<120> : tensor<1x128xi32>
|
10 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xf32>
|
11 |
+
%c4_i32 = arith.constant 4 : i32
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.muli %0, %c4_i32 : i32
|
14 |
+
%2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32>) -> tensor<4x1xi32>
|
16 |
+
%4 = tt.splat %1 : (i32) -> tensor<4x1xi32>
|
17 |
+
%5 = arith.addi %4, %3 : tensor<4x1xi32>
|
18 |
+
%6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
19 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
|
20 |
+
%8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
|
21 |
+
%9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
|
22 |
+
%10 = tt.broadcast %5 : (tensor<4x1xi32>) -> tensor<4x128xi32>
|
23 |
+
%11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<4x128xi32>
|
24 |
+
%12 = arith.addi %10, %11 : tensor<4x128xi32>
|
25 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>>
|
26 |
+
%14 = tt.addptr %13, %12 : tensor<4x128x!tt.ptr<f32, 1>>, tensor<4x128xi32>
|
27 |
+
%15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<4x128xi1>
|
28 |
+
%16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32>
|
29 |
+
%17 = arith.addf %16, %cst_6 : tensor<4x128xf32>
|
30 |
+
%18 = arith.select %15, %17, %cst_6 : tensor<4x128xi1>, tensor<4x128xf32>
|
31 |
+
%19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
|
32 |
+
^bb0(%arg5: f32, %arg6: f32):
|
33 |
+
%35 = arith.addf %arg5, %arg6 : f32
|
34 |
+
tt.reduce.return %35 : f32
|
35 |
+
}) : (tensor<4x128xf32>) -> tensor<4xf32>
|
36 |
+
%20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<4xf32>) -> tensor<4x1xf32>
|
37 |
+
%21 = arith.divsi %5, %cst_3 : tensor<4x1xi32>
|
38 |
+
%22 = arith.remsi %5, %cst_3 : tensor<4x1xi32>
|
39 |
+
%23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>>
|
40 |
+
%24 = tt.addptr %23, %21 : tensor<4x1x!tt.ptr<i64, 1>>, tensor<4x1xi32>
|
41 |
+
%25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64>
|
42 |
+
%26 = arith.addi %25, %cst_1 : tensor<4x1xi64>
|
43 |
+
%27 = arith.cmpi slt, %25, %cst_0 : tensor<4x1xi64>
|
44 |
+
%28 = arith.select %27, %26, %25 : tensor<4x1xi1>, tensor<4x1xi64>
|
45 |
+
%29 = arith.muli %28, %cst : tensor<4x1xi64>
|
46 |
+
%30 = arith.extsi %22 : tensor<4x1xi32> to tensor<4x1xi64>
|
47 |
+
%31 = arith.addi %30, %29 : tensor<4x1xi64>
|
48 |
+
%32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>>
|
49 |
+
%33 = tt.addptr %32, %31 : tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xi64>
|
50 |
+
%34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xf32>, tensor<4x1xi1>) -> tensor<4x1xf32>
|
51 |
+
tt.return
|
52 |
+
}
|
53 |
+
}
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin
ADDED
Binary file (23.9 kB). View file
|
|
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir
ADDED
@@ -0,0 +1,858 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
5 |
+
|
6 |
+
define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
|
7 |
+
%3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
8 |
+
%4 = shl i32 %3, 3, !dbg !10
|
9 |
+
%5 = and i32 %4, 1016, !dbg !10
|
10 |
+
%6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
|
11 |
+
%7 = shl i32 %6, 10, !dbg !12
|
12 |
+
%8 = or i32 %7, %5, !dbg !13
|
13 |
+
%9 = sext i32 %8 to i64, !dbg !14
|
14 |
+
%10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
|
15 |
+
%11 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
|
16 |
+
%12 = extractvalue { i32, i32, i32, i32 } %11, 0, !dbg !15
|
17 |
+
%13 = extractvalue { i32, i32, i32, i32 } %11, 1, !dbg !15
|
18 |
+
%14 = extractvalue { i32, i32, i32, i32 } %11, 2, !dbg !15
|
19 |
+
%15 = extractvalue { i32, i32, i32, i32 } %11, 3, !dbg !15
|
20 |
+
%16 = trunc i32 %12 to i16, !dbg !15
|
21 |
+
%extelt.offset = lshr i32 %12, 16, !dbg !15
|
22 |
+
%17 = trunc i32 %extelt.offset to i16, !dbg !15
|
23 |
+
%18 = trunc i32 %13 to i16, !dbg !15
|
24 |
+
%extelt.offset1 = lshr i32 %13, 16, !dbg !15
|
25 |
+
%19 = trunc i32 %extelt.offset1 to i16, !dbg !15
|
26 |
+
%20 = trunc i32 %14 to i16, !dbg !15
|
27 |
+
%extelt.offset2 = lshr i32 %14, 16, !dbg !15
|
28 |
+
%21 = trunc i32 %extelt.offset2 to i16, !dbg !15
|
29 |
+
%22 = trunc i32 %15 to i16, !dbg !15
|
30 |
+
%extelt.offset3 = lshr i32 %15, 16, !dbg !15
|
31 |
+
%23 = trunc i32 %extelt.offset3 to i16, !dbg !15
|
32 |
+
%24 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %16) #4, !dbg !16
|
33 |
+
%25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
|
34 |
+
%26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
|
35 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
|
36 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
|
37 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
|
38 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
|
39 |
+
%31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
|
40 |
+
%32 = fmul float %24, 0x3FE6A09E60000000, !dbg !17
|
41 |
+
%33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
|
42 |
+
%34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
|
43 |
+
%35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
|
44 |
+
%36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
|
45 |
+
%37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
|
46 |
+
%38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
|
47 |
+
%39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
|
48 |
+
%40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
49 |
+
%.not.i = icmp eq i32 %40, 0, !dbg !18
|
50 |
+
%41 = tail call float @llvm.nvvm.fabs.ftz.f(float %32) #4, !dbg !18
|
51 |
+
%42 = tail call float @llvm.nvvm.fabs.f(float %32) #4, !dbg !18
|
52 |
+
%.0.i = select i1 %.not.i, float %42, float %41, !dbg !18
|
53 |
+
%43 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
|
54 |
+
br i1 %43, label %__nv_fabsf.exit1.i, label %45, !dbg !18
|
55 |
+
|
56 |
+
__nv_fabsf.exit1.i: ; preds = %2
|
57 |
+
%44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
58 |
+
%.not1.i = icmp eq i32 %44, 0, !dbg !18
|
59 |
+
%.01.i = select i1 %.not1.i, float %42, float %41, !dbg !18
|
60 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
61 |
+
|
62 |
+
45: ; preds = %2
|
63 |
+
%46 = fmul float %32, %32, !dbg !18
|
64 |
+
br label %__internal_fmad.exit.i, !dbg !18
|
65 |
+
|
66 |
+
__internal_fmad.exit.i: ; preds = %45, %__nv_fabsf.exit1.i
|
67 |
+
%47 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %45 ], !dbg !18
|
68 |
+
%48 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %45 ], !dbg !18
|
69 |
+
%49 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %45 ], !dbg !18
|
70 |
+
%50 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %45 ], !dbg !18
|
71 |
+
%51 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %45 ], !dbg !18
|
72 |
+
%52 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %45 ], !dbg !18
|
73 |
+
%53 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %45 ], !dbg !18
|
74 |
+
%54 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %46, %45 ], !dbg !18
|
75 |
+
%55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
76 |
+
%.not2.i = icmp eq i32 %55, 0, !dbg !18
|
77 |
+
%56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %53, float %54, float %52) #4, !dbg !18
|
78 |
+
%57 = tail call float @llvm.nvvm.fma.rn.f(float %53, float %54, float %52) #4, !dbg !18
|
79 |
+
%.02.i = select i1 %.not2.i, float %57, float %56, !dbg !18
|
80 |
+
%58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
81 |
+
%.not3.i = icmp eq i32 %58, 0, !dbg !18
|
82 |
+
%59 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %54, float %51) #4, !dbg !18
|
83 |
+
%60 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %54, float %51) #4, !dbg !18
|
84 |
+
%.03.i = select i1 %.not3.i, float %60, float %59, !dbg !18
|
85 |
+
%61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
86 |
+
%.not4.i = icmp eq i32 %61, 0, !dbg !18
|
87 |
+
%62 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %54, float %50) #4, !dbg !18
|
88 |
+
%63 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %54, float %50) #4, !dbg !18
|
89 |
+
%.04.i = select i1 %.not4.i, float %63, float %62, !dbg !18
|
90 |
+
%64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
91 |
+
%.not5.i = icmp eq i32 %64, 0, !dbg !18
|
92 |
+
%65 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %54, float %49) #4, !dbg !18
|
93 |
+
%66 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %54, float %49) #4, !dbg !18
|
94 |
+
%.05.i = select i1 %.not5.i, float %66, float %65, !dbg !18
|
95 |
+
%67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
96 |
+
%.not6.i = icmp eq i32 %67, 0, !dbg !18
|
97 |
+
%68 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %54, float %48) #4, !dbg !18
|
98 |
+
%69 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %54, float %48) #4, !dbg !18
|
99 |
+
%.06.i = select i1 %.not6.i, float %69, float %68, !dbg !18
|
100 |
+
%70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
101 |
+
%.not7.i = icmp eq i32 %70, 0, !dbg !18
|
102 |
+
%71 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %54, float %47) #4, !dbg !18
|
103 |
+
%72 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %54, float %47) #4, !dbg !18
|
104 |
+
%.07.i = select i1 %.not7.i, float %72, float %71, !dbg !18
|
105 |
+
%73 = fneg float %54, !dbg !18
|
106 |
+
%74 = select i1 %43, float %73, float %32, !dbg !18
|
107 |
+
%75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
108 |
+
%.not8.i = icmp eq i32 %75, 0, !dbg !18
|
109 |
+
%76 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %74, float %74) #4, !dbg !18
|
110 |
+
%77 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %74, float %74) #4, !dbg !18
|
111 |
+
%.08.i = select i1 %.not8.i, float %77, float %76, !dbg !18
|
112 |
+
br i1 %43, label %78, label %__nv_erff.exit, !dbg !18
|
113 |
+
|
114 |
+
78: ; preds = %__internal_fmad.exit.i
|
115 |
+
%79 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
|
116 |
+
%80 = fsub float 1.000000e+00, %79, !dbg !18
|
117 |
+
%81 = bitcast float %80 to i32, !dbg !18
|
118 |
+
%82 = bitcast float %32 to i32, !dbg !18
|
119 |
+
%83 = and i32 %82, -2147483648, !dbg !18
|
120 |
+
%84 = or i32 %83, %81, !dbg !18
|
121 |
+
%85 = bitcast i32 %84 to float, !dbg !18
|
122 |
+
br label %__nv_erff.exit, !dbg !18
|
123 |
+
|
124 |
+
__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %78
|
125 |
+
%r.0.i = phi float [ %85, %78 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
|
126 |
+
%86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
127 |
+
%.not.i4 = icmp eq i32 %86, 0, !dbg !18
|
128 |
+
%87 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
|
129 |
+
%88 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
|
130 |
+
%.0.i5 = select i1 %.not.i4, float %88, float %87, !dbg !18
|
131 |
+
%89 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
|
132 |
+
br i1 %89, label %__nv_fabsf.exit1.i22, label %91, !dbg !18
|
133 |
+
|
134 |
+
__nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
|
135 |
+
%90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
136 |
+
%.not1.i23 = icmp eq i32 %90, 0, !dbg !18
|
137 |
+
%.01.i24 = select i1 %.not1.i23, float %88, float %87, !dbg !18
|
138 |
+
br label %__internal_fmad.exit.i6, !dbg !18
|
139 |
+
|
140 |
+
91: ; preds = %__nv_erff.exit
|
141 |
+
%92 = fmul float %33, %33, !dbg !18
|
142 |
+
br label %__internal_fmad.exit.i6, !dbg !18
|
143 |
+
|
144 |
+
__internal_fmad.exit.i6: ; preds = %91, %__nv_fabsf.exit1.i22
|
145 |
+
%93 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %91 ], !dbg !18
|
146 |
+
%94 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %91 ], !dbg !18
|
147 |
+
%95 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %91 ], !dbg !18
|
148 |
+
%96 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %91 ], !dbg !18
|
149 |
+
%97 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %91 ], !dbg !18
|
150 |
+
%98 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %91 ], !dbg !18
|
151 |
+
%99 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %91 ], !dbg !18
|
152 |
+
%100 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %92, %91 ], !dbg !18
|
153 |
+
%101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
154 |
+
%.not2.i7 = icmp eq i32 %101, 0, !dbg !18
|
155 |
+
%102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %99, float %100, float %98) #4, !dbg !18
|
156 |
+
%103 = tail call float @llvm.nvvm.fma.rn.f(float %99, float %100, float %98) #4, !dbg !18
|
157 |
+
%.02.i8 = select i1 %.not2.i7, float %103, float %102, !dbg !18
|
158 |
+
%104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
159 |
+
%.not3.i9 = icmp eq i32 %104, 0, !dbg !18
|
160 |
+
%105 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %100, float %97) #4, !dbg !18
|
161 |
+
%106 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %100, float %97) #4, !dbg !18
|
162 |
+
%.03.i10 = select i1 %.not3.i9, float %106, float %105, !dbg !18
|
163 |
+
%107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
164 |
+
%.not4.i11 = icmp eq i32 %107, 0, !dbg !18
|
165 |
+
%108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %100, float %96) #4, !dbg !18
|
166 |
+
%109 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %100, float %96) #4, !dbg !18
|
167 |
+
%.04.i12 = select i1 %.not4.i11, float %109, float %108, !dbg !18
|
168 |
+
%110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
169 |
+
%.not5.i13 = icmp eq i32 %110, 0, !dbg !18
|
170 |
+
%111 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %100, float %95) #4, !dbg !18
|
171 |
+
%112 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %100, float %95) #4, !dbg !18
|
172 |
+
%.05.i14 = select i1 %.not5.i13, float %112, float %111, !dbg !18
|
173 |
+
%113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
174 |
+
%.not6.i15 = icmp eq i32 %113, 0, !dbg !18
|
175 |
+
%114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %100, float %94) #4, !dbg !18
|
176 |
+
%115 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %100, float %94) #4, !dbg !18
|
177 |
+
%.06.i16 = select i1 %.not6.i15, float %115, float %114, !dbg !18
|
178 |
+
%116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
179 |
+
%.not7.i17 = icmp eq i32 %116, 0, !dbg !18
|
180 |
+
%117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %100, float %93) #4, !dbg !18
|
181 |
+
%118 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %100, float %93) #4, !dbg !18
|
182 |
+
%.07.i18 = select i1 %.not7.i17, float %118, float %117, !dbg !18
|
183 |
+
%119 = fneg float %100, !dbg !18
|
184 |
+
%120 = select i1 %89, float %119, float %33, !dbg !18
|
185 |
+
%121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
186 |
+
%.not8.i19 = icmp eq i32 %121, 0, !dbg !18
|
187 |
+
%122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %120, float %120) #4, !dbg !18
|
188 |
+
%123 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %120, float %120) #4, !dbg !18
|
189 |
+
%.08.i20 = select i1 %.not8.i19, float %123, float %122, !dbg !18
|
190 |
+
br i1 %89, label %124, label %__nv_erff.exit25, !dbg !18
|
191 |
+
|
192 |
+
124: ; preds = %__internal_fmad.exit.i6
|
193 |
+
%125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
|
194 |
+
%126 = fsub float 1.000000e+00, %125, !dbg !18
|
195 |
+
%127 = bitcast float %126 to i32, !dbg !18
|
196 |
+
%128 = bitcast float %33 to i32, !dbg !18
|
197 |
+
%129 = and i32 %128, -2147483648, !dbg !18
|
198 |
+
%130 = or i32 %129, %127, !dbg !18
|
199 |
+
%131 = bitcast i32 %130 to float, !dbg !18
|
200 |
+
br label %__nv_erff.exit25, !dbg !18
|
201 |
+
|
202 |
+
__nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %124
|
203 |
+
%r.0.i21 = phi float [ %131, %124 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
|
204 |
+
%132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
205 |
+
%.not.i26 = icmp eq i32 %132, 0, !dbg !18
|
206 |
+
%133 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
|
207 |
+
%134 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
|
208 |
+
%.0.i27 = select i1 %.not.i26, float %134, float %133, !dbg !18
|
209 |
+
%135 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
|
210 |
+
br i1 %135, label %__nv_fabsf.exit1.i44, label %137, !dbg !18
|
211 |
+
|
212 |
+
__nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
|
213 |
+
%136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
214 |
+
%.not1.i45 = icmp eq i32 %136, 0, !dbg !18
|
215 |
+
%.01.i46 = select i1 %.not1.i45, float %134, float %133, !dbg !18
|
216 |
+
br label %__internal_fmad.exit.i28, !dbg !18
|
217 |
+
|
218 |
+
137: ; preds = %__nv_erff.exit25
|
219 |
+
%138 = fmul float %34, %34, !dbg !18
|
220 |
+
br label %__internal_fmad.exit.i28, !dbg !18
|
221 |
+
|
222 |
+
__internal_fmad.exit.i28: ; preds = %137, %__nv_fabsf.exit1.i44
|
223 |
+
%139 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %137 ], !dbg !18
|
224 |
+
%140 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %137 ], !dbg !18
|
225 |
+
%141 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %137 ], !dbg !18
|
226 |
+
%142 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %137 ], !dbg !18
|
227 |
+
%143 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %137 ], !dbg !18
|
228 |
+
%144 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %137 ], !dbg !18
|
229 |
+
%145 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %137 ], !dbg !18
|
230 |
+
%146 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %138, %137 ], !dbg !18
|
231 |
+
%147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
232 |
+
%.not2.i29 = icmp eq i32 %147, 0, !dbg !18
|
233 |
+
%148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %145, float %146, float %144) #4, !dbg !18
|
234 |
+
%149 = tail call float @llvm.nvvm.fma.rn.f(float %145, float %146, float %144) #4, !dbg !18
|
235 |
+
%.02.i30 = select i1 %.not2.i29, float %149, float %148, !dbg !18
|
236 |
+
%150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
237 |
+
%.not3.i31 = icmp eq i32 %150, 0, !dbg !18
|
238 |
+
%151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %146, float %143) #4, !dbg !18
|
239 |
+
%152 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %146, float %143) #4, !dbg !18
|
240 |
+
%.03.i32 = select i1 %.not3.i31, float %152, float %151, !dbg !18
|
241 |
+
%153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
242 |
+
%.not4.i33 = icmp eq i32 %153, 0, !dbg !18
|
243 |
+
%154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %146, float %142) #4, !dbg !18
|
244 |
+
%155 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %146, float %142) #4, !dbg !18
|
245 |
+
%.04.i34 = select i1 %.not4.i33, float %155, float %154, !dbg !18
|
246 |
+
%156 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
247 |
+
%.not5.i35 = icmp eq i32 %156, 0, !dbg !18
|
248 |
+
%157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %146, float %141) #4, !dbg !18
|
249 |
+
%158 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %146, float %141) #4, !dbg !18
|
250 |
+
%.05.i36 = select i1 %.not5.i35, float %158, float %157, !dbg !18
|
251 |
+
%159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
252 |
+
%.not6.i37 = icmp eq i32 %159, 0, !dbg !18
|
253 |
+
%160 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %146, float %140) #4, !dbg !18
|
254 |
+
%161 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %146, float %140) #4, !dbg !18
|
255 |
+
%.06.i38 = select i1 %.not6.i37, float %161, float %160, !dbg !18
|
256 |
+
%162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
257 |
+
%.not7.i39 = icmp eq i32 %162, 0, !dbg !18
|
258 |
+
%163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %146, float %139) #4, !dbg !18
|
259 |
+
%164 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %146, float %139) #4, !dbg !18
|
260 |
+
%.07.i40 = select i1 %.not7.i39, float %164, float %163, !dbg !18
|
261 |
+
%165 = fneg float %146, !dbg !18
|
262 |
+
%166 = select i1 %135, float %165, float %34, !dbg !18
|
263 |
+
%167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
264 |
+
%.not8.i41 = icmp eq i32 %167, 0, !dbg !18
|
265 |
+
%168 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %166, float %166) #4, !dbg !18
|
266 |
+
%169 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %166, float %166) #4, !dbg !18
|
267 |
+
%.08.i42 = select i1 %.not8.i41, float %169, float %168, !dbg !18
|
268 |
+
br i1 %135, label %170, label %__nv_erff.exit47, !dbg !18
|
269 |
+
|
270 |
+
170: ; preds = %__internal_fmad.exit.i28
|
271 |
+
%171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
|
272 |
+
%172 = fsub float 1.000000e+00, %171, !dbg !18
|
273 |
+
%173 = bitcast float %172 to i32, !dbg !18
|
274 |
+
%174 = bitcast float %34 to i32, !dbg !18
|
275 |
+
%175 = and i32 %174, -2147483648, !dbg !18
|
276 |
+
%176 = or i32 %175, %173, !dbg !18
|
277 |
+
%177 = bitcast i32 %176 to float, !dbg !18
|
278 |
+
br label %__nv_erff.exit47, !dbg !18
|
279 |
+
|
280 |
+
__nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %170
|
281 |
+
%r.0.i43 = phi float [ %177, %170 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
|
282 |
+
%178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
283 |
+
%.not.i48 = icmp eq i32 %178, 0, !dbg !18
|
284 |
+
%179 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
|
285 |
+
%180 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
|
286 |
+
%.0.i49 = select i1 %.not.i48, float %180, float %179, !dbg !18
|
287 |
+
%181 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
|
288 |
+
br i1 %181, label %__nv_fabsf.exit1.i66, label %183, !dbg !18
|
289 |
+
|
290 |
+
__nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
|
291 |
+
%182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
292 |
+
%.not1.i67 = icmp eq i32 %182, 0, !dbg !18
|
293 |
+
%.01.i68 = select i1 %.not1.i67, float %180, float %179, !dbg !18
|
294 |
+
br label %__internal_fmad.exit.i50, !dbg !18
|
295 |
+
|
296 |
+
183: ; preds = %__nv_erff.exit47
|
297 |
+
%184 = fmul float %35, %35, !dbg !18
|
298 |
+
br label %__internal_fmad.exit.i50, !dbg !18
|
299 |
+
|
300 |
+
__internal_fmad.exit.i50: ; preds = %183, %__nv_fabsf.exit1.i66
|
301 |
+
%185 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %183 ], !dbg !18
|
302 |
+
%186 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %183 ], !dbg !18
|
303 |
+
%187 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %183 ], !dbg !18
|
304 |
+
%188 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %183 ], !dbg !18
|
305 |
+
%189 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %183 ], !dbg !18
|
306 |
+
%190 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %183 ], !dbg !18
|
307 |
+
%191 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %183 ], !dbg !18
|
308 |
+
%192 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %184, %183 ], !dbg !18
|
309 |
+
%193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
310 |
+
%.not2.i51 = icmp eq i32 %193, 0, !dbg !18
|
311 |
+
%194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %191, float %192, float %190) #4, !dbg !18
|
312 |
+
%195 = tail call float @llvm.nvvm.fma.rn.f(float %191, float %192, float %190) #4, !dbg !18
|
313 |
+
%.02.i52 = select i1 %.not2.i51, float %195, float %194, !dbg !18
|
314 |
+
%196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
315 |
+
%.not3.i53 = icmp eq i32 %196, 0, !dbg !18
|
316 |
+
%197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %192, float %189) #4, !dbg !18
|
317 |
+
%198 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %192, float %189) #4, !dbg !18
|
318 |
+
%.03.i54 = select i1 %.not3.i53, float %198, float %197, !dbg !18
|
319 |
+
%199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
320 |
+
%.not4.i55 = icmp eq i32 %199, 0, !dbg !18
|
321 |
+
%200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %192, float %188) #4, !dbg !18
|
322 |
+
%201 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %192, float %188) #4, !dbg !18
|
323 |
+
%.04.i56 = select i1 %.not4.i55, float %201, float %200, !dbg !18
|
324 |
+
%202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
325 |
+
%.not5.i57 = icmp eq i32 %202, 0, !dbg !18
|
326 |
+
%203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %192, float %187) #4, !dbg !18
|
327 |
+
%204 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %192, float %187) #4, !dbg !18
|
328 |
+
%.05.i58 = select i1 %.not5.i57, float %204, float %203, !dbg !18
|
329 |
+
%205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
330 |
+
%.not6.i59 = icmp eq i32 %205, 0, !dbg !18
|
331 |
+
%206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %192, float %186) #4, !dbg !18
|
332 |
+
%207 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %192, float %186) #4, !dbg !18
|
333 |
+
%.06.i60 = select i1 %.not6.i59, float %207, float %206, !dbg !18
|
334 |
+
%208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
335 |
+
%.not7.i61 = icmp eq i32 %208, 0, !dbg !18
|
336 |
+
%209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %192, float %185) #4, !dbg !18
|
337 |
+
%210 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %192, float %185) #4, !dbg !18
|
338 |
+
%.07.i62 = select i1 %.not7.i61, float %210, float %209, !dbg !18
|
339 |
+
%211 = fneg float %192, !dbg !18
|
340 |
+
%212 = select i1 %181, float %211, float %35, !dbg !18
|
341 |
+
%213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
342 |
+
%.not8.i63 = icmp eq i32 %213, 0, !dbg !18
|
343 |
+
%214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %212, float %212) #4, !dbg !18
|
344 |
+
%215 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %212, float %212) #4, !dbg !18
|
345 |
+
%.08.i64 = select i1 %.not8.i63, float %215, float %214, !dbg !18
|
346 |
+
br i1 %181, label %216, label %__nv_erff.exit69, !dbg !18
|
347 |
+
|
348 |
+
216: ; preds = %__internal_fmad.exit.i50
|
349 |
+
%217 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
|
350 |
+
%218 = fsub float 1.000000e+00, %217, !dbg !18
|
351 |
+
%219 = bitcast float %218 to i32, !dbg !18
|
352 |
+
%220 = bitcast float %35 to i32, !dbg !18
|
353 |
+
%221 = and i32 %220, -2147483648, !dbg !18
|
354 |
+
%222 = or i32 %221, %219, !dbg !18
|
355 |
+
%223 = bitcast i32 %222 to float, !dbg !18
|
356 |
+
br label %__nv_erff.exit69, !dbg !18
|
357 |
+
|
358 |
+
__nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %216
|
359 |
+
%r.0.i65 = phi float [ %223, %216 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
|
360 |
+
%224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
361 |
+
%.not.i70 = icmp eq i32 %224, 0, !dbg !18
|
362 |
+
%225 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
|
363 |
+
%226 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
|
364 |
+
%.0.i71 = select i1 %.not.i70, float %226, float %225, !dbg !18
|
365 |
+
%227 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
|
366 |
+
br i1 %227, label %__nv_fabsf.exit1.i88, label %229, !dbg !18
|
367 |
+
|
368 |
+
__nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
|
369 |
+
%228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
370 |
+
%.not1.i89 = icmp eq i32 %228, 0, !dbg !18
|
371 |
+
%.01.i90 = select i1 %.not1.i89, float %226, float %225, !dbg !18
|
372 |
+
br label %__internal_fmad.exit.i72, !dbg !18
|
373 |
+
|
374 |
+
229: ; preds = %__nv_erff.exit69
|
375 |
+
%230 = fmul float %36, %36, !dbg !18
|
376 |
+
br label %__internal_fmad.exit.i72, !dbg !18
|
377 |
+
|
378 |
+
__internal_fmad.exit.i72: ; preds = %229, %__nv_fabsf.exit1.i88
|
379 |
+
%231 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %229 ], !dbg !18
|
380 |
+
%232 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %229 ], !dbg !18
|
381 |
+
%233 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %229 ], !dbg !18
|
382 |
+
%234 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %229 ], !dbg !18
|
383 |
+
%235 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %229 ], !dbg !18
|
384 |
+
%236 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %229 ], !dbg !18
|
385 |
+
%237 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %229 ], !dbg !18
|
386 |
+
%238 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %230, %229 ], !dbg !18
|
387 |
+
%239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
388 |
+
%.not2.i73 = icmp eq i32 %239, 0, !dbg !18
|
389 |
+
%240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %237, float %238, float %236) #4, !dbg !18
|
390 |
+
%241 = tail call float @llvm.nvvm.fma.rn.f(float %237, float %238, float %236) #4, !dbg !18
|
391 |
+
%.02.i74 = select i1 %.not2.i73, float %241, float %240, !dbg !18
|
392 |
+
%242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
393 |
+
%.not3.i75 = icmp eq i32 %242, 0, !dbg !18
|
394 |
+
%243 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %238, float %235) #4, !dbg !18
|
395 |
+
%244 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %238, float %235) #4, !dbg !18
|
396 |
+
%.03.i76 = select i1 %.not3.i75, float %244, float %243, !dbg !18
|
397 |
+
%245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
398 |
+
%.not4.i77 = icmp eq i32 %245, 0, !dbg !18
|
399 |
+
%246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %238, float %234) #4, !dbg !18
|
400 |
+
%247 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %238, float %234) #4, !dbg !18
|
401 |
+
%.04.i78 = select i1 %.not4.i77, float %247, float %246, !dbg !18
|
402 |
+
%248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
403 |
+
%.not5.i79 = icmp eq i32 %248, 0, !dbg !18
|
404 |
+
%249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %238, float %233) #4, !dbg !18
|
405 |
+
%250 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %238, float %233) #4, !dbg !18
|
406 |
+
%.05.i80 = select i1 %.not5.i79, float %250, float %249, !dbg !18
|
407 |
+
%251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
408 |
+
%.not6.i81 = icmp eq i32 %251, 0, !dbg !18
|
409 |
+
%252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %238, float %232) #4, !dbg !18
|
410 |
+
%253 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %238, float %232) #4, !dbg !18
|
411 |
+
%.06.i82 = select i1 %.not6.i81, float %253, float %252, !dbg !18
|
412 |
+
%254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
413 |
+
%.not7.i83 = icmp eq i32 %254, 0, !dbg !18
|
414 |
+
%255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %238, float %231) #4, !dbg !18
|
415 |
+
%256 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %238, float %231) #4, !dbg !18
|
416 |
+
%.07.i84 = select i1 %.not7.i83, float %256, float %255, !dbg !18
|
417 |
+
%257 = fneg float %238, !dbg !18
|
418 |
+
%258 = select i1 %227, float %257, float %36, !dbg !18
|
419 |
+
%259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
420 |
+
%.not8.i85 = icmp eq i32 %259, 0, !dbg !18
|
421 |
+
%260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %258, float %258) #4, !dbg !18
|
422 |
+
%261 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %258, float %258) #4, !dbg !18
|
423 |
+
%.08.i86 = select i1 %.not8.i85, float %261, float %260, !dbg !18
|
424 |
+
br i1 %227, label %262, label %__nv_erff.exit91, !dbg !18
|
425 |
+
|
426 |
+
262: ; preds = %__internal_fmad.exit.i72
|
427 |
+
%263 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
|
428 |
+
%264 = fsub float 1.000000e+00, %263, !dbg !18
|
429 |
+
%265 = bitcast float %264 to i32, !dbg !18
|
430 |
+
%266 = bitcast float %36 to i32, !dbg !18
|
431 |
+
%267 = and i32 %266, -2147483648, !dbg !18
|
432 |
+
%268 = or i32 %267, %265, !dbg !18
|
433 |
+
%269 = bitcast i32 %268 to float, !dbg !18
|
434 |
+
br label %__nv_erff.exit91, !dbg !18
|
435 |
+
|
436 |
+
__nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %262
|
437 |
+
%r.0.i87 = phi float [ %269, %262 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
|
438 |
+
%270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
439 |
+
%.not.i92 = icmp eq i32 %270, 0, !dbg !18
|
440 |
+
%271 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
|
441 |
+
%272 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
|
442 |
+
%.0.i93 = select i1 %.not.i92, float %272, float %271, !dbg !18
|
443 |
+
%273 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
|
444 |
+
br i1 %273, label %__nv_fabsf.exit1.i110, label %275, !dbg !18
|
445 |
+
|
446 |
+
__nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
|
447 |
+
%274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
448 |
+
%.not1.i111 = icmp eq i32 %274, 0, !dbg !18
|
449 |
+
%.01.i112 = select i1 %.not1.i111, float %272, float %271, !dbg !18
|
450 |
+
br label %__internal_fmad.exit.i94, !dbg !18
|
451 |
+
|
452 |
+
275: ; preds = %__nv_erff.exit91
|
453 |
+
%276 = fmul float %37, %37, !dbg !18
|
454 |
+
br label %__internal_fmad.exit.i94, !dbg !18
|
455 |
+
|
456 |
+
__internal_fmad.exit.i94: ; preds = %275, %__nv_fabsf.exit1.i110
|
457 |
+
%277 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %275 ], !dbg !18
|
458 |
+
%278 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %275 ], !dbg !18
|
459 |
+
%279 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %275 ], !dbg !18
|
460 |
+
%280 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %275 ], !dbg !18
|
461 |
+
%281 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %275 ], !dbg !18
|
462 |
+
%282 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %275 ], !dbg !18
|
463 |
+
%283 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %275 ], !dbg !18
|
464 |
+
%284 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %276, %275 ], !dbg !18
|
465 |
+
%285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
466 |
+
%.not2.i95 = icmp eq i32 %285, 0, !dbg !18
|
467 |
+
%286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %283, float %284, float %282) #4, !dbg !18
|
468 |
+
%287 = tail call float @llvm.nvvm.fma.rn.f(float %283, float %284, float %282) #4, !dbg !18
|
469 |
+
%.02.i96 = select i1 %.not2.i95, float %287, float %286, !dbg !18
|
470 |
+
%288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
471 |
+
%.not3.i97 = icmp eq i32 %288, 0, !dbg !18
|
472 |
+
%289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %284, float %281) #4, !dbg !18
|
473 |
+
%290 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %284, float %281) #4, !dbg !18
|
474 |
+
%.03.i98 = select i1 %.not3.i97, float %290, float %289, !dbg !18
|
475 |
+
%291 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
476 |
+
%.not4.i99 = icmp eq i32 %291, 0, !dbg !18
|
477 |
+
%292 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %284, float %280) #4, !dbg !18
|
478 |
+
%293 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %284, float %280) #4, !dbg !18
|
479 |
+
%.04.i100 = select i1 %.not4.i99, float %293, float %292, !dbg !18
|
480 |
+
%294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
481 |
+
%.not5.i101 = icmp eq i32 %294, 0, !dbg !18
|
482 |
+
%295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %284, float %279) #4, !dbg !18
|
483 |
+
%296 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %284, float %279) #4, !dbg !18
|
484 |
+
%.05.i102 = select i1 %.not5.i101, float %296, float %295, !dbg !18
|
485 |
+
%297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
486 |
+
%.not6.i103 = icmp eq i32 %297, 0, !dbg !18
|
487 |
+
%298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %284, float %278) #4, !dbg !18
|
488 |
+
%299 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %284, float %278) #4, !dbg !18
|
489 |
+
%.06.i104 = select i1 %.not6.i103, float %299, float %298, !dbg !18
|
490 |
+
%300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
491 |
+
%.not7.i105 = icmp eq i32 %300, 0, !dbg !18
|
492 |
+
%301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %284, float %277) #4, !dbg !18
|
493 |
+
%302 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %284, float %277) #4, !dbg !18
|
494 |
+
%.07.i106 = select i1 %.not7.i105, float %302, float %301, !dbg !18
|
495 |
+
%303 = fneg float %284, !dbg !18
|
496 |
+
%304 = select i1 %273, float %303, float %37, !dbg !18
|
497 |
+
%305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
498 |
+
%.not8.i107 = icmp eq i32 %305, 0, !dbg !18
|
499 |
+
%306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %304, float %304) #4, !dbg !18
|
500 |
+
%307 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %304, float %304) #4, !dbg !18
|
501 |
+
%.08.i108 = select i1 %.not8.i107, float %307, float %306, !dbg !18
|
502 |
+
br i1 %273, label %308, label %__nv_erff.exit113, !dbg !18
|
503 |
+
|
504 |
+
308: ; preds = %__internal_fmad.exit.i94
|
505 |
+
%309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
|
506 |
+
%310 = fsub float 1.000000e+00, %309, !dbg !18
|
507 |
+
%311 = bitcast float %310 to i32, !dbg !18
|
508 |
+
%312 = bitcast float %37 to i32, !dbg !18
|
509 |
+
%313 = and i32 %312, -2147483648, !dbg !18
|
510 |
+
%314 = or i32 %313, %311, !dbg !18
|
511 |
+
%315 = bitcast i32 %314 to float, !dbg !18
|
512 |
+
br label %__nv_erff.exit113, !dbg !18
|
513 |
+
|
514 |
+
__nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %308
|
515 |
+
%r.0.i109 = phi float [ %315, %308 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
|
516 |
+
%316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
517 |
+
%.not.i114 = icmp eq i32 %316, 0, !dbg !18
|
518 |
+
%317 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
|
519 |
+
%318 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
|
520 |
+
%.0.i115 = select i1 %.not.i114, float %318, float %317, !dbg !18
|
521 |
+
%319 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
|
522 |
+
br i1 %319, label %__nv_fabsf.exit1.i132, label %321, !dbg !18
|
523 |
+
|
524 |
+
__nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
|
525 |
+
%320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
526 |
+
%.not1.i133 = icmp eq i32 %320, 0, !dbg !18
|
527 |
+
%.01.i134 = select i1 %.not1.i133, float %318, float %317, !dbg !18
|
528 |
+
br label %__internal_fmad.exit.i116, !dbg !18
|
529 |
+
|
530 |
+
321: ; preds = %__nv_erff.exit113
|
531 |
+
%322 = fmul float %38, %38, !dbg !18
|
532 |
+
br label %__internal_fmad.exit.i116, !dbg !18
|
533 |
+
|
534 |
+
__internal_fmad.exit.i116: ; preds = %321, %__nv_fabsf.exit1.i132
|
535 |
+
%323 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %321 ], !dbg !18
|
536 |
+
%324 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %321 ], !dbg !18
|
537 |
+
%325 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %321 ], !dbg !18
|
538 |
+
%326 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %321 ], !dbg !18
|
539 |
+
%327 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %321 ], !dbg !18
|
540 |
+
%328 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %321 ], !dbg !18
|
541 |
+
%329 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %321 ], !dbg !18
|
542 |
+
%330 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %322, %321 ], !dbg !18
|
543 |
+
%331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
544 |
+
%.not2.i117 = icmp eq i32 %331, 0, !dbg !18
|
545 |
+
%332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %329, float %330, float %328) #4, !dbg !18
|
546 |
+
%333 = tail call float @llvm.nvvm.fma.rn.f(float %329, float %330, float %328) #4, !dbg !18
|
547 |
+
%.02.i118 = select i1 %.not2.i117, float %333, float %332, !dbg !18
|
548 |
+
%334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
549 |
+
%.not3.i119 = icmp eq i32 %334, 0, !dbg !18
|
550 |
+
%335 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %330, float %327) #4, !dbg !18
|
551 |
+
%336 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %330, float %327) #4, !dbg !18
|
552 |
+
%.03.i120 = select i1 %.not3.i119, float %336, float %335, !dbg !18
|
553 |
+
%337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
554 |
+
%.not4.i121 = icmp eq i32 %337, 0, !dbg !18
|
555 |
+
%338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %330, float %326) #4, !dbg !18
|
556 |
+
%339 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %330, float %326) #4, !dbg !18
|
557 |
+
%.04.i122 = select i1 %.not4.i121, float %339, float %338, !dbg !18
|
558 |
+
%340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
559 |
+
%.not5.i123 = icmp eq i32 %340, 0, !dbg !18
|
560 |
+
%341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %330, float %325) #4, !dbg !18
|
561 |
+
%342 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %330, float %325) #4, !dbg !18
|
562 |
+
%.05.i124 = select i1 %.not5.i123, float %342, float %341, !dbg !18
|
563 |
+
%343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
564 |
+
%.not6.i125 = icmp eq i32 %343, 0, !dbg !18
|
565 |
+
%344 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %330, float %324) #4, !dbg !18
|
566 |
+
%345 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %330, float %324) #4, !dbg !18
|
567 |
+
%.06.i126 = select i1 %.not6.i125, float %345, float %344, !dbg !18
|
568 |
+
%346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
569 |
+
%.not7.i127 = icmp eq i32 %346, 0, !dbg !18
|
570 |
+
%347 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %330, float %323) #4, !dbg !18
|
571 |
+
%348 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %330, float %323) #4, !dbg !18
|
572 |
+
%.07.i128 = select i1 %.not7.i127, float %348, float %347, !dbg !18
|
573 |
+
%349 = fneg float %330, !dbg !18
|
574 |
+
%350 = select i1 %319, float %349, float %38, !dbg !18
|
575 |
+
%351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
576 |
+
%.not8.i129 = icmp eq i32 %351, 0, !dbg !18
|
577 |
+
%352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %350, float %350) #4, !dbg !18
|
578 |
+
%353 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %350, float %350) #4, !dbg !18
|
579 |
+
%.08.i130 = select i1 %.not8.i129, float %353, float %352, !dbg !18
|
580 |
+
br i1 %319, label %354, label %__nv_erff.exit135, !dbg !18
|
581 |
+
|
582 |
+
354: ; preds = %__internal_fmad.exit.i116
|
583 |
+
%355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
|
584 |
+
%356 = fsub float 1.000000e+00, %355, !dbg !18
|
585 |
+
%357 = bitcast float %356 to i32, !dbg !18
|
586 |
+
%358 = bitcast float %38 to i32, !dbg !18
|
587 |
+
%359 = and i32 %358, -2147483648, !dbg !18
|
588 |
+
%360 = or i32 %359, %357, !dbg !18
|
589 |
+
%361 = bitcast i32 %360 to float, !dbg !18
|
590 |
+
br label %__nv_erff.exit135, !dbg !18
|
591 |
+
|
592 |
+
__nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %354
|
593 |
+
%r.0.i131 = phi float [ %361, %354 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
|
594 |
+
%362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
595 |
+
%.not.i136 = icmp eq i32 %362, 0, !dbg !18
|
596 |
+
%363 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
|
597 |
+
%364 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
|
598 |
+
%.0.i137 = select i1 %.not.i136, float %364, float %363, !dbg !18
|
599 |
+
%365 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
|
600 |
+
br i1 %365, label %__nv_fabsf.exit1.i154, label %367, !dbg !18
|
601 |
+
|
602 |
+
__nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
|
603 |
+
%366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
604 |
+
%.not1.i155 = icmp eq i32 %366, 0, !dbg !18
|
605 |
+
%.01.i156 = select i1 %.not1.i155, float %364, float %363, !dbg !18
|
606 |
+
br label %__internal_fmad.exit.i138, !dbg !18
|
607 |
+
|
608 |
+
367: ; preds = %__nv_erff.exit135
|
609 |
+
%368 = fmul float %39, %39, !dbg !18
|
610 |
+
br label %__internal_fmad.exit.i138, !dbg !18
|
611 |
+
|
612 |
+
__internal_fmad.exit.i138: ; preds = %367, %__nv_fabsf.exit1.i154
|
613 |
+
%369 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %367 ], !dbg !18
|
614 |
+
%370 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %367 ], !dbg !18
|
615 |
+
%371 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %367 ], !dbg !18
|
616 |
+
%372 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %367 ], !dbg !18
|
617 |
+
%373 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %367 ], !dbg !18
|
618 |
+
%374 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %367 ], !dbg !18
|
619 |
+
%375 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %367 ], !dbg !18
|
620 |
+
%376 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %368, %367 ], !dbg !18
|
621 |
+
%377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
622 |
+
%.not2.i139 = icmp eq i32 %377, 0, !dbg !18
|
623 |
+
%378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float %376, float %374) #4, !dbg !18
|
624 |
+
%379 = tail call float @llvm.nvvm.fma.rn.f(float %375, float %376, float %374) #4, !dbg !18
|
625 |
+
%.02.i140 = select i1 %.not2.i139, float %379, float %378, !dbg !18
|
626 |
+
%380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
627 |
+
%.not3.i141 = icmp eq i32 %380, 0, !dbg !18
|
628 |
+
%381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %376, float %373) #4, !dbg !18
|
629 |
+
%382 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %376, float %373) #4, !dbg !18
|
630 |
+
%.03.i142 = select i1 %.not3.i141, float %382, float %381, !dbg !18
|
631 |
+
%383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
632 |
+
%.not4.i143 = icmp eq i32 %383, 0, !dbg !18
|
633 |
+
%384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %376, float %372) #4, !dbg !18
|
634 |
+
%385 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %376, float %372) #4, !dbg !18
|
635 |
+
%.04.i144 = select i1 %.not4.i143, float %385, float %384, !dbg !18
|
636 |
+
%386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
637 |
+
%.not5.i145 = icmp eq i32 %386, 0, !dbg !18
|
638 |
+
%387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %376, float %371) #4, !dbg !18
|
639 |
+
%388 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %376, float %371) #4, !dbg !18
|
640 |
+
%.05.i146 = select i1 %.not5.i145, float %388, float %387, !dbg !18
|
641 |
+
%389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
642 |
+
%.not6.i147 = icmp eq i32 %389, 0, !dbg !18
|
643 |
+
%390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %376, float %370) #4, !dbg !18
|
644 |
+
%391 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %376, float %370) #4, !dbg !18
|
645 |
+
%.06.i148 = select i1 %.not6.i147, float %391, float %390, !dbg !18
|
646 |
+
%392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
647 |
+
%.not7.i149 = icmp eq i32 %392, 0, !dbg !18
|
648 |
+
%393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %376, float %369) #4, !dbg !18
|
649 |
+
%394 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %376, float %369) #4, !dbg !18
|
650 |
+
%.07.i150 = select i1 %.not7.i149, float %394, float %393, !dbg !18
|
651 |
+
%395 = fneg float %376, !dbg !18
|
652 |
+
%396 = select i1 %365, float %395, float %39, !dbg !18
|
653 |
+
%397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
|
654 |
+
%.not8.i151 = icmp eq i32 %397, 0, !dbg !18
|
655 |
+
%398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %396, float %396) #4, !dbg !18
|
656 |
+
%399 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %396, float %396) #4, !dbg !18
|
657 |
+
%.08.i152 = select i1 %.not8.i151, float %399, float %398, !dbg !18
|
658 |
+
br i1 %365, label %400, label %__nv_erff.exit157, !dbg !18
|
659 |
+
|
660 |
+
400: ; preds = %__internal_fmad.exit.i138
|
661 |
+
%401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
|
662 |
+
%402 = fsub float 1.000000e+00, %401, !dbg !18
|
663 |
+
%403 = bitcast float %402 to i32, !dbg !18
|
664 |
+
%404 = bitcast float %39 to i32, !dbg !18
|
665 |
+
%405 = and i32 %404, -2147483648, !dbg !18
|
666 |
+
%406 = or i32 %405, %403, !dbg !18
|
667 |
+
%407 = bitcast i32 %406 to float, !dbg !18
|
668 |
+
br label %__nv_erff.exit157, !dbg !18
|
669 |
+
|
670 |
+
__nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %400
|
671 |
+
%r.0.i153 = phi float [ %407, %400 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
|
672 |
+
%408 = fmul float %31, 5.000000e-01, !dbg !19
|
673 |
+
%409 = fmul float %30, 5.000000e-01, !dbg !19
|
674 |
+
%410 = fmul float %29, 5.000000e-01, !dbg !19
|
675 |
+
%411 = fmul float %28, 5.000000e-01, !dbg !19
|
676 |
+
%412 = fmul float %27, 5.000000e-01, !dbg !19
|
677 |
+
%413 = fmul float %26, 5.000000e-01, !dbg !19
|
678 |
+
%414 = fmul float %25, 5.000000e-01, !dbg !19
|
679 |
+
%415 = fmul float %24, 5.000000e-01, !dbg !19
|
680 |
+
%416 = fadd float %r.0.i, 1.000000e+00, !dbg !20
|
681 |
+
%417 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
|
682 |
+
%418 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
|
683 |
+
%419 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
|
684 |
+
%420 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
|
685 |
+
%421 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
|
686 |
+
%422 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
|
687 |
+
%423 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
|
688 |
+
%424 = fmul float %415, %416, !dbg !21
|
689 |
+
%425 = fmul float %414, %417, !dbg !21
|
690 |
+
%426 = fmul float %413, %418, !dbg !21
|
691 |
+
%427 = fmul float %412, %419, !dbg !21
|
692 |
+
%428 = fmul float %411, %420, !dbg !21
|
693 |
+
%429 = fmul float %410, %421, !dbg !21
|
694 |
+
%430 = fmul float %409, %422, !dbg !21
|
695 |
+
%431 = fmul float %408, %423, !dbg !21
|
696 |
+
%432 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %424) #4, !dbg !22
|
697 |
+
%433 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !22
|
698 |
+
%434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !22
|
699 |
+
%435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !22
|
700 |
+
%436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !22
|
701 |
+
%437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !22
|
702 |
+
%438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !22
|
703 |
+
%439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !22
|
704 |
+
%440 = insertelement <2 x i16> undef, i16 %432, i64 0, !dbg !22
|
705 |
+
%441 = insertelement <2 x i16> %440, i16 %433, i64 1, !dbg !22
|
706 |
+
%442 = bitcast <2 x i16> %441 to i32, !dbg !22
|
707 |
+
%443 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !22
|
708 |
+
%444 = insertelement <2 x i16> %443, i16 %435, i64 1, !dbg !22
|
709 |
+
%445 = bitcast <2 x i16> %444 to i32, !dbg !22
|
710 |
+
%446 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !22
|
711 |
+
%447 = insertelement <2 x i16> %446, i16 %437, i64 1, !dbg !22
|
712 |
+
%448 = bitcast <2 x i16> %447 to i32, !dbg !22
|
713 |
+
%449 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !22
|
714 |
+
%450 = insertelement <2 x i16> %449, i16 %439, i64 1, !dbg !22
|
715 |
+
%451 = bitcast <2 x i16> %450 to i32, !dbg !22
|
716 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %445, i32 %448, i32 %451, ptr addrspace(1) %10, i1 true) #4, !dbg !22
|
717 |
+
ret void, !dbg !23
|
718 |
+
}
|
719 |
+
|
720 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
721 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
722 |
+
|
723 |
+
; Function Attrs: alwaysinline nounwind
|
724 |
+
define float @__nv_erff(float %a) local_unnamed_addr #1 {
|
725 |
+
__nv_fabsf.exit:
|
726 |
+
%0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
727 |
+
%.not = icmp eq i32 %0, 0
|
728 |
+
%1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
|
729 |
+
%2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
|
730 |
+
%.0 = select i1 %.not, float %2, float %1
|
731 |
+
%3 = fcmp oge float %.0, 0x3FF00C1FC0000000
|
732 |
+
br i1 %3, label %__nv_fabsf.exit1, label %5
|
733 |
+
|
734 |
+
__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
|
735 |
+
%4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
736 |
+
%.not1 = icmp eq i32 %4, 0
|
737 |
+
%.01 = select i1 %.not1, float %2, float %1
|
738 |
+
br label %__internal_fmad.exit
|
739 |
+
|
740 |
+
5: ; preds = %__nv_fabsf.exit
|
741 |
+
%6 = fmul float %a, %a
|
742 |
+
br label %__internal_fmad.exit
|
743 |
+
|
744 |
+
__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
|
745 |
+
%7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
|
746 |
+
%8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
|
747 |
+
%9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
|
748 |
+
%10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
|
749 |
+
%11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
|
750 |
+
%12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
|
751 |
+
%13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
|
752 |
+
%14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
|
753 |
+
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
754 |
+
%.not2 = icmp eq i32 %15, 0
|
755 |
+
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
|
756 |
+
%17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
|
757 |
+
%.02 = select i1 %.not2, float %17, float %16
|
758 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
759 |
+
%.not3 = icmp eq i32 %18, 0
|
760 |
+
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
|
761 |
+
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
|
762 |
+
%.03 = select i1 %.not3, float %20, float %19
|
763 |
+
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
764 |
+
%.not4 = icmp eq i32 %21, 0
|
765 |
+
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
|
766 |
+
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
|
767 |
+
%.04 = select i1 %.not4, float %23, float %22
|
768 |
+
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
769 |
+
%.not5 = icmp eq i32 %24, 0
|
770 |
+
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
|
771 |
+
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
|
772 |
+
%.05 = select i1 %.not5, float %26, float %25
|
773 |
+
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
774 |
+
%.not6 = icmp eq i32 %27, 0
|
775 |
+
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
|
776 |
+
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
|
777 |
+
%.06 = select i1 %.not6, float %29, float %28
|
778 |
+
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
779 |
+
%.not7 = icmp eq i32 %30, 0
|
780 |
+
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
|
781 |
+
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
|
782 |
+
%.07 = select i1 %.not7, float %32, float %31
|
783 |
+
%33 = fneg float %14
|
784 |
+
%34 = select i1 %3, float %33, float %a
|
785 |
+
%35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
786 |
+
%.not8 = icmp eq i32 %35, 0
|
787 |
+
%36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
|
788 |
+
%37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
|
789 |
+
%.08 = select i1 %.not8, float %37, float %36
|
790 |
+
br i1 %3, label %38, label %46
|
791 |
+
|
792 |
+
38: ; preds = %__internal_fmad.exit
|
793 |
+
%39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
|
794 |
+
%40 = fsub float 1.000000e+00, %39
|
795 |
+
%41 = bitcast float %40 to i32
|
796 |
+
%42 = bitcast float %a to i32
|
797 |
+
%43 = and i32 %42, -2147483648
|
798 |
+
%44 = or i32 %43, %41
|
799 |
+
%45 = bitcast i32 %44 to float
|
800 |
+
br label %46
|
801 |
+
|
802 |
+
46: ; preds = %38, %__internal_fmad.exit
|
803 |
+
%r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
|
804 |
+
ret float %r.0
|
805 |
+
}
|
806 |
+
|
807 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
|
808 |
+
|
809 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
810 |
+
declare float @llvm.nvvm.fabs.ftz.f(float) #0
|
811 |
+
|
812 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
813 |
+
declare float @llvm.nvvm.fabs.f(float) #0
|
814 |
+
|
815 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
816 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
|
817 |
+
|
818 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
819 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
|
820 |
+
|
821 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
822 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
823 |
+
|
824 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
825 |
+
attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
826 |
+
attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
827 |
+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
828 |
+
attributes #4 = { nounwind }
|
829 |
+
|
830 |
+
!llvm.module.flags = !{!0, !1}
|
831 |
+
!llvm.dbg.cu = !{!2}
|
832 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
833 |
+
!llvm.ident = !{!6}
|
834 |
+
|
835 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
836 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
837 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
838 |
+
!3 = !DIFile(filename: "ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py", directory: "/tmp/torchinductor_root/kp")
|
839 |
+
!4 = !{ptr @triton__0d1de, !"kernel", i32 1}
|
840 |
+
!5 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
|
841 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
842 |
+
!7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
843 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
844 |
+
!9 = !{}
|
845 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
846 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
847 |
+
!12 = !DILocation(line: 20, column: 33, scope: !7)
|
848 |
+
!13 = !DILocation(line: 21, column: 23, scope: !7)
|
849 |
+
!14 = !DILocation(line: 24, column: 34, scope: !7)
|
850 |
+
!15 = !DILocation(line: 24, column: 39, scope: !7)
|
851 |
+
!16 = !DILocation(line: 24, column: 48, scope: !7)
|
852 |
+
!17 = !DILocation(line: 29, column: 18, scope: !7)
|
853 |
+
!18 = !DILocation(line: 30, column: 23, scope: !7)
|
854 |
+
!19 = !DILocation(line: 27, column: 18, scope: !7)
|
855 |
+
!20 = !DILocation(line: 32, column: 18, scope: !7)
|
856 |
+
!21 = !DILocation(line: 33, column: 18, scope: !7)
|
857 |
+
!22 = !DILocation(line: 35, column: 40, scope: !7)
|
858 |
+
!23 = !DILocation(line: 35, column: 4, scope: !7)
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = and i32 %6, 7, !dbg !8
|
11 |
+
%10 = shl nuw nsw i32 %9, 2, !dbg !8
|
12 |
+
%11 = and i32 %8, 7, !dbg !9
|
13 |
+
%12 = lshr i32 %7, 3, !dbg !9
|
14 |
+
%13 = shl nuw nsw i32 %11, 2, !dbg !9
|
15 |
+
%14 = or i32 %13, %12, !dbg !9
|
16 |
+
%15 = or i32 %14, 96, !dbg !9
|
17 |
+
%16 = or i32 %10, 1, !dbg !10
|
18 |
+
%17 = or i32 %10, 2, !dbg !10
|
19 |
+
%18 = or i32 %10, 3, !dbg !10
|
20 |
+
%19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
|
21 |
+
%20 = shl i32 %19, 5, !dbg !15
|
22 |
+
%21 = or i32 %20, %10, !dbg !16
|
23 |
+
%22 = or i32 %20, %7, !dbg !16
|
24 |
+
%23 = icmp ult i32 %15, 120, !dbg !17
|
25 |
+
%24 = shl nuw nsw i32 %14, 17, !dbg !18
|
26 |
+
%25 = or i32 %24, 4194304, !dbg !18
|
27 |
+
%26 = or i32 %24, 8388608, !dbg !18
|
28 |
+
%27 = shl nuw nsw i32 %15, 17, !dbg !18
|
29 |
+
%28 = add i32 %21, %24, !dbg !19
|
30 |
+
%29 = add i32 %25, %21, !dbg !19
|
31 |
+
%30 = add i32 %26, %21, !dbg !19
|
32 |
+
%31 = add i32 %21, %27, !dbg !19
|
33 |
+
%32 = sext i32 %28 to i64, !dbg !20
|
34 |
+
%33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !20
|
35 |
+
%34 = sext i32 %29 to i64, !dbg !20
|
36 |
+
%35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !20
|
37 |
+
%36 = sext i32 %30 to i64, !dbg !20
|
38 |
+
%37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !20
|
39 |
+
%38 = sext i32 %31 to i64, !dbg !20
|
40 |
+
%39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !20
|
41 |
+
%40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
42 |
+
%41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !21
|
43 |
+
%42 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !21
|
44 |
+
%43 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !21
|
45 |
+
%44 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !21
|
46 |
+
%45 = bitcast i32 %41 to float, !dbg !21
|
47 |
+
%46 = bitcast i32 %42 to float, !dbg !21
|
48 |
+
%47 = bitcast i32 %43 to float, !dbg !21
|
49 |
+
%48 = bitcast i32 %44 to float, !dbg !21
|
50 |
+
%49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
51 |
+
%50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !21
|
52 |
+
%51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !21
|
53 |
+
%52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !21
|
54 |
+
%53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !21
|
55 |
+
%54 = bitcast i32 %50 to float, !dbg !21
|
56 |
+
%55 = bitcast i32 %51 to float, !dbg !21
|
57 |
+
%56 = bitcast i32 %52 to float, !dbg !21
|
58 |
+
%57 = bitcast i32 %53 to float, !dbg !21
|
59 |
+
%58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
|
60 |
+
%59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !21
|
61 |
+
%60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !21
|
62 |
+
%61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !21
|
63 |
+
%62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !21
|
64 |
+
%63 = bitcast i32 %59 to float, !dbg !21
|
65 |
+
%64 = bitcast i32 %60 to float, !dbg !21
|
66 |
+
%65 = bitcast i32 %61 to float, !dbg !21
|
67 |
+
%66 = bitcast i32 %62 to float, !dbg !21
|
68 |
+
%67 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23) #3, !dbg !21
|
69 |
+
%68 = extractvalue { i32, i32, i32, i32 } %67, 0, !dbg !21
|
70 |
+
%69 = extractvalue { i32, i32, i32, i32 } %67, 1, !dbg !21
|
71 |
+
%70 = extractvalue { i32, i32, i32, i32 } %67, 2, !dbg !21
|
72 |
+
%71 = extractvalue { i32, i32, i32, i32 } %67, 3, !dbg !21
|
73 |
+
%72 = bitcast i32 %68 to float, !dbg !21
|
74 |
+
%73 = bitcast i32 %69 to float, !dbg !21
|
75 |
+
%74 = bitcast i32 %70 to float, !dbg !21
|
76 |
+
%75 = bitcast i32 %71 to float, !dbg !21
|
77 |
+
%76 = fadd float %45, 0.000000e+00, !dbg !22
|
78 |
+
%77 = fadd float %46, 0.000000e+00, !dbg !22
|
79 |
+
%78 = fadd float %47, 0.000000e+00, !dbg !22
|
80 |
+
%79 = fadd float %48, 0.000000e+00, !dbg !22
|
81 |
+
%80 = fadd float %54, 0.000000e+00, !dbg !22
|
82 |
+
%81 = fadd float %55, 0.000000e+00, !dbg !22
|
83 |
+
%82 = fadd float %56, 0.000000e+00, !dbg !22
|
84 |
+
%83 = fadd float %57, 0.000000e+00, !dbg !22
|
85 |
+
%84 = fadd float %63, 0.000000e+00, !dbg !22
|
86 |
+
%85 = fadd float %64, 0.000000e+00, !dbg !22
|
87 |
+
%86 = fadd float %65, 0.000000e+00, !dbg !22
|
88 |
+
%87 = fadd float %66, 0.000000e+00, !dbg !22
|
89 |
+
%88 = fadd float %72, 0.000000e+00, !dbg !22
|
90 |
+
%89 = fadd float %73, 0.000000e+00, !dbg !22
|
91 |
+
%90 = fadd float %74, 0.000000e+00, !dbg !22
|
92 |
+
%91 = fadd float %75, 0.000000e+00, !dbg !22
|
93 |
+
%92 = select i1 %23, float %88, float 0.000000e+00, !dbg !23
|
94 |
+
%93 = select i1 %23, float %89, float 0.000000e+00, !dbg !23
|
95 |
+
%94 = select i1 %23, float %90, float 0.000000e+00, !dbg !23
|
96 |
+
%95 = select i1 %23, float %91, float 0.000000e+00, !dbg !23
|
97 |
+
%96 = fadd float %76, %80, !dbg !24
|
98 |
+
%97 = fadd float %77, %81, !dbg !24
|
99 |
+
%98 = fadd float %78, %82, !dbg !24
|
100 |
+
%99 = fadd float %79, %83, !dbg !24
|
101 |
+
%100 = fadd float %96, %84, !dbg !24
|
102 |
+
%101 = fadd float %97, %85, !dbg !24
|
103 |
+
%102 = fadd float %98, %86, !dbg !24
|
104 |
+
%103 = fadd float %99, %87, !dbg !24
|
105 |
+
%104 = fadd float %100, %92, !dbg !24
|
106 |
+
%105 = fadd float %101, %93, !dbg !24
|
107 |
+
%106 = fadd float %102, %94, !dbg !24
|
108 |
+
%107 = fadd float %103, %95, !dbg !24
|
109 |
+
%108 = bitcast float %104 to i32, !dbg !10
|
110 |
+
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !10
|
111 |
+
%110 = bitcast i32 %109 to float, !dbg !10
|
112 |
+
%111 = fadd float %104, %110, !dbg !24
|
113 |
+
%112 = bitcast float %111 to i32, !dbg !10
|
114 |
+
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !10
|
115 |
+
%114 = bitcast i32 %113 to float, !dbg !10
|
116 |
+
%115 = fadd float %111, %114, !dbg !24
|
117 |
+
%116 = bitcast float %105 to i32, !dbg !10
|
118 |
+
%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !10
|
119 |
+
%118 = bitcast i32 %117 to float, !dbg !10
|
120 |
+
%119 = fadd float %105, %118, !dbg !24
|
121 |
+
%120 = bitcast float %119 to i32, !dbg !10
|
122 |
+
%121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 8, i32 31), !dbg !10
|
123 |
+
%122 = bitcast i32 %121 to float, !dbg !10
|
124 |
+
%123 = fadd float %119, %122, !dbg !24
|
125 |
+
%124 = bitcast float %106 to i32, !dbg !10
|
126 |
+
%125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 16, i32 31), !dbg !10
|
127 |
+
%126 = bitcast i32 %125 to float, !dbg !10
|
128 |
+
%127 = fadd float %106, %126, !dbg !24
|
129 |
+
%128 = bitcast float %127 to i32, !dbg !10
|
130 |
+
%129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !10
|
131 |
+
%130 = bitcast i32 %129 to float, !dbg !10
|
132 |
+
%131 = fadd float %127, %130, !dbg !24
|
133 |
+
%132 = bitcast float %107 to i32, !dbg !10
|
134 |
+
%133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !10
|
135 |
+
%134 = bitcast i32 %133 to float, !dbg !10
|
136 |
+
%135 = fadd float %107, %134, !dbg !24
|
137 |
+
%136 = bitcast float %135 to i32, !dbg !10
|
138 |
+
%137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !10
|
139 |
+
%138 = bitcast i32 %137 to float, !dbg !10
|
140 |
+
%139 = fadd float %135, %138, !dbg !24
|
141 |
+
%140 = icmp ult i32 %7, 8, !dbg !10
|
142 |
+
%141 = shl nuw nsw i32 %9, 5, !dbg !10
|
143 |
+
%142 = or i32 %141, %11, !dbg !10
|
144 |
+
%143 = zext nneg i32 %142 to i64, !dbg !10
|
145 |
+
%144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !10
|
146 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %144, float %115, i1 %140) #3, !dbg !10
|
147 |
+
%145 = shl nuw nsw i32 %16, 3, !dbg !10
|
148 |
+
%146 = or i32 %145, %11, !dbg !10
|
149 |
+
%147 = zext nneg i32 %146 to i64, !dbg !10
|
150 |
+
%148 = getelementptr float, ptr addrspace(3) @global_smem, i64 %147, !dbg !10
|
151 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, float %123, i1 %140) #3, !dbg !10
|
152 |
+
%149 = shl nuw nsw i32 %17, 3, !dbg !10
|
153 |
+
%150 = or i32 %149, %11, !dbg !10
|
154 |
+
%151 = zext nneg i32 %150 to i64, !dbg !10
|
155 |
+
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
|
156 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %152, float %131, i1 %140) #3, !dbg !10
|
157 |
+
%153 = shl nuw nsw i32 %18, 3, !dbg !10
|
158 |
+
%154 = or i32 %153, %11, !dbg !10
|
159 |
+
%155 = zext nneg i32 %154 to i64, !dbg !10
|
160 |
+
%156 = getelementptr float, ptr addrspace(3) @global_smem, i64 %155, !dbg !10
|
161 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %156, float %139, i1 %140) #3, !dbg !10
|
162 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !10
|
163 |
+
%157 = icmp slt i32 %6, 256, !dbg !10
|
164 |
+
%158 = sext i32 %6 to i64, !dbg !10
|
165 |
+
%159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !10
|
166 |
+
%160 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %159, i1 %157) #3, !dbg !10
|
167 |
+
%161 = bitcast float %160 to i32, !dbg !10
|
168 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !10
|
169 |
+
%163 = bitcast i32 %162 to float, !dbg !10
|
170 |
+
%164 = fadd float %160, %163, !dbg !24
|
171 |
+
%165 = bitcast float %164 to i32, !dbg !10
|
172 |
+
%166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !10
|
173 |
+
%167 = bitcast i32 %166 to float, !dbg !10
|
174 |
+
%168 = fadd float %164, %167, !dbg !24
|
175 |
+
%169 = bitcast float %168 to i32, !dbg !10
|
176 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !10
|
177 |
+
%171 = bitcast i32 %170 to float, !dbg !10
|
178 |
+
%172 = fadd float %168, %171, !dbg !24
|
179 |
+
%173 = icmp eq i32 %9, 0, !dbg !10
|
180 |
+
%174 = and i1 %157, %173, !dbg !10
|
181 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %172, i1 %174) #3, !dbg !10
|
182 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !10
|
183 |
+
%175 = zext nneg i32 %141 to i64, !dbg !10
|
184 |
+
%176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !10
|
185 |
+
%177 = load float, ptr addrspace(3) %176, align 4, !dbg !10
|
186 |
+
%178 = zext nneg i32 %145 to i64, !dbg !10
|
187 |
+
%179 = getelementptr float, ptr addrspace(3) @global_smem, i64 %178, !dbg !10
|
188 |
+
%180 = load float, ptr addrspace(3) %179, align 4, !dbg !10
|
189 |
+
%181 = zext nneg i32 %149 to i64, !dbg !10
|
190 |
+
%182 = getelementptr float, ptr addrspace(3) @global_smem, i64 %181, !dbg !10
|
191 |
+
%183 = load float, ptr addrspace(3) %182, align 4, !dbg !10
|
192 |
+
%184 = zext nneg i32 %153 to i64, !dbg !10
|
193 |
+
%185 = getelementptr float, ptr addrspace(3) @global_smem, i64 %184, !dbg !10
|
194 |
+
%186 = load float, ptr addrspace(3) %185, align 4, !dbg !10
|
195 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
196 |
+
%187 = zext nneg i32 %10 to i64, !dbg !28
|
197 |
+
%188 = getelementptr float, ptr addrspace(3) @global_smem, i64 %187, !dbg !28
|
198 |
+
%189 = insertelement <1 x float> undef, float %177, i64 0, !dbg !28
|
199 |
+
store <1 x float> %189, ptr addrspace(3) %188, align 4, !dbg !28
|
200 |
+
%190 = zext nneg i32 %16 to i64, !dbg !28
|
201 |
+
%191 = getelementptr float, ptr addrspace(3) @global_smem, i64 %190, !dbg !28
|
202 |
+
%192 = insertelement <1 x float> undef, float %180, i64 0, !dbg !28
|
203 |
+
store <1 x float> %192, ptr addrspace(3) %191, align 4, !dbg !28
|
204 |
+
%193 = zext nneg i32 %17 to i64, !dbg !28
|
205 |
+
%194 = getelementptr float, ptr addrspace(3) @global_smem, i64 %193, !dbg !28
|
206 |
+
%195 = insertelement <1 x float> undef, float %183, i64 0, !dbg !28
|
207 |
+
store <1 x float> %195, ptr addrspace(3) %194, align 4, !dbg !28
|
208 |
+
%196 = zext nneg i32 %18 to i64, !dbg !28
|
209 |
+
%197 = getelementptr float, ptr addrspace(3) @global_smem, i64 %196, !dbg !28
|
210 |
+
%198 = insertelement <1 x float> undef, float %186, i64 0, !dbg !28
|
211 |
+
store <1 x float> %198, ptr addrspace(3) %197, align 4, !dbg !28
|
212 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
213 |
+
%199 = zext nneg i32 %7 to i64, !dbg !28
|
214 |
+
%200 = getelementptr float, ptr addrspace(3) @global_smem, i64 %199, !dbg !28
|
215 |
+
%201 = load <1 x float>, ptr addrspace(3) %200, align 4, !dbg !28
|
216 |
+
%.frozen = freeze i32 %22
|
217 |
+
%202 = sdiv i32 %.frozen, 256, !dbg !29
|
218 |
+
%203 = mul i32 %202, 256
|
219 |
+
%.decomposed = sub i32 %.frozen, %203
|
220 |
+
%204 = sext i32 %202 to i64, !dbg !30
|
221 |
+
%205 = getelementptr i64, ptr addrspace(1) %1, i64 %204, !dbg !30
|
222 |
+
%206 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %205, i1 true) #3, !dbg !31
|
223 |
+
%207 = lshr i64 %206, 54, !dbg !32
|
224 |
+
%208 = and i64 %207, 512, !dbg !32
|
225 |
+
%209 = add i64 %208, %206, !dbg !32
|
226 |
+
%210 = shl i64 %209, 8, !dbg !33
|
227 |
+
%211 = sext i32 %.decomposed to i64, !dbg !34
|
228 |
+
%212 = getelementptr float, ptr addrspace(1) %2, i64 %210, !dbg !35
|
229 |
+
%213 = getelementptr float, ptr addrspace(1) %212, i64 %211, !dbg !35
|
230 |
+
%214 = icmp eq i32 %11, 0, !dbg !36
|
231 |
+
%215 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %213, <1 x float> %201, i1 %214) #3, !dbg !36
|
232 |
+
ret void, !dbg !37
|
233 |
+
}
|
234 |
+
|
235 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
236 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
237 |
+
|
238 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
239 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
240 |
+
|
241 |
+
; Function Attrs: convergent nocallback nounwind
|
242 |
+
declare void @llvm.nvvm.barrier0() #2
|
243 |
+
|
244 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
245 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
246 |
+
attributes #2 = { convergent nocallback nounwind }
|
247 |
+
attributes #3 = { nounwind }
|
248 |
+
|
249 |
+
!llvm.module.flags = !{!0}
|
250 |
+
!llvm.dbg.cu = !{!1}
|
251 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
252 |
+
|
253 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
254 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
255 |
+
!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
|
256 |
+
!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
|
257 |
+
!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
|
258 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
259 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
260 |
+
!7 = !{}
|
261 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
262 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
263 |
+
!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
|
264 |
+
!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
|
265 |
+
!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
266 |
+
!13 = !DILocation(line: 35, column: 25, scope: !11)
|
267 |
+
!14 = !DILocation(line: 21, column: 28, scope: !5)
|
268 |
+
!15 = !DILocation(line: 21, column: 33, scope: !5)
|
269 |
+
!16 = !DILocation(line: 22, column: 23, scope: !5)
|
270 |
+
!17 = !DILocation(line: 29, column: 25, scope: !5)
|
271 |
+
!18 = !DILocation(line: 31, column: 47, scope: !5)
|
272 |
+
!19 = !DILocation(line: 31, column: 40, scope: !5)
|
273 |
+
!20 = !DILocation(line: 31, column: 34, scope: !5)
|
274 |
+
!21 = !DILocation(line: 31, column: 53, scope: !5)
|
275 |
+
!22 = !DILocation(line: 33, column: 23, scope: !5)
|
276 |
+
!23 = !DILocation(line: 34, column: 38, scope: !5)
|
277 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
|
278 |
+
!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
|
279 |
+
!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
|
280 |
+
!27 = !DILocation(line: 35, column: 25, scope: !25)
|
281 |
+
!28 = !DILocation(line: 35, column: 28, scope: !5)
|
282 |
+
!29 = !DILocation(line: 36, column: 20, scope: !5)
|
283 |
+
!30 = !DILocation(line: 38, column: 30, scope: !5)
|
284 |
+
!31 = !DILocation(line: 38, column: 35, scope: !5)
|
285 |
+
!32 = !DILocation(line: 41, column: 32, scope: !5)
|
286 |
+
!33 = !DILocation(line: 45, column: 40, scope: !5)
|
287 |
+
!34 = !DILocation(line: 45, column: 36, scope: !5)
|
288 |
+
!35 = !DILocation(line: 45, column: 30, scope: !5)
|
289 |
+
!36 = !DILocation(line: 45, column: 55, scope: !5)
|
290 |
+
!37 = !DILocation(line: 45, column: 4, scope: !5)
|
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx
ADDED
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4e
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4e(
|
13 |
+
.param .u64 triton__0d1d2d3de4e_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4e_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4e_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4e_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4e_param_4
|
18 |
+
)
|
19 |
+
.maxntid 256, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<30>;
|
22 |
+
.reg .b32 %r<112>;
|
23 |
+
.reg .f32 %f<76>;
|
24 |
+
.reg .b64 %rd<22>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_0];
|
30 |
+
ld.param.u64 %rd9, [triton__0d1d2d3de4e_param_1];
|
31 |
+
$L__tmp0:
|
32 |
+
.loc 1 22 44
|
33 |
+
mov.u32 %r48, %tid.x;
|
34 |
+
and.b32 %r49, %r48, 31;
|
35 |
+
ld.param.u64 %rd10, [triton__0d1d2d3de4e_param_2];
|
36 |
+
and.b32 %r50, %r48, 7;
|
37 |
+
shl.b32 %r51, %r50, 2;
|
38 |
+
.loc 1 24 33
|
39 |
+
bfe.u32 %r52, %r48, 5, 3;
|
40 |
+
bfe.u32 %r53, %r48, 3, 2;
|
41 |
+
shl.b32 %r54, %r52, 2;
|
42 |
+
or.b32 %r55, %r54, %r53;
|
43 |
+
or.b32 %r56, %r55, 96;
|
44 |
+
.loc 1 21 28
|
45 |
+
mov.u32 %r1, %ctaid.x;
|
46 |
+
.loc 1 21 33
|
47 |
+
shl.b32 %r57, %r1, 5;
|
48 |
+
.loc 1 22 23
|
49 |
+
or.b32 %r58, %r57, %r51;
|
50 |
+
or.b32 %r59, %r57, %r49;
|
51 |
+
.loc 1 29 25
|
52 |
+
setp.lt.u32 %p16, %r56, 120;
|
53 |
+
.loc 1 31 47
|
54 |
+
shl.b32 %r60, %r55, 17;
|
55 |
+
shl.b32 %r61, %r56, 17;
|
56 |
+
.loc 1 31 40
|
57 |
+
add.s32 %r62, %r58, %r60;
|
58 |
+
add.s32 %r63, %r62, 4194304;
|
59 |
+
add.s32 %r64, %r62, 8388608;
|
60 |
+
add.s32 %r65, %r58, %r61;
|
61 |
+
.loc 1 31 34
|
62 |
+
mul.wide.s32 %rd11, %r62, 4;
|
63 |
+
add.s64 %rd1, %rd8, %rd11;
|
64 |
+
mul.wide.s32 %rd12, %r63, 4;
|
65 |
+
add.s64 %rd2, %rd8, %rd12;
|
66 |
+
mul.wide.s32 %rd13, %r64, 4;
|
67 |
+
add.s64 %rd3, %rd8, %rd13;
|
68 |
+
mul.wide.s32 %rd14, %r65, 4;
|
69 |
+
add.s64 %rd4, %rd8, %rd14;
|
70 |
+
mov.b32 %r6, 0;
|
71 |
+
mov.pred %p1, -1;
|
72 |
+
.loc 1 31 53
|
73 |
+
mov.u32 %r2, 0x0;
|
74 |
+
mov.u32 %r3, 0x0;
|
75 |
+
mov.u32 %r4, 0x0;
|
76 |
+
mov.u32 %r5, 0x0;
|
77 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
78 |
+
@!%p1 mov.u32 %r2, %r6;
|
79 |
+
@!%p1 mov.u32 %r3, %r6;
|
80 |
+
@!%p1 mov.u32 %r4, %r6;
|
81 |
+
@!%p1 mov.u32 %r5, %r6;
|
82 |
+
mov.b32 %f1, %r2;
|
83 |
+
mov.b32 %f2, %r3;
|
84 |
+
mov.b32 %f3, %r4;
|
85 |
+
mov.b32 %f4, %r5;
|
86 |
+
mov.u32 %r10, 0x0;
|
87 |
+
mov.u32 %r11, 0x0;
|
88 |
+
mov.u32 %r12, 0x0;
|
89 |
+
mov.u32 %r13, 0x0;
|
90 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
|
91 |
+
@!%p1 mov.u32 %r10, %r6;
|
92 |
+
@!%p1 mov.u32 %r11, %r6;
|
93 |
+
@!%p1 mov.u32 %r12, %r6;
|
94 |
+
@!%p1 mov.u32 %r13, %r6;
|
95 |
+
mov.b32 %f5, %r10;
|
96 |
+
mov.b32 %f6, %r11;
|
97 |
+
mov.b32 %f7, %r12;
|
98 |
+
mov.b32 %f8, %r13;
|
99 |
+
mov.u32 %r18, 0x0;
|
100 |
+
mov.u32 %r19, 0x0;
|
101 |
+
mov.u32 %r20, 0x0;
|
102 |
+
mov.u32 %r21, 0x0;
|
103 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
|
104 |
+
@!%p1 mov.u32 %r18, %r6;
|
105 |
+
@!%p1 mov.u32 %r19, %r6;
|
106 |
+
@!%p1 mov.u32 %r20, %r6;
|
107 |
+
@!%p1 mov.u32 %r21, %r6;
|
108 |
+
mov.b32 %f9, %r18;
|
109 |
+
mov.b32 %f10, %r19;
|
110 |
+
mov.b32 %f11, %r20;
|
111 |
+
mov.b32 %f12, %r21;
|
112 |
+
mov.u32 %r26, 0x0;
|
113 |
+
mov.u32 %r27, 0x0;
|
114 |
+
mov.u32 %r28, 0x0;
|
115 |
+
mov.u32 %r29, 0x0;
|
116 |
+
@%p16 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
|
117 |
+
@!%p16 mov.u32 %r26, %r6;
|
118 |
+
@!%p16 mov.u32 %r27, %r6;
|
119 |
+
@!%p16 mov.u32 %r28, %r6;
|
120 |
+
@!%p16 mov.u32 %r29, %r6;
|
121 |
+
mov.b32 %f13, %r26;
|
122 |
+
mov.b32 %f14, %r27;
|
123 |
+
mov.b32 %f15, %r28;
|
124 |
+
mov.b32 %f16, %r29;
|
125 |
+
.loc 1 33 23
|
126 |
+
add.f32 %f17, %f1, 0f00000000;
|
127 |
+
add.f32 %f18, %f2, 0f00000000;
|
128 |
+
add.f32 %f19, %f3, 0f00000000;
|
129 |
+
add.f32 %f20, %f4, 0f00000000;
|
130 |
+
add.f32 %f21, %f5, 0f00000000;
|
131 |
+
add.f32 %f22, %f6, 0f00000000;
|
132 |
+
add.f32 %f23, %f7, 0f00000000;
|
133 |
+
add.f32 %f24, %f8, 0f00000000;
|
134 |
+
add.f32 %f25, %f9, 0f00000000;
|
135 |
+
add.f32 %f26, %f10, 0f00000000;
|
136 |
+
add.f32 %f27, %f11, 0f00000000;
|
137 |
+
add.f32 %f28, %f12, 0f00000000;
|
138 |
+
add.f32 %f29, %f13, 0f00000000;
|
139 |
+
add.f32 %f30, %f14, 0f00000000;
|
140 |
+
add.f32 %f31, %f15, 0f00000000;
|
141 |
+
add.f32 %f32, %f16, 0f00000000;
|
142 |
+
.loc 1 34 38
|
143 |
+
selp.f32 %f33, %f29, 0f00000000, %p16;
|
144 |
+
selp.f32 %f34, %f30, 0f00000000, %p16;
|
145 |
+
selp.f32 %f35, %f31, 0f00000000, %p16;
|
146 |
+
selp.f32 %f36, %f32, 0f00000000, %p16;
|
147 |
+
$L__tmp1:
|
148 |
+
.loc 2 233 15
|
149 |
+
add.f32 %f37, %f17, %f21;
|
150 |
+
add.f32 %f38, %f18, %f22;
|
151 |
+
add.f32 %f39, %f19, %f23;
|
152 |
+
add.f32 %f40, %f20, %f24;
|
153 |
+
add.f32 %f41, %f37, %f25;
|
154 |
+
add.f32 %f42, %f38, %f26;
|
155 |
+
add.f32 %f43, %f39, %f27;
|
156 |
+
add.f32 %f44, %f40, %f28;
|
157 |
+
add.f32 %f45, %f41, %f33;
|
158 |
+
add.f32 %f46, %f42, %f34;
|
159 |
+
add.f32 %f47, %f43, %f35;
|
160 |
+
add.f32 %f48, %f44, %f36;
|
161 |
+
$L__tmp2:
|
162 |
+
.loc 2 243 36
|
163 |
+
mov.b32 %r66, %f45;
|
164 |
+
shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
|
165 |
+
mov.b32 %f49, %r67;
|
166 |
+
$L__tmp3:
|
167 |
+
.loc 2 233 15
|
168 |
+
add.f32 %f50, %f45, %f49;
|
169 |
+
$L__tmp4:
|
170 |
+
.loc 2 243 36
|
171 |
+
mov.b32 %r68, %f50;
|
172 |
+
shfl.sync.bfly.b32 %r69, %r68, 8, 31, -1;
|
173 |
+
mov.b32 %f51, %r69;
|
174 |
+
$L__tmp5:
|
175 |
+
.loc 2 233 15
|
176 |
+
add.f32 %f52, %f50, %f51;
|
177 |
+
$L__tmp6:
|
178 |
+
.loc 2 243 36
|
179 |
+
mov.b32 %r70, %f46;
|
180 |
+
shfl.sync.bfly.b32 %r71, %r70, 16, 31, -1;
|
181 |
+
mov.b32 %f53, %r71;
|
182 |
+
$L__tmp7:
|
183 |
+
.loc 2 233 15
|
184 |
+
add.f32 %f54, %f46, %f53;
|
185 |
+
$L__tmp8:
|
186 |
+
.loc 2 243 36
|
187 |
+
mov.b32 %r72, %f54;
|
188 |
+
shfl.sync.bfly.b32 %r73, %r72, 8, 31, -1;
|
189 |
+
mov.b32 %f55, %r73;
|
190 |
+
$L__tmp9:
|
191 |
+
.loc 2 233 15
|
192 |
+
add.f32 %f56, %f54, %f55;
|
193 |
+
$L__tmp10:
|
194 |
+
.loc 2 243 36
|
195 |
+
mov.b32 %r74, %f47;
|
196 |
+
shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
|
197 |
+
mov.b32 %f57, %r75;
|
198 |
+
$L__tmp11:
|
199 |
+
.loc 2 233 15
|
200 |
+
add.f32 %f58, %f47, %f57;
|
201 |
+
$L__tmp12:
|
202 |
+
.loc 2 243 36
|
203 |
+
mov.b32 %r76, %f58;
|
204 |
+
shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
|
205 |
+
mov.b32 %f59, %r77;
|
206 |
+
$L__tmp13:
|
207 |
+
.loc 2 233 15
|
208 |
+
add.f32 %f60, %f58, %f59;
|
209 |
+
$L__tmp14:
|
210 |
+
.loc 2 243 36
|
211 |
+
mov.b32 %r78, %f48;
|
212 |
+
shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
|
213 |
+
mov.b32 %f61, %r79;
|
214 |
+
$L__tmp15:
|
215 |
+
.loc 2 233 15
|
216 |
+
add.f32 %f62, %f48, %f61;
|
217 |
+
$L__tmp16:
|
218 |
+
.loc 2 243 36
|
219 |
+
mov.b32 %r80, %f62;
|
220 |
+
shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
|
221 |
+
mov.b32 %f63, %r81;
|
222 |
+
$L__tmp17:
|
223 |
+
.loc 2 233 15
|
224 |
+
add.f32 %f64, %f62, %f63;
|
225 |
+
$L__tmp18:
|
226 |
+
.loc 2 243 36
|
227 |
+
setp.lt.u32 %p21, %r49, 8;
|
228 |
+
shl.b32 %r82, %r50, 7;
|
229 |
+
or.b32 %r83, %r82, %r54;
|
230 |
+
mov.u32 %r84, global_smem;
|
231 |
+
add.s32 %r34, %r84, %r83;
|
232 |
+
mov.b32 %r35, %f52;
|
233 |
+
@%p21 st.shared.b32 [ %r34 + 0 ], %r35;
|
234 |
+
or.b32 %r85, %r82, 32;
|
235 |
+
or.b32 %r86, %r85, %r54;
|
236 |
+
add.s32 %r36, %r84, %r86;
|
237 |
+
mov.b32 %r37, %f56;
|
238 |
+
@%p21 st.shared.b32 [ %r36 + 0 ], %r37;
|
239 |
+
or.b32 %r87, %r82, 64;
|
240 |
+
or.b32 %r88, %r87, %r54;
|
241 |
+
add.s32 %r38, %r84, %r88;
|
242 |
+
mov.b32 %r39, %f60;
|
243 |
+
@%p21 st.shared.b32 [ %r38 + 0 ], %r39;
|
244 |
+
or.b32 %r89, %r82, 96;
|
245 |
+
or.b32 %r90, %r89, %r54;
|
246 |
+
add.s32 %r40, %r84, %r90;
|
247 |
+
mov.b32 %r41, %f64;
|
248 |
+
@%p21 st.shared.b32 [ %r40 + 0 ], %r41;
|
249 |
+
bar.sync 0;
|
250 |
+
setp.lt.s32 %p25, %r48, 256;
|
251 |
+
shl.b32 %r91, %r48, 2;
|
252 |
+
add.s32 %r43, %r84, %r91;
|
253 |
+
@%p25 ld.shared.b32 %r42, [ %r43 + 0 ];
|
254 |
+
mov.b32 %f65, %r42;
|
255 |
+
shfl.sync.bfly.b32 %r92, %r42, 4, 31, -1;
|
256 |
+
mov.b32 %f66, %r92;
|
257 |
+
$L__tmp19:
|
258 |
+
.loc 2 233 15
|
259 |
+
add.f32 %f67, %f65, %f66;
|
260 |
+
$L__tmp20:
|
261 |
+
.loc 2 243 36
|
262 |
+
mov.b32 %r93, %f67;
|
263 |
+
shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
|
264 |
+
mov.b32 %f68, %r94;
|
265 |
+
$L__tmp21:
|
266 |
+
.loc 2 233 15
|
267 |
+
add.f32 %f69, %f67, %f68;
|
268 |
+
$L__tmp22:
|
269 |
+
.loc 2 243 36
|
270 |
+
mov.b32 %r95, %f69;
|
271 |
+
shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
|
272 |
+
mov.b32 %f70, %r96;
|
273 |
+
$L__tmp23:
|
274 |
+
.loc 2 233 15
|
275 |
+
add.f32 %f71, %f69, %f70;
|
276 |
+
$L__tmp24:
|
277 |
+
.loc 2 243 36
|
278 |
+
setp.eq.s32 %p29, %r50, 0;
|
279 |
+
and.pred %p26, %p25, %p29;
|
280 |
+
mov.b32 %r45, %f71;
|
281 |
+
@%p26 st.shared.b32 [ %r43 + 0 ], %r45;
|
282 |
+
bar.sync 0;
|
283 |
+
add.s32 %r97, %r84, %r82;
|
284 |
+
ld.shared.f32 %f72, [%r97];
|
285 |
+
add.s32 %r98, %r84, %r85;
|
286 |
+
ld.shared.f32 %f73, [%r98];
|
287 |
+
add.s32 %r99, %r84, %r87;
|
288 |
+
ld.shared.f32 %f74, [%r99];
|
289 |
+
add.s32 %r100, %r84, %r89;
|
290 |
+
ld.shared.f32 %f75, [%r100];
|
291 |
+
$L__tmp25:
|
292 |
+
.loc 1 35 28
|
293 |
+
bar.sync 0;
|
294 |
+
shl.b32 %r101, %r50, 4;
|
295 |
+
add.s32 %r102, %r84, %r101;
|
296 |
+
st.shared.f32 [%r102], %f72;
|
297 |
+
st.shared.f32 [%r102+4], %f73;
|
298 |
+
st.shared.f32 [%r102+8], %f74;
|
299 |
+
st.shared.f32 [%r102+12], %f75;
|
300 |
+
bar.sync 0;
|
301 |
+
shl.b32 %r103, %r49, 2;
|
302 |
+
add.s32 %r104, %r84, %r103;
|
303 |
+
.loc 1 36 20
|
304 |
+
shr.s32 %r106, %r59, 31;
|
305 |
+
shr.u32 %r107, %r106, 24;
|
306 |
+
add.s32 %r108, %r59, %r107;
|
307 |
+
shr.s32 %r109, %r108, 8;
|
308 |
+
and.b32 %r110, %r108, -256;
|
309 |
+
sub.s32 %r111, %r59, %r110;
|
310 |
+
.loc 1 38 30
|
311 |
+
mul.wide.s32 %rd15, %r109, 8;
|
312 |
+
add.s64 %rd6, %rd9, %rd15;
|
313 |
+
.loc 1 45 55
|
314 |
+
ld.shared.u32 %r47, [%r104];
|
315 |
+
.loc 1 38 35
|
316 |
+
mov.u64 %rd5, 0x0;
|
317 |
+
@%p1 ld.global.L1::evict_last.b64 { %rd5 }, [ %rd6 + 0 ];
|
318 |
+
.loc 1 41 32
|
319 |
+
shr.u64 %rd16, %rd5, 54;
|
320 |
+
and.b64 %rd17, %rd16, 512;
|
321 |
+
add.s64 %rd18, %rd17, %rd5;
|
322 |
+
.loc 1 45 30
|
323 |
+
shl.b64 %rd19, %rd18, 10;
|
324 |
+
add.s64 %rd20, %rd10, %rd19;
|
325 |
+
mul.wide.s32 %rd21, %r111, 4;
|
326 |
+
add.s64 %rd7, %rd20, %rd21;
|
327 |
+
.loc 1 45 55
|
328 |
+
setp.eq.s32 %p28, %r52, 0;
|
329 |
+
mov.u32 %r46, 0x0;
|
330 |
+
@%p28 atom.global.gpu.acq_rel.add.f32 %r46, [ %rd7 + 0 ], %r47;
|
331 |
+
.loc 1 45 4
|
332 |
+
ret;
|
333 |
+
$L__tmp26:
|
334 |
+
$L__func_end0:
|
335 |
+
|
336 |
+
}
|
337 |
+
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
|
338 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
339 |
+
.section .debug_abbrev
|
340 |
+
{
|
341 |
+
.b8 1
|
342 |
+
.b8 17
|
343 |
+
.b8 1
|
344 |
+
.b8 37
|
345 |
+
.b8 8
|
346 |
+
.b8 19
|
347 |
+
.b8 5
|
348 |
+
.b8 3
|
349 |
+
.b8 8
|
350 |
+
.b8 16
|
351 |
+
.b8 6
|
352 |
+
.b8 27
|
353 |
+
.b8 8
|
354 |
+
.b8 180
|
355 |
+
.b8 66
|
356 |
+
.b8 12
|
357 |
+
.b8 17
|
358 |
+
.b8 1
|
359 |
+
.b8 18
|
360 |
+
.b8 1
|
361 |
+
.b8 0
|
362 |
+
.b8 0
|
363 |
+
.b8 2
|
364 |
+
.b8 46
|
365 |
+
.b8 0
|
366 |
+
.b8 135
|
367 |
+
.b8 64
|
368 |
+
.b8 8
|
369 |
+
.b8 3
|
370 |
+
.b8 8
|
371 |
+
.b8 58
|
372 |
+
.b8 11
|
373 |
+
.b8 59
|
374 |
+
.b8 11
|
375 |
+
.b8 63
|
376 |
+
.b8 12
|
377 |
+
.b8 32
|
378 |
+
.b8 11
|
379 |
+
.b8 0
|
380 |
+
.b8 0
|
381 |
+
.b8 3
|
382 |
+
.b8 46
|
383 |
+
.b8 1
|
384 |
+
.b8 17
|
385 |
+
.b8 1
|
386 |
+
.b8 18
|
387 |
+
.b8 1
|
388 |
+
.b8 64
|
389 |
+
.b8 10
|
390 |
+
.b8 49
|
391 |
+
.b8 19
|
392 |
+
.b8 0
|
393 |
+
.b8 0
|
394 |
+
.b8 4
|
395 |
+
.b8 29
|
396 |
+
.b8 1
|
397 |
+
.b8 49
|
398 |
+
.b8 19
|
399 |
+
.b8 17
|
400 |
+
.b8 1
|
401 |
+
.b8 18
|
402 |
+
.b8 1
|
403 |
+
.b8 88
|
404 |
+
.b8 11
|
405 |
+
.b8 89
|
406 |
+
.b8 11
|
407 |
+
.b8 87
|
408 |
+
.b8 11
|
409 |
+
.b8 0
|
410 |
+
.b8 0
|
411 |
+
.b8 5
|
412 |
+
.b8 29
|
413 |
+
.b8 0
|
414 |
+
.b8 49
|
415 |
+
.b8 19
|
416 |
+
.b8 17
|
417 |
+
.b8 1
|
418 |
+
.b8 18
|
419 |
+
.b8 1
|
420 |
+
.b8 88
|
421 |
+
.b8 11
|
422 |
+
.b8 89
|
423 |
+
.b8 11
|
424 |
+
.b8 87
|
425 |
+
.b8 11
|
426 |
+
.b8 0
|
427 |
+
.b8 0
|
428 |
+
.b8 0
|
429 |
+
}
|
430 |
+
.section .debug_info
|
431 |
+
{
|
432 |
+
.b32 264
|
433 |
+
.b8 2
|
434 |
+
.b8 0
|
435 |
+
.b32 .debug_abbrev
|
436 |
+
.b8 8
|
437 |
+
.b8 1
|
438 |
+
.b8 116
|
439 |
+
.b8 114
|
440 |
+
.b8 105
|
441 |
+
.b8 116
|
442 |
+
.b8 111
|
443 |
+
.b8 110
|
444 |
+
.b8 0
|
445 |
+
.b8 2
|
446 |
+
.b8 0
|
447 |
+
.b8 99
|
448 |
+
.b8 54
|
449 |
+
.b8 105
|
450 |
+
.b8 107
|
451 |
+
.b8 53
|
452 |
+
.b8 118
|
453 |
+
.b8 120
|
454 |
+
.b8 55
|
455 |
+
.b8 112
|
456 |
+
.b8 50
|
457 |
+
.b8 50
|
458 |
+
.b8 102
|
459 |
+
.b8 112
|
460 |
+
.b8 107
|
461 |
+
.b8 52
|
462 |
+
.b8 100
|
463 |
+
.b8 99
|
464 |
+
.b8 118
|
465 |
+
.b8 104
|
466 |
+
.b8 53
|
467 |
+
.b8 53
|
468 |
+
.b8 122
|
469 |
+
.b8 105
|
470 |
+
.b8 109
|
471 |
+
.b8 119
|
472 |
+
.b8 52
|
473 |
+
.b8 116
|
474 |
+
.b8 53
|
475 |
+
.b8 110
|
476 |
+
.b8 114
|
477 |
+
.b8 53
|
478 |
+
.b8 122
|
479 |
+
.b8 110
|
480 |
+
.b8 50
|
481 |
+
.b8 98
|
482 |
+
.b8 55
|
483 |
+
.b8 105
|
484 |
+
.b8 110
|
485 |
+
.b8 117
|
486 |
+
.b8 106
|
487 |
+
.b8 120
|
488 |
+
.b8 106
|
489 |
+
.b8 97
|
490 |
+
.b8 117
|
491 |
+
.b8 120
|
492 |
+
.b8 115
|
493 |
+
.b8 104
|
494 |
+
.b8 108
|
495 |
+
.b8 106
|
496 |
+
.b8 117
|
497 |
+
.b8 109
|
498 |
+
.b8 109
|
499 |
+
.b8 46
|
500 |
+
.b8 112
|
501 |
+
.b8 121
|
502 |
+
.b8 0
|
503 |
+
.b32 .debug_line
|
504 |
+
.b8 47
|
505 |
+
.b8 116
|
506 |
+
.b8 109
|
507 |
+
.b8 112
|
508 |
+
.b8 47
|
509 |
+
.b8 116
|
510 |
+
.b8 111
|
511 |
+
.b8 114
|
512 |
+
.b8 99
|
513 |
+
.b8 104
|
514 |
+
.b8 105
|
515 |
+
.b8 110
|
516 |
+
.b8 100
|
517 |
+
.b8 117
|
518 |
+
.b8 99
|
519 |
+
.b8 116
|
520 |
+
.b8 111
|
521 |
+
.b8 114
|
522 |
+
.b8 95
|
523 |
+
.b8 114
|
524 |
+
.b8 111
|
525 |
+
.b8 111
|
526 |
+
.b8 116
|
527 |
+
.b8 47
|
528 |
+
.b8 54
|
529 |
+
.b8 105
|
530 |
+
.b8 0
|
531 |
+
.b8 1
|
532 |
+
.b64 $L__func_begin0
|
533 |
+
.b64 $L__func_end0
|
534 |
+
.b8 2
|
535 |
+
.b8 116
|
536 |
+
.b8 114
|
537 |
+
.b8 105
|
538 |
+
.b8 116
|
539 |
+
.b8 111
|
540 |
+
.b8 110
|
541 |
+
.b8 95
|
542 |
+
.b8 95
|
543 |
+
.b8 48
|
544 |
+
.b8 100
|
545 |
+
.b8 49
|
546 |
+
.b8 100
|
547 |
+
.b8 50
|
548 |
+
.b8 100
|
549 |
+
.b8 51
|
550 |
+
.b8 100
|
551 |
+
.b8 101
|
552 |
+
.b8 52
|
553 |
+
.b8 101
|
554 |
+
.b8 0
|
555 |
+
.b8 116
|
556 |
+
.b8 114
|
557 |
+
.b8 105
|
558 |
+
.b8 116
|
559 |
+
.b8 111
|
560 |
+
.b8 110
|
561 |
+
.b8 95
|
562 |
+
.b8 95
|
563 |
+
.b8 48
|
564 |
+
.b8 100
|
565 |
+
.b8 49
|
566 |
+
.b8 100
|
567 |
+
.b8 50
|
568 |
+
.b8 100
|
569 |
+
.b8 51
|
570 |
+
.b8 100
|
571 |
+
.b8 101
|
572 |
+
.b8 52
|
573 |
+
.b8 101
|
574 |
+
.b8 0
|
575 |
+
.b8 1
|
576 |
+
.b8 18
|
577 |
+
.b8 1
|
578 |
+
.b8 1
|
579 |
+
.b8 3
|
580 |
+
.b64 $L__func_begin0
|
581 |
+
.b64 $L__func_end0
|
582 |
+
.b8 1
|
583 |
+
.b8 156
|
584 |
+
.b32 125
|
585 |
+
.b8 4
|
586 |
+
.b32 125
|
587 |
+
.b64 $L__tmp1
|
588 |
+
.b64 $L__tmp24
|
589 |
+
.b8 2
|
590 |
+
.b8 35
|
591 |
+
.b8 25
|
592 |
+
.b8 5
|
593 |
+
.b32 125
|
594 |
+
.b64 $L__tmp1
|
595 |
+
.b64 $L__tmp24
|
596 |
+
.b8 2
|
597 |
+
.b8 243
|
598 |
+
.b8 36
|
599 |
+
.b8 0
|
600 |
+
.b8 5
|
601 |
+
.b32 125
|
602 |
+
.b64 $L__tmp2
|
603 |
+
.b64 $L__tmp25
|
604 |
+
.b8 2
|
605 |
+
.b8 35
|
606 |
+
.b8 25
|
607 |
+
.b8 0
|
608 |
+
.b8 0
|
609 |
+
}
|
610 |
+
.section .debug_pubnames
|
611 |
+
{
|
612 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
613 |
+
$L__pubNames_start0:
|
614 |
+
.b8 2
|
615 |
+
.b8 0
|
616 |
+
.b32 .debug_info
|
617 |
+
.b32 268
|
618 |
+
.b32 125
|
619 |
+
.b8 116
|
620 |
+
.b8 114
|
621 |
+
.b8 105
|
622 |
+
.b8 116
|
623 |
+
.b8 111
|
624 |
+
.b8 110
|
625 |
+
.b8 95
|
626 |
+
.b8 95
|
627 |
+
.b8 48
|
628 |
+
.b8 100
|
629 |
+
.b8 49
|
630 |
+
.b8 100
|
631 |
+
.b8 50
|
632 |
+
.b8 100
|
633 |
+
.b8 51
|
634 |
+
.b8 100
|
635 |
+
.b8 101
|
636 |
+
.b8 52
|
637 |
+
.b8 101
|
638 |
+
.b8 0
|
639 |
+
.b32 0
|
640 |
+
$L__pubNames_end0:
|
641 |
+
}
|
642 |
+
.section .debug_pubtypes
|
643 |
+
{
|
644 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
645 |
+
$L__pubTypes_start0:
|
646 |
+
.b8 2
|
647 |
+
.b8 0
|
648 |
+
.b32 .debug_info
|
649 |
+
.b32 268
|
650 |
+
.b32 0
|
651 |
+
$L__pubTypes_end0:
|
652 |
+
}
|
653 |
+
.section .debug_loc { }
|
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%5 = and i32 %4, 127, !dbg !8
|
9 |
+
%6 = shl nuw nsw i32 %5, 3, !dbg !8
|
10 |
+
%7 = shl nuw nsw i32 %5, 2, !dbg !8
|
11 |
+
%8 = or i32 %7, 512, !dbg !8
|
12 |
+
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
|
13 |
+
%10 = shl i32 %9, 10, !dbg !10
|
14 |
+
%11 = or i32 %10, %6, !dbg !11
|
15 |
+
%12 = or i32 %10, %7, !dbg !11
|
16 |
+
%13 = or i32 %10, %8, !dbg !11
|
17 |
+
%14 = sext i32 %11 to i64, !dbg !12
|
18 |
+
%15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
|
19 |
+
%16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
|
20 |
+
%17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
|
21 |
+
%18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
|
22 |
+
%19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
|
23 |
+
%20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
|
24 |
+
%21 = trunc i32 %17 to i16, !dbg !13
|
25 |
+
%extelt.offset = lshr i32 %17, 16, !dbg !13
|
26 |
+
%22 = trunc i32 %extelt.offset to i16, !dbg !13
|
27 |
+
%23 = trunc i32 %18 to i16, !dbg !13
|
28 |
+
%extelt.offset1 = lshr i32 %18, 16, !dbg !13
|
29 |
+
%24 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
30 |
+
%25 = trunc i32 %19 to i16, !dbg !13
|
31 |
+
%extelt.offset2 = lshr i32 %19, 16, !dbg !13
|
32 |
+
%26 = trunc i32 %extelt.offset2 to i16, !dbg !13
|
33 |
+
%27 = trunc i32 %20 to i16, !dbg !13
|
34 |
+
%extelt.offset3 = lshr i32 %20, 16, !dbg !13
|
35 |
+
%28 = trunc i32 %extelt.offset3 to i16, !dbg !13
|
36 |
+
%29 = zext nneg i32 %6 to i64, !dbg !14
|
37 |
+
%30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
|
38 |
+
%31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
|
39 |
+
store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
|
40 |
+
%32 = or i32 %6, 1, !dbg !14
|
41 |
+
%33 = zext nneg i32 %32 to i64, !dbg !14
|
42 |
+
%34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
|
43 |
+
%35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
|
44 |
+
store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
|
45 |
+
%36 = or i32 %6, 2, !dbg !14
|
46 |
+
%37 = zext nneg i32 %36 to i64, !dbg !14
|
47 |
+
%38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
|
48 |
+
%39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
|
49 |
+
store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
|
50 |
+
%40 = or i32 %6, 3, !dbg !14
|
51 |
+
%41 = zext nneg i32 %40 to i64, !dbg !14
|
52 |
+
%42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
|
53 |
+
%43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
|
54 |
+
store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
|
55 |
+
%44 = or i32 %6, 4, !dbg !14
|
56 |
+
%45 = zext nneg i32 %44 to i64, !dbg !14
|
57 |
+
%46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
|
58 |
+
%47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
|
59 |
+
store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
|
60 |
+
%48 = or i32 %6, 5, !dbg !14
|
61 |
+
%49 = zext nneg i32 %48 to i64, !dbg !14
|
62 |
+
%50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
|
63 |
+
%51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
|
64 |
+
store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
|
65 |
+
%52 = or i32 %6, 6, !dbg !14
|
66 |
+
%53 = zext nneg i32 %52 to i64, !dbg !14
|
67 |
+
%54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
|
68 |
+
%55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
|
69 |
+
store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
|
70 |
+
%56 = or i32 %6, 7, !dbg !14
|
71 |
+
%57 = zext nneg i32 %56 to i64, !dbg !14
|
72 |
+
%58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
|
73 |
+
%59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
|
74 |
+
store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
|
75 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !14
|
76 |
+
%60 = zext nneg i32 %7 to i64, !dbg !14
|
77 |
+
%61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
|
78 |
+
%62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
|
79 |
+
%63 = or i32 %7, 1, !dbg !14
|
80 |
+
%64 = zext nneg i32 %63 to i64, !dbg !14
|
81 |
+
%65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
|
82 |
+
%66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
|
83 |
+
%67 = or i32 %7, 2, !dbg !14
|
84 |
+
%68 = zext nneg i32 %67 to i64, !dbg !14
|
85 |
+
%69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
|
86 |
+
%70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
|
87 |
+
%71 = or i32 %7, 3, !dbg !14
|
88 |
+
%72 = zext nneg i32 %71 to i64, !dbg !14
|
89 |
+
%73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
|
90 |
+
%74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
|
91 |
+
%75 = zext nneg i32 %8 to i64, !dbg !14
|
92 |
+
%76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
|
93 |
+
%77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
|
94 |
+
%78 = or i32 %7, 513, !dbg !14
|
95 |
+
%79 = zext nneg i32 %78 to i64, !dbg !14
|
96 |
+
%80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
|
97 |
+
%81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
|
98 |
+
%82 = or i32 %7, 514, !dbg !14
|
99 |
+
%83 = zext nneg i32 %82 to i64, !dbg !14
|
100 |
+
%84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
|
101 |
+
%85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
|
102 |
+
%86 = or i32 %7, 515, !dbg !14
|
103 |
+
%87 = zext nneg i32 %86 to i64, !dbg !14
|
104 |
+
%88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
|
105 |
+
%89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
|
106 |
+
%90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
|
107 |
+
%91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
|
108 |
+
%92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
|
109 |
+
%93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
|
110 |
+
%94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
|
111 |
+
%95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
|
112 |
+
%96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
|
113 |
+
%97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
|
114 |
+
%98 = sext i32 %12 to i64, !dbg !15
|
115 |
+
%99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
|
116 |
+
%100 = sext i32 %13 to i64, !dbg !15
|
117 |
+
%101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
|
118 |
+
%102 = bitcast float %90 to i32, !dbg !16
|
119 |
+
%103 = bitcast float %91 to i32, !dbg !16
|
120 |
+
%104 = bitcast float %92 to i32, !dbg !16
|
121 |
+
%105 = bitcast float %93 to i32, !dbg !16
|
122 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
|
123 |
+
%106 = bitcast float %94 to i32, !dbg !16
|
124 |
+
%107 = bitcast float %95 to i32, !dbg !16
|
125 |
+
%108 = bitcast float %96 to i32, !dbg !16
|
126 |
+
%109 = bitcast float %97 to i32, !dbg !16
|
127 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
|
128 |
+
ret void, !dbg !17
|
129 |
+
}
|
130 |
+
|
131 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
132 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
133 |
+
|
134 |
+
; Function Attrs: convergent nocallback nounwind
|
135 |
+
declare void @llvm.nvvm.barrier0() #1
|
136 |
+
|
137 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
138 |
+
attributes #1 = { convergent nocallback nounwind }
|
139 |
+
attributes #2 = { nounwind }
|
140 |
+
|
141 |
+
!llvm.module.flags = !{!0}
|
142 |
+
!llvm.dbg.cu = !{!1}
|
143 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
144 |
+
|
145 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
146 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
147 |
+
!2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot")
|
148 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
149 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
150 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
151 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
152 |
+
!7 = !{}
|
153 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
154 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
155 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
156 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
157 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
158 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
159 |
+
!14 = !DILocation(line: 24, column: 44, scope: !5)
|
160 |
+
!15 = !DILocation(line: 26, column: 25, scope: !5)
|
161 |
+
!16 = !DILocation(line: 26, column: 36, scope: !5)
|
162 |
+
!17 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 128, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<4>;
|
20 |
+
.reg .b16 %rs<9>;
|
21 |
+
.reg .b32 %r<37>;
|
22 |
+
.reg .b64 %rd<13>;
|
23 |
+
.loc 1 18 0
|
24 |
+
$L__func_begin0:
|
25 |
+
.loc 1 18 0
|
26 |
+
|
27 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
28 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
29 |
+
$L__tmp0:
|
30 |
+
.loc 1 21 36
|
31 |
+
mov.u32 %r22, %tid.x;
|
32 |
+
and.b32 %r23, %r22, 127;
|
33 |
+
shl.b32 %r24, %r23, 3;
|
34 |
+
shl.b32 %r25, %r23, 2;
|
35 |
+
.loc 1 20 28
|
36 |
+
mov.u32 %r1, %ctaid.x;
|
37 |
+
.loc 1 20 33
|
38 |
+
shl.b32 %r26, %r1, 10;
|
39 |
+
.loc 1 21 23
|
40 |
+
or.b32 %r27, %r26, %r24;
|
41 |
+
or.b32 %r28, %r26, %r25;
|
42 |
+
.loc 1 24 30
|
43 |
+
mul.wide.s32 %rd6, %r27, 2;
|
44 |
+
add.s64 %rd1, %rd4, %rd6;
|
45 |
+
mov.pred %p1, -1;
|
46 |
+
.loc 1 24 35
|
47 |
+
mov.u32 %r2, 0x0;
|
48 |
+
mov.u32 %r3, 0x0;
|
49 |
+
mov.u32 %r4, 0x0;
|
50 |
+
mov.u32 %r5, 0x0;
|
51 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
52 |
+
shr.u32 %r29, %r2, 16;
|
53 |
+
shr.u32 %r30, %r3, 16;
|
54 |
+
shr.u32 %r31, %r4, 16;
|
55 |
+
shr.u32 %r32, %r5, 16;
|
56 |
+
.loc 1 24 44
|
57 |
+
shl.b32 %r33, %r23, 4;
|
58 |
+
mov.u32 %r34, global_smem;
|
59 |
+
add.s32 %r35, %r34, %r33;
|
60 |
+
st.shared.u16 [%r35], %r2;
|
61 |
+
st.shared.u16 [%r35+2], %r29;
|
62 |
+
st.shared.u16 [%r35+4], %r3;
|
63 |
+
st.shared.u16 [%r35+6], %r30;
|
64 |
+
st.shared.u16 [%r35+8], %r4;
|
65 |
+
st.shared.u16 [%r35+10], %r31;
|
66 |
+
st.shared.u16 [%r35+12], %r5;
|
67 |
+
st.shared.u16 [%r35+14], %r32;
|
68 |
+
bar.sync 0;
|
69 |
+
add.s32 %r36, %r34, %r24;
|
70 |
+
ld.shared.u16 %rs1, [%r36];
|
71 |
+
ld.shared.u16 %rs2, [%r36+2];
|
72 |
+
ld.shared.u16 %rs3, [%r36+4];
|
73 |
+
ld.shared.u16 %rs4, [%r36+6];
|
74 |
+
ld.shared.u16 %rs5, [%r36+1024];
|
75 |
+
ld.shared.u16 %rs6, [%r36+1026];
|
76 |
+
ld.shared.u16 %rs7, [%r36+1028];
|
77 |
+
ld.shared.u16 %rs8, [%r36+1030];
|
78 |
+
cvt.f32.bf16 %r14, %rs1;
|
79 |
+
cvt.f32.bf16 %r15, %rs2;
|
80 |
+
cvt.f32.bf16 %r16, %rs3;
|
81 |
+
cvt.f32.bf16 %r17, %rs4;
|
82 |
+
cvt.f32.bf16 %r18, %rs5;
|
83 |
+
cvt.f32.bf16 %r19, %rs6;
|
84 |
+
cvt.f32.bf16 %r20, %rs7;
|
85 |
+
cvt.f32.bf16 %r21, %rs8;
|
86 |
+
.loc 1 26 25
|
87 |
+
mul.wide.s32 %rd7, %r28, 4;
|
88 |
+
add.s64 %rd2, %rd5, %rd7;
|
89 |
+
cvt.s64.s32 %rd8, %r26;
|
90 |
+
cvt.u64.u32 %rd9, %r25;
|
91 |
+
or.b64 %rd10, %rd8, %rd9;
|
92 |
+
shl.b64 %rd11, %rd10, 2;
|
93 |
+
add.s64 %rd12, %rd5, %rd11;
|
94 |
+
add.s64 %rd3, %rd12, 2048;
|
95 |
+
.loc 1 26 36
|
96 |
+
@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
|
97 |
+
@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
|
98 |
+
.loc 1 26 4
|
99 |
+
ret;
|
100 |
+
$L__tmp1:
|
101 |
+
$L__func_end0:
|
102 |
+
|
103 |
+
}
|
104 |
+
.file 1 "/tmp/torchinductor_root/ot/cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py"
|
105 |
+
.section .debug_abbrev
|
106 |
+
{
|
107 |
+
.b8 1
|
108 |
+
.b8 17
|
109 |
+
.b8 1
|
110 |
+
.b8 37
|
111 |
+
.b8 8
|
112 |
+
.b8 19
|
113 |
+
.b8 5
|
114 |
+
.b8 3
|
115 |
+
.b8 8
|
116 |
+
.b8 16
|
117 |
+
.b8 6
|
118 |
+
.b8 27
|
119 |
+
.b8 8
|
120 |
+
.b8 180
|
121 |
+
.b8 66
|
122 |
+
.b8 12
|
123 |
+
.b8 17
|
124 |
+
.b8 1
|
125 |
+
.b8 18
|
126 |
+
.b8 1
|
127 |
+
.b8 0
|
128 |
+
.b8 0
|
129 |
+
.b8 2
|
130 |
+
.b8 46
|
131 |
+
.b8 0
|
132 |
+
.b8 17
|
133 |
+
.b8 1
|
134 |
+
.b8 18
|
135 |
+
.b8 1
|
136 |
+
.b8 64
|
137 |
+
.b8 10
|
138 |
+
.b8 135
|
139 |
+
.b8 64
|
140 |
+
.b8 8
|
141 |
+
.b8 3
|
142 |
+
.b8 8
|
143 |
+
.b8 58
|
144 |
+
.b8 11
|
145 |
+
.b8 59
|
146 |
+
.b8 11
|
147 |
+
.b8 63
|
148 |
+
.b8 12
|
149 |
+
.b8 0
|
150 |
+
.b8 0
|
151 |
+
.b8 0
|
152 |
+
}
|
153 |
+
.section .debug_info
|
154 |
+
{
|
155 |
+
.b32 176
|
156 |
+
.b8 2
|
157 |
+
.b8 0
|
158 |
+
.b32 .debug_abbrev
|
159 |
+
.b8 8
|
160 |
+
.b8 1
|
161 |
+
.b8 116
|
162 |
+
.b8 114
|
163 |
+
.b8 105
|
164 |
+
.b8 116
|
165 |
+
.b8 111
|
166 |
+
.b8 110
|
167 |
+
.b8 0
|
168 |
+
.b8 2
|
169 |
+
.b8 0
|
170 |
+
.b8 99
|
171 |
+
.b8 111
|
172 |
+
.b8 116
|
173 |
+
.b8 98
|
174 |
+
.b8 104
|
175 |
+
.b8 101
|
176 |
+
.b8 116
|
177 |
+
.b8 51
|
178 |
+
.b8 55
|
179 |
+
.b8 118
|
180 |
+
.b8 54
|
181 |
+
.b8 109
|
182 |
+
.b8 104
|
183 |
+
.b8 53
|
184 |
+
.b8 115
|
185 |
+
.b8 97
|
186 |
+
.b8 109
|
187 |
+
.b8 113
|
188 |
+
.b8 108
|
189 |
+
.b8 55
|
190 |
+
.b8 117
|
191 |
+
.b8 120
|
192 |
+
.b8 114
|
193 |
+
.b8 101
|
194 |
+
.b8 51
|
195 |
+
.b8 104
|
196 |
+
.b8 112
|
197 |
+
.b8 114
|
198 |
+
.b8 112
|
199 |
+
.b8 110
|
200 |
+
.b8 98
|
201 |
+
.b8 104
|
202 |
+
.b8 117
|
203 |
+
.b8 118
|
204 |
+
.b8 105
|
205 |
+
.b8 109
|
206 |
+
.b8 51
|
207 |
+
.b8 102
|
208 |
+
.b8 109
|
209 |
+
.b8 114
|
210 |
+
.b8 106
|
211 |
+
.b8 112
|
212 |
+
.b8 113
|
213 |
+
.b8 53
|
214 |
+
.b8 102
|
215 |
+
.b8 103
|
216 |
+
.b8 103
|
217 |
+
.b8 54
|
218 |
+
.b8 108
|
219 |
+
.b8 119
|
220 |
+
.b8 98
|
221 |
+
.b8 105
|
222 |
+
.b8 46
|
223 |
+
.b8 112
|
224 |
+
.b8 121
|
225 |
+
.b8 0
|
226 |
+
.b32 .debug_line
|
227 |
+
.b8 47
|
228 |
+
.b8 116
|
229 |
+
.b8 109
|
230 |
+
.b8 112
|
231 |
+
.b8 47
|
232 |
+
.b8 116
|
233 |
+
.b8 111
|
234 |
+
.b8 114
|
235 |
+
.b8 99
|
236 |
+
.b8 104
|
237 |
+
.b8 105
|
238 |
+
.b8 110
|
239 |
+
.b8 100
|
240 |
+
.b8 117
|
241 |
+
.b8 99
|
242 |
+
.b8 116
|
243 |
+
.b8 111
|
244 |
+
.b8 114
|
245 |
+
.b8 95
|
246 |
+
.b8 114
|
247 |
+
.b8 111
|
248 |
+
.b8 111
|
249 |
+
.b8 116
|
250 |
+
.b8 47
|
251 |
+
.b8 111
|
252 |
+
.b8 116
|
253 |
+
.b8 0
|
254 |
+
.b8 1
|
255 |
+
.b64 $L__func_begin0
|
256 |
+
.b64 $L__func_end0
|
257 |
+
.b8 2
|
258 |
+
.b64 $L__func_begin0
|
259 |
+
.b64 $L__func_end0
|
260 |
+
.b8 1
|
261 |
+
.b8 156
|
262 |
+
.b8 116
|
263 |
+
.b8 114
|
264 |
+
.b8 105
|
265 |
+
.b8 116
|
266 |
+
.b8 111
|
267 |
+
.b8 110
|
268 |
+
.b8 95
|
269 |
+
.b8 95
|
270 |
+
.b8 48
|
271 |
+
.b8 100
|
272 |
+
.b8 49
|
273 |
+
.b8 100
|
274 |
+
.b8 50
|
275 |
+
.b8 100
|
276 |
+
.b8 101
|
277 |
+
.b8 0
|
278 |
+
.b8 116
|
279 |
+
.b8 114
|
280 |
+
.b8 105
|
281 |
+
.b8 116
|
282 |
+
.b8 111
|
283 |
+
.b8 110
|
284 |
+
.b8 95
|
285 |
+
.b8 95
|
286 |
+
.b8 48
|
287 |
+
.b8 100
|
288 |
+
.b8 49
|
289 |
+
.b8 100
|
290 |
+
.b8 50
|
291 |
+
.b8 100
|
292 |
+
.b8 101
|
293 |
+
.b8 0
|
294 |
+
.b8 1
|
295 |
+
.b8 18
|
296 |
+
.b8 1
|
297 |
+
.b8 0
|
298 |
+
}
|
299 |
+
.section .debug_pubnames
|
300 |
+
{
|
301 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
302 |
+
$L__pubNames_start0:
|
303 |
+
.b8 2
|
304 |
+
.b8 0
|
305 |
+
.b32 .debug_info
|
306 |
+
.b32 180
|
307 |
+
.b32 125
|
308 |
+
.b8 116
|
309 |
+
.b8 114
|
310 |
+
.b8 105
|
311 |
+
.b8 116
|
312 |
+
.b8 111
|
313 |
+
.b8 110
|
314 |
+
.b8 95
|
315 |
+
.b8 95
|
316 |
+
.b8 48
|
317 |
+
.b8 100
|
318 |
+
.b8 49
|
319 |
+
.b8 100
|
320 |
+
.b8 50
|
321 |
+
.b8 100
|
322 |
+
.b8 101
|
323 |
+
.b8 0
|
324 |
+
.b32 0
|
325 |
+
$L__pubNames_end0:
|
326 |
+
}
|
327 |
+
.section .debug_pubtypes
|
328 |
+
{
|
329 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
330 |
+
$L__pubTypes_start0:
|
331 |
+
.b8 2
|
332 |
+
.b8 0
|
333 |
+
.b32 .debug_info
|
334 |
+
.b32 180
|
335 |
+
.b32 0
|
336 |
+
$L__pubTypes_end0:
|
337 |
+
}
|
338 |
+
.section .debug_loc { }
|
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%c1024_i32 = arith.constant 1024 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
9 |
+
%3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
|
10 |
+
%4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
11 |
+
%5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
|
12 |
+
%6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
|
13 |
+
%7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
|
14 |
+
%8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
15 |
+
%9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
16 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
17 |
+
%11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
|
18 |
+
%12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
|
19 |
+
%13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
|
20 |
+
%14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
|
21 |
+
tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
|
22 |
+
tt.return
|
23 |
+
}
|
24 |
+
}
|
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c1024_i32 = arith.constant 1024 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
12 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
13 |
+
%9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
|
14 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
|
15 |
+
tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin
ADDED
Binary file (32 kB). View file
|
|
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx
ADDED
@@ -0,0 +1,756 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5de6de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
26 |
+
|
27 |
+
.visible .entry triton__0d1d2d3d4d5de6de(
|
28 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
|
29 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
|
33 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
|
34 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_6
|
35 |
+
)
|
36 |
+
.maxntid 256, 1, 1
|
37 |
+
{
|
38 |
+
.reg .pred %p<27>;
|
39 |
+
.reg .b16 %rs<3>;
|
40 |
+
.reg .b32 %r<81>;
|
41 |
+
.reg .f32 %f<73>;
|
42 |
+
.reg .b64 %rd<84>;
|
43 |
+
.loc 1 18 0
|
44 |
+
$L__func_begin0:
|
45 |
+
.loc 1 18 0
|
46 |
+
|
47 |
+
ld.param.u64 %rd35, [triton__0d1d2d3d4d5de6de_param_3];
|
48 |
+
ld.param.u64 %rd34, [triton__0d1d2d3d4d5de6de_param_2];
|
49 |
+
ld.param.u64 %rd33, [triton__0d1d2d3d4d5de6de_param_1];
|
50 |
+
ld.param.u64 %rd41, [triton__0d1d2d3d4d5de6de_param_0];
|
51 |
+
$L__tmp0:
|
52 |
+
.loc 1 22 44
|
53 |
+
mov.u32 %r1, %tid.x;
|
54 |
+
bfe.u32 %r2, %r1, 2, 6;
|
55 |
+
and.b32 %r14, %r1, 63;
|
56 |
+
.loc 1 24 33
|
57 |
+
and.b32 %r3, %r1, 3;
|
58 |
+
.loc 1 21 28
|
59 |
+
mov.u32 %r13, %ctaid.x;
|
60 |
+
.loc 1 21 33
|
61 |
+
shl.b32 %r15, %r13, 6;
|
62 |
+
.loc 1 22 23
|
63 |
+
or.b32 %r16, %r15, %r2;
|
64 |
+
or.b32 %r17, %r15, %r14;
|
65 |
+
.loc 1 26 30
|
66 |
+
mul.wide.s32 %rd42, %r16, 8;
|
67 |
+
add.s64 %rd38, %rd41, %rd42;
|
68 |
+
mul.wide.s32 %rd43, %r17, 8;
|
69 |
+
add.s64 %rd40, %rd41, %rd43;
|
70 |
+
mov.pred %p11, -1;
|
71 |
+
.loc 1 26 35
|
72 |
+
mov.u64 %rd37, 0x0;
|
73 |
+
@%p11 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd38 + 0 ];
|
74 |
+
mov.u64 %rd39, 0x0;
|
75 |
+
@%p11 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
|
76 |
+
.loc 1 27 18
|
77 |
+
bfe.s32 %r18, %r13, 25, 1;
|
78 |
+
shr.u32 %r19, %r18, 23;
|
79 |
+
add.s32 %r20, %r16, %r19;
|
80 |
+
and.b32 %r21, %r20, 16776704;
|
81 |
+
sub.s32 %r22, %r16, %r21;
|
82 |
+
.loc 1 35 44
|
83 |
+
shl.b32 %r5, %r22, 8;
|
84 |
+
.loc 1 36 22
|
85 |
+
add.s64 %rd44, %rd39, 50257;
|
86 |
+
.loc 1 37 22
|
87 |
+
setp.lt.s64 %p3, %rd37, 0;
|
88 |
+
setp.lt.s64 %p4, %rd39, 0;
|
89 |
+
.loc 1 38 36
|
90 |
+
selp.b64 %rd45, %rd44, %rd39, %p4;
|
91 |
+
.loc 1 39 40
|
92 |
+
setp.gt.u64 %p5, %rd45, 50256;
|
93 |
+
.loc 1 40 44
|
94 |
+
shl.b64 %rd46, %rd37, 8;
|
95 |
+
add.s64 %rd47, %rd46, 12865792;
|
96 |
+
selp.b64 %rd2, %rd47, %rd46, %p3;
|
97 |
+
mov.b32 %r67, 0;
|
98 |
+
mov.b32 %r77, 883;
|
99 |
+
mov.u64 %rd73, 1;
|
100 |
+
.loc 1 39 55
|
101 |
+
@%p5 bra $L__BB0_3;
|
102 |
+
bra.uni $L__BB0_1;
|
103 |
+
$L__BB0_3:
|
104 |
+
.loc 1 31 36
|
105 |
+
shl.b64 %rd51, %rd2, 2;
|
106 |
+
mul.wide.u32 %rd80, %r3, 4;
|
107 |
+
add.s64 %rd79, %rd51, %rd80;
|
108 |
+
add.s64 %rd75, %rd33, %rd79;
|
109 |
+
add.s32 %r35, %r5, %r3;
|
110 |
+
mul.wide.s32 %rd78, %r35, 4;
|
111 |
+
add.s64 %rd74, %rd34, %rd78;
|
112 |
+
mov.f32 %f72, 0f00000000;
|
113 |
+
mov.b32 %r78, -4;
|
114 |
+
mov.f32 %f71, %f72;
|
115 |
+
mov.f32 %f70, %f72;
|
116 |
+
$L__BB0_4:
|
117 |
+
.loc 1 35 50
|
118 |
+
mov.u32 %r36, 0x0;
|
119 |
+
@%p11 ld.global.L1::evict_last.b32 { %r36 }, [ %rd74 + 0 ];
|
120 |
+
@!%p11 mov.u32 %r36, %r67;
|
121 |
+
mov.b32 %f28, %r36;
|
122 |
+
.loc 1 39 55
|
123 |
+
mov.u64 %rd54, assertMessage_0;
|
124 |
+
cvta.global.u64 %rd55, %rd54;
|
125 |
+
mov.u64 %rd56, assertFile_0;
|
126 |
+
cvta.global.u64 %rd57, %rd56;
|
127 |
+
mov.u64 %rd58, assertFunc_0;
|
128 |
+
cvta.global.u64 %rd59, %rd58;
|
129 |
+
{ // callseq 10, 0
|
130 |
+
.reg .b32 temp_param_reg;
|
131 |
+
.param .b64 param0;
|
132 |
+
st.param.b64 [param0+0], %rd55;
|
133 |
+
.param .b64 param1;
|
134 |
+
st.param.b64 [param1+0], %rd57;
|
135 |
+
.param .b32 param2;
|
136 |
+
st.param.b32 [param2+0], %r77;
|
137 |
+
.param .b64 param3;
|
138 |
+
st.param.b64 [param3+0], %rd59;
|
139 |
+
.param .b64 param4;
|
140 |
+
st.param.b64 [param4+0], %rd73;
|
141 |
+
call.uni
|
142 |
+
__assertfail,
|
143 |
+
(
|
144 |
+
param0,
|
145 |
+
param1,
|
146 |
+
param2,
|
147 |
+
param3,
|
148 |
+
param4
|
149 |
+
);
|
150 |
+
} // callseq 10
|
151 |
+
.loc 1 40 52
|
152 |
+
mov.u32 %r38, 0x0;
|
153 |
+
@%p11 ld.global.L1::evict_last.b32 { %r38 }, [ %rd75 + 0 ];
|
154 |
+
@!%p11 mov.u32 %r38, %r67;
|
155 |
+
mov.b32 %f29, %r38;
|
156 |
+
.loc 1 41 22
|
157 |
+
add.f32 %f30, %f28, %f29;
|
158 |
+
$L__tmp1:
|
159 |
+
.loc 2 96 20
|
160 |
+
sub.f32 %f31, %f30, %f70;
|
161 |
+
.loc 2 97 26
|
162 |
+
add.f32 %f72, %f72, 0f3F800000;
|
163 |
+
.loc 2 98 30
|
164 |
+
mov.b32 %r41, %f31;
|
165 |
+
mov.b32 %r42, %f72;
|
166 |
+
div.full.f32 %r40, %r41, %r42;
|
167 |
+
mov.b32 %f32, %r40;
|
168 |
+
.loc 2 98 22
|
169 |
+
add.f32 %f70, %f70, %f32;
|
170 |
+
.loc 2 101 30
|
171 |
+
sub.f32 %f33, %f30, %f70;
|
172 |
+
$L__tmp2:
|
173 |
+
.loc 1 47 48
|
174 |
+
fma.rn.f32 %f71, %f31, %f33, %f71;
|
175 |
+
.loc 1 31 36
|
176 |
+
add.s32 %r78, %r78, 4;
|
177 |
+
add.s64 %rd75, %rd75, 16;
|
178 |
+
add.s64 %rd74, %rd74, 16;
|
179 |
+
setp.lt.u32 %p15, %r78, 252;
|
180 |
+
@%p15 bra $L__BB0_4;
|
181 |
+
bra.uni $L__BB0_5;
|
182 |
+
$L__BB0_1:
|
183 |
+
.loc 1 0 36
|
184 |
+
mov.b32 %r79, -4;
|
185 |
+
.loc 1 31 36
|
186 |
+
shl.b64 %rd48, %rd2, 2;
|
187 |
+
mul.wide.u32 %rd80, %r3, 4;
|
188 |
+
add.s64 %rd79, %rd48, %rd80;
|
189 |
+
add.s64 %rd77, %rd33, %rd79;
|
190 |
+
add.s32 %r25, %r5, %r3;
|
191 |
+
mul.wide.s32 %rd78, %r25, 4;
|
192 |
+
add.s64 %rd76, %rd34, %rd78;
|
193 |
+
mov.f32 %f72, 0f00000000;
|
194 |
+
mov.f32 %f71, %f72;
|
195 |
+
mov.f32 %f70, %f72;
|
196 |
+
$L__BB0_2:
|
197 |
+
.loc 1 35 50
|
198 |
+
mov.u32 %r26, 0x0;
|
199 |
+
@%p11 ld.global.L1::evict_last.b32 { %r26 }, [ %rd76 + 0 ];
|
200 |
+
@!%p11 mov.u32 %r26, %r67;
|
201 |
+
mov.b32 %f21, %r26;
|
202 |
+
.loc 1 40 52
|
203 |
+
mov.u32 %r28, 0x0;
|
204 |
+
@%p11 ld.global.L1::evict_last.b32 { %r28 }, [ %rd77 + 0 ];
|
205 |
+
@!%p11 mov.u32 %r28, %r67;
|
206 |
+
mov.b32 %f22, %r28;
|
207 |
+
.loc 1 41 22
|
208 |
+
add.f32 %f23, %f21, %f22;
|
209 |
+
$L__tmp3:
|
210 |
+
.loc 2 96 20
|
211 |
+
sub.f32 %f24, %f23, %f70;
|
212 |
+
.loc 2 97 26
|
213 |
+
add.f32 %f72, %f72, 0f3F800000;
|
214 |
+
.loc 2 98 30
|
215 |
+
mov.b32 %r31, %f24;
|
216 |
+
mov.b32 %r32, %f72;
|
217 |
+
div.full.f32 %r30, %r31, %r32;
|
218 |
+
mov.b32 %f25, %r30;
|
219 |
+
.loc 2 98 22
|
220 |
+
add.f32 %f70, %f70, %f25;
|
221 |
+
.loc 2 101 30
|
222 |
+
sub.f32 %f26, %f23, %f70;
|
223 |
+
$L__tmp4:
|
224 |
+
.loc 1 47 48
|
225 |
+
fma.rn.f32 %f71, %f24, %f26, %f71;
|
226 |
+
.loc 1 31 36
|
227 |
+
add.s32 %r79, %r79, 4;
|
228 |
+
add.s64 %rd77, %rd77, 16;
|
229 |
+
add.s64 %rd76, %rd76, 16;
|
230 |
+
setp.lt.u32 %p10, %r79, 252;
|
231 |
+
@%p10 bra $L__BB0_2;
|
232 |
+
$L__BB0_5:
|
233 |
+
.loc 1 0 36
|
234 |
+
ld.param.u64 %rd36, [triton__0d1d2d3d4d5de6de_param_4];
|
235 |
+
$L__tmp5:
|
236 |
+
.loc 2 120 46
|
237 |
+
mov.b32 %r54, %f70;
|
238 |
+
shfl.sync.bfly.b32 %r55, %r54, 2, 31, -1;
|
239 |
+
mov.b32 %f34, %r55;
|
240 |
+
mov.b32 %r56, %f71;
|
241 |
+
shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1;
|
242 |
+
mov.b32 %f35, %r57;
|
243 |
+
mov.b32 %r58, %f72;
|
244 |
+
shfl.sync.bfly.b32 %r45, %r58, 2, 31, -1;
|
245 |
+
mov.b32 %f36, %r45;
|
246 |
+
$L__tmp6:
|
247 |
+
.loc 2 108 21
|
248 |
+
sub.f32 %f37, %f34, %f70;
|
249 |
+
.loc 2 109 28
|
250 |
+
add.f32 %f38, %f72, %f36;
|
251 |
+
.loc 2 110 39
|
252 |
+
setp.eq.f32 %p16, %f38, 0f00000000;
|
253 |
+
.loc 2 110 60
|
254 |
+
mov.b32 %r46, %f38;
|
255 |
+
div.full.f32 %r44, %r45, %r46;
|
256 |
+
mov.b32 %f39, %r44;
|
257 |
+
.loc 2 110 49
|
258 |
+
selp.f32 %f40, 0f00000000, %f39, %p16;
|
259 |
+
.loc 2 112 17
|
260 |
+
fma.rn.f32 %f41, %f37, %f40, %f70;
|
261 |
+
.loc 2 113 15
|
262 |
+
add.f32 %f42, %f71, %f35;
|
263 |
+
.loc 2 113 30
|
264 |
+
mul.f32 %f43, %f37, %f37;
|
265 |
+
.loc 2 113 38
|
266 |
+
mul.f32 %f44, %f72, %f43;
|
267 |
+
.loc 2 113 22
|
268 |
+
fma.rn.f32 %f45, %f44, %f40, %f42;
|
269 |
+
$L__tmp7:
|
270 |
+
.loc 2 120 46
|
271 |
+
mov.b32 %r59, %f41;
|
272 |
+
shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1;
|
273 |
+
mov.b32 %f46, %r60;
|
274 |
+
mov.b32 %r61, %f45;
|
275 |
+
shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
|
276 |
+
mov.b32 %f47, %r62;
|
277 |
+
shfl.sync.bfly.b32 %r48, %r46, 1, 31, -1;
|
278 |
+
mov.b32 %f48, %r48;
|
279 |
+
$L__tmp8:
|
280 |
+
.loc 2 108 21
|
281 |
+
sub.f32 %f49, %f46, %f41;
|
282 |
+
.loc 2 109 28
|
283 |
+
add.f32 %f50, %f38, %f48;
|
284 |
+
.loc 2 110 39
|
285 |
+
setp.eq.f32 %p17, %f50, 0f00000000;
|
286 |
+
.loc 2 110 60
|
287 |
+
mov.b32 %r49, %f50;
|
288 |
+
div.full.f32 %r47, %r48, %r49;
|
289 |
+
mov.b32 %f51, %r47;
|
290 |
+
.loc 2 110 49
|
291 |
+
selp.f32 %f52, 0f00000000, %f51, %p17;
|
292 |
+
.loc 2 112 17
|
293 |
+
fma.rn.f32 %f16, %f49, %f52, %f41;
|
294 |
+
.loc 2 113 15
|
295 |
+
add.f32 %f53, %f45, %f47;
|
296 |
+
.loc 2 113 30
|
297 |
+
mul.f32 %f54, %f49, %f49;
|
298 |
+
.loc 2 113 38
|
299 |
+
mul.f32 %f55, %f38, %f54;
|
300 |
+
.loc 2 113 22
|
301 |
+
fma.rn.f32 %f56, %f52, %f55, %f53;
|
302 |
+
$L__tmp9:
|
303 |
+
.loc 1 69 23
|
304 |
+
mov.b32 %r51, %f56;
|
305 |
+
mov.b32 %r52, 1132462080;
|
306 |
+
div.full.f32 %r50, %r51, %r52;
|
307 |
+
mov.b32 %f57, %r50;
|
308 |
+
.loc 1 71 24
|
309 |
+
add.f32 %f17, %f57, 0f3727C5AC;
|
310 |
+
.loc 1 55 36
|
311 |
+
shl.b32 %r63, %r13, 14;
|
312 |
+
shl.b32 %r64, %r2, 8;
|
313 |
+
or.b32 %r65, %r63, %r64;
|
314 |
+
or.b32 %r10, %r65, %r3;
|
315 |
+
add.s64 %rd83, %rd33, %rd79;
|
316 |
+
add.s64 %rd82, %rd35, %rd80;
|
317 |
+
add.s64 %rd81, %rd34, %rd78;
|
318 |
+
mov.b32 %r80, -4;
|
319 |
+
setp.lt.u64 %p22, %rd45, 50257;
|
320 |
+
rsqrt.approx.ftz.f32 %f61, %f17;
|
321 |
+
bra.uni $L__BB0_6;
|
322 |
+
$L__BB0_8:
|
323 |
+
.loc 1 0 0
|
324 |
+
mov.b32 %f18, %r66;
|
325 |
+
mov.b32 %f19, %r68;
|
326 |
+
.loc 1 65 54
|
327 |
+
mov.u32 %r71, 0x0;
|
328 |
+
@%p11 ld.global.L1::evict_first.b32 { %r71 }, [ %rd83 + 0 ];
|
329 |
+
@!%p11 mov.u32 %r71, %r67;
|
330 |
+
mov.b32 %f58, %r71;
|
331 |
+
.loc 1 66 24
|
332 |
+
add.f32 %f59, %f18, %f58;
|
333 |
+
.loc 1 67 24
|
334 |
+
sub.f32 %f60, %f59, %f16;
|
335 |
+
.loc 1 73 24
|
336 |
+
mul.f32 %f62, %f60, %f61;
|
337 |
+
.loc 1 74 24
|
338 |
+
mul.f32 %f63, %f62, %f19;
|
339 |
+
.loc 1 55 36
|
340 |
+
add.s32 %r80, %r80, 4;
|
341 |
+
.loc 1 76 29
|
342 |
+
add.s32 %r74, %r80, %r10;
|
343 |
+
mul.wide.s32 %rd72, %r74, 2;
|
344 |
+
add.s64 %rd71, %rd36, %rd72;
|
345 |
+
.loc 1 76 52
|
346 |
+
mov.b32 %r73, %f63;
|
347 |
+
cvt.rn.bf16.f32 %rs1, %r73;
|
348 |
+
@%p11 st.global.b16 [ %rd71 + 0 ], { %rs1 };
|
349 |
+
.loc 1 55 36
|
350 |
+
add.s64 %rd83, %rd83, 16;
|
351 |
+
add.s64 %rd82, %rd82, 16;
|
352 |
+
add.s64 %rd81, %rd81, 16;
|
353 |
+
setp.lt.u32 %p26, %r80, 252;
|
354 |
+
@%p26 bra $L__BB0_6;
|
355 |
+
bra.uni $L__BB0_9;
|
356 |
+
$L__BB0_6:
|
357 |
+
.loc 1 59 51
|
358 |
+
mov.u32 %r66, 0x0;
|
359 |
+
@%p11 ld.global.L1::evict_last.b32 { %r66 }, [ %rd81 + 0 ];
|
360 |
+
@!%p11 mov.u32 %r66, %r67;
|
361 |
+
.loc 1 60 40
|
362 |
+
mov.u32 %r68, 0x0;
|
363 |
+
@%p11 ld.global.L1::evict_last.b32 { %r68 }, [ %rd82 + 0 ];
|
364 |
+
@!%p11 mov.u32 %r68, %r67;
|
365 |
+
.loc 1 64 57
|
366 |
+
@%p22 bra $L__BB0_8;
|
367 |
+
mov.u64 %rd63, assertMessage_1;
|
368 |
+
cvta.global.u64 %rd64, %rd63;
|
369 |
+
mov.u64 %rd65, assertFile_1;
|
370 |
+
cvta.global.u64 %rd66, %rd65;
|
371 |
+
mov.u64 %rd67, assertFunc_1;
|
372 |
+
cvta.global.u64 %rd68, %rd67;
|
373 |
+
{ // callseq 11, 0
|
374 |
+
.reg .b32 temp_param_reg;
|
375 |
+
.param .b64 param0;
|
376 |
+
st.param.b64 [param0+0], %rd64;
|
377 |
+
.param .b64 param1;
|
378 |
+
st.param.b64 [param1+0], %rd66;
|
379 |
+
.param .b32 param2;
|
380 |
+
st.param.b32 [param2+0], %r77;
|
381 |
+
.param .b64 param3;
|
382 |
+
st.param.b64 [param3+0], %rd68;
|
383 |
+
.param .b64 param4;
|
384 |
+
st.param.b64 [param4+0], %rd73;
|
385 |
+
call.uni
|
386 |
+
__assertfail,
|
387 |
+
(
|
388 |
+
param0,
|
389 |
+
param1,
|
390 |
+
param2,
|
391 |
+
param3,
|
392 |
+
param4
|
393 |
+
);
|
394 |
+
} // callseq 11
|
395 |
+
bra.uni $L__BB0_8;
|
396 |
+
$L__BB0_9:
|
397 |
+
.loc 1 55 4
|
398 |
+
ret;
|
399 |
+
$L__tmp10:
|
400 |
+
$L__func_end0:
|
401 |
+
|
402 |
+
}
|
403 |
+
// .globl __nv_rsqrtf
|
404 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
405 |
+
.param .b32 __nv_rsqrtf_param_0
|
406 |
+
)
|
407 |
+
{
|
408 |
+
.reg .f32 %f<3>;
|
409 |
+
$L__func_begin1:
|
410 |
+
|
411 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
412 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
413 |
+
st.param.f32 [func_retval0+0], %f2;
|
414 |
+
ret;
|
415 |
+
$L__func_end1:
|
416 |
+
|
417 |
+
}
|
418 |
+
.file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
|
419 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
420 |
+
.section .debug_abbrev
|
421 |
+
{
|
422 |
+
.b8 1
|
423 |
+
.b8 17
|
424 |
+
.b8 1
|
425 |
+
.b8 37
|
426 |
+
.b8 8
|
427 |
+
.b8 19
|
428 |
+
.b8 5
|
429 |
+
.b8 3
|
430 |
+
.b8 8
|
431 |
+
.b8 16
|
432 |
+
.b8 6
|
433 |
+
.b8 27
|
434 |
+
.b8 8
|
435 |
+
.b8 180
|
436 |
+
.b8 66
|
437 |
+
.b8 12
|
438 |
+
.b8 17
|
439 |
+
.b8 1
|
440 |
+
.b8 18
|
441 |
+
.b8 1
|
442 |
+
.b8 0
|
443 |
+
.b8 0
|
444 |
+
.b8 2
|
445 |
+
.b8 46
|
446 |
+
.b8 0
|
447 |
+
.b8 135
|
448 |
+
.b8 64
|
449 |
+
.b8 8
|
450 |
+
.b8 3
|
451 |
+
.b8 8
|
452 |
+
.b8 58
|
453 |
+
.b8 11
|
454 |
+
.b8 59
|
455 |
+
.b8 11
|
456 |
+
.b8 63
|
457 |
+
.b8 12
|
458 |
+
.b8 32
|
459 |
+
.b8 11
|
460 |
+
.b8 0
|
461 |
+
.b8 0
|
462 |
+
.b8 3
|
463 |
+
.b8 46
|
464 |
+
.b8 1
|
465 |
+
.b8 17
|
466 |
+
.b8 1
|
467 |
+
.b8 18
|
468 |
+
.b8 1
|
469 |
+
.b8 64
|
470 |
+
.b8 10
|
471 |
+
.b8 49
|
472 |
+
.b8 19
|
473 |
+
.b8 0
|
474 |
+
.b8 0
|
475 |
+
.b8 4
|
476 |
+
.b8 29
|
477 |
+
.b8 0
|
478 |
+
.b8 49
|
479 |
+
.b8 19
|
480 |
+
.b8 17
|
481 |
+
.b8 1
|
482 |
+
.b8 18
|
483 |
+
.b8 1
|
484 |
+
.b8 88
|
485 |
+
.b8 11
|
486 |
+
.b8 89
|
487 |
+
.b8 11
|
488 |
+
.b8 87
|
489 |
+
.b8 11
|
490 |
+
.b8 0
|
491 |
+
.b8 0
|
492 |
+
.b8 5
|
493 |
+
.b8 29
|
494 |
+
.b8 1
|
495 |
+
.b8 49
|
496 |
+
.b8 19
|
497 |
+
.b8 17
|
498 |
+
.b8 1
|
499 |
+
.b8 18
|
500 |
+
.b8 1
|
501 |
+
.b8 88
|
502 |
+
.b8 11
|
503 |
+
.b8 89
|
504 |
+
.b8 11
|
505 |
+
.b8 87
|
506 |
+
.b8 11
|
507 |
+
.b8 0
|
508 |
+
.b8 0
|
509 |
+
.b8 0
|
510 |
+
}
|
511 |
+
.section .debug_info
|
512 |
+
{
|
513 |
+
.b32 298
|
514 |
+
.b8 2
|
515 |
+
.b8 0
|
516 |
+
.b32 .debug_abbrev
|
517 |
+
.b8 8
|
518 |
+
.b8 1
|
519 |
+
.b8 116
|
520 |
+
.b8 114
|
521 |
+
.b8 105
|
522 |
+
.b8 116
|
523 |
+
.b8 111
|
524 |
+
.b8 110
|
525 |
+
.b8 0
|
526 |
+
.b8 2
|
527 |
+
.b8 0
|
528 |
+
.b8 99
|
529 |
+
.b8 103
|
530 |
+
.b8 120
|
531 |
+
.b8 53
|
532 |
+
.b8 108
|
533 |
+
.b8 120
|
534 |
+
.b8 112
|
535 |
+
.b8 117
|
536 |
+
.b8 101
|
537 |
+
.b8 120
|
538 |
+
.b8 112
|
539 |
+
.b8 105
|
540 |
+
.b8 110
|
541 |
+
.b8 100
|
542 |
+
.b8 106
|
543 |
+
.b8 52
|
544 |
+
.b8 100
|
545 |
+
.b8 115
|
546 |
+
.b8 109
|
547 |
+
.b8 106
|
548 |
+
.b8 122
|
549 |
+
.b8 53
|
550 |
+
.b8 120
|
551 |
+
.b8 52
|
552 |
+
.b8 50
|
553 |
+
.b8 117
|
554 |
+
.b8 104
|
555 |
+
.b8 121
|
556 |
+
.b8 121
|
557 |
+
.b8 55
|
558 |
+
.b8 105
|
559 |
+
.b8 115
|
560 |
+
.b8 107
|
561 |
+
.b8 101
|
562 |
+
.b8 118
|
563 |
+
.b8 113
|
564 |
+
.b8 55
|
565 |
+
.b8 111
|
566 |
+
.b8 118
|
567 |
+
.b8 122
|
568 |
+
.b8 112
|
569 |
+
.b8 119
|
570 |
+
.b8 97
|
571 |
+
.b8 103
|
572 |
+
.b8 98
|
573 |
+
.b8 51
|
574 |
+
.b8 116
|
575 |
+
.b8 53
|
576 |
+
.b8 112
|
577 |
+
.b8 111
|
578 |
+
.b8 119
|
579 |
+
.b8 106
|
580 |
+
.b8 46
|
581 |
+
.b8 112
|
582 |
+
.b8 121
|
583 |
+
.b8 0
|
584 |
+
.b32 .debug_line
|
585 |
+
.b8 47
|
586 |
+
.b8 116
|
587 |
+
.b8 109
|
588 |
+
.b8 112
|
589 |
+
.b8 47
|
590 |
+
.b8 116
|
591 |
+
.b8 111
|
592 |
+
.b8 114
|
593 |
+
.b8 99
|
594 |
+
.b8 104
|
595 |
+
.b8 105
|
596 |
+
.b8 110
|
597 |
+
.b8 100
|
598 |
+
.b8 117
|
599 |
+
.b8 99
|
600 |
+
.b8 116
|
601 |
+
.b8 111
|
602 |
+
.b8 114
|
603 |
+
.b8 95
|
604 |
+
.b8 114
|
605 |
+
.b8 111
|
606 |
+
.b8 111
|
607 |
+
.b8 116
|
608 |
+
.b8 47
|
609 |
+
.b8 103
|
610 |
+
.b8 120
|
611 |
+
.b8 0
|
612 |
+
.b8 1
|
613 |
+
.b64 $L__func_begin0
|
614 |
+
.b64 $L__func_end0
|
615 |
+
.b8 2
|
616 |
+
.b8 116
|
617 |
+
.b8 114
|
618 |
+
.b8 105
|
619 |
+
.b8 116
|
620 |
+
.b8 111
|
621 |
+
.b8 110
|
622 |
+
.b8 95
|
623 |
+
.b8 95
|
624 |
+
.b8 48
|
625 |
+
.b8 100
|
626 |
+
.b8 49
|
627 |
+
.b8 100
|
628 |
+
.b8 50
|
629 |
+
.b8 100
|
630 |
+
.b8 51
|
631 |
+
.b8 100
|
632 |
+
.b8 52
|
633 |
+
.b8 100
|
634 |
+
.b8 53
|
635 |
+
.b8 100
|
636 |
+
.b8 101
|
637 |
+
.b8 54
|
638 |
+
.b8 100
|
639 |
+
.b8 101
|
640 |
+
.b8 0
|
641 |
+
.b8 116
|
642 |
+
.b8 114
|
643 |
+
.b8 105
|
644 |
+
.b8 116
|
645 |
+
.b8 111
|
646 |
+
.b8 110
|
647 |
+
.b8 95
|
648 |
+
.b8 95
|
649 |
+
.b8 48
|
650 |
+
.b8 100
|
651 |
+
.b8 49
|
652 |
+
.b8 100
|
653 |
+
.b8 50
|
654 |
+
.b8 100
|
655 |
+
.b8 51
|
656 |
+
.b8 100
|
657 |
+
.b8 52
|
658 |
+
.b8 100
|
659 |
+
.b8 53
|
660 |
+
.b8 100
|
661 |
+
.b8 101
|
662 |
+
.b8 54
|
663 |
+
.b8 100
|
664 |
+
.b8 101
|
665 |
+
.b8 0
|
666 |
+
.b8 1
|
667 |
+
.b8 18
|
668 |
+
.b8 1
|
669 |
+
.b8 1
|
670 |
+
.b8 3
|
671 |
+
.b64 $L__func_begin0
|
672 |
+
.b64 $L__func_end0
|
673 |
+
.b8 1
|
674 |
+
.b8 156
|
675 |
+
.b32 125
|
676 |
+
.b8 4
|
677 |
+
.b32 125
|
678 |
+
.b64 $L__tmp1
|
679 |
+
.b64 $L__tmp4
|
680 |
+
.b8 2
|
681 |
+
.b8 44
|
682 |
+
.b8 38
|
683 |
+
.b8 4
|
684 |
+
.b32 125
|
685 |
+
.b64 $L__tmp5
|
686 |
+
.b64 $L__tmp8
|
687 |
+
.b8 2
|
688 |
+
.b8 50
|
689 |
+
.b8 41
|
690 |
+
.b8 5
|
691 |
+
.b32 125
|
692 |
+
.b64 $L__tmp6
|
693 |
+
.b64 $L__tmp9
|
694 |
+
.b8 2
|
695 |
+
.b8 50
|
696 |
+
.b8 41
|
697 |
+
.b8 4
|
698 |
+
.b32 125
|
699 |
+
.b64 $L__tmp6
|
700 |
+
.b64 $L__tmp9
|
701 |
+
.b8 2
|
702 |
+
.b8 120
|
703 |
+
.b8 46
|
704 |
+
.b8 0
|
705 |
+
.b8 0
|
706 |
+
.b8 0
|
707 |
+
}
|
708 |
+
.section .debug_pubnames
|
709 |
+
{
|
710 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
711 |
+
$L__pubNames_start0:
|
712 |
+
.b8 2
|
713 |
+
.b8 0
|
714 |
+
.b32 .debug_info
|
715 |
+
.b32 302
|
716 |
+
.b32 125
|
717 |
+
.b8 116
|
718 |
+
.b8 114
|
719 |
+
.b8 105
|
720 |
+
.b8 116
|
721 |
+
.b8 111
|
722 |
+
.b8 110
|
723 |
+
.b8 95
|
724 |
+
.b8 95
|
725 |
+
.b8 48
|
726 |
+
.b8 100
|
727 |
+
.b8 49
|
728 |
+
.b8 100
|
729 |
+
.b8 50
|
730 |
+
.b8 100
|
731 |
+
.b8 51
|
732 |
+
.b8 100
|
733 |
+
.b8 52
|
734 |
+
.b8 100
|
735 |
+
.b8 53
|
736 |
+
.b8 100
|
737 |
+
.b8 101
|
738 |
+
.b8 54
|
739 |
+
.b8 100
|
740 |
+
.b8 101
|
741 |
+
.b8 0
|
742 |
+
.b32 0
|
743 |
+
$L__pubNames_end0:
|
744 |
+
}
|
745 |
+
.section .debug_pubtypes
|
746 |
+
{
|
747 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
748 |
+
$L__pubTypes_start0:
|
749 |
+
.b8 2
|
750 |
+
.b8 0
|
751 |
+
.b32 .debug_info
|
752 |
+
.b32 302
|
753 |
+
.b32 0
|
754 |
+
$L__pubTypes_end0:
|
755 |
+
}
|
756 |
+
.section .debug_loc { }
|
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
|
11 |
+
%cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
|
12 |
+
%c0_i32 = arith.constant 0 : i32
|
13 |
+
%c4_i32 = arith.constant 4 : i32
|
14 |
+
%c256_i32 = arith.constant 256 : i32
|
15 |
+
%cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
|
16 |
+
%cst_7 = arith.constant 0.000000e+00 : f32
|
17 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
|
18 |
+
%cst_9 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
|
19 |
+
%cst_10 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
|
20 |
+
%cst_11 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
|
21 |
+
%cst_12 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
|
22 |
+
%c64_i32 = arith.constant 64 : i32
|
23 |
+
%0 = tt.get_program_id x : i32
|
24 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
25 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
26 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
27 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
28 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
29 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
30 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
31 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
32 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
33 |
+
%10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
34 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
|
35 |
+
%12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
36 |
+
%13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
|
37 |
+
%14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
38 |
+
%15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
39 |
+
%16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
40 |
+
%17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
|
41 |
+
%18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
42 |
+
%19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
|
43 |
+
%20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
44 |
+
%21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
|
45 |
+
%22 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
|
46 |
+
%23 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
|
47 |
+
%24 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
|
48 |
+
%25 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
|
49 |
+
%26 = arith.select %24, %22, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
50 |
+
%27 = arith.select %25, %23, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
|
51 |
+
%28 = arith.cmpi sge, %27, %cst_5 : tensor<64x1xi64, #blocked1>
|
52 |
+
%29 = arith.cmpi slt, %27, %cst_4 : tensor<64x1xi64, #blocked1>
|
53 |
+
%30 = arith.andi %28, %29 : tensor<64x1xi1, #blocked1>
|
54 |
+
%31 = arith.muli %26, %cst_1 : tensor<64x1xi64, #blocked>
|
55 |
+
%32 = tt.broadcast %31 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
56 |
+
%33 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
|
57 |
+
%34:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_9, %arg9 = %cst_9, %arg10 = %cst_9) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) : i32 {
|
58 |
+
%45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
|
59 |
+
%46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
|
60 |
+
%47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
|
61 |
+
%48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
62 |
+
%49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
|
63 |
+
%50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
64 |
+
%51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
65 |
+
%52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
66 |
+
tt.assert %30, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
67 |
+
%53 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
|
68 |
+
%54 = tt.broadcast %53 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
69 |
+
%55 = arith.addi %54, %32 : tensor<64x4xi64, #blocked>
|
70 |
+
%56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
71 |
+
%57 = tt.load %56, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
72 |
+
%58 = arith.addf %57, %52 : tensor<64x4xf32, #blocked>
|
73 |
+
%59 = arith.subf %58, %arg8 : tensor<64x4xf32, #blocked>
|
74 |
+
%60 = arith.addf %arg10, %cst_6 : tensor<64x4xf32, #blocked>
|
75 |
+
%61 = arith.divf %59, %60 : tensor<64x4xf32, #blocked>
|
76 |
+
%62 = arith.addf %arg8, %61 : tensor<64x4xf32, #blocked>
|
77 |
+
%63 = arith.subf %58, %62 : tensor<64x4xf32, #blocked>
|
78 |
+
%64 = arith.mulf %59, %63 : tensor<64x4xf32, #blocked>
|
79 |
+
%65 = arith.addf %arg9, %64 : tensor<64x4xf32, #blocked>
|
80 |
+
%66 = arith.select %51, %62, %arg8 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
81 |
+
%67 = arith.select %51, %65, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
82 |
+
%68 = arith.select %51, %60, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
83 |
+
scf.yield %66, %67, %68 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
|
84 |
+
}
|
85 |
+
%35:3 = "tt.reduce"(%34#0, %34#1, %34#2) <{axis = 1 : i32}> ({
|
86 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
87 |
+
%45 = arith.subf %arg10, %arg7 : f32
|
88 |
+
%46 = arith.addf %arg9, %arg12 : f32
|
89 |
+
%47 = arith.cmpf oeq, %46, %cst_7 : f32
|
90 |
+
%48 = arith.divf %arg12, %46 : f32
|
91 |
+
%49 = arith.select %47, %cst_7, %48 : f32
|
92 |
+
%50 = arith.mulf %45, %49 : f32
|
93 |
+
%51 = arith.addf %arg7, %50 : f32
|
94 |
+
%52 = arith.addf %arg8, %arg11 : f32
|
95 |
+
%53 = arith.mulf %45, %45 : f32
|
96 |
+
%54 = arith.mulf %53, %arg9 : f32
|
97 |
+
%55 = arith.mulf %54, %49 : f32
|
98 |
+
%56 = arith.addf %52, %55 : f32
|
99 |
+
tt.reduce.return %51, %56, %46 : f32, f32, f32
|
100 |
+
}) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
101 |
+
%36 = tt.expand_dims %35#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
102 |
+
%37 = tt.expand_dims %35#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
103 |
+
%38 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
|
104 |
+
%39 = tt.broadcast %36 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
105 |
+
%40 = arith.divf %37, %cst_12 : tensor<64x1xf32, #blocked>
|
106 |
+
%41 = arith.addf %40, %cst_11 : tensor<64x1xf32, #blocked>
|
107 |
+
%42 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
|
108 |
+
%43 = tt.broadcast %42 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
109 |
+
%44 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
|
110 |
+
scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
|
111 |
+
%45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
|
112 |
+
%46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
|
113 |
+
%47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
|
114 |
+
%48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
115 |
+
%49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
|
116 |
+
%50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
117 |
+
%51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
118 |
+
%52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
119 |
+
%53 = tt.addptr %38, %46 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
|
120 |
+
%54 = tt.load %53, %47, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
|
121 |
+
tt.assert %30, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
122 |
+
%55 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
|
123 |
+
%56 = tt.broadcast %55 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
124 |
+
%57 = arith.addi %56, %32 : tensor<64x4xi64, #blocked>
|
125 |
+
%58 = tt.addptr %33, %57 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
126 |
+
%59 = tt.load %58, %51, %cst_9 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
127 |
+
%60 = arith.addf %59, %52 : tensor<64x4xf32, #blocked>
|
128 |
+
%61 = arith.subf %60, %39 : tensor<64x4xf32, #blocked>
|
129 |
+
%62 = tt.extern_elementwise %41 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
|
130 |
+
%63 = tt.broadcast %62 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
131 |
+
%64 = arith.mulf %61, %63 : tensor<64x4xf32, #blocked>
|
132 |
+
%65 = tt.broadcast %54 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
133 |
+
%66 = arith.mulf %64, %65 : tensor<64x4xf32, #blocked>
|
134 |
+
%67 = arith.addi %48, %43 : tensor<64x4xi32, #blocked>
|
135 |
+
%68 = tt.addptr %44, %67 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
136 |
+
%69 = arith.truncf %66 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
|
137 |
+
tt.store %68, %69, %51 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
|
138 |
+
}
|
139 |
+
tt.return
|
140 |
+
}
|
141 |
+
}
|
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant 0.000000e+00 : f32
|
4 |
+
%cst_0 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
|
5 |
+
%c256_i32 = arith.constant 256 : i32
|
6 |
+
%c4_i32 = arith.constant 4 : i32
|
7 |
+
%c0_i32 = arith.constant 0 : i32
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi64>
|
9 |
+
%cst_2 = arith.constant dense<0> : tensor<64x1xi64>
|
10 |
+
%cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
|
11 |
+
%cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
|
12 |
+
%cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
|
13 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
|
14 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
|
15 |
+
%cst_8 = arith.constant dense<256> : tensor<64x1xi32>
|
16 |
+
%cst_9 = arith.constant dense<256> : tensor<1x4xi32>
|
17 |
+
%cst_10 = arith.constant dense<512> : tensor<64x1xi32>
|
18 |
+
%c64_i32 = arith.constant 64 : i32
|
19 |
+
%0 = tt.get_program_id x : i32
|
20 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
21 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
22 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
23 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
24 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
25 |
+
%6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
|
26 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
|
27 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
28 |
+
%9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
29 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
30 |
+
%11 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
|
31 |
+
%12 = arith.muli %11, %cst_8 : tensor<64x1xi32>
|
32 |
+
%13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
33 |
+
%14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
34 |
+
%15 = arith.addi %10, %cst_3 : tensor<64x1xi64>
|
35 |
+
%16 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
|
36 |
+
%17 = arith.select %16, %15, %10 : tensor<64x1xi1>, tensor<64x1xi64>
|
37 |
+
%18 = arith.cmpi sge, %17, %cst_2 : tensor<64x1xi64>
|
38 |
+
%19 = arith.cmpi slt, %17, %cst_3 : tensor<64x1xi64>
|
39 |
+
%20 = arith.andi %18, %19 : tensor<64x1xi1>
|
40 |
+
%21 = arith.muli %17, %cst_1 : tensor<64x1xi64>
|
41 |
+
%22 = tt.broadcast %21 : (tensor<64x1xi64>) -> tensor<64x4xi64>
|
42 |
+
%23 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
43 |
+
%24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 {
|
44 |
+
%47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
|
45 |
+
%48 = arith.addi %47, %7 : tensor<1x4xi32>
|
46 |
+
%49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
|
47 |
+
%50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
|
48 |
+
%51 = arith.addi %50, %13 : tensor<64x4xi32>
|
49 |
+
%52 = tt.addptr %14, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
|
50 |
+
%53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
51 |
+
%54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
52 |
+
tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
53 |
+
%55 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
|
54 |
+
%56 = tt.broadcast %55 : (tensor<1x4xi64>) -> tensor<64x4xi64>
|
55 |
+
%57 = arith.addi %56, %22 : tensor<64x4xi64>
|
56 |
+
%58 = tt.addptr %23, %57 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
|
57 |
+
%59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
58 |
+
%60 = arith.addf %59, %54 : tensor<64x4xf32>
|
59 |
+
%61 = arith.subf %60, %arg8 : tensor<64x4xf32>
|
60 |
+
%62 = arith.addf %arg10, %cst_0 : tensor<64x4xf32>
|
61 |
+
%63 = arith.divf %61, %62 : tensor<64x4xf32>
|
62 |
+
%64 = arith.addf %arg8, %63 : tensor<64x4xf32>
|
63 |
+
%65 = arith.subf %60, %64 : tensor<64x4xf32>
|
64 |
+
%66 = arith.mulf %61, %65 : tensor<64x4xf32>
|
65 |
+
%67 = arith.addf %arg9, %66 : tensor<64x4xf32>
|
66 |
+
%68 = arith.select %53, %64, %arg8 : tensor<64x4xi1>, tensor<64x4xf32>
|
67 |
+
%69 = arith.select %53, %67, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
|
68 |
+
%70 = arith.select %53, %62, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
|
69 |
+
scf.yield %68, %69, %70 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
|
70 |
+
}
|
71 |
+
%25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({
|
72 |
+
^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
|
73 |
+
%47 = arith.subf %arg10, %arg7 : f32
|
74 |
+
%48 = arith.addf %arg9, %arg12 : f32
|
75 |
+
%49 = arith.cmpf oeq, %48, %cst : f32
|
76 |
+
%50 = arith.divf %arg12, %48 : f32
|
77 |
+
%51 = arith.select %49, %cst, %50 : f32
|
78 |
+
%52 = arith.mulf %47, %51 : f32
|
79 |
+
%53 = arith.addf %arg7, %52 : f32
|
80 |
+
%54 = arith.addf %arg8, %arg11 : f32
|
81 |
+
%55 = arith.mulf %47, %47 : f32
|
82 |
+
%56 = arith.mulf %55, %arg9 : f32
|
83 |
+
%57 = arith.mulf %56, %51 : f32
|
84 |
+
%58 = arith.addf %54, %57 : f32
|
85 |
+
tt.reduce.return %53, %58, %48 : f32, f32, f32
|
86 |
+
}) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
|
87 |
+
%26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
88 |
+
%27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
89 |
+
%28 = arith.muli %11, %cst_8 : tensor<64x1xi32>
|
90 |
+
%29 = tt.broadcast %28 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
91 |
+
%30 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
92 |
+
%31 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
|
93 |
+
%32 = arith.addi %10, %cst_3 : tensor<64x1xi64>
|
94 |
+
%33 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
|
95 |
+
%34 = arith.select %33, %32, %10 : tensor<64x1xi1>, tensor<64x1xi64>
|
96 |
+
%35 = arith.cmpi sge, %34, %cst_2 : tensor<64x1xi64>
|
97 |
+
%36 = arith.cmpi slt, %34, %cst_3 : tensor<64x1xi64>
|
98 |
+
%37 = arith.andi %35, %36 : tensor<64x1xi1>
|
99 |
+
%38 = arith.muli %34, %cst_1 : tensor<64x1xi64>
|
100 |
+
%39 = tt.broadcast %38 : (tensor<64x1xi64>) -> tensor<64x4xi64>
|
101 |
+
%40 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
102 |
+
%41 = tt.broadcast %26 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
103 |
+
%42 = arith.divf %27, %cst_5 : tensor<64x1xf32>
|
104 |
+
%43 = arith.addf %42, %cst_4 : tensor<64x1xf32>
|
105 |
+
%44 = arith.muli %5, %cst_8 : tensor<64x1xi32>
|
106 |
+
%45 = tt.broadcast %44 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
107 |
+
%46 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
108 |
+
scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
|
109 |
+
%47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
|
110 |
+
%48 = arith.addi %47, %7 : tensor<1x4xi32>
|
111 |
+
%49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
|
112 |
+
%50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
|
113 |
+
%51 = arith.addi %50, %29 : tensor<64x4xi32>
|
114 |
+
%52 = tt.addptr %30, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
|
115 |
+
%53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
116 |
+
%54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
117 |
+
%55 = tt.addptr %31, %48 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
|
118 |
+
%56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
|
119 |
+
tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
120 |
+
%57 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
|
121 |
+
%58 = tt.broadcast %57 : (tensor<1x4xi64>) -> tensor<64x4xi64>
|
122 |
+
%59 = arith.addi %58, %39 : tensor<64x4xi64>
|
123 |
+
%60 = tt.addptr %40, %59 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
|
124 |
+
%61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
|
125 |
+
%62 = arith.addf %61, %54 : tensor<64x4xf32>
|
126 |
+
%63 = arith.subf %62, %41 : tensor<64x4xf32>
|
127 |
+
%64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
|
128 |
+
%65 = tt.broadcast %64 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
129 |
+
%66 = arith.mulf %63, %65 : tensor<64x4xf32>
|
130 |
+
%67 = tt.broadcast %56 : (tensor<1x4xf32>) -> tensor<64x4xf32>
|
131 |
+
%68 = arith.mulf %66, %67 : tensor<64x4xf32>
|
132 |
+
%69 = arith.addi %50, %45 : tensor<64x4xi32>
|
133 |
+
%70 = tt.addptr %46, %69 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
|
134 |
+
%71 = arith.truncf %68 : tensor<64x4xf32> to tensor<64x4xbf16>
|
135 |
+
tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
|
136 |
+
}
|
137 |
+
tt.return
|
138 |
+
}
|
139 |
+
}
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = shl i32 %6, 2, !dbg !8
|
11 |
+
%10 = and i32 %9, 60, !dbg !8
|
12 |
+
%11 = and i32 %8, 3, !dbg !9
|
13 |
+
%12 = lshr i32 %7, 4, !dbg !9
|
14 |
+
%13 = shl nuw nsw i32 %11, 1, !dbg !9
|
15 |
+
%14 = or i32 %13, %12, !dbg !9
|
16 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
17 |
+
%16 = shl i32 %15, 6, !dbg !11
|
18 |
+
%17 = or i32 %16, %10, !dbg !12
|
19 |
+
%.frozen = freeze i32 %17
|
20 |
+
%18 = sdiv i32 %.frozen, 256, !dbg !13
|
21 |
+
%19 = mul i32 %18, 256
|
22 |
+
%.decomposed = sub i32 %.frozen, %19
|
23 |
+
%20 = shl i32 %18, 15, !dbg !14
|
24 |
+
%21 = add i32 %20, %.decomposed
|
25 |
+
br label %22, !dbg !15
|
26 |
+
|
27 |
+
22: ; preds = %5, %22
|
28 |
+
%23 = phi i32 [ 0, %5 ], [ %53, %22 ]
|
29 |
+
%24 = phi <4 x float> [ zeroinitializer, %5 ], [ %52, %22 ]
|
30 |
+
%25 = or i32 %23, %14, !dbg !16
|
31 |
+
%26 = shl i32 %25, 8, !dbg !17
|
32 |
+
%27 = add i32 %21, %26, !dbg !18
|
33 |
+
%28 = sext i32 %27 to i64, !dbg !19
|
34 |
+
%29 = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !19
|
35 |
+
%30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
|
36 |
+
%31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !20
|
37 |
+
%32 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !20
|
38 |
+
%33 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !20
|
39 |
+
%34 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !20
|
40 |
+
%35 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !21
|
41 |
+
%36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !22
|
42 |
+
%37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !22
|
43 |
+
%38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !22
|
44 |
+
%39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !22
|
45 |
+
%40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !22
|
46 |
+
%41 = insertelement <4 x i32> poison, i32 %31, i64 0, !dbg !20
|
47 |
+
%42 = insertelement <4 x i32> %41, i32 %32, i64 1, !dbg !20
|
48 |
+
%43 = insertelement <4 x i32> %42, i32 %33, i64 2, !dbg !20
|
49 |
+
%44 = insertelement <4 x i32> %43, i32 %34, i64 3, !dbg !20
|
50 |
+
%45 = bitcast <4 x i32> %44 to <4 x float>, !dbg !20
|
51 |
+
%46 = insertelement <4 x i32> poison, i32 %37, i64 0, !dbg !22
|
52 |
+
%47 = insertelement <4 x i32> %46, i32 %38, i64 1, !dbg !22
|
53 |
+
%48 = insertelement <4 x i32> %47, i32 %39, i64 2, !dbg !22
|
54 |
+
%49 = insertelement <4 x i32> %48, i32 %40, i64 3, !dbg !22
|
55 |
+
%50 = bitcast <4 x i32> %49 to <4 x float>, !dbg !22
|
56 |
+
%51 = fmul <4 x float> %45, %50, !dbg !23
|
57 |
+
%52 = fadd <4 x float> %24, %51, !dbg !24
|
58 |
+
%53 = add nuw nsw i32 %23, 8, !dbg !15
|
59 |
+
%54 = icmp ult i32 %23, 120, !dbg !15
|
60 |
+
br i1 %54, label %22, label %55, !dbg !15
|
61 |
+
|
62 |
+
55: ; preds = %22
|
63 |
+
%56 = and i32 %6, 63, !dbg !8
|
64 |
+
%57 = or i32 %16, %56, !dbg !12
|
65 |
+
%58 = or i32 %10, 3, !dbg !25
|
66 |
+
%59 = or i32 %10, 2, !dbg !25
|
67 |
+
%60 = or i32 %10, 1, !dbg !25
|
68 |
+
%61 = extractelement <4 x float> %52, i64 0, !dbg !25
|
69 |
+
%62 = bitcast float %61 to i32, !dbg !25
|
70 |
+
%63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 16, i32 31), !dbg !25
|
71 |
+
%64 = bitcast i32 %63 to float, !dbg !25
|
72 |
+
%65 = fadd float %61, %64, !dbg !29
|
73 |
+
%66 = extractelement <4 x float> %52, i64 1, !dbg !25
|
74 |
+
%67 = bitcast float %66 to i32, !dbg !25
|
75 |
+
%68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !25
|
76 |
+
%69 = bitcast i32 %68 to float, !dbg !25
|
77 |
+
%70 = fadd float %66, %69, !dbg !29
|
78 |
+
%71 = extractelement <4 x float> %52, i64 2, !dbg !25
|
79 |
+
%72 = bitcast float %71 to i32, !dbg !25
|
80 |
+
%73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !25
|
81 |
+
%74 = bitcast i32 %73 to float, !dbg !25
|
82 |
+
%75 = fadd float %71, %74, !dbg !29
|
83 |
+
%76 = extractelement <4 x float> %52, i64 3, !dbg !25
|
84 |
+
%77 = bitcast float %76 to i32, !dbg !25
|
85 |
+
%78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !25
|
86 |
+
%79 = bitcast i32 %78 to float, !dbg !25
|
87 |
+
%80 = fadd float %76, %79, !dbg !29
|
88 |
+
%81 = icmp ult i32 %7, 16, !dbg !25
|
89 |
+
%82 = shl nuw nsw i32 %10, 2, !dbg !25
|
90 |
+
%83 = or i32 %82, %11, !dbg !25
|
91 |
+
%84 = zext nneg i32 %83 to i64, !dbg !25
|
92 |
+
%85 = getelementptr float, ptr addrspace(3) @global_smem, i64 %84, !dbg !25
|
93 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, float %65, i1 %81) #3, !dbg !25
|
94 |
+
%86 = shl nuw nsw i32 %60, 2, !dbg !25
|
95 |
+
%87 = or i32 %86, %11, !dbg !25
|
96 |
+
%88 = zext nneg i32 %87 to i64, !dbg !25
|
97 |
+
%89 = getelementptr float, ptr addrspace(3) @global_smem, i64 %88, !dbg !25
|
98 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %89, float %70, i1 %81) #3, !dbg !25
|
99 |
+
%90 = shl nuw nsw i32 %59, 2, !dbg !25
|
100 |
+
%91 = or i32 %90, %11, !dbg !25
|
101 |
+
%92 = zext nneg i32 %91 to i64, !dbg !25
|
102 |
+
%93 = getelementptr float, ptr addrspace(3) @global_smem, i64 %92, !dbg !25
|
103 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %93, float %75, i1 %81) #3, !dbg !25
|
104 |
+
%94 = shl nuw nsw i32 %58, 2, !dbg !25
|
105 |
+
%95 = or i32 %94, %11, !dbg !25
|
106 |
+
%96 = zext nneg i32 %95 to i64, !dbg !25
|
107 |
+
%97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !25
|
108 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %80, i1 %81) #3, !dbg !25
|
109 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !25
|
110 |
+
%98 = icmp slt i32 %6, 256, !dbg !25
|
111 |
+
%99 = sext i32 %6 to i64, !dbg !25
|
112 |
+
%100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !25
|
113 |
+
%101 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %100, i1 %98) #3, !dbg !25
|
114 |
+
%102 = bitcast float %101 to i32, !dbg !25
|
115 |
+
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !25
|
116 |
+
%104 = bitcast i32 %103 to float, !dbg !25
|
117 |
+
%105 = fadd float %101, %104, !dbg !29
|
118 |
+
%106 = bitcast float %105 to i32, !dbg !25
|
119 |
+
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !25
|
120 |
+
%108 = bitcast i32 %107 to float, !dbg !25
|
121 |
+
%109 = fadd float %105, %108, !dbg !29
|
122 |
+
%110 = and i32 %6, 3, !dbg !25
|
123 |
+
%111 = icmp eq i32 %110, 0, !dbg !25
|
124 |
+
%112 = and i1 %98, %111, !dbg !25
|
125 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %109, i1 %112) #3, !dbg !25
|
126 |
+
%113 = add i32 %6, 128, !dbg !25
|
127 |
+
%114 = sext i32 %113 to i64, !dbg !25
|
128 |
+
%115 = getelementptr float, ptr addrspace(3) @global_smem, i64 %114, !dbg !25
|
129 |
+
%116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %98) #3, !dbg !25
|
130 |
+
%117 = bitcast float %116 to i32, !dbg !25
|
131 |
+
%118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !25
|
132 |
+
%119 = bitcast i32 %118 to float, !dbg !25
|
133 |
+
%120 = fadd float %116, %119, !dbg !29
|
134 |
+
%121 = bitcast float %120 to i32, !dbg !25
|
135 |
+
%122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !25
|
136 |
+
%123 = bitcast i32 %122 to float, !dbg !25
|
137 |
+
%124 = fadd float %120, %123, !dbg !29
|
138 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %124, i1 %112) #3, !dbg !25
|
139 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !25
|
140 |
+
%125 = zext nneg i32 %82 to i64, !dbg !25
|
141 |
+
%126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !25
|
142 |
+
%127 = load float, ptr addrspace(3) %126, align 4, !dbg !25
|
143 |
+
%128 = zext nneg i32 %86 to i64, !dbg !25
|
144 |
+
%129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !25
|
145 |
+
%130 = load float, ptr addrspace(3) %129, align 4, !dbg !25
|
146 |
+
%131 = zext nneg i32 %90 to i64, !dbg !25
|
147 |
+
%132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !25
|
148 |
+
%133 = load float, ptr addrspace(3) %132, align 4, !dbg !25
|
149 |
+
%134 = zext nneg i32 %94 to i64, !dbg !25
|
150 |
+
%135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !25
|
151 |
+
%136 = load float, ptr addrspace(3) %135, align 4, !dbg !25
|
152 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !33
|
153 |
+
%137 = zext nneg i32 %10 to i64, !dbg !33
|
154 |
+
%138 = getelementptr float, ptr addrspace(3) @global_smem, i64 %137, !dbg !33
|
155 |
+
%139 = insertelement <1 x float> undef, float %127, i64 0, !dbg !33
|
156 |
+
store <1 x float> %139, ptr addrspace(3) %138, align 4, !dbg !33
|
157 |
+
%140 = zext nneg i32 %60 to i64, !dbg !33
|
158 |
+
%141 = getelementptr float, ptr addrspace(3) @global_smem, i64 %140, !dbg !33
|
159 |
+
%142 = insertelement <1 x float> undef, float %130, i64 0, !dbg !33
|
160 |
+
store <1 x float> %142, ptr addrspace(3) %141, align 4, !dbg !33
|
161 |
+
%143 = zext nneg i32 %59 to i64, !dbg !33
|
162 |
+
%144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !33
|
163 |
+
%145 = insertelement <1 x float> undef, float %133, i64 0, !dbg !33
|
164 |
+
store <1 x float> %145, ptr addrspace(3) %144, align 4, !dbg !33
|
165 |
+
%146 = zext nneg i32 %58 to i64, !dbg !33
|
166 |
+
%147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !33
|
167 |
+
%148 = insertelement <1 x float> undef, float %136, i64 0, !dbg !33
|
168 |
+
store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !33
|
169 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !33
|
170 |
+
%149 = zext nneg i32 %56 to i64, !dbg !33
|
171 |
+
%150 = getelementptr float, ptr addrspace(3) @global_smem, i64 %149, !dbg !33
|
172 |
+
%151 = load i32, ptr addrspace(3) %150, align 4, !dbg !33
|
173 |
+
%152 = sext i32 %57 to i64, !dbg !34
|
174 |
+
%153 = getelementptr float, ptr addrspace(1) %2, i64 %152, !dbg !34
|
175 |
+
%154 = and i32 %6, 64, !dbg !35
|
176 |
+
%155 = icmp eq i32 %154, 0, !dbg !35
|
177 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %151, ptr addrspace(1) %153, i1 %155) #3, !dbg !35
|
178 |
+
ret void, !dbg !36
|
179 |
+
}
|
180 |
+
|
181 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
182 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
183 |
+
|
184 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
185 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
186 |
+
|
187 |
+
; Function Attrs: convergent nocallback nounwind
|
188 |
+
declare void @llvm.nvvm.barrier0() #2
|
189 |
+
|
190 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
191 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
192 |
+
attributes #2 = { convergent nocallback nounwind }
|
193 |
+
attributes #3 = { nounwind }
|
194 |
+
|
195 |
+
!llvm.module.flags = !{!0}
|
196 |
+
!llvm.dbg.cu = !{!1}
|
197 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
198 |
+
|
199 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
200 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
201 |
+
!2 = !DIFile(filename: "cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py", directory: "/tmp/torchinductor_root/qd")
|
202 |
+
!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
|
203 |
+
!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
|
204 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
205 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
206 |
+
!7 = !{}
|
207 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
208 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
209 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
210 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
211 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
212 |
+
!13 = !DILocation(line: 26, column: 20, scope: !5)
|
213 |
+
!14 = !DILocation(line: 33, column: 57, scope: !5)
|
214 |
+
!15 = !DILocation(line: 29, column: 36, scope: !5)
|
215 |
+
!16 = !DILocation(line: 30, column: 27, scope: !5)
|
216 |
+
!17 = !DILocation(line: 33, column: 44, scope: !5)
|
217 |
+
!18 = !DILocation(line: 33, column: 51, scope: !5)
|
218 |
+
!19 = !DILocation(line: 33, column: 34, scope: !5)
|
219 |
+
!20 = !DILocation(line: 33, column: 63, scope: !5)
|
220 |
+
!21 = !DILocation(line: 34, column: 34, scope: !5)
|
221 |
+
!22 = !DILocation(line: 34, column: 63, scope: !5)
|
222 |
+
!23 = !DILocation(line: 35, column: 22, scope: !5)
|
223 |
+
!24 = !DILocation(line: 38, column: 38, scope: !5)
|
224 |
+
!25 = !DILocation(line: 243, column: 36, scope: !26, inlinedAt: !28)
|
225 |
+
!26 = distinct !DILexicalBlockFile(scope: !5, file: !27, discriminator: 0)
|
226 |
+
!27 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
227 |
+
!28 = !DILocation(line: 39, column: 25, scope: !26)
|
228 |
+
!29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !31)
|
229 |
+
!30 = distinct !DILexicalBlockFile(scope: !26, file: !27, discriminator: 0)
|
230 |
+
!31 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
|
231 |
+
!32 = !DILocation(line: 39, column: 25, scope: !30)
|
232 |
+
!33 = !DILocation(line: 39, column: 28, scope: !5)
|
233 |
+
!34 = !DILocation(line: 40, column: 25, scope: !5)
|
234 |
+
!35 = !DILocation(line: 40, column: 36, scope: !5)
|
235 |
+
!36 = !DILocation(line: 40, column: 4, scope: !5)
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx
ADDED
@@ -0,0 +1,572 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4de(
|
13 |
+
.param .u64 triton__0d1d2d3de4de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4de_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4de_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4de_param_4
|
18 |
+
)
|
19 |
+
.maxntid 128, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<22>;
|
22 |
+
.reg .b32 %r<98>;
|
23 |
+
.reg .f32 %f<47>;
|
24 |
+
.reg .b64 %rd<9>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
|
30 |
+
ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
|
31 |
+
ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
|
32 |
+
$L__tmp0:
|
33 |
+
.loc 1 22 44
|
34 |
+
mov.u32 %r1, %tid.x;
|
35 |
+
and.b32 %r2, %r1, 31;
|
36 |
+
shl.b32 %r13, %r1, 2;
|
37 |
+
and.b32 %r3, %r13, 60;
|
38 |
+
.loc 1 24 33
|
39 |
+
bfe.u32 %r4, %r1, 5, 2;
|
40 |
+
.loc 1 21 28
|
41 |
+
mov.u32 %r11, %ctaid.x;
|
42 |
+
.loc 1 21 33
|
43 |
+
shl.b32 %r5, %r11, 6;
|
44 |
+
.loc 1 22 23
|
45 |
+
or.b32 %r14, %r5, %r3;
|
46 |
+
.loc 1 26 20
|
47 |
+
shr.s32 %r16, %r14, 31;
|
48 |
+
shr.u32 %r17, %r16, 24;
|
49 |
+
add.s32 %r18, %r14, %r17;
|
50 |
+
shr.s32 %r19, %r18, 8;
|
51 |
+
.loc 1 29 36
|
52 |
+
mad.lo.s32 %r20, %r19, 32512, %r14;
|
53 |
+
shl.b32 %r21, %r4, 9;
|
54 |
+
add.s32 %r22, %r20, %r21;
|
55 |
+
shl.b32 %r23, %r1, 4;
|
56 |
+
and.b32 %r24, %r23, 256;
|
57 |
+
add.s32 %r96, %r22, %r24;
|
58 |
+
mov.f32 %f43, 0f00000000;
|
59 |
+
mov.b32 %r97, -8;
|
60 |
+
mov.pred %p1, -1;
|
61 |
+
mov.f32 %f44, %f43;
|
62 |
+
mov.f32 %f45, %f43;
|
63 |
+
mov.f32 %f46, %f43;
|
64 |
+
$L__BB0_1:
|
65 |
+
.loc 1 33 34
|
66 |
+
mul.wide.s32 %rd6, %r96, 4;
|
67 |
+
add.s64 %rd4, %rd1, %rd6;
|
68 |
+
mov.b32 %r29, 0;
|
69 |
+
.loc 1 33 63
|
70 |
+
mov.u32 %r25, 0x0;
|
71 |
+
mov.u32 %r26, 0x0;
|
72 |
+
mov.u32 %r27, 0x0;
|
73 |
+
mov.u32 %r28, 0x0;
|
74 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd4 + 0 ];
|
75 |
+
@!%p1 mov.u32 %r25, %r29;
|
76 |
+
@!%p1 mov.u32 %r26, %r29;
|
77 |
+
@!%p1 mov.u32 %r27, %r29;
|
78 |
+
@!%p1 mov.u32 %r28, %r29;
|
79 |
+
.loc 1 34 34
|
80 |
+
add.s64 %rd5, %rd2, %rd6;
|
81 |
+
.loc 1 34 63
|
82 |
+
mov.u32 %r33, 0x0;
|
83 |
+
mov.u32 %r34, 0x0;
|
84 |
+
mov.u32 %r35, 0x0;
|
85 |
+
mov.u32 %r36, 0x0;
|
86 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
|
87 |
+
@!%p1 mov.u32 %r33, %r29;
|
88 |
+
@!%p1 mov.u32 %r34, %r29;
|
89 |
+
@!%p1 mov.u32 %r35, %r29;
|
90 |
+
@!%p1 mov.u32 %r36, %r29;
|
91 |
+
.loc 1 33 63
|
92 |
+
mov.b32 %f13, %r25;
|
93 |
+
mov.b32 %f14, %r26;
|
94 |
+
mov.b32 %f15, %r27;
|
95 |
+
mov.b32 %f16, %r28;
|
96 |
+
.loc 1 34 63
|
97 |
+
mov.b32 %f17, %r33;
|
98 |
+
mov.b32 %f18, %r34;
|
99 |
+
mov.b32 %f19, %r35;
|
100 |
+
mov.b32 %f20, %r36;
|
101 |
+
.loc 1 38 38
|
102 |
+
fma.rn.f32 %f46, %f16, %f20, %f46;
|
103 |
+
fma.rn.f32 %f45, %f15, %f19, %f45;
|
104 |
+
fma.rn.f32 %f44, %f14, %f18, %f44;
|
105 |
+
fma.rn.f32 %f43, %f13, %f17, %f43;
|
106 |
+
.loc 1 29 36
|
107 |
+
add.s32 %r97, %r97, 8;
|
108 |
+
add.s32 %r96, %r96, 2048;
|
109 |
+
setp.lt.u32 %p11, %r97, 120;
|
110 |
+
@%p11 bra $L__BB0_1;
|
111 |
+
.loc 1 22 44
|
112 |
+
and.b32 %r58, %r1, 63;
|
113 |
+
.loc 1 22 23
|
114 |
+
or.b32 %r59, %r5, %r58;
|
115 |
+
$L__tmp1:
|
116 |
+
.loc 2 243 36
|
117 |
+
mov.b32 %r60, %f43;
|
118 |
+
shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
|
119 |
+
mov.b32 %f21, %r61;
|
120 |
+
$L__tmp2:
|
121 |
+
.loc 2 233 15
|
122 |
+
add.f32 %f22, %f43, %f21;
|
123 |
+
$L__tmp3:
|
124 |
+
.loc 2 243 36
|
125 |
+
mov.b32 %r62, %f44;
|
126 |
+
shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
|
127 |
+
mov.b32 %f23, %r63;
|
128 |
+
$L__tmp4:
|
129 |
+
.loc 2 233 15
|
130 |
+
add.f32 %f24, %f44, %f23;
|
131 |
+
$L__tmp5:
|
132 |
+
.loc 2 243 36
|
133 |
+
mov.b32 %r64, %f45;
|
134 |
+
shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
|
135 |
+
mov.b32 %f25, %r65;
|
136 |
+
$L__tmp6:
|
137 |
+
.loc 2 233 15
|
138 |
+
add.f32 %f26, %f45, %f25;
|
139 |
+
$L__tmp7:
|
140 |
+
.loc 2 243 36
|
141 |
+
mov.b32 %r66, %f46;
|
142 |
+
shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
|
143 |
+
mov.b32 %f27, %r67;
|
144 |
+
$L__tmp8:
|
145 |
+
.loc 2 233 15
|
146 |
+
add.f32 %f28, %f46, %f27;
|
147 |
+
$L__tmp9:
|
148 |
+
.loc 2 243 36
|
149 |
+
setp.lt.u32 %p12, %r2, 16;
|
150 |
+
shl.b32 %r68, %r3, 2;
|
151 |
+
or.b32 %r69, %r68, %r4;
|
152 |
+
shl.b32 %r70, %r69, 2;
|
153 |
+
mov.u32 %r71, global_smem;
|
154 |
+
add.s32 %r41, %r71, %r70;
|
155 |
+
mov.b32 %r42, %f22;
|
156 |
+
@%p12 st.shared.b32 [ %r41 + 0 ], %r42;
|
157 |
+
shl.b32 %r72, %r4, 2;
|
158 |
+
shl.b32 %r73, %r3, 4;
|
159 |
+
or.b32 %r74, %r73, 16;
|
160 |
+
or.b32 %r75, %r74, %r72;
|
161 |
+
add.s32 %r43, %r71, %r75;
|
162 |
+
mov.b32 %r44, %f24;
|
163 |
+
@%p12 st.shared.b32 [ %r43 + 0 ], %r44;
|
164 |
+
or.b32 %r76, %r73, 32;
|
165 |
+
or.b32 %r77, %r76, %r72;
|
166 |
+
add.s32 %r45, %r71, %r77;
|
167 |
+
mov.b32 %r46, %f26;
|
168 |
+
@%p12 st.shared.b32 [ %r45 + 0 ], %r46;
|
169 |
+
or.b32 %r78, %r73, 48;
|
170 |
+
or.b32 %r79, %r78, %r72;
|
171 |
+
add.s32 %r47, %r71, %r79;
|
172 |
+
mov.b32 %r48, %f28;
|
173 |
+
@%p12 st.shared.b32 [ %r47 + 0 ], %r48;
|
174 |
+
bar.sync 0;
|
175 |
+
setp.lt.s32 %p16, %r1, 256;
|
176 |
+
add.s32 %r50, %r71, %r13;
|
177 |
+
@%p16 ld.shared.b32 %r49, [ %r50 + 0 ];
|
178 |
+
mov.b32 %f29, %r49;
|
179 |
+
shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
|
180 |
+
mov.b32 %f30, %r81;
|
181 |
+
$L__tmp10:
|
182 |
+
.loc 2 233 15
|
183 |
+
add.f32 %f31, %f29, %f30;
|
184 |
+
$L__tmp11:
|
185 |
+
.loc 2 243 36
|
186 |
+
mov.b32 %r82, %f31;
|
187 |
+
shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
|
188 |
+
mov.b32 %f32, %r83;
|
189 |
+
$L__tmp12:
|
190 |
+
.loc 2 233 15
|
191 |
+
add.f32 %f33, %f31, %f32;
|
192 |
+
$L__tmp13:
|
193 |
+
.loc 2 243 36
|
194 |
+
and.b32 %r84, %r1, 3;
|
195 |
+
setp.eq.s32 %p21, %r84, 0;
|
196 |
+
and.pred %p17, %p16, %p21;
|
197 |
+
mov.b32 %r52, %f33;
|
198 |
+
@%p17 st.shared.b32 [ %r50 + 0 ], %r52;
|
199 |
+
add.s32 %r54, %r50, 512;
|
200 |
+
@%p16 ld.shared.b32 %r53, [ %r54 + 0 ];
|
201 |
+
mov.b32 %f34, %r53;
|
202 |
+
shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
|
203 |
+
mov.b32 %f35, %r85;
|
204 |
+
$L__tmp14:
|
205 |
+
.loc 2 233 15
|
206 |
+
add.f32 %f36, %f34, %f35;
|
207 |
+
$L__tmp15:
|
208 |
+
.loc 2 243 36
|
209 |
+
mov.b32 %r86, %f36;
|
210 |
+
shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
|
211 |
+
mov.b32 %f37, %r87;
|
212 |
+
$L__tmp16:
|
213 |
+
.loc 2 233 15
|
214 |
+
add.f32 %f38, %f36, %f37;
|
215 |
+
$L__tmp17:
|
216 |
+
.loc 2 243 36
|
217 |
+
mov.b32 %r56, %f38;
|
218 |
+
@%p17 st.shared.b32 [ %r54 + 0 ], %r56;
|
219 |
+
bar.sync 0;
|
220 |
+
add.s32 %r88, %r71, %r73;
|
221 |
+
ld.shared.f32 %f39, [%r88];
|
222 |
+
add.s32 %r89, %r71, %r74;
|
223 |
+
ld.shared.f32 %f40, [%r89];
|
224 |
+
add.s32 %r90, %r71, %r76;
|
225 |
+
ld.shared.f32 %f41, [%r90];
|
226 |
+
add.s32 %r91, %r71, %r78;
|
227 |
+
ld.shared.f32 %f42, [%r91];
|
228 |
+
$L__tmp18:
|
229 |
+
.loc 1 39 28
|
230 |
+
bar.sync 0;
|
231 |
+
add.s32 %r92, %r71, %r68;
|
232 |
+
st.shared.f32 [%r92], %f39;
|
233 |
+
st.shared.f32 [%r92+4], %f40;
|
234 |
+
st.shared.f32 [%r92+8], %f41;
|
235 |
+
st.shared.f32 [%r92+12], %f42;
|
236 |
+
bar.sync 0;
|
237 |
+
shl.b32 %r93, %r58, 2;
|
238 |
+
add.s32 %r94, %r71, %r93;
|
239 |
+
ld.shared.u32 %r57, [%r94];
|
240 |
+
.loc 1 40 25
|
241 |
+
mul.wide.s32 %rd8, %r59, 4;
|
242 |
+
add.s64 %rd7, %rd3, %rd8;
|
243 |
+
.loc 1 40 36
|
244 |
+
and.b32 %r95, %r1, 64;
|
245 |
+
setp.eq.s32 %p20, %r95, 0;
|
246 |
+
@%p20 st.global.b32 [ %rd7 + 0 ], { %r57 };
|
247 |
+
.loc 1 40 4
|
248 |
+
ret;
|
249 |
+
$L__tmp19:
|
250 |
+
$L__func_end0:
|
251 |
+
|
252 |
+
}
|
253 |
+
.file 1 "/tmp/torchinductor_root/qd/cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py"
|
254 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
255 |
+
.section .debug_abbrev
|
256 |
+
{
|
257 |
+
.b8 1
|
258 |
+
.b8 17
|
259 |
+
.b8 1
|
260 |
+
.b8 37
|
261 |
+
.b8 8
|
262 |
+
.b8 19
|
263 |
+
.b8 5
|
264 |
+
.b8 3
|
265 |
+
.b8 8
|
266 |
+
.b8 16
|
267 |
+
.b8 6
|
268 |
+
.b8 27
|
269 |
+
.b8 8
|
270 |
+
.b8 180
|
271 |
+
.b8 66
|
272 |
+
.b8 12
|
273 |
+
.b8 17
|
274 |
+
.b8 1
|
275 |
+
.b8 18
|
276 |
+
.b8 1
|
277 |
+
.b8 0
|
278 |
+
.b8 0
|
279 |
+
.b8 2
|
280 |
+
.b8 46
|
281 |
+
.b8 0
|
282 |
+
.b8 135
|
283 |
+
.b8 64
|
284 |
+
.b8 8
|
285 |
+
.b8 3
|
286 |
+
.b8 8
|
287 |
+
.b8 58
|
288 |
+
.b8 11
|
289 |
+
.b8 59
|
290 |
+
.b8 11
|
291 |
+
.b8 63
|
292 |
+
.b8 12
|
293 |
+
.b8 32
|
294 |
+
.b8 11
|
295 |
+
.b8 0
|
296 |
+
.b8 0
|
297 |
+
.b8 3
|
298 |
+
.b8 46
|
299 |
+
.b8 1
|
300 |
+
.b8 17
|
301 |
+
.b8 1
|
302 |
+
.b8 18
|
303 |
+
.b8 1
|
304 |
+
.b8 64
|
305 |
+
.b8 10
|
306 |
+
.b8 49
|
307 |
+
.b8 19
|
308 |
+
.b8 0
|
309 |
+
.b8 0
|
310 |
+
.b8 4
|
311 |
+
.b8 29
|
312 |
+
.b8 0
|
313 |
+
.b8 49
|
314 |
+
.b8 19
|
315 |
+
.b8 17
|
316 |
+
.b8 1
|
317 |
+
.b8 18
|
318 |
+
.b8 1
|
319 |
+
.b8 88
|
320 |
+
.b8 11
|
321 |
+
.b8 89
|
322 |
+
.b8 11
|
323 |
+
.b8 87
|
324 |
+
.b8 11
|
325 |
+
.b8 0
|
326 |
+
.b8 0
|
327 |
+
.b8 5
|
328 |
+
.b8 29
|
329 |
+
.b8 1
|
330 |
+
.b8 49
|
331 |
+
.b8 19
|
332 |
+
.b8 17
|
333 |
+
.b8 1
|
334 |
+
.b8 18
|
335 |
+
.b8 1
|
336 |
+
.b8 88
|
337 |
+
.b8 11
|
338 |
+
.b8 89
|
339 |
+
.b8 11
|
340 |
+
.b8 87
|
341 |
+
.b8 11
|
342 |
+
.b8 0
|
343 |
+
.b8 0
|
344 |
+
.b8 0
|
345 |
+
}
|
346 |
+
.section .debug_info
|
347 |
+
{
|
348 |
+
.b32 266
|
349 |
+
.b8 2
|
350 |
+
.b8 0
|
351 |
+
.b32 .debug_abbrev
|
352 |
+
.b8 8
|
353 |
+
.b8 1
|
354 |
+
.b8 116
|
355 |
+
.b8 114
|
356 |
+
.b8 105
|
357 |
+
.b8 116
|
358 |
+
.b8 111
|
359 |
+
.b8 110
|
360 |
+
.b8 0
|
361 |
+
.b8 2
|
362 |
+
.b8 0
|
363 |
+
.b8 99
|
364 |
+
.b8 113
|
365 |
+
.b8 100
|
366 |
+
.b8 118
|
367 |
+
.b8 108
|
368 |
+
.b8 116
|
369 |
+
.b8 110
|
370 |
+
.b8 100
|
371 |
+
.b8 120
|
372 |
+
.b8 99
|
373 |
+
.b8 55
|
374 |
+
.b8 118
|
375 |
+
.b8 119
|
376 |
+
.b8 106
|
377 |
+
.b8 53
|
378 |
+
.b8 106
|
379 |
+
.b8 53
|
380 |
+
.b8 100
|
381 |
+
.b8 110
|
382 |
+
.b8 115
|
383 |
+
.b8 98
|
384 |
+
.b8 55
|
385 |
+
.b8 51
|
386 |
+
.b8 116
|
387 |
+
.b8 107
|
388 |
+
.b8 55
|
389 |
+
.b8 54
|
390 |
+
.b8 51
|
391 |
+
.b8 103
|
392 |
+
.b8 97
|
393 |
+
.b8 106
|
394 |
+
.b8 102
|
395 |
+
.b8 116
|
396 |
+
.b8 106
|
397 |
+
.b8 119
|
398 |
+
.b8 118
|
399 |
+
.b8 109
|
400 |
+
.b8 98
|
401 |
+
.b8 102
|
402 |
+
.b8 113
|
403 |
+
.b8 55
|
404 |
+
.b8 105
|
405 |
+
.b8 54
|
406 |
+
.b8 115
|
407 |
+
.b8 105
|
408 |
+
.b8 116
|
409 |
+
.b8 107
|
410 |
+
.b8 53
|
411 |
+
.b8 103
|
412 |
+
.b8 119
|
413 |
+
.b8 111
|
414 |
+
.b8 121
|
415 |
+
.b8 46
|
416 |
+
.b8 112
|
417 |
+
.b8 121
|
418 |
+
.b8 0
|
419 |
+
.b32 .debug_line
|
420 |
+
.b8 47
|
421 |
+
.b8 116
|
422 |
+
.b8 109
|
423 |
+
.b8 112
|
424 |
+
.b8 47
|
425 |
+
.b8 116
|
426 |
+
.b8 111
|
427 |
+
.b8 114
|
428 |
+
.b8 99
|
429 |
+
.b8 104
|
430 |
+
.b8 105
|
431 |
+
.b8 110
|
432 |
+
.b8 100
|
433 |
+
.b8 117
|
434 |
+
.b8 99
|
435 |
+
.b8 116
|
436 |
+
.b8 111
|
437 |
+
.b8 114
|
438 |
+
.b8 95
|
439 |
+
.b8 114
|
440 |
+
.b8 111
|
441 |
+
.b8 111
|
442 |
+
.b8 116
|
443 |
+
.b8 47
|
444 |
+
.b8 113
|
445 |
+
.b8 100
|
446 |
+
.b8 0
|
447 |
+
.b8 1
|
448 |
+
.b64 $L__func_begin0
|
449 |
+
.b64 $L__func_end0
|
450 |
+
.b8 2
|
451 |
+
.b8 116
|
452 |
+
.b8 114
|
453 |
+
.b8 105
|
454 |
+
.b8 116
|
455 |
+
.b8 111
|
456 |
+
.b8 110
|
457 |
+
.b8 95
|
458 |
+
.b8 95
|
459 |
+
.b8 48
|
460 |
+
.b8 100
|
461 |
+
.b8 49
|
462 |
+
.b8 100
|
463 |
+
.b8 50
|
464 |
+
.b8 100
|
465 |
+
.b8 51
|
466 |
+
.b8 100
|
467 |
+
.b8 101
|
468 |
+
.b8 52
|
469 |
+
.b8 100
|
470 |
+
.b8 101
|
471 |
+
.b8 0
|
472 |
+
.b8 116
|
473 |
+
.b8 114
|
474 |
+
.b8 105
|
475 |
+
.b8 116
|
476 |
+
.b8 111
|
477 |
+
.b8 110
|
478 |
+
.b8 95
|
479 |
+
.b8 95
|
480 |
+
.b8 48
|
481 |
+
.b8 100
|
482 |
+
.b8 49
|
483 |
+
.b8 100
|
484 |
+
.b8 50
|
485 |
+
.b8 100
|
486 |
+
.b8 51
|
487 |
+
.b8 100
|
488 |
+
.b8 101
|
489 |
+
.b8 52
|
490 |
+
.b8 100
|
491 |
+
.b8 101
|
492 |
+
.b8 0
|
493 |
+
.b8 1
|
494 |
+
.b8 18
|
495 |
+
.b8 1
|
496 |
+
.b8 1
|
497 |
+
.b8 3
|
498 |
+
.b64 $L__func_begin0
|
499 |
+
.b64 $L__func_end0
|
500 |
+
.b8 1
|
501 |
+
.b8 156
|
502 |
+
.b32 125
|
503 |
+
.b8 4
|
504 |
+
.b32 125
|
505 |
+
.b64 $L__tmp1
|
506 |
+
.b64 $L__tmp18
|
507 |
+
.b8 2
|
508 |
+
.b8 39
|
509 |
+
.b8 25
|
510 |
+
.b8 5
|
511 |
+
.b32 125
|
512 |
+
.b64 $L__tmp2
|
513 |
+
.b64 $L__tmp17
|
514 |
+
.b8 2
|
515 |
+
.b8 39
|
516 |
+
.b8 25
|
517 |
+
.b8 4
|
518 |
+
.b32 125
|
519 |
+
.b64 $L__tmp2
|
520 |
+
.b64 $L__tmp17
|
521 |
+
.b8 2
|
522 |
+
.b8 243
|
523 |
+
.b8 36
|
524 |
+
.b8 0
|
525 |
+
.b8 0
|
526 |
+
.b8 0
|
527 |
+
}
|
528 |
+
.section .debug_pubnames
|
529 |
+
{
|
530 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
531 |
+
$L__pubNames_start0:
|
532 |
+
.b8 2
|
533 |
+
.b8 0
|
534 |
+
.b32 .debug_info
|
535 |
+
.b32 270
|
536 |
+
.b32 125
|
537 |
+
.b8 116
|
538 |
+
.b8 114
|
539 |
+
.b8 105
|
540 |
+
.b8 116
|
541 |
+
.b8 111
|
542 |
+
.b8 110
|
543 |
+
.b8 95
|
544 |
+
.b8 95
|
545 |
+
.b8 48
|
546 |
+
.b8 100
|
547 |
+
.b8 49
|
548 |
+
.b8 100
|
549 |
+
.b8 50
|
550 |
+
.b8 100
|
551 |
+
.b8 51
|
552 |
+
.b8 100
|
553 |
+
.b8 101
|
554 |
+
.b8 52
|
555 |
+
.b8 100
|
556 |
+
.b8 101
|
557 |
+
.b8 0
|
558 |
+
.b32 0
|
559 |
+
$L__pubNames_end0:
|
560 |
+
}
|
561 |
+
.section .debug_pubtypes
|
562 |
+
{
|
563 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
564 |
+
$L__pubTypes_start0:
|
565 |
+
.b8 2
|
566 |
+
.b8 0
|
567 |
+
.b32 .debug_info
|
568 |
+
.b32 270
|
569 |
+
.b32 0
|
570 |
+
$L__pubTypes_end0:
|
571 |
+
}
|
572 |
+
.section .debug_loc { }
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
|
9 |
+
%c0_i32 = arith.constant 0 : i32
|
10 |
+
%c128_i32 = arith.constant 128 : i32
|
11 |
+
%c8_i32 = arith.constant 8 : i32
|
12 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
13 |
+
%c64_i32 = arith.constant 64 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
17 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
18 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
19 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
20 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
21 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
22 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
23 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
24 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
25 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
26 |
+
%12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
27 |
+
%13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
|
28 |
+
%14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
29 |
+
%15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
|
30 |
+
%16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
31 |
+
%17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
32 |
+
%18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
|
34 |
+
%25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
|
35 |
+
%26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
|
36 |
+
%27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
|
37 |
+
%28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
|
38 |
+
%29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
39 |
+
%30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
|
40 |
+
%31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
|
41 |
+
%32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
42 |
+
%33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
43 |
+
%34 = tt.load %32, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
44 |
+
%35 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
45 |
+
%36 = tt.load %35, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
46 |
+
%37 = arith.mulf %34, %36 : tensor<64x8xf32, #blocked>
|
47 |
+
%38 = arith.addf %arg6, %37 : tensor<64x8xf32, #blocked>
|
48 |
+
%39 = arith.select %33, %38, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
49 |
+
scf.yield %39 : tensor<64x8xf32, #blocked>
|
50 |
+
}
|
51 |
+
%20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
|
52 |
+
^bb0(%arg5: f32, %arg6: f32):
|
53 |
+
%25 = arith.addf %arg5, %arg6 : f32
|
54 |
+
tt.reduce.return %25 : f32
|
55 |
+
}) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
56 |
+
%21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
57 |
+
%22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
|
58 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
|
59 |
+
%24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
60 |
+
tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
|
61 |
+
tt.return
|
62 |
+
}
|
63 |
+
}
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin
ADDED
Binary file (16.5 kB). View file
|
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
7 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%7 = and i32 %6, 31, !dbg !8
|
9 |
+
%8 = lshr i32 %6, 5, !dbg !8
|
10 |
+
%9 = shl i32 %6, 2, !dbg !8
|
11 |
+
%10 = and i32 %9, 60, !dbg !8
|
12 |
+
%11 = and i32 %8, 3, !dbg !9
|
13 |
+
%12 = lshr i32 %7, 4, !dbg !9
|
14 |
+
%13 = shl nuw nsw i32 %11, 1, !dbg !9
|
15 |
+
%14 = or i32 %13, %12, !dbg !9
|
16 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
|
17 |
+
%16 = shl i32 %15, 6, !dbg !11
|
18 |
+
%17 = or i32 %16, %10, !dbg !12
|
19 |
+
%.frozen = freeze i32 %17
|
20 |
+
%18 = sdiv i32 %.frozen, 256, !dbg !13
|
21 |
+
%19 = mul i32 %18, 256
|
22 |
+
%.decomposed = sub i32 %.frozen, %19
|
23 |
+
%20 = shl i32 %18, 15, !dbg !14
|
24 |
+
%21 = add i32 %20, %.decomposed
|
25 |
+
br label %22, !dbg !15
|
26 |
+
|
27 |
+
22: ; preds = %5, %22
|
28 |
+
%23 = phi i32 [ 0, %5 ], [ %58, %22 ]
|
29 |
+
%24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ]
|
30 |
+
%25 = or i32 %23, %14, !dbg !16
|
31 |
+
%26 = shl i32 %25, 8, !dbg !17
|
32 |
+
%27 = add i32 %21, %26, !dbg !18
|
33 |
+
%28 = sext i32 %27 to i64, !dbg !19
|
34 |
+
%29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19
|
35 |
+
%30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
|
36 |
+
%31 = extractvalue { i32, i32 } %30, 0, !dbg !20
|
37 |
+
%32 = extractvalue { i32, i32 } %30, 1, !dbg !20
|
38 |
+
%33 = trunc i32 %31 to i16, !dbg !20
|
39 |
+
%extelt.offset = lshr i32 %31, 16, !dbg !20
|
40 |
+
%34 = trunc i32 %extelt.offset to i16, !dbg !20
|
41 |
+
%35 = trunc i32 %32 to i16, !dbg !20
|
42 |
+
%extelt.offset1 = lshr i32 %32, 16, !dbg !20
|
43 |
+
%36 = trunc i32 %extelt.offset1 to i16, !dbg !20
|
44 |
+
%37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21
|
45 |
+
%38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21
|
46 |
+
%39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21
|
47 |
+
%40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21
|
48 |
+
%41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22
|
49 |
+
%42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23
|
50 |
+
%43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23
|
51 |
+
%44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23
|
52 |
+
%45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23
|
53 |
+
%46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23
|
54 |
+
%47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23
|
55 |
+
%48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23
|
56 |
+
%49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23
|
57 |
+
%50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23
|
58 |
+
%51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23
|
59 |
+
%52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24
|
60 |
+
%53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24
|
61 |
+
%54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24
|
62 |
+
%55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24
|
63 |
+
%56 = fmul <4 x float> %55, %51, !dbg !24
|
64 |
+
%57 = fadd <4 x float> %24, %56, !dbg !25
|
65 |
+
%58 = add nuw nsw i32 %23, 8, !dbg !15
|
66 |
+
%59 = icmp ult i32 %23, 120, !dbg !15
|
67 |
+
br i1 %59, label %22, label %60, !dbg !15
|
68 |
+
|
69 |
+
60: ; preds = %22
|
70 |
+
%61 = and i32 %6, 63, !dbg !8
|
71 |
+
%62 = or i32 %16, %61, !dbg !12
|
72 |
+
%63 = or i32 %10, 3, !dbg !26
|
73 |
+
%64 = or i32 %10, 2, !dbg !26
|
74 |
+
%65 = or i32 %10, 1, !dbg !26
|
75 |
+
%66 = extractelement <4 x float> %57, i64 0, !dbg !26
|
76 |
+
%67 = bitcast float %66 to i32, !dbg !26
|
77 |
+
%68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26
|
78 |
+
%69 = bitcast i32 %68 to float, !dbg !26
|
79 |
+
%70 = fadd float %66, %69, !dbg !30
|
80 |
+
%71 = extractelement <4 x float> %57, i64 1, !dbg !26
|
81 |
+
%72 = bitcast float %71 to i32, !dbg !26
|
82 |
+
%73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26
|
83 |
+
%74 = bitcast i32 %73 to float, !dbg !26
|
84 |
+
%75 = fadd float %71, %74, !dbg !30
|
85 |
+
%76 = extractelement <4 x float> %57, i64 2, !dbg !26
|
86 |
+
%77 = bitcast float %76 to i32, !dbg !26
|
87 |
+
%78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26
|
88 |
+
%79 = bitcast i32 %78 to float, !dbg !26
|
89 |
+
%80 = fadd float %76, %79, !dbg !30
|
90 |
+
%81 = extractelement <4 x float> %57, i64 3, !dbg !26
|
91 |
+
%82 = bitcast float %81 to i32, !dbg !26
|
92 |
+
%83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26
|
93 |
+
%84 = bitcast i32 %83 to float, !dbg !26
|
94 |
+
%85 = fadd float %81, %84, !dbg !30
|
95 |
+
%86 = icmp ult i32 %7, 16, !dbg !26
|
96 |
+
%87 = shl nuw nsw i32 %10, 2, !dbg !26
|
97 |
+
%88 = or i32 %87, %11, !dbg !26
|
98 |
+
%89 = zext nneg i32 %88 to i64, !dbg !26
|
99 |
+
%90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26
|
100 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26
|
101 |
+
%91 = shl nuw nsw i32 %65, 2, !dbg !26
|
102 |
+
%92 = or i32 %91, %11, !dbg !26
|
103 |
+
%93 = zext nneg i32 %92 to i64, !dbg !26
|
104 |
+
%94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26
|
105 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26
|
106 |
+
%95 = shl nuw nsw i32 %64, 2, !dbg !26
|
107 |
+
%96 = or i32 %95, %11, !dbg !26
|
108 |
+
%97 = zext nneg i32 %96 to i64, !dbg !26
|
109 |
+
%98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26
|
110 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26
|
111 |
+
%99 = shl nuw nsw i32 %63, 2, !dbg !26
|
112 |
+
%100 = or i32 %99, %11, !dbg !26
|
113 |
+
%101 = zext nneg i32 %100 to i64, !dbg !26
|
114 |
+
%102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26
|
115 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26
|
116 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
117 |
+
%103 = icmp slt i32 %6, 256, !dbg !26
|
118 |
+
%104 = sext i32 %6 to i64, !dbg !26
|
119 |
+
%105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26
|
120 |
+
%106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26
|
121 |
+
%107 = bitcast float %106 to i32, !dbg !26
|
122 |
+
%108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26
|
123 |
+
%109 = bitcast i32 %108 to float, !dbg !26
|
124 |
+
%110 = fadd float %106, %109, !dbg !30
|
125 |
+
%111 = bitcast float %110 to i32, !dbg !26
|
126 |
+
%112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26
|
127 |
+
%113 = bitcast i32 %112 to float, !dbg !26
|
128 |
+
%114 = fadd float %110, %113, !dbg !30
|
129 |
+
%115 = and i32 %6, 3, !dbg !26
|
130 |
+
%116 = icmp eq i32 %115, 0, !dbg !26
|
131 |
+
%117 = and i1 %103, %116, !dbg !26
|
132 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26
|
133 |
+
%118 = add i32 %6, 128, !dbg !26
|
134 |
+
%119 = sext i32 %118 to i64, !dbg !26
|
135 |
+
%120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26
|
136 |
+
%121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26
|
137 |
+
%122 = bitcast float %121 to i32, !dbg !26
|
138 |
+
%123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26
|
139 |
+
%124 = bitcast i32 %123 to float, !dbg !26
|
140 |
+
%125 = fadd float %121, %124, !dbg !30
|
141 |
+
%126 = bitcast float %125 to i32, !dbg !26
|
142 |
+
%127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26
|
143 |
+
%128 = bitcast i32 %127 to float, !dbg !26
|
144 |
+
%129 = fadd float %125, %128, !dbg !30
|
145 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26
|
146 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !26
|
147 |
+
%130 = zext nneg i32 %87 to i64, !dbg !26
|
148 |
+
%131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26
|
149 |
+
%132 = load float, ptr addrspace(3) %131, align 4, !dbg !26
|
150 |
+
%133 = zext nneg i32 %91 to i64, !dbg !26
|
151 |
+
%134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26
|
152 |
+
%135 = load float, ptr addrspace(3) %134, align 4, !dbg !26
|
153 |
+
%136 = zext nneg i32 %95 to i64, !dbg !26
|
154 |
+
%137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26
|
155 |
+
%138 = load float, ptr addrspace(3) %137, align 4, !dbg !26
|
156 |
+
%139 = zext nneg i32 %99 to i64, !dbg !26
|
157 |
+
%140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26
|
158 |
+
%141 = load float, ptr addrspace(3) %140, align 4, !dbg !26
|
159 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !34
|
160 |
+
%142 = zext nneg i32 %10 to i64, !dbg !34
|
161 |
+
%143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34
|
162 |
+
%144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34
|
163 |
+
store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34
|
164 |
+
%145 = zext nneg i32 %65 to i64, !dbg !34
|
165 |
+
%146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34
|
166 |
+
%147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34
|
167 |
+
store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34
|
168 |
+
%148 = zext nneg i32 %64 to i64, !dbg !34
|
169 |
+
%149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34
|
170 |
+
%150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34
|
171 |
+
store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34
|
172 |
+
%151 = zext nneg i32 %63 to i64, !dbg !34
|
173 |
+
%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34
|
174 |
+
%153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34
|
175 |
+
store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34
|
176 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !34
|
177 |
+
%154 = zext nneg i32 %61 to i64, !dbg !34
|
178 |
+
%155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34
|
179 |
+
%156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34
|
180 |
+
%157 = sext i32 %62 to i64, !dbg !35
|
181 |
+
%158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35
|
182 |
+
%159 = and i32 %6, 64, !dbg !36
|
183 |
+
%160 = icmp eq i32 %159, 0, !dbg !36
|
184 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36
|
185 |
+
ret void, !dbg !37
|
186 |
+
}
|
187 |
+
|
188 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
189 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
190 |
+
|
191 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
192 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
193 |
+
|
194 |
+
; Function Attrs: convergent nocallback nounwind
|
195 |
+
declare void @llvm.nvvm.barrier0() #2
|
196 |
+
|
197 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
198 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
199 |
+
attributes #2 = { convergent nocallback nounwind }
|
200 |
+
attributes #3 = { nounwind }
|
201 |
+
|
202 |
+
!llvm.module.flags = !{!0}
|
203 |
+
!llvm.dbg.cu = !{!1}
|
204 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
205 |
+
|
206 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
207 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
208 |
+
!2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj")
|
209 |
+
!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
|
210 |
+
!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
|
211 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
212 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
213 |
+
!7 = !{}
|
214 |
+
!8 = !DILocation(line: 22, column: 44, scope: !5)
|
215 |
+
!9 = !DILocation(line: 24, column: 33, scope: !5)
|
216 |
+
!10 = !DILocation(line: 21, column: 28, scope: !5)
|
217 |
+
!11 = !DILocation(line: 21, column: 33, scope: !5)
|
218 |
+
!12 = !DILocation(line: 22, column: 23, scope: !5)
|
219 |
+
!13 = !DILocation(line: 26, column: 20, scope: !5)
|
220 |
+
!14 = !DILocation(line: 33, column: 57, scope: !5)
|
221 |
+
!15 = !DILocation(line: 29, column: 36, scope: !5)
|
222 |
+
!16 = !DILocation(line: 30, column: 27, scope: !5)
|
223 |
+
!17 = !DILocation(line: 33, column: 44, scope: !5)
|
224 |
+
!18 = !DILocation(line: 33, column: 51, scope: !5)
|
225 |
+
!19 = !DILocation(line: 33, column: 34, scope: !5)
|
226 |
+
!20 = !DILocation(line: 33, column: 63, scope: !5)
|
227 |
+
!21 = !DILocation(line: 33, column: 115, scope: !5)
|
228 |
+
!22 = !DILocation(line: 34, column: 34, scope: !5)
|
229 |
+
!23 = !DILocation(line: 34, column: 63, scope: !5)
|
230 |
+
!24 = !DILocation(line: 36, column: 22, scope: !5)
|
231 |
+
!25 = !DILocation(line: 39, column: 38, scope: !5)
|
232 |
+
!26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29)
|
233 |
+
!27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
|
234 |
+
!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
235 |
+
!29 = !DILocation(line: 40, column: 25, scope: !27)
|
236 |
+
!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32)
|
237 |
+
!31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0)
|
238 |
+
!32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33)
|
239 |
+
!33 = !DILocation(line: 40, column: 25, scope: !31)
|
240 |
+
!34 = !DILocation(line: 40, column: 28, scope: !5)
|
241 |
+
!35 = !DILocation(line: 41, column: 25, scope: !5)
|
242 |
+
!36 = !DILocation(line: 41, column: 36, scope: !5)
|
243 |
+
!37 = !DILocation(line: 41, column: 4, scope: !5)
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx
ADDED
@@ -0,0 +1,577 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4de(
|
13 |
+
.param .u64 triton__0d1d2d3de4de_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4de_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4de_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4de_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4de_param_4
|
18 |
+
)
|
19 |
+
.maxntid 128, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<20>;
|
22 |
+
.reg .b16 %rs<5>;
|
23 |
+
.reg .b32 %r<98>;
|
24 |
+
.reg .f32 %f<47>;
|
25 |
+
.reg .b64 %rd<10>;
|
26 |
+
.loc 1 18 0
|
27 |
+
$L__func_begin0:
|
28 |
+
.loc 1 18 0
|
29 |
+
|
30 |
+
ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
|
31 |
+
ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
|
32 |
+
ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
|
33 |
+
$L__tmp0:
|
34 |
+
.loc 1 22 44
|
35 |
+
mov.u32 %r1, %tid.x;
|
36 |
+
and.b32 %r2, %r1, 31;
|
37 |
+
shl.b32 %r13, %r1, 2;
|
38 |
+
and.b32 %r3, %r13, 60;
|
39 |
+
.loc 1 24 33
|
40 |
+
bfe.u32 %r4, %r1, 5, 2;
|
41 |
+
.loc 1 21 28
|
42 |
+
mov.u32 %r11, %ctaid.x;
|
43 |
+
.loc 1 21 33
|
44 |
+
shl.b32 %r5, %r11, 6;
|
45 |
+
.loc 1 22 23
|
46 |
+
or.b32 %r14, %r5, %r3;
|
47 |
+
.loc 1 26 20
|
48 |
+
shr.s32 %r16, %r14, 31;
|
49 |
+
shr.u32 %r17, %r16, 24;
|
50 |
+
add.s32 %r18, %r14, %r17;
|
51 |
+
shr.s32 %r19, %r18, 8;
|
52 |
+
.loc 1 29 36
|
53 |
+
mad.lo.s32 %r20, %r19, 32512, %r14;
|
54 |
+
shl.b32 %r21, %r4, 9;
|
55 |
+
add.s32 %r22, %r20, %r21;
|
56 |
+
shl.b32 %r23, %r1, 4;
|
57 |
+
and.b32 %r24, %r23, 256;
|
58 |
+
add.s32 %r96, %r22, %r24;
|
59 |
+
mov.f32 %f43, 0f00000000;
|
60 |
+
mov.b32 %r97, -8;
|
61 |
+
mov.pred %p1, -1;
|
62 |
+
mov.f32 %f44, %f43;
|
63 |
+
mov.f32 %f45, %f43;
|
64 |
+
mov.f32 %f46, %f43;
|
65 |
+
$L__BB0_1:
|
66 |
+
.loc 1 33 34
|
67 |
+
mul.wide.s32 %rd6, %r96, 2;
|
68 |
+
add.s64 %rd4, %rd1, %rd6;
|
69 |
+
mov.b32 %r27, 0;
|
70 |
+
.loc 1 33 63
|
71 |
+
mov.u32 %r25, 0x0;
|
72 |
+
mov.u32 %r26, 0x0;
|
73 |
+
@%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ];
|
74 |
+
@!%p1 mov.u32 %r25, %r27;
|
75 |
+
@!%p1 mov.u32 %r26, %r27;
|
76 |
+
cvt.u16.u32 %rs1, %r25;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; }
|
78 |
+
cvt.u16.u32 %rs3, %r26;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; }
|
80 |
+
.loc 1 33 115
|
81 |
+
cvt.f32.bf16 %r29, %rs1;
|
82 |
+
mov.b32 %f13, %r29;
|
83 |
+
cvt.f32.bf16 %r30, %rs2;
|
84 |
+
mov.b32 %f14, %r30;
|
85 |
+
cvt.f32.bf16 %r31, %rs3;
|
86 |
+
mov.b32 %f15, %r31;
|
87 |
+
cvt.f32.bf16 %r32, %rs4;
|
88 |
+
mov.b32 %f16, %r32;
|
89 |
+
.loc 1 34 34
|
90 |
+
mul.wide.s32 %rd7, %r96, 4;
|
91 |
+
add.s64 %rd5, %rd2, %rd7;
|
92 |
+
.loc 1 34 63
|
93 |
+
mov.u32 %r33, 0x0;
|
94 |
+
mov.u32 %r34, 0x0;
|
95 |
+
mov.u32 %r35, 0x0;
|
96 |
+
mov.u32 %r36, 0x0;
|
97 |
+
@%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
|
98 |
+
@!%p1 mov.u32 %r33, %r27;
|
99 |
+
@!%p1 mov.u32 %r34, %r27;
|
100 |
+
@!%p1 mov.u32 %r35, %r27;
|
101 |
+
@!%p1 mov.u32 %r36, %r27;
|
102 |
+
mov.b32 %f17, %r33;
|
103 |
+
mov.b32 %f18, %r34;
|
104 |
+
mov.b32 %f19, %r35;
|
105 |
+
mov.b32 %f20, %r36;
|
106 |
+
.loc 1 39 38
|
107 |
+
fma.rn.f32 %f46, %f16, %f20, %f46;
|
108 |
+
fma.rn.f32 %f45, %f15, %f19, %f45;
|
109 |
+
fma.rn.f32 %f44, %f14, %f18, %f44;
|
110 |
+
fma.rn.f32 %f43, %f13, %f17, %f43;
|
111 |
+
.loc 1 29 36
|
112 |
+
add.s32 %r97, %r97, 8;
|
113 |
+
add.s32 %r96, %r96, 2048;
|
114 |
+
setp.lt.u32 %p9, %r97, 120;
|
115 |
+
@%p9 bra $L__BB0_1;
|
116 |
+
.loc 1 22 44
|
117 |
+
and.b32 %r58, %r1, 63;
|
118 |
+
.loc 1 22 23
|
119 |
+
or.b32 %r59, %r5, %r58;
|
120 |
+
$L__tmp1:
|
121 |
+
.loc 2 243 36
|
122 |
+
mov.b32 %r60, %f43;
|
123 |
+
shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
|
124 |
+
mov.b32 %f21, %r61;
|
125 |
+
$L__tmp2:
|
126 |
+
.loc 2 233 15
|
127 |
+
add.f32 %f22, %f43, %f21;
|
128 |
+
$L__tmp3:
|
129 |
+
.loc 2 243 36
|
130 |
+
mov.b32 %r62, %f44;
|
131 |
+
shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
|
132 |
+
mov.b32 %f23, %r63;
|
133 |
+
$L__tmp4:
|
134 |
+
.loc 2 233 15
|
135 |
+
add.f32 %f24, %f44, %f23;
|
136 |
+
$L__tmp5:
|
137 |
+
.loc 2 243 36
|
138 |
+
mov.b32 %r64, %f45;
|
139 |
+
shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
|
140 |
+
mov.b32 %f25, %r65;
|
141 |
+
$L__tmp6:
|
142 |
+
.loc 2 233 15
|
143 |
+
add.f32 %f26, %f45, %f25;
|
144 |
+
$L__tmp7:
|
145 |
+
.loc 2 243 36
|
146 |
+
mov.b32 %r66, %f46;
|
147 |
+
shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
|
148 |
+
mov.b32 %f27, %r67;
|
149 |
+
$L__tmp8:
|
150 |
+
.loc 2 233 15
|
151 |
+
add.f32 %f28, %f46, %f27;
|
152 |
+
$L__tmp9:
|
153 |
+
.loc 2 243 36
|
154 |
+
setp.lt.u32 %p10, %r2, 16;
|
155 |
+
shl.b32 %r68, %r3, 2;
|
156 |
+
or.b32 %r69, %r68, %r4;
|
157 |
+
shl.b32 %r70, %r69, 2;
|
158 |
+
mov.u32 %r71, global_smem;
|
159 |
+
add.s32 %r41, %r71, %r70;
|
160 |
+
mov.b32 %r42, %f22;
|
161 |
+
@%p10 st.shared.b32 [ %r41 + 0 ], %r42;
|
162 |
+
shl.b32 %r72, %r4, 2;
|
163 |
+
shl.b32 %r73, %r3, 4;
|
164 |
+
or.b32 %r74, %r73, 16;
|
165 |
+
or.b32 %r75, %r74, %r72;
|
166 |
+
add.s32 %r43, %r71, %r75;
|
167 |
+
mov.b32 %r44, %f24;
|
168 |
+
@%p10 st.shared.b32 [ %r43 + 0 ], %r44;
|
169 |
+
or.b32 %r76, %r73, 32;
|
170 |
+
or.b32 %r77, %r76, %r72;
|
171 |
+
add.s32 %r45, %r71, %r77;
|
172 |
+
mov.b32 %r46, %f26;
|
173 |
+
@%p10 st.shared.b32 [ %r45 + 0 ], %r46;
|
174 |
+
or.b32 %r78, %r73, 48;
|
175 |
+
or.b32 %r79, %r78, %r72;
|
176 |
+
add.s32 %r47, %r71, %r79;
|
177 |
+
mov.b32 %r48, %f28;
|
178 |
+
@%p10 st.shared.b32 [ %r47 + 0 ], %r48;
|
179 |
+
bar.sync 0;
|
180 |
+
setp.lt.s32 %p14, %r1, 256;
|
181 |
+
add.s32 %r50, %r71, %r13;
|
182 |
+
@%p14 ld.shared.b32 %r49, [ %r50 + 0 ];
|
183 |
+
mov.b32 %f29, %r49;
|
184 |
+
shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
|
185 |
+
mov.b32 %f30, %r81;
|
186 |
+
$L__tmp10:
|
187 |
+
.loc 2 233 15
|
188 |
+
add.f32 %f31, %f29, %f30;
|
189 |
+
$L__tmp11:
|
190 |
+
.loc 2 243 36
|
191 |
+
mov.b32 %r82, %f31;
|
192 |
+
shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
|
193 |
+
mov.b32 %f32, %r83;
|
194 |
+
$L__tmp12:
|
195 |
+
.loc 2 233 15
|
196 |
+
add.f32 %f33, %f31, %f32;
|
197 |
+
$L__tmp13:
|
198 |
+
.loc 2 243 36
|
199 |
+
and.b32 %r84, %r1, 3;
|
200 |
+
setp.eq.s32 %p19, %r84, 0;
|
201 |
+
and.pred %p15, %p14, %p19;
|
202 |
+
mov.b32 %r52, %f33;
|
203 |
+
@%p15 st.shared.b32 [ %r50 + 0 ], %r52;
|
204 |
+
add.s32 %r54, %r50, 512;
|
205 |
+
@%p14 ld.shared.b32 %r53, [ %r54 + 0 ];
|
206 |
+
mov.b32 %f34, %r53;
|
207 |
+
shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
|
208 |
+
mov.b32 %f35, %r85;
|
209 |
+
$L__tmp14:
|
210 |
+
.loc 2 233 15
|
211 |
+
add.f32 %f36, %f34, %f35;
|
212 |
+
$L__tmp15:
|
213 |
+
.loc 2 243 36
|
214 |
+
mov.b32 %r86, %f36;
|
215 |
+
shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
|
216 |
+
mov.b32 %f37, %r87;
|
217 |
+
$L__tmp16:
|
218 |
+
.loc 2 233 15
|
219 |
+
add.f32 %f38, %f36, %f37;
|
220 |
+
$L__tmp17:
|
221 |
+
.loc 2 243 36
|
222 |
+
mov.b32 %r56, %f38;
|
223 |
+
@%p15 st.shared.b32 [ %r54 + 0 ], %r56;
|
224 |
+
bar.sync 0;
|
225 |
+
add.s32 %r88, %r71, %r73;
|
226 |
+
ld.shared.f32 %f39, [%r88];
|
227 |
+
add.s32 %r89, %r71, %r74;
|
228 |
+
ld.shared.f32 %f40, [%r89];
|
229 |
+
add.s32 %r90, %r71, %r76;
|
230 |
+
ld.shared.f32 %f41, [%r90];
|
231 |
+
add.s32 %r91, %r71, %r78;
|
232 |
+
ld.shared.f32 %f42, [%r91];
|
233 |
+
$L__tmp18:
|
234 |
+
.loc 1 40 28
|
235 |
+
bar.sync 0;
|
236 |
+
add.s32 %r92, %r71, %r68;
|
237 |
+
st.shared.f32 [%r92], %f39;
|
238 |
+
st.shared.f32 [%r92+4], %f40;
|
239 |
+
st.shared.f32 [%r92+8], %f41;
|
240 |
+
st.shared.f32 [%r92+12], %f42;
|
241 |
+
bar.sync 0;
|
242 |
+
shl.b32 %r93, %r58, 2;
|
243 |
+
add.s32 %r94, %r71, %r93;
|
244 |
+
ld.shared.u32 %r57, [%r94];
|
245 |
+
.loc 1 41 25
|
246 |
+
mul.wide.s32 %rd9, %r59, 4;
|
247 |
+
add.s64 %rd8, %rd3, %rd9;
|
248 |
+
.loc 1 41 36
|
249 |
+
and.b32 %r95, %r1, 64;
|
250 |
+
setp.eq.s32 %p18, %r95, 0;
|
251 |
+
@%p18 st.global.b32 [ %rd8 + 0 ], { %r57 };
|
252 |
+
.loc 1 41 4
|
253 |
+
ret;
|
254 |
+
$L__tmp19:
|
255 |
+
$L__func_end0:
|
256 |
+
|
257 |
+
}
|
258 |
+
.file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py"
|
259 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
260 |
+
.section .debug_abbrev
|
261 |
+
{
|
262 |
+
.b8 1
|
263 |
+
.b8 17
|
264 |
+
.b8 1
|
265 |
+
.b8 37
|
266 |
+
.b8 8
|
267 |
+
.b8 19
|
268 |
+
.b8 5
|
269 |
+
.b8 3
|
270 |
+
.b8 8
|
271 |
+
.b8 16
|
272 |
+
.b8 6
|
273 |
+
.b8 27
|
274 |
+
.b8 8
|
275 |
+
.b8 180
|
276 |
+
.b8 66
|
277 |
+
.b8 12
|
278 |
+
.b8 17
|
279 |
+
.b8 1
|
280 |
+
.b8 18
|
281 |
+
.b8 1
|
282 |
+
.b8 0
|
283 |
+
.b8 0
|
284 |
+
.b8 2
|
285 |
+
.b8 46
|
286 |
+
.b8 0
|
287 |
+
.b8 135
|
288 |
+
.b8 64
|
289 |
+
.b8 8
|
290 |
+
.b8 3
|
291 |
+
.b8 8
|
292 |
+
.b8 58
|
293 |
+
.b8 11
|
294 |
+
.b8 59
|
295 |
+
.b8 11
|
296 |
+
.b8 63
|
297 |
+
.b8 12
|
298 |
+
.b8 32
|
299 |
+
.b8 11
|
300 |
+
.b8 0
|
301 |
+
.b8 0
|
302 |
+
.b8 3
|
303 |
+
.b8 46
|
304 |
+
.b8 1
|
305 |
+
.b8 17
|
306 |
+
.b8 1
|
307 |
+
.b8 18
|
308 |
+
.b8 1
|
309 |
+
.b8 64
|
310 |
+
.b8 10
|
311 |
+
.b8 49
|
312 |
+
.b8 19
|
313 |
+
.b8 0
|
314 |
+
.b8 0
|
315 |
+
.b8 4
|
316 |
+
.b8 29
|
317 |
+
.b8 0
|
318 |
+
.b8 49
|
319 |
+
.b8 19
|
320 |
+
.b8 17
|
321 |
+
.b8 1
|
322 |
+
.b8 18
|
323 |
+
.b8 1
|
324 |
+
.b8 88
|
325 |
+
.b8 11
|
326 |
+
.b8 89
|
327 |
+
.b8 11
|
328 |
+
.b8 87
|
329 |
+
.b8 11
|
330 |
+
.b8 0
|
331 |
+
.b8 0
|
332 |
+
.b8 5
|
333 |
+
.b8 29
|
334 |
+
.b8 1
|
335 |
+
.b8 49
|
336 |
+
.b8 19
|
337 |
+
.b8 17
|
338 |
+
.b8 1
|
339 |
+
.b8 18
|
340 |
+
.b8 1
|
341 |
+
.b8 88
|
342 |
+
.b8 11
|
343 |
+
.b8 89
|
344 |
+
.b8 11
|
345 |
+
.b8 87
|
346 |
+
.b8 11
|
347 |
+
.b8 0
|
348 |
+
.b8 0
|
349 |
+
.b8 0
|
350 |
+
}
|
351 |
+
.section .debug_info
|
352 |
+
{
|
353 |
+
.b32 266
|
354 |
+
.b8 2
|
355 |
+
.b8 0
|
356 |
+
.b32 .debug_abbrev
|
357 |
+
.b8 8
|
358 |
+
.b8 1
|
359 |
+
.b8 116
|
360 |
+
.b8 114
|
361 |
+
.b8 105
|
362 |
+
.b8 116
|
363 |
+
.b8 111
|
364 |
+
.b8 110
|
365 |
+
.b8 0
|
366 |
+
.b8 2
|
367 |
+
.b8 0
|
368 |
+
.b8 99
|
369 |
+
.b8 115
|
370 |
+
.b8 106
|
371 |
+
.b8 100
|
372 |
+
.b8 55
|
373 |
+
.b8 109
|
374 |
+
.b8 108
|
375 |
+
.b8 114
|
376 |
+
.b8 106
|
377 |
+
.b8 117
|
378 |
+
.b8 106
|
379 |
+
.b8 100
|
380 |
+
.b8 52
|
381 |
+
.b8 117
|
382 |
+
.b8 119
|
383 |
+
.b8 122
|
384 |
+
.b8 101
|
385 |
+
.b8 53
|
386 |
+
.b8 116
|
387 |
+
.b8 107
|
388 |
+
.b8 103
|
389 |
+
.b8 55
|
390 |
+
.b8 112
|
391 |
+
.b8 116
|
392 |
+
.b8 116
|
393 |
+
.b8 101
|
394 |
+
.b8 97
|
395 |
+
.b8 103
|
396 |
+
.b8 112
|
397 |
+
.b8 105
|
398 |
+
.b8 104
|
399 |
+
.b8 103
|
400 |
+
.b8 116
|
401 |
+
.b8 53
|
402 |
+
.b8 122
|
403 |
+
.b8 116
|
404 |
+
.b8 97
|
405 |
+
.b8 116
|
406 |
+
.b8 102
|
407 |
+
.b8 113
|
408 |
+
.b8 99
|
409 |
+
.b8 104
|
410 |
+
.b8 112
|
411 |
+
.b8 114
|
412 |
+
.b8 99
|
413 |
+
.b8 114
|
414 |
+
.b8 97
|
415 |
+
.b8 120
|
416 |
+
.b8 50
|
417 |
+
.b8 50
|
418 |
+
.b8 108
|
419 |
+
.b8 115
|
420 |
+
.b8 46
|
421 |
+
.b8 112
|
422 |
+
.b8 121
|
423 |
+
.b8 0
|
424 |
+
.b32 .debug_line
|
425 |
+
.b8 47
|
426 |
+
.b8 116
|
427 |
+
.b8 109
|
428 |
+
.b8 112
|
429 |
+
.b8 47
|
430 |
+
.b8 116
|
431 |
+
.b8 111
|
432 |
+
.b8 114
|
433 |
+
.b8 99
|
434 |
+
.b8 104
|
435 |
+
.b8 105
|
436 |
+
.b8 110
|
437 |
+
.b8 100
|
438 |
+
.b8 117
|
439 |
+
.b8 99
|
440 |
+
.b8 116
|
441 |
+
.b8 111
|
442 |
+
.b8 114
|
443 |
+
.b8 95
|
444 |
+
.b8 114
|
445 |
+
.b8 111
|
446 |
+
.b8 111
|
447 |
+
.b8 116
|
448 |
+
.b8 47
|
449 |
+
.b8 115
|
450 |
+
.b8 106
|
451 |
+
.b8 0
|
452 |
+
.b8 1
|
453 |
+
.b64 $L__func_begin0
|
454 |
+
.b64 $L__func_end0
|
455 |
+
.b8 2
|
456 |
+
.b8 116
|
457 |
+
.b8 114
|
458 |
+
.b8 105
|
459 |
+
.b8 116
|
460 |
+
.b8 111
|
461 |
+
.b8 110
|
462 |
+
.b8 95
|
463 |
+
.b8 95
|
464 |
+
.b8 48
|
465 |
+
.b8 100
|
466 |
+
.b8 49
|
467 |
+
.b8 100
|
468 |
+
.b8 50
|
469 |
+
.b8 100
|
470 |
+
.b8 51
|
471 |
+
.b8 100
|
472 |
+
.b8 101
|
473 |
+
.b8 52
|
474 |
+
.b8 100
|
475 |
+
.b8 101
|
476 |
+
.b8 0
|
477 |
+
.b8 116
|
478 |
+
.b8 114
|
479 |
+
.b8 105
|
480 |
+
.b8 116
|
481 |
+
.b8 111
|
482 |
+
.b8 110
|
483 |
+
.b8 95
|
484 |
+
.b8 95
|
485 |
+
.b8 48
|
486 |
+
.b8 100
|
487 |
+
.b8 49
|
488 |
+
.b8 100
|
489 |
+
.b8 50
|
490 |
+
.b8 100
|
491 |
+
.b8 51
|
492 |
+
.b8 100
|
493 |
+
.b8 101
|
494 |
+
.b8 52
|
495 |
+
.b8 100
|
496 |
+
.b8 101
|
497 |
+
.b8 0
|
498 |
+
.b8 1
|
499 |
+
.b8 18
|
500 |
+
.b8 1
|
501 |
+
.b8 1
|
502 |
+
.b8 3
|
503 |
+
.b64 $L__func_begin0
|
504 |
+
.b64 $L__func_end0
|
505 |
+
.b8 1
|
506 |
+
.b8 156
|
507 |
+
.b32 125
|
508 |
+
.b8 4
|
509 |
+
.b32 125
|
510 |
+
.b64 $L__tmp1
|
511 |
+
.b64 $L__tmp18
|
512 |
+
.b8 2
|
513 |
+
.b8 40
|
514 |
+
.b8 25
|
515 |
+
.b8 5
|
516 |
+
.b32 125
|
517 |
+
.b64 $L__tmp2
|
518 |
+
.b64 $L__tmp17
|
519 |
+
.b8 2
|
520 |
+
.b8 40
|
521 |
+
.b8 25
|
522 |
+
.b8 4
|
523 |
+
.b32 125
|
524 |
+
.b64 $L__tmp2
|
525 |
+
.b64 $L__tmp17
|
526 |
+
.b8 2
|
527 |
+
.b8 243
|
528 |
+
.b8 36
|
529 |
+
.b8 0
|
530 |
+
.b8 0
|
531 |
+
.b8 0
|
532 |
+
}
|
533 |
+
.section .debug_pubnames
|
534 |
+
{
|
535 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
536 |
+
$L__pubNames_start0:
|
537 |
+
.b8 2
|
538 |
+
.b8 0
|
539 |
+
.b32 .debug_info
|
540 |
+
.b32 270
|
541 |
+
.b32 125
|
542 |
+
.b8 116
|
543 |
+
.b8 114
|
544 |
+
.b8 105
|
545 |
+
.b8 116
|
546 |
+
.b8 111
|
547 |
+
.b8 110
|
548 |
+
.b8 95
|
549 |
+
.b8 95
|
550 |
+
.b8 48
|
551 |
+
.b8 100
|
552 |
+
.b8 49
|
553 |
+
.b8 100
|
554 |
+
.b8 50
|
555 |
+
.b8 100
|
556 |
+
.b8 51
|
557 |
+
.b8 100
|
558 |
+
.b8 101
|
559 |
+
.b8 52
|
560 |
+
.b8 100
|
561 |
+
.b8 101
|
562 |
+
.b8 0
|
563 |
+
.b32 0
|
564 |
+
$L__pubNames_end0:
|
565 |
+
}
|
566 |
+
.section .debug_pubtypes
|
567 |
+
{
|
568 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
569 |
+
$L__pubTypes_start0:
|
570 |
+
.b8 2
|
571 |
+
.b8 0
|
572 |
+
.b32 .debug_info
|
573 |
+
.b32 270
|
574 |
+
.b32 0
|
575 |
+
$L__pubTypes_end0:
|
576 |
+
}
|
577 |
+
.section .debug_loc { }
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
|
9 |
+
%c0_i32 = arith.constant 0 : i32
|
10 |
+
%c128_i32 = arith.constant 128 : i32
|
11 |
+
%c8_i32 = arith.constant 8 : i32
|
12 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
|
13 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
|
14 |
+
%c64_i32 = arith.constant 64 : i32
|
15 |
+
%0 = tt.get_program_id x : i32
|
16 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
17 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
19 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
20 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
21 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
22 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
23 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
24 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
25 |
+
%10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
26 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
|
27 |
+
%12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
28 |
+
%13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
|
29 |
+
%14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
30 |
+
%15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
|
31 |
+
%16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
32 |
+
%17 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
|
33 |
+
%18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
|
34 |
+
%19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
|
35 |
+
%25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
|
36 |
+
%26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
|
37 |
+
%27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
|
38 |
+
%28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
|
39 |
+
%29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
|
40 |
+
%30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
|
41 |
+
%31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
|
42 |
+
%32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
43 |
+
%33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
|
44 |
+
%34 = tt.load %32, %33, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
|
45 |
+
%35 = arith.extf %34 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
|
46 |
+
%36 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
|
47 |
+
%37 = tt.load %36, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
|
48 |
+
%38 = arith.mulf %35, %37 : tensor<64x8xf32, #blocked>
|
49 |
+
%39 = arith.addf %arg6, %38 : tensor<64x8xf32, #blocked>
|
50 |
+
%40 = arith.select %33, %39, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
|
51 |
+
scf.yield %40 : tensor<64x8xf32, #blocked>
|
52 |
+
}
|
53 |
+
%20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
|
54 |
+
^bb0(%arg5: f32, %arg6: f32):
|
55 |
+
%25 = arith.addf %arg5, %arg6 : f32
|
56 |
+
tt.reduce.return %25 : f32
|
57 |
+
}) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
58 |
+
%21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
59 |
+
%22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
|
60 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
|
61 |
+
%24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
62 |
+
tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
|
63 |
+
tt.return
|
64 |
+
}
|
65 |
+
}
|
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
|
4 |
+
%c8_i32 = arith.constant 8 : i32
|
5 |
+
%c128_i32 = arith.constant 128 : i32
|
6 |
+
%c0_i32 = arith.constant 0 : i32
|
7 |
+
%cst_0 = arith.constant dense<32768> : tensor<64x1xi32>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<1x8xi32>
|
9 |
+
%cst_2 = arith.constant dense<128> : tensor<1x8xi32>
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
|
11 |
+
%cst_4 = arith.constant dense<256> : tensor<64x1xi32>
|
12 |
+
%c64_i32 = arith.constant 64 : i32
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
15 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
16 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
17 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
18 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
19 |
+
%6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
20 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
21 |
+
%8 = arith.remsi %5, %cst_4 : tensor<64x1xi32>
|
22 |
+
%9 = arith.divsi %5, %cst_4 : tensor<64x1xi32>
|
23 |
+
%10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
24 |
+
%11 = arith.muli %9, %cst_0 : tensor<64x1xi32>
|
25 |
+
%12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
26 |
+
%13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
|
27 |
+
%14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
28 |
+
%15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32>) : i32 {
|
29 |
+
%20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
|
30 |
+
%21 = arith.addi %20, %7 : tensor<1x8xi32>
|
31 |
+
%22 = arith.cmpi slt, %21, %cst_2 : tensor<1x8xi32>
|
32 |
+
%23 = arith.muli %21, %cst_1 : tensor<1x8xi32>
|
33 |
+
%24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
34 |
+
%25 = arith.addi %10, %24 : tensor<64x8xi32>
|
35 |
+
%26 = arith.addi %25, %12 : tensor<64x8xi32>
|
36 |
+
%27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
|
37 |
+
%28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
38 |
+
%29 = tt.load %27, %28, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
|
39 |
+
%30 = arith.extf %29 : tensor<64x8xbf16> to tensor<64x8xf32>
|
40 |
+
%31 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
41 |
+
%32 = tt.load %31, %28, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
42 |
+
%33 = arith.mulf %30, %32 : tensor<64x8xf32>
|
43 |
+
%34 = arith.addf %arg6, %33 : tensor<64x8xf32>
|
44 |
+
%35 = arith.select %28, %34, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
|
45 |
+
scf.yield %35 : tensor<64x8xf32>
|
46 |
+
}
|
47 |
+
%16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
|
48 |
+
^bb0(%arg5: f32, %arg6: f32):
|
49 |
+
%20 = arith.addf %arg5, %arg6 : f32
|
50 |
+
tt.reduce.return %20 : f32
|
51 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
52 |
+
%17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
53 |
+
%18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
54 |
+
%19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
|
55 |
+
tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
|
56 |
+
tt.return
|
57 |
+
}
|
58 |
+
}
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3de4e
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2d3de4e(
|
13 |
+
.param .u64 triton__0d1d2d3de4e_param_0,
|
14 |
+
.param .u64 triton__0d1d2d3de4e_param_1,
|
15 |
+
.param .u64 triton__0d1d2d3de4e_param_2,
|
16 |
+
.param .u32 triton__0d1d2d3de4e_param_3,
|
17 |
+
.param .u32 triton__0d1d2d3de4e_param_4
|
18 |
+
)
|
19 |
+
.maxntid 256, 1, 1
|
20 |
+
{
|
21 |
+
.reg .pred %p<10>;
|
22 |
+
.reg .b32 %r<44>;
|
23 |
+
.reg .f32 %f<11>;
|
24 |
+
.reg .b64 %rd<16>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2];
|
30 |
+
ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1];
|
31 |
+
ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0];
|
32 |
+
$L__tmp0:
|
33 |
+
.loc 1 22 44
|
34 |
+
mov.u32 %r1, %tid.x;
|
35 |
+
and.b32 %r2, %r1, 63;
|
36 |
+
.loc 1 24 33
|
37 |
+
bfe.u32 %r3, %r1, 6, 2;
|
38 |
+
.loc 1 21 28
|
39 |
+
mov.u32 %r10, %ctaid.x;
|
40 |
+
.loc 1 21 33
|
41 |
+
shl.b32 %r12, %r10, 6;
|
42 |
+
.loc 1 22 23
|
43 |
+
or.b32 %r4, %r12, %r2;
|
44 |
+
.loc 1 27 36
|
45 |
+
shl.b32 %r13, %r3, 17;
|
46 |
+
add.s32 %r14, %r13, %r12;
|
47 |
+
or.b32 %r42, %r14, %r2;
|
48 |
+
mov.f32 %f10, 0f00000000;
|
49 |
+
mov.b32 %r43, -4;
|
50 |
+
mov.pred %p4, -1;
|
51 |
+
$L__BB0_1:
|
52 |
+
.loc 1 31 34
|
53 |
+
mul.wide.s32 %rd5, %r42, 4;
|
54 |
+
add.s64 %rd4, %rd1, %rd5;
|
55 |
+
mov.b32 %r16, 0;
|
56 |
+
.loc 1 31 53
|
57 |
+
mov.u32 %r15, 0x0;
|
58 |
+
@%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ];
|
59 |
+
@!%p4 mov.u32 %r15, %r16;
|
60 |
+
mov.b32 %f4, %r15;
|
61 |
+
.loc 1 34 38
|
62 |
+
add.f32 %f10, %f10, %f4;
|
63 |
+
.loc 1 27 36
|
64 |
+
add.s32 %r43, %r43, 4;
|
65 |
+
add.s32 %r42, %r42, 524288;
|
66 |
+
setp.lt.u32 %p3, %r43, 116;
|
67 |
+
@%p3 bra $L__BB0_1;
|
68 |
+
$L__tmp1:
|
69 |
+
.loc 2 243 36
|
70 |
+
shl.b32 %r25, %r3, 2;
|
71 |
+
shl.b32 %r26, %r2, 4;
|
72 |
+
or.b32 %r27, %r26, %r25;
|
73 |
+
mov.u32 %r28, global_smem;
|
74 |
+
add.s32 %r17, %r28, %r27;
|
75 |
+
mov.b32 %r18, %f10;
|
76 |
+
@%p4 st.shared.b32 [ %r17 + 0 ], %r18;
|
77 |
+
bar.sync 0;
|
78 |
+
setp.lt.s32 %p5, %r1, 256;
|
79 |
+
shl.b32 %r29, %r1, 2;
|
80 |
+
add.s32 %r20, %r28, %r29;
|
81 |
+
@%p5 ld.shared.b32 %r19, [ %r20 + 0 ];
|
82 |
+
mov.b32 %f5, %r19;
|
83 |
+
shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1;
|
84 |
+
mov.b32 %f6, %r30;
|
85 |
+
$L__tmp2:
|
86 |
+
.loc 2 233 15
|
87 |
+
add.f32 %f7, %f5, %f6;
|
88 |
+
$L__tmp3:
|
89 |
+
.loc 2 243 36
|
90 |
+
mov.b32 %r31, %f7;
|
91 |
+
shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1;
|
92 |
+
mov.b32 %f8, %r32;
|
93 |
+
$L__tmp4:
|
94 |
+
.loc 2 233 15
|
95 |
+
add.f32 %f9, %f7, %f8;
|
96 |
+
$L__tmp5:
|
97 |
+
.loc 2 243 36
|
98 |
+
and.b32 %r33, %r1, 3;
|
99 |
+
setp.eq.s32 %p9, %r33, 0;
|
100 |
+
and.pred %p6, %p5, %p9;
|
101 |
+
mov.b32 %r22, %f9;
|
102 |
+
@%p6 st.shared.b32 [ %r20 + 0 ], %r22;
|
103 |
+
bar.sync 0;
|
104 |
+
add.s32 %r34, %r28, %r26;
|
105 |
+
$L__tmp6:
|
106 |
+
.loc 1 36 20
|
107 |
+
shr.s32 %r36, %r4, 31;
|
108 |
+
shr.u32 %r37, %r36, 24;
|
109 |
+
add.s32 %r38, %r4, %r37;
|
110 |
+
shr.s32 %r39, %r38, 8;
|
111 |
+
and.b32 %r40, %r38, -256;
|
112 |
+
sub.s32 %r41, %r4, %r40;
|
113 |
+
.loc 1 38 30
|
114 |
+
mul.wide.s32 %rd9, %r39, 8;
|
115 |
+
add.s64 %rd7, %rd2, %rd9;
|
116 |
+
.loc 1 45 55
|
117 |
+
ld.shared.u32 %r24, [%r34];
|
118 |
+
.loc 1 38 35
|
119 |
+
mov.u64 %rd6, 0x0;
|
120 |
+
@%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
|
121 |
+
.loc 1 41 32
|
122 |
+
shr.u64 %rd10, %rd6, 54;
|
123 |
+
and.b64 %rd11, %rd10, 512;
|
124 |
+
add.s64 %rd12, %rd11, %rd6;
|
125 |
+
.loc 1 45 30
|
126 |
+
shl.b64 %rd13, %rd12, 10;
|
127 |
+
add.s64 %rd14, %rd3, %rd13;
|
128 |
+
mul.wide.s32 %rd15, %r41, 4;
|
129 |
+
add.s64 %rd8, %rd14, %rd15;
|
130 |
+
.loc 1 45 55
|
131 |
+
setp.eq.s32 %p8, %r3, 0;
|
132 |
+
mov.u32 %r23, 0x0;
|
133 |
+
@%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24;
|
134 |
+
.loc 1 45 4
|
135 |
+
ret;
|
136 |
+
$L__tmp7:
|
137 |
+
$L__func_end0:
|
138 |
+
|
139 |
+
}
|
140 |
+
.file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
|
141 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
142 |
+
.section .debug_abbrev
|
143 |
+
{
|
144 |
+
.b8 1
|
145 |
+
.b8 17
|
146 |
+
.b8 1
|
147 |
+
.b8 37
|
148 |
+
.b8 8
|
149 |
+
.b8 19
|
150 |
+
.b8 5
|
151 |
+
.b8 3
|
152 |
+
.b8 8
|
153 |
+
.b8 16
|
154 |
+
.b8 6
|
155 |
+
.b8 27
|
156 |
+
.b8 8
|
157 |
+
.b8 180
|
158 |
+
.b8 66
|
159 |
+
.b8 12
|
160 |
+
.b8 17
|
161 |
+
.b8 1
|
162 |
+
.b8 18
|
163 |
+
.b8 1
|
164 |
+
.b8 0
|
165 |
+
.b8 0
|
166 |
+
.b8 2
|
167 |
+
.b8 46
|
168 |
+
.b8 0
|
169 |
+
.b8 135
|
170 |
+
.b8 64
|
171 |
+
.b8 8
|
172 |
+
.b8 3
|
173 |
+
.b8 8
|
174 |
+
.b8 58
|
175 |
+
.b8 11
|
176 |
+
.b8 59
|
177 |
+
.b8 11
|
178 |
+
.b8 63
|
179 |
+
.b8 12
|
180 |
+
.b8 32
|
181 |
+
.b8 11
|
182 |
+
.b8 0
|
183 |
+
.b8 0
|
184 |
+
.b8 3
|
185 |
+
.b8 46
|
186 |
+
.b8 1
|
187 |
+
.b8 17
|
188 |
+
.b8 1
|
189 |
+
.b8 18
|
190 |
+
.b8 1
|
191 |
+
.b8 64
|
192 |
+
.b8 10
|
193 |
+
.b8 49
|
194 |
+
.b8 19
|
195 |
+
.b8 0
|
196 |
+
.b8 0
|
197 |
+
.b8 4
|
198 |
+
.b8 29
|
199 |
+
.b8 0
|
200 |
+
.b8 49
|
201 |
+
.b8 19
|
202 |
+
.b8 17
|
203 |
+
.b8 1
|
204 |
+
.b8 18
|
205 |
+
.b8 1
|
206 |
+
.b8 88
|
207 |
+
.b8 11
|
208 |
+
.b8 89
|
209 |
+
.b8 11
|
210 |
+
.b8 87
|
211 |
+
.b8 11
|
212 |
+
.b8 0
|
213 |
+
.b8 0
|
214 |
+
.b8 5
|
215 |
+
.b8 29
|
216 |
+
.b8 1
|
217 |
+
.b8 49
|
218 |
+
.b8 19
|
219 |
+
.b8 17
|
220 |
+
.b8 1
|
221 |
+
.b8 18
|
222 |
+
.b8 1
|
223 |
+
.b8 88
|
224 |
+
.b8 11
|
225 |
+
.b8 89
|
226 |
+
.b8 11
|
227 |
+
.b8 87
|
228 |
+
.b8 11
|
229 |
+
.b8 0
|
230 |
+
.b8 0
|
231 |
+
.b8 0
|
232 |
+
}
|
233 |
+
.section .debug_info
|
234 |
+
{
|
235 |
+
.b32 264
|
236 |
+
.b8 2
|
237 |
+
.b8 0
|
238 |
+
.b32 .debug_abbrev
|
239 |
+
.b8 8
|
240 |
+
.b8 1
|
241 |
+
.b8 116
|
242 |
+
.b8 114
|
243 |
+
.b8 105
|
244 |
+
.b8 116
|
245 |
+
.b8 111
|
246 |
+
.b8 110
|
247 |
+
.b8 0
|
248 |
+
.b8 2
|
249 |
+
.b8 0
|
250 |
+
.b8 99
|
251 |
+
.b8 54
|
252 |
+
.b8 105
|
253 |
+
.b8 107
|
254 |
+
.b8 53
|
255 |
+
.b8 118
|
256 |
+
.b8 120
|
257 |
+
.b8 55
|
258 |
+
.b8 112
|
259 |
+
.b8 50
|
260 |
+
.b8 50
|
261 |
+
.b8 102
|
262 |
+
.b8 112
|
263 |
+
.b8 107
|
264 |
+
.b8 52
|
265 |
+
.b8 100
|
266 |
+
.b8 99
|
267 |
+
.b8 118
|
268 |
+
.b8 104
|
269 |
+
.b8 53
|
270 |
+
.b8 53
|
271 |
+
.b8 122
|
272 |
+
.b8 105
|
273 |
+
.b8 109
|
274 |
+
.b8 119
|
275 |
+
.b8 52
|
276 |
+
.b8 116
|
277 |
+
.b8 53
|
278 |
+
.b8 110
|
279 |
+
.b8 114
|
280 |
+
.b8 53
|
281 |
+
.b8 122
|
282 |
+
.b8 110
|
283 |
+
.b8 50
|
284 |
+
.b8 98
|
285 |
+
.b8 55
|
286 |
+
.b8 105
|
287 |
+
.b8 110
|
288 |
+
.b8 117
|
289 |
+
.b8 106
|
290 |
+
.b8 120
|
291 |
+
.b8 106
|
292 |
+
.b8 97
|
293 |
+
.b8 117
|
294 |
+
.b8 120
|
295 |
+
.b8 115
|
296 |
+
.b8 104
|
297 |
+
.b8 108
|
298 |
+
.b8 106
|
299 |
+
.b8 117
|
300 |
+
.b8 109
|
301 |
+
.b8 109
|
302 |
+
.b8 46
|
303 |
+
.b8 112
|
304 |
+
.b8 121
|
305 |
+
.b8 0
|
306 |
+
.b32 .debug_line
|
307 |
+
.b8 47
|
308 |
+
.b8 116
|
309 |
+
.b8 109
|
310 |
+
.b8 112
|
311 |
+
.b8 47
|
312 |
+
.b8 116
|
313 |
+
.b8 111
|
314 |
+
.b8 114
|
315 |
+
.b8 99
|
316 |
+
.b8 104
|
317 |
+
.b8 105
|
318 |
+
.b8 110
|
319 |
+
.b8 100
|
320 |
+
.b8 117
|
321 |
+
.b8 99
|
322 |
+
.b8 116
|
323 |
+
.b8 111
|
324 |
+
.b8 114
|
325 |
+
.b8 95
|
326 |
+
.b8 114
|
327 |
+
.b8 111
|
328 |
+
.b8 111
|
329 |
+
.b8 116
|
330 |
+
.b8 47
|
331 |
+
.b8 54
|
332 |
+
.b8 105
|
333 |
+
.b8 0
|
334 |
+
.b8 1
|
335 |
+
.b64 $L__func_begin0
|
336 |
+
.b64 $L__func_end0
|
337 |
+
.b8 2
|
338 |
+
.b8 116
|
339 |
+
.b8 114
|
340 |
+
.b8 105
|
341 |
+
.b8 116
|
342 |
+
.b8 111
|
343 |
+
.b8 110
|
344 |
+
.b8 95
|
345 |
+
.b8 95
|
346 |
+
.b8 48
|
347 |
+
.b8 100
|
348 |
+
.b8 49
|
349 |
+
.b8 100
|
350 |
+
.b8 50
|
351 |
+
.b8 100
|
352 |
+
.b8 51
|
353 |
+
.b8 100
|
354 |
+
.b8 101
|
355 |
+
.b8 52
|
356 |
+
.b8 101
|
357 |
+
.b8 0
|
358 |
+
.b8 116
|
359 |
+
.b8 114
|
360 |
+
.b8 105
|
361 |
+
.b8 116
|
362 |
+
.b8 111
|
363 |
+
.b8 110
|
364 |
+
.b8 95
|
365 |
+
.b8 95
|
366 |
+
.b8 48
|
367 |
+
.b8 100
|
368 |
+
.b8 49
|
369 |
+
.b8 100
|
370 |
+
.b8 50
|
371 |
+
.b8 100
|
372 |
+
.b8 51
|
373 |
+
.b8 100
|
374 |
+
.b8 101
|
375 |
+
.b8 52
|
376 |
+
.b8 101
|
377 |
+
.b8 0
|
378 |
+
.b8 1
|
379 |
+
.b8 18
|
380 |
+
.b8 1
|
381 |
+
.b8 1
|
382 |
+
.b8 3
|
383 |
+
.b64 $L__func_begin0
|
384 |
+
.b64 $L__func_end0
|
385 |
+
.b8 1
|
386 |
+
.b8 156
|
387 |
+
.b32 125
|
388 |
+
.b8 4
|
389 |
+
.b32 125
|
390 |
+
.b64 $L__tmp1
|
391 |
+
.b64 $L__tmp6
|
392 |
+
.b8 2
|
393 |
+
.b8 35
|
394 |
+
.b8 25
|
395 |
+
.b8 5
|
396 |
+
.b32 125
|
397 |
+
.b64 $L__tmp2
|
398 |
+
.b64 $L__tmp5
|
399 |
+
.b8 2
|
400 |
+
.b8 35
|
401 |
+
.b8 25
|
402 |
+
.b8 4
|
403 |
+
.b32 125
|
404 |
+
.b64 $L__tmp2
|
405 |
+
.b64 $L__tmp5
|
406 |
+
.b8 2
|
407 |
+
.b8 243
|
408 |
+
.b8 36
|
409 |
+
.b8 0
|
410 |
+
.b8 0
|
411 |
+
.b8 0
|
412 |
+
}
|
413 |
+
.section .debug_pubnames
|
414 |
+
{
|
415 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
416 |
+
$L__pubNames_start0:
|
417 |
+
.b8 2
|
418 |
+
.b8 0
|
419 |
+
.b32 .debug_info
|
420 |
+
.b32 268
|
421 |
+
.b32 125
|
422 |
+
.b8 116
|
423 |
+
.b8 114
|
424 |
+
.b8 105
|
425 |
+
.b8 116
|
426 |
+
.b8 111
|
427 |
+
.b8 110
|
428 |
+
.b8 95
|
429 |
+
.b8 95
|
430 |
+
.b8 48
|
431 |
+
.b8 100
|
432 |
+
.b8 49
|
433 |
+
.b8 100
|
434 |
+
.b8 50
|
435 |
+
.b8 100
|
436 |
+
.b8 51
|
437 |
+
.b8 100
|
438 |
+
.b8 101
|
439 |
+
.b8 52
|
440 |
+
.b8 101
|
441 |
+
.b8 0
|
442 |
+
.b32 0
|
443 |
+
$L__pubNames_end0:
|
444 |
+
}
|
445 |
+
.section .debug_pubtypes
|
446 |
+
{
|
447 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
448 |
+
$L__pubTypes_start0:
|
449 |
+
.b8 2
|
450 |
+
.b8 0
|
451 |
+
.b32 .debug_info
|
452 |
+
.b32 268
|
453 |
+
.b32 0
|
454 |
+
$L__pubTypes_end0:
|
455 |
+
}
|
456 |
+
.section .debug_loc { }
|
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<256> : tensor<64x1xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<64x1xi64>
|
5 |
+
%cst_1 = arith.constant dense<512> : tensor<64x1xi64>
|
6 |
+
%c4_i32 = arith.constant 4 : i32
|
7 |
+
%c120_i32 = arith.constant 120 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_2 = arith.constant dense<true> : tensor<64x1xi1>
|
10 |
+
%cst_3 = arith.constant dense<256> : tensor<64x1xi32>
|
11 |
+
%cst_4 = arith.constant dense<131072> : tensor<1x4xi32>
|
12 |
+
%cst_5 = arith.constant dense<120> : tensor<1x4xi32>
|
13 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
|
14 |
+
%c64_i32 = arith.constant 64 : i32
|
15 |
+
%0 = tt.get_program_id x : i32
|
16 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
17 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
18 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
19 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
20 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
21 |
+
%6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
|
22 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
|
23 |
+
%8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
24 |
+
%9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
25 |
+
%10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x4xf32>) : i32 {
|
26 |
+
%27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32>
|
27 |
+
%28 = arith.addi %27, %7 : tensor<1x4xi32>
|
28 |
+
%29 = arith.cmpi slt, %28, %cst_5 : tensor<1x4xi32>
|
29 |
+
%30 = arith.muli %28, %cst_4 : tensor<1x4xi32>
|
30 |
+
%31 = tt.broadcast %30 : (tensor<1x4xi32>) -> tensor<64x4xi32>
|
31 |
+
%32 = arith.addi %8, %31 : tensor<64x4xi32>
|
32 |
+
%33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
|
33 |
+
%34 = tt.broadcast %29 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
34 |
+
%35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
|
35 |
+
%36 = arith.addf %arg6, %35 : tensor<64x4xf32>
|
36 |
+
%37 = arith.select %34, %36, %arg6 : tensor<64x4xi1>, tensor<64x4xf32>
|
37 |
+
scf.yield %37 : tensor<64x4xf32>
|
38 |
+
}
|
39 |
+
%11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
|
40 |
+
^bb0(%arg5: f32, %arg6: f32):
|
41 |
+
%27 = arith.addf %arg5, %arg6 : f32
|
42 |
+
tt.reduce.return %27 : f32
|
43 |
+
}) : (tensor<64x4xf32>) -> tensor<64xf32>
|
44 |
+
%12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
45 |
+
%13 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
|
46 |
+
%14 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
|
47 |
+
%15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
48 |
+
%16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
49 |
+
%17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
50 |
+
%18 = arith.addi %17, %cst_1 : tensor<64x1xi64>
|
51 |
+
%19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64>
|
52 |
+
%20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64>
|
53 |
+
%21 = arith.muli %20, %cst : tensor<64x1xi64>
|
54 |
+
%22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64>
|
55 |
+
%23 = arith.addi %22, %21 : tensor<64x1xi64>
|
56 |
+
%24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
57 |
+
%25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
|
58 |
+
%26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
|
59 |
+
tt.return
|
60 |
+
}
|
61 |
+
}
|
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin
ADDED
Binary file (40.4 kB). View file
|
|
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx
ADDED
@@ -0,0 +1,809 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
23 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
26 |
+
|
27 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
28 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
34 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
36 |
+
)
|
37 |
+
.maxntid 256, 1, 1
|
38 |
+
{
|
39 |
+
.reg .pred %p<33>;
|
40 |
+
.reg .b16 %rs<13>;
|
41 |
+
.reg .b32 %r<93>;
|
42 |
+
.reg .f32 %f<79>;
|
43 |
+
.reg .b64 %rd<92>;
|
44 |
+
.loc 1 18 0
|
45 |
+
$L__func_begin0:
|
46 |
+
.loc 1 18 0
|
47 |
+
|
48 |
+
ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6de7de_param_4];
|
49 |
+
ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6de7de_param_3];
|
50 |
+
ld.param.u64 %rd35, [triton__0d1d2d3d4d5d6de7de_param_2];
|
51 |
+
ld.param.u64 %rd34, [triton__0d1d2d3d4d5d6de7de_param_1];
|
52 |
+
ld.param.u64 %rd43, [triton__0d1d2d3d4d5d6de7de_param_0];
|
53 |
+
$L__tmp0:
|
54 |
+
.loc 1 22 44
|
55 |
+
mov.u32 %r1, %tid.x;
|
56 |
+
bfe.u32 %r2, %r1, 2, 6;
|
57 |
+
and.b32 %r16, %r1, 63;
|
58 |
+
.loc 1 24 33
|
59 |
+
and.b32 %r3, %r1, 3;
|
60 |
+
.loc 1 21 28
|
61 |
+
mov.u32 %r15, %ctaid.x;
|
62 |
+
.loc 1 21 33
|
63 |
+
shl.b32 %r17, %r15, 6;
|
64 |
+
.loc 1 22 23
|
65 |
+
or.b32 %r18, %r17, %r2;
|
66 |
+
or.b32 %r19, %r17, %r16;
|
67 |
+
.loc 1 26 30
|
68 |
+
mul.wide.s32 %rd44, %r18, 8;
|
69 |
+
add.s64 %rd40, %rd43, %rd44;
|
70 |
+
mul.wide.s32 %rd45, %r19, 8;
|
71 |
+
add.s64 %rd42, %rd43, %rd45;
|
72 |
+
mov.pred %p13, -1;
|
73 |
+
.loc 1 26 35
|
74 |
+
mov.u64 %rd39, 0x0;
|
75 |
+
@%p13 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
|
76 |
+
mov.u64 %rd41, 0x0;
|
77 |
+
@%p13 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd42 + 0 ];
|
78 |
+
.loc 1 27 18
|
79 |
+
bfe.s32 %r20, %r15, 25, 1;
|
80 |
+
shr.u32 %r21, %r20, 23;
|
81 |
+
add.s32 %r22, %r18, %r21;
|
82 |
+
and.b32 %r23, %r22, 16776704;
|
83 |
+
sub.s32 %r24, %r18, %r23;
|
84 |
+
.loc 1 35 44
|
85 |
+
shl.b32 %r5, %r24, 8;
|
86 |
+
.loc 1 37 22
|
87 |
+
add.s64 %rd46, %rd41, 50257;
|
88 |
+
.loc 1 38 22
|
89 |
+
setp.lt.s64 %p3, %rd39, 0;
|
90 |
+
setp.lt.s64 %p4, %rd41, 0;
|
91 |
+
.loc 1 39 36
|
92 |
+
selp.b64 %rd47, %rd46, %rd41, %p4;
|
93 |
+
.loc 1 40 40
|
94 |
+
setp.gt.u64 %p5, %rd47, 50256;
|
95 |
+
.loc 1 41 44
|
96 |
+
shl.b64 %rd48, %rd39, 8;
|
97 |
+
add.s64 %rd49, %rd48, 12865792;
|
98 |
+
selp.b64 %rd2, %rd49, %rd48, %p3;
|
99 |
+
mov.u16 %rs12, 0;
|
100 |
+
mov.b32 %r76, 0;
|
101 |
+
mov.b32 %r88, 883;
|
102 |
+
mov.u64 %rd81, 1;
|
103 |
+
.loc 1 40 55
|
104 |
+
@%p5 bra $L__BB0_3;
|
105 |
+
bra.uni $L__BB0_1;
|
106 |
+
$L__BB0_3:
|
107 |
+
.loc 1 31 36
|
108 |
+
shl.b64 %rd55, %rd2, 2;
|
109 |
+
mul.wide.u32 %rd88, %r3, 4;
|
110 |
+
add.s64 %rd87, %rd55, %rd88;
|
111 |
+
add.s64 %rd83, %rd34, %rd87;
|
112 |
+
shl.b32 %r42, %r15, 14;
|
113 |
+
shl.b32 %r43, %r2, 8;
|
114 |
+
or.b32 %r44, %r42, %r43;
|
115 |
+
or.b32 %r91, %r44, %r3;
|
116 |
+
add.s32 %r45, %r5, %r3;
|
117 |
+
mul.wide.s32 %rd86, %r45, 4;
|
118 |
+
add.s64 %rd82, %rd35, %rd86;
|
119 |
+
mov.f32 %f78, 0f00000000;
|
120 |
+
mov.b32 %r89, -4;
|
121 |
+
mov.f32 %f77, %f78;
|
122 |
+
mov.f32 %f76, %f78;
|
123 |
+
$L__BB0_4:
|
124 |
+
.loc 1 35 50
|
125 |
+
mov.u32 %r46, 0x0;
|
126 |
+
@%p13 ld.global.L1::evict_last.b32 { %r46 }, [ %rd82 + 0 ];
|
127 |
+
@!%p13 mov.u32 %r46, %r76;
|
128 |
+
mov.b32 %f31, %r46;
|
129 |
+
.loc 1 31 36
|
130 |
+
add.s32 %r89, %r89, 4;
|
131 |
+
.loc 1 36 34
|
132 |
+
add.s32 %r54, %r89, %r91;
|
133 |
+
mul.wide.s32 %rd59, %r54, 2;
|
134 |
+
add.s64 %rd57, %rd36, %rd59;
|
135 |
+
.loc 1 36 50
|
136 |
+
mov.u16 %rs4, 0x0;
|
137 |
+
@%p13 ld.global.L1::evict_last.b16 { %rs4 }, [ %rd57 + 0 ];
|
138 |
+
@!%p13 mov.u16 %rs4, %rs12;
|
139 |
+
.loc 1 36 101
|
140 |
+
cvt.f32.bf16 %r48, %rs4;
|
141 |
+
mov.b32 %f32, %r48;
|
142 |
+
.loc 1 40 55
|
143 |
+
mov.u64 %rd60, assertMessage_0;
|
144 |
+
cvta.global.u64 %rd61, %rd60;
|
145 |
+
mov.u64 %rd62, assertFile_0;
|
146 |
+
cvta.global.u64 %rd63, %rd62;
|
147 |
+
mov.u64 %rd64, assertFunc_0;
|
148 |
+
cvta.global.u64 %rd65, %rd64;
|
149 |
+
{ // callseq 10, 0
|
150 |
+
.reg .b32 temp_param_reg;
|
151 |
+
.param .b64 param0;
|
152 |
+
st.param.b64 [param0+0], %rd61;
|
153 |
+
.param .b64 param1;
|
154 |
+
st.param.b64 [param1+0], %rd63;
|
155 |
+
.param .b32 param2;
|
156 |
+
st.param.b32 [param2+0], %r88;
|
157 |
+
.param .b64 param3;
|
158 |
+
st.param.b64 [param3+0], %rd65;
|
159 |
+
.param .b64 param4;
|
160 |
+
st.param.b64 [param4+0], %rd81;
|
161 |
+
call.uni
|
162 |
+
__assertfail,
|
163 |
+
(
|
164 |
+
param0,
|
165 |
+
param1,
|
166 |
+
param2,
|
167 |
+
param3,
|
168 |
+
param4
|
169 |
+
);
|
170 |
+
} // callseq 10
|
171 |
+
.loc 1 41 52
|
172 |
+
mov.u32 %r49, 0x0;
|
173 |
+
@%p13 ld.global.L1::evict_last.b32 { %r49 }, [ %rd83 + 0 ];
|
174 |
+
@!%p13 mov.u32 %r49, %r76;
|
175 |
+
mov.b32 %f33, %r49;
|
176 |
+
.loc 1 42 22
|
177 |
+
add.f32 %f34, %f31, %f33;
|
178 |
+
.loc 1 44 22
|
179 |
+
add.f32 %f35, %f32, %f34;
|
180 |
+
$L__tmp1:
|
181 |
+
.loc 2 96 20
|
182 |
+
sub.f32 %f36, %f35, %f76;
|
183 |
+
.loc 2 97 26
|
184 |
+
add.f32 %f78, %f78, 0f3F800000;
|
185 |
+
.loc 2 98 30
|
186 |
+
mov.b32 %r52, %f36;
|
187 |
+
mov.b32 %r53, %f78;
|
188 |
+
div.full.f32 %r51, %r52, %r53;
|
189 |
+
mov.b32 %f37, %r51;
|
190 |
+
.loc 2 98 22
|
191 |
+
add.f32 %f76, %f76, %f37;
|
192 |
+
.loc 2 101 30
|
193 |
+
sub.f32 %f38, %f35, %f76;
|
194 |
+
$L__tmp2:
|
195 |
+
.loc 1 50 50
|
196 |
+
fma.rn.f32 %f77, %f36, %f38, %f77;
|
197 |
+
.loc 1 31 36
|
198 |
+
add.s64 %rd83, %rd83, 16;
|
199 |
+
add.s64 %rd82, %rd82, 16;
|
200 |
+
setp.lt.u32 %p19, %r89, 252;
|
201 |
+
@%p19 bra $L__BB0_4;
|
202 |
+
bra.uni $L__BB0_5;
|
203 |
+
$L__BB0_1:
|
204 |
+
.loc 1 0 36
|
205 |
+
mov.b32 %r90, -4;
|
206 |
+
.loc 1 31 36
|
207 |
+
shl.b64 %rd50, %rd2, 2;
|
208 |
+
mul.wide.u32 %rd88, %r3, 4;
|
209 |
+
add.s64 %rd87, %rd50, %rd88;
|
210 |
+
add.s64 %rd85, %rd34, %rd87;
|
211 |
+
shl.b32 %r27, %r15, 14;
|
212 |
+
shl.b32 %r28, %r2, 8;
|
213 |
+
or.b32 %r29, %r27, %r28;
|
214 |
+
or.b32 %r91, %r29, %r3;
|
215 |
+
add.s32 %r30, %r5, %r3;
|
216 |
+
mul.wide.s32 %rd86, %r30, 4;
|
217 |
+
add.s64 %rd84, %rd35, %rd86;
|
218 |
+
mov.f32 %f78, 0f00000000;
|
219 |
+
mov.f32 %f77, %f78;
|
220 |
+
mov.f32 %f76, %f78;
|
221 |
+
$L__BB0_2:
|
222 |
+
.loc 1 35 50
|
223 |
+
mov.u32 %r31, 0x0;
|
224 |
+
@%p13 ld.global.L1::evict_last.b32 { %r31 }, [ %rd84 + 0 ];
|
225 |
+
@!%p13 mov.u32 %r31, %r76;
|
226 |
+
mov.b32 %f22, %r31;
|
227 |
+
.loc 1 31 36
|
228 |
+
add.s32 %r90, %r90, 4;
|
229 |
+
.loc 1 36 34
|
230 |
+
add.s32 %r39, %r90, %r91;
|
231 |
+
mul.wide.s32 %rd54, %r39, 2;
|
232 |
+
add.s64 %rd52, %rd36, %rd54;
|
233 |
+
.loc 1 36 50
|
234 |
+
mov.u16 %rs1, 0x0;
|
235 |
+
@%p13 ld.global.L1::evict_last.b16 { %rs1 }, [ %rd52 + 0 ];
|
236 |
+
@!%p13 mov.u16 %rs1, %rs12;
|
237 |
+
.loc 1 36 101
|
238 |
+
cvt.f32.bf16 %r33, %rs1;
|
239 |
+
mov.b32 %f23, %r33;
|
240 |
+
.loc 1 41 52
|
241 |
+
mov.u32 %r34, 0x0;
|
242 |
+
@%p13 ld.global.L1::evict_last.b32 { %r34 }, [ %rd85 + 0 ];
|
243 |
+
@!%p13 mov.u32 %r34, %r76;
|
244 |
+
mov.b32 %f24, %r34;
|
245 |
+
.loc 1 42 22
|
246 |
+
add.f32 %f25, %f22, %f24;
|
247 |
+
.loc 1 44 22
|
248 |
+
add.f32 %f26, %f23, %f25;
|
249 |
+
$L__tmp3:
|
250 |
+
.loc 2 96 20
|
251 |
+
sub.f32 %f27, %f26, %f76;
|
252 |
+
.loc 2 97 26
|
253 |
+
add.f32 %f78, %f78, 0f3F800000;
|
254 |
+
.loc 2 98 30
|
255 |
+
mov.b32 %r37, %f27;
|
256 |
+
mov.b32 %r38, %f78;
|
257 |
+
div.full.f32 %r36, %r37, %r38;
|
258 |
+
mov.b32 %f28, %r36;
|
259 |
+
.loc 2 98 22
|
260 |
+
add.f32 %f76, %f76, %f28;
|
261 |
+
.loc 2 101 30
|
262 |
+
sub.f32 %f29, %f26, %f76;
|
263 |
+
$L__tmp4:
|
264 |
+
.loc 1 50 50
|
265 |
+
fma.rn.f32 %f77, %f27, %f29, %f77;
|
266 |
+
.loc 1 31 36
|
267 |
+
add.s64 %rd85, %rd85, 16;
|
268 |
+
add.s64 %rd84, %rd84, 16;
|
269 |
+
setp.lt.u32 %p12, %r90, 252;
|
270 |
+
@%p12 bra $L__BB0_2;
|
271 |
+
$L__BB0_5:
|
272 |
+
.loc 1 0 36
|
273 |
+
ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6de7de_param_5];
|
274 |
+
$L__tmp5:
|
275 |
+
.loc 2 120 46
|
276 |
+
mov.b32 %r66, %f76;
|
277 |
+
shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1;
|
278 |
+
mov.b32 %f39, %r67;
|
279 |
+
mov.b32 %r68, %f77;
|
280 |
+
shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1;
|
281 |
+
mov.b32 %f40, %r69;
|
282 |
+
mov.b32 %r70, %f78;
|
283 |
+
shfl.sync.bfly.b32 %r57, %r70, 2, 31, -1;
|
284 |
+
mov.b32 %f41, %r57;
|
285 |
+
$L__tmp6:
|
286 |
+
.loc 2 108 21
|
287 |
+
sub.f32 %f42, %f39, %f76;
|
288 |
+
.loc 2 109 28
|
289 |
+
add.f32 %f43, %f78, %f41;
|
290 |
+
.loc 2 110 39
|
291 |
+
setp.eq.f32 %p20, %f43, 0f00000000;
|
292 |
+
.loc 2 110 60
|
293 |
+
mov.b32 %r58, %f43;
|
294 |
+
div.full.f32 %r56, %r57, %r58;
|
295 |
+
mov.b32 %f44, %r56;
|
296 |
+
.loc 2 110 49
|
297 |
+
selp.f32 %f45, 0f00000000, %f44, %p20;
|
298 |
+
.loc 2 112 17
|
299 |
+
fma.rn.f32 %f46, %f42, %f45, %f76;
|
300 |
+
.loc 2 113 15
|
301 |
+
add.f32 %f47, %f77, %f40;
|
302 |
+
.loc 2 113 30
|
303 |
+
mul.f32 %f48, %f42, %f42;
|
304 |
+
.loc 2 113 38
|
305 |
+
mul.f32 %f49, %f78, %f48;
|
306 |
+
.loc 2 113 22
|
307 |
+
fma.rn.f32 %f50, %f49, %f45, %f47;
|
308 |
+
$L__tmp7:
|
309 |
+
.loc 2 120 46
|
310 |
+
mov.b32 %r71, %f46;
|
311 |
+
shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
|
312 |
+
mov.b32 %f51, %r72;
|
313 |
+
mov.b32 %r73, %f50;
|
314 |
+
shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1;
|
315 |
+
mov.b32 %f52, %r74;
|
316 |
+
shfl.sync.bfly.b32 %r60, %r58, 1, 31, -1;
|
317 |
+
mov.b32 %f53, %r60;
|
318 |
+
$L__tmp8:
|
319 |
+
.loc 2 108 21
|
320 |
+
sub.f32 %f54, %f51, %f46;
|
321 |
+
.loc 2 109 28
|
322 |
+
add.f32 %f55, %f43, %f53;
|
323 |
+
.loc 2 110 39
|
324 |
+
setp.eq.f32 %p21, %f55, 0f00000000;
|
325 |
+
.loc 2 110 60
|
326 |
+
mov.b32 %r61, %f55;
|
327 |
+
div.full.f32 %r59, %r60, %r61;
|
328 |
+
mov.b32 %f56, %r59;
|
329 |
+
.loc 2 110 49
|
330 |
+
selp.f32 %f57, 0f00000000, %f56, %p21;
|
331 |
+
.loc 2 112 17
|
332 |
+
fma.rn.f32 %f16, %f54, %f57, %f46;
|
333 |
+
.loc 2 113 15
|
334 |
+
add.f32 %f58, %f50, %f52;
|
335 |
+
.loc 2 113 30
|
336 |
+
mul.f32 %f59, %f54, %f54;
|
337 |
+
.loc 2 113 38
|
338 |
+
mul.f32 %f60, %f43, %f59;
|
339 |
+
.loc 2 113 22
|
340 |
+
fma.rn.f32 %f61, %f57, %f60, %f58;
|
341 |
+
$L__tmp9:
|
342 |
+
.loc 1 75 24
|
343 |
+
mov.b32 %r63, %f61;
|
344 |
+
mov.b32 %r64, 1132462080;
|
345 |
+
div.full.f32 %r62, %r63, %r64;
|
346 |
+
mov.b32 %f62, %r62;
|
347 |
+
.loc 1 77 24
|
348 |
+
add.f32 %f17, %f62, 0f3727C5AC;
|
349 |
+
.loc 1 58 36
|
350 |
+
add.s64 %rd91, %rd34, %rd87;
|
351 |
+
add.s64 %rd90, %rd37, %rd88;
|
352 |
+
add.s64 %rd89, %rd35, %rd86;
|
353 |
+
mov.b32 %r92, -4;
|
354 |
+
setp.lt.u64 %p28, %rd47, 50257;
|
355 |
+
rsqrt.approx.ftz.f32 %f67, %f17;
|
356 |
+
bra.uni $L__BB0_6;
|
357 |
+
$L__BB0_8:
|
358 |
+
.loc 1 0 0
|
359 |
+
mov.b32 %f18, %r75;
|
360 |
+
cvt.s64.s32 %rd30, %r81;
|
361 |
+
cvt.f32.bf16 %r77, %rs7;
|
362 |
+
mov.b32 %f19, %r77;
|
363 |
+
mov.b32 %f20, %r78;
|
364 |
+
.loc 1 69 54
|
365 |
+
mov.u32 %r83, 0x0;
|
366 |
+
@%p13 ld.global.L1::evict_first.b32 { %r83 }, [ %rd91 + 0 ];
|
367 |
+
@!%p13 mov.u32 %r83, %r76;
|
368 |
+
mov.b32 %f63, %r83;
|
369 |
+
.loc 1 70 24
|
370 |
+
add.f32 %f64, %f18, %f63;
|
371 |
+
.loc 1 72 24
|
372 |
+
add.f32 %f65, %f19, %f64;
|
373 |
+
.loc 1 73 24
|
374 |
+
sub.f32 %f66, %f65, %f16;
|
375 |
+
.loc 1 79 24
|
376 |
+
mul.f32 %f68, %f66, %f67;
|
377 |
+
.loc 1 80 24
|
378 |
+
mul.f32 %f69, %f68, %f20;
|
379 |
+
.loc 1 82 29
|
380 |
+
shl.b64 %rd80, %rd30, 1;
|
381 |
+
add.s64 %rd79, %rd38, %rd80;
|
382 |
+
.loc 1 82 52
|
383 |
+
mov.b32 %r85, %f69;
|
384 |
+
cvt.rn.bf16.f32 %rs10, %r85;
|
385 |
+
@%p13 st.global.b16 [ %rd79 + 0 ], { %rs10 };
|
386 |
+
.loc 1 58 36
|
387 |
+
add.s32 %r92, %r92, 4;
|
388 |
+
add.s64 %rd91, %rd91, 16;
|
389 |
+
add.s64 %rd90, %rd90, 16;
|
390 |
+
add.s64 %rd89, %rd89, 16;
|
391 |
+
setp.lt.u32 %p32, %r92, 252;
|
392 |
+
@%p32 bra $L__BB0_6;
|
393 |
+
bra.uni $L__BB0_9;
|
394 |
+
$L__BB0_6:
|
395 |
+
.loc 1 62 51
|
396 |
+
mov.u32 %r75, 0x0;
|
397 |
+
@%p13 ld.global.L1::evict_last.b32 { %r75 }, [ %rd89 + 0 ];
|
398 |
+
@!%p13 mov.u32 %r75, %r76;
|
399 |
+
.loc 1 63 35
|
400 |
+
add.s32 %r80, %r91, %r92;
|
401 |
+
add.s32 %r81, %r80, 4;
|
402 |
+
mul.wide.s32 %rd70, %r81, 2;
|
403 |
+
add.s64 %rd68, %rd36, %rd70;
|
404 |
+
.loc 1 63 51
|
405 |
+
mov.u16 %rs7, 0x0;
|
406 |
+
@%p13 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd68 + 0 ];
|
407 |
+
@!%p13 mov.u16 %rs7, %rs12;
|
408 |
+
.loc 1 64 40
|
409 |
+
mov.u32 %r78, 0x0;
|
410 |
+
@%p13 ld.global.L1::evict_last.b32 { %r78 }, [ %rd90 + 0 ];
|
411 |
+
@!%p13 mov.u32 %r78, %r76;
|
412 |
+
.loc 1 68 57
|
413 |
+
@%p28 bra $L__BB0_8;
|
414 |
+
mov.u64 %rd71, assertMessage_1;
|
415 |
+
cvta.global.u64 %rd72, %rd71;
|
416 |
+
mov.u64 %rd73, assertFile_1;
|
417 |
+
cvta.global.u64 %rd74, %rd73;
|
418 |
+
mov.u64 %rd75, assertFunc_1;
|
419 |
+
cvta.global.u64 %rd76, %rd75;
|
420 |
+
{ // callseq 11, 0
|
421 |
+
.reg .b32 temp_param_reg;
|
422 |
+
.param .b64 param0;
|
423 |
+
st.param.b64 [param0+0], %rd72;
|
424 |
+
.param .b64 param1;
|
425 |
+
st.param.b64 [param1+0], %rd74;
|
426 |
+
.param .b32 param2;
|
427 |
+
st.param.b32 [param2+0], %r88;
|
428 |
+
.param .b64 param3;
|
429 |
+
st.param.b64 [param3+0], %rd76;
|
430 |
+
.param .b64 param4;
|
431 |
+
st.param.b64 [param4+0], %rd81;
|
432 |
+
call.uni
|
433 |
+
__assertfail,
|
434 |
+
(
|
435 |
+
param0,
|
436 |
+
param1,
|
437 |
+
param2,
|
438 |
+
param3,
|
439 |
+
param4
|
440 |
+
);
|
441 |
+
} // callseq 11
|
442 |
+
bra.uni $L__BB0_8;
|
443 |
+
$L__BB0_9:
|
444 |
+
.loc 1 58 4
|
445 |
+
ret;
|
446 |
+
$L__tmp10:
|
447 |
+
$L__func_end0:
|
448 |
+
|
449 |
+
}
|
450 |
+
// .globl __nv_rsqrtf
|
451 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
452 |
+
.param .b32 __nv_rsqrtf_param_0
|
453 |
+
)
|
454 |
+
{
|
455 |
+
.reg .f32 %f<3>;
|
456 |
+
$L__func_begin1:
|
457 |
+
|
458 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
459 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
460 |
+
st.param.f32 [func_retval0+0], %f2;
|
461 |
+
ret;
|
462 |
+
$L__func_end1:
|
463 |
+
|
464 |
+
}
|
465 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
466 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
467 |
+
.section .debug_abbrev
|
468 |
+
{
|
469 |
+
.b8 1
|
470 |
+
.b8 17
|
471 |
+
.b8 1
|
472 |
+
.b8 37
|
473 |
+
.b8 8
|
474 |
+
.b8 19
|
475 |
+
.b8 5
|
476 |
+
.b8 3
|
477 |
+
.b8 8
|
478 |
+
.b8 16
|
479 |
+
.b8 6
|
480 |
+
.b8 27
|
481 |
+
.b8 8
|
482 |
+
.b8 180
|
483 |
+
.b8 66
|
484 |
+
.b8 12
|
485 |
+
.b8 17
|
486 |
+
.b8 1
|
487 |
+
.b8 18
|
488 |
+
.b8 1
|
489 |
+
.b8 0
|
490 |
+
.b8 0
|
491 |
+
.b8 2
|
492 |
+
.b8 46
|
493 |
+
.b8 0
|
494 |
+
.b8 135
|
495 |
+
.b8 64
|
496 |
+
.b8 8
|
497 |
+
.b8 3
|
498 |
+
.b8 8
|
499 |
+
.b8 58
|
500 |
+
.b8 11
|
501 |
+
.b8 59
|
502 |
+
.b8 11
|
503 |
+
.b8 63
|
504 |
+
.b8 12
|
505 |
+
.b8 32
|
506 |
+
.b8 11
|
507 |
+
.b8 0
|
508 |
+
.b8 0
|
509 |
+
.b8 3
|
510 |
+
.b8 46
|
511 |
+
.b8 1
|
512 |
+
.b8 17
|
513 |
+
.b8 1
|
514 |
+
.b8 18
|
515 |
+
.b8 1
|
516 |
+
.b8 64
|
517 |
+
.b8 10
|
518 |
+
.b8 49
|
519 |
+
.b8 19
|
520 |
+
.b8 0
|
521 |
+
.b8 0
|
522 |
+
.b8 4
|
523 |
+
.b8 29
|
524 |
+
.b8 0
|
525 |
+
.b8 49
|
526 |
+
.b8 19
|
527 |
+
.b8 17
|
528 |
+
.b8 1
|
529 |
+
.b8 18
|
530 |
+
.b8 1
|
531 |
+
.b8 88
|
532 |
+
.b8 11
|
533 |
+
.b8 89
|
534 |
+
.b8 11
|
535 |
+
.b8 87
|
536 |
+
.b8 11
|
537 |
+
.b8 0
|
538 |
+
.b8 0
|
539 |
+
.b8 5
|
540 |
+
.b8 29
|
541 |
+
.b8 1
|
542 |
+
.b8 49
|
543 |
+
.b8 19
|
544 |
+
.b8 17
|
545 |
+
.b8 1
|
546 |
+
.b8 18
|
547 |
+
.b8 1
|
548 |
+
.b8 88
|
549 |
+
.b8 11
|
550 |
+
.b8 89
|
551 |
+
.b8 11
|
552 |
+
.b8 87
|
553 |
+
.b8 11
|
554 |
+
.b8 0
|
555 |
+
.b8 0
|
556 |
+
.b8 0
|
557 |
+
}
|
558 |
+
.section .debug_info
|
559 |
+
{
|
560 |
+
.b32 302
|
561 |
+
.b8 2
|
562 |
+
.b8 0
|
563 |
+
.b32 .debug_abbrev
|
564 |
+
.b8 8
|
565 |
+
.b8 1
|
566 |
+
.b8 116
|
567 |
+
.b8 114
|
568 |
+
.b8 105
|
569 |
+
.b8 116
|
570 |
+
.b8 111
|
571 |
+
.b8 110
|
572 |
+
.b8 0
|
573 |
+
.b8 2
|
574 |
+
.b8 0
|
575 |
+
.b8 99
|
576 |
+
.b8 112
|
577 |
+
.b8 110
|
578 |
+
.b8 51
|
579 |
+
.b8 108
|
580 |
+
.b8 97
|
581 |
+
.b8 119
|
582 |
+
.b8 103
|
583 |
+
.b8 54
|
584 |
+
.b8 53
|
585 |
+
.b8 108
|
586 |
+
.b8 112
|
587 |
+
.b8 105
|
588 |
+
.b8 54
|
589 |
+
.b8 51
|
590 |
+
.b8 103
|
591 |
+
.b8 118
|
592 |
+
.b8 54
|
593 |
+
.b8 99
|
594 |
+
.b8 54
|
595 |
+
.b8 112
|
596 |
+
.b8 110
|
597 |
+
.b8 52
|
598 |
+
.b8 111
|
599 |
+
.b8 105
|
600 |
+
.b8 107
|
601 |
+
.b8 104
|
602 |
+
.b8 103
|
603 |
+
.b8 54
|
604 |
+
.b8 113
|
605 |
+
.b8 118
|
606 |
+
.b8 97
|
607 |
+
.b8 50
|
608 |
+
.b8 104
|
609 |
+
.b8 50
|
610 |
+
.b8 113
|
611 |
+
.b8 106
|
612 |
+
.b8 100
|
613 |
+
.b8 112
|
614 |
+
.b8 120
|
615 |
+
.b8 101
|
616 |
+
.b8 54
|
617 |
+
.b8 113
|
618 |
+
.b8 106
|
619 |
+
.b8 52
|
620 |
+
.b8 108
|
621 |
+
.b8 118
|
622 |
+
.b8 116
|
623 |
+
.b8 116
|
624 |
+
.b8 119
|
625 |
+
.b8 101
|
626 |
+
.b8 122
|
627 |
+
.b8 46
|
628 |
+
.b8 112
|
629 |
+
.b8 121
|
630 |
+
.b8 0
|
631 |
+
.b32 .debug_line
|
632 |
+
.b8 47
|
633 |
+
.b8 116
|
634 |
+
.b8 109
|
635 |
+
.b8 112
|
636 |
+
.b8 47
|
637 |
+
.b8 116
|
638 |
+
.b8 111
|
639 |
+
.b8 114
|
640 |
+
.b8 99
|
641 |
+
.b8 104
|
642 |
+
.b8 105
|
643 |
+
.b8 110
|
644 |
+
.b8 100
|
645 |
+
.b8 117
|
646 |
+
.b8 99
|
647 |
+
.b8 116
|
648 |
+
.b8 111
|
649 |
+
.b8 114
|
650 |
+
.b8 95
|
651 |
+
.b8 114
|
652 |
+
.b8 111
|
653 |
+
.b8 111
|
654 |
+
.b8 116
|
655 |
+
.b8 47
|
656 |
+
.b8 112
|
657 |
+
.b8 110
|
658 |
+
.b8 0
|
659 |
+
.b8 1
|
660 |
+
.b64 $L__func_begin0
|
661 |
+
.b64 $L__func_end0
|
662 |
+
.b8 2
|
663 |
+
.b8 116
|
664 |
+
.b8 114
|
665 |
+
.b8 105
|
666 |
+
.b8 116
|
667 |
+
.b8 111
|
668 |
+
.b8 110
|
669 |
+
.b8 95
|
670 |
+
.b8 95
|
671 |
+
.b8 48
|
672 |
+
.b8 100
|
673 |
+
.b8 49
|
674 |
+
.b8 100
|
675 |
+
.b8 50
|
676 |
+
.b8 100
|
677 |
+
.b8 51
|
678 |
+
.b8 100
|
679 |
+
.b8 52
|
680 |
+
.b8 100
|
681 |
+
.b8 53
|
682 |
+
.b8 100
|
683 |
+
.b8 54
|
684 |
+
.b8 100
|
685 |
+
.b8 101
|
686 |
+
.b8 55
|
687 |
+
.b8 100
|
688 |
+
.b8 101
|
689 |
+
.b8 0
|
690 |
+
.b8 116
|
691 |
+
.b8 114
|
692 |
+
.b8 105
|
693 |
+
.b8 116
|
694 |
+
.b8 111
|
695 |
+
.b8 110
|
696 |
+
.b8 95
|
697 |
+
.b8 95
|
698 |
+
.b8 48
|
699 |
+
.b8 100
|
700 |
+
.b8 49
|
701 |
+
.b8 100
|
702 |
+
.b8 50
|
703 |
+
.b8 100
|
704 |
+
.b8 51
|
705 |
+
.b8 100
|
706 |
+
.b8 52
|
707 |
+
.b8 100
|
708 |
+
.b8 53
|
709 |
+
.b8 100
|
710 |
+
.b8 54
|
711 |
+
.b8 100
|
712 |
+
.b8 101
|
713 |
+
.b8 55
|
714 |
+
.b8 100
|
715 |
+
.b8 101
|
716 |
+
.b8 0
|
717 |
+
.b8 1
|
718 |
+
.b8 18
|
719 |
+
.b8 1
|
720 |
+
.b8 1
|
721 |
+
.b8 3
|
722 |
+
.b64 $L__func_begin0
|
723 |
+
.b64 $L__func_end0
|
724 |
+
.b8 1
|
725 |
+
.b8 156
|
726 |
+
.b32 125
|
727 |
+
.b8 4
|
728 |
+
.b32 125
|
729 |
+
.b64 $L__tmp1
|
730 |
+
.b64 $L__tmp4
|
731 |
+
.b8 2
|
732 |
+
.b8 47
|
733 |
+
.b8 41
|
734 |
+
.b8 4
|
735 |
+
.b32 125
|
736 |
+
.b64 $L__tmp5
|
737 |
+
.b64 $L__tmp8
|
738 |
+
.b8 2
|
739 |
+
.b8 53
|
740 |
+
.b8 44
|
741 |
+
.b8 5
|
742 |
+
.b32 125
|
743 |
+
.b64 $L__tmp6
|
744 |
+
.b64 $L__tmp9
|
745 |
+
.b8 2
|
746 |
+
.b8 53
|
747 |
+
.b8 44
|
748 |
+
.b8 4
|
749 |
+
.b32 125
|
750 |
+
.b64 $L__tmp6
|
751 |
+
.b64 $L__tmp9
|
752 |
+
.b8 2
|
753 |
+
.b8 120
|
754 |
+
.b8 46
|
755 |
+
.b8 0
|
756 |
+
.b8 0
|
757 |
+
.b8 0
|
758 |
+
}
|
759 |
+
.section .debug_pubnames
|
760 |
+
{
|
761 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
762 |
+
$L__pubNames_start0:
|
763 |
+
.b8 2
|
764 |
+
.b8 0
|
765 |
+
.b32 .debug_info
|
766 |
+
.b32 306
|
767 |
+
.b32 125
|
768 |
+
.b8 116
|
769 |
+
.b8 114
|
770 |
+
.b8 105
|
771 |
+
.b8 116
|
772 |
+
.b8 111
|
773 |
+
.b8 110
|
774 |
+
.b8 95
|
775 |
+
.b8 95
|
776 |
+
.b8 48
|
777 |
+
.b8 100
|
778 |
+
.b8 49
|
779 |
+
.b8 100
|
780 |
+
.b8 50
|
781 |
+
.b8 100
|
782 |
+
.b8 51
|
783 |
+
.b8 100
|
784 |
+
.b8 52
|
785 |
+
.b8 100
|
786 |
+
.b8 53
|
787 |
+
.b8 100
|
788 |
+
.b8 54
|
789 |
+
.b8 100
|
790 |
+
.b8 101
|
791 |
+
.b8 55
|
792 |
+
.b8 100
|
793 |
+
.b8 101
|
794 |
+
.b8 0
|
795 |
+
.b32 0
|
796 |
+
$L__pubNames_end0:
|
797 |
+
}
|
798 |
+
.section .debug_pubtypes
|
799 |
+
{
|
800 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
801 |
+
$L__pubTypes_start0:
|
802 |
+
.b8 2
|
803 |
+
.b8 0
|
804 |
+
.b32 .debug_info
|
805 |
+
.b32 306
|
806 |
+
.b32 0
|
807 |
+
$L__pubTypes_end0:
|
808 |
+
}
|
809 |
+
.section .debug_loc { }
|
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx
ADDED
@@ -0,0 +1,717 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5de6de
|
10 |
+
.extern .shared .align 1 .b8 global_smem[];
|
11 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
12 |
+
|
13 |
+
.visible .entry triton__0d1d2d3d4d5de6de(
|
14 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_0,
|
15 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_1,
|
16 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_2,
|
17 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_3,
|
18 |
+
.param .u64 triton__0d1d2d3d4d5de6de_param_4,
|
19 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_5,
|
20 |
+
.param .u32 triton__0d1d2d3d4d5de6de_param_6
|
21 |
+
)
|
22 |
+
.maxntid 64, 1, 1
|
23 |
+
{
|
24 |
+
.reg .pred %p<26>;
|
25 |
+
.reg .b16 %rs<9>;
|
26 |
+
.reg .b32 %r<88>;
|
27 |
+
.reg .f32 %f<78>;
|
28 |
+
.reg .b64 %rd<14>;
|
29 |
+
.loc 1 18 0
|
30 |
+
$L__func_begin0:
|
31 |
+
.loc 1 18 0
|
32 |
+
|
33 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_0];
|
34 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_1];
|
35 |
+
$L__tmp0:
|
36 |
+
.loc 1 26 26
|
37 |
+
mov.u32 %r56, %tid.x;
|
38 |
+
and.b32 %r57, %r56, 31;
|
39 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2];
|
40 |
+
ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3];
|
41 |
+
ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4];
|
42 |
+
shl.b32 %r58, %r56, 2;
|
43 |
+
and.b32 %r59, %r58, 252;
|
44 |
+
.loc 1 23 28
|
45 |
+
mov.u32 %r1, %ctaid.x;
|
46 |
+
.loc 1 30 40
|
47 |
+
shl.b32 %r60, %r1, 8;
|
48 |
+
.loc 1 30 36
|
49 |
+
or.b32 %r61, %r60, %r59;
|
50 |
+
.loc 1 30 30
|
51 |
+
mul.wide.s32 %rd11, %r61, 4;
|
52 |
+
add.s64 %rd1, %rd6, %rd11;
|
53 |
+
mov.b32 %r6, 0;
|
54 |
+
mov.pred %p1, -1;
|
55 |
+
.loc 1 30 46
|
56 |
+
mov.u32 %r2, 0x0;
|
57 |
+
mov.u32 %r3, 0x0;
|
58 |
+
mov.u32 %r4, 0x0;
|
59 |
+
mov.u32 %r5, 0x0;
|
60 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
61 |
+
@!%p1 mov.u32 %r2, %r6;
|
62 |
+
@!%p1 mov.u32 %r3, %r6;
|
63 |
+
@!%p1 mov.u32 %r4, %r6;
|
64 |
+
@!%p1 mov.u32 %r5, %r6;
|
65 |
+
mov.b32 %f1, %r4;
|
66 |
+
mov.b32 %f2, %r5;
|
67 |
+
.loc 1 31 30
|
68 |
+
mul.wide.s32 %rd12, %r61, 2;
|
69 |
+
add.s64 %rd2, %rd7, %rd12;
|
70 |
+
.loc 1 31 46
|
71 |
+
mov.u32 %r10, 0x0;
|
72 |
+
mov.u32 %r11, 0x0;
|
73 |
+
@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
|
74 |
+
@!%p1 mov.u32 %r10, %r6;
|
75 |
+
@!%p1 mov.u32 %r11, %r6;
|
76 |
+
cvt.u16.u32 %rs1, %r10;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
|
78 |
+
cvt.u16.u32 %rs3, %r11;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
|
80 |
+
.loc 1 31 67
|
81 |
+
cvt.f32.bf16 %r14, %rs1;
|
82 |
+
mov.b32 %f3, %r14;
|
83 |
+
cvt.f32.bf16 %r15, %rs2;
|
84 |
+
mov.b32 %f4, %r15;
|
85 |
+
cvt.f32.bf16 %r16, %rs3;
|
86 |
+
mov.b32 %f5, %r16;
|
87 |
+
cvt.f32.bf16 %r17, %rs4;
|
88 |
+
mov.b32 %f6, %r17;
|
89 |
+
.loc 1 32 30
|
90 |
+
add.s64 %rd3, %rd8, %rd12;
|
91 |
+
.loc 1 32 46
|
92 |
+
mov.u32 %r18, 0x0;
|
93 |
+
mov.u32 %r19, 0x0;
|
94 |
+
@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
|
95 |
+
@!%p1 mov.u32 %r18, %r6;
|
96 |
+
@!%p1 mov.u32 %r19, %r6;
|
97 |
+
cvt.u16.u32 %rs5, %r18;
|
98 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
|
99 |
+
cvt.u16.u32 %rs7, %r19;
|
100 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
|
101 |
+
.loc 1 32 67
|
102 |
+
cvt.f32.bf16 %r22, %rs5;
|
103 |
+
mov.b32 %f7, %r22;
|
104 |
+
cvt.f32.bf16 %r23, %rs6;
|
105 |
+
mov.b32 %f8, %r23;
|
106 |
+
cvt.f32.bf16 %r24, %rs7;
|
107 |
+
mov.b32 %f9, %r24;
|
108 |
+
cvt.f32.bf16 %r25, %rs8;
|
109 |
+
mov.b32 %f10, %r25;
|
110 |
+
.loc 1 33 31
|
111 |
+
mul.wide.u32 %rd13, %r59, 4;
|
112 |
+
add.s64 %rd4, %rd9, %rd13;
|
113 |
+
.loc 1 33 36
|
114 |
+
mov.u32 %r26, 0x0;
|
115 |
+
mov.u32 %r27, 0x0;
|
116 |
+
mov.u32 %r28, 0x0;
|
117 |
+
mov.u32 %r29, 0x0;
|
118 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
|
119 |
+
@!%p1 mov.u32 %r26, %r6;
|
120 |
+
@!%p1 mov.u32 %r27, %r6;
|
121 |
+
@!%p1 mov.u32 %r28, %r6;
|
122 |
+
@!%p1 mov.u32 %r29, %r6;
|
123 |
+
.loc 1 35 18
|
124 |
+
add.f32 %f11, %f5, %f1;
|
125 |
+
add.f32 %f12, %f6, %f2;
|
126 |
+
.loc 1 30 46
|
127 |
+
mov.b32 %f13, %r3;
|
128 |
+
mov.b32 %f14, %r2;
|
129 |
+
.loc 1 35 18
|
130 |
+
add.f32 %f15, %f3, %f14;
|
131 |
+
add.f32 %f16, %f4, %f13;
|
132 |
+
.loc 1 37 18
|
133 |
+
add.f32 %f17, %f16, %f8;
|
134 |
+
add.f32 %f18, %f15, %f7;
|
135 |
+
add.f32 %f19, %f11, %f9;
|
136 |
+
add.f32 %f20, %f12, %f10;
|
137 |
+
$L__tmp1:
|
138 |
+
.loc 2 233 15
|
139 |
+
add.f32 %f21, %f18, %f17;
|
140 |
+
add.f32 %f22, %f21, %f19;
|
141 |
+
add.f32 %f23, %f22, %f20;
|
142 |
+
$L__tmp2:
|
143 |
+
.loc 2 243 36
|
144 |
+
mov.b32 %r62, %f23;
|
145 |
+
shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
|
146 |
+
mov.b32 %f24, %r63;
|
147 |
+
$L__tmp3:
|
148 |
+
.loc 2 233 15
|
149 |
+
add.f32 %f25, %f23, %f24;
|
150 |
+
$L__tmp4:
|
151 |
+
.loc 2 243 36
|
152 |
+
mov.b32 %r64, %f25;
|
153 |
+
shfl.sync.bfly.b32 %r65, %r64, 8, 31, -1;
|
154 |
+
mov.b32 %f26, %r65;
|
155 |
+
$L__tmp5:
|
156 |
+
.loc 2 233 15
|
157 |
+
add.f32 %f27, %f25, %f26;
|
158 |
+
$L__tmp6:
|
159 |
+
.loc 2 243 36
|
160 |
+
mov.b32 %r66, %f27;
|
161 |
+
shfl.sync.bfly.b32 %r67, %r66, 4, 31, -1;
|
162 |
+
mov.b32 %f28, %r67;
|
163 |
+
$L__tmp7:
|
164 |
+
.loc 2 233 15
|
165 |
+
add.f32 %f29, %f27, %f28;
|
166 |
+
$L__tmp8:
|
167 |
+
.loc 2 243 36
|
168 |
+
mov.b32 %r68, %f29;
|
169 |
+
shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1;
|
170 |
+
mov.b32 %f30, %r69;
|
171 |
+
$L__tmp9:
|
172 |
+
.loc 2 233 15
|
173 |
+
add.f32 %f31, %f29, %f30;
|
174 |
+
$L__tmp10:
|
175 |
+
.loc 2 243 36
|
176 |
+
mov.b32 %r70, %f31;
|
177 |
+
shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1;
|
178 |
+
mov.b32 %f32, %r71;
|
179 |
+
$L__tmp11:
|
180 |
+
.loc 2 233 15
|
181 |
+
add.f32 %f33, %f31, %f32;
|
182 |
+
$L__tmp12:
|
183 |
+
.loc 2 243 36
|
184 |
+
setp.eq.s32 %p17, %r57, 0;
|
185 |
+
shr.u32 %r72, %r56, 3;
|
186 |
+
and.b32 %r73, %r72, 4;
|
187 |
+
mov.u32 %r74, global_smem;
|
188 |
+
add.s32 %r34, %r74, %r73;
|
189 |
+
mov.b32 %r35, %f33;
|
190 |
+
@%p17 st.shared.b32 [ %r34 + 0 ], %r35;
|
191 |
+
bar.sync 0;
|
192 |
+
setp.lt.s32 %p18, %r56, 2;
|
193 |
+
add.s32 %r37, %r74, %r58;
|
194 |
+
@%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
|
195 |
+
mov.b32 %f34, %r36;
|
196 |
+
shfl.sync.bfly.b32 %r75, %r36, 1, 31, -1;
|
197 |
+
mov.b32 %f35, %r75;
|
198 |
+
$L__tmp13:
|
199 |
+
.loc 2 233 15
|
200 |
+
add.f32 %f36, %f34, %f35;
|
201 |
+
$L__tmp14:
|
202 |
+
.loc 2 243 36
|
203 |
+
and.b32 %r76, %r56, 1;
|
204 |
+
setp.eq.b32 %p24, %r76, 1;
|
205 |
+
not.pred %p25, %p24;
|
206 |
+
and.pred %p19, %p18, %p25;
|
207 |
+
mov.b32 %r39, %f36;
|
208 |
+
@%p19 st.shared.b32 [ %r37 + 0 ], %r39;
|
209 |
+
bar.sync 0;
|
210 |
+
ld.shared.f32 %f37, [global_smem];
|
211 |
+
$L__tmp15:
|
212 |
+
.loc 3 8 15
|
213 |
+
add.f32 %f38, %f37, 0f00000000;
|
214 |
+
$L__tmp16:
|
215 |
+
.loc 1 45 20
|
216 |
+
mov.b32 %r41, %f38;
|
217 |
+
mov.b32 %r42, 1132462080;
|
218 |
+
div.full.f32 %r40, %r41, %r42;
|
219 |
+
mov.b32 %f39, %r40;
|
220 |
+
.loc 1 46 19
|
221 |
+
sub.f32 %f40, %f18, %f39;
|
222 |
+
sub.f32 %f41, %f17, %f39;
|
223 |
+
sub.f32 %f42, %f19, %f39;
|
224 |
+
sub.f32 %f43, %f20, %f39;
|
225 |
+
.loc 1 47 20
|
226 |
+
mul.f32 %f44, %f41, %f41;
|
227 |
+
$L__tmp17:
|
228 |
+
.loc 2 243 36
|
229 |
+
bar.sync 0;
|
230 |
+
$L__tmp18:
|
231 |
+
.loc 2 233 15
|
232 |
+
fma.rn.f32 %f45, %f40, %f40, %f44;
|
233 |
+
fma.rn.f32 %f46, %f42, %f42, %f45;
|
234 |
+
fma.rn.f32 %f47, %f43, %f43, %f46;
|
235 |
+
$L__tmp19:
|
236 |
+
.loc 2 243 36
|
237 |
+
mov.b32 %r77, %f47;
|
238 |
+
shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
|
239 |
+
mov.b32 %f48, %r78;
|
240 |
+
$L__tmp20:
|
241 |
+
.loc 2 233 15
|
242 |
+
add.f32 %f49, %f47, %f48;
|
243 |
+
$L__tmp21:
|
244 |
+
.loc 2 243 36
|
245 |
+
mov.b32 %r79, %f49;
|
246 |
+
shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
|
247 |
+
mov.b32 %f50, %r80;
|
248 |
+
$L__tmp22:
|
249 |
+
.loc 2 233 15
|
250 |
+
add.f32 %f51, %f49, %f50;
|
251 |
+
$L__tmp23:
|
252 |
+
.loc 2 243 36
|
253 |
+
mov.b32 %r81, %f51;
|
254 |
+
shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
|
255 |
+
mov.b32 %f52, %r82;
|
256 |
+
$L__tmp24:
|
257 |
+
.loc 2 233 15
|
258 |
+
add.f32 %f53, %f51, %f52;
|
259 |
+
$L__tmp25:
|
260 |
+
.loc 2 243 36
|
261 |
+
mov.b32 %r83, %f53;
|
262 |
+
shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
|
263 |
+
mov.b32 %f54, %r84;
|
264 |
+
$L__tmp26:
|
265 |
+
.loc 2 233 15
|
266 |
+
add.f32 %f55, %f53, %f54;
|
267 |
+
$L__tmp27:
|
268 |
+
.loc 2 243 36
|
269 |
+
mov.b32 %r85, %f55;
|
270 |
+
shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
|
271 |
+
mov.b32 %f56, %r86;
|
272 |
+
$L__tmp28:
|
273 |
+
.loc 2 233 15
|
274 |
+
add.f32 %f57, %f55, %f56;
|
275 |
+
$L__tmp29:
|
276 |
+
.loc 2 243 36
|
277 |
+
mov.b32 %r44, %f57;
|
278 |
+
@%p17 st.shared.b32 [ %r34 + 0 ], %r44;
|
279 |
+
bar.sync 0;
|
280 |
+
@%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
|
281 |
+
mov.b32 %f58, %r45;
|
282 |
+
shfl.sync.bfly.b32 %r87, %r45, 1, 31, -1;
|
283 |
+
mov.b32 %f59, %r87;
|
284 |
+
$L__tmp30:
|
285 |
+
.loc 2 233 15
|
286 |
+
add.f32 %f60, %f58, %f59;
|
287 |
+
$L__tmp31:
|
288 |
+
.loc 2 243 36
|
289 |
+
mov.b32 %r48, %f60;
|
290 |
+
@%p19 st.shared.b32 [ %r37 + 0 ], %r48;
|
291 |
+
bar.sync 0;
|
292 |
+
ld.shared.f32 %f61, [global_smem];
|
293 |
+
$L__tmp32:
|
294 |
+
.loc 3 8 15
|
295 |
+
add.f32 %f62, %f61, 0f00000000;
|
296 |
+
$L__tmp33:
|
297 |
+
.loc 1 53 20
|
298 |
+
mov.b32 %r50, %f62;
|
299 |
+
div.full.f32 %r49, %r50, %r42;
|
300 |
+
mov.b32 %f63, %r49;
|
301 |
+
.loc 1 55 20
|
302 |
+
add.f32 %f64, %f63, 0f3727C5AC;
|
303 |
+
.loc 1 56 26
|
304 |
+
rsqrt.approx.ftz.f32 %f65, %f64;
|
305 |
+
.loc 1 33 36
|
306 |
+
mov.b32 %f66, %r29;
|
307 |
+
mov.b32 %f67, %r28;
|
308 |
+
mov.b32 %f68, %r27;
|
309 |
+
mov.b32 %f69, %r26;
|
310 |
+
.loc 1 57 20
|
311 |
+
mul.f32 %f70, %f40, %f65;
|
312 |
+
mul.f32 %f71, %f41, %f65;
|
313 |
+
mul.f32 %f72, %f42, %f65;
|
314 |
+
mul.f32 %f73, %f43, %f65;
|
315 |
+
.loc 1 58 20
|
316 |
+
mul.f32 %f74, %f70, %f69;
|
317 |
+
mul.f32 %f75, %f71, %f68;
|
318 |
+
mul.f32 %f76, %f72, %f67;
|
319 |
+
mul.f32 %f77, %f73, %f66;
|
320 |
+
.loc 1 59 25
|
321 |
+
add.s64 %rd5, %rd10, %rd11;
|
322 |
+
.loc 1 59 48
|
323 |
+
mov.b32 %r52, %f74;
|
324 |
+
mov.b32 %r53, %f75;
|
325 |
+
mov.b32 %r54, %f76;
|
326 |
+
mov.b32 %r55, %f77;
|
327 |
+
@%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r52, %r53, %r54, %r55 };
|
328 |
+
.loc 1 59 4
|
329 |
+
ret;
|
330 |
+
$L__tmp34:
|
331 |
+
$L__func_end0:
|
332 |
+
|
333 |
+
}
|
334 |
+
// .globl __nv_rsqrtf
|
335 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
336 |
+
.param .b32 __nv_rsqrtf_param_0
|
337 |
+
)
|
338 |
+
{
|
339 |
+
.reg .f32 %f<3>;
|
340 |
+
$L__func_begin1:
|
341 |
+
|
342 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
343 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
344 |
+
st.param.f32 [func_retval0+0], %f2;
|
345 |
+
ret;
|
346 |
+
$L__func_end1:
|
347 |
+
|
348 |
+
}
|
349 |
+
.file 1 "/tmp/torchinductor_root/tv/ctvr3xs46luhhbr7xomihgyropjaatss7yata4igaw6kvgwas7g2.py"
|
350 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
351 |
+
.file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
352 |
+
.section .debug_abbrev
|
353 |
+
{
|
354 |
+
.b8 1
|
355 |
+
.b8 17
|
356 |
+
.b8 1
|
357 |
+
.b8 37
|
358 |
+
.b8 8
|
359 |
+
.b8 19
|
360 |
+
.b8 5
|
361 |
+
.b8 3
|
362 |
+
.b8 8
|
363 |
+
.b8 16
|
364 |
+
.b8 6
|
365 |
+
.b8 27
|
366 |
+
.b8 8
|
367 |
+
.b8 180
|
368 |
+
.b8 66
|
369 |
+
.b8 12
|
370 |
+
.b8 17
|
371 |
+
.b8 1
|
372 |
+
.b8 18
|
373 |
+
.b8 1
|
374 |
+
.b8 0
|
375 |
+
.b8 0
|
376 |
+
.b8 2
|
377 |
+
.b8 46
|
378 |
+
.b8 0
|
379 |
+
.b8 135
|
380 |
+
.b8 64
|
381 |
+
.b8 8
|
382 |
+
.b8 3
|
383 |
+
.b8 8
|
384 |
+
.b8 58
|
385 |
+
.b8 11
|
386 |
+
.b8 59
|
387 |
+
.b8 11
|
388 |
+
.b8 63
|
389 |
+
.b8 12
|
390 |
+
.b8 32
|
391 |
+
.b8 11
|
392 |
+
.b8 0
|
393 |
+
.b8 0
|
394 |
+
.b8 3
|
395 |
+
.b8 46
|
396 |
+
.b8 1
|
397 |
+
.b8 17
|
398 |
+
.b8 1
|
399 |
+
.b8 18
|
400 |
+
.b8 1
|
401 |
+
.b8 64
|
402 |
+
.b8 10
|
403 |
+
.b8 49
|
404 |
+
.b8 19
|
405 |
+
.b8 0
|
406 |
+
.b8 0
|
407 |
+
.b8 4
|
408 |
+
.b8 29
|
409 |
+
.b8 1
|
410 |
+
.b8 49
|
411 |
+
.b8 19
|
412 |
+
.b8 17
|
413 |
+
.b8 1
|
414 |
+
.b8 18
|
415 |
+
.b8 1
|
416 |
+
.b8 88
|
417 |
+
.b8 11
|
418 |
+
.b8 89
|
419 |
+
.b8 11
|
420 |
+
.b8 87
|
421 |
+
.b8 11
|
422 |
+
.b8 0
|
423 |
+
.b8 0
|
424 |
+
.b8 5
|
425 |
+
.b8 29
|
426 |
+
.b8 0
|
427 |
+
.b8 49
|
428 |
+
.b8 19
|
429 |
+
.b8 17
|
430 |
+
.b8 1
|
431 |
+
.b8 18
|
432 |
+
.b8 1
|
433 |
+
.b8 88
|
434 |
+
.b8 11
|
435 |
+
.b8 89
|
436 |
+
.b8 11
|
437 |
+
.b8 87
|
438 |
+
.b8 11
|
439 |
+
.b8 0
|
440 |
+
.b8 0
|
441 |
+
.b8 0
|
442 |
+
}
|
443 |
+
.section .debug_info
|
444 |
+
{
|
445 |
+
.b32 395
|
446 |
+
.b8 2
|
447 |
+
.b8 0
|
448 |
+
.b32 .debug_abbrev
|
449 |
+
.b8 8
|
450 |
+
.b8 1
|
451 |
+
.b8 116
|
452 |
+
.b8 114
|
453 |
+
.b8 105
|
454 |
+
.b8 116
|
455 |
+
.b8 111
|
456 |
+
.b8 110
|
457 |
+
.b8 0
|
458 |
+
.b8 2
|
459 |
+
.b8 0
|
460 |
+
.b8 99
|
461 |
+
.b8 116
|
462 |
+
.b8 118
|
463 |
+
.b8 114
|
464 |
+
.b8 51
|
465 |
+
.b8 120
|
466 |
+
.b8 115
|
467 |
+
.b8 52
|
468 |
+
.b8 54
|
469 |
+
.b8 108
|
470 |
+
.b8 117
|
471 |
+
.b8 104
|
472 |
+
.b8 104
|
473 |
+
.b8 98
|
474 |
+
.b8 114
|
475 |
+
.b8 55
|
476 |
+
.b8 120
|
477 |
+
.b8 111
|
478 |
+
.b8 109
|
479 |
+
.b8 105
|
480 |
+
.b8 104
|
481 |
+
.b8 103
|
482 |
+
.b8 121
|
483 |
+
.b8 114
|
484 |
+
.b8 111
|
485 |
+
.b8 112
|
486 |
+
.b8 106
|
487 |
+
.b8 97
|
488 |
+
.b8 97
|
489 |
+
.b8 116
|
490 |
+
.b8 115
|
491 |
+
.b8 115
|
492 |
+
.b8 55
|
493 |
+
.b8 121
|
494 |
+
.b8 97
|
495 |
+
.b8 116
|
496 |
+
.b8 97
|
497 |
+
.b8 52
|
498 |
+
.b8 105
|
499 |
+
.b8 103
|
500 |
+
.b8 97
|
501 |
+
.b8 119
|
502 |
+
.b8 54
|
503 |
+
.b8 107
|
504 |
+
.b8 118
|
505 |
+
.b8 103
|
506 |
+
.b8 119
|
507 |
+
.b8 97
|
508 |
+
.b8 115
|
509 |
+
.b8 55
|
510 |
+
.b8 103
|
511 |
+
.b8 50
|
512 |
+
.b8 46
|
513 |
+
.b8 112
|
514 |
+
.b8 121
|
515 |
+
.b8 0
|
516 |
+
.b32 .debug_line
|
517 |
+
.b8 47
|
518 |
+
.b8 116
|
519 |
+
.b8 109
|
520 |
+
.b8 112
|
521 |
+
.b8 47
|
522 |
+
.b8 116
|
523 |
+
.b8 111
|
524 |
+
.b8 114
|
525 |
+
.b8 99
|
526 |
+
.b8 104
|
527 |
+
.b8 105
|
528 |
+
.b8 110
|
529 |
+
.b8 100
|
530 |
+
.b8 117
|
531 |
+
.b8 99
|
532 |
+
.b8 116
|
533 |
+
.b8 111
|
534 |
+
.b8 114
|
535 |
+
.b8 95
|
536 |
+
.b8 114
|
537 |
+
.b8 111
|
538 |
+
.b8 111
|
539 |
+
.b8 116
|
540 |
+
.b8 47
|
541 |
+
.b8 116
|
542 |
+
.b8 118
|
543 |
+
.b8 0
|
544 |
+
.b8 1
|
545 |
+
.b64 $L__func_begin0
|
546 |
+
.b64 $L__func_end0
|
547 |
+
.b8 2
|
548 |
+
.b8 116
|
549 |
+
.b8 114
|
550 |
+
.b8 105
|
551 |
+
.b8 116
|
552 |
+
.b8 111
|
553 |
+
.b8 110
|
554 |
+
.b8 95
|
555 |
+
.b8 95
|
556 |
+
.b8 48
|
557 |
+
.b8 100
|
558 |
+
.b8 49
|
559 |
+
.b8 100
|
560 |
+
.b8 50
|
561 |
+
.b8 100
|
562 |
+
.b8 51
|
563 |
+
.b8 100
|
564 |
+
.b8 52
|
565 |
+
.b8 100
|
566 |
+
.b8 53
|
567 |
+
.b8 100
|
568 |
+
.b8 101
|
569 |
+
.b8 54
|
570 |
+
.b8 100
|
571 |
+
.b8 101
|
572 |
+
.b8 0
|
573 |
+
.b8 116
|
574 |
+
.b8 114
|
575 |
+
.b8 105
|
576 |
+
.b8 116
|
577 |
+
.b8 111
|
578 |
+
.b8 110
|
579 |
+
.b8 95
|
580 |
+
.b8 95
|
581 |
+
.b8 48
|
582 |
+
.b8 100
|
583 |
+
.b8 49
|
584 |
+
.b8 100
|
585 |
+
.b8 50
|
586 |
+
.b8 100
|
587 |
+
.b8 51
|
588 |
+
.b8 100
|
589 |
+
.b8 52
|
590 |
+
.b8 100
|
591 |
+
.b8 53
|
592 |
+
.b8 100
|
593 |
+
.b8 101
|
594 |
+
.b8 54
|
595 |
+
.b8 100
|
596 |
+
.b8 101
|
597 |
+
.b8 0
|
598 |
+
.b8 1
|
599 |
+
.b8 18
|
600 |
+
.b8 1
|
601 |
+
.b8 1
|
602 |
+
.b8 3
|
603 |
+
.b64 $L__func_begin0
|
604 |
+
.b64 $L__func_end0
|
605 |
+
.b8 1
|
606 |
+
.b8 156
|
607 |
+
.b32 125
|
608 |
+
.b8 4
|
609 |
+
.b32 125
|
610 |
+
.b64 $L__tmp1
|
611 |
+
.b64 $L__tmp14
|
612 |
+
.b8 2
|
613 |
+
.b8 42
|
614 |
+
.b8 59
|
615 |
+
.b8 5
|
616 |
+
.b32 125
|
617 |
+
.b64 $L__tmp1
|
618 |
+
.b64 $L__tmp14
|
619 |
+
.b8 2
|
620 |
+
.b8 243
|
621 |
+
.b8 36
|
622 |
+
.b8 0
|
623 |
+
.b8 5
|
624 |
+
.b32 125
|
625 |
+
.b64 $L__tmp2
|
626 |
+
.b64 $L__tmp15
|
627 |
+
.b8 2
|
628 |
+
.b8 42
|
629 |
+
.b8 59
|
630 |
+
.b8 5
|
631 |
+
.b32 125
|
632 |
+
.b64 $L__tmp15
|
633 |
+
.b64 $L__tmp16
|
634 |
+
.b8 3
|
635 |
+
.b8 42
|
636 |
+
.b8 45
|
637 |
+
.b8 5
|
638 |
+
.b32 125
|
639 |
+
.b64 $L__tmp17
|
640 |
+
.b64 $L__tmp32
|
641 |
+
.b8 2
|
642 |
+
.b8 50
|
643 |
+
.b8 59
|
644 |
+
.b8 4
|
645 |
+
.b32 125
|
646 |
+
.b64 $L__tmp18
|
647 |
+
.b64 $L__tmp31
|
648 |
+
.b8 2
|
649 |
+
.b8 50
|
650 |
+
.b8 59
|
651 |
+
.b8 5
|
652 |
+
.b32 125
|
653 |
+
.b64 $L__tmp18
|
654 |
+
.b64 $L__tmp31
|
655 |
+
.b8 2
|
656 |
+
.b8 243
|
657 |
+
.b8 36
|
658 |
+
.b8 0
|
659 |
+
.b8 5
|
660 |
+
.b32 125
|
661 |
+
.b64 $L__tmp32
|
662 |
+
.b64 $L__tmp33
|
663 |
+
.b8 3
|
664 |
+
.b8 50
|
665 |
+
.b8 45
|
666 |
+
.b8 0
|
667 |
+
.b8 0
|
668 |
+
}
|
669 |
+
.section .debug_pubnames
|
670 |
+
{
|
671 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
672 |
+
$L__pubNames_start0:
|
673 |
+
.b8 2
|
674 |
+
.b8 0
|
675 |
+
.b32 .debug_info
|
676 |
+
.b32 399
|
677 |
+
.b32 125
|
678 |
+
.b8 116
|
679 |
+
.b8 114
|
680 |
+
.b8 105
|
681 |
+
.b8 116
|
682 |
+
.b8 111
|
683 |
+
.b8 110
|
684 |
+
.b8 95
|
685 |
+
.b8 95
|
686 |
+
.b8 48
|
687 |
+
.b8 100
|
688 |
+
.b8 49
|
689 |
+
.b8 100
|
690 |
+
.b8 50
|
691 |
+
.b8 100
|
692 |
+
.b8 51
|
693 |
+
.b8 100
|
694 |
+
.b8 52
|
695 |
+
.b8 100
|
696 |
+
.b8 53
|
697 |
+
.b8 100
|
698 |
+
.b8 101
|
699 |
+
.b8 54
|
700 |
+
.b8 100
|
701 |
+
.b8 101
|
702 |
+
.b8 0
|
703 |
+
.b32 0
|
704 |
+
$L__pubNames_end0:
|
705 |
+
}
|
706 |
+
.section .debug_pubtypes
|
707 |
+
{
|
708 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
709 |
+
$L__pubTypes_start0:
|
710 |
+
.b8 2
|
711 |
+
.b8 0
|
712 |
+
.b32 .debug_info
|
713 |
+
.b32 399
|
714 |
+
.b32 0
|
715 |
+
$L__pubTypes_end0:
|
716 |
+
}
|
717 |
+
.section .debug_loc { }
|
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
3 |
+
%c0_i32 = arith.constant 0 : i32
|
4 |
+
%cst = arith.constant dense<0> : tensor<1x8xi64>
|
5 |
+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
|
6 |
+
%cst_1 = arith.constant dense<8> : tensor<1x8xi32>
|
7 |
+
%0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
8 |
+
%1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
9 |
+
%2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32>
|
10 |
+
%3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
|
11 |
+
%4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
|
12 |
+
%5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32>
|
13 |
+
%6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>>
|
14 |
+
%7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>>, tensor<1x8xi32>
|
15 |
+
%8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64>
|
16 |
+
%9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1>, tensor<1x8xf32>
|
17 |
+
%10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
|
18 |
+
^bb0(%arg5: f32, %arg6: f32):
|
19 |
+
%19 = arith.addf %arg5, %arg6 : f32
|
20 |
+
tt.reduce.return %19 : f32
|
21 |
+
}) : (tensor<1x8xf32>) -> tensor<1xf32>
|
22 |
+
%11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
|
23 |
+
%12 = arith.select %2, %8, %cst : tensor<1x8xi1>, tensor<1x8xi64>
|
24 |
+
%13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
|
25 |
+
^bb0(%arg5: i64, %arg6: i64):
|
26 |
+
%19 = arith.addi %arg5, %arg6 : i64
|
27 |
+
tt.reduce.return %19 : i64
|
28 |
+
}) : (tensor<1x8xi64>) -> tensor<1xi64>
|
29 |
+
%14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64>) -> tensor<1x1xi64>
|
30 |
+
%15 = arith.sitofp %14 : tensor<1x1xi64> to tensor<1x1xf32>
|
31 |
+
%16 = arith.divf %11, %15 : tensor<1x1xf32>
|
32 |
+
gpu.barrier
|
33 |
+
%17 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
|
34 |
+
%18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
|
35 |
+
tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx
ADDED
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2d3d4de(
|
12 |
+
.param .u64 triton__0d1d2d3d4de_param_0,
|
13 |
+
.param .u64 triton__0d1d2d3d4de_param_1,
|
14 |
+
.param .u64 triton__0d1d2d3d4de_param_2,
|
15 |
+
.param .u64 triton__0d1d2d3d4de_param_3,
|
16 |
+
.param .u32 triton__0d1d2d3d4de_param_4
|
17 |
+
)
|
18 |
+
.maxntid 128, 1, 1
|
19 |
+
{
|
20 |
+
.reg .pred %p<8>;
|
21 |
+
.reg .b16 %rs<33>;
|
22 |
+
.reg .b32 %r<77>;
|
23 |
+
.reg .f32 %f<65>;
|
24 |
+
.reg .b64 %rd<11>;
|
25 |
+
.loc 1 18 0
|
26 |
+
$L__func_begin0:
|
27 |
+
.loc 1 18 0
|
28 |
+
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0];
|
30 |
+
ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1];
|
31 |
+
$L__tmp0:
|
32 |
+
.loc 1 21 36
|
33 |
+
mov.u32 %r50, %tid.x;
|
34 |
+
shl.b32 %r51, %r50, 3;
|
35 |
+
ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2];
|
36 |
+
and.b32 %r52, %r51, 1016;
|
37 |
+
ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3];
|
38 |
+
.loc 1 20 28
|
39 |
+
mov.u32 %r1, %ctaid.x;
|
40 |
+
.loc 1 20 33
|
41 |
+
shl.b32 %r53, %r1, 10;
|
42 |
+
.loc 1 21 23
|
43 |
+
or.b32 %r54, %r53, %r52;
|
44 |
+
.loc 1 23 20
|
45 |
+
shr.s32 %r56, %r54, 31;
|
46 |
+
shr.u32 %r57, %r56, 24;
|
47 |
+
add.s32 %r58, %r54, %r57;
|
48 |
+
shr.s32 %r59, %r58, 8;
|
49 |
+
.loc 1 23 27
|
50 |
+
mul.hi.s32 %r60, %r59, 1431655766;
|
51 |
+
shr.u32 %r61, %r60, 31;
|
52 |
+
add.s32 %r62, %r60, %r61;
|
53 |
+
mul.lo.s32 %r63, %r62, 3;
|
54 |
+
sub.s32 %r64, %r59, %r63;
|
55 |
+
and.b32 %r65, %r58, -256;
|
56 |
+
sub.s32 %r66, %r54, %r65;
|
57 |
+
.loc 1 25 20
|
58 |
+
mul.hi.s32 %r67, %r54, 715827883;
|
59 |
+
shr.u32 %r68, %r67, 31;
|
60 |
+
shr.u32 %r69, %r67, 7;
|
61 |
+
add.s32 %r70, %r69, %r68;
|
62 |
+
.loc 1 27 40
|
63 |
+
shl.b32 %r71, %r70, 8;
|
64 |
+
.loc 1 27 36
|
65 |
+
add.s32 %r72, %r71, %r66;
|
66 |
+
.loc 1 27 30
|
67 |
+
mul.wide.s32 %rd9, %r72, 2;
|
68 |
+
add.s64 %rd1, %rd5, %rd9;
|
69 |
+
mov.pred %p1, -1;
|
70 |
+
.loc 1 27 46
|
71 |
+
mov.u32 %r2, 0x0;
|
72 |
+
mov.u32 %r3, 0x0;
|
73 |
+
mov.u32 %r4, 0x0;
|
74 |
+
mov.u32 %r5, 0x0;
|
75 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
|
76 |
+
cvt.u16.u32 %rs1, %r2;
|
77 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
78 |
+
cvt.u16.u32 %rs3, %r3;
|
79 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
80 |
+
cvt.u16.u32 %rs5, %r4;
|
81 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
|
82 |
+
cvt.u16.u32 %rs7, %r5;
|
83 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
|
84 |
+
.loc 1 27 85
|
85 |
+
cvt.f32.bf16 %r6, %rs1;
|
86 |
+
mov.b32 %f1, %r6;
|
87 |
+
cvt.f32.bf16 %r7, %rs2;
|
88 |
+
mov.b32 %f2, %r7;
|
89 |
+
cvt.f32.bf16 %r8, %rs3;
|
90 |
+
mov.b32 %f3, %r8;
|
91 |
+
cvt.f32.bf16 %r9, %rs4;
|
92 |
+
mov.b32 %f4, %r9;
|
93 |
+
cvt.f32.bf16 %r10, %rs5;
|
94 |
+
mov.b32 %f5, %r10;
|
95 |
+
cvt.f32.bf16 %r11, %rs6;
|
96 |
+
mov.b32 %f6, %r11;
|
97 |
+
cvt.f32.bf16 %r12, %rs7;
|
98 |
+
mov.b32 %f7, %r12;
|
99 |
+
cvt.f32.bf16 %r13, %rs8;
|
100 |
+
mov.b32 %f8, %r13;
|
101 |
+
.loc 1 28 30
|
102 |
+
add.s64 %rd2, %rd6, %rd9;
|
103 |
+
.loc 1 28 46
|
104 |
+
mov.u32 %r14, 0x0;
|
105 |
+
mov.u32 %r15, 0x0;
|
106 |
+
mov.u32 %r16, 0x0;
|
107 |
+
mov.u32 %r17, 0x0;
|
108 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
|
109 |
+
cvt.u16.u32 %rs9, %r14;
|
110 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
|
111 |
+
cvt.u16.u32 %rs11, %r15;
|
112 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
|
113 |
+
cvt.u16.u32 %rs13, %r16;
|
114 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
|
115 |
+
cvt.u16.u32 %rs15, %r17;
|
116 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
|
117 |
+
.loc 1 28 85
|
118 |
+
cvt.f32.bf16 %r18, %rs9;
|
119 |
+
mov.b32 %f9, %r18;
|
120 |
+
cvt.f32.bf16 %r19, %rs10;
|
121 |
+
mov.b32 %f10, %r19;
|
122 |
+
cvt.f32.bf16 %r20, %rs11;
|
123 |
+
mov.b32 %f11, %r20;
|
124 |
+
cvt.f32.bf16 %r21, %rs12;
|
125 |
+
mov.b32 %f12, %r21;
|
126 |
+
cvt.f32.bf16 %r22, %rs13;
|
127 |
+
mov.b32 %f13, %r22;
|
128 |
+
cvt.f32.bf16 %r23, %rs14;
|
129 |
+
mov.b32 %f14, %r23;
|
130 |
+
cvt.f32.bf16 %r24, %rs15;
|
131 |
+
mov.b32 %f15, %r24;
|
132 |
+
cvt.f32.bf16 %r25, %rs16;
|
133 |
+
mov.b32 %f16, %r25;
|
134 |
+
.loc 1 29 31
|
135 |
+
add.s64 %rd3, %rd7, %rd9;
|
136 |
+
.loc 1 29 47
|
137 |
+
mov.u32 %r26, 0x0;
|
138 |
+
mov.u32 %r27, 0x0;
|
139 |
+
mov.u32 %r28, 0x0;
|
140 |
+
mov.u32 %r29, 0x0;
|
141 |
+
@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ];
|
142 |
+
cvt.u16.u32 %rs17, %r26;
|
143 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; }
|
144 |
+
cvt.u16.u32 %rs19, %r27;
|
145 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; }
|
146 |
+
cvt.u16.u32 %rs21, %r28;
|
147 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; }
|
148 |
+
cvt.u16.u32 %rs23, %r29;
|
149 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; }
|
150 |
+
.loc 1 29 86
|
151 |
+
cvt.f32.bf16 %r30, %rs17;
|
152 |
+
mov.b32 %f17, %r30;
|
153 |
+
cvt.f32.bf16 %r31, %rs18;
|
154 |
+
mov.b32 %f18, %r31;
|
155 |
+
cvt.f32.bf16 %r32, %rs19;
|
156 |
+
mov.b32 %f19, %r32;
|
157 |
+
cvt.f32.bf16 %r33, %rs20;
|
158 |
+
mov.b32 %f20, %r33;
|
159 |
+
cvt.f32.bf16 %r34, %rs21;
|
160 |
+
mov.b32 %f21, %r34;
|
161 |
+
cvt.f32.bf16 %r35, %rs22;
|
162 |
+
mov.b32 %f22, %r35;
|
163 |
+
cvt.f32.bf16 %r36, %rs23;
|
164 |
+
mov.b32 %f23, %r36;
|
165 |
+
cvt.f32.bf16 %r37, %rs24;
|
166 |
+
mov.b32 %f24, %r37;
|
167 |
+
.loc 1 32 19
|
168 |
+
setp.eq.s32 %p5, %r64, 2;
|
169 |
+
.loc 1 34 32
|
170 |
+
selp.f32 %f25, %f1, 0f00000000, %p5;
|
171 |
+
selp.f32 %f26, %f2, 0f00000000, %p5;
|
172 |
+
selp.f32 %f27, %f3, 0f00000000, %p5;
|
173 |
+
selp.f32 %f28, %f4, 0f00000000, %p5;
|
174 |
+
selp.f32 %f29, %f5, 0f00000000, %p5;
|
175 |
+
selp.f32 %f30, %f6, 0f00000000, %p5;
|
176 |
+
selp.f32 %f31, %f7, 0f00000000, %p5;
|
177 |
+
selp.f32 %f32, %f8, 0f00000000, %p5;
|
178 |
+
.loc 1 36 19
|
179 |
+
setp.eq.s32 %p6, %r64, 1;
|
180 |
+
.loc 1 37 32
|
181 |
+
selp.f32 %f33, %f9, 0f00000000, %p6;
|
182 |
+
selp.f32 %f34, %f10, 0f00000000, %p6;
|
183 |
+
selp.f32 %f35, %f11, 0f00000000, %p6;
|
184 |
+
selp.f32 %f36, %f12, 0f00000000, %p6;
|
185 |
+
selp.f32 %f37, %f13, 0f00000000, %p6;
|
186 |
+
selp.f32 %f38, %f14, 0f00000000, %p6;
|
187 |
+
selp.f32 %f39, %f15, 0f00000000, %p6;
|
188 |
+
selp.f32 %f40, %f16, 0f00000000, %p6;
|
189 |
+
.loc 1 38 19
|
190 |
+
add.f32 %f41, %f25, %f33;
|
191 |
+
add.f32 %f42, %f26, %f34;
|
192 |
+
add.f32 %f43, %f27, %f35;
|
193 |
+
add.f32 %f44, %f28, %f36;
|
194 |
+
add.f32 %f45, %f29, %f37;
|
195 |
+
add.f32 %f46, %f30, %f38;
|
196 |
+
add.f32 %f47, %f31, %f39;
|
197 |
+
add.f32 %f48, %f32, %f40;
|
198 |
+
.loc 1 40 20
|
199 |
+
setp.eq.s32 %p7, %r64, 0;
|
200 |
+
.loc 1 41 35
|
201 |
+
selp.f32 %f49, %f17, 0f00000000, %p7;
|
202 |
+
selp.f32 %f50, %f18, 0f00000000, %p7;
|
203 |
+
selp.f32 %f51, %f19, 0f00000000, %p7;
|
204 |
+
selp.f32 %f52, %f20, 0f00000000, %p7;
|
205 |
+
selp.f32 %f53, %f21, 0f00000000, %p7;
|
206 |
+
selp.f32 %f54, %f22, 0f00000000, %p7;
|
207 |
+
selp.f32 %f55, %f23, 0f00000000, %p7;
|
208 |
+
selp.f32 %f56, %f24, 0f00000000, %p7;
|
209 |
+
.loc 1 42 20
|
210 |
+
add.f32 %f57, %f41, %f49;
|
211 |
+
add.f32 %f58, %f42, %f50;
|
212 |
+
add.f32 %f59, %f43, %f51;
|
213 |
+
add.f32 %f60, %f44, %f52;
|
214 |
+
add.f32 %f61, %f45, %f53;
|
215 |
+
add.f32 %f62, %f46, %f54;
|
216 |
+
add.f32 %f63, %f47, %f55;
|
217 |
+
add.f32 %f64, %f48, %f56;
|
218 |
+
.loc 1 43 25
|
219 |
+
mul.wide.s32 %rd10, %r54, 2;
|
220 |
+
add.s64 %rd4, %rd8, %rd10;
|
221 |
+
.loc 1 43 37
|
222 |
+
mov.b32 %r38, %f57;
|
223 |
+
cvt.rn.bf16.f32 %rs25, %r38;
|
224 |
+
mov.b32 %r39, %f58;
|
225 |
+
cvt.rn.bf16.f32 %rs26, %r39;
|
226 |
+
mov.b32 %r40, %f59;
|
227 |
+
cvt.rn.bf16.f32 %rs27, %r40;
|
228 |
+
mov.b32 %r41, %f60;
|
229 |
+
cvt.rn.bf16.f32 %rs28, %r41;
|
230 |
+
mov.b32 %r42, %f61;
|
231 |
+
cvt.rn.bf16.f32 %rs29, %r42;
|
232 |
+
mov.b32 %r43, %f62;
|
233 |
+
cvt.rn.bf16.f32 %rs30, %r43;
|
234 |
+
mov.b32 %r44, %f63;
|
235 |
+
cvt.rn.bf16.f32 %rs31, %r44;
|
236 |
+
mov.b32 %r45, %f64;
|
237 |
+
cvt.rn.bf16.f32 %rs32, %r45;
|
238 |
+
mov.b32 %r73, {%rs25, %rs26};
|
239 |
+
mov.b32 %r74, {%rs27, %rs28};
|
240 |
+
mov.b32 %r75, {%rs29, %rs30};
|
241 |
+
mov.b32 %r76, {%rs31, %rs32};
|
242 |
+
@%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 };
|
243 |
+
.loc 1 43 4
|
244 |
+
ret;
|
245 |
+
$L__tmp1:
|
246 |
+
$L__func_end0:
|
247 |
+
|
248 |
+
}
|
249 |
+
.file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py"
|
250 |
+
.section .debug_abbrev
|
251 |
+
{
|
252 |
+
.b8 1
|
253 |
+
.b8 17
|
254 |
+
.b8 1
|
255 |
+
.b8 37
|
256 |
+
.b8 8
|
257 |
+
.b8 19
|
258 |
+
.b8 5
|
259 |
+
.b8 3
|
260 |
+
.b8 8
|
261 |
+
.b8 16
|
262 |
+
.b8 6
|
263 |
+
.b8 27
|
264 |
+
.b8 8
|
265 |
+
.b8 180
|
266 |
+
.b8 66
|
267 |
+
.b8 12
|
268 |
+
.b8 17
|
269 |
+
.b8 1
|
270 |
+
.b8 18
|
271 |
+
.b8 1
|
272 |
+
.b8 0
|
273 |
+
.b8 0
|
274 |
+
.b8 2
|
275 |
+
.b8 46
|
276 |
+
.b8 0
|
277 |
+
.b8 17
|
278 |
+
.b8 1
|
279 |
+
.b8 18
|
280 |
+
.b8 1
|
281 |
+
.b8 64
|
282 |
+
.b8 10
|
283 |
+
.b8 135
|
284 |
+
.b8 64
|
285 |
+
.b8 8
|
286 |
+
.b8 3
|
287 |
+
.b8 8
|
288 |
+
.b8 58
|
289 |
+
.b8 11
|
290 |
+
.b8 59
|
291 |
+
.b8 11
|
292 |
+
.b8 63
|
293 |
+
.b8 12
|
294 |
+
.b8 0
|
295 |
+
.b8 0
|
296 |
+
.b8 0
|
297 |
+
}
|
298 |
+
.section .debug_info
|
299 |
+
{
|
300 |
+
.b32 184
|
301 |
+
.b8 2
|
302 |
+
.b8 0
|
303 |
+
.b32 .debug_abbrev
|
304 |
+
.b8 8
|
305 |
+
.b8 1
|
306 |
+
.b8 116
|
307 |
+
.b8 114
|
308 |
+
.b8 105
|
309 |
+
.b8 116
|
310 |
+
.b8 111
|
311 |
+
.b8 110
|
312 |
+
.b8 0
|
313 |
+
.b8 2
|
314 |
+
.b8 0
|
315 |
+
.b8 99
|
316 |
+
.b8 54
|
317 |
+
.b8 51
|
318 |
+
.b8 114
|
319 |
+
.b8 55
|
320 |
+
.b8 105
|
321 |
+
.b8 117
|
322 |
+
.b8 114
|
323 |
+
.b8 119
|
324 |
+
.b8 107
|
325 |
+
.b8 53
|
326 |
+
.b8 121
|
327 |
+
.b8 100
|
328 |
+
.b8 108
|
329 |
+
.b8 115
|
330 |
+
.b8 119
|
331 |
+
.b8 104
|
332 |
+
.b8 55
|
333 |
+
.b8 114
|
334 |
+
.b8 118
|
335 |
+
.b8 104
|
336 |
+
.b8 99
|
337 |
+
.b8 109
|
338 |
+
.b8 108
|
339 |
+
.b8 120
|
340 |
+
.b8 50
|
341 |
+
.b8 99
|
342 |
+
.b8 102
|
343 |
+
.b8 114
|
344 |
+
.b8 101
|
345 |
+
.b8 116
|
346 |
+
.b8 108
|
347 |
+
.b8 114
|
348 |
+
.b8 101
|
349 |
+
.b8 119
|
350 |
+
.b8 103
|
351 |
+
.b8 119
|
352 |
+
.b8 54
|
353 |
+
.b8 116
|
354 |
+
.b8 108
|
355 |
+
.b8 106
|
356 |
+
.b8 108
|
357 |
+
.b8 117
|
358 |
+
.b8 114
|
359 |
+
.b8 115
|
360 |
+
.b8 115
|
361 |
+
.b8 104
|
362 |
+
.b8 103
|
363 |
+
.b8 116
|
364 |
+
.b8 102
|
365 |
+
.b8 112
|
366 |
+
.b8 112
|
367 |
+
.b8 46
|
368 |
+
.b8 112
|
369 |
+
.b8 121
|
370 |
+
.b8 0
|
371 |
+
.b32 .debug_line
|
372 |
+
.b8 47
|
373 |
+
.b8 116
|
374 |
+
.b8 109
|
375 |
+
.b8 112
|
376 |
+
.b8 47
|
377 |
+
.b8 116
|
378 |
+
.b8 111
|
379 |
+
.b8 114
|
380 |
+
.b8 99
|
381 |
+
.b8 104
|
382 |
+
.b8 105
|
383 |
+
.b8 110
|
384 |
+
.b8 100
|
385 |
+
.b8 117
|
386 |
+
.b8 99
|
387 |
+
.b8 116
|
388 |
+
.b8 111
|
389 |
+
.b8 114
|
390 |
+
.b8 95
|
391 |
+
.b8 114
|
392 |
+
.b8 111
|
393 |
+
.b8 111
|
394 |
+
.b8 116
|
395 |
+
.b8 47
|
396 |
+
.b8 54
|
397 |
+
.b8 51
|
398 |
+
.b8 0
|
399 |
+
.b8 1
|
400 |
+
.b64 $L__func_begin0
|
401 |
+
.b64 $L__func_end0
|
402 |
+
.b8 2
|
403 |
+
.b64 $L__func_begin0
|
404 |
+
.b64 $L__func_end0
|
405 |
+
.b8 1
|
406 |
+
.b8 156
|
407 |
+
.b8 116
|
408 |
+
.b8 114
|
409 |
+
.b8 105
|
410 |
+
.b8 116
|
411 |
+
.b8 111
|
412 |
+
.b8 110
|
413 |
+
.b8 95
|
414 |
+
.b8 95
|
415 |
+
.b8 48
|
416 |
+
.b8 100
|
417 |
+
.b8 49
|
418 |
+
.b8 100
|
419 |
+
.b8 50
|
420 |
+
.b8 100
|
421 |
+
.b8 51
|
422 |
+
.b8 100
|
423 |
+
.b8 52
|
424 |
+
.b8 100
|
425 |
+
.b8 101
|
426 |
+
.b8 0
|
427 |
+
.b8 116
|
428 |
+
.b8 114
|
429 |
+
.b8 105
|
430 |
+
.b8 116
|
431 |
+
.b8 111
|
432 |
+
.b8 110
|
433 |
+
.b8 95
|
434 |
+
.b8 95
|
435 |
+
.b8 48
|
436 |
+
.b8 100
|
437 |
+
.b8 49
|
438 |
+
.b8 100
|
439 |
+
.b8 50
|
440 |
+
.b8 100
|
441 |
+
.b8 51
|
442 |
+
.b8 100
|
443 |
+
.b8 52
|
444 |
+
.b8 100
|
445 |
+
.b8 101
|
446 |
+
.b8 0
|
447 |
+
.b8 1
|
448 |
+
.b8 18
|
449 |
+
.b8 1
|
450 |
+
.b8 0
|
451 |
+
}
|
452 |
+
.section .debug_pubnames
|
453 |
+
{
|
454 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
455 |
+
$L__pubNames_start0:
|
456 |
+
.b8 2
|
457 |
+
.b8 0
|
458 |
+
.b32 .debug_info
|
459 |
+
.b32 188
|
460 |
+
.b32 125
|
461 |
+
.b8 116
|
462 |
+
.b8 114
|
463 |
+
.b8 105
|
464 |
+
.b8 116
|
465 |
+
.b8 111
|
466 |
+
.b8 110
|
467 |
+
.b8 95
|
468 |
+
.b8 95
|
469 |
+
.b8 48
|
470 |
+
.b8 100
|
471 |
+
.b8 49
|
472 |
+
.b8 100
|
473 |
+
.b8 50
|
474 |
+
.b8 100
|
475 |
+
.b8 51
|
476 |
+
.b8 100
|
477 |
+
.b8 52
|
478 |
+
.b8 100
|
479 |
+
.b8 101
|
480 |
+
.b8 0
|
481 |
+
.b32 0
|
482 |
+
$L__pubNames_end0:
|
483 |
+
}
|
484 |
+
.section .debug_pubtypes
|
485 |
+
{
|
486 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
487 |
+
$L__pubTypes_start0:
|
488 |
+
.b8 2
|
489 |
+
.b8 0
|
490 |
+
.b32 .debug_info
|
491 |
+
.b32 188
|
492 |
+
.b32 0
|
493 |
+
$L__pubTypes_end0:
|
494 |
+
}
|
495 |
+
.section .debug_loc { }
|
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !5 {
|
7 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%10 = and i32 %9, 31, !dbg !8
|
9 |
+
%11 = lshr i32 %9, 5, !dbg !8
|
10 |
+
%12 = and i32 %11, 1, !dbg !8
|
11 |
+
%urem = shl i32 %9, 2, !dbg !8
|
12 |
+
%13 = and i32 %urem, 252, !dbg !8
|
13 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
14 |
+
%15 = shl i32 %14, 8, !dbg !10
|
15 |
+
%16 = or i32 %15, %13, !dbg !11
|
16 |
+
%17 = sext i32 %16 to i64, !dbg !12
|
17 |
+
%18 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !12
|
18 |
+
%19 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
|
19 |
+
%20 = extractvalue { i32, i32 } %19, 0, !dbg !13
|
20 |
+
%21 = extractvalue { i32, i32 } %19, 1, !dbg !13
|
21 |
+
%22 = trunc i32 %20 to i16, !dbg !13
|
22 |
+
%extelt.offset = lshr i32 %20, 16, !dbg !13
|
23 |
+
%23 = trunc i32 %extelt.offset to i16, !dbg !13
|
24 |
+
%24 = trunc i32 %21 to i16, !dbg !13
|
25 |
+
%extelt.offset1 = lshr i32 %21, 16, !dbg !13
|
26 |
+
%25 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
27 |
+
%26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #3, !dbg !14
|
28 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
|
29 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
|
30 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
|
31 |
+
%30 = zext nneg i32 %13 to i64, !dbg !15
|
32 |
+
%31 = getelementptr float, ptr addrspace(1) %2, i64 %30, !dbg !15
|
33 |
+
%32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %31, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
|
34 |
+
%33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !16
|
35 |
+
%34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !16
|
36 |
+
%35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !16
|
37 |
+
%36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !16
|
38 |
+
%37 = bitcast i32 %33 to float, !dbg !16
|
39 |
+
%38 = bitcast i32 %34 to float, !dbg !16
|
40 |
+
%39 = bitcast i32 %35 to float, !dbg !16
|
41 |
+
%40 = bitcast i32 %36 to float, !dbg !16
|
42 |
+
%41 = getelementptr float, ptr addrspace(1) %3, i64 %17, !dbg !17
|
43 |
+
%42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
|
44 |
+
%43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !18
|
45 |
+
%44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !18
|
46 |
+
%45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !18
|
47 |
+
%46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !18
|
48 |
+
%47 = bitcast i32 %43 to float, !dbg !18
|
49 |
+
%48 = bitcast i32 %44 to float, !dbg !18
|
50 |
+
%49 = bitcast i32 %45 to float, !dbg !18
|
51 |
+
%50 = bitcast i32 %46 to float, !dbg !18
|
52 |
+
%51 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !19
|
53 |
+
%52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
|
54 |
+
%53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !20
|
55 |
+
%54 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !20
|
56 |
+
%55 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !20
|
57 |
+
%56 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !20
|
58 |
+
%57 = bitcast i32 %53 to float, !dbg !20
|
59 |
+
%58 = bitcast i32 %54 to float, !dbg !20
|
60 |
+
%59 = bitcast i32 %55 to float, !dbg !20
|
61 |
+
%60 = bitcast i32 %56 to float, !dbg !20
|
62 |
+
%61 = sext i32 %14 to i64, !dbg !21
|
63 |
+
%62 = getelementptr float, ptr addrspace(1) %4, i64 %61, !dbg !21
|
64 |
+
%63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
|
65 |
+
%64 = bitcast i32 %63 to float, !dbg !22
|
66 |
+
%65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
|
67 |
+
%66 = bitcast i32 %65 to float, !dbg !22
|
68 |
+
%67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
|
69 |
+
%68 = bitcast i32 %67 to float, !dbg !22
|
70 |
+
%69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
|
71 |
+
%70 = bitcast i32 %69 to float, !dbg !22
|
72 |
+
%71 = fmul float %26, %37, !dbg !23
|
73 |
+
%72 = fmul float %27, %38, !dbg !23
|
74 |
+
%73 = fmul float %28, %39, !dbg !23
|
75 |
+
%74 = fmul float %29, %40, !dbg !23
|
76 |
+
%75 = fadd float %71, %72, !dbg !24
|
77 |
+
%76 = fadd float %73, %75, !dbg !24
|
78 |
+
%77 = fadd float %74, %76, !dbg !24
|
79 |
+
%78 = bitcast float %77 to i32, !dbg !30
|
80 |
+
%79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !30
|
81 |
+
%80 = bitcast i32 %79 to float, !dbg !30
|
82 |
+
%81 = fadd float %77, %80, !dbg !24
|
83 |
+
%82 = bitcast float %81 to i32, !dbg !30
|
84 |
+
%83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !30
|
85 |
+
%84 = bitcast i32 %83 to float, !dbg !30
|
86 |
+
%85 = fadd float %81, %84, !dbg !24
|
87 |
+
%86 = bitcast float %85 to i32, !dbg !30
|
88 |
+
%87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !30
|
89 |
+
%88 = bitcast i32 %87 to float, !dbg !30
|
90 |
+
%89 = fadd float %85, %88, !dbg !24
|
91 |
+
%90 = bitcast float %89 to i32, !dbg !30
|
92 |
+
%91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !30
|
93 |
+
%92 = bitcast i32 %91 to float, !dbg !30
|
94 |
+
%93 = fadd float %89, %92, !dbg !24
|
95 |
+
%94 = bitcast float %93 to i32, !dbg !30
|
96 |
+
%95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !30
|
97 |
+
%96 = bitcast i32 %95 to float, !dbg !30
|
98 |
+
%97 = fadd float %93, %96, !dbg !24
|
99 |
+
%98 = icmp eq i32 %10, 0, !dbg !30
|
100 |
+
%99 = zext nneg i32 %12 to i64, !dbg !30
|
101 |
+
%100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !30
|
102 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %97, i1 %98) #3, !dbg !30
|
103 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !30
|
104 |
+
%101 = icmp slt i32 %9, 2, !dbg !30
|
105 |
+
%102 = sext i32 %9 to i64, !dbg !30
|
106 |
+
%103 = getelementptr float, ptr addrspace(3) @global_smem, i64 %102, !dbg !30
|
107 |
+
%104 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !30
|
108 |
+
%105 = bitcast float %104 to i32, !dbg !30
|
109 |
+
%106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !30
|
110 |
+
%107 = bitcast i32 %106 to float, !dbg !30
|
111 |
+
%108 = fadd float %104, %107, !dbg !24
|
112 |
+
%109 = and i32 %9, 1, !dbg !30
|
113 |
+
%110 = icmp eq i32 %109, 0, !dbg !30
|
114 |
+
%111 = and i1 %101, %110, !dbg !30
|
115 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %108, i1 %111) #3, !dbg !30
|
116 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !30
|
117 |
+
%112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !30
|
118 |
+
%113 = fadd float %112, 0.000000e+00, !dbg !32
|
119 |
+
%114 = fmul float %71, %47, !dbg !36
|
120 |
+
%115 = fmul float %72, %48, !dbg !36
|
121 |
+
%116 = fmul float %73, %49, !dbg !36
|
122 |
+
%117 = fmul float %74, %50, !dbg !36
|
123 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !37
|
124 |
+
%118 = fadd float %114, %115, !dbg !39
|
125 |
+
%119 = fadd float %116, %118, !dbg !39
|
126 |
+
%120 = fadd float %117, %119, !dbg !39
|
127 |
+
%121 = bitcast float %120 to i32, !dbg !37
|
128 |
+
%122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !37
|
129 |
+
%123 = bitcast i32 %122 to float, !dbg !37
|
130 |
+
%124 = fadd float %120, %123, !dbg !39
|
131 |
+
%125 = bitcast float %124 to i32, !dbg !37
|
132 |
+
%126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 8, i32 31), !dbg !37
|
133 |
+
%127 = bitcast i32 %126 to float, !dbg !37
|
134 |
+
%128 = fadd float %124, %127, !dbg !39
|
135 |
+
%129 = bitcast float %128 to i32, !dbg !37
|
136 |
+
%130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 4, i32 31), !dbg !37
|
137 |
+
%131 = bitcast i32 %130 to float, !dbg !37
|
138 |
+
%132 = fadd float %128, %131, !dbg !39
|
139 |
+
%133 = bitcast float %132 to i32, !dbg !37
|
140 |
+
%134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !37
|
141 |
+
%135 = bitcast i32 %134 to float, !dbg !37
|
142 |
+
%136 = fadd float %132, %135, !dbg !39
|
143 |
+
%137 = bitcast float %136 to i32, !dbg !37
|
144 |
+
%138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !37
|
145 |
+
%139 = bitcast i32 %138 to float, !dbg !37
|
146 |
+
%140 = fadd float %136, %139, !dbg !39
|
147 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %140, i1 %98) #3, !dbg !37
|
148 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !37
|
149 |
+
%141 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !37
|
150 |
+
%142 = bitcast float %141 to i32, !dbg !37
|
151 |
+
%143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 1, i32 31), !dbg !37
|
152 |
+
%144 = bitcast i32 %143 to float, !dbg !37
|
153 |
+
%145 = fadd float %141, %144, !dbg !39
|
154 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %145, i1 %111) #3, !dbg !37
|
155 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !37
|
156 |
+
%146 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
|
157 |
+
%147 = fadd float %146, 0.000000e+00, !dbg !42
|
158 |
+
%148 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %64, float 2.560000e+02) #3, !dbg !44
|
159 |
+
%149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %66, float 2.560000e+02) #3, !dbg !44
|
160 |
+
%150 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %68, float 2.560000e+02) #3, !dbg !44
|
161 |
+
%151 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %70, float 2.560000e+02) #3, !dbg !44
|
162 |
+
%152 = fmul float %71, 2.560000e+02, !dbg !45
|
163 |
+
%153 = fmul float %72, 2.560000e+02, !dbg !45
|
164 |
+
%154 = fmul float %73, 2.560000e+02, !dbg !45
|
165 |
+
%155 = fmul float %74, 2.560000e+02, !dbg !45
|
166 |
+
%156 = fsub float %152, %113, !dbg !46
|
167 |
+
%157 = fsub float %153, %113, !dbg !46
|
168 |
+
%158 = fsub float %154, %113, !dbg !46
|
169 |
+
%159 = fsub float %155, %113, !dbg !46
|
170 |
+
%160 = fmul float %147, %47, !dbg !47
|
171 |
+
%161 = fmul float %147, %48, !dbg !47
|
172 |
+
%162 = fmul float %147, %49, !dbg !47
|
173 |
+
%163 = fmul float %147, %50, !dbg !47
|
174 |
+
%164 = fsub float %156, %160, !dbg !48
|
175 |
+
%165 = fsub float %157, %161, !dbg !48
|
176 |
+
%166 = fsub float %158, %162, !dbg !48
|
177 |
+
%167 = fsub float %159, %163, !dbg !48
|
178 |
+
%168 = fmul float %148, %164, !dbg !49
|
179 |
+
%169 = fmul float %148, %165, !dbg !49
|
180 |
+
%170 = fmul float %148, %166, !dbg !49
|
181 |
+
%171 = fmul float %148, %167, !dbg !49
|
182 |
+
%172 = fadd float %168, %57, !dbg !50
|
183 |
+
%173 = fadd float %169, %58, !dbg !50
|
184 |
+
%174 = fadd float %170, %59, !dbg !50
|
185 |
+
%175 = fadd float %171, %60, !dbg !50
|
186 |
+
%176 = bitcast float %172 to i32, !dbg !51
|
187 |
+
%177 = bitcast float %173 to i32, !dbg !51
|
188 |
+
%178 = bitcast float %174 to i32, !dbg !51
|
189 |
+
%179 = bitcast float %175 to i32, !dbg !51
|
190 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %176, i32 %177, i32 %178, i32 %179, ptr addrspace(1) %51, i1 true) #3, !dbg !51
|
191 |
+
%180 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52
|
192 |
+
%181 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %172) #3, !dbg !53
|
193 |
+
%182 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %173) #3, !dbg !53
|
194 |
+
%183 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %174) #3, !dbg !53
|
195 |
+
%184 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %175) #3, !dbg !53
|
196 |
+
%185 = insertelement <2 x i16> undef, i16 %181, i64 0, !dbg !53
|
197 |
+
%186 = insertelement <2 x i16> %185, i16 %182, i64 1, !dbg !53
|
198 |
+
%187 = bitcast <2 x i16> %186 to i32, !dbg !53
|
199 |
+
%188 = insertelement <2 x i16> undef, i16 %183, i64 0, !dbg !53
|
200 |
+
%189 = insertelement <2 x i16> %188, i16 %184, i64 1, !dbg !53
|
201 |
+
%190 = bitcast <2 x i16> %189 to i32, !dbg !53
|
202 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %187, i32 %190, ptr addrspace(1) %180, i1 true) #3, !dbg !53
|
203 |
+
ret void, !dbg !54
|
204 |
+
}
|
205 |
+
|
206 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
207 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
208 |
+
|
209 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
210 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
211 |
+
|
212 |
+
; Function Attrs: convergent nocallback nounwind
|
213 |
+
declare void @llvm.nvvm.barrier0() #2
|
214 |
+
|
215 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
216 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
217 |
+
attributes #2 = { convergent nocallback nounwind }
|
218 |
+
attributes #3 = { nounwind }
|
219 |
+
|
220 |
+
!llvm.module.flags = !{!0}
|
221 |
+
!llvm.dbg.cu = !{!1}
|
222 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
223 |
+
|
224 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
225 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
226 |
+
!2 = !DIFile(filename: "crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py", directory: "/tmp/torchinductor_root/rn")
|
227 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
228 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
|
229 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
230 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
231 |
+
!7 = !{}
|
232 |
+
!8 = !DILocation(line: 26, column: 26, scope: !5)
|
233 |
+
!9 = !DILocation(line: 23, column: 28, scope: !5)
|
234 |
+
!10 = !DILocation(line: 30, column: 40, scope: !5)
|
235 |
+
!11 = !DILocation(line: 30, column: 36, scope: !5)
|
236 |
+
!12 = !DILocation(line: 30, column: 30, scope: !5)
|
237 |
+
!13 = !DILocation(line: 30, column: 46, scope: !5)
|
238 |
+
!14 = !DILocation(line: 30, column: 67, scope: !5)
|
239 |
+
!15 = !DILocation(line: 31, column: 30, scope: !5)
|
240 |
+
!16 = !DILocation(line: 31, column: 35, scope: !5)
|
241 |
+
!17 = !DILocation(line: 32, column: 30, scope: !5)
|
242 |
+
!18 = !DILocation(line: 32, column: 46, scope: !5)
|
243 |
+
!19 = !DILocation(line: 33, column: 35, scope: !5)
|
244 |
+
!20 = !DILocation(line: 33, column: 51, scope: !5)
|
245 |
+
!21 = !DILocation(line: 34, column: 31, scope: !5)
|
246 |
+
!22 = !DILocation(line: 34, column: 36, scope: !5)
|
247 |
+
!23 = !DILocation(line: 36, column: 18, scope: !5)
|
248 |
+
!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !28)
|
249 |
+
!25 = distinct !DILexicalBlockFile(scope: !27, file: !26, discriminator: 0)
|
250 |
+
!26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
251 |
+
!27 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0)
|
252 |
+
!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
|
253 |
+
!29 = !DILocation(line: 39, column: 57, scope: !25)
|
254 |
+
!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
|
255 |
+
!31 = !DILocation(line: 39, column: 57, scope: !27)
|
256 |
+
!32 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !35)
|
257 |
+
!33 = distinct !DILexicalBlockFile(scope: !5, file: !34, discriminator: 0)
|
258 |
+
!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
259 |
+
!35 = !DILocation(line: 39, column: 44, scope: !33)
|
260 |
+
!36 = !DILocation(line: 40, column: 18, scope: !5)
|
261 |
+
!37 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !38)
|
262 |
+
!38 = !DILocation(line: 43, column: 59, scope: !27)
|
263 |
+
!39 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !40)
|
264 |
+
!40 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !41)
|
265 |
+
!41 = !DILocation(line: 43, column: 59, scope: !25)
|
266 |
+
!42 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !43)
|
267 |
+
!43 = !DILocation(line: 43, column: 45, scope: !33)
|
268 |
+
!44 = !DILocation(line: 45, column: 20, scope: !5)
|
269 |
+
!45 = !DILocation(line: 46, column: 19, scope: !5)
|
270 |
+
!46 = !DILocation(line: 47, column: 20, scope: !5)
|
271 |
+
!47 = !DILocation(line: 48, column: 19, scope: !5)
|
272 |
+
!48 = !DILocation(line: 49, column: 20, scope: !5)
|
273 |
+
!49 = !DILocation(line: 50, column: 20, scope: !5)
|
274 |
+
!50 = !DILocation(line: 51, column: 20, scope: !5)
|
275 |
+
!51 = !DILocation(line: 53, column: 51, scope: !5)
|
276 |
+
!52 = !DILocation(line: 54, column: 25, scope: !5)
|
277 |
+
!53 = !DILocation(line: 54, column: 48, scope: !5)
|
278 |
+
!54 = !DILocation(line: 54, column: 4, scope: !5)
|
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx
ADDED
@@ -0,0 +1,1154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6de7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62};
|
20 |
+
.global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
|
21 |
+
.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62};
|
23 |
+
.global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
|
24 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
|
25 |
+
.extern .shared .align 1 .b8 global_smem[];
|
26 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
27 |
+
|
28 |
+
.visible .entry triton__0d1d2d3d4d5d6de7de(
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
|
34 |
+
.param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
|
35 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
|
36 |
+
.param .u32 triton__0d1d2d3d4d5d6de7de_param_7
|
37 |
+
)
|
38 |
+
.maxntid 128, 1, 1
|
39 |
+
{
|
40 |
+
.reg .pred %p<65>;
|
41 |
+
.reg .b16 %rs<13>;
|
42 |
+
.reg .b32 %r<188>;
|
43 |
+
.reg .f32 %f<166>;
|
44 |
+
.reg .b64 %rd<99>;
|
45 |
+
.loc 1 18 0
|
46 |
+
$L__func_begin0:
|
47 |
+
.loc 1 18 0
|
48 |
+
|
49 |
+
ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_3];
|
50 |
+
ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_2];
|
51 |
+
ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_0];
|
52 |
+
$L__tmp0:
|
53 |
+
.loc 1 22 44
|
54 |
+
mov.u32 %r1, %tid.x;
|
55 |
+
ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6de7de_param_1];
|
56 |
+
bfe.u32 %r3, %r1, 6, 1;
|
57 |
+
and.b32 %r4, %r1, 1;
|
58 |
+
.loc 1 24 33
|
59 |
+
shl.b32 %r23, %r1, 1;
|
60 |
+
and.b32 %r5, %r23, 126;
|
61 |
+
.loc 1 21 28
|
62 |
+
mov.u32 %r14, %ctaid.x;
|
63 |
+
.loc 1 21 33
|
64 |
+
shl.b32 %r24, %r14, 1;
|
65 |
+
.loc 1 22 23
|
66 |
+
or.b32 %r25, %r24, %r3;
|
67 |
+
or.b32 %r26, %r24, %r4;
|
68 |
+
.loc 1 26 30
|
69 |
+
mul.wide.s32 %rd26, %r25, 8;
|
70 |
+
add.s64 %rd17, %rd24, %rd26;
|
71 |
+
mul.wide.s32 %rd27, %r26, 8;
|
72 |
+
add.s64 %rd21, %rd24, %rd27;
|
73 |
+
mov.pred %p61, -1;
|
74 |
+
.loc 1 26 35
|
75 |
+
mov.u64 %rd16, 0x0;
|
76 |
+
@%p61 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
|
77 |
+
mov.u64 %rd18, 0x0;
|
78 |
+
@%p61 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd17 + 0 ];
|
79 |
+
mov.u64 %rd20, 0x0;
|
80 |
+
@%p61 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
|
81 |
+
.loc 1 27 18
|
82 |
+
bfe.s32 %r27, %r14, 30, 1;
|
83 |
+
shr.u32 %r28, %r27, 23;
|
84 |
+
add.s32 %r29, %r25, %r28;
|
85 |
+
and.b32 %r30, %r29, 16776704;
|
86 |
+
sub.s32 %r31, %r25, %r30;
|
87 |
+
.loc 1 35 44
|
88 |
+
shl.b32 %r6, %r31, 8;
|
89 |
+
.loc 1 36 44
|
90 |
+
shl.b32 %r7, %r25, 8;
|
91 |
+
.loc 1 37 22
|
92 |
+
add.s64 %rd28, %rd20, 50257;
|
93 |
+
.loc 1 38 22
|
94 |
+
setp.lt.s64 %p9, %rd16, 0;
|
95 |
+
setp.lt.s64 %p10, %rd20, 0;
|
96 |
+
.loc 1 39 36
|
97 |
+
selp.b64 %rd1, %rd28, %rd20, %p10;
|
98 |
+
.loc 1 40 40
|
99 |
+
setp.lt.u64 %p11, %rd1, 50257;
|
100 |
+
.loc 1 41 44
|
101 |
+
shl.b64 %rd29, %rd16, 8;
|
102 |
+
add.s64 %rd30, %rd29, 12865792;
|
103 |
+
selp.b64 %rd31, %rd30, %rd29, %p9;
|
104 |
+
shl.b64 %rd32, %rd31, 2;
|
105 |
+
add.s64 %rd2, %rd25, %rd32;
|
106 |
+
.loc 1 35 40
|
107 |
+
or.b32 %r32, %r5, %r6;
|
108 |
+
.loc 1 35 34
|
109 |
+
mul.wide.s32 %rd33, %r32, 4;
|
110 |
+
add.s64 %rd62, %rd12, %rd33;
|
111 |
+
mov.b32 %r179, 0;
|
112 |
+
.loc 1 35 50
|
113 |
+
mov.u32 %r15, 0x0;
|
114 |
+
mov.u32 %r16, 0x0;
|
115 |
+
@%p61 ld.global.L1::evict_last.v2.b32 { %r15, %r16 }, [ %rd62 + 0 ];
|
116 |
+
@!%p61 mov.u32 %r15, %r179;
|
117 |
+
@!%p61 mov.u32 %r16, %r179;
|
118 |
+
mov.b32 %f2, %r16;
|
119 |
+
mov.b32 %f1, %r15;
|
120 |
+
.loc 1 36 40
|
121 |
+
or.b32 %r33, %r5, %r7;
|
122 |
+
.loc 1 36 34
|
123 |
+
mul.wide.s32 %rd34, %r33, 2;
|
124 |
+
add.s64 %rd63, %rd13, %rd34;
|
125 |
+
.loc 1 36 50
|
126 |
+
mov.u32 %r19, 0x0;
|
127 |
+
@%p61 ld.global.L1::evict_last.b32 { %r19 }, [ %rd63 + 0 ];
|
128 |
+
@!%p61 mov.u32 %r19, %r179;
|
129 |
+
cvt.u16.u32 %rs1, %r19;
|
130 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; }
|
131 |
+
.loc 1 36 101
|
132 |
+
cvt.f32.bf16 %r21, %rs1;
|
133 |
+
mov.b32 %f3, %r21;
|
134 |
+
cvt.f32.bf16 %r22, %rs2;
|
135 |
+
mov.b32 %f4, %r22;
|
136 |
+
mov.u64 %rd95, assertMessage_0;
|
137 |
+
mov.u64 %rd96, assertFile_0;
|
138 |
+
mov.u64 %rd97, assertFunc_0;
|
139 |
+
mov.b32 %r187, 1892;
|
140 |
+
mov.u64 %rd98, 1;
|
141 |
+
.loc 1 40 55
|
142 |
+
@%p11 bra $L__BB0_2;
|
143 |
+
cvta.global.u64 %rd36, %rd95;
|
144 |
+
cvta.global.u64 %rd38, %rd96;
|
145 |
+
cvta.global.u64 %rd40, %rd97;
|
146 |
+
{ // callseq 2, 0
|
147 |
+
.reg .b32 temp_param_reg;
|
148 |
+
.param .b64 param0;
|
149 |
+
st.param.b64 [param0+0], %rd36;
|
150 |
+
.param .b64 param1;
|
151 |
+
st.param.b64 [param1+0], %rd38;
|
152 |
+
.param .b32 param2;
|
153 |
+
st.param.b32 [param2+0], %r187;
|
154 |
+
.param .b64 param3;
|
155 |
+
st.param.b64 [param3+0], %rd40;
|
156 |
+
.param .b64 param4;
|
157 |
+
st.param.b64 [param4+0], %rd98;
|
158 |
+
call.uni
|
159 |
+
__assertfail,
|
160 |
+
(
|
161 |
+
param0,
|
162 |
+
param1,
|
163 |
+
param2,
|
164 |
+
param3,
|
165 |
+
param4
|
166 |
+
);
|
167 |
+
} // callseq 2
|
168 |
+
$L__BB0_2:
|
169 |
+
.loc 1 0 55
|
170 |
+
ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_4];
|
171 |
+
and.b32 %r2, %r1, 31;
|
172 |
+
.loc 1 41 40
|
173 |
+
cvt.u64.u32 %rd45, %r5;
|
174 |
+
.loc 1 41 34
|
175 |
+
mul.wide.u32 %rd46, %r5, 4;
|
176 |
+
add.s64 %rd73, %rd2, %rd46;
|
177 |
+
.loc 1 41 52
|
178 |
+
mov.u32 %r35, 0x0;
|
179 |
+
mov.u32 %r36, 0x0;
|
180 |
+
@%p61 ld.global.L1::evict_last.v2.b32 { %r35, %r36 }, [ %rd73 + 0 ];
|
181 |
+
@!%p61 mov.u32 %r35, %r179;
|
182 |
+
@!%p61 mov.u32 %r36, %r179;
|
183 |
+
mov.b32 %f21, %r36;
|
184 |
+
mov.b32 %f22, %r35;
|
185 |
+
.loc 1 42 22
|
186 |
+
add.f32 %f23, %f1, %f22;
|
187 |
+
add.f32 %f24, %f2, %f21;
|
188 |
+
.loc 1 44 22
|
189 |
+
add.f32 %f25, %f4, %f24;
|
190 |
+
mov.b32 %r43, %f25;
|
191 |
+
add.f32 %f26, %f3, %f23;
|
192 |
+
mov.b32 %r40, %f26;
|
193 |
+
mov.b32 %r41, 1065353216;
|
194 |
+
$L__tmp1:
|
195 |
+
.loc 2 98 30
|
196 |
+
div.full.f32 %r39, %r40, %r41;
|
197 |
+
mov.b32 %f27, %r39;
|
198 |
+
div.full.f32 %r42, %r43, %r41;
|
199 |
+
mov.b32 %f28, %r42;
|
200 |
+
.loc 2 98 22
|
201 |
+
add.f32 %f6, %f28, 0f00000000;
|
202 |
+
add.f32 %f5, %f27, 0f00000000;
|
203 |
+
.loc 2 101 30
|
204 |
+
sub.f32 %f29, %f26, %f5;
|
205 |
+
sub.f32 %f30, %f25, %f6;
|
206 |
+
$L__tmp2:
|
207 |
+
.loc 1 50 50
|
208 |
+
fma.rn.f32 %f8, %f25, %f30, 0f00000000;
|
209 |
+
fma.rn.f32 %f7, %f26, %f29, 0f00000000;
|
210 |
+
.loc 1 35 34
|
211 |
+
cvt.s64.s32 %rd47, %r6;
|
212 |
+
add.s64 %rd48, %rd45, %rd47;
|
213 |
+
shl.b64 %rd49, %rd48, 2;
|
214 |
+
add.s64 %rd50, %rd12, %rd49;
|
215 |
+
add.s64 %rd75, %rd50, 512;
|
216 |
+
.loc 1 35 50
|
217 |
+
mov.u32 %r45, 0x0;
|
218 |
+
mov.u32 %r46, 0x0;
|
219 |
+
@%p61 ld.global.L1::evict_last.v2.b32 { %r45, %r46 }, [ %rd75 + 0 ];
|
220 |
+
@!%p61 mov.u32 %r45, %r179;
|
221 |
+
@!%p61 mov.u32 %r46, %r179;
|
222 |
+
mov.b32 %f10, %r46;
|
223 |
+
mov.b32 %f9, %r45;
|
224 |
+
.loc 1 36 34
|
225 |
+
cvt.s64.s32 %rd51, %r7;
|
226 |
+
add.s64 %rd8, %rd45, %rd51;
|
227 |
+
shl.b64 %rd52, %rd8, 1;
|
228 |
+
add.s64 %rd53, %rd13, %rd52;
|
229 |
+
add.s64 %rd76, %rd53, 256;
|
230 |
+
.loc 1 36 50
|
231 |
+
mov.u32 %r49, 0x0;
|
232 |
+
@%p61 ld.global.L1::evict_last.b32 { %r49 }, [ %rd76 + 0 ];
|
233 |
+
@!%p61 mov.u32 %r49, %r179;
|
234 |
+
cvt.u16.u32 %rs3, %r49;
|
235 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r49; }
|
236 |
+
.loc 1 36 101
|
237 |
+
cvt.f32.bf16 %r51, %rs3;
|
238 |
+
mov.b32 %f11, %r51;
|
239 |
+
cvt.f32.bf16 %r52, %rs4;
|
240 |
+
mov.b32 %f12, %r52;
|
241 |
+
.loc 1 40 55
|
242 |
+
@%p11 bra $L__BB0_4;
|
243 |
+
cvta.global.u64 %rd55, %rd95;
|
244 |
+
cvta.global.u64 %rd57, %rd96;
|
245 |
+
cvta.global.u64 %rd59, %rd97;
|
246 |
+
{ // callseq 3, 0
|
247 |
+
.reg .b32 temp_param_reg;
|
248 |
+
.param .b64 param0;
|
249 |
+
st.param.b64 [param0+0], %rd55;
|
250 |
+
.param .b64 param1;
|
251 |
+
st.param.b64 [param1+0], %rd57;
|
252 |
+
.param .b32 param2;
|
253 |
+
st.param.b32 [param2+0], %r187;
|
254 |
+
.param .b64 param3;
|
255 |
+
st.param.b64 [param3+0], %rd59;
|
256 |
+
.param .b64 param4;
|
257 |
+
st.param.b64 [param4+0], %rd98;
|
258 |
+
call.uni
|
259 |
+
__assertfail,
|
260 |
+
(
|
261 |
+
param0,
|
262 |
+
param1,
|
263 |
+
param2,
|
264 |
+
param3,
|
265 |
+
param4
|
266 |
+
);
|
267 |
+
} // callseq 3
|
268 |
+
$L__BB0_4:
|
269 |
+
.loc 1 0 55
|
270 |
+
ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_5];
|
271 |
+
cvt.s64.s32 %rd4, %r33;
|
272 |
+
.loc 1 41 34
|
273 |
+
add.s64 %rd86, %rd73, 512;
|
274 |
+
.loc 1 41 52
|
275 |
+
mov.u32 %r54, 0x0;
|
276 |
+
mov.u32 %r55, 0x0;
|
277 |
+
@%p61 ld.global.L1::evict_last.v2.b32 { %r54, %r55 }, [ %rd86 + 0 ];
|
278 |
+
@!%p61 mov.u32 %r54, %r179;
|
279 |
+
@!%p61 mov.u32 %r55, %r179;
|
280 |
+
mov.b32 %f31, %r54;
|
281 |
+
mov.b32 %f32, %r55;
|
282 |
+
.loc 1 42 22
|
283 |
+
add.f32 %f33, %f10, %f32;
|
284 |
+
add.f32 %f34, %f9, %f31;
|
285 |
+
.loc 1 44 22
|
286 |
+
add.f32 %f35, %f11, %f34;
|
287 |
+
add.f32 %f36, %f12, %f33;
|
288 |
+
$L__tmp3:
|
289 |
+
.loc 2 96 20
|
290 |
+
sub.f32 %f37, %f36, %f6;
|
291 |
+
mov.b32 %r62, %f37;
|
292 |
+
sub.f32 %f38, %f35, %f5;
|
293 |
+
mov.b32 %r59, %f38;
|
294 |
+
mov.b32 %r60, 1073741824;
|
295 |
+
.loc 2 98 30
|
296 |
+
div.full.f32 %r58, %r59, %r60;
|
297 |
+
mov.b32 %f39, %r58;
|
298 |
+
div.full.f32 %r61, %r62, %r60;
|
299 |
+
mov.b32 %f40, %r61;
|
300 |
+
.loc 2 98 22
|
301 |
+
add.f32 %f41, %f6, %f40;
|
302 |
+
add.f32 %f42, %f5, %f39;
|
303 |
+
.loc 2 101 30
|
304 |
+
sub.f32 %f43, %f35, %f42;
|
305 |
+
sub.f32 %f44, %f36, %f41;
|
306 |
+
$L__tmp4:
|
307 |
+
.loc 1 50 50
|
308 |
+
fma.rn.f32 %f45, %f37, %f44, %f8;
|
309 |
+
fma.rn.f32 %f46, %f38, %f43, %f7;
|
310 |
+
.loc 1 24 33
|
311 |
+
and.b32 %r119, %r1, 127;
|
312 |
+
.loc 1 31 36
|
313 |
+
shl.b32 %r120, %r119, 2;
|
314 |
+
mov.u32 %r121, global_smem;
|
315 |
+
add.s32 %r8, %r121, %r120;
|
316 |
+
st.shared.u32 [%r8], %r60;
|
317 |
+
st.shared.u32 [%r8+520], %r60;
|
318 |
+
bar.sync 0;
|
319 |
+
mad.lo.s32 %r122, %r3, 130, %r5;
|
320 |
+
shl.b32 %r123, %r122, 2;
|
321 |
+
add.s32 %r124, %r121, %r123;
|
322 |
+
ld.shared.v2.f32 {%f47, %f48}, [%r124];
|
323 |
+
$L__tmp5:
|
324 |
+
.loc 2 120 46
|
325 |
+
bar.sync 0;
|
326 |
+
$L__tmp6:
|
327 |
+
.loc 2 108 21
|
328 |
+
sub.f32 %f49, %f41, %f42;
|
329 |
+
.loc 2 109 28
|
330 |
+
add.f32 %f50, %f47, %f48;
|
331 |
+
.loc 2 110 39
|
332 |
+
setp.eq.f32 %p41, %f50, 0f00000000;
|
333 |
+
.loc 2 110 60
|
334 |
+
mov.b32 %r65, %f48;
|
335 |
+
mov.b32 %r66, %f50;
|
336 |
+
div.full.f32 %r64, %r65, %r66;
|
337 |
+
mov.b32 %f51, %r64;
|
338 |
+
.loc 2 110 49
|
339 |
+
selp.f32 %f52, 0f00000000, %f51, %p41;
|
340 |
+
.loc 2 112 17
|
341 |
+
fma.rn.f32 %f53, %f49, %f52, %f42;
|
342 |
+
.loc 2 113 15
|
343 |
+
add.f32 %f54, %f46, %f45;
|
344 |
+
.loc 2 113 30
|
345 |
+
mul.f32 %f55, %f49, %f49;
|
346 |
+
.loc 2 113 38
|
347 |
+
mul.f32 %f56, %f55, %f47;
|
348 |
+
.loc 2 113 22
|
349 |
+
fma.rn.f32 %f57, %f56, %f52, %f54;
|
350 |
+
$L__tmp7:
|
351 |
+
.loc 2 120 46
|
352 |
+
mov.b32 %r125, %f53;
|
353 |
+
shfl.sync.bfly.b32 %r126, %r125, 16, 31, -1;
|
354 |
+
mov.b32 %f58, %r126;
|
355 |
+
mov.b32 %r127, %f57;
|
356 |
+
shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
|
357 |
+
mov.b32 %f59, %r128;
|
358 |
+
shfl.sync.bfly.b32 %r68, %r66, 16, 31, -1;
|
359 |
+
mov.b32 %f60, %r68;
|
360 |
+
$L__tmp8:
|
361 |
+
.loc 2 108 21
|
362 |
+
sub.f32 %f61, %f58, %f53;
|
363 |
+
.loc 2 109 28
|
364 |
+
add.f32 %f62, %f50, %f60;
|
365 |
+
.loc 2 110 39
|
366 |
+
setp.eq.f32 %p42, %f62, 0f00000000;
|
367 |
+
.loc 2 110 60
|
368 |
+
mov.b32 %r69, %f62;
|
369 |
+
div.full.f32 %r67, %r68, %r69;
|
370 |
+
mov.b32 %f63, %r67;
|
371 |
+
.loc 2 110 49
|
372 |
+
selp.f32 %f64, 0f00000000, %f63, %p42;
|
373 |
+
.loc 2 112 17
|
374 |
+
fma.rn.f32 %f65, %f61, %f64, %f53;
|
375 |
+
.loc 2 113 15
|
376 |
+
add.f32 %f66, %f57, %f59;
|
377 |
+
.loc 2 113 30
|
378 |
+
mul.f32 %f67, %f61, %f61;
|
379 |
+
.loc 2 113 38
|
380 |
+
mul.f32 %f68, %f50, %f67;
|
381 |
+
.loc 2 113 22
|
382 |
+
fma.rn.f32 %f69, %f68, %f64, %f66;
|
383 |
+
$L__tmp9:
|
384 |
+
.loc 2 120 46
|
385 |
+
mov.b32 %r129, %f65;
|
386 |
+
shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
|
387 |
+
mov.b32 %f70, %r130;
|
388 |
+
mov.b32 %r131, %f69;
|
389 |
+
shfl.sync.bfly.b32 %r132, %r131, 8, 31, -1;
|
390 |
+
mov.b32 %f71, %r132;
|
391 |
+
shfl.sync.bfly.b32 %r71, %r69, 8, 31, -1;
|
392 |
+
mov.b32 %f72, %r71;
|
393 |
+
$L__tmp10:
|
394 |
+
.loc 2 108 21
|
395 |
+
sub.f32 %f73, %f70, %f65;
|
396 |
+
.loc 2 109 28
|
397 |
+
add.f32 %f74, %f62, %f72;
|
398 |
+
.loc 2 110 39
|
399 |
+
setp.eq.f32 %p43, %f74, 0f00000000;
|
400 |
+
.loc 2 110 60
|
401 |
+
mov.b32 %r72, %f74;
|
402 |
+
div.full.f32 %r70, %r71, %r72;
|
403 |
+
mov.b32 %f75, %r70;
|
404 |
+
.loc 2 110 49
|
405 |
+
selp.f32 %f76, 0f00000000, %f75, %p43;
|
406 |
+
.loc 2 112 17
|
407 |
+
fma.rn.f32 %f77, %f73, %f76, %f65;
|
408 |
+
.loc 2 113 15
|
409 |
+
add.f32 %f78, %f69, %f71;
|
410 |
+
.loc 2 113 30
|
411 |
+
mul.f32 %f79, %f73, %f73;
|
412 |
+
.loc 2 113 38
|
413 |
+
mul.f32 %f80, %f62, %f79;
|
414 |
+
.loc 2 113 22
|
415 |
+
fma.rn.f32 %f81, %f76, %f80, %f78;
|
416 |
+
$L__tmp11:
|
417 |
+
.loc 2 120 46
|
418 |
+
mov.b32 %r133, %f77;
|
419 |
+
shfl.sync.bfly.b32 %r134, %r133, 4, 31, -1;
|
420 |
+
mov.b32 %f82, %r134;
|
421 |
+
mov.b32 %r135, %f81;
|
422 |
+
shfl.sync.bfly.b32 %r136, %r135, 4, 31, -1;
|
423 |
+
mov.b32 %f83, %r136;
|
424 |
+
shfl.sync.bfly.b32 %r74, %r72, 4, 31, -1;
|
425 |
+
mov.b32 %f84, %r74;
|
426 |
+
$L__tmp12:
|
427 |
+
.loc 2 108 21
|
428 |
+
sub.f32 %f85, %f82, %f77;
|
429 |
+
.loc 2 109 28
|
430 |
+
add.f32 %f86, %f74, %f84;
|
431 |
+
.loc 2 110 39
|
432 |
+
setp.eq.f32 %p44, %f86, 0f00000000;
|
433 |
+
.loc 2 110 60
|
434 |
+
mov.b32 %r75, %f86;
|
435 |
+
div.full.f32 %r73, %r74, %r75;
|
436 |
+
mov.b32 %f87, %r73;
|
437 |
+
.loc 2 110 49
|
438 |
+
selp.f32 %f88, 0f00000000, %f87, %p44;
|
439 |
+
.loc 2 112 17
|
440 |
+
fma.rn.f32 %f89, %f85, %f88, %f77;
|
441 |
+
.loc 2 113 15
|
442 |
+
add.f32 %f90, %f81, %f83;
|
443 |
+
.loc 2 113 30
|
444 |
+
mul.f32 %f91, %f85, %f85;
|
445 |
+
.loc 2 113 38
|
446 |
+
mul.f32 %f92, %f74, %f91;
|
447 |
+
.loc 2 113 22
|
448 |
+
fma.rn.f32 %f93, %f88, %f92, %f90;
|
449 |
+
$L__tmp13:
|
450 |
+
.loc 2 120 46
|
451 |
+
mov.b32 %r137, %f89;
|
452 |
+
shfl.sync.bfly.b32 %r138, %r137, 2, 31, -1;
|
453 |
+
mov.b32 %f94, %r138;
|
454 |
+
mov.b32 %r139, %f93;
|
455 |
+
shfl.sync.bfly.b32 %r140, %r139, 2, 31, -1;
|
456 |
+
mov.b32 %f95, %r140;
|
457 |
+
shfl.sync.bfly.b32 %r77, %r75, 2, 31, -1;
|
458 |
+
mov.b32 %f96, %r77;
|
459 |
+
$L__tmp14:
|
460 |
+
.loc 2 108 21
|
461 |
+
sub.f32 %f97, %f94, %f89;
|
462 |
+
.loc 2 109 28
|
463 |
+
add.f32 %f98, %f86, %f96;
|
464 |
+
.loc 2 110 39
|
465 |
+
setp.eq.f32 %p45, %f98, 0f00000000;
|
466 |
+
.loc 2 110 60
|
467 |
+
mov.b32 %r78, %f98;
|
468 |
+
div.full.f32 %r76, %r77, %r78;
|
469 |
+
mov.b32 %f99, %r76;
|
470 |
+
.loc 2 110 49
|
471 |
+
selp.f32 %f100, 0f00000000, %f99, %p45;
|
472 |
+
.loc 2 112 17
|
473 |
+
fma.rn.f32 %f101, %f97, %f100, %f89;
|
474 |
+
.loc 2 113 15
|
475 |
+
add.f32 %f102, %f93, %f95;
|
476 |
+
.loc 2 113 30
|
477 |
+
mul.f32 %f103, %f97, %f97;
|
478 |
+
.loc 2 113 38
|
479 |
+
mul.f32 %f104, %f86, %f103;
|
480 |
+
.loc 2 113 22
|
481 |
+
fma.rn.f32 %f105, %f100, %f104, %f102;
|
482 |
+
$L__tmp15:
|
483 |
+
.loc 2 120 46
|
484 |
+
mov.b32 %r141, %f101;
|
485 |
+
shfl.sync.bfly.b32 %r142, %r141, 1, 31, -1;
|
486 |
+
mov.b32 %f106, %r142;
|
487 |
+
mov.b32 %r143, %f105;
|
488 |
+
shfl.sync.bfly.b32 %r144, %r143, 1, 31, -1;
|
489 |
+
mov.b32 %f107, %r144;
|
490 |
+
shfl.sync.bfly.b32 %r80, %r78, 1, 31, -1;
|
491 |
+
mov.b32 %f108, %r80;
|
492 |
+
$L__tmp16:
|
493 |
+
.loc 2 108 21
|
494 |
+
sub.f32 %f109, %f106, %f101;
|
495 |
+
.loc 2 109 28
|
496 |
+
add.f32 %f110, %f98, %f108;
|
497 |
+
.loc 2 110 39
|
498 |
+
setp.eq.f32 %p46, %f110, 0f00000000;
|
499 |
+
.loc 2 110 60
|
500 |
+
mov.b32 %r81, %f110;
|
501 |
+
div.full.f32 %r79, %r80, %r81;
|
502 |
+
mov.b32 %f111, %r79;
|
503 |
+
.loc 2 110 49
|
504 |
+
selp.f32 %f112, 0f00000000, %f111, %p46;
|
505 |
+
.loc 2 112 17
|
506 |
+
fma.rn.f32 %f113, %f109, %f112, %f101;
|
507 |
+
.loc 2 113 15
|
508 |
+
add.f32 %f114, %f105, %f107;
|
509 |
+
.loc 2 113 30
|
510 |
+
mul.f32 %f115, %f109, %f109;
|
511 |
+
.loc 2 113 38
|
512 |
+
mul.f32 %f116, %f98, %f115;
|
513 |
+
.loc 2 113 22
|
514 |
+
fma.rn.f32 %f117, %f112, %f116, %f114;
|
515 |
+
$L__tmp17:
|
516 |
+
.loc 2 120 46
|
517 |
+
setp.eq.s32 %p24, %r2, 0;
|
518 |
+
shr.u32 %r145, %r1, 3;
|
519 |
+
and.b32 %r146, %r145, 4;
|
520 |
+
shl.b32 %r147, %r3, 3;
|
521 |
+
or.b32 %r148, %r147, %r146;
|
522 |
+
add.s32 %r82, %r121, %r148;
|
523 |
+
mov.b32 %r83, %f113;
|
524 |
+
@%p24 st.shared.b32 [ %r82 + 0 ], %r83;
|
525 |
+
add.s32 %r149, %r121, 16;
|
526 |
+
add.s32 %r84, %r149, %r148;
|
527 |
+
mov.b32 %r85, %f117;
|
528 |
+
@%p24 st.shared.b32 [ %r84 + 0 ], %r85;
|
529 |
+
add.s32 %r150, %r121, 32;
|
530 |
+
add.s32 %r86, %r150, %r148;
|
531 |
+
@%p24 st.shared.b32 [ %r86 + 0 ], %r81;
|
532 |
+
bar.sync 0;
|
533 |
+
setp.lt.s32 %p27, %r1, 4;
|
534 |
+
shl.b32 %r151, %r1, 2;
|
535 |
+
add.s32 %r89, %r121, %r151;
|
536 |
+
@%p27 ld.shared.b32 %r88, [ %r89 + 0 ];
|
537 |
+
mov.b32 %f118, %r88;
|
538 |
+
add.s32 %r91, %r149, %r151;
|
539 |
+
@%p27 ld.shared.b32 %r90, [ %r91 + 0 ];
|
540 |
+
mov.b32 %f119, %r90;
|
541 |
+
add.s32 %r93, %r150, %r151;
|
542 |
+
@%p27 ld.shared.b32 %r92, [ %r93 + 0 ];
|
543 |
+
mov.b32 %f120, %r92;
|
544 |
+
shfl.sync.bfly.b32 %r152, %r88, 1, 31, -1;
|
545 |
+
mov.b32 %f121, %r152;
|
546 |
+
shfl.sync.bfly.b32 %r153, %r90, 1, 31, -1;
|
547 |
+
mov.b32 %f122, %r153;
|
548 |
+
shfl.sync.bfly.b32 %r95, %r92, 1, 31, -1;
|
549 |
+
mov.b32 %f123, %r95;
|
550 |
+
$L__tmp18:
|
551 |
+
.loc 2 108 21
|
552 |
+
sub.f32 %f124, %f121, %f118;
|
553 |
+
.loc 2 109 28
|
554 |
+
add.f32 %f125, %f120, %f123;
|
555 |
+
.loc 2 110 39
|
556 |
+
setp.eq.f32 %p47, %f125, 0f00000000;
|
557 |
+
.loc 2 110 60
|
558 |
+
mov.b32 %r96, %f125;
|
559 |
+
div.full.f32 %r94, %r95, %r96;
|
560 |
+
mov.b32 %f126, %r94;
|
561 |
+
.loc 2 110 49
|
562 |
+
selp.f32 %f127, 0f00000000, %f126, %p47;
|
563 |
+
.loc 2 112 17
|
564 |
+
fma.rn.f32 %f128, %f124, %f127, %f118;
|
565 |
+
.loc 2 113 15
|
566 |
+
add.f32 %f129, %f119, %f122;
|
567 |
+
.loc 2 113 30
|
568 |
+
mul.f32 %f130, %f124, %f124;
|
569 |
+
.loc 2 113 38
|
570 |
+
mul.f32 %f131, %f120, %f130;
|
571 |
+
.loc 2 113 22
|
572 |
+
fma.rn.f32 %f132, %f131, %f127, %f129;
|
573 |
+
$L__tmp19:
|
574 |
+
.loc 2 120 46
|
575 |
+
setp.eq.s32 %p48, %r4, 0;
|
576 |
+
and.pred %p30, %p27, %p48;
|
577 |
+
mov.b32 %r98, %f128;
|
578 |
+
@%p30 st.shared.b32 [ %r89 + 0 ], %r98;
|
579 |
+
mov.b32 %r100, %f132;
|
580 |
+
@%p30 st.shared.b32 [ %r91 + 0 ], %r100;
|
581 |
+
@%p30 st.shared.b32 [ %r93 + 0 ], %r96;
|
582 |
+
bar.sync 0;
|
583 |
+
add.s32 %r154, %r121, %r147;
|
584 |
+
ld.shared.f32 %f13, [%r154];
|
585 |
+
add.s32 %r155, %r149, %r147;
|
586 |
+
$L__tmp20:
|
587 |
+
.loc 1 75 24
|
588 |
+
ld.shared.u32 %r104, [%r155];
|
589 |
+
mov.b32 %r105, 1132462080;
|
590 |
+
div.full.f32 %r103, %r104, %r105;
|
591 |
+
mov.b32 %f133, %r103;
|
592 |
+
.loc 1 77 24
|
593 |
+
add.f32 %f14, %f133, 0f3727C5AC;
|
594 |
+
shl.b32 %r156, %r5, 2;
|
595 |
+
add.s32 %r9, %r121, %r156;
|
596 |
+
.loc 1 62 51
|
597 |
+
mov.u32 %r109, 0x0;
|
598 |
+
mov.u32 %r110, 0x0;
|
599 |
+
@%p61 ld.global.L1::evict_last.v2.b32 { %r109, %r110 }, [ %rd62 + 0 ];
|
600 |
+
@!%p61 mov.u32 %r109, %r179;
|
601 |
+
@!%p61 mov.u32 %r110, %r179;
|
602 |
+
mov.b32 %f15, %r109;
|
603 |
+
mov.b32 %f16, %r110;
|
604 |
+
.loc 1 63 51
|
605 |
+
mov.u32 %r113, 0x0;
|
606 |
+
@%p61 ld.global.L1::evict_first.b32 { %r113 }, [ %rd63 + 0 ];
|
607 |
+
@!%p61 mov.u32 %r113, %r179;
|
608 |
+
cvt.u16.u32 %rs5, %r113;
|
609 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r113; }
|
610 |
+
.loc 1 63 103
|
611 |
+
cvt.f32.bf16 %r115, %rs5;
|
612 |
+
mov.b32 %f17, %r115;
|
613 |
+
cvt.f32.bf16 %r116, %rs6;
|
614 |
+
mov.b32 %f18, %r116;
|
615 |
+
.loc 1 64 35
|
616 |
+
mul.wide.u32 %rd65, %r119, 4;
|
617 |
+
add.s64 %rd64, %rd14, %rd65;
|
618 |
+
.loc 1 64 40
|
619 |
+
mov.u32 %r117, 0x0;
|
620 |
+
@%p61 ld.global.L1::evict_last.b32 { %r117 }, [ %rd64 + 0 ];
|
621 |
+
@!%p61 mov.u32 %r117, %r179;
|
622 |
+
mov.u64 %rd90, assertMessage_1;
|
623 |
+
mov.u64 %rd91, assertFile_1;
|
624 |
+
mov.u64 %rd92, assertFunc_1;
|
625 |
+
.loc 1 68 57
|
626 |
+
@%p11 bra $L__BB0_6;
|
627 |
+
cvta.global.u64 %rd67, %rd90;
|
628 |
+
cvta.global.u64 %rd69, %rd91;
|
629 |
+
cvta.global.u64 %rd71, %rd92;
|
630 |
+
{ // callseq 4, 0
|
631 |
+
.reg .b32 temp_param_reg;
|
632 |
+
.param .b64 param0;
|
633 |
+
st.param.b64 [param0+0], %rd67;
|
634 |
+
.param .b64 param1;
|
635 |
+
st.param.b64 [param1+0], %rd69;
|
636 |
+
.param .b32 param2;
|
637 |
+
st.param.b32 [param2+0], %r187;
|
638 |
+
.param .b64 param3;
|
639 |
+
st.param.b64 [param3+0], %rd71;
|
640 |
+
.param .b64 param4;
|
641 |
+
st.param.b64 [param4+0], %rd98;
|
642 |
+
call.uni
|
643 |
+
__assertfail,
|
644 |
+
(
|
645 |
+
param0,
|
646 |
+
param1,
|
647 |
+
param2,
|
648 |
+
param3,
|
649 |
+
param4
|
650 |
+
);
|
651 |
+
} // callseq 4
|
652 |
+
$L__BB0_6:
|
653 |
+
.loc 1 69 54
|
654 |
+
mov.u32 %r158, 0x0;
|
655 |
+
mov.u32 %r159, 0x0;
|
656 |
+
@%p61 ld.global.L1::evict_first.v2.b32 { %r158, %r159 }, [ %rd73 + 0 ];
|
657 |
+
@!%p61 mov.u32 %r158, %r179;
|
658 |
+
@!%p61 mov.u32 %r159, %r179;
|
659 |
+
mov.b32 %f134, %r158;
|
660 |
+
mov.b32 %f135, %r159;
|
661 |
+
.loc 1 70 24
|
662 |
+
add.f32 %f136, %f15, %f134;
|
663 |
+
add.f32 %f137, %f16, %f135;
|
664 |
+
.loc 1 72 24
|
665 |
+
add.f32 %f138, %f17, %f136;
|
666 |
+
add.f32 %f139, %f18, %f137;
|
667 |
+
.loc 1 73 24
|
668 |
+
sub.f32 %f140, %f138, %f13;
|
669 |
+
sub.f32 %f141, %f139, %f13;
|
670 |
+
.loc 1 78 30
|
671 |
+
rsqrt.approx.ftz.f32 %f142, %f14;
|
672 |
+
.loc 1 79 24
|
673 |
+
mul.f32 %f143, %f140, %f142;
|
674 |
+
mul.f32 %f144, %f141, %f142;
|
675 |
+
.loc 1 80 24
|
676 |
+
bar.sync 0;
|
677 |
+
st.shared.u32 [%r8], %r117;
|
678 |
+
bar.sync 0;
|
679 |
+
ld.shared.v2.f32 {%f145, %f146}, [%r9];
|
680 |
+
mul.f32 %f147, %f143, %f145;
|
681 |
+
mul.f32 %f148, %f144, %f146;
|
682 |
+
.loc 1 82 29
|
683 |
+
shl.b64 %rd78, %rd4, 1;
|
684 |
+
add.s64 %rd74, %rd15, %rd78;
|
685 |
+
.loc 1 82 52
|
686 |
+
mov.b32 %r162, %f147;
|
687 |
+
cvt.rn.bf16.f32 %rs7, %r162;
|
688 |
+
mov.b32 %r163, %f148;
|
689 |
+
cvt.rn.bf16.f32 %rs8, %r163;
|
690 |
+
mov.b32 %r175, {%rs7, %rs8};
|
691 |
+
@%p61 st.global.b32 [ %rd74 + 0 ], { %r175 };
|
692 |
+
.loc 1 62 51
|
693 |
+
mov.u32 %r165, 0x0;
|
694 |
+
mov.u32 %r166, 0x0;
|
695 |
+
@%p61 ld.global.L1::evict_last.v2.b32 { %r165, %r166 }, [ %rd75 + 0 ];
|
696 |
+
@!%p61 mov.u32 %r165, %r179;
|
697 |
+
@!%p61 mov.u32 %r166, %r179;
|
698 |
+
.loc 1 63 51
|
699 |
+
mov.u32 %r169, 0x0;
|
700 |
+
@%p61 ld.global.L1::evict_first.b32 { %r169 }, [ %rd76 + 0 ];
|
701 |
+
@!%p61 mov.u32 %r169, %r179;
|
702 |
+
cvt.u16.u32 %rs9, %r169;
|
703 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r169; }
|
704 |
+
.loc 1 63 103
|
705 |
+
cvt.f32.bf16 %r171, %rs9;
|
706 |
+
mov.b32 %f19, %r171;
|
707 |
+
cvt.f32.bf16 %r172, %rs10;
|
708 |
+
mov.b32 %f20, %r172;
|
709 |
+
.loc 1 64 35
|
710 |
+
add.s64 %rd77, %rd64, 512;
|
711 |
+
.loc 1 64 40
|
712 |
+
mov.u32 %r173, 0x0;
|
713 |
+
@%p61 ld.global.L1::evict_last.b32 { %r173 }, [ %rd77 + 0 ];
|
714 |
+
@!%p61 mov.u32 %r173, %r179;
|
715 |
+
.loc 1 68 57
|
716 |
+
@%p11 bra $L__BB0_8;
|
717 |
+
cvta.global.u64 %rd80, %rd90;
|
718 |
+
cvta.global.u64 %rd82, %rd91;
|
719 |
+
cvta.global.u64 %rd84, %rd92;
|
720 |
+
{ // callseq 5, 0
|
721 |
+
.reg .b32 temp_param_reg;
|
722 |
+
.param .b64 param0;
|
723 |
+
st.param.b64 [param0+0], %rd80;
|
724 |
+
.param .b64 param1;
|
725 |
+
st.param.b64 [param1+0], %rd82;
|
726 |
+
.param .b32 param2;
|
727 |
+
st.param.b32 [param2+0], %r187;
|
728 |
+
.param .b64 param3;
|
729 |
+
st.param.b64 [param3+0], %rd84;
|
730 |
+
.param .b64 param4;
|
731 |
+
st.param.b64 [param4+0], %rd98;
|
732 |
+
call.uni
|
733 |
+
__assertfail,
|
734 |
+
(
|
735 |
+
param0,
|
736 |
+
param1,
|
737 |
+
param2,
|
738 |
+
param3,
|
739 |
+
param4
|
740 |
+
);
|
741 |
+
} // callseq 5
|
742 |
+
$L__BB0_8:
|
743 |
+
.loc 1 69 54
|
744 |
+
mov.u32 %r177, 0x0;
|
745 |
+
mov.u32 %r178, 0x0;
|
746 |
+
@%p61 ld.global.L1::evict_first.v2.b32 { %r177, %r178 }, [ %rd86 + 0 ];
|
747 |
+
@!%p61 mov.u32 %r177, %r179;
|
748 |
+
@!%p61 mov.u32 %r178, %r179;
|
749 |
+
.loc 1 62 51
|
750 |
+
mov.b32 %f150, %r166;
|
751 |
+
.loc 1 69 54
|
752 |
+
mov.b32 %f151, %r178;
|
753 |
+
.loc 1 70 24
|
754 |
+
add.f32 %f152, %f150, %f151;
|
755 |
+
.loc 1 72 24
|
756 |
+
add.f32 %f153, %f20, %f152;
|
757 |
+
.loc 1 73 24
|
758 |
+
sub.f32 %f154, %f153, %f13;
|
759 |
+
.loc 1 62 51
|
760 |
+
mov.b32 %f155, %r165;
|
761 |
+
.loc 1 69 54
|
762 |
+
mov.b32 %f156, %r177;
|
763 |
+
.loc 1 70 24
|
764 |
+
add.f32 %f157, %f155, %f156;
|
765 |
+
.loc 1 72 24
|
766 |
+
add.f32 %f158, %f19, %f157;
|
767 |
+
.loc 1 73 24
|
768 |
+
sub.f32 %f159, %f158, %f13;
|
769 |
+
.loc 1 79 24
|
770 |
+
mul.f32 %f160, %f159, %f142;
|
771 |
+
mul.f32 %f161, %f154, %f142;
|
772 |
+
.loc 1 80 24
|
773 |
+
bar.sync 0;
|
774 |
+
st.shared.u32 [%r8], %r173;
|
775 |
+
bar.sync 0;
|
776 |
+
ld.shared.v2.f32 {%f162, %f163}, [%r9];
|
777 |
+
mul.f32 %f164, %f160, %f162;
|
778 |
+
mul.f32 %f165, %f161, %f163;
|
779 |
+
.loc 1 82 29
|
780 |
+
add.s64 %rd89, %rd15, %rd52;
|
781 |
+
add.s64 %rd87, %rd89, 256;
|
782 |
+
.loc 1 82 52
|
783 |
+
mov.b32 %r181, %f164;
|
784 |
+
cvt.rn.bf16.f32 %rs11, %r181;
|
785 |
+
mov.b32 %r182, %f165;
|
786 |
+
cvt.rn.bf16.f32 %rs12, %r182;
|
787 |
+
mov.b32 %r184, {%rs11, %rs12};
|
788 |
+
@%p61 st.global.b32 [ %rd87 + 0 ], { %r184 };
|
789 |
+
.loc 1 58 4
|
790 |
+
ret;
|
791 |
+
$L__tmp21:
|
792 |
+
$L__func_end0:
|
793 |
+
|
794 |
+
}
|
795 |
+
// .globl __nv_rsqrtf
|
796 |
+
.visible .func (.param .b32 func_retval0) __nv_rsqrtf(
|
797 |
+
.param .b32 __nv_rsqrtf_param_0
|
798 |
+
)
|
799 |
+
{
|
800 |
+
.reg .f32 %f<3>;
|
801 |
+
$L__func_begin1:
|
802 |
+
|
803 |
+
ld.param.f32 %f1, [__nv_rsqrtf_param_0];
|
804 |
+
rsqrt.approx.ftz.f32 %f2, %f1;
|
805 |
+
st.param.f32 [func_retval0+0], %f2;
|
806 |
+
ret;
|
807 |
+
$L__func_end1:
|
808 |
+
|
809 |
+
}
|
810 |
+
.file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
|
811 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
|
812 |
+
.section .debug_abbrev
|
813 |
+
{
|
814 |
+
.b8 1
|
815 |
+
.b8 17
|
816 |
+
.b8 1
|
817 |
+
.b8 37
|
818 |
+
.b8 8
|
819 |
+
.b8 19
|
820 |
+
.b8 5
|
821 |
+
.b8 3
|
822 |
+
.b8 8
|
823 |
+
.b8 16
|
824 |
+
.b8 6
|
825 |
+
.b8 27
|
826 |
+
.b8 8
|
827 |
+
.b8 180
|
828 |
+
.b8 66
|
829 |
+
.b8 12
|
830 |
+
.b8 17
|
831 |
+
.b8 1
|
832 |
+
.b8 18
|
833 |
+
.b8 1
|
834 |
+
.b8 0
|
835 |
+
.b8 0
|
836 |
+
.b8 2
|
837 |
+
.b8 46
|
838 |
+
.b8 0
|
839 |
+
.b8 135
|
840 |
+
.b8 64
|
841 |
+
.b8 8
|
842 |
+
.b8 3
|
843 |
+
.b8 8
|
844 |
+
.b8 58
|
845 |
+
.b8 11
|
846 |
+
.b8 59
|
847 |
+
.b8 11
|
848 |
+
.b8 63
|
849 |
+
.b8 12
|
850 |
+
.b8 32
|
851 |
+
.b8 11
|
852 |
+
.b8 0
|
853 |
+
.b8 0
|
854 |
+
.b8 3
|
855 |
+
.b8 46
|
856 |
+
.b8 1
|
857 |
+
.b8 17
|
858 |
+
.b8 1
|
859 |
+
.b8 18
|
860 |
+
.b8 1
|
861 |
+
.b8 64
|
862 |
+
.b8 10
|
863 |
+
.b8 49
|
864 |
+
.b8 19
|
865 |
+
.b8 0
|
866 |
+
.b8 0
|
867 |
+
.b8 4
|
868 |
+
.b8 29
|
869 |
+
.b8 0
|
870 |
+
.b8 49
|
871 |
+
.b8 19
|
872 |
+
.b8 17
|
873 |
+
.b8 1
|
874 |
+
.b8 18
|
875 |
+
.b8 1
|
876 |
+
.b8 88
|
877 |
+
.b8 11
|
878 |
+
.b8 89
|
879 |
+
.b8 11
|
880 |
+
.b8 87
|
881 |
+
.b8 11
|
882 |
+
.b8 0
|
883 |
+
.b8 0
|
884 |
+
.b8 5
|
885 |
+
.b8 29
|
886 |
+
.b8 1
|
887 |
+
.b8 49
|
888 |
+
.b8 19
|
889 |
+
.b8 17
|
890 |
+
.b8 1
|
891 |
+
.b8 18
|
892 |
+
.b8 1
|
893 |
+
.b8 88
|
894 |
+
.b8 11
|
895 |
+
.b8 89
|
896 |
+
.b8 11
|
897 |
+
.b8 87
|
898 |
+
.b8 11
|
899 |
+
.b8 0
|
900 |
+
.b8 0
|
901 |
+
.b8 0
|
902 |
+
}
|
903 |
+
.section .debug_info
|
904 |
+
{
|
905 |
+
.b32 302
|
906 |
+
.b8 2
|
907 |
+
.b8 0
|
908 |
+
.b32 .debug_abbrev
|
909 |
+
.b8 8
|
910 |
+
.b8 1
|
911 |
+
.b8 116
|
912 |
+
.b8 114
|
913 |
+
.b8 105
|
914 |
+
.b8 116
|
915 |
+
.b8 111
|
916 |
+
.b8 110
|
917 |
+
.b8 0
|
918 |
+
.b8 2
|
919 |
+
.b8 0
|
920 |
+
.b8 99
|
921 |
+
.b8 112
|
922 |
+
.b8 110
|
923 |
+
.b8 51
|
924 |
+
.b8 108
|
925 |
+
.b8 97
|
926 |
+
.b8 119
|
927 |
+
.b8 103
|
928 |
+
.b8 54
|
929 |
+
.b8 53
|
930 |
+
.b8 108
|
931 |
+
.b8 112
|
932 |
+
.b8 105
|
933 |
+
.b8 54
|
934 |
+
.b8 51
|
935 |
+
.b8 103
|
936 |
+
.b8 118
|
937 |
+
.b8 54
|
938 |
+
.b8 99
|
939 |
+
.b8 54
|
940 |
+
.b8 112
|
941 |
+
.b8 110
|
942 |
+
.b8 52
|
943 |
+
.b8 111
|
944 |
+
.b8 105
|
945 |
+
.b8 107
|
946 |
+
.b8 104
|
947 |
+
.b8 103
|
948 |
+
.b8 54
|
949 |
+
.b8 113
|
950 |
+
.b8 118
|
951 |
+
.b8 97
|
952 |
+
.b8 50
|
953 |
+
.b8 104
|
954 |
+
.b8 50
|
955 |
+
.b8 113
|
956 |
+
.b8 106
|
957 |
+
.b8 100
|
958 |
+
.b8 112
|
959 |
+
.b8 120
|
960 |
+
.b8 101
|
961 |
+
.b8 54
|
962 |
+
.b8 113
|
963 |
+
.b8 106
|
964 |
+
.b8 52
|
965 |
+
.b8 108
|
966 |
+
.b8 118
|
967 |
+
.b8 116
|
968 |
+
.b8 116
|
969 |
+
.b8 119
|
970 |
+
.b8 101
|
971 |
+
.b8 122
|
972 |
+
.b8 46
|
973 |
+
.b8 112
|
974 |
+
.b8 121
|
975 |
+
.b8 0
|
976 |
+
.b32 .debug_line
|
977 |
+
.b8 47
|
978 |
+
.b8 116
|
979 |
+
.b8 109
|
980 |
+
.b8 112
|
981 |
+
.b8 47
|
982 |
+
.b8 116
|
983 |
+
.b8 111
|
984 |
+
.b8 114
|
985 |
+
.b8 99
|
986 |
+
.b8 104
|
987 |
+
.b8 105
|
988 |
+
.b8 110
|
989 |
+
.b8 100
|
990 |
+
.b8 117
|
991 |
+
.b8 99
|
992 |
+
.b8 116
|
993 |
+
.b8 111
|
994 |
+
.b8 114
|
995 |
+
.b8 95
|
996 |
+
.b8 114
|
997 |
+
.b8 111
|
998 |
+
.b8 111
|
999 |
+
.b8 116
|
1000 |
+
.b8 47
|
1001 |
+
.b8 112
|
1002 |
+
.b8 110
|
1003 |
+
.b8 0
|
1004 |
+
.b8 1
|
1005 |
+
.b64 $L__func_begin0
|
1006 |
+
.b64 $L__func_end0
|
1007 |
+
.b8 2
|
1008 |
+
.b8 116
|
1009 |
+
.b8 114
|
1010 |
+
.b8 105
|
1011 |
+
.b8 116
|
1012 |
+
.b8 111
|
1013 |
+
.b8 110
|
1014 |
+
.b8 95
|
1015 |
+
.b8 95
|
1016 |
+
.b8 48
|
1017 |
+
.b8 100
|
1018 |
+
.b8 49
|
1019 |
+
.b8 100
|
1020 |
+
.b8 50
|
1021 |
+
.b8 100
|
1022 |
+
.b8 51
|
1023 |
+
.b8 100
|
1024 |
+
.b8 52
|
1025 |
+
.b8 100
|
1026 |
+
.b8 53
|
1027 |
+
.b8 100
|
1028 |
+
.b8 54
|
1029 |
+
.b8 100
|
1030 |
+
.b8 101
|
1031 |
+
.b8 55
|
1032 |
+
.b8 100
|
1033 |
+
.b8 101
|
1034 |
+
.b8 0
|
1035 |
+
.b8 116
|
1036 |
+
.b8 114
|
1037 |
+
.b8 105
|
1038 |
+
.b8 116
|
1039 |
+
.b8 111
|
1040 |
+
.b8 110
|
1041 |
+
.b8 95
|
1042 |
+
.b8 95
|
1043 |
+
.b8 48
|
1044 |
+
.b8 100
|
1045 |
+
.b8 49
|
1046 |
+
.b8 100
|
1047 |
+
.b8 50
|
1048 |
+
.b8 100
|
1049 |
+
.b8 51
|
1050 |
+
.b8 100
|
1051 |
+
.b8 52
|
1052 |
+
.b8 100
|
1053 |
+
.b8 53
|
1054 |
+
.b8 100
|
1055 |
+
.b8 54
|
1056 |
+
.b8 100
|
1057 |
+
.b8 101
|
1058 |
+
.b8 55
|
1059 |
+
.b8 100
|
1060 |
+
.b8 101
|
1061 |
+
.b8 0
|
1062 |
+
.b8 1
|
1063 |
+
.b8 18
|
1064 |
+
.b8 1
|
1065 |
+
.b8 1
|
1066 |
+
.b8 3
|
1067 |
+
.b64 $L__func_begin0
|
1068 |
+
.b64 $L__func_end0
|
1069 |
+
.b8 1
|
1070 |
+
.b8 156
|
1071 |
+
.b32 125
|
1072 |
+
.b8 4
|
1073 |
+
.b32 125
|
1074 |
+
.b64 $L__tmp1
|
1075 |
+
.b64 $L__tmp4
|
1076 |
+
.b8 2
|
1077 |
+
.b8 47
|
1078 |
+
.b8 41
|
1079 |
+
.b8 4
|
1080 |
+
.b32 125
|
1081 |
+
.b64 $L__tmp5
|
1082 |
+
.b64 $L__tmp20
|
1083 |
+
.b8 2
|
1084 |
+
.b8 53
|
1085 |
+
.b8 44
|
1086 |
+
.b8 5
|
1087 |
+
.b32 125
|
1088 |
+
.b64 $L__tmp6
|
1089 |
+
.b64 $L__tmp19
|
1090 |
+
.b8 2
|
1091 |
+
.b8 53
|
1092 |
+
.b8 44
|
1093 |
+
.b8 4
|
1094 |
+
.b32 125
|
1095 |
+
.b64 $L__tmp6
|
1096 |
+
.b64 $L__tmp19
|
1097 |
+
.b8 2
|
1098 |
+
.b8 120
|
1099 |
+
.b8 46
|
1100 |
+
.b8 0
|
1101 |
+
.b8 0
|
1102 |
+
.b8 0
|
1103 |
+
}
|
1104 |
+
.section .debug_pubnames
|
1105 |
+
{
|
1106 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1107 |
+
$L__pubNames_start0:
|
1108 |
+
.b8 2
|
1109 |
+
.b8 0
|
1110 |
+
.b32 .debug_info
|
1111 |
+
.b32 306
|
1112 |
+
.b32 125
|
1113 |
+
.b8 116
|
1114 |
+
.b8 114
|
1115 |
+
.b8 105
|
1116 |
+
.b8 116
|
1117 |
+
.b8 111
|
1118 |
+
.b8 110
|
1119 |
+
.b8 95
|
1120 |
+
.b8 95
|
1121 |
+
.b8 48
|
1122 |
+
.b8 100
|
1123 |
+
.b8 49
|
1124 |
+
.b8 100
|
1125 |
+
.b8 50
|
1126 |
+
.b8 100
|
1127 |
+
.b8 51
|
1128 |
+
.b8 100
|
1129 |
+
.b8 52
|
1130 |
+
.b8 100
|
1131 |
+
.b8 53
|
1132 |
+
.b8 100
|
1133 |
+
.b8 54
|
1134 |
+
.b8 100
|
1135 |
+
.b8 101
|
1136 |
+
.b8 55
|
1137 |
+
.b8 100
|
1138 |
+
.b8 101
|
1139 |
+
.b8 0
|
1140 |
+
.b32 0
|
1141 |
+
$L__pubNames_end0:
|
1142 |
+
}
|
1143 |
+
.section .debug_pubtypes
|
1144 |
+
{
|
1145 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1146 |
+
$L__pubTypes_start0:
|
1147 |
+
.b8 2
|
1148 |
+
.b8 0
|
1149 |
+
.b32 .debug_info
|
1150 |
+
.b32 306
|
1151 |
+
.b32 0
|
1152 |
+
$L__pubTypes_end0:
|
1153 |
+
}
|
1154 |
+
.section .debug_loc { }
|
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx
ADDED
@@ -0,0 +1,1608 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2d3d4d5d6e7de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.extern .shared .align 1 .b8 global_smem[];
|
23 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
24 |
+
|
25 |
+
.visible .entry triton__0d1d2d3d4d5d6e7de(
|
26 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_0,
|
27 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_1,
|
28 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_2,
|
29 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_3,
|
30 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_4,
|
31 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_5,
|
32 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_6,
|
33 |
+
.param .u64 triton__0d1d2d3d4d5d6e7de_param_7
|
34 |
+
)
|
35 |
+
.maxntid 256, 1, 1
|
36 |
+
{
|
37 |
+
.reg .pred %p<154>;
|
38 |
+
.reg .b16 %rs<83>;
|
39 |
+
.reg .b32 %r<247>;
|
40 |
+
.reg .f32 %f<401>;
|
41 |
+
.reg .b64 %rd<217>;
|
42 |
+
.loc 1 18 0
|
43 |
+
$L__func_begin0:
|
44 |
+
.loc 1 18 0
|
45 |
+
|
46 |
+
ld.param.u64 %rd48, [triton__0d1d2d3d4d5d6e7de_param_5];
|
47 |
+
ld.param.u64 %rd47, [triton__0d1d2d3d4d5d6e7de_param_4];
|
48 |
+
ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6e7de_param_0];
|
49 |
+
ld.param.u64 %rd58, [triton__0d1d2d3d4d5d6e7de_param_1];
|
50 |
+
$L__tmp0:
|
51 |
+
.loc 1 24 33
|
52 |
+
mov.u32 %r1, %tid.x;
|
53 |
+
ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6e7de_param_2];
|
54 |
+
and.b32 %r2, %r1, 255;
|
55 |
+
ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6e7de_param_3];
|
56 |
+
shl.b32 %r3, %r2, 2;
|
57 |
+
or.b32 %r36, %r3, 1;
|
58 |
+
or.b32 %r37, %r3, 2;
|
59 |
+
or.b32 %r38, %r3, 3;
|
60 |
+
or.b32 %r39, %r2, 256;
|
61 |
+
or.b32 %r40, %r2, 512;
|
62 |
+
or.b32 %r41, %r2, 768;
|
63 |
+
.loc 1 21 28
|
64 |
+
mov.u32 %r34, %ctaid.x;
|
65 |
+
.loc 1 21 34
|
66 |
+
cvt.s64.s32 %rd1, %r34;
|
67 |
+
.loc 1 23 21
|
68 |
+
setp.lt.s32 %p1, %r34, 8;
|
69 |
+
shl.b32 %r42, %r2, 3;
|
70 |
+
shl.b32 %r43, %r2, 5;
|
71 |
+
mov.u32 %r44, global_smem;
|
72 |
+
add.s32 %r4, %r44, %r43;
|
73 |
+
shl.b32 %r45, %r36, 1;
|
74 |
+
shl.b32 %r46, %r36, 3;
|
75 |
+
add.s32 %r5, %r44, %r46;
|
76 |
+
shl.b32 %r47, %r37, 1;
|
77 |
+
shl.b32 %r48, %r37, 3;
|
78 |
+
add.s32 %r6, %r44, %r48;
|
79 |
+
shl.b32 %r49, %r38, 1;
|
80 |
+
shl.b32 %r50, %r38, 3;
|
81 |
+
add.s32 %r7, %r44, %r50;
|
82 |
+
shl.b32 %r51, %r2, 1;
|
83 |
+
add.s32 %r12, %r44, %r42;
|
84 |
+
shl.b32 %r52, %r39, 1;
|
85 |
+
shl.b32 %r53, %r39, 3;
|
86 |
+
add.s32 %r9, %r44, %r53;
|
87 |
+
shl.b32 %r54, %r40, 1;
|
88 |
+
shl.b32 %r55, %r40, 3;
|
89 |
+
add.s32 %r10, %r44, %r55;
|
90 |
+
shl.b32 %r56, %r41, 1;
|
91 |
+
shl.b32 %r57, %r41, 3;
|
92 |
+
add.s32 %r11, %r44, %r57;
|
93 |
+
add.s32 %r13, %r44, %r45;
|
94 |
+
add.s32 %r14, %r44, %r47;
|
95 |
+
add.s32 %r15, %r44, %r49;
|
96 |
+
add.s32 %r16, %r44, %r51;
|
97 |
+
add.s32 %r17, %r44, %r52;
|
98 |
+
add.s32 %r18, %r44, %r54;
|
99 |
+
add.s32 %r19, %r44, %r56;
|
100 |
+
add.s32 %r20, %r44, %r3;
|
101 |
+
add.s32 %r21, %r44, %r2;
|
102 |
+
shl.b32 %r58, %r2, 6;
|
103 |
+
add.s32 %r22, %r44, %r58;
|
104 |
+
shl.b32 %r59, %r36, 4;
|
105 |
+
add.s32 %r23, %r44, %r59;
|
106 |
+
shl.b32 %r60, %r37, 4;
|
107 |
+
add.s32 %r24, %r44, %r60;
|
108 |
+
shl.b32 %r61, %r38, 4;
|
109 |
+
add.s32 %r25, %r44, %r61;
|
110 |
+
shl.b32 %r62, %r2, 4;
|
111 |
+
add.s32 %r26, %r44, %r62;
|
112 |
+
shl.b32 %r63, %r39, 4;
|
113 |
+
add.s32 %r27, %r44, %r63;
|
114 |
+
shl.b32 %r64, %r40, 4;
|
115 |
+
add.s32 %r28, %r44, %r64;
|
116 |
+
shl.b32 %r65, %r41, 4;
|
117 |
+
add.s32 %r29, %r44, %r65;
|
118 |
+
.loc 1 28 36
|
119 |
+
mul.wide.s32 %rd61, %r34, 61440;
|
120 |
+
mul.wide.u32 %rd62, %r2, 32;
|
121 |
+
add.s64 %rd63, %rd61, %rd62;
|
122 |
+
add.s64 %rd64, %rd63, %rd57;
|
123 |
+
add.s64 %rd208, %rd64, 8208;
|
124 |
+
mul.wide.s32 %rd65, %r34, 771947520;
|
125 |
+
add.s64 %rd66, %rd58, %rd65;
|
126 |
+
mul.wide.u32 %rd67, %r2, 402056;
|
127 |
+
add.s64 %rd68, %rd66, %rd67;
|
128 |
+
add.s64 %rd207, %rd68, 103227878;
|
129 |
+
mul.wide.u32 %rd4, %r2, 16;
|
130 |
+
mul.wide.s32 %rd69, %r34, 30720;
|
131 |
+
add.s64 %rd206, %rd60, %rd69;
|
132 |
+
add.s64 %rd205, %rd59, %rd69;
|
133 |
+
mov.u64 %rd209, 0;
|
134 |
+
mov.f32 %f385, 0f00000000;
|
135 |
+
mov.b32 %r246, -2048;
|
136 |
+
mov.u16 %rs44, 0;
|
137 |
+
mov.f32 %f386, %f385;
|
138 |
+
mov.f32 %f387, %f385;
|
139 |
+
mov.f32 %f388, %f385;
|
140 |
+
mov.f32 %f389, %f385;
|
141 |
+
mov.f32 %f390, %f385;
|
142 |
+
mov.f32 %f391, %f385;
|
143 |
+
mov.f32 %f392, %f385;
|
144 |
+
mov.u64 %rd210, %rd209;
|
145 |
+
mov.u64 %rd211, %rd209;
|
146 |
+
mov.u64 %rd212, %rd209;
|
147 |
+
mov.u64 %rd213, %rd209;
|
148 |
+
mov.u64 %rd214, %rd209;
|
149 |
+
mov.u64 %rd215, %rd209;
|
150 |
+
mov.u64 %rd216, %rd209;
|
151 |
+
bra.uni $L__BB0_1;
|
152 |
+
$L__BB0_19:
|
153 |
+
.loc 1 36 23
|
154 |
+
bfe.s32 %r172, %r115, 0, 8;
|
155 |
+
cvt.u16.u32 %rs67, %r172;
|
156 |
+
and.b16 %rs68, %rs67, 255;
|
157 |
+
setp.eq.s16 %p117, %rs68, 0;
|
158 |
+
bfe.s32 %r173, %r115, 8, 8;
|
159 |
+
cvt.u16.u32 %rs69, %r173;
|
160 |
+
and.b16 %rs70, %rs69, 255;
|
161 |
+
setp.eq.s16 %p118, %rs70, 0;
|
162 |
+
bfe.s32 %r174, %r115, 16, 8;
|
163 |
+
cvt.u16.u32 %rs71, %r174;
|
164 |
+
and.b16 %rs72, %rs71, 255;
|
165 |
+
setp.eq.s16 %p119, %rs72, 0;
|
166 |
+
bfe.s32 %r175, %r115, 24, 8;
|
167 |
+
cvt.u16.u32 %rs73, %r175;
|
168 |
+
and.b16 %rs74, %rs73, 255;
|
169 |
+
setp.eq.s16 %p120, %rs74, 0;
|
170 |
+
bfe.s32 %r176, %r108, 0, 8;
|
171 |
+
cvt.u16.u32 %rs75, %r176;
|
172 |
+
and.b16 %rs76, %rs75, 255;
|
173 |
+
setp.eq.s16 %p121, %rs76, 0;
|
174 |
+
bfe.s32 %r177, %r108, 8, 8;
|
175 |
+
cvt.u16.u32 %rs77, %r177;
|
176 |
+
and.b16 %rs78, %rs77, 255;
|
177 |
+
setp.eq.s16 %p122, %rs78, 0;
|
178 |
+
bfe.s32 %r178, %r108, 16, 8;
|
179 |
+
cvt.u16.u32 %rs79, %r178;
|
180 |
+
and.b16 %rs80, %rs79, 255;
|
181 |
+
setp.eq.s16 %p123, %rs80, 0;
|
182 |
+
bfe.s32 %r179, %r108, 24, 8;
|
183 |
+
cvt.u16.u32 %rs81, %r179;
|
184 |
+
and.b16 %rs82, %rs81, 255;
|
185 |
+
setp.eq.s16 %p124, %rs82, 0;
|
186 |
+
.loc 1 46 23
|
187 |
+
setp.eq.f32 %p133, %f68, 0f00000000;
|
188 |
+
selp.f32 %f320, 0fFF800000, %f400, %p133;
|
189 |
+
bar.sync 0;
|
190 |
+
st.shared.f32 [%r4], %f37;
|
191 |
+
st.shared.f32 [%r5], %f42;
|
192 |
+
st.shared.f32 [%r6], %f47;
|
193 |
+
st.shared.f32 [%r7], %f52;
|
194 |
+
bar.sync 0;
|
195 |
+
ld.shared.f32 %f321, [%r12];
|
196 |
+
ld.shared.f32 %f322, [%r9];
|
197 |
+
ld.shared.f32 %f323, [%r10];
|
198 |
+
ld.shared.f32 %f324, [%r11];
|
199 |
+
bar.sync 0;
|
200 |
+
st.shared.f32 [%r4], %f57;
|
201 |
+
st.shared.f32 [%r5], %f62;
|
202 |
+
st.shared.f32 [%r6], %f67;
|
203 |
+
st.shared.f32 [%r7], %f320;
|
204 |
+
bar.sync 0;
|
205 |
+
ld.shared.f32 %f325, [%r12];
|
206 |
+
ld.shared.f32 %f326, [%r9];
|
207 |
+
ld.shared.f32 %f327, [%r10];
|
208 |
+
ld.shared.f32 %f328, [%r11];
|
209 |
+
.loc 1 48 17
|
210 |
+
sub.f32 %f329, %f324, %f28;
|
211 |
+
sub.f32 %f330, %f323, %f27;
|
212 |
+
sub.f32 %f331, %f322, %f26;
|
213 |
+
sub.f32 %f332, %f321, %f25;
|
214 |
+
sub.f32 %f333, %f328, %f32;
|
215 |
+
sub.f32 %f334, %f327, %f31;
|
216 |
+
sub.f32 %f335, %f326, %f30;
|
217 |
+
sub.f32 %f336, %f325, %f29;
|
218 |
+
add.f32 %f337, %f336, 0f00000000;
|
219 |
+
add.f32 %f338, %f335, 0f00000000;
|
220 |
+
add.f32 %f339, %f334, 0f00000000;
|
221 |
+
add.f32 %f340, %f333, 0f00000000;
|
222 |
+
add.f32 %f341, %f332, 0f00000000;
|
223 |
+
add.f32 %f342, %f331, 0f00000000;
|
224 |
+
add.f32 %f343, %f330, 0f00000000;
|
225 |
+
add.f32 %f344, %f329, 0f00000000;
|
226 |
+
.loc 1 50 38
|
227 |
+
selp.f32 %f345, 0f00000000, %f344, %p124;
|
228 |
+
selp.f32 %f346, 0f00000000, %f343, %p123;
|
229 |
+
selp.f32 %f347, 0f00000000, %f342, %p122;
|
230 |
+
selp.f32 %f348, 0f00000000, %f341, %p121;
|
231 |
+
selp.f32 %f349, 0f00000000, %f340, %p120;
|
232 |
+
selp.f32 %f350, 0f00000000, %f339, %p119;
|
233 |
+
selp.f32 %f351, 0f00000000, %f338, %p118;
|
234 |
+
selp.f32 %f352, 0f00000000, %f337, %p117;
|
235 |
+
.loc 1 53 48
|
236 |
+
selp.f32 %f353, %f352, 0f80000000, %p1;
|
237 |
+
selp.f32 %f354, %f351, 0f80000000, %p1;
|
238 |
+
selp.f32 %f355, %f350, 0f80000000, %p90;
|
239 |
+
selp.f32 %f356, %f349, 0f80000000, %p90;
|
240 |
+
selp.f32 %f357, %f348, 0f80000000, %p1;
|
241 |
+
selp.f32 %f358, %f347, 0f80000000, %p1;
|
242 |
+
selp.f32 %f359, %f346, 0f80000000, %p1;
|
243 |
+
selp.f32 %f360, %f345, 0f80000000, %p1;
|
244 |
+
add.f32 %f388, %f388, %f360;
|
245 |
+
add.f32 %f387, %f387, %f359;
|
246 |
+
add.f32 %f386, %f386, %f358;
|
247 |
+
add.f32 %f385, %f385, %f357;
|
248 |
+
add.f32 %f392, %f392, %f356;
|
249 |
+
add.f32 %f391, %f391, %f355;
|
250 |
+
add.f32 %f390, %f390, %f354;
|
251 |
+
add.f32 %f389, %f389, %f353;
|
252 |
+
.loc 1 57 48
|
253 |
+
and.pred %p134, %p1, %p52;
|
254 |
+
and.pred %p135, %p1, %p51;
|
255 |
+
and.pred %p136, %p1, %p50;
|
256 |
+
and.pred %p137, %p1, %p49;
|
257 |
+
and.pred %p138, %p17, %p48;
|
258 |
+
and.pred %p139, %p17, %p47;
|
259 |
+
and.pred %p140, %p17, %p46;
|
260 |
+
and.pred %p141, %p17, %p45;
|
261 |
+
selp.u64 %rd140, 1, 0, %p141;
|
262 |
+
selp.u64 %rd141, 1, 0, %p140;
|
263 |
+
selp.u64 %rd142, 1, 0, %p139;
|
264 |
+
selp.u64 %rd143, 1, 0, %p138;
|
265 |
+
selp.u64 %rd144, 1, 0, %p137;
|
266 |
+
selp.u64 %rd145, 1, 0, %p136;
|
267 |
+
selp.u64 %rd146, 1, 0, %p135;
|
268 |
+
selp.u64 %rd147, 1, 0, %p134;
|
269 |
+
add.s64 %rd209, %rd209, %rd147;
|
270 |
+
add.s64 %rd210, %rd210, %rd146;
|
271 |
+
add.s64 %rd211, %rd211, %rd145;
|
272 |
+
add.s64 %rd212, %rd212, %rd144;
|
273 |
+
add.s64 %rd213, %rd213, %rd143;
|
274 |
+
add.s64 %rd214, %rd214, %rd142;
|
275 |
+
add.s64 %rd215, %rd215, %rd141;
|
276 |
+
add.s64 %rd216, %rd216, %rd140;
|
277 |
+
.loc 1 28 36
|
278 |
+
add.s64 %rd208, %rd208, 16384;
|
279 |
+
add.s32 %r246, %r246, 2048;
|
280 |
+
add.s64 %rd207, %rd207, 205852672;
|
281 |
+
add.s64 %rd206, %rd206, 8192;
|
282 |
+
add.s64 %rd205, %rd205, 8192;
|
283 |
+
setp.lt.u32 %p142, %r246, 5632;
|
284 |
+
@%p142 bra $L__BB0_1;
|
285 |
+
bra.uni $L__BB0_20;
|
286 |
+
$L__BB0_1:
|
287 |
+
.loc 1 0 36
|
288 |
+
cvt.u32.u64 %r98, %rd1;
|
289 |
+
.loc 1 23 21
|
290 |
+
setp.lt.s32 %p78, %r98, 8;
|
291 |
+
.loc 1 29 27
|
292 |
+
add.s32 %r99, %r3, %r246;
|
293 |
+
add.s32 %r100, %r99, 3072;
|
294 |
+
.loc 1 30 25
|
295 |
+
add.s32 %r101, %r246, 3584;
|
296 |
+
setp.lt.u32 %p43, %r100, 7680;
|
297 |
+
setp.lt.u32 %p44, %r101, 7680;
|
298 |
+
.loc 1 29 27
|
299 |
+
add.s64 %rd72, %rd208, -8208;
|
300 |
+
.loc 1 32 34
|
301 |
+
add.s64 %rd75, %rd208, -8192;
|
302 |
+
add.s64 %rd78, %rd208, -16;
|
303 |
+
.loc 1 32 59
|
304 |
+
and.pred %p17, %p78, %p43;
|
305 |
+
and.pred %p90, %p78, %p44;
|
306 |
+
.loc 1 32 51
|
307 |
+
mov.u64 %rd70, 0x0;
|
308 |
+
mov.u64 %rd71, 0x0;
|
309 |
+
@%p78 ld.global.L1::evict_first.v2.b64 { %rd70, %rd71 }, [ %rd72 + 0 ];
|
310 |
+
@!%p78 mov.u64 %rd70, 0x0;
|
311 |
+
@!%p78 mov.u64 %rd71, 0x0;
|
312 |
+
mov.u64 %rd73, 0x0;
|
313 |
+
mov.u64 %rd74, 0x0;
|
314 |
+
@%p78 ld.global.L1::evict_first.v2.b64 { %rd73, %rd74 }, [ %rd75 + 0 ];
|
315 |
+
@!%p78 mov.u64 %rd73, 0x0;
|
316 |
+
@!%p78 mov.u64 %rd74, 0x0;
|
317 |
+
mov.u64 %rd76, 0x0;
|
318 |
+
mov.u64 %rd77, 0x0;
|
319 |
+
@%p17 ld.global.L1::evict_first.v2.b64 { %rd76, %rd77 }, [ %rd78 + 0 ];
|
320 |
+
@!%p17 mov.u64 %rd76, 0x0;
|
321 |
+
@!%p17 mov.u64 %rd77, 0x0;
|
322 |
+
mov.u64 %rd79, 0x0;
|
323 |
+
mov.u64 %rd80, 0x0;
|
324 |
+
@%p17 ld.global.L1::evict_first.v2.b64 { %rd79, %rd80 }, [ %rd208 + 0 ];
|
325 |
+
@!%p17 mov.u64 %rd79, 0x0;
|
326 |
+
@!%p17 mov.u64 %rd80, 0x0;
|
327 |
+
.loc 1 33 35
|
328 |
+
add.s64 %rd82, %rd205, %rd4;
|
329 |
+
.loc 1 33 52
|
330 |
+
add.s64 %rd83, %rd82, 4096;
|
331 |
+
mov.b32 %r70, 0;
|
332 |
+
mov.u32 %r66, 0x0;
|
333 |
+
mov.u32 %r67, 0x0;
|
334 |
+
mov.u32 %r68, 0x0;
|
335 |
+
mov.u32 %r69, 0x0;
|
336 |
+
@%p78 ld.global.L1::evict_first.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd82 + 0 ];
|
337 |
+
@!%p78 mov.u32 %r66, %r70;
|
338 |
+
@!%p78 mov.u32 %r67, %r70;
|
339 |
+
@!%p78 mov.u32 %r68, %r70;
|
340 |
+
@!%p78 mov.u32 %r69, %r70;
|
341 |
+
mov.u32 %r74, 0x0;
|
342 |
+
mov.u32 %r75, 0x0;
|
343 |
+
mov.u32 %r76, 0x0;
|
344 |
+
mov.u32 %r77, 0x0;
|
345 |
+
@%p17 ld.global.L1::evict_first.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd83 + 0 ];
|
346 |
+
@!%p17 mov.u32 %r74, %r70;
|
347 |
+
@!%p17 mov.u32 %r75, %r70;
|
348 |
+
@!%p17 mov.u32 %r76, %r70;
|
349 |
+
@!%p17 mov.u32 %r77, %r70;
|
350 |
+
bar.sync 0;
|
351 |
+
st.shared.u32 [%r4], %r66;
|
352 |
+
st.shared.u32 [%r5], %r67;
|
353 |
+
st.shared.u32 [%r6], %r68;
|
354 |
+
st.shared.u32 [%r7], %r69;
|
355 |
+
bar.sync 0;
|
356 |
+
ld.shared.f32 %f9, [%r12];
|
357 |
+
ld.shared.f32 %f10, [%r9];
|
358 |
+
ld.shared.f32 %f11, [%r10];
|
359 |
+
ld.shared.f32 %f12, [%r11];
|
360 |
+
bar.sync 0;
|
361 |
+
st.shared.u32 [%r4], %r74;
|
362 |
+
st.shared.u32 [%r5], %r75;
|
363 |
+
st.shared.u32 [%r6], %r76;
|
364 |
+
st.shared.u32 [%r7], %r77;
|
365 |
+
bar.sync 0;
|
366 |
+
ld.shared.f32 %f13, [%r12];
|
367 |
+
ld.shared.f32 %f14, [%r9];
|
368 |
+
ld.shared.f32 %f15, [%r10];
|
369 |
+
ld.shared.f32 %f16, [%r11];
|
370 |
+
.loc 1 34 35
|
371 |
+
add.s64 %rd84, %rd206, %rd4;
|
372 |
+
.loc 1 34 52
|
373 |
+
add.s64 %rd85, %rd84, 4096;
|
374 |
+
mov.u32 %r82, 0x0;
|
375 |
+
mov.u32 %r83, 0x0;
|
376 |
+
mov.u32 %r84, 0x0;
|
377 |
+
mov.u32 %r85, 0x0;
|
378 |
+
@%p78 ld.global.L1::evict_first.v4.b32 { %r82, %r83, %r84, %r85 }, [ %rd84 + 0 ];
|
379 |
+
@!%p78 mov.u32 %r82, %r70;
|
380 |
+
@!%p78 mov.u32 %r83, %r70;
|
381 |
+
@!%p78 mov.u32 %r84, %r70;
|
382 |
+
@!%p78 mov.u32 %r85, %r70;
|
383 |
+
mov.b32 %f17, %r82;
|
384 |
+
mov.u32 %r90, 0x0;
|
385 |
+
mov.u32 %r91, 0x0;
|
386 |
+
mov.u32 %r92, 0x0;
|
387 |
+
mov.u32 %r93, 0x0;
|
388 |
+
@%p17 ld.global.L1::evict_first.v4.b32 { %r90, %r91, %r92, %r93 }, [ %rd85 + 0 ];
|
389 |
+
@!%p17 mov.u32 %r90, %r70;
|
390 |
+
@!%p17 mov.u32 %r91, %r70;
|
391 |
+
@!%p17 mov.u32 %r92, %r70;
|
392 |
+
@!%p17 mov.u32 %r93, %r70;
|
393 |
+
.loc 1 36 23
|
394 |
+
setp.ne.s64 %p45, %rd80, -1;
|
395 |
+
setp.ne.s64 %p46, %rd79, -1;
|
396 |
+
setp.ne.s64 %p47, %rd77, -1;
|
397 |
+
setp.ne.s64 %p48, %rd76, -1;
|
398 |
+
setp.ne.s64 %p49, %rd74, -1;
|
399 |
+
setp.ne.s64 %p50, %rd73, -1;
|
400 |
+
setp.ne.s64 %p51, %rd71, -1;
|
401 |
+
setp.ne.s64 %p52, %rd70, -1;
|
402 |
+
bar.sync 0;
|
403 |
+
selp.u16 %rs1, 1, 0, %p52;
|
404 |
+
st.shared.u8 [%r12], %rs1;
|
405 |
+
selp.u16 %rs2, 1, 0, %p51;
|
406 |
+
st.shared.u8 [%r13], %rs2;
|
407 |
+
selp.u16 %rs3, 1, 0, %p50;
|
408 |
+
st.shared.u8 [%r14], %rs3;
|
409 |
+
selp.u16 %rs4, 1, 0, %p49;
|
410 |
+
st.shared.u8 [%r15], %rs4;
|
411 |
+
bar.sync 0;
|
412 |
+
ld.shared.u8 %r102, [%r19];
|
413 |
+
ld.shared.u8 %r103, [%r18];
|
414 |
+
ld.shared.u8 %r104, [%r17];
|
415 |
+
ld.shared.u8 %r105, [%r16];
|
416 |
+
bar.sync 0;
|
417 |
+
selp.u16 %rs5, 1, 0, %p48;
|
418 |
+
st.shared.u8 [%r12], %rs5;
|
419 |
+
selp.u16 %rs6, 1, 0, %p47;
|
420 |
+
st.shared.u8 [%r13], %rs6;
|
421 |
+
selp.u16 %rs7, 1, 0, %p46;
|
422 |
+
st.shared.u8 [%r14], %rs7;
|
423 |
+
selp.u16 %rs8, 1, 0, %p45;
|
424 |
+
st.shared.u8 [%r15], %rs8;
|
425 |
+
bar.sync 0;
|
426 |
+
bfi.b32 %r106, %r104, %r105, 8, 8;
|
427 |
+
bfi.b32 %r107, %r103, %r106, 16, 8;
|
428 |
+
bfi.b32 %r108, %r102, %r107, 24, 8;
|
429 |
+
ld.shared.u8 %r109, [%r16];
|
430 |
+
ld.shared.u8 %r110, [%r17];
|
431 |
+
bfi.b32 %r111, %r110, %r109, 8, 8;
|
432 |
+
ld.shared.u8 %r112, [%r18];
|
433 |
+
bfi.b32 %r113, %r112, %r111, 16, 8;
|
434 |
+
ld.shared.u8 %r114, [%r19];
|
435 |
+
bfi.b32 %r115, %r114, %r113, 24, 8;
|
436 |
+
.loc 1 42 40
|
437 |
+
bar.sync 0;
|
438 |
+
.loc 1 38 36
|
439 |
+
selp.b64 %rd86, %rd70, 0, %p52;
|
440 |
+
selp.b64 %rd87, %rd71, 0, %p51;
|
441 |
+
selp.b64 %rd88, %rd73, 0, %p50;
|
442 |
+
selp.b64 %rd89, %rd74, 0, %p49;
|
443 |
+
.loc 1 39 22
|
444 |
+
add.s64 %rd90, %rd89, 50257;
|
445 |
+
add.s64 %rd91, %rd88, 50257;
|
446 |
+
add.s64 %rd92, %rd87, 50257;
|
447 |
+
add.s64 %rd93, %rd86, 50257;
|
448 |
+
.loc 1 40 22
|
449 |
+
setp.lt.s64 %p53, %rd89, 0;
|
450 |
+
setp.lt.s64 %p54, %rd88, 0;
|
451 |
+
setp.lt.s64 %p55, %rd87, 0;
|
452 |
+
setp.lt.s64 %p56, %rd86, 0;
|
453 |
+
.loc 1 41 36
|
454 |
+
selp.b64 %rd27, %rd93, %rd86, %p56;
|
455 |
+
selp.b64 %rd28, %rd92, %rd87, %p55;
|
456 |
+
selp.b64 %rd29, %rd91, %rd88, %p54;
|
457 |
+
selp.b64 %rd30, %rd90, %rd89, %p53;
|
458 |
+
.loc 1 42 40
|
459 |
+
setp.lt.u64 %p57, %rd30, 50257;
|
460 |
+
setp.lt.u64 %p58, %rd29, 50257;
|
461 |
+
setp.lt.u64 %p59, %rd28, 50257;
|
462 |
+
setp.lt.u64 %p60, %rd27, 50257;
|
463 |
+
selp.u32 %r116, 1, 0, %p60;
|
464 |
+
selp.u32 %r117, 1, 0, %p59;
|
465 |
+
bfi.b32 %r118, %r117, %r116, 8, 8;
|
466 |
+
selp.u32 %r119, 1, 0, %p58;
|
467 |
+
bfi.b32 %r120, %r119, %r118, 16, 8;
|
468 |
+
selp.u32 %r121, 1, 0, %p57;
|
469 |
+
bfi.b32 %r122, %r121, %r120, 24, 8;
|
470 |
+
st.shared.u32 [%r20], %r122;
|
471 |
+
bar.sync 0;
|
472 |
+
ld.shared.u8 %rs9, [%r21];
|
473 |
+
ld.shared.u8 %rs10, [%r21+256];
|
474 |
+
ld.shared.u8 %rs11, [%r21+512];
|
475 |
+
ld.shared.u8 %rs12, [%r21+768];
|
476 |
+
bar.sync 0;
|
477 |
+
.loc 1 38 36
|
478 |
+
selp.b64 %rd94, %rd76, 0, %p48;
|
479 |
+
selp.b64 %rd95, %rd77, 0, %p47;
|
480 |
+
selp.b64 %rd96, %rd79, 0, %p46;
|
481 |
+
selp.b64 %rd97, %rd80, 0, %p45;
|
482 |
+
.loc 1 39 22
|
483 |
+
add.s64 %rd98, %rd97, 50257;
|
484 |
+
add.s64 %rd99, %rd96, 50257;
|
485 |
+
add.s64 %rd100, %rd95, 50257;
|
486 |
+
add.s64 %rd101, %rd94, 50257;
|
487 |
+
.loc 1 40 22
|
488 |
+
setp.lt.s64 %p61, %rd97, 0;
|
489 |
+
setp.lt.s64 %p62, %rd96, 0;
|
490 |
+
setp.lt.s64 %p63, %rd95, 0;
|
491 |
+
setp.lt.s64 %p64, %rd94, 0;
|
492 |
+
.loc 1 41 36
|
493 |
+
selp.b64 %rd31, %rd101, %rd94, %p64;
|
494 |
+
selp.b64 %rd32, %rd100, %rd95, %p63;
|
495 |
+
selp.b64 %rd33, %rd99, %rd96, %p62;
|
496 |
+
selp.b64 %rd34, %rd98, %rd97, %p61;
|
497 |
+
.loc 1 42 40
|
498 |
+
setp.lt.u64 %p65, %rd34, 50257;
|
499 |
+
setp.lt.u64 %p66, %rd33, 50257;
|
500 |
+
setp.lt.u64 %p67, %rd32, 50257;
|
501 |
+
setp.lt.u64 %p68, %rd31, 50257;
|
502 |
+
selp.u32 %r123, 1, 0, %p68;
|
503 |
+
selp.u32 %r124, 1, 0, %p67;
|
504 |
+
bfi.b32 %r125, %r124, %r123, 8, 8;
|
505 |
+
selp.u32 %r126, 1, 0, %p66;
|
506 |
+
bfi.b32 %r127, %r126, %r125, 16, 8;
|
507 |
+
selp.u32 %r128, 1, 0, %p65;
|
508 |
+
bfi.b32 %r129, %r128, %r127, 24, 8;
|
509 |
+
st.shared.u32 [%r20], %r129;
|
510 |
+
bar.sync 0;
|
511 |
+
ld.shared.u8 %rs13, [%r21];
|
512 |
+
ld.shared.u8 %rs14, [%r21+256];
|
513 |
+
ld.shared.u8 %rs15, [%r21+512];
|
514 |
+
ld.shared.u8 %rs16, [%r21+768];
|
515 |
+
setp.eq.s16 %p69, %rs11, 0;
|
516 |
+
selp.u16 %rs17, 1, 0, %p69;
|
517 |
+
shl.b16 %rs18, %rs17, 2;
|
518 |
+
setp.eq.s16 %p70, %rs12, 0;
|
519 |
+
selp.u16 %rs19, -1, 0, %p70;
|
520 |
+
shl.b16 %rs20, %rs19, 3;
|
521 |
+
or.b16 %rs21, %rs20, %rs18;
|
522 |
+
setp.eq.s16 %p71, %rs10, 0;
|
523 |
+
selp.u16 %rs22, 1, 0, %p71;
|
524 |
+
setp.eq.s16 %p72, %rs9, 0;
|
525 |
+
selp.u16 %rs23, -1, 0, %p72;
|
526 |
+
shl.b16 %rs24, %rs23, 1;
|
527 |
+
or.b16 %rs25, %rs22, %rs24;
|
528 |
+
and.b16 %rs26, %rs25, 3;
|
529 |
+
or.b16 %rs27, %rs26, %rs21;
|
530 |
+
and.b16 %rs28, %rs27, 15;
|
531 |
+
setp.eq.s16 %p73, %rs15, 0;
|
532 |
+
selp.u16 %rs29, 1, 0, %p73;
|
533 |
+
shl.b16 %rs30, %rs29, 2;
|
534 |
+
setp.eq.s16 %p74, %rs16, 0;
|
535 |
+
selp.u16 %rs31, -1, 0, %p74;
|
536 |
+
shl.b16 %rs32, %rs31, 3;
|
537 |
+
or.b16 %rs33, %rs32, %rs30;
|
538 |
+
setp.eq.s16 %p75, %rs13, 0;
|
539 |
+
selp.u16 %rs34, 1, 0, %p75;
|
540 |
+
setp.eq.s16 %p76, %rs14, 0;
|
541 |
+
selp.u16 %rs35, -1, 0, %p76;
|
542 |
+
shl.b16 %rs36, %rs35, 1;
|
543 |
+
or.b16 %rs37, %rs34, %rs36;
|
544 |
+
and.b16 %rs38, %rs37, 3;
|
545 |
+
or.b16 %rs39, %rs38, %rs33;
|
546 |
+
shl.b16 %rs40, %rs39, 4;
|
547 |
+
or.b16 %rs41, %rs28, %rs40;
|
548 |
+
.loc 1 42 55
|
549 |
+
and.b16 %rs42, %rs41, 255;
|
550 |
+
setp.eq.s16 %p77, %rs42, 0;
|
551 |
+
@%p77 bra $L__BB0_3;
|
552 |
+
mov.u64 %rd102, assertMessage_0;
|
553 |
+
cvta.global.u64 %rd103, %rd102;
|
554 |
+
mov.u64 %rd104, assertFile_0;
|
555 |
+
cvta.global.u64 %rd105, %rd104;
|
556 |
+
mov.u64 %rd106, assertFunc_0;
|
557 |
+
cvta.global.u64 %rd107, %rd106;
|
558 |
+
mov.b32 %r130, 883;
|
559 |
+
mov.u64 %rd108, 1;
|
560 |
+
{ // callseq 0, 0
|
561 |
+
.reg .b32 temp_param_reg;
|
562 |
+
.param .b64 param0;
|
563 |
+
st.param.b64 [param0+0], %rd103;
|
564 |
+
.param .b64 param1;
|
565 |
+
st.param.b64 [param1+0], %rd105;
|
566 |
+
.param .b32 param2;
|
567 |
+
st.param.b32 [param2+0], %r130;
|
568 |
+
.param .b64 param3;
|
569 |
+
st.param.b64 [param3+0], %rd107;
|
570 |
+
.param .b64 param4;
|
571 |
+
st.param.b64 [param4+0], %rd108;
|
572 |
+
call.uni
|
573 |
+
__assertfail,
|
574 |
+
(
|
575 |
+
param0,
|
576 |
+
param1,
|
577 |
+
param2,
|
578 |
+
param3,
|
579 |
+
param4
|
580 |
+
);
|
581 |
+
} // callseq 0
|
582 |
+
$L__BB0_3:
|
583 |
+
.loc 1 43 71
|
584 |
+
bar.sync 0;
|
585 |
+
shl.b64 %rd117, %rd27, 1;
|
586 |
+
add.s64 %rd118, %rd207, %rd117;
|
587 |
+
add.s64 %rd119, %rd118, -103227878;
|
588 |
+
st.shared.u64 [%r22], %rd119;
|
589 |
+
shl.b64 %rd120, %rd28, 1;
|
590 |
+
add.s64 %rd121, %rd207, %rd120;
|
591 |
+
add.s64 %rd122, %rd121, -103127364;
|
592 |
+
st.shared.u64 [%r23], %rd122;
|
593 |
+
shl.b64 %rd123, %rd29, 1;
|
594 |
+
add.s64 %rd124, %rd207, %rd123;
|
595 |
+
add.s64 %rd125, %rd124, -103026850;
|
596 |
+
st.shared.u64 [%r24], %rd125;
|
597 |
+
shl.b64 %rd126, %rd30, 1;
|
598 |
+
add.s64 %rd127, %rd207, %rd126;
|
599 |
+
add.s64 %rd128, %rd127, -102926336;
|
600 |
+
st.shared.u64 [%r25], %rd128;
|
601 |
+
bar.sync 0;
|
602 |
+
ld.shared.u64 %rd109, [%r26];
|
603 |
+
ld.shared.u64 %rd110, [%r27];
|
604 |
+
ld.shared.u64 %rd111, [%r28];
|
605 |
+
ld.shared.u64 %rd112, [%r29];
|
606 |
+
bar.sync 0;
|
607 |
+
shl.b64 %rd129, %rd31, 1;
|
608 |
+
add.s64 %rd130, %rd207, %rd129;
|
609 |
+
add.s64 %rd131, %rd130, -301542;
|
610 |
+
st.shared.u64 [%r22], %rd131;
|
611 |
+
shl.b64 %rd132, %rd32, 1;
|
612 |
+
add.s64 %rd133, %rd207, %rd132;
|
613 |
+
add.s64 %rd134, %rd133, -201028;
|
614 |
+
st.shared.u64 [%r23], %rd134;
|
615 |
+
shl.b64 %rd135, %rd33, 1;
|
616 |
+
add.s64 %rd136, %rd207, %rd135;
|
617 |
+
add.s64 %rd137, %rd136, -100514;
|
618 |
+
st.shared.u64 [%r24], %rd137;
|
619 |
+
shl.b64 %rd138, %rd34, 1;
|
620 |
+
add.s64 %rd139, %rd207, %rd138;
|
621 |
+
st.shared.u64 [%r25], %rd139;
|
622 |
+
bar.sync 0;
|
623 |
+
ld.shared.u64 %rd113, [%r26];
|
624 |
+
ld.shared.u64 %rd114, [%r27];
|
625 |
+
ld.shared.u64 %rd115, [%r28];
|
626 |
+
ld.shared.u64 %rd116, [%r29];
|
627 |
+
mov.u16 %rs43, 0x0;
|
628 |
+
@%p78 ld.global.L1::evict_last.b16 { %rs43 }, [ %rd109 + 0 ];
|
629 |
+
@!%p78 mov.u16 %rs43, %rs44;
|
630 |
+
mov.u16 %rs45, 0x0;
|
631 |
+
@%p78 ld.global.L1::evict_last.b16 { %rs45 }, [ %rd110 + 0 ];
|
632 |
+
@!%p78 mov.u16 %rs45, %rs44;
|
633 |
+
mov.u16 %rs47, 0x0;
|
634 |
+
@%p78 ld.global.L1::evict_last.b16 { %rs47 }, [ %rd111 + 0 ];
|
635 |
+
@!%p78 mov.u16 %rs47, %rs44;
|
636 |
+
mov.u16 %rs49, 0x0;
|
637 |
+
@%p78 ld.global.L1::evict_last.b16 { %rs49 }, [ %rd112 + 0 ];
|
638 |
+
@!%p78 mov.u16 %rs49, %rs44;
|
639 |
+
mov.u16 %rs51, 0x0;
|
640 |
+
@%p78 ld.global.L1::evict_last.b16 { %rs51 }, [ %rd113 + 0 ];
|
641 |
+
@!%p78 mov.u16 %rs51, %rs44;
|
642 |
+
mov.u16 %rs53, 0x0;
|
643 |
+
@%p78 ld.global.L1::evict_last.b16 { %rs53 }, [ %rd114 + 0 ];
|
644 |
+
@!%p78 mov.u16 %rs53, %rs44;
|
645 |
+
mov.u16 %rs55, 0x0;
|
646 |
+
@%p90 ld.global.L1::evict_last.b16 { %rs55 }, [ %rd115 + 0 ];
|
647 |
+
@!%p90 mov.u16 %rs55, %rs44;
|
648 |
+
mov.u16 %rs57, 0x0;
|
649 |
+
@%p90 ld.global.L1::evict_last.b16 { %rs57 }, [ %rd116 + 0 ];
|
650 |
+
@!%p90 mov.u16 %rs57, %rs44;
|
651 |
+
.loc 1 46 23
|
652 |
+
setp.lt.f32 %p94, %f17, 0f00800000;
|
653 |
+
mul.f32 %f96, %f17, 0f4B000000;
|
654 |
+
selp.f32 %f33, %f96, %f17, %p94;
|
655 |
+
selp.f32 %f97, 0fC1B80000, 0f00000000, %p94;
|
656 |
+
mov.b32 %r140, %f33;
|
657 |
+
add.s32 %r141, %r140, -1059760811;
|
658 |
+
and.b32 %r142, %r141, -8388608;
|
659 |
+
sub.s32 %r143, %r140, %r142;
|
660 |
+
mov.b32 %f98, %r143;
|
661 |
+
cvt.rn.f32.s32 %f99, %r142;
|
662 |
+
mov.f32 %f100, 0f34000000;
|
663 |
+
fma.rn.ftz.f32 %f101, %f99, %f100, %f97;
|
664 |
+
add.f32 %f102, %f98, 0fBF800000;
|
665 |
+
mov.f32 %f103, 0f3E1039F6;
|
666 |
+
mov.f32 %f104, 0fBE055027;
|
667 |
+
fma.rn.ftz.f32 %f105, %f104, %f102, %f103;
|
668 |
+
mov.f32 %f106, 0fBDF8CDCC;
|
669 |
+
fma.rn.ftz.f32 %f107, %f105, %f102, %f106;
|
670 |
+
mov.f32 %f108, 0f3E0F2955;
|
671 |
+
fma.rn.ftz.f32 %f109, %f107, %f102, %f108;
|
672 |
+
mov.f32 %f110, 0fBE2AD8B9;
|
673 |
+
fma.rn.ftz.f32 %f111, %f109, %f102, %f110;
|
674 |
+
mov.f32 %f112, 0f3E4CED0B;
|
675 |
+
fma.rn.ftz.f32 %f113, %f111, %f102, %f112;
|
676 |
+
mov.f32 %f114, 0fBE7FFF22;
|
677 |
+
fma.rn.ftz.f32 %f115, %f113, %f102, %f114;
|
678 |
+
mov.f32 %f116, 0f3EAAAA78;
|
679 |
+
fma.rn.ftz.f32 %f117, %f115, %f102, %f116;
|
680 |
+
mov.f32 %f118, 0fBF000000;
|
681 |
+
fma.rn.ftz.f32 %f119, %f117, %f102, %f118;
|
682 |
+
mul.f32 %f120, %f102, %f119;
|
683 |
+
fma.rn.ftz.f32 %f121, %f120, %f102, %f102;
|
684 |
+
mov.f32 %f122, 0f3F317218;
|
685 |
+
fma.rn.ftz.f32 %f393, %f101, %f122, %f121;
|
686 |
+
setp.lt.u32 %p95, %r140, 2139095040;
|
687 |
+
mov.f32 %f123, 0f7F800000;
|
688 |
+
@%p95 bra $L__BB0_5;
|
689 |
+
.loc 1 0 23
|
690 |
+
fma.rn.ftz.f32 %f393, %f33, %f123, %f123;
|
691 |
+
$L__BB0_5:
|
692 |
+
mov.b32 %f18, %r83;
|
693 |
+
.loc 1 46 23
|
694 |
+
setp.lt.f32 %p97, %f18, 0f00800000;
|
695 |
+
mul.f32 %f124, %f18, 0f4B000000;
|
696 |
+
selp.f32 %f38, %f124, %f18, %p97;
|
697 |
+
selp.f32 %f125, 0fC1B80000, 0f00000000, %p97;
|
698 |
+
mov.b32 %r144, %f38;
|
699 |
+
add.s32 %r145, %r144, -1059760811;
|
700 |
+
and.b32 %r146, %r145, -8388608;
|
701 |
+
sub.s32 %r147, %r144, %r146;
|
702 |
+
mov.b32 %f126, %r147;
|
703 |
+
cvt.rn.f32.s32 %f127, %r146;
|
704 |
+
fma.rn.ftz.f32 %f129, %f127, %f100, %f125;
|
705 |
+
add.f32 %f130, %f126, 0fBF800000;
|
706 |
+
fma.rn.ftz.f32 %f133, %f104, %f130, %f103;
|
707 |
+
fma.rn.ftz.f32 %f135, %f133, %f130, %f106;
|
708 |
+
fma.rn.ftz.f32 %f137, %f135, %f130, %f108;
|
709 |
+
fma.rn.ftz.f32 %f139, %f137, %f130, %f110;
|
710 |
+
fma.rn.ftz.f32 %f141, %f139, %f130, %f112;
|
711 |
+
fma.rn.ftz.f32 %f143, %f141, %f130, %f114;
|
712 |
+
fma.rn.ftz.f32 %f145, %f143, %f130, %f116;
|
713 |
+
fma.rn.ftz.f32 %f147, %f145, %f130, %f118;
|
714 |
+
mul.f32 %f148, %f130, %f147;
|
715 |
+
fma.rn.ftz.f32 %f149, %f148, %f130, %f130;
|
716 |
+
fma.rn.ftz.f32 %f394, %f129, %f122, %f149;
|
717 |
+
setp.lt.u32 %p98, %r144, 2139095040;
|
718 |
+
@%p98 bra $L__BB0_7;
|
719 |
+
.loc 1 0 23
|
720 |
+
fma.rn.ftz.f32 %f394, %f38, %f123, %f123;
|
721 |
+
$L__BB0_7:
|
722 |
+
mov.b32 %f19, %r84;
|
723 |
+
.loc 1 46 23
|
724 |
+
setp.lt.f32 %p100, %f19, 0f00800000;
|
725 |
+
mul.f32 %f152, %f19, 0f4B000000;
|
726 |
+
selp.f32 %f43, %f152, %f19, %p100;
|
727 |
+
selp.f32 %f153, 0fC1B80000, 0f00000000, %p100;
|
728 |
+
mov.b32 %r148, %f43;
|
729 |
+
add.s32 %r149, %r148, -1059760811;
|
730 |
+
and.b32 %r150, %r149, -8388608;
|
731 |
+
sub.s32 %r151, %r148, %r150;
|
732 |
+
mov.b32 %f154, %r151;
|
733 |
+
cvt.rn.f32.s32 %f155, %r150;
|
734 |
+
fma.rn.ftz.f32 %f157, %f155, %f100, %f153;
|
735 |
+
add.f32 %f158, %f154, 0fBF800000;
|
736 |
+
fma.rn.ftz.f32 %f161, %f104, %f158, %f103;
|
737 |
+
fma.rn.ftz.f32 %f163, %f161, %f158, %f106;
|
738 |
+
fma.rn.ftz.f32 %f165, %f163, %f158, %f108;
|
739 |
+
fma.rn.ftz.f32 %f167, %f165, %f158, %f110;
|
740 |
+
fma.rn.ftz.f32 %f169, %f167, %f158, %f112;
|
741 |
+
fma.rn.ftz.f32 %f171, %f169, %f158, %f114;
|
742 |
+
fma.rn.ftz.f32 %f173, %f171, %f158, %f116;
|
743 |
+
fma.rn.ftz.f32 %f175, %f173, %f158, %f118;
|
744 |
+
mul.f32 %f176, %f158, %f175;
|
745 |
+
fma.rn.ftz.f32 %f177, %f176, %f158, %f158;
|
746 |
+
fma.rn.ftz.f32 %f395, %f157, %f122, %f177;
|
747 |
+
setp.lt.u32 %p101, %r148, 2139095040;
|
748 |
+
@%p101 bra $L__BB0_9;
|
749 |
+
.loc 1 0 23
|
750 |
+
fma.rn.ftz.f32 %f395, %f43, %f123, %f123;
|
751 |
+
$L__BB0_9:
|
752 |
+
mov.b32 %f20, %r85;
|
753 |
+
.loc 1 46 23
|
754 |
+
setp.lt.f32 %p103, %f20, 0f00800000;
|
755 |
+
mul.f32 %f180, %f20, 0f4B000000;
|
756 |
+
selp.f32 %f48, %f180, %f20, %p103;
|
757 |
+
selp.f32 %f181, 0fC1B80000, 0f00000000, %p103;
|
758 |
+
mov.b32 %r152, %f48;
|
759 |
+
add.s32 %r153, %r152, -1059760811;
|
760 |
+
and.b32 %r154, %r153, -8388608;
|
761 |
+
sub.s32 %r155, %r152, %r154;
|
762 |
+
mov.b32 %f182, %r155;
|
763 |
+
cvt.rn.f32.s32 %f183, %r154;
|
764 |
+
fma.rn.ftz.f32 %f185, %f183, %f100, %f181;
|
765 |
+
add.f32 %f186, %f182, 0fBF800000;
|
766 |
+
fma.rn.ftz.f32 %f189, %f104, %f186, %f103;
|
767 |
+
fma.rn.ftz.f32 %f191, %f189, %f186, %f106;
|
768 |
+
fma.rn.ftz.f32 %f193, %f191, %f186, %f108;
|
769 |
+
fma.rn.ftz.f32 %f195, %f193, %f186, %f110;
|
770 |
+
fma.rn.ftz.f32 %f197, %f195, %f186, %f112;
|
771 |
+
fma.rn.ftz.f32 %f199, %f197, %f186, %f114;
|
772 |
+
fma.rn.ftz.f32 %f201, %f199, %f186, %f116;
|
773 |
+
fma.rn.ftz.f32 %f203, %f201, %f186, %f118;
|
774 |
+
mul.f32 %f204, %f186, %f203;
|
775 |
+
fma.rn.ftz.f32 %f205, %f204, %f186, %f186;
|
776 |
+
fma.rn.ftz.f32 %f396, %f185, %f122, %f205;
|
777 |
+
setp.lt.u32 %p104, %r152, 2139095040;
|
778 |
+
@%p104 bra $L__BB0_11;
|
779 |
+
.loc 1 0 23
|
780 |
+
fma.rn.ftz.f32 %f396, %f48, %f123, %f123;
|
781 |
+
$L__BB0_11:
|
782 |
+
mov.b32 %f21, %r90;
|
783 |
+
.loc 1 46 23
|
784 |
+
setp.lt.f32 %p106, %f21, 0f00800000;
|
785 |
+
mul.f32 %f208, %f21, 0f4B000000;
|
786 |
+
selp.f32 %f53, %f208, %f21, %p106;
|
787 |
+
selp.f32 %f209, 0fC1B80000, 0f00000000, %p106;
|
788 |
+
mov.b32 %r156, %f53;
|
789 |
+
add.s32 %r157, %r156, -1059760811;
|
790 |
+
and.b32 %r158, %r157, -8388608;
|
791 |
+
sub.s32 %r159, %r156, %r158;
|
792 |
+
mov.b32 %f210, %r159;
|
793 |
+
cvt.rn.f32.s32 %f211, %r158;
|
794 |
+
fma.rn.ftz.f32 %f213, %f211, %f100, %f209;
|
795 |
+
add.f32 %f214, %f210, 0fBF800000;
|
796 |
+
fma.rn.ftz.f32 %f217, %f104, %f214, %f103;
|
797 |
+
fma.rn.ftz.f32 %f219, %f217, %f214, %f106;
|
798 |
+
fma.rn.ftz.f32 %f221, %f219, %f214, %f108;
|
799 |
+
fma.rn.ftz.f32 %f223, %f221, %f214, %f110;
|
800 |
+
fma.rn.ftz.f32 %f225, %f223, %f214, %f112;
|
801 |
+
fma.rn.ftz.f32 %f227, %f225, %f214, %f114;
|
802 |
+
fma.rn.ftz.f32 %f229, %f227, %f214, %f116;
|
803 |
+
fma.rn.ftz.f32 %f231, %f229, %f214, %f118;
|
804 |
+
mul.f32 %f232, %f214, %f231;
|
805 |
+
fma.rn.ftz.f32 %f233, %f232, %f214, %f214;
|
806 |
+
fma.rn.ftz.f32 %f397, %f213, %f122, %f233;
|
807 |
+
setp.lt.u32 %p107, %r156, 2139095040;
|
808 |
+
@%p107 bra $L__BB0_13;
|
809 |
+
.loc 1 0 23
|
810 |
+
fma.rn.ftz.f32 %f397, %f53, %f123, %f123;
|
811 |
+
$L__BB0_13:
|
812 |
+
mov.b32 %f22, %r91;
|
813 |
+
.loc 1 46 23
|
814 |
+
setp.lt.f32 %p109, %f22, 0f00800000;
|
815 |
+
mul.f32 %f236, %f22, 0f4B000000;
|
816 |
+
selp.f32 %f58, %f236, %f22, %p109;
|
817 |
+
selp.f32 %f237, 0fC1B80000, 0f00000000, %p109;
|
818 |
+
mov.b32 %r160, %f58;
|
819 |
+
add.s32 %r161, %r160, -1059760811;
|
820 |
+
and.b32 %r162, %r161, -8388608;
|
821 |
+
sub.s32 %r163, %r160, %r162;
|
822 |
+
mov.b32 %f238, %r163;
|
823 |
+
cvt.rn.f32.s32 %f239, %r162;
|
824 |
+
fma.rn.ftz.f32 %f241, %f239, %f100, %f237;
|
825 |
+
add.f32 %f242, %f238, 0fBF800000;
|
826 |
+
fma.rn.ftz.f32 %f245, %f104, %f242, %f103;
|
827 |
+
fma.rn.ftz.f32 %f247, %f245, %f242, %f106;
|
828 |
+
fma.rn.ftz.f32 %f249, %f247, %f242, %f108;
|
829 |
+
fma.rn.ftz.f32 %f251, %f249, %f242, %f110;
|
830 |
+
fma.rn.ftz.f32 %f253, %f251, %f242, %f112;
|
831 |
+
fma.rn.ftz.f32 %f255, %f253, %f242, %f114;
|
832 |
+
fma.rn.ftz.f32 %f257, %f255, %f242, %f116;
|
833 |
+
fma.rn.ftz.f32 %f259, %f257, %f242, %f118;
|
834 |
+
mul.f32 %f260, %f242, %f259;
|
835 |
+
fma.rn.ftz.f32 %f261, %f260, %f242, %f242;
|
836 |
+
fma.rn.ftz.f32 %f398, %f241, %f122, %f261;
|
837 |
+
setp.lt.u32 %p110, %r160, 2139095040;
|
838 |
+
@%p110 bra $L__BB0_15;
|
839 |
+
.loc 1 0 23
|
840 |
+
fma.rn.ftz.f32 %f398, %f58, %f123, %f123;
|
841 |
+
$L__BB0_15:
|
842 |
+
setp.eq.f32 %p96, %f33, 0f00000000;
|
843 |
+
setp.eq.f32 %p99, %f38, 0f00000000;
|
844 |
+
setp.eq.f32 %p102, %f43, 0f00000000;
|
845 |
+
setp.eq.f32 %p105, %f48, 0f00000000;
|
846 |
+
setp.eq.f32 %p108, %f53, 0f00000000;
|
847 |
+
mov.b32 %f23, %r92;
|
848 |
+
.loc 1 46 23
|
849 |
+
setp.eq.f32 %p111, %f58, 0f00000000;
|
850 |
+
setp.lt.f32 %p112, %f23, 0f00800000;
|
851 |
+
mul.f32 %f264, %f23, 0f4B000000;
|
852 |
+
selp.f32 %f63, %f264, %f23, %p112;
|
853 |
+
selp.f32 %f265, 0fC1B80000, 0f00000000, %p112;
|
854 |
+
mov.b32 %r164, %f63;
|
855 |
+
add.s32 %r165, %r164, -1059760811;
|
856 |
+
and.b32 %r166, %r165, -8388608;
|
857 |
+
sub.s32 %r167, %r164, %r166;
|
858 |
+
mov.b32 %f266, %r167;
|
859 |
+
cvt.rn.f32.s32 %f267, %r166;
|
860 |
+
fma.rn.ftz.f32 %f269, %f267, %f100, %f265;
|
861 |
+
add.f32 %f270, %f266, 0fBF800000;
|
862 |
+
fma.rn.ftz.f32 %f273, %f104, %f270, %f103;
|
863 |
+
fma.rn.ftz.f32 %f275, %f273, %f270, %f106;
|
864 |
+
fma.rn.ftz.f32 %f277, %f275, %f270, %f108;
|
865 |
+
fma.rn.ftz.f32 %f279, %f277, %f270, %f110;
|
866 |
+
fma.rn.ftz.f32 %f281, %f279, %f270, %f112;
|
867 |
+
fma.rn.ftz.f32 %f283, %f281, %f270, %f114;
|
868 |
+
fma.rn.ftz.f32 %f285, %f283, %f270, %f116;
|
869 |
+
fma.rn.ftz.f32 %f287, %f285, %f270, %f118;
|
870 |
+
mul.f32 %f288, %f270, %f287;
|
871 |
+
fma.rn.ftz.f32 %f289, %f288, %f270, %f270;
|
872 |
+
fma.rn.ftz.f32 %f399, %f269, %f122, %f289;
|
873 |
+
setp.lt.u32 %p113, %r164, 2139095040;
|
874 |
+
@%p113 bra $L__BB0_17;
|
875 |
+
.loc 1 0 23
|
876 |
+
fma.rn.ftz.f32 %f399, %f63, %f123, %f123;
|
877 |
+
$L__BB0_17:
|
878 |
+
mov.b32 %f24, %r93;
|
879 |
+
cvt.f32.bf16 %r131, %rs43;
|
880 |
+
mov.b32 %f88, %r131;
|
881 |
+
cvt.f32.bf16 %r132, %rs45;
|
882 |
+
mov.b32 %f89, %r132;
|
883 |
+
cvt.f32.bf16 %r133, %rs47;
|
884 |
+
mov.b32 %f90, %r133;
|
885 |
+
cvt.f32.bf16 %r134, %rs49;
|
886 |
+
mov.b32 %f91, %r134;
|
887 |
+
cvt.f32.bf16 %r135, %rs51;
|
888 |
+
mov.b32 %f92, %r135;
|
889 |
+
cvt.f32.bf16 %r136, %rs53;
|
890 |
+
mov.b32 %f93, %r136;
|
891 |
+
cvt.f32.bf16 %r137, %rs55;
|
892 |
+
mov.b32 %f94, %r137;
|
893 |
+
cvt.f32.bf16 %r138, %rs57;
|
894 |
+
mov.b32 %f95, %r138;
|
895 |
+
sub.f32 %f32, %f95, %f16;
|
896 |
+
sub.f32 %f31, %f94, %f15;
|
897 |
+
sub.f32 %f30, %f93, %f14;
|
898 |
+
sub.f32 %f29, %f92, %f13;
|
899 |
+
sub.f32 %f28, %f91, %f12;
|
900 |
+
sub.f32 %f27, %f90, %f11;
|
901 |
+
sub.f32 %f26, %f89, %f10;
|
902 |
+
sub.f32 %f25, %f88, %f9;
|
903 |
+
.loc 1 46 23
|
904 |
+
selp.f32 %f37, 0fFF800000, %f393, %p96;
|
905 |
+
selp.f32 %f42, 0fFF800000, %f394, %p99;
|
906 |
+
selp.f32 %f47, 0fFF800000, %f395, %p102;
|
907 |
+
selp.f32 %f52, 0fFF800000, %f396, %p105;
|
908 |
+
selp.f32 %f57, 0fFF800000, %f397, %p108;
|
909 |
+
selp.f32 %f62, 0fFF800000, %f398, %p111;
|
910 |
+
setp.eq.f32 %p114, %f63, 0f00000000;
|
911 |
+
selp.f32 %f67, 0fFF800000, %f399, %p114;
|
912 |
+
setp.lt.f32 %p115, %f24, 0f00800000;
|
913 |
+
mul.f32 %f292, %f24, 0f4B000000;
|
914 |
+
selp.f32 %f68, %f292, %f24, %p115;
|
915 |
+
selp.f32 %f293, 0fC1B80000, 0f00000000, %p115;
|
916 |
+
mov.b32 %r168, %f68;
|
917 |
+
add.s32 %r169, %r168, -1059760811;
|
918 |
+
and.b32 %r170, %r169, -8388608;
|
919 |
+
sub.s32 %r171, %r168, %r170;
|
920 |
+
mov.b32 %f294, %r171;
|
921 |
+
cvt.rn.f32.s32 %f295, %r170;
|
922 |
+
fma.rn.ftz.f32 %f297, %f295, %f100, %f293;
|
923 |
+
add.f32 %f298, %f294, 0fBF800000;
|
924 |
+
fma.rn.ftz.f32 %f301, %f104, %f298, %f103;
|
925 |
+
fma.rn.ftz.f32 %f303, %f301, %f298, %f106;
|
926 |
+
fma.rn.ftz.f32 %f305, %f303, %f298, %f108;
|
927 |
+
fma.rn.ftz.f32 %f307, %f305, %f298, %f110;
|
928 |
+
fma.rn.ftz.f32 %f309, %f307, %f298, %f112;
|
929 |
+
fma.rn.ftz.f32 %f311, %f309, %f298, %f114;
|
930 |
+
fma.rn.ftz.f32 %f313, %f311, %f298, %f116;
|
931 |
+
fma.rn.ftz.f32 %f315, %f313, %f298, %f118;
|
932 |
+
mul.f32 %f316, %f298, %f315;
|
933 |
+
fma.rn.ftz.f32 %f317, %f316, %f298, %f298;
|
934 |
+
fma.rn.ftz.f32 %f400, %f297, %f122, %f317;
|
935 |
+
setp.lt.u32 %p116, %r168, 2139095040;
|
936 |
+
@%p116 bra $L__BB0_19;
|
937 |
+
.loc 1 0 23
|
938 |
+
fma.rn.ftz.f32 %f400, %f68, %f123, %f123;
|
939 |
+
bra.uni $L__BB0_19;
|
940 |
+
$L__BB0_20:
|
941 |
+
.loc 1 24 33
|
942 |
+
bfe.u32 %r191, %r1, 5, 3;
|
943 |
+
and.b32 %r192, %r1, 31;
|
944 |
+
$L__tmp1:
|
945 |
+
.loc 2 243 36
|
946 |
+
bar.sync 0;
|
947 |
+
$L__tmp2:
|
948 |
+
.loc 2 233 15
|
949 |
+
add.f32 %f361, %f385, %f386;
|
950 |
+
add.f32 %f362, %f387, %f361;
|
951 |
+
add.f32 %f363, %f388, %f362;
|
952 |
+
add.f32 %f364, %f389, %f363;
|
953 |
+
add.f32 %f365, %f390, %f364;
|
954 |
+
add.f32 %f366, %f391, %f365;
|
955 |
+
add.f32 %f367, %f392, %f366;
|
956 |
+
$L__tmp3:
|
957 |
+
.loc 2 243 36
|
958 |
+
mov.b32 %r193, %f367;
|
959 |
+
shfl.sync.bfly.b32 %r194, %r193, 16, 31, -1;
|
960 |
+
mov.b32 %f368, %r194;
|
961 |
+
$L__tmp4:
|
962 |
+
.loc 2 233 15
|
963 |
+
add.f32 %f369, %f367, %f368;
|
964 |
+
$L__tmp5:
|
965 |
+
.loc 2 243 36
|
966 |
+
mov.b32 %r195, %f369;
|
967 |
+
shfl.sync.bfly.b32 %r196, %r195, 8, 31, -1;
|
968 |
+
mov.b32 %f370, %r196;
|
969 |
+
$L__tmp6:
|
970 |
+
.loc 2 233 15
|
971 |
+
add.f32 %f371, %f369, %f370;
|
972 |
+
$L__tmp7:
|
973 |
+
.loc 2 243 36
|
974 |
+
mov.b32 %r197, %f371;
|
975 |
+
shfl.sync.bfly.b32 %r198, %r197, 4, 31, -1;
|
976 |
+
mov.b32 %f372, %r198;
|
977 |
+
$L__tmp8:
|
978 |
+
.loc 2 233 15
|
979 |
+
add.f32 %f373, %f371, %f372;
|
980 |
+
$L__tmp9:
|
981 |
+
.loc 2 243 36
|
982 |
+
mov.b32 %r199, %f373;
|
983 |
+
shfl.sync.bfly.b32 %r200, %r199, 2, 31, -1;
|
984 |
+
mov.b32 %f374, %r200;
|
985 |
+
$L__tmp10:
|
986 |
+
.loc 2 233 15
|
987 |
+
add.f32 %f375, %f373, %f374;
|
988 |
+
$L__tmp11:
|
989 |
+
.loc 2 243 36
|
990 |
+
mov.b32 %r201, %f375;
|
991 |
+
shfl.sync.bfly.b32 %r202, %r201, 1, 31, -1;
|
992 |
+
mov.b32 %f376, %r202;
|
993 |
+
$L__tmp12:
|
994 |
+
.loc 2 233 15
|
995 |
+
add.f32 %f377, %f375, %f376;
|
996 |
+
$L__tmp13:
|
997 |
+
.loc 2 243 36
|
998 |
+
setp.eq.s32 %p143, %r192, 0;
|
999 |
+
shl.b32 %r203, %r191, 2;
|
1000 |
+
add.s32 %r180, %r44, %r203;
|
1001 |
+
mov.b32 %r181, %f377;
|
1002 |
+
@%p143 st.shared.b32 [ %r180 + 0 ], %r181;
|
1003 |
+
bar.sync 0;
|
1004 |
+
setp.lt.s32 %p144, %r1, 8;
|
1005 |
+
shl.b32 %r205, %r1, 2;
|
1006 |
+
add.s32 %r183, %r44, %r205;
|
1007 |
+
@%p144 ld.shared.b32 %r182, [ %r183 + 0 ];
|
1008 |
+
mov.b32 %f378, %r182;
|
1009 |
+
shfl.sync.bfly.b32 %r206, %r182, 4, 31, -1;
|
1010 |
+
mov.b32 %f379, %r206;
|
1011 |
+
$L__tmp14:
|
1012 |
+
.loc 2 233 15
|
1013 |
+
add.f32 %f380, %f378, %f379;
|
1014 |
+
$L__tmp15:
|
1015 |
+
.loc 2 243 36
|
1016 |
+
mov.b32 %r207, %f380;
|
1017 |
+
shfl.sync.bfly.b32 %r208, %r207, 2, 31, -1;
|
1018 |
+
mov.b32 %f381, %r208;
|
1019 |
+
$L__tmp16:
|
1020 |
+
.loc 2 233 15
|
1021 |
+
add.f32 %f382, %f380, %f381;
|
1022 |
+
$L__tmp17:
|
1023 |
+
.loc 2 243 36
|
1024 |
+
mov.b32 %r209, %f382;
|
1025 |
+
shfl.sync.bfly.b32 %r210, %r209, 1, 31, -1;
|
1026 |
+
mov.b32 %f383, %r210;
|
1027 |
+
$L__tmp18:
|
1028 |
+
.loc 2 233 15
|
1029 |
+
add.f32 %f384, %f382, %f383;
|
1030 |
+
$L__tmp19:
|
1031 |
+
.loc 2 243 36
|
1032 |
+
and.b32 %r211, %r1, 7;
|
1033 |
+
setp.eq.s32 %p152, %r211, 0;
|
1034 |
+
and.pred %p145, %p144, %p152;
|
1035 |
+
mov.b32 %r185, %f384;
|
1036 |
+
@%p145 st.shared.b32 [ %r183 + 0 ], %r185;
|
1037 |
+
bar.sync 0;
|
1038 |
+
ld.shared.u32 %r186, [global_smem];
|
1039 |
+
$L__tmp20:
|
1040 |
+
.loc 1 59 25
|
1041 |
+
shl.b64 %rd154, %rd1, 2;
|
1042 |
+
add.s64 %rd148, %rd47, %rd154;
|
1043 |
+
.loc 1 59 37
|
1044 |
+
setp.eq.s32 %p153, %r2, 0;
|
1045 |
+
and.pred %p146, %p153, %p78;
|
1046 |
+
@%p146 st.global.b32 [ %rd148 + 0 ], { %r186 };
|
1047 |
+
$L__tmp21:
|
1048 |
+
.loc 2 243 36
|
1049 |
+
bar.sync 0;
|
1050 |
+
$L__tmp22:
|
1051 |
+
.loc 2 233 15
|
1052 |
+
add.s64 %rd155, %rd209, %rd210;
|
1053 |
+
add.s64 %rd156, %rd155, %rd211;
|
1054 |
+
add.s64 %rd157, %rd156, %rd212;
|
1055 |
+
add.s64 %rd158, %rd157, %rd213;
|
1056 |
+
add.s64 %rd159, %rd158, %rd214;
|
1057 |
+
add.s64 %rd160, %rd159, %rd215;
|
1058 |
+
add.s64 %rd161, %rd160, %rd216;
|
1059 |
+
$L__tmp23:
|
1060 |
+
.loc 2 243 36
|
1061 |
+
cvt.u32.u64 %r212, %rd161;
|
1062 |
+
shfl.sync.bfly.b32 %r213, %r212, 16, 31, -1;
|
1063 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r214}, %rd161; }
|
1064 |
+
shfl.sync.bfly.b32 %r215, %r214, 16, 31, -1;
|
1065 |
+
cvt.u64.u32 %rd162, %r213;
|
1066 |
+
cvt.u64.u32 %rd163, %r215;
|
1067 |
+
shl.b64 %rd164, %rd163, 32;
|
1068 |
+
or.b64 %rd165, %rd162, %rd164;
|
1069 |
+
$L__tmp24:
|
1070 |
+
.loc 2 233 15
|
1071 |
+
add.s64 %rd166, %rd161, %rd165;
|
1072 |
+
$L__tmp25:
|
1073 |
+
.loc 2 243 36
|
1074 |
+
cvt.u32.u64 %r216, %rd166;
|
1075 |
+
shfl.sync.bfly.b32 %r217, %r216, 8, 31, -1;
|
1076 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r218}, %rd166; }
|
1077 |
+
shfl.sync.bfly.b32 %r219, %r218, 8, 31, -1;
|
1078 |
+
cvt.u64.u32 %rd167, %r217;
|
1079 |
+
cvt.u64.u32 %rd168, %r219;
|
1080 |
+
shl.b64 %rd169, %rd168, 32;
|
1081 |
+
or.b64 %rd170, %rd167, %rd169;
|
1082 |
+
$L__tmp26:
|
1083 |
+
.loc 2 233 15
|
1084 |
+
add.s64 %rd171, %rd166, %rd170;
|
1085 |
+
$L__tmp27:
|
1086 |
+
.loc 2 243 36
|
1087 |
+
cvt.u32.u64 %r220, %rd171;
|
1088 |
+
shfl.sync.bfly.b32 %r221, %r220, 4, 31, -1;
|
1089 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r222}, %rd171; }
|
1090 |
+
shfl.sync.bfly.b32 %r223, %r222, 4, 31, -1;
|
1091 |
+
cvt.u64.u32 %rd172, %r221;
|
1092 |
+
cvt.u64.u32 %rd173, %r223;
|
1093 |
+
shl.b64 %rd174, %rd173, 32;
|
1094 |
+
or.b64 %rd175, %rd172, %rd174;
|
1095 |
+
$L__tmp28:
|
1096 |
+
.loc 2 233 15
|
1097 |
+
add.s64 %rd176, %rd171, %rd175;
|
1098 |
+
$L__tmp29:
|
1099 |
+
.loc 2 243 36
|
1100 |
+
cvt.u32.u64 %r224, %rd176;
|
1101 |
+
shfl.sync.bfly.b32 %r225, %r224, 2, 31, -1;
|
1102 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r226}, %rd176; }
|
1103 |
+
shfl.sync.bfly.b32 %r227, %r226, 2, 31, -1;
|
1104 |
+
cvt.u64.u32 %rd177, %r225;
|
1105 |
+
cvt.u64.u32 %rd178, %r227;
|
1106 |
+
shl.b64 %rd179, %rd178, 32;
|
1107 |
+
or.b64 %rd180, %rd177, %rd179;
|
1108 |
+
$L__tmp30:
|
1109 |
+
.loc 2 233 15
|
1110 |
+
add.s64 %rd181, %rd176, %rd180;
|
1111 |
+
$L__tmp31:
|
1112 |
+
.loc 2 243 36
|
1113 |
+
cvt.u32.u64 %r228, %rd181;
|
1114 |
+
shfl.sync.bfly.b32 %r229, %r228, 1, 31, -1;
|
1115 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r230}, %rd181; }
|
1116 |
+
shfl.sync.bfly.b32 %r231, %r230, 1, 31, -1;
|
1117 |
+
cvt.u64.u32 %rd182, %r229;
|
1118 |
+
cvt.u64.u32 %rd183, %r231;
|
1119 |
+
shl.b64 %rd184, %rd183, 32;
|
1120 |
+
or.b64 %rd185, %rd182, %rd184;
|
1121 |
+
$L__tmp32:
|
1122 |
+
.loc 2 233 15
|
1123 |
+
add.s64 %rd149, %rd181, %rd185;
|
1124 |
+
$L__tmp33:
|
1125 |
+
.loc 2 243 36
|
1126 |
+
shl.b32 %r232, %r191, 3;
|
1127 |
+
add.s32 %r187, %r44, %r232;
|
1128 |
+
@%p143 st.shared.b64 [ %r187 + 0 ], %rd149;
|
1129 |
+
bar.sync 0;
|
1130 |
+
shl.b32 %r233, %r1, 3;
|
1131 |
+
add.s32 %r188, %r44, %r233;
|
1132 |
+
@%p144 ld.shared.b64 %rd150, [ %r188 + 0 ];
|
1133 |
+
cvt.u32.u64 %r234, %rd150;
|
1134 |
+
shfl.sync.bfly.b32 %r235, %r234, 4, 31, -1;
|
1135 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r236}, %rd150; }
|
1136 |
+
shfl.sync.bfly.b32 %r237, %r236, 4, 31, -1;
|
1137 |
+
cvt.u64.u32 %rd186, %r235;
|
1138 |
+
cvt.u64.u32 %rd187, %r237;
|
1139 |
+
shl.b64 %rd188, %rd187, 32;
|
1140 |
+
or.b64 %rd189, %rd186, %rd188;
|
1141 |
+
$L__tmp34:
|
1142 |
+
.loc 2 233 15
|
1143 |
+
add.s64 %rd190, %rd150, %rd189;
|
1144 |
+
$L__tmp35:
|
1145 |
+
.loc 2 243 36
|
1146 |
+
cvt.u32.u64 %r238, %rd190;
|
1147 |
+
shfl.sync.bfly.b32 %r239, %r238, 2, 31, -1;
|
1148 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r240}, %rd190; }
|
1149 |
+
shfl.sync.bfly.b32 %r241, %r240, 2, 31, -1;
|
1150 |
+
cvt.u64.u32 %rd191, %r239;
|
1151 |
+
cvt.u64.u32 %rd192, %r241;
|
1152 |
+
shl.b64 %rd193, %rd192, 32;
|
1153 |
+
or.b64 %rd194, %rd191, %rd193;
|
1154 |
+
$L__tmp36:
|
1155 |
+
.loc 2 233 15
|
1156 |
+
add.s64 %rd195, %rd190, %rd194;
|
1157 |
+
$L__tmp37:
|
1158 |
+
.loc 2 243 36
|
1159 |
+
cvt.u32.u64 %r242, %rd195;
|
1160 |
+
shfl.sync.bfly.b32 %r243, %r242, 1, 31, -1;
|
1161 |
+
{ .reg .b32 tmp; mov.b64 {tmp, %r244}, %rd195; }
|
1162 |
+
shfl.sync.bfly.b32 %r245, %r244, 1, 31, -1;
|
1163 |
+
cvt.u64.u32 %rd196, %r243;
|
1164 |
+
cvt.u64.u32 %rd197, %r245;
|
1165 |
+
shl.b64 %rd198, %rd197, 32;
|
1166 |
+
or.b64 %rd199, %rd196, %rd198;
|
1167 |
+
$L__tmp38:
|
1168 |
+
.loc 2 233 15
|
1169 |
+
add.s64 %rd151, %rd195, %rd199;
|
1170 |
+
$L__tmp39:
|
1171 |
+
.loc 2 243 36
|
1172 |
+
@%p145 st.shared.b64 [ %r188 + 0 ], %rd151;
|
1173 |
+
bar.sync 0;
|
1174 |
+
ld.shared.u32 %rd200, [global_smem+4];
|
1175 |
+
shl.b64 %rd201, %rd200, 32;
|
1176 |
+
ld.shared.u32 %rd202, [global_smem];
|
1177 |
+
or.b64 %rd203, %rd201, %rd202;
|
1178 |
+
$L__tmp40:
|
1179 |
+
.loc 1 60 30
|
1180 |
+
bar.sync 0;
|
1181 |
+
st.shared.u64 [global_smem], %rd203;
|
1182 |
+
bar.sync 0;
|
1183 |
+
ld.shared.u64 %rd152, [global_smem];
|
1184 |
+
.loc 1 61 25
|
1185 |
+
shl.b64 %rd204, %rd1, 3;
|
1186 |
+
add.s64 %rd153, %rd48, %rd204;
|
1187 |
+
.loc 1 61 37
|
1188 |
+
@%p146 st.global.b64 [ %rd153 + 0 ], { %rd152 };
|
1189 |
+
.loc 1 61 4
|
1190 |
+
ret;
|
1191 |
+
$L__tmp41:
|
1192 |
+
$L__func_end0:
|
1193 |
+
|
1194 |
+
}
|
1195 |
+
// .globl __nv_logf
|
1196 |
+
.visible .func (.param .b32 func_retval0) __nv_logf(
|
1197 |
+
.param .b32 __nv_logf_param_0
|
1198 |
+
)
|
1199 |
+
{
|
1200 |
+
.reg .pred %p<4>;
|
1201 |
+
.reg .b32 %r<5>;
|
1202 |
+
.reg .f32 %f<36>;
|
1203 |
+
$L__func_begin1:
|
1204 |
+
|
1205 |
+
ld.param.f32 %f5, [__nv_logf_param_0];
|
1206 |
+
setp.lt.f32 %p1, %f5, 0f00800000;
|
1207 |
+
mul.f32 %f6, %f5, 0f4B000000;
|
1208 |
+
selp.f32 %f1, %f6, %f5, %p1;
|
1209 |
+
selp.f32 %f7, 0fC1B80000, 0f00000000, %p1;
|
1210 |
+
mov.b32 %r1, %f1;
|
1211 |
+
add.s32 %r2, %r1, -1059760811;
|
1212 |
+
and.b32 %r3, %r2, -8388608;
|
1213 |
+
sub.s32 %r4, %r1, %r3;
|
1214 |
+
mov.b32 %f8, %r4;
|
1215 |
+
cvt.rn.f32.s32 %f9, %r3;
|
1216 |
+
mov.f32 %f10, 0f34000000;
|
1217 |
+
fma.rn.ftz.f32 %f11, %f9, %f10, %f7;
|
1218 |
+
add.f32 %f12, %f8, 0fBF800000;
|
1219 |
+
mov.f32 %f13, 0f3E1039F6;
|
1220 |
+
mov.f32 %f14, 0fBE055027;
|
1221 |
+
fma.rn.ftz.f32 %f15, %f14, %f12, %f13;
|
1222 |
+
mov.f32 %f16, 0fBDF8CDCC;
|
1223 |
+
fma.rn.ftz.f32 %f17, %f15, %f12, %f16;
|
1224 |
+
mov.f32 %f18, 0f3E0F2955;
|
1225 |
+
fma.rn.ftz.f32 %f19, %f17, %f12, %f18;
|
1226 |
+
mov.f32 %f20, 0fBE2AD8B9;
|
1227 |
+
fma.rn.ftz.f32 %f21, %f19, %f12, %f20;
|
1228 |
+
mov.f32 %f22, 0f3E4CED0B;
|
1229 |
+
fma.rn.ftz.f32 %f23, %f21, %f12, %f22;
|
1230 |
+
mov.f32 %f24, 0fBE7FFF22;
|
1231 |
+
fma.rn.ftz.f32 %f25, %f23, %f12, %f24;
|
1232 |
+
mov.f32 %f26, 0f3EAAAA78;
|
1233 |
+
fma.rn.ftz.f32 %f27, %f25, %f12, %f26;
|
1234 |
+
mov.f32 %f28, 0fBF000000;
|
1235 |
+
fma.rn.ftz.f32 %f29, %f27, %f12, %f28;
|
1236 |
+
mul.f32 %f30, %f12, %f29;
|
1237 |
+
fma.rn.ftz.f32 %f31, %f30, %f12, %f12;
|
1238 |
+
mov.f32 %f32, 0f3F317218;
|
1239 |
+
fma.rn.ftz.f32 %f35, %f11, %f32, %f31;
|
1240 |
+
setp.lt.u32 %p2, %r1, 2139095040;
|
1241 |
+
@%p2 bra $L__BB1_2;
|
1242 |
+
mov.f32 %f33, 0f7F800000;
|
1243 |
+
fma.rn.ftz.f32 %f35, %f1, %f33, %f33;
|
1244 |
+
$L__BB1_2:
|
1245 |
+
setp.eq.f32 %p3, %f1, 0f00000000;
|
1246 |
+
selp.f32 %f34, 0fFF800000, %f35, %p3;
|
1247 |
+
st.param.f32 [func_retval0+0], %f34;
|
1248 |
+
ret;
|
1249 |
+
$L__func_end1:
|
1250 |
+
|
1251 |
+
}
|
1252 |
+
.file 1 "/tmp/torchinductor_root/ns/cnshxlw3p7kytog7ihat33cfh5n4z4tq3l77zyi5jxajo5uonq7m.py"
|
1253 |
+
.file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
|
1254 |
+
.section .debug_abbrev
|
1255 |
+
{
|
1256 |
+
.b8 1
|
1257 |
+
.b8 17
|
1258 |
+
.b8 1
|
1259 |
+
.b8 37
|
1260 |
+
.b8 8
|
1261 |
+
.b8 19
|
1262 |
+
.b8 5
|
1263 |
+
.b8 3
|
1264 |
+
.b8 8
|
1265 |
+
.b8 16
|
1266 |
+
.b8 6
|
1267 |
+
.b8 27
|
1268 |
+
.b8 8
|
1269 |
+
.b8 180
|
1270 |
+
.b8 66
|
1271 |
+
.b8 12
|
1272 |
+
.b8 17
|
1273 |
+
.b8 1
|
1274 |
+
.b8 18
|
1275 |
+
.b8 1
|
1276 |
+
.b8 0
|
1277 |
+
.b8 0
|
1278 |
+
.b8 2
|
1279 |
+
.b8 46
|
1280 |
+
.b8 0
|
1281 |
+
.b8 135
|
1282 |
+
.b8 64
|
1283 |
+
.b8 8
|
1284 |
+
.b8 3
|
1285 |
+
.b8 8
|
1286 |
+
.b8 58
|
1287 |
+
.b8 11
|
1288 |
+
.b8 59
|
1289 |
+
.b8 11
|
1290 |
+
.b8 63
|
1291 |
+
.b8 12
|
1292 |
+
.b8 32
|
1293 |
+
.b8 11
|
1294 |
+
.b8 0
|
1295 |
+
.b8 0
|
1296 |
+
.b8 3
|
1297 |
+
.b8 46
|
1298 |
+
.b8 1
|
1299 |
+
.b8 17
|
1300 |
+
.b8 1
|
1301 |
+
.b8 18
|
1302 |
+
.b8 1
|
1303 |
+
.b8 64
|
1304 |
+
.b8 10
|
1305 |
+
.b8 49
|
1306 |
+
.b8 19
|
1307 |
+
.b8 0
|
1308 |
+
.b8 0
|
1309 |
+
.b8 4
|
1310 |
+
.b8 29
|
1311 |
+
.b8 0
|
1312 |
+
.b8 49
|
1313 |
+
.b8 19
|
1314 |
+
.b8 17
|
1315 |
+
.b8 1
|
1316 |
+
.b8 18
|
1317 |
+
.b8 1
|
1318 |
+
.b8 88
|
1319 |
+
.b8 11
|
1320 |
+
.b8 89
|
1321 |
+
.b8 11
|
1322 |
+
.b8 87
|
1323 |
+
.b8 11
|
1324 |
+
.b8 0
|
1325 |
+
.b8 0
|
1326 |
+
.b8 5
|
1327 |
+
.b8 29
|
1328 |
+
.b8 1
|
1329 |
+
.b8 49
|
1330 |
+
.b8 19
|
1331 |
+
.b8 17
|
1332 |
+
.b8 1
|
1333 |
+
.b8 18
|
1334 |
+
.b8 1
|
1335 |
+
.b8 88
|
1336 |
+
.b8 11
|
1337 |
+
.b8 89
|
1338 |
+
.b8 11
|
1339 |
+
.b8 87
|
1340 |
+
.b8 11
|
1341 |
+
.b8 0
|
1342 |
+
.b8 0
|
1343 |
+
.b8 0
|
1344 |
+
}
|
1345 |
+
.section .debug_info
|
1346 |
+
{
|
1347 |
+
.b32 349
|
1348 |
+
.b8 2
|
1349 |
+
.b8 0
|
1350 |
+
.b32 .debug_abbrev
|
1351 |
+
.b8 8
|
1352 |
+
.b8 1
|
1353 |
+
.b8 116
|
1354 |
+
.b8 114
|
1355 |
+
.b8 105
|
1356 |
+
.b8 116
|
1357 |
+
.b8 111
|
1358 |
+
.b8 110
|
1359 |
+
.b8 0
|
1360 |
+
.b8 2
|
1361 |
+
.b8 0
|
1362 |
+
.b8 99
|
1363 |
+
.b8 110
|
1364 |
+
.b8 115
|
1365 |
+
.b8 104
|
1366 |
+
.b8 120
|
1367 |
+
.b8 108
|
1368 |
+
.b8 119
|
1369 |
+
.b8 51
|
1370 |
+
.b8 112
|
1371 |
+
.b8 55
|
1372 |
+
.b8 107
|
1373 |
+
.b8 121
|
1374 |
+
.b8 116
|
1375 |
+
.b8 111
|
1376 |
+
.b8 103
|
1377 |
+
.b8 55
|
1378 |
+
.b8 105
|
1379 |
+
.b8 104
|
1380 |
+
.b8 97
|
1381 |
+
.b8 116
|
1382 |
+
.b8 51
|
1383 |
+
.b8 51
|
1384 |
+
.b8 99
|
1385 |
+
.b8 102
|
1386 |
+
.b8 104
|
1387 |
+
.b8 53
|
1388 |
+
.b8 110
|
1389 |
+
.b8 52
|
1390 |
+
.b8 122
|
1391 |
+
.b8 52
|
1392 |
+
.b8 116
|
1393 |
+
.b8 113
|
1394 |
+
.b8 51
|
1395 |
+
.b8 108
|
1396 |
+
.b8 55
|
1397 |
+
.b8 55
|
1398 |
+
.b8 122
|
1399 |
+
.b8 121
|
1400 |
+
.b8 105
|
1401 |
+
.b8 53
|
1402 |
+
.b8 106
|
1403 |
+
.b8 120
|
1404 |
+
.b8 97
|
1405 |
+
.b8 106
|
1406 |
+
.b8 111
|
1407 |
+
.b8 53
|
1408 |
+
.b8 117
|
1409 |
+
.b8 111
|
1410 |
+
.b8 110
|
1411 |
+
.b8 113
|
1412 |
+
.b8 55
|
1413 |
+
.b8 109
|
1414 |
+
.b8 46
|
1415 |
+
.b8 112
|
1416 |
+
.b8 121
|
1417 |
+
.b8 0
|
1418 |
+
.b32 .debug_line
|
1419 |
+
.b8 47
|
1420 |
+
.b8 116
|
1421 |
+
.b8 109
|
1422 |
+
.b8 112
|
1423 |
+
.b8 47
|
1424 |
+
.b8 116
|
1425 |
+
.b8 111
|
1426 |
+
.b8 114
|
1427 |
+
.b8 99
|
1428 |
+
.b8 104
|
1429 |
+
.b8 105
|
1430 |
+
.b8 110
|
1431 |
+
.b8 100
|
1432 |
+
.b8 117
|
1433 |
+
.b8 99
|
1434 |
+
.b8 116
|
1435 |
+
.b8 111
|
1436 |
+
.b8 114
|
1437 |
+
.b8 95
|
1438 |
+
.b8 114
|
1439 |
+
.b8 111
|
1440 |
+
.b8 111
|
1441 |
+
.b8 116
|
1442 |
+
.b8 47
|
1443 |
+
.b8 110
|
1444 |
+
.b8 115
|
1445 |
+
.b8 0
|
1446 |
+
.b8 1
|
1447 |
+
.b64 $L__func_begin0
|
1448 |
+
.b64 $L__func_end0
|
1449 |
+
.b8 2
|
1450 |
+
.b8 116
|
1451 |
+
.b8 114
|
1452 |
+
.b8 105
|
1453 |
+
.b8 116
|
1454 |
+
.b8 111
|
1455 |
+
.b8 110
|
1456 |
+
.b8 95
|
1457 |
+
.b8 95
|
1458 |
+
.b8 48
|
1459 |
+
.b8 100
|
1460 |
+
.b8 49
|
1461 |
+
.b8 100
|
1462 |
+
.b8 50
|
1463 |
+
.b8 100
|
1464 |
+
.b8 51
|
1465 |
+
.b8 100
|
1466 |
+
.b8 52
|
1467 |
+
.b8 100
|
1468 |
+
.b8 53
|
1469 |
+
.b8 100
|
1470 |
+
.b8 54
|
1471 |
+
.b8 101
|
1472 |
+
.b8 55
|
1473 |
+
.b8 100
|
1474 |
+
.b8 101
|
1475 |
+
.b8 0
|
1476 |
+
.b8 116
|
1477 |
+
.b8 114
|
1478 |
+
.b8 105
|
1479 |
+
.b8 116
|
1480 |
+
.b8 111
|
1481 |
+
.b8 110
|
1482 |
+
.b8 95
|
1483 |
+
.b8 95
|
1484 |
+
.b8 48
|
1485 |
+
.b8 100
|
1486 |
+
.b8 49
|
1487 |
+
.b8 100
|
1488 |
+
.b8 50
|
1489 |
+
.b8 100
|
1490 |
+
.b8 51
|
1491 |
+
.b8 100
|
1492 |
+
.b8 52
|
1493 |
+
.b8 100
|
1494 |
+
.b8 53
|
1495 |
+
.b8 100
|
1496 |
+
.b8 54
|
1497 |
+
.b8 101
|
1498 |
+
.b8 55
|
1499 |
+
.b8 100
|
1500 |
+
.b8 101
|
1501 |
+
.b8 0
|
1502 |
+
.b8 1
|
1503 |
+
.b8 18
|
1504 |
+
.b8 1
|
1505 |
+
.b8 1
|
1506 |
+
.b8 3
|
1507 |
+
.b64 $L__func_begin0
|
1508 |
+
.b64 $L__func_end0
|
1509 |
+
.b8 1
|
1510 |
+
.b8 156
|
1511 |
+
.b32 125
|
1512 |
+
.b8 4
|
1513 |
+
.b32 125
|
1514 |
+
.b64 $L__tmp1
|
1515 |
+
.b64 $L__tmp20
|
1516 |
+
.b8 2
|
1517 |
+
.b8 58
|
1518 |
+
.b8 27
|
1519 |
+
.b8 5
|
1520 |
+
.b32 125
|
1521 |
+
.b64 $L__tmp2
|
1522 |
+
.b64 $L__tmp19
|
1523 |
+
.b8 2
|
1524 |
+
.b8 58
|
1525 |
+
.b8 27
|
1526 |
+
.b8 4
|
1527 |
+
.b32 125
|
1528 |
+
.b64 $L__tmp2
|
1529 |
+
.b64 $L__tmp19
|
1530 |
+
.b8 2
|
1531 |
+
.b8 243
|
1532 |
+
.b8 36
|
1533 |
+
.b8 0
|
1534 |
+
.b8 4
|
1535 |
+
.b32 125
|
1536 |
+
.b64 $L__tmp21
|
1537 |
+
.b64 $L__tmp40
|
1538 |
+
.b8 2
|
1539 |
+
.b8 60
|
1540 |
+
.b8 27
|
1541 |
+
.b8 5
|
1542 |
+
.b32 125
|
1543 |
+
.b64 $L__tmp22
|
1544 |
+
.b64 $L__tmp39
|
1545 |
+
.b8 2
|
1546 |
+
.b8 60
|
1547 |
+
.b8 27
|
1548 |
+
.b8 4
|
1549 |
+
.b32 125
|
1550 |
+
.b64 $L__tmp22
|
1551 |
+
.b64 $L__tmp39
|
1552 |
+
.b8 2
|
1553 |
+
.b8 243
|
1554 |
+
.b8 36
|
1555 |
+
.b8 0
|
1556 |
+
.b8 0
|
1557 |
+
.b8 0
|
1558 |
+
}
|
1559 |
+
.section .debug_pubnames
|
1560 |
+
{
|
1561 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
1562 |
+
$L__pubNames_start0:
|
1563 |
+
.b8 2
|
1564 |
+
.b8 0
|
1565 |
+
.b32 .debug_info
|
1566 |
+
.b32 353
|
1567 |
+
.b32 125
|
1568 |
+
.b8 116
|
1569 |
+
.b8 114
|
1570 |
+
.b8 105
|
1571 |
+
.b8 116
|
1572 |
+
.b8 111
|
1573 |
+
.b8 110
|
1574 |
+
.b8 95
|
1575 |
+
.b8 95
|
1576 |
+
.b8 48
|
1577 |
+
.b8 100
|
1578 |
+
.b8 49
|
1579 |
+
.b8 100
|
1580 |
+
.b8 50
|
1581 |
+
.b8 100
|
1582 |
+
.b8 51
|
1583 |
+
.b8 100
|
1584 |
+
.b8 52
|
1585 |
+
.b8 100
|
1586 |
+
.b8 53
|
1587 |
+
.b8 100
|
1588 |
+
.b8 54
|
1589 |
+
.b8 101
|
1590 |
+
.b8 55
|
1591 |
+
.b8 100
|
1592 |
+
.b8 101
|
1593 |
+
.b8 0
|
1594 |
+
.b32 0
|
1595 |
+
$L__pubNames_end0:
|
1596 |
+
}
|
1597 |
+
.section .debug_pubtypes
|
1598 |
+
{
|
1599 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
1600 |
+
$L__pubTypes_start0:
|
1601 |
+
.b8 2
|
1602 |
+
.b8 0
|
1603 |
+
.b32 .debug_info
|
1604 |
+
.b32 353
|
1605 |
+
.b32 0
|
1606 |
+
$L__pubTypes_end0:
|
1607 |
+
}
|
1608 |
+
.section .debug_loc { }
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir
ADDED
@@ -0,0 +1,949 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
5 |
+
|
6 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
|
7 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
8 |
+
%5 = shl i32 %4, 3, !dbg !10
|
9 |
+
%6 = and i32 %5, 1016, !dbg !10
|
10 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
|
11 |
+
%8 = shl i32 %7, 10, !dbg !12
|
12 |
+
%9 = or i32 %8, %6, !dbg !13
|
13 |
+
%10 = sext i32 %9 to i64, !dbg !14
|
14 |
+
%11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
|
15 |
+
%12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
|
16 |
+
%13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
|
17 |
+
%14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
|
18 |
+
%15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
|
19 |
+
%16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
|
20 |
+
%17 = trunc i32 %13 to i16, !dbg !15
|
21 |
+
%extelt.offset = lshr i32 %13, 16, !dbg !15
|
22 |
+
%18 = trunc i32 %extelt.offset to i16, !dbg !15
|
23 |
+
%19 = trunc i32 %14 to i16, !dbg !15
|
24 |
+
%extelt.offset1 = lshr i32 %14, 16, !dbg !15
|
25 |
+
%20 = trunc i32 %extelt.offset1 to i16, !dbg !15
|
26 |
+
%21 = trunc i32 %15 to i16, !dbg !15
|
27 |
+
%extelt.offset2 = lshr i32 %15, 16, !dbg !15
|
28 |
+
%22 = trunc i32 %extelt.offset2 to i16, !dbg !15
|
29 |
+
%23 = trunc i32 %16 to i16, !dbg !15
|
30 |
+
%extelt.offset3 = lshr i32 %16, 16, !dbg !15
|
31 |
+
%24 = trunc i32 %extelt.offset3 to i16, !dbg !15
|
32 |
+
%25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
|
33 |
+
%26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
|
34 |
+
%27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
|
35 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
|
36 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
|
37 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
|
38 |
+
%31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
|
39 |
+
%32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
|
40 |
+
%33 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
|
41 |
+
%34 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %33, i1 true) #4, !dbg !18
|
42 |
+
%35 = extractvalue { i32, i32, i32, i32 } %34, 0, !dbg !18
|
43 |
+
%36 = extractvalue { i32, i32, i32, i32 } %34, 1, !dbg !18
|
44 |
+
%37 = extractvalue { i32, i32, i32, i32 } %34, 2, !dbg !18
|
45 |
+
%38 = extractvalue { i32, i32, i32, i32 } %34, 3, !dbg !18
|
46 |
+
%39 = trunc i32 %35 to i16, !dbg !18
|
47 |
+
%extelt.offset4 = lshr i32 %35, 16, !dbg !18
|
48 |
+
%40 = trunc i32 %extelt.offset4 to i16, !dbg !18
|
49 |
+
%41 = trunc i32 %36 to i16, !dbg !18
|
50 |
+
%extelt.offset5 = lshr i32 %36, 16, !dbg !18
|
51 |
+
%42 = trunc i32 %extelt.offset5 to i16, !dbg !18
|
52 |
+
%43 = trunc i32 %37 to i16, !dbg !18
|
53 |
+
%extelt.offset6 = lshr i32 %37, 16, !dbg !18
|
54 |
+
%44 = trunc i32 %extelt.offset6 to i16, !dbg !18
|
55 |
+
%45 = trunc i32 %38 to i16, !dbg !18
|
56 |
+
%extelt.offset7 = lshr i32 %38, 16, !dbg !18
|
57 |
+
%46 = trunc i32 %extelt.offset7 to i16, !dbg !18
|
58 |
+
%47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %39) #4, !dbg !19
|
59 |
+
%48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %40) #4, !dbg !19
|
60 |
+
%49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #4, !dbg !19
|
61 |
+
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #4, !dbg !19
|
62 |
+
%51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #4, !dbg !19
|
63 |
+
%52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #4, !dbg !19
|
64 |
+
%53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #4, !dbg !19
|
65 |
+
%54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #4, !dbg !19
|
66 |
+
%55 = fmul float %47, 0x3FE6A09E60000000, !dbg !20
|
67 |
+
%56 = fmul float %48, 0x3FE6A09E60000000, !dbg !20
|
68 |
+
%57 = fmul float %49, 0x3FE6A09E60000000, !dbg !20
|
69 |
+
%58 = fmul float %50, 0x3FE6A09E60000000, !dbg !20
|
70 |
+
%59 = fmul float %51, 0x3FE6A09E60000000, !dbg !20
|
71 |
+
%60 = fmul float %52, 0x3FE6A09E60000000, !dbg !20
|
72 |
+
%61 = fmul float %53, 0x3FE6A09E60000000, !dbg !20
|
73 |
+
%62 = fmul float %54, 0x3FE6A09E60000000, !dbg !20
|
74 |
+
%63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
75 |
+
%.not.i = icmp eq i32 %63, 0, !dbg !21
|
76 |
+
%64 = tail call float @llvm.nvvm.fabs.ftz.f(float %55) #4, !dbg !21
|
77 |
+
%65 = tail call float @llvm.nvvm.fabs.f(float %55) #4, !dbg !21
|
78 |
+
%.0.i = select i1 %.not.i, float %65, float %64, !dbg !21
|
79 |
+
%66 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
|
80 |
+
br i1 %66, label %__nv_fabsf.exit1.i, label %68, !dbg !21
|
81 |
+
|
82 |
+
__nv_fabsf.exit1.i: ; preds = %3
|
83 |
+
%67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
84 |
+
%.not1.i = icmp eq i32 %67, 0, !dbg !21
|
85 |
+
%.01.i = select i1 %.not1.i, float %65, float %64, !dbg !21
|
86 |
+
br label %__internal_fmad.exit.i, !dbg !21
|
87 |
+
|
88 |
+
68: ; preds = %3
|
89 |
+
%69 = fmul float %55, %55, !dbg !21
|
90 |
+
br label %__internal_fmad.exit.i, !dbg !21
|
91 |
+
|
92 |
+
__internal_fmad.exit.i: ; preds = %68, %__nv_fabsf.exit1.i
|
93 |
+
%70 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %68 ], !dbg !21
|
94 |
+
%71 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %68 ], !dbg !21
|
95 |
+
%72 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %68 ], !dbg !21
|
96 |
+
%73 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %68 ], !dbg !21
|
97 |
+
%74 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %68 ], !dbg !21
|
98 |
+
%75 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %68 ], !dbg !21
|
99 |
+
%76 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %68 ], !dbg !21
|
100 |
+
%77 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %69, %68 ], !dbg !21
|
101 |
+
%78 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
102 |
+
%.not2.i = icmp eq i32 %78, 0, !dbg !21
|
103 |
+
%79 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %76, float %77, float %75) #4, !dbg !21
|
104 |
+
%80 = tail call float @llvm.nvvm.fma.rn.f(float %76, float %77, float %75) #4, !dbg !21
|
105 |
+
%.02.i = select i1 %.not2.i, float %80, float %79, !dbg !21
|
106 |
+
%81 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
107 |
+
%.not3.i = icmp eq i32 %81, 0, !dbg !21
|
108 |
+
%82 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %77, float %74) #4, !dbg !21
|
109 |
+
%83 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %77, float %74) #4, !dbg !21
|
110 |
+
%.03.i = select i1 %.not3.i, float %83, float %82, !dbg !21
|
111 |
+
%84 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
112 |
+
%.not4.i = icmp eq i32 %84, 0, !dbg !21
|
113 |
+
%85 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %77, float %73) #4, !dbg !21
|
114 |
+
%86 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %77, float %73) #4, !dbg !21
|
115 |
+
%.04.i = select i1 %.not4.i, float %86, float %85, !dbg !21
|
116 |
+
%87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
117 |
+
%.not5.i = icmp eq i32 %87, 0, !dbg !21
|
118 |
+
%88 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %77, float %72) #4, !dbg !21
|
119 |
+
%89 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %77, float %72) #4, !dbg !21
|
120 |
+
%.05.i = select i1 %.not5.i, float %89, float %88, !dbg !21
|
121 |
+
%90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
122 |
+
%.not6.i = icmp eq i32 %90, 0, !dbg !21
|
123 |
+
%91 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %77, float %71) #4, !dbg !21
|
124 |
+
%92 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %77, float %71) #4, !dbg !21
|
125 |
+
%.06.i = select i1 %.not6.i, float %92, float %91, !dbg !21
|
126 |
+
%93 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
127 |
+
%.not7.i = icmp eq i32 %93, 0, !dbg !21
|
128 |
+
%94 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %77, float %70) #4, !dbg !21
|
129 |
+
%95 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %77, float %70) #4, !dbg !21
|
130 |
+
%.07.i = select i1 %.not7.i, float %95, float %94, !dbg !21
|
131 |
+
%96 = fneg float %77, !dbg !21
|
132 |
+
%97 = select i1 %66, float %96, float %55, !dbg !21
|
133 |
+
%98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
134 |
+
%.not8.i = icmp eq i32 %98, 0, !dbg !21
|
135 |
+
%99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %97, float %97) #4, !dbg !21
|
136 |
+
%100 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %97, float %97) #4, !dbg !21
|
137 |
+
%.08.i = select i1 %.not8.i, float %100, float %99, !dbg !21
|
138 |
+
br i1 %66, label %101, label %__nv_erff.exit, !dbg !21
|
139 |
+
|
140 |
+
101: ; preds = %__internal_fmad.exit.i
|
141 |
+
%102 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
|
142 |
+
%103 = fsub float 1.000000e+00, %102, !dbg !21
|
143 |
+
%104 = bitcast float %103 to i32, !dbg !21
|
144 |
+
%105 = bitcast float %55 to i32, !dbg !21
|
145 |
+
%106 = and i32 %105, -2147483648, !dbg !21
|
146 |
+
%107 = or i32 %106, %104, !dbg !21
|
147 |
+
%108 = bitcast i32 %107 to float, !dbg !21
|
148 |
+
br label %__nv_erff.exit, !dbg !21
|
149 |
+
|
150 |
+
__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %101
|
151 |
+
%r.0.i = phi float [ %108, %101 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
|
152 |
+
%109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
153 |
+
%.not.i8 = icmp eq i32 %109, 0, !dbg !21
|
154 |
+
%110 = tail call float @llvm.nvvm.fabs.ftz.f(float %56) #4, !dbg !21
|
155 |
+
%111 = tail call float @llvm.nvvm.fabs.f(float %56) #4, !dbg !21
|
156 |
+
%.0.i9 = select i1 %.not.i8, float %111, float %110, !dbg !21
|
157 |
+
%112 = fcmp oge float %.0.i9, 0x3FF00C1FC0000000, !dbg !21
|
158 |
+
br i1 %112, label %__nv_fabsf.exit1.i26, label %114, !dbg !21
|
159 |
+
|
160 |
+
__nv_fabsf.exit1.i26: ; preds = %__nv_erff.exit
|
161 |
+
%113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
162 |
+
%.not1.i27 = icmp eq i32 %113, 0, !dbg !21
|
163 |
+
%.01.i28 = select i1 %.not1.i27, float %111, float %110, !dbg !21
|
164 |
+
br label %__internal_fmad.exit.i10, !dbg !21
|
165 |
+
|
166 |
+
114: ; preds = %__nv_erff.exit
|
167 |
+
%115 = fmul float %56, %56, !dbg !21
|
168 |
+
br label %__internal_fmad.exit.i10, !dbg !21
|
169 |
+
|
170 |
+
__internal_fmad.exit.i10: ; preds = %114, %__nv_fabsf.exit1.i26
|
171 |
+
%116 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i26 ], [ 0x3FC06EBA60000000, %114 ], !dbg !21
|
172 |
+
%117 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i26 ], [ 0xBFD8127580000000, %114 ], !dbg !21
|
173 |
+
%118 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i26 ], [ 0x3FBCE315E0000000, %114 ], !dbg !21
|
174 |
+
%119 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i26 ], [ 0xBF9B837CE0000000, %114 ], !dbg !21
|
175 |
+
%120 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i26 ], [ 0x3F755ABD40000000, %114 ], !dbg !21
|
176 |
+
%121 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i26 ], [ 0xBF4AE9A400000000, %114 ], !dbg !21
|
177 |
+
%122 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i26 ], [ 0x3F163D2D40000000, %114 ], !dbg !21
|
178 |
+
%123 = phi float [ %.01.i28, %__nv_fabsf.exit1.i26 ], [ %115, %114 ], !dbg !21
|
179 |
+
%124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
180 |
+
%.not2.i11 = icmp eq i32 %124, 0, !dbg !21
|
181 |
+
%125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float %123, float %121) #4, !dbg !21
|
182 |
+
%126 = tail call float @llvm.nvvm.fma.rn.f(float %122, float %123, float %121) #4, !dbg !21
|
183 |
+
%.02.i12 = select i1 %.not2.i11, float %126, float %125, !dbg !21
|
184 |
+
%127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
185 |
+
%.not3.i13 = icmp eq i32 %127, 0, !dbg !21
|
186 |
+
%128 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i12, float %123, float %120) #4, !dbg !21
|
187 |
+
%129 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i12, float %123, float %120) #4, !dbg !21
|
188 |
+
%.03.i14 = select i1 %.not3.i13, float %129, float %128, !dbg !21
|
189 |
+
%130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
190 |
+
%.not4.i15 = icmp eq i32 %130, 0, !dbg !21
|
191 |
+
%131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i14, float %123, float %119) #4, !dbg !21
|
192 |
+
%132 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i14, float %123, float %119) #4, !dbg !21
|
193 |
+
%.04.i16 = select i1 %.not4.i15, float %132, float %131, !dbg !21
|
194 |
+
%133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
195 |
+
%.not5.i17 = icmp eq i32 %133, 0, !dbg !21
|
196 |
+
%134 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i16, float %123, float %118) #4, !dbg !21
|
197 |
+
%135 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i16, float %123, float %118) #4, !dbg !21
|
198 |
+
%.05.i18 = select i1 %.not5.i17, float %135, float %134, !dbg !21
|
199 |
+
%136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
200 |
+
%.not6.i19 = icmp eq i32 %136, 0, !dbg !21
|
201 |
+
%137 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i18, float %123, float %117) #4, !dbg !21
|
202 |
+
%138 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i18, float %123, float %117) #4, !dbg !21
|
203 |
+
%.06.i20 = select i1 %.not6.i19, float %138, float %137, !dbg !21
|
204 |
+
%139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
205 |
+
%.not7.i21 = icmp eq i32 %139, 0, !dbg !21
|
206 |
+
%140 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i20, float %123, float %116) #4, !dbg !21
|
207 |
+
%141 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i20, float %123, float %116) #4, !dbg !21
|
208 |
+
%.07.i22 = select i1 %.not7.i21, float %141, float %140, !dbg !21
|
209 |
+
%142 = fneg float %123, !dbg !21
|
210 |
+
%143 = select i1 %112, float %142, float %56, !dbg !21
|
211 |
+
%144 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
212 |
+
%.not8.i23 = icmp eq i32 %144, 0, !dbg !21
|
213 |
+
%145 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i22, float %143, float %143) #4, !dbg !21
|
214 |
+
%146 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i22, float %143, float %143) #4, !dbg !21
|
215 |
+
%.08.i24 = select i1 %.not8.i23, float %146, float %145, !dbg !21
|
216 |
+
br i1 %112, label %147, label %__nv_erff.exit29, !dbg !21
|
217 |
+
|
218 |
+
147: ; preds = %__internal_fmad.exit.i10
|
219 |
+
%148 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i24) #4, !dbg !21
|
220 |
+
%149 = fsub float 1.000000e+00, %148, !dbg !21
|
221 |
+
%150 = bitcast float %149 to i32, !dbg !21
|
222 |
+
%151 = bitcast float %56 to i32, !dbg !21
|
223 |
+
%152 = and i32 %151, -2147483648, !dbg !21
|
224 |
+
%153 = or i32 %152, %150, !dbg !21
|
225 |
+
%154 = bitcast i32 %153 to float, !dbg !21
|
226 |
+
br label %__nv_erff.exit29, !dbg !21
|
227 |
+
|
228 |
+
__nv_erff.exit29: ; preds = %__internal_fmad.exit.i10, %147
|
229 |
+
%r.0.i25 = phi float [ %154, %147 ], [ %.08.i24, %__internal_fmad.exit.i10 ], !dbg !21
|
230 |
+
%155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
231 |
+
%.not.i30 = icmp eq i32 %155, 0, !dbg !21
|
232 |
+
%156 = tail call float @llvm.nvvm.fabs.ftz.f(float %57) #4, !dbg !21
|
233 |
+
%157 = tail call float @llvm.nvvm.fabs.f(float %57) #4, !dbg !21
|
234 |
+
%.0.i31 = select i1 %.not.i30, float %157, float %156, !dbg !21
|
235 |
+
%158 = fcmp oge float %.0.i31, 0x3FF00C1FC0000000, !dbg !21
|
236 |
+
br i1 %158, label %__nv_fabsf.exit1.i48, label %160, !dbg !21
|
237 |
+
|
238 |
+
__nv_fabsf.exit1.i48: ; preds = %__nv_erff.exit29
|
239 |
+
%159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
240 |
+
%.not1.i49 = icmp eq i32 %159, 0, !dbg !21
|
241 |
+
%.01.i50 = select i1 %.not1.i49, float %157, float %156, !dbg !21
|
242 |
+
br label %__internal_fmad.exit.i32, !dbg !21
|
243 |
+
|
244 |
+
160: ; preds = %__nv_erff.exit29
|
245 |
+
%161 = fmul float %57, %57, !dbg !21
|
246 |
+
br label %__internal_fmad.exit.i32, !dbg !21
|
247 |
+
|
248 |
+
__internal_fmad.exit.i32: ; preds = %160, %__nv_fabsf.exit1.i48
|
249 |
+
%162 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i48 ], [ 0x3FC06EBA60000000, %160 ], !dbg !21
|
250 |
+
%163 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i48 ], [ 0xBFD8127580000000, %160 ], !dbg !21
|
251 |
+
%164 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i48 ], [ 0x3FBCE315E0000000, %160 ], !dbg !21
|
252 |
+
%165 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i48 ], [ 0xBF9B837CE0000000, %160 ], !dbg !21
|
253 |
+
%166 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i48 ], [ 0x3F755ABD40000000, %160 ], !dbg !21
|
254 |
+
%167 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i48 ], [ 0xBF4AE9A400000000, %160 ], !dbg !21
|
255 |
+
%168 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i48 ], [ 0x3F163D2D40000000, %160 ], !dbg !21
|
256 |
+
%169 = phi float [ %.01.i50, %__nv_fabsf.exit1.i48 ], [ %161, %160 ], !dbg !21
|
257 |
+
%170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
258 |
+
%.not2.i33 = icmp eq i32 %170, 0, !dbg !21
|
259 |
+
%171 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %168, float %169, float %167) #4, !dbg !21
|
260 |
+
%172 = tail call float @llvm.nvvm.fma.rn.f(float %168, float %169, float %167) #4, !dbg !21
|
261 |
+
%.02.i34 = select i1 %.not2.i33, float %172, float %171, !dbg !21
|
262 |
+
%173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
263 |
+
%.not3.i35 = icmp eq i32 %173, 0, !dbg !21
|
264 |
+
%174 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i34, float %169, float %166) #4, !dbg !21
|
265 |
+
%175 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i34, float %169, float %166) #4, !dbg !21
|
266 |
+
%.03.i36 = select i1 %.not3.i35, float %175, float %174, !dbg !21
|
267 |
+
%176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
268 |
+
%.not4.i37 = icmp eq i32 %176, 0, !dbg !21
|
269 |
+
%177 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i36, float %169, float %165) #4, !dbg !21
|
270 |
+
%178 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i36, float %169, float %165) #4, !dbg !21
|
271 |
+
%.04.i38 = select i1 %.not4.i37, float %178, float %177, !dbg !21
|
272 |
+
%179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
273 |
+
%.not5.i39 = icmp eq i32 %179, 0, !dbg !21
|
274 |
+
%180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i38, float %169, float %164) #4, !dbg !21
|
275 |
+
%181 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i38, float %169, float %164) #4, !dbg !21
|
276 |
+
%.05.i40 = select i1 %.not5.i39, float %181, float %180, !dbg !21
|
277 |
+
%182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
278 |
+
%.not6.i41 = icmp eq i32 %182, 0, !dbg !21
|
279 |
+
%183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i40, float %169, float %163) #4, !dbg !21
|
280 |
+
%184 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i40, float %169, float %163) #4, !dbg !21
|
281 |
+
%.06.i42 = select i1 %.not6.i41, float %184, float %183, !dbg !21
|
282 |
+
%185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
283 |
+
%.not7.i43 = icmp eq i32 %185, 0, !dbg !21
|
284 |
+
%186 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i42, float %169, float %162) #4, !dbg !21
|
285 |
+
%187 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i42, float %169, float %162) #4, !dbg !21
|
286 |
+
%.07.i44 = select i1 %.not7.i43, float %187, float %186, !dbg !21
|
287 |
+
%188 = fneg float %169, !dbg !21
|
288 |
+
%189 = select i1 %158, float %188, float %57, !dbg !21
|
289 |
+
%190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
290 |
+
%.not8.i45 = icmp eq i32 %190, 0, !dbg !21
|
291 |
+
%191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i44, float %189, float %189) #4, !dbg !21
|
292 |
+
%192 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i44, float %189, float %189) #4, !dbg !21
|
293 |
+
%.08.i46 = select i1 %.not8.i45, float %192, float %191, !dbg !21
|
294 |
+
br i1 %158, label %193, label %__nv_erff.exit51, !dbg !21
|
295 |
+
|
296 |
+
193: ; preds = %__internal_fmad.exit.i32
|
297 |
+
%194 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i46) #4, !dbg !21
|
298 |
+
%195 = fsub float 1.000000e+00, %194, !dbg !21
|
299 |
+
%196 = bitcast float %195 to i32, !dbg !21
|
300 |
+
%197 = bitcast float %57 to i32, !dbg !21
|
301 |
+
%198 = and i32 %197, -2147483648, !dbg !21
|
302 |
+
%199 = or i32 %198, %196, !dbg !21
|
303 |
+
%200 = bitcast i32 %199 to float, !dbg !21
|
304 |
+
br label %__nv_erff.exit51, !dbg !21
|
305 |
+
|
306 |
+
__nv_erff.exit51: ; preds = %__internal_fmad.exit.i32, %193
|
307 |
+
%r.0.i47 = phi float [ %200, %193 ], [ %.08.i46, %__internal_fmad.exit.i32 ], !dbg !21
|
308 |
+
%201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
309 |
+
%.not.i52 = icmp eq i32 %201, 0, !dbg !21
|
310 |
+
%202 = tail call float @llvm.nvvm.fabs.ftz.f(float %58) #4, !dbg !21
|
311 |
+
%203 = tail call float @llvm.nvvm.fabs.f(float %58) #4, !dbg !21
|
312 |
+
%.0.i53 = select i1 %.not.i52, float %203, float %202, !dbg !21
|
313 |
+
%204 = fcmp oge float %.0.i53, 0x3FF00C1FC0000000, !dbg !21
|
314 |
+
br i1 %204, label %__nv_fabsf.exit1.i70, label %206, !dbg !21
|
315 |
+
|
316 |
+
__nv_fabsf.exit1.i70: ; preds = %__nv_erff.exit51
|
317 |
+
%205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
318 |
+
%.not1.i71 = icmp eq i32 %205, 0, !dbg !21
|
319 |
+
%.01.i72 = select i1 %.not1.i71, float %203, float %202, !dbg !21
|
320 |
+
br label %__internal_fmad.exit.i54, !dbg !21
|
321 |
+
|
322 |
+
206: ; preds = %__nv_erff.exit51
|
323 |
+
%207 = fmul float %58, %58, !dbg !21
|
324 |
+
br label %__internal_fmad.exit.i54, !dbg !21
|
325 |
+
|
326 |
+
__internal_fmad.exit.i54: ; preds = %206, %__nv_fabsf.exit1.i70
|
327 |
+
%208 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i70 ], [ 0x3FC06EBA60000000, %206 ], !dbg !21
|
328 |
+
%209 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i70 ], [ 0xBFD8127580000000, %206 ], !dbg !21
|
329 |
+
%210 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i70 ], [ 0x3FBCE315E0000000, %206 ], !dbg !21
|
330 |
+
%211 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i70 ], [ 0xBF9B837CE0000000, %206 ], !dbg !21
|
331 |
+
%212 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i70 ], [ 0x3F755ABD40000000, %206 ], !dbg !21
|
332 |
+
%213 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i70 ], [ 0xBF4AE9A400000000, %206 ], !dbg !21
|
333 |
+
%214 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i70 ], [ 0x3F163D2D40000000, %206 ], !dbg !21
|
334 |
+
%215 = phi float [ %.01.i72, %__nv_fabsf.exit1.i70 ], [ %207, %206 ], !dbg !21
|
335 |
+
%216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
336 |
+
%.not2.i55 = icmp eq i32 %216, 0, !dbg !21
|
337 |
+
%217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %214, float %215, float %213) #4, !dbg !21
|
338 |
+
%218 = tail call float @llvm.nvvm.fma.rn.f(float %214, float %215, float %213) #4, !dbg !21
|
339 |
+
%.02.i56 = select i1 %.not2.i55, float %218, float %217, !dbg !21
|
340 |
+
%219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
341 |
+
%.not3.i57 = icmp eq i32 %219, 0, !dbg !21
|
342 |
+
%220 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i56, float %215, float %212) #4, !dbg !21
|
343 |
+
%221 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i56, float %215, float %212) #4, !dbg !21
|
344 |
+
%.03.i58 = select i1 %.not3.i57, float %221, float %220, !dbg !21
|
345 |
+
%222 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
346 |
+
%.not4.i59 = icmp eq i32 %222, 0, !dbg !21
|
347 |
+
%223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i58, float %215, float %211) #4, !dbg !21
|
348 |
+
%224 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i58, float %215, float %211) #4, !dbg !21
|
349 |
+
%.04.i60 = select i1 %.not4.i59, float %224, float %223, !dbg !21
|
350 |
+
%225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
351 |
+
%.not5.i61 = icmp eq i32 %225, 0, !dbg !21
|
352 |
+
%226 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i60, float %215, float %210) #4, !dbg !21
|
353 |
+
%227 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i60, float %215, float %210) #4, !dbg !21
|
354 |
+
%.05.i62 = select i1 %.not5.i61, float %227, float %226, !dbg !21
|
355 |
+
%228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
356 |
+
%.not6.i63 = icmp eq i32 %228, 0, !dbg !21
|
357 |
+
%229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i62, float %215, float %209) #4, !dbg !21
|
358 |
+
%230 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i62, float %215, float %209) #4, !dbg !21
|
359 |
+
%.06.i64 = select i1 %.not6.i63, float %230, float %229, !dbg !21
|
360 |
+
%231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
361 |
+
%.not7.i65 = icmp eq i32 %231, 0, !dbg !21
|
362 |
+
%232 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i64, float %215, float %208) #4, !dbg !21
|
363 |
+
%233 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i64, float %215, float %208) #4, !dbg !21
|
364 |
+
%.07.i66 = select i1 %.not7.i65, float %233, float %232, !dbg !21
|
365 |
+
%234 = fneg float %215, !dbg !21
|
366 |
+
%235 = select i1 %204, float %234, float %58, !dbg !21
|
367 |
+
%236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
368 |
+
%.not8.i67 = icmp eq i32 %236, 0, !dbg !21
|
369 |
+
%237 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i66, float %235, float %235) #4, !dbg !21
|
370 |
+
%238 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i66, float %235, float %235) #4, !dbg !21
|
371 |
+
%.08.i68 = select i1 %.not8.i67, float %238, float %237, !dbg !21
|
372 |
+
br i1 %204, label %239, label %__nv_erff.exit73, !dbg !21
|
373 |
+
|
374 |
+
239: ; preds = %__internal_fmad.exit.i54
|
375 |
+
%240 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i68) #4, !dbg !21
|
376 |
+
%241 = fsub float 1.000000e+00, %240, !dbg !21
|
377 |
+
%242 = bitcast float %241 to i32, !dbg !21
|
378 |
+
%243 = bitcast float %58 to i32, !dbg !21
|
379 |
+
%244 = and i32 %243, -2147483648, !dbg !21
|
380 |
+
%245 = or i32 %244, %242, !dbg !21
|
381 |
+
%246 = bitcast i32 %245 to float, !dbg !21
|
382 |
+
br label %__nv_erff.exit73, !dbg !21
|
383 |
+
|
384 |
+
__nv_erff.exit73: ; preds = %__internal_fmad.exit.i54, %239
|
385 |
+
%r.0.i69 = phi float [ %246, %239 ], [ %.08.i68, %__internal_fmad.exit.i54 ], !dbg !21
|
386 |
+
%247 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
387 |
+
%.not.i74 = icmp eq i32 %247, 0, !dbg !21
|
388 |
+
%248 = tail call float @llvm.nvvm.fabs.ftz.f(float %59) #4, !dbg !21
|
389 |
+
%249 = tail call float @llvm.nvvm.fabs.f(float %59) #4, !dbg !21
|
390 |
+
%.0.i75 = select i1 %.not.i74, float %249, float %248, !dbg !21
|
391 |
+
%250 = fcmp oge float %.0.i75, 0x3FF00C1FC0000000, !dbg !21
|
392 |
+
br i1 %250, label %__nv_fabsf.exit1.i92, label %252, !dbg !21
|
393 |
+
|
394 |
+
__nv_fabsf.exit1.i92: ; preds = %__nv_erff.exit73
|
395 |
+
%251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
396 |
+
%.not1.i93 = icmp eq i32 %251, 0, !dbg !21
|
397 |
+
%.01.i94 = select i1 %.not1.i93, float %249, float %248, !dbg !21
|
398 |
+
br label %__internal_fmad.exit.i76, !dbg !21
|
399 |
+
|
400 |
+
252: ; preds = %__nv_erff.exit73
|
401 |
+
%253 = fmul float %59, %59, !dbg !21
|
402 |
+
br label %__internal_fmad.exit.i76, !dbg !21
|
403 |
+
|
404 |
+
__internal_fmad.exit.i76: ; preds = %252, %__nv_fabsf.exit1.i92
|
405 |
+
%254 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i92 ], [ 0x3FC06EBA60000000, %252 ], !dbg !21
|
406 |
+
%255 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i92 ], [ 0xBFD8127580000000, %252 ], !dbg !21
|
407 |
+
%256 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i92 ], [ 0x3FBCE315E0000000, %252 ], !dbg !21
|
408 |
+
%257 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i92 ], [ 0xBF9B837CE0000000, %252 ], !dbg !21
|
409 |
+
%258 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i92 ], [ 0x3F755ABD40000000, %252 ], !dbg !21
|
410 |
+
%259 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i92 ], [ 0xBF4AE9A400000000, %252 ], !dbg !21
|
411 |
+
%260 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i92 ], [ 0x3F163D2D40000000, %252 ], !dbg !21
|
412 |
+
%261 = phi float [ %.01.i94, %__nv_fabsf.exit1.i92 ], [ %253, %252 ], !dbg !21
|
413 |
+
%262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
414 |
+
%.not2.i77 = icmp eq i32 %262, 0, !dbg !21
|
415 |
+
%263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %260, float %261, float %259) #4, !dbg !21
|
416 |
+
%264 = tail call float @llvm.nvvm.fma.rn.f(float %260, float %261, float %259) #4, !dbg !21
|
417 |
+
%.02.i78 = select i1 %.not2.i77, float %264, float %263, !dbg !21
|
418 |
+
%265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
419 |
+
%.not3.i79 = icmp eq i32 %265, 0, !dbg !21
|
420 |
+
%266 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i78, float %261, float %258) #4, !dbg !21
|
421 |
+
%267 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i78, float %261, float %258) #4, !dbg !21
|
422 |
+
%.03.i80 = select i1 %.not3.i79, float %267, float %266, !dbg !21
|
423 |
+
%268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
424 |
+
%.not4.i81 = icmp eq i32 %268, 0, !dbg !21
|
425 |
+
%269 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i80, float %261, float %257) #4, !dbg !21
|
426 |
+
%270 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i80, float %261, float %257) #4, !dbg !21
|
427 |
+
%.04.i82 = select i1 %.not4.i81, float %270, float %269, !dbg !21
|
428 |
+
%271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
429 |
+
%.not5.i83 = icmp eq i32 %271, 0, !dbg !21
|
430 |
+
%272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i82, float %261, float %256) #4, !dbg !21
|
431 |
+
%273 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i82, float %261, float %256) #4, !dbg !21
|
432 |
+
%.05.i84 = select i1 %.not5.i83, float %273, float %272, !dbg !21
|
433 |
+
%274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
434 |
+
%.not6.i85 = icmp eq i32 %274, 0, !dbg !21
|
435 |
+
%275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i84, float %261, float %255) #4, !dbg !21
|
436 |
+
%276 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i84, float %261, float %255) #4, !dbg !21
|
437 |
+
%.06.i86 = select i1 %.not6.i85, float %276, float %275, !dbg !21
|
438 |
+
%277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
439 |
+
%.not7.i87 = icmp eq i32 %277, 0, !dbg !21
|
440 |
+
%278 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i86, float %261, float %254) #4, !dbg !21
|
441 |
+
%279 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i86, float %261, float %254) #4, !dbg !21
|
442 |
+
%.07.i88 = select i1 %.not7.i87, float %279, float %278, !dbg !21
|
443 |
+
%280 = fneg float %261, !dbg !21
|
444 |
+
%281 = select i1 %250, float %280, float %59, !dbg !21
|
445 |
+
%282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
446 |
+
%.not8.i89 = icmp eq i32 %282, 0, !dbg !21
|
447 |
+
%283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i88, float %281, float %281) #4, !dbg !21
|
448 |
+
%284 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i88, float %281, float %281) #4, !dbg !21
|
449 |
+
%.08.i90 = select i1 %.not8.i89, float %284, float %283, !dbg !21
|
450 |
+
br i1 %250, label %285, label %__nv_erff.exit95, !dbg !21
|
451 |
+
|
452 |
+
285: ; preds = %__internal_fmad.exit.i76
|
453 |
+
%286 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i90) #4, !dbg !21
|
454 |
+
%287 = fsub float 1.000000e+00, %286, !dbg !21
|
455 |
+
%288 = bitcast float %287 to i32, !dbg !21
|
456 |
+
%289 = bitcast float %59 to i32, !dbg !21
|
457 |
+
%290 = and i32 %289, -2147483648, !dbg !21
|
458 |
+
%291 = or i32 %290, %288, !dbg !21
|
459 |
+
%292 = bitcast i32 %291 to float, !dbg !21
|
460 |
+
br label %__nv_erff.exit95, !dbg !21
|
461 |
+
|
462 |
+
__nv_erff.exit95: ; preds = %__internal_fmad.exit.i76, %285
|
463 |
+
%r.0.i91 = phi float [ %292, %285 ], [ %.08.i90, %__internal_fmad.exit.i76 ], !dbg !21
|
464 |
+
%293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
465 |
+
%.not.i96 = icmp eq i32 %293, 0, !dbg !21
|
466 |
+
%294 = tail call float @llvm.nvvm.fabs.ftz.f(float %60) #4, !dbg !21
|
467 |
+
%295 = tail call float @llvm.nvvm.fabs.f(float %60) #4, !dbg !21
|
468 |
+
%.0.i97 = select i1 %.not.i96, float %295, float %294, !dbg !21
|
469 |
+
%296 = fcmp oge float %.0.i97, 0x3FF00C1FC0000000, !dbg !21
|
470 |
+
br i1 %296, label %__nv_fabsf.exit1.i114, label %298, !dbg !21
|
471 |
+
|
472 |
+
__nv_fabsf.exit1.i114: ; preds = %__nv_erff.exit95
|
473 |
+
%297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
474 |
+
%.not1.i115 = icmp eq i32 %297, 0, !dbg !21
|
475 |
+
%.01.i116 = select i1 %.not1.i115, float %295, float %294, !dbg !21
|
476 |
+
br label %__internal_fmad.exit.i98, !dbg !21
|
477 |
+
|
478 |
+
298: ; preds = %__nv_erff.exit95
|
479 |
+
%299 = fmul float %60, %60, !dbg !21
|
480 |
+
br label %__internal_fmad.exit.i98, !dbg !21
|
481 |
+
|
482 |
+
__internal_fmad.exit.i98: ; preds = %298, %__nv_fabsf.exit1.i114
|
483 |
+
%300 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i114 ], [ 0x3FC06EBA60000000, %298 ], !dbg !21
|
484 |
+
%301 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i114 ], [ 0xBFD8127580000000, %298 ], !dbg !21
|
485 |
+
%302 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i114 ], [ 0x3FBCE315E0000000, %298 ], !dbg !21
|
486 |
+
%303 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i114 ], [ 0xBF9B837CE0000000, %298 ], !dbg !21
|
487 |
+
%304 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i114 ], [ 0x3F755ABD40000000, %298 ], !dbg !21
|
488 |
+
%305 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i114 ], [ 0xBF4AE9A400000000, %298 ], !dbg !21
|
489 |
+
%306 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i114 ], [ 0x3F163D2D40000000, %298 ], !dbg !21
|
490 |
+
%307 = phi float [ %.01.i116, %__nv_fabsf.exit1.i114 ], [ %299, %298 ], !dbg !21
|
491 |
+
%308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
492 |
+
%.not2.i99 = icmp eq i32 %308, 0, !dbg !21
|
493 |
+
%309 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %306, float %307, float %305) #4, !dbg !21
|
494 |
+
%310 = tail call float @llvm.nvvm.fma.rn.f(float %306, float %307, float %305) #4, !dbg !21
|
495 |
+
%.02.i100 = select i1 %.not2.i99, float %310, float %309, !dbg !21
|
496 |
+
%311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
497 |
+
%.not3.i101 = icmp eq i32 %311, 0, !dbg !21
|
498 |
+
%312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i100, float %307, float %304) #4, !dbg !21
|
499 |
+
%313 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i100, float %307, float %304) #4, !dbg !21
|
500 |
+
%.03.i102 = select i1 %.not3.i101, float %313, float %312, !dbg !21
|
501 |
+
%314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
502 |
+
%.not4.i103 = icmp eq i32 %314, 0, !dbg !21
|
503 |
+
%315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i102, float %307, float %303) #4, !dbg !21
|
504 |
+
%316 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i102, float %307, float %303) #4, !dbg !21
|
505 |
+
%.04.i104 = select i1 %.not4.i103, float %316, float %315, !dbg !21
|
506 |
+
%317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
507 |
+
%.not5.i105 = icmp eq i32 %317, 0, !dbg !21
|
508 |
+
%318 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i104, float %307, float %302) #4, !dbg !21
|
509 |
+
%319 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i104, float %307, float %302) #4, !dbg !21
|
510 |
+
%.05.i106 = select i1 %.not5.i105, float %319, float %318, !dbg !21
|
511 |
+
%320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
512 |
+
%.not6.i107 = icmp eq i32 %320, 0, !dbg !21
|
513 |
+
%321 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i106, float %307, float %301) #4, !dbg !21
|
514 |
+
%322 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i106, float %307, float %301) #4, !dbg !21
|
515 |
+
%.06.i108 = select i1 %.not6.i107, float %322, float %321, !dbg !21
|
516 |
+
%323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
517 |
+
%.not7.i109 = icmp eq i32 %323, 0, !dbg !21
|
518 |
+
%324 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i108, float %307, float %300) #4, !dbg !21
|
519 |
+
%325 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i108, float %307, float %300) #4, !dbg !21
|
520 |
+
%.07.i110 = select i1 %.not7.i109, float %325, float %324, !dbg !21
|
521 |
+
%326 = fneg float %307, !dbg !21
|
522 |
+
%327 = select i1 %296, float %326, float %60, !dbg !21
|
523 |
+
%328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
524 |
+
%.not8.i111 = icmp eq i32 %328, 0, !dbg !21
|
525 |
+
%329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i110, float %327, float %327) #4, !dbg !21
|
526 |
+
%330 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i110, float %327, float %327) #4, !dbg !21
|
527 |
+
%.08.i112 = select i1 %.not8.i111, float %330, float %329, !dbg !21
|
528 |
+
br i1 %296, label %331, label %__nv_erff.exit117, !dbg !21
|
529 |
+
|
530 |
+
331: ; preds = %__internal_fmad.exit.i98
|
531 |
+
%332 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i112) #4, !dbg !21
|
532 |
+
%333 = fsub float 1.000000e+00, %332, !dbg !21
|
533 |
+
%334 = bitcast float %333 to i32, !dbg !21
|
534 |
+
%335 = bitcast float %60 to i32, !dbg !21
|
535 |
+
%336 = and i32 %335, -2147483648, !dbg !21
|
536 |
+
%337 = or i32 %336, %334, !dbg !21
|
537 |
+
%338 = bitcast i32 %337 to float, !dbg !21
|
538 |
+
br label %__nv_erff.exit117, !dbg !21
|
539 |
+
|
540 |
+
__nv_erff.exit117: ; preds = %__internal_fmad.exit.i98, %331
|
541 |
+
%r.0.i113 = phi float [ %338, %331 ], [ %.08.i112, %__internal_fmad.exit.i98 ], !dbg !21
|
542 |
+
%339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
543 |
+
%.not.i118 = icmp eq i32 %339, 0, !dbg !21
|
544 |
+
%340 = tail call float @llvm.nvvm.fabs.ftz.f(float %61) #4, !dbg !21
|
545 |
+
%341 = tail call float @llvm.nvvm.fabs.f(float %61) #4, !dbg !21
|
546 |
+
%.0.i119 = select i1 %.not.i118, float %341, float %340, !dbg !21
|
547 |
+
%342 = fcmp oge float %.0.i119, 0x3FF00C1FC0000000, !dbg !21
|
548 |
+
br i1 %342, label %__nv_fabsf.exit1.i136, label %344, !dbg !21
|
549 |
+
|
550 |
+
__nv_fabsf.exit1.i136: ; preds = %__nv_erff.exit117
|
551 |
+
%343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
552 |
+
%.not1.i137 = icmp eq i32 %343, 0, !dbg !21
|
553 |
+
%.01.i138 = select i1 %.not1.i137, float %341, float %340, !dbg !21
|
554 |
+
br label %__internal_fmad.exit.i120, !dbg !21
|
555 |
+
|
556 |
+
344: ; preds = %__nv_erff.exit117
|
557 |
+
%345 = fmul float %61, %61, !dbg !21
|
558 |
+
br label %__internal_fmad.exit.i120, !dbg !21
|
559 |
+
|
560 |
+
__internal_fmad.exit.i120: ; preds = %344, %__nv_fabsf.exit1.i136
|
561 |
+
%346 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i136 ], [ 0x3FC06EBA60000000, %344 ], !dbg !21
|
562 |
+
%347 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i136 ], [ 0xBFD8127580000000, %344 ], !dbg !21
|
563 |
+
%348 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i136 ], [ 0x3FBCE315E0000000, %344 ], !dbg !21
|
564 |
+
%349 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i136 ], [ 0xBF9B837CE0000000, %344 ], !dbg !21
|
565 |
+
%350 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i136 ], [ 0x3F755ABD40000000, %344 ], !dbg !21
|
566 |
+
%351 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i136 ], [ 0xBF4AE9A400000000, %344 ], !dbg !21
|
567 |
+
%352 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i136 ], [ 0x3F163D2D40000000, %344 ], !dbg !21
|
568 |
+
%353 = phi float [ %.01.i138, %__nv_fabsf.exit1.i136 ], [ %345, %344 ], !dbg !21
|
569 |
+
%354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
570 |
+
%.not2.i121 = icmp eq i32 %354, 0, !dbg !21
|
571 |
+
%355 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float %353, float %351) #4, !dbg !21
|
572 |
+
%356 = tail call float @llvm.nvvm.fma.rn.f(float %352, float %353, float %351) #4, !dbg !21
|
573 |
+
%.02.i122 = select i1 %.not2.i121, float %356, float %355, !dbg !21
|
574 |
+
%357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
575 |
+
%.not3.i123 = icmp eq i32 %357, 0, !dbg !21
|
576 |
+
%358 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i122, float %353, float %350) #4, !dbg !21
|
577 |
+
%359 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i122, float %353, float %350) #4, !dbg !21
|
578 |
+
%.03.i124 = select i1 %.not3.i123, float %359, float %358, !dbg !21
|
579 |
+
%360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
580 |
+
%.not4.i125 = icmp eq i32 %360, 0, !dbg !21
|
581 |
+
%361 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i124, float %353, float %349) #4, !dbg !21
|
582 |
+
%362 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i124, float %353, float %349) #4, !dbg !21
|
583 |
+
%.04.i126 = select i1 %.not4.i125, float %362, float %361, !dbg !21
|
584 |
+
%363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
585 |
+
%.not5.i127 = icmp eq i32 %363, 0, !dbg !21
|
586 |
+
%364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i126, float %353, float %348) #4, !dbg !21
|
587 |
+
%365 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i126, float %353, float %348) #4, !dbg !21
|
588 |
+
%.05.i128 = select i1 %.not5.i127, float %365, float %364, !dbg !21
|
589 |
+
%366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
590 |
+
%.not6.i129 = icmp eq i32 %366, 0, !dbg !21
|
591 |
+
%367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i128, float %353, float %347) #4, !dbg !21
|
592 |
+
%368 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i128, float %353, float %347) #4, !dbg !21
|
593 |
+
%.06.i130 = select i1 %.not6.i129, float %368, float %367, !dbg !21
|
594 |
+
%369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
595 |
+
%.not7.i131 = icmp eq i32 %369, 0, !dbg !21
|
596 |
+
%370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i130, float %353, float %346) #4, !dbg !21
|
597 |
+
%371 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i130, float %353, float %346) #4, !dbg !21
|
598 |
+
%.07.i132 = select i1 %.not7.i131, float %371, float %370, !dbg !21
|
599 |
+
%372 = fneg float %353, !dbg !21
|
600 |
+
%373 = select i1 %342, float %372, float %61, !dbg !21
|
601 |
+
%374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
602 |
+
%.not8.i133 = icmp eq i32 %374, 0, !dbg !21
|
603 |
+
%375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i132, float %373, float %373) #4, !dbg !21
|
604 |
+
%376 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i132, float %373, float %373) #4, !dbg !21
|
605 |
+
%.08.i134 = select i1 %.not8.i133, float %376, float %375, !dbg !21
|
606 |
+
br i1 %342, label %377, label %__nv_erff.exit139, !dbg !21
|
607 |
+
|
608 |
+
377: ; preds = %__internal_fmad.exit.i120
|
609 |
+
%378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i134) #4, !dbg !21
|
610 |
+
%379 = fsub float 1.000000e+00, %378, !dbg !21
|
611 |
+
%380 = bitcast float %379 to i32, !dbg !21
|
612 |
+
%381 = bitcast float %61 to i32, !dbg !21
|
613 |
+
%382 = and i32 %381, -2147483648, !dbg !21
|
614 |
+
%383 = or i32 %382, %380, !dbg !21
|
615 |
+
%384 = bitcast i32 %383 to float, !dbg !21
|
616 |
+
br label %__nv_erff.exit139, !dbg !21
|
617 |
+
|
618 |
+
__nv_erff.exit139: ; preds = %__internal_fmad.exit.i120, %377
|
619 |
+
%r.0.i135 = phi float [ %384, %377 ], [ %.08.i134, %__internal_fmad.exit.i120 ], !dbg !21
|
620 |
+
%385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
621 |
+
%.not.i140 = icmp eq i32 %385, 0, !dbg !21
|
622 |
+
%386 = tail call float @llvm.nvvm.fabs.ftz.f(float %62) #4, !dbg !21
|
623 |
+
%387 = tail call float @llvm.nvvm.fabs.f(float %62) #4, !dbg !21
|
624 |
+
%.0.i141 = select i1 %.not.i140, float %387, float %386, !dbg !21
|
625 |
+
%388 = fcmp oge float %.0.i141, 0x3FF00C1FC0000000, !dbg !21
|
626 |
+
br i1 %388, label %__nv_fabsf.exit1.i158, label %390, !dbg !21
|
627 |
+
|
628 |
+
__nv_fabsf.exit1.i158: ; preds = %__nv_erff.exit139
|
629 |
+
%389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
630 |
+
%.not1.i159 = icmp eq i32 %389, 0, !dbg !21
|
631 |
+
%.01.i160 = select i1 %.not1.i159, float %387, float %386, !dbg !21
|
632 |
+
br label %__internal_fmad.exit.i142, !dbg !21
|
633 |
+
|
634 |
+
390: ; preds = %__nv_erff.exit139
|
635 |
+
%391 = fmul float %62, %62, !dbg !21
|
636 |
+
br label %__internal_fmad.exit.i142, !dbg !21
|
637 |
+
|
638 |
+
__internal_fmad.exit.i142: ; preds = %390, %__nv_fabsf.exit1.i158
|
639 |
+
%392 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i158 ], [ 0x3FC06EBA60000000, %390 ], !dbg !21
|
640 |
+
%393 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i158 ], [ 0xBFD8127580000000, %390 ], !dbg !21
|
641 |
+
%394 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i158 ], [ 0x3FBCE315E0000000, %390 ], !dbg !21
|
642 |
+
%395 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i158 ], [ 0xBF9B837CE0000000, %390 ], !dbg !21
|
643 |
+
%396 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i158 ], [ 0x3F755ABD40000000, %390 ], !dbg !21
|
644 |
+
%397 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i158 ], [ 0xBF4AE9A400000000, %390 ], !dbg !21
|
645 |
+
%398 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i158 ], [ 0x3F163D2D40000000, %390 ], !dbg !21
|
646 |
+
%399 = phi float [ %.01.i160, %__nv_fabsf.exit1.i158 ], [ %391, %390 ], !dbg !21
|
647 |
+
%400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
648 |
+
%.not2.i143 = icmp eq i32 %400, 0, !dbg !21
|
649 |
+
%401 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float %399, float %397) #4, !dbg !21
|
650 |
+
%402 = tail call float @llvm.nvvm.fma.rn.f(float %398, float %399, float %397) #4, !dbg !21
|
651 |
+
%.02.i144 = select i1 %.not2.i143, float %402, float %401, !dbg !21
|
652 |
+
%403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
653 |
+
%.not3.i145 = icmp eq i32 %403, 0, !dbg !21
|
654 |
+
%404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i144, float %399, float %396) #4, !dbg !21
|
655 |
+
%405 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i144, float %399, float %396) #4, !dbg !21
|
656 |
+
%.03.i146 = select i1 %.not3.i145, float %405, float %404, !dbg !21
|
657 |
+
%406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
658 |
+
%.not4.i147 = icmp eq i32 %406, 0, !dbg !21
|
659 |
+
%407 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i146, float %399, float %395) #4, !dbg !21
|
660 |
+
%408 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i146, float %399, float %395) #4, !dbg !21
|
661 |
+
%.04.i148 = select i1 %.not4.i147, float %408, float %407, !dbg !21
|
662 |
+
%409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
663 |
+
%.not5.i149 = icmp eq i32 %409, 0, !dbg !21
|
664 |
+
%410 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i148, float %399, float %394) #4, !dbg !21
|
665 |
+
%411 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i148, float %399, float %394) #4, !dbg !21
|
666 |
+
%.05.i150 = select i1 %.not5.i149, float %411, float %410, !dbg !21
|
667 |
+
%412 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
668 |
+
%.not6.i151 = icmp eq i32 %412, 0, !dbg !21
|
669 |
+
%413 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i150, float %399, float %393) #4, !dbg !21
|
670 |
+
%414 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i150, float %399, float %393) #4, !dbg !21
|
671 |
+
%.06.i152 = select i1 %.not6.i151, float %414, float %413, !dbg !21
|
672 |
+
%415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
673 |
+
%.not7.i153 = icmp eq i32 %415, 0, !dbg !21
|
674 |
+
%416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i152, float %399, float %392) #4, !dbg !21
|
675 |
+
%417 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i152, float %399, float %392) #4, !dbg !21
|
676 |
+
%.07.i154 = select i1 %.not7.i153, float %417, float %416, !dbg !21
|
677 |
+
%418 = fneg float %399, !dbg !21
|
678 |
+
%419 = select i1 %388, float %418, float %62, !dbg !21
|
679 |
+
%420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
|
680 |
+
%.not8.i155 = icmp eq i32 %420, 0, !dbg !21
|
681 |
+
%421 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i154, float %419, float %419) #4, !dbg !21
|
682 |
+
%422 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i154, float %419, float %419) #4, !dbg !21
|
683 |
+
%.08.i156 = select i1 %.not8.i155, float %422, float %421, !dbg !21
|
684 |
+
br i1 %388, label %423, label %__nv_erff.exit161, !dbg !21
|
685 |
+
|
686 |
+
423: ; preds = %__internal_fmad.exit.i142
|
687 |
+
%424 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i156) #4, !dbg !21
|
688 |
+
%425 = fsub float 1.000000e+00, %424, !dbg !21
|
689 |
+
%426 = bitcast float %425 to i32, !dbg !21
|
690 |
+
%427 = bitcast float %62 to i32, !dbg !21
|
691 |
+
%428 = and i32 %427, -2147483648, !dbg !21
|
692 |
+
%429 = or i32 %428, %426, !dbg !21
|
693 |
+
%430 = bitcast i32 %429 to float, !dbg !21
|
694 |
+
br label %__nv_erff.exit161, !dbg !21
|
695 |
+
|
696 |
+
__nv_erff.exit161: ; preds = %__internal_fmad.exit.i142, %423
|
697 |
+
%r.0.i157 = phi float [ %430, %423 ], [ %.08.i156, %__internal_fmad.exit.i142 ], !dbg !21
|
698 |
+
%431 = fadd float %r.0.i, 1.000000e+00, !dbg !22
|
699 |
+
%432 = fadd float %r.0.i25, 1.000000e+00, !dbg !22
|
700 |
+
%433 = fadd float %r.0.i47, 1.000000e+00, !dbg !22
|
701 |
+
%434 = fadd float %r.0.i69, 1.000000e+00, !dbg !22
|
702 |
+
%435 = fadd float %r.0.i91, 1.000000e+00, !dbg !22
|
703 |
+
%436 = fadd float %r.0.i113, 1.000000e+00, !dbg !22
|
704 |
+
%437 = fadd float %r.0.i135, 1.000000e+00, !dbg !22
|
705 |
+
%438 = fadd float %r.0.i157, 1.000000e+00, !dbg !22
|
706 |
+
%439 = fmul float %431, 5.000000e-01, !dbg !23
|
707 |
+
%440 = fmul float %432, 5.000000e-01, !dbg !23
|
708 |
+
%441 = fmul float %433, 5.000000e-01, !dbg !23
|
709 |
+
%442 = fmul float %434, 5.000000e-01, !dbg !23
|
710 |
+
%443 = fmul float %435, 5.000000e-01, !dbg !23
|
711 |
+
%444 = fmul float %436, 5.000000e-01, !dbg !23
|
712 |
+
%445 = fmul float %437, 5.000000e-01, !dbg !23
|
713 |
+
%446 = fmul float %438, 5.000000e-01, !dbg !23
|
714 |
+
%447 = fmul float %47, %47, !dbg !24
|
715 |
+
%448 = fmul float %48, %48, !dbg !24
|
716 |
+
%449 = fmul float %49, %49, !dbg !24
|
717 |
+
%450 = fmul float %50, %50, !dbg !24
|
718 |
+
%451 = fmul float %51, %51, !dbg !24
|
719 |
+
%452 = fmul float %52, %52, !dbg !24
|
720 |
+
%453 = fmul float %53, %53, !dbg !24
|
721 |
+
%454 = fmul float %54, %54, !dbg !24
|
722 |
+
%455 = fmul float %447, -5.000000e-01, !dbg !25
|
723 |
+
%456 = fmul float %448, -5.000000e-01, !dbg !25
|
724 |
+
%457 = fmul float %449, -5.000000e-01, !dbg !25
|
725 |
+
%458 = fmul float %450, -5.000000e-01, !dbg !25
|
726 |
+
%459 = fmul float %451, -5.000000e-01, !dbg !25
|
727 |
+
%460 = fmul float %452, -5.000000e-01, !dbg !25
|
728 |
+
%461 = fmul float %453, -5.000000e-01, !dbg !25
|
729 |
+
%462 = fmul float %454, -5.000000e-01, !dbg !25
|
730 |
+
%463 = fmul float %455, 0x3FF7154760000000, !dbg !26
|
731 |
+
%464 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %463) #4, !dbg !26
|
732 |
+
%465 = fmul float %456, 0x3FF7154760000000, !dbg !26
|
733 |
+
%466 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %465) #4, !dbg !26
|
734 |
+
%467 = fmul float %457, 0x3FF7154760000000, !dbg !26
|
735 |
+
%468 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %467) #4, !dbg !26
|
736 |
+
%469 = fmul float %458, 0x3FF7154760000000, !dbg !26
|
737 |
+
%470 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %469) #4, !dbg !26
|
738 |
+
%471 = fmul float %459, 0x3FF7154760000000, !dbg !26
|
739 |
+
%472 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %471) #4, !dbg !26
|
740 |
+
%473 = fmul float %460, 0x3FF7154760000000, !dbg !26
|
741 |
+
%474 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %473) #4, !dbg !26
|
742 |
+
%475 = fmul float %461, 0x3FF7154760000000, !dbg !26
|
743 |
+
%476 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %475) #4, !dbg !26
|
744 |
+
%477 = fmul float %462, 0x3FF7154760000000, !dbg !26
|
745 |
+
%478 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %477) #4, !dbg !26
|
746 |
+
%479 = fmul float %464, 0x3FD9884540000000, !dbg !27
|
747 |
+
%480 = fmul float %466, 0x3FD9884540000000, !dbg !27
|
748 |
+
%481 = fmul float %468, 0x3FD9884540000000, !dbg !27
|
749 |
+
%482 = fmul float %470, 0x3FD9884540000000, !dbg !27
|
750 |
+
%483 = fmul float %472, 0x3FD9884540000000, !dbg !27
|
751 |
+
%484 = fmul float %474, 0x3FD9884540000000, !dbg !27
|
752 |
+
%485 = fmul float %476, 0x3FD9884540000000, !dbg !27
|
753 |
+
%486 = fmul float %478, 0x3FD9884540000000, !dbg !27
|
754 |
+
%487 = fmul float %47, %479, !dbg !28
|
755 |
+
%488 = fmul float %48, %480, !dbg !28
|
756 |
+
%489 = fmul float %49, %481, !dbg !28
|
757 |
+
%490 = fmul float %50, %482, !dbg !28
|
758 |
+
%491 = fmul float %51, %483, !dbg !28
|
759 |
+
%492 = fmul float %52, %484, !dbg !28
|
760 |
+
%493 = fmul float %53, %485, !dbg !28
|
761 |
+
%494 = fmul float %54, %486, !dbg !28
|
762 |
+
%495 = fadd float %439, %487, !dbg !29
|
763 |
+
%496 = fadd float %440, %488, !dbg !29
|
764 |
+
%497 = fadd float %441, %489, !dbg !29
|
765 |
+
%498 = fadd float %442, %490, !dbg !29
|
766 |
+
%499 = fadd float %443, %491, !dbg !29
|
767 |
+
%500 = fadd float %444, %492, !dbg !29
|
768 |
+
%501 = fadd float %445, %493, !dbg !29
|
769 |
+
%502 = fadd float %446, %494, !dbg !29
|
770 |
+
%503 = fmul float %25, %495, !dbg !30
|
771 |
+
%504 = fmul float %26, %496, !dbg !30
|
772 |
+
%505 = fmul float %27, %497, !dbg !30
|
773 |
+
%506 = fmul float %28, %498, !dbg !30
|
774 |
+
%507 = fmul float %29, %499, !dbg !30
|
775 |
+
%508 = fmul float %30, %500, !dbg !30
|
776 |
+
%509 = fmul float %31, %501, !dbg !30
|
777 |
+
%510 = fmul float %32, %502, !dbg !30
|
778 |
+
%511 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %503) #4, !dbg !31
|
779 |
+
%512 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %504) #4, !dbg !31
|
780 |
+
%513 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %505) #4, !dbg !31
|
781 |
+
%514 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %506) #4, !dbg !31
|
782 |
+
%515 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %507) #4, !dbg !31
|
783 |
+
%516 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %508) #4, !dbg !31
|
784 |
+
%517 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %509) #4, !dbg !31
|
785 |
+
%518 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %510) #4, !dbg !31
|
786 |
+
%519 = insertelement <2 x i16> undef, i16 %511, i64 0, !dbg !31
|
787 |
+
%520 = insertelement <2 x i16> %519, i16 %512, i64 1, !dbg !31
|
788 |
+
%521 = bitcast <2 x i16> %520 to i32, !dbg !31
|
789 |
+
%522 = insertelement <2 x i16> undef, i16 %513, i64 0, !dbg !31
|
790 |
+
%523 = insertelement <2 x i16> %522, i16 %514, i64 1, !dbg !31
|
791 |
+
%524 = bitcast <2 x i16> %523 to i32, !dbg !31
|
792 |
+
%525 = insertelement <2 x i16> undef, i16 %515, i64 0, !dbg !31
|
793 |
+
%526 = insertelement <2 x i16> %525, i16 %516, i64 1, !dbg !31
|
794 |
+
%527 = bitcast <2 x i16> %526 to i32, !dbg !31
|
795 |
+
%528 = insertelement <2 x i16> undef, i16 %517, i64 0, !dbg !31
|
796 |
+
%529 = insertelement <2 x i16> %528, i16 %518, i64 1, !dbg !31
|
797 |
+
%530 = bitcast <2 x i16> %529 to i32, !dbg !31
|
798 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %521, i32 %524, i32 %527, i32 %530, ptr addrspace(1) %11, i1 true) #4, !dbg !31
|
799 |
+
ret void, !dbg !32
|
800 |
+
}
|
801 |
+
|
802 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
803 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
804 |
+
|
805 |
+
; Function Attrs: alwaysinline nounwind
|
806 |
+
define float @__nv_erff(float %a) local_unnamed_addr #1 {
|
807 |
+
__nv_fabsf.exit:
|
808 |
+
%0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
809 |
+
%.not = icmp eq i32 %0, 0
|
810 |
+
%1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
|
811 |
+
%2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
|
812 |
+
%.0 = select i1 %.not, float %2, float %1
|
813 |
+
%3 = fcmp oge float %.0, 0x3FF00C1FC0000000
|
814 |
+
br i1 %3, label %__nv_fabsf.exit1, label %5
|
815 |
+
|
816 |
+
__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
|
817 |
+
%4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
818 |
+
%.not1 = icmp eq i32 %4, 0
|
819 |
+
%.01 = select i1 %.not1, float %2, float %1
|
820 |
+
br label %__internal_fmad.exit
|
821 |
+
|
822 |
+
5: ; preds = %__nv_fabsf.exit
|
823 |
+
%6 = fmul float %a, %a
|
824 |
+
br label %__internal_fmad.exit
|
825 |
+
|
826 |
+
__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
|
827 |
+
%7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
|
828 |
+
%8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
|
829 |
+
%9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
|
830 |
+
%10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
|
831 |
+
%11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
|
832 |
+
%12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
|
833 |
+
%13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
|
834 |
+
%14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
|
835 |
+
%15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
836 |
+
%.not2 = icmp eq i32 %15, 0
|
837 |
+
%16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
|
838 |
+
%17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
|
839 |
+
%.02 = select i1 %.not2, float %17, float %16
|
840 |
+
%18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
841 |
+
%.not3 = icmp eq i32 %18, 0
|
842 |
+
%19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
|
843 |
+
%20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
|
844 |
+
%.03 = select i1 %.not3, float %20, float %19
|
845 |
+
%21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
846 |
+
%.not4 = icmp eq i32 %21, 0
|
847 |
+
%22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
|
848 |
+
%23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
|
849 |
+
%.04 = select i1 %.not4, float %23, float %22
|
850 |
+
%24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
851 |
+
%.not5 = icmp eq i32 %24, 0
|
852 |
+
%25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
|
853 |
+
%26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
|
854 |
+
%.05 = select i1 %.not5, float %26, float %25
|
855 |
+
%27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
856 |
+
%.not6 = icmp eq i32 %27, 0
|
857 |
+
%28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
|
858 |
+
%29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
|
859 |
+
%.06 = select i1 %.not6, float %29, float %28
|
860 |
+
%30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
861 |
+
%.not7 = icmp eq i32 %30, 0
|
862 |
+
%31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
|
863 |
+
%32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
|
864 |
+
%.07 = select i1 %.not7, float %32, float %31
|
865 |
+
%33 = fneg float %14
|
866 |
+
%34 = select i1 %3, float %33, float %a
|
867 |
+
%35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
|
868 |
+
%.not8 = icmp eq i32 %35, 0
|
869 |
+
%36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
|
870 |
+
%37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
|
871 |
+
%.08 = select i1 %.not8, float %37, float %36
|
872 |
+
br i1 %3, label %38, label %46
|
873 |
+
|
874 |
+
38: ; preds = %__internal_fmad.exit
|
875 |
+
%39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
|
876 |
+
%40 = fsub float 1.000000e+00, %39
|
877 |
+
%41 = bitcast float %40 to i32
|
878 |
+
%42 = bitcast float %a to i32
|
879 |
+
%43 = and i32 %42, -2147483648
|
880 |
+
%44 = or i32 %43, %41
|
881 |
+
%45 = bitcast i32 %44 to float
|
882 |
+
br label %46
|
883 |
+
|
884 |
+
46: ; preds = %38, %__internal_fmad.exit
|
885 |
+
%r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
|
886 |
+
ret float %r.0
|
887 |
+
}
|
888 |
+
|
889 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
|
890 |
+
|
891 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
892 |
+
declare float @llvm.nvvm.fabs.ftz.f(float) #0
|
893 |
+
|
894 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
895 |
+
declare float @llvm.nvvm.fabs.f(float) #0
|
896 |
+
|
897 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
898 |
+
declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
|
899 |
+
|
900 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
901 |
+
declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
|
902 |
+
|
903 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
904 |
+
declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
|
905 |
+
|
906 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
907 |
+
attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
908 |
+
attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
909 |
+
attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
910 |
+
attributes #4 = { nounwind }
|
911 |
+
|
912 |
+
!llvm.module.flags = !{!0, !1}
|
913 |
+
!llvm.dbg.cu = !{!2}
|
914 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
915 |
+
!llvm.ident = !{!6}
|
916 |
+
|
917 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
918 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
919 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
920 |
+
!3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
|
921 |
+
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
922 |
+
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
923 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
924 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
925 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
926 |
+
!9 = !{}
|
927 |
+
!10 = !DILocation(line: 21, column: 36, scope: !7)
|
928 |
+
!11 = !DILocation(line: 20, column: 28, scope: !7)
|
929 |
+
!12 = !DILocation(line: 20, column: 33, scope: !7)
|
930 |
+
!13 = !DILocation(line: 21, column: 23, scope: !7)
|
931 |
+
!14 = !DILocation(line: 24, column: 34, scope: !7)
|
932 |
+
!15 = !DILocation(line: 24, column: 39, scope: !7)
|
933 |
+
!16 = !DILocation(line: 24, column: 48, scope: !7)
|
934 |
+
!17 = !DILocation(line: 25, column: 30, scope: !7)
|
935 |
+
!18 = !DILocation(line: 25, column: 35, scope: !7)
|
936 |
+
!19 = !DILocation(line: 25, column: 44, scope: !7)
|
937 |
+
!20 = !DILocation(line: 29, column: 18, scope: !7)
|
938 |
+
!21 = !DILocation(line: 30, column: 23, scope: !7)
|
939 |
+
!22 = !DILocation(line: 32, column: 18, scope: !7)
|
940 |
+
!23 = !DILocation(line: 34, column: 19, scope: !7)
|
941 |
+
!24 = !DILocation(line: 35, column: 19, scope: !7)
|
942 |
+
!25 = !DILocation(line: 37, column: 20, scope: !7)
|
943 |
+
!26 = !DILocation(line: 38, column: 19, scope: !7)
|
944 |
+
!27 = !DILocation(line: 40, column: 20, scope: !7)
|
945 |
+
!28 = !DILocation(line: 41, column: 19, scope: !7)
|
946 |
+
!29 = !DILocation(line: 42, column: 20, scope: !7)
|
947 |
+
!30 = !DILocation(line: 43, column: 19, scope: !7)
|
948 |
+
!31 = !DILocation(line: 45, column: 40, scope: !7)
|
949 |
+
!32 = !DILocation(line: 45, column: 4, scope: !7)
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<0.398942292> : tensor<1024xf32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
|
8 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
|
9 |
+
%c1024_i32 = arith.constant 1024 : i32
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
12 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
13 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
14 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
15 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
16 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
17 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
18 |
+
%8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
|
19 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
20 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
21 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
|
22 |
+
%12 = arith.extf %11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
|
23 |
+
%13 = arith.mulf %12, %cst_3 : tensor<1024xf32, #blocked>
|
24 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
|
25 |
+
%15 = arith.addf %14, %cst_2 : tensor<1024xf32, #blocked>
|
26 |
+
%16 = arith.mulf %15, %cst_1 : tensor<1024xf32, #blocked>
|
27 |
+
%17 = arith.mulf %12, %12 : tensor<1024xf32, #blocked>
|
28 |
+
%18 = arith.mulf %17, %cst_0 : tensor<1024xf32, #blocked>
|
29 |
+
%19 = math.exp %18 : tensor<1024xf32, #blocked>
|
30 |
+
%20 = arith.mulf %19, %cst : tensor<1024xf32, #blocked>
|
31 |
+
%21 = arith.mulf %12, %20 : tensor<1024xf32, #blocked>
|
32 |
+
%22 = arith.addf %16, %21 : tensor<1024xf32, #blocked>
|
33 |
+
%23 = arith.mulf %8, %22 : tensor<1024xf32, #blocked>
|
34 |
+
%24 = arith.truncf %23 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
35 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.398942292> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
|
6 |
+
%cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32>
|
7 |
+
%cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32>
|
8 |
+
%c1024_i32 = arith.constant 1024 : i32
|
9 |
+
%0 = tt.get_program_id x : i32
|
10 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
11 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
12 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
13 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
14 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
15 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
16 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
17 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
18 |
+
%9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
19 |
+
%10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
20 |
+
%11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
21 |
+
%12 = arith.extf %11 : tensor<1024xbf16> to tensor<1024xf32>
|
22 |
+
%13 = arith.mulf %12, %cst_3 : tensor<1024xf32>
|
23 |
+
%14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
|
24 |
+
%15 = arith.addf %14, %cst_2 : tensor<1024xf32>
|
25 |
+
%16 = arith.mulf %15, %cst_1 : tensor<1024xf32>
|
26 |
+
%17 = arith.mulf %12, %12 : tensor<1024xf32>
|
27 |
+
%18 = arith.mulf %17, %cst_0 : tensor<1024xf32>
|
28 |
+
%19 = math.exp %18 : tensor<1024xf32>
|
29 |
+
%20 = arith.mulf %19, %cst : tensor<1024xf32>
|
30 |
+
%21 = arith.mulf %12, %20 : tensor<1024xf32>
|
31 |
+
%22 = arith.addf %16, %21 : tensor<1024xf32>
|
32 |
+
%23 = arith.mulf %8, %22 : tensor<1024xf32>
|
33 |
+
%24 = arith.truncf %23 : tensor<1024xf32> to tensor<1024xbf16>
|
34 |
+
tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
35 |
+
tt.return
|
36 |
+
}
|
37 |
+
}
|
.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir
ADDED
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
5 |
+
|
6 |
+
define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !5 {
|
7 |
+
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
8 |
+
%12 = and i32 %11, 31, !dbg !8
|
9 |
+
%13 = lshr i32 %11, 5, !dbg !8
|
10 |
+
%14 = and i32 %13, 1, !dbg !8
|
11 |
+
%urem = shl i32 %11, 2, !dbg !8
|
12 |
+
%15 = and i32 %urem, 252, !dbg !8
|
13 |
+
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
|
14 |
+
%17 = shl i32 %16, 8, !dbg !10
|
15 |
+
%18 = or i32 %17, %15, !dbg !11
|
16 |
+
%19 = sext i32 %18 to i64, !dbg !12
|
17 |
+
%20 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !12
|
18 |
+
%21 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
|
19 |
+
%22 = extractvalue { i32, i32 } %21, 0, !dbg !13
|
20 |
+
%23 = extractvalue { i32, i32 } %21, 1, !dbg !13
|
21 |
+
%24 = trunc i32 %22 to i16, !dbg !13
|
22 |
+
%extelt.offset = lshr i32 %22, 16, !dbg !13
|
23 |
+
%25 = trunc i32 %extelt.offset to i16, !dbg !13
|
24 |
+
%26 = trunc i32 %23 to i16, !dbg !13
|
25 |
+
%extelt.offset1 = lshr i32 %23, 16, !dbg !13
|
26 |
+
%27 = trunc i32 %extelt.offset1 to i16, !dbg !13
|
27 |
+
%28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
|
28 |
+
%29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
|
29 |
+
%30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14
|
30 |
+
%31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #3, !dbg !14
|
31 |
+
%32 = zext nneg i32 %15 to i64, !dbg !15
|
32 |
+
%33 = getelementptr float, ptr addrspace(1) %2, i64 %32, !dbg !15
|
33 |
+
%34 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
|
34 |
+
%35 = extractvalue { i32, i32, i32, i32 } %34, 0, !dbg !16
|
35 |
+
%36 = extractvalue { i32, i32, i32, i32 } %34, 1, !dbg !16
|
36 |
+
%37 = extractvalue { i32, i32, i32, i32 } %34, 2, !dbg !16
|
37 |
+
%38 = extractvalue { i32, i32, i32, i32 } %34, 3, !dbg !16
|
38 |
+
%39 = bitcast i32 %35 to float, !dbg !16
|
39 |
+
%40 = bitcast i32 %36 to float, !dbg !16
|
40 |
+
%41 = bitcast i32 %37 to float, !dbg !16
|
41 |
+
%42 = bitcast i32 %38 to float, !dbg !16
|
42 |
+
%43 = getelementptr float, ptr addrspace(1) %3, i64 %19, !dbg !17
|
43 |
+
%44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
|
44 |
+
%45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !18
|
45 |
+
%46 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !18
|
46 |
+
%47 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !18
|
47 |
+
%48 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !18
|
48 |
+
%49 = bitcast i32 %45 to float, !dbg !18
|
49 |
+
%50 = bitcast i32 %46 to float, !dbg !18
|
50 |
+
%51 = bitcast i32 %47 to float, !dbg !18
|
51 |
+
%52 = bitcast i32 %48 to float, !dbg !18
|
52 |
+
%53 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !19
|
53 |
+
%54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
|
54 |
+
%55 = extractvalue { i32, i32 } %54, 0, !dbg !20
|
55 |
+
%56 = extractvalue { i32, i32 } %54, 1, !dbg !20
|
56 |
+
%57 = trunc i32 %55 to i16, !dbg !20
|
57 |
+
%extelt.offset2 = lshr i32 %55, 16, !dbg !20
|
58 |
+
%58 = trunc i32 %extelt.offset2 to i16, !dbg !20
|
59 |
+
%59 = trunc i32 %56 to i16, !dbg !20
|
60 |
+
%extelt.offset3 = lshr i32 %56, 16, !dbg !20
|
61 |
+
%60 = trunc i32 %extelt.offset3 to i16, !dbg !20
|
62 |
+
%61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !21
|
63 |
+
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #3, !dbg !21
|
64 |
+
%63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #3, !dbg !21
|
65 |
+
%64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #3, !dbg !21
|
66 |
+
%65 = sext i32 %16 to i64, !dbg !22
|
67 |
+
%66 = getelementptr float, ptr addrspace(1) %5, i64 %65, !dbg !22
|
68 |
+
%67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
|
69 |
+
%68 = bitcast i32 %67 to float, !dbg !23
|
70 |
+
%69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
|
71 |
+
%70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
|
72 |
+
%71 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
|
73 |
+
%72 = getelementptr float, ptr addrspace(1) %6, i64 %65, !dbg !24
|
74 |
+
%73 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
|
75 |
+
%74 = bitcast i32 %73 to float, !dbg !25
|
76 |
+
%75 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
|
77 |
+
%76 = bitcast i32 %75 to float, !dbg !25
|
78 |
+
%77 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
|
79 |
+
%78 = bitcast i32 %77 to float, !dbg !25
|
80 |
+
%79 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
|
81 |
+
%80 = bitcast i32 %79 to float, !dbg !25
|
82 |
+
%81 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !26
|
83 |
+
%82 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !27
|
84 |
+
%83 = extractvalue { i32, i32, i32, i32 } %82, 0, !dbg !27
|
85 |
+
%84 = extractvalue { i32, i32, i32, i32 } %82, 1, !dbg !27
|
86 |
+
%85 = extractvalue { i32, i32, i32, i32 } %82, 2, !dbg !27
|
87 |
+
%86 = extractvalue { i32, i32, i32, i32 } %82, 3, !dbg !27
|
88 |
+
%87 = bitcast i32 %83 to float, !dbg !27
|
89 |
+
%88 = bitcast i32 %84 to float, !dbg !27
|
90 |
+
%89 = bitcast i32 %85 to float, !dbg !27
|
91 |
+
%90 = bitcast i32 %86 to float, !dbg !27
|
92 |
+
%91 = fmul float %28, %39, !dbg !28
|
93 |
+
%92 = fmul float %29, %40, !dbg !28
|
94 |
+
%93 = fmul float %30, %41, !dbg !28
|
95 |
+
%94 = fmul float %31, %42, !dbg !28
|
96 |
+
%95 = fadd float %91, %92, !dbg !29
|
97 |
+
%96 = fadd float %93, %95, !dbg !29
|
98 |
+
%97 = fadd float %94, %96, !dbg !29
|
99 |
+
%98 = bitcast float %97 to i32, !dbg !35
|
100 |
+
%99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !35
|
101 |
+
%100 = bitcast i32 %99 to float, !dbg !35
|
102 |
+
%101 = fadd float %97, %100, !dbg !29
|
103 |
+
%102 = bitcast float %101 to i32, !dbg !35
|
104 |
+
%103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !35
|
105 |
+
%104 = bitcast i32 %103 to float, !dbg !35
|
106 |
+
%105 = fadd float %101, %104, !dbg !29
|
107 |
+
%106 = bitcast float %105 to i32, !dbg !35
|
108 |
+
%107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !35
|
109 |
+
%108 = bitcast i32 %107 to float, !dbg !35
|
110 |
+
%109 = fadd float %105, %108, !dbg !29
|
111 |
+
%110 = bitcast float %109 to i32, !dbg !35
|
112 |
+
%111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !35
|
113 |
+
%112 = bitcast i32 %111 to float, !dbg !35
|
114 |
+
%113 = fadd float %109, %112, !dbg !29
|
115 |
+
%114 = bitcast float %113 to i32, !dbg !35
|
116 |
+
%115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !35
|
117 |
+
%116 = bitcast i32 %115 to float, !dbg !35
|
118 |
+
%117 = fadd float %113, %116, !dbg !29
|
119 |
+
%118 = icmp eq i32 %12, 0, !dbg !35
|
120 |
+
%119 = zext nneg i32 %14 to i64, !dbg !35
|
121 |
+
%120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !35
|
122 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !35
|
123 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
124 |
+
%121 = icmp slt i32 %11, 2, !dbg !35
|
125 |
+
%122 = sext i32 %11 to i64, !dbg !35
|
126 |
+
%123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !35
|
127 |
+
%124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !35
|
128 |
+
%125 = bitcast float %124 to i32, !dbg !35
|
129 |
+
%126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !35
|
130 |
+
%127 = bitcast i32 %126 to float, !dbg !35
|
131 |
+
%128 = fadd float %124, %127, !dbg !29
|
132 |
+
%129 = and i32 %11, 1, !dbg !35
|
133 |
+
%130 = icmp eq i32 %129, 0, !dbg !35
|
134 |
+
%131 = and i1 %121, %130, !dbg !35
|
135 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %128, i1 %131) #3, !dbg !35
|
136 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !35
|
137 |
+
%132 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35
|
138 |
+
%133 = fadd float %132, 0.000000e+00, !dbg !37
|
139 |
+
%134 = fadd float %61, %49, !dbg !41
|
140 |
+
%135 = fadd float %62, %50, !dbg !41
|
141 |
+
%136 = fadd float %63, %51, !dbg !41
|
142 |
+
%137 = fadd float %64, %52, !dbg !41
|
143 |
+
%138 = fsub float %134, %68, !dbg !42
|
144 |
+
%139 = fsub float %135, %68, !dbg !42
|
145 |
+
%140 = fsub float %136, %68, !dbg !42
|
146 |
+
%141 = fsub float %137, %68, !dbg !42
|
147 |
+
%142 = fmul float %138, %74, !dbg !43
|
148 |
+
%143 = fmul float %139, %74, !dbg !43
|
149 |
+
%144 = fmul float %140, %74, !dbg !43
|
150 |
+
%145 = fmul float %141, %74, !dbg !43
|
151 |
+
%146 = fmul float %91, %142, !dbg !44
|
152 |
+
%147 = fmul float %92, %143, !dbg !44
|
153 |
+
%148 = fmul float %93, %144, !dbg !44
|
154 |
+
%149 = fmul float %94, %145, !dbg !44
|
155 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
156 |
+
%150 = fadd float %146, %147, !dbg !47
|
157 |
+
%151 = fadd float %148, %150, !dbg !47
|
158 |
+
%152 = fadd float %149, %151, !dbg !47
|
159 |
+
%153 = bitcast float %152 to i32, !dbg !45
|
160 |
+
%154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 16, i32 31), !dbg !45
|
161 |
+
%155 = bitcast i32 %154 to float, !dbg !45
|
162 |
+
%156 = fadd float %152, %155, !dbg !47
|
163 |
+
%157 = bitcast float %156 to i32, !dbg !45
|
164 |
+
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 8, i32 31), !dbg !45
|
165 |
+
%159 = bitcast i32 %158 to float, !dbg !45
|
166 |
+
%160 = fadd float %156, %159, !dbg !47
|
167 |
+
%161 = bitcast float %160 to i32, !dbg !45
|
168 |
+
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !45
|
169 |
+
%163 = bitcast i32 %162 to float, !dbg !45
|
170 |
+
%164 = fadd float %160, %163, !dbg !47
|
171 |
+
%165 = bitcast float %164 to i32, !dbg !45
|
172 |
+
%166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !45
|
173 |
+
%167 = bitcast i32 %166 to float, !dbg !45
|
174 |
+
%168 = fadd float %164, %167, !dbg !47
|
175 |
+
%169 = bitcast float %168 to i32, !dbg !45
|
176 |
+
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !45
|
177 |
+
%171 = bitcast i32 %170 to float, !dbg !45
|
178 |
+
%172 = fadd float %168, %171, !dbg !47
|
179 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %172, i1 %118) #3, !dbg !45
|
180 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
181 |
+
%173 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !45
|
182 |
+
%174 = bitcast float %173 to i32, !dbg !45
|
183 |
+
%175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 1, i32 31), !dbg !45
|
184 |
+
%176 = bitcast i32 %175 to float, !dbg !45
|
185 |
+
%177 = fadd float %173, %176, !dbg !47
|
186 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %177, i1 %131) #3, !dbg !45
|
187 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !45
|
188 |
+
%178 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
|
189 |
+
%179 = fadd float %178, 0.000000e+00, !dbg !50
|
190 |
+
%180 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %74, float 2.560000e+02) #3, !dbg !52
|
191 |
+
%181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %76, float 2.560000e+02) #3, !dbg !52
|
192 |
+
%182 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %78, float 2.560000e+02) #3, !dbg !52
|
193 |
+
%183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %80, float 2.560000e+02) #3, !dbg !52
|
194 |
+
%184 = fmul float %91, 2.560000e+02, !dbg !53
|
195 |
+
%185 = fmul float %92, 2.560000e+02, !dbg !53
|
196 |
+
%186 = fmul float %93, 2.560000e+02, !dbg !53
|
197 |
+
%187 = fmul float %94, 2.560000e+02, !dbg !53
|
198 |
+
%188 = fsub float %184, %133, !dbg !54
|
199 |
+
%189 = fsub float %185, %133, !dbg !54
|
200 |
+
%190 = fsub float %186, %133, !dbg !54
|
201 |
+
%191 = fsub float %187, %133, !dbg !54
|
202 |
+
%192 = fmul float %142, %179, !dbg !55
|
203 |
+
%193 = fmul float %143, %179, !dbg !55
|
204 |
+
%194 = fmul float %144, %179, !dbg !55
|
205 |
+
%195 = fmul float %145, %179, !dbg !55
|
206 |
+
%196 = fsub float %188, %192, !dbg !56
|
207 |
+
%197 = fsub float %189, %193, !dbg !56
|
208 |
+
%198 = fsub float %190, %194, !dbg !56
|
209 |
+
%199 = fsub float %191, %195, !dbg !56
|
210 |
+
%200 = fmul float %180, %196, !dbg !57
|
211 |
+
%201 = fmul float %180, %197, !dbg !57
|
212 |
+
%202 = fmul float %180, %198, !dbg !57
|
213 |
+
%203 = fmul float %180, %199, !dbg !57
|
214 |
+
%204 = fadd float %200, %87, !dbg !58
|
215 |
+
%205 = fadd float %201, %88, !dbg !58
|
216 |
+
%206 = fadd float %202, %89, !dbg !58
|
217 |
+
%207 = fadd float %203, %90, !dbg !58
|
218 |
+
%208 = bitcast float %204 to i32, !dbg !59
|
219 |
+
%209 = bitcast float %205 to i32, !dbg !59
|
220 |
+
%210 = bitcast float %206 to i32, !dbg !59
|
221 |
+
%211 = bitcast float %207 to i32, !dbg !59
|
222 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %81, i1 true) #3, !dbg !59
|
223 |
+
%212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !60
|
224 |
+
%213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #3, !dbg !61
|
225 |
+
%214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #3, !dbg !61
|
226 |
+
%215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #3, !dbg !61
|
227 |
+
%216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %207) #3, !dbg !61
|
228 |
+
%217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !61
|
229 |
+
%218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !61
|
230 |
+
%219 = bitcast <2 x i16> %218 to i32, !dbg !61
|
231 |
+
%220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !61
|
232 |
+
%221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !61
|
233 |
+
%222 = bitcast <2 x i16> %221 to i32, !dbg !61
|
234 |
+
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #3, !dbg !61
|
235 |
+
ret void, !dbg !62
|
236 |
+
}
|
237 |
+
|
238 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
239 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
240 |
+
|
241 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
242 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
243 |
+
|
244 |
+
; Function Attrs: convergent nocallback nounwind
|
245 |
+
declare void @llvm.nvvm.barrier0() #2
|
246 |
+
|
247 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
248 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
249 |
+
attributes #2 = { convergent nocallback nounwind }
|
250 |
+
attributes #3 = { nounwind }
|
251 |
+
|
252 |
+
!llvm.module.flags = !{!0}
|
253 |
+
!llvm.dbg.cu = !{!1}
|
254 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
255 |
+
|
256 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
257 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
258 |
+
!2 = !DIFile(filename: "cfhjzwujbd4bpel57x4hxw7d3m3qqfwrjg6bfe6e4wk2cyh77u45.py", directory: "/tmp/torchinductor_root/fh")
|
259 |
+
!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
|
260 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
|
261 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
262 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
263 |
+
!7 = !{}
|
264 |
+
!8 = !DILocation(line: 26, column: 26, scope: !5)
|
265 |
+
!9 = !DILocation(line: 23, column: 28, scope: !5)
|
266 |
+
!10 = !DILocation(line: 30, column: 40, scope: !5)
|
267 |
+
!11 = !DILocation(line: 30, column: 36, scope: !5)
|
268 |
+
!12 = !DILocation(line: 30, column: 30, scope: !5)
|
269 |
+
!13 = !DILocation(line: 30, column: 46, scope: !5)
|
270 |
+
!14 = !DILocation(line: 30, column: 67, scope: !5)
|
271 |
+
!15 = !DILocation(line: 31, column: 30, scope: !5)
|
272 |
+
!16 = !DILocation(line: 31, column: 35, scope: !5)
|
273 |
+
!17 = !DILocation(line: 32, column: 30, scope: !5)
|
274 |
+
!18 = !DILocation(line: 32, column: 46, scope: !5)
|
275 |
+
!19 = !DILocation(line: 33, column: 30, scope: !5)
|
276 |
+
!20 = !DILocation(line: 33, column: 46, scope: !5)
|
277 |
+
!21 = !DILocation(line: 33, column: 67, scope: !5)
|
278 |
+
!22 = !DILocation(line: 34, column: 31, scope: !5)
|
279 |
+
!23 = !DILocation(line: 34, column: 36, scope: !5)
|
280 |
+
!24 = !DILocation(line: 35, column: 31, scope: !5)
|
281 |
+
!25 = !DILocation(line: 35, column: 36, scope: !5)
|
282 |
+
!26 = !DILocation(line: 36, column: 35, scope: !5)
|
283 |
+
!27 = !DILocation(line: 36, column: 51, scope: !5)
|
284 |
+
!28 = !DILocation(line: 38, column: 18, scope: !5)
|
285 |
+
!29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33)
|
286 |
+
!30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0)
|
287 |
+
!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
288 |
+
!32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
|
289 |
+
!33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34)
|
290 |
+
!34 = !DILocation(line: 41, column: 57, scope: !30)
|
291 |
+
!35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36)
|
292 |
+
!36 = !DILocation(line: 41, column: 57, scope: !32)
|
293 |
+
!37 = !DILocation(line: 8, column: 15, scope: !38, inlinedAt: !40)
|
294 |
+
!38 = distinct !DILexicalBlockFile(scope: !5, file: !39, discriminator: 0)
|
295 |
+
!39 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
296 |
+
!40 = !DILocation(line: 41, column: 44, scope: !38)
|
297 |
+
!41 = !DILocation(line: 43, column: 19, scope: !5)
|
298 |
+
!42 = !DILocation(line: 44, column: 20, scope: !5)
|
299 |
+
!43 = !DILocation(line: 45, column: 20, scope: !5)
|
300 |
+
!44 = !DILocation(line: 46, column: 19, scope: !5)
|
301 |
+
!45 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !46)
|
302 |
+
!46 = !DILocation(line: 49, column: 59, scope: !32)
|
303 |
+
!47 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !48)
|
304 |
+
!48 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !49)
|
305 |
+
!49 = !DILocation(line: 49, column: 59, scope: !30)
|
306 |
+
!50 = !DILocation(line: 8, column: 15, scope: !38, inlinedAt: !51)
|
307 |
+
!51 = !DILocation(line: 49, column: 45, scope: !38)
|
308 |
+
!52 = !DILocation(line: 51, column: 20, scope: !5)
|
309 |
+
!53 = !DILocation(line: 52, column: 19, scope: !5)
|
310 |
+
!54 = !DILocation(line: 53, column: 20, scope: !5)
|
311 |
+
!55 = !DILocation(line: 54, column: 20, scope: !5)
|
312 |
+
!56 = !DILocation(line: 55, column: 20, scope: !5)
|
313 |
+
!57 = !DILocation(line: 56, column: 20, scope: !5)
|
314 |
+
!58 = !DILocation(line: 57, column: 20, scope: !5)
|
315 |
+
!59 = !DILocation(line: 59, column: 51, scope: !5)
|
316 |
+
!60 = !DILocation(line: 60, column: 25, scope: !5)
|
317 |
+
!61 = !DILocation(line: 60, column: 48, scope: !5)
|
318 |
+
!62 = !DILocation(line: 60, column: 4, scope: !5)
|