0-hero commited on
Commit
00602c7
·
verified ·
1 Parent(s): 6f0bac9

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin +0 -0
  2. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir +366 -0
  3. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir +76 -0
  4. .triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir +75 -0
  5. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin +0 -0
  6. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir +283 -0
  7. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx +687 -0
  8. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir +58 -0
  9. .triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir +57 -0
  10. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin +0 -0
  11. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx +1854 -0
  12. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir +134 -0
  13. .triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir +113 -0
  14. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir +245 -0
  15. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx +651 -0
  16. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir +53 -0
  17. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin +0 -0
  18. .triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir +858 -0
  19. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir +290 -0
  20. .triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx +653 -0
  21. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir +162 -0
  22. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx +338 -0
  23. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir +24 -0
  24. .triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir +18 -0
  25. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin +0 -0
  26. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx +756 -0
  27. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir +141 -0
  28. .triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir +139 -0
  29. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir +235 -0
  30. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx +572 -0
  31. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir +63 -0
  32. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin +0 -0
  33. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir +243 -0
  34. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx +577 -0
  35. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir +65 -0
  36. .triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir +58 -0
  37. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx +456 -0
  38. .triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir +61 -0
  39. .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin +0 -0
  40. .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx +809 -0
  41. .triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx +717 -0
  42. .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir +38 -0
  43. .triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx +495 -0
  44. .triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir +278 -0
  45. .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx +1154 -0
  46. .triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx +1608 -0
  47. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir +949 -0
  48. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir +38 -0
  49. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir +37 -0
  50. .triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir +318 -0
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin ADDED
Binary file (16.9 kB). View file
 
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 {
8
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %12 = and i32 %11, 31, !dbg !10
10
+ %13 = lshr i32 %11, 5, !dbg !10
11
+ %14 = and i32 %13, 1, !dbg !10
12
+ %urem = shl i32 %11, 2, !dbg !10
13
+ %15 = and i32 %urem, 252, !dbg !10
14
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %17 = shl i32 %16, 8, !dbg !12
16
+ %18 = or i32 %17, %15, !dbg !13
17
+ %19 = sext i32 %18 to i64, !dbg !14
18
+ %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14
19
+ %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15
21
+ %23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15
22
+ %24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15
23
+ %25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15
24
+ %26 = bitcast i32 %22 to float, !dbg !15
25
+ %27 = bitcast i32 %23 to float, !dbg !15
26
+ %28 = bitcast i32 %24 to float, !dbg !15
27
+ %29 = bitcast i32 %25 to float, !dbg !15
28
+ %30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16
29
+ %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
30
+ %32 = extractvalue { i32, i32 } %31, 0, !dbg !17
31
+ %33 = extractvalue { i32, i32 } %31, 1, !dbg !17
32
+ %34 = trunc i32 %32 to i16, !dbg !17
33
+ %extelt.offset = lshr i32 %32, 16, !dbg !17
34
+ %35 = trunc i32 %extelt.offset to i16, !dbg !17
35
+ %36 = trunc i32 %33 to i16, !dbg !17
36
+ %extelt.offset1 = lshr i32 %33, 16, !dbg !17
37
+ %37 = trunc i32 %extelt.offset1 to i16, !dbg !17
38
+ %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
39
+ %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18
40
+ %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18
41
+ %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18
42
+ %42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19
43
+ %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
44
+ %44 = extractvalue { i32, i32 } %43, 0, !dbg !20
45
+ %45 = extractvalue { i32, i32 } %43, 1, !dbg !20
46
+ %46 = trunc i32 %44 to i16, !dbg !20
47
+ %extelt.offset2 = lshr i32 %44, 16, !dbg !20
48
+ %47 = trunc i32 %extelt.offset2 to i16, !dbg !20
49
+ %48 = trunc i32 %45 to i16, !dbg !20
50
+ %extelt.offset3 = lshr i32 %45, 16, !dbg !20
51
+ %49 = trunc i32 %extelt.offset3 to i16, !dbg !20
52
+ %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21
53
+ %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21
54
+ %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21
55
+ %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21
56
+ %54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22
57
+ %55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
58
+ %56 = extractvalue { i32, i32 } %55, 0, !dbg !23
59
+ %57 = extractvalue { i32, i32 } %55, 1, !dbg !23
60
+ %58 = trunc i32 %56 to i16, !dbg !23
61
+ %extelt.offset4 = lshr i32 %56, 16, !dbg !23
62
+ %59 = trunc i32 %extelt.offset4 to i16, !dbg !23
63
+ %60 = trunc i32 %57 to i16, !dbg !23
64
+ %extelt.offset5 = lshr i32 %57, 16, !dbg !23
65
+ %61 = trunc i32 %extelt.offset5 to i16, !dbg !23
66
+ %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24
67
+ %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24
68
+ %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24
69
+ %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24
70
+ %66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25
71
+ %67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
72
+ %68 = extractvalue { i32, i32 } %67, 0, !dbg !26
73
+ %69 = extractvalue { i32, i32 } %67, 1, !dbg !26
74
+ %70 = trunc i32 %68 to i16, !dbg !26
75
+ %extelt.offset6 = lshr i32 %68, 16, !dbg !26
76
+ %71 = trunc i32 %extelt.offset6 to i16, !dbg !26
77
+ %72 = trunc i32 %69 to i16, !dbg !26
78
+ %extelt.offset7 = lshr i32 %69, 16, !dbg !26
79
+ %73 = trunc i32 %extelt.offset7 to i16, !dbg !26
80
+ %74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27
81
+ %75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27
82
+ %76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27
83
+ %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27
84
+ %78 = zext nneg i32 %15 to i64, !dbg !28
85
+ %79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28
86
+ %80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
87
+ %81 = fadd float %38, %26, !dbg !30
88
+ %82 = fadd float %39, %27, !dbg !30
89
+ %83 = fadd float %40, %28, !dbg !30
90
+ %84 = fadd float %81, %50, !dbg !31
91
+ %85 = fadd float %82, %51, !dbg !31
92
+ %86 = fadd float %83, %52, !dbg !31
93
+ %87 = fadd float %85, %63, !dbg !32
94
+ %88 = fadd float %86, %64, !dbg !32
95
+ %89 = fadd float %87, %75, !dbg !33
96
+ %90 = fadd float %88, %76, !dbg !33
97
+ %91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32
98
+ %92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32
99
+ %93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32
100
+ %94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32
101
+ %95 = fadd <2 x float> %92, %94, !dbg !32
102
+ %96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33
103
+ %97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33
104
+ %98 = fadd <2 x float> %95, %97, !dbg !33
105
+ %99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34
106
+ %100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34
107
+ %101 = fadd <2 x float> %98, %100, !dbg !34
108
+ %102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34
109
+ %103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34
110
+ %104 = fadd <2 x float> %101, %103, !dbg !34
111
+ %105 = extractelement <2 x float> %104, i64 0, !dbg !34
112
+ %106 = extractelement <2 x float> %104, i64 1, !dbg !34
113
+ %107 = fadd float %105, %106, !dbg !34
114
+ %108 = bitcast float %107 to i32, !dbg !40
115
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40
116
+ %110 = bitcast i32 %109 to float, !dbg !40
117
+ %111 = fadd float %107, %110, !dbg !34
118
+ %112 = bitcast float %111 to i32, !dbg !40
119
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40
120
+ %114 = bitcast i32 %113 to float, !dbg !40
121
+ %115 = fadd float %111, %114, !dbg !34
122
+ %116 = bitcast float %115 to i32, !dbg !40
123
+ %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40
124
+ %118 = bitcast i32 %117 to float, !dbg !40
125
+ %119 = fadd float %115, %118, !dbg !34
126
+ %120 = bitcast float %119 to i32, !dbg !40
127
+ %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40
128
+ %122 = bitcast i32 %121 to float, !dbg !40
129
+ %123 = fadd float %119, %122, !dbg !34
130
+ %124 = bitcast float %123 to i32, !dbg !40
131
+ %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40
132
+ %126 = bitcast i32 %125 to float, !dbg !40
133
+ %127 = fadd float %123, %126, !dbg !34
134
+ %128 = icmp eq i32 %12, 0, !dbg !40
135
+ %129 = zext nneg i32 %14 to i64, !dbg !40
136
+ %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40
137
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40
138
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
139
+ %131 = icmp slt i32 %11, 2, !dbg !40
140
+ %132 = sext i32 %11 to i64, !dbg !40
141
+ %133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40
142
+ %134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40
143
+ %135 = bitcast float %134 to i32, !dbg !40
144
+ %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40
145
+ %137 = bitcast i32 %136 to float, !dbg !40
146
+ %138 = fadd float %134, %137, !dbg !34
147
+ %139 = and i32 %11, 1, !dbg !40
148
+ %140 = icmp eq i32 %139, 0, !dbg !40
149
+ %141 = and i1 %131, %140, !dbg !40
150
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40
151
+ tail call void @llvm.nvvm.barrier0(), !dbg !40
152
+ %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40
153
+ %143 = fadd float %142, 0.000000e+00, !dbg !42
154
+ %144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46
155
+ %145 = extractelement <2 x float> %98, i64 0, !dbg !47
156
+ %146 = fsub float %145, %144, !dbg !47
157
+ %147 = fsub float %89, %144, !dbg !47
158
+ %148 = fsub float %90, %144, !dbg !47
159
+ %149 = fsub float %106, %144, !dbg !47
160
+ %150 = fmul float %146, %146, !dbg !48
161
+ %151 = fmul float %147, %147, !dbg !48
162
+ %152 = fmul float %148, %148, !dbg !48
163
+ %153 = fmul float %149, %149, !dbg !48
164
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
165
+ %154 = fadd float %150, %151, !dbg !51
166
+ %155 = fadd float %152, %154, !dbg !51
167
+ %156 = fadd float %153, %155, !dbg !51
168
+ %157 = bitcast float %156 to i32, !dbg !49
169
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49
170
+ %159 = bitcast i32 %158 to float, !dbg !49
171
+ %160 = fadd float %156, %159, !dbg !51
172
+ %161 = bitcast float %160 to i32, !dbg !49
173
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49
174
+ %163 = bitcast i32 %162 to float, !dbg !49
175
+ %164 = fadd float %160, %163, !dbg !51
176
+ %165 = bitcast float %164 to i32, !dbg !49
177
+ %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49
178
+ %167 = bitcast i32 %166 to float, !dbg !49
179
+ %168 = fadd float %164, %167, !dbg !51
180
+ %169 = bitcast float %168 to i32, !dbg !49
181
+ %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49
182
+ %171 = bitcast i32 %170 to float, !dbg !49
183
+ %172 = fadd float %168, %171, !dbg !51
184
+ %173 = bitcast float %172 to i32, !dbg !49
185
+ %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49
186
+ %175 = bitcast i32 %174 to float, !dbg !49
187
+ %176 = fadd float %172, %175, !dbg !51
188
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49
189
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
190
+ %177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49
191
+ %178 = bitcast float %177 to i32, !dbg !49
192
+ %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49
193
+ %180 = bitcast i32 %179 to float, !dbg !49
194
+ %181 = fadd float %177, %180, !dbg !51
195
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49
196
+ tail call void @llvm.nvvm.barrier0(), !dbg !49
197
+ %182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49
198
+ %183 = fadd float %182, 0.000000e+00, !dbg !54
199
+ %184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56
200
+ %185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57
201
+ %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58
202
+ %.not.i = icmp eq i32 %186, 0, !dbg !58
203
+ br i1 %.not.i, label %189, label %187, !dbg !58
204
+
205
+ 187: ; preds = %10
206
+ %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58
207
+ br label %__nv_rsqrtf.exit, !dbg !58
208
+
209
+ 189: ; preds = %10
210
+ %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58
211
+ br label %__nv_rsqrtf.exit, !dbg !58
212
+
213
+ __nv_rsqrtf.exit: ; preds = %187, %189
214
+ %.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58
215
+ %191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29
216
+ %192 = bitcast i32 %191 to float, !dbg !29
217
+ %193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29
218
+ %194 = bitcast i32 %193 to float, !dbg !29
219
+ %195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29
220
+ %196 = bitcast i32 %195 to float, !dbg !29
221
+ %197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29
222
+ %198 = bitcast i32 %197 to float, !dbg !29
223
+ %199 = fmul float %146, %.0.i, !dbg !59
224
+ %200 = fmul float %147, %.0.i, !dbg !59
225
+ %201 = fmul float %148, %.0.i, !dbg !59
226
+ %202 = fmul float %149, %.0.i, !dbg !59
227
+ %203 = fmul float %199, %198, !dbg !60
228
+ %204 = fmul float %200, %196, !dbg !60
229
+ %205 = fmul float %201, %194, !dbg !60
230
+ %206 = fmul float %202, %192, !dbg !60
231
+ %207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61
232
+ %208 = bitcast float %145 to i32, !dbg !62
233
+ %209 = bitcast float %89 to i32, !dbg !62
234
+ %210 = bitcast float %90 to i32, !dbg !62
235
+ %211 = bitcast float %106 to i32, !dbg !62
236
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62
237
+ %212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63
238
+ %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64
239
+ %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64
240
+ %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64
241
+ %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64
242
+ %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64
243
+ %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64
244
+ %219 = bitcast <2 x i16> %218 to i32, !dbg !64
245
+ %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64
246
+ %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64
247
+ %222 = bitcast <2 x i16> %221 to i32, !dbg !64
248
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64
249
+ ret void, !dbg !65
250
+ }
251
+
252
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
253
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
254
+
255
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
256
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
257
+
258
+ ; Function Attrs: convergent nocallback nounwind
259
+ declare void @llvm.nvvm.barrier0() #2
260
+
261
+ ; Function Attrs: alwaysinline nounwind
262
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
263
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
264
+ %.not = icmp eq i32 %1, 0
265
+ br i1 %.not, label %4, label %2
266
+
267
+ 2: ; preds = %0
268
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
269
+ br label %6
270
+
271
+ 4: ; preds = %0
272
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
273
+ br label %6
274
+
275
+ 6: ; preds = %4, %2
276
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
277
+ ret float %.0
278
+ }
279
+
280
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
281
+
282
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
283
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
284
+
285
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
286
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
287
+
288
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
289
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
290
+ attributes #2 = { convergent nocallback nounwind }
291
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
292
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
293
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
294
+ attributes #6 = { nounwind }
295
+
296
+ !llvm.module.flags = !{!0, !1}
297
+ !llvm.dbg.cu = !{!2}
298
+ !nvvm.annotations = !{!4, !5, !5, !4}
299
+ !llvm.ident = !{!6}
300
+
301
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
302
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
303
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
304
+ !3 = !DIFile(filename: "cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py", directory: "/tmp/torchinductor_root/jb")
305
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
306
+ !5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
307
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
308
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
309
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
310
+ !9 = !{}
311
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
312
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
313
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
314
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
315
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
316
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
317
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
318
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
319
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
320
+ !19 = !DILocation(line: 32, column: 30, scope: !7)
321
+ !20 = !DILocation(line: 32, column: 46, scope: !7)
322
+ !21 = !DILocation(line: 32, column: 67, scope: !7)
323
+ !22 = !DILocation(line: 33, column: 30, scope: !7)
324
+ !23 = !DILocation(line: 33, column: 46, scope: !7)
325
+ !24 = !DILocation(line: 33, column: 67, scope: !7)
326
+ !25 = !DILocation(line: 34, column: 31, scope: !7)
327
+ !26 = !DILocation(line: 34, column: 47, scope: !7)
328
+ !27 = !DILocation(line: 34, column: 68, scope: !7)
329
+ !28 = !DILocation(line: 35, column: 31, scope: !7)
330
+ !29 = !DILocation(line: 35, column: 36, scope: !7)
331
+ !30 = !DILocation(line: 37, column: 18, scope: !7)
332
+ !31 = !DILocation(line: 39, column: 18, scope: !7)
333
+ !32 = !DILocation(line: 41, column: 18, scope: !7)
334
+ !33 = !DILocation(line: 43, column: 19, scope: !7)
335
+ !34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38)
336
+ !35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0)
337
+ !36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
338
+ !37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
339
+ !38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39)
340
+ !39 = !DILocation(line: 48, column: 59, scope: !35)
341
+ !40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41)
342
+ !41 = !DILocation(line: 48, column: 59, scope: !37)
343
+ !42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45)
344
+ !43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0)
345
+ !44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
346
+ !45 = !DILocation(line: 48, column: 45, scope: !43)
347
+ !46 = !DILocation(line: 51, column: 20, scope: !7)
348
+ !47 = !DILocation(line: 52, column: 20, scope: !7)
349
+ !48 = !DILocation(line: 53, column: 20, scope: !7)
350
+ !49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50)
351
+ !50 = !DILocation(line: 56, column: 59, scope: !37)
352
+ !51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52)
353
+ !52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53)
354
+ !53 = !DILocation(line: 56, column: 59, scope: !35)
355
+ !54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55)
356
+ !55 = !DILocation(line: 56, column: 45, scope: !43)
357
+ !56 = !DILocation(line: 59, column: 20, scope: !7)
358
+ !57 = !DILocation(line: 61, column: 20, scope: !7)
359
+ !58 = !DILocation(line: 62, column: 26, scope: !7)
360
+ !59 = !DILocation(line: 63, column: 20, scope: !7)
361
+ !60 = !DILocation(line: 64, column: 20, scope: !7)
362
+ !61 = !DILocation(line: 66, column: 25, scope: !7)
363
+ !62 = !DILocation(line: 66, column: 48, scope: !7)
364
+ !63 = !DILocation(line: 67, column: 25, scope: !7)
365
+ !64 = !DILocation(line: 67, column: 48, scope: !7)
366
+ !65 = !DILocation(line: 67, column: 4, scope: !7)
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
25
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
27
+ %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
28
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
29
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
31
+ %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
32
+ %21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
33
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
34
+ %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
35
+ %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
36
+ %25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
37
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
38
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
39
+ %28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
40
+ %29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
41
+ %30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
42
+ %31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
43
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
44
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
45
+ ^bb0(%arg10: f32, %arg11: f32):
46
+ %53 = arith.addf %arg10, %arg11 : f32
47
+ tt.reduce.return %53 : f32
48
+ }) : (tensor<256xf32, #blocked>) -> f32
49
+ %34 = arith.addf %33, %cst_2 : f32
50
+ %35 = arith.divf %34, %cst_1 : f32
51
+ %36 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
52
+ %37 = arith.subf %31, %36 : tensor<256xf32, #blocked>
53
+ %38 = arith.mulf %37, %37 : tensor<256xf32, #blocked>
54
+ %39 = arith.select %2, %38, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
55
+ %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
56
+ ^bb0(%arg10: f32, %arg11: f32):
57
+ %53 = arith.addf %arg10, %arg11 : f32
58
+ tt.reduce.return %53 : f32
59
+ }) : (tensor<256xf32, #blocked>) -> f32
60
+ %41 = arith.addf %40, %cst_2 : f32
61
+ %42 = arith.divf %41, %cst_1 : f32
62
+ %43 = arith.addf %42, %cst_0 : f32
63
+ %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
64
+ %45 = tt.splat %44 : (f32) -> tensor<256xf32, #blocked>
65
+ %46 = arith.mulf %37, %45 : tensor<256xf32, #blocked>
66
+ %47 = arith.mulf %46, %27 : tensor<256xf32, #blocked>
67
+ %48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
68
+ %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
69
+ tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
70
+ %50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
71
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
72
+ %52 = arith.truncf %47 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
73
+ tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
74
+ tt.return
75
+ }
76
+ }
.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
24
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
26
+ %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
27
+ %17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
28
+ %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
29
+ %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
30
+ %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
31
+ %21 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
32
+ %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
33
+ %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
34
+ %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
35
+ %25 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
36
+ %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
37
+ %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
38
+ %28 = arith.addf %8, %12 : tensor<256xf32>
39
+ %29 = arith.addf %28, %16 : tensor<256xf32>
40
+ %30 = arith.addf %29, %20 : tensor<256xf32>
41
+ %31 = arith.addf %30, %24 : tensor<256xf32>
42
+ %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
43
+ %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
44
+ ^bb0(%arg10: f32, %arg11: f32):
45
+ %53 = arith.addf %arg10, %arg11 : f32
46
+ tt.reduce.return %53 : f32
47
+ }) : (tensor<256xf32>) -> f32
48
+ %34 = arith.addf %33, %cst_0 : f32
49
+ %35 = arith.divf %34, %cst_1 : f32
50
+ %36 = tt.splat %35 : (f32) -> tensor<256xf32>
51
+ %37 = arith.subf %31, %36 : tensor<256xf32>
52
+ %38 = arith.mulf %37, %37 : tensor<256xf32>
53
+ %39 = arith.select %2, %38, %cst_3 : tensor<256xi1>, tensor<256xf32>
54
+ %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({
55
+ ^bb0(%arg10: f32, %arg11: f32):
56
+ %53 = arith.addf %arg10, %arg11 : f32
57
+ tt.reduce.return %53 : f32
58
+ }) : (tensor<256xf32>) -> f32
59
+ %41 = arith.addf %40, %cst_0 : f32
60
+ %42 = arith.divf %41, %cst_1 : f32
61
+ %43 = arith.addf %42, %cst_2 : f32
62
+ %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
63
+ %45 = tt.splat %44 : (f32) -> tensor<256xf32>
64
+ %46 = arith.mulf %37, %45 : tensor<256xf32>
65
+ %47 = arith.mulf %46, %27 : tensor<256xf32>
66
+ %48 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
67
+ %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
68
+ tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
69
+ %50 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
70
+ %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
71
+ %52 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
72
+ tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
73
+ tt.return
74
+ }
75
+ }
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin ADDED
Binary file (13.2 kB). View file
 
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
6
+
7
+ define void @triton__0d1d2d3d4de5de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !7 {
8
+ %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
9
+ %8 = and i32 %7, 31, !dbg !10
10
+ %9 = lshr i32 %7, 5, !dbg !10
11
+ %10 = and i32 %9, 1, !dbg !10
12
+ %urem = shl i32 %7, 2, !dbg !10
13
+ %11 = and i32 %urem, 252, !dbg !10
14
+ %12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
15
+ %13 = shl i32 %12, 8, !dbg !12
16
+ %14 = or i32 %13, %11, !dbg !13
17
+ %15 = sext i32 %14 to i64, !dbg !14
18
+ %16 = getelementptr float, ptr addrspace(1) %0, i64 %15, !dbg !14
19
+ %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %16, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
20
+ %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !15
21
+ %19 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !15
22
+ %20 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !15
23
+ %21 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !15
24
+ %22 = bitcast i32 %18 to float, !dbg !15
25
+ %23 = bitcast i32 %19 to float, !dbg !15
26
+ %24 = bitcast i32 %20 to float, !dbg !15
27
+ %25 = bitcast i32 %21 to float, !dbg !15
28
+ %26 = getelementptr i16, ptr addrspace(1) %1, i64 %15, !dbg !16
29
+ %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
30
+ %28 = extractvalue { i32, i32 } %27, 0, !dbg !17
31
+ %29 = extractvalue { i32, i32 } %27, 1, !dbg !17
32
+ %30 = trunc i32 %28 to i16, !dbg !17
33
+ %extelt.offset = lshr i32 %28, 16, !dbg !17
34
+ %31 = trunc i32 %extelt.offset to i16, !dbg !17
35
+ %32 = trunc i32 %29 to i16, !dbg !17
36
+ %extelt.offset1 = lshr i32 %29, 16, !dbg !17
37
+ %33 = trunc i32 %extelt.offset1 to i16, !dbg !17
38
+ %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18
39
+ %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
40
+ %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
41
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
42
+ %38 = zext nneg i32 %11 to i64, !dbg !19
43
+ %39 = getelementptr float, ptr addrspace(1) %2, i64 %38, !dbg !19
44
+ %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
45
+ %41 = fadd float %34, %22, !dbg !21
46
+ %42 = fadd float %35, %23, !dbg !21
47
+ %43 = fadd float %36, %24, !dbg !21
48
+ %44 = fadd float %37, %25, !dbg !21
49
+ %45 = fadd float %41, %42, !dbg !22
50
+ %46 = fadd float %45, %43, !dbg !22
51
+ %47 = fadd float %46, %44, !dbg !22
52
+ %48 = bitcast float %47 to i32, !dbg !28
53
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !28
54
+ %50 = bitcast i32 %49 to float, !dbg !28
55
+ %51 = fadd float %47, %50, !dbg !22
56
+ %52 = bitcast float %51 to i32, !dbg !28
57
+ %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !28
58
+ %54 = bitcast i32 %53 to float, !dbg !28
59
+ %55 = fadd float %51, %54, !dbg !22
60
+ %56 = bitcast float %55 to i32, !dbg !28
61
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !28
62
+ %58 = bitcast i32 %57 to float, !dbg !28
63
+ %59 = fadd float %55, %58, !dbg !22
64
+ %60 = bitcast float %59 to i32, !dbg !28
65
+ %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !28
66
+ %62 = bitcast i32 %61 to float, !dbg !28
67
+ %63 = fadd float %59, %62, !dbg !22
68
+ %64 = bitcast float %63 to i32, !dbg !28
69
+ %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !28
70
+ %66 = bitcast i32 %65 to float, !dbg !28
71
+ %67 = fadd float %63, %66, !dbg !22
72
+ %68 = icmp eq i32 %8, 0, !dbg !28
73
+ %69 = zext nneg i32 %10 to i64, !dbg !28
74
+ %70 = getelementptr float, ptr addrspace(3) @global_smem, i64 %69, !dbg !28
75
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %67, i1 %68) #6, !dbg !28
76
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
77
+ %71 = icmp slt i32 %7, 2, !dbg !28
78
+ %72 = sext i32 %7 to i64, !dbg !28
79
+ %73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !28
80
+ %74 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !28
81
+ %75 = bitcast float %74 to i32, !dbg !28
82
+ %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 1, i32 31), !dbg !28
83
+ %77 = bitcast i32 %76 to float, !dbg !28
84
+ %78 = fadd float %74, %77, !dbg !22
85
+ %79 = and i32 %7, 1, !dbg !28
86
+ %80 = icmp eq i32 %79, 0, !dbg !28
87
+ %81 = and i1 %71, %80, !dbg !28
88
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %78, i1 %81) #6, !dbg !28
89
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
90
+ %82 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28
91
+ %83 = fadd float %82, 0.000000e+00, !dbg !30
92
+ %84 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %83, float 2.560000e+02) #6, !dbg !34
93
+ %85 = fsub float %41, %84, !dbg !35
94
+ %86 = fsub float %42, %84, !dbg !35
95
+ %87 = fsub float %43, %84, !dbg !35
96
+ %88 = fsub float %44, %84, !dbg !35
97
+ %89 = fmul float %85, %85, !dbg !36
98
+ %90 = fmul float %86, %86, !dbg !36
99
+ %91 = fmul float %87, %87, !dbg !36
100
+ %92 = fmul float %88, %88, !dbg !36
101
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
102
+ %93 = fadd float %89, %90, !dbg !39
103
+ %94 = fadd float %91, %93, !dbg !39
104
+ %95 = fadd float %92, %94, !dbg !39
105
+ %96 = bitcast float %95 to i32, !dbg !37
106
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !37
107
+ %98 = bitcast i32 %97 to float, !dbg !37
108
+ %99 = fadd float %95, %98, !dbg !39
109
+ %100 = bitcast float %99 to i32, !dbg !37
110
+ %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !37
111
+ %102 = bitcast i32 %101 to float, !dbg !37
112
+ %103 = fadd float %99, %102, !dbg !39
113
+ %104 = bitcast float %103 to i32, !dbg !37
114
+ %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !37
115
+ %106 = bitcast i32 %105 to float, !dbg !37
116
+ %107 = fadd float %103, %106, !dbg !39
117
+ %108 = bitcast float %107 to i32, !dbg !37
118
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !37
119
+ %110 = bitcast i32 %109 to float, !dbg !37
120
+ %111 = fadd float %107, %110, !dbg !39
121
+ %112 = bitcast float %111 to i32, !dbg !37
122
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !37
123
+ %114 = bitcast i32 %113 to float, !dbg !37
124
+ %115 = fadd float %111, %114, !dbg !39
125
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %115, i1 %68) #6, !dbg !37
126
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
127
+ %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !37
128
+ %117 = bitcast float %116 to i32, !dbg !37
129
+ %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37
130
+ %119 = bitcast i32 %118 to float, !dbg !37
131
+ %120 = fadd float %116, %119, !dbg !39
132
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %120, i1 %81) #6, !dbg !37
133
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
134
+ %121 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
135
+ %122 = fadd float %121, 0.000000e+00, !dbg !42
136
+ %123 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %122, float 2.560000e+02) #6, !dbg !44
137
+ %124 = fadd float %123, 0x3EE4F8B580000000, !dbg !45
138
+ %125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46
139
+ %.not.i = icmp eq i32 %125, 0, !dbg !46
140
+ br i1 %.not.i, label %128, label %126, !dbg !46
141
+
142
+ 126: ; preds = %6
143
+ %127 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !46
144
+ br label %__nv_rsqrtf.exit, !dbg !46
145
+
146
+ 128: ; preds = %6
147
+ %129 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !46
148
+ br label %__nv_rsqrtf.exit, !dbg !46
149
+
150
+ __nv_rsqrtf.exit: ; preds = %126, %128
151
+ %.0.i = phi float [ %127, %126 ], [ %129, %128 ], !dbg !46
152
+ %130 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !20
153
+ %131 = bitcast i32 %130 to float, !dbg !20
154
+ %132 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !20
155
+ %133 = bitcast i32 %132 to float, !dbg !20
156
+ %134 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !20
157
+ %135 = bitcast i32 %134 to float, !dbg !20
158
+ %136 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !20
159
+ %137 = bitcast i32 %136 to float, !dbg !20
160
+ %138 = fmul float %85, %.0.i, !dbg !47
161
+ %139 = fmul float %86, %.0.i, !dbg !47
162
+ %140 = fmul float %87, %.0.i, !dbg !47
163
+ %141 = fmul float %88, %.0.i, !dbg !47
164
+ %142 = fmul float %138, %137, !dbg !48
165
+ %143 = fmul float %139, %135, !dbg !48
166
+ %144 = fmul float %140, %133, !dbg !48
167
+ %145 = fmul float %141, %131, !dbg !48
168
+ %146 = getelementptr i16, ptr addrspace(1) %3, i64 %15, !dbg !49
169
+ %147 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %142) #6, !dbg !50
170
+ %148 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %143) #6, !dbg !50
171
+ %149 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !50
172
+ %150 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !50
173
+ %151 = insertelement <2 x i16> undef, i16 %147, i64 0, !dbg !50
174
+ %152 = insertelement <2 x i16> %151, i16 %148, i64 1, !dbg !50
175
+ %153 = bitcast <2 x i16> %152 to i32, !dbg !50
176
+ %154 = insertelement <2 x i16> undef, i16 %149, i64 0, !dbg !50
177
+ %155 = insertelement <2 x i16> %154, i16 %150, i64 1, !dbg !50
178
+ %156 = bitcast <2 x i16> %155 to i32, !dbg !50
179
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %153, i32 %156, ptr addrspace(1) %146, i1 true) #6, !dbg !50
180
+ ret void, !dbg !51
181
+ }
182
+
183
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
184
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
185
+
186
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
187
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
188
+
189
+ ; Function Attrs: convergent nocallback nounwind
190
+ declare void @llvm.nvvm.barrier0() #2
191
+
192
+ ; Function Attrs: alwaysinline nounwind
193
+ define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
194
+ %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
195
+ %.not = icmp eq i32 %1, 0
196
+ br i1 %.not, label %4, label %2
197
+
198
+ 2: ; preds = %0
199
+ %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
200
+ br label %6
201
+
202
+ 4: ; preds = %0
203
+ %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
204
+ br label %6
205
+
206
+ 6: ; preds = %4, %2
207
+ %.0 = phi float [ %3, %2 ], [ %5, %4 ]
208
+ ret float %.0
209
+ }
210
+
211
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
212
+
213
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
214
+ declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
215
+
216
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
217
+ declare float @llvm.nvvm.rsqrt.approx.f(float) #5
218
+
219
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
220
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
221
+ attributes #2 = { convergent nocallback nounwind }
222
+ attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
223
+ attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
224
+ attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
225
+ attributes #6 = { nounwind }
226
+
227
+ !llvm.module.flags = !{!0, !1}
228
+ !llvm.dbg.cu = !{!2}
229
+ !nvvm.annotations = !{!4, !5, !5, !4}
230
+ !llvm.ident = !{!6}
231
+
232
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
233
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
234
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
235
+ !3 = !DIFile(filename: "cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py", directory: "/tmp/torchinductor_root/qh")
236
+ !4 = !{ptr @triton__0d1d2d3d4de5de, !"kernel", i32 1}
237
+ !5 = !{ptr @triton__0d1d2d3d4de5de, !"maxntidx", i32 64}
238
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
239
+ !7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5de", linkageName: "triton__0d1d2d3d4de5de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
240
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
241
+ !9 = !{}
242
+ !10 = !DILocation(line: 26, column: 26, scope: !7)
243
+ !11 = !DILocation(line: 23, column: 28, scope: !7)
244
+ !12 = !DILocation(line: 30, column: 40, scope: !7)
245
+ !13 = !DILocation(line: 30, column: 36, scope: !7)
246
+ !14 = !DILocation(line: 30, column: 30, scope: !7)
247
+ !15 = !DILocation(line: 30, column: 46, scope: !7)
248
+ !16 = !DILocation(line: 31, column: 30, scope: !7)
249
+ !17 = !DILocation(line: 31, column: 46, scope: !7)
250
+ !18 = !DILocation(line: 31, column: 67, scope: !7)
251
+ !19 = !DILocation(line: 32, column: 31, scope: !7)
252
+ !20 = !DILocation(line: 32, column: 36, scope: !7)
253
+ !21 = !DILocation(line: 34, column: 18, scope: !7)
254
+ !22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26)
255
+ !23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0)
256
+ !24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
257
+ !25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0)
258
+ !26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27)
259
+ !27 = !DILocation(line: 39, column: 58, scope: !23)
260
+ !28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
261
+ !29 = !DILocation(line: 39, column: 58, scope: !25)
262
+ !30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33)
263
+ !31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
264
+ !32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
265
+ !33 = !DILocation(line: 39, column: 45, scope: !31)
266
+ !34 = !DILocation(line: 42, column: 20, scope: !7)
267
+ !35 = !DILocation(line: 43, column: 19, scope: !7)
268
+ !36 = !DILocation(line: 44, column: 20, scope: !7)
269
+ !37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38)
270
+ !38 = !DILocation(line: 47, column: 59, scope: !25)
271
+ !39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40)
272
+ !40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41)
273
+ !41 = !DILocation(line: 47, column: 59, scope: !23)
274
+ !42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43)
275
+ !43 = !DILocation(line: 47, column: 45, scope: !31)
276
+ !44 = !DILocation(line: 50, column: 20, scope: !7)
277
+ !45 = !DILocation(line: 52, column: 20, scope: !7)
278
+ !46 = !DILocation(line: 53, column: 26, scope: !7)
279
+ !47 = !DILocation(line: 54, column: 20, scope: !7)
280
+ !48 = !DILocation(line: 55, column: 20, scope: !7)
281
+ !49 = !DILocation(line: 57, column: 25, scope: !7)
282
+ !50 = !DILocation(line: 57, column: 48, scope: !7)
283
+ !51 = !DILocation(line: 57, column: 4, scope: !7)
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx ADDED
@@ -0,0 +1,687 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4de5de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4de5de(
14
+ .param .u64 triton__0d1d2d3d4de5de_param_0,
15
+ .param .u64 triton__0d1d2d3d4de5de_param_1,
16
+ .param .u64 triton__0d1d2d3d4de5de_param_2,
17
+ .param .u64 triton__0d1d2d3d4de5de_param_3,
18
+ .param .u32 triton__0d1d2d3d4de5de_param_4,
19
+ .param .u32 triton__0d1d2d3d4de5de_param_5
20
+ )
21
+ .maxntid 64, 1, 1
22
+ {
23
+ .reg .pred %p<23>;
24
+ .reg .b16 %rs<9>;
25
+ .reg .b32 %r<84>;
26
+ .reg .f32 %f<70>;
27
+ .reg .b64 %rd<12>;
28
+ .loc 1 18 0
29
+ $L__func_begin0:
30
+ .loc 1 18 0
31
+
32
+ ld.param.u64 %rd5, [triton__0d1d2d3d4de5de_param_0];
33
+ ld.param.u64 %rd6, [triton__0d1d2d3d4de5de_param_1];
34
+ $L__tmp0:
35
+ .loc 1 26 26
36
+ mov.u32 %r50, %tid.x;
37
+ and.b32 %r51, %r50, 31;
38
+ ld.param.u64 %rd7, [triton__0d1d2d3d4de5de_param_2];
39
+ ld.param.u64 %rd8, [triton__0d1d2d3d4de5de_param_3];
40
+ shl.b32 %r52, %r50, 2;
41
+ and.b32 %r53, %r52, 252;
42
+ .loc 1 23 28
43
+ mov.u32 %r1, %ctaid.x;
44
+ .loc 1 30 40
45
+ shl.b32 %r54, %r1, 8;
46
+ .loc 1 30 36
47
+ or.b32 %r55, %r54, %r53;
48
+ .loc 1 30 30
49
+ mul.wide.s32 %rd9, %r55, 4;
50
+ add.s64 %rd1, %rd5, %rd9;
51
+ mov.b32 %r6, 0;
52
+ mov.pred %p1, -1;
53
+ .loc 1 30 46
54
+ mov.u32 %r2, 0x0;
55
+ mov.u32 %r3, 0x0;
56
+ mov.u32 %r4, 0x0;
57
+ mov.u32 %r5, 0x0;
58
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
59
+ @!%p1 mov.u32 %r2, %r6;
60
+ @!%p1 mov.u32 %r3, %r6;
61
+ @!%p1 mov.u32 %r4, %r6;
62
+ @!%p1 mov.u32 %r5, %r6;
63
+ mov.b32 %f1, %r2;
64
+ mov.b32 %f2, %r3;
65
+ mov.b32 %f3, %r4;
66
+ mov.b32 %f4, %r5;
67
+ .loc 1 31 30
68
+ mul.wide.s32 %rd10, %r55, 2;
69
+ add.s64 %rd2, %rd6, %rd10;
70
+ .loc 1 31 46
71
+ mov.u32 %r10, 0x0;
72
+ mov.u32 %r11, 0x0;
73
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
74
+ @!%p1 mov.u32 %r10, %r6;
75
+ @!%p1 mov.u32 %r11, %r6;
76
+ cvt.u16.u32 %rs1, %r10;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
78
+ cvt.u16.u32 %rs3, %r11;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
80
+ .loc 1 31 67
81
+ cvt.f32.bf16 %r14, %rs1;
82
+ mov.b32 %f5, %r14;
83
+ cvt.f32.bf16 %r15, %rs2;
84
+ mov.b32 %f6, %r15;
85
+ cvt.f32.bf16 %r16, %rs3;
86
+ mov.b32 %f7, %r16;
87
+ cvt.f32.bf16 %r17, %rs4;
88
+ mov.b32 %f8, %r17;
89
+ .loc 1 32 31
90
+ mul.wide.u32 %rd11, %r53, 4;
91
+ add.s64 %rd3, %rd7, %rd11;
92
+ .loc 1 32 36
93
+ mov.u32 %r18, 0x0;
94
+ mov.u32 %r19, 0x0;
95
+ mov.u32 %r20, 0x0;
96
+ mov.u32 %r21, 0x0;
97
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
98
+ @!%p1 mov.u32 %r18, %r6;
99
+ @!%p1 mov.u32 %r19, %r6;
100
+ @!%p1 mov.u32 %r20, %r6;
101
+ @!%p1 mov.u32 %r21, %r6;
102
+ .loc 1 34 18
103
+ add.f32 %f9, %f5, %f1;
104
+ add.f32 %f10, %f6, %f2;
105
+ add.f32 %f11, %f7, %f3;
106
+ add.f32 %f12, %f8, %f4;
107
+ $L__tmp1:
108
+ .loc 2 233 15
109
+ add.f32 %f13, %f9, %f10;
110
+ add.f32 %f14, %f13, %f11;
111
+ add.f32 %f15, %f14, %f12;
112
+ $L__tmp2:
113
+ .loc 2 243 36
114
+ mov.b32 %r56, %f15;
115
+ shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1;
116
+ mov.b32 %f16, %r57;
117
+ $L__tmp3:
118
+ .loc 2 233 15
119
+ add.f32 %f17, %f15, %f16;
120
+ $L__tmp4:
121
+ .loc 2 243 36
122
+ mov.b32 %r58, %f17;
123
+ shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1;
124
+ mov.b32 %f18, %r59;
125
+ $L__tmp5:
126
+ .loc 2 233 15
127
+ add.f32 %f19, %f17, %f18;
128
+ $L__tmp6:
129
+ .loc 2 243 36
130
+ mov.b32 %r60, %f19;
131
+ shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1;
132
+ mov.b32 %f20, %r61;
133
+ $L__tmp7:
134
+ .loc 2 233 15
135
+ add.f32 %f21, %f19, %f20;
136
+ $L__tmp8:
137
+ .loc 2 243 36
138
+ mov.b32 %r62, %f21;
139
+ shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1;
140
+ mov.b32 %f22, %r63;
141
+ $L__tmp9:
142
+ .loc 2 233 15
143
+ add.f32 %f23, %f21, %f22;
144
+ $L__tmp10:
145
+ .loc 2 243 36
146
+ mov.b32 %r64, %f23;
147
+ shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1;
148
+ mov.b32 %f24, %r65;
149
+ $L__tmp11:
150
+ .loc 2 233 15
151
+ add.f32 %f25, %f23, %f24;
152
+ $L__tmp12:
153
+ .loc 2 243 36
154
+ setp.eq.s32 %p14, %r51, 0;
155
+ shr.u32 %r66, %r50, 3;
156
+ and.b32 %r67, %r66, 4;
157
+ mov.u32 %r68, global_smem;
158
+ add.s32 %r26, %r68, %r67;
159
+ mov.b32 %r27, %f25;
160
+ @%p14 st.shared.b32 [ %r26 + 0 ], %r27;
161
+ bar.sync 0;
162
+ setp.lt.s32 %p15, %r50, 2;
163
+ add.s32 %r29, %r68, %r52;
164
+ @%p15 ld.shared.b32 %r28, [ %r29 + 0 ];
165
+ mov.b32 %f26, %r28;
166
+ shfl.sync.bfly.b32 %r69, %r28, 1, 31, -1;
167
+ mov.b32 %f27, %r69;
168
+ $L__tmp13:
169
+ .loc 2 233 15
170
+ add.f32 %f28, %f26, %f27;
171
+ $L__tmp14:
172
+ .loc 2 243 36
173
+ and.b32 %r70, %r50, 1;
174
+ setp.eq.b32 %p21, %r70, 1;
175
+ not.pred %p22, %p21;
176
+ and.pred %p16, %p15, %p22;
177
+ mov.b32 %r31, %f28;
178
+ @%p16 st.shared.b32 [ %r29 + 0 ], %r31;
179
+ bar.sync 0;
180
+ ld.shared.f32 %f29, [global_smem];
181
+ $L__tmp15:
182
+ .loc 3 8 15
183
+ add.f32 %f30, %f29, 0f00000000;
184
+ $L__tmp16:
185
+ .loc 1 42 20
186
+ mov.b32 %r33, %f30;
187
+ mov.b32 %r34, 1132462080;
188
+ div.full.f32 %r32, %r33, %r34;
189
+ mov.b32 %f31, %r32;
190
+ .loc 1 43 19
191
+ sub.f32 %f32, %f9, %f31;
192
+ sub.f32 %f33, %f10, %f31;
193
+ sub.f32 %f34, %f11, %f31;
194
+ sub.f32 %f35, %f12, %f31;
195
+ .loc 1 44 20
196
+ mul.f32 %f36, %f33, %f33;
197
+ $L__tmp17:
198
+ .loc 2 243 36
199
+ bar.sync 0;
200
+ $L__tmp18:
201
+ .loc 2 233 15
202
+ fma.rn.f32 %f37, %f32, %f32, %f36;
203
+ fma.rn.f32 %f38, %f34, %f34, %f37;
204
+ fma.rn.f32 %f39, %f35, %f35, %f38;
205
+ $L__tmp19:
206
+ .loc 2 243 36
207
+ mov.b32 %r71, %f39;
208
+ shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1;
209
+ mov.b32 %f40, %r72;
210
+ $L__tmp20:
211
+ .loc 2 233 15
212
+ add.f32 %f41, %f39, %f40;
213
+ $L__tmp21:
214
+ .loc 2 243 36
215
+ mov.b32 %r73, %f41;
216
+ shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1;
217
+ mov.b32 %f42, %r74;
218
+ $L__tmp22:
219
+ .loc 2 233 15
220
+ add.f32 %f43, %f41, %f42;
221
+ $L__tmp23:
222
+ .loc 2 243 36
223
+ mov.b32 %r75, %f43;
224
+ shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1;
225
+ mov.b32 %f44, %r76;
226
+ $L__tmp24:
227
+ .loc 2 233 15
228
+ add.f32 %f45, %f43, %f44;
229
+ $L__tmp25:
230
+ .loc 2 243 36
231
+ mov.b32 %r77, %f45;
232
+ shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1;
233
+ mov.b32 %f46, %r78;
234
+ $L__tmp26:
235
+ .loc 2 233 15
236
+ add.f32 %f47, %f45, %f46;
237
+ $L__tmp27:
238
+ .loc 2 243 36
239
+ mov.b32 %r79, %f47;
240
+ shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1;
241
+ mov.b32 %f48, %r80;
242
+ $L__tmp28:
243
+ .loc 2 233 15
244
+ add.f32 %f49, %f47, %f48;
245
+ $L__tmp29:
246
+ .loc 2 243 36
247
+ mov.b32 %r36, %f49;
248
+ @%p14 st.shared.b32 [ %r26 + 0 ], %r36;
249
+ bar.sync 0;
250
+ @%p15 ld.shared.b32 %r37, [ %r29 + 0 ];
251
+ mov.b32 %f50, %r37;
252
+ shfl.sync.bfly.b32 %r81, %r37, 1, 31, -1;
253
+ mov.b32 %f51, %r81;
254
+ $L__tmp30:
255
+ .loc 2 233 15
256
+ add.f32 %f52, %f50, %f51;
257
+ $L__tmp31:
258
+ .loc 2 243 36
259
+ mov.b32 %r40, %f52;
260
+ @%p16 st.shared.b32 [ %r29 + 0 ], %r40;
261
+ bar.sync 0;
262
+ ld.shared.f32 %f53, [global_smem];
263
+ $L__tmp32:
264
+ .loc 3 8 15
265
+ add.f32 %f54, %f53, 0f00000000;
266
+ $L__tmp33:
267
+ .loc 1 50 20
268
+ mov.b32 %r42, %f54;
269
+ div.full.f32 %r41, %r42, %r34;
270
+ mov.b32 %f55, %r41;
271
+ .loc 1 52 20
272
+ add.f32 %f56, %f55, 0f3727C5AC;
273
+ .loc 1 53 26
274
+ rsqrt.approx.ftz.f32 %f57, %f56;
275
+ .loc 1 32 36
276
+ mov.b32 %f58, %r21;
277
+ mov.b32 %f59, %r20;
278
+ mov.b32 %f60, %r19;
279
+ mov.b32 %f61, %r18;
280
+ .loc 1 54 20
281
+ mul.f32 %f62, %f32, %f57;
282
+ mul.f32 %f63, %f33, %f57;
283
+ mul.f32 %f64, %f34, %f57;
284
+ mul.f32 %f65, %f35, %f57;
285
+ .loc 1 55 20
286
+ mul.f32 %f66, %f62, %f61;
287
+ mul.f32 %f67, %f63, %f60;
288
+ mul.f32 %f68, %f64, %f59;
289
+ mul.f32 %f69, %f65, %f58;
290
+ .loc 1 57 25
291
+ add.s64 %rd4, %rd8, %rd10;
292
+ .loc 1 57 48
293
+ mov.b32 %r44, %f66;
294
+ cvt.rn.bf16.f32 %rs5, %r44;
295
+ mov.b32 %r45, %f67;
296
+ cvt.rn.bf16.f32 %rs6, %r45;
297
+ mov.b32 %r46, %f68;
298
+ cvt.rn.bf16.f32 %rs7, %r46;
299
+ mov.b32 %r47, %f69;
300
+ cvt.rn.bf16.f32 %rs8, %r47;
301
+ mov.b32 %r82, {%rs5, %rs6};
302
+ mov.b32 %r83, {%rs7, %rs8};
303
+ @%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 };
304
+ .loc 1 57 4
305
+ ret;
306
+ $L__tmp34:
307
+ $L__func_end0:
308
+
309
+ }
310
+ // .globl __nv_rsqrtf
311
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
312
+ .param .b32 __nv_rsqrtf_param_0
313
+ )
314
+ {
315
+ .reg .f32 %f<3>;
316
+ $L__func_begin1:
317
+
318
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
319
+ rsqrt.approx.ftz.f32 %f2, %f1;
320
+ st.param.f32 [func_retval0+0], %f2;
321
+ ret;
322
+ $L__func_end1:
323
+
324
+ }
325
+ .file 1 "/tmp/torchinductor_root/qh/cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py"
326
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
327
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
328
+ .section .debug_abbrev
329
+ {
330
+ .b8 1
331
+ .b8 17
332
+ .b8 1
333
+ .b8 37
334
+ .b8 8
335
+ .b8 19
336
+ .b8 5
337
+ .b8 3
338
+ .b8 8
339
+ .b8 16
340
+ .b8 6
341
+ .b8 27
342
+ .b8 8
343
+ .b8 180
344
+ .b8 66
345
+ .b8 12
346
+ .b8 17
347
+ .b8 1
348
+ .b8 18
349
+ .b8 1
350
+ .b8 0
351
+ .b8 0
352
+ .b8 2
353
+ .b8 46
354
+ .b8 0
355
+ .b8 135
356
+ .b8 64
357
+ .b8 8
358
+ .b8 3
359
+ .b8 8
360
+ .b8 58
361
+ .b8 11
362
+ .b8 59
363
+ .b8 11
364
+ .b8 63
365
+ .b8 12
366
+ .b8 32
367
+ .b8 11
368
+ .b8 0
369
+ .b8 0
370
+ .b8 3
371
+ .b8 46
372
+ .b8 1
373
+ .b8 17
374
+ .b8 1
375
+ .b8 18
376
+ .b8 1
377
+ .b8 64
378
+ .b8 10
379
+ .b8 49
380
+ .b8 19
381
+ .b8 0
382
+ .b8 0
383
+ .b8 4
384
+ .b8 29
385
+ .b8 1
386
+ .b8 49
387
+ .b8 19
388
+ .b8 17
389
+ .b8 1
390
+ .b8 18
391
+ .b8 1
392
+ .b8 88
393
+ .b8 11
394
+ .b8 89
395
+ .b8 11
396
+ .b8 87
397
+ .b8 11
398
+ .b8 0
399
+ .b8 0
400
+ .b8 5
401
+ .b8 29
402
+ .b8 0
403
+ .b8 49
404
+ .b8 19
405
+ .b8 17
406
+ .b8 1
407
+ .b8 18
408
+ .b8 1
409
+ .b8 88
410
+ .b8 11
411
+ .b8 89
412
+ .b8 11
413
+ .b8 87
414
+ .b8 11
415
+ .b8 0
416
+ .b8 0
417
+ .b8 0
418
+ }
419
+ .section .debug_info
420
+ {
421
+ .b32 391
422
+ .b8 2
423
+ .b8 0
424
+ .b32 .debug_abbrev
425
+ .b8 8
426
+ .b8 1
427
+ .b8 116
428
+ .b8 114
429
+ .b8 105
430
+ .b8 116
431
+ .b8 111
432
+ .b8 110
433
+ .b8 0
434
+ .b8 2
435
+ .b8 0
436
+ .b8 99
437
+ .b8 113
438
+ .b8 104
439
+ .b8 50
440
+ .b8 100
441
+ .b8 106
442
+ .b8 51
443
+ .b8 53
444
+ .b8 53
445
+ .b8 105
446
+ .b8 97
447
+ .b8 116
448
+ .b8 106
449
+ .b8 122
450
+ .b8 118
451
+ .b8 105
452
+ .b8 53
453
+ .b8 99
454
+ .b8 109
455
+ .b8 122
456
+ .b8 52
457
+ .b8 116
458
+ .b8 120
459
+ .b8 118
460
+ .b8 106
461
+ .b8 100
462
+ .b8 51
463
+ .b8 97
464
+ .b8 112
465
+ .b8 53
466
+ .b8 50
467
+ .b8 115
468
+ .b8 104
469
+ .b8 103
470
+ .b8 97
471
+ .b8 115
472
+ .b8 104
473
+ .b8 52
474
+ .b8 99
475
+ .b8 122
476
+ .b8 105
477
+ .b8 102
478
+ .b8 100
479
+ .b8 99
480
+ .b8 110
481
+ .b8 97
482
+ .b8 102
483
+ .b8 110
484
+ .b8 107
485
+ .b8 107
486
+ .b8 97
487
+ .b8 109
488
+ .b8 46
489
+ .b8 112
490
+ .b8 121
491
+ .b8 0
492
+ .b32 .debug_line
493
+ .b8 47
494
+ .b8 116
495
+ .b8 109
496
+ .b8 112
497
+ .b8 47
498
+ .b8 116
499
+ .b8 111
500
+ .b8 114
501
+ .b8 99
502
+ .b8 104
503
+ .b8 105
504
+ .b8 110
505
+ .b8 100
506
+ .b8 117
507
+ .b8 99
508
+ .b8 116
509
+ .b8 111
510
+ .b8 114
511
+ .b8 95
512
+ .b8 114
513
+ .b8 111
514
+ .b8 111
515
+ .b8 116
516
+ .b8 47
517
+ .b8 113
518
+ .b8 104
519
+ .b8 0
520
+ .b8 1
521
+ .b64 $L__func_begin0
522
+ .b64 $L__func_end0
523
+ .b8 2
524
+ .b8 116
525
+ .b8 114
526
+ .b8 105
527
+ .b8 116
528
+ .b8 111
529
+ .b8 110
530
+ .b8 95
531
+ .b8 95
532
+ .b8 48
533
+ .b8 100
534
+ .b8 49
535
+ .b8 100
536
+ .b8 50
537
+ .b8 100
538
+ .b8 51
539
+ .b8 100
540
+ .b8 52
541
+ .b8 100
542
+ .b8 101
543
+ .b8 53
544
+ .b8 100
545
+ .b8 101
546
+ .b8 0
547
+ .b8 116
548
+ .b8 114
549
+ .b8 105
550
+ .b8 116
551
+ .b8 111
552
+ .b8 110
553
+ .b8 95
554
+ .b8 95
555
+ .b8 48
556
+ .b8 100
557
+ .b8 49
558
+ .b8 100
559
+ .b8 50
560
+ .b8 100
561
+ .b8 51
562
+ .b8 100
563
+ .b8 52
564
+ .b8 100
565
+ .b8 101
566
+ .b8 53
567
+ .b8 100
568
+ .b8 101
569
+ .b8 0
570
+ .b8 1
571
+ .b8 18
572
+ .b8 1
573
+ .b8 1
574
+ .b8 3
575
+ .b64 $L__func_begin0
576
+ .b64 $L__func_end0
577
+ .b8 1
578
+ .b8 156
579
+ .b32 125
580
+ .b8 4
581
+ .b32 125
582
+ .b64 $L__tmp1
583
+ .b64 $L__tmp14
584
+ .b8 2
585
+ .b8 39
586
+ .b8 58
587
+ .b8 5
588
+ .b32 125
589
+ .b64 $L__tmp1
590
+ .b64 $L__tmp14
591
+ .b8 2
592
+ .b8 243
593
+ .b8 36
594
+ .b8 0
595
+ .b8 5
596
+ .b32 125
597
+ .b64 $L__tmp2
598
+ .b64 $L__tmp15
599
+ .b8 2
600
+ .b8 39
601
+ .b8 58
602
+ .b8 5
603
+ .b32 125
604
+ .b64 $L__tmp15
605
+ .b64 $L__tmp16
606
+ .b8 3
607
+ .b8 39
608
+ .b8 45
609
+ .b8 5
610
+ .b32 125
611
+ .b64 $L__tmp17
612
+ .b64 $L__tmp32
613
+ .b8 2
614
+ .b8 47
615
+ .b8 59
616
+ .b8 4
617
+ .b32 125
618
+ .b64 $L__tmp18
619
+ .b64 $L__tmp31
620
+ .b8 2
621
+ .b8 47
622
+ .b8 59
623
+ .b8 5
624
+ .b32 125
625
+ .b64 $L__tmp18
626
+ .b64 $L__tmp31
627
+ .b8 2
628
+ .b8 243
629
+ .b8 36
630
+ .b8 0
631
+ .b8 5
632
+ .b32 125
633
+ .b64 $L__tmp32
634
+ .b64 $L__tmp33
635
+ .b8 3
636
+ .b8 47
637
+ .b8 45
638
+ .b8 0
639
+ .b8 0
640
+ }
641
+ .section .debug_pubnames
642
+ {
643
+ .b32 $L__pubNames_end0-$L__pubNames_start0
644
+ $L__pubNames_start0:
645
+ .b8 2
646
+ .b8 0
647
+ .b32 .debug_info
648
+ .b32 395
649
+ .b32 125
650
+ .b8 116
651
+ .b8 114
652
+ .b8 105
653
+ .b8 116
654
+ .b8 111
655
+ .b8 110
656
+ .b8 95
657
+ .b8 95
658
+ .b8 48
659
+ .b8 100
660
+ .b8 49
661
+ .b8 100
662
+ .b8 50
663
+ .b8 100
664
+ .b8 51
665
+ .b8 100
666
+ .b8 52
667
+ .b8 100
668
+ .b8 101
669
+ .b8 53
670
+ .b8 100
671
+ .b8 101
672
+ .b8 0
673
+ .b32 0
674
+ $L__pubNames_end0:
675
+ }
676
+ .section .debug_pubtypes
677
+ {
678
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
679
+ $L__pubTypes_start0:
680
+ .b8 2
681
+ .b8 0
682
+ .b32 .debug_info
683
+ .b32 395
684
+ .b32 0
685
+ $L__pubTypes_end0:
686
+ }
687
+ .section .debug_loc { }
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant 9.99999974E-6 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 0.000000e+00 : f32
8
+ %c256_i32 = arith.constant 256 : i32
9
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
10
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
11
+ %0 = tt.get_program_id x : i32
12
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
13
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
14
+ %3 = arith.muli %0, %c256_i32 : i32
15
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
16
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
17
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
18
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
19
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
20
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
21
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
22
+ %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
23
+ %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
24
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
25
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
26
+ %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
27
+ %16 = arith.addf %8, %12 : tensor<256xf32, #blocked>
28
+ %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
29
+ %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
30
+ ^bb0(%arg6: f32, %arg7: f32):
31
+ %36 = arith.addf %arg6, %arg7 : f32
32
+ tt.reduce.return %36 : f32
33
+ }) : (tensor<256xf32, #blocked>) -> f32
34
+ %19 = arith.addf %18, %cst_2 : f32
35
+ %20 = arith.divf %19, %cst_1 : f32
36
+ %21 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked>
37
+ %22 = arith.subf %16, %21 : tensor<256xf32, #blocked>
38
+ %23 = arith.mulf %22, %22 : tensor<256xf32, #blocked>
39
+ %24 = arith.select %2, %23, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
40
+ %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
41
+ ^bb0(%arg6: f32, %arg7: f32):
42
+ %36 = arith.addf %arg6, %arg7 : f32
43
+ tt.reduce.return %36 : f32
44
+ }) : (tensor<256xf32, #blocked>) -> f32
45
+ %26 = arith.addf %25, %cst_2 : f32
46
+ %27 = arith.divf %26, %cst_1 : f32
47
+ %28 = arith.addf %27, %cst_0 : f32
48
+ %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
49
+ %30 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked>
50
+ %31 = arith.mulf %22, %30 : tensor<256xf32, #blocked>
51
+ %32 = arith.mulf %31, %15 : tensor<256xf32, #blocked>
52
+ %33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
53
+ %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
54
+ %35 = arith.truncf %32 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
55
+ tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
56
+ tt.return
57
+ }
58
+ }
.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
5
+ %cst_0 = arith.constant 0.000000e+00 : f32
6
+ %cst_1 = arith.constant 2.560000e+02 : f32
7
+ %cst_2 = arith.constant 9.99999974E-6 : f32
8
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_4 = arith.constant dense<256> : tensor<256xi32>
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
12
+ %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
13
+ %3 = arith.muli %0, %c256_i32 : i32
14
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32>
15
+ %5 = arith.addi %1, %4 : tensor<256xi32>
16
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
17
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
18
+ %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
20
+ %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
21
+ %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
22
+ %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
23
+ %13 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
24
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
25
+ %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
26
+ %16 = arith.addf %8, %12 : tensor<256xf32>
27
+ %17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32>
28
+ %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
29
+ ^bb0(%arg6: f32, %arg7: f32):
30
+ %36 = arith.addf %arg6, %arg7 : f32
31
+ tt.reduce.return %36 : f32
32
+ }) : (tensor<256xf32>) -> f32
33
+ %19 = arith.addf %18, %cst_0 : f32
34
+ %20 = arith.divf %19, %cst_1 : f32
35
+ %21 = tt.splat %20 : (f32) -> tensor<256xf32>
36
+ %22 = arith.subf %16, %21 : tensor<256xf32>
37
+ %23 = arith.mulf %22, %22 : tensor<256xf32>
38
+ %24 = arith.select %2, %23, %cst_3 : tensor<256xi1>, tensor<256xf32>
39
+ %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({
40
+ ^bb0(%arg6: f32, %arg7: f32):
41
+ %36 = arith.addf %arg6, %arg7 : f32
42
+ tt.reduce.return %36 : f32
43
+ }) : (tensor<256xf32>) -> f32
44
+ %26 = arith.addf %25, %cst_0 : f32
45
+ %27 = arith.divf %26, %cst_1 : f32
46
+ %28 = arith.addf %27, %cst_2 : f32
47
+ %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
48
+ %30 = tt.splat %29 : (f32) -> tensor<256xf32>
49
+ %31 = arith.mulf %22, %30 : tensor<256xf32>
50
+ %32 = arith.mulf %31, %15 : tensor<256xf32>
51
+ %33 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
52
+ %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
53
+ %35 = arith.truncf %32 : tensor<256xf32> to tensor<256xbf16>
54
+ tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
55
+ tt.return
56
+ }
57
+ }
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin ADDED
Binary file (60 kB). View file
 
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx ADDED
@@ -0,0 +1,1854 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 256, 1, 1
39
+ {
40
+ .reg .pred %p<137>;
41
+ .reg .b16 %rs<49>;
42
+ .reg .b32 %r<439>;
43
+ .reg .f32 %f<487>;
44
+ .reg .b64 %rd<124>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_4];
50
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_1];
51
+ ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r89, %tid.x;
55
+ ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6de7de_param_2];
56
+ bfe.u32 %r90, %r89, 5, 3;
57
+ ld.param.u64 %rd61, [triton__0d1d2d3d4d5d6de7de_param_3];
58
+ and.b32 %r91, %r89, 15;
59
+ .loc 1 24 33
60
+ shl.b32 %r92, %r89, 3;
61
+ and.b32 %r1, %r92, 248;
62
+ and.b32 %r2, %r89, 255;
63
+ .loc 1 21 28
64
+ mov.u32 %r24, %ctaid.x;
65
+ .loc 1 21 33
66
+ shl.b32 %r93, %r24, 4;
67
+ .loc 1 22 23
68
+ or.b32 %r94, %r93, %r90;
69
+ or.b32 %r95, %r94, 8;
70
+ or.b32 %r96, %r93, %r91;
71
+ .loc 1 26 30
72
+ mul.wide.s32 %rd62, %r94, 8;
73
+ add.s64 %rd20, %rd59, %rd62;
74
+ add.s64 %rd36, %rd20, 64;
75
+ mul.wide.s32 %rd63, %r96, 8;
76
+ add.s64 %rd52, %rd59, %rd63;
77
+ mov.pred %p113, -1;
78
+ .loc 1 26 35
79
+ mov.u64 %rd19, 0x0;
80
+ @%p113 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd20 + 0 ];
81
+ mov.u64 %rd21, 0x0;
82
+ @%p113 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd20 + 0 ];
83
+ mov.u64 %rd23, 0x0;
84
+ @%p113 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd20 + 0 ];
85
+ mov.u64 %rd25, 0x0;
86
+ @%p113 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd20 + 0 ];
87
+ mov.u64 %rd27, 0x0;
88
+ @%p113 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd20 + 0 ];
89
+ mov.u64 %rd29, 0x0;
90
+ @%p113 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd20 + 0 ];
91
+ mov.u64 %rd31, 0x0;
92
+ @%p113 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd20 + 0 ];
93
+ mov.u64 %rd33, 0x0;
94
+ @%p113 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd20 + 0 ];
95
+ mov.u64 %rd35, 0x0;
96
+ @%p113 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ];
97
+ mov.u64 %rd37, 0x0;
98
+ @%p113 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd36 + 0 ];
99
+ mov.u64 %rd39, 0x0;
100
+ @%p113 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd36 + 0 ];
101
+ mov.u64 %rd41, 0x0;
102
+ @%p113 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd36 + 0 ];
103
+ mov.u64 %rd43, 0x0;
104
+ @%p113 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd36 + 0 ];
105
+ mov.u64 %rd45, 0x0;
106
+ @%p113 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd36 + 0 ];
107
+ mov.u64 %rd47, 0x0;
108
+ @%p113 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd36 + 0 ];
109
+ mov.u64 %rd49, 0x0;
110
+ @%p113 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd36 + 0 ];
111
+ mov.u64 %rd51, 0x0;
112
+ @%p113 ld.global.L1::evict_last.b64 { %rd51 }, [ %rd52 + 0 ];
113
+ .loc 1 27 18
114
+ bfe.s32 %r97, %r24, 27, 1;
115
+ shr.u32 %r98, %r97, 23;
116
+ add.s32 %r99, %r94, %r98;
117
+ and.b32 %r100, %r99, 16776704;
118
+ sub.s32 %r101, %r94, %r100;
119
+ add.s32 %r102, %r95, %r98;
120
+ and.b32 %r103, %r102, 16776704;
121
+ sub.s32 %r104, %r95, %r103;
122
+ .loc 1 35 44
123
+ shl.b32 %r105, %r101, 8;
124
+ shl.b32 %r106, %r104, 8;
125
+ .loc 1 35 40
126
+ or.b32 %r107, %r105, %r1;
127
+ or.b32 %r108, %r106, %r1;
128
+ .loc 1 35 34
129
+ mul.wide.s32 %rd64, %r107, 4;
130
+ add.s64 %rd89, %rd60, %rd64;
131
+ cvt.s64.s32 %rd65, %r105;
132
+ cvt.u64.u32 %rd66, %r1;
133
+ or.b64 %rd67, %rd65, %rd66;
134
+ shl.b64 %rd68, %rd67, 2;
135
+ add.s64 %rd69, %rd60, %rd68;
136
+ add.s64 %rd90, %rd69, 16;
137
+ mul.wide.s32 %rd70, %r108, 4;
138
+ add.s64 %rd91, %rd60, %rd70;
139
+ cvt.s64.s32 %rd71, %r106;
140
+ or.b64 %rd72, %rd71, %rd66;
141
+ shl.b64 %rd73, %rd72, 2;
142
+ add.s64 %rd74, %rd60, %rd73;
143
+ add.s64 %rd92, %rd74, 16;
144
+ mov.b32 %r325, 0;
145
+ .loc 1 35 50
146
+ mov.u32 %r25, 0x0;
147
+ mov.u32 %r26, 0x0;
148
+ mov.u32 %r27, 0x0;
149
+ mov.u32 %r28, 0x0;
150
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd89 + 0 ];
151
+ @!%p113 mov.u32 %r25, %r325;
152
+ @!%p113 mov.u32 %r26, %r325;
153
+ @!%p113 mov.u32 %r27, %r325;
154
+ @!%p113 mov.u32 %r28, %r325;
155
+ mov.b32 %f1, %r25;
156
+ mov.b32 %f2, %r26;
157
+ mov.b32 %f3, %r27;
158
+ mov.b32 %f4, %r28;
159
+ mov.u32 %r33, 0x0;
160
+ mov.u32 %r34, 0x0;
161
+ mov.u32 %r35, 0x0;
162
+ mov.u32 %r36, 0x0;
163
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd90 + 0 ];
164
+ @!%p113 mov.u32 %r33, %r325;
165
+ @!%p113 mov.u32 %r34, %r325;
166
+ @!%p113 mov.u32 %r35, %r325;
167
+ @!%p113 mov.u32 %r36, %r325;
168
+ mov.b32 %f5, %r33;
169
+ mov.b32 %f6, %r34;
170
+ mov.b32 %f7, %r35;
171
+ mov.b32 %f8, %r36;
172
+ mov.u32 %r41, 0x0;
173
+ mov.u32 %r42, 0x0;
174
+ mov.u32 %r43, 0x0;
175
+ mov.u32 %r44, 0x0;
176
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd91 + 0 ];
177
+ @!%p113 mov.u32 %r41, %r325;
178
+ @!%p113 mov.u32 %r42, %r325;
179
+ @!%p113 mov.u32 %r43, %r325;
180
+ @!%p113 mov.u32 %r44, %r325;
181
+ mov.b32 %f9, %r41;
182
+ mov.b32 %f10, %r42;
183
+ mov.b32 %f11, %r43;
184
+ mov.b32 %f12, %r44;
185
+ mov.u32 %r49, 0x0;
186
+ mov.u32 %r50, 0x0;
187
+ mov.u32 %r51, 0x0;
188
+ mov.u32 %r52, 0x0;
189
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd92 + 0 ];
190
+ @!%p113 mov.u32 %r49, %r325;
191
+ @!%p113 mov.u32 %r50, %r325;
192
+ @!%p113 mov.u32 %r51, %r325;
193
+ @!%p113 mov.u32 %r52, %r325;
194
+ mov.b32 %f13, %r49;
195
+ mov.b32 %f14, %r50;
196
+ mov.b32 %f15, %r51;
197
+ mov.b32 %f16, %r52;
198
+ .loc 1 36 44
199
+ shl.b32 %r109, %r94, 8;
200
+ shl.b32 %r110, %r95, 8;
201
+ .loc 1 36 40
202
+ or.b32 %r111, %r109, %r1;
203
+ or.b32 %r112, %r110, %r1;
204
+ .loc 1 36 34
205
+ mul.wide.s32 %rd75, %r111, 2;
206
+ add.s64 %rd93, %rd61, %rd75;
207
+ mul.wide.s32 %rd76, %r112, 2;
208
+ add.s64 %rd94, %rd61, %rd76;
209
+ .loc 1 36 50
210
+ mov.u32 %r57, 0x0;
211
+ mov.u32 %r58, 0x0;
212
+ mov.u32 %r59, 0x0;
213
+ mov.u32 %r60, 0x0;
214
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd93 + 0 ];
215
+ @!%p113 mov.u32 %r57, %r325;
216
+ @!%p113 mov.u32 %r58, %r325;
217
+ @!%p113 mov.u32 %r59, %r325;
218
+ @!%p113 mov.u32 %r60, %r325;
219
+ cvt.u16.u32 %rs1, %r57;
220
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r57; }
221
+ cvt.u16.u32 %rs3, %r58;
222
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r58; }
223
+ cvt.u16.u32 %rs5, %r59;
224
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r59; }
225
+ cvt.u16.u32 %rs7, %r60;
226
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r60; }
227
+ mov.u32 %r65, 0x0;
228
+ mov.u32 %r66, 0x0;
229
+ mov.u32 %r67, 0x0;
230
+ mov.u32 %r68, 0x0;
231
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd94 + 0 ];
232
+ @!%p113 mov.u32 %r65, %r325;
233
+ @!%p113 mov.u32 %r66, %r325;
234
+ @!%p113 mov.u32 %r67, %r325;
235
+ @!%p113 mov.u32 %r68, %r325;
236
+ cvt.u16.u32 %rs9, %r65;
237
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r65; }
238
+ cvt.u16.u32 %rs11, %r66;
239
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r66; }
240
+ cvt.u16.u32 %rs13, %r67;
241
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r67; }
242
+ cvt.u16.u32 %rs15, %r68;
243
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r68; }
244
+ .loc 1 36 101
245
+ cvt.f32.bf16 %r73, %rs1;
246
+ mov.b32 %f17, %r73;
247
+ cvt.f32.bf16 %r74, %rs2;
248
+ mov.b32 %f18, %r74;
249
+ cvt.f32.bf16 %r75, %rs3;
250
+ mov.b32 %f19, %r75;
251
+ cvt.f32.bf16 %r76, %rs4;
252
+ mov.b32 %f20, %r76;
253
+ cvt.f32.bf16 %r77, %rs5;
254
+ mov.b32 %f21, %r77;
255
+ cvt.f32.bf16 %r78, %rs6;
256
+ mov.b32 %f22, %r78;
257
+ cvt.f32.bf16 %r79, %rs7;
258
+ mov.b32 %f23, %r79;
259
+ cvt.f32.bf16 %r80, %rs8;
260
+ mov.b32 %f24, %r80;
261
+ cvt.f32.bf16 %r81, %rs9;
262
+ mov.b32 %f25, %r81;
263
+ cvt.f32.bf16 %r82, %rs10;
264
+ mov.b32 %f26, %r82;
265
+ cvt.f32.bf16 %r83, %rs11;
266
+ mov.b32 %f27, %r83;
267
+ cvt.f32.bf16 %r84, %rs12;
268
+ mov.b32 %f28, %r84;
269
+ cvt.f32.bf16 %r85, %rs13;
270
+ mov.b32 %f29, %r85;
271
+ cvt.f32.bf16 %r86, %rs14;
272
+ mov.b32 %f30, %r86;
273
+ cvt.f32.bf16 %r87, %rs15;
274
+ mov.b32 %f31, %r87;
275
+ cvt.f32.bf16 %r88, %rs16;
276
+ mov.b32 %f32, %r88;
277
+ .loc 1 37 22
278
+ add.s64 %rd77, %rd51, 50257;
279
+ .loc 1 38 22
280
+ setp.lt.s64 %p48, %rd51, 0;
281
+ .loc 1 39 36
282
+ selp.b64 %rd11, %rd77, %rd51, %p48;
283
+ .loc 1 40 40
284
+ setp.lt.u64 %p49, %rd11, 50257;
285
+ mov.b32 %r438, 883;
286
+ mov.u64 %rd123, 1;
287
+ .loc 1 40 55
288
+ @%p49 bra $L__BB0_2;
289
+ mov.u64 %rd78, assertMessage_0;
290
+ cvta.global.u64 %rd79, %rd78;
291
+ mov.u64 %rd80, assertFile_0;
292
+ cvta.global.u64 %rd81, %rd80;
293
+ mov.u64 %rd82, assertFunc_0;
294
+ cvta.global.u64 %rd83, %rd82;
295
+ { // callseq 8, 0
296
+ .reg .b32 temp_param_reg;
297
+ .param .b64 param0;
298
+ st.param.b64 [param0+0], %rd79;
299
+ .param .b64 param1;
300
+ st.param.b64 [param1+0], %rd81;
301
+ .param .b32 param2;
302
+ st.param.b32 [param2+0], %r438;
303
+ .param .b64 param3;
304
+ st.param.b64 [param3+0], %rd83;
305
+ .param .b64 param4;
306
+ st.param.b64 [param4+0], %rd123;
307
+ call.uni
308
+ __assertfail,
309
+ (
310
+ param0,
311
+ param1,
312
+ param2,
313
+ param3,
314
+ param4
315
+ );
316
+ } // callseq 8
317
+ $L__BB0_2:
318
+ .loc 1 0 55
319
+ ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_5];
320
+ cvt.s64.s32 %rd7, %r111;
321
+ cvt.s64.s32 %rd9, %r112;
322
+ .loc 1 38 22
323
+ setp.lt.s64 %p103, %rd35, 0;
324
+ setp.lt.s64 %p104, %rd19, 0;
325
+ .loc 1 41 44
326
+ shl.b64 %rd96, %rd19, 8;
327
+ add.s64 %rd97, %rd96, 12865792;
328
+ selp.b64 %rd98, %rd97, %rd96, %p104;
329
+ shl.b64 %rd99, %rd35, 8;
330
+ add.s64 %rd100, %rd99, 12865792;
331
+ selp.b64 %rd101, %rd100, %rd99, %p103;
332
+ .loc 1 41 40
333
+ or.b64 %rd103, %rd98, %rd66;
334
+ or.b64 %rd104, %rd101, %rd66;
335
+ .loc 1 41 34
336
+ shl.b64 %rd105, %rd103, 2;
337
+ add.s64 %rd115, %rd16, %rd105;
338
+ add.s64 %rd116, %rd115, 16;
339
+ shl.b64 %rd106, %rd104, 2;
340
+ add.s64 %rd117, %rd16, %rd106;
341
+ add.s64 %rd118, %rd117, 16;
342
+ .loc 1 41 52
343
+ mov.u32 %r114, 0x0;
344
+ mov.u32 %r115, 0x0;
345
+ mov.u32 %r116, 0x0;
346
+ mov.u32 %r117, 0x0;
347
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd115 + 0 ];
348
+ @!%p113 mov.u32 %r114, %r325;
349
+ @!%p113 mov.u32 %r115, %r325;
350
+ @!%p113 mov.u32 %r116, %r325;
351
+ @!%p113 mov.u32 %r117, %r325;
352
+ mov.b32 %f59, %r114;
353
+ mov.b32 %f60, %r115;
354
+ mov.b32 %f61, %r116;
355
+ mov.b32 %f62, %r117;
356
+ mov.u32 %r122, 0x0;
357
+ mov.u32 %r123, 0x0;
358
+ mov.u32 %r124, 0x0;
359
+ mov.u32 %r125, 0x0;
360
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd116 + 0 ];
361
+ @!%p113 mov.u32 %r122, %r325;
362
+ @!%p113 mov.u32 %r123, %r325;
363
+ @!%p113 mov.u32 %r124, %r325;
364
+ @!%p113 mov.u32 %r125, %r325;
365
+ mov.b32 %f63, %r122;
366
+ mov.b32 %f64, %r123;
367
+ mov.b32 %f65, %r124;
368
+ mov.b32 %f66, %r125;
369
+ mov.u32 %r130, 0x0;
370
+ mov.u32 %r131, 0x0;
371
+ mov.u32 %r132, 0x0;
372
+ mov.u32 %r133, 0x0;
373
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd117 + 0 ];
374
+ @!%p113 mov.u32 %r130, %r325;
375
+ @!%p113 mov.u32 %r131, %r325;
376
+ @!%p113 mov.u32 %r132, %r325;
377
+ @!%p113 mov.u32 %r133, %r325;
378
+ mov.b32 %f67, %r130;
379
+ mov.b32 %f68, %r131;
380
+ mov.b32 %f69, %r132;
381
+ mov.b32 %f70, %r133;
382
+ mov.u32 %r138, 0x0;
383
+ mov.u32 %r139, 0x0;
384
+ mov.u32 %r140, 0x0;
385
+ mov.u32 %r141, 0x0;
386
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd118 + 0 ];
387
+ @!%p113 mov.u32 %r138, %r325;
388
+ @!%p113 mov.u32 %r139, %r325;
389
+ @!%p113 mov.u32 %r140, %r325;
390
+ @!%p113 mov.u32 %r141, %r325;
391
+ mov.b32 %f71, %r138;
392
+ mov.b32 %f72, %r139;
393
+ mov.b32 %f73, %r140;
394
+ mov.b32 %f74, %r141;
395
+ .loc 1 42 22
396
+ add.f32 %f75, %f1, %f59;
397
+ add.f32 %f76, %f2, %f60;
398
+ add.f32 %f77, %f3, %f61;
399
+ add.f32 %f78, %f4, %f62;
400
+ add.f32 %f79, %f5, %f63;
401
+ add.f32 %f80, %f6, %f64;
402
+ add.f32 %f81, %f7, %f65;
403
+ add.f32 %f82, %f8, %f66;
404
+ add.f32 %f83, %f9, %f67;
405
+ add.f32 %f84, %f10, %f68;
406
+ add.f32 %f85, %f11, %f69;
407
+ add.f32 %f86, %f12, %f70;
408
+ add.f32 %f87, %f13, %f71;
409
+ add.f32 %f88, %f14, %f72;
410
+ add.f32 %f89, %f15, %f73;
411
+ add.f32 %f90, %f16, %f74;
412
+ .loc 1 44 22
413
+ add.f32 %f91, %f17, %f75;
414
+ add.f32 %f92, %f18, %f76;
415
+ add.f32 %f93, %f19, %f77;
416
+ add.f32 %f94, %f20, %f78;
417
+ add.f32 %f95, %f21, %f79;
418
+ add.f32 %f96, %f22, %f80;
419
+ add.f32 %f97, %f23, %f81;
420
+ add.f32 %f98, %f24, %f82;
421
+ add.f32 %f99, %f25, %f83;
422
+ add.f32 %f100, %f26, %f84;
423
+ add.f32 %f101, %f27, %f85;
424
+ add.f32 %f102, %f28, %f86;
425
+ add.f32 %f103, %f29, %f87;
426
+ add.f32 %f104, %f30, %f88;
427
+ add.f32 %f105, %f31, %f89;
428
+ add.f32 %f106, %f32, %f90;
429
+ $L__tmp1:
430
+ .loc 2 98 22
431
+ add.f32 %f107, %f91, 0f00000000;
432
+ add.f32 %f108, %f92, 0f00000000;
433
+ add.f32 %f109, %f93, 0f00000000;
434
+ add.f32 %f110, %f94, 0f00000000;
435
+ add.f32 %f111, %f95, 0f00000000;
436
+ add.f32 %f112, %f96, 0f00000000;
437
+ add.f32 %f113, %f97, 0f00000000;
438
+ add.f32 %f114, %f98, 0f00000000;
439
+ add.f32 %f115, %f99, 0f00000000;
440
+ add.f32 %f116, %f100, 0f00000000;
441
+ add.f32 %f117, %f101, 0f00000000;
442
+ add.f32 %f118, %f102, 0f00000000;
443
+ add.f32 %f119, %f103, 0f00000000;
444
+ add.f32 %f120, %f104, 0f00000000;
445
+ add.f32 %f121, %f105, 0f00000000;
446
+ add.f32 %f122, %f106, 0f00000000;
447
+ .loc 2 101 30
448
+ sub.f32 %f123, %f91, %f107;
449
+ sub.f32 %f124, %f92, %f108;
450
+ sub.f32 %f125, %f93, %f109;
451
+ sub.f32 %f126, %f94, %f110;
452
+ sub.f32 %f127, %f95, %f111;
453
+ sub.f32 %f128, %f96, %f112;
454
+ sub.f32 %f129, %f97, %f113;
455
+ sub.f32 %f130, %f98, %f114;
456
+ sub.f32 %f131, %f99, %f115;
457
+ sub.f32 %f132, %f100, %f116;
458
+ sub.f32 %f133, %f101, %f117;
459
+ sub.f32 %f134, %f102, %f118;
460
+ sub.f32 %f135, %f103, %f119;
461
+ sub.f32 %f136, %f104, %f120;
462
+ sub.f32 %f137, %f105, %f121;
463
+ sub.f32 %f138, %f106, %f122;
464
+ .loc 2 101 13
465
+ fma.rn.f32 %f139, %f91, %f123, 0f00000000;
466
+ fma.rn.f32 %f140, %f92, %f124, 0f00000000;
467
+ fma.rn.f32 %f141, %f93, %f125, 0f00000000;
468
+ fma.rn.f32 %f142, %f94, %f126, 0f00000000;
469
+ fma.rn.f32 %f143, %f95, %f127, 0f00000000;
470
+ fma.rn.f32 %f144, %f96, %f128, 0f00000000;
471
+ fma.rn.f32 %f145, %f97, %f129, 0f00000000;
472
+ fma.rn.f32 %f146, %f98, %f130, 0f00000000;
473
+ fma.rn.f32 %f147, %f99, %f131, 0f00000000;
474
+ fma.rn.f32 %f148, %f100, %f132, 0f00000000;
475
+ fma.rn.f32 %f149, %f101, %f133, 0f00000000;
476
+ fma.rn.f32 %f150, %f102, %f134, 0f00000000;
477
+ fma.rn.f32 %f151, %f103, %f135, 0f00000000;
478
+ fma.rn.f32 %f152, %f104, %f136, 0f00000000;
479
+ fma.rn.f32 %f153, %f105, %f137, 0f00000000;
480
+ fma.rn.f32 %f154, %f106, %f138, 0f00000000;
481
+ $L__tmp2:
482
+ .loc 2 108 21
483
+ sub.f32 %f155, %f108, %f107;
484
+ mov.b32 %r147, 1065353216;
485
+ mov.b32 %r148, 1073741824;
486
+ .loc 2 110 60
487
+ div.full.f32 %r146, %r147, %r148;
488
+ mov.b32 %f156, %r146;
489
+ .loc 2 112 17
490
+ fma.rn.f32 %f157, %f156, %f155, %f107;
491
+ .loc 2 113 15
492
+ add.f32 %f158, %f139, %f140;
493
+ .loc 2 113 30
494
+ mul.f32 %f159, %f155, %f155;
495
+ .loc 2 113 22
496
+ fma.rn.f32 %f160, %f156, %f159, %f158;
497
+ .loc 2 108 21
498
+ sub.f32 %f161, %f109, %f157;
499
+ mov.b32 %r151, 1077936128;
500
+ .loc 2 110 60
501
+ div.full.f32 %r149, %r147, %r151;
502
+ mov.b32 %f162, %r149;
503
+ .loc 2 112 17
504
+ fma.rn.f32 %f163, %f162, %f161, %f157;
505
+ .loc 2 113 15
506
+ add.f32 %f164, %f141, %f160;
507
+ .loc 2 113 30
508
+ mul.f32 %f165, %f161, %f161;
509
+ .loc 2 113 38
510
+ fma.rn.f32 %f166, %f161, %f161, %f165;
511
+ .loc 2 113 22
512
+ fma.rn.f32 %f167, %f162, %f166, %f164;
513
+ .loc 2 108 21
514
+ sub.f32 %f168, %f110, %f163;
515
+ mov.b32 %r154, 1082130432;
516
+ .loc 2 110 60
517
+ div.full.f32 %r152, %r147, %r154;
518
+ mov.b32 %f169, %r152;
519
+ .loc 2 112 17
520
+ fma.rn.f32 %f170, %f169, %f168, %f163;
521
+ .loc 2 113 15
522
+ add.f32 %f171, %f142, %f167;
523
+ .loc 2 113 30
524
+ mul.f32 %f172, %f168, %f168;
525
+ .loc 2 113 38
526
+ mul.f32 %f173, %f172, 0f40400000;
527
+ .loc 2 113 22
528
+ fma.rn.f32 %f174, %f169, %f173, %f171;
529
+ .loc 2 108 21
530
+ sub.f32 %f175, %f111, %f170;
531
+ mov.b32 %r157, 1084227584;
532
+ .loc 2 110 60
533
+ div.full.f32 %r155, %r147, %r157;
534
+ mov.b32 %f176, %r155;
535
+ .loc 2 112 17
536
+ fma.rn.f32 %f177, %f176, %f175, %f170;
537
+ .loc 2 113 15
538
+ add.f32 %f178, %f143, %f174;
539
+ .loc 2 113 30
540
+ mul.f32 %f179, %f175, %f175;
541
+ .loc 2 113 38
542
+ mul.f32 %f180, %f179, 0f40800000;
543
+ .loc 2 113 22
544
+ fma.rn.f32 %f181, %f176, %f180, %f178;
545
+ .loc 2 108 21
546
+ sub.f32 %f182, %f112, %f177;
547
+ mov.b32 %r160, 1086324736;
548
+ .loc 2 110 60
549
+ div.full.f32 %r158, %r147, %r160;
550
+ mov.b32 %f183, %r158;
551
+ .loc 2 112 17
552
+ fma.rn.f32 %f184, %f183, %f182, %f177;
553
+ .loc 2 113 15
554
+ add.f32 %f185, %f144, %f181;
555
+ .loc 2 113 30
556
+ mul.f32 %f186, %f182, %f182;
557
+ .loc 2 113 38
558
+ mul.f32 %f187, %f186, 0f40A00000;
559
+ .loc 2 113 22
560
+ fma.rn.f32 %f188, %f183, %f187, %f185;
561
+ .loc 2 108 21
562
+ sub.f32 %f189, %f113, %f184;
563
+ mov.b32 %r163, 1088421888;
564
+ .loc 2 110 60
565
+ div.full.f32 %r161, %r147, %r163;
566
+ mov.b32 %f190, %r161;
567
+ .loc 2 112 17
568
+ fma.rn.f32 %f191, %f190, %f189, %f184;
569
+ .loc 2 113 15
570
+ add.f32 %f192, %f145, %f188;
571
+ .loc 2 113 30
572
+ mul.f32 %f193, %f189, %f189;
573
+ .loc 2 113 38
574
+ mul.f32 %f194, %f193, 0f40C00000;
575
+ .loc 2 113 22
576
+ fma.rn.f32 %f195, %f190, %f194, %f192;
577
+ .loc 2 108 21
578
+ sub.f32 %f196, %f114, %f191;
579
+ mov.b32 %r166, 1090519040;
580
+ .loc 2 110 60
581
+ div.full.f32 %r164, %r147, %r166;
582
+ mov.b32 %f197, %r164;
583
+ .loc 2 112 17
584
+ fma.rn.f32 %f198, %f197, %f196, %f191;
585
+ .loc 2 113 15
586
+ add.f32 %f199, %f146, %f195;
587
+ .loc 2 113 30
588
+ mul.f32 %f200, %f196, %f196;
589
+ .loc 2 113 38
590
+ mul.f32 %f201, %f200, 0f40E00000;
591
+ .loc 2 113 22
592
+ fma.rn.f32 %f202, %f197, %f201, %f199;
593
+ .loc 2 108 21
594
+ sub.f32 %f203, %f116, %f115;
595
+ .loc 2 110 60
596
+ div.full.f32 %r167, %r147, %r148;
597
+ mov.b32 %f204, %r167;
598
+ .loc 2 112 17
599
+ fma.rn.f32 %f205, %f203, %f204, %f115;
600
+ .loc 2 113 15
601
+ add.f32 %f206, %f147, %f148;
602
+ .loc 2 113 30
603
+ mul.f32 %f207, %f203, %f203;
604
+ .loc 2 113 22
605
+ fma.rn.f32 %f208, %f207, %f204, %f206;
606
+ .loc 2 108 21
607
+ sub.f32 %f209, %f117, %f205;
608
+ .loc 2 110 60
609
+ div.full.f32 %r170, %r147, %r151;
610
+ mov.b32 %f210, %r170;
611
+ .loc 2 112 17
612
+ fma.rn.f32 %f211, %f210, %f209, %f205;
613
+ .loc 2 113 15
614
+ add.f32 %f212, %f149, %f208;
615
+ .loc 2 113 30
616
+ mul.f32 %f213, %f209, %f209;
617
+ .loc 2 113 38
618
+ fma.rn.f32 %f214, %f209, %f209, %f213;
619
+ .loc 2 113 22
620
+ fma.rn.f32 %f215, %f210, %f214, %f212;
621
+ .loc 2 108 21
622
+ sub.f32 %f216, %f118, %f211;
623
+ .loc 2 110 60
624
+ div.full.f32 %r173, %r147, %r154;
625
+ mov.b32 %f217, %r173;
626
+ .loc 2 112 17
627
+ fma.rn.f32 %f218, %f217, %f216, %f211;
628
+ .loc 2 113 15
629
+ add.f32 %f219, %f150, %f215;
630
+ .loc 2 113 30
631
+ mul.f32 %f220, %f216, %f216;
632
+ .loc 2 113 38
633
+ mul.f32 %f221, %f220, 0f40400000;
634
+ .loc 2 113 22
635
+ fma.rn.f32 %f222, %f217, %f221, %f219;
636
+ .loc 2 108 21
637
+ sub.f32 %f223, %f119, %f218;
638
+ .loc 2 110 60
639
+ div.full.f32 %r176, %r147, %r157;
640
+ mov.b32 %f224, %r176;
641
+ .loc 2 112 17
642
+ fma.rn.f32 %f225, %f224, %f223, %f218;
643
+ .loc 2 113 15
644
+ add.f32 %f226, %f151, %f222;
645
+ .loc 2 113 30
646
+ mul.f32 %f227, %f223, %f223;
647
+ .loc 2 113 38
648
+ mul.f32 %f228, %f227, 0f40800000;
649
+ .loc 2 113 22
650
+ fma.rn.f32 %f229, %f224, %f228, %f226;
651
+ .loc 2 108 21
652
+ sub.f32 %f230, %f120, %f225;
653
+ .loc 2 110 60
654
+ div.full.f32 %r179, %r147, %r160;
655
+ mov.b32 %f231, %r179;
656
+ .loc 2 112 17
657
+ fma.rn.f32 %f232, %f231, %f230, %f225;
658
+ .loc 2 113 15
659
+ add.f32 %f233, %f152, %f229;
660
+ .loc 2 113 30
661
+ mul.f32 %f234, %f230, %f230;
662
+ .loc 2 113 38
663
+ mul.f32 %f235, %f234, 0f40A00000;
664
+ .loc 2 113 22
665
+ fma.rn.f32 %f236, %f231, %f235, %f233;
666
+ .loc 2 108 21
667
+ sub.f32 %f237, %f121, %f232;
668
+ .loc 2 110 60
669
+ div.full.f32 %r182, %r147, %r163;
670
+ mov.b32 %f238, %r182;
671
+ .loc 2 112 17
672
+ fma.rn.f32 %f239, %f238, %f237, %f232;
673
+ .loc 2 113 15
674
+ add.f32 %f240, %f153, %f236;
675
+ .loc 2 113 30
676
+ mul.f32 %f241, %f237, %f237;
677
+ .loc 2 113 38
678
+ mul.f32 %f242, %f241, 0f40C00000;
679
+ .loc 2 113 22
680
+ fma.rn.f32 %f243, %f238, %f242, %f240;
681
+ .loc 2 108 21
682
+ sub.f32 %f244, %f122, %f239;
683
+ .loc 2 110 60
684
+ div.full.f32 %r185, %r147, %r166;
685
+ mov.b32 %f245, %r185;
686
+ .loc 2 112 17
687
+ fma.rn.f32 %f246, %f245, %f244, %f239;
688
+ .loc 2 113 15
689
+ add.f32 %f247, %f154, %f243;
690
+ .loc 2 113 30
691
+ mul.f32 %f248, %f244, %f244;
692
+ .loc 2 113 38
693
+ mul.f32 %f249, %f248, 0f40E00000;
694
+ .loc 2 113 22
695
+ fma.rn.f32 %f250, %f245, %f249, %f247;
696
+ $L__tmp3:
697
+ .loc 2 120 46
698
+ mov.b32 %r284, %f198;
699
+ shfl.sync.bfly.b32 %r285, %r284, 16, 31, -1;
700
+ mov.b32 %f251, %r285;
701
+ mov.b32 %r286, %f202;
702
+ shfl.sync.bfly.b32 %r287, %r286, 16, 31, -1;
703
+ mov.b32 %f252, %r287;
704
+ shfl.sync.bfly.b32 %r189, %r166, 16, 31, -1;
705
+ mov.b32 %f253, %r189;
706
+ $L__tmp4:
707
+ .loc 2 108 21
708
+ sub.f32 %f254, %f251, %f198;
709
+ .loc 2 109 28
710
+ add.f32 %f255, %f253, 0f41000000;
711
+ .loc 2 110 39
712
+ setp.eq.f32 %p105, %f255, 0f00000000;
713
+ .loc 2 110 60
714
+ mov.b32 %r190, %f255;
715
+ div.full.f32 %r188, %r189, %r190;
716
+ mov.b32 %f256, %r188;
717
+ .loc 2 110 49
718
+ selp.f32 %f257, 0f00000000, %f256, %p105;
719
+ .loc 2 112 17
720
+ fma.rn.f32 %f258, %f257, %f254, %f198;
721
+ .loc 2 113 15
722
+ add.f32 %f259, %f202, %f252;
723
+ .loc 2 113 30
724
+ mul.f32 %f260, %f254, %f254;
725
+ .loc 2 113 38
726
+ mul.f32 %f261, %f260, 0f41000000;
727
+ .loc 2 113 22
728
+ fma.rn.f32 %f262, %f257, %f261, %f259;
729
+ $L__tmp5:
730
+ .loc 2 120 46
731
+ mov.b32 %r288, %f258;
732
+ shfl.sync.bfly.b32 %r289, %r288, 8, 31, -1;
733
+ mov.b32 %f263, %r289;
734
+ mov.b32 %r290, %f262;
735
+ shfl.sync.bfly.b32 %r291, %r290, 8, 31, -1;
736
+ mov.b32 %f264, %r291;
737
+ shfl.sync.bfly.b32 %r192, %r190, 8, 31, -1;
738
+ mov.b32 %f265, %r192;
739
+ $L__tmp6:
740
+ .loc 2 108 21
741
+ sub.f32 %f266, %f263, %f258;
742
+ .loc 2 109 28
743
+ add.f32 %f267, %f255, %f265;
744
+ .loc 2 110 39
745
+ setp.eq.f32 %p106, %f267, 0f00000000;
746
+ .loc 2 110 60
747
+ mov.b32 %r193, %f267;
748
+ div.full.f32 %r191, %r192, %r193;
749
+ mov.b32 %f268, %r191;
750
+ .loc 2 110 49
751
+ selp.f32 %f269, 0f00000000, %f268, %p106;
752
+ .loc 2 112 17
753
+ fma.rn.f32 %f270, %f269, %f266, %f258;
754
+ .loc 2 113 15
755
+ add.f32 %f271, %f262, %f264;
756
+ .loc 2 113 30
757
+ mul.f32 %f272, %f266, %f266;
758
+ .loc 2 113 38
759
+ mul.f32 %f273, %f255, %f272;
760
+ .loc 2 113 22
761
+ fma.rn.f32 %f274, %f269, %f273, %f271;
762
+ $L__tmp7:
763
+ .loc 2 120 46
764
+ mov.b32 %r292, %f270;
765
+ shfl.sync.bfly.b32 %r293, %r292, 4, 31, -1;
766
+ mov.b32 %f275, %r293;
767
+ mov.b32 %r294, %f274;
768
+ shfl.sync.bfly.b32 %r295, %r294, 4, 31, -1;
769
+ mov.b32 %f276, %r295;
770
+ shfl.sync.bfly.b32 %r195, %r193, 4, 31, -1;
771
+ mov.b32 %f277, %r195;
772
+ $L__tmp8:
773
+ .loc 2 108 21
774
+ sub.f32 %f278, %f275, %f270;
775
+ .loc 2 109 28
776
+ add.f32 %f279, %f267, %f277;
777
+ .loc 2 110 39
778
+ setp.eq.f32 %p107, %f279, 0f00000000;
779
+ .loc 2 110 60
780
+ mov.b32 %r196, %f279;
781
+ div.full.f32 %r194, %r195, %r196;
782
+ mov.b32 %f280, %r194;
783
+ .loc 2 110 49
784
+ selp.f32 %f281, 0f00000000, %f280, %p107;
785
+ .loc 2 112 17
786
+ fma.rn.f32 %f282, %f281, %f278, %f270;
787
+ .loc 2 113 15
788
+ add.f32 %f283, %f274, %f276;
789
+ .loc 2 113 30
790
+ mul.f32 %f284, %f278, %f278;
791
+ .loc 2 113 38
792
+ mul.f32 %f285, %f267, %f284;
793
+ .loc 2 113 22
794
+ fma.rn.f32 %f286, %f281, %f285, %f283;
795
+ $L__tmp9:
796
+ .loc 2 120 46
797
+ mov.b32 %r296, %f282;
798
+ shfl.sync.bfly.b32 %r297, %r296, 2, 31, -1;
799
+ mov.b32 %f287, %r297;
800
+ mov.b32 %r298, %f286;
801
+ shfl.sync.bfly.b32 %r299, %r298, 2, 31, -1;
802
+ mov.b32 %f288, %r299;
803
+ shfl.sync.bfly.b32 %r198, %r196, 2, 31, -1;
804
+ mov.b32 %f289, %r198;
805
+ $L__tmp10:
806
+ .loc 2 108 21
807
+ sub.f32 %f290, %f287, %f282;
808
+ .loc 2 109 28
809
+ add.f32 %f33, %f279, %f289;
810
+ .loc 2 110 39
811
+ setp.eq.f32 %p108, %f33, 0f00000000;
812
+ .loc 2 110 60
813
+ mov.b32 %r199, %f33;
814
+ div.full.f32 %r197, %r198, %r199;
815
+ mov.b32 %f291, %r197;
816
+ .loc 2 110 49
817
+ selp.f32 %f292, 0f00000000, %f291, %p108;
818
+ .loc 2 112 17
819
+ fma.rn.f32 %f34, %f290, %f292, %f282;
820
+ .loc 2 113 15
821
+ add.f32 %f293, %f286, %f288;
822
+ .loc 2 113 30
823
+ mul.f32 %f294, %f290, %f290;
824
+ .loc 2 113 38
825
+ mul.f32 %f295, %f279, %f294;
826
+ .loc 2 113 22
827
+ fma.rn.f32 %f35, %f292, %f295, %f293;
828
+ $L__tmp11:
829
+ .loc 2 120 46
830
+ mov.b32 %r300, %f34;
831
+ shfl.sync.bfly.b32 %r3, %r300, 1, 31, -1;
832
+ mov.b32 %r301, %f35;
833
+ shfl.sync.bfly.b32 %r4, %r301, 1, 31, -1;
834
+ shfl.sync.bfly.b32 %r201, %r199, 1, 31, -1;
835
+ mov.b32 %f296, %r201;
836
+ $L__tmp12:
837
+ .loc 2 109 28
838
+ add.f32 %f36, %f33, %f296;
839
+ .loc 2 110 60
840
+ mov.b32 %r202, %f36;
841
+ div.full.f32 %r200, %r201, %r202;
842
+ mov.b32 %f37, %r200;
843
+ $L__tmp13:
844
+ .loc 2 120 46
845
+ mov.b32 %r302, %f246;
846
+ shfl.sync.bfly.b32 %r303, %r302, 16, 31, -1;
847
+ mov.b32 %f297, %r303;
848
+ mov.b32 %r304, %f250;
849
+ shfl.sync.bfly.b32 %r305, %r304, 16, 31, -1;
850
+ mov.b32 %f298, %r305;
851
+ shfl.sync.bfly.b32 %r204, %r166, 16, 31, -1;
852
+ mov.b32 %f299, %r204;
853
+ $L__tmp14:
854
+ .loc 2 108 21
855
+ sub.f32 %f300, %f297, %f246;
856
+ .loc 2 109 28
857
+ add.f32 %f301, %f299, 0f41000000;
858
+ .loc 2 110 39
859
+ setp.eq.f32 %p109, %f301, 0f00000000;
860
+ .loc 2 110 60
861
+ mov.b32 %r205, %f301;
862
+ div.full.f32 %r203, %r204, %r205;
863
+ mov.b32 %f302, %r203;
864
+ .loc 2 110 49
865
+ selp.f32 %f303, 0f00000000, %f302, %p109;
866
+ .loc 2 112 17
867
+ fma.rn.f32 %f304, %f300, %f303, %f246;
868
+ .loc 2 113 15
869
+ add.f32 %f305, %f250, %f298;
870
+ .loc 2 113 30
871
+ mul.f32 %f306, %f300, %f300;
872
+ .loc 2 113 38
873
+ mul.f32 %f307, %f306, 0f41000000;
874
+ .loc 2 113 22
875
+ fma.rn.f32 %f308, %f307, %f303, %f305;
876
+ $L__tmp15:
877
+ .loc 2 120 46
878
+ mov.b32 %r306, %f304;
879
+ shfl.sync.bfly.b32 %r307, %r306, 8, 31, -1;
880
+ mov.b32 %f309, %r307;
881
+ mov.b32 %r308, %f308;
882
+ shfl.sync.bfly.b32 %r309, %r308, 8, 31, -1;
883
+ mov.b32 %f310, %r309;
884
+ shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1;
885
+ mov.b32 %f311, %r207;
886
+ $L__tmp16:
887
+ .loc 2 108 21
888
+ sub.f32 %f312, %f309, %f304;
889
+ .loc 2 109 28
890
+ add.f32 %f313, %f301, %f311;
891
+ .loc 2 110 39
892
+ setp.eq.f32 %p110, %f313, 0f00000000;
893
+ .loc 2 110 60
894
+ mov.b32 %r208, %f313;
895
+ div.full.f32 %r206, %r207, %r208;
896
+ mov.b32 %f314, %r206;
897
+ .loc 2 110 49
898
+ selp.f32 %f315, 0f00000000, %f314, %p110;
899
+ .loc 2 112 17
900
+ fma.rn.f32 %f316, %f312, %f315, %f304;
901
+ .loc 2 113 15
902
+ add.f32 %f317, %f308, %f310;
903
+ .loc 2 113 30
904
+ mul.f32 %f318, %f312, %f312;
905
+ .loc 2 113 38
906
+ mul.f32 %f319, %f301, %f318;
907
+ .loc 2 113 22
908
+ fma.rn.f32 %f320, %f315, %f319, %f317;
909
+ $L__tmp17:
910
+ .loc 2 120 46
911
+ mov.b32 %r310, %f316;
912
+ shfl.sync.bfly.b32 %r311, %r310, 4, 31, -1;
913
+ mov.b32 %f321, %r311;
914
+ mov.b32 %r312, %f320;
915
+ shfl.sync.bfly.b32 %r313, %r312, 4, 31, -1;
916
+ mov.b32 %f322, %r313;
917
+ shfl.sync.bfly.b32 %r210, %r208, 4, 31, -1;
918
+ mov.b32 %f323, %r210;
919
+ $L__tmp18:
920
+ .loc 2 108 21
921
+ sub.f32 %f324, %f321, %f316;
922
+ .loc 2 109 28
923
+ add.f32 %f325, %f313, %f323;
924
+ .loc 2 110 39
925
+ setp.eq.f32 %p111, %f325, 0f00000000;
926
+ .loc 2 110 60
927
+ mov.b32 %r211, %f325;
928
+ div.full.f32 %r209, %r210, %r211;
929
+ mov.b32 %f326, %r209;
930
+ .loc 2 110 49
931
+ selp.f32 %f327, 0f00000000, %f326, %p111;
932
+ .loc 2 112 17
933
+ fma.rn.f32 %f328, %f324, %f327, %f316;
934
+ .loc 2 113 15
935
+ add.f32 %f329, %f320, %f322;
936
+ .loc 2 113 30
937
+ mul.f32 %f330, %f324, %f324;
938
+ .loc 2 113 38
939
+ mul.f32 %f331, %f313, %f330;
940
+ .loc 2 113 22
941
+ fma.rn.f32 %f332, %f327, %f331, %f329;
942
+ $L__tmp19:
943
+ .loc 2 120 46
944
+ mov.b32 %r314, %f328;
945
+ shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1;
946
+ mov.b32 %f333, %r315;
947
+ mov.b32 %r316, %f332;
948
+ shfl.sync.bfly.b32 %r317, %r316, 2, 31, -1;
949
+ mov.b32 %f334, %r317;
950
+ shfl.sync.bfly.b32 %r213, %r211, 2, 31, -1;
951
+ mov.b32 %f335, %r213;
952
+ $L__tmp20:
953
+ .loc 2 108 21
954
+ sub.f32 %f336, %f333, %f328;
955
+ .loc 2 109 28
956
+ add.f32 %f38, %f325, %f335;
957
+ .loc 2 110 39
958
+ setp.eq.f32 %p112, %f38, 0f00000000;
959
+ .loc 2 110 60
960
+ mov.b32 %r214, %f38;
961
+ div.full.f32 %r212, %r213, %r214;
962
+ mov.b32 %f337, %r212;
963
+ .loc 2 110 49
964
+ selp.f32 %f338, 0f00000000, %f337, %p112;
965
+ .loc 2 112 17
966
+ fma.rn.f32 %f39, %f336, %f338, %f328;
967
+ .loc 2 113 15
968
+ add.f32 %f339, %f332, %f334;
969
+ .loc 2 113 30
970
+ mul.f32 %f340, %f336, %f336;
971
+ .loc 2 113 38
972
+ mul.f32 %f341, %f325, %f340;
973
+ .loc 2 113 22
974
+ fma.rn.f32 %f40, %f338, %f341, %f339;
975
+ $L__tmp21:
976
+ .loc 2 120 46
977
+ mov.b32 %r318, %f39;
978
+ shfl.sync.bfly.b32 %r5, %r318, 1, 31, -1;
979
+ mov.b32 %r319, %f40;
980
+ shfl.sync.bfly.b32 %r6, %r319, 1, 31, -1;
981
+ shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1;
982
+ mov.b32 %f342, %r216;
983
+ $L__tmp22:
984
+ .loc 2 109 28
985
+ add.f32 %f41, %f38, %f342;
986
+ .loc 2 110 60
987
+ mov.b32 %r217, %f41;
988
+ div.full.f32 %r215, %r216, %r217;
989
+ mov.b32 %f42, %r215;
990
+ $L__tmp23:
991
+ .loc 1 62 51
992
+ mov.u32 %r218, 0x0;
993
+ mov.u32 %r219, 0x0;
994
+ mov.u32 %r220, 0x0;
995
+ mov.u32 %r221, 0x0;
996
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r218, %r219, %r220, %r221 }, [ %rd89 + 0 ];
997
+ @!%p113 mov.u32 %r218, %r325;
998
+ @!%p113 mov.u32 %r219, %r325;
999
+ @!%p113 mov.u32 %r220, %r325;
1000
+ @!%p113 mov.u32 %r221, %r325;
1001
+ mov.u32 %r226, 0x0;
1002
+ mov.u32 %r227, 0x0;
1003
+ mov.u32 %r228, 0x0;
1004
+ mov.u32 %r229, 0x0;
1005
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r226, %r227, %r228, %r229 }, [ %rd90 + 0 ];
1006
+ @!%p113 mov.u32 %r226, %r325;
1007
+ @!%p113 mov.u32 %r227, %r325;
1008
+ @!%p113 mov.u32 %r228, %r325;
1009
+ @!%p113 mov.u32 %r229, %r325;
1010
+ mov.u32 %r234, 0x0;
1011
+ mov.u32 %r235, 0x0;
1012
+ mov.u32 %r236, 0x0;
1013
+ mov.u32 %r237, 0x0;
1014
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r234, %r235, %r236, %r237 }, [ %rd91 + 0 ];
1015
+ @!%p113 mov.u32 %r234, %r325;
1016
+ @!%p113 mov.u32 %r235, %r325;
1017
+ @!%p113 mov.u32 %r236, %r325;
1018
+ @!%p113 mov.u32 %r237, %r325;
1019
+ mov.u32 %r242, 0x0;
1020
+ mov.u32 %r243, 0x0;
1021
+ mov.u32 %r244, 0x0;
1022
+ mov.u32 %r245, 0x0;
1023
+ @%p113 ld.global.L1::evict_last.v4.b32 { %r242, %r243, %r244, %r245 }, [ %rd92 + 0 ];
1024
+ @!%p113 mov.u32 %r242, %r325;
1025
+ @!%p113 mov.u32 %r243, %r325;
1026
+ @!%p113 mov.u32 %r244, %r325;
1027
+ @!%p113 mov.u32 %r245, %r325;
1028
+ .loc 1 63 51
1029
+ mov.u32 %r250, 0x0;
1030
+ mov.u32 %r251, 0x0;
1031
+ mov.u32 %r252, 0x0;
1032
+ mov.u32 %r253, 0x0;
1033
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r250, %r251, %r252, %r253 }, [ %rd93 + 0 ];
1034
+ @!%p113 mov.u32 %r250, %r325;
1035
+ @!%p113 mov.u32 %r251, %r325;
1036
+ @!%p113 mov.u32 %r252, %r325;
1037
+ @!%p113 mov.u32 %r253, %r325;
1038
+ cvt.u16.u32 %rs17, %r250;
1039
+ { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r250; }
1040
+ cvt.u16.u32 %rs19, %r251;
1041
+ { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r251; }
1042
+ cvt.u16.u32 %rs21, %r252;
1043
+ { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r252; }
1044
+ cvt.u16.u32 %rs23, %r253;
1045
+ { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r253; }
1046
+ mov.u32 %r258, 0x0;
1047
+ mov.u32 %r259, 0x0;
1048
+ mov.u32 %r260, 0x0;
1049
+ mov.u32 %r261, 0x0;
1050
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r258, %r259, %r260, %r261 }, [ %rd94 + 0 ];
1051
+ @!%p113 mov.u32 %r258, %r325;
1052
+ @!%p113 mov.u32 %r259, %r325;
1053
+ @!%p113 mov.u32 %r260, %r325;
1054
+ @!%p113 mov.u32 %r261, %r325;
1055
+ cvt.u16.u32 %rs25, %r258;
1056
+ { .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r258; }
1057
+ cvt.u16.u32 %rs27, %r259;
1058
+ { .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r259; }
1059
+ cvt.u16.u32 %rs29, %r260;
1060
+ { .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r260; }
1061
+ cvt.u16.u32 %rs31, %r261;
1062
+ { .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r261; }
1063
+ .loc 1 63 103
1064
+ cvt.f32.bf16 %r266, %rs17;
1065
+ mov.b32 %f43, %r266;
1066
+ cvt.f32.bf16 %r267, %rs18;
1067
+ mov.b32 %f44, %r267;
1068
+ cvt.f32.bf16 %r268, %rs19;
1069
+ mov.b32 %f45, %r268;
1070
+ cvt.f32.bf16 %r269, %rs20;
1071
+ mov.b32 %f46, %r269;
1072
+ cvt.f32.bf16 %r270, %rs21;
1073
+ mov.b32 %f47, %r270;
1074
+ cvt.f32.bf16 %r271, %rs22;
1075
+ mov.b32 %f48, %r271;
1076
+ cvt.f32.bf16 %r272, %rs23;
1077
+ mov.b32 %f49, %r272;
1078
+ cvt.f32.bf16 %r273, %rs24;
1079
+ mov.b32 %f50, %r273;
1080
+ cvt.f32.bf16 %r274, %rs25;
1081
+ mov.b32 %f51, %r274;
1082
+ cvt.f32.bf16 %r275, %rs26;
1083
+ mov.b32 %f52, %r275;
1084
+ cvt.f32.bf16 %r276, %rs27;
1085
+ mov.b32 %f53, %r276;
1086
+ cvt.f32.bf16 %r277, %rs28;
1087
+ mov.b32 %f54, %r277;
1088
+ cvt.f32.bf16 %r278, %rs29;
1089
+ mov.b32 %f55, %r278;
1090
+ cvt.f32.bf16 %r279, %rs30;
1091
+ mov.b32 %f56, %r279;
1092
+ cvt.f32.bf16 %r280, %rs31;
1093
+ mov.b32 %f57, %r280;
1094
+ cvt.f32.bf16 %r281, %rs32;
1095
+ mov.b32 %f58, %r281;
1096
+ .loc 1 64 35
1097
+ mul.wide.u32 %rd107, %r2, 4;
1098
+ add.s64 %rd95, %rd17, %rd107;
1099
+ .loc 1 64 40
1100
+ mov.u32 %r282, 0x0;
1101
+ @%p113 ld.global.L1::evict_last.b32 { %r282 }, [ %rd95 + 0 ];
1102
+ @!%p113 mov.u32 %r282, %r325;
1103
+ .loc 1 68 57
1104
+ @%p49 bra $L__BB0_4;
1105
+ mov.u64 %rd108, assertMessage_1;
1106
+ cvta.global.u64 %rd109, %rd108;
1107
+ mov.u64 %rd110, assertFile_1;
1108
+ cvta.global.u64 %rd111, %rd110;
1109
+ mov.u64 %rd112, assertFunc_1;
1110
+ cvta.global.u64 %rd113, %rd112;
1111
+ { // callseq 9, 0
1112
+ .reg .b32 temp_param_reg;
1113
+ .param .b64 param0;
1114
+ st.param.b64 [param0+0], %rd109;
1115
+ .param .b64 param1;
1116
+ st.param.b64 [param1+0], %rd111;
1117
+ .param .b32 param2;
1118
+ st.param.b32 [param2+0], %r438;
1119
+ .param .b64 param3;
1120
+ st.param.b64 [param3+0], %rd113;
1121
+ .param .b64 param4;
1122
+ st.param.b64 [param4+0], %rd123;
1123
+ call.uni
1124
+ __assertfail,
1125
+ (
1126
+ param0,
1127
+ param1,
1128
+ param2,
1129
+ param3,
1130
+ param4
1131
+ );
1132
+ } // callseq 9
1133
+ $L__BB0_4:
1134
+ $L__tmp24:
1135
+ .loc 2 120 46
1136
+ mov.b32 %f343, %r6;
1137
+ $L__tmp25:
1138
+ .loc 2 113 15
1139
+ add.f32 %f344, %f40, %f343;
1140
+ $L__tmp26:
1141
+ .loc 2 120 46
1142
+ mov.b32 %f345, %r5;
1143
+ $L__tmp27:
1144
+ .loc 2 108 21
1145
+ sub.f32 %f346, %f345, %f39;
1146
+ .loc 2 113 30
1147
+ mul.f32 %f347, %f346, %f346;
1148
+ .loc 2 113 38
1149
+ mul.f32 %f348, %f38, %f347;
1150
+ .loc 2 110 39
1151
+ setp.eq.f32 %p135, %f41, 0f00000000;
1152
+ .loc 2 110 49
1153
+ selp.f32 %f349, 0f00000000, %f42, %p135;
1154
+ .loc 2 113 22
1155
+ fma.rn.f32 %f350, %f349, %f348, %f344;
1156
+ $L__tmp28:
1157
+ .loc 2 120 46
1158
+ mov.b32 %f351, %r4;
1159
+ $L__tmp29:
1160
+ .loc 2 113 15
1161
+ add.f32 %f352, %f35, %f351;
1162
+ $L__tmp30:
1163
+ .loc 2 120 46
1164
+ mov.b32 %f353, %r3;
1165
+ $L__tmp31:
1166
+ .loc 2 108 21
1167
+ sub.f32 %f354, %f353, %f34;
1168
+ .loc 2 113 30
1169
+ mul.f32 %f355, %f354, %f354;
1170
+ .loc 2 113 38
1171
+ mul.f32 %f356, %f33, %f355;
1172
+ .loc 2 110 39
1173
+ setp.eq.f32 %p136, %f36, 0f00000000;
1174
+ .loc 2 110 49
1175
+ selp.f32 %f357, 0f00000000, %f37, %p136;
1176
+ .loc 2 113 22
1177
+ fma.rn.f32 %f358, %f357, %f356, %f352;
1178
+ $L__tmp32:
1179
+ .loc 1 69 54
1180
+ mov.u32 %r321, 0x0;
1181
+ mov.u32 %r322, 0x0;
1182
+ mov.u32 %r323, 0x0;
1183
+ mov.u32 %r324, 0x0;
1184
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd115 + 0 ];
1185
+ @!%p113 mov.u32 %r321, %r325;
1186
+ @!%p113 mov.u32 %r322, %r325;
1187
+ @!%p113 mov.u32 %r323, %r325;
1188
+ @!%p113 mov.u32 %r324, %r325;
1189
+ mov.u32 %r329, 0x0;
1190
+ mov.u32 %r330, 0x0;
1191
+ mov.u32 %r331, 0x0;
1192
+ mov.u32 %r332, 0x0;
1193
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd116 + 0 ];
1194
+ @!%p113 mov.u32 %r329, %r325;
1195
+ @!%p113 mov.u32 %r330, %r325;
1196
+ @!%p113 mov.u32 %r331, %r325;
1197
+ @!%p113 mov.u32 %r332, %r325;
1198
+ mov.u32 %r337, 0x0;
1199
+ mov.u32 %r338, 0x0;
1200
+ mov.u32 %r339, 0x0;
1201
+ mov.u32 %r340, 0x0;
1202
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r337, %r338, %r339, %r340 }, [ %rd117 + 0 ];
1203
+ @!%p113 mov.u32 %r337, %r325;
1204
+ @!%p113 mov.u32 %r338, %r325;
1205
+ @!%p113 mov.u32 %r339, %r325;
1206
+ @!%p113 mov.u32 %r340, %r325;
1207
+ mov.u32 %r345, 0x0;
1208
+ mov.u32 %r346, 0x0;
1209
+ mov.u32 %r347, 0x0;
1210
+ mov.u32 %r348, 0x0;
1211
+ @%p113 ld.global.L1::evict_first.v4.b32 { %r345, %r346, %r347, %r348 }, [ %rd118 + 0 ];
1212
+ @!%p113 mov.u32 %r345, %r325;
1213
+ @!%p113 mov.u32 %r346, %r325;
1214
+ @!%p113 mov.u32 %r347, %r325;
1215
+ @!%p113 mov.u32 %r348, %r325;
1216
+ .loc 1 75 24
1217
+ mov.b32 %r354, %f358;
1218
+ mov.b32 %r355, 1132462080;
1219
+ div.full.f32 %r353, %r354, %r355;
1220
+ mov.b32 %f359, %r353;
1221
+ mov.b32 %r378, %f350;
1222
+ div.full.f32 %r377, %r378, %r355;
1223
+ mov.b32 %f360, %r377;
1224
+ .loc 1 77 24
1225
+ add.f32 %f361, %f359, 0f3727C5AC;
1226
+ add.f32 %f362, %f360, 0f3727C5AC;
1227
+ .loc 1 78 30
1228
+ rsqrt.approx.ftz.f32 %f363, %f361;
1229
+ rsqrt.approx.ftz.f32 %f364, %f362;
1230
+ .loc 1 69 54
1231
+ mov.b32 %f365, %r348;
1232
+ .loc 1 62 51
1233
+ mov.b32 %f366, %r245;
1234
+ .loc 1 70 24
1235
+ add.f32 %f367, %f366, %f365;
1236
+ .loc 1 72 24
1237
+ add.f32 %f368, %f58, %f367;
1238
+ $L__tmp33:
1239
+ .loc 2 112 17
1240
+ fma.rn.f32 %f369, %f346, %f349, %f39;
1241
+ $L__tmp34:
1242
+ .loc 1 73 24
1243
+ sub.f32 %f370, %f368, %f369;
1244
+ .loc 1 69 54
1245
+ mov.b32 %f371, %r347;
1246
+ .loc 1 62 51
1247
+ mov.b32 %f372, %r244;
1248
+ .loc 1 70 24
1249
+ add.f32 %f373, %f372, %f371;
1250
+ .loc 1 72 24
1251
+ add.f32 %f374, %f57, %f373;
1252
+ .loc 1 73 24
1253
+ sub.f32 %f375, %f374, %f369;
1254
+ .loc 1 69 54
1255
+ mov.b32 %f376, %r346;
1256
+ .loc 1 62 51
1257
+ mov.b32 %f377, %r243;
1258
+ .loc 1 70 24
1259
+ add.f32 %f378, %f377, %f376;
1260
+ .loc 1 72 24
1261
+ add.f32 %f379, %f56, %f378;
1262
+ .loc 1 73 24
1263
+ sub.f32 %f380, %f379, %f369;
1264
+ .loc 1 69 54
1265
+ mov.b32 %f381, %r345;
1266
+ .loc 1 62 51
1267
+ mov.b32 %f382, %r242;
1268
+ .loc 1 70 24
1269
+ add.f32 %f383, %f382, %f381;
1270
+ .loc 1 72 24
1271
+ add.f32 %f384, %f55, %f383;
1272
+ .loc 1 73 24
1273
+ sub.f32 %f385, %f384, %f369;
1274
+ .loc 1 69 54
1275
+ mov.b32 %f386, %r340;
1276
+ .loc 1 62 51
1277
+ mov.b32 %f387, %r237;
1278
+ .loc 1 70 24
1279
+ add.f32 %f388, %f387, %f386;
1280
+ .loc 1 72 24
1281
+ add.f32 %f389, %f54, %f388;
1282
+ .loc 1 73 24
1283
+ sub.f32 %f390, %f389, %f369;
1284
+ .loc 1 69 54
1285
+ mov.b32 %f391, %r339;
1286
+ .loc 1 62 51
1287
+ mov.b32 %f392, %r236;
1288
+ .loc 1 70 24
1289
+ add.f32 %f393, %f392, %f391;
1290
+ .loc 1 72 24
1291
+ add.f32 %f394, %f53, %f393;
1292
+ .loc 1 73 24
1293
+ sub.f32 %f395, %f394, %f369;
1294
+ .loc 1 69 54
1295
+ mov.b32 %f396, %r338;
1296
+ .loc 1 62 51
1297
+ mov.b32 %f397, %r235;
1298
+ .loc 1 70 24
1299
+ add.f32 %f398, %f397, %f396;
1300
+ .loc 1 72 24
1301
+ add.f32 %f399, %f52, %f398;
1302
+ .loc 1 73 24
1303
+ sub.f32 %f400, %f399, %f369;
1304
+ .loc 1 69 54
1305
+ mov.b32 %f401, %r337;
1306
+ .loc 1 62 51
1307
+ mov.b32 %f402, %r234;
1308
+ .loc 1 70 24
1309
+ add.f32 %f403, %f402, %f401;
1310
+ .loc 1 72 24
1311
+ add.f32 %f404, %f51, %f403;
1312
+ .loc 1 73 24
1313
+ sub.f32 %f405, %f404, %f369;
1314
+ .loc 1 69 54
1315
+ mov.b32 %f406, %r332;
1316
+ .loc 1 62 51
1317
+ mov.b32 %f407, %r229;
1318
+ .loc 1 70 24
1319
+ add.f32 %f408, %f407, %f406;
1320
+ .loc 1 72 24
1321
+ add.f32 %f409, %f50, %f408;
1322
+ $L__tmp35:
1323
+ .loc 2 112 17
1324
+ fma.rn.f32 %f410, %f354, %f357, %f34;
1325
+ $L__tmp36:
1326
+ .loc 1 73 24
1327
+ sub.f32 %f411, %f409, %f410;
1328
+ .loc 1 69 54
1329
+ mov.b32 %f412, %r331;
1330
+ .loc 1 62 51
1331
+ mov.b32 %f413, %r228;
1332
+ .loc 1 70 24
1333
+ add.f32 %f414, %f413, %f412;
1334
+ .loc 1 72 24
1335
+ add.f32 %f415, %f49, %f414;
1336
+ .loc 1 73 24
1337
+ sub.f32 %f416, %f415, %f410;
1338
+ .loc 1 69 54
1339
+ mov.b32 %f417, %r330;
1340
+ .loc 1 62 51
1341
+ mov.b32 %f418, %r227;
1342
+ .loc 1 70 24
1343
+ add.f32 %f419, %f418, %f417;
1344
+ .loc 1 72 24
1345
+ add.f32 %f420, %f48, %f419;
1346
+ .loc 1 73 24
1347
+ sub.f32 %f421, %f420, %f410;
1348
+ .loc 1 69 54
1349
+ mov.b32 %f422, %r329;
1350
+ .loc 1 62 51
1351
+ mov.b32 %f423, %r226;
1352
+ .loc 1 70 24
1353
+ add.f32 %f424, %f423, %f422;
1354
+ .loc 1 72 24
1355
+ add.f32 %f425, %f47, %f424;
1356
+ .loc 1 73 24
1357
+ sub.f32 %f426, %f425, %f410;
1358
+ .loc 1 69 54
1359
+ mov.b32 %f427, %r324;
1360
+ .loc 1 62 51
1361
+ mov.b32 %f428, %r221;
1362
+ .loc 1 70 24
1363
+ add.f32 %f429, %f428, %f427;
1364
+ .loc 1 72 24
1365
+ add.f32 %f430, %f46, %f429;
1366
+ .loc 1 73 24
1367
+ sub.f32 %f431, %f430, %f410;
1368
+ .loc 1 69 54
1369
+ mov.b32 %f432, %r323;
1370
+ .loc 1 62 51
1371
+ mov.b32 %f433, %r220;
1372
+ .loc 1 70 24
1373
+ add.f32 %f434, %f433, %f432;
1374
+ .loc 1 72 24
1375
+ add.f32 %f435, %f45, %f434;
1376
+ .loc 1 73 24
1377
+ sub.f32 %f436, %f435, %f410;
1378
+ .loc 1 69 54
1379
+ mov.b32 %f437, %r322;
1380
+ .loc 1 62 51
1381
+ mov.b32 %f438, %r219;
1382
+ .loc 1 70 24
1383
+ add.f32 %f439, %f438, %f437;
1384
+ .loc 1 72 24
1385
+ add.f32 %f440, %f44, %f439;
1386
+ .loc 1 73 24
1387
+ sub.f32 %f441, %f440, %f410;
1388
+ .loc 1 69 54
1389
+ mov.b32 %f442, %r321;
1390
+ .loc 1 62 51
1391
+ mov.b32 %f443, %r218;
1392
+ .loc 1 70 24
1393
+ add.f32 %f444, %f443, %f442;
1394
+ .loc 1 72 24
1395
+ add.f32 %f445, %f43, %f444;
1396
+ .loc 1 73 24
1397
+ sub.f32 %f446, %f445, %f410;
1398
+ .loc 1 79 24
1399
+ mul.f32 %f447, %f446, %f363;
1400
+ mul.f32 %f448, %f441, %f363;
1401
+ mul.f32 %f449, %f436, %f363;
1402
+ mul.f32 %f450, %f431, %f363;
1403
+ mul.f32 %f451, %f426, %f363;
1404
+ mul.f32 %f452, %f421, %f363;
1405
+ mul.f32 %f453, %f416, %f363;
1406
+ mul.f32 %f454, %f411, %f363;
1407
+ mul.f32 %f455, %f405, %f364;
1408
+ mul.f32 %f456, %f400, %f364;
1409
+ mul.f32 %f457, %f395, %f364;
1410
+ mul.f32 %f458, %f390, %f364;
1411
+ mul.f32 %f459, %f385, %f364;
1412
+ mul.f32 %f460, %f380, %f364;
1413
+ mul.f32 %f461, %f375, %f364;
1414
+ mul.f32 %f462, %f370, %f364;
1415
+ .loc 1 80 24
1416
+ shl.b32 %r425, %r2, 2;
1417
+ mov.u32 %r426, global_smem;
1418
+ add.s32 %r427, %r426, %r425;
1419
+ st.shared.u32 [%r427], %r282;
1420
+ bar.sync 0;
1421
+ shl.b32 %r428, %r1, 2;
1422
+ add.s32 %r429, %r426, %r428;
1423
+ ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r429];
1424
+ ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r429+16];
1425
+ mul.f32 %f471, %f447, %f463;
1426
+ mul.f32 %f472, %f448, %f464;
1427
+ mul.f32 %f473, %f449, %f465;
1428
+ mul.f32 %f474, %f450, %f466;
1429
+ mul.f32 %f475, %f451, %f467;
1430
+ mul.f32 %f476, %f452, %f468;
1431
+ mul.f32 %f477, %f453, %f469;
1432
+ mul.f32 %f478, %f454, %f470;
1433
+ mul.f32 %f479, %f455, %f463;
1434
+ mul.f32 %f480, %f456, %f464;
1435
+ mul.f32 %f481, %f457, %f465;
1436
+ mul.f32 %f482, %f458, %f466;
1437
+ mul.f32 %f483, %f459, %f467;
1438
+ mul.f32 %f484, %f460, %f468;
1439
+ mul.f32 %f485, %f461, %f469;
1440
+ mul.f32 %f486, %f462, %f470;
1441
+ .loc 1 82 29
1442
+ shl.b64 %rd121, %rd7, 1;
1443
+ add.s64 %rd119, %rd18, %rd121;
1444
+ shl.b64 %rd122, %rd9, 1;
1445
+ add.s64 %rd120, %rd18, %rd122;
1446
+ .loc 1 82 52
1447
+ mov.b32 %r401, %f471;
1448
+ cvt.rn.bf16.f32 %rs33, %r401;
1449
+ mov.b32 %r402, %f472;
1450
+ cvt.rn.bf16.f32 %rs34, %r402;
1451
+ mov.b32 %r403, %f473;
1452
+ cvt.rn.bf16.f32 %rs35, %r403;
1453
+ mov.b32 %r404, %f474;
1454
+ cvt.rn.bf16.f32 %rs36, %r404;
1455
+ mov.b32 %r405, %f475;
1456
+ cvt.rn.bf16.f32 %rs37, %r405;
1457
+ mov.b32 %r406, %f476;
1458
+ cvt.rn.bf16.f32 %rs38, %r406;
1459
+ mov.b32 %r407, %f477;
1460
+ cvt.rn.bf16.f32 %rs39, %r407;
1461
+ mov.b32 %r408, %f478;
1462
+ cvt.rn.bf16.f32 %rs40, %r408;
1463
+ mov.b32 %r409, %f479;
1464
+ cvt.rn.bf16.f32 %rs41, %r409;
1465
+ mov.b32 %r410, %f480;
1466
+ cvt.rn.bf16.f32 %rs42, %r410;
1467
+ mov.b32 %r411, %f481;
1468
+ cvt.rn.bf16.f32 %rs43, %r411;
1469
+ mov.b32 %r412, %f482;
1470
+ cvt.rn.bf16.f32 %rs44, %r412;
1471
+ mov.b32 %r413, %f483;
1472
+ cvt.rn.bf16.f32 %rs45, %r413;
1473
+ mov.b32 %r414, %f484;
1474
+ cvt.rn.bf16.f32 %rs46, %r414;
1475
+ mov.b32 %r415, %f485;
1476
+ cvt.rn.bf16.f32 %rs47, %r415;
1477
+ mov.b32 %r416, %f486;
1478
+ cvt.rn.bf16.f32 %rs48, %r416;
1479
+ mov.b32 %r430, {%rs33, %rs34};
1480
+ mov.b32 %r431, {%rs35, %rs36};
1481
+ mov.b32 %r432, {%rs37, %rs38};
1482
+ mov.b32 %r433, {%rs39, %rs40};
1483
+ @%p113 st.global.v4.b32 [ %rd119 + 0 ], { %r430, %r431, %r432, %r433 };
1484
+ mov.b32 %r434, {%rs41, %rs42};
1485
+ mov.b32 %r435, {%rs43, %rs44};
1486
+ mov.b32 %r436, {%rs45, %rs46};
1487
+ mov.b32 %r437, {%rs47, %rs48};
1488
+ @%p113 st.global.v4.b32 [ %rd120 + 0 ], { %r434, %r435, %r436, %r437 };
1489
+ .loc 1 58 4
1490
+ ret;
1491
+ $L__tmp37:
1492
+ $L__func_end0:
1493
+
1494
+ }
1495
+ // .globl __nv_rsqrtf
1496
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
1497
+ .param .b32 __nv_rsqrtf_param_0
1498
+ )
1499
+ {
1500
+ .reg .f32 %f<3>;
1501
+ $L__func_begin1:
1502
+
1503
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
1504
+ rsqrt.approx.ftz.f32 %f2, %f1;
1505
+ st.param.f32 [func_retval0+0], %f2;
1506
+ ret;
1507
+ $L__func_end1:
1508
+
1509
+ }
1510
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
1511
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
1512
+ .section .debug_abbrev
1513
+ {
1514
+ .b8 1
1515
+ .b8 17
1516
+ .b8 1
1517
+ .b8 37
1518
+ .b8 8
1519
+ .b8 19
1520
+ .b8 5
1521
+ .b8 3
1522
+ .b8 8
1523
+ .b8 16
1524
+ .b8 6
1525
+ .b8 27
1526
+ .b8 8
1527
+ .b8 180
1528
+ .b8 66
1529
+ .b8 12
1530
+ .b8 17
1531
+ .b8 1
1532
+ .b8 18
1533
+ .b8 1
1534
+ .b8 0
1535
+ .b8 0
1536
+ .b8 2
1537
+ .b8 46
1538
+ .b8 0
1539
+ .b8 135
1540
+ .b8 64
1541
+ .b8 8
1542
+ .b8 3
1543
+ .b8 8
1544
+ .b8 58
1545
+ .b8 11
1546
+ .b8 59
1547
+ .b8 11
1548
+ .b8 63
1549
+ .b8 12
1550
+ .b8 32
1551
+ .b8 11
1552
+ .b8 0
1553
+ .b8 0
1554
+ .b8 3
1555
+ .b8 46
1556
+ .b8 1
1557
+ .b8 17
1558
+ .b8 1
1559
+ .b8 18
1560
+ .b8 1
1561
+ .b8 64
1562
+ .b8 10
1563
+ .b8 49
1564
+ .b8 19
1565
+ .b8 0
1566
+ .b8 0
1567
+ .b8 4
1568
+ .b8 29
1569
+ .b8 0
1570
+ .b8 49
1571
+ .b8 19
1572
+ .b8 17
1573
+ .b8 1
1574
+ .b8 18
1575
+ .b8 1
1576
+ .b8 88
1577
+ .b8 11
1578
+ .b8 89
1579
+ .b8 11
1580
+ .b8 87
1581
+ .b8 11
1582
+ .b8 0
1583
+ .b8 0
1584
+ .b8 5
1585
+ .b8 29
1586
+ .b8 1
1587
+ .b8 49
1588
+ .b8 19
1589
+ .b8 17
1590
+ .b8 1
1591
+ .b8 18
1592
+ .b8 1
1593
+ .b8 88
1594
+ .b8 11
1595
+ .b8 89
1596
+ .b8 11
1597
+ .b8 87
1598
+ .b8 11
1599
+ .b8 0
1600
+ .b8 0
1601
+ .b8 0
1602
+ }
1603
+ .section .debug_info
1604
+ {
1605
+ .b32 302
1606
+ .b8 2
1607
+ .b8 0
1608
+ .b32 .debug_abbrev
1609
+ .b8 8
1610
+ .b8 1
1611
+ .b8 116
1612
+ .b8 114
1613
+ .b8 105
1614
+ .b8 116
1615
+ .b8 111
1616
+ .b8 110
1617
+ .b8 0
1618
+ .b8 2
1619
+ .b8 0
1620
+ .b8 99
1621
+ .b8 112
1622
+ .b8 110
1623
+ .b8 51
1624
+ .b8 108
1625
+ .b8 97
1626
+ .b8 119
1627
+ .b8 103
1628
+ .b8 54
1629
+ .b8 53
1630
+ .b8 108
1631
+ .b8 112
1632
+ .b8 105
1633
+ .b8 54
1634
+ .b8 51
1635
+ .b8 103
1636
+ .b8 118
1637
+ .b8 54
1638
+ .b8 99
1639
+ .b8 54
1640
+ .b8 112
1641
+ .b8 110
1642
+ .b8 52
1643
+ .b8 111
1644
+ .b8 105
1645
+ .b8 107
1646
+ .b8 104
1647
+ .b8 103
1648
+ .b8 54
1649
+ .b8 113
1650
+ .b8 118
1651
+ .b8 97
1652
+ .b8 50
1653
+ .b8 104
1654
+ .b8 50
1655
+ .b8 113
1656
+ .b8 106
1657
+ .b8 100
1658
+ .b8 112
1659
+ .b8 120
1660
+ .b8 101
1661
+ .b8 54
1662
+ .b8 113
1663
+ .b8 106
1664
+ .b8 52
1665
+ .b8 108
1666
+ .b8 118
1667
+ .b8 116
1668
+ .b8 116
1669
+ .b8 119
1670
+ .b8 101
1671
+ .b8 122
1672
+ .b8 46
1673
+ .b8 112
1674
+ .b8 121
1675
+ .b8 0
1676
+ .b32 .debug_line
1677
+ .b8 47
1678
+ .b8 116
1679
+ .b8 109
1680
+ .b8 112
1681
+ .b8 47
1682
+ .b8 116
1683
+ .b8 111
1684
+ .b8 114
1685
+ .b8 99
1686
+ .b8 104
1687
+ .b8 105
1688
+ .b8 110
1689
+ .b8 100
1690
+ .b8 117
1691
+ .b8 99
1692
+ .b8 116
1693
+ .b8 111
1694
+ .b8 114
1695
+ .b8 95
1696
+ .b8 114
1697
+ .b8 111
1698
+ .b8 111
1699
+ .b8 116
1700
+ .b8 47
1701
+ .b8 112
1702
+ .b8 110
1703
+ .b8 0
1704
+ .b8 1
1705
+ .b64 $L__func_begin0
1706
+ .b64 $L__func_end0
1707
+ .b8 2
1708
+ .b8 116
1709
+ .b8 114
1710
+ .b8 105
1711
+ .b8 116
1712
+ .b8 111
1713
+ .b8 110
1714
+ .b8 95
1715
+ .b8 95
1716
+ .b8 48
1717
+ .b8 100
1718
+ .b8 49
1719
+ .b8 100
1720
+ .b8 50
1721
+ .b8 100
1722
+ .b8 51
1723
+ .b8 100
1724
+ .b8 52
1725
+ .b8 100
1726
+ .b8 53
1727
+ .b8 100
1728
+ .b8 54
1729
+ .b8 100
1730
+ .b8 101
1731
+ .b8 55
1732
+ .b8 100
1733
+ .b8 101
1734
+ .b8 0
1735
+ .b8 116
1736
+ .b8 114
1737
+ .b8 105
1738
+ .b8 116
1739
+ .b8 111
1740
+ .b8 110
1741
+ .b8 95
1742
+ .b8 95
1743
+ .b8 48
1744
+ .b8 100
1745
+ .b8 49
1746
+ .b8 100
1747
+ .b8 50
1748
+ .b8 100
1749
+ .b8 51
1750
+ .b8 100
1751
+ .b8 52
1752
+ .b8 100
1753
+ .b8 53
1754
+ .b8 100
1755
+ .b8 54
1756
+ .b8 100
1757
+ .b8 101
1758
+ .b8 55
1759
+ .b8 100
1760
+ .b8 101
1761
+ .b8 0
1762
+ .b8 1
1763
+ .b8 18
1764
+ .b8 1
1765
+ .b8 1
1766
+ .b8 3
1767
+ .b64 $L__func_begin0
1768
+ .b64 $L__func_end0
1769
+ .b8 1
1770
+ .b8 156
1771
+ .b32 125
1772
+ .b8 4
1773
+ .b32 125
1774
+ .b64 $L__tmp1
1775
+ .b64 $L__tmp2
1776
+ .b8 2
1777
+ .b8 47
1778
+ .b8 41
1779
+ .b8 5
1780
+ .b32 125
1781
+ .b64 $L__tmp2
1782
+ .b64 $L__tmp36
1783
+ .b8 2
1784
+ .b8 53
1785
+ .b8 44
1786
+ .b8 4
1787
+ .b32 125
1788
+ .b64 $L__tmp2
1789
+ .b64 $L__tmp36
1790
+ .b8 2
1791
+ .b8 120
1792
+ .b8 46
1793
+ .b8 0
1794
+ .b8 4
1795
+ .b32 125
1796
+ .b64 $L__tmp3
1797
+ .b64 $L__tmp31
1798
+ .b8 2
1799
+ .b8 53
1800
+ .b8 44
1801
+ .b8 0
1802
+ .b8 0
1803
+ }
1804
+ .section .debug_pubnames
1805
+ {
1806
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1807
+ $L__pubNames_start0:
1808
+ .b8 2
1809
+ .b8 0
1810
+ .b32 .debug_info
1811
+ .b32 306
1812
+ .b32 125
1813
+ .b8 116
1814
+ .b8 114
1815
+ .b8 105
1816
+ .b8 116
1817
+ .b8 111
1818
+ .b8 110
1819
+ .b8 95
1820
+ .b8 95
1821
+ .b8 48
1822
+ .b8 100
1823
+ .b8 49
1824
+ .b8 100
1825
+ .b8 50
1826
+ .b8 100
1827
+ .b8 51
1828
+ .b8 100
1829
+ .b8 52
1830
+ .b8 100
1831
+ .b8 53
1832
+ .b8 100
1833
+ .b8 54
1834
+ .b8 100
1835
+ .b8 101
1836
+ .b8 55
1837
+ .b8 100
1838
+ .b8 101
1839
+ .b8 0
1840
+ .b32 0
1841
+ $L__pubNames_end0:
1842
+ }
1843
+ .section .debug_pubtypes
1844
+ {
1845
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1846
+ $L__pubTypes_start0:
1847
+ .b8 2
1848
+ .b8 0
1849
+ .b32 .debug_info
1850
+ .b32 306
1851
+ .b32 0
1852
+ $L__pubTypes_end0:
1853
+ }
1854
+ .section .debug_loc { }
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
4
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
5
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
6
+ %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked>
8
+ %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked>
9
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked>
11
+ %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked>
12
+ %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked>
13
+ %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked>
14
+ %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1>
15
+ %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1>
16
+ %cst_9 = arith.constant 0.000000e+00 : f32
17
+ %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2>
18
+ %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2>
19
+ %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked>
20
+ %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked>
21
+ %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked>
22
+ %cst_15 = arith.constant dense<0.000000e+00> : tensor<16x256xbf16, #blocked>
23
+ %c16_i32 = arith.constant 16 : i32
24
+ %0 = tt.get_program_id x : i32
25
+ %1 = arith.muli %0, %c16_i32 : i32
26
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
27
+ %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
28
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked>
29
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1>
30
+ %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked>
31
+ %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1>
32
+ %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked>
33
+ %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1>
34
+ %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
35
+ %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
36
+ %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked>
37
+ %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2>
38
+ %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked>
39
+ %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>, #blocked1>
40
+ %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr<i64, 1>, #blocked>, tensor<16x1xi32, #blocked>
41
+ %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr<i64, 1>, #blocked1>, tensor<16x1xi32, #blocked1>
42
+ %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked>
43
+ %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1>
44
+ %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked>
45
+ %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked>
46
+ %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2>
47
+ %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked>
48
+ %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked>
49
+ %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
50
+ %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked>
51
+ %27 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
52
+ %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi32, #blocked>
53
+ %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked>
54
+ %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
55
+ %31 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked>
56
+ %32 = tt.broadcast %31 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked>
57
+ %33 = arith.addi %24, %32 : tensor<16x256xi32, #blocked>
58
+ %34 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
59
+ %35 = tt.addptr %34, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
60
+ %36 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
61
+ %37 = arith.extf %36 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
62
+ %38 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked>
63
+ %39 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1>
64
+ %40 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked>
65
+ %41 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1>
66
+ %42 = arith.select %40, %38, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked>
67
+ %43 = arith.select %41, %39, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1>
68
+ %44 = arith.cmpi sge, %43, %cst_7 : tensor<16x1xi64, #blocked1>
69
+ %45 = arith.cmpi slt, %43, %cst_8 : tensor<16x1xi64, #blocked1>
70
+ %46 = arith.andi %44, %45 : tensor<16x1xi1, #blocked1>
71
+ tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
72
+ %47 = arith.muli %42, %cst_4 : tensor<16x1xi64, #blocked>
73
+ %48 = tt.broadcast %47 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked>
74
+ %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked>
75
+ %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked>
76
+ %51 = arith.addi %50, %48 : tensor<16x256xi64, #blocked>
77
+ %52 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>, #blocked>
78
+ %53 = tt.addptr %52, %51 : tensor<16x256x!tt.ptr<f32, 1>, #blocked>, tensor<16x256xi64, #blocked>
79
+ %54 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
80
+ %55 = arith.addf %54, %30 : tensor<16x256xf32, #blocked>
81
+ %56 = arith.addf %55, %37 : tensor<16x256xf32, #blocked>
82
+ %57 = arith.addf %56, %cst_14 : tensor<16x256xf32, #blocked>
83
+ %58 = arith.subf %56, %57 : tensor<16x256xf32, #blocked>
84
+ %59 = arith.mulf %56, %58 : tensor<16x256xf32, #blocked>
85
+ %60 = arith.addf %59, %cst_14 : tensor<16x256xf32, #blocked>
86
+ %61 = arith.select %29, %57, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
87
+ %62 = arith.select %29, %60, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked>
88
+ %63 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked>
89
+ %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
90
+ %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({
91
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
92
+ %90 = arith.subf %arg11, %arg8 : f32
93
+ %91 = arith.addf %arg10, %arg13 : f32
94
+ %92 = arith.cmpf oeq, %91, %cst_9 : f32
95
+ %93 = arith.divf %arg13, %91 : f32
96
+ %94 = arith.select %92, %cst_9, %93 : f32
97
+ %95 = arith.mulf %90, %94 : f32
98
+ %96 = arith.addf %arg8, %95 : f32
99
+ %97 = arith.addf %arg9, %arg12 : f32
100
+ %98 = arith.mulf %90, %90 : f32
101
+ %99 = arith.mulf %98, %arg10 : f32
102
+ %100 = arith.mulf %99, %94 : f32
103
+ %101 = arith.addf %97, %100 : f32
104
+ tt.reduce.return %96, %101, %91 : f32, f32, f32
105
+ }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
106
+ %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
107
+ %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked>
108
+ %68 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
109
+ %69 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked>
110
+ %70 = arith.extf %69 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked>
111
+ %71 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>, #blocked2>
112
+ %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr<f32, 1>, #blocked2>, tensor<1x256xi32, #blocked2>
113
+ %73 = tt.load %72, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2>
114
+ tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1>
115
+ %74 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked>
116
+ %75 = arith.addf %74, %68 : tensor<16x256xf32, #blocked>
117
+ %76 = arith.addf %75, %70 : tensor<16x256xf32, #blocked>
118
+ %77 = tt.broadcast %66 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
119
+ %78 = arith.subf %76, %77 : tensor<16x256xf32, #blocked>
120
+ %79 = arith.divf %67, %cst_13 : tensor<16x1xf32, #blocked>
121
+ %80 = arith.addf %79, %cst_12 : tensor<16x1xf32, #blocked>
122
+ %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked>
123
+ %82 = tt.broadcast %81 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked>
124
+ %83 = arith.mulf %78, %82 : tensor<16x256xf32, #blocked>
125
+ %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked>
126
+ %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked>
127
+ %86 = arith.mulf %83, %85 : tensor<16x256xf32, #blocked>
128
+ %87 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>, #blocked>
129
+ %88 = tt.addptr %87, %33 : tensor<16x256x!tt.ptr<bf16, 1>, #blocked>, tensor<16x256xi32, #blocked>
130
+ %89 = arith.truncf %86 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked>
131
+ tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked>
132
+ tt.return
133
+ }
134
+ }
.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<16x256xbf16>
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32>
5
+ %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32>
6
+ %cst_2 = arith.constant 0.000000e+00 : f32
7
+ %cst_3 = arith.constant dense<256> : tensor<16x1xi64>
8
+ %cst_4 = arith.constant dense<50257> : tensor<16x1xi64>
9
+ %cst_5 = arith.constant dense<0> : tensor<16x1xi64>
10
+ %cst_6 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32>
11
+ %cst_7 = arith.constant dense<2.560000e+02> : tensor<16x1xf32>
12
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<16x256xf32>
13
+ %cst_9 = arith.constant dense<256> : tensor<16x1xi32>
14
+ %cst_10 = arith.constant dense<256> : tensor<1x256xi32>
15
+ %cst_11 = arith.constant dense<512> : tensor<16x1xi32>
16
+ %c16_i32 = arith.constant 16 : i32
17
+ %0 = tt.get_program_id x : i32
18
+ %1 = arith.muli %0, %c16_i32 : i32
19
+ %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32>
20
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32>
21
+ %4 = tt.splat %1 : (i32) -> tensor<16x1xi32>
22
+ %5 = arith.addi %4, %3 : tensor<16x1xi32>
23
+ %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
24
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32>
25
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<16x1x!tt.ptr<i64, 1>>
26
+ %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr<i64, 1>>, tensor<16x1xi32>
27
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64>
28
+ %11 = arith.remsi %5, %cst_11 : tensor<16x1xi32>
29
+ %12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32>
30
+ %13 = arith.muli %11, %cst_9 : tensor<16x1xi32>
31
+ %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<16x256xi32>
32
+ %15 = tt.broadcast %13 : (tensor<16x1xi32>) -> tensor<16x256xi32>
33
+ %16 = arith.addi %14, %15 : tensor<16x256xi32>
34
+ %17 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
35
+ %18 = tt.addptr %17, %16 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi32>
36
+ %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<16x256xi1>
37
+ %20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
38
+ %21 = arith.muli %5, %cst_9 : tensor<16x1xi32>
39
+ %22 = tt.broadcast %21 : (tensor<16x1xi32>) -> tensor<16x256xi32>
40
+ %23 = arith.addi %14, %22 : tensor<16x256xi32>
41
+ %24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
42
+ %25 = tt.addptr %24, %23 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
43
+ %26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16>
44
+ %27 = arith.extf %26 : tensor<16x256xbf16> to tensor<16x256xf32>
45
+ %28 = arith.addi %10, %cst_4 : tensor<16x1xi64>
46
+ %29 = arith.cmpi slt, %10, %cst_5 : tensor<16x1xi64>
47
+ %30 = arith.select %29, %28, %10 : tensor<16x1xi1>, tensor<16x1xi64>
48
+ %31 = arith.cmpi sge, %30, %cst_5 : tensor<16x1xi64>
49
+ %32 = arith.cmpi slt, %30, %cst_4 : tensor<16x1xi64>
50
+ %33 = arith.andi %31, %32 : tensor<16x1xi1>
51
+ tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
52
+ %34 = arith.muli %30, %cst_3 : tensor<16x1xi64>
53
+ %35 = tt.broadcast %34 : (tensor<16x1xi64>) -> tensor<16x256xi64>
54
+ %36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64>
55
+ %37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<16x256xi64>
56
+ %38 = arith.addi %37, %35 : tensor<16x256xi64>
57
+ %39 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<16x256x!tt.ptr<f32, 1>>
58
+ %40 = tt.addptr %39, %38 : tensor<16x256x!tt.ptr<f32, 1>>, tensor<16x256xi64>
59
+ %41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
60
+ %42 = arith.addf %41, %20 : tensor<16x256xf32>
61
+ %43 = arith.addf %42, %27 : tensor<16x256xf32>
62
+ %44 = arith.addf %43, %cst_8 : tensor<16x256xf32>
63
+ %45 = arith.subf %43, %44 : tensor<16x256xf32>
64
+ %46 = arith.mulf %43, %45 : tensor<16x256xf32>
65
+ %47 = arith.addf %46, %cst_8 : tensor<16x256xf32>
66
+ %48 = arith.select %19, %44, %cst_8 : tensor<16x256xi1>, tensor<16x256xf32>
67
+ %49 = arith.select %19, %47, %cst_8 : tensor<16x256xi1>, tensor<16x256xf32>
68
+ %50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32>
69
+ %51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<16x256xf32>
70
+ %52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({
71
+ ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
72
+ %76 = arith.subf %arg11, %arg8 : f32
73
+ %77 = arith.addf %arg10, %arg13 : f32
74
+ %78 = arith.cmpf oeq, %77, %cst_2 : f32
75
+ %79 = arith.divf %arg13, %77 : f32
76
+ %80 = arith.select %78, %cst_2, %79 : f32
77
+ %81 = arith.mulf %76, %80 : f32
78
+ %82 = arith.addf %arg8, %81 : f32
79
+ %83 = arith.addf %arg9, %arg12 : f32
80
+ %84 = arith.mulf %76, %76 : f32
81
+ %85 = arith.mulf %84, %arg10 : f32
82
+ %86 = arith.mulf %85, %80 : f32
83
+ %87 = arith.addf %83, %86 : f32
84
+ tt.reduce.return %82, %87, %77 : f32, f32, f32
85
+ }) : (tensor<16x256xf32>, tensor<16x256xf32>, tensor<16x256xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>)
86
+ %53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
87
+ %54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32>
88
+ %55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32>
89
+ %56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16>
90
+ %57 = arith.extf %56 : tensor<16x256xbf16> to tensor<16x256xf32>
91
+ %58 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x256x!tt.ptr<f32, 1>>
92
+ %59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr<f32, 1>>, tensor<1x256xi32>
93
+ %60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32>
94
+ tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<16x1xi1>
95
+ %61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32>
96
+ %62 = arith.addf %61, %55 : tensor<16x256xf32>
97
+ %63 = arith.addf %62, %57 : tensor<16x256xf32>
98
+ %64 = tt.broadcast %53 : (tensor<16x1xf32>) -> tensor<16x256xf32>
99
+ %65 = arith.subf %63, %64 : tensor<16x256xf32>
100
+ %66 = arith.divf %54, %cst_7 : tensor<16x1xf32>
101
+ %67 = arith.addf %66, %cst_6 : tensor<16x1xf32>
102
+ %68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32>
103
+ %69 = tt.broadcast %68 : (tensor<16x1xf32>) -> tensor<16x256xf32>
104
+ %70 = arith.mulf %65, %69 : tensor<16x256xf32>
105
+ %71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<16x256xf32>
106
+ %72 = arith.mulf %70, %71 : tensor<16x256xf32>
107
+ %73 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<16x256x!tt.ptr<bf16, 1>>
108
+ %74 = tt.addptr %73, %23 : tensor<16x256x!tt.ptr<bf16, 1>>, tensor<16x256xi32>
109
+ %75 = arith.truncf %72 : tensor<16x256xf32> to tensor<16x256xbf16>
110
+ tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16>
111
+ tt.return
112
+ }
113
+ }
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = and i32 %6, 3, !dbg !8
11
+ %10 = and i32 %8, 3, !dbg !9
12
+ %urem = and i32 %6, 127, !dbg !9
13
+ %11 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
14
+ %12 = shl i32 %11, 2, !dbg !11
15
+ %13 = or i32 %12, %9, !dbg !12
16
+ %14 = icmp ult i32 %urem, 120, !dbg !13
17
+ %15 = shl nuw nsw i32 %urem, 17, !dbg !14
18
+ %16 = add i32 %12, %15, !dbg !15
19
+ %17 = sext i32 %16 to i64, !dbg !16
20
+ %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !16
21
+ %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14) #3, !dbg !17
22
+ %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !17
23
+ %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !17
24
+ %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !17
25
+ %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !17
26
+ %24 = bitcast i32 %20 to float, !dbg !17
27
+ %25 = bitcast i32 %21 to float, !dbg !17
28
+ %26 = bitcast i32 %22 to float, !dbg !17
29
+ %27 = bitcast i32 %23 to float, !dbg !17
30
+ %28 = fadd float %24, 0.000000e+00, !dbg !18
31
+ %29 = fadd float %25, 0.000000e+00, !dbg !18
32
+ %30 = fadd float %26, 0.000000e+00, !dbg !18
33
+ %31 = fadd float %27, 0.000000e+00, !dbg !18
34
+ %32 = select i1 %14, float %28, float 0.000000e+00, !dbg !19
35
+ %33 = select i1 %14, float %29, float 0.000000e+00, !dbg !19
36
+ %34 = select i1 %14, float %30, float 0.000000e+00, !dbg !19
37
+ %35 = select i1 %14, float %31, float 0.000000e+00, !dbg !19
38
+ %36 = bitcast float %32 to i32, !dbg !20
39
+ %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 16, i32 31), !dbg !20
40
+ %38 = bitcast i32 %37 to float, !dbg !20
41
+ %39 = fadd float %32, %38, !dbg !24
42
+ %40 = bitcast float %39 to i32, !dbg !20
43
+ %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !20
44
+ %42 = bitcast i32 %41 to float, !dbg !20
45
+ %43 = fadd float %39, %42, !dbg !24
46
+ %44 = bitcast float %43 to i32, !dbg !20
47
+ %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !20
48
+ %46 = bitcast i32 %45 to float, !dbg !20
49
+ %47 = fadd float %43, %46, !dbg !24
50
+ %48 = bitcast float %47 to i32, !dbg !20
51
+ %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !20
52
+ %50 = bitcast i32 %49 to float, !dbg !20
53
+ %51 = fadd float %47, %50, !dbg !24
54
+ %52 = bitcast float %51 to i32, !dbg !20
55
+ %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !20
56
+ %54 = bitcast i32 %53 to float, !dbg !20
57
+ %55 = fadd float %51, %54, !dbg !24
58
+ %56 = bitcast float %33 to i32, !dbg !20
59
+ %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20
60
+ %58 = bitcast i32 %57 to float, !dbg !20
61
+ %59 = fadd float %33, %58, !dbg !24
62
+ %60 = bitcast float %59 to i32, !dbg !20
63
+ %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !20
64
+ %62 = bitcast i32 %61 to float, !dbg !20
65
+ %63 = fadd float %59, %62, !dbg !24
66
+ %64 = bitcast float %63 to i32, !dbg !20
67
+ %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 4, i32 31), !dbg !20
68
+ %66 = bitcast i32 %65 to float, !dbg !20
69
+ %67 = fadd float %63, %66, !dbg !24
70
+ %68 = bitcast float %67 to i32, !dbg !20
71
+ %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !20
72
+ %70 = bitcast i32 %69 to float, !dbg !20
73
+ %71 = fadd float %67, %70, !dbg !24
74
+ %72 = bitcast float %71 to i32, !dbg !20
75
+ %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !20
76
+ %74 = bitcast i32 %73 to float, !dbg !20
77
+ %75 = fadd float %71, %74, !dbg !24
78
+ %76 = bitcast float %34 to i32, !dbg !20
79
+ %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !20
80
+ %78 = bitcast i32 %77 to float, !dbg !20
81
+ %79 = fadd float %34, %78, !dbg !24
82
+ %80 = bitcast float %79 to i32, !dbg !20
83
+ %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !20
84
+ %82 = bitcast i32 %81 to float, !dbg !20
85
+ %83 = fadd float %79, %82, !dbg !24
86
+ %84 = bitcast float %83 to i32, !dbg !20
87
+ %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !20
88
+ %86 = bitcast i32 %85 to float, !dbg !20
89
+ %87 = fadd float %83, %86, !dbg !24
90
+ %88 = bitcast float %87 to i32, !dbg !20
91
+ %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !20
92
+ %90 = bitcast i32 %89 to float, !dbg !20
93
+ %91 = fadd float %87, %90, !dbg !24
94
+ %92 = bitcast float %91 to i32, !dbg !20
95
+ %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !20
96
+ %94 = bitcast i32 %93 to float, !dbg !20
97
+ %95 = fadd float %91, %94, !dbg !24
98
+ %96 = bitcast float %35 to i32, !dbg !20
99
+ %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !20
100
+ %98 = bitcast i32 %97 to float, !dbg !20
101
+ %99 = fadd float %35, %98, !dbg !24
102
+ %100 = bitcast float %99 to i32, !dbg !20
103
+ %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !20
104
+ %102 = bitcast i32 %101 to float, !dbg !20
105
+ %103 = fadd float %99, %102, !dbg !24
106
+ %104 = bitcast float %103 to i32, !dbg !20
107
+ %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !20
108
+ %106 = bitcast i32 %105 to float, !dbg !20
109
+ %107 = fadd float %103, %106, !dbg !24
110
+ %108 = bitcast float %107 to i32, !dbg !20
111
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !20
112
+ %110 = bitcast i32 %109 to float, !dbg !20
113
+ %111 = fadd float %107, %110, !dbg !24
114
+ %112 = bitcast float %111 to i32, !dbg !20
115
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !20
116
+ %114 = bitcast i32 %113 to float, !dbg !20
117
+ %115 = fadd float %111, %114, !dbg !24
118
+ %116 = icmp eq i32 %7, 0, !dbg !20
119
+ %117 = zext nneg i32 %10 to i64, !dbg !20
120
+ %118 = getelementptr float, ptr addrspace(3) @global_smem, i64 %117, !dbg !20
121
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %118, float %55, i1 %116) #3, !dbg !20
122
+ %119 = or i32 %10, 4, !dbg !20
123
+ %120 = zext nneg i32 %119 to i64, !dbg !20
124
+ %121 = getelementptr float, ptr addrspace(3) @global_smem, i64 %120, !dbg !20
125
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %121, float %75, i1 %116) #3, !dbg !20
126
+ %122 = or i32 %10, 8, !dbg !20
127
+ %123 = zext nneg i32 %122 to i64, !dbg !20
128
+ %124 = getelementptr float, ptr addrspace(3) @global_smem, i64 %123, !dbg !20
129
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %124, float %95, i1 %116) #3, !dbg !20
130
+ %125 = or i32 %10, 12, !dbg !20
131
+ %126 = zext nneg i32 %125 to i64, !dbg !20
132
+ %127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !20
133
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %115, i1 %116) #3, !dbg !20
134
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
135
+ %128 = icmp slt i32 %6, 16, !dbg !20
136
+ %129 = sext i32 %6 to i64, !dbg !20
137
+ %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !20
138
+ %131 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %130, i1 %128) #3, !dbg !20
139
+ %132 = bitcast float %131 to i32, !dbg !20
140
+ %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !20
141
+ %134 = bitcast i32 %133 to float, !dbg !20
142
+ %135 = fadd float %131, %134, !dbg !24
143
+ %136 = bitcast float %135 to i32, !dbg !20
144
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !20
145
+ %138 = bitcast i32 %137 to float, !dbg !20
146
+ %139 = fadd float %135, %138, !dbg !24
147
+ %140 = icmp eq i32 %9, 0, !dbg !20
148
+ %141 = and i1 %128, %140, !dbg !20
149
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %139, i1 %141) #3, !dbg !20
150
+ tail call void @llvm.nvvm.barrier0(), !dbg !20
151
+ %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !20
152
+ %143 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), align 4, !dbg !20
153
+ %144 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !20
154
+ %145 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 48), align 4, !dbg !20
155
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
156
+ %146 = insertelement <1 x float> undef, float %142, i64 0, !dbg !28
157
+ store <1 x float> %146, ptr addrspace(3) @global_smem, align 4, !dbg !28
158
+ %147 = insertelement <1 x float> undef, float %143, i64 0, !dbg !28
159
+ store <1 x float> %147, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4), align 4, !dbg !28
160
+ %148 = insertelement <1 x float> undef, float %144, i64 0, !dbg !28
161
+ store <1 x float> %148, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !28
162
+ %149 = insertelement <1 x float> undef, float %145, i64 0, !dbg !28
163
+ store <1 x float> %149, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 12), align 4, !dbg !28
164
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
165
+ %150 = zext nneg i32 %9 to i64, !dbg !28
166
+ %151 = getelementptr float, ptr addrspace(3) @global_smem, i64 %150, !dbg !28
167
+ %152 = load <1 x float>, ptr addrspace(3) %151, align 4, !dbg !28
168
+ %.frozen = freeze i32 %13
169
+ %153 = sdiv i32 %.frozen, 256, !dbg !29
170
+ %154 = mul i32 %153, 256
171
+ %.decomposed = sub i32 %.frozen, %154
172
+ %155 = sext i32 %153 to i64, !dbg !30
173
+ %156 = getelementptr i64, ptr addrspace(1) %1, i64 %155, !dbg !30
174
+ %157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %156, i1 true) #3, !dbg !31
175
+ %158 = lshr i64 %157, 54, !dbg !32
176
+ %159 = and i64 %158, 512, !dbg !32
177
+ %160 = add i64 %159, %157, !dbg !32
178
+ %161 = shl i64 %160, 8, !dbg !33
179
+ %162 = sext i32 %.decomposed to i64, !dbg !34
180
+ %163 = getelementptr float, ptr addrspace(1) %2, i64 %161, !dbg !35
181
+ %164 = getelementptr float, ptr addrspace(1) %163, i64 %162, !dbg !35
182
+ %165 = lshr i32 %7, 2, !dbg !36
183
+ %166 = shl nuw nsw i32 %10, 3, !dbg !36
184
+ %167 = or i32 %166, %165, !dbg !36
185
+ %168 = icmp eq i32 %167, 0, !dbg !36
186
+ %169 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %164, <1 x float> %152, i1 %168) #3, !dbg !36
187
+ ret void, !dbg !37
188
+ }
189
+
190
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
191
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
192
+
193
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
194
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
195
+
196
+ ; Function Attrs: convergent nocallback nounwind
197
+ declare void @llvm.nvvm.barrier0() #2
198
+
199
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
200
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
201
+ attributes #2 = { convergent nocallback nounwind }
202
+ attributes #3 = { nounwind }
203
+
204
+ !llvm.module.flags = !{!0}
205
+ !llvm.dbg.cu = !{!1}
206
+ !nvvm.annotations = !{!3, !4, !4, !3}
207
+
208
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
209
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
210
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
211
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
212
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128}
213
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
214
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
215
+ !7 = !{}
216
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
217
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
218
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
219
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
220
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
221
+ !13 = !DILocation(line: 29, column: 25, scope: !5)
222
+ !14 = !DILocation(line: 31, column: 47, scope: !5)
223
+ !15 = !DILocation(line: 31, column: 40, scope: !5)
224
+ !16 = !DILocation(line: 31, column: 34, scope: !5)
225
+ !17 = !DILocation(line: 31, column: 53, scope: !5)
226
+ !18 = !DILocation(line: 33, column: 23, scope: !5)
227
+ !19 = !DILocation(line: 34, column: 38, scope: !5)
228
+ !20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
229
+ !21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
230
+ !22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
231
+ !23 = !DILocation(line: 35, column: 25, scope: !21)
232
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
233
+ !25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
234
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
235
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
236
+ !28 = !DILocation(line: 35, column: 28, scope: !5)
237
+ !29 = !DILocation(line: 36, column: 20, scope: !5)
238
+ !30 = !DILocation(line: 38, column: 30, scope: !5)
239
+ !31 = !DILocation(line: 38, column: 35, scope: !5)
240
+ !32 = !DILocation(line: 41, column: 32, scope: !5)
241
+ !33 = !DILocation(line: 45, column: 40, scope: !5)
242
+ !34 = !DILocation(line: 45, column: 36, scope: !5)
243
+ !35 = !DILocation(line: 45, column: 30, scope: !5)
244
+ !36 = !DILocation(line: 45, column: 55, scope: !5)
245
+ !37 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<15>;
22
+ .reg .b32 %r<91>;
23
+ .reg .f32 %f<62>;
24
+ .reg .b64 %rd<16>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd5, [triton__0d1d2d3de4e_param_0];
30
+ ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_1];
31
+ $L__tmp0:
32
+ .loc 1 22 44
33
+ mov.u32 %r24, %tid.x;
34
+ and.b32 %r25, %r24, 31;
35
+ ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_2];
36
+ and.b32 %r26, %r24, 3;
37
+ .loc 1 24 33
38
+ bfe.u32 %r27, %r24, 5, 2;
39
+ and.b32 %r28, %r24, 127;
40
+ .loc 1 21 28
41
+ mov.u32 %r1, %ctaid.x;
42
+ .loc 1 21 33
43
+ shl.b32 %r29, %r1, 2;
44
+ .loc 1 22 23
45
+ or.b32 %r30, %r29, %r26;
46
+ .loc 1 29 25
47
+ setp.lt.u32 %p1, %r28, 120;
48
+ .loc 1 31 47
49
+ shl.b32 %r31, %r28, 17;
50
+ .loc 1 31 40
51
+ add.s32 %r32, %r29, %r31;
52
+ .loc 1 31 34
53
+ mul.wide.s32 %rd8, %r32, 4;
54
+ add.s64 %rd1, %rd5, %rd8;
55
+ mov.b32 %r6, 0;
56
+ .loc 1 31 53
57
+ mov.u32 %r2, 0x0;
58
+ mov.u32 %r3, 0x0;
59
+ mov.u32 %r4, 0x0;
60
+ mov.u32 %r5, 0x0;
61
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
62
+ @!%p1 mov.u32 %r2, %r6;
63
+ @!%p1 mov.u32 %r3, %r6;
64
+ @!%p1 mov.u32 %r4, %r6;
65
+ @!%p1 mov.u32 %r5, %r6;
66
+ mov.b32 %f1, %r2;
67
+ mov.b32 %f2, %r3;
68
+ mov.b32 %f3, %r4;
69
+ mov.b32 %f4, %r5;
70
+ .loc 1 33 23
71
+ add.f32 %f5, %f1, 0f00000000;
72
+ add.f32 %f6, %f2, 0f00000000;
73
+ add.f32 %f7, %f3, 0f00000000;
74
+ add.f32 %f8, %f4, 0f00000000;
75
+ .loc 1 34 38
76
+ selp.f32 %f9, %f5, 0f00000000, %p1;
77
+ selp.f32 %f10, %f6, 0f00000000, %p1;
78
+ selp.f32 %f11, %f7, 0f00000000, %p1;
79
+ selp.f32 %f12, %f8, 0f00000000, %p1;
80
+ $L__tmp1:
81
+ .loc 2 243 36
82
+ mov.b32 %r33, %f9;
83
+ shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1;
84
+ mov.b32 %f13, %r34;
85
+ $L__tmp2:
86
+ .loc 2 233 15
87
+ add.f32 %f14, %f9, %f13;
88
+ $L__tmp3:
89
+ .loc 2 243 36
90
+ mov.b32 %r35, %f14;
91
+ shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1;
92
+ mov.b32 %f15, %r36;
93
+ $L__tmp4:
94
+ .loc 2 233 15
95
+ add.f32 %f16, %f14, %f15;
96
+ $L__tmp5:
97
+ .loc 2 243 36
98
+ mov.b32 %r37, %f16;
99
+ shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
100
+ mov.b32 %f17, %r38;
101
+ $L__tmp6:
102
+ .loc 2 233 15
103
+ add.f32 %f18, %f16, %f17;
104
+ $L__tmp7:
105
+ .loc 2 243 36
106
+ mov.b32 %r39, %f18;
107
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
108
+ mov.b32 %f19, %r40;
109
+ $L__tmp8:
110
+ .loc 2 233 15
111
+ add.f32 %f20, %f18, %f19;
112
+ $L__tmp9:
113
+ .loc 2 243 36
114
+ mov.b32 %r41, %f20;
115
+ shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
116
+ mov.b32 %f21, %r42;
117
+ $L__tmp10:
118
+ .loc 2 233 15
119
+ add.f32 %f22, %f20, %f21;
120
+ $L__tmp11:
121
+ .loc 2 243 36
122
+ mov.b32 %r43, %f10;
123
+ shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1;
124
+ mov.b32 %f23, %r44;
125
+ $L__tmp12:
126
+ .loc 2 233 15
127
+ add.f32 %f24, %f10, %f23;
128
+ $L__tmp13:
129
+ .loc 2 243 36
130
+ mov.b32 %r45, %f24;
131
+ shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1;
132
+ mov.b32 %f25, %r46;
133
+ $L__tmp14:
134
+ .loc 2 233 15
135
+ add.f32 %f26, %f24, %f25;
136
+ $L__tmp15:
137
+ .loc 2 243 36
138
+ mov.b32 %r47, %f26;
139
+ shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1;
140
+ mov.b32 %f27, %r48;
141
+ $L__tmp16:
142
+ .loc 2 233 15
143
+ add.f32 %f28, %f26, %f27;
144
+ $L__tmp17:
145
+ .loc 2 243 36
146
+ mov.b32 %r49, %f28;
147
+ shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1;
148
+ mov.b32 %f29, %r50;
149
+ $L__tmp18:
150
+ .loc 2 233 15
151
+ add.f32 %f30, %f28, %f29;
152
+ $L__tmp19:
153
+ .loc 2 243 36
154
+ mov.b32 %r51, %f30;
155
+ shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1;
156
+ mov.b32 %f31, %r52;
157
+ $L__tmp20:
158
+ .loc 2 233 15
159
+ add.f32 %f32, %f30, %f31;
160
+ $L__tmp21:
161
+ .loc 2 243 36
162
+ mov.b32 %r53, %f11;
163
+ shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1;
164
+ mov.b32 %f33, %r54;
165
+ $L__tmp22:
166
+ .loc 2 233 15
167
+ add.f32 %f34, %f11, %f33;
168
+ $L__tmp23:
169
+ .loc 2 243 36
170
+ mov.b32 %r55, %f34;
171
+ shfl.sync.bfly.b32 %r56, %r55, 8, 31, -1;
172
+ mov.b32 %f35, %r56;
173
+ $L__tmp24:
174
+ .loc 2 233 15
175
+ add.f32 %f36, %f34, %f35;
176
+ $L__tmp25:
177
+ .loc 2 243 36
178
+ mov.b32 %r57, %f36;
179
+ shfl.sync.bfly.b32 %r58, %r57, 4, 31, -1;
180
+ mov.b32 %f37, %r58;
181
+ $L__tmp26:
182
+ .loc 2 233 15
183
+ add.f32 %f38, %f36, %f37;
184
+ $L__tmp27:
185
+ .loc 2 243 36
186
+ mov.b32 %r59, %f38;
187
+ shfl.sync.bfly.b32 %r60, %r59, 2, 31, -1;
188
+ mov.b32 %f39, %r60;
189
+ $L__tmp28:
190
+ .loc 2 233 15
191
+ add.f32 %f40, %f38, %f39;
192
+ $L__tmp29:
193
+ .loc 2 243 36
194
+ mov.b32 %r61, %f40;
195
+ shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
196
+ mov.b32 %f41, %r62;
197
+ $L__tmp30:
198
+ .loc 2 233 15
199
+ add.f32 %f42, %f40, %f41;
200
+ $L__tmp31:
201
+ .loc 2 243 36
202
+ mov.b32 %r63, %f12;
203
+ shfl.sync.bfly.b32 %r64, %r63, 16, 31, -1;
204
+ mov.b32 %f43, %r64;
205
+ $L__tmp32:
206
+ .loc 2 233 15
207
+ add.f32 %f44, %f12, %f43;
208
+ $L__tmp33:
209
+ .loc 2 243 36
210
+ mov.b32 %r65, %f44;
211
+ shfl.sync.bfly.b32 %r66, %r65, 8, 31, -1;
212
+ mov.b32 %f45, %r66;
213
+ $L__tmp34:
214
+ .loc 2 233 15
215
+ add.f32 %f46, %f44, %f45;
216
+ $L__tmp35:
217
+ .loc 2 243 36
218
+ mov.b32 %r67, %f46;
219
+ shfl.sync.bfly.b32 %r68, %r67, 4, 31, -1;
220
+ mov.b32 %f47, %r68;
221
+ $L__tmp36:
222
+ .loc 2 233 15
223
+ add.f32 %f48, %f46, %f47;
224
+ $L__tmp37:
225
+ .loc 2 243 36
226
+ mov.b32 %r69, %f48;
227
+ shfl.sync.bfly.b32 %r70, %r69, 2, 31, -1;
228
+ mov.b32 %f49, %r70;
229
+ $L__tmp38:
230
+ .loc 2 233 15
231
+ add.f32 %f50, %f48, %f49;
232
+ $L__tmp39:
233
+ .loc 2 243 36
234
+ mov.b32 %r71, %f50;
235
+ shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
236
+ mov.b32 %f51, %r72;
237
+ $L__tmp40:
238
+ .loc 2 233 15
239
+ add.f32 %f52, %f50, %f51;
240
+ $L__tmp41:
241
+ .loc 2 243 36
242
+ setp.eq.s32 %p6, %r25, 0;
243
+ shl.b32 %r73, %r27, 2;
244
+ mov.u32 %r74, global_smem;
245
+ add.s32 %r10, %r74, %r73;
246
+ mov.b32 %r11, %f22;
247
+ @%p6 st.shared.b32 [ %r10 + 0 ], %r11;
248
+ add.s32 %r12, %r10, 16;
249
+ mov.b32 %r13, %f32;
250
+ @%p6 st.shared.b32 [ %r12 + 0 ], %r13;
251
+ add.s32 %r14, %r10, 32;
252
+ mov.b32 %r15, %f42;
253
+ @%p6 st.shared.b32 [ %r14 + 0 ], %r15;
254
+ add.s32 %r16, %r10, 48;
255
+ mov.b32 %r17, %f52;
256
+ @%p6 st.shared.b32 [ %r16 + 0 ], %r17;
257
+ bar.sync 0;
258
+ setp.lt.s32 %p10, %r24, 16;
259
+ shl.b32 %r75, %r24, 2;
260
+ add.s32 %r19, %r74, %r75;
261
+ @%p10 ld.shared.b32 %r18, [ %r19 + 0 ];
262
+ mov.b32 %f53, %r18;
263
+ shfl.sync.bfly.b32 %r76, %r18, 2, 31, -1;
264
+ mov.b32 %f54, %r76;
265
+ $L__tmp42:
266
+ .loc 2 233 15
267
+ add.f32 %f55, %f53, %f54;
268
+ $L__tmp43:
269
+ .loc 2 243 36
270
+ mov.b32 %r77, %f55;
271
+ shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1;
272
+ mov.b32 %f56, %r78;
273
+ $L__tmp44:
274
+ .loc 2 233 15
275
+ add.f32 %f57, %f55, %f56;
276
+ $L__tmp45:
277
+ .loc 2 243 36
278
+ setp.eq.s32 %p14, %r26, 0;
279
+ and.pred %p11, %p10, %p14;
280
+ mov.b32 %r21, %f57;
281
+ @%p11 st.shared.b32 [ %r19 + 0 ], %r21;
282
+ bar.sync 0;
283
+ ld.shared.f32 %f58, [global_smem];
284
+ ld.shared.f32 %f59, [global_smem+16];
285
+ ld.shared.f32 %f60, [global_smem+32];
286
+ ld.shared.f32 %f61, [global_smem+48];
287
+ $L__tmp46:
288
+ .loc 1 35 28
289
+ bar.sync 0;
290
+ st.shared.f32 [global_smem], %f58;
291
+ st.shared.f32 [global_smem+4], %f59;
292
+ st.shared.f32 [global_smem+8], %f60;
293
+ st.shared.f32 [global_smem+12], %f61;
294
+ bar.sync 0;
295
+ shl.b32 %r79, %r26, 2;
296
+ add.s32 %r80, %r74, %r79;
297
+ .loc 1 36 20
298
+ shr.s32 %r82, %r30, 31;
299
+ shr.u32 %r83, %r82, 24;
300
+ add.s32 %r84, %r30, %r83;
301
+ shr.s32 %r85, %r84, 8;
302
+ and.b32 %r86, %r84, -256;
303
+ sub.s32 %r87, %r30, %r86;
304
+ .loc 1 38 30
305
+ mul.wide.s32 %rd9, %r85, 8;
306
+ add.s64 %rd3, %rd6, %rd9;
307
+ .loc 1 45 55
308
+ ld.shared.u32 %r23, [%r80];
309
+ mov.pred %p12, -1;
310
+ .loc 1 38 35
311
+ mov.u64 %rd2, 0x0;
312
+ @%p12 ld.global.L1::evict_last.b64 { %rd2 }, [ %rd3 + 0 ];
313
+ .loc 1 41 32
314
+ shr.u64 %rd10, %rd2, 54;
315
+ and.b64 %rd11, %rd10, 512;
316
+ add.s64 %rd12, %rd11, %rd2;
317
+ .loc 1 45 30
318
+ shl.b64 %rd13, %rd12, 10;
319
+ add.s64 %rd14, %rd7, %rd13;
320
+ mul.wide.s32 %rd15, %r87, 4;
321
+ add.s64 %rd4, %rd14, %rd15;
322
+ .loc 1 45 55
323
+ bfe.u32 %r88, %r24, 2, 3;
324
+ shl.b32 %r89, %r27, 3;
325
+ or.b32 %r90, %r89, %r88;
326
+ setp.eq.s32 %p13, %r90, 0;
327
+ mov.u32 %r22, 0x0;
328
+ @%p13 atom.global.gpu.acq_rel.add.f32 %r22, [ %rd4 + 0 ], %r23;
329
+ .loc 1 45 4
330
+ ret;
331
+ $L__tmp47:
332
+ $L__func_end0:
333
+
334
+ }
335
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
336
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
337
+ .section .debug_abbrev
338
+ {
339
+ .b8 1
340
+ .b8 17
341
+ .b8 1
342
+ .b8 37
343
+ .b8 8
344
+ .b8 19
345
+ .b8 5
346
+ .b8 3
347
+ .b8 8
348
+ .b8 16
349
+ .b8 6
350
+ .b8 27
351
+ .b8 8
352
+ .b8 180
353
+ .b8 66
354
+ .b8 12
355
+ .b8 17
356
+ .b8 1
357
+ .b8 18
358
+ .b8 1
359
+ .b8 0
360
+ .b8 0
361
+ .b8 2
362
+ .b8 46
363
+ .b8 0
364
+ .b8 135
365
+ .b8 64
366
+ .b8 8
367
+ .b8 3
368
+ .b8 8
369
+ .b8 58
370
+ .b8 11
371
+ .b8 59
372
+ .b8 11
373
+ .b8 63
374
+ .b8 12
375
+ .b8 32
376
+ .b8 11
377
+ .b8 0
378
+ .b8 0
379
+ .b8 3
380
+ .b8 46
381
+ .b8 1
382
+ .b8 17
383
+ .b8 1
384
+ .b8 18
385
+ .b8 1
386
+ .b8 64
387
+ .b8 10
388
+ .b8 49
389
+ .b8 19
390
+ .b8 0
391
+ .b8 0
392
+ .b8 4
393
+ .b8 29
394
+ .b8 0
395
+ .b8 49
396
+ .b8 19
397
+ .b8 17
398
+ .b8 1
399
+ .b8 18
400
+ .b8 1
401
+ .b8 88
402
+ .b8 11
403
+ .b8 89
404
+ .b8 11
405
+ .b8 87
406
+ .b8 11
407
+ .b8 0
408
+ .b8 0
409
+ .b8 5
410
+ .b8 29
411
+ .b8 1
412
+ .b8 49
413
+ .b8 19
414
+ .b8 17
415
+ .b8 1
416
+ .b8 18
417
+ .b8 1
418
+ .b8 88
419
+ .b8 11
420
+ .b8 89
421
+ .b8 11
422
+ .b8 87
423
+ .b8 11
424
+ .b8 0
425
+ .b8 0
426
+ .b8 0
427
+ }
428
+ .section .debug_info
429
+ {
430
+ .b32 264
431
+ .b8 2
432
+ .b8 0
433
+ .b32 .debug_abbrev
434
+ .b8 8
435
+ .b8 1
436
+ .b8 116
437
+ .b8 114
438
+ .b8 105
439
+ .b8 116
440
+ .b8 111
441
+ .b8 110
442
+ .b8 0
443
+ .b8 2
444
+ .b8 0
445
+ .b8 99
446
+ .b8 54
447
+ .b8 105
448
+ .b8 107
449
+ .b8 53
450
+ .b8 118
451
+ .b8 120
452
+ .b8 55
453
+ .b8 112
454
+ .b8 50
455
+ .b8 50
456
+ .b8 102
457
+ .b8 112
458
+ .b8 107
459
+ .b8 52
460
+ .b8 100
461
+ .b8 99
462
+ .b8 118
463
+ .b8 104
464
+ .b8 53
465
+ .b8 53
466
+ .b8 122
467
+ .b8 105
468
+ .b8 109
469
+ .b8 119
470
+ .b8 52
471
+ .b8 116
472
+ .b8 53
473
+ .b8 110
474
+ .b8 114
475
+ .b8 53
476
+ .b8 122
477
+ .b8 110
478
+ .b8 50
479
+ .b8 98
480
+ .b8 55
481
+ .b8 105
482
+ .b8 110
483
+ .b8 117
484
+ .b8 106
485
+ .b8 120
486
+ .b8 106
487
+ .b8 97
488
+ .b8 117
489
+ .b8 120
490
+ .b8 115
491
+ .b8 104
492
+ .b8 108
493
+ .b8 106
494
+ .b8 117
495
+ .b8 109
496
+ .b8 109
497
+ .b8 46
498
+ .b8 112
499
+ .b8 121
500
+ .b8 0
501
+ .b32 .debug_line
502
+ .b8 47
503
+ .b8 116
504
+ .b8 109
505
+ .b8 112
506
+ .b8 47
507
+ .b8 116
508
+ .b8 111
509
+ .b8 114
510
+ .b8 99
511
+ .b8 104
512
+ .b8 105
513
+ .b8 110
514
+ .b8 100
515
+ .b8 117
516
+ .b8 99
517
+ .b8 116
518
+ .b8 111
519
+ .b8 114
520
+ .b8 95
521
+ .b8 114
522
+ .b8 111
523
+ .b8 111
524
+ .b8 116
525
+ .b8 47
526
+ .b8 54
527
+ .b8 105
528
+ .b8 0
529
+ .b8 1
530
+ .b64 $L__func_begin0
531
+ .b64 $L__func_end0
532
+ .b8 2
533
+ .b8 116
534
+ .b8 114
535
+ .b8 105
536
+ .b8 116
537
+ .b8 111
538
+ .b8 110
539
+ .b8 95
540
+ .b8 95
541
+ .b8 48
542
+ .b8 100
543
+ .b8 49
544
+ .b8 100
545
+ .b8 50
546
+ .b8 100
547
+ .b8 51
548
+ .b8 100
549
+ .b8 101
550
+ .b8 52
551
+ .b8 101
552
+ .b8 0
553
+ .b8 116
554
+ .b8 114
555
+ .b8 105
556
+ .b8 116
557
+ .b8 111
558
+ .b8 110
559
+ .b8 95
560
+ .b8 95
561
+ .b8 48
562
+ .b8 100
563
+ .b8 49
564
+ .b8 100
565
+ .b8 50
566
+ .b8 100
567
+ .b8 51
568
+ .b8 100
569
+ .b8 101
570
+ .b8 52
571
+ .b8 101
572
+ .b8 0
573
+ .b8 1
574
+ .b8 18
575
+ .b8 1
576
+ .b8 1
577
+ .b8 3
578
+ .b64 $L__func_begin0
579
+ .b64 $L__func_end0
580
+ .b8 1
581
+ .b8 156
582
+ .b32 125
583
+ .b8 4
584
+ .b32 125
585
+ .b64 $L__tmp1
586
+ .b64 $L__tmp46
587
+ .b8 2
588
+ .b8 35
589
+ .b8 25
590
+ .b8 5
591
+ .b32 125
592
+ .b64 $L__tmp2
593
+ .b64 $L__tmp45
594
+ .b8 2
595
+ .b8 35
596
+ .b8 25
597
+ .b8 4
598
+ .b32 125
599
+ .b64 $L__tmp2
600
+ .b64 $L__tmp45
601
+ .b8 2
602
+ .b8 243
603
+ .b8 36
604
+ .b8 0
605
+ .b8 0
606
+ .b8 0
607
+ }
608
+ .section .debug_pubnames
609
+ {
610
+ .b32 $L__pubNames_end0-$L__pubNames_start0
611
+ $L__pubNames_start0:
612
+ .b8 2
613
+ .b8 0
614
+ .b32 .debug_info
615
+ .b32 268
616
+ .b32 125
617
+ .b8 116
618
+ .b8 114
619
+ .b8 105
620
+ .b8 116
621
+ .b8 111
622
+ .b8 110
623
+ .b8 95
624
+ .b8 95
625
+ .b8 48
626
+ .b8 100
627
+ .b8 49
628
+ .b8 100
629
+ .b8 50
630
+ .b8 100
631
+ .b8 51
632
+ .b8 100
633
+ .b8 101
634
+ .b8 52
635
+ .b8 101
636
+ .b8 0
637
+ .b32 0
638
+ $L__pubNames_end0:
639
+ }
640
+ .section .debug_pubtypes
641
+ {
642
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
643
+ $L__pubTypes_start0:
644
+ .b8 2
645
+ .b8 0
646
+ .b32 .debug_info
647
+ .b32 268
648
+ .b32 0
649
+ $L__pubTypes_end0:
650
+ }
651
+ .section .debug_loc { }
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<4x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<4x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<4x1xi64>
6
+ %cst_2 = arith.constant dense<true> : tensor<4x1xi1>
7
+ %cst_3 = arith.constant dense<256> : tensor<4x1xi32>
8
+ %cst_4 = arith.constant dense<131072> : tensor<1x128xi32>
9
+ %cst_5 = arith.constant dense<120> : tensor<1x128xi32>
10
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xf32>
11
+ %c4_i32 = arith.constant 4 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c4_i32 : i32
14
+ %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32>) -> tensor<4x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<4x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<4x1xi32>
18
+ %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32>
20
+ %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32>
21
+ %9 = arith.muli %7, %cst_4 : tensor<1x128xi32>
22
+ %10 = tt.broadcast %5 : (tensor<4x1xi32>) -> tensor<4x128xi32>
23
+ %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<4x128xi32>
24
+ %12 = arith.addi %10, %11 : tensor<4x128xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>>
26
+ %14 = tt.addptr %13, %12 : tensor<4x128x!tt.ptr<f32, 1>>, tensor<4x128xi32>
27
+ %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<4x128xi1>
28
+ %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32>
29
+ %17 = arith.addf %16, %cst_6 : tensor<4x128xf32>
30
+ %18 = arith.select %15, %17, %cst_6 : tensor<4x128xi1>, tensor<4x128xf32>
31
+ %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({
32
+ ^bb0(%arg5: f32, %arg6: f32):
33
+ %35 = arith.addf %arg5, %arg6 : f32
34
+ tt.reduce.return %35 : f32
35
+ }) : (tensor<4x128xf32>) -> tensor<4xf32>
36
+ %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<4xf32>) -> tensor<4x1xf32>
37
+ %21 = arith.divsi %5, %cst_3 : tensor<4x1xi32>
38
+ %22 = arith.remsi %5, %cst_3 : tensor<4x1xi32>
39
+ %23 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>>
40
+ %24 = tt.addptr %23, %21 : tensor<4x1x!tt.ptr<i64, 1>>, tensor<4x1xi32>
41
+ %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64>
42
+ %26 = arith.addi %25, %cst_1 : tensor<4x1xi64>
43
+ %27 = arith.cmpi slt, %25, %cst_0 : tensor<4x1xi64>
44
+ %28 = arith.select %27, %26, %25 : tensor<4x1xi1>, tensor<4x1xi64>
45
+ %29 = arith.muli %28, %cst : tensor<4x1xi64>
46
+ %30 = arith.extsi %22 : tensor<4x1xi32> to tensor<4x1xi64>
47
+ %31 = arith.addi %30, %29 : tensor<4x1xi64>
48
+ %32 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>>
49
+ %33 = tt.addptr %32, %31 : tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xi64>
50
+ %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>>, tensor<4x1xf32>, tensor<4x1xi1>) -> tensor<4x1xf32>
51
+ tt.return
52
+ }
53
+ }
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin ADDED
Binary file (23.9 kB). View file
 
.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir ADDED
@@ -0,0 +1,858 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
7
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %4 = shl i32 %3, 3, !dbg !10
9
+ %5 = and i32 %4, 1016, !dbg !10
10
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %7 = shl i32 %6, 10, !dbg !12
12
+ %8 = or i32 %7, %5, !dbg !13
13
+ %9 = sext i32 %8 to i64, !dbg !14
14
+ %10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
15
+ %11 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
16
+ %12 = extractvalue { i32, i32, i32, i32 } %11, 0, !dbg !15
17
+ %13 = extractvalue { i32, i32, i32, i32 } %11, 1, !dbg !15
18
+ %14 = extractvalue { i32, i32, i32, i32 } %11, 2, !dbg !15
19
+ %15 = extractvalue { i32, i32, i32, i32 } %11, 3, !dbg !15
20
+ %16 = trunc i32 %12 to i16, !dbg !15
21
+ %extelt.offset = lshr i32 %12, 16, !dbg !15
22
+ %17 = trunc i32 %extelt.offset to i16, !dbg !15
23
+ %18 = trunc i32 %13 to i16, !dbg !15
24
+ %extelt.offset1 = lshr i32 %13, 16, !dbg !15
25
+ %19 = trunc i32 %extelt.offset1 to i16, !dbg !15
26
+ %20 = trunc i32 %14 to i16, !dbg !15
27
+ %extelt.offset2 = lshr i32 %14, 16, !dbg !15
28
+ %21 = trunc i32 %extelt.offset2 to i16, !dbg !15
29
+ %22 = trunc i32 %15 to i16, !dbg !15
30
+ %extelt.offset3 = lshr i32 %15, 16, !dbg !15
31
+ %23 = trunc i32 %extelt.offset3 to i16, !dbg !15
32
+ %24 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %16) #4, !dbg !16
33
+ %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
34
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
35
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
36
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
37
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
38
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
39
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
40
+ %32 = fmul float %24, 0x3FE6A09E60000000, !dbg !17
41
+ %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17
42
+ %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17
43
+ %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17
44
+ %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17
45
+ %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17
46
+ %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17
47
+ %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17
48
+ %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
49
+ %.not.i = icmp eq i32 %40, 0, !dbg !18
50
+ %41 = tail call float @llvm.nvvm.fabs.ftz.f(float %32) #4, !dbg !18
51
+ %42 = tail call float @llvm.nvvm.fabs.f(float %32) #4, !dbg !18
52
+ %.0.i = select i1 %.not.i, float %42, float %41, !dbg !18
53
+ %43 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
54
+ br i1 %43, label %__nv_fabsf.exit1.i, label %45, !dbg !18
55
+
56
+ __nv_fabsf.exit1.i: ; preds = %2
57
+ %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
58
+ %.not1.i = icmp eq i32 %44, 0, !dbg !18
59
+ %.01.i = select i1 %.not1.i, float %42, float %41, !dbg !18
60
+ br label %__internal_fmad.exit.i, !dbg !18
61
+
62
+ 45: ; preds = %2
63
+ %46 = fmul float %32, %32, !dbg !18
64
+ br label %__internal_fmad.exit.i, !dbg !18
65
+
66
+ __internal_fmad.exit.i: ; preds = %45, %__nv_fabsf.exit1.i
67
+ %47 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %45 ], !dbg !18
68
+ %48 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %45 ], !dbg !18
69
+ %49 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %45 ], !dbg !18
70
+ %50 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %45 ], !dbg !18
71
+ %51 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %45 ], !dbg !18
72
+ %52 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %45 ], !dbg !18
73
+ %53 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %45 ], !dbg !18
74
+ %54 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %46, %45 ], !dbg !18
75
+ %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
76
+ %.not2.i = icmp eq i32 %55, 0, !dbg !18
77
+ %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %53, float %54, float %52) #4, !dbg !18
78
+ %57 = tail call float @llvm.nvvm.fma.rn.f(float %53, float %54, float %52) #4, !dbg !18
79
+ %.02.i = select i1 %.not2.i, float %57, float %56, !dbg !18
80
+ %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
81
+ %.not3.i = icmp eq i32 %58, 0, !dbg !18
82
+ %59 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %54, float %51) #4, !dbg !18
83
+ %60 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %54, float %51) #4, !dbg !18
84
+ %.03.i = select i1 %.not3.i, float %60, float %59, !dbg !18
85
+ %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
86
+ %.not4.i = icmp eq i32 %61, 0, !dbg !18
87
+ %62 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %54, float %50) #4, !dbg !18
88
+ %63 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %54, float %50) #4, !dbg !18
89
+ %.04.i = select i1 %.not4.i, float %63, float %62, !dbg !18
90
+ %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
91
+ %.not5.i = icmp eq i32 %64, 0, !dbg !18
92
+ %65 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %54, float %49) #4, !dbg !18
93
+ %66 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %54, float %49) #4, !dbg !18
94
+ %.05.i = select i1 %.not5.i, float %66, float %65, !dbg !18
95
+ %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
96
+ %.not6.i = icmp eq i32 %67, 0, !dbg !18
97
+ %68 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %54, float %48) #4, !dbg !18
98
+ %69 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %54, float %48) #4, !dbg !18
99
+ %.06.i = select i1 %.not6.i, float %69, float %68, !dbg !18
100
+ %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
101
+ %.not7.i = icmp eq i32 %70, 0, !dbg !18
102
+ %71 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %54, float %47) #4, !dbg !18
103
+ %72 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %54, float %47) #4, !dbg !18
104
+ %.07.i = select i1 %.not7.i, float %72, float %71, !dbg !18
105
+ %73 = fneg float %54, !dbg !18
106
+ %74 = select i1 %43, float %73, float %32, !dbg !18
107
+ %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
108
+ %.not8.i = icmp eq i32 %75, 0, !dbg !18
109
+ %76 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %74, float %74) #4, !dbg !18
110
+ %77 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %74, float %74) #4, !dbg !18
111
+ %.08.i = select i1 %.not8.i, float %77, float %76, !dbg !18
112
+ br i1 %43, label %78, label %__nv_erff.exit, !dbg !18
113
+
114
+ 78: ; preds = %__internal_fmad.exit.i
115
+ %79 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
116
+ %80 = fsub float 1.000000e+00, %79, !dbg !18
117
+ %81 = bitcast float %80 to i32, !dbg !18
118
+ %82 = bitcast float %32 to i32, !dbg !18
119
+ %83 = and i32 %82, -2147483648, !dbg !18
120
+ %84 = or i32 %83, %81, !dbg !18
121
+ %85 = bitcast i32 %84 to float, !dbg !18
122
+ br label %__nv_erff.exit, !dbg !18
123
+
124
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %78
125
+ %r.0.i = phi float [ %85, %78 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
126
+ %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
127
+ %.not.i4 = icmp eq i32 %86, 0, !dbg !18
128
+ %87 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18
129
+ %88 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18
130
+ %.0.i5 = select i1 %.not.i4, float %88, float %87, !dbg !18
131
+ %89 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18
132
+ br i1 %89, label %__nv_fabsf.exit1.i22, label %91, !dbg !18
133
+
134
+ __nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit
135
+ %90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
136
+ %.not1.i23 = icmp eq i32 %90, 0, !dbg !18
137
+ %.01.i24 = select i1 %.not1.i23, float %88, float %87, !dbg !18
138
+ br label %__internal_fmad.exit.i6, !dbg !18
139
+
140
+ 91: ; preds = %__nv_erff.exit
141
+ %92 = fmul float %33, %33, !dbg !18
142
+ br label %__internal_fmad.exit.i6, !dbg !18
143
+
144
+ __internal_fmad.exit.i6: ; preds = %91, %__nv_fabsf.exit1.i22
145
+ %93 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %91 ], !dbg !18
146
+ %94 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %91 ], !dbg !18
147
+ %95 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %91 ], !dbg !18
148
+ %96 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %91 ], !dbg !18
149
+ %97 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %91 ], !dbg !18
150
+ %98 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %91 ], !dbg !18
151
+ %99 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %91 ], !dbg !18
152
+ %100 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %92, %91 ], !dbg !18
153
+ %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
154
+ %.not2.i7 = icmp eq i32 %101, 0, !dbg !18
155
+ %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %99, float %100, float %98) #4, !dbg !18
156
+ %103 = tail call float @llvm.nvvm.fma.rn.f(float %99, float %100, float %98) #4, !dbg !18
157
+ %.02.i8 = select i1 %.not2.i7, float %103, float %102, !dbg !18
158
+ %104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
159
+ %.not3.i9 = icmp eq i32 %104, 0, !dbg !18
160
+ %105 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %100, float %97) #4, !dbg !18
161
+ %106 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %100, float %97) #4, !dbg !18
162
+ %.03.i10 = select i1 %.not3.i9, float %106, float %105, !dbg !18
163
+ %107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
164
+ %.not4.i11 = icmp eq i32 %107, 0, !dbg !18
165
+ %108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %100, float %96) #4, !dbg !18
166
+ %109 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %100, float %96) #4, !dbg !18
167
+ %.04.i12 = select i1 %.not4.i11, float %109, float %108, !dbg !18
168
+ %110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
169
+ %.not5.i13 = icmp eq i32 %110, 0, !dbg !18
170
+ %111 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %100, float %95) #4, !dbg !18
171
+ %112 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %100, float %95) #4, !dbg !18
172
+ %.05.i14 = select i1 %.not5.i13, float %112, float %111, !dbg !18
173
+ %113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
174
+ %.not6.i15 = icmp eq i32 %113, 0, !dbg !18
175
+ %114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %100, float %94) #4, !dbg !18
176
+ %115 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %100, float %94) #4, !dbg !18
177
+ %.06.i16 = select i1 %.not6.i15, float %115, float %114, !dbg !18
178
+ %116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
179
+ %.not7.i17 = icmp eq i32 %116, 0, !dbg !18
180
+ %117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %100, float %93) #4, !dbg !18
181
+ %118 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %100, float %93) #4, !dbg !18
182
+ %.07.i18 = select i1 %.not7.i17, float %118, float %117, !dbg !18
183
+ %119 = fneg float %100, !dbg !18
184
+ %120 = select i1 %89, float %119, float %33, !dbg !18
185
+ %121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
186
+ %.not8.i19 = icmp eq i32 %121, 0, !dbg !18
187
+ %122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %120, float %120) #4, !dbg !18
188
+ %123 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %120, float %120) #4, !dbg !18
189
+ %.08.i20 = select i1 %.not8.i19, float %123, float %122, !dbg !18
190
+ br i1 %89, label %124, label %__nv_erff.exit25, !dbg !18
191
+
192
+ 124: ; preds = %__internal_fmad.exit.i6
193
+ %125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18
194
+ %126 = fsub float 1.000000e+00, %125, !dbg !18
195
+ %127 = bitcast float %126 to i32, !dbg !18
196
+ %128 = bitcast float %33 to i32, !dbg !18
197
+ %129 = and i32 %128, -2147483648, !dbg !18
198
+ %130 = or i32 %129, %127, !dbg !18
199
+ %131 = bitcast i32 %130 to float, !dbg !18
200
+ br label %__nv_erff.exit25, !dbg !18
201
+
202
+ __nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %124
203
+ %r.0.i21 = phi float [ %131, %124 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18
204
+ %132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
205
+ %.not.i26 = icmp eq i32 %132, 0, !dbg !18
206
+ %133 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18
207
+ %134 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18
208
+ %.0.i27 = select i1 %.not.i26, float %134, float %133, !dbg !18
209
+ %135 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18
210
+ br i1 %135, label %__nv_fabsf.exit1.i44, label %137, !dbg !18
211
+
212
+ __nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25
213
+ %136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
214
+ %.not1.i45 = icmp eq i32 %136, 0, !dbg !18
215
+ %.01.i46 = select i1 %.not1.i45, float %134, float %133, !dbg !18
216
+ br label %__internal_fmad.exit.i28, !dbg !18
217
+
218
+ 137: ; preds = %__nv_erff.exit25
219
+ %138 = fmul float %34, %34, !dbg !18
220
+ br label %__internal_fmad.exit.i28, !dbg !18
221
+
222
+ __internal_fmad.exit.i28: ; preds = %137, %__nv_fabsf.exit1.i44
223
+ %139 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %137 ], !dbg !18
224
+ %140 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %137 ], !dbg !18
225
+ %141 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %137 ], !dbg !18
226
+ %142 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %137 ], !dbg !18
227
+ %143 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %137 ], !dbg !18
228
+ %144 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %137 ], !dbg !18
229
+ %145 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %137 ], !dbg !18
230
+ %146 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %138, %137 ], !dbg !18
231
+ %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
232
+ %.not2.i29 = icmp eq i32 %147, 0, !dbg !18
233
+ %148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %145, float %146, float %144) #4, !dbg !18
234
+ %149 = tail call float @llvm.nvvm.fma.rn.f(float %145, float %146, float %144) #4, !dbg !18
235
+ %.02.i30 = select i1 %.not2.i29, float %149, float %148, !dbg !18
236
+ %150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
237
+ %.not3.i31 = icmp eq i32 %150, 0, !dbg !18
238
+ %151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %146, float %143) #4, !dbg !18
239
+ %152 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %146, float %143) #4, !dbg !18
240
+ %.03.i32 = select i1 %.not3.i31, float %152, float %151, !dbg !18
241
+ %153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
242
+ %.not4.i33 = icmp eq i32 %153, 0, !dbg !18
243
+ %154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %146, float %142) #4, !dbg !18
244
+ %155 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %146, float %142) #4, !dbg !18
245
+ %.04.i34 = select i1 %.not4.i33, float %155, float %154, !dbg !18
246
+ %156 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
247
+ %.not5.i35 = icmp eq i32 %156, 0, !dbg !18
248
+ %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %146, float %141) #4, !dbg !18
249
+ %158 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %146, float %141) #4, !dbg !18
250
+ %.05.i36 = select i1 %.not5.i35, float %158, float %157, !dbg !18
251
+ %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
252
+ %.not6.i37 = icmp eq i32 %159, 0, !dbg !18
253
+ %160 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %146, float %140) #4, !dbg !18
254
+ %161 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %146, float %140) #4, !dbg !18
255
+ %.06.i38 = select i1 %.not6.i37, float %161, float %160, !dbg !18
256
+ %162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
257
+ %.not7.i39 = icmp eq i32 %162, 0, !dbg !18
258
+ %163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %146, float %139) #4, !dbg !18
259
+ %164 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %146, float %139) #4, !dbg !18
260
+ %.07.i40 = select i1 %.not7.i39, float %164, float %163, !dbg !18
261
+ %165 = fneg float %146, !dbg !18
262
+ %166 = select i1 %135, float %165, float %34, !dbg !18
263
+ %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
264
+ %.not8.i41 = icmp eq i32 %167, 0, !dbg !18
265
+ %168 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %166, float %166) #4, !dbg !18
266
+ %169 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %166, float %166) #4, !dbg !18
267
+ %.08.i42 = select i1 %.not8.i41, float %169, float %168, !dbg !18
268
+ br i1 %135, label %170, label %__nv_erff.exit47, !dbg !18
269
+
270
+ 170: ; preds = %__internal_fmad.exit.i28
271
+ %171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18
272
+ %172 = fsub float 1.000000e+00, %171, !dbg !18
273
+ %173 = bitcast float %172 to i32, !dbg !18
274
+ %174 = bitcast float %34 to i32, !dbg !18
275
+ %175 = and i32 %174, -2147483648, !dbg !18
276
+ %176 = or i32 %175, %173, !dbg !18
277
+ %177 = bitcast i32 %176 to float, !dbg !18
278
+ br label %__nv_erff.exit47, !dbg !18
279
+
280
+ __nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %170
281
+ %r.0.i43 = phi float [ %177, %170 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18
282
+ %178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
283
+ %.not.i48 = icmp eq i32 %178, 0, !dbg !18
284
+ %179 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18
285
+ %180 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18
286
+ %.0.i49 = select i1 %.not.i48, float %180, float %179, !dbg !18
287
+ %181 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18
288
+ br i1 %181, label %__nv_fabsf.exit1.i66, label %183, !dbg !18
289
+
290
+ __nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47
291
+ %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
292
+ %.not1.i67 = icmp eq i32 %182, 0, !dbg !18
293
+ %.01.i68 = select i1 %.not1.i67, float %180, float %179, !dbg !18
294
+ br label %__internal_fmad.exit.i50, !dbg !18
295
+
296
+ 183: ; preds = %__nv_erff.exit47
297
+ %184 = fmul float %35, %35, !dbg !18
298
+ br label %__internal_fmad.exit.i50, !dbg !18
299
+
300
+ __internal_fmad.exit.i50: ; preds = %183, %__nv_fabsf.exit1.i66
301
+ %185 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %183 ], !dbg !18
302
+ %186 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %183 ], !dbg !18
303
+ %187 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %183 ], !dbg !18
304
+ %188 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %183 ], !dbg !18
305
+ %189 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %183 ], !dbg !18
306
+ %190 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %183 ], !dbg !18
307
+ %191 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %183 ], !dbg !18
308
+ %192 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %184, %183 ], !dbg !18
309
+ %193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
310
+ %.not2.i51 = icmp eq i32 %193, 0, !dbg !18
311
+ %194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %191, float %192, float %190) #4, !dbg !18
312
+ %195 = tail call float @llvm.nvvm.fma.rn.f(float %191, float %192, float %190) #4, !dbg !18
313
+ %.02.i52 = select i1 %.not2.i51, float %195, float %194, !dbg !18
314
+ %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
315
+ %.not3.i53 = icmp eq i32 %196, 0, !dbg !18
316
+ %197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %192, float %189) #4, !dbg !18
317
+ %198 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %192, float %189) #4, !dbg !18
318
+ %.03.i54 = select i1 %.not3.i53, float %198, float %197, !dbg !18
319
+ %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
320
+ %.not4.i55 = icmp eq i32 %199, 0, !dbg !18
321
+ %200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %192, float %188) #4, !dbg !18
322
+ %201 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %192, float %188) #4, !dbg !18
323
+ %.04.i56 = select i1 %.not4.i55, float %201, float %200, !dbg !18
324
+ %202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
325
+ %.not5.i57 = icmp eq i32 %202, 0, !dbg !18
326
+ %203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %192, float %187) #4, !dbg !18
327
+ %204 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %192, float %187) #4, !dbg !18
328
+ %.05.i58 = select i1 %.not5.i57, float %204, float %203, !dbg !18
329
+ %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
330
+ %.not6.i59 = icmp eq i32 %205, 0, !dbg !18
331
+ %206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %192, float %186) #4, !dbg !18
332
+ %207 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %192, float %186) #4, !dbg !18
333
+ %.06.i60 = select i1 %.not6.i59, float %207, float %206, !dbg !18
334
+ %208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
335
+ %.not7.i61 = icmp eq i32 %208, 0, !dbg !18
336
+ %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %192, float %185) #4, !dbg !18
337
+ %210 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %192, float %185) #4, !dbg !18
338
+ %.07.i62 = select i1 %.not7.i61, float %210, float %209, !dbg !18
339
+ %211 = fneg float %192, !dbg !18
340
+ %212 = select i1 %181, float %211, float %35, !dbg !18
341
+ %213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
342
+ %.not8.i63 = icmp eq i32 %213, 0, !dbg !18
343
+ %214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %212, float %212) #4, !dbg !18
344
+ %215 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %212, float %212) #4, !dbg !18
345
+ %.08.i64 = select i1 %.not8.i63, float %215, float %214, !dbg !18
346
+ br i1 %181, label %216, label %__nv_erff.exit69, !dbg !18
347
+
348
+ 216: ; preds = %__internal_fmad.exit.i50
349
+ %217 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18
350
+ %218 = fsub float 1.000000e+00, %217, !dbg !18
351
+ %219 = bitcast float %218 to i32, !dbg !18
352
+ %220 = bitcast float %35 to i32, !dbg !18
353
+ %221 = and i32 %220, -2147483648, !dbg !18
354
+ %222 = or i32 %221, %219, !dbg !18
355
+ %223 = bitcast i32 %222 to float, !dbg !18
356
+ br label %__nv_erff.exit69, !dbg !18
357
+
358
+ __nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %216
359
+ %r.0.i65 = phi float [ %223, %216 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18
360
+ %224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
361
+ %.not.i70 = icmp eq i32 %224, 0, !dbg !18
362
+ %225 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18
363
+ %226 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18
364
+ %.0.i71 = select i1 %.not.i70, float %226, float %225, !dbg !18
365
+ %227 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18
366
+ br i1 %227, label %__nv_fabsf.exit1.i88, label %229, !dbg !18
367
+
368
+ __nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69
369
+ %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
370
+ %.not1.i89 = icmp eq i32 %228, 0, !dbg !18
371
+ %.01.i90 = select i1 %.not1.i89, float %226, float %225, !dbg !18
372
+ br label %__internal_fmad.exit.i72, !dbg !18
373
+
374
+ 229: ; preds = %__nv_erff.exit69
375
+ %230 = fmul float %36, %36, !dbg !18
376
+ br label %__internal_fmad.exit.i72, !dbg !18
377
+
378
+ __internal_fmad.exit.i72: ; preds = %229, %__nv_fabsf.exit1.i88
379
+ %231 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %229 ], !dbg !18
380
+ %232 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %229 ], !dbg !18
381
+ %233 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %229 ], !dbg !18
382
+ %234 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %229 ], !dbg !18
383
+ %235 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %229 ], !dbg !18
384
+ %236 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %229 ], !dbg !18
385
+ %237 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %229 ], !dbg !18
386
+ %238 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %230, %229 ], !dbg !18
387
+ %239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
388
+ %.not2.i73 = icmp eq i32 %239, 0, !dbg !18
389
+ %240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %237, float %238, float %236) #4, !dbg !18
390
+ %241 = tail call float @llvm.nvvm.fma.rn.f(float %237, float %238, float %236) #4, !dbg !18
391
+ %.02.i74 = select i1 %.not2.i73, float %241, float %240, !dbg !18
392
+ %242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
393
+ %.not3.i75 = icmp eq i32 %242, 0, !dbg !18
394
+ %243 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %238, float %235) #4, !dbg !18
395
+ %244 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %238, float %235) #4, !dbg !18
396
+ %.03.i76 = select i1 %.not3.i75, float %244, float %243, !dbg !18
397
+ %245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
398
+ %.not4.i77 = icmp eq i32 %245, 0, !dbg !18
399
+ %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %238, float %234) #4, !dbg !18
400
+ %247 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %238, float %234) #4, !dbg !18
401
+ %.04.i78 = select i1 %.not4.i77, float %247, float %246, !dbg !18
402
+ %248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
403
+ %.not5.i79 = icmp eq i32 %248, 0, !dbg !18
404
+ %249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %238, float %233) #4, !dbg !18
405
+ %250 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %238, float %233) #4, !dbg !18
406
+ %.05.i80 = select i1 %.not5.i79, float %250, float %249, !dbg !18
407
+ %251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
408
+ %.not6.i81 = icmp eq i32 %251, 0, !dbg !18
409
+ %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %238, float %232) #4, !dbg !18
410
+ %253 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %238, float %232) #4, !dbg !18
411
+ %.06.i82 = select i1 %.not6.i81, float %253, float %252, !dbg !18
412
+ %254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
413
+ %.not7.i83 = icmp eq i32 %254, 0, !dbg !18
414
+ %255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %238, float %231) #4, !dbg !18
415
+ %256 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %238, float %231) #4, !dbg !18
416
+ %.07.i84 = select i1 %.not7.i83, float %256, float %255, !dbg !18
417
+ %257 = fneg float %238, !dbg !18
418
+ %258 = select i1 %227, float %257, float %36, !dbg !18
419
+ %259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
420
+ %.not8.i85 = icmp eq i32 %259, 0, !dbg !18
421
+ %260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %258, float %258) #4, !dbg !18
422
+ %261 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %258, float %258) #4, !dbg !18
423
+ %.08.i86 = select i1 %.not8.i85, float %261, float %260, !dbg !18
424
+ br i1 %227, label %262, label %__nv_erff.exit91, !dbg !18
425
+
426
+ 262: ; preds = %__internal_fmad.exit.i72
427
+ %263 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18
428
+ %264 = fsub float 1.000000e+00, %263, !dbg !18
429
+ %265 = bitcast float %264 to i32, !dbg !18
430
+ %266 = bitcast float %36 to i32, !dbg !18
431
+ %267 = and i32 %266, -2147483648, !dbg !18
432
+ %268 = or i32 %267, %265, !dbg !18
433
+ %269 = bitcast i32 %268 to float, !dbg !18
434
+ br label %__nv_erff.exit91, !dbg !18
435
+
436
+ __nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %262
437
+ %r.0.i87 = phi float [ %269, %262 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18
438
+ %270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
439
+ %.not.i92 = icmp eq i32 %270, 0, !dbg !18
440
+ %271 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18
441
+ %272 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18
442
+ %.0.i93 = select i1 %.not.i92, float %272, float %271, !dbg !18
443
+ %273 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18
444
+ br i1 %273, label %__nv_fabsf.exit1.i110, label %275, !dbg !18
445
+
446
+ __nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91
447
+ %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
448
+ %.not1.i111 = icmp eq i32 %274, 0, !dbg !18
449
+ %.01.i112 = select i1 %.not1.i111, float %272, float %271, !dbg !18
450
+ br label %__internal_fmad.exit.i94, !dbg !18
451
+
452
+ 275: ; preds = %__nv_erff.exit91
453
+ %276 = fmul float %37, %37, !dbg !18
454
+ br label %__internal_fmad.exit.i94, !dbg !18
455
+
456
+ __internal_fmad.exit.i94: ; preds = %275, %__nv_fabsf.exit1.i110
457
+ %277 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %275 ], !dbg !18
458
+ %278 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %275 ], !dbg !18
459
+ %279 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %275 ], !dbg !18
460
+ %280 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %275 ], !dbg !18
461
+ %281 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %275 ], !dbg !18
462
+ %282 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %275 ], !dbg !18
463
+ %283 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %275 ], !dbg !18
464
+ %284 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %276, %275 ], !dbg !18
465
+ %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
466
+ %.not2.i95 = icmp eq i32 %285, 0, !dbg !18
467
+ %286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %283, float %284, float %282) #4, !dbg !18
468
+ %287 = tail call float @llvm.nvvm.fma.rn.f(float %283, float %284, float %282) #4, !dbg !18
469
+ %.02.i96 = select i1 %.not2.i95, float %287, float %286, !dbg !18
470
+ %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
471
+ %.not3.i97 = icmp eq i32 %288, 0, !dbg !18
472
+ %289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %284, float %281) #4, !dbg !18
473
+ %290 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %284, float %281) #4, !dbg !18
474
+ %.03.i98 = select i1 %.not3.i97, float %290, float %289, !dbg !18
475
+ %291 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
476
+ %.not4.i99 = icmp eq i32 %291, 0, !dbg !18
477
+ %292 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %284, float %280) #4, !dbg !18
478
+ %293 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %284, float %280) #4, !dbg !18
479
+ %.04.i100 = select i1 %.not4.i99, float %293, float %292, !dbg !18
480
+ %294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
481
+ %.not5.i101 = icmp eq i32 %294, 0, !dbg !18
482
+ %295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %284, float %279) #4, !dbg !18
483
+ %296 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %284, float %279) #4, !dbg !18
484
+ %.05.i102 = select i1 %.not5.i101, float %296, float %295, !dbg !18
485
+ %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
486
+ %.not6.i103 = icmp eq i32 %297, 0, !dbg !18
487
+ %298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %284, float %278) #4, !dbg !18
488
+ %299 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %284, float %278) #4, !dbg !18
489
+ %.06.i104 = select i1 %.not6.i103, float %299, float %298, !dbg !18
490
+ %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
491
+ %.not7.i105 = icmp eq i32 %300, 0, !dbg !18
492
+ %301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %284, float %277) #4, !dbg !18
493
+ %302 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %284, float %277) #4, !dbg !18
494
+ %.07.i106 = select i1 %.not7.i105, float %302, float %301, !dbg !18
495
+ %303 = fneg float %284, !dbg !18
496
+ %304 = select i1 %273, float %303, float %37, !dbg !18
497
+ %305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
498
+ %.not8.i107 = icmp eq i32 %305, 0, !dbg !18
499
+ %306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %304, float %304) #4, !dbg !18
500
+ %307 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %304, float %304) #4, !dbg !18
501
+ %.08.i108 = select i1 %.not8.i107, float %307, float %306, !dbg !18
502
+ br i1 %273, label %308, label %__nv_erff.exit113, !dbg !18
503
+
504
+ 308: ; preds = %__internal_fmad.exit.i94
505
+ %309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18
506
+ %310 = fsub float 1.000000e+00, %309, !dbg !18
507
+ %311 = bitcast float %310 to i32, !dbg !18
508
+ %312 = bitcast float %37 to i32, !dbg !18
509
+ %313 = and i32 %312, -2147483648, !dbg !18
510
+ %314 = or i32 %313, %311, !dbg !18
511
+ %315 = bitcast i32 %314 to float, !dbg !18
512
+ br label %__nv_erff.exit113, !dbg !18
513
+
514
+ __nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %308
515
+ %r.0.i109 = phi float [ %315, %308 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18
516
+ %316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
517
+ %.not.i114 = icmp eq i32 %316, 0, !dbg !18
518
+ %317 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18
519
+ %318 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18
520
+ %.0.i115 = select i1 %.not.i114, float %318, float %317, !dbg !18
521
+ %319 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18
522
+ br i1 %319, label %__nv_fabsf.exit1.i132, label %321, !dbg !18
523
+
524
+ __nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113
525
+ %320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
526
+ %.not1.i133 = icmp eq i32 %320, 0, !dbg !18
527
+ %.01.i134 = select i1 %.not1.i133, float %318, float %317, !dbg !18
528
+ br label %__internal_fmad.exit.i116, !dbg !18
529
+
530
+ 321: ; preds = %__nv_erff.exit113
531
+ %322 = fmul float %38, %38, !dbg !18
532
+ br label %__internal_fmad.exit.i116, !dbg !18
533
+
534
+ __internal_fmad.exit.i116: ; preds = %321, %__nv_fabsf.exit1.i132
535
+ %323 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %321 ], !dbg !18
536
+ %324 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %321 ], !dbg !18
537
+ %325 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %321 ], !dbg !18
538
+ %326 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %321 ], !dbg !18
539
+ %327 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %321 ], !dbg !18
540
+ %328 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %321 ], !dbg !18
541
+ %329 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %321 ], !dbg !18
542
+ %330 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %322, %321 ], !dbg !18
543
+ %331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
544
+ %.not2.i117 = icmp eq i32 %331, 0, !dbg !18
545
+ %332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %329, float %330, float %328) #4, !dbg !18
546
+ %333 = tail call float @llvm.nvvm.fma.rn.f(float %329, float %330, float %328) #4, !dbg !18
547
+ %.02.i118 = select i1 %.not2.i117, float %333, float %332, !dbg !18
548
+ %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
549
+ %.not3.i119 = icmp eq i32 %334, 0, !dbg !18
550
+ %335 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %330, float %327) #4, !dbg !18
551
+ %336 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %330, float %327) #4, !dbg !18
552
+ %.03.i120 = select i1 %.not3.i119, float %336, float %335, !dbg !18
553
+ %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
554
+ %.not4.i121 = icmp eq i32 %337, 0, !dbg !18
555
+ %338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %330, float %326) #4, !dbg !18
556
+ %339 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %330, float %326) #4, !dbg !18
557
+ %.04.i122 = select i1 %.not4.i121, float %339, float %338, !dbg !18
558
+ %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
559
+ %.not5.i123 = icmp eq i32 %340, 0, !dbg !18
560
+ %341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %330, float %325) #4, !dbg !18
561
+ %342 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %330, float %325) #4, !dbg !18
562
+ %.05.i124 = select i1 %.not5.i123, float %342, float %341, !dbg !18
563
+ %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
564
+ %.not6.i125 = icmp eq i32 %343, 0, !dbg !18
565
+ %344 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %330, float %324) #4, !dbg !18
566
+ %345 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %330, float %324) #4, !dbg !18
567
+ %.06.i126 = select i1 %.not6.i125, float %345, float %344, !dbg !18
568
+ %346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
569
+ %.not7.i127 = icmp eq i32 %346, 0, !dbg !18
570
+ %347 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %330, float %323) #4, !dbg !18
571
+ %348 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %330, float %323) #4, !dbg !18
572
+ %.07.i128 = select i1 %.not7.i127, float %348, float %347, !dbg !18
573
+ %349 = fneg float %330, !dbg !18
574
+ %350 = select i1 %319, float %349, float %38, !dbg !18
575
+ %351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
576
+ %.not8.i129 = icmp eq i32 %351, 0, !dbg !18
577
+ %352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %350, float %350) #4, !dbg !18
578
+ %353 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %350, float %350) #4, !dbg !18
579
+ %.08.i130 = select i1 %.not8.i129, float %353, float %352, !dbg !18
580
+ br i1 %319, label %354, label %__nv_erff.exit135, !dbg !18
581
+
582
+ 354: ; preds = %__internal_fmad.exit.i116
583
+ %355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18
584
+ %356 = fsub float 1.000000e+00, %355, !dbg !18
585
+ %357 = bitcast float %356 to i32, !dbg !18
586
+ %358 = bitcast float %38 to i32, !dbg !18
587
+ %359 = and i32 %358, -2147483648, !dbg !18
588
+ %360 = or i32 %359, %357, !dbg !18
589
+ %361 = bitcast i32 %360 to float, !dbg !18
590
+ br label %__nv_erff.exit135, !dbg !18
591
+
592
+ __nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %354
593
+ %r.0.i131 = phi float [ %361, %354 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18
594
+ %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
595
+ %.not.i136 = icmp eq i32 %362, 0, !dbg !18
596
+ %363 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18
597
+ %364 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18
598
+ %.0.i137 = select i1 %.not.i136, float %364, float %363, !dbg !18
599
+ %365 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18
600
+ br i1 %365, label %__nv_fabsf.exit1.i154, label %367, !dbg !18
601
+
602
+ __nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135
603
+ %366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
604
+ %.not1.i155 = icmp eq i32 %366, 0, !dbg !18
605
+ %.01.i156 = select i1 %.not1.i155, float %364, float %363, !dbg !18
606
+ br label %__internal_fmad.exit.i138, !dbg !18
607
+
608
+ 367: ; preds = %__nv_erff.exit135
609
+ %368 = fmul float %39, %39, !dbg !18
610
+ br label %__internal_fmad.exit.i138, !dbg !18
611
+
612
+ __internal_fmad.exit.i138: ; preds = %367, %__nv_fabsf.exit1.i154
613
+ %369 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %367 ], !dbg !18
614
+ %370 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %367 ], !dbg !18
615
+ %371 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %367 ], !dbg !18
616
+ %372 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %367 ], !dbg !18
617
+ %373 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %367 ], !dbg !18
618
+ %374 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %367 ], !dbg !18
619
+ %375 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %367 ], !dbg !18
620
+ %376 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %368, %367 ], !dbg !18
621
+ %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
622
+ %.not2.i139 = icmp eq i32 %377, 0, !dbg !18
623
+ %378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float %376, float %374) #4, !dbg !18
624
+ %379 = tail call float @llvm.nvvm.fma.rn.f(float %375, float %376, float %374) #4, !dbg !18
625
+ %.02.i140 = select i1 %.not2.i139, float %379, float %378, !dbg !18
626
+ %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
627
+ %.not3.i141 = icmp eq i32 %380, 0, !dbg !18
628
+ %381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %376, float %373) #4, !dbg !18
629
+ %382 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %376, float %373) #4, !dbg !18
630
+ %.03.i142 = select i1 %.not3.i141, float %382, float %381, !dbg !18
631
+ %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
632
+ %.not4.i143 = icmp eq i32 %383, 0, !dbg !18
633
+ %384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %376, float %372) #4, !dbg !18
634
+ %385 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %376, float %372) #4, !dbg !18
635
+ %.04.i144 = select i1 %.not4.i143, float %385, float %384, !dbg !18
636
+ %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
637
+ %.not5.i145 = icmp eq i32 %386, 0, !dbg !18
638
+ %387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %376, float %371) #4, !dbg !18
639
+ %388 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %376, float %371) #4, !dbg !18
640
+ %.05.i146 = select i1 %.not5.i145, float %388, float %387, !dbg !18
641
+ %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
642
+ %.not6.i147 = icmp eq i32 %389, 0, !dbg !18
643
+ %390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %376, float %370) #4, !dbg !18
644
+ %391 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %376, float %370) #4, !dbg !18
645
+ %.06.i148 = select i1 %.not6.i147, float %391, float %390, !dbg !18
646
+ %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
647
+ %.not7.i149 = icmp eq i32 %392, 0, !dbg !18
648
+ %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %376, float %369) #4, !dbg !18
649
+ %394 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %376, float %369) #4, !dbg !18
650
+ %.07.i150 = select i1 %.not7.i149, float %394, float %393, !dbg !18
651
+ %395 = fneg float %376, !dbg !18
652
+ %396 = select i1 %365, float %395, float %39, !dbg !18
653
+ %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
654
+ %.not8.i151 = icmp eq i32 %397, 0, !dbg !18
655
+ %398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %396, float %396) #4, !dbg !18
656
+ %399 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %396, float %396) #4, !dbg !18
657
+ %.08.i152 = select i1 %.not8.i151, float %399, float %398, !dbg !18
658
+ br i1 %365, label %400, label %__nv_erff.exit157, !dbg !18
659
+
660
+ 400: ; preds = %__internal_fmad.exit.i138
661
+ %401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18
662
+ %402 = fsub float 1.000000e+00, %401, !dbg !18
663
+ %403 = bitcast float %402 to i32, !dbg !18
664
+ %404 = bitcast float %39 to i32, !dbg !18
665
+ %405 = and i32 %404, -2147483648, !dbg !18
666
+ %406 = or i32 %405, %403, !dbg !18
667
+ %407 = bitcast i32 %406 to float, !dbg !18
668
+ br label %__nv_erff.exit157, !dbg !18
669
+
670
+ __nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %400
671
+ %r.0.i153 = phi float [ %407, %400 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18
672
+ %408 = fmul float %31, 5.000000e-01, !dbg !19
673
+ %409 = fmul float %30, 5.000000e-01, !dbg !19
674
+ %410 = fmul float %29, 5.000000e-01, !dbg !19
675
+ %411 = fmul float %28, 5.000000e-01, !dbg !19
676
+ %412 = fmul float %27, 5.000000e-01, !dbg !19
677
+ %413 = fmul float %26, 5.000000e-01, !dbg !19
678
+ %414 = fmul float %25, 5.000000e-01, !dbg !19
679
+ %415 = fmul float %24, 5.000000e-01, !dbg !19
680
+ %416 = fadd float %r.0.i, 1.000000e+00, !dbg !20
681
+ %417 = fadd float %r.0.i21, 1.000000e+00, !dbg !20
682
+ %418 = fadd float %r.0.i43, 1.000000e+00, !dbg !20
683
+ %419 = fadd float %r.0.i65, 1.000000e+00, !dbg !20
684
+ %420 = fadd float %r.0.i87, 1.000000e+00, !dbg !20
685
+ %421 = fadd float %r.0.i109, 1.000000e+00, !dbg !20
686
+ %422 = fadd float %r.0.i131, 1.000000e+00, !dbg !20
687
+ %423 = fadd float %r.0.i153, 1.000000e+00, !dbg !20
688
+ %424 = fmul float %415, %416, !dbg !21
689
+ %425 = fmul float %414, %417, !dbg !21
690
+ %426 = fmul float %413, %418, !dbg !21
691
+ %427 = fmul float %412, %419, !dbg !21
692
+ %428 = fmul float %411, %420, !dbg !21
693
+ %429 = fmul float %410, %421, !dbg !21
694
+ %430 = fmul float %409, %422, !dbg !21
695
+ %431 = fmul float %408, %423, !dbg !21
696
+ %432 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %424) #4, !dbg !22
697
+ %433 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !22
698
+ %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !22
699
+ %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !22
700
+ %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !22
701
+ %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !22
702
+ %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !22
703
+ %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !22
704
+ %440 = insertelement <2 x i16> undef, i16 %432, i64 0, !dbg !22
705
+ %441 = insertelement <2 x i16> %440, i16 %433, i64 1, !dbg !22
706
+ %442 = bitcast <2 x i16> %441 to i32, !dbg !22
707
+ %443 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !22
708
+ %444 = insertelement <2 x i16> %443, i16 %435, i64 1, !dbg !22
709
+ %445 = bitcast <2 x i16> %444 to i32, !dbg !22
710
+ %446 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !22
711
+ %447 = insertelement <2 x i16> %446, i16 %437, i64 1, !dbg !22
712
+ %448 = bitcast <2 x i16> %447 to i32, !dbg !22
713
+ %449 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !22
714
+ %450 = insertelement <2 x i16> %449, i16 %439, i64 1, !dbg !22
715
+ %451 = bitcast <2 x i16> %450 to i32, !dbg !22
716
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %445, i32 %448, i32 %451, ptr addrspace(1) %10, i1 true) #4, !dbg !22
717
+ ret void, !dbg !23
718
+ }
719
+
720
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
721
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
722
+
723
+ ; Function Attrs: alwaysinline nounwind
724
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
725
+ __nv_fabsf.exit:
726
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
727
+ %.not = icmp eq i32 %0, 0
728
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
729
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
730
+ %.0 = select i1 %.not, float %2, float %1
731
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
732
+ br i1 %3, label %__nv_fabsf.exit1, label %5
733
+
734
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
735
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
736
+ %.not1 = icmp eq i32 %4, 0
737
+ %.01 = select i1 %.not1, float %2, float %1
738
+ br label %__internal_fmad.exit
739
+
740
+ 5: ; preds = %__nv_fabsf.exit
741
+ %6 = fmul float %a, %a
742
+ br label %__internal_fmad.exit
743
+
744
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
745
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
746
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
747
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
748
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
749
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
750
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
751
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
752
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
753
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
754
+ %.not2 = icmp eq i32 %15, 0
755
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
756
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
757
+ %.02 = select i1 %.not2, float %17, float %16
758
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
759
+ %.not3 = icmp eq i32 %18, 0
760
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
761
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
762
+ %.03 = select i1 %.not3, float %20, float %19
763
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
764
+ %.not4 = icmp eq i32 %21, 0
765
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
766
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
767
+ %.04 = select i1 %.not4, float %23, float %22
768
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
769
+ %.not5 = icmp eq i32 %24, 0
770
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
771
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
772
+ %.05 = select i1 %.not5, float %26, float %25
773
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
774
+ %.not6 = icmp eq i32 %27, 0
775
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
776
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
777
+ %.06 = select i1 %.not6, float %29, float %28
778
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
779
+ %.not7 = icmp eq i32 %30, 0
780
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
781
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
782
+ %.07 = select i1 %.not7, float %32, float %31
783
+ %33 = fneg float %14
784
+ %34 = select i1 %3, float %33, float %a
785
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
786
+ %.not8 = icmp eq i32 %35, 0
787
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
788
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
789
+ %.08 = select i1 %.not8, float %37, float %36
790
+ br i1 %3, label %38, label %46
791
+
792
+ 38: ; preds = %__internal_fmad.exit
793
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
794
+ %40 = fsub float 1.000000e+00, %39
795
+ %41 = bitcast float %40 to i32
796
+ %42 = bitcast float %a to i32
797
+ %43 = and i32 %42, -2147483648
798
+ %44 = or i32 %43, %41
799
+ %45 = bitcast i32 %44 to float
800
+ br label %46
801
+
802
+ 46: ; preds = %38, %__internal_fmad.exit
803
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
804
+ ret float %r.0
805
+ }
806
+
807
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
808
+
809
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
810
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
811
+
812
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
813
+ declare float @llvm.nvvm.fabs.f(float) #0
814
+
815
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
816
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
817
+
818
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
819
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
820
+
821
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
822
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
823
+
824
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
825
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
826
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
827
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
828
+ attributes #4 = { nounwind }
829
+
830
+ !llvm.module.flags = !{!0, !1}
831
+ !llvm.dbg.cu = !{!2}
832
+ !nvvm.annotations = !{!4, !5, !5, !4}
833
+ !llvm.ident = !{!6}
834
+
835
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
836
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
837
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
838
+ !3 = !DIFile(filename: "ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py", directory: "/tmp/torchinductor_root/kp")
839
+ !4 = !{ptr @triton__0d1de, !"kernel", i32 1}
840
+ !5 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
841
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
842
+ !7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
843
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
844
+ !9 = !{}
845
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
846
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
847
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
848
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
849
+ !14 = !DILocation(line: 24, column: 34, scope: !7)
850
+ !15 = !DILocation(line: 24, column: 39, scope: !7)
851
+ !16 = !DILocation(line: 24, column: 48, scope: !7)
852
+ !17 = !DILocation(line: 29, column: 18, scope: !7)
853
+ !18 = !DILocation(line: 30, column: 23, scope: !7)
854
+ !19 = !DILocation(line: 27, column: 18, scope: !7)
855
+ !20 = !DILocation(line: 32, column: 18, scope: !7)
856
+ !21 = !DILocation(line: 33, column: 18, scope: !7)
857
+ !22 = !DILocation(line: 35, column: 40, scope: !7)
858
+ !23 = !DILocation(line: 35, column: 4, scope: !7)
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = and i32 %6, 7, !dbg !8
11
+ %10 = shl nuw nsw i32 %9, 2, !dbg !8
12
+ %11 = and i32 %8, 7, !dbg !9
13
+ %12 = lshr i32 %7, 3, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 2, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = or i32 %14, 96, !dbg !9
17
+ %16 = or i32 %10, 1, !dbg !10
18
+ %17 = or i32 %10, 2, !dbg !10
19
+ %18 = or i32 %10, 3, !dbg !10
20
+ %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14
21
+ %20 = shl i32 %19, 5, !dbg !15
22
+ %21 = or i32 %20, %10, !dbg !16
23
+ %22 = or i32 %20, %7, !dbg !16
24
+ %23 = icmp ult i32 %15, 120, !dbg !17
25
+ %24 = shl nuw nsw i32 %14, 17, !dbg !18
26
+ %25 = or i32 %24, 4194304, !dbg !18
27
+ %26 = or i32 %24, 8388608, !dbg !18
28
+ %27 = shl nuw nsw i32 %15, 17, !dbg !18
29
+ %28 = add i32 %21, %24, !dbg !19
30
+ %29 = add i32 %25, %21, !dbg !19
31
+ %30 = add i32 %26, %21, !dbg !19
32
+ %31 = add i32 %21, %27, !dbg !19
33
+ %32 = sext i32 %28 to i64, !dbg !20
34
+ %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !20
35
+ %34 = sext i32 %29 to i64, !dbg !20
36
+ %35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !20
37
+ %36 = sext i32 %30 to i64, !dbg !20
38
+ %37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !20
39
+ %38 = sext i32 %31 to i64, !dbg !20
40
+ %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !20
41
+ %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
42
+ %41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !21
43
+ %42 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !21
44
+ %43 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !21
45
+ %44 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !21
46
+ %45 = bitcast i32 %41 to float, !dbg !21
47
+ %46 = bitcast i32 %42 to float, !dbg !21
48
+ %47 = bitcast i32 %43 to float, !dbg !21
49
+ %48 = bitcast i32 %44 to float, !dbg !21
50
+ %49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
51
+ %50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !21
52
+ %51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !21
53
+ %52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !21
54
+ %53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !21
55
+ %54 = bitcast i32 %50 to float, !dbg !21
56
+ %55 = bitcast i32 %51 to float, !dbg !21
57
+ %56 = bitcast i32 %52 to float, !dbg !21
58
+ %57 = bitcast i32 %53 to float, !dbg !21
59
+ %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21
60
+ %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !21
61
+ %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !21
62
+ %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !21
63
+ %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !21
64
+ %63 = bitcast i32 %59 to float, !dbg !21
65
+ %64 = bitcast i32 %60 to float, !dbg !21
66
+ %65 = bitcast i32 %61 to float, !dbg !21
67
+ %66 = bitcast i32 %62 to float, !dbg !21
68
+ %67 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23) #3, !dbg !21
69
+ %68 = extractvalue { i32, i32, i32, i32 } %67, 0, !dbg !21
70
+ %69 = extractvalue { i32, i32, i32, i32 } %67, 1, !dbg !21
71
+ %70 = extractvalue { i32, i32, i32, i32 } %67, 2, !dbg !21
72
+ %71 = extractvalue { i32, i32, i32, i32 } %67, 3, !dbg !21
73
+ %72 = bitcast i32 %68 to float, !dbg !21
74
+ %73 = bitcast i32 %69 to float, !dbg !21
75
+ %74 = bitcast i32 %70 to float, !dbg !21
76
+ %75 = bitcast i32 %71 to float, !dbg !21
77
+ %76 = fadd float %45, 0.000000e+00, !dbg !22
78
+ %77 = fadd float %46, 0.000000e+00, !dbg !22
79
+ %78 = fadd float %47, 0.000000e+00, !dbg !22
80
+ %79 = fadd float %48, 0.000000e+00, !dbg !22
81
+ %80 = fadd float %54, 0.000000e+00, !dbg !22
82
+ %81 = fadd float %55, 0.000000e+00, !dbg !22
83
+ %82 = fadd float %56, 0.000000e+00, !dbg !22
84
+ %83 = fadd float %57, 0.000000e+00, !dbg !22
85
+ %84 = fadd float %63, 0.000000e+00, !dbg !22
86
+ %85 = fadd float %64, 0.000000e+00, !dbg !22
87
+ %86 = fadd float %65, 0.000000e+00, !dbg !22
88
+ %87 = fadd float %66, 0.000000e+00, !dbg !22
89
+ %88 = fadd float %72, 0.000000e+00, !dbg !22
90
+ %89 = fadd float %73, 0.000000e+00, !dbg !22
91
+ %90 = fadd float %74, 0.000000e+00, !dbg !22
92
+ %91 = fadd float %75, 0.000000e+00, !dbg !22
93
+ %92 = select i1 %23, float %88, float 0.000000e+00, !dbg !23
94
+ %93 = select i1 %23, float %89, float 0.000000e+00, !dbg !23
95
+ %94 = select i1 %23, float %90, float 0.000000e+00, !dbg !23
96
+ %95 = select i1 %23, float %91, float 0.000000e+00, !dbg !23
97
+ %96 = fadd float %76, %80, !dbg !24
98
+ %97 = fadd float %77, %81, !dbg !24
99
+ %98 = fadd float %78, %82, !dbg !24
100
+ %99 = fadd float %79, %83, !dbg !24
101
+ %100 = fadd float %96, %84, !dbg !24
102
+ %101 = fadd float %97, %85, !dbg !24
103
+ %102 = fadd float %98, %86, !dbg !24
104
+ %103 = fadd float %99, %87, !dbg !24
105
+ %104 = fadd float %100, %92, !dbg !24
106
+ %105 = fadd float %101, %93, !dbg !24
107
+ %106 = fadd float %102, %94, !dbg !24
108
+ %107 = fadd float %103, %95, !dbg !24
109
+ %108 = bitcast float %104 to i32, !dbg !10
110
+ %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !10
111
+ %110 = bitcast i32 %109 to float, !dbg !10
112
+ %111 = fadd float %104, %110, !dbg !24
113
+ %112 = bitcast float %111 to i32, !dbg !10
114
+ %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !10
115
+ %114 = bitcast i32 %113 to float, !dbg !10
116
+ %115 = fadd float %111, %114, !dbg !24
117
+ %116 = bitcast float %105 to i32, !dbg !10
118
+ %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !10
119
+ %118 = bitcast i32 %117 to float, !dbg !10
120
+ %119 = fadd float %105, %118, !dbg !24
121
+ %120 = bitcast float %119 to i32, !dbg !10
122
+ %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 8, i32 31), !dbg !10
123
+ %122 = bitcast i32 %121 to float, !dbg !10
124
+ %123 = fadd float %119, %122, !dbg !24
125
+ %124 = bitcast float %106 to i32, !dbg !10
126
+ %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 16, i32 31), !dbg !10
127
+ %126 = bitcast i32 %125 to float, !dbg !10
128
+ %127 = fadd float %106, %126, !dbg !24
129
+ %128 = bitcast float %127 to i32, !dbg !10
130
+ %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !10
131
+ %130 = bitcast i32 %129 to float, !dbg !10
132
+ %131 = fadd float %127, %130, !dbg !24
133
+ %132 = bitcast float %107 to i32, !dbg !10
134
+ %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !10
135
+ %134 = bitcast i32 %133 to float, !dbg !10
136
+ %135 = fadd float %107, %134, !dbg !24
137
+ %136 = bitcast float %135 to i32, !dbg !10
138
+ %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !10
139
+ %138 = bitcast i32 %137 to float, !dbg !10
140
+ %139 = fadd float %135, %138, !dbg !24
141
+ %140 = icmp ult i32 %7, 8, !dbg !10
142
+ %141 = shl nuw nsw i32 %9, 5, !dbg !10
143
+ %142 = or i32 %141, %11, !dbg !10
144
+ %143 = zext nneg i32 %142 to i64, !dbg !10
145
+ %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !10
146
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %144, float %115, i1 %140) #3, !dbg !10
147
+ %145 = shl nuw nsw i32 %16, 3, !dbg !10
148
+ %146 = or i32 %145, %11, !dbg !10
149
+ %147 = zext nneg i32 %146 to i64, !dbg !10
150
+ %148 = getelementptr float, ptr addrspace(3) @global_smem, i64 %147, !dbg !10
151
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, float %123, i1 %140) #3, !dbg !10
152
+ %149 = shl nuw nsw i32 %17, 3, !dbg !10
153
+ %150 = or i32 %149, %11, !dbg !10
154
+ %151 = zext nneg i32 %150 to i64, !dbg !10
155
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10
156
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %152, float %131, i1 %140) #3, !dbg !10
157
+ %153 = shl nuw nsw i32 %18, 3, !dbg !10
158
+ %154 = or i32 %153, %11, !dbg !10
159
+ %155 = zext nneg i32 %154 to i64, !dbg !10
160
+ %156 = getelementptr float, ptr addrspace(3) @global_smem, i64 %155, !dbg !10
161
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %156, float %139, i1 %140) #3, !dbg !10
162
+ tail call void @llvm.nvvm.barrier0(), !dbg !10
163
+ %157 = icmp slt i32 %6, 256, !dbg !10
164
+ %158 = sext i32 %6 to i64, !dbg !10
165
+ %159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !10
166
+ %160 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %159, i1 %157) #3, !dbg !10
167
+ %161 = bitcast float %160 to i32, !dbg !10
168
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !10
169
+ %163 = bitcast i32 %162 to float, !dbg !10
170
+ %164 = fadd float %160, %163, !dbg !24
171
+ %165 = bitcast float %164 to i32, !dbg !10
172
+ %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !10
173
+ %167 = bitcast i32 %166 to float, !dbg !10
174
+ %168 = fadd float %164, %167, !dbg !24
175
+ %169 = bitcast float %168 to i32, !dbg !10
176
+ %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !10
177
+ %171 = bitcast i32 %170 to float, !dbg !10
178
+ %172 = fadd float %168, %171, !dbg !24
179
+ %173 = icmp eq i32 %9, 0, !dbg !10
180
+ %174 = and i1 %157, %173, !dbg !10
181
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %172, i1 %174) #3, !dbg !10
182
+ tail call void @llvm.nvvm.barrier0(), !dbg !10
183
+ %175 = zext nneg i32 %141 to i64, !dbg !10
184
+ %176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !10
185
+ %177 = load float, ptr addrspace(3) %176, align 4, !dbg !10
186
+ %178 = zext nneg i32 %145 to i64, !dbg !10
187
+ %179 = getelementptr float, ptr addrspace(3) @global_smem, i64 %178, !dbg !10
188
+ %180 = load float, ptr addrspace(3) %179, align 4, !dbg !10
189
+ %181 = zext nneg i32 %149 to i64, !dbg !10
190
+ %182 = getelementptr float, ptr addrspace(3) @global_smem, i64 %181, !dbg !10
191
+ %183 = load float, ptr addrspace(3) %182, align 4, !dbg !10
192
+ %184 = zext nneg i32 %153 to i64, !dbg !10
193
+ %185 = getelementptr float, ptr addrspace(3) @global_smem, i64 %184, !dbg !10
194
+ %186 = load float, ptr addrspace(3) %185, align 4, !dbg !10
195
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
196
+ %187 = zext nneg i32 %10 to i64, !dbg !28
197
+ %188 = getelementptr float, ptr addrspace(3) @global_smem, i64 %187, !dbg !28
198
+ %189 = insertelement <1 x float> undef, float %177, i64 0, !dbg !28
199
+ store <1 x float> %189, ptr addrspace(3) %188, align 4, !dbg !28
200
+ %190 = zext nneg i32 %16 to i64, !dbg !28
201
+ %191 = getelementptr float, ptr addrspace(3) @global_smem, i64 %190, !dbg !28
202
+ %192 = insertelement <1 x float> undef, float %180, i64 0, !dbg !28
203
+ store <1 x float> %192, ptr addrspace(3) %191, align 4, !dbg !28
204
+ %193 = zext nneg i32 %17 to i64, !dbg !28
205
+ %194 = getelementptr float, ptr addrspace(3) @global_smem, i64 %193, !dbg !28
206
+ %195 = insertelement <1 x float> undef, float %183, i64 0, !dbg !28
207
+ store <1 x float> %195, ptr addrspace(3) %194, align 4, !dbg !28
208
+ %196 = zext nneg i32 %18 to i64, !dbg !28
209
+ %197 = getelementptr float, ptr addrspace(3) @global_smem, i64 %196, !dbg !28
210
+ %198 = insertelement <1 x float> undef, float %186, i64 0, !dbg !28
211
+ store <1 x float> %198, ptr addrspace(3) %197, align 4, !dbg !28
212
+ tail call void @llvm.nvvm.barrier0(), !dbg !28
213
+ %199 = zext nneg i32 %7 to i64, !dbg !28
214
+ %200 = getelementptr float, ptr addrspace(3) @global_smem, i64 %199, !dbg !28
215
+ %201 = load <1 x float>, ptr addrspace(3) %200, align 4, !dbg !28
216
+ %.frozen = freeze i32 %22
217
+ %202 = sdiv i32 %.frozen, 256, !dbg !29
218
+ %203 = mul i32 %202, 256
219
+ %.decomposed = sub i32 %.frozen, %203
220
+ %204 = sext i32 %202 to i64, !dbg !30
221
+ %205 = getelementptr i64, ptr addrspace(1) %1, i64 %204, !dbg !30
222
+ %206 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %205, i1 true) #3, !dbg !31
223
+ %207 = lshr i64 %206, 54, !dbg !32
224
+ %208 = and i64 %207, 512, !dbg !32
225
+ %209 = add i64 %208, %206, !dbg !32
226
+ %210 = shl i64 %209, 8, !dbg !33
227
+ %211 = sext i32 %.decomposed to i64, !dbg !34
228
+ %212 = getelementptr float, ptr addrspace(1) %2, i64 %210, !dbg !35
229
+ %213 = getelementptr float, ptr addrspace(1) %212, i64 %211, !dbg !35
230
+ %214 = icmp eq i32 %11, 0, !dbg !36
231
+ %215 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %213, <1 x float> %201, i1 %214) #3, !dbg !36
232
+ ret void, !dbg !37
233
+ }
234
+
235
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
236
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
237
+
238
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
239
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
240
+
241
+ ; Function Attrs: convergent nocallback nounwind
242
+ declare void @llvm.nvvm.barrier0() #2
243
+
244
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
245
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
246
+ attributes #2 = { convergent nocallback nounwind }
247
+ attributes #3 = { nounwind }
248
+
249
+ !llvm.module.flags = !{!0}
250
+ !llvm.dbg.cu = !{!1}
251
+ !nvvm.annotations = !{!3, !4, !4, !3}
252
+
253
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
254
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
255
+ !2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
256
+ !3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
257
+ !4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
258
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
259
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
260
+ !7 = !{}
261
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
262
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
263
+ !10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13)
264
+ !11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0)
265
+ !12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
266
+ !13 = !DILocation(line: 35, column: 25, scope: !11)
267
+ !14 = !DILocation(line: 21, column: 28, scope: !5)
268
+ !15 = !DILocation(line: 21, column: 33, scope: !5)
269
+ !16 = !DILocation(line: 22, column: 23, scope: !5)
270
+ !17 = !DILocation(line: 29, column: 25, scope: !5)
271
+ !18 = !DILocation(line: 31, column: 47, scope: !5)
272
+ !19 = !DILocation(line: 31, column: 40, scope: !5)
273
+ !20 = !DILocation(line: 31, column: 34, scope: !5)
274
+ !21 = !DILocation(line: 31, column: 53, scope: !5)
275
+ !22 = !DILocation(line: 33, column: 23, scope: !5)
276
+ !23 = !DILocation(line: 34, column: 38, scope: !5)
277
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
278
+ !25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0)
279
+ !26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
280
+ !27 = !DILocation(line: 35, column: 25, scope: !25)
281
+ !28 = !DILocation(line: 35, column: 28, scope: !5)
282
+ !29 = !DILocation(line: 36, column: 20, scope: !5)
283
+ !30 = !DILocation(line: 38, column: 30, scope: !5)
284
+ !31 = !DILocation(line: 38, column: 35, scope: !5)
285
+ !32 = !DILocation(line: 41, column: 32, scope: !5)
286
+ !33 = !DILocation(line: 45, column: 40, scope: !5)
287
+ !34 = !DILocation(line: 45, column: 36, scope: !5)
288
+ !35 = !DILocation(line: 45, column: 30, scope: !5)
289
+ !36 = !DILocation(line: 45, column: 55, scope: !5)
290
+ !37 = !DILocation(line: 45, column: 4, scope: !5)
.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 256, 1, 1
20
+ {
21
+ .reg .pred %p<30>;
22
+ .reg .b32 %r<112>;
23
+ .reg .f32 %f<76>;
24
+ .reg .b64 %rd<22>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_0];
30
+ ld.param.u64 %rd9, [triton__0d1d2d3de4e_param_1];
31
+ $L__tmp0:
32
+ .loc 1 22 44
33
+ mov.u32 %r48, %tid.x;
34
+ and.b32 %r49, %r48, 31;
35
+ ld.param.u64 %rd10, [triton__0d1d2d3de4e_param_2];
36
+ and.b32 %r50, %r48, 7;
37
+ shl.b32 %r51, %r50, 2;
38
+ .loc 1 24 33
39
+ bfe.u32 %r52, %r48, 5, 3;
40
+ bfe.u32 %r53, %r48, 3, 2;
41
+ shl.b32 %r54, %r52, 2;
42
+ or.b32 %r55, %r54, %r53;
43
+ or.b32 %r56, %r55, 96;
44
+ .loc 1 21 28
45
+ mov.u32 %r1, %ctaid.x;
46
+ .loc 1 21 33
47
+ shl.b32 %r57, %r1, 5;
48
+ .loc 1 22 23
49
+ or.b32 %r58, %r57, %r51;
50
+ or.b32 %r59, %r57, %r49;
51
+ .loc 1 29 25
52
+ setp.lt.u32 %p16, %r56, 120;
53
+ .loc 1 31 47
54
+ shl.b32 %r60, %r55, 17;
55
+ shl.b32 %r61, %r56, 17;
56
+ .loc 1 31 40
57
+ add.s32 %r62, %r58, %r60;
58
+ add.s32 %r63, %r62, 4194304;
59
+ add.s32 %r64, %r62, 8388608;
60
+ add.s32 %r65, %r58, %r61;
61
+ .loc 1 31 34
62
+ mul.wide.s32 %rd11, %r62, 4;
63
+ add.s64 %rd1, %rd8, %rd11;
64
+ mul.wide.s32 %rd12, %r63, 4;
65
+ add.s64 %rd2, %rd8, %rd12;
66
+ mul.wide.s32 %rd13, %r64, 4;
67
+ add.s64 %rd3, %rd8, %rd13;
68
+ mul.wide.s32 %rd14, %r65, 4;
69
+ add.s64 %rd4, %rd8, %rd14;
70
+ mov.b32 %r6, 0;
71
+ mov.pred %p1, -1;
72
+ .loc 1 31 53
73
+ mov.u32 %r2, 0x0;
74
+ mov.u32 %r3, 0x0;
75
+ mov.u32 %r4, 0x0;
76
+ mov.u32 %r5, 0x0;
77
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
78
+ @!%p1 mov.u32 %r2, %r6;
79
+ @!%p1 mov.u32 %r3, %r6;
80
+ @!%p1 mov.u32 %r4, %r6;
81
+ @!%p1 mov.u32 %r5, %r6;
82
+ mov.b32 %f1, %r2;
83
+ mov.b32 %f2, %r3;
84
+ mov.b32 %f3, %r4;
85
+ mov.b32 %f4, %r5;
86
+ mov.u32 %r10, 0x0;
87
+ mov.u32 %r11, 0x0;
88
+ mov.u32 %r12, 0x0;
89
+ mov.u32 %r13, 0x0;
90
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ];
91
+ @!%p1 mov.u32 %r10, %r6;
92
+ @!%p1 mov.u32 %r11, %r6;
93
+ @!%p1 mov.u32 %r12, %r6;
94
+ @!%p1 mov.u32 %r13, %r6;
95
+ mov.b32 %f5, %r10;
96
+ mov.b32 %f6, %r11;
97
+ mov.b32 %f7, %r12;
98
+ mov.b32 %f8, %r13;
99
+ mov.u32 %r18, 0x0;
100
+ mov.u32 %r19, 0x0;
101
+ mov.u32 %r20, 0x0;
102
+ mov.u32 %r21, 0x0;
103
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ];
104
+ @!%p1 mov.u32 %r18, %r6;
105
+ @!%p1 mov.u32 %r19, %r6;
106
+ @!%p1 mov.u32 %r20, %r6;
107
+ @!%p1 mov.u32 %r21, %r6;
108
+ mov.b32 %f9, %r18;
109
+ mov.b32 %f10, %r19;
110
+ mov.b32 %f11, %r20;
111
+ mov.b32 %f12, %r21;
112
+ mov.u32 %r26, 0x0;
113
+ mov.u32 %r27, 0x0;
114
+ mov.u32 %r28, 0x0;
115
+ mov.u32 %r29, 0x0;
116
+ @%p16 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
117
+ @!%p16 mov.u32 %r26, %r6;
118
+ @!%p16 mov.u32 %r27, %r6;
119
+ @!%p16 mov.u32 %r28, %r6;
120
+ @!%p16 mov.u32 %r29, %r6;
121
+ mov.b32 %f13, %r26;
122
+ mov.b32 %f14, %r27;
123
+ mov.b32 %f15, %r28;
124
+ mov.b32 %f16, %r29;
125
+ .loc 1 33 23
126
+ add.f32 %f17, %f1, 0f00000000;
127
+ add.f32 %f18, %f2, 0f00000000;
128
+ add.f32 %f19, %f3, 0f00000000;
129
+ add.f32 %f20, %f4, 0f00000000;
130
+ add.f32 %f21, %f5, 0f00000000;
131
+ add.f32 %f22, %f6, 0f00000000;
132
+ add.f32 %f23, %f7, 0f00000000;
133
+ add.f32 %f24, %f8, 0f00000000;
134
+ add.f32 %f25, %f9, 0f00000000;
135
+ add.f32 %f26, %f10, 0f00000000;
136
+ add.f32 %f27, %f11, 0f00000000;
137
+ add.f32 %f28, %f12, 0f00000000;
138
+ add.f32 %f29, %f13, 0f00000000;
139
+ add.f32 %f30, %f14, 0f00000000;
140
+ add.f32 %f31, %f15, 0f00000000;
141
+ add.f32 %f32, %f16, 0f00000000;
142
+ .loc 1 34 38
143
+ selp.f32 %f33, %f29, 0f00000000, %p16;
144
+ selp.f32 %f34, %f30, 0f00000000, %p16;
145
+ selp.f32 %f35, %f31, 0f00000000, %p16;
146
+ selp.f32 %f36, %f32, 0f00000000, %p16;
147
+ $L__tmp1:
148
+ .loc 2 233 15
149
+ add.f32 %f37, %f17, %f21;
150
+ add.f32 %f38, %f18, %f22;
151
+ add.f32 %f39, %f19, %f23;
152
+ add.f32 %f40, %f20, %f24;
153
+ add.f32 %f41, %f37, %f25;
154
+ add.f32 %f42, %f38, %f26;
155
+ add.f32 %f43, %f39, %f27;
156
+ add.f32 %f44, %f40, %f28;
157
+ add.f32 %f45, %f41, %f33;
158
+ add.f32 %f46, %f42, %f34;
159
+ add.f32 %f47, %f43, %f35;
160
+ add.f32 %f48, %f44, %f36;
161
+ $L__tmp2:
162
+ .loc 2 243 36
163
+ mov.b32 %r66, %f45;
164
+ shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
165
+ mov.b32 %f49, %r67;
166
+ $L__tmp3:
167
+ .loc 2 233 15
168
+ add.f32 %f50, %f45, %f49;
169
+ $L__tmp4:
170
+ .loc 2 243 36
171
+ mov.b32 %r68, %f50;
172
+ shfl.sync.bfly.b32 %r69, %r68, 8, 31, -1;
173
+ mov.b32 %f51, %r69;
174
+ $L__tmp5:
175
+ .loc 2 233 15
176
+ add.f32 %f52, %f50, %f51;
177
+ $L__tmp6:
178
+ .loc 2 243 36
179
+ mov.b32 %r70, %f46;
180
+ shfl.sync.bfly.b32 %r71, %r70, 16, 31, -1;
181
+ mov.b32 %f53, %r71;
182
+ $L__tmp7:
183
+ .loc 2 233 15
184
+ add.f32 %f54, %f46, %f53;
185
+ $L__tmp8:
186
+ .loc 2 243 36
187
+ mov.b32 %r72, %f54;
188
+ shfl.sync.bfly.b32 %r73, %r72, 8, 31, -1;
189
+ mov.b32 %f55, %r73;
190
+ $L__tmp9:
191
+ .loc 2 233 15
192
+ add.f32 %f56, %f54, %f55;
193
+ $L__tmp10:
194
+ .loc 2 243 36
195
+ mov.b32 %r74, %f47;
196
+ shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1;
197
+ mov.b32 %f57, %r75;
198
+ $L__tmp11:
199
+ .loc 2 233 15
200
+ add.f32 %f58, %f47, %f57;
201
+ $L__tmp12:
202
+ .loc 2 243 36
203
+ mov.b32 %r76, %f58;
204
+ shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1;
205
+ mov.b32 %f59, %r77;
206
+ $L__tmp13:
207
+ .loc 2 233 15
208
+ add.f32 %f60, %f58, %f59;
209
+ $L__tmp14:
210
+ .loc 2 243 36
211
+ mov.b32 %r78, %f48;
212
+ shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1;
213
+ mov.b32 %f61, %r79;
214
+ $L__tmp15:
215
+ .loc 2 233 15
216
+ add.f32 %f62, %f48, %f61;
217
+ $L__tmp16:
218
+ .loc 2 243 36
219
+ mov.b32 %r80, %f62;
220
+ shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1;
221
+ mov.b32 %f63, %r81;
222
+ $L__tmp17:
223
+ .loc 2 233 15
224
+ add.f32 %f64, %f62, %f63;
225
+ $L__tmp18:
226
+ .loc 2 243 36
227
+ setp.lt.u32 %p21, %r49, 8;
228
+ shl.b32 %r82, %r50, 7;
229
+ or.b32 %r83, %r82, %r54;
230
+ mov.u32 %r84, global_smem;
231
+ add.s32 %r34, %r84, %r83;
232
+ mov.b32 %r35, %f52;
233
+ @%p21 st.shared.b32 [ %r34 + 0 ], %r35;
234
+ or.b32 %r85, %r82, 32;
235
+ or.b32 %r86, %r85, %r54;
236
+ add.s32 %r36, %r84, %r86;
237
+ mov.b32 %r37, %f56;
238
+ @%p21 st.shared.b32 [ %r36 + 0 ], %r37;
239
+ or.b32 %r87, %r82, 64;
240
+ or.b32 %r88, %r87, %r54;
241
+ add.s32 %r38, %r84, %r88;
242
+ mov.b32 %r39, %f60;
243
+ @%p21 st.shared.b32 [ %r38 + 0 ], %r39;
244
+ or.b32 %r89, %r82, 96;
245
+ or.b32 %r90, %r89, %r54;
246
+ add.s32 %r40, %r84, %r90;
247
+ mov.b32 %r41, %f64;
248
+ @%p21 st.shared.b32 [ %r40 + 0 ], %r41;
249
+ bar.sync 0;
250
+ setp.lt.s32 %p25, %r48, 256;
251
+ shl.b32 %r91, %r48, 2;
252
+ add.s32 %r43, %r84, %r91;
253
+ @%p25 ld.shared.b32 %r42, [ %r43 + 0 ];
254
+ mov.b32 %f65, %r42;
255
+ shfl.sync.bfly.b32 %r92, %r42, 4, 31, -1;
256
+ mov.b32 %f66, %r92;
257
+ $L__tmp19:
258
+ .loc 2 233 15
259
+ add.f32 %f67, %f65, %f66;
260
+ $L__tmp20:
261
+ .loc 2 243 36
262
+ mov.b32 %r93, %f67;
263
+ shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
264
+ mov.b32 %f68, %r94;
265
+ $L__tmp21:
266
+ .loc 2 233 15
267
+ add.f32 %f69, %f67, %f68;
268
+ $L__tmp22:
269
+ .loc 2 243 36
270
+ mov.b32 %r95, %f69;
271
+ shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
272
+ mov.b32 %f70, %r96;
273
+ $L__tmp23:
274
+ .loc 2 233 15
275
+ add.f32 %f71, %f69, %f70;
276
+ $L__tmp24:
277
+ .loc 2 243 36
278
+ setp.eq.s32 %p29, %r50, 0;
279
+ and.pred %p26, %p25, %p29;
280
+ mov.b32 %r45, %f71;
281
+ @%p26 st.shared.b32 [ %r43 + 0 ], %r45;
282
+ bar.sync 0;
283
+ add.s32 %r97, %r84, %r82;
284
+ ld.shared.f32 %f72, [%r97];
285
+ add.s32 %r98, %r84, %r85;
286
+ ld.shared.f32 %f73, [%r98];
287
+ add.s32 %r99, %r84, %r87;
288
+ ld.shared.f32 %f74, [%r99];
289
+ add.s32 %r100, %r84, %r89;
290
+ ld.shared.f32 %f75, [%r100];
291
+ $L__tmp25:
292
+ .loc 1 35 28
293
+ bar.sync 0;
294
+ shl.b32 %r101, %r50, 4;
295
+ add.s32 %r102, %r84, %r101;
296
+ st.shared.f32 [%r102], %f72;
297
+ st.shared.f32 [%r102+4], %f73;
298
+ st.shared.f32 [%r102+8], %f74;
299
+ st.shared.f32 [%r102+12], %f75;
300
+ bar.sync 0;
301
+ shl.b32 %r103, %r49, 2;
302
+ add.s32 %r104, %r84, %r103;
303
+ .loc 1 36 20
304
+ shr.s32 %r106, %r59, 31;
305
+ shr.u32 %r107, %r106, 24;
306
+ add.s32 %r108, %r59, %r107;
307
+ shr.s32 %r109, %r108, 8;
308
+ and.b32 %r110, %r108, -256;
309
+ sub.s32 %r111, %r59, %r110;
310
+ .loc 1 38 30
311
+ mul.wide.s32 %rd15, %r109, 8;
312
+ add.s64 %rd6, %rd9, %rd15;
313
+ .loc 1 45 55
314
+ ld.shared.u32 %r47, [%r104];
315
+ .loc 1 38 35
316
+ mov.u64 %rd5, 0x0;
317
+ @%p1 ld.global.L1::evict_last.b64 { %rd5 }, [ %rd6 + 0 ];
318
+ .loc 1 41 32
319
+ shr.u64 %rd16, %rd5, 54;
320
+ and.b64 %rd17, %rd16, 512;
321
+ add.s64 %rd18, %rd17, %rd5;
322
+ .loc 1 45 30
323
+ shl.b64 %rd19, %rd18, 10;
324
+ add.s64 %rd20, %rd10, %rd19;
325
+ mul.wide.s32 %rd21, %r111, 4;
326
+ add.s64 %rd7, %rd20, %rd21;
327
+ .loc 1 45 55
328
+ setp.eq.s32 %p28, %r52, 0;
329
+ mov.u32 %r46, 0x0;
330
+ @%p28 atom.global.gpu.acq_rel.add.f32 %r46, [ %rd7 + 0 ], %r47;
331
+ .loc 1 45 4
332
+ ret;
333
+ $L__tmp26:
334
+ $L__func_end0:
335
+
336
+ }
337
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
338
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
339
+ .section .debug_abbrev
340
+ {
341
+ .b8 1
342
+ .b8 17
343
+ .b8 1
344
+ .b8 37
345
+ .b8 8
346
+ .b8 19
347
+ .b8 5
348
+ .b8 3
349
+ .b8 8
350
+ .b8 16
351
+ .b8 6
352
+ .b8 27
353
+ .b8 8
354
+ .b8 180
355
+ .b8 66
356
+ .b8 12
357
+ .b8 17
358
+ .b8 1
359
+ .b8 18
360
+ .b8 1
361
+ .b8 0
362
+ .b8 0
363
+ .b8 2
364
+ .b8 46
365
+ .b8 0
366
+ .b8 135
367
+ .b8 64
368
+ .b8 8
369
+ .b8 3
370
+ .b8 8
371
+ .b8 58
372
+ .b8 11
373
+ .b8 59
374
+ .b8 11
375
+ .b8 63
376
+ .b8 12
377
+ .b8 32
378
+ .b8 11
379
+ .b8 0
380
+ .b8 0
381
+ .b8 3
382
+ .b8 46
383
+ .b8 1
384
+ .b8 17
385
+ .b8 1
386
+ .b8 18
387
+ .b8 1
388
+ .b8 64
389
+ .b8 10
390
+ .b8 49
391
+ .b8 19
392
+ .b8 0
393
+ .b8 0
394
+ .b8 4
395
+ .b8 29
396
+ .b8 1
397
+ .b8 49
398
+ .b8 19
399
+ .b8 17
400
+ .b8 1
401
+ .b8 18
402
+ .b8 1
403
+ .b8 88
404
+ .b8 11
405
+ .b8 89
406
+ .b8 11
407
+ .b8 87
408
+ .b8 11
409
+ .b8 0
410
+ .b8 0
411
+ .b8 5
412
+ .b8 29
413
+ .b8 0
414
+ .b8 49
415
+ .b8 19
416
+ .b8 17
417
+ .b8 1
418
+ .b8 18
419
+ .b8 1
420
+ .b8 88
421
+ .b8 11
422
+ .b8 89
423
+ .b8 11
424
+ .b8 87
425
+ .b8 11
426
+ .b8 0
427
+ .b8 0
428
+ .b8 0
429
+ }
430
+ .section .debug_info
431
+ {
432
+ .b32 264
433
+ .b8 2
434
+ .b8 0
435
+ .b32 .debug_abbrev
436
+ .b8 8
437
+ .b8 1
438
+ .b8 116
439
+ .b8 114
440
+ .b8 105
441
+ .b8 116
442
+ .b8 111
443
+ .b8 110
444
+ .b8 0
445
+ .b8 2
446
+ .b8 0
447
+ .b8 99
448
+ .b8 54
449
+ .b8 105
450
+ .b8 107
451
+ .b8 53
452
+ .b8 118
453
+ .b8 120
454
+ .b8 55
455
+ .b8 112
456
+ .b8 50
457
+ .b8 50
458
+ .b8 102
459
+ .b8 112
460
+ .b8 107
461
+ .b8 52
462
+ .b8 100
463
+ .b8 99
464
+ .b8 118
465
+ .b8 104
466
+ .b8 53
467
+ .b8 53
468
+ .b8 122
469
+ .b8 105
470
+ .b8 109
471
+ .b8 119
472
+ .b8 52
473
+ .b8 116
474
+ .b8 53
475
+ .b8 110
476
+ .b8 114
477
+ .b8 53
478
+ .b8 122
479
+ .b8 110
480
+ .b8 50
481
+ .b8 98
482
+ .b8 55
483
+ .b8 105
484
+ .b8 110
485
+ .b8 117
486
+ .b8 106
487
+ .b8 120
488
+ .b8 106
489
+ .b8 97
490
+ .b8 117
491
+ .b8 120
492
+ .b8 115
493
+ .b8 104
494
+ .b8 108
495
+ .b8 106
496
+ .b8 117
497
+ .b8 109
498
+ .b8 109
499
+ .b8 46
500
+ .b8 112
501
+ .b8 121
502
+ .b8 0
503
+ .b32 .debug_line
504
+ .b8 47
505
+ .b8 116
506
+ .b8 109
507
+ .b8 112
508
+ .b8 47
509
+ .b8 116
510
+ .b8 111
511
+ .b8 114
512
+ .b8 99
513
+ .b8 104
514
+ .b8 105
515
+ .b8 110
516
+ .b8 100
517
+ .b8 117
518
+ .b8 99
519
+ .b8 116
520
+ .b8 111
521
+ .b8 114
522
+ .b8 95
523
+ .b8 114
524
+ .b8 111
525
+ .b8 111
526
+ .b8 116
527
+ .b8 47
528
+ .b8 54
529
+ .b8 105
530
+ .b8 0
531
+ .b8 1
532
+ .b64 $L__func_begin0
533
+ .b64 $L__func_end0
534
+ .b8 2
535
+ .b8 116
536
+ .b8 114
537
+ .b8 105
538
+ .b8 116
539
+ .b8 111
540
+ .b8 110
541
+ .b8 95
542
+ .b8 95
543
+ .b8 48
544
+ .b8 100
545
+ .b8 49
546
+ .b8 100
547
+ .b8 50
548
+ .b8 100
549
+ .b8 51
550
+ .b8 100
551
+ .b8 101
552
+ .b8 52
553
+ .b8 101
554
+ .b8 0
555
+ .b8 116
556
+ .b8 114
557
+ .b8 105
558
+ .b8 116
559
+ .b8 111
560
+ .b8 110
561
+ .b8 95
562
+ .b8 95
563
+ .b8 48
564
+ .b8 100
565
+ .b8 49
566
+ .b8 100
567
+ .b8 50
568
+ .b8 100
569
+ .b8 51
570
+ .b8 100
571
+ .b8 101
572
+ .b8 52
573
+ .b8 101
574
+ .b8 0
575
+ .b8 1
576
+ .b8 18
577
+ .b8 1
578
+ .b8 1
579
+ .b8 3
580
+ .b64 $L__func_begin0
581
+ .b64 $L__func_end0
582
+ .b8 1
583
+ .b8 156
584
+ .b32 125
585
+ .b8 4
586
+ .b32 125
587
+ .b64 $L__tmp1
588
+ .b64 $L__tmp24
589
+ .b8 2
590
+ .b8 35
591
+ .b8 25
592
+ .b8 5
593
+ .b32 125
594
+ .b64 $L__tmp1
595
+ .b64 $L__tmp24
596
+ .b8 2
597
+ .b8 243
598
+ .b8 36
599
+ .b8 0
600
+ .b8 5
601
+ .b32 125
602
+ .b64 $L__tmp2
603
+ .b64 $L__tmp25
604
+ .b8 2
605
+ .b8 35
606
+ .b8 25
607
+ .b8 0
608
+ .b8 0
609
+ }
610
+ .section .debug_pubnames
611
+ {
612
+ .b32 $L__pubNames_end0-$L__pubNames_start0
613
+ $L__pubNames_start0:
614
+ .b8 2
615
+ .b8 0
616
+ .b32 .debug_info
617
+ .b32 268
618
+ .b32 125
619
+ .b8 116
620
+ .b8 114
621
+ .b8 105
622
+ .b8 116
623
+ .b8 111
624
+ .b8 110
625
+ .b8 95
626
+ .b8 95
627
+ .b8 48
628
+ .b8 100
629
+ .b8 49
630
+ .b8 100
631
+ .b8 50
632
+ .b8 100
633
+ .b8 51
634
+ .b8 100
635
+ .b8 101
636
+ .b8 52
637
+ .b8 101
638
+ .b8 0
639
+ .b32 0
640
+ $L__pubNames_end0:
641
+ }
642
+ .section .debug_pubtypes
643
+ {
644
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
645
+ $L__pubTypes_start0:
646
+ .b8 2
647
+ .b8 0
648
+ .b32 .debug_info
649
+ .b32 268
650
+ .b32 0
651
+ $L__pubTypes_end0:
652
+ }
653
+ .section .debug_loc { }
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %5 = and i32 %4, 127, !dbg !8
9
+ %6 = shl nuw nsw i32 %5, 3, !dbg !8
10
+ %7 = shl nuw nsw i32 %5, 2, !dbg !8
11
+ %8 = or i32 %7, 512, !dbg !8
12
+ %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
13
+ %10 = shl i32 %9, 10, !dbg !10
14
+ %11 = or i32 %10, %6, !dbg !11
15
+ %12 = or i32 %10, %7, !dbg !11
16
+ %13 = or i32 %10, %8, !dbg !11
17
+ %14 = sext i32 %11 to i64, !dbg !12
18
+ %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
19
+ %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
20
+ %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
21
+ %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
22
+ %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
23
+ %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
24
+ %21 = trunc i32 %17 to i16, !dbg !13
25
+ %extelt.offset = lshr i32 %17, 16, !dbg !13
26
+ %22 = trunc i32 %extelt.offset to i16, !dbg !13
27
+ %23 = trunc i32 %18 to i16, !dbg !13
28
+ %extelt.offset1 = lshr i32 %18, 16, !dbg !13
29
+ %24 = trunc i32 %extelt.offset1 to i16, !dbg !13
30
+ %25 = trunc i32 %19 to i16, !dbg !13
31
+ %extelt.offset2 = lshr i32 %19, 16, !dbg !13
32
+ %26 = trunc i32 %extelt.offset2 to i16, !dbg !13
33
+ %27 = trunc i32 %20 to i16, !dbg !13
34
+ %extelt.offset3 = lshr i32 %20, 16, !dbg !13
35
+ %28 = trunc i32 %extelt.offset3 to i16, !dbg !13
36
+ %29 = zext nneg i32 %6 to i64, !dbg !14
37
+ %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
38
+ %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
39
+ store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
40
+ %32 = or i32 %6, 1, !dbg !14
41
+ %33 = zext nneg i32 %32 to i64, !dbg !14
42
+ %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
43
+ %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
44
+ store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
45
+ %36 = or i32 %6, 2, !dbg !14
46
+ %37 = zext nneg i32 %36 to i64, !dbg !14
47
+ %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
48
+ %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
49
+ store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
50
+ %40 = or i32 %6, 3, !dbg !14
51
+ %41 = zext nneg i32 %40 to i64, !dbg !14
52
+ %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
53
+ %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
54
+ store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
55
+ %44 = or i32 %6, 4, !dbg !14
56
+ %45 = zext nneg i32 %44 to i64, !dbg !14
57
+ %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
58
+ %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
59
+ store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
60
+ %48 = or i32 %6, 5, !dbg !14
61
+ %49 = zext nneg i32 %48 to i64, !dbg !14
62
+ %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
63
+ %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
64
+ store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
65
+ %52 = or i32 %6, 6, !dbg !14
66
+ %53 = zext nneg i32 %52 to i64, !dbg !14
67
+ %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
68
+ %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
69
+ store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
70
+ %56 = or i32 %6, 7, !dbg !14
71
+ %57 = zext nneg i32 %56 to i64, !dbg !14
72
+ %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
73
+ %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
74
+ store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
75
+ tail call void @llvm.nvvm.barrier0(), !dbg !14
76
+ %60 = zext nneg i32 %7 to i64, !dbg !14
77
+ %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
78
+ %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
79
+ %63 = or i32 %7, 1, !dbg !14
80
+ %64 = zext nneg i32 %63 to i64, !dbg !14
81
+ %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
82
+ %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
83
+ %67 = or i32 %7, 2, !dbg !14
84
+ %68 = zext nneg i32 %67 to i64, !dbg !14
85
+ %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
86
+ %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
87
+ %71 = or i32 %7, 3, !dbg !14
88
+ %72 = zext nneg i32 %71 to i64, !dbg !14
89
+ %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
90
+ %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
91
+ %75 = zext nneg i32 %8 to i64, !dbg !14
92
+ %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
93
+ %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
94
+ %78 = or i32 %7, 513, !dbg !14
95
+ %79 = zext nneg i32 %78 to i64, !dbg !14
96
+ %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
97
+ %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
98
+ %82 = or i32 %7, 514, !dbg !14
99
+ %83 = zext nneg i32 %82 to i64, !dbg !14
100
+ %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
101
+ %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
102
+ %86 = or i32 %7, 515, !dbg !14
103
+ %87 = zext nneg i32 %86 to i64, !dbg !14
104
+ %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
105
+ %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
106
+ %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
107
+ %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
108
+ %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
109
+ %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
110
+ %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
111
+ %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
112
+ %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
113
+ %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
114
+ %98 = sext i32 %12 to i64, !dbg !15
115
+ %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
116
+ %100 = sext i32 %13 to i64, !dbg !15
117
+ %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
118
+ %102 = bitcast float %90 to i32, !dbg !16
119
+ %103 = bitcast float %91 to i32, !dbg !16
120
+ %104 = bitcast float %92 to i32, !dbg !16
121
+ %105 = bitcast float %93 to i32, !dbg !16
122
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
123
+ %106 = bitcast float %94 to i32, !dbg !16
124
+ %107 = bitcast float %95 to i32, !dbg !16
125
+ %108 = bitcast float %96 to i32, !dbg !16
126
+ %109 = bitcast float %97 to i32, !dbg !16
127
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
128
+ ret void, !dbg !17
129
+ }
130
+
131
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
132
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
133
+
134
+ ; Function Attrs: convergent nocallback nounwind
135
+ declare void @llvm.nvvm.barrier0() #1
136
+
137
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
138
+ attributes #1 = { convergent nocallback nounwind }
139
+ attributes #2 = { nounwind }
140
+
141
+ !llvm.module.flags = !{!0}
142
+ !llvm.dbg.cu = !{!1}
143
+ !nvvm.annotations = !{!3, !4, !4, !3}
144
+
145
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
146
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
147
+ !2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot")
148
+ !3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
149
+ !4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
150
+ !5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
151
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
152
+ !7 = !{}
153
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
154
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
155
+ !10 = !DILocation(line: 20, column: 33, scope: !5)
156
+ !11 = !DILocation(line: 21, column: 23, scope: !5)
157
+ !12 = !DILocation(line: 24, column: 30, scope: !5)
158
+ !13 = !DILocation(line: 24, column: 35, scope: !5)
159
+ !14 = !DILocation(line: 24, column: 44, scope: !5)
160
+ !15 = !DILocation(line: 26, column: 25, scope: !5)
161
+ !16 = !DILocation(line: 26, column: 36, scope: !5)
162
+ !17 = !DILocation(line: 26, column: 4, scope: !5)
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 128, 1, 1
18
+ {
19
+ .reg .pred %p<4>;
20
+ .reg .b16 %rs<9>;
21
+ .reg .b32 %r<37>;
22
+ .reg .b64 %rd<13>;
23
+ .loc 1 18 0
24
+ $L__func_begin0:
25
+ .loc 1 18 0
26
+
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
28
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
29
+ $L__tmp0:
30
+ .loc 1 21 36
31
+ mov.u32 %r22, %tid.x;
32
+ and.b32 %r23, %r22, 127;
33
+ shl.b32 %r24, %r23, 3;
34
+ shl.b32 %r25, %r23, 2;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r26, %r1, 10;
39
+ .loc 1 21 23
40
+ or.b32 %r27, %r26, %r24;
41
+ or.b32 %r28, %r26, %r25;
42
+ .loc 1 24 30
43
+ mul.wide.s32 %rd6, %r27, 2;
44
+ add.s64 %rd1, %rd4, %rd6;
45
+ mov.pred %p1, -1;
46
+ .loc 1 24 35
47
+ mov.u32 %r2, 0x0;
48
+ mov.u32 %r3, 0x0;
49
+ mov.u32 %r4, 0x0;
50
+ mov.u32 %r5, 0x0;
51
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
52
+ shr.u32 %r29, %r2, 16;
53
+ shr.u32 %r30, %r3, 16;
54
+ shr.u32 %r31, %r4, 16;
55
+ shr.u32 %r32, %r5, 16;
56
+ .loc 1 24 44
57
+ shl.b32 %r33, %r23, 4;
58
+ mov.u32 %r34, global_smem;
59
+ add.s32 %r35, %r34, %r33;
60
+ st.shared.u16 [%r35], %r2;
61
+ st.shared.u16 [%r35+2], %r29;
62
+ st.shared.u16 [%r35+4], %r3;
63
+ st.shared.u16 [%r35+6], %r30;
64
+ st.shared.u16 [%r35+8], %r4;
65
+ st.shared.u16 [%r35+10], %r31;
66
+ st.shared.u16 [%r35+12], %r5;
67
+ st.shared.u16 [%r35+14], %r32;
68
+ bar.sync 0;
69
+ add.s32 %r36, %r34, %r24;
70
+ ld.shared.u16 %rs1, [%r36];
71
+ ld.shared.u16 %rs2, [%r36+2];
72
+ ld.shared.u16 %rs3, [%r36+4];
73
+ ld.shared.u16 %rs4, [%r36+6];
74
+ ld.shared.u16 %rs5, [%r36+1024];
75
+ ld.shared.u16 %rs6, [%r36+1026];
76
+ ld.shared.u16 %rs7, [%r36+1028];
77
+ ld.shared.u16 %rs8, [%r36+1030];
78
+ cvt.f32.bf16 %r14, %rs1;
79
+ cvt.f32.bf16 %r15, %rs2;
80
+ cvt.f32.bf16 %r16, %rs3;
81
+ cvt.f32.bf16 %r17, %rs4;
82
+ cvt.f32.bf16 %r18, %rs5;
83
+ cvt.f32.bf16 %r19, %rs6;
84
+ cvt.f32.bf16 %r20, %rs7;
85
+ cvt.f32.bf16 %r21, %rs8;
86
+ .loc 1 26 25
87
+ mul.wide.s32 %rd7, %r28, 4;
88
+ add.s64 %rd2, %rd5, %rd7;
89
+ cvt.s64.s32 %rd8, %r26;
90
+ cvt.u64.u32 %rd9, %r25;
91
+ or.b64 %rd10, %rd8, %rd9;
92
+ shl.b64 %rd11, %rd10, 2;
93
+ add.s64 %rd12, %rd5, %rd11;
94
+ add.s64 %rd3, %rd12, 2048;
95
+ .loc 1 26 36
96
+ @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
97
+ @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
98
+ .loc 1 26 4
99
+ ret;
100
+ $L__tmp1:
101
+ $L__func_end0:
102
+
103
+ }
104
+ .file 1 "/tmp/torchinductor_root/ot/cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py"
105
+ .section .debug_abbrev
106
+ {
107
+ .b8 1
108
+ .b8 17
109
+ .b8 1
110
+ .b8 37
111
+ .b8 8
112
+ .b8 19
113
+ .b8 5
114
+ .b8 3
115
+ .b8 8
116
+ .b8 16
117
+ .b8 6
118
+ .b8 27
119
+ .b8 8
120
+ .b8 180
121
+ .b8 66
122
+ .b8 12
123
+ .b8 17
124
+ .b8 1
125
+ .b8 18
126
+ .b8 1
127
+ .b8 0
128
+ .b8 0
129
+ .b8 2
130
+ .b8 46
131
+ .b8 0
132
+ .b8 17
133
+ .b8 1
134
+ .b8 18
135
+ .b8 1
136
+ .b8 64
137
+ .b8 10
138
+ .b8 135
139
+ .b8 64
140
+ .b8 8
141
+ .b8 3
142
+ .b8 8
143
+ .b8 58
144
+ .b8 11
145
+ .b8 59
146
+ .b8 11
147
+ .b8 63
148
+ .b8 12
149
+ .b8 0
150
+ .b8 0
151
+ .b8 0
152
+ }
153
+ .section .debug_info
154
+ {
155
+ .b32 176
156
+ .b8 2
157
+ .b8 0
158
+ .b32 .debug_abbrev
159
+ .b8 8
160
+ .b8 1
161
+ .b8 116
162
+ .b8 114
163
+ .b8 105
164
+ .b8 116
165
+ .b8 111
166
+ .b8 110
167
+ .b8 0
168
+ .b8 2
169
+ .b8 0
170
+ .b8 99
171
+ .b8 111
172
+ .b8 116
173
+ .b8 98
174
+ .b8 104
175
+ .b8 101
176
+ .b8 116
177
+ .b8 51
178
+ .b8 55
179
+ .b8 118
180
+ .b8 54
181
+ .b8 109
182
+ .b8 104
183
+ .b8 53
184
+ .b8 115
185
+ .b8 97
186
+ .b8 109
187
+ .b8 113
188
+ .b8 108
189
+ .b8 55
190
+ .b8 117
191
+ .b8 120
192
+ .b8 114
193
+ .b8 101
194
+ .b8 51
195
+ .b8 104
196
+ .b8 112
197
+ .b8 114
198
+ .b8 112
199
+ .b8 110
200
+ .b8 98
201
+ .b8 104
202
+ .b8 117
203
+ .b8 118
204
+ .b8 105
205
+ .b8 109
206
+ .b8 51
207
+ .b8 102
208
+ .b8 109
209
+ .b8 114
210
+ .b8 106
211
+ .b8 112
212
+ .b8 113
213
+ .b8 53
214
+ .b8 102
215
+ .b8 103
216
+ .b8 103
217
+ .b8 54
218
+ .b8 108
219
+ .b8 119
220
+ .b8 98
221
+ .b8 105
222
+ .b8 46
223
+ .b8 112
224
+ .b8 121
225
+ .b8 0
226
+ .b32 .debug_line
227
+ .b8 47
228
+ .b8 116
229
+ .b8 109
230
+ .b8 112
231
+ .b8 47
232
+ .b8 116
233
+ .b8 111
234
+ .b8 114
235
+ .b8 99
236
+ .b8 104
237
+ .b8 105
238
+ .b8 110
239
+ .b8 100
240
+ .b8 117
241
+ .b8 99
242
+ .b8 116
243
+ .b8 111
244
+ .b8 114
245
+ .b8 95
246
+ .b8 114
247
+ .b8 111
248
+ .b8 111
249
+ .b8 116
250
+ .b8 47
251
+ .b8 111
252
+ .b8 116
253
+ .b8 0
254
+ .b8 1
255
+ .b64 $L__func_begin0
256
+ .b64 $L__func_end0
257
+ .b8 2
258
+ .b64 $L__func_begin0
259
+ .b64 $L__func_end0
260
+ .b8 1
261
+ .b8 156
262
+ .b8 116
263
+ .b8 114
264
+ .b8 105
265
+ .b8 116
266
+ .b8 111
267
+ .b8 110
268
+ .b8 95
269
+ .b8 95
270
+ .b8 48
271
+ .b8 100
272
+ .b8 49
273
+ .b8 100
274
+ .b8 50
275
+ .b8 100
276
+ .b8 101
277
+ .b8 0
278
+ .b8 116
279
+ .b8 114
280
+ .b8 105
281
+ .b8 116
282
+ .b8 111
283
+ .b8 110
284
+ .b8 95
285
+ .b8 95
286
+ .b8 48
287
+ .b8 100
288
+ .b8 49
289
+ .b8 100
290
+ .b8 50
291
+ .b8 100
292
+ .b8 101
293
+ .b8 0
294
+ .b8 1
295
+ .b8 18
296
+ .b8 1
297
+ .b8 0
298
+ }
299
+ .section .debug_pubnames
300
+ {
301
+ .b32 $L__pubNames_end0-$L__pubNames_start0
302
+ $L__pubNames_start0:
303
+ .b8 2
304
+ .b8 0
305
+ .b32 .debug_info
306
+ .b32 180
307
+ .b32 125
308
+ .b8 116
309
+ .b8 114
310
+ .b8 105
311
+ .b8 116
312
+ .b8 111
313
+ .b8 110
314
+ .b8 95
315
+ .b8 95
316
+ .b8 48
317
+ .b8 100
318
+ .b8 49
319
+ .b8 100
320
+ .b8 50
321
+ .b8 100
322
+ .b8 101
323
+ .b8 0
324
+ .b32 0
325
+ $L__pubNames_end0:
326
+ }
327
+ .section .debug_pubtypes
328
+ {
329
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
330
+ $L__pubTypes_start0:
331
+ .b8 2
332
+ .b8 0
333
+ .b32 .debug_info
334
+ .b32 180
335
+ .b32 0
336
+ $L__pubTypes_end0:
337
+ }
338
+ .section .debug_loc { }
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
10
+ %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
11
+ %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
12
+ %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
13
+ %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
14
+ %8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
15
+ %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
16
+ %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
17
+ %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
18
+ %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
19
+ %13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
20
+ %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
21
+ tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
22
+ tt.return
23
+ }
24
+ }
.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c1024_i32 = arith.constant 1024 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c1024_i32 : i32
6
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
8
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
12
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
13
+ %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
14
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
15
+ tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin ADDED
Binary file (32 kB). View file
 
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx ADDED
@@ -0,0 +1,756 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
26
+
27
+ .visible .entry triton__0d1d2d3d4d5de6de(
28
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
29
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
30
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
31
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
32
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
33
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
34
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
35
+ )
36
+ .maxntid 256, 1, 1
37
+ {
38
+ .reg .pred %p<27>;
39
+ .reg .b16 %rs<3>;
40
+ .reg .b32 %r<81>;
41
+ .reg .f32 %f<73>;
42
+ .reg .b64 %rd<84>;
43
+ .loc 1 18 0
44
+ $L__func_begin0:
45
+ .loc 1 18 0
46
+
47
+ ld.param.u64 %rd35, [triton__0d1d2d3d4d5de6de_param_3];
48
+ ld.param.u64 %rd34, [triton__0d1d2d3d4d5de6de_param_2];
49
+ ld.param.u64 %rd33, [triton__0d1d2d3d4d5de6de_param_1];
50
+ ld.param.u64 %rd41, [triton__0d1d2d3d4d5de6de_param_0];
51
+ $L__tmp0:
52
+ .loc 1 22 44
53
+ mov.u32 %r1, %tid.x;
54
+ bfe.u32 %r2, %r1, 2, 6;
55
+ and.b32 %r14, %r1, 63;
56
+ .loc 1 24 33
57
+ and.b32 %r3, %r1, 3;
58
+ .loc 1 21 28
59
+ mov.u32 %r13, %ctaid.x;
60
+ .loc 1 21 33
61
+ shl.b32 %r15, %r13, 6;
62
+ .loc 1 22 23
63
+ or.b32 %r16, %r15, %r2;
64
+ or.b32 %r17, %r15, %r14;
65
+ .loc 1 26 30
66
+ mul.wide.s32 %rd42, %r16, 8;
67
+ add.s64 %rd38, %rd41, %rd42;
68
+ mul.wide.s32 %rd43, %r17, 8;
69
+ add.s64 %rd40, %rd41, %rd43;
70
+ mov.pred %p11, -1;
71
+ .loc 1 26 35
72
+ mov.u64 %rd37, 0x0;
73
+ @%p11 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd38 + 0 ];
74
+ mov.u64 %rd39, 0x0;
75
+ @%p11 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
76
+ .loc 1 27 18
77
+ bfe.s32 %r18, %r13, 25, 1;
78
+ shr.u32 %r19, %r18, 23;
79
+ add.s32 %r20, %r16, %r19;
80
+ and.b32 %r21, %r20, 16776704;
81
+ sub.s32 %r22, %r16, %r21;
82
+ .loc 1 35 44
83
+ shl.b32 %r5, %r22, 8;
84
+ .loc 1 36 22
85
+ add.s64 %rd44, %rd39, 50257;
86
+ .loc 1 37 22
87
+ setp.lt.s64 %p3, %rd37, 0;
88
+ setp.lt.s64 %p4, %rd39, 0;
89
+ .loc 1 38 36
90
+ selp.b64 %rd45, %rd44, %rd39, %p4;
91
+ .loc 1 39 40
92
+ setp.gt.u64 %p5, %rd45, 50256;
93
+ .loc 1 40 44
94
+ shl.b64 %rd46, %rd37, 8;
95
+ add.s64 %rd47, %rd46, 12865792;
96
+ selp.b64 %rd2, %rd47, %rd46, %p3;
97
+ mov.b32 %r67, 0;
98
+ mov.b32 %r77, 883;
99
+ mov.u64 %rd73, 1;
100
+ .loc 1 39 55
101
+ @%p5 bra $L__BB0_3;
102
+ bra.uni $L__BB0_1;
103
+ $L__BB0_3:
104
+ .loc 1 31 36
105
+ shl.b64 %rd51, %rd2, 2;
106
+ mul.wide.u32 %rd80, %r3, 4;
107
+ add.s64 %rd79, %rd51, %rd80;
108
+ add.s64 %rd75, %rd33, %rd79;
109
+ add.s32 %r35, %r5, %r3;
110
+ mul.wide.s32 %rd78, %r35, 4;
111
+ add.s64 %rd74, %rd34, %rd78;
112
+ mov.f32 %f72, 0f00000000;
113
+ mov.b32 %r78, -4;
114
+ mov.f32 %f71, %f72;
115
+ mov.f32 %f70, %f72;
116
+ $L__BB0_4:
117
+ .loc 1 35 50
118
+ mov.u32 %r36, 0x0;
119
+ @%p11 ld.global.L1::evict_last.b32 { %r36 }, [ %rd74 + 0 ];
120
+ @!%p11 mov.u32 %r36, %r67;
121
+ mov.b32 %f28, %r36;
122
+ .loc 1 39 55
123
+ mov.u64 %rd54, assertMessage_0;
124
+ cvta.global.u64 %rd55, %rd54;
125
+ mov.u64 %rd56, assertFile_0;
126
+ cvta.global.u64 %rd57, %rd56;
127
+ mov.u64 %rd58, assertFunc_0;
128
+ cvta.global.u64 %rd59, %rd58;
129
+ { // callseq 10, 0
130
+ .reg .b32 temp_param_reg;
131
+ .param .b64 param0;
132
+ st.param.b64 [param0+0], %rd55;
133
+ .param .b64 param1;
134
+ st.param.b64 [param1+0], %rd57;
135
+ .param .b32 param2;
136
+ st.param.b32 [param2+0], %r77;
137
+ .param .b64 param3;
138
+ st.param.b64 [param3+0], %rd59;
139
+ .param .b64 param4;
140
+ st.param.b64 [param4+0], %rd73;
141
+ call.uni
142
+ __assertfail,
143
+ (
144
+ param0,
145
+ param1,
146
+ param2,
147
+ param3,
148
+ param4
149
+ );
150
+ } // callseq 10
151
+ .loc 1 40 52
152
+ mov.u32 %r38, 0x0;
153
+ @%p11 ld.global.L1::evict_last.b32 { %r38 }, [ %rd75 + 0 ];
154
+ @!%p11 mov.u32 %r38, %r67;
155
+ mov.b32 %f29, %r38;
156
+ .loc 1 41 22
157
+ add.f32 %f30, %f28, %f29;
158
+ $L__tmp1:
159
+ .loc 2 96 20
160
+ sub.f32 %f31, %f30, %f70;
161
+ .loc 2 97 26
162
+ add.f32 %f72, %f72, 0f3F800000;
163
+ .loc 2 98 30
164
+ mov.b32 %r41, %f31;
165
+ mov.b32 %r42, %f72;
166
+ div.full.f32 %r40, %r41, %r42;
167
+ mov.b32 %f32, %r40;
168
+ .loc 2 98 22
169
+ add.f32 %f70, %f70, %f32;
170
+ .loc 2 101 30
171
+ sub.f32 %f33, %f30, %f70;
172
+ $L__tmp2:
173
+ .loc 1 47 48
174
+ fma.rn.f32 %f71, %f31, %f33, %f71;
175
+ .loc 1 31 36
176
+ add.s32 %r78, %r78, 4;
177
+ add.s64 %rd75, %rd75, 16;
178
+ add.s64 %rd74, %rd74, 16;
179
+ setp.lt.u32 %p15, %r78, 252;
180
+ @%p15 bra $L__BB0_4;
181
+ bra.uni $L__BB0_5;
182
+ $L__BB0_1:
183
+ .loc 1 0 36
184
+ mov.b32 %r79, -4;
185
+ .loc 1 31 36
186
+ shl.b64 %rd48, %rd2, 2;
187
+ mul.wide.u32 %rd80, %r3, 4;
188
+ add.s64 %rd79, %rd48, %rd80;
189
+ add.s64 %rd77, %rd33, %rd79;
190
+ add.s32 %r25, %r5, %r3;
191
+ mul.wide.s32 %rd78, %r25, 4;
192
+ add.s64 %rd76, %rd34, %rd78;
193
+ mov.f32 %f72, 0f00000000;
194
+ mov.f32 %f71, %f72;
195
+ mov.f32 %f70, %f72;
196
+ $L__BB0_2:
197
+ .loc 1 35 50
198
+ mov.u32 %r26, 0x0;
199
+ @%p11 ld.global.L1::evict_last.b32 { %r26 }, [ %rd76 + 0 ];
200
+ @!%p11 mov.u32 %r26, %r67;
201
+ mov.b32 %f21, %r26;
202
+ .loc 1 40 52
203
+ mov.u32 %r28, 0x0;
204
+ @%p11 ld.global.L1::evict_last.b32 { %r28 }, [ %rd77 + 0 ];
205
+ @!%p11 mov.u32 %r28, %r67;
206
+ mov.b32 %f22, %r28;
207
+ .loc 1 41 22
208
+ add.f32 %f23, %f21, %f22;
209
+ $L__tmp3:
210
+ .loc 2 96 20
211
+ sub.f32 %f24, %f23, %f70;
212
+ .loc 2 97 26
213
+ add.f32 %f72, %f72, 0f3F800000;
214
+ .loc 2 98 30
215
+ mov.b32 %r31, %f24;
216
+ mov.b32 %r32, %f72;
217
+ div.full.f32 %r30, %r31, %r32;
218
+ mov.b32 %f25, %r30;
219
+ .loc 2 98 22
220
+ add.f32 %f70, %f70, %f25;
221
+ .loc 2 101 30
222
+ sub.f32 %f26, %f23, %f70;
223
+ $L__tmp4:
224
+ .loc 1 47 48
225
+ fma.rn.f32 %f71, %f24, %f26, %f71;
226
+ .loc 1 31 36
227
+ add.s32 %r79, %r79, 4;
228
+ add.s64 %rd77, %rd77, 16;
229
+ add.s64 %rd76, %rd76, 16;
230
+ setp.lt.u32 %p10, %r79, 252;
231
+ @%p10 bra $L__BB0_2;
232
+ $L__BB0_5:
233
+ .loc 1 0 36
234
+ ld.param.u64 %rd36, [triton__0d1d2d3d4d5de6de_param_4];
235
+ $L__tmp5:
236
+ .loc 2 120 46
237
+ mov.b32 %r54, %f70;
238
+ shfl.sync.bfly.b32 %r55, %r54, 2, 31, -1;
239
+ mov.b32 %f34, %r55;
240
+ mov.b32 %r56, %f71;
241
+ shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1;
242
+ mov.b32 %f35, %r57;
243
+ mov.b32 %r58, %f72;
244
+ shfl.sync.bfly.b32 %r45, %r58, 2, 31, -1;
245
+ mov.b32 %f36, %r45;
246
+ $L__tmp6:
247
+ .loc 2 108 21
248
+ sub.f32 %f37, %f34, %f70;
249
+ .loc 2 109 28
250
+ add.f32 %f38, %f72, %f36;
251
+ .loc 2 110 39
252
+ setp.eq.f32 %p16, %f38, 0f00000000;
253
+ .loc 2 110 60
254
+ mov.b32 %r46, %f38;
255
+ div.full.f32 %r44, %r45, %r46;
256
+ mov.b32 %f39, %r44;
257
+ .loc 2 110 49
258
+ selp.f32 %f40, 0f00000000, %f39, %p16;
259
+ .loc 2 112 17
260
+ fma.rn.f32 %f41, %f37, %f40, %f70;
261
+ .loc 2 113 15
262
+ add.f32 %f42, %f71, %f35;
263
+ .loc 2 113 30
264
+ mul.f32 %f43, %f37, %f37;
265
+ .loc 2 113 38
266
+ mul.f32 %f44, %f72, %f43;
267
+ .loc 2 113 22
268
+ fma.rn.f32 %f45, %f44, %f40, %f42;
269
+ $L__tmp7:
270
+ .loc 2 120 46
271
+ mov.b32 %r59, %f41;
272
+ shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1;
273
+ mov.b32 %f46, %r60;
274
+ mov.b32 %r61, %f45;
275
+ shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1;
276
+ mov.b32 %f47, %r62;
277
+ shfl.sync.bfly.b32 %r48, %r46, 1, 31, -1;
278
+ mov.b32 %f48, %r48;
279
+ $L__tmp8:
280
+ .loc 2 108 21
281
+ sub.f32 %f49, %f46, %f41;
282
+ .loc 2 109 28
283
+ add.f32 %f50, %f38, %f48;
284
+ .loc 2 110 39
285
+ setp.eq.f32 %p17, %f50, 0f00000000;
286
+ .loc 2 110 60
287
+ mov.b32 %r49, %f50;
288
+ div.full.f32 %r47, %r48, %r49;
289
+ mov.b32 %f51, %r47;
290
+ .loc 2 110 49
291
+ selp.f32 %f52, 0f00000000, %f51, %p17;
292
+ .loc 2 112 17
293
+ fma.rn.f32 %f16, %f49, %f52, %f41;
294
+ .loc 2 113 15
295
+ add.f32 %f53, %f45, %f47;
296
+ .loc 2 113 30
297
+ mul.f32 %f54, %f49, %f49;
298
+ .loc 2 113 38
299
+ mul.f32 %f55, %f38, %f54;
300
+ .loc 2 113 22
301
+ fma.rn.f32 %f56, %f52, %f55, %f53;
302
+ $L__tmp9:
303
+ .loc 1 69 23
304
+ mov.b32 %r51, %f56;
305
+ mov.b32 %r52, 1132462080;
306
+ div.full.f32 %r50, %r51, %r52;
307
+ mov.b32 %f57, %r50;
308
+ .loc 1 71 24
309
+ add.f32 %f17, %f57, 0f3727C5AC;
310
+ .loc 1 55 36
311
+ shl.b32 %r63, %r13, 14;
312
+ shl.b32 %r64, %r2, 8;
313
+ or.b32 %r65, %r63, %r64;
314
+ or.b32 %r10, %r65, %r3;
315
+ add.s64 %rd83, %rd33, %rd79;
316
+ add.s64 %rd82, %rd35, %rd80;
317
+ add.s64 %rd81, %rd34, %rd78;
318
+ mov.b32 %r80, -4;
319
+ setp.lt.u64 %p22, %rd45, 50257;
320
+ rsqrt.approx.ftz.f32 %f61, %f17;
321
+ bra.uni $L__BB0_6;
322
+ $L__BB0_8:
323
+ .loc 1 0 0
324
+ mov.b32 %f18, %r66;
325
+ mov.b32 %f19, %r68;
326
+ .loc 1 65 54
327
+ mov.u32 %r71, 0x0;
328
+ @%p11 ld.global.L1::evict_first.b32 { %r71 }, [ %rd83 + 0 ];
329
+ @!%p11 mov.u32 %r71, %r67;
330
+ mov.b32 %f58, %r71;
331
+ .loc 1 66 24
332
+ add.f32 %f59, %f18, %f58;
333
+ .loc 1 67 24
334
+ sub.f32 %f60, %f59, %f16;
335
+ .loc 1 73 24
336
+ mul.f32 %f62, %f60, %f61;
337
+ .loc 1 74 24
338
+ mul.f32 %f63, %f62, %f19;
339
+ .loc 1 55 36
340
+ add.s32 %r80, %r80, 4;
341
+ .loc 1 76 29
342
+ add.s32 %r74, %r80, %r10;
343
+ mul.wide.s32 %rd72, %r74, 2;
344
+ add.s64 %rd71, %rd36, %rd72;
345
+ .loc 1 76 52
346
+ mov.b32 %r73, %f63;
347
+ cvt.rn.bf16.f32 %rs1, %r73;
348
+ @%p11 st.global.b16 [ %rd71 + 0 ], { %rs1 };
349
+ .loc 1 55 36
350
+ add.s64 %rd83, %rd83, 16;
351
+ add.s64 %rd82, %rd82, 16;
352
+ add.s64 %rd81, %rd81, 16;
353
+ setp.lt.u32 %p26, %r80, 252;
354
+ @%p26 bra $L__BB0_6;
355
+ bra.uni $L__BB0_9;
356
+ $L__BB0_6:
357
+ .loc 1 59 51
358
+ mov.u32 %r66, 0x0;
359
+ @%p11 ld.global.L1::evict_last.b32 { %r66 }, [ %rd81 + 0 ];
360
+ @!%p11 mov.u32 %r66, %r67;
361
+ .loc 1 60 40
362
+ mov.u32 %r68, 0x0;
363
+ @%p11 ld.global.L1::evict_last.b32 { %r68 }, [ %rd82 + 0 ];
364
+ @!%p11 mov.u32 %r68, %r67;
365
+ .loc 1 64 57
366
+ @%p22 bra $L__BB0_8;
367
+ mov.u64 %rd63, assertMessage_1;
368
+ cvta.global.u64 %rd64, %rd63;
369
+ mov.u64 %rd65, assertFile_1;
370
+ cvta.global.u64 %rd66, %rd65;
371
+ mov.u64 %rd67, assertFunc_1;
372
+ cvta.global.u64 %rd68, %rd67;
373
+ { // callseq 11, 0
374
+ .reg .b32 temp_param_reg;
375
+ .param .b64 param0;
376
+ st.param.b64 [param0+0], %rd64;
377
+ .param .b64 param1;
378
+ st.param.b64 [param1+0], %rd66;
379
+ .param .b32 param2;
380
+ st.param.b32 [param2+0], %r77;
381
+ .param .b64 param3;
382
+ st.param.b64 [param3+0], %rd68;
383
+ .param .b64 param4;
384
+ st.param.b64 [param4+0], %rd73;
385
+ call.uni
386
+ __assertfail,
387
+ (
388
+ param0,
389
+ param1,
390
+ param2,
391
+ param3,
392
+ param4
393
+ );
394
+ } // callseq 11
395
+ bra.uni $L__BB0_8;
396
+ $L__BB0_9:
397
+ .loc 1 55 4
398
+ ret;
399
+ $L__tmp10:
400
+ $L__func_end0:
401
+
402
+ }
403
+ // .globl __nv_rsqrtf
404
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
405
+ .param .b32 __nv_rsqrtf_param_0
406
+ )
407
+ {
408
+ .reg .f32 %f<3>;
409
+ $L__func_begin1:
410
+
411
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
412
+ rsqrt.approx.ftz.f32 %f2, %f1;
413
+ st.param.f32 [func_retval0+0], %f2;
414
+ ret;
415
+ $L__func_end1:
416
+
417
+ }
418
+ .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
419
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
420
+ .section .debug_abbrev
421
+ {
422
+ .b8 1
423
+ .b8 17
424
+ .b8 1
425
+ .b8 37
426
+ .b8 8
427
+ .b8 19
428
+ .b8 5
429
+ .b8 3
430
+ .b8 8
431
+ .b8 16
432
+ .b8 6
433
+ .b8 27
434
+ .b8 8
435
+ .b8 180
436
+ .b8 66
437
+ .b8 12
438
+ .b8 17
439
+ .b8 1
440
+ .b8 18
441
+ .b8 1
442
+ .b8 0
443
+ .b8 0
444
+ .b8 2
445
+ .b8 46
446
+ .b8 0
447
+ .b8 135
448
+ .b8 64
449
+ .b8 8
450
+ .b8 3
451
+ .b8 8
452
+ .b8 58
453
+ .b8 11
454
+ .b8 59
455
+ .b8 11
456
+ .b8 63
457
+ .b8 12
458
+ .b8 32
459
+ .b8 11
460
+ .b8 0
461
+ .b8 0
462
+ .b8 3
463
+ .b8 46
464
+ .b8 1
465
+ .b8 17
466
+ .b8 1
467
+ .b8 18
468
+ .b8 1
469
+ .b8 64
470
+ .b8 10
471
+ .b8 49
472
+ .b8 19
473
+ .b8 0
474
+ .b8 0
475
+ .b8 4
476
+ .b8 29
477
+ .b8 0
478
+ .b8 49
479
+ .b8 19
480
+ .b8 17
481
+ .b8 1
482
+ .b8 18
483
+ .b8 1
484
+ .b8 88
485
+ .b8 11
486
+ .b8 89
487
+ .b8 11
488
+ .b8 87
489
+ .b8 11
490
+ .b8 0
491
+ .b8 0
492
+ .b8 5
493
+ .b8 29
494
+ .b8 1
495
+ .b8 49
496
+ .b8 19
497
+ .b8 17
498
+ .b8 1
499
+ .b8 18
500
+ .b8 1
501
+ .b8 88
502
+ .b8 11
503
+ .b8 89
504
+ .b8 11
505
+ .b8 87
506
+ .b8 11
507
+ .b8 0
508
+ .b8 0
509
+ .b8 0
510
+ }
511
+ .section .debug_info
512
+ {
513
+ .b32 298
514
+ .b8 2
515
+ .b8 0
516
+ .b32 .debug_abbrev
517
+ .b8 8
518
+ .b8 1
519
+ .b8 116
520
+ .b8 114
521
+ .b8 105
522
+ .b8 116
523
+ .b8 111
524
+ .b8 110
525
+ .b8 0
526
+ .b8 2
527
+ .b8 0
528
+ .b8 99
529
+ .b8 103
530
+ .b8 120
531
+ .b8 53
532
+ .b8 108
533
+ .b8 120
534
+ .b8 112
535
+ .b8 117
536
+ .b8 101
537
+ .b8 120
538
+ .b8 112
539
+ .b8 105
540
+ .b8 110
541
+ .b8 100
542
+ .b8 106
543
+ .b8 52
544
+ .b8 100
545
+ .b8 115
546
+ .b8 109
547
+ .b8 106
548
+ .b8 122
549
+ .b8 53
550
+ .b8 120
551
+ .b8 52
552
+ .b8 50
553
+ .b8 117
554
+ .b8 104
555
+ .b8 121
556
+ .b8 121
557
+ .b8 55
558
+ .b8 105
559
+ .b8 115
560
+ .b8 107
561
+ .b8 101
562
+ .b8 118
563
+ .b8 113
564
+ .b8 55
565
+ .b8 111
566
+ .b8 118
567
+ .b8 122
568
+ .b8 112
569
+ .b8 119
570
+ .b8 97
571
+ .b8 103
572
+ .b8 98
573
+ .b8 51
574
+ .b8 116
575
+ .b8 53
576
+ .b8 112
577
+ .b8 111
578
+ .b8 119
579
+ .b8 106
580
+ .b8 46
581
+ .b8 112
582
+ .b8 121
583
+ .b8 0
584
+ .b32 .debug_line
585
+ .b8 47
586
+ .b8 116
587
+ .b8 109
588
+ .b8 112
589
+ .b8 47
590
+ .b8 116
591
+ .b8 111
592
+ .b8 114
593
+ .b8 99
594
+ .b8 104
595
+ .b8 105
596
+ .b8 110
597
+ .b8 100
598
+ .b8 117
599
+ .b8 99
600
+ .b8 116
601
+ .b8 111
602
+ .b8 114
603
+ .b8 95
604
+ .b8 114
605
+ .b8 111
606
+ .b8 111
607
+ .b8 116
608
+ .b8 47
609
+ .b8 103
610
+ .b8 120
611
+ .b8 0
612
+ .b8 1
613
+ .b64 $L__func_begin0
614
+ .b64 $L__func_end0
615
+ .b8 2
616
+ .b8 116
617
+ .b8 114
618
+ .b8 105
619
+ .b8 116
620
+ .b8 111
621
+ .b8 110
622
+ .b8 95
623
+ .b8 95
624
+ .b8 48
625
+ .b8 100
626
+ .b8 49
627
+ .b8 100
628
+ .b8 50
629
+ .b8 100
630
+ .b8 51
631
+ .b8 100
632
+ .b8 52
633
+ .b8 100
634
+ .b8 53
635
+ .b8 100
636
+ .b8 101
637
+ .b8 54
638
+ .b8 100
639
+ .b8 101
640
+ .b8 0
641
+ .b8 116
642
+ .b8 114
643
+ .b8 105
644
+ .b8 116
645
+ .b8 111
646
+ .b8 110
647
+ .b8 95
648
+ .b8 95
649
+ .b8 48
650
+ .b8 100
651
+ .b8 49
652
+ .b8 100
653
+ .b8 50
654
+ .b8 100
655
+ .b8 51
656
+ .b8 100
657
+ .b8 52
658
+ .b8 100
659
+ .b8 53
660
+ .b8 100
661
+ .b8 101
662
+ .b8 54
663
+ .b8 100
664
+ .b8 101
665
+ .b8 0
666
+ .b8 1
667
+ .b8 18
668
+ .b8 1
669
+ .b8 1
670
+ .b8 3
671
+ .b64 $L__func_begin0
672
+ .b64 $L__func_end0
673
+ .b8 1
674
+ .b8 156
675
+ .b32 125
676
+ .b8 4
677
+ .b32 125
678
+ .b64 $L__tmp1
679
+ .b64 $L__tmp4
680
+ .b8 2
681
+ .b8 44
682
+ .b8 38
683
+ .b8 4
684
+ .b32 125
685
+ .b64 $L__tmp5
686
+ .b64 $L__tmp8
687
+ .b8 2
688
+ .b8 50
689
+ .b8 41
690
+ .b8 5
691
+ .b32 125
692
+ .b64 $L__tmp6
693
+ .b64 $L__tmp9
694
+ .b8 2
695
+ .b8 50
696
+ .b8 41
697
+ .b8 4
698
+ .b32 125
699
+ .b64 $L__tmp6
700
+ .b64 $L__tmp9
701
+ .b8 2
702
+ .b8 120
703
+ .b8 46
704
+ .b8 0
705
+ .b8 0
706
+ .b8 0
707
+ }
708
+ .section .debug_pubnames
709
+ {
710
+ .b32 $L__pubNames_end0-$L__pubNames_start0
711
+ $L__pubNames_start0:
712
+ .b8 2
713
+ .b8 0
714
+ .b32 .debug_info
715
+ .b32 302
716
+ .b32 125
717
+ .b8 116
718
+ .b8 114
719
+ .b8 105
720
+ .b8 116
721
+ .b8 111
722
+ .b8 110
723
+ .b8 95
724
+ .b8 95
725
+ .b8 48
726
+ .b8 100
727
+ .b8 49
728
+ .b8 100
729
+ .b8 50
730
+ .b8 100
731
+ .b8 51
732
+ .b8 100
733
+ .b8 52
734
+ .b8 100
735
+ .b8 53
736
+ .b8 100
737
+ .b8 101
738
+ .b8 54
739
+ .b8 100
740
+ .b8 101
741
+ .b8 0
742
+ .b32 0
743
+ $L__pubNames_end0:
744
+ }
745
+ .section .debug_pubtypes
746
+ {
747
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
748
+ $L__pubTypes_start0:
749
+ .b8 2
750
+ .b8 0
751
+ .b32 .debug_info
752
+ .b32 302
753
+ .b32 0
754
+ $L__pubTypes_end0:
755
+ }
756
+ .section .debug_loc { }
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
9
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
10
+ %cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
11
+ %cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
12
+ %c0_i32 = arith.constant 0 : i32
13
+ %c4_i32 = arith.constant 4 : i32
14
+ %c256_i32 = arith.constant 256 : i32
15
+ %cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
16
+ %cst_7 = arith.constant 0.000000e+00 : f32
17
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
18
+ %cst_9 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
19
+ %cst_10 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
20
+ %cst_11 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
21
+ %cst_12 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
22
+ %c64_i32 = arith.constant 64 : i32
23
+ %0 = tt.get_program_id x : i32
24
+ %1 = arith.muli %0, %c64_i32 : i32
25
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
26
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
27
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
28
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
29
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
30
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
31
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
32
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
33
+ %10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
34
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
35
+ %12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
36
+ %13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
37
+ %14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
38
+ %15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
39
+ %16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
40
+ %17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
41
+ %18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
42
+ %19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
43
+ %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
44
+ %21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
45
+ %22 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
46
+ %23 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
47
+ %24 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
48
+ %25 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
49
+ %26 = arith.select %24, %22, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
50
+ %27 = arith.select %25, %23, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
51
+ %28 = arith.cmpi sge, %27, %cst_5 : tensor<64x1xi64, #blocked1>
52
+ %29 = arith.cmpi slt, %27, %cst_4 : tensor<64x1xi64, #blocked1>
53
+ %30 = arith.andi %28, %29 : tensor<64x1xi1, #blocked1>
54
+ %31 = arith.muli %26, %cst_1 : tensor<64x1xi64, #blocked>
55
+ %32 = tt.broadcast %31 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
56
+ %33 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
57
+ %34:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_9, %arg9 = %cst_9, %arg10 = %cst_9) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) : i32 {
58
+ %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
59
+ %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
60
+ %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
61
+ %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
62
+ %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
63
+ %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
64
+ %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
65
+ %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
66
+ tt.assert %30, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
67
+ %53 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
68
+ %54 = tt.broadcast %53 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
69
+ %55 = arith.addi %54, %32 : tensor<64x4xi64, #blocked>
70
+ %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
71
+ %57 = tt.load %56, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
72
+ %58 = arith.addf %57, %52 : tensor<64x4xf32, #blocked>
73
+ %59 = arith.subf %58, %arg8 : tensor<64x4xf32, #blocked>
74
+ %60 = arith.addf %arg10, %cst_6 : tensor<64x4xf32, #blocked>
75
+ %61 = arith.divf %59, %60 : tensor<64x4xf32, #blocked>
76
+ %62 = arith.addf %arg8, %61 : tensor<64x4xf32, #blocked>
77
+ %63 = arith.subf %58, %62 : tensor<64x4xf32, #blocked>
78
+ %64 = arith.mulf %59, %63 : tensor<64x4xf32, #blocked>
79
+ %65 = arith.addf %arg9, %64 : tensor<64x4xf32, #blocked>
80
+ %66 = arith.select %51, %62, %arg8 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
81
+ %67 = arith.select %51, %65, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
82
+ %68 = arith.select %51, %60, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
83
+ scf.yield %66, %67, %68 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
84
+ }
85
+ %35:3 = "tt.reduce"(%34#0, %34#1, %34#2) <{axis = 1 : i32}> ({
86
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
87
+ %45 = arith.subf %arg10, %arg7 : f32
88
+ %46 = arith.addf %arg9, %arg12 : f32
89
+ %47 = arith.cmpf oeq, %46, %cst_7 : f32
90
+ %48 = arith.divf %arg12, %46 : f32
91
+ %49 = arith.select %47, %cst_7, %48 : f32
92
+ %50 = arith.mulf %45, %49 : f32
93
+ %51 = arith.addf %arg7, %50 : f32
94
+ %52 = arith.addf %arg8, %arg11 : f32
95
+ %53 = arith.mulf %45, %45 : f32
96
+ %54 = arith.mulf %53, %arg9 : f32
97
+ %55 = arith.mulf %54, %49 : f32
98
+ %56 = arith.addf %52, %55 : f32
99
+ tt.reduce.return %51, %56, %46 : f32, f32, f32
100
+ }) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
101
+ %36 = tt.expand_dims %35#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
102
+ %37 = tt.expand_dims %35#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
103
+ %38 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
104
+ %39 = tt.broadcast %36 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
105
+ %40 = arith.divf %37, %cst_12 : tensor<64x1xf32, #blocked>
106
+ %41 = arith.addf %40, %cst_11 : tensor<64x1xf32, #blocked>
107
+ %42 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
108
+ %43 = tt.broadcast %42 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
109
+ %44 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
110
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
111
+ %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked>
112
+ %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked>
113
+ %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked>
114
+ %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
115
+ %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked>
116
+ %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
117
+ %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
118
+ %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
119
+ %53 = tt.addptr %38, %46 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
120
+ %54 = tt.load %53, %47, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
121
+ tt.assert %30, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
122
+ %55 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
123
+ %56 = tt.broadcast %55 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
124
+ %57 = arith.addi %56, %32 : tensor<64x4xi64, #blocked>
125
+ %58 = tt.addptr %33, %57 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
126
+ %59 = tt.load %58, %51, %cst_9 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
127
+ %60 = arith.addf %59, %52 : tensor<64x4xf32, #blocked>
128
+ %61 = arith.subf %60, %39 : tensor<64x4xf32, #blocked>
129
+ %62 = tt.extern_elementwise %41 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
130
+ %63 = tt.broadcast %62 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
131
+ %64 = arith.mulf %61, %63 : tensor<64x4xf32, #blocked>
132
+ %65 = tt.broadcast %54 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
133
+ %66 = arith.mulf %64, %65 : tensor<64x4xf32, #blocked>
134
+ %67 = arith.addi %48, %43 : tensor<64x4xi32, #blocked>
135
+ %68 = tt.addptr %44, %67 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
136
+ %69 = arith.truncf %66 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
137
+ tt.store %68, %69, %51 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
138
+ }
139
+ tt.return
140
+ }
141
+ }
.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant 0.000000e+00 : f32
4
+ %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
5
+ %c256_i32 = arith.constant 256 : i32
6
+ %c4_i32 = arith.constant 4 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
10
+ %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
11
+ %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
12
+ %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
14
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
15
+ %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
16
+ %cst_9 = arith.constant dense<256> : tensor<1x4xi32>
17
+ %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
18
+ %c64_i32 = arith.constant 64 : i32
19
+ %0 = tt.get_program_id x : i32
20
+ %1 = arith.muli %0, %c64_i32 : i32
21
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
22
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
23
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
24
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
25
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
26
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
27
+ %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
28
+ %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
29
+ %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
30
+ %11 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
31
+ %12 = arith.muli %11, %cst_8 : tensor<64x1xi32>
32
+ %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
33
+ %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
34
+ %15 = arith.addi %10, %cst_3 : tensor<64x1xi64>
35
+ %16 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
36
+ %17 = arith.select %16, %15, %10 : tensor<64x1xi1>, tensor<64x1xi64>
37
+ %18 = arith.cmpi sge, %17, %cst_2 : tensor<64x1xi64>
38
+ %19 = arith.cmpi slt, %17, %cst_3 : tensor<64x1xi64>
39
+ %20 = arith.andi %18, %19 : tensor<64x1xi1>
40
+ %21 = arith.muli %17, %cst_1 : tensor<64x1xi64>
41
+ %22 = tt.broadcast %21 : (tensor<64x1xi64>) -> tensor<64x4xi64>
42
+ %23 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
43
+ %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 {
44
+ %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
45
+ %48 = arith.addi %47, %7 : tensor<1x4xi32>
46
+ %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
47
+ %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
48
+ %51 = arith.addi %50, %13 : tensor<64x4xi32>
49
+ %52 = tt.addptr %14, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
50
+ %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
51
+ %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
52
+ tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
53
+ %55 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
54
+ %56 = tt.broadcast %55 : (tensor<1x4xi64>) -> tensor<64x4xi64>
55
+ %57 = arith.addi %56, %22 : tensor<64x4xi64>
56
+ %58 = tt.addptr %23, %57 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
57
+ %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
58
+ %60 = arith.addf %59, %54 : tensor<64x4xf32>
59
+ %61 = arith.subf %60, %arg8 : tensor<64x4xf32>
60
+ %62 = arith.addf %arg10, %cst_0 : tensor<64x4xf32>
61
+ %63 = arith.divf %61, %62 : tensor<64x4xf32>
62
+ %64 = arith.addf %arg8, %63 : tensor<64x4xf32>
63
+ %65 = arith.subf %60, %64 : tensor<64x4xf32>
64
+ %66 = arith.mulf %61, %65 : tensor<64x4xf32>
65
+ %67 = arith.addf %arg9, %66 : tensor<64x4xf32>
66
+ %68 = arith.select %53, %64, %arg8 : tensor<64x4xi1>, tensor<64x4xf32>
67
+ %69 = arith.select %53, %67, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
68
+ %70 = arith.select %53, %62, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
69
+ scf.yield %68, %69, %70 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
70
+ }
71
+ %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({
72
+ ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
73
+ %47 = arith.subf %arg10, %arg7 : f32
74
+ %48 = arith.addf %arg9, %arg12 : f32
75
+ %49 = arith.cmpf oeq, %48, %cst : f32
76
+ %50 = arith.divf %arg12, %48 : f32
77
+ %51 = arith.select %49, %cst, %50 : f32
78
+ %52 = arith.mulf %47, %51 : f32
79
+ %53 = arith.addf %arg7, %52 : f32
80
+ %54 = arith.addf %arg8, %arg11 : f32
81
+ %55 = arith.mulf %47, %47 : f32
82
+ %56 = arith.mulf %55, %arg9 : f32
83
+ %57 = arith.mulf %56, %51 : f32
84
+ %58 = arith.addf %54, %57 : f32
85
+ tt.reduce.return %53, %58, %48 : f32, f32, f32
86
+ }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
87
+ %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
88
+ %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
89
+ %28 = arith.muli %11, %cst_8 : tensor<64x1xi32>
90
+ %29 = tt.broadcast %28 : (tensor<64x1xi32>) -> tensor<64x4xi32>
91
+ %30 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
92
+ %31 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
93
+ %32 = arith.addi %10, %cst_3 : tensor<64x1xi64>
94
+ %33 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
95
+ %34 = arith.select %33, %32, %10 : tensor<64x1xi1>, tensor<64x1xi64>
96
+ %35 = arith.cmpi sge, %34, %cst_2 : tensor<64x1xi64>
97
+ %36 = arith.cmpi slt, %34, %cst_3 : tensor<64x1xi64>
98
+ %37 = arith.andi %35, %36 : tensor<64x1xi1>
99
+ %38 = arith.muli %34, %cst_1 : tensor<64x1xi64>
100
+ %39 = tt.broadcast %38 : (tensor<64x1xi64>) -> tensor<64x4xi64>
101
+ %40 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
102
+ %41 = tt.broadcast %26 : (tensor<64x1xf32>) -> tensor<64x4xf32>
103
+ %42 = arith.divf %27, %cst_5 : tensor<64x1xf32>
104
+ %43 = arith.addf %42, %cst_4 : tensor<64x1xf32>
105
+ %44 = arith.muli %5, %cst_8 : tensor<64x1xi32>
106
+ %45 = tt.broadcast %44 : (tensor<64x1xi32>) -> tensor<64x4xi32>
107
+ %46 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
108
+ scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
109
+ %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32>
110
+ %48 = arith.addi %47, %7 : tensor<1x4xi32>
111
+ %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32>
112
+ %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32>
113
+ %51 = arith.addi %50, %29 : tensor<64x4xi32>
114
+ %52 = tt.addptr %30, %51 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
115
+ %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1>
116
+ %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
117
+ %55 = tt.addptr %31, %48 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
118
+ %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
119
+ tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
120
+ %57 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64>
121
+ %58 = tt.broadcast %57 : (tensor<1x4xi64>) -> tensor<64x4xi64>
122
+ %59 = arith.addi %58, %39 : tensor<64x4xi64>
123
+ %60 = tt.addptr %40, %59 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
124
+ %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
125
+ %62 = arith.addf %61, %54 : tensor<64x4xf32>
126
+ %63 = arith.subf %62, %41 : tensor<64x4xf32>
127
+ %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
128
+ %65 = tt.broadcast %64 : (tensor<64x1xf32>) -> tensor<64x4xf32>
129
+ %66 = arith.mulf %63, %65 : tensor<64x4xf32>
130
+ %67 = tt.broadcast %56 : (tensor<1x4xf32>) -> tensor<64x4xf32>
131
+ %68 = arith.mulf %66, %67 : tensor<64x4xf32>
132
+ %69 = arith.addi %50, %45 : tensor<64x4xi32>
133
+ %70 = tt.addptr %46, %69 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
134
+ %71 = arith.truncf %68 : tensor<64x4xf32> to tensor<64x4xbf16>
135
+ tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
136
+ }
137
+ tt.return
138
+ }
139
+ }
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = shl i32 %6, 2, !dbg !8
11
+ %10 = and i32 %9, 60, !dbg !8
12
+ %11 = and i32 %8, 3, !dbg !9
13
+ %12 = lshr i32 %7, 4, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 1, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
17
+ %16 = shl i32 %15, 6, !dbg !11
18
+ %17 = or i32 %16, %10, !dbg !12
19
+ %.frozen = freeze i32 %17
20
+ %18 = sdiv i32 %.frozen, 256, !dbg !13
21
+ %19 = mul i32 %18, 256
22
+ %.decomposed = sub i32 %.frozen, %19
23
+ %20 = shl i32 %18, 15, !dbg !14
24
+ %21 = add i32 %20, %.decomposed
25
+ br label %22, !dbg !15
26
+
27
+ 22: ; preds = %5, %22
28
+ %23 = phi i32 [ 0, %5 ], [ %53, %22 ]
29
+ %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %52, %22 ]
30
+ %25 = or i32 %23, %14, !dbg !16
31
+ %26 = shl i32 %25, 8, !dbg !17
32
+ %27 = add i32 %21, %26, !dbg !18
33
+ %28 = sext i32 %27 to i64, !dbg !19
34
+ %29 = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !19
35
+ %30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
36
+ %31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !20
37
+ %32 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !20
38
+ %33 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !20
39
+ %34 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !20
40
+ %35 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !21
41
+ %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !22
42
+ %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !22
43
+ %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !22
44
+ %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !22
45
+ %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !22
46
+ %41 = insertelement <4 x i32> poison, i32 %31, i64 0, !dbg !20
47
+ %42 = insertelement <4 x i32> %41, i32 %32, i64 1, !dbg !20
48
+ %43 = insertelement <4 x i32> %42, i32 %33, i64 2, !dbg !20
49
+ %44 = insertelement <4 x i32> %43, i32 %34, i64 3, !dbg !20
50
+ %45 = bitcast <4 x i32> %44 to <4 x float>, !dbg !20
51
+ %46 = insertelement <4 x i32> poison, i32 %37, i64 0, !dbg !22
52
+ %47 = insertelement <4 x i32> %46, i32 %38, i64 1, !dbg !22
53
+ %48 = insertelement <4 x i32> %47, i32 %39, i64 2, !dbg !22
54
+ %49 = insertelement <4 x i32> %48, i32 %40, i64 3, !dbg !22
55
+ %50 = bitcast <4 x i32> %49 to <4 x float>, !dbg !22
56
+ %51 = fmul <4 x float> %45, %50, !dbg !23
57
+ %52 = fadd <4 x float> %24, %51, !dbg !24
58
+ %53 = add nuw nsw i32 %23, 8, !dbg !15
59
+ %54 = icmp ult i32 %23, 120, !dbg !15
60
+ br i1 %54, label %22, label %55, !dbg !15
61
+
62
+ 55: ; preds = %22
63
+ %56 = and i32 %6, 63, !dbg !8
64
+ %57 = or i32 %16, %56, !dbg !12
65
+ %58 = or i32 %10, 3, !dbg !25
66
+ %59 = or i32 %10, 2, !dbg !25
67
+ %60 = or i32 %10, 1, !dbg !25
68
+ %61 = extractelement <4 x float> %52, i64 0, !dbg !25
69
+ %62 = bitcast float %61 to i32, !dbg !25
70
+ %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 16, i32 31), !dbg !25
71
+ %64 = bitcast i32 %63 to float, !dbg !25
72
+ %65 = fadd float %61, %64, !dbg !29
73
+ %66 = extractelement <4 x float> %52, i64 1, !dbg !25
74
+ %67 = bitcast float %66 to i32, !dbg !25
75
+ %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !25
76
+ %69 = bitcast i32 %68 to float, !dbg !25
77
+ %70 = fadd float %66, %69, !dbg !29
78
+ %71 = extractelement <4 x float> %52, i64 2, !dbg !25
79
+ %72 = bitcast float %71 to i32, !dbg !25
80
+ %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !25
81
+ %74 = bitcast i32 %73 to float, !dbg !25
82
+ %75 = fadd float %71, %74, !dbg !29
83
+ %76 = extractelement <4 x float> %52, i64 3, !dbg !25
84
+ %77 = bitcast float %76 to i32, !dbg !25
85
+ %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !25
86
+ %79 = bitcast i32 %78 to float, !dbg !25
87
+ %80 = fadd float %76, %79, !dbg !29
88
+ %81 = icmp ult i32 %7, 16, !dbg !25
89
+ %82 = shl nuw nsw i32 %10, 2, !dbg !25
90
+ %83 = or i32 %82, %11, !dbg !25
91
+ %84 = zext nneg i32 %83 to i64, !dbg !25
92
+ %85 = getelementptr float, ptr addrspace(3) @global_smem, i64 %84, !dbg !25
93
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, float %65, i1 %81) #3, !dbg !25
94
+ %86 = shl nuw nsw i32 %60, 2, !dbg !25
95
+ %87 = or i32 %86, %11, !dbg !25
96
+ %88 = zext nneg i32 %87 to i64, !dbg !25
97
+ %89 = getelementptr float, ptr addrspace(3) @global_smem, i64 %88, !dbg !25
98
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %89, float %70, i1 %81) #3, !dbg !25
99
+ %90 = shl nuw nsw i32 %59, 2, !dbg !25
100
+ %91 = or i32 %90, %11, !dbg !25
101
+ %92 = zext nneg i32 %91 to i64, !dbg !25
102
+ %93 = getelementptr float, ptr addrspace(3) @global_smem, i64 %92, !dbg !25
103
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %93, float %75, i1 %81) #3, !dbg !25
104
+ %94 = shl nuw nsw i32 %58, 2, !dbg !25
105
+ %95 = or i32 %94, %11, !dbg !25
106
+ %96 = zext nneg i32 %95 to i64, !dbg !25
107
+ %97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !25
108
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %80, i1 %81) #3, !dbg !25
109
+ tail call void @llvm.nvvm.barrier0(), !dbg !25
110
+ %98 = icmp slt i32 %6, 256, !dbg !25
111
+ %99 = sext i32 %6 to i64, !dbg !25
112
+ %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !25
113
+ %101 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %100, i1 %98) #3, !dbg !25
114
+ %102 = bitcast float %101 to i32, !dbg !25
115
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !25
116
+ %104 = bitcast i32 %103 to float, !dbg !25
117
+ %105 = fadd float %101, %104, !dbg !29
118
+ %106 = bitcast float %105 to i32, !dbg !25
119
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !25
120
+ %108 = bitcast i32 %107 to float, !dbg !25
121
+ %109 = fadd float %105, %108, !dbg !29
122
+ %110 = and i32 %6, 3, !dbg !25
123
+ %111 = icmp eq i32 %110, 0, !dbg !25
124
+ %112 = and i1 %98, %111, !dbg !25
125
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %109, i1 %112) #3, !dbg !25
126
+ %113 = add i32 %6, 128, !dbg !25
127
+ %114 = sext i32 %113 to i64, !dbg !25
128
+ %115 = getelementptr float, ptr addrspace(3) @global_smem, i64 %114, !dbg !25
129
+ %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %98) #3, !dbg !25
130
+ %117 = bitcast float %116 to i32, !dbg !25
131
+ %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !25
132
+ %119 = bitcast i32 %118 to float, !dbg !25
133
+ %120 = fadd float %116, %119, !dbg !29
134
+ %121 = bitcast float %120 to i32, !dbg !25
135
+ %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !25
136
+ %123 = bitcast i32 %122 to float, !dbg !25
137
+ %124 = fadd float %120, %123, !dbg !29
138
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %124, i1 %112) #3, !dbg !25
139
+ tail call void @llvm.nvvm.barrier0(), !dbg !25
140
+ %125 = zext nneg i32 %82 to i64, !dbg !25
141
+ %126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !25
142
+ %127 = load float, ptr addrspace(3) %126, align 4, !dbg !25
143
+ %128 = zext nneg i32 %86 to i64, !dbg !25
144
+ %129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !25
145
+ %130 = load float, ptr addrspace(3) %129, align 4, !dbg !25
146
+ %131 = zext nneg i32 %90 to i64, !dbg !25
147
+ %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !25
148
+ %133 = load float, ptr addrspace(3) %132, align 4, !dbg !25
149
+ %134 = zext nneg i32 %94 to i64, !dbg !25
150
+ %135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !25
151
+ %136 = load float, ptr addrspace(3) %135, align 4, !dbg !25
152
+ tail call void @llvm.nvvm.barrier0(), !dbg !33
153
+ %137 = zext nneg i32 %10 to i64, !dbg !33
154
+ %138 = getelementptr float, ptr addrspace(3) @global_smem, i64 %137, !dbg !33
155
+ %139 = insertelement <1 x float> undef, float %127, i64 0, !dbg !33
156
+ store <1 x float> %139, ptr addrspace(3) %138, align 4, !dbg !33
157
+ %140 = zext nneg i32 %60 to i64, !dbg !33
158
+ %141 = getelementptr float, ptr addrspace(3) @global_smem, i64 %140, !dbg !33
159
+ %142 = insertelement <1 x float> undef, float %130, i64 0, !dbg !33
160
+ store <1 x float> %142, ptr addrspace(3) %141, align 4, !dbg !33
161
+ %143 = zext nneg i32 %59 to i64, !dbg !33
162
+ %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !33
163
+ %145 = insertelement <1 x float> undef, float %133, i64 0, !dbg !33
164
+ store <1 x float> %145, ptr addrspace(3) %144, align 4, !dbg !33
165
+ %146 = zext nneg i32 %58 to i64, !dbg !33
166
+ %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !33
167
+ %148 = insertelement <1 x float> undef, float %136, i64 0, !dbg !33
168
+ store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !33
169
+ tail call void @llvm.nvvm.barrier0(), !dbg !33
170
+ %149 = zext nneg i32 %56 to i64, !dbg !33
171
+ %150 = getelementptr float, ptr addrspace(3) @global_smem, i64 %149, !dbg !33
172
+ %151 = load i32, ptr addrspace(3) %150, align 4, !dbg !33
173
+ %152 = sext i32 %57 to i64, !dbg !34
174
+ %153 = getelementptr float, ptr addrspace(1) %2, i64 %152, !dbg !34
175
+ %154 = and i32 %6, 64, !dbg !35
176
+ %155 = icmp eq i32 %154, 0, !dbg !35
177
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %151, ptr addrspace(1) %153, i1 %155) #3, !dbg !35
178
+ ret void, !dbg !36
179
+ }
180
+
181
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
182
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
183
+
184
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
185
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
186
+
187
+ ; Function Attrs: convergent nocallback nounwind
188
+ declare void @llvm.nvvm.barrier0() #2
189
+
190
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
191
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
192
+ attributes #2 = { convergent nocallback nounwind }
193
+ attributes #3 = { nounwind }
194
+
195
+ !llvm.module.flags = !{!0}
196
+ !llvm.dbg.cu = !{!1}
197
+ !nvvm.annotations = !{!3, !4, !4, !3}
198
+
199
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
200
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
201
+ !2 = !DIFile(filename: "cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py", directory: "/tmp/torchinductor_root/qd")
202
+ !3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
203
+ !4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
204
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
205
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
206
+ !7 = !{}
207
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
208
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
209
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
210
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
211
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
212
+ !13 = !DILocation(line: 26, column: 20, scope: !5)
213
+ !14 = !DILocation(line: 33, column: 57, scope: !5)
214
+ !15 = !DILocation(line: 29, column: 36, scope: !5)
215
+ !16 = !DILocation(line: 30, column: 27, scope: !5)
216
+ !17 = !DILocation(line: 33, column: 44, scope: !5)
217
+ !18 = !DILocation(line: 33, column: 51, scope: !5)
218
+ !19 = !DILocation(line: 33, column: 34, scope: !5)
219
+ !20 = !DILocation(line: 33, column: 63, scope: !5)
220
+ !21 = !DILocation(line: 34, column: 34, scope: !5)
221
+ !22 = !DILocation(line: 34, column: 63, scope: !5)
222
+ !23 = !DILocation(line: 35, column: 22, scope: !5)
223
+ !24 = !DILocation(line: 38, column: 38, scope: !5)
224
+ !25 = !DILocation(line: 243, column: 36, scope: !26, inlinedAt: !28)
225
+ !26 = distinct !DILexicalBlockFile(scope: !5, file: !27, discriminator: 0)
226
+ !27 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
227
+ !28 = !DILocation(line: 39, column: 25, scope: !26)
228
+ !29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !31)
229
+ !30 = distinct !DILexicalBlockFile(scope: !26, file: !27, discriminator: 0)
230
+ !31 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
231
+ !32 = !DILocation(line: 39, column: 25, scope: !30)
232
+ !33 = !DILocation(line: 39, column: 28, scope: !5)
233
+ !34 = !DILocation(line: 40, column: 25, scope: !5)
234
+ !35 = !DILocation(line: 40, column: 36, scope: !5)
235
+ !36 = !DILocation(line: 40, column: 4, scope: !5)
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4de(
13
+ .param .u64 triton__0d1d2d3de4de_param_0,
14
+ .param .u64 triton__0d1d2d3de4de_param_1,
15
+ .param .u64 triton__0d1d2d3de4de_param_2,
16
+ .param .u32 triton__0d1d2d3de4de_param_3,
17
+ .param .u32 triton__0d1d2d3de4de_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<22>;
22
+ .reg .b32 %r<98>;
23
+ .reg .f32 %f<47>;
24
+ .reg .b64 %rd<9>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
30
+ ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
31
+ ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
32
+ $L__tmp0:
33
+ .loc 1 22 44
34
+ mov.u32 %r1, %tid.x;
35
+ and.b32 %r2, %r1, 31;
36
+ shl.b32 %r13, %r1, 2;
37
+ and.b32 %r3, %r13, 60;
38
+ .loc 1 24 33
39
+ bfe.u32 %r4, %r1, 5, 2;
40
+ .loc 1 21 28
41
+ mov.u32 %r11, %ctaid.x;
42
+ .loc 1 21 33
43
+ shl.b32 %r5, %r11, 6;
44
+ .loc 1 22 23
45
+ or.b32 %r14, %r5, %r3;
46
+ .loc 1 26 20
47
+ shr.s32 %r16, %r14, 31;
48
+ shr.u32 %r17, %r16, 24;
49
+ add.s32 %r18, %r14, %r17;
50
+ shr.s32 %r19, %r18, 8;
51
+ .loc 1 29 36
52
+ mad.lo.s32 %r20, %r19, 32512, %r14;
53
+ shl.b32 %r21, %r4, 9;
54
+ add.s32 %r22, %r20, %r21;
55
+ shl.b32 %r23, %r1, 4;
56
+ and.b32 %r24, %r23, 256;
57
+ add.s32 %r96, %r22, %r24;
58
+ mov.f32 %f43, 0f00000000;
59
+ mov.b32 %r97, -8;
60
+ mov.pred %p1, -1;
61
+ mov.f32 %f44, %f43;
62
+ mov.f32 %f45, %f43;
63
+ mov.f32 %f46, %f43;
64
+ $L__BB0_1:
65
+ .loc 1 33 34
66
+ mul.wide.s32 %rd6, %r96, 4;
67
+ add.s64 %rd4, %rd1, %rd6;
68
+ mov.b32 %r29, 0;
69
+ .loc 1 33 63
70
+ mov.u32 %r25, 0x0;
71
+ mov.u32 %r26, 0x0;
72
+ mov.u32 %r27, 0x0;
73
+ mov.u32 %r28, 0x0;
74
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd4 + 0 ];
75
+ @!%p1 mov.u32 %r25, %r29;
76
+ @!%p1 mov.u32 %r26, %r29;
77
+ @!%p1 mov.u32 %r27, %r29;
78
+ @!%p1 mov.u32 %r28, %r29;
79
+ .loc 1 34 34
80
+ add.s64 %rd5, %rd2, %rd6;
81
+ .loc 1 34 63
82
+ mov.u32 %r33, 0x0;
83
+ mov.u32 %r34, 0x0;
84
+ mov.u32 %r35, 0x0;
85
+ mov.u32 %r36, 0x0;
86
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
87
+ @!%p1 mov.u32 %r33, %r29;
88
+ @!%p1 mov.u32 %r34, %r29;
89
+ @!%p1 mov.u32 %r35, %r29;
90
+ @!%p1 mov.u32 %r36, %r29;
91
+ .loc 1 33 63
92
+ mov.b32 %f13, %r25;
93
+ mov.b32 %f14, %r26;
94
+ mov.b32 %f15, %r27;
95
+ mov.b32 %f16, %r28;
96
+ .loc 1 34 63
97
+ mov.b32 %f17, %r33;
98
+ mov.b32 %f18, %r34;
99
+ mov.b32 %f19, %r35;
100
+ mov.b32 %f20, %r36;
101
+ .loc 1 38 38
102
+ fma.rn.f32 %f46, %f16, %f20, %f46;
103
+ fma.rn.f32 %f45, %f15, %f19, %f45;
104
+ fma.rn.f32 %f44, %f14, %f18, %f44;
105
+ fma.rn.f32 %f43, %f13, %f17, %f43;
106
+ .loc 1 29 36
107
+ add.s32 %r97, %r97, 8;
108
+ add.s32 %r96, %r96, 2048;
109
+ setp.lt.u32 %p11, %r97, 120;
110
+ @%p11 bra $L__BB0_1;
111
+ .loc 1 22 44
112
+ and.b32 %r58, %r1, 63;
113
+ .loc 1 22 23
114
+ or.b32 %r59, %r5, %r58;
115
+ $L__tmp1:
116
+ .loc 2 243 36
117
+ mov.b32 %r60, %f43;
118
+ shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
119
+ mov.b32 %f21, %r61;
120
+ $L__tmp2:
121
+ .loc 2 233 15
122
+ add.f32 %f22, %f43, %f21;
123
+ $L__tmp3:
124
+ .loc 2 243 36
125
+ mov.b32 %r62, %f44;
126
+ shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
127
+ mov.b32 %f23, %r63;
128
+ $L__tmp4:
129
+ .loc 2 233 15
130
+ add.f32 %f24, %f44, %f23;
131
+ $L__tmp5:
132
+ .loc 2 243 36
133
+ mov.b32 %r64, %f45;
134
+ shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
135
+ mov.b32 %f25, %r65;
136
+ $L__tmp6:
137
+ .loc 2 233 15
138
+ add.f32 %f26, %f45, %f25;
139
+ $L__tmp7:
140
+ .loc 2 243 36
141
+ mov.b32 %r66, %f46;
142
+ shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
143
+ mov.b32 %f27, %r67;
144
+ $L__tmp8:
145
+ .loc 2 233 15
146
+ add.f32 %f28, %f46, %f27;
147
+ $L__tmp9:
148
+ .loc 2 243 36
149
+ setp.lt.u32 %p12, %r2, 16;
150
+ shl.b32 %r68, %r3, 2;
151
+ or.b32 %r69, %r68, %r4;
152
+ shl.b32 %r70, %r69, 2;
153
+ mov.u32 %r71, global_smem;
154
+ add.s32 %r41, %r71, %r70;
155
+ mov.b32 %r42, %f22;
156
+ @%p12 st.shared.b32 [ %r41 + 0 ], %r42;
157
+ shl.b32 %r72, %r4, 2;
158
+ shl.b32 %r73, %r3, 4;
159
+ or.b32 %r74, %r73, 16;
160
+ or.b32 %r75, %r74, %r72;
161
+ add.s32 %r43, %r71, %r75;
162
+ mov.b32 %r44, %f24;
163
+ @%p12 st.shared.b32 [ %r43 + 0 ], %r44;
164
+ or.b32 %r76, %r73, 32;
165
+ or.b32 %r77, %r76, %r72;
166
+ add.s32 %r45, %r71, %r77;
167
+ mov.b32 %r46, %f26;
168
+ @%p12 st.shared.b32 [ %r45 + 0 ], %r46;
169
+ or.b32 %r78, %r73, 48;
170
+ or.b32 %r79, %r78, %r72;
171
+ add.s32 %r47, %r71, %r79;
172
+ mov.b32 %r48, %f28;
173
+ @%p12 st.shared.b32 [ %r47 + 0 ], %r48;
174
+ bar.sync 0;
175
+ setp.lt.s32 %p16, %r1, 256;
176
+ add.s32 %r50, %r71, %r13;
177
+ @%p16 ld.shared.b32 %r49, [ %r50 + 0 ];
178
+ mov.b32 %f29, %r49;
179
+ shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
180
+ mov.b32 %f30, %r81;
181
+ $L__tmp10:
182
+ .loc 2 233 15
183
+ add.f32 %f31, %f29, %f30;
184
+ $L__tmp11:
185
+ .loc 2 243 36
186
+ mov.b32 %r82, %f31;
187
+ shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
188
+ mov.b32 %f32, %r83;
189
+ $L__tmp12:
190
+ .loc 2 233 15
191
+ add.f32 %f33, %f31, %f32;
192
+ $L__tmp13:
193
+ .loc 2 243 36
194
+ and.b32 %r84, %r1, 3;
195
+ setp.eq.s32 %p21, %r84, 0;
196
+ and.pred %p17, %p16, %p21;
197
+ mov.b32 %r52, %f33;
198
+ @%p17 st.shared.b32 [ %r50 + 0 ], %r52;
199
+ add.s32 %r54, %r50, 512;
200
+ @%p16 ld.shared.b32 %r53, [ %r54 + 0 ];
201
+ mov.b32 %f34, %r53;
202
+ shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
203
+ mov.b32 %f35, %r85;
204
+ $L__tmp14:
205
+ .loc 2 233 15
206
+ add.f32 %f36, %f34, %f35;
207
+ $L__tmp15:
208
+ .loc 2 243 36
209
+ mov.b32 %r86, %f36;
210
+ shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
211
+ mov.b32 %f37, %r87;
212
+ $L__tmp16:
213
+ .loc 2 233 15
214
+ add.f32 %f38, %f36, %f37;
215
+ $L__tmp17:
216
+ .loc 2 243 36
217
+ mov.b32 %r56, %f38;
218
+ @%p17 st.shared.b32 [ %r54 + 0 ], %r56;
219
+ bar.sync 0;
220
+ add.s32 %r88, %r71, %r73;
221
+ ld.shared.f32 %f39, [%r88];
222
+ add.s32 %r89, %r71, %r74;
223
+ ld.shared.f32 %f40, [%r89];
224
+ add.s32 %r90, %r71, %r76;
225
+ ld.shared.f32 %f41, [%r90];
226
+ add.s32 %r91, %r71, %r78;
227
+ ld.shared.f32 %f42, [%r91];
228
+ $L__tmp18:
229
+ .loc 1 39 28
230
+ bar.sync 0;
231
+ add.s32 %r92, %r71, %r68;
232
+ st.shared.f32 [%r92], %f39;
233
+ st.shared.f32 [%r92+4], %f40;
234
+ st.shared.f32 [%r92+8], %f41;
235
+ st.shared.f32 [%r92+12], %f42;
236
+ bar.sync 0;
237
+ shl.b32 %r93, %r58, 2;
238
+ add.s32 %r94, %r71, %r93;
239
+ ld.shared.u32 %r57, [%r94];
240
+ .loc 1 40 25
241
+ mul.wide.s32 %rd8, %r59, 4;
242
+ add.s64 %rd7, %rd3, %rd8;
243
+ .loc 1 40 36
244
+ and.b32 %r95, %r1, 64;
245
+ setp.eq.s32 %p20, %r95, 0;
246
+ @%p20 st.global.b32 [ %rd7 + 0 ], { %r57 };
247
+ .loc 1 40 4
248
+ ret;
249
+ $L__tmp19:
250
+ $L__func_end0:
251
+
252
+ }
253
+ .file 1 "/tmp/torchinductor_root/qd/cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py"
254
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
255
+ .section .debug_abbrev
256
+ {
257
+ .b8 1
258
+ .b8 17
259
+ .b8 1
260
+ .b8 37
261
+ .b8 8
262
+ .b8 19
263
+ .b8 5
264
+ .b8 3
265
+ .b8 8
266
+ .b8 16
267
+ .b8 6
268
+ .b8 27
269
+ .b8 8
270
+ .b8 180
271
+ .b8 66
272
+ .b8 12
273
+ .b8 17
274
+ .b8 1
275
+ .b8 18
276
+ .b8 1
277
+ .b8 0
278
+ .b8 0
279
+ .b8 2
280
+ .b8 46
281
+ .b8 0
282
+ .b8 135
283
+ .b8 64
284
+ .b8 8
285
+ .b8 3
286
+ .b8 8
287
+ .b8 58
288
+ .b8 11
289
+ .b8 59
290
+ .b8 11
291
+ .b8 63
292
+ .b8 12
293
+ .b8 32
294
+ .b8 11
295
+ .b8 0
296
+ .b8 0
297
+ .b8 3
298
+ .b8 46
299
+ .b8 1
300
+ .b8 17
301
+ .b8 1
302
+ .b8 18
303
+ .b8 1
304
+ .b8 64
305
+ .b8 10
306
+ .b8 49
307
+ .b8 19
308
+ .b8 0
309
+ .b8 0
310
+ .b8 4
311
+ .b8 29
312
+ .b8 0
313
+ .b8 49
314
+ .b8 19
315
+ .b8 17
316
+ .b8 1
317
+ .b8 18
318
+ .b8 1
319
+ .b8 88
320
+ .b8 11
321
+ .b8 89
322
+ .b8 11
323
+ .b8 87
324
+ .b8 11
325
+ .b8 0
326
+ .b8 0
327
+ .b8 5
328
+ .b8 29
329
+ .b8 1
330
+ .b8 49
331
+ .b8 19
332
+ .b8 17
333
+ .b8 1
334
+ .b8 18
335
+ .b8 1
336
+ .b8 88
337
+ .b8 11
338
+ .b8 89
339
+ .b8 11
340
+ .b8 87
341
+ .b8 11
342
+ .b8 0
343
+ .b8 0
344
+ .b8 0
345
+ }
346
+ .section .debug_info
347
+ {
348
+ .b32 266
349
+ .b8 2
350
+ .b8 0
351
+ .b32 .debug_abbrev
352
+ .b8 8
353
+ .b8 1
354
+ .b8 116
355
+ .b8 114
356
+ .b8 105
357
+ .b8 116
358
+ .b8 111
359
+ .b8 110
360
+ .b8 0
361
+ .b8 2
362
+ .b8 0
363
+ .b8 99
364
+ .b8 113
365
+ .b8 100
366
+ .b8 118
367
+ .b8 108
368
+ .b8 116
369
+ .b8 110
370
+ .b8 100
371
+ .b8 120
372
+ .b8 99
373
+ .b8 55
374
+ .b8 118
375
+ .b8 119
376
+ .b8 106
377
+ .b8 53
378
+ .b8 106
379
+ .b8 53
380
+ .b8 100
381
+ .b8 110
382
+ .b8 115
383
+ .b8 98
384
+ .b8 55
385
+ .b8 51
386
+ .b8 116
387
+ .b8 107
388
+ .b8 55
389
+ .b8 54
390
+ .b8 51
391
+ .b8 103
392
+ .b8 97
393
+ .b8 106
394
+ .b8 102
395
+ .b8 116
396
+ .b8 106
397
+ .b8 119
398
+ .b8 118
399
+ .b8 109
400
+ .b8 98
401
+ .b8 102
402
+ .b8 113
403
+ .b8 55
404
+ .b8 105
405
+ .b8 54
406
+ .b8 115
407
+ .b8 105
408
+ .b8 116
409
+ .b8 107
410
+ .b8 53
411
+ .b8 103
412
+ .b8 119
413
+ .b8 111
414
+ .b8 121
415
+ .b8 46
416
+ .b8 112
417
+ .b8 121
418
+ .b8 0
419
+ .b32 .debug_line
420
+ .b8 47
421
+ .b8 116
422
+ .b8 109
423
+ .b8 112
424
+ .b8 47
425
+ .b8 116
426
+ .b8 111
427
+ .b8 114
428
+ .b8 99
429
+ .b8 104
430
+ .b8 105
431
+ .b8 110
432
+ .b8 100
433
+ .b8 117
434
+ .b8 99
435
+ .b8 116
436
+ .b8 111
437
+ .b8 114
438
+ .b8 95
439
+ .b8 114
440
+ .b8 111
441
+ .b8 111
442
+ .b8 116
443
+ .b8 47
444
+ .b8 113
445
+ .b8 100
446
+ .b8 0
447
+ .b8 1
448
+ .b64 $L__func_begin0
449
+ .b64 $L__func_end0
450
+ .b8 2
451
+ .b8 116
452
+ .b8 114
453
+ .b8 105
454
+ .b8 116
455
+ .b8 111
456
+ .b8 110
457
+ .b8 95
458
+ .b8 95
459
+ .b8 48
460
+ .b8 100
461
+ .b8 49
462
+ .b8 100
463
+ .b8 50
464
+ .b8 100
465
+ .b8 51
466
+ .b8 100
467
+ .b8 101
468
+ .b8 52
469
+ .b8 100
470
+ .b8 101
471
+ .b8 0
472
+ .b8 116
473
+ .b8 114
474
+ .b8 105
475
+ .b8 116
476
+ .b8 111
477
+ .b8 110
478
+ .b8 95
479
+ .b8 95
480
+ .b8 48
481
+ .b8 100
482
+ .b8 49
483
+ .b8 100
484
+ .b8 50
485
+ .b8 100
486
+ .b8 51
487
+ .b8 100
488
+ .b8 101
489
+ .b8 52
490
+ .b8 100
491
+ .b8 101
492
+ .b8 0
493
+ .b8 1
494
+ .b8 18
495
+ .b8 1
496
+ .b8 1
497
+ .b8 3
498
+ .b64 $L__func_begin0
499
+ .b64 $L__func_end0
500
+ .b8 1
501
+ .b8 156
502
+ .b32 125
503
+ .b8 4
504
+ .b32 125
505
+ .b64 $L__tmp1
506
+ .b64 $L__tmp18
507
+ .b8 2
508
+ .b8 39
509
+ .b8 25
510
+ .b8 5
511
+ .b32 125
512
+ .b64 $L__tmp2
513
+ .b64 $L__tmp17
514
+ .b8 2
515
+ .b8 39
516
+ .b8 25
517
+ .b8 4
518
+ .b32 125
519
+ .b64 $L__tmp2
520
+ .b64 $L__tmp17
521
+ .b8 2
522
+ .b8 243
523
+ .b8 36
524
+ .b8 0
525
+ .b8 0
526
+ .b8 0
527
+ }
528
+ .section .debug_pubnames
529
+ {
530
+ .b32 $L__pubNames_end0-$L__pubNames_start0
531
+ $L__pubNames_start0:
532
+ .b8 2
533
+ .b8 0
534
+ .b32 .debug_info
535
+ .b32 270
536
+ .b32 125
537
+ .b8 116
538
+ .b8 114
539
+ .b8 105
540
+ .b8 116
541
+ .b8 111
542
+ .b8 110
543
+ .b8 95
544
+ .b8 95
545
+ .b8 48
546
+ .b8 100
547
+ .b8 49
548
+ .b8 100
549
+ .b8 50
550
+ .b8 100
551
+ .b8 51
552
+ .b8 100
553
+ .b8 101
554
+ .b8 52
555
+ .b8 100
556
+ .b8 101
557
+ .b8 0
558
+ .b32 0
559
+ $L__pubNames_end0:
560
+ }
561
+ .section .debug_pubtypes
562
+ {
563
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
564
+ $L__pubTypes_start0:
565
+ .b8 2
566
+ .b8 0
567
+ .b32 .debug_info
568
+ .b32 270
569
+ .b32 0
570
+ $L__pubTypes_end0:
571
+ }
572
+ .section .debug_loc { }
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
9
+ %c0_i32 = arith.constant 0 : i32
10
+ %c128_i32 = arith.constant 128 : i32
11
+ %c8_i32 = arith.constant 8 : i32
12
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
13
+ %c64_i32 = arith.constant 64 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c64_i32 : i32
16
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
17
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
20
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
21
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
22
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
23
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
24
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
26
+ %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
27
+ %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
28
+ %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
29
+ %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
30
+ %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
32
+ %18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
33
+ %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
34
+ %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
35
+ %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
36
+ %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
37
+ %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
38
+ %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
39
+ %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
40
+ %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
41
+ %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
42
+ %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
43
+ %34 = tt.load %32, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
44
+ %35 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
45
+ %36 = tt.load %35, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
46
+ %37 = arith.mulf %34, %36 : tensor<64x8xf32, #blocked>
47
+ %38 = arith.addf %arg6, %37 : tensor<64x8xf32, #blocked>
48
+ %39 = arith.select %33, %38, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
49
+ scf.yield %39 : tensor<64x8xf32, #blocked>
50
+ }
51
+ %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
52
+ ^bb0(%arg5: f32, %arg6: f32):
53
+ %25 = arith.addf %arg5, %arg6 : f32
54
+ tt.reduce.return %25 : f32
55
+ }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
56
+ %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
57
+ %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
58
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
59
+ %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
60
+ tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
61
+ tt.return
62
+ }
63
+ }
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin ADDED
Binary file (16.5 kB). View file
 
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
7
+ %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %7 = and i32 %6, 31, !dbg !8
9
+ %8 = lshr i32 %6, 5, !dbg !8
10
+ %9 = shl i32 %6, 2, !dbg !8
11
+ %10 = and i32 %9, 60, !dbg !8
12
+ %11 = and i32 %8, 3, !dbg !9
13
+ %12 = lshr i32 %7, 4, !dbg !9
14
+ %13 = shl nuw nsw i32 %11, 1, !dbg !9
15
+ %14 = or i32 %13, %12, !dbg !9
16
+ %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
17
+ %16 = shl i32 %15, 6, !dbg !11
18
+ %17 = or i32 %16, %10, !dbg !12
19
+ %.frozen = freeze i32 %17
20
+ %18 = sdiv i32 %.frozen, 256, !dbg !13
21
+ %19 = mul i32 %18, 256
22
+ %.decomposed = sub i32 %.frozen, %19
23
+ %20 = shl i32 %18, 15, !dbg !14
24
+ %21 = add i32 %20, %.decomposed
25
+ br label %22, !dbg !15
26
+
27
+ 22: ; preds = %5, %22
28
+ %23 = phi i32 [ 0, %5 ], [ %58, %22 ]
29
+ %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ]
30
+ %25 = or i32 %23, %14, !dbg !16
31
+ %26 = shl i32 %25, 8, !dbg !17
32
+ %27 = add i32 %21, %26, !dbg !18
33
+ %28 = sext i32 %27 to i64, !dbg !19
34
+ %29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19
35
+ %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
36
+ %31 = extractvalue { i32, i32 } %30, 0, !dbg !20
37
+ %32 = extractvalue { i32, i32 } %30, 1, !dbg !20
38
+ %33 = trunc i32 %31 to i16, !dbg !20
39
+ %extelt.offset = lshr i32 %31, 16, !dbg !20
40
+ %34 = trunc i32 %extelt.offset to i16, !dbg !20
41
+ %35 = trunc i32 %32 to i16, !dbg !20
42
+ %extelt.offset1 = lshr i32 %32, 16, !dbg !20
43
+ %36 = trunc i32 %extelt.offset1 to i16, !dbg !20
44
+ %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21
45
+ %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21
46
+ %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21
47
+ %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21
48
+ %41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22
49
+ %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23
50
+ %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23
51
+ %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23
52
+ %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23
53
+ %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23
54
+ %47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23
55
+ %48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23
56
+ %49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23
57
+ %50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23
58
+ %51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23
59
+ %52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24
60
+ %53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24
61
+ %54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24
62
+ %55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24
63
+ %56 = fmul <4 x float> %55, %51, !dbg !24
64
+ %57 = fadd <4 x float> %24, %56, !dbg !25
65
+ %58 = add nuw nsw i32 %23, 8, !dbg !15
66
+ %59 = icmp ult i32 %23, 120, !dbg !15
67
+ br i1 %59, label %22, label %60, !dbg !15
68
+
69
+ 60: ; preds = %22
70
+ %61 = and i32 %6, 63, !dbg !8
71
+ %62 = or i32 %16, %61, !dbg !12
72
+ %63 = or i32 %10, 3, !dbg !26
73
+ %64 = or i32 %10, 2, !dbg !26
74
+ %65 = or i32 %10, 1, !dbg !26
75
+ %66 = extractelement <4 x float> %57, i64 0, !dbg !26
76
+ %67 = bitcast float %66 to i32, !dbg !26
77
+ %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26
78
+ %69 = bitcast i32 %68 to float, !dbg !26
79
+ %70 = fadd float %66, %69, !dbg !30
80
+ %71 = extractelement <4 x float> %57, i64 1, !dbg !26
81
+ %72 = bitcast float %71 to i32, !dbg !26
82
+ %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26
83
+ %74 = bitcast i32 %73 to float, !dbg !26
84
+ %75 = fadd float %71, %74, !dbg !30
85
+ %76 = extractelement <4 x float> %57, i64 2, !dbg !26
86
+ %77 = bitcast float %76 to i32, !dbg !26
87
+ %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26
88
+ %79 = bitcast i32 %78 to float, !dbg !26
89
+ %80 = fadd float %76, %79, !dbg !30
90
+ %81 = extractelement <4 x float> %57, i64 3, !dbg !26
91
+ %82 = bitcast float %81 to i32, !dbg !26
92
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26
93
+ %84 = bitcast i32 %83 to float, !dbg !26
94
+ %85 = fadd float %81, %84, !dbg !30
95
+ %86 = icmp ult i32 %7, 16, !dbg !26
96
+ %87 = shl nuw nsw i32 %10, 2, !dbg !26
97
+ %88 = or i32 %87, %11, !dbg !26
98
+ %89 = zext nneg i32 %88 to i64, !dbg !26
99
+ %90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26
100
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26
101
+ %91 = shl nuw nsw i32 %65, 2, !dbg !26
102
+ %92 = or i32 %91, %11, !dbg !26
103
+ %93 = zext nneg i32 %92 to i64, !dbg !26
104
+ %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26
105
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26
106
+ %95 = shl nuw nsw i32 %64, 2, !dbg !26
107
+ %96 = or i32 %95, %11, !dbg !26
108
+ %97 = zext nneg i32 %96 to i64, !dbg !26
109
+ %98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26
110
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26
111
+ %99 = shl nuw nsw i32 %63, 2, !dbg !26
112
+ %100 = or i32 %99, %11, !dbg !26
113
+ %101 = zext nneg i32 %100 to i64, !dbg !26
114
+ %102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26
115
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26
116
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
117
+ %103 = icmp slt i32 %6, 256, !dbg !26
118
+ %104 = sext i32 %6 to i64, !dbg !26
119
+ %105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26
120
+ %106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26
121
+ %107 = bitcast float %106 to i32, !dbg !26
122
+ %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26
123
+ %109 = bitcast i32 %108 to float, !dbg !26
124
+ %110 = fadd float %106, %109, !dbg !30
125
+ %111 = bitcast float %110 to i32, !dbg !26
126
+ %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26
127
+ %113 = bitcast i32 %112 to float, !dbg !26
128
+ %114 = fadd float %110, %113, !dbg !30
129
+ %115 = and i32 %6, 3, !dbg !26
130
+ %116 = icmp eq i32 %115, 0, !dbg !26
131
+ %117 = and i1 %103, %116, !dbg !26
132
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26
133
+ %118 = add i32 %6, 128, !dbg !26
134
+ %119 = sext i32 %118 to i64, !dbg !26
135
+ %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26
136
+ %121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26
137
+ %122 = bitcast float %121 to i32, !dbg !26
138
+ %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26
139
+ %124 = bitcast i32 %123 to float, !dbg !26
140
+ %125 = fadd float %121, %124, !dbg !30
141
+ %126 = bitcast float %125 to i32, !dbg !26
142
+ %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26
143
+ %128 = bitcast i32 %127 to float, !dbg !26
144
+ %129 = fadd float %125, %128, !dbg !30
145
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26
146
+ tail call void @llvm.nvvm.barrier0(), !dbg !26
147
+ %130 = zext nneg i32 %87 to i64, !dbg !26
148
+ %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26
149
+ %132 = load float, ptr addrspace(3) %131, align 4, !dbg !26
150
+ %133 = zext nneg i32 %91 to i64, !dbg !26
151
+ %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26
152
+ %135 = load float, ptr addrspace(3) %134, align 4, !dbg !26
153
+ %136 = zext nneg i32 %95 to i64, !dbg !26
154
+ %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26
155
+ %138 = load float, ptr addrspace(3) %137, align 4, !dbg !26
156
+ %139 = zext nneg i32 %99 to i64, !dbg !26
157
+ %140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26
158
+ %141 = load float, ptr addrspace(3) %140, align 4, !dbg !26
159
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
160
+ %142 = zext nneg i32 %10 to i64, !dbg !34
161
+ %143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34
162
+ %144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34
163
+ store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34
164
+ %145 = zext nneg i32 %65 to i64, !dbg !34
165
+ %146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34
166
+ %147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34
167
+ store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34
168
+ %148 = zext nneg i32 %64 to i64, !dbg !34
169
+ %149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34
170
+ %150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34
171
+ store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34
172
+ %151 = zext nneg i32 %63 to i64, !dbg !34
173
+ %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34
174
+ %153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34
175
+ store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34
176
+ tail call void @llvm.nvvm.barrier0(), !dbg !34
177
+ %154 = zext nneg i32 %61 to i64, !dbg !34
178
+ %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34
179
+ %156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34
180
+ %157 = sext i32 %62 to i64, !dbg !35
181
+ %158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35
182
+ %159 = and i32 %6, 64, !dbg !36
183
+ %160 = icmp eq i32 %159, 0, !dbg !36
184
+ tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36
185
+ ret void, !dbg !37
186
+ }
187
+
188
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
189
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
190
+
191
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
192
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
193
+
194
+ ; Function Attrs: convergent nocallback nounwind
195
+ declare void @llvm.nvvm.barrier0() #2
196
+
197
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
198
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
199
+ attributes #2 = { convergent nocallback nounwind }
200
+ attributes #3 = { nounwind }
201
+
202
+ !llvm.module.flags = !{!0}
203
+ !llvm.dbg.cu = !{!1}
204
+ !nvvm.annotations = !{!3, !4, !4, !3}
205
+
206
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
207
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
208
+ !2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj")
209
+ !3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1}
210
+ !4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128}
211
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
212
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
213
+ !7 = !{}
214
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
215
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
216
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
217
+ !11 = !DILocation(line: 21, column: 33, scope: !5)
218
+ !12 = !DILocation(line: 22, column: 23, scope: !5)
219
+ !13 = !DILocation(line: 26, column: 20, scope: !5)
220
+ !14 = !DILocation(line: 33, column: 57, scope: !5)
221
+ !15 = !DILocation(line: 29, column: 36, scope: !5)
222
+ !16 = !DILocation(line: 30, column: 27, scope: !5)
223
+ !17 = !DILocation(line: 33, column: 44, scope: !5)
224
+ !18 = !DILocation(line: 33, column: 51, scope: !5)
225
+ !19 = !DILocation(line: 33, column: 34, scope: !5)
226
+ !20 = !DILocation(line: 33, column: 63, scope: !5)
227
+ !21 = !DILocation(line: 33, column: 115, scope: !5)
228
+ !22 = !DILocation(line: 34, column: 34, scope: !5)
229
+ !23 = !DILocation(line: 34, column: 63, scope: !5)
230
+ !24 = !DILocation(line: 36, column: 22, scope: !5)
231
+ !25 = !DILocation(line: 39, column: 38, scope: !5)
232
+ !26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29)
233
+ !27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
234
+ !28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
235
+ !29 = !DILocation(line: 40, column: 25, scope: !27)
236
+ !30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32)
237
+ !31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0)
238
+ !32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33)
239
+ !33 = !DILocation(line: 40, column: 25, scope: !31)
240
+ !34 = !DILocation(line: 40, column: 28, scope: !5)
241
+ !35 = !DILocation(line: 41, column: 25, scope: !5)
242
+ !36 = !DILocation(line: 41, column: 36, scope: !5)
243
+ !37 = !DILocation(line: 41, column: 4, scope: !5)
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4de(
13
+ .param .u64 triton__0d1d2d3de4de_param_0,
14
+ .param .u64 triton__0d1d2d3de4de_param_1,
15
+ .param .u64 triton__0d1d2d3de4de_param_2,
16
+ .param .u32 triton__0d1d2d3de4de_param_3,
17
+ .param .u32 triton__0d1d2d3de4de_param_4
18
+ )
19
+ .maxntid 128, 1, 1
20
+ {
21
+ .reg .pred %p<20>;
22
+ .reg .b16 %rs<5>;
23
+ .reg .b32 %r<98>;
24
+ .reg .f32 %f<47>;
25
+ .reg .b64 %rd<10>;
26
+ .loc 1 18 0
27
+ $L__func_begin0:
28
+ .loc 1 18 0
29
+
30
+ ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2];
31
+ ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1];
32
+ ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0];
33
+ $L__tmp0:
34
+ .loc 1 22 44
35
+ mov.u32 %r1, %tid.x;
36
+ and.b32 %r2, %r1, 31;
37
+ shl.b32 %r13, %r1, 2;
38
+ and.b32 %r3, %r13, 60;
39
+ .loc 1 24 33
40
+ bfe.u32 %r4, %r1, 5, 2;
41
+ .loc 1 21 28
42
+ mov.u32 %r11, %ctaid.x;
43
+ .loc 1 21 33
44
+ shl.b32 %r5, %r11, 6;
45
+ .loc 1 22 23
46
+ or.b32 %r14, %r5, %r3;
47
+ .loc 1 26 20
48
+ shr.s32 %r16, %r14, 31;
49
+ shr.u32 %r17, %r16, 24;
50
+ add.s32 %r18, %r14, %r17;
51
+ shr.s32 %r19, %r18, 8;
52
+ .loc 1 29 36
53
+ mad.lo.s32 %r20, %r19, 32512, %r14;
54
+ shl.b32 %r21, %r4, 9;
55
+ add.s32 %r22, %r20, %r21;
56
+ shl.b32 %r23, %r1, 4;
57
+ and.b32 %r24, %r23, 256;
58
+ add.s32 %r96, %r22, %r24;
59
+ mov.f32 %f43, 0f00000000;
60
+ mov.b32 %r97, -8;
61
+ mov.pred %p1, -1;
62
+ mov.f32 %f44, %f43;
63
+ mov.f32 %f45, %f43;
64
+ mov.f32 %f46, %f43;
65
+ $L__BB0_1:
66
+ .loc 1 33 34
67
+ mul.wide.s32 %rd6, %r96, 2;
68
+ add.s64 %rd4, %rd1, %rd6;
69
+ mov.b32 %r27, 0;
70
+ .loc 1 33 63
71
+ mov.u32 %r25, 0x0;
72
+ mov.u32 %r26, 0x0;
73
+ @%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ];
74
+ @!%p1 mov.u32 %r25, %r27;
75
+ @!%p1 mov.u32 %r26, %r27;
76
+ cvt.u16.u32 %rs1, %r25;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; }
78
+ cvt.u16.u32 %rs3, %r26;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; }
80
+ .loc 1 33 115
81
+ cvt.f32.bf16 %r29, %rs1;
82
+ mov.b32 %f13, %r29;
83
+ cvt.f32.bf16 %r30, %rs2;
84
+ mov.b32 %f14, %r30;
85
+ cvt.f32.bf16 %r31, %rs3;
86
+ mov.b32 %f15, %r31;
87
+ cvt.f32.bf16 %r32, %rs4;
88
+ mov.b32 %f16, %r32;
89
+ .loc 1 34 34
90
+ mul.wide.s32 %rd7, %r96, 4;
91
+ add.s64 %rd5, %rd2, %rd7;
92
+ .loc 1 34 63
93
+ mov.u32 %r33, 0x0;
94
+ mov.u32 %r34, 0x0;
95
+ mov.u32 %r35, 0x0;
96
+ mov.u32 %r36, 0x0;
97
+ @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ];
98
+ @!%p1 mov.u32 %r33, %r27;
99
+ @!%p1 mov.u32 %r34, %r27;
100
+ @!%p1 mov.u32 %r35, %r27;
101
+ @!%p1 mov.u32 %r36, %r27;
102
+ mov.b32 %f17, %r33;
103
+ mov.b32 %f18, %r34;
104
+ mov.b32 %f19, %r35;
105
+ mov.b32 %f20, %r36;
106
+ .loc 1 39 38
107
+ fma.rn.f32 %f46, %f16, %f20, %f46;
108
+ fma.rn.f32 %f45, %f15, %f19, %f45;
109
+ fma.rn.f32 %f44, %f14, %f18, %f44;
110
+ fma.rn.f32 %f43, %f13, %f17, %f43;
111
+ .loc 1 29 36
112
+ add.s32 %r97, %r97, 8;
113
+ add.s32 %r96, %r96, 2048;
114
+ setp.lt.u32 %p9, %r97, 120;
115
+ @%p9 bra $L__BB0_1;
116
+ .loc 1 22 44
117
+ and.b32 %r58, %r1, 63;
118
+ .loc 1 22 23
119
+ or.b32 %r59, %r5, %r58;
120
+ $L__tmp1:
121
+ .loc 2 243 36
122
+ mov.b32 %r60, %f43;
123
+ shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1;
124
+ mov.b32 %f21, %r61;
125
+ $L__tmp2:
126
+ .loc 2 233 15
127
+ add.f32 %f22, %f43, %f21;
128
+ $L__tmp3:
129
+ .loc 2 243 36
130
+ mov.b32 %r62, %f44;
131
+ shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
132
+ mov.b32 %f23, %r63;
133
+ $L__tmp4:
134
+ .loc 2 233 15
135
+ add.f32 %f24, %f44, %f23;
136
+ $L__tmp5:
137
+ .loc 2 243 36
138
+ mov.b32 %r64, %f45;
139
+ shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1;
140
+ mov.b32 %f25, %r65;
141
+ $L__tmp6:
142
+ .loc 2 233 15
143
+ add.f32 %f26, %f45, %f25;
144
+ $L__tmp7:
145
+ .loc 2 243 36
146
+ mov.b32 %r66, %f46;
147
+ shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1;
148
+ mov.b32 %f27, %r67;
149
+ $L__tmp8:
150
+ .loc 2 233 15
151
+ add.f32 %f28, %f46, %f27;
152
+ $L__tmp9:
153
+ .loc 2 243 36
154
+ setp.lt.u32 %p10, %r2, 16;
155
+ shl.b32 %r68, %r3, 2;
156
+ or.b32 %r69, %r68, %r4;
157
+ shl.b32 %r70, %r69, 2;
158
+ mov.u32 %r71, global_smem;
159
+ add.s32 %r41, %r71, %r70;
160
+ mov.b32 %r42, %f22;
161
+ @%p10 st.shared.b32 [ %r41 + 0 ], %r42;
162
+ shl.b32 %r72, %r4, 2;
163
+ shl.b32 %r73, %r3, 4;
164
+ or.b32 %r74, %r73, 16;
165
+ or.b32 %r75, %r74, %r72;
166
+ add.s32 %r43, %r71, %r75;
167
+ mov.b32 %r44, %f24;
168
+ @%p10 st.shared.b32 [ %r43 + 0 ], %r44;
169
+ or.b32 %r76, %r73, 32;
170
+ or.b32 %r77, %r76, %r72;
171
+ add.s32 %r45, %r71, %r77;
172
+ mov.b32 %r46, %f26;
173
+ @%p10 st.shared.b32 [ %r45 + 0 ], %r46;
174
+ or.b32 %r78, %r73, 48;
175
+ or.b32 %r79, %r78, %r72;
176
+ add.s32 %r47, %r71, %r79;
177
+ mov.b32 %r48, %f28;
178
+ @%p10 st.shared.b32 [ %r47 + 0 ], %r48;
179
+ bar.sync 0;
180
+ setp.lt.s32 %p14, %r1, 256;
181
+ add.s32 %r50, %r71, %r13;
182
+ @%p14 ld.shared.b32 %r49, [ %r50 + 0 ];
183
+ mov.b32 %f29, %r49;
184
+ shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1;
185
+ mov.b32 %f30, %r81;
186
+ $L__tmp10:
187
+ .loc 2 233 15
188
+ add.f32 %f31, %f29, %f30;
189
+ $L__tmp11:
190
+ .loc 2 243 36
191
+ mov.b32 %r82, %f31;
192
+ shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1;
193
+ mov.b32 %f32, %r83;
194
+ $L__tmp12:
195
+ .loc 2 233 15
196
+ add.f32 %f33, %f31, %f32;
197
+ $L__tmp13:
198
+ .loc 2 243 36
199
+ and.b32 %r84, %r1, 3;
200
+ setp.eq.s32 %p19, %r84, 0;
201
+ and.pred %p15, %p14, %p19;
202
+ mov.b32 %r52, %f33;
203
+ @%p15 st.shared.b32 [ %r50 + 0 ], %r52;
204
+ add.s32 %r54, %r50, 512;
205
+ @%p14 ld.shared.b32 %r53, [ %r54 + 0 ];
206
+ mov.b32 %f34, %r53;
207
+ shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1;
208
+ mov.b32 %f35, %r85;
209
+ $L__tmp14:
210
+ .loc 2 233 15
211
+ add.f32 %f36, %f34, %f35;
212
+ $L__tmp15:
213
+ .loc 2 243 36
214
+ mov.b32 %r86, %f36;
215
+ shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1;
216
+ mov.b32 %f37, %r87;
217
+ $L__tmp16:
218
+ .loc 2 233 15
219
+ add.f32 %f38, %f36, %f37;
220
+ $L__tmp17:
221
+ .loc 2 243 36
222
+ mov.b32 %r56, %f38;
223
+ @%p15 st.shared.b32 [ %r54 + 0 ], %r56;
224
+ bar.sync 0;
225
+ add.s32 %r88, %r71, %r73;
226
+ ld.shared.f32 %f39, [%r88];
227
+ add.s32 %r89, %r71, %r74;
228
+ ld.shared.f32 %f40, [%r89];
229
+ add.s32 %r90, %r71, %r76;
230
+ ld.shared.f32 %f41, [%r90];
231
+ add.s32 %r91, %r71, %r78;
232
+ ld.shared.f32 %f42, [%r91];
233
+ $L__tmp18:
234
+ .loc 1 40 28
235
+ bar.sync 0;
236
+ add.s32 %r92, %r71, %r68;
237
+ st.shared.f32 [%r92], %f39;
238
+ st.shared.f32 [%r92+4], %f40;
239
+ st.shared.f32 [%r92+8], %f41;
240
+ st.shared.f32 [%r92+12], %f42;
241
+ bar.sync 0;
242
+ shl.b32 %r93, %r58, 2;
243
+ add.s32 %r94, %r71, %r93;
244
+ ld.shared.u32 %r57, [%r94];
245
+ .loc 1 41 25
246
+ mul.wide.s32 %rd9, %r59, 4;
247
+ add.s64 %rd8, %rd3, %rd9;
248
+ .loc 1 41 36
249
+ and.b32 %r95, %r1, 64;
250
+ setp.eq.s32 %p18, %r95, 0;
251
+ @%p18 st.global.b32 [ %rd8 + 0 ], { %r57 };
252
+ .loc 1 41 4
253
+ ret;
254
+ $L__tmp19:
255
+ $L__func_end0:
256
+
257
+ }
258
+ .file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py"
259
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
260
+ .section .debug_abbrev
261
+ {
262
+ .b8 1
263
+ .b8 17
264
+ .b8 1
265
+ .b8 37
266
+ .b8 8
267
+ .b8 19
268
+ .b8 5
269
+ .b8 3
270
+ .b8 8
271
+ .b8 16
272
+ .b8 6
273
+ .b8 27
274
+ .b8 8
275
+ .b8 180
276
+ .b8 66
277
+ .b8 12
278
+ .b8 17
279
+ .b8 1
280
+ .b8 18
281
+ .b8 1
282
+ .b8 0
283
+ .b8 0
284
+ .b8 2
285
+ .b8 46
286
+ .b8 0
287
+ .b8 135
288
+ .b8 64
289
+ .b8 8
290
+ .b8 3
291
+ .b8 8
292
+ .b8 58
293
+ .b8 11
294
+ .b8 59
295
+ .b8 11
296
+ .b8 63
297
+ .b8 12
298
+ .b8 32
299
+ .b8 11
300
+ .b8 0
301
+ .b8 0
302
+ .b8 3
303
+ .b8 46
304
+ .b8 1
305
+ .b8 17
306
+ .b8 1
307
+ .b8 18
308
+ .b8 1
309
+ .b8 64
310
+ .b8 10
311
+ .b8 49
312
+ .b8 19
313
+ .b8 0
314
+ .b8 0
315
+ .b8 4
316
+ .b8 29
317
+ .b8 0
318
+ .b8 49
319
+ .b8 19
320
+ .b8 17
321
+ .b8 1
322
+ .b8 18
323
+ .b8 1
324
+ .b8 88
325
+ .b8 11
326
+ .b8 89
327
+ .b8 11
328
+ .b8 87
329
+ .b8 11
330
+ .b8 0
331
+ .b8 0
332
+ .b8 5
333
+ .b8 29
334
+ .b8 1
335
+ .b8 49
336
+ .b8 19
337
+ .b8 17
338
+ .b8 1
339
+ .b8 18
340
+ .b8 1
341
+ .b8 88
342
+ .b8 11
343
+ .b8 89
344
+ .b8 11
345
+ .b8 87
346
+ .b8 11
347
+ .b8 0
348
+ .b8 0
349
+ .b8 0
350
+ }
351
+ .section .debug_info
352
+ {
353
+ .b32 266
354
+ .b8 2
355
+ .b8 0
356
+ .b32 .debug_abbrev
357
+ .b8 8
358
+ .b8 1
359
+ .b8 116
360
+ .b8 114
361
+ .b8 105
362
+ .b8 116
363
+ .b8 111
364
+ .b8 110
365
+ .b8 0
366
+ .b8 2
367
+ .b8 0
368
+ .b8 99
369
+ .b8 115
370
+ .b8 106
371
+ .b8 100
372
+ .b8 55
373
+ .b8 109
374
+ .b8 108
375
+ .b8 114
376
+ .b8 106
377
+ .b8 117
378
+ .b8 106
379
+ .b8 100
380
+ .b8 52
381
+ .b8 117
382
+ .b8 119
383
+ .b8 122
384
+ .b8 101
385
+ .b8 53
386
+ .b8 116
387
+ .b8 107
388
+ .b8 103
389
+ .b8 55
390
+ .b8 112
391
+ .b8 116
392
+ .b8 116
393
+ .b8 101
394
+ .b8 97
395
+ .b8 103
396
+ .b8 112
397
+ .b8 105
398
+ .b8 104
399
+ .b8 103
400
+ .b8 116
401
+ .b8 53
402
+ .b8 122
403
+ .b8 116
404
+ .b8 97
405
+ .b8 116
406
+ .b8 102
407
+ .b8 113
408
+ .b8 99
409
+ .b8 104
410
+ .b8 112
411
+ .b8 114
412
+ .b8 99
413
+ .b8 114
414
+ .b8 97
415
+ .b8 120
416
+ .b8 50
417
+ .b8 50
418
+ .b8 108
419
+ .b8 115
420
+ .b8 46
421
+ .b8 112
422
+ .b8 121
423
+ .b8 0
424
+ .b32 .debug_line
425
+ .b8 47
426
+ .b8 116
427
+ .b8 109
428
+ .b8 112
429
+ .b8 47
430
+ .b8 116
431
+ .b8 111
432
+ .b8 114
433
+ .b8 99
434
+ .b8 104
435
+ .b8 105
436
+ .b8 110
437
+ .b8 100
438
+ .b8 117
439
+ .b8 99
440
+ .b8 116
441
+ .b8 111
442
+ .b8 114
443
+ .b8 95
444
+ .b8 114
445
+ .b8 111
446
+ .b8 111
447
+ .b8 116
448
+ .b8 47
449
+ .b8 115
450
+ .b8 106
451
+ .b8 0
452
+ .b8 1
453
+ .b64 $L__func_begin0
454
+ .b64 $L__func_end0
455
+ .b8 2
456
+ .b8 116
457
+ .b8 114
458
+ .b8 105
459
+ .b8 116
460
+ .b8 111
461
+ .b8 110
462
+ .b8 95
463
+ .b8 95
464
+ .b8 48
465
+ .b8 100
466
+ .b8 49
467
+ .b8 100
468
+ .b8 50
469
+ .b8 100
470
+ .b8 51
471
+ .b8 100
472
+ .b8 101
473
+ .b8 52
474
+ .b8 100
475
+ .b8 101
476
+ .b8 0
477
+ .b8 116
478
+ .b8 114
479
+ .b8 105
480
+ .b8 116
481
+ .b8 111
482
+ .b8 110
483
+ .b8 95
484
+ .b8 95
485
+ .b8 48
486
+ .b8 100
487
+ .b8 49
488
+ .b8 100
489
+ .b8 50
490
+ .b8 100
491
+ .b8 51
492
+ .b8 100
493
+ .b8 101
494
+ .b8 52
495
+ .b8 100
496
+ .b8 101
497
+ .b8 0
498
+ .b8 1
499
+ .b8 18
500
+ .b8 1
501
+ .b8 1
502
+ .b8 3
503
+ .b64 $L__func_begin0
504
+ .b64 $L__func_end0
505
+ .b8 1
506
+ .b8 156
507
+ .b32 125
508
+ .b8 4
509
+ .b32 125
510
+ .b64 $L__tmp1
511
+ .b64 $L__tmp18
512
+ .b8 2
513
+ .b8 40
514
+ .b8 25
515
+ .b8 5
516
+ .b32 125
517
+ .b64 $L__tmp2
518
+ .b64 $L__tmp17
519
+ .b8 2
520
+ .b8 40
521
+ .b8 25
522
+ .b8 4
523
+ .b32 125
524
+ .b64 $L__tmp2
525
+ .b64 $L__tmp17
526
+ .b8 2
527
+ .b8 243
528
+ .b8 36
529
+ .b8 0
530
+ .b8 0
531
+ .b8 0
532
+ }
533
+ .section .debug_pubnames
534
+ {
535
+ .b32 $L__pubNames_end0-$L__pubNames_start0
536
+ $L__pubNames_start0:
537
+ .b8 2
538
+ .b8 0
539
+ .b32 .debug_info
540
+ .b32 270
541
+ .b32 125
542
+ .b8 116
543
+ .b8 114
544
+ .b8 105
545
+ .b8 116
546
+ .b8 111
547
+ .b8 110
548
+ .b8 95
549
+ .b8 95
550
+ .b8 48
551
+ .b8 100
552
+ .b8 49
553
+ .b8 100
554
+ .b8 50
555
+ .b8 100
556
+ .b8 51
557
+ .b8 100
558
+ .b8 101
559
+ .b8 52
560
+ .b8 100
561
+ .b8 101
562
+ .b8 0
563
+ .b32 0
564
+ $L__pubNames_end0:
565
+ }
566
+ .section .debug_pubtypes
567
+ {
568
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
569
+ $L__pubTypes_start0:
570
+ .b8 2
571
+ .b8 0
572
+ .b32 .debug_info
573
+ .b32 270
574
+ .b32 0
575
+ $L__pubTypes_end0:
576
+ }
577
+ .section .debug_loc { }
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked>
6
+ %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked>
7
+ %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
8
+ %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked>
9
+ %c0_i32 = arith.constant 0 : i32
10
+ %c128_i32 = arith.constant 128 : i32
11
+ %c8_i32 = arith.constant 8 : i32
12
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
14
+ %c64_i32 = arith.constant 64 : i32
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = arith.muli %0, %c64_i32 : i32
17
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
19
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
20
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
21
+ %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
22
+ %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
23
+ %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
24
+ %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
25
+ %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
26
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
27
+ %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
28
+ %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked>
29
+ %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
30
+ %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked>
31
+ %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
32
+ %17 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
33
+ %18 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
34
+ %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 {
35
+ %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked>
36
+ %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked>
37
+ %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked>
38
+ %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked>
39
+ %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
40
+ %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked>
41
+ %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked>
42
+ %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
43
+ %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
44
+ %34 = tt.load %32, %33, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
45
+ %35 = arith.extf %34 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
46
+ %36 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
47
+ %37 = tt.load %36, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
48
+ %38 = arith.mulf %35, %37 : tensor<64x8xf32, #blocked>
49
+ %39 = arith.addf %arg6, %38 : tensor<64x8xf32, #blocked>
50
+ %40 = arith.select %33, %39, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
51
+ scf.yield %40 : tensor<64x8xf32, #blocked>
52
+ }
53
+ %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({
54
+ ^bb0(%arg5: f32, %arg6: f32):
55
+ %25 = arith.addf %arg5, %arg6 : f32
56
+ tt.reduce.return %25 : f32
57
+ }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
58
+ %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
59
+ %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1>
60
+ %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked1>
61
+ %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr<f32, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
62
+ tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1>
63
+ tt.return
64
+ }
65
+ }
.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %c8_i32 = arith.constant 8 : i32
5
+ %c128_i32 = arith.constant 128 : i32
6
+ %c0_i32 = arith.constant 0 : i32
7
+ %cst_0 = arith.constant dense<32768> : tensor<64x1xi32>
8
+ %cst_1 = arith.constant dense<256> : tensor<1x8xi32>
9
+ %cst_2 = arith.constant dense<128> : tensor<1x8xi32>
10
+ %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
11
+ %cst_4 = arith.constant dense<256> : tensor<64x1xi32>
12
+ %c64_i32 = arith.constant 64 : i32
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.muli %0, %c64_i32 : i32
15
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
16
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
17
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
18
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
19
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
20
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
21
+ %8 = arith.remsi %5, %cst_4 : tensor<64x1xi32>
22
+ %9 = arith.divsi %5, %cst_4 : tensor<64x1xi32>
23
+ %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
24
+ %11 = arith.muli %9, %cst_0 : tensor<64x1xi32>
25
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
26
+ %13 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
27
+ %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
28
+ %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32>) : i32 {
29
+ %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
30
+ %21 = arith.addi %20, %7 : tensor<1x8xi32>
31
+ %22 = arith.cmpi slt, %21, %cst_2 : tensor<1x8xi32>
32
+ %23 = arith.muli %21, %cst_1 : tensor<1x8xi32>
33
+ %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
34
+ %25 = arith.addi %10, %24 : tensor<64x8xi32>
35
+ %26 = arith.addi %25, %12 : tensor<64x8xi32>
36
+ %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
37
+ %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
38
+ %29 = tt.load %27, %28, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
39
+ %30 = arith.extf %29 : tensor<64x8xbf16> to tensor<64x8xf32>
40
+ %31 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
41
+ %32 = tt.load %31, %28, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
42
+ %33 = arith.mulf %30, %32 : tensor<64x8xf32>
43
+ %34 = arith.addf %arg6, %33 : tensor<64x8xf32>
44
+ %35 = arith.select %28, %34, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
45
+ scf.yield %35 : tensor<64x8xf32>
46
+ }
47
+ %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
48
+ ^bb0(%arg5: f32, %arg6: f32):
49
+ %20 = arith.addf %arg5, %arg6 : f32
50
+ tt.reduce.return %20 : f32
51
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
52
+ %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
53
+ %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
54
+ %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
55
+ tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
56
+ tt.return
57
+ }
58
+ }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3de4e
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3de4e(
13
+ .param .u64 triton__0d1d2d3de4e_param_0,
14
+ .param .u64 triton__0d1d2d3de4e_param_1,
15
+ .param .u64 triton__0d1d2d3de4e_param_2,
16
+ .param .u32 triton__0d1d2d3de4e_param_3,
17
+ .param .u32 triton__0d1d2d3de4e_param_4
18
+ )
19
+ .maxntid 256, 1, 1
20
+ {
21
+ .reg .pred %p<10>;
22
+ .reg .b32 %r<44>;
23
+ .reg .f32 %f<11>;
24
+ .reg .b64 %rd<16>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2];
30
+ ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1];
31
+ ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0];
32
+ $L__tmp0:
33
+ .loc 1 22 44
34
+ mov.u32 %r1, %tid.x;
35
+ and.b32 %r2, %r1, 63;
36
+ .loc 1 24 33
37
+ bfe.u32 %r3, %r1, 6, 2;
38
+ .loc 1 21 28
39
+ mov.u32 %r10, %ctaid.x;
40
+ .loc 1 21 33
41
+ shl.b32 %r12, %r10, 6;
42
+ .loc 1 22 23
43
+ or.b32 %r4, %r12, %r2;
44
+ .loc 1 27 36
45
+ shl.b32 %r13, %r3, 17;
46
+ add.s32 %r14, %r13, %r12;
47
+ or.b32 %r42, %r14, %r2;
48
+ mov.f32 %f10, 0f00000000;
49
+ mov.b32 %r43, -4;
50
+ mov.pred %p4, -1;
51
+ $L__BB0_1:
52
+ .loc 1 31 34
53
+ mul.wide.s32 %rd5, %r42, 4;
54
+ add.s64 %rd4, %rd1, %rd5;
55
+ mov.b32 %r16, 0;
56
+ .loc 1 31 53
57
+ mov.u32 %r15, 0x0;
58
+ @%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ];
59
+ @!%p4 mov.u32 %r15, %r16;
60
+ mov.b32 %f4, %r15;
61
+ .loc 1 34 38
62
+ add.f32 %f10, %f10, %f4;
63
+ .loc 1 27 36
64
+ add.s32 %r43, %r43, 4;
65
+ add.s32 %r42, %r42, 524288;
66
+ setp.lt.u32 %p3, %r43, 116;
67
+ @%p3 bra $L__BB0_1;
68
+ $L__tmp1:
69
+ .loc 2 243 36
70
+ shl.b32 %r25, %r3, 2;
71
+ shl.b32 %r26, %r2, 4;
72
+ or.b32 %r27, %r26, %r25;
73
+ mov.u32 %r28, global_smem;
74
+ add.s32 %r17, %r28, %r27;
75
+ mov.b32 %r18, %f10;
76
+ @%p4 st.shared.b32 [ %r17 + 0 ], %r18;
77
+ bar.sync 0;
78
+ setp.lt.s32 %p5, %r1, 256;
79
+ shl.b32 %r29, %r1, 2;
80
+ add.s32 %r20, %r28, %r29;
81
+ @%p5 ld.shared.b32 %r19, [ %r20 + 0 ];
82
+ mov.b32 %f5, %r19;
83
+ shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1;
84
+ mov.b32 %f6, %r30;
85
+ $L__tmp2:
86
+ .loc 2 233 15
87
+ add.f32 %f7, %f5, %f6;
88
+ $L__tmp3:
89
+ .loc 2 243 36
90
+ mov.b32 %r31, %f7;
91
+ shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1;
92
+ mov.b32 %f8, %r32;
93
+ $L__tmp4:
94
+ .loc 2 233 15
95
+ add.f32 %f9, %f7, %f8;
96
+ $L__tmp5:
97
+ .loc 2 243 36
98
+ and.b32 %r33, %r1, 3;
99
+ setp.eq.s32 %p9, %r33, 0;
100
+ and.pred %p6, %p5, %p9;
101
+ mov.b32 %r22, %f9;
102
+ @%p6 st.shared.b32 [ %r20 + 0 ], %r22;
103
+ bar.sync 0;
104
+ add.s32 %r34, %r28, %r26;
105
+ $L__tmp6:
106
+ .loc 1 36 20
107
+ shr.s32 %r36, %r4, 31;
108
+ shr.u32 %r37, %r36, 24;
109
+ add.s32 %r38, %r4, %r37;
110
+ shr.s32 %r39, %r38, 8;
111
+ and.b32 %r40, %r38, -256;
112
+ sub.s32 %r41, %r4, %r40;
113
+ .loc 1 38 30
114
+ mul.wide.s32 %rd9, %r39, 8;
115
+ add.s64 %rd7, %rd2, %rd9;
116
+ .loc 1 45 55
117
+ ld.shared.u32 %r24, [%r34];
118
+ .loc 1 38 35
119
+ mov.u64 %rd6, 0x0;
120
+ @%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ];
121
+ .loc 1 41 32
122
+ shr.u64 %rd10, %rd6, 54;
123
+ and.b64 %rd11, %rd10, 512;
124
+ add.s64 %rd12, %rd11, %rd6;
125
+ .loc 1 45 30
126
+ shl.b64 %rd13, %rd12, 10;
127
+ add.s64 %rd14, %rd3, %rd13;
128
+ mul.wide.s32 %rd15, %r41, 4;
129
+ add.s64 %rd8, %rd14, %rd15;
130
+ .loc 1 45 55
131
+ setp.eq.s32 %p8, %r3, 0;
132
+ mov.u32 %r23, 0x0;
133
+ @%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24;
134
+ .loc 1 45 4
135
+ ret;
136
+ $L__tmp7:
137
+ $L__func_end0:
138
+
139
+ }
140
+ .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py"
141
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
142
+ .section .debug_abbrev
143
+ {
144
+ .b8 1
145
+ .b8 17
146
+ .b8 1
147
+ .b8 37
148
+ .b8 8
149
+ .b8 19
150
+ .b8 5
151
+ .b8 3
152
+ .b8 8
153
+ .b8 16
154
+ .b8 6
155
+ .b8 27
156
+ .b8 8
157
+ .b8 180
158
+ .b8 66
159
+ .b8 12
160
+ .b8 17
161
+ .b8 1
162
+ .b8 18
163
+ .b8 1
164
+ .b8 0
165
+ .b8 0
166
+ .b8 2
167
+ .b8 46
168
+ .b8 0
169
+ .b8 135
170
+ .b8 64
171
+ .b8 8
172
+ .b8 3
173
+ .b8 8
174
+ .b8 58
175
+ .b8 11
176
+ .b8 59
177
+ .b8 11
178
+ .b8 63
179
+ .b8 12
180
+ .b8 32
181
+ .b8 11
182
+ .b8 0
183
+ .b8 0
184
+ .b8 3
185
+ .b8 46
186
+ .b8 1
187
+ .b8 17
188
+ .b8 1
189
+ .b8 18
190
+ .b8 1
191
+ .b8 64
192
+ .b8 10
193
+ .b8 49
194
+ .b8 19
195
+ .b8 0
196
+ .b8 0
197
+ .b8 4
198
+ .b8 29
199
+ .b8 0
200
+ .b8 49
201
+ .b8 19
202
+ .b8 17
203
+ .b8 1
204
+ .b8 18
205
+ .b8 1
206
+ .b8 88
207
+ .b8 11
208
+ .b8 89
209
+ .b8 11
210
+ .b8 87
211
+ .b8 11
212
+ .b8 0
213
+ .b8 0
214
+ .b8 5
215
+ .b8 29
216
+ .b8 1
217
+ .b8 49
218
+ .b8 19
219
+ .b8 17
220
+ .b8 1
221
+ .b8 18
222
+ .b8 1
223
+ .b8 88
224
+ .b8 11
225
+ .b8 89
226
+ .b8 11
227
+ .b8 87
228
+ .b8 11
229
+ .b8 0
230
+ .b8 0
231
+ .b8 0
232
+ }
233
+ .section .debug_info
234
+ {
235
+ .b32 264
236
+ .b8 2
237
+ .b8 0
238
+ .b32 .debug_abbrev
239
+ .b8 8
240
+ .b8 1
241
+ .b8 116
242
+ .b8 114
243
+ .b8 105
244
+ .b8 116
245
+ .b8 111
246
+ .b8 110
247
+ .b8 0
248
+ .b8 2
249
+ .b8 0
250
+ .b8 99
251
+ .b8 54
252
+ .b8 105
253
+ .b8 107
254
+ .b8 53
255
+ .b8 118
256
+ .b8 120
257
+ .b8 55
258
+ .b8 112
259
+ .b8 50
260
+ .b8 50
261
+ .b8 102
262
+ .b8 112
263
+ .b8 107
264
+ .b8 52
265
+ .b8 100
266
+ .b8 99
267
+ .b8 118
268
+ .b8 104
269
+ .b8 53
270
+ .b8 53
271
+ .b8 122
272
+ .b8 105
273
+ .b8 109
274
+ .b8 119
275
+ .b8 52
276
+ .b8 116
277
+ .b8 53
278
+ .b8 110
279
+ .b8 114
280
+ .b8 53
281
+ .b8 122
282
+ .b8 110
283
+ .b8 50
284
+ .b8 98
285
+ .b8 55
286
+ .b8 105
287
+ .b8 110
288
+ .b8 117
289
+ .b8 106
290
+ .b8 120
291
+ .b8 106
292
+ .b8 97
293
+ .b8 117
294
+ .b8 120
295
+ .b8 115
296
+ .b8 104
297
+ .b8 108
298
+ .b8 106
299
+ .b8 117
300
+ .b8 109
301
+ .b8 109
302
+ .b8 46
303
+ .b8 112
304
+ .b8 121
305
+ .b8 0
306
+ .b32 .debug_line
307
+ .b8 47
308
+ .b8 116
309
+ .b8 109
310
+ .b8 112
311
+ .b8 47
312
+ .b8 116
313
+ .b8 111
314
+ .b8 114
315
+ .b8 99
316
+ .b8 104
317
+ .b8 105
318
+ .b8 110
319
+ .b8 100
320
+ .b8 117
321
+ .b8 99
322
+ .b8 116
323
+ .b8 111
324
+ .b8 114
325
+ .b8 95
326
+ .b8 114
327
+ .b8 111
328
+ .b8 111
329
+ .b8 116
330
+ .b8 47
331
+ .b8 54
332
+ .b8 105
333
+ .b8 0
334
+ .b8 1
335
+ .b64 $L__func_begin0
336
+ .b64 $L__func_end0
337
+ .b8 2
338
+ .b8 116
339
+ .b8 114
340
+ .b8 105
341
+ .b8 116
342
+ .b8 111
343
+ .b8 110
344
+ .b8 95
345
+ .b8 95
346
+ .b8 48
347
+ .b8 100
348
+ .b8 49
349
+ .b8 100
350
+ .b8 50
351
+ .b8 100
352
+ .b8 51
353
+ .b8 100
354
+ .b8 101
355
+ .b8 52
356
+ .b8 101
357
+ .b8 0
358
+ .b8 116
359
+ .b8 114
360
+ .b8 105
361
+ .b8 116
362
+ .b8 111
363
+ .b8 110
364
+ .b8 95
365
+ .b8 95
366
+ .b8 48
367
+ .b8 100
368
+ .b8 49
369
+ .b8 100
370
+ .b8 50
371
+ .b8 100
372
+ .b8 51
373
+ .b8 100
374
+ .b8 101
375
+ .b8 52
376
+ .b8 101
377
+ .b8 0
378
+ .b8 1
379
+ .b8 18
380
+ .b8 1
381
+ .b8 1
382
+ .b8 3
383
+ .b64 $L__func_begin0
384
+ .b64 $L__func_end0
385
+ .b8 1
386
+ .b8 156
387
+ .b32 125
388
+ .b8 4
389
+ .b32 125
390
+ .b64 $L__tmp1
391
+ .b64 $L__tmp6
392
+ .b8 2
393
+ .b8 35
394
+ .b8 25
395
+ .b8 5
396
+ .b32 125
397
+ .b64 $L__tmp2
398
+ .b64 $L__tmp5
399
+ .b8 2
400
+ .b8 35
401
+ .b8 25
402
+ .b8 4
403
+ .b32 125
404
+ .b64 $L__tmp2
405
+ .b64 $L__tmp5
406
+ .b8 2
407
+ .b8 243
408
+ .b8 36
409
+ .b8 0
410
+ .b8 0
411
+ .b8 0
412
+ }
413
+ .section .debug_pubnames
414
+ {
415
+ .b32 $L__pubNames_end0-$L__pubNames_start0
416
+ $L__pubNames_start0:
417
+ .b8 2
418
+ .b8 0
419
+ .b32 .debug_info
420
+ .b32 268
421
+ .b32 125
422
+ .b8 116
423
+ .b8 114
424
+ .b8 105
425
+ .b8 116
426
+ .b8 111
427
+ .b8 110
428
+ .b8 95
429
+ .b8 95
430
+ .b8 48
431
+ .b8 100
432
+ .b8 49
433
+ .b8 100
434
+ .b8 50
435
+ .b8 100
436
+ .b8 51
437
+ .b8 100
438
+ .b8 101
439
+ .b8 52
440
+ .b8 101
441
+ .b8 0
442
+ .b32 0
443
+ $L__pubNames_end0:
444
+ }
445
+ .section .debug_pubtypes
446
+ {
447
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
448
+ $L__pubTypes_start0:
449
+ .b8 2
450
+ .b8 0
451
+ .b32 .debug_info
452
+ .b32 268
453
+ .b32 0
454
+ $L__pubTypes_end0:
455
+ }
456
+ .section .debug_loc { }
.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<256> : tensor<64x1xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<64x1xi64>
5
+ %cst_1 = arith.constant dense<512> : tensor<64x1xi64>
6
+ %c4_i32 = arith.constant 4 : i32
7
+ %c120_i32 = arith.constant 120 : i32
8
+ %c0_i32 = arith.constant 0 : i32
9
+ %cst_2 = arith.constant dense<true> : tensor<64x1xi1>
10
+ %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
11
+ %cst_4 = arith.constant dense<131072> : tensor<1x4xi32>
12
+ %cst_5 = arith.constant dense<120> : tensor<1x4xi32>
13
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
14
+ %c64_i32 = arith.constant 64 : i32
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = arith.muli %0, %c64_i32 : i32
17
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
18
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
19
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
20
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
21
+ %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
22
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
23
+ %8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x4xi32>
24
+ %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
25
+ %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x4xf32>) : i32 {
26
+ %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32>
27
+ %28 = arith.addi %27, %7 : tensor<1x4xi32>
28
+ %29 = arith.cmpi slt, %28, %cst_5 : tensor<1x4xi32>
29
+ %30 = arith.muli %28, %cst_4 : tensor<1x4xi32>
30
+ %31 = tt.broadcast %30 : (tensor<1x4xi32>) -> tensor<64x4xi32>
31
+ %32 = arith.addi %8, %31 : tensor<64x4xi32>
32
+ %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
33
+ %34 = tt.broadcast %29 : (tensor<1x4xi1>) -> tensor<64x4xi1>
34
+ %35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
35
+ %36 = arith.addf %arg6, %35 : tensor<64x4xf32>
36
+ %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1>, tensor<64x4xf32>
37
+ scf.yield %37 : tensor<64x4xf32>
38
+ }
39
+ %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
40
+ ^bb0(%arg5: f32, %arg6: f32):
41
+ %27 = arith.addf %arg5, %arg6 : f32
42
+ tt.reduce.return %27 : f32
43
+ }) : (tensor<64x4xf32>) -> tensor<64xf32>
44
+ %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
45
+ %13 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
46
+ %14 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
47
+ %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
48
+ %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
49
+ %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
50
+ %18 = arith.addi %17, %cst_1 : tensor<64x1xi64>
51
+ %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64>
52
+ %20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64>
53
+ %21 = arith.muli %20, %cst : tensor<64x1xi64>
54
+ %22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64>
55
+ %23 = arith.addi %22, %21 : tensor<64x1xi64>
56
+ %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
57
+ %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi64>
58
+ %26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32>
59
+ tt.return
60
+ }
61
+ }
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin ADDED
Binary file (40.4 kB). View file
 
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx ADDED
@@ -0,0 +1,809 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
23
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
26
+
27
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
28
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
34
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
36
+ )
37
+ .maxntid 256, 1, 1
38
+ {
39
+ .reg .pred %p<33>;
40
+ .reg .b16 %rs<13>;
41
+ .reg .b32 %r<93>;
42
+ .reg .f32 %f<79>;
43
+ .reg .b64 %rd<92>;
44
+ .loc 1 18 0
45
+ $L__func_begin0:
46
+ .loc 1 18 0
47
+
48
+ ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6de7de_param_4];
49
+ ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6de7de_param_3];
50
+ ld.param.u64 %rd35, [triton__0d1d2d3d4d5d6de7de_param_2];
51
+ ld.param.u64 %rd34, [triton__0d1d2d3d4d5d6de7de_param_1];
52
+ ld.param.u64 %rd43, [triton__0d1d2d3d4d5d6de7de_param_0];
53
+ $L__tmp0:
54
+ .loc 1 22 44
55
+ mov.u32 %r1, %tid.x;
56
+ bfe.u32 %r2, %r1, 2, 6;
57
+ and.b32 %r16, %r1, 63;
58
+ .loc 1 24 33
59
+ and.b32 %r3, %r1, 3;
60
+ .loc 1 21 28
61
+ mov.u32 %r15, %ctaid.x;
62
+ .loc 1 21 33
63
+ shl.b32 %r17, %r15, 6;
64
+ .loc 1 22 23
65
+ or.b32 %r18, %r17, %r2;
66
+ or.b32 %r19, %r17, %r16;
67
+ .loc 1 26 30
68
+ mul.wide.s32 %rd44, %r18, 8;
69
+ add.s64 %rd40, %rd43, %rd44;
70
+ mul.wide.s32 %rd45, %r19, 8;
71
+ add.s64 %rd42, %rd43, %rd45;
72
+ mov.pred %p13, -1;
73
+ .loc 1 26 35
74
+ mov.u64 %rd39, 0x0;
75
+ @%p13 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ];
76
+ mov.u64 %rd41, 0x0;
77
+ @%p13 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd42 + 0 ];
78
+ .loc 1 27 18
79
+ bfe.s32 %r20, %r15, 25, 1;
80
+ shr.u32 %r21, %r20, 23;
81
+ add.s32 %r22, %r18, %r21;
82
+ and.b32 %r23, %r22, 16776704;
83
+ sub.s32 %r24, %r18, %r23;
84
+ .loc 1 35 44
85
+ shl.b32 %r5, %r24, 8;
86
+ .loc 1 37 22
87
+ add.s64 %rd46, %rd41, 50257;
88
+ .loc 1 38 22
89
+ setp.lt.s64 %p3, %rd39, 0;
90
+ setp.lt.s64 %p4, %rd41, 0;
91
+ .loc 1 39 36
92
+ selp.b64 %rd47, %rd46, %rd41, %p4;
93
+ .loc 1 40 40
94
+ setp.gt.u64 %p5, %rd47, 50256;
95
+ .loc 1 41 44
96
+ shl.b64 %rd48, %rd39, 8;
97
+ add.s64 %rd49, %rd48, 12865792;
98
+ selp.b64 %rd2, %rd49, %rd48, %p3;
99
+ mov.u16 %rs12, 0;
100
+ mov.b32 %r76, 0;
101
+ mov.b32 %r88, 883;
102
+ mov.u64 %rd81, 1;
103
+ .loc 1 40 55
104
+ @%p5 bra $L__BB0_3;
105
+ bra.uni $L__BB0_1;
106
+ $L__BB0_3:
107
+ .loc 1 31 36
108
+ shl.b64 %rd55, %rd2, 2;
109
+ mul.wide.u32 %rd88, %r3, 4;
110
+ add.s64 %rd87, %rd55, %rd88;
111
+ add.s64 %rd83, %rd34, %rd87;
112
+ shl.b32 %r42, %r15, 14;
113
+ shl.b32 %r43, %r2, 8;
114
+ or.b32 %r44, %r42, %r43;
115
+ or.b32 %r91, %r44, %r3;
116
+ add.s32 %r45, %r5, %r3;
117
+ mul.wide.s32 %rd86, %r45, 4;
118
+ add.s64 %rd82, %rd35, %rd86;
119
+ mov.f32 %f78, 0f00000000;
120
+ mov.b32 %r89, -4;
121
+ mov.f32 %f77, %f78;
122
+ mov.f32 %f76, %f78;
123
+ $L__BB0_4:
124
+ .loc 1 35 50
125
+ mov.u32 %r46, 0x0;
126
+ @%p13 ld.global.L1::evict_last.b32 { %r46 }, [ %rd82 + 0 ];
127
+ @!%p13 mov.u32 %r46, %r76;
128
+ mov.b32 %f31, %r46;
129
+ .loc 1 31 36
130
+ add.s32 %r89, %r89, 4;
131
+ .loc 1 36 34
132
+ add.s32 %r54, %r89, %r91;
133
+ mul.wide.s32 %rd59, %r54, 2;
134
+ add.s64 %rd57, %rd36, %rd59;
135
+ .loc 1 36 50
136
+ mov.u16 %rs4, 0x0;
137
+ @%p13 ld.global.L1::evict_last.b16 { %rs4 }, [ %rd57 + 0 ];
138
+ @!%p13 mov.u16 %rs4, %rs12;
139
+ .loc 1 36 101
140
+ cvt.f32.bf16 %r48, %rs4;
141
+ mov.b32 %f32, %r48;
142
+ .loc 1 40 55
143
+ mov.u64 %rd60, assertMessage_0;
144
+ cvta.global.u64 %rd61, %rd60;
145
+ mov.u64 %rd62, assertFile_0;
146
+ cvta.global.u64 %rd63, %rd62;
147
+ mov.u64 %rd64, assertFunc_0;
148
+ cvta.global.u64 %rd65, %rd64;
149
+ { // callseq 10, 0
150
+ .reg .b32 temp_param_reg;
151
+ .param .b64 param0;
152
+ st.param.b64 [param0+0], %rd61;
153
+ .param .b64 param1;
154
+ st.param.b64 [param1+0], %rd63;
155
+ .param .b32 param2;
156
+ st.param.b32 [param2+0], %r88;
157
+ .param .b64 param3;
158
+ st.param.b64 [param3+0], %rd65;
159
+ .param .b64 param4;
160
+ st.param.b64 [param4+0], %rd81;
161
+ call.uni
162
+ __assertfail,
163
+ (
164
+ param0,
165
+ param1,
166
+ param2,
167
+ param3,
168
+ param4
169
+ );
170
+ } // callseq 10
171
+ .loc 1 41 52
172
+ mov.u32 %r49, 0x0;
173
+ @%p13 ld.global.L1::evict_last.b32 { %r49 }, [ %rd83 + 0 ];
174
+ @!%p13 mov.u32 %r49, %r76;
175
+ mov.b32 %f33, %r49;
176
+ .loc 1 42 22
177
+ add.f32 %f34, %f31, %f33;
178
+ .loc 1 44 22
179
+ add.f32 %f35, %f32, %f34;
180
+ $L__tmp1:
181
+ .loc 2 96 20
182
+ sub.f32 %f36, %f35, %f76;
183
+ .loc 2 97 26
184
+ add.f32 %f78, %f78, 0f3F800000;
185
+ .loc 2 98 30
186
+ mov.b32 %r52, %f36;
187
+ mov.b32 %r53, %f78;
188
+ div.full.f32 %r51, %r52, %r53;
189
+ mov.b32 %f37, %r51;
190
+ .loc 2 98 22
191
+ add.f32 %f76, %f76, %f37;
192
+ .loc 2 101 30
193
+ sub.f32 %f38, %f35, %f76;
194
+ $L__tmp2:
195
+ .loc 1 50 50
196
+ fma.rn.f32 %f77, %f36, %f38, %f77;
197
+ .loc 1 31 36
198
+ add.s64 %rd83, %rd83, 16;
199
+ add.s64 %rd82, %rd82, 16;
200
+ setp.lt.u32 %p19, %r89, 252;
201
+ @%p19 bra $L__BB0_4;
202
+ bra.uni $L__BB0_5;
203
+ $L__BB0_1:
204
+ .loc 1 0 36
205
+ mov.b32 %r90, -4;
206
+ .loc 1 31 36
207
+ shl.b64 %rd50, %rd2, 2;
208
+ mul.wide.u32 %rd88, %r3, 4;
209
+ add.s64 %rd87, %rd50, %rd88;
210
+ add.s64 %rd85, %rd34, %rd87;
211
+ shl.b32 %r27, %r15, 14;
212
+ shl.b32 %r28, %r2, 8;
213
+ or.b32 %r29, %r27, %r28;
214
+ or.b32 %r91, %r29, %r3;
215
+ add.s32 %r30, %r5, %r3;
216
+ mul.wide.s32 %rd86, %r30, 4;
217
+ add.s64 %rd84, %rd35, %rd86;
218
+ mov.f32 %f78, 0f00000000;
219
+ mov.f32 %f77, %f78;
220
+ mov.f32 %f76, %f78;
221
+ $L__BB0_2:
222
+ .loc 1 35 50
223
+ mov.u32 %r31, 0x0;
224
+ @%p13 ld.global.L1::evict_last.b32 { %r31 }, [ %rd84 + 0 ];
225
+ @!%p13 mov.u32 %r31, %r76;
226
+ mov.b32 %f22, %r31;
227
+ .loc 1 31 36
228
+ add.s32 %r90, %r90, 4;
229
+ .loc 1 36 34
230
+ add.s32 %r39, %r90, %r91;
231
+ mul.wide.s32 %rd54, %r39, 2;
232
+ add.s64 %rd52, %rd36, %rd54;
233
+ .loc 1 36 50
234
+ mov.u16 %rs1, 0x0;
235
+ @%p13 ld.global.L1::evict_last.b16 { %rs1 }, [ %rd52 + 0 ];
236
+ @!%p13 mov.u16 %rs1, %rs12;
237
+ .loc 1 36 101
238
+ cvt.f32.bf16 %r33, %rs1;
239
+ mov.b32 %f23, %r33;
240
+ .loc 1 41 52
241
+ mov.u32 %r34, 0x0;
242
+ @%p13 ld.global.L1::evict_last.b32 { %r34 }, [ %rd85 + 0 ];
243
+ @!%p13 mov.u32 %r34, %r76;
244
+ mov.b32 %f24, %r34;
245
+ .loc 1 42 22
246
+ add.f32 %f25, %f22, %f24;
247
+ .loc 1 44 22
248
+ add.f32 %f26, %f23, %f25;
249
+ $L__tmp3:
250
+ .loc 2 96 20
251
+ sub.f32 %f27, %f26, %f76;
252
+ .loc 2 97 26
253
+ add.f32 %f78, %f78, 0f3F800000;
254
+ .loc 2 98 30
255
+ mov.b32 %r37, %f27;
256
+ mov.b32 %r38, %f78;
257
+ div.full.f32 %r36, %r37, %r38;
258
+ mov.b32 %f28, %r36;
259
+ .loc 2 98 22
260
+ add.f32 %f76, %f76, %f28;
261
+ .loc 2 101 30
262
+ sub.f32 %f29, %f26, %f76;
263
+ $L__tmp4:
264
+ .loc 1 50 50
265
+ fma.rn.f32 %f77, %f27, %f29, %f77;
266
+ .loc 1 31 36
267
+ add.s64 %rd85, %rd85, 16;
268
+ add.s64 %rd84, %rd84, 16;
269
+ setp.lt.u32 %p12, %r90, 252;
270
+ @%p12 bra $L__BB0_2;
271
+ $L__BB0_5:
272
+ .loc 1 0 36
273
+ ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6de7de_param_5];
274
+ $L__tmp5:
275
+ .loc 2 120 46
276
+ mov.b32 %r66, %f76;
277
+ shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1;
278
+ mov.b32 %f39, %r67;
279
+ mov.b32 %r68, %f77;
280
+ shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1;
281
+ mov.b32 %f40, %r69;
282
+ mov.b32 %r70, %f78;
283
+ shfl.sync.bfly.b32 %r57, %r70, 2, 31, -1;
284
+ mov.b32 %f41, %r57;
285
+ $L__tmp6:
286
+ .loc 2 108 21
287
+ sub.f32 %f42, %f39, %f76;
288
+ .loc 2 109 28
289
+ add.f32 %f43, %f78, %f41;
290
+ .loc 2 110 39
291
+ setp.eq.f32 %p20, %f43, 0f00000000;
292
+ .loc 2 110 60
293
+ mov.b32 %r58, %f43;
294
+ div.full.f32 %r56, %r57, %r58;
295
+ mov.b32 %f44, %r56;
296
+ .loc 2 110 49
297
+ selp.f32 %f45, 0f00000000, %f44, %p20;
298
+ .loc 2 112 17
299
+ fma.rn.f32 %f46, %f42, %f45, %f76;
300
+ .loc 2 113 15
301
+ add.f32 %f47, %f77, %f40;
302
+ .loc 2 113 30
303
+ mul.f32 %f48, %f42, %f42;
304
+ .loc 2 113 38
305
+ mul.f32 %f49, %f78, %f48;
306
+ .loc 2 113 22
307
+ fma.rn.f32 %f50, %f49, %f45, %f47;
308
+ $L__tmp7:
309
+ .loc 2 120 46
310
+ mov.b32 %r71, %f46;
311
+ shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1;
312
+ mov.b32 %f51, %r72;
313
+ mov.b32 %r73, %f50;
314
+ shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1;
315
+ mov.b32 %f52, %r74;
316
+ shfl.sync.bfly.b32 %r60, %r58, 1, 31, -1;
317
+ mov.b32 %f53, %r60;
318
+ $L__tmp8:
319
+ .loc 2 108 21
320
+ sub.f32 %f54, %f51, %f46;
321
+ .loc 2 109 28
322
+ add.f32 %f55, %f43, %f53;
323
+ .loc 2 110 39
324
+ setp.eq.f32 %p21, %f55, 0f00000000;
325
+ .loc 2 110 60
326
+ mov.b32 %r61, %f55;
327
+ div.full.f32 %r59, %r60, %r61;
328
+ mov.b32 %f56, %r59;
329
+ .loc 2 110 49
330
+ selp.f32 %f57, 0f00000000, %f56, %p21;
331
+ .loc 2 112 17
332
+ fma.rn.f32 %f16, %f54, %f57, %f46;
333
+ .loc 2 113 15
334
+ add.f32 %f58, %f50, %f52;
335
+ .loc 2 113 30
336
+ mul.f32 %f59, %f54, %f54;
337
+ .loc 2 113 38
338
+ mul.f32 %f60, %f43, %f59;
339
+ .loc 2 113 22
340
+ fma.rn.f32 %f61, %f57, %f60, %f58;
341
+ $L__tmp9:
342
+ .loc 1 75 24
343
+ mov.b32 %r63, %f61;
344
+ mov.b32 %r64, 1132462080;
345
+ div.full.f32 %r62, %r63, %r64;
346
+ mov.b32 %f62, %r62;
347
+ .loc 1 77 24
348
+ add.f32 %f17, %f62, 0f3727C5AC;
349
+ .loc 1 58 36
350
+ add.s64 %rd91, %rd34, %rd87;
351
+ add.s64 %rd90, %rd37, %rd88;
352
+ add.s64 %rd89, %rd35, %rd86;
353
+ mov.b32 %r92, -4;
354
+ setp.lt.u64 %p28, %rd47, 50257;
355
+ rsqrt.approx.ftz.f32 %f67, %f17;
356
+ bra.uni $L__BB0_6;
357
+ $L__BB0_8:
358
+ .loc 1 0 0
359
+ mov.b32 %f18, %r75;
360
+ cvt.s64.s32 %rd30, %r81;
361
+ cvt.f32.bf16 %r77, %rs7;
362
+ mov.b32 %f19, %r77;
363
+ mov.b32 %f20, %r78;
364
+ .loc 1 69 54
365
+ mov.u32 %r83, 0x0;
366
+ @%p13 ld.global.L1::evict_first.b32 { %r83 }, [ %rd91 + 0 ];
367
+ @!%p13 mov.u32 %r83, %r76;
368
+ mov.b32 %f63, %r83;
369
+ .loc 1 70 24
370
+ add.f32 %f64, %f18, %f63;
371
+ .loc 1 72 24
372
+ add.f32 %f65, %f19, %f64;
373
+ .loc 1 73 24
374
+ sub.f32 %f66, %f65, %f16;
375
+ .loc 1 79 24
376
+ mul.f32 %f68, %f66, %f67;
377
+ .loc 1 80 24
378
+ mul.f32 %f69, %f68, %f20;
379
+ .loc 1 82 29
380
+ shl.b64 %rd80, %rd30, 1;
381
+ add.s64 %rd79, %rd38, %rd80;
382
+ .loc 1 82 52
383
+ mov.b32 %r85, %f69;
384
+ cvt.rn.bf16.f32 %rs10, %r85;
385
+ @%p13 st.global.b16 [ %rd79 + 0 ], { %rs10 };
386
+ .loc 1 58 36
387
+ add.s32 %r92, %r92, 4;
388
+ add.s64 %rd91, %rd91, 16;
389
+ add.s64 %rd90, %rd90, 16;
390
+ add.s64 %rd89, %rd89, 16;
391
+ setp.lt.u32 %p32, %r92, 252;
392
+ @%p32 bra $L__BB0_6;
393
+ bra.uni $L__BB0_9;
394
+ $L__BB0_6:
395
+ .loc 1 62 51
396
+ mov.u32 %r75, 0x0;
397
+ @%p13 ld.global.L1::evict_last.b32 { %r75 }, [ %rd89 + 0 ];
398
+ @!%p13 mov.u32 %r75, %r76;
399
+ .loc 1 63 35
400
+ add.s32 %r80, %r91, %r92;
401
+ add.s32 %r81, %r80, 4;
402
+ mul.wide.s32 %rd70, %r81, 2;
403
+ add.s64 %rd68, %rd36, %rd70;
404
+ .loc 1 63 51
405
+ mov.u16 %rs7, 0x0;
406
+ @%p13 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd68 + 0 ];
407
+ @!%p13 mov.u16 %rs7, %rs12;
408
+ .loc 1 64 40
409
+ mov.u32 %r78, 0x0;
410
+ @%p13 ld.global.L1::evict_last.b32 { %r78 }, [ %rd90 + 0 ];
411
+ @!%p13 mov.u32 %r78, %r76;
412
+ .loc 1 68 57
413
+ @%p28 bra $L__BB0_8;
414
+ mov.u64 %rd71, assertMessage_1;
415
+ cvta.global.u64 %rd72, %rd71;
416
+ mov.u64 %rd73, assertFile_1;
417
+ cvta.global.u64 %rd74, %rd73;
418
+ mov.u64 %rd75, assertFunc_1;
419
+ cvta.global.u64 %rd76, %rd75;
420
+ { // callseq 11, 0
421
+ .reg .b32 temp_param_reg;
422
+ .param .b64 param0;
423
+ st.param.b64 [param0+0], %rd72;
424
+ .param .b64 param1;
425
+ st.param.b64 [param1+0], %rd74;
426
+ .param .b32 param2;
427
+ st.param.b32 [param2+0], %r88;
428
+ .param .b64 param3;
429
+ st.param.b64 [param3+0], %rd76;
430
+ .param .b64 param4;
431
+ st.param.b64 [param4+0], %rd81;
432
+ call.uni
433
+ __assertfail,
434
+ (
435
+ param0,
436
+ param1,
437
+ param2,
438
+ param3,
439
+ param4
440
+ );
441
+ } // callseq 11
442
+ bra.uni $L__BB0_8;
443
+ $L__BB0_9:
444
+ .loc 1 58 4
445
+ ret;
446
+ $L__tmp10:
447
+ $L__func_end0:
448
+
449
+ }
450
+ // .globl __nv_rsqrtf
451
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
452
+ .param .b32 __nv_rsqrtf_param_0
453
+ )
454
+ {
455
+ .reg .f32 %f<3>;
456
+ $L__func_begin1:
457
+
458
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
459
+ rsqrt.approx.ftz.f32 %f2, %f1;
460
+ st.param.f32 [func_retval0+0], %f2;
461
+ ret;
462
+ $L__func_end1:
463
+
464
+ }
465
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
466
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
467
+ .section .debug_abbrev
468
+ {
469
+ .b8 1
470
+ .b8 17
471
+ .b8 1
472
+ .b8 37
473
+ .b8 8
474
+ .b8 19
475
+ .b8 5
476
+ .b8 3
477
+ .b8 8
478
+ .b8 16
479
+ .b8 6
480
+ .b8 27
481
+ .b8 8
482
+ .b8 180
483
+ .b8 66
484
+ .b8 12
485
+ .b8 17
486
+ .b8 1
487
+ .b8 18
488
+ .b8 1
489
+ .b8 0
490
+ .b8 0
491
+ .b8 2
492
+ .b8 46
493
+ .b8 0
494
+ .b8 135
495
+ .b8 64
496
+ .b8 8
497
+ .b8 3
498
+ .b8 8
499
+ .b8 58
500
+ .b8 11
501
+ .b8 59
502
+ .b8 11
503
+ .b8 63
504
+ .b8 12
505
+ .b8 32
506
+ .b8 11
507
+ .b8 0
508
+ .b8 0
509
+ .b8 3
510
+ .b8 46
511
+ .b8 1
512
+ .b8 17
513
+ .b8 1
514
+ .b8 18
515
+ .b8 1
516
+ .b8 64
517
+ .b8 10
518
+ .b8 49
519
+ .b8 19
520
+ .b8 0
521
+ .b8 0
522
+ .b8 4
523
+ .b8 29
524
+ .b8 0
525
+ .b8 49
526
+ .b8 19
527
+ .b8 17
528
+ .b8 1
529
+ .b8 18
530
+ .b8 1
531
+ .b8 88
532
+ .b8 11
533
+ .b8 89
534
+ .b8 11
535
+ .b8 87
536
+ .b8 11
537
+ .b8 0
538
+ .b8 0
539
+ .b8 5
540
+ .b8 29
541
+ .b8 1
542
+ .b8 49
543
+ .b8 19
544
+ .b8 17
545
+ .b8 1
546
+ .b8 18
547
+ .b8 1
548
+ .b8 88
549
+ .b8 11
550
+ .b8 89
551
+ .b8 11
552
+ .b8 87
553
+ .b8 11
554
+ .b8 0
555
+ .b8 0
556
+ .b8 0
557
+ }
558
+ .section .debug_info
559
+ {
560
+ .b32 302
561
+ .b8 2
562
+ .b8 0
563
+ .b32 .debug_abbrev
564
+ .b8 8
565
+ .b8 1
566
+ .b8 116
567
+ .b8 114
568
+ .b8 105
569
+ .b8 116
570
+ .b8 111
571
+ .b8 110
572
+ .b8 0
573
+ .b8 2
574
+ .b8 0
575
+ .b8 99
576
+ .b8 112
577
+ .b8 110
578
+ .b8 51
579
+ .b8 108
580
+ .b8 97
581
+ .b8 119
582
+ .b8 103
583
+ .b8 54
584
+ .b8 53
585
+ .b8 108
586
+ .b8 112
587
+ .b8 105
588
+ .b8 54
589
+ .b8 51
590
+ .b8 103
591
+ .b8 118
592
+ .b8 54
593
+ .b8 99
594
+ .b8 54
595
+ .b8 112
596
+ .b8 110
597
+ .b8 52
598
+ .b8 111
599
+ .b8 105
600
+ .b8 107
601
+ .b8 104
602
+ .b8 103
603
+ .b8 54
604
+ .b8 113
605
+ .b8 118
606
+ .b8 97
607
+ .b8 50
608
+ .b8 104
609
+ .b8 50
610
+ .b8 113
611
+ .b8 106
612
+ .b8 100
613
+ .b8 112
614
+ .b8 120
615
+ .b8 101
616
+ .b8 54
617
+ .b8 113
618
+ .b8 106
619
+ .b8 52
620
+ .b8 108
621
+ .b8 118
622
+ .b8 116
623
+ .b8 116
624
+ .b8 119
625
+ .b8 101
626
+ .b8 122
627
+ .b8 46
628
+ .b8 112
629
+ .b8 121
630
+ .b8 0
631
+ .b32 .debug_line
632
+ .b8 47
633
+ .b8 116
634
+ .b8 109
635
+ .b8 112
636
+ .b8 47
637
+ .b8 116
638
+ .b8 111
639
+ .b8 114
640
+ .b8 99
641
+ .b8 104
642
+ .b8 105
643
+ .b8 110
644
+ .b8 100
645
+ .b8 117
646
+ .b8 99
647
+ .b8 116
648
+ .b8 111
649
+ .b8 114
650
+ .b8 95
651
+ .b8 114
652
+ .b8 111
653
+ .b8 111
654
+ .b8 116
655
+ .b8 47
656
+ .b8 112
657
+ .b8 110
658
+ .b8 0
659
+ .b8 1
660
+ .b64 $L__func_begin0
661
+ .b64 $L__func_end0
662
+ .b8 2
663
+ .b8 116
664
+ .b8 114
665
+ .b8 105
666
+ .b8 116
667
+ .b8 111
668
+ .b8 110
669
+ .b8 95
670
+ .b8 95
671
+ .b8 48
672
+ .b8 100
673
+ .b8 49
674
+ .b8 100
675
+ .b8 50
676
+ .b8 100
677
+ .b8 51
678
+ .b8 100
679
+ .b8 52
680
+ .b8 100
681
+ .b8 53
682
+ .b8 100
683
+ .b8 54
684
+ .b8 100
685
+ .b8 101
686
+ .b8 55
687
+ .b8 100
688
+ .b8 101
689
+ .b8 0
690
+ .b8 116
691
+ .b8 114
692
+ .b8 105
693
+ .b8 116
694
+ .b8 111
695
+ .b8 110
696
+ .b8 95
697
+ .b8 95
698
+ .b8 48
699
+ .b8 100
700
+ .b8 49
701
+ .b8 100
702
+ .b8 50
703
+ .b8 100
704
+ .b8 51
705
+ .b8 100
706
+ .b8 52
707
+ .b8 100
708
+ .b8 53
709
+ .b8 100
710
+ .b8 54
711
+ .b8 100
712
+ .b8 101
713
+ .b8 55
714
+ .b8 100
715
+ .b8 101
716
+ .b8 0
717
+ .b8 1
718
+ .b8 18
719
+ .b8 1
720
+ .b8 1
721
+ .b8 3
722
+ .b64 $L__func_begin0
723
+ .b64 $L__func_end0
724
+ .b8 1
725
+ .b8 156
726
+ .b32 125
727
+ .b8 4
728
+ .b32 125
729
+ .b64 $L__tmp1
730
+ .b64 $L__tmp4
731
+ .b8 2
732
+ .b8 47
733
+ .b8 41
734
+ .b8 4
735
+ .b32 125
736
+ .b64 $L__tmp5
737
+ .b64 $L__tmp8
738
+ .b8 2
739
+ .b8 53
740
+ .b8 44
741
+ .b8 5
742
+ .b32 125
743
+ .b64 $L__tmp6
744
+ .b64 $L__tmp9
745
+ .b8 2
746
+ .b8 53
747
+ .b8 44
748
+ .b8 4
749
+ .b32 125
750
+ .b64 $L__tmp6
751
+ .b64 $L__tmp9
752
+ .b8 2
753
+ .b8 120
754
+ .b8 46
755
+ .b8 0
756
+ .b8 0
757
+ .b8 0
758
+ }
759
+ .section .debug_pubnames
760
+ {
761
+ .b32 $L__pubNames_end0-$L__pubNames_start0
762
+ $L__pubNames_start0:
763
+ .b8 2
764
+ .b8 0
765
+ .b32 .debug_info
766
+ .b32 306
767
+ .b32 125
768
+ .b8 116
769
+ .b8 114
770
+ .b8 105
771
+ .b8 116
772
+ .b8 111
773
+ .b8 110
774
+ .b8 95
775
+ .b8 95
776
+ .b8 48
777
+ .b8 100
778
+ .b8 49
779
+ .b8 100
780
+ .b8 50
781
+ .b8 100
782
+ .b8 51
783
+ .b8 100
784
+ .b8 52
785
+ .b8 100
786
+ .b8 53
787
+ .b8 100
788
+ .b8 54
789
+ .b8 100
790
+ .b8 101
791
+ .b8 55
792
+ .b8 100
793
+ .b8 101
794
+ .b8 0
795
+ .b32 0
796
+ $L__pubNames_end0:
797
+ }
798
+ .section .debug_pubtypes
799
+ {
800
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
801
+ $L__pubTypes_start0:
802
+ .b8 2
803
+ .b8 0
804
+ .b32 .debug_info
805
+ .b32 306
806
+ .b32 0
807
+ $L__pubTypes_end0:
808
+ }
809
+ .section .debug_loc { }
.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx ADDED
@@ -0,0 +1,717 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5de6de
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
12
+
13
+ .visible .entry triton__0d1d2d3d4d5de6de(
14
+ .param .u64 triton__0d1d2d3d4d5de6de_param_0,
15
+ .param .u64 triton__0d1d2d3d4d5de6de_param_1,
16
+ .param .u64 triton__0d1d2d3d4d5de6de_param_2,
17
+ .param .u64 triton__0d1d2d3d4d5de6de_param_3,
18
+ .param .u64 triton__0d1d2d3d4d5de6de_param_4,
19
+ .param .u32 triton__0d1d2d3d4d5de6de_param_5,
20
+ .param .u32 triton__0d1d2d3d4d5de6de_param_6
21
+ )
22
+ .maxntid 64, 1, 1
23
+ {
24
+ .reg .pred %p<26>;
25
+ .reg .b16 %rs<9>;
26
+ .reg .b32 %r<88>;
27
+ .reg .f32 %f<78>;
28
+ .reg .b64 %rd<14>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_0];
34
+ ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_1];
35
+ $L__tmp0:
36
+ .loc 1 26 26
37
+ mov.u32 %r56, %tid.x;
38
+ and.b32 %r57, %r56, 31;
39
+ ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2];
40
+ ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3];
41
+ ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4];
42
+ shl.b32 %r58, %r56, 2;
43
+ and.b32 %r59, %r58, 252;
44
+ .loc 1 23 28
45
+ mov.u32 %r1, %ctaid.x;
46
+ .loc 1 30 40
47
+ shl.b32 %r60, %r1, 8;
48
+ .loc 1 30 36
49
+ or.b32 %r61, %r60, %r59;
50
+ .loc 1 30 30
51
+ mul.wide.s32 %rd11, %r61, 4;
52
+ add.s64 %rd1, %rd6, %rd11;
53
+ mov.b32 %r6, 0;
54
+ mov.pred %p1, -1;
55
+ .loc 1 30 46
56
+ mov.u32 %r2, 0x0;
57
+ mov.u32 %r3, 0x0;
58
+ mov.u32 %r4, 0x0;
59
+ mov.u32 %r5, 0x0;
60
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
61
+ @!%p1 mov.u32 %r2, %r6;
62
+ @!%p1 mov.u32 %r3, %r6;
63
+ @!%p1 mov.u32 %r4, %r6;
64
+ @!%p1 mov.u32 %r5, %r6;
65
+ mov.b32 %f1, %r4;
66
+ mov.b32 %f2, %r5;
67
+ .loc 1 31 30
68
+ mul.wide.s32 %rd12, %r61, 2;
69
+ add.s64 %rd2, %rd7, %rd12;
70
+ .loc 1 31 46
71
+ mov.u32 %r10, 0x0;
72
+ mov.u32 %r11, 0x0;
73
+ @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
74
+ @!%p1 mov.u32 %r10, %r6;
75
+ @!%p1 mov.u32 %r11, %r6;
76
+ cvt.u16.u32 %rs1, %r10;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
78
+ cvt.u16.u32 %rs3, %r11;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
80
+ .loc 1 31 67
81
+ cvt.f32.bf16 %r14, %rs1;
82
+ mov.b32 %f3, %r14;
83
+ cvt.f32.bf16 %r15, %rs2;
84
+ mov.b32 %f4, %r15;
85
+ cvt.f32.bf16 %r16, %rs3;
86
+ mov.b32 %f5, %r16;
87
+ cvt.f32.bf16 %r17, %rs4;
88
+ mov.b32 %f6, %r17;
89
+ .loc 1 32 30
90
+ add.s64 %rd3, %rd8, %rd12;
91
+ .loc 1 32 46
92
+ mov.u32 %r18, 0x0;
93
+ mov.u32 %r19, 0x0;
94
+ @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
95
+ @!%p1 mov.u32 %r18, %r6;
96
+ @!%p1 mov.u32 %r19, %r6;
97
+ cvt.u16.u32 %rs5, %r18;
98
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
99
+ cvt.u16.u32 %rs7, %r19;
100
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
101
+ .loc 1 32 67
102
+ cvt.f32.bf16 %r22, %rs5;
103
+ mov.b32 %f7, %r22;
104
+ cvt.f32.bf16 %r23, %rs6;
105
+ mov.b32 %f8, %r23;
106
+ cvt.f32.bf16 %r24, %rs7;
107
+ mov.b32 %f9, %r24;
108
+ cvt.f32.bf16 %r25, %rs8;
109
+ mov.b32 %f10, %r25;
110
+ .loc 1 33 31
111
+ mul.wide.u32 %rd13, %r59, 4;
112
+ add.s64 %rd4, %rd9, %rd13;
113
+ .loc 1 33 36
114
+ mov.u32 %r26, 0x0;
115
+ mov.u32 %r27, 0x0;
116
+ mov.u32 %r28, 0x0;
117
+ mov.u32 %r29, 0x0;
118
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
119
+ @!%p1 mov.u32 %r26, %r6;
120
+ @!%p1 mov.u32 %r27, %r6;
121
+ @!%p1 mov.u32 %r28, %r6;
122
+ @!%p1 mov.u32 %r29, %r6;
123
+ .loc 1 35 18
124
+ add.f32 %f11, %f5, %f1;
125
+ add.f32 %f12, %f6, %f2;
126
+ .loc 1 30 46
127
+ mov.b32 %f13, %r3;
128
+ mov.b32 %f14, %r2;
129
+ .loc 1 35 18
130
+ add.f32 %f15, %f3, %f14;
131
+ add.f32 %f16, %f4, %f13;
132
+ .loc 1 37 18
133
+ add.f32 %f17, %f16, %f8;
134
+ add.f32 %f18, %f15, %f7;
135
+ add.f32 %f19, %f11, %f9;
136
+ add.f32 %f20, %f12, %f10;
137
+ $L__tmp1:
138
+ .loc 2 233 15
139
+ add.f32 %f21, %f18, %f17;
140
+ add.f32 %f22, %f21, %f19;
141
+ add.f32 %f23, %f22, %f20;
142
+ $L__tmp2:
143
+ .loc 2 243 36
144
+ mov.b32 %r62, %f23;
145
+ shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1;
146
+ mov.b32 %f24, %r63;
147
+ $L__tmp3:
148
+ .loc 2 233 15
149
+ add.f32 %f25, %f23, %f24;
150
+ $L__tmp4:
151
+ .loc 2 243 36
152
+ mov.b32 %r64, %f25;
153
+ shfl.sync.bfly.b32 %r65, %r64, 8, 31, -1;
154
+ mov.b32 %f26, %r65;
155
+ $L__tmp5:
156
+ .loc 2 233 15
157
+ add.f32 %f27, %f25, %f26;
158
+ $L__tmp6:
159
+ .loc 2 243 36
160
+ mov.b32 %r66, %f27;
161
+ shfl.sync.bfly.b32 %r67, %r66, 4, 31, -1;
162
+ mov.b32 %f28, %r67;
163
+ $L__tmp7:
164
+ .loc 2 233 15
165
+ add.f32 %f29, %f27, %f28;
166
+ $L__tmp8:
167
+ .loc 2 243 36
168
+ mov.b32 %r68, %f29;
169
+ shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1;
170
+ mov.b32 %f30, %r69;
171
+ $L__tmp9:
172
+ .loc 2 233 15
173
+ add.f32 %f31, %f29, %f30;
174
+ $L__tmp10:
175
+ .loc 2 243 36
176
+ mov.b32 %r70, %f31;
177
+ shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1;
178
+ mov.b32 %f32, %r71;
179
+ $L__tmp11:
180
+ .loc 2 233 15
181
+ add.f32 %f33, %f31, %f32;
182
+ $L__tmp12:
183
+ .loc 2 243 36
184
+ setp.eq.s32 %p17, %r57, 0;
185
+ shr.u32 %r72, %r56, 3;
186
+ and.b32 %r73, %r72, 4;
187
+ mov.u32 %r74, global_smem;
188
+ add.s32 %r34, %r74, %r73;
189
+ mov.b32 %r35, %f33;
190
+ @%p17 st.shared.b32 [ %r34 + 0 ], %r35;
191
+ bar.sync 0;
192
+ setp.lt.s32 %p18, %r56, 2;
193
+ add.s32 %r37, %r74, %r58;
194
+ @%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
195
+ mov.b32 %f34, %r36;
196
+ shfl.sync.bfly.b32 %r75, %r36, 1, 31, -1;
197
+ mov.b32 %f35, %r75;
198
+ $L__tmp13:
199
+ .loc 2 233 15
200
+ add.f32 %f36, %f34, %f35;
201
+ $L__tmp14:
202
+ .loc 2 243 36
203
+ and.b32 %r76, %r56, 1;
204
+ setp.eq.b32 %p24, %r76, 1;
205
+ not.pred %p25, %p24;
206
+ and.pred %p19, %p18, %p25;
207
+ mov.b32 %r39, %f36;
208
+ @%p19 st.shared.b32 [ %r37 + 0 ], %r39;
209
+ bar.sync 0;
210
+ ld.shared.f32 %f37, [global_smem];
211
+ $L__tmp15:
212
+ .loc 3 8 15
213
+ add.f32 %f38, %f37, 0f00000000;
214
+ $L__tmp16:
215
+ .loc 1 45 20
216
+ mov.b32 %r41, %f38;
217
+ mov.b32 %r42, 1132462080;
218
+ div.full.f32 %r40, %r41, %r42;
219
+ mov.b32 %f39, %r40;
220
+ .loc 1 46 19
221
+ sub.f32 %f40, %f18, %f39;
222
+ sub.f32 %f41, %f17, %f39;
223
+ sub.f32 %f42, %f19, %f39;
224
+ sub.f32 %f43, %f20, %f39;
225
+ .loc 1 47 20
226
+ mul.f32 %f44, %f41, %f41;
227
+ $L__tmp17:
228
+ .loc 2 243 36
229
+ bar.sync 0;
230
+ $L__tmp18:
231
+ .loc 2 233 15
232
+ fma.rn.f32 %f45, %f40, %f40, %f44;
233
+ fma.rn.f32 %f46, %f42, %f42, %f45;
234
+ fma.rn.f32 %f47, %f43, %f43, %f46;
235
+ $L__tmp19:
236
+ .loc 2 243 36
237
+ mov.b32 %r77, %f47;
238
+ shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
239
+ mov.b32 %f48, %r78;
240
+ $L__tmp20:
241
+ .loc 2 233 15
242
+ add.f32 %f49, %f47, %f48;
243
+ $L__tmp21:
244
+ .loc 2 243 36
245
+ mov.b32 %r79, %f49;
246
+ shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
247
+ mov.b32 %f50, %r80;
248
+ $L__tmp22:
249
+ .loc 2 233 15
250
+ add.f32 %f51, %f49, %f50;
251
+ $L__tmp23:
252
+ .loc 2 243 36
253
+ mov.b32 %r81, %f51;
254
+ shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
255
+ mov.b32 %f52, %r82;
256
+ $L__tmp24:
257
+ .loc 2 233 15
258
+ add.f32 %f53, %f51, %f52;
259
+ $L__tmp25:
260
+ .loc 2 243 36
261
+ mov.b32 %r83, %f53;
262
+ shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
263
+ mov.b32 %f54, %r84;
264
+ $L__tmp26:
265
+ .loc 2 233 15
266
+ add.f32 %f55, %f53, %f54;
267
+ $L__tmp27:
268
+ .loc 2 243 36
269
+ mov.b32 %r85, %f55;
270
+ shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
271
+ mov.b32 %f56, %r86;
272
+ $L__tmp28:
273
+ .loc 2 233 15
274
+ add.f32 %f57, %f55, %f56;
275
+ $L__tmp29:
276
+ .loc 2 243 36
277
+ mov.b32 %r44, %f57;
278
+ @%p17 st.shared.b32 [ %r34 + 0 ], %r44;
279
+ bar.sync 0;
280
+ @%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
281
+ mov.b32 %f58, %r45;
282
+ shfl.sync.bfly.b32 %r87, %r45, 1, 31, -1;
283
+ mov.b32 %f59, %r87;
284
+ $L__tmp30:
285
+ .loc 2 233 15
286
+ add.f32 %f60, %f58, %f59;
287
+ $L__tmp31:
288
+ .loc 2 243 36
289
+ mov.b32 %r48, %f60;
290
+ @%p19 st.shared.b32 [ %r37 + 0 ], %r48;
291
+ bar.sync 0;
292
+ ld.shared.f32 %f61, [global_smem];
293
+ $L__tmp32:
294
+ .loc 3 8 15
295
+ add.f32 %f62, %f61, 0f00000000;
296
+ $L__tmp33:
297
+ .loc 1 53 20
298
+ mov.b32 %r50, %f62;
299
+ div.full.f32 %r49, %r50, %r42;
300
+ mov.b32 %f63, %r49;
301
+ .loc 1 55 20
302
+ add.f32 %f64, %f63, 0f3727C5AC;
303
+ .loc 1 56 26
304
+ rsqrt.approx.ftz.f32 %f65, %f64;
305
+ .loc 1 33 36
306
+ mov.b32 %f66, %r29;
307
+ mov.b32 %f67, %r28;
308
+ mov.b32 %f68, %r27;
309
+ mov.b32 %f69, %r26;
310
+ .loc 1 57 20
311
+ mul.f32 %f70, %f40, %f65;
312
+ mul.f32 %f71, %f41, %f65;
313
+ mul.f32 %f72, %f42, %f65;
314
+ mul.f32 %f73, %f43, %f65;
315
+ .loc 1 58 20
316
+ mul.f32 %f74, %f70, %f69;
317
+ mul.f32 %f75, %f71, %f68;
318
+ mul.f32 %f76, %f72, %f67;
319
+ mul.f32 %f77, %f73, %f66;
320
+ .loc 1 59 25
321
+ add.s64 %rd5, %rd10, %rd11;
322
+ .loc 1 59 48
323
+ mov.b32 %r52, %f74;
324
+ mov.b32 %r53, %f75;
325
+ mov.b32 %r54, %f76;
326
+ mov.b32 %r55, %f77;
327
+ @%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r52, %r53, %r54, %r55 };
328
+ .loc 1 59 4
329
+ ret;
330
+ $L__tmp34:
331
+ $L__func_end0:
332
+
333
+ }
334
+ // .globl __nv_rsqrtf
335
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
336
+ .param .b32 __nv_rsqrtf_param_0
337
+ )
338
+ {
339
+ .reg .f32 %f<3>;
340
+ $L__func_begin1:
341
+
342
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
343
+ rsqrt.approx.ftz.f32 %f2, %f1;
344
+ st.param.f32 [func_retval0+0], %f2;
345
+ ret;
346
+ $L__func_end1:
347
+
348
+ }
349
+ .file 1 "/tmp/torchinductor_root/tv/ctvr3xs46luhhbr7xomihgyropjaatss7yata4igaw6kvgwas7g2.py"
350
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
351
+ .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
352
+ .section .debug_abbrev
353
+ {
354
+ .b8 1
355
+ .b8 17
356
+ .b8 1
357
+ .b8 37
358
+ .b8 8
359
+ .b8 19
360
+ .b8 5
361
+ .b8 3
362
+ .b8 8
363
+ .b8 16
364
+ .b8 6
365
+ .b8 27
366
+ .b8 8
367
+ .b8 180
368
+ .b8 66
369
+ .b8 12
370
+ .b8 17
371
+ .b8 1
372
+ .b8 18
373
+ .b8 1
374
+ .b8 0
375
+ .b8 0
376
+ .b8 2
377
+ .b8 46
378
+ .b8 0
379
+ .b8 135
380
+ .b8 64
381
+ .b8 8
382
+ .b8 3
383
+ .b8 8
384
+ .b8 58
385
+ .b8 11
386
+ .b8 59
387
+ .b8 11
388
+ .b8 63
389
+ .b8 12
390
+ .b8 32
391
+ .b8 11
392
+ .b8 0
393
+ .b8 0
394
+ .b8 3
395
+ .b8 46
396
+ .b8 1
397
+ .b8 17
398
+ .b8 1
399
+ .b8 18
400
+ .b8 1
401
+ .b8 64
402
+ .b8 10
403
+ .b8 49
404
+ .b8 19
405
+ .b8 0
406
+ .b8 0
407
+ .b8 4
408
+ .b8 29
409
+ .b8 1
410
+ .b8 49
411
+ .b8 19
412
+ .b8 17
413
+ .b8 1
414
+ .b8 18
415
+ .b8 1
416
+ .b8 88
417
+ .b8 11
418
+ .b8 89
419
+ .b8 11
420
+ .b8 87
421
+ .b8 11
422
+ .b8 0
423
+ .b8 0
424
+ .b8 5
425
+ .b8 29
426
+ .b8 0
427
+ .b8 49
428
+ .b8 19
429
+ .b8 17
430
+ .b8 1
431
+ .b8 18
432
+ .b8 1
433
+ .b8 88
434
+ .b8 11
435
+ .b8 89
436
+ .b8 11
437
+ .b8 87
438
+ .b8 11
439
+ .b8 0
440
+ .b8 0
441
+ .b8 0
442
+ }
443
+ .section .debug_info
444
+ {
445
+ .b32 395
446
+ .b8 2
447
+ .b8 0
448
+ .b32 .debug_abbrev
449
+ .b8 8
450
+ .b8 1
451
+ .b8 116
452
+ .b8 114
453
+ .b8 105
454
+ .b8 116
455
+ .b8 111
456
+ .b8 110
457
+ .b8 0
458
+ .b8 2
459
+ .b8 0
460
+ .b8 99
461
+ .b8 116
462
+ .b8 118
463
+ .b8 114
464
+ .b8 51
465
+ .b8 120
466
+ .b8 115
467
+ .b8 52
468
+ .b8 54
469
+ .b8 108
470
+ .b8 117
471
+ .b8 104
472
+ .b8 104
473
+ .b8 98
474
+ .b8 114
475
+ .b8 55
476
+ .b8 120
477
+ .b8 111
478
+ .b8 109
479
+ .b8 105
480
+ .b8 104
481
+ .b8 103
482
+ .b8 121
483
+ .b8 114
484
+ .b8 111
485
+ .b8 112
486
+ .b8 106
487
+ .b8 97
488
+ .b8 97
489
+ .b8 116
490
+ .b8 115
491
+ .b8 115
492
+ .b8 55
493
+ .b8 121
494
+ .b8 97
495
+ .b8 116
496
+ .b8 97
497
+ .b8 52
498
+ .b8 105
499
+ .b8 103
500
+ .b8 97
501
+ .b8 119
502
+ .b8 54
503
+ .b8 107
504
+ .b8 118
505
+ .b8 103
506
+ .b8 119
507
+ .b8 97
508
+ .b8 115
509
+ .b8 55
510
+ .b8 103
511
+ .b8 50
512
+ .b8 46
513
+ .b8 112
514
+ .b8 121
515
+ .b8 0
516
+ .b32 .debug_line
517
+ .b8 47
518
+ .b8 116
519
+ .b8 109
520
+ .b8 112
521
+ .b8 47
522
+ .b8 116
523
+ .b8 111
524
+ .b8 114
525
+ .b8 99
526
+ .b8 104
527
+ .b8 105
528
+ .b8 110
529
+ .b8 100
530
+ .b8 117
531
+ .b8 99
532
+ .b8 116
533
+ .b8 111
534
+ .b8 114
535
+ .b8 95
536
+ .b8 114
537
+ .b8 111
538
+ .b8 111
539
+ .b8 116
540
+ .b8 47
541
+ .b8 116
542
+ .b8 118
543
+ .b8 0
544
+ .b8 1
545
+ .b64 $L__func_begin0
546
+ .b64 $L__func_end0
547
+ .b8 2
548
+ .b8 116
549
+ .b8 114
550
+ .b8 105
551
+ .b8 116
552
+ .b8 111
553
+ .b8 110
554
+ .b8 95
555
+ .b8 95
556
+ .b8 48
557
+ .b8 100
558
+ .b8 49
559
+ .b8 100
560
+ .b8 50
561
+ .b8 100
562
+ .b8 51
563
+ .b8 100
564
+ .b8 52
565
+ .b8 100
566
+ .b8 53
567
+ .b8 100
568
+ .b8 101
569
+ .b8 54
570
+ .b8 100
571
+ .b8 101
572
+ .b8 0
573
+ .b8 116
574
+ .b8 114
575
+ .b8 105
576
+ .b8 116
577
+ .b8 111
578
+ .b8 110
579
+ .b8 95
580
+ .b8 95
581
+ .b8 48
582
+ .b8 100
583
+ .b8 49
584
+ .b8 100
585
+ .b8 50
586
+ .b8 100
587
+ .b8 51
588
+ .b8 100
589
+ .b8 52
590
+ .b8 100
591
+ .b8 53
592
+ .b8 100
593
+ .b8 101
594
+ .b8 54
595
+ .b8 100
596
+ .b8 101
597
+ .b8 0
598
+ .b8 1
599
+ .b8 18
600
+ .b8 1
601
+ .b8 1
602
+ .b8 3
603
+ .b64 $L__func_begin0
604
+ .b64 $L__func_end0
605
+ .b8 1
606
+ .b8 156
607
+ .b32 125
608
+ .b8 4
609
+ .b32 125
610
+ .b64 $L__tmp1
611
+ .b64 $L__tmp14
612
+ .b8 2
613
+ .b8 42
614
+ .b8 59
615
+ .b8 5
616
+ .b32 125
617
+ .b64 $L__tmp1
618
+ .b64 $L__tmp14
619
+ .b8 2
620
+ .b8 243
621
+ .b8 36
622
+ .b8 0
623
+ .b8 5
624
+ .b32 125
625
+ .b64 $L__tmp2
626
+ .b64 $L__tmp15
627
+ .b8 2
628
+ .b8 42
629
+ .b8 59
630
+ .b8 5
631
+ .b32 125
632
+ .b64 $L__tmp15
633
+ .b64 $L__tmp16
634
+ .b8 3
635
+ .b8 42
636
+ .b8 45
637
+ .b8 5
638
+ .b32 125
639
+ .b64 $L__tmp17
640
+ .b64 $L__tmp32
641
+ .b8 2
642
+ .b8 50
643
+ .b8 59
644
+ .b8 4
645
+ .b32 125
646
+ .b64 $L__tmp18
647
+ .b64 $L__tmp31
648
+ .b8 2
649
+ .b8 50
650
+ .b8 59
651
+ .b8 5
652
+ .b32 125
653
+ .b64 $L__tmp18
654
+ .b64 $L__tmp31
655
+ .b8 2
656
+ .b8 243
657
+ .b8 36
658
+ .b8 0
659
+ .b8 5
660
+ .b32 125
661
+ .b64 $L__tmp32
662
+ .b64 $L__tmp33
663
+ .b8 3
664
+ .b8 50
665
+ .b8 45
666
+ .b8 0
667
+ .b8 0
668
+ }
669
+ .section .debug_pubnames
670
+ {
671
+ .b32 $L__pubNames_end0-$L__pubNames_start0
672
+ $L__pubNames_start0:
673
+ .b8 2
674
+ .b8 0
675
+ .b32 .debug_info
676
+ .b32 399
677
+ .b32 125
678
+ .b8 116
679
+ .b8 114
680
+ .b8 105
681
+ .b8 116
682
+ .b8 111
683
+ .b8 110
684
+ .b8 95
685
+ .b8 95
686
+ .b8 48
687
+ .b8 100
688
+ .b8 49
689
+ .b8 100
690
+ .b8 50
691
+ .b8 100
692
+ .b8 51
693
+ .b8 100
694
+ .b8 52
695
+ .b8 100
696
+ .b8 53
697
+ .b8 100
698
+ .b8 101
699
+ .b8 54
700
+ .b8 100
701
+ .b8 101
702
+ .b8 0
703
+ .b32 0
704
+ $L__pubNames_end0:
705
+ }
706
+ .section .debug_pubtypes
707
+ {
708
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
709
+ $L__pubTypes_start0:
710
+ .b8 2
711
+ .b8 0
712
+ .b32 .debug_info
713
+ .b32 399
714
+ .b32 0
715
+ $L__pubTypes_end0:
716
+ }
717
+ .section .debug_loc { }
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
3
+ %c0_i32 = arith.constant 0 : i32
4
+ %cst = arith.constant dense<0> : tensor<1x8xi64>
5
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
6
+ %cst_1 = arith.constant dense<8> : tensor<1x8xi32>
7
+ %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
8
+ %1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
9
+ %2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32>
10
+ %3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
11
+ %4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
12
+ %5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32>
13
+ %6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>>
14
+ %7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>>, tensor<1x8xi32>
15
+ %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64>
16
+ %9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1>, tensor<1x8xf32>
17
+ %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
18
+ ^bb0(%arg5: f32, %arg6: f32):
19
+ %19 = arith.addf %arg5, %arg6 : f32
20
+ tt.reduce.return %19 : f32
21
+ }) : (tensor<1x8xf32>) -> tensor<1xf32>
22
+ %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
23
+ %12 = arith.select %2, %8, %cst : tensor<1x8xi1>, tensor<1x8xi64>
24
+ %13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
25
+ ^bb0(%arg5: i64, %arg6: i64):
26
+ %19 = arith.addi %arg5, %arg6 : i64
27
+ tt.reduce.return %19 : i64
28
+ }) : (tensor<1x8xi64>) -> tensor<1xi64>
29
+ %14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64>) -> tensor<1x1xi64>
30
+ %15 = arith.sitofp %14 : tensor<1x1xi64> to tensor<1x1xf32>
31
+ %16 = arith.divf %11, %15 : tensor<1x1xf32>
32
+ gpu.barrier
33
+ %17 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
34
+ %18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
35
+ tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4de
10
+
11
+ .visible .entry triton__0d1d2d3d4de(
12
+ .param .u64 triton__0d1d2d3d4de_param_0,
13
+ .param .u64 triton__0d1d2d3d4de_param_1,
14
+ .param .u64 triton__0d1d2d3d4de_param_2,
15
+ .param .u64 triton__0d1d2d3d4de_param_3,
16
+ .param .u32 triton__0d1d2d3d4de_param_4
17
+ )
18
+ .maxntid 128, 1, 1
19
+ {
20
+ .reg .pred %p<8>;
21
+ .reg .b16 %rs<33>;
22
+ .reg .b32 %r<77>;
23
+ .reg .f32 %f<65>;
24
+ .reg .b64 %rd<11>;
25
+ .loc 1 18 0
26
+ $L__func_begin0:
27
+ .loc 1 18 0
28
+
29
+ ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0];
30
+ ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1];
31
+ $L__tmp0:
32
+ .loc 1 21 36
33
+ mov.u32 %r50, %tid.x;
34
+ shl.b32 %r51, %r50, 3;
35
+ ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2];
36
+ and.b32 %r52, %r51, 1016;
37
+ ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3];
38
+ .loc 1 20 28
39
+ mov.u32 %r1, %ctaid.x;
40
+ .loc 1 20 33
41
+ shl.b32 %r53, %r1, 10;
42
+ .loc 1 21 23
43
+ or.b32 %r54, %r53, %r52;
44
+ .loc 1 23 20
45
+ shr.s32 %r56, %r54, 31;
46
+ shr.u32 %r57, %r56, 24;
47
+ add.s32 %r58, %r54, %r57;
48
+ shr.s32 %r59, %r58, 8;
49
+ .loc 1 23 27
50
+ mul.hi.s32 %r60, %r59, 1431655766;
51
+ shr.u32 %r61, %r60, 31;
52
+ add.s32 %r62, %r60, %r61;
53
+ mul.lo.s32 %r63, %r62, 3;
54
+ sub.s32 %r64, %r59, %r63;
55
+ and.b32 %r65, %r58, -256;
56
+ sub.s32 %r66, %r54, %r65;
57
+ .loc 1 25 20
58
+ mul.hi.s32 %r67, %r54, 715827883;
59
+ shr.u32 %r68, %r67, 31;
60
+ shr.u32 %r69, %r67, 7;
61
+ add.s32 %r70, %r69, %r68;
62
+ .loc 1 27 40
63
+ shl.b32 %r71, %r70, 8;
64
+ .loc 1 27 36
65
+ add.s32 %r72, %r71, %r66;
66
+ .loc 1 27 30
67
+ mul.wide.s32 %rd9, %r72, 2;
68
+ add.s64 %rd1, %rd5, %rd9;
69
+ mov.pred %p1, -1;
70
+ .loc 1 27 46
71
+ mov.u32 %r2, 0x0;
72
+ mov.u32 %r3, 0x0;
73
+ mov.u32 %r4, 0x0;
74
+ mov.u32 %r5, 0x0;
75
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
76
+ cvt.u16.u32 %rs1, %r2;
77
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
78
+ cvt.u16.u32 %rs3, %r3;
79
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
80
+ cvt.u16.u32 %rs5, %r4;
81
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
82
+ cvt.u16.u32 %rs7, %r5;
83
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
84
+ .loc 1 27 85
85
+ cvt.f32.bf16 %r6, %rs1;
86
+ mov.b32 %f1, %r6;
87
+ cvt.f32.bf16 %r7, %rs2;
88
+ mov.b32 %f2, %r7;
89
+ cvt.f32.bf16 %r8, %rs3;
90
+ mov.b32 %f3, %r8;
91
+ cvt.f32.bf16 %r9, %rs4;
92
+ mov.b32 %f4, %r9;
93
+ cvt.f32.bf16 %r10, %rs5;
94
+ mov.b32 %f5, %r10;
95
+ cvt.f32.bf16 %r11, %rs6;
96
+ mov.b32 %f6, %r11;
97
+ cvt.f32.bf16 %r12, %rs7;
98
+ mov.b32 %f7, %r12;
99
+ cvt.f32.bf16 %r13, %rs8;
100
+ mov.b32 %f8, %r13;
101
+ .loc 1 28 30
102
+ add.s64 %rd2, %rd6, %rd9;
103
+ .loc 1 28 46
104
+ mov.u32 %r14, 0x0;
105
+ mov.u32 %r15, 0x0;
106
+ mov.u32 %r16, 0x0;
107
+ mov.u32 %r17, 0x0;
108
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
109
+ cvt.u16.u32 %rs9, %r14;
110
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
111
+ cvt.u16.u32 %rs11, %r15;
112
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
113
+ cvt.u16.u32 %rs13, %r16;
114
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
115
+ cvt.u16.u32 %rs15, %r17;
116
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
117
+ .loc 1 28 85
118
+ cvt.f32.bf16 %r18, %rs9;
119
+ mov.b32 %f9, %r18;
120
+ cvt.f32.bf16 %r19, %rs10;
121
+ mov.b32 %f10, %r19;
122
+ cvt.f32.bf16 %r20, %rs11;
123
+ mov.b32 %f11, %r20;
124
+ cvt.f32.bf16 %r21, %rs12;
125
+ mov.b32 %f12, %r21;
126
+ cvt.f32.bf16 %r22, %rs13;
127
+ mov.b32 %f13, %r22;
128
+ cvt.f32.bf16 %r23, %rs14;
129
+ mov.b32 %f14, %r23;
130
+ cvt.f32.bf16 %r24, %rs15;
131
+ mov.b32 %f15, %r24;
132
+ cvt.f32.bf16 %r25, %rs16;
133
+ mov.b32 %f16, %r25;
134
+ .loc 1 29 31
135
+ add.s64 %rd3, %rd7, %rd9;
136
+ .loc 1 29 47
137
+ mov.u32 %r26, 0x0;
138
+ mov.u32 %r27, 0x0;
139
+ mov.u32 %r28, 0x0;
140
+ mov.u32 %r29, 0x0;
141
+ @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ];
142
+ cvt.u16.u32 %rs17, %r26;
143
+ { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; }
144
+ cvt.u16.u32 %rs19, %r27;
145
+ { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; }
146
+ cvt.u16.u32 %rs21, %r28;
147
+ { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; }
148
+ cvt.u16.u32 %rs23, %r29;
149
+ { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; }
150
+ .loc 1 29 86
151
+ cvt.f32.bf16 %r30, %rs17;
152
+ mov.b32 %f17, %r30;
153
+ cvt.f32.bf16 %r31, %rs18;
154
+ mov.b32 %f18, %r31;
155
+ cvt.f32.bf16 %r32, %rs19;
156
+ mov.b32 %f19, %r32;
157
+ cvt.f32.bf16 %r33, %rs20;
158
+ mov.b32 %f20, %r33;
159
+ cvt.f32.bf16 %r34, %rs21;
160
+ mov.b32 %f21, %r34;
161
+ cvt.f32.bf16 %r35, %rs22;
162
+ mov.b32 %f22, %r35;
163
+ cvt.f32.bf16 %r36, %rs23;
164
+ mov.b32 %f23, %r36;
165
+ cvt.f32.bf16 %r37, %rs24;
166
+ mov.b32 %f24, %r37;
167
+ .loc 1 32 19
168
+ setp.eq.s32 %p5, %r64, 2;
169
+ .loc 1 34 32
170
+ selp.f32 %f25, %f1, 0f00000000, %p5;
171
+ selp.f32 %f26, %f2, 0f00000000, %p5;
172
+ selp.f32 %f27, %f3, 0f00000000, %p5;
173
+ selp.f32 %f28, %f4, 0f00000000, %p5;
174
+ selp.f32 %f29, %f5, 0f00000000, %p5;
175
+ selp.f32 %f30, %f6, 0f00000000, %p5;
176
+ selp.f32 %f31, %f7, 0f00000000, %p5;
177
+ selp.f32 %f32, %f8, 0f00000000, %p5;
178
+ .loc 1 36 19
179
+ setp.eq.s32 %p6, %r64, 1;
180
+ .loc 1 37 32
181
+ selp.f32 %f33, %f9, 0f00000000, %p6;
182
+ selp.f32 %f34, %f10, 0f00000000, %p6;
183
+ selp.f32 %f35, %f11, 0f00000000, %p6;
184
+ selp.f32 %f36, %f12, 0f00000000, %p6;
185
+ selp.f32 %f37, %f13, 0f00000000, %p6;
186
+ selp.f32 %f38, %f14, 0f00000000, %p6;
187
+ selp.f32 %f39, %f15, 0f00000000, %p6;
188
+ selp.f32 %f40, %f16, 0f00000000, %p6;
189
+ .loc 1 38 19
190
+ add.f32 %f41, %f25, %f33;
191
+ add.f32 %f42, %f26, %f34;
192
+ add.f32 %f43, %f27, %f35;
193
+ add.f32 %f44, %f28, %f36;
194
+ add.f32 %f45, %f29, %f37;
195
+ add.f32 %f46, %f30, %f38;
196
+ add.f32 %f47, %f31, %f39;
197
+ add.f32 %f48, %f32, %f40;
198
+ .loc 1 40 20
199
+ setp.eq.s32 %p7, %r64, 0;
200
+ .loc 1 41 35
201
+ selp.f32 %f49, %f17, 0f00000000, %p7;
202
+ selp.f32 %f50, %f18, 0f00000000, %p7;
203
+ selp.f32 %f51, %f19, 0f00000000, %p7;
204
+ selp.f32 %f52, %f20, 0f00000000, %p7;
205
+ selp.f32 %f53, %f21, 0f00000000, %p7;
206
+ selp.f32 %f54, %f22, 0f00000000, %p7;
207
+ selp.f32 %f55, %f23, 0f00000000, %p7;
208
+ selp.f32 %f56, %f24, 0f00000000, %p7;
209
+ .loc 1 42 20
210
+ add.f32 %f57, %f41, %f49;
211
+ add.f32 %f58, %f42, %f50;
212
+ add.f32 %f59, %f43, %f51;
213
+ add.f32 %f60, %f44, %f52;
214
+ add.f32 %f61, %f45, %f53;
215
+ add.f32 %f62, %f46, %f54;
216
+ add.f32 %f63, %f47, %f55;
217
+ add.f32 %f64, %f48, %f56;
218
+ .loc 1 43 25
219
+ mul.wide.s32 %rd10, %r54, 2;
220
+ add.s64 %rd4, %rd8, %rd10;
221
+ .loc 1 43 37
222
+ mov.b32 %r38, %f57;
223
+ cvt.rn.bf16.f32 %rs25, %r38;
224
+ mov.b32 %r39, %f58;
225
+ cvt.rn.bf16.f32 %rs26, %r39;
226
+ mov.b32 %r40, %f59;
227
+ cvt.rn.bf16.f32 %rs27, %r40;
228
+ mov.b32 %r41, %f60;
229
+ cvt.rn.bf16.f32 %rs28, %r41;
230
+ mov.b32 %r42, %f61;
231
+ cvt.rn.bf16.f32 %rs29, %r42;
232
+ mov.b32 %r43, %f62;
233
+ cvt.rn.bf16.f32 %rs30, %r43;
234
+ mov.b32 %r44, %f63;
235
+ cvt.rn.bf16.f32 %rs31, %r44;
236
+ mov.b32 %r45, %f64;
237
+ cvt.rn.bf16.f32 %rs32, %r45;
238
+ mov.b32 %r73, {%rs25, %rs26};
239
+ mov.b32 %r74, {%rs27, %rs28};
240
+ mov.b32 %r75, {%rs29, %rs30};
241
+ mov.b32 %r76, {%rs31, %rs32};
242
+ @%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 };
243
+ .loc 1 43 4
244
+ ret;
245
+ $L__tmp1:
246
+ $L__func_end0:
247
+
248
+ }
249
+ .file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py"
250
+ .section .debug_abbrev
251
+ {
252
+ .b8 1
253
+ .b8 17
254
+ .b8 1
255
+ .b8 37
256
+ .b8 8
257
+ .b8 19
258
+ .b8 5
259
+ .b8 3
260
+ .b8 8
261
+ .b8 16
262
+ .b8 6
263
+ .b8 27
264
+ .b8 8
265
+ .b8 180
266
+ .b8 66
267
+ .b8 12
268
+ .b8 17
269
+ .b8 1
270
+ .b8 18
271
+ .b8 1
272
+ .b8 0
273
+ .b8 0
274
+ .b8 2
275
+ .b8 46
276
+ .b8 0
277
+ .b8 17
278
+ .b8 1
279
+ .b8 18
280
+ .b8 1
281
+ .b8 64
282
+ .b8 10
283
+ .b8 135
284
+ .b8 64
285
+ .b8 8
286
+ .b8 3
287
+ .b8 8
288
+ .b8 58
289
+ .b8 11
290
+ .b8 59
291
+ .b8 11
292
+ .b8 63
293
+ .b8 12
294
+ .b8 0
295
+ .b8 0
296
+ .b8 0
297
+ }
298
+ .section .debug_info
299
+ {
300
+ .b32 184
301
+ .b8 2
302
+ .b8 0
303
+ .b32 .debug_abbrev
304
+ .b8 8
305
+ .b8 1
306
+ .b8 116
307
+ .b8 114
308
+ .b8 105
309
+ .b8 116
310
+ .b8 111
311
+ .b8 110
312
+ .b8 0
313
+ .b8 2
314
+ .b8 0
315
+ .b8 99
316
+ .b8 54
317
+ .b8 51
318
+ .b8 114
319
+ .b8 55
320
+ .b8 105
321
+ .b8 117
322
+ .b8 114
323
+ .b8 119
324
+ .b8 107
325
+ .b8 53
326
+ .b8 121
327
+ .b8 100
328
+ .b8 108
329
+ .b8 115
330
+ .b8 119
331
+ .b8 104
332
+ .b8 55
333
+ .b8 114
334
+ .b8 118
335
+ .b8 104
336
+ .b8 99
337
+ .b8 109
338
+ .b8 108
339
+ .b8 120
340
+ .b8 50
341
+ .b8 99
342
+ .b8 102
343
+ .b8 114
344
+ .b8 101
345
+ .b8 116
346
+ .b8 108
347
+ .b8 114
348
+ .b8 101
349
+ .b8 119
350
+ .b8 103
351
+ .b8 119
352
+ .b8 54
353
+ .b8 116
354
+ .b8 108
355
+ .b8 106
356
+ .b8 108
357
+ .b8 117
358
+ .b8 114
359
+ .b8 115
360
+ .b8 115
361
+ .b8 104
362
+ .b8 103
363
+ .b8 116
364
+ .b8 102
365
+ .b8 112
366
+ .b8 112
367
+ .b8 46
368
+ .b8 112
369
+ .b8 121
370
+ .b8 0
371
+ .b32 .debug_line
372
+ .b8 47
373
+ .b8 116
374
+ .b8 109
375
+ .b8 112
376
+ .b8 47
377
+ .b8 116
378
+ .b8 111
379
+ .b8 114
380
+ .b8 99
381
+ .b8 104
382
+ .b8 105
383
+ .b8 110
384
+ .b8 100
385
+ .b8 117
386
+ .b8 99
387
+ .b8 116
388
+ .b8 111
389
+ .b8 114
390
+ .b8 95
391
+ .b8 114
392
+ .b8 111
393
+ .b8 111
394
+ .b8 116
395
+ .b8 47
396
+ .b8 54
397
+ .b8 51
398
+ .b8 0
399
+ .b8 1
400
+ .b64 $L__func_begin0
401
+ .b64 $L__func_end0
402
+ .b8 2
403
+ .b64 $L__func_begin0
404
+ .b64 $L__func_end0
405
+ .b8 1
406
+ .b8 156
407
+ .b8 116
408
+ .b8 114
409
+ .b8 105
410
+ .b8 116
411
+ .b8 111
412
+ .b8 110
413
+ .b8 95
414
+ .b8 95
415
+ .b8 48
416
+ .b8 100
417
+ .b8 49
418
+ .b8 100
419
+ .b8 50
420
+ .b8 100
421
+ .b8 51
422
+ .b8 100
423
+ .b8 52
424
+ .b8 100
425
+ .b8 101
426
+ .b8 0
427
+ .b8 116
428
+ .b8 114
429
+ .b8 105
430
+ .b8 116
431
+ .b8 111
432
+ .b8 110
433
+ .b8 95
434
+ .b8 95
435
+ .b8 48
436
+ .b8 100
437
+ .b8 49
438
+ .b8 100
439
+ .b8 50
440
+ .b8 100
441
+ .b8 51
442
+ .b8 100
443
+ .b8 52
444
+ .b8 100
445
+ .b8 101
446
+ .b8 0
447
+ .b8 1
448
+ .b8 18
449
+ .b8 1
450
+ .b8 0
451
+ }
452
+ .section .debug_pubnames
453
+ {
454
+ .b32 $L__pubNames_end0-$L__pubNames_start0
455
+ $L__pubNames_start0:
456
+ .b8 2
457
+ .b8 0
458
+ .b32 .debug_info
459
+ .b32 188
460
+ .b32 125
461
+ .b8 116
462
+ .b8 114
463
+ .b8 105
464
+ .b8 116
465
+ .b8 111
466
+ .b8 110
467
+ .b8 95
468
+ .b8 95
469
+ .b8 48
470
+ .b8 100
471
+ .b8 49
472
+ .b8 100
473
+ .b8 50
474
+ .b8 100
475
+ .b8 51
476
+ .b8 100
477
+ .b8 52
478
+ .b8 100
479
+ .b8 101
480
+ .b8 0
481
+ .b32 0
482
+ $L__pubNames_end0:
483
+ }
484
+ .section .debug_pubtypes
485
+ {
486
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
487
+ $L__pubTypes_start0:
488
+ .b8 2
489
+ .b8 0
490
+ .b32 .debug_info
491
+ .b32 188
492
+ .b32 0
493
+ $L__pubTypes_end0:
494
+ }
495
+ .section .debug_loc { }
.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !5 {
7
+ %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %10 = and i32 %9, 31, !dbg !8
9
+ %11 = lshr i32 %9, 5, !dbg !8
10
+ %12 = and i32 %11, 1, !dbg !8
11
+ %urem = shl i32 %9, 2, !dbg !8
12
+ %13 = and i32 %urem, 252, !dbg !8
13
+ %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
14
+ %15 = shl i32 %14, 8, !dbg !10
15
+ %16 = or i32 %15, %13, !dbg !11
16
+ %17 = sext i32 %16 to i64, !dbg !12
17
+ %18 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !12
18
+ %19 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
19
+ %20 = extractvalue { i32, i32 } %19, 0, !dbg !13
20
+ %21 = extractvalue { i32, i32 } %19, 1, !dbg !13
21
+ %22 = trunc i32 %20 to i16, !dbg !13
22
+ %extelt.offset = lshr i32 %20, 16, !dbg !13
23
+ %23 = trunc i32 %extelt.offset to i16, !dbg !13
24
+ %24 = trunc i32 %21 to i16, !dbg !13
25
+ %extelt.offset1 = lshr i32 %21, 16, !dbg !13
26
+ %25 = trunc i32 %extelt.offset1 to i16, !dbg !13
27
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #3, !dbg !14
28
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14
29
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
30
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
31
+ %30 = zext nneg i32 %13 to i64, !dbg !15
32
+ %31 = getelementptr float, ptr addrspace(1) %2, i64 %30, !dbg !15
33
+ %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %31, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
34
+ %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !16
35
+ %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !16
36
+ %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !16
37
+ %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !16
38
+ %37 = bitcast i32 %33 to float, !dbg !16
39
+ %38 = bitcast i32 %34 to float, !dbg !16
40
+ %39 = bitcast i32 %35 to float, !dbg !16
41
+ %40 = bitcast i32 %36 to float, !dbg !16
42
+ %41 = getelementptr float, ptr addrspace(1) %3, i64 %17, !dbg !17
43
+ %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
44
+ %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !18
45
+ %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !18
46
+ %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !18
47
+ %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !18
48
+ %47 = bitcast i32 %43 to float, !dbg !18
49
+ %48 = bitcast i32 %44 to float, !dbg !18
50
+ %49 = bitcast i32 %45 to float, !dbg !18
51
+ %50 = bitcast i32 %46 to float, !dbg !18
52
+ %51 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !19
53
+ %52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
54
+ %53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !20
55
+ %54 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !20
56
+ %55 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !20
57
+ %56 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !20
58
+ %57 = bitcast i32 %53 to float, !dbg !20
59
+ %58 = bitcast i32 %54 to float, !dbg !20
60
+ %59 = bitcast i32 %55 to float, !dbg !20
61
+ %60 = bitcast i32 %56 to float, !dbg !20
62
+ %61 = sext i32 %14 to i64, !dbg !21
63
+ %62 = getelementptr float, ptr addrspace(1) %4, i64 %61, !dbg !21
64
+ %63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
65
+ %64 = bitcast i32 %63 to float, !dbg !22
66
+ %65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
67
+ %66 = bitcast i32 %65 to float, !dbg !22
68
+ %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
69
+ %68 = bitcast i32 %67 to float, !dbg !22
70
+ %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22
71
+ %70 = bitcast i32 %69 to float, !dbg !22
72
+ %71 = fmul float %26, %37, !dbg !23
73
+ %72 = fmul float %27, %38, !dbg !23
74
+ %73 = fmul float %28, %39, !dbg !23
75
+ %74 = fmul float %29, %40, !dbg !23
76
+ %75 = fadd float %71, %72, !dbg !24
77
+ %76 = fadd float %73, %75, !dbg !24
78
+ %77 = fadd float %74, %76, !dbg !24
79
+ %78 = bitcast float %77 to i32, !dbg !30
80
+ %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !30
81
+ %80 = bitcast i32 %79 to float, !dbg !30
82
+ %81 = fadd float %77, %80, !dbg !24
83
+ %82 = bitcast float %81 to i32, !dbg !30
84
+ %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !30
85
+ %84 = bitcast i32 %83 to float, !dbg !30
86
+ %85 = fadd float %81, %84, !dbg !24
87
+ %86 = bitcast float %85 to i32, !dbg !30
88
+ %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !30
89
+ %88 = bitcast i32 %87 to float, !dbg !30
90
+ %89 = fadd float %85, %88, !dbg !24
91
+ %90 = bitcast float %89 to i32, !dbg !30
92
+ %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !30
93
+ %92 = bitcast i32 %91 to float, !dbg !30
94
+ %93 = fadd float %89, %92, !dbg !24
95
+ %94 = bitcast float %93 to i32, !dbg !30
96
+ %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !30
97
+ %96 = bitcast i32 %95 to float, !dbg !30
98
+ %97 = fadd float %93, %96, !dbg !24
99
+ %98 = icmp eq i32 %10, 0, !dbg !30
100
+ %99 = zext nneg i32 %12 to i64, !dbg !30
101
+ %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !30
102
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %97, i1 %98) #3, !dbg !30
103
+ tail call void @llvm.nvvm.barrier0(), !dbg !30
104
+ %101 = icmp slt i32 %9, 2, !dbg !30
105
+ %102 = sext i32 %9 to i64, !dbg !30
106
+ %103 = getelementptr float, ptr addrspace(3) @global_smem, i64 %102, !dbg !30
107
+ %104 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !30
108
+ %105 = bitcast float %104 to i32, !dbg !30
109
+ %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !30
110
+ %107 = bitcast i32 %106 to float, !dbg !30
111
+ %108 = fadd float %104, %107, !dbg !24
112
+ %109 = and i32 %9, 1, !dbg !30
113
+ %110 = icmp eq i32 %109, 0, !dbg !30
114
+ %111 = and i1 %101, %110, !dbg !30
115
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %108, i1 %111) #3, !dbg !30
116
+ tail call void @llvm.nvvm.barrier0(), !dbg !30
117
+ %112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !30
118
+ %113 = fadd float %112, 0.000000e+00, !dbg !32
119
+ %114 = fmul float %71, %47, !dbg !36
120
+ %115 = fmul float %72, %48, !dbg !36
121
+ %116 = fmul float %73, %49, !dbg !36
122
+ %117 = fmul float %74, %50, !dbg !36
123
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
124
+ %118 = fadd float %114, %115, !dbg !39
125
+ %119 = fadd float %116, %118, !dbg !39
126
+ %120 = fadd float %117, %119, !dbg !39
127
+ %121 = bitcast float %120 to i32, !dbg !37
128
+ %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !37
129
+ %123 = bitcast i32 %122 to float, !dbg !37
130
+ %124 = fadd float %120, %123, !dbg !39
131
+ %125 = bitcast float %124 to i32, !dbg !37
132
+ %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 8, i32 31), !dbg !37
133
+ %127 = bitcast i32 %126 to float, !dbg !37
134
+ %128 = fadd float %124, %127, !dbg !39
135
+ %129 = bitcast float %128 to i32, !dbg !37
136
+ %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 4, i32 31), !dbg !37
137
+ %131 = bitcast i32 %130 to float, !dbg !37
138
+ %132 = fadd float %128, %131, !dbg !39
139
+ %133 = bitcast float %132 to i32, !dbg !37
140
+ %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !37
141
+ %135 = bitcast i32 %134 to float, !dbg !37
142
+ %136 = fadd float %132, %135, !dbg !39
143
+ %137 = bitcast float %136 to i32, !dbg !37
144
+ %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !37
145
+ %139 = bitcast i32 %138 to float, !dbg !37
146
+ %140 = fadd float %136, %139, !dbg !39
147
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %140, i1 %98) #3, !dbg !37
148
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
149
+ %141 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !37
150
+ %142 = bitcast float %141 to i32, !dbg !37
151
+ %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 1, i32 31), !dbg !37
152
+ %144 = bitcast i32 %143 to float, !dbg !37
153
+ %145 = fadd float %141, %144, !dbg !39
154
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %145, i1 %111) #3, !dbg !37
155
+ tail call void @llvm.nvvm.barrier0(), !dbg !37
156
+ %146 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
157
+ %147 = fadd float %146, 0.000000e+00, !dbg !42
158
+ %148 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %64, float 2.560000e+02) #3, !dbg !44
159
+ %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %66, float 2.560000e+02) #3, !dbg !44
160
+ %150 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %68, float 2.560000e+02) #3, !dbg !44
161
+ %151 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %70, float 2.560000e+02) #3, !dbg !44
162
+ %152 = fmul float %71, 2.560000e+02, !dbg !45
163
+ %153 = fmul float %72, 2.560000e+02, !dbg !45
164
+ %154 = fmul float %73, 2.560000e+02, !dbg !45
165
+ %155 = fmul float %74, 2.560000e+02, !dbg !45
166
+ %156 = fsub float %152, %113, !dbg !46
167
+ %157 = fsub float %153, %113, !dbg !46
168
+ %158 = fsub float %154, %113, !dbg !46
169
+ %159 = fsub float %155, %113, !dbg !46
170
+ %160 = fmul float %147, %47, !dbg !47
171
+ %161 = fmul float %147, %48, !dbg !47
172
+ %162 = fmul float %147, %49, !dbg !47
173
+ %163 = fmul float %147, %50, !dbg !47
174
+ %164 = fsub float %156, %160, !dbg !48
175
+ %165 = fsub float %157, %161, !dbg !48
176
+ %166 = fsub float %158, %162, !dbg !48
177
+ %167 = fsub float %159, %163, !dbg !48
178
+ %168 = fmul float %148, %164, !dbg !49
179
+ %169 = fmul float %148, %165, !dbg !49
180
+ %170 = fmul float %148, %166, !dbg !49
181
+ %171 = fmul float %148, %167, !dbg !49
182
+ %172 = fadd float %168, %57, !dbg !50
183
+ %173 = fadd float %169, %58, !dbg !50
184
+ %174 = fadd float %170, %59, !dbg !50
185
+ %175 = fadd float %171, %60, !dbg !50
186
+ %176 = bitcast float %172 to i32, !dbg !51
187
+ %177 = bitcast float %173 to i32, !dbg !51
188
+ %178 = bitcast float %174 to i32, !dbg !51
189
+ %179 = bitcast float %175 to i32, !dbg !51
190
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %176, i32 %177, i32 %178, i32 %179, ptr addrspace(1) %51, i1 true) #3, !dbg !51
191
+ %180 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52
192
+ %181 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %172) #3, !dbg !53
193
+ %182 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %173) #3, !dbg !53
194
+ %183 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %174) #3, !dbg !53
195
+ %184 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %175) #3, !dbg !53
196
+ %185 = insertelement <2 x i16> undef, i16 %181, i64 0, !dbg !53
197
+ %186 = insertelement <2 x i16> %185, i16 %182, i64 1, !dbg !53
198
+ %187 = bitcast <2 x i16> %186 to i32, !dbg !53
199
+ %188 = insertelement <2 x i16> undef, i16 %183, i64 0, !dbg !53
200
+ %189 = insertelement <2 x i16> %188, i16 %184, i64 1, !dbg !53
201
+ %190 = bitcast <2 x i16> %189 to i32, !dbg !53
202
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %187, i32 %190, ptr addrspace(1) %180, i1 true) #3, !dbg !53
203
+ ret void, !dbg !54
204
+ }
205
+
206
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
207
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
208
+
209
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
210
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
211
+
212
+ ; Function Attrs: convergent nocallback nounwind
213
+ declare void @llvm.nvvm.barrier0() #2
214
+
215
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
216
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
217
+ attributes #2 = { convergent nocallback nounwind }
218
+ attributes #3 = { nounwind }
219
+
220
+ !llvm.module.flags = !{!0}
221
+ !llvm.dbg.cu = !{!1}
222
+ !nvvm.annotations = !{!3, !4, !4, !3}
223
+
224
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
225
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
226
+ !2 = !DIFile(filename: "crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py", directory: "/tmp/torchinductor_root/rn")
227
+ !3 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
228
+ !4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
229
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
230
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
231
+ !7 = !{}
232
+ !8 = !DILocation(line: 26, column: 26, scope: !5)
233
+ !9 = !DILocation(line: 23, column: 28, scope: !5)
234
+ !10 = !DILocation(line: 30, column: 40, scope: !5)
235
+ !11 = !DILocation(line: 30, column: 36, scope: !5)
236
+ !12 = !DILocation(line: 30, column: 30, scope: !5)
237
+ !13 = !DILocation(line: 30, column: 46, scope: !5)
238
+ !14 = !DILocation(line: 30, column: 67, scope: !5)
239
+ !15 = !DILocation(line: 31, column: 30, scope: !5)
240
+ !16 = !DILocation(line: 31, column: 35, scope: !5)
241
+ !17 = !DILocation(line: 32, column: 30, scope: !5)
242
+ !18 = !DILocation(line: 32, column: 46, scope: !5)
243
+ !19 = !DILocation(line: 33, column: 35, scope: !5)
244
+ !20 = !DILocation(line: 33, column: 51, scope: !5)
245
+ !21 = !DILocation(line: 34, column: 31, scope: !5)
246
+ !22 = !DILocation(line: 34, column: 36, scope: !5)
247
+ !23 = !DILocation(line: 36, column: 18, scope: !5)
248
+ !24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !28)
249
+ !25 = distinct !DILexicalBlockFile(scope: !27, file: !26, discriminator: 0)
250
+ !26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
251
+ !27 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0)
252
+ !28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
253
+ !29 = !DILocation(line: 39, column: 57, scope: !25)
254
+ !30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
255
+ !31 = !DILocation(line: 39, column: 57, scope: !27)
256
+ !32 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !35)
257
+ !33 = distinct !DILexicalBlockFile(scope: !5, file: !34, discriminator: 0)
258
+ !34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
259
+ !35 = !DILocation(line: 39, column: 44, scope: !33)
260
+ !36 = !DILocation(line: 40, column: 18, scope: !5)
261
+ !37 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !38)
262
+ !38 = !DILocation(line: 43, column: 59, scope: !27)
263
+ !39 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !40)
264
+ !40 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !41)
265
+ !41 = !DILocation(line: 43, column: 59, scope: !25)
266
+ !42 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !43)
267
+ !43 = !DILocation(line: 43, column: 45, scope: !33)
268
+ !44 = !DILocation(line: 45, column: 20, scope: !5)
269
+ !45 = !DILocation(line: 46, column: 19, scope: !5)
270
+ !46 = !DILocation(line: 47, column: 20, scope: !5)
271
+ !47 = !DILocation(line: 48, column: 19, scope: !5)
272
+ !48 = !DILocation(line: 49, column: 20, scope: !5)
273
+ !49 = !DILocation(line: 50, column: 20, scope: !5)
274
+ !50 = !DILocation(line: 51, column: 20, scope: !5)
275
+ !51 = !DILocation(line: 53, column: 51, scope: !5)
276
+ !52 = !DILocation(line: 54, column: 25, scope: !5)
277
+ !53 = !DILocation(line: 54, column: 48, scope: !5)
278
+ !54 = !DILocation(line: 54, column: 4, scope: !5)
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx ADDED
@@ -0,0 +1,1154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6de7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62};
20
+ .global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
21
+ .global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62};
23
+ .global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121};
24
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
25
+ .extern .shared .align 1 .b8 global_smem[];
26
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
27
+
28
+ .visible .entry triton__0d1d2d3d4d5d6de7de(
29
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_0,
30
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_1,
31
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_2,
32
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_3,
33
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_4,
34
+ .param .u64 triton__0d1d2d3d4d5d6de7de_param_5,
35
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_6,
36
+ .param .u32 triton__0d1d2d3d4d5d6de7de_param_7
37
+ )
38
+ .maxntid 128, 1, 1
39
+ {
40
+ .reg .pred %p<65>;
41
+ .reg .b16 %rs<13>;
42
+ .reg .b32 %r<188>;
43
+ .reg .f32 %f<166>;
44
+ .reg .b64 %rd<99>;
45
+ .loc 1 18 0
46
+ $L__func_begin0:
47
+ .loc 1 18 0
48
+
49
+ ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_3];
50
+ ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_2];
51
+ ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_0];
52
+ $L__tmp0:
53
+ .loc 1 22 44
54
+ mov.u32 %r1, %tid.x;
55
+ ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6de7de_param_1];
56
+ bfe.u32 %r3, %r1, 6, 1;
57
+ and.b32 %r4, %r1, 1;
58
+ .loc 1 24 33
59
+ shl.b32 %r23, %r1, 1;
60
+ and.b32 %r5, %r23, 126;
61
+ .loc 1 21 28
62
+ mov.u32 %r14, %ctaid.x;
63
+ .loc 1 21 33
64
+ shl.b32 %r24, %r14, 1;
65
+ .loc 1 22 23
66
+ or.b32 %r25, %r24, %r3;
67
+ or.b32 %r26, %r24, %r4;
68
+ .loc 1 26 30
69
+ mul.wide.s32 %rd26, %r25, 8;
70
+ add.s64 %rd17, %rd24, %rd26;
71
+ mul.wide.s32 %rd27, %r26, 8;
72
+ add.s64 %rd21, %rd24, %rd27;
73
+ mov.pred %p61, -1;
74
+ .loc 1 26 35
75
+ mov.u64 %rd16, 0x0;
76
+ @%p61 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ];
77
+ mov.u64 %rd18, 0x0;
78
+ @%p61 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd17 + 0 ];
79
+ mov.u64 %rd20, 0x0;
80
+ @%p61 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
81
+ .loc 1 27 18
82
+ bfe.s32 %r27, %r14, 30, 1;
83
+ shr.u32 %r28, %r27, 23;
84
+ add.s32 %r29, %r25, %r28;
85
+ and.b32 %r30, %r29, 16776704;
86
+ sub.s32 %r31, %r25, %r30;
87
+ .loc 1 35 44
88
+ shl.b32 %r6, %r31, 8;
89
+ .loc 1 36 44
90
+ shl.b32 %r7, %r25, 8;
91
+ .loc 1 37 22
92
+ add.s64 %rd28, %rd20, 50257;
93
+ .loc 1 38 22
94
+ setp.lt.s64 %p9, %rd16, 0;
95
+ setp.lt.s64 %p10, %rd20, 0;
96
+ .loc 1 39 36
97
+ selp.b64 %rd1, %rd28, %rd20, %p10;
98
+ .loc 1 40 40
99
+ setp.lt.u64 %p11, %rd1, 50257;
100
+ .loc 1 41 44
101
+ shl.b64 %rd29, %rd16, 8;
102
+ add.s64 %rd30, %rd29, 12865792;
103
+ selp.b64 %rd31, %rd30, %rd29, %p9;
104
+ shl.b64 %rd32, %rd31, 2;
105
+ add.s64 %rd2, %rd25, %rd32;
106
+ .loc 1 35 40
107
+ or.b32 %r32, %r5, %r6;
108
+ .loc 1 35 34
109
+ mul.wide.s32 %rd33, %r32, 4;
110
+ add.s64 %rd62, %rd12, %rd33;
111
+ mov.b32 %r179, 0;
112
+ .loc 1 35 50
113
+ mov.u32 %r15, 0x0;
114
+ mov.u32 %r16, 0x0;
115
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r15, %r16 }, [ %rd62 + 0 ];
116
+ @!%p61 mov.u32 %r15, %r179;
117
+ @!%p61 mov.u32 %r16, %r179;
118
+ mov.b32 %f2, %r16;
119
+ mov.b32 %f1, %r15;
120
+ .loc 1 36 40
121
+ or.b32 %r33, %r5, %r7;
122
+ .loc 1 36 34
123
+ mul.wide.s32 %rd34, %r33, 2;
124
+ add.s64 %rd63, %rd13, %rd34;
125
+ .loc 1 36 50
126
+ mov.u32 %r19, 0x0;
127
+ @%p61 ld.global.L1::evict_last.b32 { %r19 }, [ %rd63 + 0 ];
128
+ @!%p61 mov.u32 %r19, %r179;
129
+ cvt.u16.u32 %rs1, %r19;
130
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; }
131
+ .loc 1 36 101
132
+ cvt.f32.bf16 %r21, %rs1;
133
+ mov.b32 %f3, %r21;
134
+ cvt.f32.bf16 %r22, %rs2;
135
+ mov.b32 %f4, %r22;
136
+ mov.u64 %rd95, assertMessage_0;
137
+ mov.u64 %rd96, assertFile_0;
138
+ mov.u64 %rd97, assertFunc_0;
139
+ mov.b32 %r187, 1892;
140
+ mov.u64 %rd98, 1;
141
+ .loc 1 40 55
142
+ @%p11 bra $L__BB0_2;
143
+ cvta.global.u64 %rd36, %rd95;
144
+ cvta.global.u64 %rd38, %rd96;
145
+ cvta.global.u64 %rd40, %rd97;
146
+ { // callseq 2, 0
147
+ .reg .b32 temp_param_reg;
148
+ .param .b64 param0;
149
+ st.param.b64 [param0+0], %rd36;
150
+ .param .b64 param1;
151
+ st.param.b64 [param1+0], %rd38;
152
+ .param .b32 param2;
153
+ st.param.b32 [param2+0], %r187;
154
+ .param .b64 param3;
155
+ st.param.b64 [param3+0], %rd40;
156
+ .param .b64 param4;
157
+ st.param.b64 [param4+0], %rd98;
158
+ call.uni
159
+ __assertfail,
160
+ (
161
+ param0,
162
+ param1,
163
+ param2,
164
+ param3,
165
+ param4
166
+ );
167
+ } // callseq 2
168
+ $L__BB0_2:
169
+ .loc 1 0 55
170
+ ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_4];
171
+ and.b32 %r2, %r1, 31;
172
+ .loc 1 41 40
173
+ cvt.u64.u32 %rd45, %r5;
174
+ .loc 1 41 34
175
+ mul.wide.u32 %rd46, %r5, 4;
176
+ add.s64 %rd73, %rd2, %rd46;
177
+ .loc 1 41 52
178
+ mov.u32 %r35, 0x0;
179
+ mov.u32 %r36, 0x0;
180
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r35, %r36 }, [ %rd73 + 0 ];
181
+ @!%p61 mov.u32 %r35, %r179;
182
+ @!%p61 mov.u32 %r36, %r179;
183
+ mov.b32 %f21, %r36;
184
+ mov.b32 %f22, %r35;
185
+ .loc 1 42 22
186
+ add.f32 %f23, %f1, %f22;
187
+ add.f32 %f24, %f2, %f21;
188
+ .loc 1 44 22
189
+ add.f32 %f25, %f4, %f24;
190
+ mov.b32 %r43, %f25;
191
+ add.f32 %f26, %f3, %f23;
192
+ mov.b32 %r40, %f26;
193
+ mov.b32 %r41, 1065353216;
194
+ $L__tmp1:
195
+ .loc 2 98 30
196
+ div.full.f32 %r39, %r40, %r41;
197
+ mov.b32 %f27, %r39;
198
+ div.full.f32 %r42, %r43, %r41;
199
+ mov.b32 %f28, %r42;
200
+ .loc 2 98 22
201
+ add.f32 %f6, %f28, 0f00000000;
202
+ add.f32 %f5, %f27, 0f00000000;
203
+ .loc 2 101 30
204
+ sub.f32 %f29, %f26, %f5;
205
+ sub.f32 %f30, %f25, %f6;
206
+ $L__tmp2:
207
+ .loc 1 50 50
208
+ fma.rn.f32 %f8, %f25, %f30, 0f00000000;
209
+ fma.rn.f32 %f7, %f26, %f29, 0f00000000;
210
+ .loc 1 35 34
211
+ cvt.s64.s32 %rd47, %r6;
212
+ add.s64 %rd48, %rd45, %rd47;
213
+ shl.b64 %rd49, %rd48, 2;
214
+ add.s64 %rd50, %rd12, %rd49;
215
+ add.s64 %rd75, %rd50, 512;
216
+ .loc 1 35 50
217
+ mov.u32 %r45, 0x0;
218
+ mov.u32 %r46, 0x0;
219
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r45, %r46 }, [ %rd75 + 0 ];
220
+ @!%p61 mov.u32 %r45, %r179;
221
+ @!%p61 mov.u32 %r46, %r179;
222
+ mov.b32 %f10, %r46;
223
+ mov.b32 %f9, %r45;
224
+ .loc 1 36 34
225
+ cvt.s64.s32 %rd51, %r7;
226
+ add.s64 %rd8, %rd45, %rd51;
227
+ shl.b64 %rd52, %rd8, 1;
228
+ add.s64 %rd53, %rd13, %rd52;
229
+ add.s64 %rd76, %rd53, 256;
230
+ .loc 1 36 50
231
+ mov.u32 %r49, 0x0;
232
+ @%p61 ld.global.L1::evict_last.b32 { %r49 }, [ %rd76 + 0 ];
233
+ @!%p61 mov.u32 %r49, %r179;
234
+ cvt.u16.u32 %rs3, %r49;
235
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r49; }
236
+ .loc 1 36 101
237
+ cvt.f32.bf16 %r51, %rs3;
238
+ mov.b32 %f11, %r51;
239
+ cvt.f32.bf16 %r52, %rs4;
240
+ mov.b32 %f12, %r52;
241
+ .loc 1 40 55
242
+ @%p11 bra $L__BB0_4;
243
+ cvta.global.u64 %rd55, %rd95;
244
+ cvta.global.u64 %rd57, %rd96;
245
+ cvta.global.u64 %rd59, %rd97;
246
+ { // callseq 3, 0
247
+ .reg .b32 temp_param_reg;
248
+ .param .b64 param0;
249
+ st.param.b64 [param0+0], %rd55;
250
+ .param .b64 param1;
251
+ st.param.b64 [param1+0], %rd57;
252
+ .param .b32 param2;
253
+ st.param.b32 [param2+0], %r187;
254
+ .param .b64 param3;
255
+ st.param.b64 [param3+0], %rd59;
256
+ .param .b64 param4;
257
+ st.param.b64 [param4+0], %rd98;
258
+ call.uni
259
+ __assertfail,
260
+ (
261
+ param0,
262
+ param1,
263
+ param2,
264
+ param3,
265
+ param4
266
+ );
267
+ } // callseq 3
268
+ $L__BB0_4:
269
+ .loc 1 0 55
270
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_5];
271
+ cvt.s64.s32 %rd4, %r33;
272
+ .loc 1 41 34
273
+ add.s64 %rd86, %rd73, 512;
274
+ .loc 1 41 52
275
+ mov.u32 %r54, 0x0;
276
+ mov.u32 %r55, 0x0;
277
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r54, %r55 }, [ %rd86 + 0 ];
278
+ @!%p61 mov.u32 %r54, %r179;
279
+ @!%p61 mov.u32 %r55, %r179;
280
+ mov.b32 %f31, %r54;
281
+ mov.b32 %f32, %r55;
282
+ .loc 1 42 22
283
+ add.f32 %f33, %f10, %f32;
284
+ add.f32 %f34, %f9, %f31;
285
+ .loc 1 44 22
286
+ add.f32 %f35, %f11, %f34;
287
+ add.f32 %f36, %f12, %f33;
288
+ $L__tmp3:
289
+ .loc 2 96 20
290
+ sub.f32 %f37, %f36, %f6;
291
+ mov.b32 %r62, %f37;
292
+ sub.f32 %f38, %f35, %f5;
293
+ mov.b32 %r59, %f38;
294
+ mov.b32 %r60, 1073741824;
295
+ .loc 2 98 30
296
+ div.full.f32 %r58, %r59, %r60;
297
+ mov.b32 %f39, %r58;
298
+ div.full.f32 %r61, %r62, %r60;
299
+ mov.b32 %f40, %r61;
300
+ .loc 2 98 22
301
+ add.f32 %f41, %f6, %f40;
302
+ add.f32 %f42, %f5, %f39;
303
+ .loc 2 101 30
304
+ sub.f32 %f43, %f35, %f42;
305
+ sub.f32 %f44, %f36, %f41;
306
+ $L__tmp4:
307
+ .loc 1 50 50
308
+ fma.rn.f32 %f45, %f37, %f44, %f8;
309
+ fma.rn.f32 %f46, %f38, %f43, %f7;
310
+ .loc 1 24 33
311
+ and.b32 %r119, %r1, 127;
312
+ .loc 1 31 36
313
+ shl.b32 %r120, %r119, 2;
314
+ mov.u32 %r121, global_smem;
315
+ add.s32 %r8, %r121, %r120;
316
+ st.shared.u32 [%r8], %r60;
317
+ st.shared.u32 [%r8+520], %r60;
318
+ bar.sync 0;
319
+ mad.lo.s32 %r122, %r3, 130, %r5;
320
+ shl.b32 %r123, %r122, 2;
321
+ add.s32 %r124, %r121, %r123;
322
+ ld.shared.v2.f32 {%f47, %f48}, [%r124];
323
+ $L__tmp5:
324
+ .loc 2 120 46
325
+ bar.sync 0;
326
+ $L__tmp6:
327
+ .loc 2 108 21
328
+ sub.f32 %f49, %f41, %f42;
329
+ .loc 2 109 28
330
+ add.f32 %f50, %f47, %f48;
331
+ .loc 2 110 39
332
+ setp.eq.f32 %p41, %f50, 0f00000000;
333
+ .loc 2 110 60
334
+ mov.b32 %r65, %f48;
335
+ mov.b32 %r66, %f50;
336
+ div.full.f32 %r64, %r65, %r66;
337
+ mov.b32 %f51, %r64;
338
+ .loc 2 110 49
339
+ selp.f32 %f52, 0f00000000, %f51, %p41;
340
+ .loc 2 112 17
341
+ fma.rn.f32 %f53, %f49, %f52, %f42;
342
+ .loc 2 113 15
343
+ add.f32 %f54, %f46, %f45;
344
+ .loc 2 113 30
345
+ mul.f32 %f55, %f49, %f49;
346
+ .loc 2 113 38
347
+ mul.f32 %f56, %f55, %f47;
348
+ .loc 2 113 22
349
+ fma.rn.f32 %f57, %f56, %f52, %f54;
350
+ $L__tmp7:
351
+ .loc 2 120 46
352
+ mov.b32 %r125, %f53;
353
+ shfl.sync.bfly.b32 %r126, %r125, 16, 31, -1;
354
+ mov.b32 %f58, %r126;
355
+ mov.b32 %r127, %f57;
356
+ shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
357
+ mov.b32 %f59, %r128;
358
+ shfl.sync.bfly.b32 %r68, %r66, 16, 31, -1;
359
+ mov.b32 %f60, %r68;
360
+ $L__tmp8:
361
+ .loc 2 108 21
362
+ sub.f32 %f61, %f58, %f53;
363
+ .loc 2 109 28
364
+ add.f32 %f62, %f50, %f60;
365
+ .loc 2 110 39
366
+ setp.eq.f32 %p42, %f62, 0f00000000;
367
+ .loc 2 110 60
368
+ mov.b32 %r69, %f62;
369
+ div.full.f32 %r67, %r68, %r69;
370
+ mov.b32 %f63, %r67;
371
+ .loc 2 110 49
372
+ selp.f32 %f64, 0f00000000, %f63, %p42;
373
+ .loc 2 112 17
374
+ fma.rn.f32 %f65, %f61, %f64, %f53;
375
+ .loc 2 113 15
376
+ add.f32 %f66, %f57, %f59;
377
+ .loc 2 113 30
378
+ mul.f32 %f67, %f61, %f61;
379
+ .loc 2 113 38
380
+ mul.f32 %f68, %f50, %f67;
381
+ .loc 2 113 22
382
+ fma.rn.f32 %f69, %f68, %f64, %f66;
383
+ $L__tmp9:
384
+ .loc 2 120 46
385
+ mov.b32 %r129, %f65;
386
+ shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
387
+ mov.b32 %f70, %r130;
388
+ mov.b32 %r131, %f69;
389
+ shfl.sync.bfly.b32 %r132, %r131, 8, 31, -1;
390
+ mov.b32 %f71, %r132;
391
+ shfl.sync.bfly.b32 %r71, %r69, 8, 31, -1;
392
+ mov.b32 %f72, %r71;
393
+ $L__tmp10:
394
+ .loc 2 108 21
395
+ sub.f32 %f73, %f70, %f65;
396
+ .loc 2 109 28
397
+ add.f32 %f74, %f62, %f72;
398
+ .loc 2 110 39
399
+ setp.eq.f32 %p43, %f74, 0f00000000;
400
+ .loc 2 110 60
401
+ mov.b32 %r72, %f74;
402
+ div.full.f32 %r70, %r71, %r72;
403
+ mov.b32 %f75, %r70;
404
+ .loc 2 110 49
405
+ selp.f32 %f76, 0f00000000, %f75, %p43;
406
+ .loc 2 112 17
407
+ fma.rn.f32 %f77, %f73, %f76, %f65;
408
+ .loc 2 113 15
409
+ add.f32 %f78, %f69, %f71;
410
+ .loc 2 113 30
411
+ mul.f32 %f79, %f73, %f73;
412
+ .loc 2 113 38
413
+ mul.f32 %f80, %f62, %f79;
414
+ .loc 2 113 22
415
+ fma.rn.f32 %f81, %f76, %f80, %f78;
416
+ $L__tmp11:
417
+ .loc 2 120 46
418
+ mov.b32 %r133, %f77;
419
+ shfl.sync.bfly.b32 %r134, %r133, 4, 31, -1;
420
+ mov.b32 %f82, %r134;
421
+ mov.b32 %r135, %f81;
422
+ shfl.sync.bfly.b32 %r136, %r135, 4, 31, -1;
423
+ mov.b32 %f83, %r136;
424
+ shfl.sync.bfly.b32 %r74, %r72, 4, 31, -1;
425
+ mov.b32 %f84, %r74;
426
+ $L__tmp12:
427
+ .loc 2 108 21
428
+ sub.f32 %f85, %f82, %f77;
429
+ .loc 2 109 28
430
+ add.f32 %f86, %f74, %f84;
431
+ .loc 2 110 39
432
+ setp.eq.f32 %p44, %f86, 0f00000000;
433
+ .loc 2 110 60
434
+ mov.b32 %r75, %f86;
435
+ div.full.f32 %r73, %r74, %r75;
436
+ mov.b32 %f87, %r73;
437
+ .loc 2 110 49
438
+ selp.f32 %f88, 0f00000000, %f87, %p44;
439
+ .loc 2 112 17
440
+ fma.rn.f32 %f89, %f85, %f88, %f77;
441
+ .loc 2 113 15
442
+ add.f32 %f90, %f81, %f83;
443
+ .loc 2 113 30
444
+ mul.f32 %f91, %f85, %f85;
445
+ .loc 2 113 38
446
+ mul.f32 %f92, %f74, %f91;
447
+ .loc 2 113 22
448
+ fma.rn.f32 %f93, %f88, %f92, %f90;
449
+ $L__tmp13:
450
+ .loc 2 120 46
451
+ mov.b32 %r137, %f89;
452
+ shfl.sync.bfly.b32 %r138, %r137, 2, 31, -1;
453
+ mov.b32 %f94, %r138;
454
+ mov.b32 %r139, %f93;
455
+ shfl.sync.bfly.b32 %r140, %r139, 2, 31, -1;
456
+ mov.b32 %f95, %r140;
457
+ shfl.sync.bfly.b32 %r77, %r75, 2, 31, -1;
458
+ mov.b32 %f96, %r77;
459
+ $L__tmp14:
460
+ .loc 2 108 21
461
+ sub.f32 %f97, %f94, %f89;
462
+ .loc 2 109 28
463
+ add.f32 %f98, %f86, %f96;
464
+ .loc 2 110 39
465
+ setp.eq.f32 %p45, %f98, 0f00000000;
466
+ .loc 2 110 60
467
+ mov.b32 %r78, %f98;
468
+ div.full.f32 %r76, %r77, %r78;
469
+ mov.b32 %f99, %r76;
470
+ .loc 2 110 49
471
+ selp.f32 %f100, 0f00000000, %f99, %p45;
472
+ .loc 2 112 17
473
+ fma.rn.f32 %f101, %f97, %f100, %f89;
474
+ .loc 2 113 15
475
+ add.f32 %f102, %f93, %f95;
476
+ .loc 2 113 30
477
+ mul.f32 %f103, %f97, %f97;
478
+ .loc 2 113 38
479
+ mul.f32 %f104, %f86, %f103;
480
+ .loc 2 113 22
481
+ fma.rn.f32 %f105, %f100, %f104, %f102;
482
+ $L__tmp15:
483
+ .loc 2 120 46
484
+ mov.b32 %r141, %f101;
485
+ shfl.sync.bfly.b32 %r142, %r141, 1, 31, -1;
486
+ mov.b32 %f106, %r142;
487
+ mov.b32 %r143, %f105;
488
+ shfl.sync.bfly.b32 %r144, %r143, 1, 31, -1;
489
+ mov.b32 %f107, %r144;
490
+ shfl.sync.bfly.b32 %r80, %r78, 1, 31, -1;
491
+ mov.b32 %f108, %r80;
492
+ $L__tmp16:
493
+ .loc 2 108 21
494
+ sub.f32 %f109, %f106, %f101;
495
+ .loc 2 109 28
496
+ add.f32 %f110, %f98, %f108;
497
+ .loc 2 110 39
498
+ setp.eq.f32 %p46, %f110, 0f00000000;
499
+ .loc 2 110 60
500
+ mov.b32 %r81, %f110;
501
+ div.full.f32 %r79, %r80, %r81;
502
+ mov.b32 %f111, %r79;
503
+ .loc 2 110 49
504
+ selp.f32 %f112, 0f00000000, %f111, %p46;
505
+ .loc 2 112 17
506
+ fma.rn.f32 %f113, %f109, %f112, %f101;
507
+ .loc 2 113 15
508
+ add.f32 %f114, %f105, %f107;
509
+ .loc 2 113 30
510
+ mul.f32 %f115, %f109, %f109;
511
+ .loc 2 113 38
512
+ mul.f32 %f116, %f98, %f115;
513
+ .loc 2 113 22
514
+ fma.rn.f32 %f117, %f112, %f116, %f114;
515
+ $L__tmp17:
516
+ .loc 2 120 46
517
+ setp.eq.s32 %p24, %r2, 0;
518
+ shr.u32 %r145, %r1, 3;
519
+ and.b32 %r146, %r145, 4;
520
+ shl.b32 %r147, %r3, 3;
521
+ or.b32 %r148, %r147, %r146;
522
+ add.s32 %r82, %r121, %r148;
523
+ mov.b32 %r83, %f113;
524
+ @%p24 st.shared.b32 [ %r82 + 0 ], %r83;
525
+ add.s32 %r149, %r121, 16;
526
+ add.s32 %r84, %r149, %r148;
527
+ mov.b32 %r85, %f117;
528
+ @%p24 st.shared.b32 [ %r84 + 0 ], %r85;
529
+ add.s32 %r150, %r121, 32;
530
+ add.s32 %r86, %r150, %r148;
531
+ @%p24 st.shared.b32 [ %r86 + 0 ], %r81;
532
+ bar.sync 0;
533
+ setp.lt.s32 %p27, %r1, 4;
534
+ shl.b32 %r151, %r1, 2;
535
+ add.s32 %r89, %r121, %r151;
536
+ @%p27 ld.shared.b32 %r88, [ %r89 + 0 ];
537
+ mov.b32 %f118, %r88;
538
+ add.s32 %r91, %r149, %r151;
539
+ @%p27 ld.shared.b32 %r90, [ %r91 + 0 ];
540
+ mov.b32 %f119, %r90;
541
+ add.s32 %r93, %r150, %r151;
542
+ @%p27 ld.shared.b32 %r92, [ %r93 + 0 ];
543
+ mov.b32 %f120, %r92;
544
+ shfl.sync.bfly.b32 %r152, %r88, 1, 31, -1;
545
+ mov.b32 %f121, %r152;
546
+ shfl.sync.bfly.b32 %r153, %r90, 1, 31, -1;
547
+ mov.b32 %f122, %r153;
548
+ shfl.sync.bfly.b32 %r95, %r92, 1, 31, -1;
549
+ mov.b32 %f123, %r95;
550
+ $L__tmp18:
551
+ .loc 2 108 21
552
+ sub.f32 %f124, %f121, %f118;
553
+ .loc 2 109 28
554
+ add.f32 %f125, %f120, %f123;
555
+ .loc 2 110 39
556
+ setp.eq.f32 %p47, %f125, 0f00000000;
557
+ .loc 2 110 60
558
+ mov.b32 %r96, %f125;
559
+ div.full.f32 %r94, %r95, %r96;
560
+ mov.b32 %f126, %r94;
561
+ .loc 2 110 49
562
+ selp.f32 %f127, 0f00000000, %f126, %p47;
563
+ .loc 2 112 17
564
+ fma.rn.f32 %f128, %f124, %f127, %f118;
565
+ .loc 2 113 15
566
+ add.f32 %f129, %f119, %f122;
567
+ .loc 2 113 30
568
+ mul.f32 %f130, %f124, %f124;
569
+ .loc 2 113 38
570
+ mul.f32 %f131, %f120, %f130;
571
+ .loc 2 113 22
572
+ fma.rn.f32 %f132, %f131, %f127, %f129;
573
+ $L__tmp19:
574
+ .loc 2 120 46
575
+ setp.eq.s32 %p48, %r4, 0;
576
+ and.pred %p30, %p27, %p48;
577
+ mov.b32 %r98, %f128;
578
+ @%p30 st.shared.b32 [ %r89 + 0 ], %r98;
579
+ mov.b32 %r100, %f132;
580
+ @%p30 st.shared.b32 [ %r91 + 0 ], %r100;
581
+ @%p30 st.shared.b32 [ %r93 + 0 ], %r96;
582
+ bar.sync 0;
583
+ add.s32 %r154, %r121, %r147;
584
+ ld.shared.f32 %f13, [%r154];
585
+ add.s32 %r155, %r149, %r147;
586
+ $L__tmp20:
587
+ .loc 1 75 24
588
+ ld.shared.u32 %r104, [%r155];
589
+ mov.b32 %r105, 1132462080;
590
+ div.full.f32 %r103, %r104, %r105;
591
+ mov.b32 %f133, %r103;
592
+ .loc 1 77 24
593
+ add.f32 %f14, %f133, 0f3727C5AC;
594
+ shl.b32 %r156, %r5, 2;
595
+ add.s32 %r9, %r121, %r156;
596
+ .loc 1 62 51
597
+ mov.u32 %r109, 0x0;
598
+ mov.u32 %r110, 0x0;
599
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r109, %r110 }, [ %rd62 + 0 ];
600
+ @!%p61 mov.u32 %r109, %r179;
601
+ @!%p61 mov.u32 %r110, %r179;
602
+ mov.b32 %f15, %r109;
603
+ mov.b32 %f16, %r110;
604
+ .loc 1 63 51
605
+ mov.u32 %r113, 0x0;
606
+ @%p61 ld.global.L1::evict_first.b32 { %r113 }, [ %rd63 + 0 ];
607
+ @!%p61 mov.u32 %r113, %r179;
608
+ cvt.u16.u32 %rs5, %r113;
609
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r113; }
610
+ .loc 1 63 103
611
+ cvt.f32.bf16 %r115, %rs5;
612
+ mov.b32 %f17, %r115;
613
+ cvt.f32.bf16 %r116, %rs6;
614
+ mov.b32 %f18, %r116;
615
+ .loc 1 64 35
616
+ mul.wide.u32 %rd65, %r119, 4;
617
+ add.s64 %rd64, %rd14, %rd65;
618
+ .loc 1 64 40
619
+ mov.u32 %r117, 0x0;
620
+ @%p61 ld.global.L1::evict_last.b32 { %r117 }, [ %rd64 + 0 ];
621
+ @!%p61 mov.u32 %r117, %r179;
622
+ mov.u64 %rd90, assertMessage_1;
623
+ mov.u64 %rd91, assertFile_1;
624
+ mov.u64 %rd92, assertFunc_1;
625
+ .loc 1 68 57
626
+ @%p11 bra $L__BB0_6;
627
+ cvta.global.u64 %rd67, %rd90;
628
+ cvta.global.u64 %rd69, %rd91;
629
+ cvta.global.u64 %rd71, %rd92;
630
+ { // callseq 4, 0
631
+ .reg .b32 temp_param_reg;
632
+ .param .b64 param0;
633
+ st.param.b64 [param0+0], %rd67;
634
+ .param .b64 param1;
635
+ st.param.b64 [param1+0], %rd69;
636
+ .param .b32 param2;
637
+ st.param.b32 [param2+0], %r187;
638
+ .param .b64 param3;
639
+ st.param.b64 [param3+0], %rd71;
640
+ .param .b64 param4;
641
+ st.param.b64 [param4+0], %rd98;
642
+ call.uni
643
+ __assertfail,
644
+ (
645
+ param0,
646
+ param1,
647
+ param2,
648
+ param3,
649
+ param4
650
+ );
651
+ } // callseq 4
652
+ $L__BB0_6:
653
+ .loc 1 69 54
654
+ mov.u32 %r158, 0x0;
655
+ mov.u32 %r159, 0x0;
656
+ @%p61 ld.global.L1::evict_first.v2.b32 { %r158, %r159 }, [ %rd73 + 0 ];
657
+ @!%p61 mov.u32 %r158, %r179;
658
+ @!%p61 mov.u32 %r159, %r179;
659
+ mov.b32 %f134, %r158;
660
+ mov.b32 %f135, %r159;
661
+ .loc 1 70 24
662
+ add.f32 %f136, %f15, %f134;
663
+ add.f32 %f137, %f16, %f135;
664
+ .loc 1 72 24
665
+ add.f32 %f138, %f17, %f136;
666
+ add.f32 %f139, %f18, %f137;
667
+ .loc 1 73 24
668
+ sub.f32 %f140, %f138, %f13;
669
+ sub.f32 %f141, %f139, %f13;
670
+ .loc 1 78 30
671
+ rsqrt.approx.ftz.f32 %f142, %f14;
672
+ .loc 1 79 24
673
+ mul.f32 %f143, %f140, %f142;
674
+ mul.f32 %f144, %f141, %f142;
675
+ .loc 1 80 24
676
+ bar.sync 0;
677
+ st.shared.u32 [%r8], %r117;
678
+ bar.sync 0;
679
+ ld.shared.v2.f32 {%f145, %f146}, [%r9];
680
+ mul.f32 %f147, %f143, %f145;
681
+ mul.f32 %f148, %f144, %f146;
682
+ .loc 1 82 29
683
+ shl.b64 %rd78, %rd4, 1;
684
+ add.s64 %rd74, %rd15, %rd78;
685
+ .loc 1 82 52
686
+ mov.b32 %r162, %f147;
687
+ cvt.rn.bf16.f32 %rs7, %r162;
688
+ mov.b32 %r163, %f148;
689
+ cvt.rn.bf16.f32 %rs8, %r163;
690
+ mov.b32 %r175, {%rs7, %rs8};
691
+ @%p61 st.global.b32 [ %rd74 + 0 ], { %r175 };
692
+ .loc 1 62 51
693
+ mov.u32 %r165, 0x0;
694
+ mov.u32 %r166, 0x0;
695
+ @%p61 ld.global.L1::evict_last.v2.b32 { %r165, %r166 }, [ %rd75 + 0 ];
696
+ @!%p61 mov.u32 %r165, %r179;
697
+ @!%p61 mov.u32 %r166, %r179;
698
+ .loc 1 63 51
699
+ mov.u32 %r169, 0x0;
700
+ @%p61 ld.global.L1::evict_first.b32 { %r169 }, [ %rd76 + 0 ];
701
+ @!%p61 mov.u32 %r169, %r179;
702
+ cvt.u16.u32 %rs9, %r169;
703
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r169; }
704
+ .loc 1 63 103
705
+ cvt.f32.bf16 %r171, %rs9;
706
+ mov.b32 %f19, %r171;
707
+ cvt.f32.bf16 %r172, %rs10;
708
+ mov.b32 %f20, %r172;
709
+ .loc 1 64 35
710
+ add.s64 %rd77, %rd64, 512;
711
+ .loc 1 64 40
712
+ mov.u32 %r173, 0x0;
713
+ @%p61 ld.global.L1::evict_last.b32 { %r173 }, [ %rd77 + 0 ];
714
+ @!%p61 mov.u32 %r173, %r179;
715
+ .loc 1 68 57
716
+ @%p11 bra $L__BB0_8;
717
+ cvta.global.u64 %rd80, %rd90;
718
+ cvta.global.u64 %rd82, %rd91;
719
+ cvta.global.u64 %rd84, %rd92;
720
+ { // callseq 5, 0
721
+ .reg .b32 temp_param_reg;
722
+ .param .b64 param0;
723
+ st.param.b64 [param0+0], %rd80;
724
+ .param .b64 param1;
725
+ st.param.b64 [param1+0], %rd82;
726
+ .param .b32 param2;
727
+ st.param.b32 [param2+0], %r187;
728
+ .param .b64 param3;
729
+ st.param.b64 [param3+0], %rd84;
730
+ .param .b64 param4;
731
+ st.param.b64 [param4+0], %rd98;
732
+ call.uni
733
+ __assertfail,
734
+ (
735
+ param0,
736
+ param1,
737
+ param2,
738
+ param3,
739
+ param4
740
+ );
741
+ } // callseq 5
742
+ $L__BB0_8:
743
+ .loc 1 69 54
744
+ mov.u32 %r177, 0x0;
745
+ mov.u32 %r178, 0x0;
746
+ @%p61 ld.global.L1::evict_first.v2.b32 { %r177, %r178 }, [ %rd86 + 0 ];
747
+ @!%p61 mov.u32 %r177, %r179;
748
+ @!%p61 mov.u32 %r178, %r179;
749
+ .loc 1 62 51
750
+ mov.b32 %f150, %r166;
751
+ .loc 1 69 54
752
+ mov.b32 %f151, %r178;
753
+ .loc 1 70 24
754
+ add.f32 %f152, %f150, %f151;
755
+ .loc 1 72 24
756
+ add.f32 %f153, %f20, %f152;
757
+ .loc 1 73 24
758
+ sub.f32 %f154, %f153, %f13;
759
+ .loc 1 62 51
760
+ mov.b32 %f155, %r165;
761
+ .loc 1 69 54
762
+ mov.b32 %f156, %r177;
763
+ .loc 1 70 24
764
+ add.f32 %f157, %f155, %f156;
765
+ .loc 1 72 24
766
+ add.f32 %f158, %f19, %f157;
767
+ .loc 1 73 24
768
+ sub.f32 %f159, %f158, %f13;
769
+ .loc 1 79 24
770
+ mul.f32 %f160, %f159, %f142;
771
+ mul.f32 %f161, %f154, %f142;
772
+ .loc 1 80 24
773
+ bar.sync 0;
774
+ st.shared.u32 [%r8], %r173;
775
+ bar.sync 0;
776
+ ld.shared.v2.f32 {%f162, %f163}, [%r9];
777
+ mul.f32 %f164, %f160, %f162;
778
+ mul.f32 %f165, %f161, %f163;
779
+ .loc 1 82 29
780
+ add.s64 %rd89, %rd15, %rd52;
781
+ add.s64 %rd87, %rd89, 256;
782
+ .loc 1 82 52
783
+ mov.b32 %r181, %f164;
784
+ cvt.rn.bf16.f32 %rs11, %r181;
785
+ mov.b32 %r182, %f165;
786
+ cvt.rn.bf16.f32 %rs12, %r182;
787
+ mov.b32 %r184, {%rs11, %rs12};
788
+ @%p61 st.global.b32 [ %rd87 + 0 ], { %r184 };
789
+ .loc 1 58 4
790
+ ret;
791
+ $L__tmp21:
792
+ $L__func_end0:
793
+
794
+ }
795
+ // .globl __nv_rsqrtf
796
+ .visible .func (.param .b32 func_retval0) __nv_rsqrtf(
797
+ .param .b32 __nv_rsqrtf_param_0
798
+ )
799
+ {
800
+ .reg .f32 %f<3>;
801
+ $L__func_begin1:
802
+
803
+ ld.param.f32 %f1, [__nv_rsqrtf_param_0];
804
+ rsqrt.approx.ftz.f32 %f2, %f1;
805
+ st.param.f32 [func_retval0+0], %f2;
806
+ ret;
807
+ $L__func_end1:
808
+
809
+ }
810
+ .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py"
811
+ .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
812
+ .section .debug_abbrev
813
+ {
814
+ .b8 1
815
+ .b8 17
816
+ .b8 1
817
+ .b8 37
818
+ .b8 8
819
+ .b8 19
820
+ .b8 5
821
+ .b8 3
822
+ .b8 8
823
+ .b8 16
824
+ .b8 6
825
+ .b8 27
826
+ .b8 8
827
+ .b8 180
828
+ .b8 66
829
+ .b8 12
830
+ .b8 17
831
+ .b8 1
832
+ .b8 18
833
+ .b8 1
834
+ .b8 0
835
+ .b8 0
836
+ .b8 2
837
+ .b8 46
838
+ .b8 0
839
+ .b8 135
840
+ .b8 64
841
+ .b8 8
842
+ .b8 3
843
+ .b8 8
844
+ .b8 58
845
+ .b8 11
846
+ .b8 59
847
+ .b8 11
848
+ .b8 63
849
+ .b8 12
850
+ .b8 32
851
+ .b8 11
852
+ .b8 0
853
+ .b8 0
854
+ .b8 3
855
+ .b8 46
856
+ .b8 1
857
+ .b8 17
858
+ .b8 1
859
+ .b8 18
860
+ .b8 1
861
+ .b8 64
862
+ .b8 10
863
+ .b8 49
864
+ .b8 19
865
+ .b8 0
866
+ .b8 0
867
+ .b8 4
868
+ .b8 29
869
+ .b8 0
870
+ .b8 49
871
+ .b8 19
872
+ .b8 17
873
+ .b8 1
874
+ .b8 18
875
+ .b8 1
876
+ .b8 88
877
+ .b8 11
878
+ .b8 89
879
+ .b8 11
880
+ .b8 87
881
+ .b8 11
882
+ .b8 0
883
+ .b8 0
884
+ .b8 5
885
+ .b8 29
886
+ .b8 1
887
+ .b8 49
888
+ .b8 19
889
+ .b8 17
890
+ .b8 1
891
+ .b8 18
892
+ .b8 1
893
+ .b8 88
894
+ .b8 11
895
+ .b8 89
896
+ .b8 11
897
+ .b8 87
898
+ .b8 11
899
+ .b8 0
900
+ .b8 0
901
+ .b8 0
902
+ }
903
+ .section .debug_info
904
+ {
905
+ .b32 302
906
+ .b8 2
907
+ .b8 0
908
+ .b32 .debug_abbrev
909
+ .b8 8
910
+ .b8 1
911
+ .b8 116
912
+ .b8 114
913
+ .b8 105
914
+ .b8 116
915
+ .b8 111
916
+ .b8 110
917
+ .b8 0
918
+ .b8 2
919
+ .b8 0
920
+ .b8 99
921
+ .b8 112
922
+ .b8 110
923
+ .b8 51
924
+ .b8 108
925
+ .b8 97
926
+ .b8 119
927
+ .b8 103
928
+ .b8 54
929
+ .b8 53
930
+ .b8 108
931
+ .b8 112
932
+ .b8 105
933
+ .b8 54
934
+ .b8 51
935
+ .b8 103
936
+ .b8 118
937
+ .b8 54
938
+ .b8 99
939
+ .b8 54
940
+ .b8 112
941
+ .b8 110
942
+ .b8 52
943
+ .b8 111
944
+ .b8 105
945
+ .b8 107
946
+ .b8 104
947
+ .b8 103
948
+ .b8 54
949
+ .b8 113
950
+ .b8 118
951
+ .b8 97
952
+ .b8 50
953
+ .b8 104
954
+ .b8 50
955
+ .b8 113
956
+ .b8 106
957
+ .b8 100
958
+ .b8 112
959
+ .b8 120
960
+ .b8 101
961
+ .b8 54
962
+ .b8 113
963
+ .b8 106
964
+ .b8 52
965
+ .b8 108
966
+ .b8 118
967
+ .b8 116
968
+ .b8 116
969
+ .b8 119
970
+ .b8 101
971
+ .b8 122
972
+ .b8 46
973
+ .b8 112
974
+ .b8 121
975
+ .b8 0
976
+ .b32 .debug_line
977
+ .b8 47
978
+ .b8 116
979
+ .b8 109
980
+ .b8 112
981
+ .b8 47
982
+ .b8 116
983
+ .b8 111
984
+ .b8 114
985
+ .b8 99
986
+ .b8 104
987
+ .b8 105
988
+ .b8 110
989
+ .b8 100
990
+ .b8 117
991
+ .b8 99
992
+ .b8 116
993
+ .b8 111
994
+ .b8 114
995
+ .b8 95
996
+ .b8 114
997
+ .b8 111
998
+ .b8 111
999
+ .b8 116
1000
+ .b8 47
1001
+ .b8 112
1002
+ .b8 110
1003
+ .b8 0
1004
+ .b8 1
1005
+ .b64 $L__func_begin0
1006
+ .b64 $L__func_end0
1007
+ .b8 2
1008
+ .b8 116
1009
+ .b8 114
1010
+ .b8 105
1011
+ .b8 116
1012
+ .b8 111
1013
+ .b8 110
1014
+ .b8 95
1015
+ .b8 95
1016
+ .b8 48
1017
+ .b8 100
1018
+ .b8 49
1019
+ .b8 100
1020
+ .b8 50
1021
+ .b8 100
1022
+ .b8 51
1023
+ .b8 100
1024
+ .b8 52
1025
+ .b8 100
1026
+ .b8 53
1027
+ .b8 100
1028
+ .b8 54
1029
+ .b8 100
1030
+ .b8 101
1031
+ .b8 55
1032
+ .b8 100
1033
+ .b8 101
1034
+ .b8 0
1035
+ .b8 116
1036
+ .b8 114
1037
+ .b8 105
1038
+ .b8 116
1039
+ .b8 111
1040
+ .b8 110
1041
+ .b8 95
1042
+ .b8 95
1043
+ .b8 48
1044
+ .b8 100
1045
+ .b8 49
1046
+ .b8 100
1047
+ .b8 50
1048
+ .b8 100
1049
+ .b8 51
1050
+ .b8 100
1051
+ .b8 52
1052
+ .b8 100
1053
+ .b8 53
1054
+ .b8 100
1055
+ .b8 54
1056
+ .b8 100
1057
+ .b8 101
1058
+ .b8 55
1059
+ .b8 100
1060
+ .b8 101
1061
+ .b8 0
1062
+ .b8 1
1063
+ .b8 18
1064
+ .b8 1
1065
+ .b8 1
1066
+ .b8 3
1067
+ .b64 $L__func_begin0
1068
+ .b64 $L__func_end0
1069
+ .b8 1
1070
+ .b8 156
1071
+ .b32 125
1072
+ .b8 4
1073
+ .b32 125
1074
+ .b64 $L__tmp1
1075
+ .b64 $L__tmp4
1076
+ .b8 2
1077
+ .b8 47
1078
+ .b8 41
1079
+ .b8 4
1080
+ .b32 125
1081
+ .b64 $L__tmp5
1082
+ .b64 $L__tmp20
1083
+ .b8 2
1084
+ .b8 53
1085
+ .b8 44
1086
+ .b8 5
1087
+ .b32 125
1088
+ .b64 $L__tmp6
1089
+ .b64 $L__tmp19
1090
+ .b8 2
1091
+ .b8 53
1092
+ .b8 44
1093
+ .b8 4
1094
+ .b32 125
1095
+ .b64 $L__tmp6
1096
+ .b64 $L__tmp19
1097
+ .b8 2
1098
+ .b8 120
1099
+ .b8 46
1100
+ .b8 0
1101
+ .b8 0
1102
+ .b8 0
1103
+ }
1104
+ .section .debug_pubnames
1105
+ {
1106
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1107
+ $L__pubNames_start0:
1108
+ .b8 2
1109
+ .b8 0
1110
+ .b32 .debug_info
1111
+ .b32 306
1112
+ .b32 125
1113
+ .b8 116
1114
+ .b8 114
1115
+ .b8 105
1116
+ .b8 116
1117
+ .b8 111
1118
+ .b8 110
1119
+ .b8 95
1120
+ .b8 95
1121
+ .b8 48
1122
+ .b8 100
1123
+ .b8 49
1124
+ .b8 100
1125
+ .b8 50
1126
+ .b8 100
1127
+ .b8 51
1128
+ .b8 100
1129
+ .b8 52
1130
+ .b8 100
1131
+ .b8 53
1132
+ .b8 100
1133
+ .b8 54
1134
+ .b8 100
1135
+ .b8 101
1136
+ .b8 55
1137
+ .b8 100
1138
+ .b8 101
1139
+ .b8 0
1140
+ .b32 0
1141
+ $L__pubNames_end0:
1142
+ }
1143
+ .section .debug_pubtypes
1144
+ {
1145
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1146
+ $L__pubTypes_start0:
1147
+ .b8 2
1148
+ .b8 0
1149
+ .b32 .debug_info
1150
+ .b32 306
1151
+ .b32 0
1152
+ $L__pubTypes_end0:
1153
+ }
1154
+ .section .debug_loc { }
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx ADDED
@@ -0,0 +1,1608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6e7de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .extern .shared .align 1 .b8 global_smem[];
23
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
24
+
25
+ .visible .entry triton__0d1d2d3d4d5d6e7de(
26
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_0,
27
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_1,
28
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_2,
29
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_3,
30
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_4,
31
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_5,
32
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_6,
33
+ .param .u64 triton__0d1d2d3d4d5d6e7de_param_7
34
+ )
35
+ .maxntid 256, 1, 1
36
+ {
37
+ .reg .pred %p<154>;
38
+ .reg .b16 %rs<83>;
39
+ .reg .b32 %r<247>;
40
+ .reg .f32 %f<401>;
41
+ .reg .b64 %rd<217>;
42
+ .loc 1 18 0
43
+ $L__func_begin0:
44
+ .loc 1 18 0
45
+
46
+ ld.param.u64 %rd48, [triton__0d1d2d3d4d5d6e7de_param_5];
47
+ ld.param.u64 %rd47, [triton__0d1d2d3d4d5d6e7de_param_4];
48
+ ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6e7de_param_0];
49
+ ld.param.u64 %rd58, [triton__0d1d2d3d4d5d6e7de_param_1];
50
+ $L__tmp0:
51
+ .loc 1 24 33
52
+ mov.u32 %r1, %tid.x;
53
+ ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6e7de_param_2];
54
+ and.b32 %r2, %r1, 255;
55
+ ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6e7de_param_3];
56
+ shl.b32 %r3, %r2, 2;
57
+ or.b32 %r36, %r3, 1;
58
+ or.b32 %r37, %r3, 2;
59
+ or.b32 %r38, %r3, 3;
60
+ or.b32 %r39, %r2, 256;
61
+ or.b32 %r40, %r2, 512;
62
+ or.b32 %r41, %r2, 768;
63
+ .loc 1 21 28
64
+ mov.u32 %r34, %ctaid.x;
65
+ .loc 1 21 34
66
+ cvt.s64.s32 %rd1, %r34;
67
+ .loc 1 23 21
68
+ setp.lt.s32 %p1, %r34, 8;
69
+ shl.b32 %r42, %r2, 3;
70
+ shl.b32 %r43, %r2, 5;
71
+ mov.u32 %r44, global_smem;
72
+ add.s32 %r4, %r44, %r43;
73
+ shl.b32 %r45, %r36, 1;
74
+ shl.b32 %r46, %r36, 3;
75
+ add.s32 %r5, %r44, %r46;
76
+ shl.b32 %r47, %r37, 1;
77
+ shl.b32 %r48, %r37, 3;
78
+ add.s32 %r6, %r44, %r48;
79
+ shl.b32 %r49, %r38, 1;
80
+ shl.b32 %r50, %r38, 3;
81
+ add.s32 %r7, %r44, %r50;
82
+ shl.b32 %r51, %r2, 1;
83
+ add.s32 %r12, %r44, %r42;
84
+ shl.b32 %r52, %r39, 1;
85
+ shl.b32 %r53, %r39, 3;
86
+ add.s32 %r9, %r44, %r53;
87
+ shl.b32 %r54, %r40, 1;
88
+ shl.b32 %r55, %r40, 3;
89
+ add.s32 %r10, %r44, %r55;
90
+ shl.b32 %r56, %r41, 1;
91
+ shl.b32 %r57, %r41, 3;
92
+ add.s32 %r11, %r44, %r57;
93
+ add.s32 %r13, %r44, %r45;
94
+ add.s32 %r14, %r44, %r47;
95
+ add.s32 %r15, %r44, %r49;
96
+ add.s32 %r16, %r44, %r51;
97
+ add.s32 %r17, %r44, %r52;
98
+ add.s32 %r18, %r44, %r54;
99
+ add.s32 %r19, %r44, %r56;
100
+ add.s32 %r20, %r44, %r3;
101
+ add.s32 %r21, %r44, %r2;
102
+ shl.b32 %r58, %r2, 6;
103
+ add.s32 %r22, %r44, %r58;
104
+ shl.b32 %r59, %r36, 4;
105
+ add.s32 %r23, %r44, %r59;
106
+ shl.b32 %r60, %r37, 4;
107
+ add.s32 %r24, %r44, %r60;
108
+ shl.b32 %r61, %r38, 4;
109
+ add.s32 %r25, %r44, %r61;
110
+ shl.b32 %r62, %r2, 4;
111
+ add.s32 %r26, %r44, %r62;
112
+ shl.b32 %r63, %r39, 4;
113
+ add.s32 %r27, %r44, %r63;
114
+ shl.b32 %r64, %r40, 4;
115
+ add.s32 %r28, %r44, %r64;
116
+ shl.b32 %r65, %r41, 4;
117
+ add.s32 %r29, %r44, %r65;
118
+ .loc 1 28 36
119
+ mul.wide.s32 %rd61, %r34, 61440;
120
+ mul.wide.u32 %rd62, %r2, 32;
121
+ add.s64 %rd63, %rd61, %rd62;
122
+ add.s64 %rd64, %rd63, %rd57;
123
+ add.s64 %rd208, %rd64, 8208;
124
+ mul.wide.s32 %rd65, %r34, 771947520;
125
+ add.s64 %rd66, %rd58, %rd65;
126
+ mul.wide.u32 %rd67, %r2, 402056;
127
+ add.s64 %rd68, %rd66, %rd67;
128
+ add.s64 %rd207, %rd68, 103227878;
129
+ mul.wide.u32 %rd4, %r2, 16;
130
+ mul.wide.s32 %rd69, %r34, 30720;
131
+ add.s64 %rd206, %rd60, %rd69;
132
+ add.s64 %rd205, %rd59, %rd69;
133
+ mov.u64 %rd209, 0;
134
+ mov.f32 %f385, 0f00000000;
135
+ mov.b32 %r246, -2048;
136
+ mov.u16 %rs44, 0;
137
+ mov.f32 %f386, %f385;
138
+ mov.f32 %f387, %f385;
139
+ mov.f32 %f388, %f385;
140
+ mov.f32 %f389, %f385;
141
+ mov.f32 %f390, %f385;
142
+ mov.f32 %f391, %f385;
143
+ mov.f32 %f392, %f385;
144
+ mov.u64 %rd210, %rd209;
145
+ mov.u64 %rd211, %rd209;
146
+ mov.u64 %rd212, %rd209;
147
+ mov.u64 %rd213, %rd209;
148
+ mov.u64 %rd214, %rd209;
149
+ mov.u64 %rd215, %rd209;
150
+ mov.u64 %rd216, %rd209;
151
+ bra.uni $L__BB0_1;
152
+ $L__BB0_19:
153
+ .loc 1 36 23
154
+ bfe.s32 %r172, %r115, 0, 8;
155
+ cvt.u16.u32 %rs67, %r172;
156
+ and.b16 %rs68, %rs67, 255;
157
+ setp.eq.s16 %p117, %rs68, 0;
158
+ bfe.s32 %r173, %r115, 8, 8;
159
+ cvt.u16.u32 %rs69, %r173;
160
+ and.b16 %rs70, %rs69, 255;
161
+ setp.eq.s16 %p118, %rs70, 0;
162
+ bfe.s32 %r174, %r115, 16, 8;
163
+ cvt.u16.u32 %rs71, %r174;
164
+ and.b16 %rs72, %rs71, 255;
165
+ setp.eq.s16 %p119, %rs72, 0;
166
+ bfe.s32 %r175, %r115, 24, 8;
167
+ cvt.u16.u32 %rs73, %r175;
168
+ and.b16 %rs74, %rs73, 255;
169
+ setp.eq.s16 %p120, %rs74, 0;
170
+ bfe.s32 %r176, %r108, 0, 8;
171
+ cvt.u16.u32 %rs75, %r176;
172
+ and.b16 %rs76, %rs75, 255;
173
+ setp.eq.s16 %p121, %rs76, 0;
174
+ bfe.s32 %r177, %r108, 8, 8;
175
+ cvt.u16.u32 %rs77, %r177;
176
+ and.b16 %rs78, %rs77, 255;
177
+ setp.eq.s16 %p122, %rs78, 0;
178
+ bfe.s32 %r178, %r108, 16, 8;
179
+ cvt.u16.u32 %rs79, %r178;
180
+ and.b16 %rs80, %rs79, 255;
181
+ setp.eq.s16 %p123, %rs80, 0;
182
+ bfe.s32 %r179, %r108, 24, 8;
183
+ cvt.u16.u32 %rs81, %r179;
184
+ and.b16 %rs82, %rs81, 255;
185
+ setp.eq.s16 %p124, %rs82, 0;
186
+ .loc 1 46 23
187
+ setp.eq.f32 %p133, %f68, 0f00000000;
188
+ selp.f32 %f320, 0fFF800000, %f400, %p133;
189
+ bar.sync 0;
190
+ st.shared.f32 [%r4], %f37;
191
+ st.shared.f32 [%r5], %f42;
192
+ st.shared.f32 [%r6], %f47;
193
+ st.shared.f32 [%r7], %f52;
194
+ bar.sync 0;
195
+ ld.shared.f32 %f321, [%r12];
196
+ ld.shared.f32 %f322, [%r9];
197
+ ld.shared.f32 %f323, [%r10];
198
+ ld.shared.f32 %f324, [%r11];
199
+ bar.sync 0;
200
+ st.shared.f32 [%r4], %f57;
201
+ st.shared.f32 [%r5], %f62;
202
+ st.shared.f32 [%r6], %f67;
203
+ st.shared.f32 [%r7], %f320;
204
+ bar.sync 0;
205
+ ld.shared.f32 %f325, [%r12];
206
+ ld.shared.f32 %f326, [%r9];
207
+ ld.shared.f32 %f327, [%r10];
208
+ ld.shared.f32 %f328, [%r11];
209
+ .loc 1 48 17
210
+ sub.f32 %f329, %f324, %f28;
211
+ sub.f32 %f330, %f323, %f27;
212
+ sub.f32 %f331, %f322, %f26;
213
+ sub.f32 %f332, %f321, %f25;
214
+ sub.f32 %f333, %f328, %f32;
215
+ sub.f32 %f334, %f327, %f31;
216
+ sub.f32 %f335, %f326, %f30;
217
+ sub.f32 %f336, %f325, %f29;
218
+ add.f32 %f337, %f336, 0f00000000;
219
+ add.f32 %f338, %f335, 0f00000000;
220
+ add.f32 %f339, %f334, 0f00000000;
221
+ add.f32 %f340, %f333, 0f00000000;
222
+ add.f32 %f341, %f332, 0f00000000;
223
+ add.f32 %f342, %f331, 0f00000000;
224
+ add.f32 %f343, %f330, 0f00000000;
225
+ add.f32 %f344, %f329, 0f00000000;
226
+ .loc 1 50 38
227
+ selp.f32 %f345, 0f00000000, %f344, %p124;
228
+ selp.f32 %f346, 0f00000000, %f343, %p123;
229
+ selp.f32 %f347, 0f00000000, %f342, %p122;
230
+ selp.f32 %f348, 0f00000000, %f341, %p121;
231
+ selp.f32 %f349, 0f00000000, %f340, %p120;
232
+ selp.f32 %f350, 0f00000000, %f339, %p119;
233
+ selp.f32 %f351, 0f00000000, %f338, %p118;
234
+ selp.f32 %f352, 0f00000000, %f337, %p117;
235
+ .loc 1 53 48
236
+ selp.f32 %f353, %f352, 0f80000000, %p1;
237
+ selp.f32 %f354, %f351, 0f80000000, %p1;
238
+ selp.f32 %f355, %f350, 0f80000000, %p90;
239
+ selp.f32 %f356, %f349, 0f80000000, %p90;
240
+ selp.f32 %f357, %f348, 0f80000000, %p1;
241
+ selp.f32 %f358, %f347, 0f80000000, %p1;
242
+ selp.f32 %f359, %f346, 0f80000000, %p1;
243
+ selp.f32 %f360, %f345, 0f80000000, %p1;
244
+ add.f32 %f388, %f388, %f360;
245
+ add.f32 %f387, %f387, %f359;
246
+ add.f32 %f386, %f386, %f358;
247
+ add.f32 %f385, %f385, %f357;
248
+ add.f32 %f392, %f392, %f356;
249
+ add.f32 %f391, %f391, %f355;
250
+ add.f32 %f390, %f390, %f354;
251
+ add.f32 %f389, %f389, %f353;
252
+ .loc 1 57 48
253
+ and.pred %p134, %p1, %p52;
254
+ and.pred %p135, %p1, %p51;
255
+ and.pred %p136, %p1, %p50;
256
+ and.pred %p137, %p1, %p49;
257
+ and.pred %p138, %p17, %p48;
258
+ and.pred %p139, %p17, %p47;
259
+ and.pred %p140, %p17, %p46;
260
+ and.pred %p141, %p17, %p45;
261
+ selp.u64 %rd140, 1, 0, %p141;
262
+ selp.u64 %rd141, 1, 0, %p140;
263
+ selp.u64 %rd142, 1, 0, %p139;
264
+ selp.u64 %rd143, 1, 0, %p138;
265
+ selp.u64 %rd144, 1, 0, %p137;
266
+ selp.u64 %rd145, 1, 0, %p136;
267
+ selp.u64 %rd146, 1, 0, %p135;
268
+ selp.u64 %rd147, 1, 0, %p134;
269
+ add.s64 %rd209, %rd209, %rd147;
270
+ add.s64 %rd210, %rd210, %rd146;
271
+ add.s64 %rd211, %rd211, %rd145;
272
+ add.s64 %rd212, %rd212, %rd144;
273
+ add.s64 %rd213, %rd213, %rd143;
274
+ add.s64 %rd214, %rd214, %rd142;
275
+ add.s64 %rd215, %rd215, %rd141;
276
+ add.s64 %rd216, %rd216, %rd140;
277
+ .loc 1 28 36
278
+ add.s64 %rd208, %rd208, 16384;
279
+ add.s32 %r246, %r246, 2048;
280
+ add.s64 %rd207, %rd207, 205852672;
281
+ add.s64 %rd206, %rd206, 8192;
282
+ add.s64 %rd205, %rd205, 8192;
283
+ setp.lt.u32 %p142, %r246, 5632;
284
+ @%p142 bra $L__BB0_1;
285
+ bra.uni $L__BB0_20;
286
+ $L__BB0_1:
287
+ .loc 1 0 36
288
+ cvt.u32.u64 %r98, %rd1;
289
+ .loc 1 23 21
290
+ setp.lt.s32 %p78, %r98, 8;
291
+ .loc 1 29 27
292
+ add.s32 %r99, %r3, %r246;
293
+ add.s32 %r100, %r99, 3072;
294
+ .loc 1 30 25
295
+ add.s32 %r101, %r246, 3584;
296
+ setp.lt.u32 %p43, %r100, 7680;
297
+ setp.lt.u32 %p44, %r101, 7680;
298
+ .loc 1 29 27
299
+ add.s64 %rd72, %rd208, -8208;
300
+ .loc 1 32 34
301
+ add.s64 %rd75, %rd208, -8192;
302
+ add.s64 %rd78, %rd208, -16;
303
+ .loc 1 32 59
304
+ and.pred %p17, %p78, %p43;
305
+ and.pred %p90, %p78, %p44;
306
+ .loc 1 32 51
307
+ mov.u64 %rd70, 0x0;
308
+ mov.u64 %rd71, 0x0;
309
+ @%p78 ld.global.L1::evict_first.v2.b64 { %rd70, %rd71 }, [ %rd72 + 0 ];
310
+ @!%p78 mov.u64 %rd70, 0x0;
311
+ @!%p78 mov.u64 %rd71, 0x0;
312
+ mov.u64 %rd73, 0x0;
313
+ mov.u64 %rd74, 0x0;
314
+ @%p78 ld.global.L1::evict_first.v2.b64 { %rd73, %rd74 }, [ %rd75 + 0 ];
315
+ @!%p78 mov.u64 %rd73, 0x0;
316
+ @!%p78 mov.u64 %rd74, 0x0;
317
+ mov.u64 %rd76, 0x0;
318
+ mov.u64 %rd77, 0x0;
319
+ @%p17 ld.global.L1::evict_first.v2.b64 { %rd76, %rd77 }, [ %rd78 + 0 ];
320
+ @!%p17 mov.u64 %rd76, 0x0;
321
+ @!%p17 mov.u64 %rd77, 0x0;
322
+ mov.u64 %rd79, 0x0;
323
+ mov.u64 %rd80, 0x0;
324
+ @%p17 ld.global.L1::evict_first.v2.b64 { %rd79, %rd80 }, [ %rd208 + 0 ];
325
+ @!%p17 mov.u64 %rd79, 0x0;
326
+ @!%p17 mov.u64 %rd80, 0x0;
327
+ .loc 1 33 35
328
+ add.s64 %rd82, %rd205, %rd4;
329
+ .loc 1 33 52
330
+ add.s64 %rd83, %rd82, 4096;
331
+ mov.b32 %r70, 0;
332
+ mov.u32 %r66, 0x0;
333
+ mov.u32 %r67, 0x0;
334
+ mov.u32 %r68, 0x0;
335
+ mov.u32 %r69, 0x0;
336
+ @%p78 ld.global.L1::evict_first.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd82 + 0 ];
337
+ @!%p78 mov.u32 %r66, %r70;
338
+ @!%p78 mov.u32 %r67, %r70;
339
+ @!%p78 mov.u32 %r68, %r70;
340
+ @!%p78 mov.u32 %r69, %r70;
341
+ mov.u32 %r74, 0x0;
342
+ mov.u32 %r75, 0x0;
343
+ mov.u32 %r76, 0x0;
344
+ mov.u32 %r77, 0x0;
345
+ @%p17 ld.global.L1::evict_first.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd83 + 0 ];
346
+ @!%p17 mov.u32 %r74, %r70;
347
+ @!%p17 mov.u32 %r75, %r70;
348
+ @!%p17 mov.u32 %r76, %r70;
349
+ @!%p17 mov.u32 %r77, %r70;
350
+ bar.sync 0;
351
+ st.shared.u32 [%r4], %r66;
352
+ st.shared.u32 [%r5], %r67;
353
+ st.shared.u32 [%r6], %r68;
354
+ st.shared.u32 [%r7], %r69;
355
+ bar.sync 0;
356
+ ld.shared.f32 %f9, [%r12];
357
+ ld.shared.f32 %f10, [%r9];
358
+ ld.shared.f32 %f11, [%r10];
359
+ ld.shared.f32 %f12, [%r11];
360
+ bar.sync 0;
361
+ st.shared.u32 [%r4], %r74;
362
+ st.shared.u32 [%r5], %r75;
363
+ st.shared.u32 [%r6], %r76;
364
+ st.shared.u32 [%r7], %r77;
365
+ bar.sync 0;
366
+ ld.shared.f32 %f13, [%r12];
367
+ ld.shared.f32 %f14, [%r9];
368
+ ld.shared.f32 %f15, [%r10];
369
+ ld.shared.f32 %f16, [%r11];
370
+ .loc 1 34 35
371
+ add.s64 %rd84, %rd206, %rd4;
372
+ .loc 1 34 52
373
+ add.s64 %rd85, %rd84, 4096;
374
+ mov.u32 %r82, 0x0;
375
+ mov.u32 %r83, 0x0;
376
+ mov.u32 %r84, 0x0;
377
+ mov.u32 %r85, 0x0;
378
+ @%p78 ld.global.L1::evict_first.v4.b32 { %r82, %r83, %r84, %r85 }, [ %rd84 + 0 ];
379
+ @!%p78 mov.u32 %r82, %r70;
380
+ @!%p78 mov.u32 %r83, %r70;
381
+ @!%p78 mov.u32 %r84, %r70;
382
+ @!%p78 mov.u32 %r85, %r70;
383
+ mov.b32 %f17, %r82;
384
+ mov.u32 %r90, 0x0;
385
+ mov.u32 %r91, 0x0;
386
+ mov.u32 %r92, 0x0;
387
+ mov.u32 %r93, 0x0;
388
+ @%p17 ld.global.L1::evict_first.v4.b32 { %r90, %r91, %r92, %r93 }, [ %rd85 + 0 ];
389
+ @!%p17 mov.u32 %r90, %r70;
390
+ @!%p17 mov.u32 %r91, %r70;
391
+ @!%p17 mov.u32 %r92, %r70;
392
+ @!%p17 mov.u32 %r93, %r70;
393
+ .loc 1 36 23
394
+ setp.ne.s64 %p45, %rd80, -1;
395
+ setp.ne.s64 %p46, %rd79, -1;
396
+ setp.ne.s64 %p47, %rd77, -1;
397
+ setp.ne.s64 %p48, %rd76, -1;
398
+ setp.ne.s64 %p49, %rd74, -1;
399
+ setp.ne.s64 %p50, %rd73, -1;
400
+ setp.ne.s64 %p51, %rd71, -1;
401
+ setp.ne.s64 %p52, %rd70, -1;
402
+ bar.sync 0;
403
+ selp.u16 %rs1, 1, 0, %p52;
404
+ st.shared.u8 [%r12], %rs1;
405
+ selp.u16 %rs2, 1, 0, %p51;
406
+ st.shared.u8 [%r13], %rs2;
407
+ selp.u16 %rs3, 1, 0, %p50;
408
+ st.shared.u8 [%r14], %rs3;
409
+ selp.u16 %rs4, 1, 0, %p49;
410
+ st.shared.u8 [%r15], %rs4;
411
+ bar.sync 0;
412
+ ld.shared.u8 %r102, [%r19];
413
+ ld.shared.u8 %r103, [%r18];
414
+ ld.shared.u8 %r104, [%r17];
415
+ ld.shared.u8 %r105, [%r16];
416
+ bar.sync 0;
417
+ selp.u16 %rs5, 1, 0, %p48;
418
+ st.shared.u8 [%r12], %rs5;
419
+ selp.u16 %rs6, 1, 0, %p47;
420
+ st.shared.u8 [%r13], %rs6;
421
+ selp.u16 %rs7, 1, 0, %p46;
422
+ st.shared.u8 [%r14], %rs7;
423
+ selp.u16 %rs8, 1, 0, %p45;
424
+ st.shared.u8 [%r15], %rs8;
425
+ bar.sync 0;
426
+ bfi.b32 %r106, %r104, %r105, 8, 8;
427
+ bfi.b32 %r107, %r103, %r106, 16, 8;
428
+ bfi.b32 %r108, %r102, %r107, 24, 8;
429
+ ld.shared.u8 %r109, [%r16];
430
+ ld.shared.u8 %r110, [%r17];
431
+ bfi.b32 %r111, %r110, %r109, 8, 8;
432
+ ld.shared.u8 %r112, [%r18];
433
+ bfi.b32 %r113, %r112, %r111, 16, 8;
434
+ ld.shared.u8 %r114, [%r19];
435
+ bfi.b32 %r115, %r114, %r113, 24, 8;
436
+ .loc 1 42 40
437
+ bar.sync 0;
438
+ .loc 1 38 36
439
+ selp.b64 %rd86, %rd70, 0, %p52;
440
+ selp.b64 %rd87, %rd71, 0, %p51;
441
+ selp.b64 %rd88, %rd73, 0, %p50;
442
+ selp.b64 %rd89, %rd74, 0, %p49;
443
+ .loc 1 39 22
444
+ add.s64 %rd90, %rd89, 50257;
445
+ add.s64 %rd91, %rd88, 50257;
446
+ add.s64 %rd92, %rd87, 50257;
447
+ add.s64 %rd93, %rd86, 50257;
448
+ .loc 1 40 22
449
+ setp.lt.s64 %p53, %rd89, 0;
450
+ setp.lt.s64 %p54, %rd88, 0;
451
+ setp.lt.s64 %p55, %rd87, 0;
452
+ setp.lt.s64 %p56, %rd86, 0;
453
+ .loc 1 41 36
454
+ selp.b64 %rd27, %rd93, %rd86, %p56;
455
+ selp.b64 %rd28, %rd92, %rd87, %p55;
456
+ selp.b64 %rd29, %rd91, %rd88, %p54;
457
+ selp.b64 %rd30, %rd90, %rd89, %p53;
458
+ .loc 1 42 40
459
+ setp.lt.u64 %p57, %rd30, 50257;
460
+ setp.lt.u64 %p58, %rd29, 50257;
461
+ setp.lt.u64 %p59, %rd28, 50257;
462
+ setp.lt.u64 %p60, %rd27, 50257;
463
+ selp.u32 %r116, 1, 0, %p60;
464
+ selp.u32 %r117, 1, 0, %p59;
465
+ bfi.b32 %r118, %r117, %r116, 8, 8;
466
+ selp.u32 %r119, 1, 0, %p58;
467
+ bfi.b32 %r120, %r119, %r118, 16, 8;
468
+ selp.u32 %r121, 1, 0, %p57;
469
+ bfi.b32 %r122, %r121, %r120, 24, 8;
470
+ st.shared.u32 [%r20], %r122;
471
+ bar.sync 0;
472
+ ld.shared.u8 %rs9, [%r21];
473
+ ld.shared.u8 %rs10, [%r21+256];
474
+ ld.shared.u8 %rs11, [%r21+512];
475
+ ld.shared.u8 %rs12, [%r21+768];
476
+ bar.sync 0;
477
+ .loc 1 38 36
478
+ selp.b64 %rd94, %rd76, 0, %p48;
479
+ selp.b64 %rd95, %rd77, 0, %p47;
480
+ selp.b64 %rd96, %rd79, 0, %p46;
481
+ selp.b64 %rd97, %rd80, 0, %p45;
482
+ .loc 1 39 22
483
+ add.s64 %rd98, %rd97, 50257;
484
+ add.s64 %rd99, %rd96, 50257;
485
+ add.s64 %rd100, %rd95, 50257;
486
+ add.s64 %rd101, %rd94, 50257;
487
+ .loc 1 40 22
488
+ setp.lt.s64 %p61, %rd97, 0;
489
+ setp.lt.s64 %p62, %rd96, 0;
490
+ setp.lt.s64 %p63, %rd95, 0;
491
+ setp.lt.s64 %p64, %rd94, 0;
492
+ .loc 1 41 36
493
+ selp.b64 %rd31, %rd101, %rd94, %p64;
494
+ selp.b64 %rd32, %rd100, %rd95, %p63;
495
+ selp.b64 %rd33, %rd99, %rd96, %p62;
496
+ selp.b64 %rd34, %rd98, %rd97, %p61;
497
+ .loc 1 42 40
498
+ setp.lt.u64 %p65, %rd34, 50257;
499
+ setp.lt.u64 %p66, %rd33, 50257;
500
+ setp.lt.u64 %p67, %rd32, 50257;
501
+ setp.lt.u64 %p68, %rd31, 50257;
502
+ selp.u32 %r123, 1, 0, %p68;
503
+ selp.u32 %r124, 1, 0, %p67;
504
+ bfi.b32 %r125, %r124, %r123, 8, 8;
505
+ selp.u32 %r126, 1, 0, %p66;
506
+ bfi.b32 %r127, %r126, %r125, 16, 8;
507
+ selp.u32 %r128, 1, 0, %p65;
508
+ bfi.b32 %r129, %r128, %r127, 24, 8;
509
+ st.shared.u32 [%r20], %r129;
510
+ bar.sync 0;
511
+ ld.shared.u8 %rs13, [%r21];
512
+ ld.shared.u8 %rs14, [%r21+256];
513
+ ld.shared.u8 %rs15, [%r21+512];
514
+ ld.shared.u8 %rs16, [%r21+768];
515
+ setp.eq.s16 %p69, %rs11, 0;
516
+ selp.u16 %rs17, 1, 0, %p69;
517
+ shl.b16 %rs18, %rs17, 2;
518
+ setp.eq.s16 %p70, %rs12, 0;
519
+ selp.u16 %rs19, -1, 0, %p70;
520
+ shl.b16 %rs20, %rs19, 3;
521
+ or.b16 %rs21, %rs20, %rs18;
522
+ setp.eq.s16 %p71, %rs10, 0;
523
+ selp.u16 %rs22, 1, 0, %p71;
524
+ setp.eq.s16 %p72, %rs9, 0;
525
+ selp.u16 %rs23, -1, 0, %p72;
526
+ shl.b16 %rs24, %rs23, 1;
527
+ or.b16 %rs25, %rs22, %rs24;
528
+ and.b16 %rs26, %rs25, 3;
529
+ or.b16 %rs27, %rs26, %rs21;
530
+ and.b16 %rs28, %rs27, 15;
531
+ setp.eq.s16 %p73, %rs15, 0;
532
+ selp.u16 %rs29, 1, 0, %p73;
533
+ shl.b16 %rs30, %rs29, 2;
534
+ setp.eq.s16 %p74, %rs16, 0;
535
+ selp.u16 %rs31, -1, 0, %p74;
536
+ shl.b16 %rs32, %rs31, 3;
537
+ or.b16 %rs33, %rs32, %rs30;
538
+ setp.eq.s16 %p75, %rs13, 0;
539
+ selp.u16 %rs34, 1, 0, %p75;
540
+ setp.eq.s16 %p76, %rs14, 0;
541
+ selp.u16 %rs35, -1, 0, %p76;
542
+ shl.b16 %rs36, %rs35, 1;
543
+ or.b16 %rs37, %rs34, %rs36;
544
+ and.b16 %rs38, %rs37, 3;
545
+ or.b16 %rs39, %rs38, %rs33;
546
+ shl.b16 %rs40, %rs39, 4;
547
+ or.b16 %rs41, %rs28, %rs40;
548
+ .loc 1 42 55
549
+ and.b16 %rs42, %rs41, 255;
550
+ setp.eq.s16 %p77, %rs42, 0;
551
+ @%p77 bra $L__BB0_3;
552
+ mov.u64 %rd102, assertMessage_0;
553
+ cvta.global.u64 %rd103, %rd102;
554
+ mov.u64 %rd104, assertFile_0;
555
+ cvta.global.u64 %rd105, %rd104;
556
+ mov.u64 %rd106, assertFunc_0;
557
+ cvta.global.u64 %rd107, %rd106;
558
+ mov.b32 %r130, 883;
559
+ mov.u64 %rd108, 1;
560
+ { // callseq 0, 0
561
+ .reg .b32 temp_param_reg;
562
+ .param .b64 param0;
563
+ st.param.b64 [param0+0], %rd103;
564
+ .param .b64 param1;
565
+ st.param.b64 [param1+0], %rd105;
566
+ .param .b32 param2;
567
+ st.param.b32 [param2+0], %r130;
568
+ .param .b64 param3;
569
+ st.param.b64 [param3+0], %rd107;
570
+ .param .b64 param4;
571
+ st.param.b64 [param4+0], %rd108;
572
+ call.uni
573
+ __assertfail,
574
+ (
575
+ param0,
576
+ param1,
577
+ param2,
578
+ param3,
579
+ param4
580
+ );
581
+ } // callseq 0
582
+ $L__BB0_3:
583
+ .loc 1 43 71
584
+ bar.sync 0;
585
+ shl.b64 %rd117, %rd27, 1;
586
+ add.s64 %rd118, %rd207, %rd117;
587
+ add.s64 %rd119, %rd118, -103227878;
588
+ st.shared.u64 [%r22], %rd119;
589
+ shl.b64 %rd120, %rd28, 1;
590
+ add.s64 %rd121, %rd207, %rd120;
591
+ add.s64 %rd122, %rd121, -103127364;
592
+ st.shared.u64 [%r23], %rd122;
593
+ shl.b64 %rd123, %rd29, 1;
594
+ add.s64 %rd124, %rd207, %rd123;
595
+ add.s64 %rd125, %rd124, -103026850;
596
+ st.shared.u64 [%r24], %rd125;
597
+ shl.b64 %rd126, %rd30, 1;
598
+ add.s64 %rd127, %rd207, %rd126;
599
+ add.s64 %rd128, %rd127, -102926336;
600
+ st.shared.u64 [%r25], %rd128;
601
+ bar.sync 0;
602
+ ld.shared.u64 %rd109, [%r26];
603
+ ld.shared.u64 %rd110, [%r27];
604
+ ld.shared.u64 %rd111, [%r28];
605
+ ld.shared.u64 %rd112, [%r29];
606
+ bar.sync 0;
607
+ shl.b64 %rd129, %rd31, 1;
608
+ add.s64 %rd130, %rd207, %rd129;
609
+ add.s64 %rd131, %rd130, -301542;
610
+ st.shared.u64 [%r22], %rd131;
611
+ shl.b64 %rd132, %rd32, 1;
612
+ add.s64 %rd133, %rd207, %rd132;
613
+ add.s64 %rd134, %rd133, -201028;
614
+ st.shared.u64 [%r23], %rd134;
615
+ shl.b64 %rd135, %rd33, 1;
616
+ add.s64 %rd136, %rd207, %rd135;
617
+ add.s64 %rd137, %rd136, -100514;
618
+ st.shared.u64 [%r24], %rd137;
619
+ shl.b64 %rd138, %rd34, 1;
620
+ add.s64 %rd139, %rd207, %rd138;
621
+ st.shared.u64 [%r25], %rd139;
622
+ bar.sync 0;
623
+ ld.shared.u64 %rd113, [%r26];
624
+ ld.shared.u64 %rd114, [%r27];
625
+ ld.shared.u64 %rd115, [%r28];
626
+ ld.shared.u64 %rd116, [%r29];
627
+ mov.u16 %rs43, 0x0;
628
+ @%p78 ld.global.L1::evict_last.b16 { %rs43 }, [ %rd109 + 0 ];
629
+ @!%p78 mov.u16 %rs43, %rs44;
630
+ mov.u16 %rs45, 0x0;
631
+ @%p78 ld.global.L1::evict_last.b16 { %rs45 }, [ %rd110 + 0 ];
632
+ @!%p78 mov.u16 %rs45, %rs44;
633
+ mov.u16 %rs47, 0x0;
634
+ @%p78 ld.global.L1::evict_last.b16 { %rs47 }, [ %rd111 + 0 ];
635
+ @!%p78 mov.u16 %rs47, %rs44;
636
+ mov.u16 %rs49, 0x0;
637
+ @%p78 ld.global.L1::evict_last.b16 { %rs49 }, [ %rd112 + 0 ];
638
+ @!%p78 mov.u16 %rs49, %rs44;
639
+ mov.u16 %rs51, 0x0;
640
+ @%p78 ld.global.L1::evict_last.b16 { %rs51 }, [ %rd113 + 0 ];
641
+ @!%p78 mov.u16 %rs51, %rs44;
642
+ mov.u16 %rs53, 0x0;
643
+ @%p78 ld.global.L1::evict_last.b16 { %rs53 }, [ %rd114 + 0 ];
644
+ @!%p78 mov.u16 %rs53, %rs44;
645
+ mov.u16 %rs55, 0x0;
646
+ @%p90 ld.global.L1::evict_last.b16 { %rs55 }, [ %rd115 + 0 ];
647
+ @!%p90 mov.u16 %rs55, %rs44;
648
+ mov.u16 %rs57, 0x0;
649
+ @%p90 ld.global.L1::evict_last.b16 { %rs57 }, [ %rd116 + 0 ];
650
+ @!%p90 mov.u16 %rs57, %rs44;
651
+ .loc 1 46 23
652
+ setp.lt.f32 %p94, %f17, 0f00800000;
653
+ mul.f32 %f96, %f17, 0f4B000000;
654
+ selp.f32 %f33, %f96, %f17, %p94;
655
+ selp.f32 %f97, 0fC1B80000, 0f00000000, %p94;
656
+ mov.b32 %r140, %f33;
657
+ add.s32 %r141, %r140, -1059760811;
658
+ and.b32 %r142, %r141, -8388608;
659
+ sub.s32 %r143, %r140, %r142;
660
+ mov.b32 %f98, %r143;
661
+ cvt.rn.f32.s32 %f99, %r142;
662
+ mov.f32 %f100, 0f34000000;
663
+ fma.rn.ftz.f32 %f101, %f99, %f100, %f97;
664
+ add.f32 %f102, %f98, 0fBF800000;
665
+ mov.f32 %f103, 0f3E1039F6;
666
+ mov.f32 %f104, 0fBE055027;
667
+ fma.rn.ftz.f32 %f105, %f104, %f102, %f103;
668
+ mov.f32 %f106, 0fBDF8CDCC;
669
+ fma.rn.ftz.f32 %f107, %f105, %f102, %f106;
670
+ mov.f32 %f108, 0f3E0F2955;
671
+ fma.rn.ftz.f32 %f109, %f107, %f102, %f108;
672
+ mov.f32 %f110, 0fBE2AD8B9;
673
+ fma.rn.ftz.f32 %f111, %f109, %f102, %f110;
674
+ mov.f32 %f112, 0f3E4CED0B;
675
+ fma.rn.ftz.f32 %f113, %f111, %f102, %f112;
676
+ mov.f32 %f114, 0fBE7FFF22;
677
+ fma.rn.ftz.f32 %f115, %f113, %f102, %f114;
678
+ mov.f32 %f116, 0f3EAAAA78;
679
+ fma.rn.ftz.f32 %f117, %f115, %f102, %f116;
680
+ mov.f32 %f118, 0fBF000000;
681
+ fma.rn.ftz.f32 %f119, %f117, %f102, %f118;
682
+ mul.f32 %f120, %f102, %f119;
683
+ fma.rn.ftz.f32 %f121, %f120, %f102, %f102;
684
+ mov.f32 %f122, 0f3F317218;
685
+ fma.rn.ftz.f32 %f393, %f101, %f122, %f121;
686
+ setp.lt.u32 %p95, %r140, 2139095040;
687
+ mov.f32 %f123, 0f7F800000;
688
+ @%p95 bra $L__BB0_5;
689
+ .loc 1 0 23
690
+ fma.rn.ftz.f32 %f393, %f33, %f123, %f123;
691
+ $L__BB0_5:
692
+ mov.b32 %f18, %r83;
693
+ .loc 1 46 23
694
+ setp.lt.f32 %p97, %f18, 0f00800000;
695
+ mul.f32 %f124, %f18, 0f4B000000;
696
+ selp.f32 %f38, %f124, %f18, %p97;
697
+ selp.f32 %f125, 0fC1B80000, 0f00000000, %p97;
698
+ mov.b32 %r144, %f38;
699
+ add.s32 %r145, %r144, -1059760811;
700
+ and.b32 %r146, %r145, -8388608;
701
+ sub.s32 %r147, %r144, %r146;
702
+ mov.b32 %f126, %r147;
703
+ cvt.rn.f32.s32 %f127, %r146;
704
+ fma.rn.ftz.f32 %f129, %f127, %f100, %f125;
705
+ add.f32 %f130, %f126, 0fBF800000;
706
+ fma.rn.ftz.f32 %f133, %f104, %f130, %f103;
707
+ fma.rn.ftz.f32 %f135, %f133, %f130, %f106;
708
+ fma.rn.ftz.f32 %f137, %f135, %f130, %f108;
709
+ fma.rn.ftz.f32 %f139, %f137, %f130, %f110;
710
+ fma.rn.ftz.f32 %f141, %f139, %f130, %f112;
711
+ fma.rn.ftz.f32 %f143, %f141, %f130, %f114;
712
+ fma.rn.ftz.f32 %f145, %f143, %f130, %f116;
713
+ fma.rn.ftz.f32 %f147, %f145, %f130, %f118;
714
+ mul.f32 %f148, %f130, %f147;
715
+ fma.rn.ftz.f32 %f149, %f148, %f130, %f130;
716
+ fma.rn.ftz.f32 %f394, %f129, %f122, %f149;
717
+ setp.lt.u32 %p98, %r144, 2139095040;
718
+ @%p98 bra $L__BB0_7;
719
+ .loc 1 0 23
720
+ fma.rn.ftz.f32 %f394, %f38, %f123, %f123;
721
+ $L__BB0_7:
722
+ mov.b32 %f19, %r84;
723
+ .loc 1 46 23
724
+ setp.lt.f32 %p100, %f19, 0f00800000;
725
+ mul.f32 %f152, %f19, 0f4B000000;
726
+ selp.f32 %f43, %f152, %f19, %p100;
727
+ selp.f32 %f153, 0fC1B80000, 0f00000000, %p100;
728
+ mov.b32 %r148, %f43;
729
+ add.s32 %r149, %r148, -1059760811;
730
+ and.b32 %r150, %r149, -8388608;
731
+ sub.s32 %r151, %r148, %r150;
732
+ mov.b32 %f154, %r151;
733
+ cvt.rn.f32.s32 %f155, %r150;
734
+ fma.rn.ftz.f32 %f157, %f155, %f100, %f153;
735
+ add.f32 %f158, %f154, 0fBF800000;
736
+ fma.rn.ftz.f32 %f161, %f104, %f158, %f103;
737
+ fma.rn.ftz.f32 %f163, %f161, %f158, %f106;
738
+ fma.rn.ftz.f32 %f165, %f163, %f158, %f108;
739
+ fma.rn.ftz.f32 %f167, %f165, %f158, %f110;
740
+ fma.rn.ftz.f32 %f169, %f167, %f158, %f112;
741
+ fma.rn.ftz.f32 %f171, %f169, %f158, %f114;
742
+ fma.rn.ftz.f32 %f173, %f171, %f158, %f116;
743
+ fma.rn.ftz.f32 %f175, %f173, %f158, %f118;
744
+ mul.f32 %f176, %f158, %f175;
745
+ fma.rn.ftz.f32 %f177, %f176, %f158, %f158;
746
+ fma.rn.ftz.f32 %f395, %f157, %f122, %f177;
747
+ setp.lt.u32 %p101, %r148, 2139095040;
748
+ @%p101 bra $L__BB0_9;
749
+ .loc 1 0 23
750
+ fma.rn.ftz.f32 %f395, %f43, %f123, %f123;
751
+ $L__BB0_9:
752
+ mov.b32 %f20, %r85;
753
+ .loc 1 46 23
754
+ setp.lt.f32 %p103, %f20, 0f00800000;
755
+ mul.f32 %f180, %f20, 0f4B000000;
756
+ selp.f32 %f48, %f180, %f20, %p103;
757
+ selp.f32 %f181, 0fC1B80000, 0f00000000, %p103;
758
+ mov.b32 %r152, %f48;
759
+ add.s32 %r153, %r152, -1059760811;
760
+ and.b32 %r154, %r153, -8388608;
761
+ sub.s32 %r155, %r152, %r154;
762
+ mov.b32 %f182, %r155;
763
+ cvt.rn.f32.s32 %f183, %r154;
764
+ fma.rn.ftz.f32 %f185, %f183, %f100, %f181;
765
+ add.f32 %f186, %f182, 0fBF800000;
766
+ fma.rn.ftz.f32 %f189, %f104, %f186, %f103;
767
+ fma.rn.ftz.f32 %f191, %f189, %f186, %f106;
768
+ fma.rn.ftz.f32 %f193, %f191, %f186, %f108;
769
+ fma.rn.ftz.f32 %f195, %f193, %f186, %f110;
770
+ fma.rn.ftz.f32 %f197, %f195, %f186, %f112;
771
+ fma.rn.ftz.f32 %f199, %f197, %f186, %f114;
772
+ fma.rn.ftz.f32 %f201, %f199, %f186, %f116;
773
+ fma.rn.ftz.f32 %f203, %f201, %f186, %f118;
774
+ mul.f32 %f204, %f186, %f203;
775
+ fma.rn.ftz.f32 %f205, %f204, %f186, %f186;
776
+ fma.rn.ftz.f32 %f396, %f185, %f122, %f205;
777
+ setp.lt.u32 %p104, %r152, 2139095040;
778
+ @%p104 bra $L__BB0_11;
779
+ .loc 1 0 23
780
+ fma.rn.ftz.f32 %f396, %f48, %f123, %f123;
781
+ $L__BB0_11:
782
+ mov.b32 %f21, %r90;
783
+ .loc 1 46 23
784
+ setp.lt.f32 %p106, %f21, 0f00800000;
785
+ mul.f32 %f208, %f21, 0f4B000000;
786
+ selp.f32 %f53, %f208, %f21, %p106;
787
+ selp.f32 %f209, 0fC1B80000, 0f00000000, %p106;
788
+ mov.b32 %r156, %f53;
789
+ add.s32 %r157, %r156, -1059760811;
790
+ and.b32 %r158, %r157, -8388608;
791
+ sub.s32 %r159, %r156, %r158;
792
+ mov.b32 %f210, %r159;
793
+ cvt.rn.f32.s32 %f211, %r158;
794
+ fma.rn.ftz.f32 %f213, %f211, %f100, %f209;
795
+ add.f32 %f214, %f210, 0fBF800000;
796
+ fma.rn.ftz.f32 %f217, %f104, %f214, %f103;
797
+ fma.rn.ftz.f32 %f219, %f217, %f214, %f106;
798
+ fma.rn.ftz.f32 %f221, %f219, %f214, %f108;
799
+ fma.rn.ftz.f32 %f223, %f221, %f214, %f110;
800
+ fma.rn.ftz.f32 %f225, %f223, %f214, %f112;
801
+ fma.rn.ftz.f32 %f227, %f225, %f214, %f114;
802
+ fma.rn.ftz.f32 %f229, %f227, %f214, %f116;
803
+ fma.rn.ftz.f32 %f231, %f229, %f214, %f118;
804
+ mul.f32 %f232, %f214, %f231;
805
+ fma.rn.ftz.f32 %f233, %f232, %f214, %f214;
806
+ fma.rn.ftz.f32 %f397, %f213, %f122, %f233;
807
+ setp.lt.u32 %p107, %r156, 2139095040;
808
+ @%p107 bra $L__BB0_13;
809
+ .loc 1 0 23
810
+ fma.rn.ftz.f32 %f397, %f53, %f123, %f123;
811
+ $L__BB0_13:
812
+ mov.b32 %f22, %r91;
813
+ .loc 1 46 23
814
+ setp.lt.f32 %p109, %f22, 0f00800000;
815
+ mul.f32 %f236, %f22, 0f4B000000;
816
+ selp.f32 %f58, %f236, %f22, %p109;
817
+ selp.f32 %f237, 0fC1B80000, 0f00000000, %p109;
818
+ mov.b32 %r160, %f58;
819
+ add.s32 %r161, %r160, -1059760811;
820
+ and.b32 %r162, %r161, -8388608;
821
+ sub.s32 %r163, %r160, %r162;
822
+ mov.b32 %f238, %r163;
823
+ cvt.rn.f32.s32 %f239, %r162;
824
+ fma.rn.ftz.f32 %f241, %f239, %f100, %f237;
825
+ add.f32 %f242, %f238, 0fBF800000;
826
+ fma.rn.ftz.f32 %f245, %f104, %f242, %f103;
827
+ fma.rn.ftz.f32 %f247, %f245, %f242, %f106;
828
+ fma.rn.ftz.f32 %f249, %f247, %f242, %f108;
829
+ fma.rn.ftz.f32 %f251, %f249, %f242, %f110;
830
+ fma.rn.ftz.f32 %f253, %f251, %f242, %f112;
831
+ fma.rn.ftz.f32 %f255, %f253, %f242, %f114;
832
+ fma.rn.ftz.f32 %f257, %f255, %f242, %f116;
833
+ fma.rn.ftz.f32 %f259, %f257, %f242, %f118;
834
+ mul.f32 %f260, %f242, %f259;
835
+ fma.rn.ftz.f32 %f261, %f260, %f242, %f242;
836
+ fma.rn.ftz.f32 %f398, %f241, %f122, %f261;
837
+ setp.lt.u32 %p110, %r160, 2139095040;
838
+ @%p110 bra $L__BB0_15;
839
+ .loc 1 0 23
840
+ fma.rn.ftz.f32 %f398, %f58, %f123, %f123;
841
+ $L__BB0_15:
842
+ setp.eq.f32 %p96, %f33, 0f00000000;
843
+ setp.eq.f32 %p99, %f38, 0f00000000;
844
+ setp.eq.f32 %p102, %f43, 0f00000000;
845
+ setp.eq.f32 %p105, %f48, 0f00000000;
846
+ setp.eq.f32 %p108, %f53, 0f00000000;
847
+ mov.b32 %f23, %r92;
848
+ .loc 1 46 23
849
+ setp.eq.f32 %p111, %f58, 0f00000000;
850
+ setp.lt.f32 %p112, %f23, 0f00800000;
851
+ mul.f32 %f264, %f23, 0f4B000000;
852
+ selp.f32 %f63, %f264, %f23, %p112;
853
+ selp.f32 %f265, 0fC1B80000, 0f00000000, %p112;
854
+ mov.b32 %r164, %f63;
855
+ add.s32 %r165, %r164, -1059760811;
856
+ and.b32 %r166, %r165, -8388608;
857
+ sub.s32 %r167, %r164, %r166;
858
+ mov.b32 %f266, %r167;
859
+ cvt.rn.f32.s32 %f267, %r166;
860
+ fma.rn.ftz.f32 %f269, %f267, %f100, %f265;
861
+ add.f32 %f270, %f266, 0fBF800000;
862
+ fma.rn.ftz.f32 %f273, %f104, %f270, %f103;
863
+ fma.rn.ftz.f32 %f275, %f273, %f270, %f106;
864
+ fma.rn.ftz.f32 %f277, %f275, %f270, %f108;
865
+ fma.rn.ftz.f32 %f279, %f277, %f270, %f110;
866
+ fma.rn.ftz.f32 %f281, %f279, %f270, %f112;
867
+ fma.rn.ftz.f32 %f283, %f281, %f270, %f114;
868
+ fma.rn.ftz.f32 %f285, %f283, %f270, %f116;
869
+ fma.rn.ftz.f32 %f287, %f285, %f270, %f118;
870
+ mul.f32 %f288, %f270, %f287;
871
+ fma.rn.ftz.f32 %f289, %f288, %f270, %f270;
872
+ fma.rn.ftz.f32 %f399, %f269, %f122, %f289;
873
+ setp.lt.u32 %p113, %r164, 2139095040;
874
+ @%p113 bra $L__BB0_17;
875
+ .loc 1 0 23
876
+ fma.rn.ftz.f32 %f399, %f63, %f123, %f123;
877
+ $L__BB0_17:
878
+ mov.b32 %f24, %r93;
879
+ cvt.f32.bf16 %r131, %rs43;
880
+ mov.b32 %f88, %r131;
881
+ cvt.f32.bf16 %r132, %rs45;
882
+ mov.b32 %f89, %r132;
883
+ cvt.f32.bf16 %r133, %rs47;
884
+ mov.b32 %f90, %r133;
885
+ cvt.f32.bf16 %r134, %rs49;
886
+ mov.b32 %f91, %r134;
887
+ cvt.f32.bf16 %r135, %rs51;
888
+ mov.b32 %f92, %r135;
889
+ cvt.f32.bf16 %r136, %rs53;
890
+ mov.b32 %f93, %r136;
891
+ cvt.f32.bf16 %r137, %rs55;
892
+ mov.b32 %f94, %r137;
893
+ cvt.f32.bf16 %r138, %rs57;
894
+ mov.b32 %f95, %r138;
895
+ sub.f32 %f32, %f95, %f16;
896
+ sub.f32 %f31, %f94, %f15;
897
+ sub.f32 %f30, %f93, %f14;
898
+ sub.f32 %f29, %f92, %f13;
899
+ sub.f32 %f28, %f91, %f12;
900
+ sub.f32 %f27, %f90, %f11;
901
+ sub.f32 %f26, %f89, %f10;
902
+ sub.f32 %f25, %f88, %f9;
903
+ .loc 1 46 23
904
+ selp.f32 %f37, 0fFF800000, %f393, %p96;
905
+ selp.f32 %f42, 0fFF800000, %f394, %p99;
906
+ selp.f32 %f47, 0fFF800000, %f395, %p102;
907
+ selp.f32 %f52, 0fFF800000, %f396, %p105;
908
+ selp.f32 %f57, 0fFF800000, %f397, %p108;
909
+ selp.f32 %f62, 0fFF800000, %f398, %p111;
910
+ setp.eq.f32 %p114, %f63, 0f00000000;
911
+ selp.f32 %f67, 0fFF800000, %f399, %p114;
912
+ setp.lt.f32 %p115, %f24, 0f00800000;
913
+ mul.f32 %f292, %f24, 0f4B000000;
914
+ selp.f32 %f68, %f292, %f24, %p115;
915
+ selp.f32 %f293, 0fC1B80000, 0f00000000, %p115;
916
+ mov.b32 %r168, %f68;
917
+ add.s32 %r169, %r168, -1059760811;
918
+ and.b32 %r170, %r169, -8388608;
919
+ sub.s32 %r171, %r168, %r170;
920
+ mov.b32 %f294, %r171;
921
+ cvt.rn.f32.s32 %f295, %r170;
922
+ fma.rn.ftz.f32 %f297, %f295, %f100, %f293;
923
+ add.f32 %f298, %f294, 0fBF800000;
924
+ fma.rn.ftz.f32 %f301, %f104, %f298, %f103;
925
+ fma.rn.ftz.f32 %f303, %f301, %f298, %f106;
926
+ fma.rn.ftz.f32 %f305, %f303, %f298, %f108;
927
+ fma.rn.ftz.f32 %f307, %f305, %f298, %f110;
928
+ fma.rn.ftz.f32 %f309, %f307, %f298, %f112;
929
+ fma.rn.ftz.f32 %f311, %f309, %f298, %f114;
930
+ fma.rn.ftz.f32 %f313, %f311, %f298, %f116;
931
+ fma.rn.ftz.f32 %f315, %f313, %f298, %f118;
932
+ mul.f32 %f316, %f298, %f315;
933
+ fma.rn.ftz.f32 %f317, %f316, %f298, %f298;
934
+ fma.rn.ftz.f32 %f400, %f297, %f122, %f317;
935
+ setp.lt.u32 %p116, %r168, 2139095040;
936
+ @%p116 bra $L__BB0_19;
937
+ .loc 1 0 23
938
+ fma.rn.ftz.f32 %f400, %f68, %f123, %f123;
939
+ bra.uni $L__BB0_19;
940
+ $L__BB0_20:
941
+ .loc 1 24 33
942
+ bfe.u32 %r191, %r1, 5, 3;
943
+ and.b32 %r192, %r1, 31;
944
+ $L__tmp1:
945
+ .loc 2 243 36
946
+ bar.sync 0;
947
+ $L__tmp2:
948
+ .loc 2 233 15
949
+ add.f32 %f361, %f385, %f386;
950
+ add.f32 %f362, %f387, %f361;
951
+ add.f32 %f363, %f388, %f362;
952
+ add.f32 %f364, %f389, %f363;
953
+ add.f32 %f365, %f390, %f364;
954
+ add.f32 %f366, %f391, %f365;
955
+ add.f32 %f367, %f392, %f366;
956
+ $L__tmp3:
957
+ .loc 2 243 36
958
+ mov.b32 %r193, %f367;
959
+ shfl.sync.bfly.b32 %r194, %r193, 16, 31, -1;
960
+ mov.b32 %f368, %r194;
961
+ $L__tmp4:
962
+ .loc 2 233 15
963
+ add.f32 %f369, %f367, %f368;
964
+ $L__tmp5:
965
+ .loc 2 243 36
966
+ mov.b32 %r195, %f369;
967
+ shfl.sync.bfly.b32 %r196, %r195, 8, 31, -1;
968
+ mov.b32 %f370, %r196;
969
+ $L__tmp6:
970
+ .loc 2 233 15
971
+ add.f32 %f371, %f369, %f370;
972
+ $L__tmp7:
973
+ .loc 2 243 36
974
+ mov.b32 %r197, %f371;
975
+ shfl.sync.bfly.b32 %r198, %r197, 4, 31, -1;
976
+ mov.b32 %f372, %r198;
977
+ $L__tmp8:
978
+ .loc 2 233 15
979
+ add.f32 %f373, %f371, %f372;
980
+ $L__tmp9:
981
+ .loc 2 243 36
982
+ mov.b32 %r199, %f373;
983
+ shfl.sync.bfly.b32 %r200, %r199, 2, 31, -1;
984
+ mov.b32 %f374, %r200;
985
+ $L__tmp10:
986
+ .loc 2 233 15
987
+ add.f32 %f375, %f373, %f374;
988
+ $L__tmp11:
989
+ .loc 2 243 36
990
+ mov.b32 %r201, %f375;
991
+ shfl.sync.bfly.b32 %r202, %r201, 1, 31, -1;
992
+ mov.b32 %f376, %r202;
993
+ $L__tmp12:
994
+ .loc 2 233 15
995
+ add.f32 %f377, %f375, %f376;
996
+ $L__tmp13:
997
+ .loc 2 243 36
998
+ setp.eq.s32 %p143, %r192, 0;
999
+ shl.b32 %r203, %r191, 2;
1000
+ add.s32 %r180, %r44, %r203;
1001
+ mov.b32 %r181, %f377;
1002
+ @%p143 st.shared.b32 [ %r180 + 0 ], %r181;
1003
+ bar.sync 0;
1004
+ setp.lt.s32 %p144, %r1, 8;
1005
+ shl.b32 %r205, %r1, 2;
1006
+ add.s32 %r183, %r44, %r205;
1007
+ @%p144 ld.shared.b32 %r182, [ %r183 + 0 ];
1008
+ mov.b32 %f378, %r182;
1009
+ shfl.sync.bfly.b32 %r206, %r182, 4, 31, -1;
1010
+ mov.b32 %f379, %r206;
1011
+ $L__tmp14:
1012
+ .loc 2 233 15
1013
+ add.f32 %f380, %f378, %f379;
1014
+ $L__tmp15:
1015
+ .loc 2 243 36
1016
+ mov.b32 %r207, %f380;
1017
+ shfl.sync.bfly.b32 %r208, %r207, 2, 31, -1;
1018
+ mov.b32 %f381, %r208;
1019
+ $L__tmp16:
1020
+ .loc 2 233 15
1021
+ add.f32 %f382, %f380, %f381;
1022
+ $L__tmp17:
1023
+ .loc 2 243 36
1024
+ mov.b32 %r209, %f382;
1025
+ shfl.sync.bfly.b32 %r210, %r209, 1, 31, -1;
1026
+ mov.b32 %f383, %r210;
1027
+ $L__tmp18:
1028
+ .loc 2 233 15
1029
+ add.f32 %f384, %f382, %f383;
1030
+ $L__tmp19:
1031
+ .loc 2 243 36
1032
+ and.b32 %r211, %r1, 7;
1033
+ setp.eq.s32 %p152, %r211, 0;
1034
+ and.pred %p145, %p144, %p152;
1035
+ mov.b32 %r185, %f384;
1036
+ @%p145 st.shared.b32 [ %r183 + 0 ], %r185;
1037
+ bar.sync 0;
1038
+ ld.shared.u32 %r186, [global_smem];
1039
+ $L__tmp20:
1040
+ .loc 1 59 25
1041
+ shl.b64 %rd154, %rd1, 2;
1042
+ add.s64 %rd148, %rd47, %rd154;
1043
+ .loc 1 59 37
1044
+ setp.eq.s32 %p153, %r2, 0;
1045
+ and.pred %p146, %p153, %p78;
1046
+ @%p146 st.global.b32 [ %rd148 + 0 ], { %r186 };
1047
+ $L__tmp21:
1048
+ .loc 2 243 36
1049
+ bar.sync 0;
1050
+ $L__tmp22:
1051
+ .loc 2 233 15
1052
+ add.s64 %rd155, %rd209, %rd210;
1053
+ add.s64 %rd156, %rd155, %rd211;
1054
+ add.s64 %rd157, %rd156, %rd212;
1055
+ add.s64 %rd158, %rd157, %rd213;
1056
+ add.s64 %rd159, %rd158, %rd214;
1057
+ add.s64 %rd160, %rd159, %rd215;
1058
+ add.s64 %rd161, %rd160, %rd216;
1059
+ $L__tmp23:
1060
+ .loc 2 243 36
1061
+ cvt.u32.u64 %r212, %rd161;
1062
+ shfl.sync.bfly.b32 %r213, %r212, 16, 31, -1;
1063
+ { .reg .b32 tmp; mov.b64 {tmp, %r214}, %rd161; }
1064
+ shfl.sync.bfly.b32 %r215, %r214, 16, 31, -1;
1065
+ cvt.u64.u32 %rd162, %r213;
1066
+ cvt.u64.u32 %rd163, %r215;
1067
+ shl.b64 %rd164, %rd163, 32;
1068
+ or.b64 %rd165, %rd162, %rd164;
1069
+ $L__tmp24:
1070
+ .loc 2 233 15
1071
+ add.s64 %rd166, %rd161, %rd165;
1072
+ $L__tmp25:
1073
+ .loc 2 243 36
1074
+ cvt.u32.u64 %r216, %rd166;
1075
+ shfl.sync.bfly.b32 %r217, %r216, 8, 31, -1;
1076
+ { .reg .b32 tmp; mov.b64 {tmp, %r218}, %rd166; }
1077
+ shfl.sync.bfly.b32 %r219, %r218, 8, 31, -1;
1078
+ cvt.u64.u32 %rd167, %r217;
1079
+ cvt.u64.u32 %rd168, %r219;
1080
+ shl.b64 %rd169, %rd168, 32;
1081
+ or.b64 %rd170, %rd167, %rd169;
1082
+ $L__tmp26:
1083
+ .loc 2 233 15
1084
+ add.s64 %rd171, %rd166, %rd170;
1085
+ $L__tmp27:
1086
+ .loc 2 243 36
1087
+ cvt.u32.u64 %r220, %rd171;
1088
+ shfl.sync.bfly.b32 %r221, %r220, 4, 31, -1;
1089
+ { .reg .b32 tmp; mov.b64 {tmp, %r222}, %rd171; }
1090
+ shfl.sync.bfly.b32 %r223, %r222, 4, 31, -1;
1091
+ cvt.u64.u32 %rd172, %r221;
1092
+ cvt.u64.u32 %rd173, %r223;
1093
+ shl.b64 %rd174, %rd173, 32;
1094
+ or.b64 %rd175, %rd172, %rd174;
1095
+ $L__tmp28:
1096
+ .loc 2 233 15
1097
+ add.s64 %rd176, %rd171, %rd175;
1098
+ $L__tmp29:
1099
+ .loc 2 243 36
1100
+ cvt.u32.u64 %r224, %rd176;
1101
+ shfl.sync.bfly.b32 %r225, %r224, 2, 31, -1;
1102
+ { .reg .b32 tmp; mov.b64 {tmp, %r226}, %rd176; }
1103
+ shfl.sync.bfly.b32 %r227, %r226, 2, 31, -1;
1104
+ cvt.u64.u32 %rd177, %r225;
1105
+ cvt.u64.u32 %rd178, %r227;
1106
+ shl.b64 %rd179, %rd178, 32;
1107
+ or.b64 %rd180, %rd177, %rd179;
1108
+ $L__tmp30:
1109
+ .loc 2 233 15
1110
+ add.s64 %rd181, %rd176, %rd180;
1111
+ $L__tmp31:
1112
+ .loc 2 243 36
1113
+ cvt.u32.u64 %r228, %rd181;
1114
+ shfl.sync.bfly.b32 %r229, %r228, 1, 31, -1;
1115
+ { .reg .b32 tmp; mov.b64 {tmp, %r230}, %rd181; }
1116
+ shfl.sync.bfly.b32 %r231, %r230, 1, 31, -1;
1117
+ cvt.u64.u32 %rd182, %r229;
1118
+ cvt.u64.u32 %rd183, %r231;
1119
+ shl.b64 %rd184, %rd183, 32;
1120
+ or.b64 %rd185, %rd182, %rd184;
1121
+ $L__tmp32:
1122
+ .loc 2 233 15
1123
+ add.s64 %rd149, %rd181, %rd185;
1124
+ $L__tmp33:
1125
+ .loc 2 243 36
1126
+ shl.b32 %r232, %r191, 3;
1127
+ add.s32 %r187, %r44, %r232;
1128
+ @%p143 st.shared.b64 [ %r187 + 0 ], %rd149;
1129
+ bar.sync 0;
1130
+ shl.b32 %r233, %r1, 3;
1131
+ add.s32 %r188, %r44, %r233;
1132
+ @%p144 ld.shared.b64 %rd150, [ %r188 + 0 ];
1133
+ cvt.u32.u64 %r234, %rd150;
1134
+ shfl.sync.bfly.b32 %r235, %r234, 4, 31, -1;
1135
+ { .reg .b32 tmp; mov.b64 {tmp, %r236}, %rd150; }
1136
+ shfl.sync.bfly.b32 %r237, %r236, 4, 31, -1;
1137
+ cvt.u64.u32 %rd186, %r235;
1138
+ cvt.u64.u32 %rd187, %r237;
1139
+ shl.b64 %rd188, %rd187, 32;
1140
+ or.b64 %rd189, %rd186, %rd188;
1141
+ $L__tmp34:
1142
+ .loc 2 233 15
1143
+ add.s64 %rd190, %rd150, %rd189;
1144
+ $L__tmp35:
1145
+ .loc 2 243 36
1146
+ cvt.u32.u64 %r238, %rd190;
1147
+ shfl.sync.bfly.b32 %r239, %r238, 2, 31, -1;
1148
+ { .reg .b32 tmp; mov.b64 {tmp, %r240}, %rd190; }
1149
+ shfl.sync.bfly.b32 %r241, %r240, 2, 31, -1;
1150
+ cvt.u64.u32 %rd191, %r239;
1151
+ cvt.u64.u32 %rd192, %r241;
1152
+ shl.b64 %rd193, %rd192, 32;
1153
+ or.b64 %rd194, %rd191, %rd193;
1154
+ $L__tmp36:
1155
+ .loc 2 233 15
1156
+ add.s64 %rd195, %rd190, %rd194;
1157
+ $L__tmp37:
1158
+ .loc 2 243 36
1159
+ cvt.u32.u64 %r242, %rd195;
1160
+ shfl.sync.bfly.b32 %r243, %r242, 1, 31, -1;
1161
+ { .reg .b32 tmp; mov.b64 {tmp, %r244}, %rd195; }
1162
+ shfl.sync.bfly.b32 %r245, %r244, 1, 31, -1;
1163
+ cvt.u64.u32 %rd196, %r243;
1164
+ cvt.u64.u32 %rd197, %r245;
1165
+ shl.b64 %rd198, %rd197, 32;
1166
+ or.b64 %rd199, %rd196, %rd198;
1167
+ $L__tmp38:
1168
+ .loc 2 233 15
1169
+ add.s64 %rd151, %rd195, %rd199;
1170
+ $L__tmp39:
1171
+ .loc 2 243 36
1172
+ @%p145 st.shared.b64 [ %r188 + 0 ], %rd151;
1173
+ bar.sync 0;
1174
+ ld.shared.u32 %rd200, [global_smem+4];
1175
+ shl.b64 %rd201, %rd200, 32;
1176
+ ld.shared.u32 %rd202, [global_smem];
1177
+ or.b64 %rd203, %rd201, %rd202;
1178
+ $L__tmp40:
1179
+ .loc 1 60 30
1180
+ bar.sync 0;
1181
+ st.shared.u64 [global_smem], %rd203;
1182
+ bar.sync 0;
1183
+ ld.shared.u64 %rd152, [global_smem];
1184
+ .loc 1 61 25
1185
+ shl.b64 %rd204, %rd1, 3;
1186
+ add.s64 %rd153, %rd48, %rd204;
1187
+ .loc 1 61 37
1188
+ @%p146 st.global.b64 [ %rd153 + 0 ], { %rd152 };
1189
+ .loc 1 61 4
1190
+ ret;
1191
+ $L__tmp41:
1192
+ $L__func_end0:
1193
+
1194
+ }
1195
+ // .globl __nv_logf
1196
+ .visible .func (.param .b32 func_retval0) __nv_logf(
1197
+ .param .b32 __nv_logf_param_0
1198
+ )
1199
+ {
1200
+ .reg .pred %p<4>;
1201
+ .reg .b32 %r<5>;
1202
+ .reg .f32 %f<36>;
1203
+ $L__func_begin1:
1204
+
1205
+ ld.param.f32 %f5, [__nv_logf_param_0];
1206
+ setp.lt.f32 %p1, %f5, 0f00800000;
1207
+ mul.f32 %f6, %f5, 0f4B000000;
1208
+ selp.f32 %f1, %f6, %f5, %p1;
1209
+ selp.f32 %f7, 0fC1B80000, 0f00000000, %p1;
1210
+ mov.b32 %r1, %f1;
1211
+ add.s32 %r2, %r1, -1059760811;
1212
+ and.b32 %r3, %r2, -8388608;
1213
+ sub.s32 %r4, %r1, %r3;
1214
+ mov.b32 %f8, %r4;
1215
+ cvt.rn.f32.s32 %f9, %r3;
1216
+ mov.f32 %f10, 0f34000000;
1217
+ fma.rn.ftz.f32 %f11, %f9, %f10, %f7;
1218
+ add.f32 %f12, %f8, 0fBF800000;
1219
+ mov.f32 %f13, 0f3E1039F6;
1220
+ mov.f32 %f14, 0fBE055027;
1221
+ fma.rn.ftz.f32 %f15, %f14, %f12, %f13;
1222
+ mov.f32 %f16, 0fBDF8CDCC;
1223
+ fma.rn.ftz.f32 %f17, %f15, %f12, %f16;
1224
+ mov.f32 %f18, 0f3E0F2955;
1225
+ fma.rn.ftz.f32 %f19, %f17, %f12, %f18;
1226
+ mov.f32 %f20, 0fBE2AD8B9;
1227
+ fma.rn.ftz.f32 %f21, %f19, %f12, %f20;
1228
+ mov.f32 %f22, 0f3E4CED0B;
1229
+ fma.rn.ftz.f32 %f23, %f21, %f12, %f22;
1230
+ mov.f32 %f24, 0fBE7FFF22;
1231
+ fma.rn.ftz.f32 %f25, %f23, %f12, %f24;
1232
+ mov.f32 %f26, 0f3EAAAA78;
1233
+ fma.rn.ftz.f32 %f27, %f25, %f12, %f26;
1234
+ mov.f32 %f28, 0fBF000000;
1235
+ fma.rn.ftz.f32 %f29, %f27, %f12, %f28;
1236
+ mul.f32 %f30, %f12, %f29;
1237
+ fma.rn.ftz.f32 %f31, %f30, %f12, %f12;
1238
+ mov.f32 %f32, 0f3F317218;
1239
+ fma.rn.ftz.f32 %f35, %f11, %f32, %f31;
1240
+ setp.lt.u32 %p2, %r1, 2139095040;
1241
+ @%p2 bra $L__BB1_2;
1242
+ mov.f32 %f33, 0f7F800000;
1243
+ fma.rn.ftz.f32 %f35, %f1, %f33, %f33;
1244
+ $L__BB1_2:
1245
+ setp.eq.f32 %p3, %f1, 0f00000000;
1246
+ selp.f32 %f34, 0fFF800000, %f35, %p3;
1247
+ st.param.f32 [func_retval0+0], %f34;
1248
+ ret;
1249
+ $L__func_end1:
1250
+
1251
+ }
1252
+ .file 1 "/tmp/torchinductor_root/ns/cnshxlw3p7kytog7ihat33cfh5n4z4tq3l77zyi5jxajo5uonq7m.py"
1253
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
1254
+ .section .debug_abbrev
1255
+ {
1256
+ .b8 1
1257
+ .b8 17
1258
+ .b8 1
1259
+ .b8 37
1260
+ .b8 8
1261
+ .b8 19
1262
+ .b8 5
1263
+ .b8 3
1264
+ .b8 8
1265
+ .b8 16
1266
+ .b8 6
1267
+ .b8 27
1268
+ .b8 8
1269
+ .b8 180
1270
+ .b8 66
1271
+ .b8 12
1272
+ .b8 17
1273
+ .b8 1
1274
+ .b8 18
1275
+ .b8 1
1276
+ .b8 0
1277
+ .b8 0
1278
+ .b8 2
1279
+ .b8 46
1280
+ .b8 0
1281
+ .b8 135
1282
+ .b8 64
1283
+ .b8 8
1284
+ .b8 3
1285
+ .b8 8
1286
+ .b8 58
1287
+ .b8 11
1288
+ .b8 59
1289
+ .b8 11
1290
+ .b8 63
1291
+ .b8 12
1292
+ .b8 32
1293
+ .b8 11
1294
+ .b8 0
1295
+ .b8 0
1296
+ .b8 3
1297
+ .b8 46
1298
+ .b8 1
1299
+ .b8 17
1300
+ .b8 1
1301
+ .b8 18
1302
+ .b8 1
1303
+ .b8 64
1304
+ .b8 10
1305
+ .b8 49
1306
+ .b8 19
1307
+ .b8 0
1308
+ .b8 0
1309
+ .b8 4
1310
+ .b8 29
1311
+ .b8 0
1312
+ .b8 49
1313
+ .b8 19
1314
+ .b8 17
1315
+ .b8 1
1316
+ .b8 18
1317
+ .b8 1
1318
+ .b8 88
1319
+ .b8 11
1320
+ .b8 89
1321
+ .b8 11
1322
+ .b8 87
1323
+ .b8 11
1324
+ .b8 0
1325
+ .b8 0
1326
+ .b8 5
1327
+ .b8 29
1328
+ .b8 1
1329
+ .b8 49
1330
+ .b8 19
1331
+ .b8 17
1332
+ .b8 1
1333
+ .b8 18
1334
+ .b8 1
1335
+ .b8 88
1336
+ .b8 11
1337
+ .b8 89
1338
+ .b8 11
1339
+ .b8 87
1340
+ .b8 11
1341
+ .b8 0
1342
+ .b8 0
1343
+ .b8 0
1344
+ }
1345
+ .section .debug_info
1346
+ {
1347
+ .b32 349
1348
+ .b8 2
1349
+ .b8 0
1350
+ .b32 .debug_abbrev
1351
+ .b8 8
1352
+ .b8 1
1353
+ .b8 116
1354
+ .b8 114
1355
+ .b8 105
1356
+ .b8 116
1357
+ .b8 111
1358
+ .b8 110
1359
+ .b8 0
1360
+ .b8 2
1361
+ .b8 0
1362
+ .b8 99
1363
+ .b8 110
1364
+ .b8 115
1365
+ .b8 104
1366
+ .b8 120
1367
+ .b8 108
1368
+ .b8 119
1369
+ .b8 51
1370
+ .b8 112
1371
+ .b8 55
1372
+ .b8 107
1373
+ .b8 121
1374
+ .b8 116
1375
+ .b8 111
1376
+ .b8 103
1377
+ .b8 55
1378
+ .b8 105
1379
+ .b8 104
1380
+ .b8 97
1381
+ .b8 116
1382
+ .b8 51
1383
+ .b8 51
1384
+ .b8 99
1385
+ .b8 102
1386
+ .b8 104
1387
+ .b8 53
1388
+ .b8 110
1389
+ .b8 52
1390
+ .b8 122
1391
+ .b8 52
1392
+ .b8 116
1393
+ .b8 113
1394
+ .b8 51
1395
+ .b8 108
1396
+ .b8 55
1397
+ .b8 55
1398
+ .b8 122
1399
+ .b8 121
1400
+ .b8 105
1401
+ .b8 53
1402
+ .b8 106
1403
+ .b8 120
1404
+ .b8 97
1405
+ .b8 106
1406
+ .b8 111
1407
+ .b8 53
1408
+ .b8 117
1409
+ .b8 111
1410
+ .b8 110
1411
+ .b8 113
1412
+ .b8 55
1413
+ .b8 109
1414
+ .b8 46
1415
+ .b8 112
1416
+ .b8 121
1417
+ .b8 0
1418
+ .b32 .debug_line
1419
+ .b8 47
1420
+ .b8 116
1421
+ .b8 109
1422
+ .b8 112
1423
+ .b8 47
1424
+ .b8 116
1425
+ .b8 111
1426
+ .b8 114
1427
+ .b8 99
1428
+ .b8 104
1429
+ .b8 105
1430
+ .b8 110
1431
+ .b8 100
1432
+ .b8 117
1433
+ .b8 99
1434
+ .b8 116
1435
+ .b8 111
1436
+ .b8 114
1437
+ .b8 95
1438
+ .b8 114
1439
+ .b8 111
1440
+ .b8 111
1441
+ .b8 116
1442
+ .b8 47
1443
+ .b8 110
1444
+ .b8 115
1445
+ .b8 0
1446
+ .b8 1
1447
+ .b64 $L__func_begin0
1448
+ .b64 $L__func_end0
1449
+ .b8 2
1450
+ .b8 116
1451
+ .b8 114
1452
+ .b8 105
1453
+ .b8 116
1454
+ .b8 111
1455
+ .b8 110
1456
+ .b8 95
1457
+ .b8 95
1458
+ .b8 48
1459
+ .b8 100
1460
+ .b8 49
1461
+ .b8 100
1462
+ .b8 50
1463
+ .b8 100
1464
+ .b8 51
1465
+ .b8 100
1466
+ .b8 52
1467
+ .b8 100
1468
+ .b8 53
1469
+ .b8 100
1470
+ .b8 54
1471
+ .b8 101
1472
+ .b8 55
1473
+ .b8 100
1474
+ .b8 101
1475
+ .b8 0
1476
+ .b8 116
1477
+ .b8 114
1478
+ .b8 105
1479
+ .b8 116
1480
+ .b8 111
1481
+ .b8 110
1482
+ .b8 95
1483
+ .b8 95
1484
+ .b8 48
1485
+ .b8 100
1486
+ .b8 49
1487
+ .b8 100
1488
+ .b8 50
1489
+ .b8 100
1490
+ .b8 51
1491
+ .b8 100
1492
+ .b8 52
1493
+ .b8 100
1494
+ .b8 53
1495
+ .b8 100
1496
+ .b8 54
1497
+ .b8 101
1498
+ .b8 55
1499
+ .b8 100
1500
+ .b8 101
1501
+ .b8 0
1502
+ .b8 1
1503
+ .b8 18
1504
+ .b8 1
1505
+ .b8 1
1506
+ .b8 3
1507
+ .b64 $L__func_begin0
1508
+ .b64 $L__func_end0
1509
+ .b8 1
1510
+ .b8 156
1511
+ .b32 125
1512
+ .b8 4
1513
+ .b32 125
1514
+ .b64 $L__tmp1
1515
+ .b64 $L__tmp20
1516
+ .b8 2
1517
+ .b8 58
1518
+ .b8 27
1519
+ .b8 5
1520
+ .b32 125
1521
+ .b64 $L__tmp2
1522
+ .b64 $L__tmp19
1523
+ .b8 2
1524
+ .b8 58
1525
+ .b8 27
1526
+ .b8 4
1527
+ .b32 125
1528
+ .b64 $L__tmp2
1529
+ .b64 $L__tmp19
1530
+ .b8 2
1531
+ .b8 243
1532
+ .b8 36
1533
+ .b8 0
1534
+ .b8 4
1535
+ .b32 125
1536
+ .b64 $L__tmp21
1537
+ .b64 $L__tmp40
1538
+ .b8 2
1539
+ .b8 60
1540
+ .b8 27
1541
+ .b8 5
1542
+ .b32 125
1543
+ .b64 $L__tmp22
1544
+ .b64 $L__tmp39
1545
+ .b8 2
1546
+ .b8 60
1547
+ .b8 27
1548
+ .b8 4
1549
+ .b32 125
1550
+ .b64 $L__tmp22
1551
+ .b64 $L__tmp39
1552
+ .b8 2
1553
+ .b8 243
1554
+ .b8 36
1555
+ .b8 0
1556
+ .b8 0
1557
+ .b8 0
1558
+ }
1559
+ .section .debug_pubnames
1560
+ {
1561
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1562
+ $L__pubNames_start0:
1563
+ .b8 2
1564
+ .b8 0
1565
+ .b32 .debug_info
1566
+ .b32 353
1567
+ .b32 125
1568
+ .b8 116
1569
+ .b8 114
1570
+ .b8 105
1571
+ .b8 116
1572
+ .b8 111
1573
+ .b8 110
1574
+ .b8 95
1575
+ .b8 95
1576
+ .b8 48
1577
+ .b8 100
1578
+ .b8 49
1579
+ .b8 100
1580
+ .b8 50
1581
+ .b8 100
1582
+ .b8 51
1583
+ .b8 100
1584
+ .b8 52
1585
+ .b8 100
1586
+ .b8 53
1587
+ .b8 100
1588
+ .b8 54
1589
+ .b8 101
1590
+ .b8 55
1591
+ .b8 100
1592
+ .b8 101
1593
+ .b8 0
1594
+ .b32 0
1595
+ $L__pubNames_end0:
1596
+ }
1597
+ .section .debug_pubtypes
1598
+ {
1599
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1600
+ $L__pubTypes_start0:
1601
+ .b8 2
1602
+ .b8 0
1603
+ .b32 .debug_info
1604
+ .b32 353
1605
+ .b32 0
1606
+ $L__pubTypes_end0:
1607
+ }
1608
+ .section .debug_loc { }
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir ADDED
@@ -0,0 +1,949 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
5
+
6
+ define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 {
7
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
8
+ %5 = shl i32 %4, 3, !dbg !10
9
+ %6 = and i32 %5, 1016, !dbg !10
10
+ %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
11
+ %8 = shl i32 %7, 10, !dbg !12
12
+ %9 = or i32 %8, %6, !dbg !13
13
+ %10 = sext i32 %9 to i64, !dbg !14
14
+ %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14
15
+ %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15
16
+ %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15
17
+ %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15
18
+ %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15
19
+ %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15
20
+ %17 = trunc i32 %13 to i16, !dbg !15
21
+ %extelt.offset = lshr i32 %13, 16, !dbg !15
22
+ %18 = trunc i32 %extelt.offset to i16, !dbg !15
23
+ %19 = trunc i32 %14 to i16, !dbg !15
24
+ %extelt.offset1 = lshr i32 %14, 16, !dbg !15
25
+ %20 = trunc i32 %extelt.offset1 to i16, !dbg !15
26
+ %21 = trunc i32 %15 to i16, !dbg !15
27
+ %extelt.offset2 = lshr i32 %15, 16, !dbg !15
28
+ %22 = trunc i32 %extelt.offset2 to i16, !dbg !15
29
+ %23 = trunc i32 %16 to i16, !dbg !15
30
+ %extelt.offset3 = lshr i32 %16, 16, !dbg !15
31
+ %24 = trunc i32 %extelt.offset3 to i16, !dbg !15
32
+ %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16
33
+ %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16
34
+ %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16
35
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16
36
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16
37
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16
38
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16
39
+ %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16
40
+ %33 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17
41
+ %34 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %33, i1 true) #4, !dbg !18
42
+ %35 = extractvalue { i32, i32, i32, i32 } %34, 0, !dbg !18
43
+ %36 = extractvalue { i32, i32, i32, i32 } %34, 1, !dbg !18
44
+ %37 = extractvalue { i32, i32, i32, i32 } %34, 2, !dbg !18
45
+ %38 = extractvalue { i32, i32, i32, i32 } %34, 3, !dbg !18
46
+ %39 = trunc i32 %35 to i16, !dbg !18
47
+ %extelt.offset4 = lshr i32 %35, 16, !dbg !18
48
+ %40 = trunc i32 %extelt.offset4 to i16, !dbg !18
49
+ %41 = trunc i32 %36 to i16, !dbg !18
50
+ %extelt.offset5 = lshr i32 %36, 16, !dbg !18
51
+ %42 = trunc i32 %extelt.offset5 to i16, !dbg !18
52
+ %43 = trunc i32 %37 to i16, !dbg !18
53
+ %extelt.offset6 = lshr i32 %37, 16, !dbg !18
54
+ %44 = trunc i32 %extelt.offset6 to i16, !dbg !18
55
+ %45 = trunc i32 %38 to i16, !dbg !18
56
+ %extelt.offset7 = lshr i32 %38, 16, !dbg !18
57
+ %46 = trunc i32 %extelt.offset7 to i16, !dbg !18
58
+ %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %39) #4, !dbg !19
59
+ %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %40) #4, !dbg !19
60
+ %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #4, !dbg !19
61
+ %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #4, !dbg !19
62
+ %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #4, !dbg !19
63
+ %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #4, !dbg !19
64
+ %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #4, !dbg !19
65
+ %54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #4, !dbg !19
66
+ %55 = fmul float %47, 0x3FE6A09E60000000, !dbg !20
67
+ %56 = fmul float %48, 0x3FE6A09E60000000, !dbg !20
68
+ %57 = fmul float %49, 0x3FE6A09E60000000, !dbg !20
69
+ %58 = fmul float %50, 0x3FE6A09E60000000, !dbg !20
70
+ %59 = fmul float %51, 0x3FE6A09E60000000, !dbg !20
71
+ %60 = fmul float %52, 0x3FE6A09E60000000, !dbg !20
72
+ %61 = fmul float %53, 0x3FE6A09E60000000, !dbg !20
73
+ %62 = fmul float %54, 0x3FE6A09E60000000, !dbg !20
74
+ %63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
75
+ %.not.i = icmp eq i32 %63, 0, !dbg !21
76
+ %64 = tail call float @llvm.nvvm.fabs.ftz.f(float %55) #4, !dbg !21
77
+ %65 = tail call float @llvm.nvvm.fabs.f(float %55) #4, !dbg !21
78
+ %.0.i = select i1 %.not.i, float %65, float %64, !dbg !21
79
+ %66 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21
80
+ br i1 %66, label %__nv_fabsf.exit1.i, label %68, !dbg !21
81
+
82
+ __nv_fabsf.exit1.i: ; preds = %3
83
+ %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
84
+ %.not1.i = icmp eq i32 %67, 0, !dbg !21
85
+ %.01.i = select i1 %.not1.i, float %65, float %64, !dbg !21
86
+ br label %__internal_fmad.exit.i, !dbg !21
87
+
88
+ 68: ; preds = %3
89
+ %69 = fmul float %55, %55, !dbg !21
90
+ br label %__internal_fmad.exit.i, !dbg !21
91
+
92
+ __internal_fmad.exit.i: ; preds = %68, %__nv_fabsf.exit1.i
93
+ %70 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %68 ], !dbg !21
94
+ %71 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %68 ], !dbg !21
95
+ %72 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %68 ], !dbg !21
96
+ %73 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %68 ], !dbg !21
97
+ %74 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %68 ], !dbg !21
98
+ %75 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %68 ], !dbg !21
99
+ %76 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %68 ], !dbg !21
100
+ %77 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %69, %68 ], !dbg !21
101
+ %78 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
102
+ %.not2.i = icmp eq i32 %78, 0, !dbg !21
103
+ %79 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %76, float %77, float %75) #4, !dbg !21
104
+ %80 = tail call float @llvm.nvvm.fma.rn.f(float %76, float %77, float %75) #4, !dbg !21
105
+ %.02.i = select i1 %.not2.i, float %80, float %79, !dbg !21
106
+ %81 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
107
+ %.not3.i = icmp eq i32 %81, 0, !dbg !21
108
+ %82 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %77, float %74) #4, !dbg !21
109
+ %83 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %77, float %74) #4, !dbg !21
110
+ %.03.i = select i1 %.not3.i, float %83, float %82, !dbg !21
111
+ %84 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
112
+ %.not4.i = icmp eq i32 %84, 0, !dbg !21
113
+ %85 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %77, float %73) #4, !dbg !21
114
+ %86 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %77, float %73) #4, !dbg !21
115
+ %.04.i = select i1 %.not4.i, float %86, float %85, !dbg !21
116
+ %87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
117
+ %.not5.i = icmp eq i32 %87, 0, !dbg !21
118
+ %88 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %77, float %72) #4, !dbg !21
119
+ %89 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %77, float %72) #4, !dbg !21
120
+ %.05.i = select i1 %.not5.i, float %89, float %88, !dbg !21
121
+ %90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
122
+ %.not6.i = icmp eq i32 %90, 0, !dbg !21
123
+ %91 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %77, float %71) #4, !dbg !21
124
+ %92 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %77, float %71) #4, !dbg !21
125
+ %.06.i = select i1 %.not6.i, float %92, float %91, !dbg !21
126
+ %93 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
127
+ %.not7.i = icmp eq i32 %93, 0, !dbg !21
128
+ %94 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %77, float %70) #4, !dbg !21
129
+ %95 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %77, float %70) #4, !dbg !21
130
+ %.07.i = select i1 %.not7.i, float %95, float %94, !dbg !21
131
+ %96 = fneg float %77, !dbg !21
132
+ %97 = select i1 %66, float %96, float %55, !dbg !21
133
+ %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
134
+ %.not8.i = icmp eq i32 %98, 0, !dbg !21
135
+ %99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %97, float %97) #4, !dbg !21
136
+ %100 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %97, float %97) #4, !dbg !21
137
+ %.08.i = select i1 %.not8.i, float %100, float %99, !dbg !21
138
+ br i1 %66, label %101, label %__nv_erff.exit, !dbg !21
139
+
140
+ 101: ; preds = %__internal_fmad.exit.i
141
+ %102 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21
142
+ %103 = fsub float 1.000000e+00, %102, !dbg !21
143
+ %104 = bitcast float %103 to i32, !dbg !21
144
+ %105 = bitcast float %55 to i32, !dbg !21
145
+ %106 = and i32 %105, -2147483648, !dbg !21
146
+ %107 = or i32 %106, %104, !dbg !21
147
+ %108 = bitcast i32 %107 to float, !dbg !21
148
+ br label %__nv_erff.exit, !dbg !21
149
+
150
+ __nv_erff.exit: ; preds = %__internal_fmad.exit.i, %101
151
+ %r.0.i = phi float [ %108, %101 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21
152
+ %109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
153
+ %.not.i8 = icmp eq i32 %109, 0, !dbg !21
154
+ %110 = tail call float @llvm.nvvm.fabs.ftz.f(float %56) #4, !dbg !21
155
+ %111 = tail call float @llvm.nvvm.fabs.f(float %56) #4, !dbg !21
156
+ %.0.i9 = select i1 %.not.i8, float %111, float %110, !dbg !21
157
+ %112 = fcmp oge float %.0.i9, 0x3FF00C1FC0000000, !dbg !21
158
+ br i1 %112, label %__nv_fabsf.exit1.i26, label %114, !dbg !21
159
+
160
+ __nv_fabsf.exit1.i26: ; preds = %__nv_erff.exit
161
+ %113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
162
+ %.not1.i27 = icmp eq i32 %113, 0, !dbg !21
163
+ %.01.i28 = select i1 %.not1.i27, float %111, float %110, !dbg !21
164
+ br label %__internal_fmad.exit.i10, !dbg !21
165
+
166
+ 114: ; preds = %__nv_erff.exit
167
+ %115 = fmul float %56, %56, !dbg !21
168
+ br label %__internal_fmad.exit.i10, !dbg !21
169
+
170
+ __internal_fmad.exit.i10: ; preds = %114, %__nv_fabsf.exit1.i26
171
+ %116 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i26 ], [ 0x3FC06EBA60000000, %114 ], !dbg !21
172
+ %117 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i26 ], [ 0xBFD8127580000000, %114 ], !dbg !21
173
+ %118 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i26 ], [ 0x3FBCE315E0000000, %114 ], !dbg !21
174
+ %119 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i26 ], [ 0xBF9B837CE0000000, %114 ], !dbg !21
175
+ %120 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i26 ], [ 0x3F755ABD40000000, %114 ], !dbg !21
176
+ %121 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i26 ], [ 0xBF4AE9A400000000, %114 ], !dbg !21
177
+ %122 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i26 ], [ 0x3F163D2D40000000, %114 ], !dbg !21
178
+ %123 = phi float [ %.01.i28, %__nv_fabsf.exit1.i26 ], [ %115, %114 ], !dbg !21
179
+ %124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
180
+ %.not2.i11 = icmp eq i32 %124, 0, !dbg !21
181
+ %125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float %123, float %121) #4, !dbg !21
182
+ %126 = tail call float @llvm.nvvm.fma.rn.f(float %122, float %123, float %121) #4, !dbg !21
183
+ %.02.i12 = select i1 %.not2.i11, float %126, float %125, !dbg !21
184
+ %127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
185
+ %.not3.i13 = icmp eq i32 %127, 0, !dbg !21
186
+ %128 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i12, float %123, float %120) #4, !dbg !21
187
+ %129 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i12, float %123, float %120) #4, !dbg !21
188
+ %.03.i14 = select i1 %.not3.i13, float %129, float %128, !dbg !21
189
+ %130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
190
+ %.not4.i15 = icmp eq i32 %130, 0, !dbg !21
191
+ %131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i14, float %123, float %119) #4, !dbg !21
192
+ %132 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i14, float %123, float %119) #4, !dbg !21
193
+ %.04.i16 = select i1 %.not4.i15, float %132, float %131, !dbg !21
194
+ %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
195
+ %.not5.i17 = icmp eq i32 %133, 0, !dbg !21
196
+ %134 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i16, float %123, float %118) #4, !dbg !21
197
+ %135 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i16, float %123, float %118) #4, !dbg !21
198
+ %.05.i18 = select i1 %.not5.i17, float %135, float %134, !dbg !21
199
+ %136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
200
+ %.not6.i19 = icmp eq i32 %136, 0, !dbg !21
201
+ %137 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i18, float %123, float %117) #4, !dbg !21
202
+ %138 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i18, float %123, float %117) #4, !dbg !21
203
+ %.06.i20 = select i1 %.not6.i19, float %138, float %137, !dbg !21
204
+ %139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
205
+ %.not7.i21 = icmp eq i32 %139, 0, !dbg !21
206
+ %140 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i20, float %123, float %116) #4, !dbg !21
207
+ %141 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i20, float %123, float %116) #4, !dbg !21
208
+ %.07.i22 = select i1 %.not7.i21, float %141, float %140, !dbg !21
209
+ %142 = fneg float %123, !dbg !21
210
+ %143 = select i1 %112, float %142, float %56, !dbg !21
211
+ %144 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
212
+ %.not8.i23 = icmp eq i32 %144, 0, !dbg !21
213
+ %145 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i22, float %143, float %143) #4, !dbg !21
214
+ %146 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i22, float %143, float %143) #4, !dbg !21
215
+ %.08.i24 = select i1 %.not8.i23, float %146, float %145, !dbg !21
216
+ br i1 %112, label %147, label %__nv_erff.exit29, !dbg !21
217
+
218
+ 147: ; preds = %__internal_fmad.exit.i10
219
+ %148 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i24) #4, !dbg !21
220
+ %149 = fsub float 1.000000e+00, %148, !dbg !21
221
+ %150 = bitcast float %149 to i32, !dbg !21
222
+ %151 = bitcast float %56 to i32, !dbg !21
223
+ %152 = and i32 %151, -2147483648, !dbg !21
224
+ %153 = or i32 %152, %150, !dbg !21
225
+ %154 = bitcast i32 %153 to float, !dbg !21
226
+ br label %__nv_erff.exit29, !dbg !21
227
+
228
+ __nv_erff.exit29: ; preds = %__internal_fmad.exit.i10, %147
229
+ %r.0.i25 = phi float [ %154, %147 ], [ %.08.i24, %__internal_fmad.exit.i10 ], !dbg !21
230
+ %155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
231
+ %.not.i30 = icmp eq i32 %155, 0, !dbg !21
232
+ %156 = tail call float @llvm.nvvm.fabs.ftz.f(float %57) #4, !dbg !21
233
+ %157 = tail call float @llvm.nvvm.fabs.f(float %57) #4, !dbg !21
234
+ %.0.i31 = select i1 %.not.i30, float %157, float %156, !dbg !21
235
+ %158 = fcmp oge float %.0.i31, 0x3FF00C1FC0000000, !dbg !21
236
+ br i1 %158, label %__nv_fabsf.exit1.i48, label %160, !dbg !21
237
+
238
+ __nv_fabsf.exit1.i48: ; preds = %__nv_erff.exit29
239
+ %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
240
+ %.not1.i49 = icmp eq i32 %159, 0, !dbg !21
241
+ %.01.i50 = select i1 %.not1.i49, float %157, float %156, !dbg !21
242
+ br label %__internal_fmad.exit.i32, !dbg !21
243
+
244
+ 160: ; preds = %__nv_erff.exit29
245
+ %161 = fmul float %57, %57, !dbg !21
246
+ br label %__internal_fmad.exit.i32, !dbg !21
247
+
248
+ __internal_fmad.exit.i32: ; preds = %160, %__nv_fabsf.exit1.i48
249
+ %162 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i48 ], [ 0x3FC06EBA60000000, %160 ], !dbg !21
250
+ %163 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i48 ], [ 0xBFD8127580000000, %160 ], !dbg !21
251
+ %164 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i48 ], [ 0x3FBCE315E0000000, %160 ], !dbg !21
252
+ %165 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i48 ], [ 0xBF9B837CE0000000, %160 ], !dbg !21
253
+ %166 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i48 ], [ 0x3F755ABD40000000, %160 ], !dbg !21
254
+ %167 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i48 ], [ 0xBF4AE9A400000000, %160 ], !dbg !21
255
+ %168 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i48 ], [ 0x3F163D2D40000000, %160 ], !dbg !21
256
+ %169 = phi float [ %.01.i50, %__nv_fabsf.exit1.i48 ], [ %161, %160 ], !dbg !21
257
+ %170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
258
+ %.not2.i33 = icmp eq i32 %170, 0, !dbg !21
259
+ %171 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %168, float %169, float %167) #4, !dbg !21
260
+ %172 = tail call float @llvm.nvvm.fma.rn.f(float %168, float %169, float %167) #4, !dbg !21
261
+ %.02.i34 = select i1 %.not2.i33, float %172, float %171, !dbg !21
262
+ %173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
263
+ %.not3.i35 = icmp eq i32 %173, 0, !dbg !21
264
+ %174 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i34, float %169, float %166) #4, !dbg !21
265
+ %175 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i34, float %169, float %166) #4, !dbg !21
266
+ %.03.i36 = select i1 %.not3.i35, float %175, float %174, !dbg !21
267
+ %176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
268
+ %.not4.i37 = icmp eq i32 %176, 0, !dbg !21
269
+ %177 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i36, float %169, float %165) #4, !dbg !21
270
+ %178 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i36, float %169, float %165) #4, !dbg !21
271
+ %.04.i38 = select i1 %.not4.i37, float %178, float %177, !dbg !21
272
+ %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
273
+ %.not5.i39 = icmp eq i32 %179, 0, !dbg !21
274
+ %180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i38, float %169, float %164) #4, !dbg !21
275
+ %181 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i38, float %169, float %164) #4, !dbg !21
276
+ %.05.i40 = select i1 %.not5.i39, float %181, float %180, !dbg !21
277
+ %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
278
+ %.not6.i41 = icmp eq i32 %182, 0, !dbg !21
279
+ %183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i40, float %169, float %163) #4, !dbg !21
280
+ %184 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i40, float %169, float %163) #4, !dbg !21
281
+ %.06.i42 = select i1 %.not6.i41, float %184, float %183, !dbg !21
282
+ %185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
283
+ %.not7.i43 = icmp eq i32 %185, 0, !dbg !21
284
+ %186 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i42, float %169, float %162) #4, !dbg !21
285
+ %187 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i42, float %169, float %162) #4, !dbg !21
286
+ %.07.i44 = select i1 %.not7.i43, float %187, float %186, !dbg !21
287
+ %188 = fneg float %169, !dbg !21
288
+ %189 = select i1 %158, float %188, float %57, !dbg !21
289
+ %190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
290
+ %.not8.i45 = icmp eq i32 %190, 0, !dbg !21
291
+ %191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i44, float %189, float %189) #4, !dbg !21
292
+ %192 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i44, float %189, float %189) #4, !dbg !21
293
+ %.08.i46 = select i1 %.not8.i45, float %192, float %191, !dbg !21
294
+ br i1 %158, label %193, label %__nv_erff.exit51, !dbg !21
295
+
296
+ 193: ; preds = %__internal_fmad.exit.i32
297
+ %194 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i46) #4, !dbg !21
298
+ %195 = fsub float 1.000000e+00, %194, !dbg !21
299
+ %196 = bitcast float %195 to i32, !dbg !21
300
+ %197 = bitcast float %57 to i32, !dbg !21
301
+ %198 = and i32 %197, -2147483648, !dbg !21
302
+ %199 = or i32 %198, %196, !dbg !21
303
+ %200 = bitcast i32 %199 to float, !dbg !21
304
+ br label %__nv_erff.exit51, !dbg !21
305
+
306
+ __nv_erff.exit51: ; preds = %__internal_fmad.exit.i32, %193
307
+ %r.0.i47 = phi float [ %200, %193 ], [ %.08.i46, %__internal_fmad.exit.i32 ], !dbg !21
308
+ %201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
309
+ %.not.i52 = icmp eq i32 %201, 0, !dbg !21
310
+ %202 = tail call float @llvm.nvvm.fabs.ftz.f(float %58) #4, !dbg !21
311
+ %203 = tail call float @llvm.nvvm.fabs.f(float %58) #4, !dbg !21
312
+ %.0.i53 = select i1 %.not.i52, float %203, float %202, !dbg !21
313
+ %204 = fcmp oge float %.0.i53, 0x3FF00C1FC0000000, !dbg !21
314
+ br i1 %204, label %__nv_fabsf.exit1.i70, label %206, !dbg !21
315
+
316
+ __nv_fabsf.exit1.i70: ; preds = %__nv_erff.exit51
317
+ %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
318
+ %.not1.i71 = icmp eq i32 %205, 0, !dbg !21
319
+ %.01.i72 = select i1 %.not1.i71, float %203, float %202, !dbg !21
320
+ br label %__internal_fmad.exit.i54, !dbg !21
321
+
322
+ 206: ; preds = %__nv_erff.exit51
323
+ %207 = fmul float %58, %58, !dbg !21
324
+ br label %__internal_fmad.exit.i54, !dbg !21
325
+
326
+ __internal_fmad.exit.i54: ; preds = %206, %__nv_fabsf.exit1.i70
327
+ %208 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i70 ], [ 0x3FC06EBA60000000, %206 ], !dbg !21
328
+ %209 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i70 ], [ 0xBFD8127580000000, %206 ], !dbg !21
329
+ %210 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i70 ], [ 0x3FBCE315E0000000, %206 ], !dbg !21
330
+ %211 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i70 ], [ 0xBF9B837CE0000000, %206 ], !dbg !21
331
+ %212 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i70 ], [ 0x3F755ABD40000000, %206 ], !dbg !21
332
+ %213 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i70 ], [ 0xBF4AE9A400000000, %206 ], !dbg !21
333
+ %214 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i70 ], [ 0x3F163D2D40000000, %206 ], !dbg !21
334
+ %215 = phi float [ %.01.i72, %__nv_fabsf.exit1.i70 ], [ %207, %206 ], !dbg !21
335
+ %216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
336
+ %.not2.i55 = icmp eq i32 %216, 0, !dbg !21
337
+ %217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %214, float %215, float %213) #4, !dbg !21
338
+ %218 = tail call float @llvm.nvvm.fma.rn.f(float %214, float %215, float %213) #4, !dbg !21
339
+ %.02.i56 = select i1 %.not2.i55, float %218, float %217, !dbg !21
340
+ %219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
341
+ %.not3.i57 = icmp eq i32 %219, 0, !dbg !21
342
+ %220 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i56, float %215, float %212) #4, !dbg !21
343
+ %221 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i56, float %215, float %212) #4, !dbg !21
344
+ %.03.i58 = select i1 %.not3.i57, float %221, float %220, !dbg !21
345
+ %222 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
346
+ %.not4.i59 = icmp eq i32 %222, 0, !dbg !21
347
+ %223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i58, float %215, float %211) #4, !dbg !21
348
+ %224 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i58, float %215, float %211) #4, !dbg !21
349
+ %.04.i60 = select i1 %.not4.i59, float %224, float %223, !dbg !21
350
+ %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
351
+ %.not5.i61 = icmp eq i32 %225, 0, !dbg !21
352
+ %226 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i60, float %215, float %210) #4, !dbg !21
353
+ %227 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i60, float %215, float %210) #4, !dbg !21
354
+ %.05.i62 = select i1 %.not5.i61, float %227, float %226, !dbg !21
355
+ %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
356
+ %.not6.i63 = icmp eq i32 %228, 0, !dbg !21
357
+ %229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i62, float %215, float %209) #4, !dbg !21
358
+ %230 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i62, float %215, float %209) #4, !dbg !21
359
+ %.06.i64 = select i1 %.not6.i63, float %230, float %229, !dbg !21
360
+ %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
361
+ %.not7.i65 = icmp eq i32 %231, 0, !dbg !21
362
+ %232 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i64, float %215, float %208) #4, !dbg !21
363
+ %233 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i64, float %215, float %208) #4, !dbg !21
364
+ %.07.i66 = select i1 %.not7.i65, float %233, float %232, !dbg !21
365
+ %234 = fneg float %215, !dbg !21
366
+ %235 = select i1 %204, float %234, float %58, !dbg !21
367
+ %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
368
+ %.not8.i67 = icmp eq i32 %236, 0, !dbg !21
369
+ %237 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i66, float %235, float %235) #4, !dbg !21
370
+ %238 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i66, float %235, float %235) #4, !dbg !21
371
+ %.08.i68 = select i1 %.not8.i67, float %238, float %237, !dbg !21
372
+ br i1 %204, label %239, label %__nv_erff.exit73, !dbg !21
373
+
374
+ 239: ; preds = %__internal_fmad.exit.i54
375
+ %240 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i68) #4, !dbg !21
376
+ %241 = fsub float 1.000000e+00, %240, !dbg !21
377
+ %242 = bitcast float %241 to i32, !dbg !21
378
+ %243 = bitcast float %58 to i32, !dbg !21
379
+ %244 = and i32 %243, -2147483648, !dbg !21
380
+ %245 = or i32 %244, %242, !dbg !21
381
+ %246 = bitcast i32 %245 to float, !dbg !21
382
+ br label %__nv_erff.exit73, !dbg !21
383
+
384
+ __nv_erff.exit73: ; preds = %__internal_fmad.exit.i54, %239
385
+ %r.0.i69 = phi float [ %246, %239 ], [ %.08.i68, %__internal_fmad.exit.i54 ], !dbg !21
386
+ %247 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
387
+ %.not.i74 = icmp eq i32 %247, 0, !dbg !21
388
+ %248 = tail call float @llvm.nvvm.fabs.ftz.f(float %59) #4, !dbg !21
389
+ %249 = tail call float @llvm.nvvm.fabs.f(float %59) #4, !dbg !21
390
+ %.0.i75 = select i1 %.not.i74, float %249, float %248, !dbg !21
391
+ %250 = fcmp oge float %.0.i75, 0x3FF00C1FC0000000, !dbg !21
392
+ br i1 %250, label %__nv_fabsf.exit1.i92, label %252, !dbg !21
393
+
394
+ __nv_fabsf.exit1.i92: ; preds = %__nv_erff.exit73
395
+ %251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
396
+ %.not1.i93 = icmp eq i32 %251, 0, !dbg !21
397
+ %.01.i94 = select i1 %.not1.i93, float %249, float %248, !dbg !21
398
+ br label %__internal_fmad.exit.i76, !dbg !21
399
+
400
+ 252: ; preds = %__nv_erff.exit73
401
+ %253 = fmul float %59, %59, !dbg !21
402
+ br label %__internal_fmad.exit.i76, !dbg !21
403
+
404
+ __internal_fmad.exit.i76: ; preds = %252, %__nv_fabsf.exit1.i92
405
+ %254 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i92 ], [ 0x3FC06EBA60000000, %252 ], !dbg !21
406
+ %255 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i92 ], [ 0xBFD8127580000000, %252 ], !dbg !21
407
+ %256 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i92 ], [ 0x3FBCE315E0000000, %252 ], !dbg !21
408
+ %257 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i92 ], [ 0xBF9B837CE0000000, %252 ], !dbg !21
409
+ %258 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i92 ], [ 0x3F755ABD40000000, %252 ], !dbg !21
410
+ %259 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i92 ], [ 0xBF4AE9A400000000, %252 ], !dbg !21
411
+ %260 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i92 ], [ 0x3F163D2D40000000, %252 ], !dbg !21
412
+ %261 = phi float [ %.01.i94, %__nv_fabsf.exit1.i92 ], [ %253, %252 ], !dbg !21
413
+ %262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
414
+ %.not2.i77 = icmp eq i32 %262, 0, !dbg !21
415
+ %263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %260, float %261, float %259) #4, !dbg !21
416
+ %264 = tail call float @llvm.nvvm.fma.rn.f(float %260, float %261, float %259) #4, !dbg !21
417
+ %.02.i78 = select i1 %.not2.i77, float %264, float %263, !dbg !21
418
+ %265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
419
+ %.not3.i79 = icmp eq i32 %265, 0, !dbg !21
420
+ %266 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i78, float %261, float %258) #4, !dbg !21
421
+ %267 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i78, float %261, float %258) #4, !dbg !21
422
+ %.03.i80 = select i1 %.not3.i79, float %267, float %266, !dbg !21
423
+ %268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
424
+ %.not4.i81 = icmp eq i32 %268, 0, !dbg !21
425
+ %269 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i80, float %261, float %257) #4, !dbg !21
426
+ %270 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i80, float %261, float %257) #4, !dbg !21
427
+ %.04.i82 = select i1 %.not4.i81, float %270, float %269, !dbg !21
428
+ %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
429
+ %.not5.i83 = icmp eq i32 %271, 0, !dbg !21
430
+ %272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i82, float %261, float %256) #4, !dbg !21
431
+ %273 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i82, float %261, float %256) #4, !dbg !21
432
+ %.05.i84 = select i1 %.not5.i83, float %273, float %272, !dbg !21
433
+ %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
434
+ %.not6.i85 = icmp eq i32 %274, 0, !dbg !21
435
+ %275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i84, float %261, float %255) #4, !dbg !21
436
+ %276 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i84, float %261, float %255) #4, !dbg !21
437
+ %.06.i86 = select i1 %.not6.i85, float %276, float %275, !dbg !21
438
+ %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
439
+ %.not7.i87 = icmp eq i32 %277, 0, !dbg !21
440
+ %278 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i86, float %261, float %254) #4, !dbg !21
441
+ %279 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i86, float %261, float %254) #4, !dbg !21
442
+ %.07.i88 = select i1 %.not7.i87, float %279, float %278, !dbg !21
443
+ %280 = fneg float %261, !dbg !21
444
+ %281 = select i1 %250, float %280, float %59, !dbg !21
445
+ %282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
446
+ %.not8.i89 = icmp eq i32 %282, 0, !dbg !21
447
+ %283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i88, float %281, float %281) #4, !dbg !21
448
+ %284 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i88, float %281, float %281) #4, !dbg !21
449
+ %.08.i90 = select i1 %.not8.i89, float %284, float %283, !dbg !21
450
+ br i1 %250, label %285, label %__nv_erff.exit95, !dbg !21
451
+
452
+ 285: ; preds = %__internal_fmad.exit.i76
453
+ %286 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i90) #4, !dbg !21
454
+ %287 = fsub float 1.000000e+00, %286, !dbg !21
455
+ %288 = bitcast float %287 to i32, !dbg !21
456
+ %289 = bitcast float %59 to i32, !dbg !21
457
+ %290 = and i32 %289, -2147483648, !dbg !21
458
+ %291 = or i32 %290, %288, !dbg !21
459
+ %292 = bitcast i32 %291 to float, !dbg !21
460
+ br label %__nv_erff.exit95, !dbg !21
461
+
462
+ __nv_erff.exit95: ; preds = %__internal_fmad.exit.i76, %285
463
+ %r.0.i91 = phi float [ %292, %285 ], [ %.08.i90, %__internal_fmad.exit.i76 ], !dbg !21
464
+ %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
465
+ %.not.i96 = icmp eq i32 %293, 0, !dbg !21
466
+ %294 = tail call float @llvm.nvvm.fabs.ftz.f(float %60) #4, !dbg !21
467
+ %295 = tail call float @llvm.nvvm.fabs.f(float %60) #4, !dbg !21
468
+ %.0.i97 = select i1 %.not.i96, float %295, float %294, !dbg !21
469
+ %296 = fcmp oge float %.0.i97, 0x3FF00C1FC0000000, !dbg !21
470
+ br i1 %296, label %__nv_fabsf.exit1.i114, label %298, !dbg !21
471
+
472
+ __nv_fabsf.exit1.i114: ; preds = %__nv_erff.exit95
473
+ %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
474
+ %.not1.i115 = icmp eq i32 %297, 0, !dbg !21
475
+ %.01.i116 = select i1 %.not1.i115, float %295, float %294, !dbg !21
476
+ br label %__internal_fmad.exit.i98, !dbg !21
477
+
478
+ 298: ; preds = %__nv_erff.exit95
479
+ %299 = fmul float %60, %60, !dbg !21
480
+ br label %__internal_fmad.exit.i98, !dbg !21
481
+
482
+ __internal_fmad.exit.i98: ; preds = %298, %__nv_fabsf.exit1.i114
483
+ %300 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i114 ], [ 0x3FC06EBA60000000, %298 ], !dbg !21
484
+ %301 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i114 ], [ 0xBFD8127580000000, %298 ], !dbg !21
485
+ %302 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i114 ], [ 0x3FBCE315E0000000, %298 ], !dbg !21
486
+ %303 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i114 ], [ 0xBF9B837CE0000000, %298 ], !dbg !21
487
+ %304 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i114 ], [ 0x3F755ABD40000000, %298 ], !dbg !21
488
+ %305 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i114 ], [ 0xBF4AE9A400000000, %298 ], !dbg !21
489
+ %306 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i114 ], [ 0x3F163D2D40000000, %298 ], !dbg !21
490
+ %307 = phi float [ %.01.i116, %__nv_fabsf.exit1.i114 ], [ %299, %298 ], !dbg !21
491
+ %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
492
+ %.not2.i99 = icmp eq i32 %308, 0, !dbg !21
493
+ %309 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %306, float %307, float %305) #4, !dbg !21
494
+ %310 = tail call float @llvm.nvvm.fma.rn.f(float %306, float %307, float %305) #4, !dbg !21
495
+ %.02.i100 = select i1 %.not2.i99, float %310, float %309, !dbg !21
496
+ %311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
497
+ %.not3.i101 = icmp eq i32 %311, 0, !dbg !21
498
+ %312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i100, float %307, float %304) #4, !dbg !21
499
+ %313 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i100, float %307, float %304) #4, !dbg !21
500
+ %.03.i102 = select i1 %.not3.i101, float %313, float %312, !dbg !21
501
+ %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
502
+ %.not4.i103 = icmp eq i32 %314, 0, !dbg !21
503
+ %315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i102, float %307, float %303) #4, !dbg !21
504
+ %316 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i102, float %307, float %303) #4, !dbg !21
505
+ %.04.i104 = select i1 %.not4.i103, float %316, float %315, !dbg !21
506
+ %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
507
+ %.not5.i105 = icmp eq i32 %317, 0, !dbg !21
508
+ %318 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i104, float %307, float %302) #4, !dbg !21
509
+ %319 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i104, float %307, float %302) #4, !dbg !21
510
+ %.05.i106 = select i1 %.not5.i105, float %319, float %318, !dbg !21
511
+ %320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
512
+ %.not6.i107 = icmp eq i32 %320, 0, !dbg !21
513
+ %321 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i106, float %307, float %301) #4, !dbg !21
514
+ %322 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i106, float %307, float %301) #4, !dbg !21
515
+ %.06.i108 = select i1 %.not6.i107, float %322, float %321, !dbg !21
516
+ %323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
517
+ %.not7.i109 = icmp eq i32 %323, 0, !dbg !21
518
+ %324 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i108, float %307, float %300) #4, !dbg !21
519
+ %325 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i108, float %307, float %300) #4, !dbg !21
520
+ %.07.i110 = select i1 %.not7.i109, float %325, float %324, !dbg !21
521
+ %326 = fneg float %307, !dbg !21
522
+ %327 = select i1 %296, float %326, float %60, !dbg !21
523
+ %328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
524
+ %.not8.i111 = icmp eq i32 %328, 0, !dbg !21
525
+ %329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i110, float %327, float %327) #4, !dbg !21
526
+ %330 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i110, float %327, float %327) #4, !dbg !21
527
+ %.08.i112 = select i1 %.not8.i111, float %330, float %329, !dbg !21
528
+ br i1 %296, label %331, label %__nv_erff.exit117, !dbg !21
529
+
530
+ 331: ; preds = %__internal_fmad.exit.i98
531
+ %332 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i112) #4, !dbg !21
532
+ %333 = fsub float 1.000000e+00, %332, !dbg !21
533
+ %334 = bitcast float %333 to i32, !dbg !21
534
+ %335 = bitcast float %60 to i32, !dbg !21
535
+ %336 = and i32 %335, -2147483648, !dbg !21
536
+ %337 = or i32 %336, %334, !dbg !21
537
+ %338 = bitcast i32 %337 to float, !dbg !21
538
+ br label %__nv_erff.exit117, !dbg !21
539
+
540
+ __nv_erff.exit117: ; preds = %__internal_fmad.exit.i98, %331
541
+ %r.0.i113 = phi float [ %338, %331 ], [ %.08.i112, %__internal_fmad.exit.i98 ], !dbg !21
542
+ %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
543
+ %.not.i118 = icmp eq i32 %339, 0, !dbg !21
544
+ %340 = tail call float @llvm.nvvm.fabs.ftz.f(float %61) #4, !dbg !21
545
+ %341 = tail call float @llvm.nvvm.fabs.f(float %61) #4, !dbg !21
546
+ %.0.i119 = select i1 %.not.i118, float %341, float %340, !dbg !21
547
+ %342 = fcmp oge float %.0.i119, 0x3FF00C1FC0000000, !dbg !21
548
+ br i1 %342, label %__nv_fabsf.exit1.i136, label %344, !dbg !21
549
+
550
+ __nv_fabsf.exit1.i136: ; preds = %__nv_erff.exit117
551
+ %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
552
+ %.not1.i137 = icmp eq i32 %343, 0, !dbg !21
553
+ %.01.i138 = select i1 %.not1.i137, float %341, float %340, !dbg !21
554
+ br label %__internal_fmad.exit.i120, !dbg !21
555
+
556
+ 344: ; preds = %__nv_erff.exit117
557
+ %345 = fmul float %61, %61, !dbg !21
558
+ br label %__internal_fmad.exit.i120, !dbg !21
559
+
560
+ __internal_fmad.exit.i120: ; preds = %344, %__nv_fabsf.exit1.i136
561
+ %346 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i136 ], [ 0x3FC06EBA60000000, %344 ], !dbg !21
562
+ %347 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i136 ], [ 0xBFD8127580000000, %344 ], !dbg !21
563
+ %348 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i136 ], [ 0x3FBCE315E0000000, %344 ], !dbg !21
564
+ %349 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i136 ], [ 0xBF9B837CE0000000, %344 ], !dbg !21
565
+ %350 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i136 ], [ 0x3F755ABD40000000, %344 ], !dbg !21
566
+ %351 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i136 ], [ 0xBF4AE9A400000000, %344 ], !dbg !21
567
+ %352 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i136 ], [ 0x3F163D2D40000000, %344 ], !dbg !21
568
+ %353 = phi float [ %.01.i138, %__nv_fabsf.exit1.i136 ], [ %345, %344 ], !dbg !21
569
+ %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
570
+ %.not2.i121 = icmp eq i32 %354, 0, !dbg !21
571
+ %355 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float %353, float %351) #4, !dbg !21
572
+ %356 = tail call float @llvm.nvvm.fma.rn.f(float %352, float %353, float %351) #4, !dbg !21
573
+ %.02.i122 = select i1 %.not2.i121, float %356, float %355, !dbg !21
574
+ %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
575
+ %.not3.i123 = icmp eq i32 %357, 0, !dbg !21
576
+ %358 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i122, float %353, float %350) #4, !dbg !21
577
+ %359 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i122, float %353, float %350) #4, !dbg !21
578
+ %.03.i124 = select i1 %.not3.i123, float %359, float %358, !dbg !21
579
+ %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
580
+ %.not4.i125 = icmp eq i32 %360, 0, !dbg !21
581
+ %361 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i124, float %353, float %349) #4, !dbg !21
582
+ %362 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i124, float %353, float %349) #4, !dbg !21
583
+ %.04.i126 = select i1 %.not4.i125, float %362, float %361, !dbg !21
584
+ %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
585
+ %.not5.i127 = icmp eq i32 %363, 0, !dbg !21
586
+ %364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i126, float %353, float %348) #4, !dbg !21
587
+ %365 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i126, float %353, float %348) #4, !dbg !21
588
+ %.05.i128 = select i1 %.not5.i127, float %365, float %364, !dbg !21
589
+ %366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
590
+ %.not6.i129 = icmp eq i32 %366, 0, !dbg !21
591
+ %367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i128, float %353, float %347) #4, !dbg !21
592
+ %368 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i128, float %353, float %347) #4, !dbg !21
593
+ %.06.i130 = select i1 %.not6.i129, float %368, float %367, !dbg !21
594
+ %369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
595
+ %.not7.i131 = icmp eq i32 %369, 0, !dbg !21
596
+ %370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i130, float %353, float %346) #4, !dbg !21
597
+ %371 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i130, float %353, float %346) #4, !dbg !21
598
+ %.07.i132 = select i1 %.not7.i131, float %371, float %370, !dbg !21
599
+ %372 = fneg float %353, !dbg !21
600
+ %373 = select i1 %342, float %372, float %61, !dbg !21
601
+ %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
602
+ %.not8.i133 = icmp eq i32 %374, 0, !dbg !21
603
+ %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i132, float %373, float %373) #4, !dbg !21
604
+ %376 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i132, float %373, float %373) #4, !dbg !21
605
+ %.08.i134 = select i1 %.not8.i133, float %376, float %375, !dbg !21
606
+ br i1 %342, label %377, label %__nv_erff.exit139, !dbg !21
607
+
608
+ 377: ; preds = %__internal_fmad.exit.i120
609
+ %378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i134) #4, !dbg !21
610
+ %379 = fsub float 1.000000e+00, %378, !dbg !21
611
+ %380 = bitcast float %379 to i32, !dbg !21
612
+ %381 = bitcast float %61 to i32, !dbg !21
613
+ %382 = and i32 %381, -2147483648, !dbg !21
614
+ %383 = or i32 %382, %380, !dbg !21
615
+ %384 = bitcast i32 %383 to float, !dbg !21
616
+ br label %__nv_erff.exit139, !dbg !21
617
+
618
+ __nv_erff.exit139: ; preds = %__internal_fmad.exit.i120, %377
619
+ %r.0.i135 = phi float [ %384, %377 ], [ %.08.i134, %__internal_fmad.exit.i120 ], !dbg !21
620
+ %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
621
+ %.not.i140 = icmp eq i32 %385, 0, !dbg !21
622
+ %386 = tail call float @llvm.nvvm.fabs.ftz.f(float %62) #4, !dbg !21
623
+ %387 = tail call float @llvm.nvvm.fabs.f(float %62) #4, !dbg !21
624
+ %.0.i141 = select i1 %.not.i140, float %387, float %386, !dbg !21
625
+ %388 = fcmp oge float %.0.i141, 0x3FF00C1FC0000000, !dbg !21
626
+ br i1 %388, label %__nv_fabsf.exit1.i158, label %390, !dbg !21
627
+
628
+ __nv_fabsf.exit1.i158: ; preds = %__nv_erff.exit139
629
+ %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
630
+ %.not1.i159 = icmp eq i32 %389, 0, !dbg !21
631
+ %.01.i160 = select i1 %.not1.i159, float %387, float %386, !dbg !21
632
+ br label %__internal_fmad.exit.i142, !dbg !21
633
+
634
+ 390: ; preds = %__nv_erff.exit139
635
+ %391 = fmul float %62, %62, !dbg !21
636
+ br label %__internal_fmad.exit.i142, !dbg !21
637
+
638
+ __internal_fmad.exit.i142: ; preds = %390, %__nv_fabsf.exit1.i158
639
+ %392 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i158 ], [ 0x3FC06EBA60000000, %390 ], !dbg !21
640
+ %393 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i158 ], [ 0xBFD8127580000000, %390 ], !dbg !21
641
+ %394 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i158 ], [ 0x3FBCE315E0000000, %390 ], !dbg !21
642
+ %395 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i158 ], [ 0xBF9B837CE0000000, %390 ], !dbg !21
643
+ %396 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i158 ], [ 0x3F755ABD40000000, %390 ], !dbg !21
644
+ %397 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i158 ], [ 0xBF4AE9A400000000, %390 ], !dbg !21
645
+ %398 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i158 ], [ 0x3F163D2D40000000, %390 ], !dbg !21
646
+ %399 = phi float [ %.01.i160, %__nv_fabsf.exit1.i158 ], [ %391, %390 ], !dbg !21
647
+ %400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
648
+ %.not2.i143 = icmp eq i32 %400, 0, !dbg !21
649
+ %401 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float %399, float %397) #4, !dbg !21
650
+ %402 = tail call float @llvm.nvvm.fma.rn.f(float %398, float %399, float %397) #4, !dbg !21
651
+ %.02.i144 = select i1 %.not2.i143, float %402, float %401, !dbg !21
652
+ %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
653
+ %.not3.i145 = icmp eq i32 %403, 0, !dbg !21
654
+ %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i144, float %399, float %396) #4, !dbg !21
655
+ %405 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i144, float %399, float %396) #4, !dbg !21
656
+ %.03.i146 = select i1 %.not3.i145, float %405, float %404, !dbg !21
657
+ %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
658
+ %.not4.i147 = icmp eq i32 %406, 0, !dbg !21
659
+ %407 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i146, float %399, float %395) #4, !dbg !21
660
+ %408 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i146, float %399, float %395) #4, !dbg !21
661
+ %.04.i148 = select i1 %.not4.i147, float %408, float %407, !dbg !21
662
+ %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
663
+ %.not5.i149 = icmp eq i32 %409, 0, !dbg !21
664
+ %410 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i148, float %399, float %394) #4, !dbg !21
665
+ %411 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i148, float %399, float %394) #4, !dbg !21
666
+ %.05.i150 = select i1 %.not5.i149, float %411, float %410, !dbg !21
667
+ %412 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
668
+ %.not6.i151 = icmp eq i32 %412, 0, !dbg !21
669
+ %413 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i150, float %399, float %393) #4, !dbg !21
670
+ %414 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i150, float %399, float %393) #4, !dbg !21
671
+ %.06.i152 = select i1 %.not6.i151, float %414, float %413, !dbg !21
672
+ %415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
673
+ %.not7.i153 = icmp eq i32 %415, 0, !dbg !21
674
+ %416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i152, float %399, float %392) #4, !dbg !21
675
+ %417 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i152, float %399, float %392) #4, !dbg !21
676
+ %.07.i154 = select i1 %.not7.i153, float %417, float %416, !dbg !21
677
+ %418 = fneg float %399, !dbg !21
678
+ %419 = select i1 %388, float %418, float %62, !dbg !21
679
+ %420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21
680
+ %.not8.i155 = icmp eq i32 %420, 0, !dbg !21
681
+ %421 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i154, float %419, float %419) #4, !dbg !21
682
+ %422 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i154, float %419, float %419) #4, !dbg !21
683
+ %.08.i156 = select i1 %.not8.i155, float %422, float %421, !dbg !21
684
+ br i1 %388, label %423, label %__nv_erff.exit161, !dbg !21
685
+
686
+ 423: ; preds = %__internal_fmad.exit.i142
687
+ %424 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i156) #4, !dbg !21
688
+ %425 = fsub float 1.000000e+00, %424, !dbg !21
689
+ %426 = bitcast float %425 to i32, !dbg !21
690
+ %427 = bitcast float %62 to i32, !dbg !21
691
+ %428 = and i32 %427, -2147483648, !dbg !21
692
+ %429 = or i32 %428, %426, !dbg !21
693
+ %430 = bitcast i32 %429 to float, !dbg !21
694
+ br label %__nv_erff.exit161, !dbg !21
695
+
696
+ __nv_erff.exit161: ; preds = %__internal_fmad.exit.i142, %423
697
+ %r.0.i157 = phi float [ %430, %423 ], [ %.08.i156, %__internal_fmad.exit.i142 ], !dbg !21
698
+ %431 = fadd float %r.0.i, 1.000000e+00, !dbg !22
699
+ %432 = fadd float %r.0.i25, 1.000000e+00, !dbg !22
700
+ %433 = fadd float %r.0.i47, 1.000000e+00, !dbg !22
701
+ %434 = fadd float %r.0.i69, 1.000000e+00, !dbg !22
702
+ %435 = fadd float %r.0.i91, 1.000000e+00, !dbg !22
703
+ %436 = fadd float %r.0.i113, 1.000000e+00, !dbg !22
704
+ %437 = fadd float %r.0.i135, 1.000000e+00, !dbg !22
705
+ %438 = fadd float %r.0.i157, 1.000000e+00, !dbg !22
706
+ %439 = fmul float %431, 5.000000e-01, !dbg !23
707
+ %440 = fmul float %432, 5.000000e-01, !dbg !23
708
+ %441 = fmul float %433, 5.000000e-01, !dbg !23
709
+ %442 = fmul float %434, 5.000000e-01, !dbg !23
710
+ %443 = fmul float %435, 5.000000e-01, !dbg !23
711
+ %444 = fmul float %436, 5.000000e-01, !dbg !23
712
+ %445 = fmul float %437, 5.000000e-01, !dbg !23
713
+ %446 = fmul float %438, 5.000000e-01, !dbg !23
714
+ %447 = fmul float %47, %47, !dbg !24
715
+ %448 = fmul float %48, %48, !dbg !24
716
+ %449 = fmul float %49, %49, !dbg !24
717
+ %450 = fmul float %50, %50, !dbg !24
718
+ %451 = fmul float %51, %51, !dbg !24
719
+ %452 = fmul float %52, %52, !dbg !24
720
+ %453 = fmul float %53, %53, !dbg !24
721
+ %454 = fmul float %54, %54, !dbg !24
722
+ %455 = fmul float %447, -5.000000e-01, !dbg !25
723
+ %456 = fmul float %448, -5.000000e-01, !dbg !25
724
+ %457 = fmul float %449, -5.000000e-01, !dbg !25
725
+ %458 = fmul float %450, -5.000000e-01, !dbg !25
726
+ %459 = fmul float %451, -5.000000e-01, !dbg !25
727
+ %460 = fmul float %452, -5.000000e-01, !dbg !25
728
+ %461 = fmul float %453, -5.000000e-01, !dbg !25
729
+ %462 = fmul float %454, -5.000000e-01, !dbg !25
730
+ %463 = fmul float %455, 0x3FF7154760000000, !dbg !26
731
+ %464 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %463) #4, !dbg !26
732
+ %465 = fmul float %456, 0x3FF7154760000000, !dbg !26
733
+ %466 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %465) #4, !dbg !26
734
+ %467 = fmul float %457, 0x3FF7154760000000, !dbg !26
735
+ %468 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %467) #4, !dbg !26
736
+ %469 = fmul float %458, 0x3FF7154760000000, !dbg !26
737
+ %470 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %469) #4, !dbg !26
738
+ %471 = fmul float %459, 0x3FF7154760000000, !dbg !26
739
+ %472 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %471) #4, !dbg !26
740
+ %473 = fmul float %460, 0x3FF7154760000000, !dbg !26
741
+ %474 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %473) #4, !dbg !26
742
+ %475 = fmul float %461, 0x3FF7154760000000, !dbg !26
743
+ %476 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %475) #4, !dbg !26
744
+ %477 = fmul float %462, 0x3FF7154760000000, !dbg !26
745
+ %478 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %477) #4, !dbg !26
746
+ %479 = fmul float %464, 0x3FD9884540000000, !dbg !27
747
+ %480 = fmul float %466, 0x3FD9884540000000, !dbg !27
748
+ %481 = fmul float %468, 0x3FD9884540000000, !dbg !27
749
+ %482 = fmul float %470, 0x3FD9884540000000, !dbg !27
750
+ %483 = fmul float %472, 0x3FD9884540000000, !dbg !27
751
+ %484 = fmul float %474, 0x3FD9884540000000, !dbg !27
752
+ %485 = fmul float %476, 0x3FD9884540000000, !dbg !27
753
+ %486 = fmul float %478, 0x3FD9884540000000, !dbg !27
754
+ %487 = fmul float %47, %479, !dbg !28
755
+ %488 = fmul float %48, %480, !dbg !28
756
+ %489 = fmul float %49, %481, !dbg !28
757
+ %490 = fmul float %50, %482, !dbg !28
758
+ %491 = fmul float %51, %483, !dbg !28
759
+ %492 = fmul float %52, %484, !dbg !28
760
+ %493 = fmul float %53, %485, !dbg !28
761
+ %494 = fmul float %54, %486, !dbg !28
762
+ %495 = fadd float %439, %487, !dbg !29
763
+ %496 = fadd float %440, %488, !dbg !29
764
+ %497 = fadd float %441, %489, !dbg !29
765
+ %498 = fadd float %442, %490, !dbg !29
766
+ %499 = fadd float %443, %491, !dbg !29
767
+ %500 = fadd float %444, %492, !dbg !29
768
+ %501 = fadd float %445, %493, !dbg !29
769
+ %502 = fadd float %446, %494, !dbg !29
770
+ %503 = fmul float %25, %495, !dbg !30
771
+ %504 = fmul float %26, %496, !dbg !30
772
+ %505 = fmul float %27, %497, !dbg !30
773
+ %506 = fmul float %28, %498, !dbg !30
774
+ %507 = fmul float %29, %499, !dbg !30
775
+ %508 = fmul float %30, %500, !dbg !30
776
+ %509 = fmul float %31, %501, !dbg !30
777
+ %510 = fmul float %32, %502, !dbg !30
778
+ %511 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %503) #4, !dbg !31
779
+ %512 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %504) #4, !dbg !31
780
+ %513 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %505) #4, !dbg !31
781
+ %514 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %506) #4, !dbg !31
782
+ %515 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %507) #4, !dbg !31
783
+ %516 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %508) #4, !dbg !31
784
+ %517 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %509) #4, !dbg !31
785
+ %518 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %510) #4, !dbg !31
786
+ %519 = insertelement <2 x i16> undef, i16 %511, i64 0, !dbg !31
787
+ %520 = insertelement <2 x i16> %519, i16 %512, i64 1, !dbg !31
788
+ %521 = bitcast <2 x i16> %520 to i32, !dbg !31
789
+ %522 = insertelement <2 x i16> undef, i16 %513, i64 0, !dbg !31
790
+ %523 = insertelement <2 x i16> %522, i16 %514, i64 1, !dbg !31
791
+ %524 = bitcast <2 x i16> %523 to i32, !dbg !31
792
+ %525 = insertelement <2 x i16> undef, i16 %515, i64 0, !dbg !31
793
+ %526 = insertelement <2 x i16> %525, i16 %516, i64 1, !dbg !31
794
+ %527 = bitcast <2 x i16> %526 to i32, !dbg !31
795
+ %528 = insertelement <2 x i16> undef, i16 %517, i64 0, !dbg !31
796
+ %529 = insertelement <2 x i16> %528, i16 %518, i64 1, !dbg !31
797
+ %530 = bitcast <2 x i16> %529 to i32, !dbg !31
798
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %521, i32 %524, i32 %527, i32 %530, ptr addrspace(1) %11, i1 true) #4, !dbg !31
799
+ ret void, !dbg !32
800
+ }
801
+
802
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
803
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
804
+
805
+ ; Function Attrs: alwaysinline nounwind
806
+ define float @__nv_erff(float %a) local_unnamed_addr #1 {
807
+ __nv_fabsf.exit:
808
+ %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
809
+ %.not = icmp eq i32 %0, 0
810
+ %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
811
+ %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
812
+ %.0 = select i1 %.not, float %2, float %1
813
+ %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
814
+ br i1 %3, label %__nv_fabsf.exit1, label %5
815
+
816
+ __nv_fabsf.exit1: ; preds = %__nv_fabsf.exit
817
+ %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
818
+ %.not1 = icmp eq i32 %4, 0
819
+ %.01 = select i1 %.not1, float %2, float %1
820
+ br label %__internal_fmad.exit
821
+
822
+ 5: ; preds = %__nv_fabsf.exit
823
+ %6 = fmul float %a, %a
824
+ br label %__internal_fmad.exit
825
+
826
+ __internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1
827
+ %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
828
+ %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
829
+ %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
830
+ %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
831
+ %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
832
+ %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
833
+ %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
834
+ %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
835
+ %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
836
+ %.not2 = icmp eq i32 %15, 0
837
+ %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
838
+ %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
839
+ %.02 = select i1 %.not2, float %17, float %16
840
+ %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
841
+ %.not3 = icmp eq i32 %18, 0
842
+ %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
843
+ %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
844
+ %.03 = select i1 %.not3, float %20, float %19
845
+ %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
846
+ %.not4 = icmp eq i32 %21, 0
847
+ %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
848
+ %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
849
+ %.04 = select i1 %.not4, float %23, float %22
850
+ %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
851
+ %.not5 = icmp eq i32 %24, 0
852
+ %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
853
+ %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
854
+ %.05 = select i1 %.not5, float %26, float %25
855
+ %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
856
+ %.not6 = icmp eq i32 %27, 0
857
+ %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
858
+ %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
859
+ %.06 = select i1 %.not6, float %29, float %28
860
+ %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
861
+ %.not7 = icmp eq i32 %30, 0
862
+ %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
863
+ %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
864
+ %.07 = select i1 %.not7, float %32, float %31
865
+ %33 = fneg float %14
866
+ %34 = select i1 %3, float %33, float %a
867
+ %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
868
+ %.not8 = icmp eq i32 %35, 0
869
+ %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
870
+ %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
871
+ %.08 = select i1 %.not8, float %37, float %36
872
+ br i1 %3, label %38, label %46
873
+
874
+ 38: ; preds = %__internal_fmad.exit
875
+ %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
876
+ %40 = fsub float 1.000000e+00, %39
877
+ %41 = bitcast float %40 to i32
878
+ %42 = bitcast float %a to i32
879
+ %43 = and i32 %42, -2147483648
880
+ %44 = or i32 %43, %41
881
+ %45 = bitcast i32 %44 to float
882
+ br label %46
883
+
884
+ 46: ; preds = %38, %__internal_fmad.exit
885
+ %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
886
+ ret float %r.0
887
+ }
888
+
889
+ declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
890
+
891
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
892
+ declare float @llvm.nvvm.fabs.ftz.f(float) #0
893
+
894
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
895
+ declare float @llvm.nvvm.fabs.f(float) #0
896
+
897
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
898
+ declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
899
+
900
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
901
+ declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
902
+
903
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
904
+ declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
905
+
906
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
907
+ attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
908
+ attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
909
+ attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
910
+ attributes #4 = { nounwind }
911
+
912
+ !llvm.module.flags = !{!0, !1}
913
+ !llvm.dbg.cu = !{!2}
914
+ !nvvm.annotations = !{!4, !5, !5, !4}
915
+ !llvm.ident = !{!6}
916
+
917
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
918
+ !1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
919
+ !2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
920
+ !3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j")
921
+ !4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
922
+ !5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
923
+ !6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
924
+ !7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
925
+ !8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
926
+ !9 = !{}
927
+ !10 = !DILocation(line: 21, column: 36, scope: !7)
928
+ !11 = !DILocation(line: 20, column: 28, scope: !7)
929
+ !12 = !DILocation(line: 20, column: 33, scope: !7)
930
+ !13 = !DILocation(line: 21, column: 23, scope: !7)
931
+ !14 = !DILocation(line: 24, column: 34, scope: !7)
932
+ !15 = !DILocation(line: 24, column: 39, scope: !7)
933
+ !16 = !DILocation(line: 24, column: 48, scope: !7)
934
+ !17 = !DILocation(line: 25, column: 30, scope: !7)
935
+ !18 = !DILocation(line: 25, column: 35, scope: !7)
936
+ !19 = !DILocation(line: 25, column: 44, scope: !7)
937
+ !20 = !DILocation(line: 29, column: 18, scope: !7)
938
+ !21 = !DILocation(line: 30, column: 23, scope: !7)
939
+ !22 = !DILocation(line: 32, column: 18, scope: !7)
940
+ !23 = !DILocation(line: 34, column: 19, scope: !7)
941
+ !24 = !DILocation(line: 35, column: 19, scope: !7)
942
+ !25 = !DILocation(line: 37, column: 20, scope: !7)
943
+ !26 = !DILocation(line: 38, column: 19, scope: !7)
944
+ !27 = !DILocation(line: 40, column: 20, scope: !7)
945
+ !28 = !DILocation(line: 41, column: 19, scope: !7)
946
+ !29 = !DILocation(line: 42, column: 20, scope: !7)
947
+ !30 = !DILocation(line: 43, column: 19, scope: !7)
948
+ !31 = !DILocation(line: 45, column: 40, scope: !7)
949
+ !32 = !DILocation(line: 45, column: 4, scope: !7)
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.398942292> : tensor<1024xf32, #blocked>
5
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32, #blocked>
6
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
7
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
8
+ %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
9
+ %c1024_i32 = arith.constant 1024 : i32
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = arith.muli %0, %c1024_i32 : i32
12
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
13
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
14
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
15
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
16
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
17
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
18
+ %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
19
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
20
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
21
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
22
+ %12 = arith.extf %11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
23
+ %13 = arith.mulf %12, %cst_3 : tensor<1024xf32, #blocked>
24
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
25
+ %15 = arith.addf %14, %cst_2 : tensor<1024xf32, #blocked>
26
+ %16 = arith.mulf %15, %cst_1 : tensor<1024xf32, #blocked>
27
+ %17 = arith.mulf %12, %12 : tensor<1024xf32, #blocked>
28
+ %18 = arith.mulf %17, %cst_0 : tensor<1024xf32, #blocked>
29
+ %19 = math.exp %18 : tensor<1024xf32, #blocked>
30
+ %20 = arith.mulf %19, %cst : tensor<1024xf32, #blocked>
31
+ %21 = arith.mulf %12, %20 : tensor<1024xf32, #blocked>
32
+ %22 = arith.addf %16, %21 : tensor<1024xf32, #blocked>
33
+ %23 = arith.mulf %8, %22 : tensor<1024xf32, #blocked>
34
+ %24 = arith.truncf %23 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
35
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.398942292> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
6
+ %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32>
7
+ %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32>
8
+ %c1024_i32 = arith.constant 1024 : i32
9
+ %0 = tt.get_program_id x : i32
10
+ %1 = arith.muli %0, %c1024_i32 : i32
11
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
12
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
13
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
14
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
15
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
16
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
17
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
18
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
19
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
20
+ %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
21
+ %12 = arith.extf %11 : tensor<1024xbf16> to tensor<1024xf32>
22
+ %13 = arith.mulf %12, %cst_3 : tensor<1024xf32>
23
+ %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
24
+ %15 = arith.addf %14, %cst_2 : tensor<1024xf32>
25
+ %16 = arith.mulf %15, %cst_1 : tensor<1024xf32>
26
+ %17 = arith.mulf %12, %12 : tensor<1024xf32>
27
+ %18 = arith.mulf %17, %cst_0 : tensor<1024xf32>
28
+ %19 = math.exp %18 : tensor<1024xf32>
29
+ %20 = arith.mulf %19, %cst : tensor<1024xf32>
30
+ %21 = arith.mulf %12, %20 : tensor<1024xf32>
31
+ %22 = arith.addf %16, %21 : tensor<1024xf32>
32
+ %23 = arith.mulf %8, %22 : tensor<1024xf32>
33
+ %24 = arith.truncf %23 : tensor<1024xf32> to tensor<1024xbf16>
34
+ tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
35
+ tt.return
36
+ }
37
+ }
.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !5 {
7
+ %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %12 = and i32 %11, 31, !dbg !8
9
+ %13 = lshr i32 %11, 5, !dbg !8
10
+ %14 = and i32 %13, 1, !dbg !8
11
+ %urem = shl i32 %11, 2, !dbg !8
12
+ %15 = and i32 %urem, 252, !dbg !8
13
+ %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
14
+ %17 = shl i32 %16, 8, !dbg !10
15
+ %18 = or i32 %17, %15, !dbg !11
16
+ %19 = sext i32 %18 to i64, !dbg !12
17
+ %20 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !12
18
+ %21 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13
19
+ %22 = extractvalue { i32, i32 } %21, 0, !dbg !13
20
+ %23 = extractvalue { i32, i32 } %21, 1, !dbg !13
21
+ %24 = trunc i32 %22 to i16, !dbg !13
22
+ %extelt.offset = lshr i32 %22, 16, !dbg !13
23
+ %25 = trunc i32 %extelt.offset to i16, !dbg !13
24
+ %26 = trunc i32 %23 to i16, !dbg !13
25
+ %extelt.offset1 = lshr i32 %23, 16, !dbg !13
26
+ %27 = trunc i32 %extelt.offset1 to i16, !dbg !13
27
+ %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14
28
+ %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14
29
+ %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14
30
+ %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #3, !dbg !14
31
+ %32 = zext nneg i32 %15 to i64, !dbg !15
32
+ %33 = getelementptr float, ptr addrspace(1) %2, i64 %32, !dbg !15
33
+ %34 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16
34
+ %35 = extractvalue { i32, i32, i32, i32 } %34, 0, !dbg !16
35
+ %36 = extractvalue { i32, i32, i32, i32 } %34, 1, !dbg !16
36
+ %37 = extractvalue { i32, i32, i32, i32 } %34, 2, !dbg !16
37
+ %38 = extractvalue { i32, i32, i32, i32 } %34, 3, !dbg !16
38
+ %39 = bitcast i32 %35 to float, !dbg !16
39
+ %40 = bitcast i32 %36 to float, !dbg !16
40
+ %41 = bitcast i32 %37 to float, !dbg !16
41
+ %42 = bitcast i32 %38 to float, !dbg !16
42
+ %43 = getelementptr float, ptr addrspace(1) %3, i64 %19, !dbg !17
43
+ %44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18
44
+ %45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !18
45
+ %46 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !18
46
+ %47 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !18
47
+ %48 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !18
48
+ %49 = bitcast i32 %45 to float, !dbg !18
49
+ %50 = bitcast i32 %46 to float, !dbg !18
50
+ %51 = bitcast i32 %47 to float, !dbg !18
51
+ %52 = bitcast i32 %48 to float, !dbg !18
52
+ %53 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !19
53
+ %54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20
54
+ %55 = extractvalue { i32, i32 } %54, 0, !dbg !20
55
+ %56 = extractvalue { i32, i32 } %54, 1, !dbg !20
56
+ %57 = trunc i32 %55 to i16, !dbg !20
57
+ %extelt.offset2 = lshr i32 %55, 16, !dbg !20
58
+ %58 = trunc i32 %extelt.offset2 to i16, !dbg !20
59
+ %59 = trunc i32 %56 to i16, !dbg !20
60
+ %extelt.offset3 = lshr i32 %56, 16, !dbg !20
61
+ %60 = trunc i32 %extelt.offset3 to i16, !dbg !20
62
+ %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !21
63
+ %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #3, !dbg !21
64
+ %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #3, !dbg !21
65
+ %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #3, !dbg !21
66
+ %65 = sext i32 %16 to i64, !dbg !22
67
+ %66 = getelementptr float, ptr addrspace(1) %5, i64 %65, !dbg !22
68
+ %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
69
+ %68 = bitcast i32 %67 to float, !dbg !23
70
+ %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
71
+ %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
72
+ %71 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23
73
+ %72 = getelementptr float, ptr addrspace(1) %6, i64 %65, !dbg !24
74
+ %73 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
75
+ %74 = bitcast i32 %73 to float, !dbg !25
76
+ %75 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
77
+ %76 = bitcast i32 %75 to float, !dbg !25
78
+ %77 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
79
+ %78 = bitcast i32 %77 to float, !dbg !25
80
+ %79 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25
81
+ %80 = bitcast i32 %79 to float, !dbg !25
82
+ %81 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !26
83
+ %82 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !27
84
+ %83 = extractvalue { i32, i32, i32, i32 } %82, 0, !dbg !27
85
+ %84 = extractvalue { i32, i32, i32, i32 } %82, 1, !dbg !27
86
+ %85 = extractvalue { i32, i32, i32, i32 } %82, 2, !dbg !27
87
+ %86 = extractvalue { i32, i32, i32, i32 } %82, 3, !dbg !27
88
+ %87 = bitcast i32 %83 to float, !dbg !27
89
+ %88 = bitcast i32 %84 to float, !dbg !27
90
+ %89 = bitcast i32 %85 to float, !dbg !27
91
+ %90 = bitcast i32 %86 to float, !dbg !27
92
+ %91 = fmul float %28, %39, !dbg !28
93
+ %92 = fmul float %29, %40, !dbg !28
94
+ %93 = fmul float %30, %41, !dbg !28
95
+ %94 = fmul float %31, %42, !dbg !28
96
+ %95 = fadd float %91, %92, !dbg !29
97
+ %96 = fadd float %93, %95, !dbg !29
98
+ %97 = fadd float %94, %96, !dbg !29
99
+ %98 = bitcast float %97 to i32, !dbg !35
100
+ %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !35
101
+ %100 = bitcast i32 %99 to float, !dbg !35
102
+ %101 = fadd float %97, %100, !dbg !29
103
+ %102 = bitcast float %101 to i32, !dbg !35
104
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !35
105
+ %104 = bitcast i32 %103 to float, !dbg !35
106
+ %105 = fadd float %101, %104, !dbg !29
107
+ %106 = bitcast float %105 to i32, !dbg !35
108
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !35
109
+ %108 = bitcast i32 %107 to float, !dbg !35
110
+ %109 = fadd float %105, %108, !dbg !29
111
+ %110 = bitcast float %109 to i32, !dbg !35
112
+ %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !35
113
+ %112 = bitcast i32 %111 to float, !dbg !35
114
+ %113 = fadd float %109, %112, !dbg !29
115
+ %114 = bitcast float %113 to i32, !dbg !35
116
+ %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !35
117
+ %116 = bitcast i32 %115 to float, !dbg !35
118
+ %117 = fadd float %113, %116, !dbg !29
119
+ %118 = icmp eq i32 %12, 0, !dbg !35
120
+ %119 = zext nneg i32 %14 to i64, !dbg !35
121
+ %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !35
122
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !35
123
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
124
+ %121 = icmp slt i32 %11, 2, !dbg !35
125
+ %122 = sext i32 %11 to i64, !dbg !35
126
+ %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !35
127
+ %124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !35
128
+ %125 = bitcast float %124 to i32, !dbg !35
129
+ %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !35
130
+ %127 = bitcast i32 %126 to float, !dbg !35
131
+ %128 = fadd float %124, %127, !dbg !29
132
+ %129 = and i32 %11, 1, !dbg !35
133
+ %130 = icmp eq i32 %129, 0, !dbg !35
134
+ %131 = and i1 %121, %130, !dbg !35
135
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %128, i1 %131) #3, !dbg !35
136
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
137
+ %132 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35
138
+ %133 = fadd float %132, 0.000000e+00, !dbg !37
139
+ %134 = fadd float %61, %49, !dbg !41
140
+ %135 = fadd float %62, %50, !dbg !41
141
+ %136 = fadd float %63, %51, !dbg !41
142
+ %137 = fadd float %64, %52, !dbg !41
143
+ %138 = fsub float %134, %68, !dbg !42
144
+ %139 = fsub float %135, %68, !dbg !42
145
+ %140 = fsub float %136, %68, !dbg !42
146
+ %141 = fsub float %137, %68, !dbg !42
147
+ %142 = fmul float %138, %74, !dbg !43
148
+ %143 = fmul float %139, %74, !dbg !43
149
+ %144 = fmul float %140, %74, !dbg !43
150
+ %145 = fmul float %141, %74, !dbg !43
151
+ %146 = fmul float %91, %142, !dbg !44
152
+ %147 = fmul float %92, %143, !dbg !44
153
+ %148 = fmul float %93, %144, !dbg !44
154
+ %149 = fmul float %94, %145, !dbg !44
155
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
156
+ %150 = fadd float %146, %147, !dbg !47
157
+ %151 = fadd float %148, %150, !dbg !47
158
+ %152 = fadd float %149, %151, !dbg !47
159
+ %153 = bitcast float %152 to i32, !dbg !45
160
+ %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 16, i32 31), !dbg !45
161
+ %155 = bitcast i32 %154 to float, !dbg !45
162
+ %156 = fadd float %152, %155, !dbg !47
163
+ %157 = bitcast float %156 to i32, !dbg !45
164
+ %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 8, i32 31), !dbg !45
165
+ %159 = bitcast i32 %158 to float, !dbg !45
166
+ %160 = fadd float %156, %159, !dbg !47
167
+ %161 = bitcast float %160 to i32, !dbg !45
168
+ %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !45
169
+ %163 = bitcast i32 %162 to float, !dbg !45
170
+ %164 = fadd float %160, %163, !dbg !47
171
+ %165 = bitcast float %164 to i32, !dbg !45
172
+ %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !45
173
+ %167 = bitcast i32 %166 to float, !dbg !45
174
+ %168 = fadd float %164, %167, !dbg !47
175
+ %169 = bitcast float %168 to i32, !dbg !45
176
+ %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !45
177
+ %171 = bitcast i32 %170 to float, !dbg !45
178
+ %172 = fadd float %168, %171, !dbg !47
179
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %172, i1 %118) #3, !dbg !45
180
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
181
+ %173 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !45
182
+ %174 = bitcast float %173 to i32, !dbg !45
183
+ %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 1, i32 31), !dbg !45
184
+ %176 = bitcast i32 %175 to float, !dbg !45
185
+ %177 = fadd float %173, %176, !dbg !47
186
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %177, i1 %131) #3, !dbg !45
187
+ tail call void @llvm.nvvm.barrier0(), !dbg !45
188
+ %178 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45
189
+ %179 = fadd float %178, 0.000000e+00, !dbg !50
190
+ %180 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %74, float 2.560000e+02) #3, !dbg !52
191
+ %181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %76, float 2.560000e+02) #3, !dbg !52
192
+ %182 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %78, float 2.560000e+02) #3, !dbg !52
193
+ %183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %80, float 2.560000e+02) #3, !dbg !52
194
+ %184 = fmul float %91, 2.560000e+02, !dbg !53
195
+ %185 = fmul float %92, 2.560000e+02, !dbg !53
196
+ %186 = fmul float %93, 2.560000e+02, !dbg !53
197
+ %187 = fmul float %94, 2.560000e+02, !dbg !53
198
+ %188 = fsub float %184, %133, !dbg !54
199
+ %189 = fsub float %185, %133, !dbg !54
200
+ %190 = fsub float %186, %133, !dbg !54
201
+ %191 = fsub float %187, %133, !dbg !54
202
+ %192 = fmul float %142, %179, !dbg !55
203
+ %193 = fmul float %143, %179, !dbg !55
204
+ %194 = fmul float %144, %179, !dbg !55
205
+ %195 = fmul float %145, %179, !dbg !55
206
+ %196 = fsub float %188, %192, !dbg !56
207
+ %197 = fsub float %189, %193, !dbg !56
208
+ %198 = fsub float %190, %194, !dbg !56
209
+ %199 = fsub float %191, %195, !dbg !56
210
+ %200 = fmul float %180, %196, !dbg !57
211
+ %201 = fmul float %180, %197, !dbg !57
212
+ %202 = fmul float %180, %198, !dbg !57
213
+ %203 = fmul float %180, %199, !dbg !57
214
+ %204 = fadd float %200, %87, !dbg !58
215
+ %205 = fadd float %201, %88, !dbg !58
216
+ %206 = fadd float %202, %89, !dbg !58
217
+ %207 = fadd float %203, %90, !dbg !58
218
+ %208 = bitcast float %204 to i32, !dbg !59
219
+ %209 = bitcast float %205 to i32, !dbg !59
220
+ %210 = bitcast float %206 to i32, !dbg !59
221
+ %211 = bitcast float %207 to i32, !dbg !59
222
+ tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %81, i1 true) #3, !dbg !59
223
+ %212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !60
224
+ %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #3, !dbg !61
225
+ %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #3, !dbg !61
226
+ %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #3, !dbg !61
227
+ %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %207) #3, !dbg !61
228
+ %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !61
229
+ %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !61
230
+ %219 = bitcast <2 x i16> %218 to i32, !dbg !61
231
+ %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !61
232
+ %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !61
233
+ %222 = bitcast <2 x i16> %221 to i32, !dbg !61
234
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #3, !dbg !61
235
+ ret void, !dbg !62
236
+ }
237
+
238
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
239
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
240
+
241
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
242
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
243
+
244
+ ; Function Attrs: convergent nocallback nounwind
245
+ declare void @llvm.nvvm.barrier0() #2
246
+
247
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
248
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
249
+ attributes #2 = { convergent nocallback nounwind }
250
+ attributes #3 = { nounwind }
251
+
252
+ !llvm.module.flags = !{!0}
253
+ !llvm.dbg.cu = !{!1}
254
+ !nvvm.annotations = !{!3, !4, !4, !3}
255
+
256
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
257
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
258
+ !2 = !DIFile(filename: "cfhjzwujbd4bpel57x4hxw7d3m3qqfwrjg6bfe6e4wk2cyh77u45.py", directory: "/tmp/torchinductor_root/fh")
259
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1}
260
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64}
261
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
262
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
263
+ !7 = !{}
264
+ !8 = !DILocation(line: 26, column: 26, scope: !5)
265
+ !9 = !DILocation(line: 23, column: 28, scope: !5)
266
+ !10 = !DILocation(line: 30, column: 40, scope: !5)
267
+ !11 = !DILocation(line: 30, column: 36, scope: !5)
268
+ !12 = !DILocation(line: 30, column: 30, scope: !5)
269
+ !13 = !DILocation(line: 30, column: 46, scope: !5)
270
+ !14 = !DILocation(line: 30, column: 67, scope: !5)
271
+ !15 = !DILocation(line: 31, column: 30, scope: !5)
272
+ !16 = !DILocation(line: 31, column: 35, scope: !5)
273
+ !17 = !DILocation(line: 32, column: 30, scope: !5)
274
+ !18 = !DILocation(line: 32, column: 46, scope: !5)
275
+ !19 = !DILocation(line: 33, column: 30, scope: !5)
276
+ !20 = !DILocation(line: 33, column: 46, scope: !5)
277
+ !21 = !DILocation(line: 33, column: 67, scope: !5)
278
+ !22 = !DILocation(line: 34, column: 31, scope: !5)
279
+ !23 = !DILocation(line: 34, column: 36, scope: !5)
280
+ !24 = !DILocation(line: 35, column: 31, scope: !5)
281
+ !25 = !DILocation(line: 35, column: 36, scope: !5)
282
+ !26 = !DILocation(line: 36, column: 35, scope: !5)
283
+ !27 = !DILocation(line: 36, column: 51, scope: !5)
284
+ !28 = !DILocation(line: 38, column: 18, scope: !5)
285
+ !29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33)
286
+ !30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0)
287
+ !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
288
+ !32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
289
+ !33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34)
290
+ !34 = !DILocation(line: 41, column: 57, scope: !30)
291
+ !35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36)
292
+ !36 = !DILocation(line: 41, column: 57, scope: !32)
293
+ !37 = !DILocation(line: 8, column: 15, scope: !38, inlinedAt: !40)
294
+ !38 = distinct !DILexicalBlockFile(scope: !5, file: !39, discriminator: 0)
295
+ !39 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
296
+ !40 = !DILocation(line: 41, column: 44, scope: !38)
297
+ !41 = !DILocation(line: 43, column: 19, scope: !5)
298
+ !42 = !DILocation(line: 44, column: 20, scope: !5)
299
+ !43 = !DILocation(line: 45, column: 20, scope: !5)
300
+ !44 = !DILocation(line: 46, column: 19, scope: !5)
301
+ !45 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !46)
302
+ !46 = !DILocation(line: 49, column: 59, scope: !32)
303
+ !47 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !48)
304
+ !48 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !49)
305
+ !49 = !DILocation(line: 49, column: 59, scope: !30)
306
+ !50 = !DILocation(line: 8, column: 15, scope: !38, inlinedAt: !51)
307
+ !51 = !DILocation(line: 49, column: 45, scope: !38)
308
+ !52 = !DILocation(line: 51, column: 20, scope: !5)
309
+ !53 = !DILocation(line: 52, column: 19, scope: !5)
310
+ !54 = !DILocation(line: 53, column: 20, scope: !5)
311
+ !55 = !DILocation(line: 54, column: 20, scope: !5)
312
+ !56 = !DILocation(line: 55, column: 20, scope: !5)
313
+ !57 = !DILocation(line: 56, column: 20, scope: !5)
314
+ !58 = !DILocation(line: 57, column: 20, scope: !5)
315
+ !59 = !DILocation(line: 59, column: 51, scope: !5)
316
+ !60 = !DILocation(line: 60, column: 25, scope: !5)
317
+ !61 = !DILocation(line: 60, column: 48, scope: !5)
318
+ !62 = !DILocation(line: 60, column: 4, scope: !5)