Add files using upload-large-folder tool
Browse files- .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.llir +85 -0
- .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttgir +152 -0
- .triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir +153 -0
- .triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir +109 -0
- .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.llir +600 -0
- .triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir +169 -0
- .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir +18 -0
- .triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttgir +86 -0
- .triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.cubin +0 -0
- .triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir +760 -0
- .triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.llir +839 -0
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.llir
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
|
5 |
+
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%5 = shl i32 %4, 3, !dbg !8
|
7 |
+
%6 = and i32 %5, 1016, !dbg !8
|
8 |
+
%7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
|
9 |
+
%8 = shl i32 %7, 10, !dbg !10
|
10 |
+
%9 = or i32 %8, %6, !dbg !11
|
11 |
+
%10 = or i32 %9, 4, !dbg !11
|
12 |
+
%11 = sext i32 %9 to i64, !dbg !12
|
13 |
+
%12 = getelementptr float, ptr addrspace(1) %0, i64 %11, !dbg !12
|
14 |
+
%13 = sext i32 %10 to i64, !dbg !12
|
15 |
+
%14 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !12
|
16 |
+
%15 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %12, i1 true) #1, !dbg !13
|
17 |
+
%16 = extractvalue { i32, i32, i32, i32 } %15, 0, !dbg !13
|
18 |
+
%17 = extractvalue { i32, i32, i32, i32 } %15, 1, !dbg !13
|
19 |
+
%18 = extractvalue { i32, i32, i32, i32 } %15, 2, !dbg !13
|
20 |
+
%19 = extractvalue { i32, i32, i32, i32 } %15, 3, !dbg !13
|
21 |
+
%20 = bitcast i32 %16 to float, !dbg !13
|
22 |
+
%21 = bitcast i32 %17 to float, !dbg !13
|
23 |
+
%22 = bitcast i32 %18 to float, !dbg !13
|
24 |
+
%23 = bitcast i32 %19 to float, !dbg !13
|
25 |
+
%24 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %14, i1 true) #1, !dbg !13
|
26 |
+
%25 = extractvalue { i32, i32, i32, i32 } %24, 0, !dbg !13
|
27 |
+
%26 = extractvalue { i32, i32, i32, i32 } %24, 1, !dbg !13
|
28 |
+
%27 = extractvalue { i32, i32, i32, i32 } %24, 2, !dbg !13
|
29 |
+
%28 = extractvalue { i32, i32, i32, i32 } %24, 3, !dbg !13
|
30 |
+
%29 = bitcast i32 %25 to float, !dbg !13
|
31 |
+
%30 = bitcast i32 %26 to float, !dbg !13
|
32 |
+
%31 = bitcast i32 %27 to float, !dbg !13
|
33 |
+
%32 = bitcast i32 %28 to float, !dbg !13
|
34 |
+
%33 = getelementptr i16, ptr addrspace(1) %1, i64 %11, !dbg !14
|
35 |
+
%34 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %20) #1, !dbg !15
|
36 |
+
%35 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %21) #1, !dbg !15
|
37 |
+
%36 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %22) #1, !dbg !15
|
38 |
+
%37 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %23) #1, !dbg !15
|
39 |
+
%38 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %29) #1, !dbg !15
|
40 |
+
%39 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %30) #1, !dbg !15
|
41 |
+
%40 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %31) #1, !dbg !15
|
42 |
+
%41 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %32) #1, !dbg !15
|
43 |
+
%42 = insertelement <2 x i16> undef, i16 %34, i64 0, !dbg !15
|
44 |
+
%43 = insertelement <2 x i16> %42, i16 %35, i64 1, !dbg !15
|
45 |
+
%44 = bitcast <2 x i16> %43 to i32, !dbg !15
|
46 |
+
%45 = insertelement <2 x i16> undef, i16 %36, i64 0, !dbg !15
|
47 |
+
%46 = insertelement <2 x i16> %45, i16 %37, i64 1, !dbg !15
|
48 |
+
%47 = bitcast <2 x i16> %46 to i32, !dbg !15
|
49 |
+
%48 = insertelement <2 x i16> undef, i16 %38, i64 0, !dbg !15
|
50 |
+
%49 = insertelement <2 x i16> %48, i16 %39, i64 1, !dbg !15
|
51 |
+
%50 = bitcast <2 x i16> %49 to i32, !dbg !15
|
52 |
+
%51 = insertelement <2 x i16> undef, i16 %40, i64 0, !dbg !15
|
53 |
+
%52 = insertelement <2 x i16> %51, i16 %41, i64 1, !dbg !15
|
54 |
+
%53 = bitcast <2 x i16> %52 to i32, !dbg !15
|
55 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %44, i32 %47, i32 %50, i32 %53, ptr addrspace(1) %33, i1 true) #1, !dbg !15
|
56 |
+
ret void, !dbg !16
|
57 |
+
}
|
58 |
+
|
59 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
60 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
61 |
+
|
62 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
63 |
+
attributes #1 = { nounwind }
|
64 |
+
|
65 |
+
!llvm.module.flags = !{!0}
|
66 |
+
!llvm.dbg.cu = !{!1}
|
67 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
68 |
+
|
69 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
70 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
71 |
+
!2 = !DIFile(filename: "c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py", directory: "/tmp/torchinductor_root/5t")
|
72 |
+
!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
|
73 |
+
!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
|
74 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
75 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
76 |
+
!7 = !{}
|
77 |
+
!8 = !DILocation(line: 21, column: 36, scope: !5)
|
78 |
+
!9 = !DILocation(line: 20, column: 28, scope: !5)
|
79 |
+
!10 = !DILocation(line: 20, column: 33, scope: !5)
|
80 |
+
!11 = !DILocation(line: 21, column: 23, scope: !5)
|
81 |
+
!12 = !DILocation(line: 24, column: 30, scope: !5)
|
82 |
+
!13 = !DILocation(line: 24, column: 35, scope: !5)
|
83 |
+
!14 = !DILocation(line: 26, column: 25, scope: !5)
|
84 |
+
!15 = !DILocation(line: 26, column: 36, scope: !5)
|
85 |
+
!16 = !DILocation(line: 26, column: 4, scope: !5)
|
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttgir
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
|
10 |
+
%cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
|
11 |
+
%cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
|
12 |
+
%c0_i32 = arith.constant 0 : i32
|
13 |
+
%c4_i32 = arith.constant 4 : i32
|
14 |
+
%c256_i32 = arith.constant 256 : i32
|
15 |
+
%cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked>
|
16 |
+
%cst_7 = arith.constant 0.000000e+00 : f32
|
17 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xbf16, #blocked>
|
18 |
+
%cst_9 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked>
|
19 |
+
%cst_10 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
|
20 |
+
%cst_11 = arith.constant dense<256> : tensor<1x4xi32, #blocked>
|
21 |
+
%cst_12 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
|
22 |
+
%cst_13 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
|
23 |
+
%c64_i32 = arith.constant 64 : i32
|
24 |
+
%0 = tt.get_program_id x : i32
|
25 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
26 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
27 |
+
%3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
28 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
|
29 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
|
30 |
+
%6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
|
31 |
+
%7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
|
32 |
+
%8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
|
33 |
+
%9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
|
34 |
+
%10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
35 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
|
36 |
+
%12 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
|
37 |
+
%13 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
|
38 |
+
%14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
|
39 |
+
%15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
|
40 |
+
%16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
|
41 |
+
%17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
|
42 |
+
%18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
|
43 |
+
%19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked>
|
44 |
+
%20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
45 |
+
%21 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
|
46 |
+
%22 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked>
|
47 |
+
%23 = tt.broadcast %22 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
48 |
+
%24 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
|
49 |
+
%25 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked>
|
50 |
+
%26 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1>
|
51 |
+
%27 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked>
|
52 |
+
%28 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1>
|
53 |
+
%29 = arith.select %27, %25, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
|
54 |
+
%30 = arith.select %28, %26, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
|
55 |
+
%31 = arith.cmpi sge, %30, %cst_5 : tensor<64x1xi64, #blocked1>
|
56 |
+
%32 = arith.cmpi slt, %30, %cst_4 : tensor<64x1xi64, #blocked1>
|
57 |
+
%33 = arith.andi %31, %32 : tensor<64x1xi1, #blocked1>
|
58 |
+
%34 = arith.muli %29, %cst_1 : tensor<64x1xi64, #blocked>
|
59 |
+
%35 = tt.broadcast %34 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
60 |
+
%36 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
|
61 |
+
%37:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_10, %arg10 = %cst_10, %arg11 = %cst_10) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) : i32 {
|
62 |
+
%46 = tt.splat %arg8 : (i32) -> tensor<1x4xi32, #blocked>
|
63 |
+
%47 = arith.addi %46, %11 : tensor<1x4xi32, #blocked>
|
64 |
+
%48 = arith.cmpi slt, %47, %cst_11 : tensor<1x4xi32, #blocked>
|
65 |
+
%49 = tt.broadcast %47 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
66 |
+
%50 = arith.addi %49, %20 : tensor<64x4xi32, #blocked>
|
67 |
+
%51 = tt.addptr %21, %50 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
68 |
+
%52 = tt.broadcast %48 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
69 |
+
%53 = tt.load %51, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
70 |
+
%54 = arith.addi %49, %23 : tensor<64x4xi32, #blocked>
|
71 |
+
%55 = tt.addptr %24, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
72 |
+
%56 = tt.load %55, %52, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
|
73 |
+
%57 = arith.extf %56 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
|
74 |
+
tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
75 |
+
%58 = arith.extsi %47 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
|
76 |
+
%59 = tt.broadcast %58 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
77 |
+
%60 = arith.addi %59, %35 : tensor<64x4xi64, #blocked>
|
78 |
+
%61 = tt.addptr %36, %60 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
79 |
+
%62 = tt.load %61, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
80 |
+
%63 = arith.addf %62, %53 : tensor<64x4xf32, #blocked>
|
81 |
+
%64 = arith.addf %63, %57 : tensor<64x4xf32, #blocked>
|
82 |
+
%65 = arith.subf %64, %arg9 : tensor<64x4xf32, #blocked>
|
83 |
+
%66 = arith.addf %arg11, %cst_6 : tensor<64x4xf32, #blocked>
|
84 |
+
%67 = arith.divf %65, %66 : tensor<64x4xf32, #blocked>
|
85 |
+
%68 = arith.addf %arg9, %67 : tensor<64x4xf32, #blocked>
|
86 |
+
%69 = arith.subf %64, %68 : tensor<64x4xf32, #blocked>
|
87 |
+
%70 = arith.mulf %65, %69 : tensor<64x4xf32, #blocked>
|
88 |
+
%71 = arith.addf %arg10, %70 : tensor<64x4xf32, #blocked>
|
89 |
+
%72 = arith.select %52, %68, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
90 |
+
%73 = arith.select %52, %71, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
91 |
+
%74 = arith.select %52, %66, %arg11 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
|
92 |
+
scf.yield %72, %73, %74 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>
|
93 |
+
}
|
94 |
+
%38:3 = "tt.reduce"(%37#0, %37#1, %37#2) <{axis = 1 : i32}> ({
|
95 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
96 |
+
%46 = arith.subf %arg11, %arg8 : f32
|
97 |
+
%47 = arith.addf %arg10, %arg13 : f32
|
98 |
+
%48 = arith.cmpf oeq, %47, %cst_7 : f32
|
99 |
+
%49 = arith.divf %arg13, %47 : f32
|
100 |
+
%50 = arith.select %48, %cst_7, %49 : f32
|
101 |
+
%51 = arith.mulf %46, %50 : f32
|
102 |
+
%52 = arith.addf %arg8, %51 : f32
|
103 |
+
%53 = arith.addf %arg9, %arg12 : f32
|
104 |
+
%54 = arith.mulf %46, %46 : f32
|
105 |
+
%55 = arith.mulf %54, %arg10 : f32
|
106 |
+
%56 = arith.mulf %55, %50 : f32
|
107 |
+
%57 = arith.addf %53, %56 : f32
|
108 |
+
tt.reduce.return %52, %57, %47 : f32, f32, f32
|
109 |
+
}) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
110 |
+
%39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
111 |
+
%40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
|
112 |
+
%41 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>, #blocked>
|
113 |
+
%42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
114 |
+
%43 = arith.divf %40, %cst_13 : tensor<64x1xf32, #blocked>
|
115 |
+
%44 = arith.addf %43, %cst_12 : tensor<64x1xf32, #blocked>
|
116 |
+
%45 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>, #blocked>
|
117 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
|
118 |
+
%46 = tt.splat %arg8 : (i32) -> tensor<1x4xi32, #blocked>
|
119 |
+
%47 = arith.addi %46, %11 : tensor<1x4xi32, #blocked>
|
120 |
+
%48 = arith.cmpi slt, %47, %cst_11 : tensor<1x4xi32, #blocked>
|
121 |
+
%49 = tt.broadcast %47 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
|
122 |
+
%50 = arith.addi %49, %20 : tensor<64x4xi32, #blocked>
|
123 |
+
%51 = tt.addptr %21, %50 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
124 |
+
%52 = tt.broadcast %48 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
|
125 |
+
%53 = tt.load %51, %52, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
126 |
+
%54 = arith.addi %49, %23 : tensor<64x4xi32, #blocked>
|
127 |
+
%55 = tt.addptr %24, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
128 |
+
%56 = tt.load %55, %52, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16, #blocked>
|
129 |
+
%57 = arith.extf %56 : tensor<64x4xbf16, #blocked> to tensor<64x4xf32, #blocked>
|
130 |
+
%58 = tt.addptr %41, %47 : tensor<1x4x!tt.ptr<f32, 1>, #blocked>, tensor<1x4xi32, #blocked>
|
131 |
+
%59 = tt.load %58, %48, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked>
|
132 |
+
tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
|
133 |
+
%60 = arith.extsi %47 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked>
|
134 |
+
%61 = tt.broadcast %60 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked>
|
135 |
+
%62 = arith.addi %61, %35 : tensor<64x4xi64, #blocked>
|
136 |
+
%63 = tt.addptr %36, %62 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi64, #blocked>
|
137 |
+
%64 = tt.load %63, %52, %cst_10 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
|
138 |
+
%65 = arith.addf %64, %53 : tensor<64x4xf32, #blocked>
|
139 |
+
%66 = arith.addf %65, %57 : tensor<64x4xf32, #blocked>
|
140 |
+
%67 = arith.subf %66, %42 : tensor<64x4xf32, #blocked>
|
141 |
+
%68 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
|
142 |
+
%69 = tt.broadcast %68 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
143 |
+
%70 = arith.mulf %67, %69 : tensor<64x4xf32, #blocked>
|
144 |
+
%71 = tt.broadcast %59 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked>
|
145 |
+
%72 = arith.mulf %70, %71 : tensor<64x4xf32, #blocked>
|
146 |
+
%73 = tt.addptr %45, %54 : tensor<64x4x!tt.ptr<bf16, 1>, #blocked>, tensor<64x4xi32, #blocked>
|
147 |
+
%74 = arith.truncf %72 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked>
|
148 |
+
tt.store %73, %74, %52 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked>
|
149 |
+
}
|
150 |
+
tt.return
|
151 |
+
}
|
152 |
+
}
|
.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ttir
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<0.000000e+00> : tensor<64x4xbf16>
|
4 |
+
%cst_0 = arith.constant 0.000000e+00 : f32
|
5 |
+
%cst_1 = arith.constant dense<1.000000e+00> : tensor<64x4xf32>
|
6 |
+
%c256_i32 = arith.constant 256 : i32
|
7 |
+
%c4_i32 = arith.constant 4 : i32
|
8 |
+
%c0_i32 = arith.constant 0 : i32
|
9 |
+
%cst_2 = arith.constant dense<256> : tensor<64x1xi64>
|
10 |
+
%cst_3 = arith.constant dense<0> : tensor<64x1xi64>
|
11 |
+
%cst_4 = arith.constant dense<50257> : tensor<64x1xi64>
|
12 |
+
%cst_5 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
|
13 |
+
%cst_6 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
|
14 |
+
%cst_7 = arith.constant dense<0.000000e+00> : tensor<1x4xf32>
|
15 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<64x4xf32>
|
16 |
+
%cst_9 = arith.constant dense<256> : tensor<64x1xi32>
|
17 |
+
%cst_10 = arith.constant dense<256> : tensor<1x4xi32>
|
18 |
+
%cst_11 = arith.constant dense<512> : tensor<64x1xi32>
|
19 |
+
%c64_i32 = arith.constant 64 : i32
|
20 |
+
%0 = tt.get_program_id x : i32
|
21 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
22 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
23 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
24 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
25 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
26 |
+
%6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32>
|
27 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32>
|
28 |
+
%8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
|
29 |
+
%9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
|
30 |
+
%10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
|
31 |
+
%11 = arith.remsi %5, %cst_11 : tensor<64x1xi32>
|
32 |
+
%12 = arith.muli %11, %cst_9 : tensor<64x1xi32>
|
33 |
+
%13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
34 |
+
%14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
35 |
+
%15 = arith.muli %5, %cst_9 : tensor<64x1xi32>
|
36 |
+
%16 = tt.broadcast %15 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
37 |
+
%17 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
38 |
+
%18 = arith.addi %10, %cst_4 : tensor<64x1xi64>
|
39 |
+
%19 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
|
40 |
+
%20 = arith.select %19, %18, %10 : tensor<64x1xi1>, tensor<64x1xi64>
|
41 |
+
%21 = arith.cmpi sge, %20, %cst_3 : tensor<64x1xi64>
|
42 |
+
%22 = arith.cmpi slt, %20, %cst_4 : tensor<64x1xi64>
|
43 |
+
%23 = arith.andi %21, %22 : tensor<64x1xi1>
|
44 |
+
%24 = arith.muli %20, %cst_2 : tensor<64x1xi64>
|
45 |
+
%25 = tt.broadcast %24 : (tensor<64x1xi64>) -> tensor<64x4xi64>
|
46 |
+
%26 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
47 |
+
%27:3 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg9 = %cst_8, %arg10 = %cst_8, %arg11 = %cst_8) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 {
|
48 |
+
%51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
|
49 |
+
%52 = arith.addi %51, %7 : tensor<1x4xi32>
|
50 |
+
%53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
|
51 |
+
%54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
|
52 |
+
%55 = arith.addi %54, %13 : tensor<64x4xi32>
|
53 |
+
%56 = tt.addptr %14, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
|
54 |
+
%57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
55 |
+
%58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
56 |
+
%59 = arith.addi %54, %16 : tensor<64x4xi32>
|
57 |
+
%60 = tt.addptr %17, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
|
58 |
+
%61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xbf16>
|
59 |
+
%62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
|
60 |
+
tt.assert %23, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
61 |
+
%63 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
|
62 |
+
%64 = tt.broadcast %63 : (tensor<1x4xi64>) -> tensor<64x4xi64>
|
63 |
+
%65 = arith.addi %64, %25 : tensor<64x4xi64>
|
64 |
+
%66 = tt.addptr %26, %65 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
|
65 |
+
%67 = tt.load %66, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
66 |
+
%68 = arith.addf %67, %58 : tensor<64x4xf32>
|
67 |
+
%69 = arith.addf %68, %62 : tensor<64x4xf32>
|
68 |
+
%70 = arith.subf %69, %arg9 : tensor<64x4xf32>
|
69 |
+
%71 = arith.addf %arg11, %cst_1 : tensor<64x4xf32>
|
70 |
+
%72 = arith.divf %70, %71 : tensor<64x4xf32>
|
71 |
+
%73 = arith.addf %arg9, %72 : tensor<64x4xf32>
|
72 |
+
%74 = arith.subf %69, %73 : tensor<64x4xf32>
|
73 |
+
%75 = arith.mulf %70, %74 : tensor<64x4xf32>
|
74 |
+
%76 = arith.addf %arg10, %75 : tensor<64x4xf32>
|
75 |
+
%77 = arith.select %57, %73, %arg9 : tensor<64x4xi1>, tensor<64x4xf32>
|
76 |
+
%78 = arith.select %57, %76, %arg10 : tensor<64x4xi1>, tensor<64x4xf32>
|
77 |
+
%79 = arith.select %57, %71, %arg11 : tensor<64x4xi1>, tensor<64x4xf32>
|
78 |
+
scf.yield %77, %78, %79 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>
|
79 |
+
}
|
80 |
+
%28:3 = "tt.reduce"(%27#0, %27#1, %27#2) <{axis = 1 : i32}> ({
|
81 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
82 |
+
%51 = arith.subf %arg11, %arg8 : f32
|
83 |
+
%52 = arith.addf %arg10, %arg13 : f32
|
84 |
+
%53 = arith.cmpf oeq, %52, %cst_0 : f32
|
85 |
+
%54 = arith.divf %arg13, %52 : f32
|
86 |
+
%55 = arith.select %53, %cst_0, %54 : f32
|
87 |
+
%56 = arith.mulf %51, %55 : f32
|
88 |
+
%57 = arith.addf %arg8, %56 : f32
|
89 |
+
%58 = arith.addf %arg9, %arg12 : f32
|
90 |
+
%59 = arith.mulf %51, %51 : f32
|
91 |
+
%60 = arith.mulf %59, %arg10 : f32
|
92 |
+
%61 = arith.mulf %60, %55 : f32
|
93 |
+
%62 = arith.addf %58, %61 : f32
|
94 |
+
tt.reduce.return %57, %62, %52 : f32, f32, f32
|
95 |
+
}) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
|
96 |
+
%29 = tt.expand_dims %28#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
97 |
+
%30 = tt.expand_dims %28#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
98 |
+
%31 = arith.muli %11, %cst_9 : tensor<64x1xi32>
|
99 |
+
%32 = tt.broadcast %31 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
100 |
+
%33 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
101 |
+
%34 = arith.muli %5, %cst_9 : tensor<64x1xi32>
|
102 |
+
%35 = tt.broadcast %34 : (tensor<64x1xi32>) -> tensor<64x4xi32>
|
103 |
+
%36 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
104 |
+
%37 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x4x!tt.ptr<f32, 1>>
|
105 |
+
%38 = arith.addi %10, %cst_4 : tensor<64x1xi64>
|
106 |
+
%39 = arith.cmpi slt, %10, %cst_3 : tensor<64x1xi64>
|
107 |
+
%40 = arith.select %39, %38, %10 : tensor<64x1xi1>, tensor<64x1xi64>
|
108 |
+
%41 = arith.cmpi sge, %40, %cst_3 : tensor<64x1xi64>
|
109 |
+
%42 = arith.cmpi slt, %40, %cst_4 : tensor<64x1xi64>
|
110 |
+
%43 = arith.andi %41, %42 : tensor<64x1xi1>
|
111 |
+
%44 = arith.muli %40, %cst_2 : tensor<64x1xi64>
|
112 |
+
%45 = tt.broadcast %44 : (tensor<64x1xi64>) -> tensor<64x4xi64>
|
113 |
+
%46 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>>
|
114 |
+
%47 = tt.broadcast %29 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
115 |
+
%48 = arith.divf %30, %cst_6 : tensor<64x1xf32>
|
116 |
+
%49 = arith.addf %48, %cst_5 : tensor<64x1xf32>
|
117 |
+
%50 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x4x!tt.ptr<bf16, 1>>
|
118 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c4_i32 : i32 {
|
119 |
+
%51 = tt.splat %arg8 : (i32) -> tensor<1x4xi32>
|
120 |
+
%52 = arith.addi %51, %7 : tensor<1x4xi32>
|
121 |
+
%53 = arith.cmpi slt, %52, %cst_10 : tensor<1x4xi32>
|
122 |
+
%54 = tt.broadcast %52 : (tensor<1x4xi32>) -> tensor<64x4xi32>
|
123 |
+
%55 = arith.addi %54, %32 : tensor<64x4xi32>
|
124 |
+
%56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi32>
|
125 |
+
%57 = tt.broadcast %53 : (tensor<1x4xi1>) -> tensor<64x4xi1>
|
126 |
+
%58 = tt.load %56, %57, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32>
|
127 |
+
%59 = arith.addi %54, %35 : tensor<64x4xi32>
|
128 |
+
%60 = tt.addptr %36, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
|
129 |
+
%61 = tt.load %60, %57, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xbf16>
|
130 |
+
%62 = arith.extf %61 : tensor<64x4xbf16> to tensor<64x4xf32>
|
131 |
+
%63 = tt.addptr %37, %52 : tensor<1x4x!tt.ptr<f32, 1>>, tensor<1x4xi32>
|
132 |
+
%64 = tt.load %63, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32>
|
133 |
+
tt.assert %43, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
|
134 |
+
%65 = arith.extsi %52 : tensor<1x4xi32> to tensor<1x4xi64>
|
135 |
+
%66 = tt.broadcast %65 : (tensor<1x4xi64>) -> tensor<64x4xi64>
|
136 |
+
%67 = arith.addi %66, %45 : tensor<64x4xi64>
|
137 |
+
%68 = tt.addptr %46, %67 : tensor<64x4x!tt.ptr<f32, 1>>, tensor<64x4xi64>
|
138 |
+
%69 = tt.load %68, %57, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32>
|
139 |
+
%70 = arith.addf %69, %58 : tensor<64x4xf32>
|
140 |
+
%71 = arith.addf %70, %62 : tensor<64x4xf32>
|
141 |
+
%72 = arith.subf %71, %47 : tensor<64x4xf32>
|
142 |
+
%73 = tt.extern_elementwise %49 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
|
143 |
+
%74 = tt.broadcast %73 : (tensor<64x1xf32>) -> tensor<64x4xf32>
|
144 |
+
%75 = arith.mulf %72, %74 : tensor<64x4xf32>
|
145 |
+
%76 = tt.broadcast %64 : (tensor<1x4xf32>) -> tensor<64x4xf32>
|
146 |
+
%77 = arith.mulf %75, %76 : tensor<64x4xf32>
|
147 |
+
%78 = tt.addptr %50, %59 : tensor<64x4x!tt.ptr<bf16, 1>>, tensor<64x4xi32>
|
148 |
+
%79 = arith.truncf %77 : tensor<64x4xf32> to tensor<64x4xbf16>
|
149 |
+
tt.store %78, %79, %57 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16>
|
150 |
+
}
|
151 |
+
tt.return
|
152 |
+
}
|
153 |
+
}
|
.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.llir
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
define void @triton__0d1d2d34e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
|
5 |
+
%6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
|
6 |
+
%7 = and i32 %6, 7, !dbg !8
|
7 |
+
%8 = zext nneg i32 %7 to i64, !dbg !9
|
8 |
+
%9 = getelementptr float, ptr addrspace(1) %1, i64 %8, !dbg !9
|
9 |
+
%10 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %9, i1 true, i32 0, i1 true) #3, !dbg !10
|
10 |
+
%11 = bitcast i32 %10 to float, !dbg !10
|
11 |
+
%12 = getelementptr i64, ptr addrspace(1) %2, i64 %8, !dbg !11
|
12 |
+
%13 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];\0A\09@!$3 mov.u64 $0, 0x0;", "=l,l,b,b"(ptr addrspace(1) %12, i1 true, i1 true) #3, !dbg !12
|
13 |
+
%14 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %10, i32 4, i32 31), !dbg !13
|
14 |
+
%15 = bitcast i32 %14 to float, !dbg !13
|
15 |
+
%16 = fadd float %11, %15, !dbg !17
|
16 |
+
%17 = bitcast float %16 to i32, !dbg !13
|
17 |
+
%18 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %17, i32 2, i32 31), !dbg !13
|
18 |
+
%19 = bitcast i32 %18 to float, !dbg !13
|
19 |
+
%20 = fadd float %16, %19, !dbg !17
|
20 |
+
%21 = bitcast float %20 to i32, !dbg !13
|
21 |
+
%22 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %21, i32 1, i32 31), !dbg !13
|
22 |
+
%23 = bitcast i32 %22 to float, !dbg !13
|
23 |
+
%24 = fadd float %20, %23, !dbg !17
|
24 |
+
%25 = trunc i64 %13 to i32, !dbg !21
|
25 |
+
%26 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %25, i32 4, i32 31), !dbg !21
|
26 |
+
%bc = bitcast i64 %13 to <2 x i32>, !dbg !21
|
27 |
+
%27 = extractelement <2 x i32> %bc, i64 1, !dbg !21
|
28 |
+
%28 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %27, i32 4, i32 31), !dbg !21
|
29 |
+
%29 = insertelement <2 x i32> undef, i32 %26, i64 0, !dbg !21
|
30 |
+
%30 = insertelement <2 x i32> %29, i32 %28, i64 1, !dbg !21
|
31 |
+
%31 = bitcast <2 x i32> %30 to i64, !dbg !21
|
32 |
+
%32 = add i64 %13, %31, !dbg !23
|
33 |
+
%33 = trunc i64 %32 to i32, !dbg !21
|
34 |
+
%34 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %33, i32 2, i32 31), !dbg !21
|
35 |
+
%bc1 = bitcast i64 %32 to <2 x i32>, !dbg !21
|
36 |
+
%35 = extractelement <2 x i32> %bc1, i64 1, !dbg !21
|
37 |
+
%36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !21
|
38 |
+
%37 = insertelement <2 x i32> undef, i32 %34, i64 0, !dbg !21
|
39 |
+
%38 = insertelement <2 x i32> %37, i32 %36, i64 1, !dbg !21
|
40 |
+
%39 = bitcast <2 x i32> %38 to i64, !dbg !21
|
41 |
+
%40 = add i64 %32, %39, !dbg !23
|
42 |
+
%41 = trunc i64 %40 to i32, !dbg !21
|
43 |
+
%42 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %41, i32 1, i32 31), !dbg !21
|
44 |
+
%bc2 = bitcast i64 %40 to <2 x i32>, !dbg !21
|
45 |
+
%43 = extractelement <2 x i32> %bc2, i64 1, !dbg !21
|
46 |
+
%44 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %43, i32 1, i32 31), !dbg !21
|
47 |
+
%45 = insertelement <2 x i32> undef, i32 %42, i64 0, !dbg !21
|
48 |
+
%46 = insertelement <2 x i32> %45, i32 %44, i64 1, !dbg !21
|
49 |
+
%47 = bitcast <2 x i32> %46 to i64, !dbg !21
|
50 |
+
%48 = add i64 %40, %47, !dbg !23
|
51 |
+
%49 = sitofp i64 %48 to float, !dbg !26
|
52 |
+
%50 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %24, float %49) #3, !dbg !27
|
53 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !28
|
54 |
+
%51 = and i32 %6, 63, !dbg !29
|
55 |
+
%52 = icmp eq i32 %51, 0, !dbg !29
|
56 |
+
%53 = bitcast float %50 to i32, !dbg !29
|
57 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %53, ptr addrspace(1) %0, i1 %52) #3, !dbg !29
|
58 |
+
ret void, !dbg !30
|
59 |
+
}
|
60 |
+
|
61 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
62 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
63 |
+
|
64 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
65 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
|
66 |
+
|
67 |
+
; Function Attrs: convergent nocallback nounwind
|
68 |
+
declare void @llvm.nvvm.barrier0() #2
|
69 |
+
|
70 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
71 |
+
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
72 |
+
attributes #2 = { convergent nocallback nounwind }
|
73 |
+
attributes #3 = { nounwind }
|
74 |
+
|
75 |
+
!llvm.module.flags = !{!0}
|
76 |
+
!llvm.dbg.cu = !{!1}
|
77 |
+
!nvvm.annotations = !{!3, !4, !4, !3}
|
78 |
+
|
79 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
80 |
+
!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
81 |
+
!2 = !DIFile(filename: "c7zrzealf5bsn7qskl6y72zb73mh5bzf6uskuswp33lv4y5kk64w.py", directory: "/tmp/torchinductor_root/7z")
|
82 |
+
!3 = !{ptr @triton__0d1d2d34e, !"kernel", i32 1}
|
83 |
+
!4 = !{ptr @triton__0d1d2d34e, !"maxntidx", i32 64}
|
84 |
+
!5 = distinct !DISubprogram(name: "triton__0d1d2d34e", linkageName: "triton__0d1d2d34e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
|
85 |
+
!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
|
86 |
+
!7 = !{}
|
87 |
+
!8 = !DILocation(line: 25, column: 34, scope: !5)
|
88 |
+
!9 = !DILocation(line: 28, column: 30, scope: !5)
|
89 |
+
!10 = !DILocation(line: 28, column: 35, scope: !5)
|
90 |
+
!11 = !DILocation(line: 29, column: 30, scope: !5)
|
91 |
+
!12 = !DILocation(line: 29, column: 35, scope: !5)
|
92 |
+
!13 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !16)
|
93 |
+
!14 = distinct !DILexicalBlockFile(scope: !5, file: !15, discriminator: 0)
|
94 |
+
!15 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
|
95 |
+
!16 = !DILocation(line: 32, column: 24, scope: !14)
|
96 |
+
!17 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !19)
|
97 |
+
!18 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
|
98 |
+
!19 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !20)
|
99 |
+
!20 = !DILocation(line: 32, column: 24, scope: !18)
|
100 |
+
!21 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !22)
|
101 |
+
!22 = !DILocation(line: 35, column: 24, scope: !14)
|
102 |
+
!23 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !24)
|
103 |
+
!24 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !25)
|
104 |
+
!25 = !DILocation(line: 35, column: 24, scope: !18)
|
105 |
+
!26 = !DILocation(line: 36, column: 20, scope: !5)
|
106 |
+
!27 = !DILocation(line: 37, column: 19, scope: !5)
|
107 |
+
!28 = !DILocation(line: 38, column: 4, scope: !5)
|
108 |
+
!29 = !DILocation(line: 39, column: 71, scope: !5)
|
109 |
+
!30 = !DILocation(line: 39, column: 4, scope: !5)
|
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.llir
ADDED
@@ -0,0 +1,600 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [8 x i8] c"<module>"
|
5 |
+
@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [8 x i8] c"<module>"
|
8 |
+
@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = and i32 %9, 31, !dbg !10
|
18 |
+
%11 = lshr i32 %9, 6, !dbg !10
|
19 |
+
%12 = and i32 %11, 1, !dbg !10
|
20 |
+
%13 = and i32 %9, 1, !dbg !10
|
21 |
+
%urem = shl i32 %9, 1, !dbg !11
|
22 |
+
%14 = and i32 %urem, 126, !dbg !11
|
23 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
24 |
+
%16 = shl i32 %15, 1, !dbg !13
|
25 |
+
%17 = or i32 %16, %12, !dbg !14
|
26 |
+
%18 = or i32 %16, %13, !dbg !14
|
27 |
+
%19 = sext i32 %17 to i64, !dbg !15
|
28 |
+
%20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
|
29 |
+
%21 = sext i32 %18 to i64, !dbg !15
|
30 |
+
%22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
|
31 |
+
%23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
32 |
+
%24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
33 |
+
%25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
34 |
+
%26 = srem i32 %17, 512, !dbg !17
|
35 |
+
%27 = shl nsw i32 %26, 8, !dbg !18
|
36 |
+
%28 = shl i32 %17, 8, !dbg !19
|
37 |
+
%29 = add i64 %25, 50257, !dbg !20
|
38 |
+
%30 = icmp slt i64 %23, 0, !dbg !21
|
39 |
+
%31 = icmp slt i64 %25, 0, !dbg !21
|
40 |
+
%32 = select i1 %31, i64 %29, i64 %25, !dbg !22
|
41 |
+
%33 = icmp ugt i64 %32, 50256, !dbg !23
|
42 |
+
%34 = shl i64 %23, 8, !dbg !24
|
43 |
+
%35 = add i64 %34, 12865792, !dbg !24
|
44 |
+
%36 = select i1 %30, i64 %35, i64 %34, !dbg !24
|
45 |
+
%37 = getelementptr float, ptr addrspace(1) %1, i64 %36
|
46 |
+
%38 = or i32 %14, %27, !dbg !25
|
47 |
+
%39 = sext i32 %38 to i64, !dbg !26
|
48 |
+
%40 = getelementptr float, ptr addrspace(1) %2, i64 %39, !dbg !26
|
49 |
+
%41 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
|
50 |
+
%42 = extractvalue { i32, i32 } %41, 0, !dbg !27
|
51 |
+
%43 = extractvalue { i32, i32 } %41, 1, !dbg !27
|
52 |
+
%44 = insertelement <2 x i32> poison, i32 %42, i64 0, !dbg !27
|
53 |
+
%45 = insertelement <2 x i32> %44, i32 %43, i64 1, !dbg !27
|
54 |
+
%46 = bitcast <2 x i32> %45 to <2 x float>, !dbg !27
|
55 |
+
%47 = or i32 %14, %28, !dbg !28
|
56 |
+
%48 = sext i32 %47 to i64, !dbg !29
|
57 |
+
%49 = getelementptr i16, ptr addrspace(1) %3, i64 %48, !dbg !29
|
58 |
+
%50 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %49, i1 true, i32 0, i1 true) #6, !dbg !30
|
59 |
+
%51 = trunc i32 %50 to i16, !dbg !30
|
60 |
+
%extelt.offset2 = lshr i32 %50, 16, !dbg !30
|
61 |
+
%52 = trunc i32 %extelt.offset2 to i16, !dbg !30
|
62 |
+
%53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #6, !dbg !31
|
63 |
+
%54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %52) #6, !dbg !31
|
64 |
+
br i1 %33, label %55, label %56, !dbg !32
|
65 |
+
|
66 |
+
55: ; preds = %8
|
67 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !32
|
68 |
+
br label %56, !dbg !32
|
69 |
+
|
70 |
+
56: ; preds = %55, %8
|
71 |
+
%57 = zext nneg i32 %14 to i64, !dbg !33
|
72 |
+
%58 = getelementptr float, ptr addrspace(1) %37, i64 %57, !dbg !34
|
73 |
+
%59 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
74 |
+
%60 = extractvalue { i32, i32 } %59, 0, !dbg !35
|
75 |
+
%61 = extractvalue { i32, i32 } %59, 1, !dbg !35
|
76 |
+
%62 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !35
|
77 |
+
%63 = insertelement <2 x i32> %62, i32 %61, i64 1, !dbg !35
|
78 |
+
%64 = bitcast <2 x i32> %63 to <2 x float>, !dbg !35
|
79 |
+
%65 = fadd <2 x float> %46, %64, !dbg !36
|
80 |
+
%66 = insertelement <2 x float> poison, float %53, i64 0, !dbg !37
|
81 |
+
%67 = insertelement <2 x float> %66, float %54, i64 1, !dbg !37
|
82 |
+
%68 = fadd <2 x float> %67, %65, !dbg !37
|
83 |
+
%69 = extractelement <2 x float> %68, i64 0, !dbg !38
|
84 |
+
%70 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %69, float 1.000000e+00) #6, !dbg !38
|
85 |
+
%71 = extractelement <2 x float> %68, i64 1, !dbg !38
|
86 |
+
%72 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %71, float 1.000000e+00) #6, !dbg !38
|
87 |
+
%73 = insertelement <2 x float> poison, float %70, i64 0, !dbg !42
|
88 |
+
%74 = insertelement <2 x float> %73, float %72, i64 1, !dbg !42
|
89 |
+
%75 = fadd <2 x float> %74, zeroinitializer, !dbg !42
|
90 |
+
%76 = fsub <2 x float> %68, %75, !dbg !43
|
91 |
+
%77 = fmul <2 x float> %68, %76, !dbg !44
|
92 |
+
%78 = fadd <2 x float> %77, zeroinitializer, !dbg !45
|
93 |
+
%79 = or i32 %14, 128, !dbg !46
|
94 |
+
%80 = or i32 %79, %27, !dbg !25
|
95 |
+
%81 = sext i32 %80 to i64, !dbg !26
|
96 |
+
%82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !26
|
97 |
+
%83 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
|
98 |
+
%84 = extractvalue { i32, i32 } %83, 0, !dbg !27
|
99 |
+
%85 = extractvalue { i32, i32 } %83, 1, !dbg !27
|
100 |
+
%86 = insertelement <2 x i32> poison, i32 %84, i64 0, !dbg !27
|
101 |
+
%87 = insertelement <2 x i32> %86, i32 %85, i64 1, !dbg !27
|
102 |
+
%88 = bitcast <2 x i32> %87 to <2 x float>, !dbg !27
|
103 |
+
%89 = or i32 %79, %28, !dbg !28
|
104 |
+
%90 = sext i32 %89 to i64, !dbg !29
|
105 |
+
%91 = getelementptr i16, ptr addrspace(1) %3, i64 %90, !dbg !29
|
106 |
+
%92 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 true, i32 0, i1 true) #6, !dbg !30
|
107 |
+
%93 = trunc i32 %92 to i16, !dbg !30
|
108 |
+
%extelt.offset2.1 = lshr i32 %92, 16, !dbg !30
|
109 |
+
%94 = trunc i32 %extelt.offset2.1 to i16, !dbg !30
|
110 |
+
%95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %93) #6, !dbg !31
|
111 |
+
%96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %94) #6, !dbg !31
|
112 |
+
br i1 %33, label %97, label %98, !dbg !32
|
113 |
+
|
114 |
+
97: ; preds = %56
|
115 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !32
|
116 |
+
br label %98, !dbg !32
|
117 |
+
|
118 |
+
98: ; preds = %97, %56
|
119 |
+
%99 = zext nneg i32 %79 to i64, !dbg !33
|
120 |
+
%100 = getelementptr float, ptr addrspace(1) %37, i64 %99, !dbg !34
|
121 |
+
%101 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %100, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !35
|
122 |
+
%102 = extractvalue { i32, i32 } %101, 0, !dbg !35
|
123 |
+
%103 = extractvalue { i32, i32 } %101, 1, !dbg !35
|
124 |
+
%104 = insertelement <2 x i32> poison, i32 %102, i64 0, !dbg !35
|
125 |
+
%105 = insertelement <2 x i32> %104, i32 %103, i64 1, !dbg !35
|
126 |
+
%106 = bitcast <2 x i32> %105 to <2 x float>, !dbg !35
|
127 |
+
%107 = fadd <2 x float> %88, %106, !dbg !36
|
128 |
+
%108 = insertelement <2 x float> poison, float %95, i64 0, !dbg !37
|
129 |
+
%109 = insertelement <2 x float> %108, float %96, i64 1, !dbg !37
|
130 |
+
%110 = fadd <2 x float> %109, %107, !dbg !37
|
131 |
+
%111 = fsub <2 x float> %110, %75, !dbg !47
|
132 |
+
%112 = extractelement <2 x float> %111, i64 0, !dbg !38
|
133 |
+
%113 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float 2.000000e+00) #6, !dbg !38
|
134 |
+
%114 = extractelement <2 x float> %111, i64 1, !dbg !38
|
135 |
+
%115 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float 2.000000e+00) #6, !dbg !38
|
136 |
+
%116 = insertelement <2 x float> poison, float %113, i64 0, !dbg !42
|
137 |
+
%117 = insertelement <2 x float> %116, float %115, i64 1, !dbg !42
|
138 |
+
%118 = fadd <2 x float> %75, %117, !dbg !42
|
139 |
+
%119 = fsub <2 x float> %110, %118, !dbg !43
|
140 |
+
%120 = fmul <2 x float> %111, %119, !dbg !44
|
141 |
+
%121 = fadd <2 x float> %78, %120, !dbg !45
|
142 |
+
%122 = lshr i32 %9, 5, !dbg !10
|
143 |
+
%123 = and i32 %122, 1, !dbg !11
|
144 |
+
%124 = and i32 %9, 127, !dbg !11
|
145 |
+
%125 = zext nneg i32 %124 to i64, !dbg !48
|
146 |
+
%126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !48
|
147 |
+
store <1 x float> <float 2.000000e+00>, ptr addrspace(3) %126, align 4, !dbg !48
|
148 |
+
%127 = add nuw nsw i32 %124, 130, !dbg !48
|
149 |
+
%128 = zext nneg i32 %127 to i64, !dbg !48
|
150 |
+
%129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !48
|
151 |
+
store <1 x float> <float 2.000000e+00>, ptr addrspace(3) %129, align 4, !dbg !48
|
152 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !48
|
153 |
+
%130 = mul nuw nsw i32 %12, 130, !dbg !48
|
154 |
+
%131 = add nuw nsw i32 %130, %14, !dbg !48
|
155 |
+
%132 = zext nneg i32 %131 to i64, !dbg !48
|
156 |
+
%133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !48
|
157 |
+
%134 = load float, ptr addrspace(3) %133, align 8, !dbg !48
|
158 |
+
%135 = getelementptr inbounds <2 x float>, ptr addrspace(3) %133, i64 0, i64 1, !dbg !48
|
159 |
+
%136 = load float, ptr addrspace(3) %135, align 4, !dbg !48
|
160 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
161 |
+
%137 = extractelement <2 x float> %118, i64 0, !dbg !51
|
162 |
+
%138 = extractelement <2 x float> %118, i64 1, !dbg !55
|
163 |
+
%139 = fsub float %138, %137, !dbg !55
|
164 |
+
%140 = fadd float %134, %136, !dbg !56
|
165 |
+
%141 = fcmp oeq float %140, 0.000000e+00, !dbg !57
|
166 |
+
%142 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %140) #6, !dbg !58
|
167 |
+
%143 = select i1 %141, float 0.000000e+00, float %142, !dbg !59
|
168 |
+
%144 = fmul float %139, %143, !dbg !60
|
169 |
+
%145 = fadd float %137, %144, !dbg !51
|
170 |
+
%shift = shufflevector <2 x float> %121, <2 x float> poison, <2 x i32> <i32 1, i32 poison>, !dbg !61
|
171 |
+
%146 = fadd <2 x float> %121, %shift, !dbg !61
|
172 |
+
%147 = extractelement <2 x float> %146, i64 0, !dbg !61
|
173 |
+
%148 = fmul float %139, %139, !dbg !62
|
174 |
+
%149 = fmul float %148, %134, !dbg !63
|
175 |
+
%150 = fmul float %149, %143, !dbg !64
|
176 |
+
%151 = fadd float %147, %150, !dbg !65
|
177 |
+
%152 = bitcast float %145 to i32, !dbg !49
|
178 |
+
%153 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %152, i32 16, i32 31), !dbg !49
|
179 |
+
%154 = bitcast i32 %153 to float, !dbg !49
|
180 |
+
%155 = bitcast float %151 to i32, !dbg !49
|
181 |
+
%156 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %155, i32 16, i32 31), !dbg !49
|
182 |
+
%157 = bitcast i32 %156 to float, !dbg !49
|
183 |
+
%158 = bitcast float %140 to i32, !dbg !49
|
184 |
+
%159 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %158, i32 16, i32 31), !dbg !49
|
185 |
+
%160 = bitcast i32 %159 to float, !dbg !49
|
186 |
+
%161 = fsub float %154, %145, !dbg !55
|
187 |
+
%162 = fadd float %140, %160, !dbg !56
|
188 |
+
%163 = fcmp oeq float %162, 0.000000e+00, !dbg !57
|
189 |
+
%164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %160, float %162) #6, !dbg !58
|
190 |
+
%165 = select i1 %163, float 0.000000e+00, float %164, !dbg !59
|
191 |
+
%166 = fmul float %161, %165, !dbg !60
|
192 |
+
%167 = fadd float %145, %166, !dbg !51
|
193 |
+
%168 = fadd float %151, %157, !dbg !61
|
194 |
+
%169 = fmul float %161, %161, !dbg !62
|
195 |
+
%170 = fmul float %140, %169, !dbg !63
|
196 |
+
%171 = fmul float %170, %165, !dbg !64
|
197 |
+
%172 = fadd float %168, %171, !dbg !65
|
198 |
+
%173 = bitcast float %167 to i32, !dbg !49
|
199 |
+
%174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 8, i32 31), !dbg !49
|
200 |
+
%175 = bitcast i32 %174 to float, !dbg !49
|
201 |
+
%176 = bitcast float %172 to i32, !dbg !49
|
202 |
+
%177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 8, i32 31), !dbg !49
|
203 |
+
%178 = bitcast i32 %177 to float, !dbg !49
|
204 |
+
%179 = bitcast float %162 to i32, !dbg !49
|
205 |
+
%180 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %179, i32 8, i32 31), !dbg !49
|
206 |
+
%181 = bitcast i32 %180 to float, !dbg !49
|
207 |
+
%182 = fsub float %175, %167, !dbg !55
|
208 |
+
%183 = fadd float %162, %181, !dbg !56
|
209 |
+
%184 = fcmp oeq float %183, 0.000000e+00, !dbg !57
|
210 |
+
%185 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %181, float %183) #6, !dbg !58
|
211 |
+
%186 = select i1 %184, float 0.000000e+00, float %185, !dbg !59
|
212 |
+
%187 = fmul float %182, %186, !dbg !60
|
213 |
+
%188 = fadd float %167, %187, !dbg !51
|
214 |
+
%189 = fadd float %172, %178, !dbg !61
|
215 |
+
%190 = fmul float %182, %182, !dbg !62
|
216 |
+
%191 = fmul float %162, %190, !dbg !63
|
217 |
+
%192 = fmul float %186, %191, !dbg !64
|
218 |
+
%193 = fadd float %189, %192, !dbg !65
|
219 |
+
%194 = bitcast float %188 to i32, !dbg !49
|
220 |
+
%195 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %194, i32 4, i32 31), !dbg !49
|
221 |
+
%196 = bitcast i32 %195 to float, !dbg !49
|
222 |
+
%197 = bitcast float %193 to i32, !dbg !49
|
223 |
+
%198 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %197, i32 4, i32 31), !dbg !49
|
224 |
+
%199 = bitcast i32 %198 to float, !dbg !49
|
225 |
+
%200 = bitcast float %183 to i32, !dbg !49
|
226 |
+
%201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 4, i32 31), !dbg !49
|
227 |
+
%202 = bitcast i32 %201 to float, !dbg !49
|
228 |
+
%203 = fsub float %196, %188, !dbg !55
|
229 |
+
%204 = fadd float %183, %202, !dbg !56
|
230 |
+
%205 = fcmp oeq float %204, 0.000000e+00, !dbg !57
|
231 |
+
%206 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %202, float %204) #6, !dbg !58
|
232 |
+
%207 = select i1 %205, float 0.000000e+00, float %206, !dbg !59
|
233 |
+
%208 = fmul float %203, %207, !dbg !60
|
234 |
+
%209 = fadd float %188, %208, !dbg !51
|
235 |
+
%210 = fadd float %193, %199, !dbg !61
|
236 |
+
%211 = fmul float %203, %203, !dbg !62
|
237 |
+
%212 = fmul float %183, %211, !dbg !63
|
238 |
+
%213 = fmul float %207, %212, !dbg !64
|
239 |
+
%214 = fadd float %210, %213, !dbg !65
|
240 |
+
%215 = bitcast float %209 to i32, !dbg !49
|
241 |
+
%216 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %215, i32 2, i32 31), !dbg !49
|
242 |
+
%217 = bitcast i32 %216 to float, !dbg !49
|
243 |
+
%218 = bitcast float %214 to i32, !dbg !49
|
244 |
+
%219 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %218, i32 2, i32 31), !dbg !49
|
245 |
+
%220 = bitcast i32 %219 to float, !dbg !49
|
246 |
+
%221 = bitcast float %204 to i32, !dbg !49
|
247 |
+
%222 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %221, i32 2, i32 31), !dbg !49
|
248 |
+
%223 = bitcast i32 %222 to float, !dbg !49
|
249 |
+
%224 = fsub float %217, %209, !dbg !55
|
250 |
+
%225 = fadd float %204, %223, !dbg !56
|
251 |
+
%226 = fcmp oeq float %225, 0.000000e+00, !dbg !57
|
252 |
+
%227 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %223, float %225) #6, !dbg !58
|
253 |
+
%228 = select i1 %226, float 0.000000e+00, float %227, !dbg !59
|
254 |
+
%229 = fmul float %224, %228, !dbg !60
|
255 |
+
%230 = fadd float %209, %229, !dbg !51
|
256 |
+
%231 = fadd float %214, %220, !dbg !61
|
257 |
+
%232 = fmul float %224, %224, !dbg !62
|
258 |
+
%233 = fmul float %204, %232, !dbg !63
|
259 |
+
%234 = fmul float %228, %233, !dbg !64
|
260 |
+
%235 = fadd float %231, %234, !dbg !65
|
261 |
+
%236 = bitcast float %230 to i32, !dbg !49
|
262 |
+
%237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !49
|
263 |
+
%238 = bitcast i32 %237 to float, !dbg !49
|
264 |
+
%239 = bitcast float %235 to i32, !dbg !49
|
265 |
+
%240 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %239, i32 1, i32 31), !dbg !49
|
266 |
+
%241 = bitcast i32 %240 to float, !dbg !49
|
267 |
+
%242 = bitcast float %225 to i32, !dbg !49
|
268 |
+
%243 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %242, i32 1, i32 31), !dbg !49
|
269 |
+
%244 = bitcast i32 %243 to float, !dbg !49
|
270 |
+
%245 = fsub float %238, %230, !dbg !55
|
271 |
+
%246 = fadd float %225, %244, !dbg !56
|
272 |
+
%247 = fcmp oeq float %246, 0.000000e+00, !dbg !57
|
273 |
+
%248 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %244, float %246) #6, !dbg !58
|
274 |
+
%249 = select i1 %247, float 0.000000e+00, float %248, !dbg !59
|
275 |
+
%250 = fmul float %245, %249, !dbg !60
|
276 |
+
%251 = fadd float %230, %250, !dbg !51
|
277 |
+
%252 = fadd float %235, %241, !dbg !61
|
278 |
+
%253 = fmul float %245, %245, !dbg !62
|
279 |
+
%254 = fmul float %225, %253, !dbg !63
|
280 |
+
%255 = fmul float %249, %254, !dbg !64
|
281 |
+
%256 = fadd float %252, %255, !dbg !65
|
282 |
+
%257 = icmp eq i32 %10, 0, !dbg !49
|
283 |
+
%258 = shl nuw nsw i32 %12, 1, !dbg !49
|
284 |
+
%259 = or i32 %258, %123, !dbg !49
|
285 |
+
%260 = zext nneg i32 %259 to i64, !dbg !49
|
286 |
+
%261 = getelementptr float, ptr addrspace(3) @global_smem, i64 %260, !dbg !49
|
287 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %261, float %251, i1 %257) #6, !dbg !49
|
288 |
+
%262 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %260, !dbg !49
|
289 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %262, float %256, i1 %257) #6, !dbg !49
|
290 |
+
%263 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %260, !dbg !49
|
291 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %263, float %246, i1 %257) #6, !dbg !49
|
292 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
293 |
+
%264 = icmp slt i32 %9, 4, !dbg !49
|
294 |
+
%265 = sext i32 %9 to i64, !dbg !49
|
295 |
+
%266 = getelementptr float, ptr addrspace(3) @global_smem, i64 %265, !dbg !49
|
296 |
+
%267 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %266, i1 %264) #6, !dbg !49
|
297 |
+
%268 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %265, !dbg !49
|
298 |
+
%269 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %268, i1 %264) #6, !dbg !49
|
299 |
+
%270 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), i64 %265, !dbg !49
|
300 |
+
%271 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %270, i1 %264) #6, !dbg !49
|
301 |
+
%272 = bitcast float %267 to i32, !dbg !49
|
302 |
+
%273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !49
|
303 |
+
%274 = bitcast i32 %273 to float, !dbg !49
|
304 |
+
%275 = bitcast float %269 to i32, !dbg !49
|
305 |
+
%276 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %275, i32 1, i32 31), !dbg !49
|
306 |
+
%277 = bitcast i32 %276 to float, !dbg !49
|
307 |
+
%278 = bitcast float %271 to i32, !dbg !49
|
308 |
+
%279 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %278, i32 1, i32 31), !dbg !49
|
309 |
+
%280 = bitcast i32 %279 to float, !dbg !49
|
310 |
+
%281 = fsub float %274, %267, !dbg !55
|
311 |
+
%282 = fadd float %271, %280, !dbg !56
|
312 |
+
%283 = fcmp oeq float %282, 0.000000e+00, !dbg !57
|
313 |
+
%284 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %280, float %282) #6, !dbg !58
|
314 |
+
%285 = select i1 %283, float 0.000000e+00, float %284, !dbg !59
|
315 |
+
%286 = fmul float %281, %285, !dbg !60
|
316 |
+
%287 = fadd float %267, %286, !dbg !51
|
317 |
+
%288 = fadd float %269, %277, !dbg !61
|
318 |
+
%289 = fmul float %281, %281, !dbg !62
|
319 |
+
%290 = fmul float %271, %289, !dbg !63
|
320 |
+
%291 = fmul float %290, %285, !dbg !64
|
321 |
+
%292 = fadd float %288, %291, !dbg !65
|
322 |
+
%293 = icmp eq i32 %13, 0, !dbg !49
|
323 |
+
%294 = and i1 %264, %293, !dbg !49
|
324 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %266, float %287, i1 %294) #6, !dbg !49
|
325 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %268, float %292, i1 %294) #6, !dbg !49
|
326 |
+
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %270, float %282, i1 %294) #6, !dbg !49
|
327 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !49
|
328 |
+
%295 = zext nneg i32 %258 to i64, !dbg !49
|
329 |
+
%296 = getelementptr float, ptr addrspace(3) @global_smem, i64 %295, !dbg !49
|
330 |
+
%297 = load float, ptr addrspace(3) %296, align 4, !dbg !49
|
331 |
+
%298 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %295, !dbg !49
|
332 |
+
%299 = load float, ptr addrspace(3) %298, align 4, !dbg !49
|
333 |
+
%300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %299, float 2.560000e+02) #6, !dbg !66
|
334 |
+
%301 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %299, float 2.560000e+02) #6, !dbg !66
|
335 |
+
%302 = fadd float %300, 0x3EE4F8B580000000, !dbg !67
|
336 |
+
%303 = getelementptr float, ptr addrspace(3) @global_smem, i64 %57
|
337 |
+
%304 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %40, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
338 |
+
%305 = extractvalue { i32, i32 } %304, 0, !dbg !68
|
339 |
+
%306 = extractvalue { i32, i32 } %304, 1, !dbg !68
|
340 |
+
%307 = bitcast i32 %305 to float, !dbg !68
|
341 |
+
%308 = bitcast i32 %306 to float, !dbg !68
|
342 |
+
%309 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %49, i1 true, i32 0, i1 true) #6, !dbg !69
|
343 |
+
%310 = trunc i32 %309 to i16, !dbg !69
|
344 |
+
%extelt.offset = lshr i32 %309, 16, !dbg !69
|
345 |
+
%311 = trunc i32 %extelt.offset to i16, !dbg !69
|
346 |
+
%312 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %310) #6, !dbg !70
|
347 |
+
%313 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %311) #6, !dbg !70
|
348 |
+
%314 = getelementptr float, ptr addrspace(1) %4, i64 %125, !dbg !71
|
349 |
+
%315 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %314, i1 true, i32 0, i1 true) #6, !dbg !72
|
350 |
+
br i1 %33, label %316, label %317, !dbg !73
|
351 |
+
|
352 |
+
316: ; preds = %98
|
353 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !73
|
354 |
+
br label %317, !dbg !73
|
355 |
+
|
356 |
+
317: ; preds = %316, %98
|
357 |
+
%318 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %58, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !74
|
358 |
+
%319 = extractvalue { i32, i32 } %318, 0, !dbg !74
|
359 |
+
%320 = extractvalue { i32, i32 } %318, 1, !dbg !74
|
360 |
+
%321 = bitcast i32 %319 to float, !dbg !74
|
361 |
+
%322 = bitcast i32 %320 to float, !dbg !74
|
362 |
+
%323 = fadd float %307, %321, !dbg !75
|
363 |
+
%324 = fadd float %308, %322, !dbg !75
|
364 |
+
%325 = fadd float %312, %323, !dbg !76
|
365 |
+
%326 = fadd float %313, %324, !dbg !76
|
366 |
+
%327 = fsub float %325, %297, !dbg !77
|
367 |
+
%328 = fsub float %326, %297, !dbg !77
|
368 |
+
%329 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
|
369 |
+
%.not.i = icmp eq i32 %329, 0, !dbg !78
|
370 |
+
br i1 %.not.i, label %332, label %330, !dbg !78
|
371 |
+
|
372 |
+
330: ; preds = %317
|
373 |
+
%331 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %302), !dbg !78
|
374 |
+
br label %__nv_rsqrtf.exit, !dbg !78
|
375 |
+
|
376 |
+
332: ; preds = %317
|
377 |
+
%333 = tail call float @llvm.nvvm.rsqrt.approx.f(float %302), !dbg !78
|
378 |
+
br label %__nv_rsqrtf.exit, !dbg !78
|
379 |
+
|
380 |
+
__nv_rsqrtf.exit: ; preds = %330, %332
|
381 |
+
%.0.i = phi float [ %331, %330 ], [ %333, %332 ], !dbg !78
|
382 |
+
%334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
|
383 |
+
%335 = fmul float %327, %.0.i, !dbg !79
|
384 |
+
%336 = fmul float %328, %.0.i, !dbg !79
|
385 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !80
|
386 |
+
store i32 %315, ptr addrspace(3) %126, align 4, !dbg !80
|
387 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !80
|
388 |
+
%337 = load float, ptr addrspace(3) %303, align 8, !dbg !80
|
389 |
+
%338 = getelementptr inbounds <2 x float>, ptr addrspace(3) %303, i64 0, i64 1, !dbg !80
|
390 |
+
%339 = load float, ptr addrspace(3) %338, align 4, !dbg !80
|
391 |
+
%340 = fmul float %335, %337, !dbg !80
|
392 |
+
%341 = fmul float %336, %339, !dbg !80
|
393 |
+
%342 = getelementptr i16, ptr addrspace(1) %5, i64 %48, !dbg !81
|
394 |
+
%343 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %340) #6, !dbg !82
|
395 |
+
%344 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %341) #6, !dbg !82
|
396 |
+
%345 = insertelement <2 x i16> undef, i16 %343, i64 0, !dbg !82
|
397 |
+
%346 = insertelement <2 x i16> %345, i16 %344, i64 1, !dbg !82
|
398 |
+
%347 = bitcast <2 x i16> %346 to i32, !dbg !82
|
399 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %347, ptr addrspace(1) %342, i1 true) #6, !dbg !82
|
400 |
+
%348 = or i32 %124, 128, !dbg !83
|
401 |
+
%349 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
402 |
+
%350 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 true, i32 0, i1 true) #6, !dbg !69
|
403 |
+
%351 = trunc i32 %350 to i16, !dbg !69
|
404 |
+
%extelt.offset.1 = lshr i32 %350, 16, !dbg !69
|
405 |
+
%352 = trunc i32 %extelt.offset.1 to i16, !dbg !69
|
406 |
+
%353 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %351) #6, !dbg !70
|
407 |
+
%354 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %352) #6, !dbg !70
|
408 |
+
%355 = zext nneg i32 %348 to i64, !dbg !71
|
409 |
+
%356 = getelementptr float, ptr addrspace(1) %4, i64 %355, !dbg !71
|
410 |
+
%357 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %356, i1 true, i32 0, i1 true) #6, !dbg !72
|
411 |
+
br i1 %33, label %358, label %359, !dbg !73
|
412 |
+
|
413 |
+
358: ; preds = %__nv_rsqrtf.exit
|
414 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !73
|
415 |
+
br label %359, !dbg !73
|
416 |
+
|
417 |
+
359: ; preds = %358, %__nv_rsqrtf.exit
|
418 |
+
%360 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %100, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !74
|
419 |
+
%361 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
|
420 |
+
%.not.i.1 = icmp eq i32 %361, 0, !dbg !78
|
421 |
+
br i1 %.not.i.1, label %364, label %362, !dbg !78
|
422 |
+
|
423 |
+
362: ; preds = %359
|
424 |
+
%363 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %302), !dbg !78
|
425 |
+
br label %__nv_rsqrtf.exit.1, !dbg !78
|
426 |
+
|
427 |
+
364: ; preds = %359
|
428 |
+
%365 = tail call float @llvm.nvvm.rsqrt.approx.f(float %302), !dbg !78
|
429 |
+
br label %__nv_rsqrtf.exit.1, !dbg !78
|
430 |
+
|
431 |
+
__nv_rsqrtf.exit.1: ; preds = %364, %362
|
432 |
+
%.0.i.1 = phi float [ %363, %362 ], [ %365, %364 ], !dbg !78
|
433 |
+
%366 = extractvalue { i32, i32 } %349, 1, !dbg !68
|
434 |
+
%367 = bitcast i32 %366 to float, !dbg !68
|
435 |
+
%368 = extractvalue { i32, i32 } %360, 1, !dbg !74
|
436 |
+
%369 = bitcast i32 %368 to float, !dbg !74
|
437 |
+
%370 = fadd float %367, %369, !dbg !75
|
438 |
+
%371 = fadd float %354, %370, !dbg !76
|
439 |
+
%372 = fsub float %371, %297, !dbg !77
|
440 |
+
%373 = extractvalue { i32, i32 } %349, 0, !dbg !68
|
441 |
+
%374 = bitcast i32 %373 to float, !dbg !68
|
442 |
+
%375 = extractvalue { i32, i32 } %360, 0, !dbg !74
|
443 |
+
%376 = bitcast i32 %375 to float, !dbg !74
|
444 |
+
%377 = fadd float %374, %376, !dbg !75
|
445 |
+
%378 = fadd float %353, %377, !dbg !76
|
446 |
+
%379 = fsub float %378, %297, !dbg !77
|
447 |
+
%380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !78
|
448 |
+
%381 = fmul float %379, %.0.i.1, !dbg !79
|
449 |
+
%382 = fmul float %372, %.0.i.1, !dbg !79
|
450 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !80
|
451 |
+
store i32 %357, ptr addrspace(3) %126, align 4, !dbg !80
|
452 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !80
|
453 |
+
%383 = load float, ptr addrspace(3) %303, align 8, !dbg !80
|
454 |
+
%384 = load float, ptr addrspace(3) %338, align 4, !dbg !80
|
455 |
+
%385 = fmul float %381, %383, !dbg !80
|
456 |
+
%386 = fmul float %382, %384, !dbg !80
|
457 |
+
%387 = getelementptr i16, ptr addrspace(1) %5, i64 %90, !dbg !81
|
458 |
+
%388 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %385) #6, !dbg !82
|
459 |
+
%389 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %386) #6, !dbg !82
|
460 |
+
%390 = insertelement <2 x i16> undef, i16 %388, i64 0, !dbg !82
|
461 |
+
%391 = insertelement <2 x i16> %390, i16 %389, i64 1, !dbg !82
|
462 |
+
%392 = bitcast <2 x i16> %391 to i32, !dbg !82
|
463 |
+
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %392, ptr addrspace(1) %387, i1 true) #6, !dbg !82
|
464 |
+
ret void, !dbg !84
|
465 |
+
}
|
466 |
+
|
467 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
468 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
469 |
+
|
470 |
+
; Function Attrs: convergent nocallback nounwind
|
471 |
+
declare void @llvm.nvvm.barrier0() #1
|
472 |
+
|
473 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
474 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
475 |
+
|
476 |
+
; Function Attrs: alwaysinline nounwind
|
477 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
478 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
479 |
+
%.not = icmp eq i32 %1, 0
|
480 |
+
br i1 %.not, label %4, label %2
|
481 |
+
|
482 |
+
2: ; preds = %0
|
483 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
484 |
+
br label %6
|
485 |
+
|
486 |
+
4: ; preds = %0
|
487 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
488 |
+
br label %6
|
489 |
+
|
490 |
+
6: ; preds = %4, %2
|
491 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
492 |
+
ret float %.0
|
493 |
+
}
|
494 |
+
|
495 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
496 |
+
|
497 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
498 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
499 |
+
|
500 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
501 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
502 |
+
|
503 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
504 |
+
attributes #1 = { convergent nocallback nounwind }
|
505 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
506 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
507 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
508 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
509 |
+
attributes #6 = { nounwind }
|
510 |
+
|
511 |
+
!llvm.module.flags = !{!0, !1}
|
512 |
+
!llvm.dbg.cu = !{!2}
|
513 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
514 |
+
!llvm.ident = !{!6}
|
515 |
+
|
516 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
517 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
518 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
519 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
520 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
521 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
|
522 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
523 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
524 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
525 |
+
!9 = !{}
|
526 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
527 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
528 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
529 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
530 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
531 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
532 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
533 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
534 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
535 |
+
!19 = !DILocation(line: 36, column: 44, scope: !7)
|
536 |
+
!20 = !DILocation(line: 37, column: 22, scope: !7)
|
537 |
+
!21 = !DILocation(line: 38, column: 22, scope: !7)
|
538 |
+
!22 = !DILocation(line: 39, column: 36, scope: !7)
|
539 |
+
!23 = !DILocation(line: 40, column: 40, scope: !7)
|
540 |
+
!24 = !DILocation(line: 41, column: 44, scope: !7)
|
541 |
+
!25 = !DILocation(line: 35, column: 40, scope: !7)
|
542 |
+
!26 = !DILocation(line: 35, column: 34, scope: !7)
|
543 |
+
!27 = !DILocation(line: 35, column: 50, scope: !7)
|
544 |
+
!28 = !DILocation(line: 36, column: 40, scope: !7)
|
545 |
+
!29 = !DILocation(line: 36, column: 34, scope: !7)
|
546 |
+
!30 = !DILocation(line: 36, column: 50, scope: !7)
|
547 |
+
!31 = !DILocation(line: 36, column: 101, scope: !7)
|
548 |
+
!32 = !DILocation(line: 40, column: 55, scope: !7)
|
549 |
+
!33 = !DILocation(line: 41, column: 40, scope: !7)
|
550 |
+
!34 = !DILocation(line: 41, column: 34, scope: !7)
|
551 |
+
!35 = !DILocation(line: 41, column: 52, scope: !7)
|
552 |
+
!36 = !DILocation(line: 42, column: 22, scope: !7)
|
553 |
+
!37 = !DILocation(line: 44, column: 22, scope: !7)
|
554 |
+
!38 = !DILocation(line: 98, column: 30, scope: !39, inlinedAt: !41)
|
555 |
+
!39 = distinct !DILexicalBlockFile(scope: !7, file: !40, discriminator: 0)
|
556 |
+
!40 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
557 |
+
!41 = !DILocation(line: 47, column: 41, scope: !39)
|
558 |
+
!42 = !DILocation(line: 98, column: 22, scope: !39, inlinedAt: !41)
|
559 |
+
!43 = !DILocation(line: 101, column: 30, scope: !39, inlinedAt: !41)
|
560 |
+
!44 = !DILocation(line: 101, column: 22, scope: !39, inlinedAt: !41)
|
561 |
+
!45 = !DILocation(line: 50, column: 50, scope: !7)
|
562 |
+
!46 = !DILocation(line: 32, column: 27, scope: !7)
|
563 |
+
!47 = !DILocation(line: 96, column: 20, scope: !39, inlinedAt: !41)
|
564 |
+
!48 = !DILocation(line: 31, column: 36, scope: !7)
|
565 |
+
!49 = !DILocation(line: 120, column: 46, scope: !39, inlinedAt: !50)
|
566 |
+
!50 = !DILocation(line: 53, column: 44, scope: !39)
|
567 |
+
!51 = !DILocation(line: 112, column: 17, scope: !52, inlinedAt: !53)
|
568 |
+
!52 = distinct !DILexicalBlockFile(scope: !39, file: !40, discriminator: 0)
|
569 |
+
!53 = !DILocation(line: 120, column: 46, scope: !52, inlinedAt: !54)
|
570 |
+
!54 = !DILocation(line: 53, column: 44, scope: !52)
|
571 |
+
!55 = !DILocation(line: 108, column: 21, scope: !52, inlinedAt: !53)
|
572 |
+
!56 = !DILocation(line: 109, column: 28, scope: !52, inlinedAt: !53)
|
573 |
+
!57 = !DILocation(line: 110, column: 39, scope: !52, inlinedAt: !53)
|
574 |
+
!58 = !DILocation(line: 110, column: 60, scope: !52, inlinedAt: !53)
|
575 |
+
!59 = !DILocation(line: 110, column: 49, scope: !52, inlinedAt: !53)
|
576 |
+
!60 = !DILocation(line: 112, column: 25, scope: !52, inlinedAt: !53)
|
577 |
+
!61 = !DILocation(line: 113, column: 15, scope: !52, inlinedAt: !53)
|
578 |
+
!62 = !DILocation(line: 113, column: 30, scope: !52, inlinedAt: !53)
|
579 |
+
!63 = !DILocation(line: 113, column: 38, scope: !52, inlinedAt: !53)
|
580 |
+
!64 = !DILocation(line: 113, column: 49, scope: !52, inlinedAt: !53)
|
581 |
+
!65 = !DILocation(line: 113, column: 22, scope: !52, inlinedAt: !53)
|
582 |
+
!66 = !DILocation(line: 75, column: 24, scope: !7)
|
583 |
+
!67 = !DILocation(line: 77, column: 24, scope: !7)
|
584 |
+
!68 = !DILocation(line: 62, column: 51, scope: !7)
|
585 |
+
!69 = !DILocation(line: 63, column: 51, scope: !7)
|
586 |
+
!70 = !DILocation(line: 63, column: 103, scope: !7)
|
587 |
+
!71 = !DILocation(line: 64, column: 35, scope: !7)
|
588 |
+
!72 = !DILocation(line: 64, column: 40, scope: !7)
|
589 |
+
!73 = !DILocation(line: 68, column: 57, scope: !7)
|
590 |
+
!74 = !DILocation(line: 69, column: 54, scope: !7)
|
591 |
+
!75 = !DILocation(line: 70, column: 24, scope: !7)
|
592 |
+
!76 = !DILocation(line: 72, column: 24, scope: !7)
|
593 |
+
!77 = !DILocation(line: 73, column: 24, scope: !7)
|
594 |
+
!78 = !DILocation(line: 78, column: 30, scope: !7)
|
595 |
+
!79 = !DILocation(line: 79, column: 24, scope: !7)
|
596 |
+
!80 = !DILocation(line: 80, column: 24, scope: !7)
|
597 |
+
!81 = !DILocation(line: 82, column: 29, scope: !7)
|
598 |
+
!82 = !DILocation(line: 82, column: 52, scope: !7)
|
599 |
+
!83 = !DILocation(line: 59, column: 27, scope: !7)
|
600 |
+
!84 = !DILocation(line: 58, column: 4, scope: !7)
|
.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ttgir
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 2], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
4 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
5 |
+
tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
6 |
+
%cst = arith.constant dense<512> : tensor<2x1xi32, #blocked>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked>
|
8 |
+
%cst_1 = arith.constant dense<256> : tensor<2x1xi32, #blocked>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked>
|
10 |
+
%cst_3 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<256> : tensor<2x1xi64, #blocked>
|
12 |
+
%cst_5 = arith.constant dense<0> : tensor<2x1xi64, #blocked>
|
13 |
+
%cst_6 = arith.constant dense<50257> : tensor<2x1xi64, #blocked>
|
14 |
+
%cst_7 = arith.constant dense<50257> : tensor<2x1xi64, #blocked1>
|
15 |
+
%cst_8 = arith.constant dense<0> : tensor<2x1xi64, #blocked1>
|
16 |
+
%c0_i32 = arith.constant 0 : i32
|
17 |
+
%c128_i32 = arith.constant 128 : i32
|
18 |
+
%c256_i32 = arith.constant 256 : i32
|
19 |
+
%cst_9 = arith.constant dense<1.000000e+00> : tensor<2x128xf32, #blocked2>
|
20 |
+
%cst_10 = arith.constant 0.000000e+00 : f32
|
21 |
+
%cst_11 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked2>
|
22 |
+
%cst_12 = arith.constant dense<0.000000e+00> : tensor<2x128xf32, #blocked2>
|
23 |
+
%cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2>
|
24 |
+
%cst_14 = arith.constant dense<9.99999974E-6> : tensor<2x1xf32, #blocked>
|
25 |
+
%cst_15 = arith.constant dense<2.560000e+02> : tensor<2x1xf32, #blocked>
|
26 |
+
%cst_16 = arith.constant dense<0.000000e+00> : tensor<2x128xbf16, #blocked>
|
27 |
+
%c2_i32 = arith.constant 2 : i32
|
28 |
+
%0 = tt.get_program_id x : i32
|
29 |
+
%1 = arith.muli %0, %c2_i32 : i32
|
30 |
+
%2 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
31 |
+
%3 = tt.make_range {end = 2 : i32, start = 0 : i32} : tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
32 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xi32, #blocked>
|
33 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<2xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<2x1xi32, #blocked1>
|
34 |
+
%6 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked>
|
35 |
+
%7 = tt.splat %1 : (i32) -> tensor<2x1xi32, #blocked1>
|
36 |
+
%8 = arith.addi %6, %4 : tensor<2x1xi32, #blocked>
|
37 |
+
%9 = arith.addi %7, %5 : tensor<2x1xi32, #blocked1>
|
38 |
+
%10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
|
39 |
+
%11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
|
40 |
+
%12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked>
|
41 |
+
%13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2>
|
42 |
+
%14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked>
|
43 |
+
%15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<2x1x!tt.ptr<i64, 1>, #blocked1>
|
44 |
+
%16 = tt.addptr %14, %8 : tensor<2x1x!tt.ptr<i64, 1>, #blocked>, tensor<2x1xi32, #blocked>
|
45 |
+
%17 = tt.addptr %15, %9 : tensor<2x1x!tt.ptr<i64, 1>, #blocked1>, tensor<2x1xi32, #blocked1>
|
46 |
+
%18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked>
|
47 |
+
%19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x1xi64, #blocked1>
|
48 |
+
%20 = arith.remsi %8, %cst : tensor<2x1xi32, #blocked>
|
49 |
+
%21 = arith.muli %20, %cst_1 : tensor<2x1xi32, #blocked>
|
50 |
+
%22 = tt.broadcast %21 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
|
51 |
+
%23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
|
52 |
+
%24 = arith.muli %8, %cst_1 : tensor<2x1xi32, #blocked>
|
53 |
+
%25 = tt.broadcast %24 : (tensor<2x1xi32, #blocked>) -> tensor<2x128xi32, #blocked>
|
54 |
+
%26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
|
55 |
+
%27 = arith.addi %18, %cst_6 : tensor<2x1xi64, #blocked>
|
56 |
+
%28 = arith.addi %19, %cst_7 : tensor<2x1xi64, #blocked1>
|
57 |
+
%29 = arith.cmpi slt, %18, %cst_5 : tensor<2x1xi64, #blocked>
|
58 |
+
%30 = arith.cmpi slt, %19, %cst_8 : tensor<2x1xi64, #blocked1>
|
59 |
+
%31 = arith.select %29, %27, %18 : tensor<2x1xi1, #blocked>, tensor<2x1xi64, #blocked>
|
60 |
+
%32 = arith.select %30, %28, %19 : tensor<2x1xi1, #blocked1>, tensor<2x1xi64, #blocked1>
|
61 |
+
%33 = arith.cmpi sge, %32, %cst_8 : tensor<2x1xi64, #blocked1>
|
62 |
+
%34 = arith.cmpi slt, %32, %cst_7 : tensor<2x1xi64, #blocked1>
|
63 |
+
%35 = arith.andi %33, %34 : tensor<2x1xi1, #blocked1>
|
64 |
+
%36 = arith.muli %31, %cst_4 : tensor<2x1xi64, #blocked>
|
65 |
+
%37 = tt.broadcast %36 : (tensor<2x1xi64, #blocked>) -> tensor<2x128xi64, #blocked>
|
66 |
+
%38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<2x128x!tt.ptr<f32, 1>, #blocked>
|
67 |
+
%39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>) : i32 {
|
68 |
+
%49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
|
69 |
+
%50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
|
70 |
+
%51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
|
71 |
+
%52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
|
72 |
+
%53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
|
73 |
+
%54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
|
74 |
+
%55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
|
75 |
+
%56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
|
76 |
+
%57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
|
77 |
+
%58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
|
78 |
+
%59 = tt.broadcast %54 : (tensor<1x128xi1, #blocked2>) -> tensor<2x128xi1, #blocked2>
|
79 |
+
%60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
|
80 |
+
%61 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
|
81 |
+
%62 = tt.addptr %26, %61 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
|
82 |
+
%63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
|
83 |
+
%64 = arith.extf %63 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
|
84 |
+
tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
|
85 |
+
%65 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
|
86 |
+
%66 = tt.broadcast %65 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
|
87 |
+
%67 = arith.addi %66, %37 : tensor<2x128xi64, #blocked>
|
88 |
+
%68 = tt.addptr %38, %67 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
|
89 |
+
%69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
|
90 |
+
%70 = arith.addf %69, %60 : tensor<2x128xf32, #blocked>
|
91 |
+
%71 = arith.addf %70, %64 : tensor<2x128xf32, #blocked>
|
92 |
+
%72 = arith.subf %71, %arg9 : tensor<2x128xf32, #blocked>
|
93 |
+
%73 = arith.addf %arg12, %cst_3 : tensor<2x128xf32, #blocked>
|
94 |
+
%74 = arith.addf %arg11, %cst_9 : tensor<2x128xf32, #blocked2>
|
95 |
+
%75 = arith.divf %72, %73 : tensor<2x128xf32, #blocked>
|
96 |
+
%76 = arith.addf %arg9, %75 : tensor<2x128xf32, #blocked>
|
97 |
+
%77 = arith.subf %71, %76 : tensor<2x128xf32, #blocked>
|
98 |
+
%78 = arith.mulf %72, %77 : tensor<2x128xf32, #blocked>
|
99 |
+
%79 = arith.addf %arg10, %78 : tensor<2x128xf32, #blocked>
|
100 |
+
%80 = arith.select %58, %76, %arg9 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
|
101 |
+
%81 = arith.select %58, %79, %arg10 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
|
102 |
+
%82 = arith.select %58, %73, %arg12 : tensor<2x128xi1, #blocked>, tensor<2x128xf32, #blocked>
|
103 |
+
%83 = arith.select %59, %74, %arg11 : tensor<2x128xi1, #blocked2>, tensor<2x128xf32, #blocked2>
|
104 |
+
scf.yield %80, %81, %83, %82 : tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked2>, tensor<2x128xf32, #blocked>
|
105 |
+
}
|
106 |
+
%40 = triton_gpu.convert_layout %39#2 : (tensor<2x128xf32, #blocked2>) -> tensor<2x128xf32, #blocked>
|
107 |
+
%41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
|
108 |
+
^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
|
109 |
+
%49 = arith.subf %arg11, %arg8 : f32
|
110 |
+
%50 = arith.addf %arg10, %arg13 : f32
|
111 |
+
%51 = arith.cmpf oeq, %50, %cst_10 : f32
|
112 |
+
%52 = arith.divf %arg13, %50 : f32
|
113 |
+
%53 = arith.select %51, %cst_10, %52 : f32
|
114 |
+
%54 = arith.mulf %49, %53 : f32
|
115 |
+
%55 = arith.addf %arg8, %54 : f32
|
116 |
+
%56 = arith.addf %arg9, %arg12 : f32
|
117 |
+
%57 = arith.mulf %49, %49 : f32
|
118 |
+
%58 = arith.mulf %57, %arg10 : f32
|
119 |
+
%59 = arith.mulf %58, %53 : f32
|
120 |
+
%60 = arith.addf %56, %59 : f32
|
121 |
+
tt.reduce.return %55, %60, %50 : f32, f32, f32
|
122 |
+
}) : (tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>, tensor<2x128xf32, #blocked>) -> (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
|
123 |
+
%42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
|
124 |
+
%43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<2xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<2x1xf32, #blocked>
|
125 |
+
%44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x128x!tt.ptr<f32, 1>, #blocked2>
|
126 |
+
%45 = tt.broadcast %42 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
|
127 |
+
%46 = arith.divf %43, %cst_15 : tensor<2x1xf32, #blocked>
|
128 |
+
%47 = arith.addf %46, %cst_14 : tensor<2x1xf32, #blocked>
|
129 |
+
%48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<2x128x!tt.ptr<bf16, 1>, #blocked>
|
130 |
+
scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 : i32 {
|
131 |
+
%49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked>
|
132 |
+
%50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2>
|
133 |
+
%51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked>
|
134 |
+
%52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2>
|
135 |
+
%53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked>
|
136 |
+
%54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2>
|
137 |
+
%55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<2x128xi32, #blocked>
|
138 |
+
%56 = arith.addi %55, %22 : tensor<2x128xi32, #blocked>
|
139 |
+
%57 = tt.addptr %23, %56 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi32, #blocked>
|
140 |
+
%58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<2x128xi1, #blocked>
|
141 |
+
%59 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
|
142 |
+
%60 = arith.addi %55, %25 : tensor<2x128xi32, #blocked>
|
143 |
+
%61 = tt.addptr %26, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
|
144 |
+
%62 = tt.load %61, %58, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xbf16, #blocked>
|
145 |
+
%63 = arith.extf %62 : tensor<2x128xbf16, #blocked> to tensor<2x128xf32, #blocked>
|
146 |
+
%64 = tt.addptr %44, %52 : tensor<1x128x!tt.ptr<f32, 1>, #blocked2>, tensor<1x128xi32, #blocked2>
|
147 |
+
%65 = tt.load %64, %54, %cst_11 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked2>
|
148 |
+
tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "<module>", 1892 : tensor<2x1xi1, #blocked1>
|
149 |
+
%66 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked>
|
150 |
+
%67 = tt.broadcast %66 : (tensor<1x128xi64, #blocked>) -> tensor<2x128xi64, #blocked>
|
151 |
+
%68 = arith.addi %67, %37 : tensor<2x128xi64, #blocked>
|
152 |
+
%69 = tt.addptr %38, %68 : tensor<2x128x!tt.ptr<f32, 1>, #blocked>, tensor<2x128xi64, #blocked>
|
153 |
+
%70 = tt.load %69, %58, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<2x128xf32, #blocked>
|
154 |
+
%71 = arith.addf %70, %59 : tensor<2x128xf32, #blocked>
|
155 |
+
%72 = arith.addf %71, %63 : tensor<2x128xf32, #blocked>
|
156 |
+
%73 = arith.subf %72, %45 : tensor<2x128xf32, #blocked>
|
157 |
+
%74 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<2x1xf32, #blocked>) -> tensor<2x1xf32, #blocked>
|
158 |
+
%75 = tt.broadcast %74 : (tensor<2x1xf32, #blocked>) -> tensor<2x128xf32, #blocked>
|
159 |
+
%76 = arith.mulf %73, %75 : tensor<2x128xf32, #blocked>
|
160 |
+
%77 = triton_gpu.convert_layout %65 : (tensor<1x128xf32, #blocked2>) -> tensor<1x128xf32, #blocked>
|
161 |
+
%78 = tt.broadcast %77 : (tensor<1x128xf32, #blocked>) -> tensor<2x128xf32, #blocked>
|
162 |
+
%79 = arith.mulf %76, %78 : tensor<2x128xf32, #blocked>
|
163 |
+
%80 = tt.addptr %48, %60 : tensor<2x128x!tt.ptr<bf16, 1>, #blocked>, tensor<2x128xi32, #blocked>
|
164 |
+
%81 = arith.truncf %79 : tensor<2x128xf32, #blocked> to tensor<2x128xbf16, #blocked>
|
165 |
+
tt.store %80, %81, %58 {cache = 1 : i32, evict = 1 : i32} : tensor<2x128xbf16, #blocked>
|
166 |
+
}
|
167 |
+
tt.return
|
168 |
+
}
|
169 |
+
}
|
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<512> : tensor<256xi32, #blocked>
|
5 |
+
%c256_i32 = arith.constant 256 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c256_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<256xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<256xi32, #blocked>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<256xi32, #blocked>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>, #blocked>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>, #blocked>, tensor<256xi32, #blocked>
|
14 |
+
%8 = arith.extsi %4 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
|
15 |
+
tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ttgir
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
6 |
+
%cst_0 = arith.constant 9.99999974E-6 : f32
|
7 |
+
%cst_1 = arith.constant 2.560000e+02 : f32
|
8 |
+
%cst_2 = arith.constant 0.000000e+00 : f32
|
9 |
+
%c256_i32 = arith.constant 256 : i32
|
10 |
+
%cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
11 |
+
%cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
14 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
15 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
16 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
17 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
18 |
+
%6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
19 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
20 |
+
%8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
21 |
+
%9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
22 |
+
%10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
24 |
+
%12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
25 |
+
%13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
26 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
27 |
+
%15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
28 |
+
%16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
29 |
+
%17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
30 |
+
%18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
31 |
+
%19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
32 |
+
%20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
33 |
+
%21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
34 |
+
%22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
35 |
+
%23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
36 |
+
%24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
37 |
+
%25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
38 |
+
%26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
39 |
+
%27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
40 |
+
%28 = arith.addf %8, %12 : tensor<256xf32, #blocked>
|
41 |
+
%29 = arith.addf %28, %16 : tensor<256xf32, #blocked>
|
42 |
+
%30 = arith.addf %29, %20 : tensor<256xf32, #blocked>
|
43 |
+
%31 = arith.addf %30, %24 : tensor<256xf32, #blocked>
|
44 |
+
%32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
45 |
+
%33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg12: f32, %arg13: f32):
|
47 |
+
%59 = arith.addf %arg12, %arg13 : f32
|
48 |
+
tt.reduce.return %59 : f32
|
49 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
50 |
+
%34 = arith.addf %33, %cst_2 : f32
|
51 |
+
%35 = arith.divf %34, %cst_1 : f32
|
52 |
+
%36 = tt.splat %35 : (f32) -> tensor<1xf32, #blocked1>
|
53 |
+
%37 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked>
|
54 |
+
%38 = arith.subf %31, %37 : tensor<256xf32, #blocked>
|
55 |
+
%39 = arith.mulf %38, %38 : tensor<256xf32, #blocked>
|
56 |
+
%40 = arith.select %2, %39, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
57 |
+
%41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
|
58 |
+
^bb0(%arg12: f32, %arg13: f32):
|
59 |
+
%59 = arith.addf %arg12, %arg13 : f32
|
60 |
+
tt.reduce.return %59 : f32
|
61 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
62 |
+
%42 = arith.addf %41, %cst_2 : f32
|
63 |
+
%43 = arith.divf %42, %cst_1 : f32
|
64 |
+
%44 = arith.addf %43, %cst_0 : f32
|
65 |
+
%45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
66 |
+
%46 = tt.splat %45 : (f32) -> tensor<1xf32, #blocked1>
|
67 |
+
%47 = tt.splat %45 : (f32) -> tensor<256xf32, #blocked>
|
68 |
+
%48 = arith.mulf %38, %47 : tensor<256xf32, #blocked>
|
69 |
+
%49 = arith.mulf %48, %27 : tensor<256xf32, #blocked>
|
70 |
+
%50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
71 |
+
%51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
72 |
+
tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
|
73 |
+
gpu.barrier
|
74 |
+
%52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
75 |
+
%53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
76 |
+
tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
77 |
+
%54 = tt.splat %arg9 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
78 |
+
%55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
79 |
+
%56 = arith.truncf %49 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
|
80 |
+
tt.store %55, %56, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
|
81 |
+
%57 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
|
82 |
+
%58 = tt.splat %57 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked1>
|
83 |
+
tt.store %58, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32, #blocked1>
|
84 |
+
tt.return
|
85 |
+
}
|
86 |
+
}
|
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.cubin
ADDED
Binary file (42.6 kB). View file
|
|
.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.llir
ADDED
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [8 x i8] c"<module>"
|
5 |
+
@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [8 x i8] c"<module>"
|
8 |
+
@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
|
16 |
+
%8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%9 = lshr i32 %8, 4, !dbg !10
|
18 |
+
%10 = and i32 %9, 15, !dbg !10
|
19 |
+
%11 = and i32 %8, 15, !dbg !10
|
20 |
+
%12 = shl nuw nsw i32 %11, 3, !dbg !11
|
21 |
+
%13 = or i32 %12, 4, !dbg !11
|
22 |
+
%14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
23 |
+
%15 = shl i32 %14, 4, !dbg !13
|
24 |
+
%16 = or i32 %15, %10, !dbg !14
|
25 |
+
%17 = or i32 %15, %11, !dbg !14
|
26 |
+
%18 = sext i32 %16 to i64, !dbg !15
|
27 |
+
%19 = getelementptr i64, ptr addrspace(1) %0, i64 %18, !dbg !15
|
28 |
+
%20 = sext i32 %17 to i64, !dbg !15
|
29 |
+
%21 = getelementptr i64, ptr addrspace(1) %0, i64 %20, !dbg !15
|
30 |
+
%22 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
31 |
+
%23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
32 |
+
%24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
33 |
+
%25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
34 |
+
%26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
35 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
36 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
37 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %19, i1 true) #6, !dbg !16
|
38 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %21, i1 true) #6, !dbg !16
|
39 |
+
%31 = srem i32 %16, 512, !dbg !17
|
40 |
+
%32 = shl nsw i32 %31, 8, !dbg !18
|
41 |
+
%33 = add i64 %30, 50257, !dbg !19
|
42 |
+
%34 = icmp slt i64 %22, 0, !dbg !20
|
43 |
+
%35 = icmp slt i64 %30, 0, !dbg !20
|
44 |
+
%36 = select i1 %35, i64 %33, i64 %30, !dbg !21
|
45 |
+
%37 = icmp ugt i64 %36, 50256, !dbg !22
|
46 |
+
%38 = shl i64 %22, 8, !dbg !23
|
47 |
+
%39 = add i64 %38, 12865792, !dbg !23
|
48 |
+
%40 = select i1 %34, i64 %39, i64 %38, !dbg !23
|
49 |
+
%41 = getelementptr float, ptr addrspace(1) %1, i64 %40
|
50 |
+
br label %42, !dbg !24
|
51 |
+
|
52 |
+
42: ; preds = %7, %104
|
53 |
+
%43 = phi float [ 0.000000e+00, %7 ], [ %143, %104 ]
|
54 |
+
%44 = phi float [ 0.000000e+00, %7 ], [ %144, %104 ]
|
55 |
+
%45 = phi float [ 0.000000e+00, %7 ], [ %145, %104 ]
|
56 |
+
%46 = phi float [ 0.000000e+00, %7 ], [ %146, %104 ]
|
57 |
+
%47 = phi float [ 0.000000e+00, %7 ], [ %147, %104 ]
|
58 |
+
%48 = phi float [ 0.000000e+00, %7 ], [ %148, %104 ]
|
59 |
+
%49 = phi float [ 0.000000e+00, %7 ], [ %149, %104 ]
|
60 |
+
%50 = phi float [ 0.000000e+00, %7 ], [ %150, %104 ]
|
61 |
+
%51 = phi float [ 0.000000e+00, %7 ], [ %151, %104 ]
|
62 |
+
%52 = phi float [ 0.000000e+00, %7 ], [ %152, %104 ]
|
63 |
+
%53 = phi float [ 0.000000e+00, %7 ], [ %153, %104 ]
|
64 |
+
%54 = phi float [ 0.000000e+00, %7 ], [ %154, %104 ]
|
65 |
+
%55 = phi float [ 0.000000e+00, %7 ], [ %155, %104 ]
|
66 |
+
%56 = phi float [ 0.000000e+00, %7 ], [ %156, %104 ]
|
67 |
+
%57 = phi float [ 0.000000e+00, %7 ], [ %157, %104 ]
|
68 |
+
%58 = phi float [ 0.000000e+00, %7 ], [ %158, %104 ]
|
69 |
+
%59 = phi float [ 0.000000e+00, %7 ], [ %191, %104 ]
|
70 |
+
%60 = phi float [ 0.000000e+00, %7 ], [ %192, %104 ]
|
71 |
+
%61 = phi float [ 0.000000e+00, %7 ], [ %193, %104 ]
|
72 |
+
%62 = phi float [ 0.000000e+00, %7 ], [ %194, %104 ]
|
73 |
+
%63 = phi float [ 0.000000e+00, %7 ], [ %195, %104 ]
|
74 |
+
%64 = phi float [ 0.000000e+00, %7 ], [ %196, %104 ]
|
75 |
+
%65 = phi float [ 0.000000e+00, %7 ], [ %197, %104 ]
|
76 |
+
%66 = phi float [ 0.000000e+00, %7 ], [ %198, %104 ]
|
77 |
+
%67 = phi float [ 0.000000e+00, %7 ], [ %167, %104 ]
|
78 |
+
%68 = phi float [ 0.000000e+00, %7 ], [ %168, %104 ]
|
79 |
+
%69 = phi float [ 0.000000e+00, %7 ], [ %169, %104 ]
|
80 |
+
%70 = phi float [ 0.000000e+00, %7 ], [ %170, %104 ]
|
81 |
+
%71 = phi float [ 0.000000e+00, %7 ], [ %171, %104 ]
|
82 |
+
%72 = phi float [ 0.000000e+00, %7 ], [ %172, %104 ]
|
83 |
+
%73 = phi float [ 0.000000e+00, %7 ], [ %173, %104 ]
|
84 |
+
%74 = phi float [ 0.000000e+00, %7 ], [ %174, %104 ]
|
85 |
+
%75 = phi i1 [ true, %7 ], [ false, %104 ]
|
86 |
+
%76 = phi i32 [ 0, %7 ], [ 128, %104 ]
|
87 |
+
%77 = or i32 %76, %12, !dbg !25
|
88 |
+
%78 = or i32 %76, %13, !dbg !25
|
89 |
+
%79 = or i32 %77, %32, !dbg !26
|
90 |
+
%80 = or i32 %78, %32, !dbg !26
|
91 |
+
%81 = sext i32 %79 to i64, !dbg !27
|
92 |
+
%82 = getelementptr float, ptr addrspace(1) %2, i64 %81, !dbg !27
|
93 |
+
%83 = sext i32 %80 to i64, !dbg !27
|
94 |
+
%84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !27
|
95 |
+
%85 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %82, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
96 |
+
%86 = extractvalue { i32, i32, i32, i32 } %85, 0, !dbg !28
|
97 |
+
%87 = extractvalue { i32, i32, i32, i32 } %85, 1, !dbg !28
|
98 |
+
%88 = extractvalue { i32, i32, i32, i32 } %85, 2, !dbg !28
|
99 |
+
%89 = extractvalue { i32, i32, i32, i32 } %85, 3, !dbg !28
|
100 |
+
%90 = bitcast i32 %86 to float, !dbg !28
|
101 |
+
%91 = bitcast i32 %87 to float, !dbg !28
|
102 |
+
%92 = bitcast i32 %88 to float, !dbg !28
|
103 |
+
%93 = bitcast i32 %89 to float, !dbg !28
|
104 |
+
%94 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
|
105 |
+
%95 = extractvalue { i32, i32, i32, i32 } %94, 0, !dbg !28
|
106 |
+
%96 = extractvalue { i32, i32, i32, i32 } %94, 1, !dbg !28
|
107 |
+
%97 = extractvalue { i32, i32, i32, i32 } %94, 2, !dbg !28
|
108 |
+
%98 = extractvalue { i32, i32, i32, i32 } %94, 3, !dbg !28
|
109 |
+
%99 = bitcast i32 %95 to float, !dbg !28
|
110 |
+
%100 = bitcast i32 %96 to float, !dbg !28
|
111 |
+
%101 = bitcast i32 %97 to float, !dbg !28
|
112 |
+
%102 = bitcast i32 %98 to float, !dbg !28
|
113 |
+
br i1 %37, label %103, label %104, !dbg !29
|
114 |
+
|
115 |
+
103: ; preds = %42
|
116 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !29
|
117 |
+
br label %104, !dbg !29
|
118 |
+
|
119 |
+
104: ; preds = %103, %42
|
120 |
+
%105 = zext nneg i32 %77 to i64, !dbg !30
|
121 |
+
%106 = zext nneg i32 %78 to i64, !dbg !30
|
122 |
+
%107 = getelementptr float, ptr addrspace(1) %41, i64 %105, !dbg !31
|
123 |
+
%108 = getelementptr float, ptr addrspace(1) %41, i64 %106, !dbg !31
|
124 |
+
%109 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
125 |
+
%110 = extractvalue { i32, i32, i32, i32 } %109, 0, !dbg !32
|
126 |
+
%111 = extractvalue { i32, i32, i32, i32 } %109, 1, !dbg !32
|
127 |
+
%112 = extractvalue { i32, i32, i32, i32 } %109, 2, !dbg !32
|
128 |
+
%113 = extractvalue { i32, i32, i32, i32 } %109, 3, !dbg !32
|
129 |
+
%114 = bitcast i32 %110 to float, !dbg !32
|
130 |
+
%115 = bitcast i32 %111 to float, !dbg !32
|
131 |
+
%116 = bitcast i32 %112 to float, !dbg !32
|
132 |
+
%117 = bitcast i32 %113 to float, !dbg !32
|
133 |
+
%118 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %108, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
134 |
+
%119 = extractvalue { i32, i32, i32, i32 } %118, 0, !dbg !32
|
135 |
+
%120 = extractvalue { i32, i32, i32, i32 } %118, 1, !dbg !32
|
136 |
+
%121 = extractvalue { i32, i32, i32, i32 } %118, 2, !dbg !32
|
137 |
+
%122 = extractvalue { i32, i32, i32, i32 } %118, 3, !dbg !32
|
138 |
+
%123 = bitcast i32 %119 to float, !dbg !32
|
139 |
+
%124 = bitcast i32 %120 to float, !dbg !32
|
140 |
+
%125 = bitcast i32 %121 to float, !dbg !32
|
141 |
+
%126 = bitcast i32 %122 to float, !dbg !32
|
142 |
+
%127 = fadd float %90, %114, !dbg !33
|
143 |
+
%128 = fadd float %91, %115, !dbg !33
|
144 |
+
%129 = fadd float %92, %116, !dbg !33
|
145 |
+
%130 = fadd float %93, %117, !dbg !33
|
146 |
+
%131 = fadd float %99, %123, !dbg !33
|
147 |
+
%132 = fadd float %100, %124, !dbg !33
|
148 |
+
%133 = fadd float %101, %125, !dbg !33
|
149 |
+
%134 = fadd float %102, %126, !dbg !33
|
150 |
+
%135 = fsub float %127, %67, !dbg !34
|
151 |
+
%136 = fsub float %128, %68, !dbg !34
|
152 |
+
%137 = fsub float %129, %69, !dbg !34
|
153 |
+
%138 = fsub float %130, %70, !dbg !34
|
154 |
+
%139 = fsub float %131, %71, !dbg !34
|
155 |
+
%140 = fsub float %132, %72, !dbg !34
|
156 |
+
%141 = fsub float %133, %73, !dbg !34
|
157 |
+
%142 = fsub float %134, %74, !dbg !34
|
158 |
+
%143 = fadd float %43, 1.000000e+00, !dbg !38
|
159 |
+
%144 = fadd float %44, 1.000000e+00, !dbg !38
|
160 |
+
%145 = fadd float %45, 1.000000e+00, !dbg !38
|
161 |
+
%146 = fadd float %46, 1.000000e+00, !dbg !38
|
162 |
+
%147 = fadd float %47, 1.000000e+00, !dbg !38
|
163 |
+
%148 = fadd float %48, 1.000000e+00, !dbg !38
|
164 |
+
%149 = fadd float %49, 1.000000e+00, !dbg !38
|
165 |
+
%150 = fadd float %50, 1.000000e+00, !dbg !38
|
166 |
+
%151 = fadd float %51, 1.000000e+00, !dbg !38
|
167 |
+
%152 = fadd float %52, 1.000000e+00, !dbg !38
|
168 |
+
%153 = fadd float %53, 1.000000e+00, !dbg !38
|
169 |
+
%154 = fadd float %54, 1.000000e+00, !dbg !38
|
170 |
+
%155 = fadd float %55, 1.000000e+00, !dbg !38
|
171 |
+
%156 = fadd float %56, 1.000000e+00, !dbg !38
|
172 |
+
%157 = fadd float %57, 1.000000e+00, !dbg !38
|
173 |
+
%158 = fadd float %58, 1.000000e+00, !dbg !38
|
174 |
+
%159 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %135, float %143) #6, !dbg !39
|
175 |
+
%160 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %136, float %144) #6, !dbg !39
|
176 |
+
%161 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %137, float %145) #6, !dbg !39
|
177 |
+
%162 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %138, float %146) #6, !dbg !39
|
178 |
+
%163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %139, float %147) #6, !dbg !39
|
179 |
+
%164 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %140, float %148) #6, !dbg !39
|
180 |
+
%165 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %141, float %149) #6, !dbg !39
|
181 |
+
%166 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %142, float %150) #6, !dbg !39
|
182 |
+
%167 = fadd float %67, %159, !dbg !40
|
183 |
+
%168 = fadd float %68, %160, !dbg !40
|
184 |
+
%169 = fadd float %69, %161, !dbg !40
|
185 |
+
%170 = fadd float %70, %162, !dbg !40
|
186 |
+
%171 = fadd float %71, %163, !dbg !40
|
187 |
+
%172 = fadd float %72, %164, !dbg !40
|
188 |
+
%173 = fadd float %73, %165, !dbg !40
|
189 |
+
%174 = fadd float %74, %166, !dbg !40
|
190 |
+
%175 = fsub float %127, %167, !dbg !41
|
191 |
+
%176 = fsub float %128, %168, !dbg !41
|
192 |
+
%177 = fsub float %129, %169, !dbg !41
|
193 |
+
%178 = fsub float %130, %170, !dbg !41
|
194 |
+
%179 = fsub float %131, %171, !dbg !41
|
195 |
+
%180 = fsub float %132, %172, !dbg !41
|
196 |
+
%181 = fsub float %133, %173, !dbg !41
|
197 |
+
%182 = fsub float %134, %174, !dbg !41
|
198 |
+
%183 = fmul float %135, %175, !dbg !42
|
199 |
+
%184 = fmul float %136, %176, !dbg !42
|
200 |
+
%185 = fmul float %137, %177, !dbg !42
|
201 |
+
%186 = fmul float %138, %178, !dbg !42
|
202 |
+
%187 = fmul float %139, %179, !dbg !42
|
203 |
+
%188 = fmul float %140, %180, !dbg !42
|
204 |
+
%189 = fmul float %141, %181, !dbg !42
|
205 |
+
%190 = fmul float %142, %182, !dbg !42
|
206 |
+
%191 = fadd float %59, %183, !dbg !43
|
207 |
+
%192 = fadd float %60, %184, !dbg !43
|
208 |
+
%193 = fadd float %61, %185, !dbg !43
|
209 |
+
%194 = fadd float %62, %186, !dbg !43
|
210 |
+
%195 = fadd float %63, %187, !dbg !43
|
211 |
+
%196 = fadd float %64, %188, !dbg !43
|
212 |
+
%197 = fadd float %65, %189, !dbg !43
|
213 |
+
%198 = fadd float %66, %190, !dbg !43
|
214 |
+
br i1 %75, label %42, label %199, !dbg !24
|
215 |
+
|
216 |
+
199: ; preds = %104
|
217 |
+
%200 = and i32 %8, 127, !dbg !11
|
218 |
+
%201 = and i32 %8, 128, !dbg !24
|
219 |
+
%.not = icmp eq i32 %201, 0, !dbg !24
|
220 |
+
%202 = select i1 %.not, i32 0, i32 136, !dbg !24
|
221 |
+
%203 = add nuw nsw i32 %202, %200, !dbg !24
|
222 |
+
%204 = zext nneg i32 %203 to i64, !dbg !24
|
223 |
+
%205 = getelementptr float, ptr addrspace(3) @global_smem, i64 %204, !dbg !24
|
224 |
+
%206 = insertelement <1 x float> undef, float %151, i64 0, !dbg !24
|
225 |
+
store <1 x float> %206, ptr addrspace(3) %205, align 4, !dbg !24
|
226 |
+
%207 = add nuw nsw i32 %200, 272, !dbg !24
|
227 |
+
%208 = add nuw nsw i32 %207, %202, !dbg !24
|
228 |
+
%209 = zext nneg i32 %208 to i64, !dbg !24
|
229 |
+
%210 = getelementptr float, ptr addrspace(3) @global_smem, i64 %209, !dbg !24
|
230 |
+
%211 = insertelement <1 x float> undef, float %152, i64 0, !dbg !24
|
231 |
+
store <1 x float> %211, ptr addrspace(3) %210, align 4, !dbg !24
|
232 |
+
%212 = add nuw nsw i32 %200, 544, !dbg !24
|
233 |
+
%213 = add nuw nsw i32 %212, %202, !dbg !24
|
234 |
+
%214 = zext nneg i32 %213 to i64, !dbg !24
|
235 |
+
%215 = getelementptr float, ptr addrspace(3) @global_smem, i64 %214, !dbg !24
|
236 |
+
%216 = insertelement <1 x float> undef, float %153, i64 0, !dbg !24
|
237 |
+
store <1 x float> %216, ptr addrspace(3) %215, align 4, !dbg !24
|
238 |
+
%217 = add nuw nsw i32 %200, 816, !dbg !24
|
239 |
+
%218 = add nuw nsw i32 %217, %202, !dbg !24
|
240 |
+
%219 = zext nneg i32 %218 to i64, !dbg !24
|
241 |
+
%220 = getelementptr float, ptr addrspace(3) @global_smem, i64 %219, !dbg !24
|
242 |
+
%221 = insertelement <1 x float> undef, float %154, i64 0, !dbg !24
|
243 |
+
store <1 x float> %221, ptr addrspace(3) %220, align 4, !dbg !24
|
244 |
+
%222 = add nuw nsw i32 %200, 1088, !dbg !24
|
245 |
+
%223 = add nuw nsw i32 %222, %202, !dbg !24
|
246 |
+
%224 = zext nneg i32 %223 to i64, !dbg !24
|
247 |
+
%225 = getelementptr float, ptr addrspace(3) @global_smem, i64 %224, !dbg !24
|
248 |
+
%226 = insertelement <1 x float> undef, float %155, i64 0, !dbg !24
|
249 |
+
store <1 x float> %226, ptr addrspace(3) %225, align 4, !dbg !24
|
250 |
+
%227 = add nuw nsw i32 %200, 1360, !dbg !24
|
251 |
+
%228 = add nuw nsw i32 %227, %202, !dbg !24
|
252 |
+
%229 = zext nneg i32 %228 to i64, !dbg !24
|
253 |
+
%230 = getelementptr float, ptr addrspace(3) @global_smem, i64 %229, !dbg !24
|
254 |
+
%231 = insertelement <1 x float> undef, float %156, i64 0, !dbg !24
|
255 |
+
store <1 x float> %231, ptr addrspace(3) %230, align 4, !dbg !24
|
256 |
+
%232 = add nuw nsw i32 %200, 1632, !dbg !24
|
257 |
+
%233 = add nuw nsw i32 %232, %202, !dbg !24
|
258 |
+
%234 = zext nneg i32 %233 to i64, !dbg !24
|
259 |
+
%235 = getelementptr float, ptr addrspace(3) @global_smem, i64 %234, !dbg !24
|
260 |
+
%236 = insertelement <1 x float> undef, float %157, i64 0, !dbg !24
|
261 |
+
store <1 x float> %236, ptr addrspace(3) %235, align 4, !dbg !24
|
262 |
+
%237 = add nuw nsw i32 %200, 1904, !dbg !24
|
263 |
+
%238 = add nuw nsw i32 %237, %202, !dbg !24
|
264 |
+
%239 = zext nneg i32 %238 to i64, !dbg !24
|
265 |
+
%240 = getelementptr float, ptr addrspace(3) @global_smem, i64 %239, !dbg !24
|
266 |
+
%241 = insertelement <1 x float> undef, float %158, i64 0, !dbg !24
|
267 |
+
store <1 x float> %241, ptr addrspace(3) %240, align 4, !dbg !24
|
268 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !24
|
269 |
+
%242 = mul nuw nsw i32 %10, 136, !dbg !24
|
270 |
+
%243 = add nuw nsw i32 %242, %12, !dbg !24
|
271 |
+
%244 = zext nneg i32 %243 to i64, !dbg !24
|
272 |
+
%245 = getelementptr float, ptr addrspace(3) @global_smem, i64 %244, !dbg !24
|
273 |
+
%246 = load float, ptr addrspace(3) %245, align 32, !dbg !24
|
274 |
+
%247 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 1, !dbg !24
|
275 |
+
%248 = load float, ptr addrspace(3) %247, align 4, !dbg !24
|
276 |
+
%249 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 2, !dbg !24
|
277 |
+
%250 = load float, ptr addrspace(3) %249, align 8, !dbg !24
|
278 |
+
%251 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 3, !dbg !24
|
279 |
+
%252 = load float, ptr addrspace(3) %251, align 4, !dbg !24
|
280 |
+
%253 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 4, !dbg !24
|
281 |
+
%254 = load float, ptr addrspace(3) %253, align 16, !dbg !24
|
282 |
+
%255 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 5, !dbg !24
|
283 |
+
%256 = load float, ptr addrspace(3) %255, align 4, !dbg !24
|
284 |
+
%257 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 6, !dbg !24
|
285 |
+
%258 = load float, ptr addrspace(3) %257, align 8, !dbg !24
|
286 |
+
%259 = getelementptr inbounds <8 x float>, ptr addrspace(3) %245, i64 0, i64 7, !dbg !24
|
287 |
+
%260 = load float, ptr addrspace(3) %259, align 4, !dbg !24
|
288 |
+
%261 = fsub float %168, %167, !dbg !44
|
289 |
+
%262 = fadd float %246, %248, !dbg !48
|
290 |
+
%263 = fcmp oeq float %262, 0.000000e+00, !dbg !49
|
291 |
+
%264 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %248, float %262) #6, !dbg !50
|
292 |
+
%265 = select i1 %263, float 0.000000e+00, float %264, !dbg !51
|
293 |
+
%266 = fmul float %261, %265, !dbg !52
|
294 |
+
%267 = fadd float %167, %266, !dbg !53
|
295 |
+
%268 = fadd float %191, %192, !dbg !54
|
296 |
+
%269 = fmul float %261, %261, !dbg !55
|
297 |
+
%270 = fmul float %269, %246, !dbg !56
|
298 |
+
%271 = fmul float %270, %265, !dbg !57
|
299 |
+
%272 = fadd float %268, %271, !dbg !58
|
300 |
+
%273 = fsub float %169, %267, !dbg !44
|
301 |
+
%274 = fadd float %250, %262, !dbg !48
|
302 |
+
%275 = fcmp oeq float %274, 0.000000e+00, !dbg !49
|
303 |
+
%276 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %250, float %274) #6, !dbg !50
|
304 |
+
%277 = select i1 %275, float 0.000000e+00, float %276, !dbg !51
|
305 |
+
%278 = fmul float %277, %273, !dbg !52
|
306 |
+
%279 = fadd float %267, %278, !dbg !53
|
307 |
+
%280 = fadd float %193, %272, !dbg !54
|
308 |
+
%281 = fmul float %273, %273, !dbg !55
|
309 |
+
%282 = fmul float %262, %281, !dbg !56
|
310 |
+
%283 = fmul float %277, %282, !dbg !57
|
311 |
+
%284 = fadd float %280, %283, !dbg !58
|
312 |
+
%285 = fsub float %170, %279, !dbg !44
|
313 |
+
%286 = fadd float %252, %274, !dbg !48
|
314 |
+
%287 = fcmp oeq float %286, 0.000000e+00, !dbg !49
|
315 |
+
%288 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %252, float %286) #6, !dbg !50
|
316 |
+
%289 = select i1 %287, float 0.000000e+00, float %288, !dbg !51
|
317 |
+
%290 = fmul float %289, %285, !dbg !52
|
318 |
+
%291 = fadd float %279, %290, !dbg !53
|
319 |
+
%292 = fadd float %194, %284, !dbg !54
|
320 |
+
%293 = fmul float %285, %285, !dbg !55
|
321 |
+
%294 = fmul float %274, %293, !dbg !56
|
322 |
+
%295 = fmul float %289, %294, !dbg !57
|
323 |
+
%296 = fadd float %292, %295, !dbg !58
|
324 |
+
%297 = fsub float %171, %291, !dbg !44
|
325 |
+
%298 = fadd float %254, %286, !dbg !48
|
326 |
+
%299 = fcmp oeq float %298, 0.000000e+00, !dbg !49
|
327 |
+
%300 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %254, float %298) #6, !dbg !50
|
328 |
+
%301 = select i1 %299, float 0.000000e+00, float %300, !dbg !51
|
329 |
+
%302 = fmul float %301, %297, !dbg !52
|
330 |
+
%303 = fadd float %291, %302, !dbg !53
|
331 |
+
%304 = fadd float %195, %296, !dbg !54
|
332 |
+
%305 = fmul float %297, %297, !dbg !55
|
333 |
+
%306 = fmul float %286, %305, !dbg !56
|
334 |
+
%307 = fmul float %301, %306, !dbg !57
|
335 |
+
%308 = fadd float %304, %307, !dbg !58
|
336 |
+
%309 = fsub float %172, %303, !dbg !44
|
337 |
+
%310 = fadd float %256, %298, !dbg !48
|
338 |
+
%311 = fcmp oeq float %310, 0.000000e+00, !dbg !49
|
339 |
+
%312 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %256, float %310) #6, !dbg !50
|
340 |
+
%313 = select i1 %311, float 0.000000e+00, float %312, !dbg !51
|
341 |
+
%314 = fmul float %313, %309, !dbg !52
|
342 |
+
%315 = fadd float %303, %314, !dbg !53
|
343 |
+
%316 = fadd float %196, %308, !dbg !54
|
344 |
+
%317 = fmul float %309, %309, !dbg !55
|
345 |
+
%318 = fmul float %298, %317, !dbg !56
|
346 |
+
%319 = fmul float %313, %318, !dbg !57
|
347 |
+
%320 = fadd float %316, %319, !dbg !58
|
348 |
+
%321 = fsub float %173, %315, !dbg !44
|
349 |
+
%322 = fadd float %258, %310, !dbg !48
|
350 |
+
%323 = fcmp oeq float %322, 0.000000e+00, !dbg !49
|
351 |
+
%324 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %258, float %322) #6, !dbg !50
|
352 |
+
%325 = select i1 %323, float 0.000000e+00, float %324, !dbg !51
|
353 |
+
%326 = fmul float %325, %321, !dbg !52
|
354 |
+
%327 = fadd float %315, %326, !dbg !53
|
355 |
+
%328 = fadd float %197, %320, !dbg !54
|
356 |
+
%329 = fmul float %321, %321, !dbg !55
|
357 |
+
%330 = fmul float %310, %329, !dbg !56
|
358 |
+
%331 = fmul float %325, %330, !dbg !57
|
359 |
+
%332 = fadd float %328, %331, !dbg !58
|
360 |
+
%333 = fsub float %174, %327, !dbg !44
|
361 |
+
%334 = fadd float %260, %322, !dbg !48
|
362 |
+
%335 = fcmp oeq float %334, 0.000000e+00, !dbg !49
|
363 |
+
%336 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %260, float %334) #6, !dbg !50
|
364 |
+
%337 = select i1 %335, float 0.000000e+00, float %336, !dbg !51
|
365 |
+
%338 = fmul float %337, %333, !dbg !52
|
366 |
+
%339 = fadd float %327, %338, !dbg !53
|
367 |
+
%340 = fadd float %198, %332, !dbg !54
|
368 |
+
%341 = fmul float %333, %333, !dbg !55
|
369 |
+
%342 = fmul float %322, %341, !dbg !56
|
370 |
+
%343 = fmul float %337, %342, !dbg !57
|
371 |
+
%344 = fadd float %340, %343, !dbg !58
|
372 |
+
%345 = bitcast float %339 to i32, !dbg !59
|
373 |
+
%346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 8, i32 31), !dbg !59
|
374 |
+
%347 = bitcast i32 %346 to float, !dbg !59
|
375 |
+
%348 = bitcast float %344 to i32, !dbg !59
|
376 |
+
%349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 8, i32 31), !dbg !59
|
377 |
+
%350 = bitcast i32 %349 to float, !dbg !59
|
378 |
+
%351 = bitcast float %334 to i32, !dbg !59
|
379 |
+
%352 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %351, i32 8, i32 31), !dbg !59
|
380 |
+
%353 = bitcast i32 %352 to float, !dbg !59
|
381 |
+
%354 = fsub float %347, %339, !dbg !44
|
382 |
+
%355 = fadd float %334, %353, !dbg !48
|
383 |
+
%356 = fcmp oeq float %355, 0.000000e+00, !dbg !49
|
384 |
+
%357 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %353, float %355) #6, !dbg !50
|
385 |
+
%358 = select i1 %356, float 0.000000e+00, float %357, !dbg !51
|
386 |
+
%359 = fmul float %358, %354, !dbg !52
|
387 |
+
%360 = fadd float %339, %359, !dbg !53
|
388 |
+
%361 = fadd float %344, %350, !dbg !54
|
389 |
+
%362 = fmul float %354, %354, !dbg !55
|
390 |
+
%363 = fmul float %334, %362, !dbg !56
|
391 |
+
%364 = fmul float %358, %363, !dbg !57
|
392 |
+
%365 = fadd float %361, %364, !dbg !58
|
393 |
+
%366 = bitcast float %360 to i32, !dbg !59
|
394 |
+
%367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 4, i32 31), !dbg !59
|
395 |
+
%368 = bitcast i32 %367 to float, !dbg !59
|
396 |
+
%369 = bitcast float %365 to i32, !dbg !59
|
397 |
+
%370 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %369, i32 4, i32 31), !dbg !59
|
398 |
+
%371 = bitcast i32 %370 to float, !dbg !59
|
399 |
+
%372 = bitcast float %355 to i32, !dbg !59
|
400 |
+
%373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %372, i32 4, i32 31), !dbg !59
|
401 |
+
%374 = bitcast i32 %373 to float, !dbg !59
|
402 |
+
%375 = fsub float %368, %360, !dbg !44
|
403 |
+
%376 = fadd float %355, %374, !dbg !48
|
404 |
+
%377 = fcmp oeq float %376, 0.000000e+00, !dbg !49
|
405 |
+
%378 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %374, float %376) #6, !dbg !50
|
406 |
+
%379 = select i1 %377, float 0.000000e+00, float %378, !dbg !51
|
407 |
+
%380 = fmul float %379, %375, !dbg !52
|
408 |
+
%381 = fadd float %360, %380, !dbg !53
|
409 |
+
%382 = fadd float %365, %371, !dbg !54
|
410 |
+
%383 = fmul float %375, %375, !dbg !55
|
411 |
+
%384 = fmul float %355, %383, !dbg !56
|
412 |
+
%385 = fmul float %379, %384, !dbg !57
|
413 |
+
%386 = fadd float %382, %385, !dbg !58
|
414 |
+
%387 = bitcast float %381 to i32, !dbg !59
|
415 |
+
%388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 2, i32 31), !dbg !59
|
416 |
+
%389 = bitcast i32 %388 to float, !dbg !59
|
417 |
+
%390 = bitcast float %386 to i32, !dbg !59
|
418 |
+
%391 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %390, i32 2, i32 31), !dbg !59
|
419 |
+
%392 = bitcast i32 %391 to float, !dbg !59
|
420 |
+
%393 = bitcast float %376 to i32, !dbg !59
|
421 |
+
%394 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %393, i32 2, i32 31), !dbg !59
|
422 |
+
%395 = bitcast i32 %394 to float, !dbg !59
|
423 |
+
%396 = fsub float %389, %381, !dbg !44
|
424 |
+
%397 = fadd float %376, %395, !dbg !48
|
425 |
+
%398 = fcmp oeq float %397, 0.000000e+00, !dbg !49
|
426 |
+
%399 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %395, float %397) #6, !dbg !50
|
427 |
+
%400 = select i1 %398, float 0.000000e+00, float %399, !dbg !51
|
428 |
+
%401 = fmul float %400, %396, !dbg !52
|
429 |
+
%402 = fadd float %381, %401, !dbg !53
|
430 |
+
%403 = fadd float %386, %392, !dbg !54
|
431 |
+
%404 = fmul float %396, %396, !dbg !55
|
432 |
+
%405 = fmul float %376, %404, !dbg !56
|
433 |
+
%406 = fmul float %400, %405, !dbg !57
|
434 |
+
%407 = fadd float %403, %406, !dbg !58
|
435 |
+
%408 = bitcast float %402 to i32, !dbg !59
|
436 |
+
%409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 1, i32 31), !dbg !59
|
437 |
+
%410 = bitcast i32 %409 to float, !dbg !59
|
438 |
+
%411 = bitcast float %407 to i32, !dbg !59
|
439 |
+
%412 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %411, i32 1, i32 31), !dbg !59
|
440 |
+
%413 = bitcast i32 %412 to float, !dbg !59
|
441 |
+
%414 = bitcast float %397 to i32, !dbg !59
|
442 |
+
%415 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %414, i32 1, i32 31), !dbg !59
|
443 |
+
%416 = bitcast i32 %415 to float, !dbg !59
|
444 |
+
%417 = fsub float %410, %402, !dbg !44
|
445 |
+
%418 = fadd float %397, %416, !dbg !48
|
446 |
+
%419 = fcmp oeq float %418, 0.000000e+00, !dbg !49
|
447 |
+
%420 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %416, float %418) #6, !dbg !50
|
448 |
+
%421 = select i1 %419, float 0.000000e+00, float %420, !dbg !51
|
449 |
+
%422 = fmul float %421, %417, !dbg !52
|
450 |
+
%423 = fadd float %402, %422, !dbg !53
|
451 |
+
%424 = fadd float %407, %413, !dbg !54
|
452 |
+
%425 = fmul float %417, %417, !dbg !55
|
453 |
+
%426 = fmul float %397, %425, !dbg !56
|
454 |
+
%427 = fmul float %421, %426, !dbg !57
|
455 |
+
%428 = fadd float %424, %427, !dbg !58
|
456 |
+
%429 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
457 |
+
%430 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
458 |
+
%431 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
459 |
+
%432 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
460 |
+
%433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
461 |
+
%434 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
462 |
+
%435 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
463 |
+
%436 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %428, float 2.560000e+02) #6, !dbg !61
|
464 |
+
%437 = fadd float %429, 0x3EE4F8B580000000, !dbg !62
|
465 |
+
%438 = shl i32 %16, 8, !dbg !63
|
466 |
+
br label %439, !dbg !64
|
467 |
+
|
468 |
+
439: ; preds = %199, %__nv_rsqrtf.exit
|
469 |
+
%440 = phi i1 [ true, %199 ], [ false, %__nv_rsqrtf.exit ]
|
470 |
+
%441 = phi i32 [ 0, %199 ], [ 128, %__nv_rsqrtf.exit ]
|
471 |
+
%442 = or i32 %441, %12, !dbg !65
|
472 |
+
%443 = or i32 %441, %13, !dbg !65
|
473 |
+
%444 = or i32 %442, %32, !dbg !66
|
474 |
+
%445 = or i32 %443, %32, !dbg !66
|
475 |
+
%446 = sext i32 %444 to i64, !dbg !67
|
476 |
+
%447 = getelementptr float, ptr addrspace(1) %2, i64 %446, !dbg !67
|
477 |
+
%448 = sext i32 %445 to i64, !dbg !67
|
478 |
+
%449 = getelementptr float, ptr addrspace(1) %2, i64 %448, !dbg !67
|
479 |
+
%450 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %447, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
480 |
+
%451 = extractvalue { i32, i32, i32, i32 } %450, 0, !dbg !68
|
481 |
+
%452 = extractvalue { i32, i32, i32, i32 } %450, 1, !dbg !68
|
482 |
+
%453 = extractvalue { i32, i32, i32, i32 } %450, 2, !dbg !68
|
483 |
+
%454 = extractvalue { i32, i32, i32, i32 } %450, 3, !dbg !68
|
484 |
+
%455 = bitcast i32 %451 to float, !dbg !68
|
485 |
+
%456 = bitcast i32 %452 to float, !dbg !68
|
486 |
+
%457 = bitcast i32 %453 to float, !dbg !68
|
487 |
+
%458 = bitcast i32 %454 to float, !dbg !68
|
488 |
+
%459 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %449, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
|
489 |
+
%460 = extractvalue { i32, i32, i32, i32 } %459, 0, !dbg !68
|
490 |
+
%461 = extractvalue { i32, i32, i32, i32 } %459, 1, !dbg !68
|
491 |
+
%462 = extractvalue { i32, i32, i32, i32 } %459, 2, !dbg !68
|
492 |
+
%463 = extractvalue { i32, i32, i32, i32 } %459, 3, !dbg !68
|
493 |
+
%464 = bitcast i32 %460 to float, !dbg !68
|
494 |
+
%465 = bitcast i32 %461 to float, !dbg !68
|
495 |
+
%466 = bitcast i32 %462 to float, !dbg !68
|
496 |
+
%467 = bitcast i32 %463 to float, !dbg !68
|
497 |
+
%468 = zext nneg i32 %442 to i64, !dbg !69
|
498 |
+
%469 = getelementptr float, ptr addrspace(1) %3, i64 %468, !dbg !69
|
499 |
+
%470 = zext nneg i32 %443 to i64, !dbg !69
|
500 |
+
%471 = getelementptr float, ptr addrspace(1) %3, i64 %470, !dbg !69
|
501 |
+
%472 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %469, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
|
502 |
+
%473 = extractvalue { i32, i32, i32, i32 } %472, 0, !dbg !70
|
503 |
+
%474 = extractvalue { i32, i32, i32, i32 } %472, 1, !dbg !70
|
504 |
+
%475 = extractvalue { i32, i32, i32, i32 } %472, 2, !dbg !70
|
505 |
+
%476 = extractvalue { i32, i32, i32, i32 } %472, 3, !dbg !70
|
506 |
+
%477 = bitcast i32 %473 to float, !dbg !70
|
507 |
+
%478 = bitcast i32 %474 to float, !dbg !70
|
508 |
+
%479 = bitcast i32 %475 to float, !dbg !70
|
509 |
+
%480 = bitcast i32 %476 to float, !dbg !70
|
510 |
+
%481 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %471, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
|
511 |
+
%482 = extractvalue { i32, i32, i32, i32 } %481, 0, !dbg !70
|
512 |
+
%483 = extractvalue { i32, i32, i32, i32 } %481, 1, !dbg !70
|
513 |
+
%484 = extractvalue { i32, i32, i32, i32 } %481, 2, !dbg !70
|
514 |
+
%485 = extractvalue { i32, i32, i32, i32 } %481, 3, !dbg !70
|
515 |
+
%486 = bitcast i32 %482 to float, !dbg !70
|
516 |
+
%487 = bitcast i32 %483 to float, !dbg !70
|
517 |
+
%488 = bitcast i32 %484 to float, !dbg !70
|
518 |
+
%489 = bitcast i32 %485 to float, !dbg !70
|
519 |
+
br i1 %37, label %490, label %491, !dbg !71
|
520 |
+
|
521 |
+
490: ; preds = %439
|
522 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !71
|
523 |
+
br label %491, !dbg !71
|
524 |
+
|
525 |
+
491: ; preds = %490, %439
|
526 |
+
%492 = getelementptr float, ptr addrspace(1) %41, i64 %468, !dbg !72
|
527 |
+
%493 = getelementptr float, ptr addrspace(1) %41, i64 %470, !dbg !72
|
528 |
+
%494 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %492, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
529 |
+
%495 = extractvalue { i32, i32, i32, i32 } %494, 0, !dbg !73
|
530 |
+
%496 = extractvalue { i32, i32, i32, i32 } %494, 1, !dbg !73
|
531 |
+
%497 = extractvalue { i32, i32, i32, i32 } %494, 2, !dbg !73
|
532 |
+
%498 = extractvalue { i32, i32, i32, i32 } %494, 3, !dbg !73
|
533 |
+
%499 = bitcast i32 %495 to float, !dbg !73
|
534 |
+
%500 = bitcast i32 %496 to float, !dbg !73
|
535 |
+
%501 = bitcast i32 %497 to float, !dbg !73
|
536 |
+
%502 = bitcast i32 %498 to float, !dbg !73
|
537 |
+
%503 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %493, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
538 |
+
%504 = extractvalue { i32, i32, i32, i32 } %503, 0, !dbg !73
|
539 |
+
%505 = extractvalue { i32, i32, i32, i32 } %503, 1, !dbg !73
|
540 |
+
%506 = extractvalue { i32, i32, i32, i32 } %503, 2, !dbg !73
|
541 |
+
%507 = extractvalue { i32, i32, i32, i32 } %503, 3, !dbg !73
|
542 |
+
%508 = bitcast i32 %504 to float, !dbg !73
|
543 |
+
%509 = bitcast i32 %505 to float, !dbg !73
|
544 |
+
%510 = bitcast i32 %506 to float, !dbg !73
|
545 |
+
%511 = bitcast i32 %507 to float, !dbg !73
|
546 |
+
%512 = fadd float %455, %499, !dbg !74
|
547 |
+
%513 = fadd float %456, %500, !dbg !74
|
548 |
+
%514 = fadd float %457, %501, !dbg !74
|
549 |
+
%515 = fadd float %458, %502, !dbg !74
|
550 |
+
%516 = fadd float %464, %508, !dbg !74
|
551 |
+
%517 = fadd float %465, %509, !dbg !74
|
552 |
+
%518 = fadd float %466, %510, !dbg !74
|
553 |
+
%519 = fadd float %467, %511, !dbg !74
|
554 |
+
%520 = fsub float %512, %423, !dbg !75
|
555 |
+
%521 = fsub float %513, %423, !dbg !75
|
556 |
+
%522 = fsub float %514, %423, !dbg !75
|
557 |
+
%523 = fsub float %515, %423, !dbg !75
|
558 |
+
%524 = fsub float %516, %423, !dbg !75
|
559 |
+
%525 = fsub float %517, %423, !dbg !75
|
560 |
+
%526 = fsub float %518, %423, !dbg !75
|
561 |
+
%527 = fsub float %519, %423, !dbg !75
|
562 |
+
%528 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
563 |
+
%.not.i = icmp eq i32 %528, 0, !dbg !76
|
564 |
+
br i1 %.not.i, label %531, label %529, !dbg !76
|
565 |
+
|
566 |
+
529: ; preds = %491
|
567 |
+
%530 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %437), !dbg !76
|
568 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
569 |
+
|
570 |
+
531: ; preds = %491
|
571 |
+
%532 = tail call float @llvm.nvvm.rsqrt.approx.f(float %437), !dbg !76
|
572 |
+
br label %__nv_rsqrtf.exit, !dbg !76
|
573 |
+
|
574 |
+
__nv_rsqrtf.exit: ; preds = %529, %531
|
575 |
+
%.0.i = phi float [ %530, %529 ], [ %532, %531 ], !dbg !76
|
576 |
+
%533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
577 |
+
%534 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
578 |
+
%535 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
579 |
+
%536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
580 |
+
%537 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
581 |
+
%538 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
582 |
+
%539 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
|
583 |
+
%540 = fmul float %520, %.0.i, !dbg !77
|
584 |
+
%541 = fmul float %521, %.0.i, !dbg !77
|
585 |
+
%542 = fmul float %522, %.0.i, !dbg !77
|
586 |
+
%543 = fmul float %523, %.0.i, !dbg !77
|
587 |
+
%544 = fmul float %524, %.0.i, !dbg !77
|
588 |
+
%545 = fmul float %525, %.0.i, !dbg !77
|
589 |
+
%546 = fmul float %526, %.0.i, !dbg !77
|
590 |
+
%547 = fmul float %527, %.0.i, !dbg !77
|
591 |
+
%548 = fmul float %540, %477, !dbg !78
|
592 |
+
%549 = fmul float %541, %478, !dbg !78
|
593 |
+
%550 = fmul float %542, %479, !dbg !78
|
594 |
+
%551 = fmul float %543, %480, !dbg !78
|
595 |
+
%552 = fmul float %544, %486, !dbg !78
|
596 |
+
%553 = fmul float %545, %487, !dbg !78
|
597 |
+
%554 = fmul float %546, %488, !dbg !78
|
598 |
+
%555 = fmul float %547, %489, !dbg !78
|
599 |
+
%556 = or i32 %442, %438, !dbg !79
|
600 |
+
%557 = sext i32 %556 to i64, !dbg !80
|
601 |
+
%558 = getelementptr i16, ptr addrspace(1) %4, i64 %557, !dbg !80
|
602 |
+
%559 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %548) #6, !dbg !81
|
603 |
+
%560 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %549) #6, !dbg !81
|
604 |
+
%561 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %550) #6, !dbg !81
|
605 |
+
%562 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %551) #6, !dbg !81
|
606 |
+
%563 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %552) #6, !dbg !81
|
607 |
+
%564 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %553) #6, !dbg !81
|
608 |
+
%565 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %554) #6, !dbg !81
|
609 |
+
%566 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %555) #6, !dbg !81
|
610 |
+
%567 = insertelement <2 x i16> undef, i16 %559, i64 0, !dbg !81
|
611 |
+
%568 = insertelement <2 x i16> %567, i16 %560, i64 1, !dbg !81
|
612 |
+
%569 = bitcast <2 x i16> %568 to i32, !dbg !81
|
613 |
+
%570 = insertelement <2 x i16> undef, i16 %561, i64 0, !dbg !81
|
614 |
+
%571 = insertelement <2 x i16> %570, i16 %562, i64 1, !dbg !81
|
615 |
+
%572 = bitcast <2 x i16> %571 to i32, !dbg !81
|
616 |
+
%573 = insertelement <2 x i16> undef, i16 %563, i64 0, !dbg !81
|
617 |
+
%574 = insertelement <2 x i16> %573, i16 %564, i64 1, !dbg !81
|
618 |
+
%575 = bitcast <2 x i16> %574 to i32, !dbg !81
|
619 |
+
%576 = insertelement <2 x i16> undef, i16 %565, i64 0, !dbg !81
|
620 |
+
%577 = insertelement <2 x i16> %576, i16 %566, i64 1, !dbg !81
|
621 |
+
%578 = bitcast <2 x i16> %577 to i32, !dbg !81
|
622 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %569, i32 %572, i32 %575, i32 %578, ptr addrspace(1) %558, i1 true) #6, !dbg !81
|
623 |
+
br i1 %440, label %439, label %579, !dbg !64
|
624 |
+
|
625 |
+
579: ; preds = %__nv_rsqrtf.exit
|
626 |
+
ret void, !dbg !82
|
627 |
+
}
|
628 |
+
|
629 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
630 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
631 |
+
|
632 |
+
; Function Attrs: convergent nocallback nounwind
|
633 |
+
declare void @llvm.nvvm.barrier0() #1
|
634 |
+
|
635 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
636 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
637 |
+
|
638 |
+
; Function Attrs: alwaysinline nounwind
|
639 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
640 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
641 |
+
%.not = icmp eq i32 %1, 0
|
642 |
+
br i1 %.not, label %4, label %2
|
643 |
+
|
644 |
+
2: ; preds = %0
|
645 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
646 |
+
br label %6
|
647 |
+
|
648 |
+
4: ; preds = %0
|
649 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
650 |
+
br label %6
|
651 |
+
|
652 |
+
6: ; preds = %4, %2
|
653 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
654 |
+
ret float %.0
|
655 |
+
}
|
656 |
+
|
657 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
658 |
+
|
659 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
660 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
661 |
+
|
662 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
663 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
664 |
+
|
665 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
666 |
+
attributes #1 = { convergent nocallback nounwind }
|
667 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
668 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
669 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
670 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
671 |
+
attributes #6 = { nounwind }
|
672 |
+
|
673 |
+
!llvm.module.flags = !{!0, !1}
|
674 |
+
!llvm.dbg.cu = !{!2}
|
675 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
676 |
+
!llvm.ident = !{!6}
|
677 |
+
|
678 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
679 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
680 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
681 |
+
!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
|
682 |
+
!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
|
683 |
+
!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 256}
|
684 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
685 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
686 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
687 |
+
!9 = !{}
|
688 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
689 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
690 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
691 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
692 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
693 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
694 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
695 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
696 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
697 |
+
!19 = !DILocation(line: 36, column: 22, scope: !7)
|
698 |
+
!20 = !DILocation(line: 37, column: 22, scope: !7)
|
699 |
+
!21 = !DILocation(line: 38, column: 36, scope: !7)
|
700 |
+
!22 = !DILocation(line: 39, column: 40, scope: !7)
|
701 |
+
!23 = !DILocation(line: 40, column: 44, scope: !7)
|
702 |
+
!24 = !DILocation(line: 31, column: 36, scope: !7)
|
703 |
+
!25 = !DILocation(line: 32, column: 27, scope: !7)
|
704 |
+
!26 = !DILocation(line: 35, column: 40, scope: !7)
|
705 |
+
!27 = !DILocation(line: 35, column: 34, scope: !7)
|
706 |
+
!28 = !DILocation(line: 35, column: 50, scope: !7)
|
707 |
+
!29 = !DILocation(line: 39, column: 55, scope: !7)
|
708 |
+
!30 = !DILocation(line: 40, column: 40, scope: !7)
|
709 |
+
!31 = !DILocation(line: 40, column: 34, scope: !7)
|
710 |
+
!32 = !DILocation(line: 40, column: 52, scope: !7)
|
711 |
+
!33 = !DILocation(line: 41, column: 22, scope: !7)
|
712 |
+
!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
|
713 |
+
!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
|
714 |
+
!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
715 |
+
!37 = !DILocation(line: 44, column: 38, scope: !35)
|
716 |
+
!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
|
717 |
+
!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
|
718 |
+
!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
|
719 |
+
!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
|
720 |
+
!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
|
721 |
+
!43 = !DILocation(line: 47, column: 48, scope: !7)
|
722 |
+
!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
|
723 |
+
!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
|
724 |
+
!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
|
725 |
+
!47 = !DILocation(line: 50, column: 41, scope: !45)
|
726 |
+
!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
|
727 |
+
!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
|
728 |
+
!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
|
729 |
+
!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
|
730 |
+
!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
|
731 |
+
!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
|
732 |
+
!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
|
733 |
+
!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
|
734 |
+
!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
|
735 |
+
!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
|
736 |
+
!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
|
737 |
+
!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
|
738 |
+
!60 = !DILocation(line: 50, column: 41, scope: !35)
|
739 |
+
!61 = !DILocation(line: 69, column: 23, scope: !7)
|
740 |
+
!62 = !DILocation(line: 71, column: 24, scope: !7)
|
741 |
+
!63 = !DILocation(line: 76, column: 39, scope: !7)
|
742 |
+
!64 = !DILocation(line: 55, column: 36, scope: !7)
|
743 |
+
!65 = !DILocation(line: 56, column: 27, scope: !7)
|
744 |
+
!66 = !DILocation(line: 59, column: 41, scope: !7)
|
745 |
+
!67 = !DILocation(line: 59, column: 35, scope: !7)
|
746 |
+
!68 = !DILocation(line: 59, column: 51, scope: !7)
|
747 |
+
!69 = !DILocation(line: 60, column: 35, scope: !7)
|
748 |
+
!70 = !DILocation(line: 60, column: 40, scope: !7)
|
749 |
+
!71 = !DILocation(line: 64, column: 57, scope: !7)
|
750 |
+
!72 = !DILocation(line: 65, column: 35, scope: !7)
|
751 |
+
!73 = !DILocation(line: 65, column: 54, scope: !7)
|
752 |
+
!74 = !DILocation(line: 66, column: 24, scope: !7)
|
753 |
+
!75 = !DILocation(line: 67, column: 24, scope: !7)
|
754 |
+
!76 = !DILocation(line: 72, column: 30, scope: !7)
|
755 |
+
!77 = !DILocation(line: 73, column: 24, scope: !7)
|
756 |
+
!78 = !DILocation(line: 74, column: 24, scope: !7)
|
757 |
+
!79 = !DILocation(line: 76, column: 35, scope: !7)
|
758 |
+
!80 = !DILocation(line: 76, column: 29, scope: !7)
|
759 |
+
!81 = !DILocation(line: 76, column: 52, scope: !7)
|
760 |
+
!82 = !DILocation(line: 55, column: 4, scope: !7)
|
.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.llir
ADDED
@@ -0,0 +1,839 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
; ModuleID = 'LLVMDialectModule'
|
2 |
+
source_filename = "LLVMDialectModule"
|
3 |
+
|
4 |
+
@assertFunc_1 = internal constant [8 x i8] c"<module>"
|
5 |
+
@assertFile_1 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
|
6 |
+
@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
|
7 |
+
@assertFunc_0 = internal constant [8 x i8] c"<module>"
|
8 |
+
@assertFile_0 = internal constant [68 x i8] c"/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py"
|
9 |
+
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
|
10 |
+
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
|
11 |
+
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
|
12 |
+
|
13 |
+
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
|
14 |
+
|
15 |
+
define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
|
16 |
+
%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
|
17 |
+
%10 = lshr i32 %9, 4, !dbg !10
|
18 |
+
%11 = and i32 %10, 15, !dbg !10
|
19 |
+
%12 = and i32 %9, 15, !dbg !10
|
20 |
+
%13 = shl nuw nsw i32 %12, 3, !dbg !11
|
21 |
+
%14 = or i32 %13, 4, !dbg !11
|
22 |
+
%15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !12
|
23 |
+
%16 = shl i32 %15, 4, !dbg !13
|
24 |
+
%17 = or i32 %16, %11, !dbg !14
|
25 |
+
%18 = or i32 %16, %12, !dbg !14
|
26 |
+
%19 = sext i32 %17 to i64, !dbg !15
|
27 |
+
%20 = getelementptr i64, ptr addrspace(1) %0, i64 %19, !dbg !15
|
28 |
+
%21 = sext i32 %18 to i64, !dbg !15
|
29 |
+
%22 = getelementptr i64, ptr addrspace(1) %0, i64 %21, !dbg !15
|
30 |
+
%23 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
31 |
+
%24 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
32 |
+
%25 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
33 |
+
%26 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
34 |
+
%27 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
35 |
+
%28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
36 |
+
%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
37 |
+
%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %20, i1 true) #6, !dbg !16
|
38 |
+
%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %22, i1 true) #6, !dbg !16
|
39 |
+
%32 = srem i32 %17, 512, !dbg !17
|
40 |
+
%33 = shl nsw i32 %32, 8, !dbg !18
|
41 |
+
%34 = shl i32 %17, 8, !dbg !19
|
42 |
+
%35 = add i64 %31, 50257, !dbg !20
|
43 |
+
%36 = icmp slt i64 %23, 0, !dbg !21
|
44 |
+
%37 = icmp slt i64 %31, 0, !dbg !21
|
45 |
+
%38 = select i1 %37, i64 %35, i64 %31, !dbg !22
|
46 |
+
%39 = icmp ugt i64 %38, 50256, !dbg !23
|
47 |
+
%40 = shl i64 %23, 8, !dbg !24
|
48 |
+
%41 = add i64 %40, 12865792, !dbg !24
|
49 |
+
%42 = select i1 %36, i64 %41, i64 %40, !dbg !24
|
50 |
+
%43 = getelementptr float, ptr addrspace(1) %1, i64 %42
|
51 |
+
br label %44, !dbg !25
|
52 |
+
|
53 |
+
44: ; preds = %8, %130
|
54 |
+
%45 = phi float [ 0.000000e+00, %8 ], [ %177, %130 ]
|
55 |
+
%46 = phi float [ 0.000000e+00, %8 ], [ %178, %130 ]
|
56 |
+
%47 = phi float [ 0.000000e+00, %8 ], [ %179, %130 ]
|
57 |
+
%48 = phi float [ 0.000000e+00, %8 ], [ %180, %130 ]
|
58 |
+
%49 = phi float [ 0.000000e+00, %8 ], [ %181, %130 ]
|
59 |
+
%50 = phi float [ 0.000000e+00, %8 ], [ %182, %130 ]
|
60 |
+
%51 = phi float [ 0.000000e+00, %8 ], [ %183, %130 ]
|
61 |
+
%52 = phi float [ 0.000000e+00, %8 ], [ %184, %130 ]
|
62 |
+
%53 = phi float [ 0.000000e+00, %8 ], [ %185, %130 ]
|
63 |
+
%54 = phi float [ 0.000000e+00, %8 ], [ %186, %130 ]
|
64 |
+
%55 = phi float [ 0.000000e+00, %8 ], [ %187, %130 ]
|
65 |
+
%56 = phi float [ 0.000000e+00, %8 ], [ %188, %130 ]
|
66 |
+
%57 = phi float [ 0.000000e+00, %8 ], [ %189, %130 ]
|
67 |
+
%58 = phi float [ 0.000000e+00, %8 ], [ %190, %130 ]
|
68 |
+
%59 = phi float [ 0.000000e+00, %8 ], [ %191, %130 ]
|
69 |
+
%60 = phi float [ 0.000000e+00, %8 ], [ %192, %130 ]
|
70 |
+
%61 = phi float [ 0.000000e+00, %8 ], [ %225, %130 ]
|
71 |
+
%62 = phi float [ 0.000000e+00, %8 ], [ %226, %130 ]
|
72 |
+
%63 = phi float [ 0.000000e+00, %8 ], [ %227, %130 ]
|
73 |
+
%64 = phi float [ 0.000000e+00, %8 ], [ %228, %130 ]
|
74 |
+
%65 = phi float [ 0.000000e+00, %8 ], [ %229, %130 ]
|
75 |
+
%66 = phi float [ 0.000000e+00, %8 ], [ %230, %130 ]
|
76 |
+
%67 = phi float [ 0.000000e+00, %8 ], [ %231, %130 ]
|
77 |
+
%68 = phi float [ 0.000000e+00, %8 ], [ %232, %130 ]
|
78 |
+
%69 = phi float [ 0.000000e+00, %8 ], [ %201, %130 ]
|
79 |
+
%70 = phi float [ 0.000000e+00, %8 ], [ %202, %130 ]
|
80 |
+
%71 = phi float [ 0.000000e+00, %8 ], [ %203, %130 ]
|
81 |
+
%72 = phi float [ 0.000000e+00, %8 ], [ %204, %130 ]
|
82 |
+
%73 = phi float [ 0.000000e+00, %8 ], [ %205, %130 ]
|
83 |
+
%74 = phi float [ 0.000000e+00, %8 ], [ %206, %130 ]
|
84 |
+
%75 = phi float [ 0.000000e+00, %8 ], [ %207, %130 ]
|
85 |
+
%76 = phi float [ 0.000000e+00, %8 ], [ %208, %130 ]
|
86 |
+
%77 = phi i1 [ true, %8 ], [ false, %130 ]
|
87 |
+
%78 = phi i32 [ 0, %8 ], [ 128, %130 ]
|
88 |
+
%79 = or i32 %78, %13, !dbg !26
|
89 |
+
%80 = or i32 %78, %14, !dbg !26
|
90 |
+
%81 = or i32 %79, %33, !dbg !27
|
91 |
+
%82 = or i32 %80, %33, !dbg !27
|
92 |
+
%83 = sext i32 %81 to i64, !dbg !28
|
93 |
+
%84 = getelementptr float, ptr addrspace(1) %2, i64 %83, !dbg !28
|
94 |
+
%85 = sext i32 %82 to i64, !dbg !28
|
95 |
+
%86 = getelementptr float, ptr addrspace(1) %2, i64 %85, !dbg !28
|
96 |
+
%87 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %84, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
97 |
+
%88 = extractvalue { i32, i32, i32, i32 } %87, 0, !dbg !29
|
98 |
+
%89 = extractvalue { i32, i32, i32, i32 } %87, 1, !dbg !29
|
99 |
+
%90 = extractvalue { i32, i32, i32, i32 } %87, 2, !dbg !29
|
100 |
+
%91 = extractvalue { i32, i32, i32, i32 } %87, 3, !dbg !29
|
101 |
+
%92 = bitcast i32 %88 to float, !dbg !29
|
102 |
+
%93 = bitcast i32 %89 to float, !dbg !29
|
103 |
+
%94 = bitcast i32 %90 to float, !dbg !29
|
104 |
+
%95 = bitcast i32 %91 to float, !dbg !29
|
105 |
+
%96 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %86, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
|
106 |
+
%97 = extractvalue { i32, i32, i32, i32 } %96, 0, !dbg !29
|
107 |
+
%98 = extractvalue { i32, i32, i32, i32 } %96, 1, !dbg !29
|
108 |
+
%99 = extractvalue { i32, i32, i32, i32 } %96, 2, !dbg !29
|
109 |
+
%100 = extractvalue { i32, i32, i32, i32 } %96, 3, !dbg !29
|
110 |
+
%101 = bitcast i32 %97 to float, !dbg !29
|
111 |
+
%102 = bitcast i32 %98 to float, !dbg !29
|
112 |
+
%103 = bitcast i32 %99 to float, !dbg !29
|
113 |
+
%104 = bitcast i32 %100 to float, !dbg !29
|
114 |
+
%105 = or i32 %79, %34, !dbg !30
|
115 |
+
%106 = sext i32 %105 to i64, !dbg !31
|
116 |
+
%107 = getelementptr i16, ptr addrspace(1) %3, i64 %106, !dbg !31
|
117 |
+
%108 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %107, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
|
118 |
+
%109 = extractvalue { i32, i32, i32, i32 } %108, 0, !dbg !32
|
119 |
+
%110 = extractvalue { i32, i32, i32, i32 } %108, 1, !dbg !32
|
120 |
+
%111 = extractvalue { i32, i32, i32, i32 } %108, 2, !dbg !32
|
121 |
+
%112 = extractvalue { i32, i32, i32, i32 } %108, 3, !dbg !32
|
122 |
+
%113 = trunc i32 %109 to i16, !dbg !32
|
123 |
+
%extelt.offset5 = lshr i32 %109, 16, !dbg !32
|
124 |
+
%114 = trunc i32 %extelt.offset5 to i16, !dbg !32
|
125 |
+
%115 = trunc i32 %110 to i16, !dbg !32
|
126 |
+
%extelt.offset6 = lshr i32 %110, 16, !dbg !32
|
127 |
+
%116 = trunc i32 %extelt.offset6 to i16, !dbg !32
|
128 |
+
%117 = trunc i32 %111 to i16, !dbg !32
|
129 |
+
%extelt.offset7 = lshr i32 %111, 16, !dbg !32
|
130 |
+
%118 = trunc i32 %extelt.offset7 to i16, !dbg !32
|
131 |
+
%119 = trunc i32 %112 to i16, !dbg !32
|
132 |
+
%extelt.offset8 = lshr i32 %112, 16, !dbg !32
|
133 |
+
%120 = trunc i32 %extelt.offset8 to i16, !dbg !32
|
134 |
+
%121 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %113) #6, !dbg !33
|
135 |
+
%122 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %114) #6, !dbg !33
|
136 |
+
%123 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %115) #6, !dbg !33
|
137 |
+
%124 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #6, !dbg !33
|
138 |
+
%125 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %117) #6, !dbg !33
|
139 |
+
%126 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %118) #6, !dbg !33
|
140 |
+
%127 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %119) #6, !dbg !33
|
141 |
+
%128 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %120) #6, !dbg !33
|
142 |
+
br i1 %39, label %129, label %130, !dbg !34
|
143 |
+
|
144 |
+
129: ; preds = %44
|
145 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 1892, ptr nonnull @assertFunc_0, i64 1), !dbg !34
|
146 |
+
br label %130, !dbg !34
|
147 |
+
|
148 |
+
130: ; preds = %129, %44
|
149 |
+
%131 = zext nneg i32 %79 to i64, !dbg !35
|
150 |
+
%132 = zext nneg i32 %80 to i64, !dbg !35
|
151 |
+
%133 = getelementptr float, ptr addrspace(1) %43, i64 %131, !dbg !36
|
152 |
+
%134 = getelementptr float, ptr addrspace(1) %43, i64 %132, !dbg !36
|
153 |
+
%135 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %133, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
154 |
+
%136 = extractvalue { i32, i32, i32, i32 } %135, 0, !dbg !37
|
155 |
+
%137 = extractvalue { i32, i32, i32, i32 } %135, 1, !dbg !37
|
156 |
+
%138 = extractvalue { i32, i32, i32, i32 } %135, 2, !dbg !37
|
157 |
+
%139 = extractvalue { i32, i32, i32, i32 } %135, 3, !dbg !37
|
158 |
+
%140 = bitcast i32 %136 to float, !dbg !37
|
159 |
+
%141 = bitcast i32 %137 to float, !dbg !37
|
160 |
+
%142 = bitcast i32 %138 to float, !dbg !37
|
161 |
+
%143 = bitcast i32 %139 to float, !dbg !37
|
162 |
+
%144 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %134, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
|
163 |
+
%145 = extractvalue { i32, i32, i32, i32 } %144, 0, !dbg !37
|
164 |
+
%146 = extractvalue { i32, i32, i32, i32 } %144, 1, !dbg !37
|
165 |
+
%147 = extractvalue { i32, i32, i32, i32 } %144, 2, !dbg !37
|
166 |
+
%148 = extractvalue { i32, i32, i32, i32 } %144, 3, !dbg !37
|
167 |
+
%149 = bitcast i32 %145 to float, !dbg !37
|
168 |
+
%150 = bitcast i32 %146 to float, !dbg !37
|
169 |
+
%151 = bitcast i32 %147 to float, !dbg !37
|
170 |
+
%152 = bitcast i32 %148 to float, !dbg !37
|
171 |
+
%153 = fadd float %92, %140, !dbg !38
|
172 |
+
%154 = fadd float %93, %141, !dbg !38
|
173 |
+
%155 = fadd float %94, %142, !dbg !38
|
174 |
+
%156 = fadd float %95, %143, !dbg !38
|
175 |
+
%157 = fadd float %101, %149, !dbg !38
|
176 |
+
%158 = fadd float %102, %150, !dbg !38
|
177 |
+
%159 = fadd float %103, %151, !dbg !38
|
178 |
+
%160 = fadd float %104, %152, !dbg !38
|
179 |
+
%161 = fadd float %121, %153, !dbg !39
|
180 |
+
%162 = fadd float %122, %154, !dbg !39
|
181 |
+
%163 = fadd float %123, %155, !dbg !39
|
182 |
+
%164 = fadd float %124, %156, !dbg !39
|
183 |
+
%165 = fadd float %125, %157, !dbg !39
|
184 |
+
%166 = fadd float %126, %158, !dbg !39
|
185 |
+
%167 = fadd float %127, %159, !dbg !39
|
186 |
+
%168 = fadd float %128, %160, !dbg !39
|
187 |
+
%169 = fsub float %161, %69, !dbg !40
|
188 |
+
%170 = fsub float %162, %70, !dbg !40
|
189 |
+
%171 = fsub float %163, %71, !dbg !40
|
190 |
+
%172 = fsub float %164, %72, !dbg !40
|
191 |
+
%173 = fsub float %165, %73, !dbg !40
|
192 |
+
%174 = fsub float %166, %74, !dbg !40
|
193 |
+
%175 = fsub float %167, %75, !dbg !40
|
194 |
+
%176 = fsub float %168, %76, !dbg !40
|
195 |
+
%177 = fadd float %45, 1.000000e+00, !dbg !44
|
196 |
+
%178 = fadd float %46, 1.000000e+00, !dbg !44
|
197 |
+
%179 = fadd float %47, 1.000000e+00, !dbg !44
|
198 |
+
%180 = fadd float %48, 1.000000e+00, !dbg !44
|
199 |
+
%181 = fadd float %49, 1.000000e+00, !dbg !44
|
200 |
+
%182 = fadd float %50, 1.000000e+00, !dbg !44
|
201 |
+
%183 = fadd float %51, 1.000000e+00, !dbg !44
|
202 |
+
%184 = fadd float %52, 1.000000e+00, !dbg !44
|
203 |
+
%185 = fadd float %53, 1.000000e+00, !dbg !44
|
204 |
+
%186 = fadd float %54, 1.000000e+00, !dbg !44
|
205 |
+
%187 = fadd float %55, 1.000000e+00, !dbg !44
|
206 |
+
%188 = fadd float %56, 1.000000e+00, !dbg !44
|
207 |
+
%189 = fadd float %57, 1.000000e+00, !dbg !44
|
208 |
+
%190 = fadd float %58, 1.000000e+00, !dbg !44
|
209 |
+
%191 = fadd float %59, 1.000000e+00, !dbg !44
|
210 |
+
%192 = fadd float %60, 1.000000e+00, !dbg !44
|
211 |
+
%193 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %169, float %177) #6, !dbg !45
|
212 |
+
%194 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %170, float %178) #6, !dbg !45
|
213 |
+
%195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %171, float %179) #6, !dbg !45
|
214 |
+
%196 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %172, float %180) #6, !dbg !45
|
215 |
+
%197 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %173, float %181) #6, !dbg !45
|
216 |
+
%198 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %174, float %182) #6, !dbg !45
|
217 |
+
%199 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %183) #6, !dbg !45
|
218 |
+
%200 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %176, float %184) #6, !dbg !45
|
219 |
+
%201 = fadd float %69, %193, !dbg !46
|
220 |
+
%202 = fadd float %70, %194, !dbg !46
|
221 |
+
%203 = fadd float %71, %195, !dbg !46
|
222 |
+
%204 = fadd float %72, %196, !dbg !46
|
223 |
+
%205 = fadd float %73, %197, !dbg !46
|
224 |
+
%206 = fadd float %74, %198, !dbg !46
|
225 |
+
%207 = fadd float %75, %199, !dbg !46
|
226 |
+
%208 = fadd float %76, %200, !dbg !46
|
227 |
+
%209 = fsub float %161, %201, !dbg !47
|
228 |
+
%210 = fsub float %162, %202, !dbg !47
|
229 |
+
%211 = fsub float %163, %203, !dbg !47
|
230 |
+
%212 = fsub float %164, %204, !dbg !47
|
231 |
+
%213 = fsub float %165, %205, !dbg !47
|
232 |
+
%214 = fsub float %166, %206, !dbg !47
|
233 |
+
%215 = fsub float %167, %207, !dbg !47
|
234 |
+
%216 = fsub float %168, %208, !dbg !47
|
235 |
+
%217 = fmul float %169, %209, !dbg !48
|
236 |
+
%218 = fmul float %170, %210, !dbg !48
|
237 |
+
%219 = fmul float %171, %211, !dbg !48
|
238 |
+
%220 = fmul float %172, %212, !dbg !48
|
239 |
+
%221 = fmul float %173, %213, !dbg !48
|
240 |
+
%222 = fmul float %174, %214, !dbg !48
|
241 |
+
%223 = fmul float %175, %215, !dbg !48
|
242 |
+
%224 = fmul float %176, %216, !dbg !48
|
243 |
+
%225 = fadd float %61, %217, !dbg !49
|
244 |
+
%226 = fadd float %62, %218, !dbg !49
|
245 |
+
%227 = fadd float %63, %219, !dbg !49
|
246 |
+
%228 = fadd float %64, %220, !dbg !49
|
247 |
+
%229 = fadd float %65, %221, !dbg !49
|
248 |
+
%230 = fadd float %66, %222, !dbg !49
|
249 |
+
%231 = fadd float %67, %223, !dbg !49
|
250 |
+
%232 = fadd float %68, %224, !dbg !49
|
251 |
+
br i1 %77, label %44, label %233, !dbg !25
|
252 |
+
|
253 |
+
233: ; preds = %130
|
254 |
+
%234 = and i32 %9, 127, !dbg !11
|
255 |
+
%235 = and i32 %9, 128, !dbg !25
|
256 |
+
%.not = icmp eq i32 %235, 0, !dbg !25
|
257 |
+
%236 = select i1 %.not, i32 0, i32 136, !dbg !25
|
258 |
+
%237 = add nuw nsw i32 %236, %234, !dbg !25
|
259 |
+
%238 = zext nneg i32 %237 to i64, !dbg !25
|
260 |
+
%239 = getelementptr float, ptr addrspace(3) @global_smem, i64 %238, !dbg !25
|
261 |
+
%240 = insertelement <1 x float> undef, float %185, i64 0, !dbg !25
|
262 |
+
store <1 x float> %240, ptr addrspace(3) %239, align 4, !dbg !25
|
263 |
+
%241 = add nuw nsw i32 %234, 272, !dbg !25
|
264 |
+
%242 = add nuw nsw i32 %241, %236, !dbg !25
|
265 |
+
%243 = zext nneg i32 %242 to i64, !dbg !25
|
266 |
+
%244 = getelementptr float, ptr addrspace(3) @global_smem, i64 %243, !dbg !25
|
267 |
+
%245 = insertelement <1 x float> undef, float %186, i64 0, !dbg !25
|
268 |
+
store <1 x float> %245, ptr addrspace(3) %244, align 4, !dbg !25
|
269 |
+
%246 = add nuw nsw i32 %234, 544, !dbg !25
|
270 |
+
%247 = add nuw nsw i32 %246, %236, !dbg !25
|
271 |
+
%248 = zext nneg i32 %247 to i64, !dbg !25
|
272 |
+
%249 = getelementptr float, ptr addrspace(3) @global_smem, i64 %248, !dbg !25
|
273 |
+
%250 = insertelement <1 x float> undef, float %187, i64 0, !dbg !25
|
274 |
+
store <1 x float> %250, ptr addrspace(3) %249, align 4, !dbg !25
|
275 |
+
%251 = add nuw nsw i32 %234, 816, !dbg !25
|
276 |
+
%252 = add nuw nsw i32 %251, %236, !dbg !25
|
277 |
+
%253 = zext nneg i32 %252 to i64, !dbg !25
|
278 |
+
%254 = getelementptr float, ptr addrspace(3) @global_smem, i64 %253, !dbg !25
|
279 |
+
%255 = insertelement <1 x float> undef, float %188, i64 0, !dbg !25
|
280 |
+
store <1 x float> %255, ptr addrspace(3) %254, align 4, !dbg !25
|
281 |
+
%256 = add nuw nsw i32 %234, 1088, !dbg !25
|
282 |
+
%257 = add nuw nsw i32 %256, %236, !dbg !25
|
283 |
+
%258 = zext nneg i32 %257 to i64, !dbg !25
|
284 |
+
%259 = getelementptr float, ptr addrspace(3) @global_smem, i64 %258, !dbg !25
|
285 |
+
%260 = insertelement <1 x float> undef, float %189, i64 0, !dbg !25
|
286 |
+
store <1 x float> %260, ptr addrspace(3) %259, align 4, !dbg !25
|
287 |
+
%261 = add nuw nsw i32 %234, 1360, !dbg !25
|
288 |
+
%262 = add nuw nsw i32 %261, %236, !dbg !25
|
289 |
+
%263 = zext nneg i32 %262 to i64, !dbg !25
|
290 |
+
%264 = getelementptr float, ptr addrspace(3) @global_smem, i64 %263, !dbg !25
|
291 |
+
%265 = insertelement <1 x float> undef, float %190, i64 0, !dbg !25
|
292 |
+
store <1 x float> %265, ptr addrspace(3) %264, align 4, !dbg !25
|
293 |
+
%266 = add nuw nsw i32 %234, 1632, !dbg !25
|
294 |
+
%267 = add nuw nsw i32 %266, %236, !dbg !25
|
295 |
+
%268 = zext nneg i32 %267 to i64, !dbg !25
|
296 |
+
%269 = getelementptr float, ptr addrspace(3) @global_smem, i64 %268, !dbg !25
|
297 |
+
%270 = insertelement <1 x float> undef, float %191, i64 0, !dbg !25
|
298 |
+
store <1 x float> %270, ptr addrspace(3) %269, align 4, !dbg !25
|
299 |
+
%271 = add nuw nsw i32 %234, 1904, !dbg !25
|
300 |
+
%272 = add nuw nsw i32 %271, %236, !dbg !25
|
301 |
+
%273 = zext nneg i32 %272 to i64, !dbg !25
|
302 |
+
%274 = getelementptr float, ptr addrspace(3) @global_smem, i64 %273, !dbg !25
|
303 |
+
%275 = insertelement <1 x float> undef, float %192, i64 0, !dbg !25
|
304 |
+
store <1 x float> %275, ptr addrspace(3) %274, align 4, !dbg !25
|
305 |
+
tail call void @llvm.nvvm.barrier0(), !dbg !25
|
306 |
+
%276 = mul nuw nsw i32 %11, 136, !dbg !25
|
307 |
+
%277 = add nuw nsw i32 %276, %13, !dbg !25
|
308 |
+
%278 = zext nneg i32 %277 to i64, !dbg !25
|
309 |
+
%279 = getelementptr float, ptr addrspace(3) @global_smem, i64 %278, !dbg !25
|
310 |
+
%280 = load float, ptr addrspace(3) %279, align 32, !dbg !25
|
311 |
+
%281 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 1, !dbg !25
|
312 |
+
%282 = load float, ptr addrspace(3) %281, align 4, !dbg !25
|
313 |
+
%283 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 2, !dbg !25
|
314 |
+
%284 = load float, ptr addrspace(3) %283, align 8, !dbg !25
|
315 |
+
%285 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 3, !dbg !25
|
316 |
+
%286 = load float, ptr addrspace(3) %285, align 4, !dbg !25
|
317 |
+
%287 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 4, !dbg !25
|
318 |
+
%288 = load float, ptr addrspace(3) %287, align 16, !dbg !25
|
319 |
+
%289 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 5, !dbg !25
|
320 |
+
%290 = load float, ptr addrspace(3) %289, align 4, !dbg !25
|
321 |
+
%291 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 6, !dbg !25
|
322 |
+
%292 = load float, ptr addrspace(3) %291, align 8, !dbg !25
|
323 |
+
%293 = getelementptr inbounds <8 x float>, ptr addrspace(3) %279, i64 0, i64 7, !dbg !25
|
324 |
+
%294 = load float, ptr addrspace(3) %293, align 4, !dbg !25
|
325 |
+
%295 = fsub float %202, %201, !dbg !50
|
326 |
+
%296 = fadd float %280, %282, !dbg !54
|
327 |
+
%297 = fcmp oeq float %296, 0.000000e+00, !dbg !55
|
328 |
+
%298 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %282, float %296) #6, !dbg !56
|
329 |
+
%299 = select i1 %297, float 0.000000e+00, float %298, !dbg !57
|
330 |
+
%300 = fmul float %295, %299, !dbg !58
|
331 |
+
%301 = fadd float %201, %300, !dbg !59
|
332 |
+
%302 = fadd float %225, %226, !dbg !60
|
333 |
+
%303 = fmul float %295, %295, !dbg !61
|
334 |
+
%304 = fmul float %303, %280, !dbg !62
|
335 |
+
%305 = fmul float %304, %299, !dbg !63
|
336 |
+
%306 = fadd float %302, %305, !dbg !64
|
337 |
+
%307 = fsub float %203, %301, !dbg !50
|
338 |
+
%308 = fadd float %284, %296, !dbg !54
|
339 |
+
%309 = fcmp oeq float %308, 0.000000e+00, !dbg !55
|
340 |
+
%310 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %284, float %308) #6, !dbg !56
|
341 |
+
%311 = select i1 %309, float 0.000000e+00, float %310, !dbg !57
|
342 |
+
%312 = fmul float %311, %307, !dbg !58
|
343 |
+
%313 = fadd float %301, %312, !dbg !59
|
344 |
+
%314 = fadd float %227, %306, !dbg !60
|
345 |
+
%315 = fmul float %307, %307, !dbg !61
|
346 |
+
%316 = fmul float %296, %315, !dbg !62
|
347 |
+
%317 = fmul float %311, %316, !dbg !63
|
348 |
+
%318 = fadd float %314, %317, !dbg !64
|
349 |
+
%319 = fsub float %204, %313, !dbg !50
|
350 |
+
%320 = fadd float %286, %308, !dbg !54
|
351 |
+
%321 = fcmp oeq float %320, 0.000000e+00, !dbg !55
|
352 |
+
%322 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %286, float %320) #6, !dbg !56
|
353 |
+
%323 = select i1 %321, float 0.000000e+00, float %322, !dbg !57
|
354 |
+
%324 = fmul float %323, %319, !dbg !58
|
355 |
+
%325 = fadd float %313, %324, !dbg !59
|
356 |
+
%326 = fadd float %228, %318, !dbg !60
|
357 |
+
%327 = fmul float %319, %319, !dbg !61
|
358 |
+
%328 = fmul float %308, %327, !dbg !62
|
359 |
+
%329 = fmul float %323, %328, !dbg !63
|
360 |
+
%330 = fadd float %326, %329, !dbg !64
|
361 |
+
%331 = fsub float %205, %325, !dbg !50
|
362 |
+
%332 = fadd float %288, %320, !dbg !54
|
363 |
+
%333 = fcmp oeq float %332, 0.000000e+00, !dbg !55
|
364 |
+
%334 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %288, float %332) #6, !dbg !56
|
365 |
+
%335 = select i1 %333, float 0.000000e+00, float %334, !dbg !57
|
366 |
+
%336 = fmul float %335, %331, !dbg !58
|
367 |
+
%337 = fadd float %325, %336, !dbg !59
|
368 |
+
%338 = fadd float %229, %330, !dbg !60
|
369 |
+
%339 = fmul float %331, %331, !dbg !61
|
370 |
+
%340 = fmul float %320, %339, !dbg !62
|
371 |
+
%341 = fmul float %335, %340, !dbg !63
|
372 |
+
%342 = fadd float %338, %341, !dbg !64
|
373 |
+
%343 = fsub float %206, %337, !dbg !50
|
374 |
+
%344 = fadd float %290, %332, !dbg !54
|
375 |
+
%345 = fcmp oeq float %344, 0.000000e+00, !dbg !55
|
376 |
+
%346 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %290, float %344) #6, !dbg !56
|
377 |
+
%347 = select i1 %345, float 0.000000e+00, float %346, !dbg !57
|
378 |
+
%348 = fmul float %347, %343, !dbg !58
|
379 |
+
%349 = fadd float %337, %348, !dbg !59
|
380 |
+
%350 = fadd float %230, %342, !dbg !60
|
381 |
+
%351 = fmul float %343, %343, !dbg !61
|
382 |
+
%352 = fmul float %332, %351, !dbg !62
|
383 |
+
%353 = fmul float %347, %352, !dbg !63
|
384 |
+
%354 = fadd float %350, %353, !dbg !64
|
385 |
+
%355 = fsub float %207, %349, !dbg !50
|
386 |
+
%356 = fadd float %292, %344, !dbg !54
|
387 |
+
%357 = fcmp oeq float %356, 0.000000e+00, !dbg !55
|
388 |
+
%358 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %292, float %356) #6, !dbg !56
|
389 |
+
%359 = select i1 %357, float 0.000000e+00, float %358, !dbg !57
|
390 |
+
%360 = fmul float %359, %355, !dbg !58
|
391 |
+
%361 = fadd float %349, %360, !dbg !59
|
392 |
+
%362 = fadd float %231, %354, !dbg !60
|
393 |
+
%363 = fmul float %355, %355, !dbg !61
|
394 |
+
%364 = fmul float %344, %363, !dbg !62
|
395 |
+
%365 = fmul float %359, %364, !dbg !63
|
396 |
+
%366 = fadd float %362, %365, !dbg !64
|
397 |
+
%367 = fsub float %208, %361, !dbg !50
|
398 |
+
%368 = fadd float %294, %356, !dbg !54
|
399 |
+
%369 = fcmp oeq float %368, 0.000000e+00, !dbg !55
|
400 |
+
%370 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %294, float %368) #6, !dbg !56
|
401 |
+
%371 = select i1 %369, float 0.000000e+00, float %370, !dbg !57
|
402 |
+
%372 = fmul float %371, %367, !dbg !58
|
403 |
+
%373 = fadd float %361, %372, !dbg !59
|
404 |
+
%374 = fadd float %232, %366, !dbg !60
|
405 |
+
%375 = fmul float %367, %367, !dbg !61
|
406 |
+
%376 = fmul float %356, %375, !dbg !62
|
407 |
+
%377 = fmul float %371, %376, !dbg !63
|
408 |
+
%378 = fadd float %374, %377, !dbg !64
|
409 |
+
%379 = bitcast float %373 to i32, !dbg !65
|
410 |
+
%380 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %379, i32 8, i32 31), !dbg !65
|
411 |
+
%381 = bitcast i32 %380 to float, !dbg !65
|
412 |
+
%382 = bitcast float %378 to i32, !dbg !65
|
413 |
+
%383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 8, i32 31), !dbg !65
|
414 |
+
%384 = bitcast i32 %383 to float, !dbg !65
|
415 |
+
%385 = bitcast float %368 to i32, !dbg !65
|
416 |
+
%386 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %385, i32 8, i32 31), !dbg !65
|
417 |
+
%387 = bitcast i32 %386 to float, !dbg !65
|
418 |
+
%388 = fsub float %381, %373, !dbg !50
|
419 |
+
%389 = fadd float %368, %387, !dbg !54
|
420 |
+
%390 = fcmp oeq float %389, 0.000000e+00, !dbg !55
|
421 |
+
%391 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %387, float %389) #6, !dbg !56
|
422 |
+
%392 = select i1 %390, float 0.000000e+00, float %391, !dbg !57
|
423 |
+
%393 = fmul float %392, %388, !dbg !58
|
424 |
+
%394 = fadd float %373, %393, !dbg !59
|
425 |
+
%395 = fadd float %378, %384, !dbg !60
|
426 |
+
%396 = fmul float %388, %388, !dbg !61
|
427 |
+
%397 = fmul float %368, %396, !dbg !62
|
428 |
+
%398 = fmul float %392, %397, !dbg !63
|
429 |
+
%399 = fadd float %395, %398, !dbg !64
|
430 |
+
%400 = bitcast float %394 to i32, !dbg !65
|
431 |
+
%401 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %400, i32 4, i32 31), !dbg !65
|
432 |
+
%402 = bitcast i32 %401 to float, !dbg !65
|
433 |
+
%403 = bitcast float %399 to i32, !dbg !65
|
434 |
+
%404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 4, i32 31), !dbg !65
|
435 |
+
%405 = bitcast i32 %404 to float, !dbg !65
|
436 |
+
%406 = bitcast float %389 to i32, !dbg !65
|
437 |
+
%407 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %406, i32 4, i32 31), !dbg !65
|
438 |
+
%408 = bitcast i32 %407 to float, !dbg !65
|
439 |
+
%409 = fsub float %402, %394, !dbg !50
|
440 |
+
%410 = fadd float %389, %408, !dbg !54
|
441 |
+
%411 = fcmp oeq float %410, 0.000000e+00, !dbg !55
|
442 |
+
%412 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %408, float %410) #6, !dbg !56
|
443 |
+
%413 = select i1 %411, float 0.000000e+00, float %412, !dbg !57
|
444 |
+
%414 = fmul float %413, %409, !dbg !58
|
445 |
+
%415 = fadd float %394, %414, !dbg !59
|
446 |
+
%416 = fadd float %399, %405, !dbg !60
|
447 |
+
%417 = fmul float %409, %409, !dbg !61
|
448 |
+
%418 = fmul float %389, %417, !dbg !62
|
449 |
+
%419 = fmul float %413, %418, !dbg !63
|
450 |
+
%420 = fadd float %416, %419, !dbg !64
|
451 |
+
%421 = bitcast float %415 to i32, !dbg !65
|
452 |
+
%422 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %421, i32 2, i32 31), !dbg !65
|
453 |
+
%423 = bitcast i32 %422 to float, !dbg !65
|
454 |
+
%424 = bitcast float %420 to i32, !dbg !65
|
455 |
+
%425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 2, i32 31), !dbg !65
|
456 |
+
%426 = bitcast i32 %425 to float, !dbg !65
|
457 |
+
%427 = bitcast float %410 to i32, !dbg !65
|
458 |
+
%428 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %427, i32 2, i32 31), !dbg !65
|
459 |
+
%429 = bitcast i32 %428 to float, !dbg !65
|
460 |
+
%430 = fsub float %423, %415, !dbg !50
|
461 |
+
%431 = fadd float %410, %429, !dbg !54
|
462 |
+
%432 = fcmp oeq float %431, 0.000000e+00, !dbg !55
|
463 |
+
%433 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %429, float %431) #6, !dbg !56
|
464 |
+
%434 = select i1 %432, float 0.000000e+00, float %433, !dbg !57
|
465 |
+
%435 = fmul float %434, %430, !dbg !58
|
466 |
+
%436 = fadd float %415, %435, !dbg !59
|
467 |
+
%437 = fadd float %420, %426, !dbg !60
|
468 |
+
%438 = fmul float %430, %430, !dbg !61
|
469 |
+
%439 = fmul float %410, %438, !dbg !62
|
470 |
+
%440 = fmul float %434, %439, !dbg !63
|
471 |
+
%441 = fadd float %437, %440, !dbg !64
|
472 |
+
%442 = bitcast float %436 to i32, !dbg !65
|
473 |
+
%443 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %442, i32 1, i32 31), !dbg !65
|
474 |
+
%444 = bitcast i32 %443 to float, !dbg !65
|
475 |
+
%445 = bitcast float %441 to i32, !dbg !65
|
476 |
+
%446 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %445, i32 1, i32 31), !dbg !65
|
477 |
+
%447 = bitcast i32 %446 to float, !dbg !65
|
478 |
+
%448 = bitcast float %431 to i32, !dbg !65
|
479 |
+
%449 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %448, i32 1, i32 31), !dbg !65
|
480 |
+
%450 = bitcast i32 %449 to float, !dbg !65
|
481 |
+
%451 = fsub float %444, %436, !dbg !50
|
482 |
+
%452 = fadd float %431, %450, !dbg !54
|
483 |
+
%453 = fcmp oeq float %452, 0.000000e+00, !dbg !55
|
484 |
+
%454 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %450, float %452) #6, !dbg !56
|
485 |
+
%455 = select i1 %453, float 0.000000e+00, float %454, !dbg !57
|
486 |
+
%456 = fmul float %455, %451, !dbg !58
|
487 |
+
%457 = fadd float %436, %456, !dbg !59
|
488 |
+
%458 = fadd float %441, %447, !dbg !60
|
489 |
+
%459 = fmul float %451, %451, !dbg !61
|
490 |
+
%460 = fmul float %431, %459, !dbg !62
|
491 |
+
%461 = fmul float %455, %460, !dbg !63
|
492 |
+
%462 = fadd float %458, %461, !dbg !64
|
493 |
+
%463 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
494 |
+
%464 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
495 |
+
%465 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
496 |
+
%466 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
497 |
+
%467 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
498 |
+
%468 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
499 |
+
%469 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
500 |
+
%470 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %462, float 2.560000e+02) #6, !dbg !67
|
501 |
+
%471 = fadd float %463, 0x3EE4F8B580000000, !dbg !68
|
502 |
+
br label %472, !dbg !69
|
503 |
+
|
504 |
+
472: ; preds = %233, %__nv_rsqrtf.exit
|
505 |
+
%473 = phi i1 [ true, %233 ], [ false, %__nv_rsqrtf.exit ]
|
506 |
+
%474 = phi i32 [ 0, %233 ], [ 128, %__nv_rsqrtf.exit ]
|
507 |
+
%475 = or i32 %474, %13, !dbg !70
|
508 |
+
%476 = or i32 %474, %14, !dbg !70
|
509 |
+
%477 = or i32 %475, %33, !dbg !71
|
510 |
+
%478 = or i32 %476, %33, !dbg !71
|
511 |
+
%479 = sext i32 %477 to i64, !dbg !72
|
512 |
+
%480 = getelementptr float, ptr addrspace(1) %2, i64 %479, !dbg !72
|
513 |
+
%481 = sext i32 %478 to i64, !dbg !72
|
514 |
+
%482 = getelementptr float, ptr addrspace(1) %2, i64 %481, !dbg !72
|
515 |
+
%483 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %480, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
516 |
+
%484 = extractvalue { i32, i32, i32, i32 } %483, 0, !dbg !73
|
517 |
+
%485 = extractvalue { i32, i32, i32, i32 } %483, 1, !dbg !73
|
518 |
+
%486 = extractvalue { i32, i32, i32, i32 } %483, 2, !dbg !73
|
519 |
+
%487 = extractvalue { i32, i32, i32, i32 } %483, 3, !dbg !73
|
520 |
+
%488 = bitcast i32 %484 to float, !dbg !73
|
521 |
+
%489 = bitcast i32 %485 to float, !dbg !73
|
522 |
+
%490 = bitcast i32 %486 to float, !dbg !73
|
523 |
+
%491 = bitcast i32 %487 to float, !dbg !73
|
524 |
+
%492 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %482, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
|
525 |
+
%493 = extractvalue { i32, i32, i32, i32 } %492, 0, !dbg !73
|
526 |
+
%494 = extractvalue { i32, i32, i32, i32 } %492, 1, !dbg !73
|
527 |
+
%495 = extractvalue { i32, i32, i32, i32 } %492, 2, !dbg !73
|
528 |
+
%496 = extractvalue { i32, i32, i32, i32 } %492, 3, !dbg !73
|
529 |
+
%497 = bitcast i32 %493 to float, !dbg !73
|
530 |
+
%498 = bitcast i32 %494 to float, !dbg !73
|
531 |
+
%499 = bitcast i32 %495 to float, !dbg !73
|
532 |
+
%500 = bitcast i32 %496 to float, !dbg !73
|
533 |
+
%501 = or i32 %475, %34, !dbg !74
|
534 |
+
%502 = sext i32 %501 to i64, !dbg !75
|
535 |
+
%503 = getelementptr i16, ptr addrspace(1) %3, i64 %502, !dbg !75
|
536 |
+
%504 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %503, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
|
537 |
+
%505 = extractvalue { i32, i32, i32, i32 } %504, 0, !dbg !76
|
538 |
+
%506 = extractvalue { i32, i32, i32, i32 } %504, 1, !dbg !76
|
539 |
+
%507 = extractvalue { i32, i32, i32, i32 } %504, 2, !dbg !76
|
540 |
+
%508 = extractvalue { i32, i32, i32, i32 } %504, 3, !dbg !76
|
541 |
+
%509 = trunc i32 %505 to i16, !dbg !76
|
542 |
+
%extelt.offset = lshr i32 %505, 16, !dbg !76
|
543 |
+
%510 = trunc i32 %extelt.offset to i16, !dbg !76
|
544 |
+
%511 = trunc i32 %506 to i16, !dbg !76
|
545 |
+
%extelt.offset2 = lshr i32 %506, 16, !dbg !76
|
546 |
+
%512 = trunc i32 %extelt.offset2 to i16, !dbg !76
|
547 |
+
%513 = trunc i32 %507 to i16, !dbg !76
|
548 |
+
%extelt.offset3 = lshr i32 %507, 16, !dbg !76
|
549 |
+
%514 = trunc i32 %extelt.offset3 to i16, !dbg !76
|
550 |
+
%515 = trunc i32 %508 to i16, !dbg !76
|
551 |
+
%extelt.offset4 = lshr i32 %508, 16, !dbg !76
|
552 |
+
%516 = trunc i32 %extelt.offset4 to i16, !dbg !76
|
553 |
+
%517 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %509) #6, !dbg !77
|
554 |
+
%518 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %510) #6, !dbg !77
|
555 |
+
%519 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %511) #6, !dbg !77
|
556 |
+
%520 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #6, !dbg !77
|
557 |
+
%521 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #6, !dbg !77
|
558 |
+
%522 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #6, !dbg !77
|
559 |
+
%523 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #6, !dbg !77
|
560 |
+
%524 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #6, !dbg !77
|
561 |
+
%525 = zext nneg i32 %475 to i64, !dbg !78
|
562 |
+
%526 = getelementptr float, ptr addrspace(1) %4, i64 %525, !dbg !78
|
563 |
+
%527 = zext nneg i32 %476 to i64, !dbg !78
|
564 |
+
%528 = getelementptr float, ptr addrspace(1) %4, i64 %527, !dbg !78
|
565 |
+
%529 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %526, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
|
566 |
+
%530 = extractvalue { i32, i32, i32, i32 } %529, 0, !dbg !79
|
567 |
+
%531 = extractvalue { i32, i32, i32, i32 } %529, 1, !dbg !79
|
568 |
+
%532 = extractvalue { i32, i32, i32, i32 } %529, 2, !dbg !79
|
569 |
+
%533 = extractvalue { i32, i32, i32, i32 } %529, 3, !dbg !79
|
570 |
+
%534 = bitcast i32 %530 to float, !dbg !79
|
571 |
+
%535 = bitcast i32 %531 to float, !dbg !79
|
572 |
+
%536 = bitcast i32 %532 to float, !dbg !79
|
573 |
+
%537 = bitcast i32 %533 to float, !dbg !79
|
574 |
+
%538 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %528, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
|
575 |
+
%539 = extractvalue { i32, i32, i32, i32 } %538, 0, !dbg !79
|
576 |
+
%540 = extractvalue { i32, i32, i32, i32 } %538, 1, !dbg !79
|
577 |
+
%541 = extractvalue { i32, i32, i32, i32 } %538, 2, !dbg !79
|
578 |
+
%542 = extractvalue { i32, i32, i32, i32 } %538, 3, !dbg !79
|
579 |
+
%543 = bitcast i32 %539 to float, !dbg !79
|
580 |
+
%544 = bitcast i32 %540 to float, !dbg !79
|
581 |
+
%545 = bitcast i32 %541 to float, !dbg !79
|
582 |
+
%546 = bitcast i32 %542 to float, !dbg !79
|
583 |
+
br i1 %39, label %547, label %548, !dbg !80
|
584 |
+
|
585 |
+
547: ; preds = %472
|
586 |
+
tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 1892, ptr nonnull @assertFunc_1, i64 1), !dbg !80
|
587 |
+
br label %548, !dbg !80
|
588 |
+
|
589 |
+
548: ; preds = %547, %472
|
590 |
+
%549 = getelementptr float, ptr addrspace(1) %43, i64 %525, !dbg !81
|
591 |
+
%550 = getelementptr float, ptr addrspace(1) %43, i64 %527, !dbg !81
|
592 |
+
%551 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %549, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
593 |
+
%552 = extractvalue { i32, i32, i32, i32 } %551, 0, !dbg !82
|
594 |
+
%553 = extractvalue { i32, i32, i32, i32 } %551, 1, !dbg !82
|
595 |
+
%554 = extractvalue { i32, i32, i32, i32 } %551, 2, !dbg !82
|
596 |
+
%555 = extractvalue { i32, i32, i32, i32 } %551, 3, !dbg !82
|
597 |
+
%556 = bitcast i32 %552 to float, !dbg !82
|
598 |
+
%557 = bitcast i32 %553 to float, !dbg !82
|
599 |
+
%558 = bitcast i32 %554 to float, !dbg !82
|
600 |
+
%559 = bitcast i32 %555 to float, !dbg !82
|
601 |
+
%560 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %550, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
|
602 |
+
%561 = extractvalue { i32, i32, i32, i32 } %560, 0, !dbg !82
|
603 |
+
%562 = extractvalue { i32, i32, i32, i32 } %560, 1, !dbg !82
|
604 |
+
%563 = extractvalue { i32, i32, i32, i32 } %560, 2, !dbg !82
|
605 |
+
%564 = extractvalue { i32, i32, i32, i32 } %560, 3, !dbg !82
|
606 |
+
%565 = bitcast i32 %561 to float, !dbg !82
|
607 |
+
%566 = bitcast i32 %562 to float, !dbg !82
|
608 |
+
%567 = bitcast i32 %563 to float, !dbg !82
|
609 |
+
%568 = bitcast i32 %564 to float, !dbg !82
|
610 |
+
%569 = fadd float %488, %556, !dbg !83
|
611 |
+
%570 = fadd float %489, %557, !dbg !83
|
612 |
+
%571 = fadd float %490, %558, !dbg !83
|
613 |
+
%572 = fadd float %491, %559, !dbg !83
|
614 |
+
%573 = fadd float %497, %565, !dbg !83
|
615 |
+
%574 = fadd float %498, %566, !dbg !83
|
616 |
+
%575 = fadd float %499, %567, !dbg !83
|
617 |
+
%576 = fadd float %500, %568, !dbg !83
|
618 |
+
%577 = fadd float %517, %569, !dbg !84
|
619 |
+
%578 = fadd float %518, %570, !dbg !84
|
620 |
+
%579 = fadd float %519, %571, !dbg !84
|
621 |
+
%580 = fadd float %520, %572, !dbg !84
|
622 |
+
%581 = fadd float %521, %573, !dbg !84
|
623 |
+
%582 = fadd float %522, %574, !dbg !84
|
624 |
+
%583 = fadd float %523, %575, !dbg !84
|
625 |
+
%584 = fadd float %524, %576, !dbg !84
|
626 |
+
%585 = fsub float %577, %457, !dbg !85
|
627 |
+
%586 = fsub float %578, %457, !dbg !85
|
628 |
+
%587 = fsub float %579, %457, !dbg !85
|
629 |
+
%588 = fsub float %580, %457, !dbg !85
|
630 |
+
%589 = fsub float %581, %457, !dbg !85
|
631 |
+
%590 = fsub float %582, %457, !dbg !85
|
632 |
+
%591 = fsub float %583, %457, !dbg !85
|
633 |
+
%592 = fsub float %584, %457, !dbg !85
|
634 |
+
%593 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
635 |
+
%.not.i = icmp eq i32 %593, 0, !dbg !86
|
636 |
+
br i1 %.not.i, label %596, label %594, !dbg !86
|
637 |
+
|
638 |
+
594: ; preds = %548
|
639 |
+
%595 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %471), !dbg !86
|
640 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
641 |
+
|
642 |
+
596: ; preds = %548
|
643 |
+
%597 = tail call float @llvm.nvvm.rsqrt.approx.f(float %471), !dbg !86
|
644 |
+
br label %__nv_rsqrtf.exit, !dbg !86
|
645 |
+
|
646 |
+
__nv_rsqrtf.exit: ; preds = %594, %596
|
647 |
+
%.0.i = phi float [ %595, %594 ], [ %597, %596 ], !dbg !86
|
648 |
+
%598 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
649 |
+
%599 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
650 |
+
%600 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
651 |
+
%601 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
652 |
+
%602 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
653 |
+
%603 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
654 |
+
%604 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
|
655 |
+
%605 = fmul float %585, %.0.i, !dbg !87
|
656 |
+
%606 = fmul float %586, %.0.i, !dbg !87
|
657 |
+
%607 = fmul float %587, %.0.i, !dbg !87
|
658 |
+
%608 = fmul float %588, %.0.i, !dbg !87
|
659 |
+
%609 = fmul float %589, %.0.i, !dbg !87
|
660 |
+
%610 = fmul float %590, %.0.i, !dbg !87
|
661 |
+
%611 = fmul float %591, %.0.i, !dbg !87
|
662 |
+
%612 = fmul float %592, %.0.i, !dbg !87
|
663 |
+
%613 = fmul float %605, %534, !dbg !88
|
664 |
+
%614 = fmul float %606, %535, !dbg !88
|
665 |
+
%615 = fmul float %607, %536, !dbg !88
|
666 |
+
%616 = fmul float %608, %537, !dbg !88
|
667 |
+
%617 = fmul float %609, %543, !dbg !88
|
668 |
+
%618 = fmul float %610, %544, !dbg !88
|
669 |
+
%619 = fmul float %611, %545, !dbg !88
|
670 |
+
%620 = fmul float %612, %546, !dbg !88
|
671 |
+
%621 = getelementptr i16, ptr addrspace(1) %5, i64 %502, !dbg !89
|
672 |
+
%622 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %613) #6, !dbg !90
|
673 |
+
%623 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %614) #6, !dbg !90
|
674 |
+
%624 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %615) #6, !dbg !90
|
675 |
+
%625 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %616) #6, !dbg !90
|
676 |
+
%626 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %617) #6, !dbg !90
|
677 |
+
%627 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %618) #6, !dbg !90
|
678 |
+
%628 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %619) #6, !dbg !90
|
679 |
+
%629 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %620) #6, !dbg !90
|
680 |
+
%630 = insertelement <2 x i16> undef, i16 %622, i64 0, !dbg !90
|
681 |
+
%631 = insertelement <2 x i16> %630, i16 %623, i64 1, !dbg !90
|
682 |
+
%632 = bitcast <2 x i16> %631 to i32, !dbg !90
|
683 |
+
%633 = insertelement <2 x i16> undef, i16 %624, i64 0, !dbg !90
|
684 |
+
%634 = insertelement <2 x i16> %633, i16 %625, i64 1, !dbg !90
|
685 |
+
%635 = bitcast <2 x i16> %634 to i32, !dbg !90
|
686 |
+
%636 = insertelement <2 x i16> undef, i16 %626, i64 0, !dbg !90
|
687 |
+
%637 = insertelement <2 x i16> %636, i16 %627, i64 1, !dbg !90
|
688 |
+
%638 = bitcast <2 x i16> %637 to i32, !dbg !90
|
689 |
+
%639 = insertelement <2 x i16> undef, i16 %628, i64 0, !dbg !90
|
690 |
+
%640 = insertelement <2 x i16> %639, i16 %629, i64 1, !dbg !90
|
691 |
+
%641 = bitcast <2 x i16> %640 to i32, !dbg !90
|
692 |
+
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %632, i32 %635, i32 %638, i32 %641, ptr addrspace(1) %621, i1 true) #6, !dbg !90
|
693 |
+
br i1 %473, label %472, label %642, !dbg !69
|
694 |
+
|
695 |
+
642: ; preds = %__nv_rsqrtf.exit
|
696 |
+
ret void, !dbg !91
|
697 |
+
}
|
698 |
+
|
699 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
700 |
+
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
|
701 |
+
|
702 |
+
; Function Attrs: convergent nocallback nounwind
|
703 |
+
declare void @llvm.nvvm.barrier0() #1
|
704 |
+
|
705 |
+
; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
|
706 |
+
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
|
707 |
+
|
708 |
+
; Function Attrs: alwaysinline nounwind
|
709 |
+
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
|
710 |
+
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
|
711 |
+
%.not = icmp eq i32 %1, 0
|
712 |
+
br i1 %.not, label %4, label %2
|
713 |
+
|
714 |
+
2: ; preds = %0
|
715 |
+
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
|
716 |
+
br label %6
|
717 |
+
|
718 |
+
4: ; preds = %0
|
719 |
+
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
|
720 |
+
br label %6
|
721 |
+
|
722 |
+
6: ; preds = %4, %2
|
723 |
+
%.0 = phi float [ %3, %2 ], [ %5, %4 ]
|
724 |
+
ret float %.0
|
725 |
+
}
|
726 |
+
|
727 |
+
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
|
728 |
+
|
729 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
730 |
+
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
|
731 |
+
|
732 |
+
; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
|
733 |
+
declare float @llvm.nvvm.rsqrt.approx.f(float) #5
|
734 |
+
|
735 |
+
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
736 |
+
attributes #1 = { convergent nocallback nounwind }
|
737 |
+
attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
|
738 |
+
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
739 |
+
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
740 |
+
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
|
741 |
+
attributes #6 = { nounwind }
|
742 |
+
|
743 |
+
!llvm.module.flags = !{!0, !1}
|
744 |
+
!llvm.dbg.cu = !{!2}
|
745 |
+
!nvvm.annotations = !{!4, !5, !5, !4}
|
746 |
+
!llvm.ident = !{!6}
|
747 |
+
|
748 |
+
!0 = !{i32 2, !"Debug Info Version", i32 3}
|
749 |
+
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
|
750 |
+
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
|
751 |
+
!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
|
752 |
+
!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
|
753 |
+
!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 256}
|
754 |
+
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
|
755 |
+
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
|
756 |
+
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
|
757 |
+
!9 = !{}
|
758 |
+
!10 = !DILocation(line: 22, column: 44, scope: !7)
|
759 |
+
!11 = !DILocation(line: 24, column: 33, scope: !7)
|
760 |
+
!12 = !DILocation(line: 21, column: 28, scope: !7)
|
761 |
+
!13 = !DILocation(line: 21, column: 33, scope: !7)
|
762 |
+
!14 = !DILocation(line: 22, column: 23, scope: !7)
|
763 |
+
!15 = !DILocation(line: 26, column: 30, scope: !7)
|
764 |
+
!16 = !DILocation(line: 26, column: 35, scope: !7)
|
765 |
+
!17 = !DILocation(line: 27, column: 18, scope: !7)
|
766 |
+
!18 = !DILocation(line: 35, column: 44, scope: !7)
|
767 |
+
!19 = !DILocation(line: 36, column: 44, scope: !7)
|
768 |
+
!20 = !DILocation(line: 37, column: 22, scope: !7)
|
769 |
+
!21 = !DILocation(line: 38, column: 22, scope: !7)
|
770 |
+
!22 = !DILocation(line: 39, column: 36, scope: !7)
|
771 |
+
!23 = !DILocation(line: 40, column: 40, scope: !7)
|
772 |
+
!24 = !DILocation(line: 41, column: 44, scope: !7)
|
773 |
+
!25 = !DILocation(line: 31, column: 36, scope: !7)
|
774 |
+
!26 = !DILocation(line: 32, column: 27, scope: !7)
|
775 |
+
!27 = !DILocation(line: 35, column: 40, scope: !7)
|
776 |
+
!28 = !DILocation(line: 35, column: 34, scope: !7)
|
777 |
+
!29 = !DILocation(line: 35, column: 50, scope: !7)
|
778 |
+
!30 = !DILocation(line: 36, column: 40, scope: !7)
|
779 |
+
!31 = !DILocation(line: 36, column: 34, scope: !7)
|
780 |
+
!32 = !DILocation(line: 36, column: 50, scope: !7)
|
781 |
+
!33 = !DILocation(line: 36, column: 101, scope: !7)
|
782 |
+
!34 = !DILocation(line: 40, column: 55, scope: !7)
|
783 |
+
!35 = !DILocation(line: 41, column: 40, scope: !7)
|
784 |
+
!36 = !DILocation(line: 41, column: 34, scope: !7)
|
785 |
+
!37 = !DILocation(line: 41, column: 52, scope: !7)
|
786 |
+
!38 = !DILocation(line: 42, column: 22, scope: !7)
|
787 |
+
!39 = !DILocation(line: 44, column: 22, scope: !7)
|
788 |
+
!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
|
789 |
+
!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
|
790 |
+
!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
|
791 |
+
!43 = !DILocation(line: 47, column: 41, scope: !41)
|
792 |
+
!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
|
793 |
+
!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
|
794 |
+
!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
|
795 |
+
!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
|
796 |
+
!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
|
797 |
+
!49 = !DILocation(line: 50, column: 50, scope: !7)
|
798 |
+
!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
|
799 |
+
!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
|
800 |
+
!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
|
801 |
+
!53 = !DILocation(line: 53, column: 44, scope: !51)
|
802 |
+
!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
|
803 |
+
!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
|
804 |
+
!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
|
805 |
+
!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
|
806 |
+
!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
|
807 |
+
!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
|
808 |
+
!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
|
809 |
+
!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
|
810 |
+
!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
|
811 |
+
!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
|
812 |
+
!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
|
813 |
+
!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
|
814 |
+
!66 = !DILocation(line: 53, column: 44, scope: !41)
|
815 |
+
!67 = !DILocation(line: 75, column: 24, scope: !7)
|
816 |
+
!68 = !DILocation(line: 77, column: 24, scope: !7)
|
817 |
+
!69 = !DILocation(line: 58, column: 36, scope: !7)
|
818 |
+
!70 = !DILocation(line: 59, column: 27, scope: !7)
|
819 |
+
!71 = !DILocation(line: 62, column: 41, scope: !7)
|
820 |
+
!72 = !DILocation(line: 62, column: 35, scope: !7)
|
821 |
+
!73 = !DILocation(line: 62, column: 51, scope: !7)
|
822 |
+
!74 = !DILocation(line: 63, column: 41, scope: !7)
|
823 |
+
!75 = !DILocation(line: 63, column: 35, scope: !7)
|
824 |
+
!76 = !DILocation(line: 63, column: 51, scope: !7)
|
825 |
+
!77 = !DILocation(line: 63, column: 103, scope: !7)
|
826 |
+
!78 = !DILocation(line: 64, column: 35, scope: !7)
|
827 |
+
!79 = !DILocation(line: 64, column: 40, scope: !7)
|
828 |
+
!80 = !DILocation(line: 68, column: 57, scope: !7)
|
829 |
+
!81 = !DILocation(line: 69, column: 35, scope: !7)
|
830 |
+
!82 = !DILocation(line: 69, column: 54, scope: !7)
|
831 |
+
!83 = !DILocation(line: 70, column: 24, scope: !7)
|
832 |
+
!84 = !DILocation(line: 72, column: 24, scope: !7)
|
833 |
+
!85 = !DILocation(line: 73, column: 24, scope: !7)
|
834 |
+
!86 = !DILocation(line: 78, column: 30, scope: !7)
|
835 |
+
!87 = !DILocation(line: 79, column: 24, scope: !7)
|
836 |
+
!88 = !DILocation(line: 80, column: 24, scope: !7)
|
837 |
+
!89 = !DILocation(line: 82, column: 29, scope: !7)
|
838 |
+
!90 = !DILocation(line: 82, column: 52, scope: !7)
|
839 |
+
!91 = !DILocation(line: 58, column: 4, scope: !7)
|