|
|
|
source_filename = "LLVMDialectModule" |
|
|
|
@global_smem = external addrspace(3) global [0 x i8] |
|
@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 |
|
|
|
define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 { |
|
%11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 |
|
%12 = and i32 %11, 31, !dbg !10 |
|
%13 = lshr i32 %11, 5, !dbg !10 |
|
%14 = and i32 %13, 1, !dbg !10 |
|
%urem = shl i32 %11, 2, !dbg !10 |
|
%15 = and i32 %urem, 252, !dbg !10 |
|
%16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 |
|
%17 = shl i32 %16, 8, !dbg !12 |
|
%18 = or i32 %17, %15, !dbg !13 |
|
%19 = sext i32 %18 to i64, !dbg !14 |
|
%20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14 |
|
%21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 |
|
%22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15 |
|
%23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15 |
|
%24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15 |
|
%25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15 |
|
%26 = bitcast i32 %22 to float, !dbg !15 |
|
%27 = bitcast i32 %23 to float, !dbg !15 |
|
%28 = bitcast i32 %24 to float, !dbg !15 |
|
%29 = bitcast i32 %25 to float, !dbg !15 |
|
%30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16 |
|
%31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 |
|
%32 = extractvalue { i32, i32 } %31, 0, !dbg !17 |
|
%33 = extractvalue { i32, i32 } %31, 1, !dbg !17 |
|
%34 = trunc i32 %32 to i16, !dbg !17 |
|
%extelt.offset = lshr i32 %32, 16, !dbg !17 |
|
%35 = trunc i32 %extelt.offset to i16, !dbg !17 |
|
%36 = trunc i32 %33 to i16, !dbg !17 |
|
%extelt.offset1 = lshr i32 %33, 16, !dbg !17 |
|
%37 = trunc i32 %extelt.offset1 to i16, !dbg !17 |
|
%38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18 |
|
%39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18 |
|
%40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18 |
|
%41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18 |
|
%42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19 |
|
%43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 |
|
%44 = extractvalue { i32, i32 } %43, 0, !dbg !20 |
|
%45 = extractvalue { i32, i32 } %43, 1, !dbg !20 |
|
%46 = trunc i32 %44 to i16, !dbg !20 |
|
%extelt.offset2 = lshr i32 %44, 16, !dbg !20 |
|
%47 = trunc i32 %extelt.offset2 to i16, !dbg !20 |
|
%48 = trunc i32 %45 to i16, !dbg !20 |
|
%extelt.offset3 = lshr i32 %45, 16, !dbg !20 |
|
%49 = trunc i32 %extelt.offset3 to i16, !dbg !20 |
|
%50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21 |
|
%51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21 |
|
%52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21 |
|
%53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21 |
|
%54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22 |
|
%55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23 |
|
%56 = extractvalue { i32, i32 } %55, 0, !dbg !23 |
|
%57 = extractvalue { i32, i32 } %55, 1, !dbg !23 |
|
%58 = trunc i32 %56 to i16, !dbg !23 |
|
%extelt.offset4 = lshr i32 %56, 16, !dbg !23 |
|
%59 = trunc i32 %extelt.offset4 to i16, !dbg !23 |
|
%60 = trunc i32 %57 to i16, !dbg !23 |
|
%extelt.offset5 = lshr i32 %57, 16, !dbg !23 |
|
%61 = trunc i32 %extelt.offset5 to i16, !dbg !23 |
|
%62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24 |
|
%63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24 |
|
%64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24 |
|
%65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24 |
|
%66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25 |
|
%67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26 |
|
%68 = extractvalue { i32, i32 } %67, 0, !dbg !26 |
|
%69 = extractvalue { i32, i32 } %67, 1, !dbg !26 |
|
%70 = trunc i32 %68 to i16, !dbg !26 |
|
%extelt.offset6 = lshr i32 %68, 16, !dbg !26 |
|
%71 = trunc i32 %extelt.offset6 to i16, !dbg !26 |
|
%72 = trunc i32 %69 to i16, !dbg !26 |
|
%extelt.offset7 = lshr i32 %69, 16, !dbg !26 |
|
%73 = trunc i32 %extelt.offset7 to i16, !dbg !26 |
|
%74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27 |
|
%75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27 |
|
%76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27 |
|
%77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27 |
|
%78 = zext nneg i32 %15 to i64, !dbg !28 |
|
%79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28 |
|
%80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 |
|
%81 = fadd float %38, %26, !dbg !30 |
|
%82 = fadd float %39, %27, !dbg !30 |
|
%83 = fadd float %40, %28, !dbg !30 |
|
%84 = fadd float %81, %50, !dbg !31 |
|
%85 = fadd float %82, %51, !dbg !31 |
|
%86 = fadd float %83, %52, !dbg !31 |
|
%87 = fadd float %85, %63, !dbg !32 |
|
%88 = fadd float %86, %64, !dbg !32 |
|
%89 = fadd float %87, %75, !dbg !33 |
|
%90 = fadd float %88, %76, !dbg !33 |
|
%91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32 |
|
%92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32 |
|
%93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32 |
|
%94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32 |
|
%95 = fadd <2 x float> %92, %94, !dbg !32 |
|
%96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33 |
|
%97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33 |
|
%98 = fadd <2 x float> %95, %97, !dbg !33 |
|
%99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34 |
|
%100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34 |
|
%101 = fadd <2 x float> %98, %100, !dbg !34 |
|
%102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34 |
|
%103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34 |
|
%104 = fadd <2 x float> %101, %103, !dbg !34 |
|
%105 = extractelement <2 x float> %104, i64 0, !dbg !34 |
|
%106 = extractelement <2 x float> %104, i64 1, !dbg !34 |
|
%107 = fadd float %105, %106, !dbg !34 |
|
%108 = bitcast float %107 to i32, !dbg !40 |
|
%109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40 |
|
%110 = bitcast i32 %109 to float, !dbg !40 |
|
%111 = fadd float %107, %110, !dbg !34 |
|
%112 = bitcast float %111 to i32, !dbg !40 |
|
%113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40 |
|
%114 = bitcast i32 %113 to float, !dbg !40 |
|
%115 = fadd float %111, %114, !dbg !34 |
|
%116 = bitcast float %115 to i32, !dbg !40 |
|
%117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40 |
|
%118 = bitcast i32 %117 to float, !dbg !40 |
|
%119 = fadd float %115, %118, !dbg !34 |
|
%120 = bitcast float %119 to i32, !dbg !40 |
|
%121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40 |
|
%122 = bitcast i32 %121 to float, !dbg !40 |
|
%123 = fadd float %119, %122, !dbg !34 |
|
%124 = bitcast float %123 to i32, !dbg !40 |
|
%125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40 |
|
%126 = bitcast i32 %125 to float, !dbg !40 |
|
%127 = fadd float %123, %126, !dbg !34 |
|
%128 = icmp eq i32 %12, 0, !dbg !40 |
|
%129 = zext nneg i32 %14 to i64, !dbg !40 |
|
%130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40 |
|
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !40 |
|
%131 = icmp slt i32 %11, 2, !dbg !40 |
|
%132 = sext i32 %11 to i64, !dbg !40 |
|
%133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40 |
|
%134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40 |
|
%135 = bitcast float %134 to i32, !dbg !40 |
|
%136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40 |
|
%137 = bitcast i32 %136 to float, !dbg !40 |
|
%138 = fadd float %134, %137, !dbg !34 |
|
%139 = and i32 %11, 1, !dbg !40 |
|
%140 = icmp eq i32 %139, 0, !dbg !40 |
|
%141 = and i1 %131, %140, !dbg !40 |
|
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !40 |
|
%142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40 |
|
%143 = fadd float %142, 0.000000e+00, !dbg !42 |
|
%144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46 |
|
%145 = extractelement <2 x float> %98, i64 0, !dbg !47 |
|
%146 = fsub float %145, %144, !dbg !47 |
|
%147 = fsub float %89, %144, !dbg !47 |
|
%148 = fsub float %90, %144, !dbg !47 |
|
%149 = fsub float %106, %144, !dbg !47 |
|
%150 = fmul float %146, %146, !dbg !48 |
|
%151 = fmul float %147, %147, !dbg !48 |
|
%152 = fmul float %148, %148, !dbg !48 |
|
%153 = fmul float %149, %149, !dbg !48 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !49 |
|
%154 = fadd float %150, %151, !dbg !51 |
|
%155 = fadd float %152, %154, !dbg !51 |
|
%156 = fadd float %153, %155, !dbg !51 |
|
%157 = bitcast float %156 to i32, !dbg !49 |
|
%158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49 |
|
%159 = bitcast i32 %158 to float, !dbg !49 |
|
%160 = fadd float %156, %159, !dbg !51 |
|
%161 = bitcast float %160 to i32, !dbg !49 |
|
%162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49 |
|
%163 = bitcast i32 %162 to float, !dbg !49 |
|
%164 = fadd float %160, %163, !dbg !51 |
|
%165 = bitcast float %164 to i32, !dbg !49 |
|
%166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49 |
|
%167 = bitcast i32 %166 to float, !dbg !49 |
|
%168 = fadd float %164, %167, !dbg !51 |
|
%169 = bitcast float %168 to i32, !dbg !49 |
|
%170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49 |
|
%171 = bitcast i32 %170 to float, !dbg !49 |
|
%172 = fadd float %168, %171, !dbg !51 |
|
%173 = bitcast float %172 to i32, !dbg !49 |
|
%174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49 |
|
%175 = bitcast i32 %174 to float, !dbg !49 |
|
%176 = fadd float %172, %175, !dbg !51 |
|
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !49 |
|
%177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49 |
|
%178 = bitcast float %177 to i32, !dbg !49 |
|
%179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49 |
|
%180 = bitcast i32 %179 to float, !dbg !49 |
|
%181 = fadd float %177, %180, !dbg !51 |
|
tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49 |
|
tail call void @llvm.nvvm.barrier0(), !dbg !49 |
|
%182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49 |
|
%183 = fadd float %182, 0.000000e+00, !dbg !54 |
|
%184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56 |
|
%185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57 |
|
%186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 |
|
%.not.i = icmp eq i32 %186, 0, !dbg !58 |
|
br i1 %.not.i, label %189, label %187, !dbg !58 |
|
|
|
187: |
|
%188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58 |
|
br label %__nv_rsqrtf.exit, !dbg !58 |
|
|
|
189: |
|
%190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58 |
|
br label %__nv_rsqrtf.exit, !dbg !58 |
|
|
|
__nv_rsqrtf.exit: |
|
%.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58 |
|
%191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29 |
|
%192 = bitcast i32 %191 to float, !dbg !29 |
|
%193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29 |
|
%194 = bitcast i32 %193 to float, !dbg !29 |
|
%195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29 |
|
%196 = bitcast i32 %195 to float, !dbg !29 |
|
%197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29 |
|
%198 = bitcast i32 %197 to float, !dbg !29 |
|
%199 = fmul float %146, %.0.i, !dbg !59 |
|
%200 = fmul float %147, %.0.i, !dbg !59 |
|
%201 = fmul float %148, %.0.i, !dbg !59 |
|
%202 = fmul float %149, %.0.i, !dbg !59 |
|
%203 = fmul float %199, %198, !dbg !60 |
|
%204 = fmul float %200, %196, !dbg !60 |
|
%205 = fmul float %201, %194, !dbg !60 |
|
%206 = fmul float %202, %192, !dbg !60 |
|
%207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61 |
|
%208 = bitcast float %145 to i32, !dbg !62 |
|
%209 = bitcast float %89 to i32, !dbg !62 |
|
%210 = bitcast float %90 to i32, !dbg !62 |
|
%211 = bitcast float %106 to i32, !dbg !62 |
|
tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62 |
|
%212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63 |
|
%213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64 |
|
%214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64 |
|
%215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64 |
|
%216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64 |
|
%217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64 |
|
%218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64 |
|
%219 = bitcast <2 x i16> %218 to i32, !dbg !64 |
|
%220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64 |
|
%221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64 |
|
%222 = bitcast <2 x i16> %221 to i32, !dbg !64 |
|
tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64 |
|
ret void, !dbg !65 |
|
} |
|
|
|
|
|
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 |
|
|
|
|
|
declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 |
|
|
|
|
|
declare void @llvm.nvvm.barrier0() #2 |
|
|
|
|
|
define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { |
|
%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 |
|
%.not = icmp eq i32 %1, 0 |
|
br i1 %.not, label %4, label %2 |
|
|
|
2: |
|
%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) |
|
br label %6 |
|
|
|
4: |
|
%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) |
|
br label %6 |
|
|
|
6: |
|
%.0 = phi float [ %3, %2 ], [ %5, %4 ] |
|
ret float %.0 |
|
} |
|
|
|
declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 |
|
|
|
|
|
declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 |
|
|
|
|
|
declare float @llvm.nvvm.rsqrt.approx.f(float) #5 |
|
|
|
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
|
attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } |
|
attributes #2 = { convergent nocallback nounwind } |
|
attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } |
|
attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } |
|
attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } |
|
attributes #6 = { nounwind } |
|
|
|
!llvm.module.flags = !{!0, !1} |
|
!llvm.dbg.cu = !{!2} |
|
!nvvm.annotations = !{!4, !5, !5, !4} |
|
!llvm.ident = !{!6} |
|
|
|
!0 = !{i32 2, !"Debug Info Version", i32 3} |
|
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} |
|
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) |
|
!3 = !DIFile(filename: "cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py", directory: "/tmp/torchinductor_root/jb") |
|
!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1} |
|
!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64} |
|
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} |
|
!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) |
|
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) |
|
!9 = !{} |
|
!10 = !DILocation(line: 26, column: 26, scope: !7) |
|
!11 = !DILocation(line: 23, column: 28, scope: !7) |
|
!12 = !DILocation(line: 30, column: 40, scope: !7) |
|
!13 = !DILocation(line: 30, column: 36, scope: !7) |
|
!14 = !DILocation(line: 30, column: 30, scope: !7) |
|
!15 = !DILocation(line: 30, column: 46, scope: !7) |
|
!16 = !DILocation(line: 31, column: 30, scope: !7) |
|
!17 = !DILocation(line: 31, column: 46, scope: !7) |
|
!18 = !DILocation(line: 31, column: 67, scope: !7) |
|
!19 = !DILocation(line: 32, column: 30, scope: !7) |
|
!20 = !DILocation(line: 32, column: 46, scope: !7) |
|
!21 = !DILocation(line: 32, column: 67, scope: !7) |
|
!22 = !DILocation(line: 33, column: 30, scope: !7) |
|
!23 = !DILocation(line: 33, column: 46, scope: !7) |
|
!24 = !DILocation(line: 33, column: 67, scope: !7) |
|
!25 = !DILocation(line: 34, column: 31, scope: !7) |
|
!26 = !DILocation(line: 34, column: 47, scope: !7) |
|
!27 = !DILocation(line: 34, column: 68, scope: !7) |
|
!28 = !DILocation(line: 35, column: 31, scope: !7) |
|
!29 = !DILocation(line: 35, column: 36, scope: !7) |
|
!30 = !DILocation(line: 37, column: 18, scope: !7) |
|
!31 = !DILocation(line: 39, column: 18, scope: !7) |
|
!32 = !DILocation(line: 41, column: 18, scope: !7) |
|
!33 = !DILocation(line: 43, column: 19, scope: !7) |
|
!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38) |
|
!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0) |
|
!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") |
|
!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0) |
|
!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39) |
|
!39 = !DILocation(line: 48, column: 59, scope: !35) |
|
!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41) |
|
!41 = !DILocation(line: 48, column: 59, scope: !37) |
|
!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45) |
|
!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0) |
|
!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") |
|
!45 = !DILocation(line: 48, column: 45, scope: !43) |
|
!46 = !DILocation(line: 51, column: 20, scope: !7) |
|
!47 = !DILocation(line: 52, column: 20, scope: !7) |
|
!48 = !DILocation(line: 53, column: 20, scope: !7) |
|
!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50) |
|
!50 = !DILocation(line: 56, column: 59, scope: !37) |
|
!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52) |
|
!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53) |
|
!53 = !DILocation(line: 56, column: 59, scope: !35) |
|
!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55) |
|
!55 = !DILocation(line: 56, column: 45, scope: !43) |
|
!56 = !DILocation(line: 59, column: 20, scope: !7) |
|
!57 = !DILocation(line: 61, column: 20, scope: !7) |
|
!58 = !DILocation(line: 62, column: 26, scope: !7) |
|
!59 = !DILocation(line: 63, column: 20, scope: !7) |
|
!60 = !DILocation(line: 64, column: 20, scope: !7) |
|
!61 = !DILocation(line: 66, column: 25, scope: !7) |
|
!62 = !DILocation(line: 66, column: 48, scope: !7) |
|
!63 = !DILocation(line: 67, column: 25, scope: !7) |
|
!64 = !DILocation(line: 67, column: 48, scope: !7) |
|
!65 = !DILocation(line: 67, column: 4, scope: !7) |
|
|