diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..2f629b82a77126198090286b0d26a95bce508a5b Binary files /dev/null and b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.cubin differ diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..0cc3bd80940c2c0b96c72537ae39acb77904069e --- /dev/null +++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.llir @@ -0,0 +1,366 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %12 = and i32 %11, 31, !dbg !10 + %13 = lshr i32 %11, 5, !dbg !10 + %14 = and i32 %13, 1, !dbg !10 + %urem = shl i32 %11, 2, !dbg !10 + %15 = and i32 %urem, 252, !dbg !10 + %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %17 = shl i32 %16, 8, !dbg !12 + %18 = or i32 %17, %15, !dbg !13 + %19 = sext i32 %18 to i64, !dbg !14 + %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !14 + %21 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 + %22 = extractvalue { i32, i32, i32, i32 } %21, 0, !dbg !15 + %23 = extractvalue { i32, i32, i32, i32 } %21, 1, !dbg !15 + %24 = extractvalue { i32, i32, i32, i32 } %21, 2, !dbg !15 + %25 = extractvalue { i32, i32, i32, i32 } %21, 3, !dbg !15 + %26 = bitcast i32 %22 to float, !dbg !15 + %27 = bitcast i32 %23 to float, !dbg !15 + %28 = bitcast i32 %24 to float, !dbg !15 + %29 = bitcast i32 %25 to float, !dbg !15 + %30 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !16 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !17 + %33 = extractvalue { i32, i32 } %31, 1, !dbg !17 + %34 = trunc i32 %32 to i16, !dbg !17 + %extelt.offset = lshr i32 %32, 16, !dbg !17 + %35 = trunc i32 %extelt.offset to i16, !dbg !17 + %36 = trunc i32 %33 to i16, !dbg !17 + %extelt.offset1 = lshr i32 %33, 16, !dbg !17 + %37 = trunc i32 %extelt.offset1 to i16, !dbg !17 + %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18 + %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18 + %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18 + %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18 + %42 = getelementptr i16, ptr addrspace(1) %2, i64 %19, !dbg !19 + %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 + %44 = extractvalue { i32, i32 } %43, 0, !dbg !20 + %45 = extractvalue { i32, i32 } %43, 1, !dbg !20 + %46 = trunc i32 %44 to i16, !dbg !20 + %extelt.offset2 = lshr i32 %44, 16, !dbg !20 + %47 = trunc i32 %extelt.offset2 to i16, !dbg !20 + %48 = trunc i32 %45 to i16, !dbg !20 + %extelt.offset3 = lshr i32 %45, 16, !dbg !20 + %49 = trunc i32 %extelt.offset3 to i16, !dbg !20 + %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21 + %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !21 + %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21 + %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21 + %54 = getelementptr i16, ptr addrspace(1) %3, i64 %19, !dbg !22 + %55 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %54, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23 + %56 = extractvalue { i32, i32 } %55, 0, !dbg !23 + %57 = extractvalue { i32, i32 } %55, 1, !dbg !23 + %58 = trunc i32 %56 to i16, !dbg !23 + %extelt.offset4 = lshr i32 %56, 16, !dbg !23 + %59 = trunc i32 %extelt.offset4 to i16, !dbg !23 + %60 = trunc i32 %57 to i16, !dbg !23 + %extelt.offset5 = lshr i32 %57, 16, !dbg !23 + %61 = trunc i32 %extelt.offset5 to i16, !dbg !23 + %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #6, !dbg !24 + %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #6, !dbg !24 + %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24 + %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24 + %66 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !25 + %67 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %66, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26 + %68 = extractvalue { i32, i32 } %67, 0, !dbg !26 + %69 = extractvalue { i32, i32 } %67, 1, !dbg !26 + %70 = trunc i32 %68 to i16, !dbg !26 + %extelt.offset6 = lshr i32 %68, 16, !dbg !26 + %71 = trunc i32 %extelt.offset6 to i16, !dbg !26 + %72 = trunc i32 %69 to i16, !dbg !26 + %extelt.offset7 = lshr i32 %69, 16, !dbg !26 + %73 = trunc i32 %extelt.offset7 to i16, !dbg !26 + %74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #6, !dbg !27 + %75 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %71) #6, !dbg !27 + %76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27 + %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27 + %78 = zext nneg i32 %15 to i64, !dbg !28 + %79 = getelementptr float, ptr addrspace(1) %5, i64 %78, !dbg !28 + %80 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29 + %81 = fadd float %38, %26, !dbg !30 + %82 = fadd float %39, %27, !dbg !30 + %83 = fadd float %40, %28, !dbg !30 + %84 = fadd float %81, %50, !dbg !31 + %85 = fadd float %82, %51, !dbg !31 + %86 = fadd float %83, %52, !dbg !31 + %87 = fadd float %85, %63, !dbg !32 + %88 = fadd float %86, %64, !dbg !32 + %89 = fadd float %87, %75, !dbg !33 + %90 = fadd float %88, %76, !dbg !33 + %91 = insertelement <2 x float> poison, float %84, i64 0, !dbg !32 + %92 = insertelement <2 x float> %91, float %41, i64 1, !dbg !32 + %93 = insertelement <2 x float> poison, float %62, i64 0, !dbg !32 + %94 = insertelement <2 x float> %93, float %29, i64 1, !dbg !32 + %95 = fadd <2 x float> %92, %94, !dbg !32 + %96 = insertelement <2 x float> poison, float %74, i64 0, !dbg !33 + %97 = insertelement <2 x float> %96, float %53, i64 1, !dbg !33 + %98 = fadd <2 x float> %95, %97, !dbg !33 + %99 = insertelement <2 x float> poison, float %89, i64 0, !dbg !34 + %100 = insertelement <2 x float> %99, float %65, i64 1, !dbg !34 + %101 = fadd <2 x float> %98, %100, !dbg !34 + %102 = insertelement <2 x float> poison, float %90, i64 0, !dbg !34 + %103 = insertelement <2 x float> %102, float %77, i64 1, !dbg !34 + %104 = fadd <2 x float> %101, %103, !dbg !34 + %105 = extractelement <2 x float> %104, i64 0, !dbg !34 + %106 = extractelement <2 x float> %104, i64 1, !dbg !34 + %107 = fadd float %105, %106, !dbg !34 + %108 = bitcast float %107 to i32, !dbg !40 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !40 + %110 = bitcast i32 %109 to float, !dbg !40 + %111 = fadd float %107, %110, !dbg !34 + %112 = bitcast float %111 to i32, !dbg !40 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !40 + %114 = bitcast i32 %113 to float, !dbg !40 + %115 = fadd float %111, %114, !dbg !34 + %116 = bitcast float %115 to i32, !dbg !40 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 4, i32 31), !dbg !40 + %118 = bitcast i32 %117 to float, !dbg !40 + %119 = fadd float %115, %118, !dbg !34 + %120 = bitcast float %119 to i32, !dbg !40 + %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 2, i32 31), !dbg !40 + %122 = bitcast i32 %121 to float, !dbg !40 + %123 = fadd float %119, %122, !dbg !34 + %124 = bitcast float %123 to i32, !dbg !40 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 1, i32 31), !dbg !40 + %126 = bitcast i32 %125 to float, !dbg !40 + %127 = fadd float %123, %126, !dbg !34 + %128 = icmp eq i32 %12, 0, !dbg !40 + %129 = zext nneg i32 %14 to i64, !dbg !40 + %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %127, i1 %128) #6, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %131 = icmp slt i32 %11, 2, !dbg !40 + %132 = sext i32 %11 to i64, !dbg !40 + %133 = getelementptr float, ptr addrspace(3) @global_smem, i64 %132, !dbg !40 + %134 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !40 + %135 = bitcast float %134 to i32, !dbg !40 + %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 1, i32 31), !dbg !40 + %137 = bitcast i32 %136 to float, !dbg !40 + %138 = fadd float %134, %137, !dbg !34 + %139 = and i32 %11, 1, !dbg !40 + %140 = icmp eq i32 %139, 0, !dbg !40 + %141 = and i1 %131, %140, !dbg !40 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %138, i1 %141) #6, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !40 + %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40 + %143 = fadd float %142, 0.000000e+00, !dbg !42 + %144 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %143, float 2.560000e+02) #6, !dbg !46 + %145 = extractelement <2 x float> %98, i64 0, !dbg !47 + %146 = fsub float %145, %144, !dbg !47 + %147 = fsub float %89, %144, !dbg !47 + %148 = fsub float %90, %144, !dbg !47 + %149 = fsub float %106, %144, !dbg !47 + %150 = fmul float %146, %146, !dbg !48 + %151 = fmul float %147, %147, !dbg !48 + %152 = fmul float %148, %148, !dbg !48 + %153 = fmul float %149, %149, !dbg !48 + tail call void @llvm.nvvm.barrier0(), !dbg !49 + %154 = fadd float %150, %151, !dbg !51 + %155 = fadd float %152, %154, !dbg !51 + %156 = fadd float %153, %155, !dbg !51 + %157 = bitcast float %156 to i32, !dbg !49 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !49 + %159 = bitcast i32 %158 to float, !dbg !49 + %160 = fadd float %156, %159, !dbg !51 + %161 = bitcast float %160 to i32, !dbg !49 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !49 + %163 = bitcast i32 %162 to float, !dbg !49 + %164 = fadd float %160, %163, !dbg !51 + %165 = bitcast float %164 to i32, !dbg !49 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !49 + %167 = bitcast i32 %166 to float, !dbg !49 + %168 = fadd float %164, %167, !dbg !51 + %169 = bitcast float %168 to i32, !dbg !49 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !49 + %171 = bitcast i32 %170 to float, !dbg !49 + %172 = fadd float %168, %171, !dbg !51 + %173 = bitcast float %172 to i32, !dbg !49 + %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !49 + %175 = bitcast i32 %174 to float, !dbg !49 + %176 = fadd float %172, %175, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %176, i1 %128) #6, !dbg !49 + tail call void @llvm.nvvm.barrier0(), !dbg !49 + %177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %133, i1 %131) #6, !dbg !49 + %178 = bitcast float %177 to i32, !dbg !49 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !49 + %180 = bitcast i32 %179 to float, !dbg !49 + %181 = fadd float %177, %180, !dbg !51 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %133, float %181, i1 %141) #6, !dbg !49 + tail call void @llvm.nvvm.barrier0(), !dbg !49 + %182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49 + %183 = fadd float %182, 0.000000e+00, !dbg !54 + %184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !56 + %185 = fadd float %184, 0x3EE4F8B580000000, !dbg !57 + %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58 + %.not.i = icmp eq i32 %186, 0, !dbg !58 + br i1 %.not.i, label %189, label %187, !dbg !58 + +187: ; preds = %10 + %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !58 + br label %__nv_rsqrtf.exit, !dbg !58 + +189: ; preds = %10 + %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !58 + br label %__nv_rsqrtf.exit, !dbg !58 + +__nv_rsqrtf.exit: ; preds = %187, %189 + %.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !58 + %191 = extractvalue { i32, i32, i32, i32 } %80, 3, !dbg !29 + %192 = bitcast i32 %191 to float, !dbg !29 + %193 = extractvalue { i32, i32, i32, i32 } %80, 2, !dbg !29 + %194 = bitcast i32 %193 to float, !dbg !29 + %195 = extractvalue { i32, i32, i32, i32 } %80, 1, !dbg !29 + %196 = bitcast i32 %195 to float, !dbg !29 + %197 = extractvalue { i32, i32, i32, i32 } %80, 0, !dbg !29 + %198 = bitcast i32 %197 to float, !dbg !29 + %199 = fmul float %146, %.0.i, !dbg !59 + %200 = fmul float %147, %.0.i, !dbg !59 + %201 = fmul float %148, %.0.i, !dbg !59 + %202 = fmul float %149, %.0.i, !dbg !59 + %203 = fmul float %199, %198, !dbg !60 + %204 = fmul float %200, %196, !dbg !60 + %205 = fmul float %201, %194, !dbg !60 + %206 = fmul float %202, %192, !dbg !60 + %207 = getelementptr float, ptr addrspace(1) %6, i64 %19, !dbg !61 + %208 = bitcast float %145 to i32, !dbg !62 + %209 = bitcast float %89 to i32, !dbg !62 + %210 = bitcast float %90 to i32, !dbg !62 + %211 = bitcast float %106 to i32, !dbg !62 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !62 + %212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !63 + %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !64 + %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !64 + %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !64 + %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !64 + %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !64 + %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !64 + %219 = bitcast <2 x i16> %218 to i32, !dbg !64 + %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !64 + %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !64 + %222 = bitcast <2 x i16> %221 to i32, !dbg !64 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !64 + ret void, !dbg !65 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cjbnqg5u4sj7a4xstjer3a6tdgnnigb2iymd27gcs6o7oduhxy2v.py", directory: "/tmp/torchinductor_root/jb") +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 40, scope: !7) +!13 = !DILocation(line: 30, column: 36, scope: !7) +!14 = !DILocation(line: 30, column: 30, scope: !7) +!15 = !DILocation(line: 30, column: 46, scope: !7) +!16 = !DILocation(line: 31, column: 30, scope: !7) +!17 = !DILocation(line: 31, column: 46, scope: !7) +!18 = !DILocation(line: 31, column: 67, scope: !7) +!19 = !DILocation(line: 32, column: 30, scope: !7) +!20 = !DILocation(line: 32, column: 46, scope: !7) +!21 = !DILocation(line: 32, column: 67, scope: !7) +!22 = !DILocation(line: 33, column: 30, scope: !7) +!23 = !DILocation(line: 33, column: 46, scope: !7) +!24 = !DILocation(line: 33, column: 67, scope: !7) +!25 = !DILocation(line: 34, column: 31, scope: !7) +!26 = !DILocation(line: 34, column: 47, scope: !7) +!27 = !DILocation(line: 34, column: 68, scope: !7) +!28 = !DILocation(line: 35, column: 31, scope: !7) +!29 = !DILocation(line: 35, column: 36, scope: !7) +!30 = !DILocation(line: 37, column: 18, scope: !7) +!31 = !DILocation(line: 39, column: 18, scope: !7) +!32 = !DILocation(line: 41, column: 18, scope: !7) +!33 = !DILocation(line: 43, column: 19, scope: !7) +!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38) +!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0) +!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0) +!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39) +!39 = !DILocation(line: 48, column: 59, scope: !35) +!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41) +!41 = !DILocation(line: 48, column: 59, scope: !37) +!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45) +!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0) +!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!45 = !DILocation(line: 48, column: 45, scope: !43) +!46 = !DILocation(line: 51, column: 20, scope: !7) +!47 = !DILocation(line: 52, column: 20, scope: !7) +!48 = !DILocation(line: 53, column: 20, scope: !7) +!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50) +!50 = !DILocation(line: 56, column: 59, scope: !37) +!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52) +!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53) +!53 = !DILocation(line: 56, column: 59, scope: !35) +!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55) +!55 = !DILocation(line: 56, column: 45, scope: !43) +!56 = !DILocation(line: 59, column: 20, scope: !7) +!57 = !DILocation(line: 61, column: 20, scope: !7) +!58 = !DILocation(line: 62, column: 26, scope: !7) +!59 = !DILocation(line: 63, column: 20, scope: !7) +!60 = !DILocation(line: 64, column: 20, scope: !7) +!61 = !DILocation(line: 66, column: 25, scope: !7) +!62 = !DILocation(line: 66, column: 48, scope: !7) +!63 = !DILocation(line: 67, column: 25, scope: !7) +!64 = !DILocation(line: 67, column: 48, scope: !7) +!65 = !DILocation(line: 67, column: 4, scope: !7) diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..5d8f556a26051cd68c5a9cbc11f33a2d1ce6eeb5 --- /dev/null +++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttgir @@ -0,0 +1,76 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant 9.99999974E-6 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %17 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %19 = tt.load %18, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %20 = arith.extf %19 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %21 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %23 = tt.load %22, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %24 = arith.extf %23 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %25 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %28 = arith.addf %8, %12 : tensor<256xf32, #blocked> + %29 = arith.addf %28, %16 : tensor<256xf32, #blocked> + %30 = arith.addf %29, %20 : tensor<256xf32, #blocked> + %31 = arith.addf %30, %24 : tensor<256xf32, #blocked> + %32 = arith.select %2, %31, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %53 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %53 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %34 = arith.addf %33, %cst_2 : f32 + %35 = arith.divf %34, %cst_1 : f32 + %36 = tt.splat %35 : (f32) -> tensor<256xf32, #blocked> + %37 = arith.subf %31, %36 : tensor<256xf32, #blocked> + %38 = arith.mulf %37, %37 : tensor<256xf32, #blocked> + %39 = arith.select %2, %38, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %53 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %53 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %41 = arith.addf %40, %cst_2 : f32 + %42 = arith.divf %41, %cst_1 : f32 + %43 = arith.addf %42, %cst_0 : f32 + %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %45 = tt.splat %44 : (f32) -> tensor<256xf32, #blocked> + %46 = arith.mulf %37, %45 : tensor<256xf32, #blocked> + %47 = arith.mulf %46, %27 : tensor<256xf32, #blocked> + %48 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %50 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %52 = arith.truncf %47 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..0a13f05ba11e7af3195bafdd7703f65963e5d35d --- /dev/null +++ b/.triton/dump/0bb244fe116b8bad2e3be6ce32964a26/triton_.ttir @@ -0,0 +1,75 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32> + %17 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32> + %21 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr> + %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32> + %25 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr> + %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %28 = arith.addf %8, %12 : tensor<256xf32> + %29 = arith.addf %28, %16 : tensor<256xf32> + %30 = arith.addf %29, %20 : tensor<256xf32> + %31 = arith.addf %30, %24 : tensor<256xf32> + %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32> + %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %53 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %53 : f32 + }) : (tensor<256xf32>) -> f32 + %34 = arith.addf %33, %cst_0 : f32 + %35 = arith.divf %34, %cst_1 : f32 + %36 = tt.splat %35 : (f32) -> tensor<256xf32> + %37 = arith.subf %31, %36 : tensor<256xf32> + %38 = arith.mulf %37, %37 : tensor<256xf32> + %39 = arith.select %2, %38, %cst_3 : tensor<256xi1>, tensor<256xf32> + %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %53 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %53 : f32 + }) : (tensor<256xf32>) -> f32 + %41 = arith.addf %40, %cst_0 : f32 + %42 = arith.divf %41, %cst_1 : f32 + %43 = arith.addf %42, %cst_2 : f32 + %44 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %45 = tt.splat %44 : (f32) -> tensor<256xf32> + %46 = arith.mulf %37, %45 : tensor<256xf32> + %47 = arith.mulf %46, %27 : tensor<256xf32> + %48 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr> + %49 = tt.addptr %48, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %49, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %50 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr> + %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %52 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16> + tt.store %51, %52, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..3a8d5c32a62691c08d1e0f5869994e206e4639eb Binary files /dev/null and b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.cubin differ diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..853a4acd7a83c86c8f6d2fbd5fc8703cea577e81 --- /dev/null +++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.llir @@ -0,0 +1,283 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2d3d4de5de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !7 { + %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %8 = and i32 %7, 31, !dbg !10 + %9 = lshr i32 %7, 5, !dbg !10 + %10 = and i32 %9, 1, !dbg !10 + %urem = shl i32 %7, 2, !dbg !10 + %11 = and i32 %urem, 252, !dbg !10 + %12 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %13 = shl i32 %12, 8, !dbg !12 + %14 = or i32 %13, %11, !dbg !13 + %15 = sext i32 %14 to i64, !dbg !14 + %16 = getelementptr float, ptr addrspace(1) %0, i64 %15, !dbg !14 + %17 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %16, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 + %18 = extractvalue { i32, i32, i32, i32 } %17, 0, !dbg !15 + %19 = extractvalue { i32, i32, i32, i32 } %17, 1, !dbg !15 + %20 = extractvalue { i32, i32, i32, i32 } %17, 2, !dbg !15 + %21 = extractvalue { i32, i32, i32, i32 } %17, 3, !dbg !15 + %22 = bitcast i32 %18 to float, !dbg !15 + %23 = bitcast i32 %19 to float, !dbg !15 + %24 = bitcast i32 %20 to float, !dbg !15 + %25 = bitcast i32 %21 to float, !dbg !15 + %26 = getelementptr i16, ptr addrspace(1) %1, i64 %15, !dbg !16 + %27 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %26, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 + %28 = extractvalue { i32, i32 } %27, 0, !dbg !17 + %29 = extractvalue { i32, i32 } %27, 1, !dbg !17 + %30 = trunc i32 %28 to i16, !dbg !17 + %extelt.offset = lshr i32 %28, 16, !dbg !17 + %31 = trunc i32 %extelt.offset to i16, !dbg !17 + %32 = trunc i32 %29 to i16, !dbg !17 + %extelt.offset1 = lshr i32 %29, 16, !dbg !17 + %33 = trunc i32 %extelt.offset1 to i16, !dbg !17 + %34 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %30) #6, !dbg !18 + %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18 + %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18 + %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18 + %38 = zext nneg i32 %11 to i64, !dbg !19 + %39 = getelementptr float, ptr addrspace(1) %2, i64 %38, !dbg !19 + %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 + %41 = fadd float %34, %22, !dbg !21 + %42 = fadd float %35, %23, !dbg !21 + %43 = fadd float %36, %24, !dbg !21 + %44 = fadd float %37, %25, !dbg !21 + %45 = fadd float %41, %42, !dbg !22 + %46 = fadd float %45, %43, !dbg !22 + %47 = fadd float %46, %44, !dbg !22 + %48 = bitcast float %47 to i32, !dbg !28 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 16, i32 31), !dbg !28 + %50 = bitcast i32 %49 to float, !dbg !28 + %51 = fadd float %47, %50, !dbg !22 + %52 = bitcast float %51 to i32, !dbg !28 + %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 8, i32 31), !dbg !28 + %54 = bitcast i32 %53 to float, !dbg !28 + %55 = fadd float %51, %54, !dbg !22 + %56 = bitcast float %55 to i32, !dbg !28 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 4, i32 31), !dbg !28 + %58 = bitcast i32 %57 to float, !dbg !28 + %59 = fadd float %55, %58, !dbg !22 + %60 = bitcast float %59 to i32, !dbg !28 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 2, i32 31), !dbg !28 + %62 = bitcast i32 %61 to float, !dbg !28 + %63 = fadd float %59, %62, !dbg !22 + %64 = bitcast float %63 to i32, !dbg !28 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 1, i32 31), !dbg !28 + %66 = bitcast i32 %65 to float, !dbg !28 + %67 = fadd float %63, %66, !dbg !22 + %68 = icmp eq i32 %8, 0, !dbg !28 + %69 = zext nneg i32 %10 to i64, !dbg !28 + %70 = getelementptr float, ptr addrspace(3) @global_smem, i64 %69, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %67, i1 %68) #6, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %71 = icmp slt i32 %7, 2, !dbg !28 + %72 = sext i32 %7 to i64, !dbg !28 + %73 = getelementptr float, ptr addrspace(3) @global_smem, i64 %72, !dbg !28 + %74 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !28 + %75 = bitcast float %74 to i32, !dbg !28 + %76 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %75, i32 1, i32 31), !dbg !28 + %77 = bitcast i32 %76 to float, !dbg !28 + %78 = fadd float %74, %77, !dbg !22 + %79 = and i32 %7, 1, !dbg !28 + %80 = icmp eq i32 %79, 0, !dbg !28 + %81 = and i1 %71, %80, !dbg !28 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %78, i1 %81) #6, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %82 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28 + %83 = fadd float %82, 0.000000e+00, !dbg !30 + %84 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %83, float 2.560000e+02) #6, !dbg !34 + %85 = fsub float %41, %84, !dbg !35 + %86 = fsub float %42, %84, !dbg !35 + %87 = fsub float %43, %84, !dbg !35 + %88 = fsub float %44, %84, !dbg !35 + %89 = fmul float %85, %85, !dbg !36 + %90 = fmul float %86, %86, !dbg !36 + %91 = fmul float %87, %87, !dbg !36 + %92 = fmul float %88, %88, !dbg !36 + tail call void @llvm.nvvm.barrier0(), !dbg !37 + %93 = fadd float %89, %90, !dbg !39 + %94 = fadd float %91, %93, !dbg !39 + %95 = fadd float %92, %94, !dbg !39 + %96 = bitcast float %95 to i32, !dbg !37 + %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !37 + %98 = bitcast i32 %97 to float, !dbg !37 + %99 = fadd float %95, %98, !dbg !39 + %100 = bitcast float %99 to i32, !dbg !37 + %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !37 + %102 = bitcast i32 %101 to float, !dbg !37 + %103 = fadd float %99, %102, !dbg !39 + %104 = bitcast float %103 to i32, !dbg !37 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !37 + %106 = bitcast i32 %105 to float, !dbg !37 + %107 = fadd float %103, %106, !dbg !39 + %108 = bitcast float %107 to i32, !dbg !37 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !37 + %110 = bitcast i32 %109 to float, !dbg !37 + %111 = fadd float %107, %110, !dbg !39 + %112 = bitcast float %111 to i32, !dbg !37 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !37 + %114 = bitcast i32 %113 to float, !dbg !37 + %115 = fadd float %111, %114, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %70, float %115, i1 %68) #6, !dbg !37 + tail call void @llvm.nvvm.barrier0(), !dbg !37 + %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %73, i1 %71) #6, !dbg !37 + %117 = bitcast float %116 to i32, !dbg !37 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 1, i32 31), !dbg !37 + %119 = bitcast i32 %118 to float, !dbg !37 + %120 = fadd float %116, %119, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %73, float %120, i1 %81) #6, !dbg !37 + tail call void @llvm.nvvm.barrier0(), !dbg !37 + %121 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37 + %122 = fadd float %121, 0.000000e+00, !dbg !42 + %123 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %122, float 2.560000e+02) #6, !dbg !44 + %124 = fadd float %123, 0x3EE4F8B580000000, !dbg !45 + %125 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46 + %.not.i = icmp eq i32 %125, 0, !dbg !46 + br i1 %.not.i, label %128, label %126, !dbg !46 + +126: ; preds = %6 + %127 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %124), !dbg !46 + br label %__nv_rsqrtf.exit, !dbg !46 + +128: ; preds = %6 + %129 = tail call float @llvm.nvvm.rsqrt.approx.f(float %124), !dbg !46 + br label %__nv_rsqrtf.exit, !dbg !46 + +__nv_rsqrtf.exit: ; preds = %126, %128 + %.0.i = phi float [ %127, %126 ], [ %129, %128 ], !dbg !46 + %130 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !20 + %131 = bitcast i32 %130 to float, !dbg !20 + %132 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !20 + %133 = bitcast i32 %132 to float, !dbg !20 + %134 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !20 + %135 = bitcast i32 %134 to float, !dbg !20 + %136 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !20 + %137 = bitcast i32 %136 to float, !dbg !20 + %138 = fmul float %85, %.0.i, !dbg !47 + %139 = fmul float %86, %.0.i, !dbg !47 + %140 = fmul float %87, %.0.i, !dbg !47 + %141 = fmul float %88, %.0.i, !dbg !47 + %142 = fmul float %138, %137, !dbg !48 + %143 = fmul float %139, %135, !dbg !48 + %144 = fmul float %140, %133, !dbg !48 + %145 = fmul float %141, %131, !dbg !48 + %146 = getelementptr i16, ptr addrspace(1) %3, i64 %15, !dbg !49 + %147 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %142) #6, !dbg !50 + %148 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %143) #6, !dbg !50 + %149 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !50 + %150 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !50 + %151 = insertelement <2 x i16> undef, i16 %147, i64 0, !dbg !50 + %152 = insertelement <2 x i16> %151, i16 %148, i64 1, !dbg !50 + %153 = bitcast <2 x i16> %152 to i32, !dbg !50 + %154 = insertelement <2 x i16> undef, i16 %149, i64 0, !dbg !50 + %155 = insertelement <2 x i16> %154, i16 %150, i64 1, !dbg !50 + %156 = bitcast <2 x i16> %155 to i32, !dbg !50 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %153, i32 %156, ptr addrspace(1) %146, i1 true) #6, !dbg !50 + ret void, !dbg !51 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py", directory: "/tmp/torchinductor_root/qh") +!4 = !{ptr @triton__0d1d2d3d4de5de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4de5de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4de5de", linkageName: "triton__0d1d2d3d4de5de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 40, scope: !7) +!13 = !DILocation(line: 30, column: 36, scope: !7) +!14 = !DILocation(line: 30, column: 30, scope: !7) +!15 = !DILocation(line: 30, column: 46, scope: !7) +!16 = !DILocation(line: 31, column: 30, scope: !7) +!17 = !DILocation(line: 31, column: 46, scope: !7) +!18 = !DILocation(line: 31, column: 67, scope: !7) +!19 = !DILocation(line: 32, column: 31, scope: !7) +!20 = !DILocation(line: 32, column: 36, scope: !7) +!21 = !DILocation(line: 34, column: 18, scope: !7) +!22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26) +!23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0) +!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27) +!27 = !DILocation(line: 39, column: 58, scope: !23) +!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29) +!29 = !DILocation(line: 39, column: 58, scope: !25) +!30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33) +!31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0) +!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!33 = !DILocation(line: 39, column: 45, scope: !31) +!34 = !DILocation(line: 42, column: 20, scope: !7) +!35 = !DILocation(line: 43, column: 19, scope: !7) +!36 = !DILocation(line: 44, column: 20, scope: !7) +!37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38) +!38 = !DILocation(line: 47, column: 59, scope: !25) +!39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40) +!40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41) +!41 = !DILocation(line: 47, column: 59, scope: !23) +!42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43) +!43 = !DILocation(line: 47, column: 45, scope: !31) +!44 = !DILocation(line: 50, column: 20, scope: !7) +!45 = !DILocation(line: 52, column: 20, scope: !7) +!46 = !DILocation(line: 53, column: 26, scope: !7) +!47 = !DILocation(line: 54, column: 20, scope: !7) +!48 = !DILocation(line: 55, column: 20, scope: !7) +!49 = !DILocation(line: 57, column: 25, scope: !7) +!50 = !DILocation(line: 57, column: 48, scope: !7) +!51 = !DILocation(line: 57, column: 4, scope: !7) diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..d081a462a34c1032b2c124b82ba12cb96ac1dc33 --- /dev/null +++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ptx @@ -0,0 +1,687 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4de5de +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4de5de( + .param .u64 triton__0d1d2d3d4de5de_param_0, + .param .u64 triton__0d1d2d3d4de5de_param_1, + .param .u64 triton__0d1d2d3d4de5de_param_2, + .param .u64 triton__0d1d2d3d4de5de_param_3, + .param .u32 triton__0d1d2d3d4de5de_param_4, + .param .u32 triton__0d1d2d3d4de5de_param_5 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<23>; + .reg .b16 %rs<9>; + .reg .b32 %r<84>; + .reg .f32 %f<70>; + .reg .b64 %rd<12>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd5, [triton__0d1d2d3d4de5de_param_0]; + ld.param.u64 %rd6, [triton__0d1d2d3d4de5de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r50, %tid.x; + and.b32 %r51, %r50, 31; + ld.param.u64 %rd7, [triton__0d1d2d3d4de5de_param_2]; + ld.param.u64 %rd8, [triton__0d1d2d3d4de5de_param_3]; + shl.b32 %r52, %r50, 2; + and.b32 %r53, %r52, 252; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r54, %r1, 8; + .loc 1 30 36 + or.b32 %r55, %r54, %r53; + .loc 1 30 30 + mul.wide.s32 %rd9, %r55, 4; + add.s64 %rd1, %rd5, %rd9; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + .loc 1 31 30 + mul.wide.s32 %rd10, %r55, 2; + add.s64 %rd2, %rd6, %rd10; + .loc 1 31 46 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + cvt.u16.u32 %rs1, %r10; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } + cvt.u16.u32 %rs3, %r11; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } + .loc 1 31 67 + cvt.f32.bf16 %r14, %rs1; + mov.b32 %f5, %r14; + cvt.f32.bf16 %r15, %rs2; + mov.b32 %f6, %r15; + cvt.f32.bf16 %r16, %rs3; + mov.b32 %f7, %r16; + cvt.f32.bf16 %r17, %rs4; + mov.b32 %f8, %r17; + .loc 1 32 31 + mul.wide.u32 %rd11, %r53, 4; + add.s64 %rd3, %rd7, %rd11; + .loc 1 32 36 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + @!%p1 mov.u32 %r20, %r6; + @!%p1 mov.u32 %r21, %r6; + .loc 1 34 18 + add.f32 %f9, %f5, %f1; + add.f32 %f10, %f6, %f2; + add.f32 %f11, %f7, %f3; + add.f32 %f12, %f8, %f4; +$L__tmp1: + .loc 2 233 15 + add.f32 %f13, %f9, %f10; + add.f32 %f14, %f13, %f11; + add.f32 %f15, %f14, %f12; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r56, %f15; + shfl.sync.bfly.b32 %r57, %r56, 16, 31, -1; + mov.b32 %f16, %r57; +$L__tmp3: + .loc 2 233 15 + add.f32 %f17, %f15, %f16; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r58, %f17; + shfl.sync.bfly.b32 %r59, %r58, 8, 31, -1; + mov.b32 %f18, %r59; +$L__tmp5: + .loc 2 233 15 + add.f32 %f19, %f17, %f18; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r60, %f19; + shfl.sync.bfly.b32 %r61, %r60, 4, 31, -1; + mov.b32 %f20, %r61; +$L__tmp7: + .loc 2 233 15 + add.f32 %f21, %f19, %f20; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r62, %f21; + shfl.sync.bfly.b32 %r63, %r62, 2, 31, -1; + mov.b32 %f22, %r63; +$L__tmp9: + .loc 2 233 15 + add.f32 %f23, %f21, %f22; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r64, %f23; + shfl.sync.bfly.b32 %r65, %r64, 1, 31, -1; + mov.b32 %f24, %r65; +$L__tmp11: + .loc 2 233 15 + add.f32 %f25, %f23, %f24; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p14, %r51, 0; + shr.u32 %r66, %r50, 3; + and.b32 %r67, %r66, 4; + mov.u32 %r68, global_smem; + add.s32 %r26, %r68, %r67; + mov.b32 %r27, %f25; + @%p14 st.shared.b32 [ %r26 + 0 ], %r27; + bar.sync 0; + setp.lt.s32 %p15, %r50, 2; + add.s32 %r29, %r68, %r52; + @%p15 ld.shared.b32 %r28, [ %r29 + 0 ]; + mov.b32 %f26, %r28; + shfl.sync.bfly.b32 %r69, %r28, 1, 31, -1; + mov.b32 %f27, %r69; +$L__tmp13: + .loc 2 233 15 + add.f32 %f28, %f26, %f27; +$L__tmp14: + .loc 2 243 36 + and.b32 %r70, %r50, 1; + setp.eq.b32 %p21, %r70, 1; + not.pred %p22, %p21; + and.pred %p16, %p15, %p22; + mov.b32 %r31, %f28; + @%p16 st.shared.b32 [ %r29 + 0 ], %r31; + bar.sync 0; + ld.shared.f32 %f29, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f30, %f29, 0f00000000; +$L__tmp16: + .loc 1 42 20 + mov.b32 %r33, %f30; + mov.b32 %r34, 1132462080; + div.full.f32 %r32, %r33, %r34; + mov.b32 %f31, %r32; + .loc 1 43 19 + sub.f32 %f32, %f9, %f31; + sub.f32 %f33, %f10, %f31; + sub.f32 %f34, %f11, %f31; + sub.f32 %f35, %f12, %f31; + .loc 1 44 20 + mul.f32 %f36, %f33, %f33; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f37, %f32, %f32, %f36; + fma.rn.f32 %f38, %f34, %f34, %f37; + fma.rn.f32 %f39, %f35, %f35, %f38; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r71, %f39; + shfl.sync.bfly.b32 %r72, %r71, 16, 31, -1; + mov.b32 %f40, %r72; +$L__tmp20: + .loc 2 233 15 + add.f32 %f41, %f39, %f40; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r73, %f41; + shfl.sync.bfly.b32 %r74, %r73, 8, 31, -1; + mov.b32 %f42, %r74; +$L__tmp22: + .loc 2 233 15 + add.f32 %f43, %f41, %f42; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r75, %f43; + shfl.sync.bfly.b32 %r76, %r75, 4, 31, -1; + mov.b32 %f44, %r76; +$L__tmp24: + .loc 2 233 15 + add.f32 %f45, %f43, %f44; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r77, %f45; + shfl.sync.bfly.b32 %r78, %r77, 2, 31, -1; + mov.b32 %f46, %r78; +$L__tmp26: + .loc 2 233 15 + add.f32 %f47, %f45, %f46; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r79, %f47; + shfl.sync.bfly.b32 %r80, %r79, 1, 31, -1; + mov.b32 %f48, %r80; +$L__tmp28: + .loc 2 233 15 + add.f32 %f49, %f47, %f48; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r36, %f49; + @%p14 st.shared.b32 [ %r26 + 0 ], %r36; + bar.sync 0; + @%p15 ld.shared.b32 %r37, [ %r29 + 0 ]; + mov.b32 %f50, %r37; + shfl.sync.bfly.b32 %r81, %r37, 1, 31, -1; + mov.b32 %f51, %r81; +$L__tmp30: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r40, %f52; + @%p16 st.shared.b32 [ %r29 + 0 ], %r40; + bar.sync 0; + ld.shared.f32 %f53, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f54, %f53, 0f00000000; +$L__tmp33: + .loc 1 50 20 + mov.b32 %r42, %f54; + div.full.f32 %r41, %r42, %r34; + mov.b32 %f55, %r41; + .loc 1 52 20 + add.f32 %f56, %f55, 0f3727C5AC; + .loc 1 53 26 + rsqrt.approx.ftz.f32 %f57, %f56; + .loc 1 32 36 + mov.b32 %f58, %r21; + mov.b32 %f59, %r20; + mov.b32 %f60, %r19; + mov.b32 %f61, %r18; + .loc 1 54 20 + mul.f32 %f62, %f32, %f57; + mul.f32 %f63, %f33, %f57; + mul.f32 %f64, %f34, %f57; + mul.f32 %f65, %f35, %f57; + .loc 1 55 20 + mul.f32 %f66, %f62, %f61; + mul.f32 %f67, %f63, %f60; + mul.f32 %f68, %f64, %f59; + mul.f32 %f69, %f65, %f58; + .loc 1 57 25 + add.s64 %rd4, %rd8, %rd10; + .loc 1 57 48 + mov.b32 %r44, %f66; + cvt.rn.bf16.f32 %rs5, %r44; + mov.b32 %r45, %f67; + cvt.rn.bf16.f32 %rs6, %r45; + mov.b32 %r46, %f68; + cvt.rn.bf16.f32 %rs7, %r46; + mov.b32 %r47, %f69; + cvt.rn.bf16.f32 %rs8, %r47; + mov.b32 %r82, {%rs5, %rs6}; + mov.b32 %r83, {%rs7, %rs8}; + @%p1 st.global.v2.b32 [ %rd4 + 0 ], { %r82, %r83 }; + .loc 1 57 4 + ret; +$L__tmp34: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/qh/cqh2dj355iatjzvi5cmz4txvjd3ap52shgash4czifdcnafnkkam.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 391 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 113 +.b8 104 +.b8 50 +.b8 100 +.b8 106 +.b8 51 +.b8 53 +.b8 53 +.b8 105 +.b8 97 +.b8 116 +.b8 106 +.b8 122 +.b8 118 +.b8 105 +.b8 53 +.b8 99 +.b8 109 +.b8 122 +.b8 52 +.b8 116 +.b8 120 +.b8 118 +.b8 106 +.b8 100 +.b8 51 +.b8 97 +.b8 112 +.b8 53 +.b8 50 +.b8 115 +.b8 104 +.b8 103 +.b8 97 +.b8 115 +.b8 104 +.b8 52 +.b8 99 +.b8 122 +.b8 105 +.b8 102 +.b8 100 +.b8 99 +.b8 110 +.b8 97 +.b8 102 +.b8 110 +.b8 107 +.b8 107 +.b8 97 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 113 +.b8 104 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 53 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 53 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 39 +.b8 58 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 39 +.b8 58 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 39 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 47 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 47 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 47 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 395 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 53 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 395 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..baeef93f1bde8390ba5b72235755ba68556b3cd1 --- /dev/null +++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttgir @@ -0,0 +1,58 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant 9.99999974E-6 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = arith.addf %8, %12 : tensor<256xf32, #blocked> + %17 = arith.select %2, %16, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({ + ^bb0(%arg6: f32, %arg7: f32): + %36 = arith.addf %arg6, %arg7 : f32 + tt.reduce.return %36 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %19 = arith.addf %18, %cst_2 : f32 + %20 = arith.divf %19, %cst_1 : f32 + %21 = tt.splat %20 : (f32) -> tensor<256xf32, #blocked> + %22 = arith.subf %16, %21 : tensor<256xf32, #blocked> + %23 = arith.mulf %22, %22 : tensor<256xf32, #blocked> + %24 = arith.select %2, %23, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({ + ^bb0(%arg6: f32, %arg7: f32): + %36 = arith.addf %arg6, %arg7 : f32 + tt.reduce.return %36 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %26 = arith.addf %25, %cst_2 : f32 + %27 = arith.divf %26, %cst_1 : f32 + %28 = arith.addf %27, %cst_0 : f32 + %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %30 = tt.splat %29 : (f32) -> tensor<256xf32, #blocked> + %31 = arith.mulf %22, %30 : tensor<256xf32, #blocked> + %32 = arith.mulf %31, %15 : tensor<256xf32, #blocked> + %33 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %35 = arith.truncf %32 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a22267fb52fc938b1ffd5794663968a09ee5a8e6 --- /dev/null +++ b/.triton/dump/0bbf368bb6fbbd0528742f708246d167/triton_.ttir @@ -0,0 +1,57 @@ +module { + tt.func public @triton__0d1d2d3d4de5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %16 = arith.addf %8, %12 : tensor<256xf32> + %17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32> + %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({ + ^bb0(%arg6: f32, %arg7: f32): + %36 = arith.addf %arg6, %arg7 : f32 + tt.reduce.return %36 : f32 + }) : (tensor<256xf32>) -> f32 + %19 = arith.addf %18, %cst_0 : f32 + %20 = arith.divf %19, %cst_1 : f32 + %21 = tt.splat %20 : (f32) -> tensor<256xf32> + %22 = arith.subf %16, %21 : tensor<256xf32> + %23 = arith.mulf %22, %22 : tensor<256xf32> + %24 = arith.select %2, %23, %cst_3 : tensor<256xi1>, tensor<256xf32> + %25 = "tt.reduce"(%24) <{axis = 0 : i32}> ({ + ^bb0(%arg6: f32, %arg7: f32): + %36 = arith.addf %arg6, %arg7 : f32 + tt.reduce.return %36 : f32 + }) : (tensor<256xf32>) -> f32 + %26 = arith.addf %25, %cst_0 : f32 + %27 = arith.divf %26, %cst_1 : f32 + %28 = arith.addf %27, %cst_2 : f32 + %29 = tt.extern_elementwise %28 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %30 = tt.splat %29 : (f32) -> tensor<256xf32> + %31 = arith.mulf %22, %30 : tensor<256xf32> + %32 = arith.mulf %31, %15 : tensor<256xf32> + %33 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %35 = arith.truncf %32 : tensor<256xf32> to tensor<256xbf16> + tt.store %34, %35, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1a1019279a9e5949df1f24315c60d36abd2f603c Binary files /dev/null and b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.cubin differ diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..eec42267f35de6405fcd2d17eec6c194569a0b2c --- /dev/null +++ b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ptx @@ -0,0 +1,1854 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6de7de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6de7de( + .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<137>; + .reg .b16 %rs<49>; + .reg .b32 %r<439>; + .reg .f32 %f<487>; + .reg .b64 %rd<124>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6de7de_param_4]; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6de7de_param_1]; + ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6de7de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r89, %tid.x; + ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6de7de_param_2]; + bfe.u32 %r90, %r89, 5, 3; + ld.param.u64 %rd61, [triton__0d1d2d3d4d5d6de7de_param_3]; + and.b32 %r91, %r89, 15; + .loc 1 24 33 + shl.b32 %r92, %r89, 3; + and.b32 %r1, %r92, 248; + and.b32 %r2, %r89, 255; + .loc 1 21 28 + mov.u32 %r24, %ctaid.x; + .loc 1 21 33 + shl.b32 %r93, %r24, 4; + .loc 1 22 23 + or.b32 %r94, %r93, %r90; + or.b32 %r95, %r94, 8; + or.b32 %r96, %r93, %r91; + .loc 1 26 30 + mul.wide.s32 %rd62, %r94, 8; + add.s64 %rd20, %rd59, %rd62; + add.s64 %rd36, %rd20, 64; + mul.wide.s32 %rd63, %r96, 8; + add.s64 %rd52, %rd59, %rd63; + mov.pred %p113, -1; + .loc 1 26 35 + mov.u64 %rd19, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd20 + 0 ]; + mov.u64 %rd21, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd20 + 0 ]; + mov.u64 %rd23, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd20 + 0 ]; + mov.u64 %rd25, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd20 + 0 ]; + mov.u64 %rd27, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd20 + 0 ]; + mov.u64 %rd29, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd20 + 0 ]; + mov.u64 %rd31, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd31 }, [ %rd20 + 0 ]; + mov.u64 %rd33, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd33 }, [ %rd20 + 0 ]; + mov.u64 %rd35, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd35 }, [ %rd36 + 0 ]; + mov.u64 %rd37, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd36 + 0 ]; + mov.u64 %rd39, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd36 + 0 ]; + mov.u64 %rd41, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd36 + 0 ]; + mov.u64 %rd43, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd43 }, [ %rd36 + 0 ]; + mov.u64 %rd45, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd45 }, [ %rd36 + 0 ]; + mov.u64 %rd47, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd47 }, [ %rd36 + 0 ]; + mov.u64 %rd49, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd49 }, [ %rd36 + 0 ]; + mov.u64 %rd51, 0x0; + @%p113 ld.global.L1::evict_last.b64 { %rd51 }, [ %rd52 + 0 ]; + .loc 1 27 18 + bfe.s32 %r97, %r24, 27, 1; + shr.u32 %r98, %r97, 23; + add.s32 %r99, %r94, %r98; + and.b32 %r100, %r99, 16776704; + sub.s32 %r101, %r94, %r100; + add.s32 %r102, %r95, %r98; + and.b32 %r103, %r102, 16776704; + sub.s32 %r104, %r95, %r103; + .loc 1 35 44 + shl.b32 %r105, %r101, 8; + shl.b32 %r106, %r104, 8; + .loc 1 35 40 + or.b32 %r107, %r105, %r1; + or.b32 %r108, %r106, %r1; + .loc 1 35 34 + mul.wide.s32 %rd64, %r107, 4; + add.s64 %rd89, %rd60, %rd64; + cvt.s64.s32 %rd65, %r105; + cvt.u64.u32 %rd66, %r1; + or.b64 %rd67, %rd65, %rd66; + shl.b64 %rd68, %rd67, 2; + add.s64 %rd69, %rd60, %rd68; + add.s64 %rd90, %rd69, 16; + mul.wide.s32 %rd70, %r108, 4; + add.s64 %rd91, %rd60, %rd70; + cvt.s64.s32 %rd71, %r106; + or.b64 %rd72, %rd71, %rd66; + shl.b64 %rd73, %rd72, 2; + add.s64 %rd74, %rd60, %rd73; + add.s64 %rd92, %rd74, 16; + mov.b32 %r325, 0; + .loc 1 35 50 + mov.u32 %r25, 0x0; + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd89 + 0 ]; + @!%p113 mov.u32 %r25, %r325; + @!%p113 mov.u32 %r26, %r325; + @!%p113 mov.u32 %r27, %r325; + @!%p113 mov.u32 %r28, %r325; + mov.b32 %f1, %r25; + mov.b32 %f2, %r26; + mov.b32 %f3, %r27; + mov.b32 %f4, %r28; + mov.u32 %r33, 0x0; + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd90 + 0 ]; + @!%p113 mov.u32 %r33, %r325; + @!%p113 mov.u32 %r34, %r325; + @!%p113 mov.u32 %r35, %r325; + @!%p113 mov.u32 %r36, %r325; + mov.b32 %f5, %r33; + mov.b32 %f6, %r34; + mov.b32 %f7, %r35; + mov.b32 %f8, %r36; + mov.u32 %r41, 0x0; + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r41, %r42, %r43, %r44 }, [ %rd91 + 0 ]; + @!%p113 mov.u32 %r41, %r325; + @!%p113 mov.u32 %r42, %r325; + @!%p113 mov.u32 %r43, %r325; + @!%p113 mov.u32 %r44, %r325; + mov.b32 %f9, %r41; + mov.b32 %f10, %r42; + mov.b32 %f11, %r43; + mov.b32 %f12, %r44; + mov.u32 %r49, 0x0; + mov.u32 %r50, 0x0; + mov.u32 %r51, 0x0; + mov.u32 %r52, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r49, %r50, %r51, %r52 }, [ %rd92 + 0 ]; + @!%p113 mov.u32 %r49, %r325; + @!%p113 mov.u32 %r50, %r325; + @!%p113 mov.u32 %r51, %r325; + @!%p113 mov.u32 %r52, %r325; + mov.b32 %f13, %r49; + mov.b32 %f14, %r50; + mov.b32 %f15, %r51; + mov.b32 %f16, %r52; + .loc 1 36 44 + shl.b32 %r109, %r94, 8; + shl.b32 %r110, %r95, 8; + .loc 1 36 40 + or.b32 %r111, %r109, %r1; + or.b32 %r112, %r110, %r1; + .loc 1 36 34 + mul.wide.s32 %rd75, %r111, 2; + add.s64 %rd93, %rd61, %rd75; + mul.wide.s32 %rd76, %r112, 2; + add.s64 %rd94, %rd61, %rd76; + .loc 1 36 50 + mov.u32 %r57, 0x0; + mov.u32 %r58, 0x0; + mov.u32 %r59, 0x0; + mov.u32 %r60, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r57, %r58, %r59, %r60 }, [ %rd93 + 0 ]; + @!%p113 mov.u32 %r57, %r325; + @!%p113 mov.u32 %r58, %r325; + @!%p113 mov.u32 %r59, %r325; + @!%p113 mov.u32 %r60, %r325; + cvt.u16.u32 %rs1, %r57; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r57; } + cvt.u16.u32 %rs3, %r58; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r58; } + cvt.u16.u32 %rs5, %r59; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r59; } + cvt.u16.u32 %rs7, %r60; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r60; } + mov.u32 %r65, 0x0; + mov.u32 %r66, 0x0; + mov.u32 %r67, 0x0; + mov.u32 %r68, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r65, %r66, %r67, %r68 }, [ %rd94 + 0 ]; + @!%p113 mov.u32 %r65, %r325; + @!%p113 mov.u32 %r66, %r325; + @!%p113 mov.u32 %r67, %r325; + @!%p113 mov.u32 %r68, %r325; + cvt.u16.u32 %rs9, %r65; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r65; } + cvt.u16.u32 %rs11, %r66; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r66; } + cvt.u16.u32 %rs13, %r67; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r67; } + cvt.u16.u32 %rs15, %r68; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r68; } + .loc 1 36 101 + cvt.f32.bf16 %r73, %rs1; + mov.b32 %f17, %r73; + cvt.f32.bf16 %r74, %rs2; + mov.b32 %f18, %r74; + cvt.f32.bf16 %r75, %rs3; + mov.b32 %f19, %r75; + cvt.f32.bf16 %r76, %rs4; + mov.b32 %f20, %r76; + cvt.f32.bf16 %r77, %rs5; + mov.b32 %f21, %r77; + cvt.f32.bf16 %r78, %rs6; + mov.b32 %f22, %r78; + cvt.f32.bf16 %r79, %rs7; + mov.b32 %f23, %r79; + cvt.f32.bf16 %r80, %rs8; + mov.b32 %f24, %r80; + cvt.f32.bf16 %r81, %rs9; + mov.b32 %f25, %r81; + cvt.f32.bf16 %r82, %rs10; + mov.b32 %f26, %r82; + cvt.f32.bf16 %r83, %rs11; + mov.b32 %f27, %r83; + cvt.f32.bf16 %r84, %rs12; + mov.b32 %f28, %r84; + cvt.f32.bf16 %r85, %rs13; + mov.b32 %f29, %r85; + cvt.f32.bf16 %r86, %rs14; + mov.b32 %f30, %r86; + cvt.f32.bf16 %r87, %rs15; + mov.b32 %f31, %r87; + cvt.f32.bf16 %r88, %rs16; + mov.b32 %f32, %r88; + .loc 1 37 22 + add.s64 %rd77, %rd51, 50257; + .loc 1 38 22 + setp.lt.s64 %p48, %rd51, 0; + .loc 1 39 36 + selp.b64 %rd11, %rd77, %rd51, %p48; + .loc 1 40 40 + setp.lt.u64 %p49, %rd11, 50257; + mov.b32 %r438, 883; + mov.u64 %rd123, 1; + .loc 1 40 55 + @%p49 bra $L__BB0_2; + mov.u64 %rd78, assertMessage_0; + cvta.global.u64 %rd79, %rd78; + mov.u64 %rd80, assertFile_0; + cvta.global.u64 %rd81, %rd80; + mov.u64 %rd82, assertFunc_0; + cvta.global.u64 %rd83, %rd82; + { // callseq 8, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd79; + .param .b64 param1; + st.param.b64 [param1+0], %rd81; + .param .b32 param2; + st.param.b32 [param2+0], %r438; + .param .b64 param3; + st.param.b64 [param3+0], %rd83; + .param .b64 param4; + st.param.b64 [param4+0], %rd123; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 8 +$L__BB0_2: + .loc 1 0 55 + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6de7de_param_5]; + cvt.s64.s32 %rd7, %r111; + cvt.s64.s32 %rd9, %r112; + .loc 1 38 22 + setp.lt.s64 %p103, %rd35, 0; + setp.lt.s64 %p104, %rd19, 0; + .loc 1 41 44 + shl.b64 %rd96, %rd19, 8; + add.s64 %rd97, %rd96, 12865792; + selp.b64 %rd98, %rd97, %rd96, %p104; + shl.b64 %rd99, %rd35, 8; + add.s64 %rd100, %rd99, 12865792; + selp.b64 %rd101, %rd100, %rd99, %p103; + .loc 1 41 40 + or.b64 %rd103, %rd98, %rd66; + or.b64 %rd104, %rd101, %rd66; + .loc 1 41 34 + shl.b64 %rd105, %rd103, 2; + add.s64 %rd115, %rd16, %rd105; + add.s64 %rd116, %rd115, 16; + shl.b64 %rd106, %rd104, 2; + add.s64 %rd117, %rd16, %rd106; + add.s64 %rd118, %rd117, 16; + .loc 1 41 52 + mov.u32 %r114, 0x0; + mov.u32 %r115, 0x0; + mov.u32 %r116, 0x0; + mov.u32 %r117, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd115 + 0 ]; + @!%p113 mov.u32 %r114, %r325; + @!%p113 mov.u32 %r115, %r325; + @!%p113 mov.u32 %r116, %r325; + @!%p113 mov.u32 %r117, %r325; + mov.b32 %f59, %r114; + mov.b32 %f60, %r115; + mov.b32 %f61, %r116; + mov.b32 %f62, %r117; + mov.u32 %r122, 0x0; + mov.u32 %r123, 0x0; + mov.u32 %r124, 0x0; + mov.u32 %r125, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd116 + 0 ]; + @!%p113 mov.u32 %r122, %r325; + @!%p113 mov.u32 %r123, %r325; + @!%p113 mov.u32 %r124, %r325; + @!%p113 mov.u32 %r125, %r325; + mov.b32 %f63, %r122; + mov.b32 %f64, %r123; + mov.b32 %f65, %r124; + mov.b32 %f66, %r125; + mov.u32 %r130, 0x0; + mov.u32 %r131, 0x0; + mov.u32 %r132, 0x0; + mov.u32 %r133, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd117 + 0 ]; + @!%p113 mov.u32 %r130, %r325; + @!%p113 mov.u32 %r131, %r325; + @!%p113 mov.u32 %r132, %r325; + @!%p113 mov.u32 %r133, %r325; + mov.b32 %f67, %r130; + mov.b32 %f68, %r131; + mov.b32 %f69, %r132; + mov.b32 %f70, %r133; + mov.u32 %r138, 0x0; + mov.u32 %r139, 0x0; + mov.u32 %r140, 0x0; + mov.u32 %r141, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r138, %r139, %r140, %r141 }, [ %rd118 + 0 ]; + @!%p113 mov.u32 %r138, %r325; + @!%p113 mov.u32 %r139, %r325; + @!%p113 mov.u32 %r140, %r325; + @!%p113 mov.u32 %r141, %r325; + mov.b32 %f71, %r138; + mov.b32 %f72, %r139; + mov.b32 %f73, %r140; + mov.b32 %f74, %r141; + .loc 1 42 22 + add.f32 %f75, %f1, %f59; + add.f32 %f76, %f2, %f60; + add.f32 %f77, %f3, %f61; + add.f32 %f78, %f4, %f62; + add.f32 %f79, %f5, %f63; + add.f32 %f80, %f6, %f64; + add.f32 %f81, %f7, %f65; + add.f32 %f82, %f8, %f66; + add.f32 %f83, %f9, %f67; + add.f32 %f84, %f10, %f68; + add.f32 %f85, %f11, %f69; + add.f32 %f86, %f12, %f70; + add.f32 %f87, %f13, %f71; + add.f32 %f88, %f14, %f72; + add.f32 %f89, %f15, %f73; + add.f32 %f90, %f16, %f74; + .loc 1 44 22 + add.f32 %f91, %f17, %f75; + add.f32 %f92, %f18, %f76; + add.f32 %f93, %f19, %f77; + add.f32 %f94, %f20, %f78; + add.f32 %f95, %f21, %f79; + add.f32 %f96, %f22, %f80; + add.f32 %f97, %f23, %f81; + add.f32 %f98, %f24, %f82; + add.f32 %f99, %f25, %f83; + add.f32 %f100, %f26, %f84; + add.f32 %f101, %f27, %f85; + add.f32 %f102, %f28, %f86; + add.f32 %f103, %f29, %f87; + add.f32 %f104, %f30, %f88; + add.f32 %f105, %f31, %f89; + add.f32 %f106, %f32, %f90; +$L__tmp1: + .loc 2 98 22 + add.f32 %f107, %f91, 0f00000000; + add.f32 %f108, %f92, 0f00000000; + add.f32 %f109, %f93, 0f00000000; + add.f32 %f110, %f94, 0f00000000; + add.f32 %f111, %f95, 0f00000000; + add.f32 %f112, %f96, 0f00000000; + add.f32 %f113, %f97, 0f00000000; + add.f32 %f114, %f98, 0f00000000; + add.f32 %f115, %f99, 0f00000000; + add.f32 %f116, %f100, 0f00000000; + add.f32 %f117, %f101, 0f00000000; + add.f32 %f118, %f102, 0f00000000; + add.f32 %f119, %f103, 0f00000000; + add.f32 %f120, %f104, 0f00000000; + add.f32 %f121, %f105, 0f00000000; + add.f32 %f122, %f106, 0f00000000; + .loc 2 101 30 + sub.f32 %f123, %f91, %f107; + sub.f32 %f124, %f92, %f108; + sub.f32 %f125, %f93, %f109; + sub.f32 %f126, %f94, %f110; + sub.f32 %f127, %f95, %f111; + sub.f32 %f128, %f96, %f112; + sub.f32 %f129, %f97, %f113; + sub.f32 %f130, %f98, %f114; + sub.f32 %f131, %f99, %f115; + sub.f32 %f132, %f100, %f116; + sub.f32 %f133, %f101, %f117; + sub.f32 %f134, %f102, %f118; + sub.f32 %f135, %f103, %f119; + sub.f32 %f136, %f104, %f120; + sub.f32 %f137, %f105, %f121; + sub.f32 %f138, %f106, %f122; + .loc 2 101 13 + fma.rn.f32 %f139, %f91, %f123, 0f00000000; + fma.rn.f32 %f140, %f92, %f124, 0f00000000; + fma.rn.f32 %f141, %f93, %f125, 0f00000000; + fma.rn.f32 %f142, %f94, %f126, 0f00000000; + fma.rn.f32 %f143, %f95, %f127, 0f00000000; + fma.rn.f32 %f144, %f96, %f128, 0f00000000; + fma.rn.f32 %f145, %f97, %f129, 0f00000000; + fma.rn.f32 %f146, %f98, %f130, 0f00000000; + fma.rn.f32 %f147, %f99, %f131, 0f00000000; + fma.rn.f32 %f148, %f100, %f132, 0f00000000; + fma.rn.f32 %f149, %f101, %f133, 0f00000000; + fma.rn.f32 %f150, %f102, %f134, 0f00000000; + fma.rn.f32 %f151, %f103, %f135, 0f00000000; + fma.rn.f32 %f152, %f104, %f136, 0f00000000; + fma.rn.f32 %f153, %f105, %f137, 0f00000000; + fma.rn.f32 %f154, %f106, %f138, 0f00000000; +$L__tmp2: + .loc 2 108 21 + sub.f32 %f155, %f108, %f107; + mov.b32 %r147, 1065353216; + mov.b32 %r148, 1073741824; + .loc 2 110 60 + div.full.f32 %r146, %r147, %r148; + mov.b32 %f156, %r146; + .loc 2 112 17 + fma.rn.f32 %f157, %f156, %f155, %f107; + .loc 2 113 15 + add.f32 %f158, %f139, %f140; + .loc 2 113 30 + mul.f32 %f159, %f155, %f155; + .loc 2 113 22 + fma.rn.f32 %f160, %f156, %f159, %f158; + .loc 2 108 21 + sub.f32 %f161, %f109, %f157; + mov.b32 %r151, 1077936128; + .loc 2 110 60 + div.full.f32 %r149, %r147, %r151; + mov.b32 %f162, %r149; + .loc 2 112 17 + fma.rn.f32 %f163, %f162, %f161, %f157; + .loc 2 113 15 + add.f32 %f164, %f141, %f160; + .loc 2 113 30 + mul.f32 %f165, %f161, %f161; + .loc 2 113 38 + fma.rn.f32 %f166, %f161, %f161, %f165; + .loc 2 113 22 + fma.rn.f32 %f167, %f162, %f166, %f164; + .loc 2 108 21 + sub.f32 %f168, %f110, %f163; + mov.b32 %r154, 1082130432; + .loc 2 110 60 + div.full.f32 %r152, %r147, %r154; + mov.b32 %f169, %r152; + .loc 2 112 17 + fma.rn.f32 %f170, %f169, %f168, %f163; + .loc 2 113 15 + add.f32 %f171, %f142, %f167; + .loc 2 113 30 + mul.f32 %f172, %f168, %f168; + .loc 2 113 38 + mul.f32 %f173, %f172, 0f40400000; + .loc 2 113 22 + fma.rn.f32 %f174, %f169, %f173, %f171; + .loc 2 108 21 + sub.f32 %f175, %f111, %f170; + mov.b32 %r157, 1084227584; + .loc 2 110 60 + div.full.f32 %r155, %r147, %r157; + mov.b32 %f176, %r155; + .loc 2 112 17 + fma.rn.f32 %f177, %f176, %f175, %f170; + .loc 2 113 15 + add.f32 %f178, %f143, %f174; + .loc 2 113 30 + mul.f32 %f179, %f175, %f175; + .loc 2 113 38 + mul.f32 %f180, %f179, 0f40800000; + .loc 2 113 22 + fma.rn.f32 %f181, %f176, %f180, %f178; + .loc 2 108 21 + sub.f32 %f182, %f112, %f177; + mov.b32 %r160, 1086324736; + .loc 2 110 60 + div.full.f32 %r158, %r147, %r160; + mov.b32 %f183, %r158; + .loc 2 112 17 + fma.rn.f32 %f184, %f183, %f182, %f177; + .loc 2 113 15 + add.f32 %f185, %f144, %f181; + .loc 2 113 30 + mul.f32 %f186, %f182, %f182; + .loc 2 113 38 + mul.f32 %f187, %f186, 0f40A00000; + .loc 2 113 22 + fma.rn.f32 %f188, %f183, %f187, %f185; + .loc 2 108 21 + sub.f32 %f189, %f113, %f184; + mov.b32 %r163, 1088421888; + .loc 2 110 60 + div.full.f32 %r161, %r147, %r163; + mov.b32 %f190, %r161; + .loc 2 112 17 + fma.rn.f32 %f191, %f190, %f189, %f184; + .loc 2 113 15 + add.f32 %f192, %f145, %f188; + .loc 2 113 30 + mul.f32 %f193, %f189, %f189; + .loc 2 113 38 + mul.f32 %f194, %f193, 0f40C00000; + .loc 2 113 22 + fma.rn.f32 %f195, %f190, %f194, %f192; + .loc 2 108 21 + sub.f32 %f196, %f114, %f191; + mov.b32 %r166, 1090519040; + .loc 2 110 60 + div.full.f32 %r164, %r147, %r166; + mov.b32 %f197, %r164; + .loc 2 112 17 + fma.rn.f32 %f198, %f197, %f196, %f191; + .loc 2 113 15 + add.f32 %f199, %f146, %f195; + .loc 2 113 30 + mul.f32 %f200, %f196, %f196; + .loc 2 113 38 + mul.f32 %f201, %f200, 0f40E00000; + .loc 2 113 22 + fma.rn.f32 %f202, %f197, %f201, %f199; + .loc 2 108 21 + sub.f32 %f203, %f116, %f115; + .loc 2 110 60 + div.full.f32 %r167, %r147, %r148; + mov.b32 %f204, %r167; + .loc 2 112 17 + fma.rn.f32 %f205, %f203, %f204, %f115; + .loc 2 113 15 + add.f32 %f206, %f147, %f148; + .loc 2 113 30 + mul.f32 %f207, %f203, %f203; + .loc 2 113 22 + fma.rn.f32 %f208, %f207, %f204, %f206; + .loc 2 108 21 + sub.f32 %f209, %f117, %f205; + .loc 2 110 60 + div.full.f32 %r170, %r147, %r151; + mov.b32 %f210, %r170; + .loc 2 112 17 + fma.rn.f32 %f211, %f210, %f209, %f205; + .loc 2 113 15 + add.f32 %f212, %f149, %f208; + .loc 2 113 30 + mul.f32 %f213, %f209, %f209; + .loc 2 113 38 + fma.rn.f32 %f214, %f209, %f209, %f213; + .loc 2 113 22 + fma.rn.f32 %f215, %f210, %f214, %f212; + .loc 2 108 21 + sub.f32 %f216, %f118, %f211; + .loc 2 110 60 + div.full.f32 %r173, %r147, %r154; + mov.b32 %f217, %r173; + .loc 2 112 17 + fma.rn.f32 %f218, %f217, %f216, %f211; + .loc 2 113 15 + add.f32 %f219, %f150, %f215; + .loc 2 113 30 + mul.f32 %f220, %f216, %f216; + .loc 2 113 38 + mul.f32 %f221, %f220, 0f40400000; + .loc 2 113 22 + fma.rn.f32 %f222, %f217, %f221, %f219; + .loc 2 108 21 + sub.f32 %f223, %f119, %f218; + .loc 2 110 60 + div.full.f32 %r176, %r147, %r157; + mov.b32 %f224, %r176; + .loc 2 112 17 + fma.rn.f32 %f225, %f224, %f223, %f218; + .loc 2 113 15 + add.f32 %f226, %f151, %f222; + .loc 2 113 30 + mul.f32 %f227, %f223, %f223; + .loc 2 113 38 + mul.f32 %f228, %f227, 0f40800000; + .loc 2 113 22 + fma.rn.f32 %f229, %f224, %f228, %f226; + .loc 2 108 21 + sub.f32 %f230, %f120, %f225; + .loc 2 110 60 + div.full.f32 %r179, %r147, %r160; + mov.b32 %f231, %r179; + .loc 2 112 17 + fma.rn.f32 %f232, %f231, %f230, %f225; + .loc 2 113 15 + add.f32 %f233, %f152, %f229; + .loc 2 113 30 + mul.f32 %f234, %f230, %f230; + .loc 2 113 38 + mul.f32 %f235, %f234, 0f40A00000; + .loc 2 113 22 + fma.rn.f32 %f236, %f231, %f235, %f233; + .loc 2 108 21 + sub.f32 %f237, %f121, %f232; + .loc 2 110 60 + div.full.f32 %r182, %r147, %r163; + mov.b32 %f238, %r182; + .loc 2 112 17 + fma.rn.f32 %f239, %f238, %f237, %f232; + .loc 2 113 15 + add.f32 %f240, %f153, %f236; + .loc 2 113 30 + mul.f32 %f241, %f237, %f237; + .loc 2 113 38 + mul.f32 %f242, %f241, 0f40C00000; + .loc 2 113 22 + fma.rn.f32 %f243, %f238, %f242, %f240; + .loc 2 108 21 + sub.f32 %f244, %f122, %f239; + .loc 2 110 60 + div.full.f32 %r185, %r147, %r166; + mov.b32 %f245, %r185; + .loc 2 112 17 + fma.rn.f32 %f246, %f245, %f244, %f239; + .loc 2 113 15 + add.f32 %f247, %f154, %f243; + .loc 2 113 30 + mul.f32 %f248, %f244, %f244; + .loc 2 113 38 + mul.f32 %f249, %f248, 0f40E00000; + .loc 2 113 22 + fma.rn.f32 %f250, %f245, %f249, %f247; +$L__tmp3: + .loc 2 120 46 + mov.b32 %r284, %f198; + shfl.sync.bfly.b32 %r285, %r284, 16, 31, -1; + mov.b32 %f251, %r285; + mov.b32 %r286, %f202; + shfl.sync.bfly.b32 %r287, %r286, 16, 31, -1; + mov.b32 %f252, %r287; + shfl.sync.bfly.b32 %r189, %r166, 16, 31, -1; + mov.b32 %f253, %r189; +$L__tmp4: + .loc 2 108 21 + sub.f32 %f254, %f251, %f198; + .loc 2 109 28 + add.f32 %f255, %f253, 0f41000000; + .loc 2 110 39 + setp.eq.f32 %p105, %f255, 0f00000000; + .loc 2 110 60 + mov.b32 %r190, %f255; + div.full.f32 %r188, %r189, %r190; + mov.b32 %f256, %r188; + .loc 2 110 49 + selp.f32 %f257, 0f00000000, %f256, %p105; + .loc 2 112 17 + fma.rn.f32 %f258, %f257, %f254, %f198; + .loc 2 113 15 + add.f32 %f259, %f202, %f252; + .loc 2 113 30 + mul.f32 %f260, %f254, %f254; + .loc 2 113 38 + mul.f32 %f261, %f260, 0f41000000; + .loc 2 113 22 + fma.rn.f32 %f262, %f257, %f261, %f259; +$L__tmp5: + .loc 2 120 46 + mov.b32 %r288, %f258; + shfl.sync.bfly.b32 %r289, %r288, 8, 31, -1; + mov.b32 %f263, %r289; + mov.b32 %r290, %f262; + shfl.sync.bfly.b32 %r291, %r290, 8, 31, -1; + mov.b32 %f264, %r291; + shfl.sync.bfly.b32 %r192, %r190, 8, 31, -1; + mov.b32 %f265, %r192; +$L__tmp6: + .loc 2 108 21 + sub.f32 %f266, %f263, %f258; + .loc 2 109 28 + add.f32 %f267, %f255, %f265; + .loc 2 110 39 + setp.eq.f32 %p106, %f267, 0f00000000; + .loc 2 110 60 + mov.b32 %r193, %f267; + div.full.f32 %r191, %r192, %r193; + mov.b32 %f268, %r191; + .loc 2 110 49 + selp.f32 %f269, 0f00000000, %f268, %p106; + .loc 2 112 17 + fma.rn.f32 %f270, %f269, %f266, %f258; + .loc 2 113 15 + add.f32 %f271, %f262, %f264; + .loc 2 113 30 + mul.f32 %f272, %f266, %f266; + .loc 2 113 38 + mul.f32 %f273, %f255, %f272; + .loc 2 113 22 + fma.rn.f32 %f274, %f269, %f273, %f271; +$L__tmp7: + .loc 2 120 46 + mov.b32 %r292, %f270; + shfl.sync.bfly.b32 %r293, %r292, 4, 31, -1; + mov.b32 %f275, %r293; + mov.b32 %r294, %f274; + shfl.sync.bfly.b32 %r295, %r294, 4, 31, -1; + mov.b32 %f276, %r295; + shfl.sync.bfly.b32 %r195, %r193, 4, 31, -1; + mov.b32 %f277, %r195; +$L__tmp8: + .loc 2 108 21 + sub.f32 %f278, %f275, %f270; + .loc 2 109 28 + add.f32 %f279, %f267, %f277; + .loc 2 110 39 + setp.eq.f32 %p107, %f279, 0f00000000; + .loc 2 110 60 + mov.b32 %r196, %f279; + div.full.f32 %r194, %r195, %r196; + mov.b32 %f280, %r194; + .loc 2 110 49 + selp.f32 %f281, 0f00000000, %f280, %p107; + .loc 2 112 17 + fma.rn.f32 %f282, %f281, %f278, %f270; + .loc 2 113 15 + add.f32 %f283, %f274, %f276; + .loc 2 113 30 + mul.f32 %f284, %f278, %f278; + .loc 2 113 38 + mul.f32 %f285, %f267, %f284; + .loc 2 113 22 + fma.rn.f32 %f286, %f281, %f285, %f283; +$L__tmp9: + .loc 2 120 46 + mov.b32 %r296, %f282; + shfl.sync.bfly.b32 %r297, %r296, 2, 31, -1; + mov.b32 %f287, %r297; + mov.b32 %r298, %f286; + shfl.sync.bfly.b32 %r299, %r298, 2, 31, -1; + mov.b32 %f288, %r299; + shfl.sync.bfly.b32 %r198, %r196, 2, 31, -1; + mov.b32 %f289, %r198; +$L__tmp10: + .loc 2 108 21 + sub.f32 %f290, %f287, %f282; + .loc 2 109 28 + add.f32 %f33, %f279, %f289; + .loc 2 110 39 + setp.eq.f32 %p108, %f33, 0f00000000; + .loc 2 110 60 + mov.b32 %r199, %f33; + div.full.f32 %r197, %r198, %r199; + mov.b32 %f291, %r197; + .loc 2 110 49 + selp.f32 %f292, 0f00000000, %f291, %p108; + .loc 2 112 17 + fma.rn.f32 %f34, %f290, %f292, %f282; + .loc 2 113 15 + add.f32 %f293, %f286, %f288; + .loc 2 113 30 + mul.f32 %f294, %f290, %f290; + .loc 2 113 38 + mul.f32 %f295, %f279, %f294; + .loc 2 113 22 + fma.rn.f32 %f35, %f292, %f295, %f293; +$L__tmp11: + .loc 2 120 46 + mov.b32 %r300, %f34; + shfl.sync.bfly.b32 %r3, %r300, 1, 31, -1; + mov.b32 %r301, %f35; + shfl.sync.bfly.b32 %r4, %r301, 1, 31, -1; + shfl.sync.bfly.b32 %r201, %r199, 1, 31, -1; + mov.b32 %f296, %r201; +$L__tmp12: + .loc 2 109 28 + add.f32 %f36, %f33, %f296; + .loc 2 110 60 + mov.b32 %r202, %f36; + div.full.f32 %r200, %r201, %r202; + mov.b32 %f37, %r200; +$L__tmp13: + .loc 2 120 46 + mov.b32 %r302, %f246; + shfl.sync.bfly.b32 %r303, %r302, 16, 31, -1; + mov.b32 %f297, %r303; + mov.b32 %r304, %f250; + shfl.sync.bfly.b32 %r305, %r304, 16, 31, -1; + mov.b32 %f298, %r305; + shfl.sync.bfly.b32 %r204, %r166, 16, 31, -1; + mov.b32 %f299, %r204; +$L__tmp14: + .loc 2 108 21 + sub.f32 %f300, %f297, %f246; + .loc 2 109 28 + add.f32 %f301, %f299, 0f41000000; + .loc 2 110 39 + setp.eq.f32 %p109, %f301, 0f00000000; + .loc 2 110 60 + mov.b32 %r205, %f301; + div.full.f32 %r203, %r204, %r205; + mov.b32 %f302, %r203; + .loc 2 110 49 + selp.f32 %f303, 0f00000000, %f302, %p109; + .loc 2 112 17 + fma.rn.f32 %f304, %f300, %f303, %f246; + .loc 2 113 15 + add.f32 %f305, %f250, %f298; + .loc 2 113 30 + mul.f32 %f306, %f300, %f300; + .loc 2 113 38 + mul.f32 %f307, %f306, 0f41000000; + .loc 2 113 22 + fma.rn.f32 %f308, %f307, %f303, %f305; +$L__tmp15: + .loc 2 120 46 + mov.b32 %r306, %f304; + shfl.sync.bfly.b32 %r307, %r306, 8, 31, -1; + mov.b32 %f309, %r307; + mov.b32 %r308, %f308; + shfl.sync.bfly.b32 %r309, %r308, 8, 31, -1; + mov.b32 %f310, %r309; + shfl.sync.bfly.b32 %r207, %r205, 8, 31, -1; + mov.b32 %f311, %r207; +$L__tmp16: + .loc 2 108 21 + sub.f32 %f312, %f309, %f304; + .loc 2 109 28 + add.f32 %f313, %f301, %f311; + .loc 2 110 39 + setp.eq.f32 %p110, %f313, 0f00000000; + .loc 2 110 60 + mov.b32 %r208, %f313; + div.full.f32 %r206, %r207, %r208; + mov.b32 %f314, %r206; + .loc 2 110 49 + selp.f32 %f315, 0f00000000, %f314, %p110; + .loc 2 112 17 + fma.rn.f32 %f316, %f312, %f315, %f304; + .loc 2 113 15 + add.f32 %f317, %f308, %f310; + .loc 2 113 30 + mul.f32 %f318, %f312, %f312; + .loc 2 113 38 + mul.f32 %f319, %f301, %f318; + .loc 2 113 22 + fma.rn.f32 %f320, %f315, %f319, %f317; +$L__tmp17: + .loc 2 120 46 + mov.b32 %r310, %f316; + shfl.sync.bfly.b32 %r311, %r310, 4, 31, -1; + mov.b32 %f321, %r311; + mov.b32 %r312, %f320; + shfl.sync.bfly.b32 %r313, %r312, 4, 31, -1; + mov.b32 %f322, %r313; + shfl.sync.bfly.b32 %r210, %r208, 4, 31, -1; + mov.b32 %f323, %r210; +$L__tmp18: + .loc 2 108 21 + sub.f32 %f324, %f321, %f316; + .loc 2 109 28 + add.f32 %f325, %f313, %f323; + .loc 2 110 39 + setp.eq.f32 %p111, %f325, 0f00000000; + .loc 2 110 60 + mov.b32 %r211, %f325; + div.full.f32 %r209, %r210, %r211; + mov.b32 %f326, %r209; + .loc 2 110 49 + selp.f32 %f327, 0f00000000, %f326, %p111; + .loc 2 112 17 + fma.rn.f32 %f328, %f324, %f327, %f316; + .loc 2 113 15 + add.f32 %f329, %f320, %f322; + .loc 2 113 30 + mul.f32 %f330, %f324, %f324; + .loc 2 113 38 + mul.f32 %f331, %f313, %f330; + .loc 2 113 22 + fma.rn.f32 %f332, %f327, %f331, %f329; +$L__tmp19: + .loc 2 120 46 + mov.b32 %r314, %f328; + shfl.sync.bfly.b32 %r315, %r314, 2, 31, -1; + mov.b32 %f333, %r315; + mov.b32 %r316, %f332; + shfl.sync.bfly.b32 %r317, %r316, 2, 31, -1; + mov.b32 %f334, %r317; + shfl.sync.bfly.b32 %r213, %r211, 2, 31, -1; + mov.b32 %f335, %r213; +$L__tmp20: + .loc 2 108 21 + sub.f32 %f336, %f333, %f328; + .loc 2 109 28 + add.f32 %f38, %f325, %f335; + .loc 2 110 39 + setp.eq.f32 %p112, %f38, 0f00000000; + .loc 2 110 60 + mov.b32 %r214, %f38; + div.full.f32 %r212, %r213, %r214; + mov.b32 %f337, %r212; + .loc 2 110 49 + selp.f32 %f338, 0f00000000, %f337, %p112; + .loc 2 112 17 + fma.rn.f32 %f39, %f336, %f338, %f328; + .loc 2 113 15 + add.f32 %f339, %f332, %f334; + .loc 2 113 30 + mul.f32 %f340, %f336, %f336; + .loc 2 113 38 + mul.f32 %f341, %f325, %f340; + .loc 2 113 22 + fma.rn.f32 %f40, %f338, %f341, %f339; +$L__tmp21: + .loc 2 120 46 + mov.b32 %r318, %f39; + shfl.sync.bfly.b32 %r5, %r318, 1, 31, -1; + mov.b32 %r319, %f40; + shfl.sync.bfly.b32 %r6, %r319, 1, 31, -1; + shfl.sync.bfly.b32 %r216, %r214, 1, 31, -1; + mov.b32 %f342, %r216; +$L__tmp22: + .loc 2 109 28 + add.f32 %f41, %f38, %f342; + .loc 2 110 60 + mov.b32 %r217, %f41; + div.full.f32 %r215, %r216, %r217; + mov.b32 %f42, %r215; +$L__tmp23: + .loc 1 62 51 + mov.u32 %r218, 0x0; + mov.u32 %r219, 0x0; + mov.u32 %r220, 0x0; + mov.u32 %r221, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r218, %r219, %r220, %r221 }, [ %rd89 + 0 ]; + @!%p113 mov.u32 %r218, %r325; + @!%p113 mov.u32 %r219, %r325; + @!%p113 mov.u32 %r220, %r325; + @!%p113 mov.u32 %r221, %r325; + mov.u32 %r226, 0x0; + mov.u32 %r227, 0x0; + mov.u32 %r228, 0x0; + mov.u32 %r229, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r226, %r227, %r228, %r229 }, [ %rd90 + 0 ]; + @!%p113 mov.u32 %r226, %r325; + @!%p113 mov.u32 %r227, %r325; + @!%p113 mov.u32 %r228, %r325; + @!%p113 mov.u32 %r229, %r325; + mov.u32 %r234, 0x0; + mov.u32 %r235, 0x0; + mov.u32 %r236, 0x0; + mov.u32 %r237, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r234, %r235, %r236, %r237 }, [ %rd91 + 0 ]; + @!%p113 mov.u32 %r234, %r325; + @!%p113 mov.u32 %r235, %r325; + @!%p113 mov.u32 %r236, %r325; + @!%p113 mov.u32 %r237, %r325; + mov.u32 %r242, 0x0; + mov.u32 %r243, 0x0; + mov.u32 %r244, 0x0; + mov.u32 %r245, 0x0; + @%p113 ld.global.L1::evict_last.v4.b32 { %r242, %r243, %r244, %r245 }, [ %rd92 + 0 ]; + @!%p113 mov.u32 %r242, %r325; + @!%p113 mov.u32 %r243, %r325; + @!%p113 mov.u32 %r244, %r325; + @!%p113 mov.u32 %r245, %r325; + .loc 1 63 51 + mov.u32 %r250, 0x0; + mov.u32 %r251, 0x0; + mov.u32 %r252, 0x0; + mov.u32 %r253, 0x0; + @%p113 ld.global.L1::evict_first.v4.b32 { %r250, %r251, %r252, %r253 }, [ %rd93 + 0 ]; + @!%p113 mov.u32 %r250, %r325; + @!%p113 mov.u32 %r251, %r325; + @!%p113 mov.u32 %r252, %r325; + @!%p113 mov.u32 %r253, %r325; + cvt.u16.u32 %rs17, %r250; + { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r250; } + cvt.u16.u32 %rs19, %r251; + { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r251; } + cvt.u16.u32 %rs21, %r252; + { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r252; } + cvt.u16.u32 %rs23, %r253; + { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r253; } + mov.u32 %r258, 0x0; + mov.u32 %r259, 0x0; + mov.u32 %r260, 0x0; + mov.u32 %r261, 0x0; + @%p113 ld.global.L1::evict_first.v4.b32 { %r258, %r259, %r260, %r261 }, [ %rd94 + 0 ]; + @!%p113 mov.u32 %r258, %r325; + @!%p113 mov.u32 %r259, %r325; + @!%p113 mov.u32 %r260, %r325; + @!%p113 mov.u32 %r261, %r325; + cvt.u16.u32 %rs25, %r258; + { .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r258; } + cvt.u16.u32 %rs27, %r259; + { .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r259; } + cvt.u16.u32 %rs29, %r260; + { .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r260; } + cvt.u16.u32 %rs31, %r261; + { .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r261; } + .loc 1 63 103 + cvt.f32.bf16 %r266, %rs17; + mov.b32 %f43, %r266; + cvt.f32.bf16 %r267, %rs18; + mov.b32 %f44, %r267; + cvt.f32.bf16 %r268, %rs19; + mov.b32 %f45, %r268; + cvt.f32.bf16 %r269, %rs20; + mov.b32 %f46, %r269; + cvt.f32.bf16 %r270, %rs21; + mov.b32 %f47, %r270; + cvt.f32.bf16 %r271, %rs22; + mov.b32 %f48, %r271; + cvt.f32.bf16 %r272, %rs23; + mov.b32 %f49, %r272; + cvt.f32.bf16 %r273, %rs24; + mov.b32 %f50, %r273; + cvt.f32.bf16 %r274, %rs25; + mov.b32 %f51, %r274; + cvt.f32.bf16 %r275, %rs26; + mov.b32 %f52, %r275; + cvt.f32.bf16 %r276, %rs27; + mov.b32 %f53, %r276; + cvt.f32.bf16 %r277, %rs28; + mov.b32 %f54, %r277; + cvt.f32.bf16 %r278, %rs29; + mov.b32 %f55, %r278; + cvt.f32.bf16 %r279, %rs30; + mov.b32 %f56, %r279; + cvt.f32.bf16 %r280, %rs31; + mov.b32 %f57, %r280; + cvt.f32.bf16 %r281, %rs32; + mov.b32 %f58, %r281; + .loc 1 64 35 + mul.wide.u32 %rd107, %r2, 4; + add.s64 %rd95, %rd17, %rd107; + .loc 1 64 40 + mov.u32 %r282, 0x0; + @%p113 ld.global.L1::evict_last.b32 { %r282 }, [ %rd95 + 0 ]; + @!%p113 mov.u32 %r282, %r325; + .loc 1 68 57 + @%p49 bra $L__BB0_4; + mov.u64 %rd108, assertMessage_1; + cvta.global.u64 %rd109, %rd108; + mov.u64 %rd110, assertFile_1; + cvta.global.u64 %rd111, %rd110; + mov.u64 %rd112, assertFunc_1; + cvta.global.u64 %rd113, %rd112; + { // callseq 9, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd109; + .param .b64 param1; + st.param.b64 [param1+0], %rd111; + .param .b32 param2; + st.param.b32 [param2+0], %r438; + .param .b64 param3; + st.param.b64 [param3+0], %rd113; + .param .b64 param4; + st.param.b64 [param4+0], %rd123; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 9 +$L__BB0_4: +$L__tmp24: + .loc 2 120 46 + mov.b32 %f343, %r6; +$L__tmp25: + .loc 2 113 15 + add.f32 %f344, %f40, %f343; +$L__tmp26: + .loc 2 120 46 + mov.b32 %f345, %r5; +$L__tmp27: + .loc 2 108 21 + sub.f32 %f346, %f345, %f39; + .loc 2 113 30 + mul.f32 %f347, %f346, %f346; + .loc 2 113 38 + mul.f32 %f348, %f38, %f347; + .loc 2 110 39 + setp.eq.f32 %p135, %f41, 0f00000000; + .loc 2 110 49 + selp.f32 %f349, 0f00000000, %f42, %p135; + .loc 2 113 22 + fma.rn.f32 %f350, %f349, %f348, %f344; +$L__tmp28: + .loc 2 120 46 + mov.b32 %f351, %r4; +$L__tmp29: + .loc 2 113 15 + add.f32 %f352, %f35, %f351; +$L__tmp30: + .loc 2 120 46 + mov.b32 %f353, %r3; +$L__tmp31: + .loc 2 108 21 + sub.f32 %f354, %f353, %f34; + .loc 2 113 30 + mul.f32 %f355, %f354, %f354; + .loc 2 113 38 + mul.f32 %f356, %f33, %f355; + .loc 2 110 39 + setp.eq.f32 %p136, %f36, 0f00000000; + .loc 2 110 49 + selp.f32 %f357, 0f00000000, %f37, %p136; + .loc 2 113 22 + fma.rn.f32 %f358, %f357, %f356, %f352; +$L__tmp32: + .loc 1 69 54 + mov.u32 %r321, 0x0; + mov.u32 %r322, 0x0; + mov.u32 %r323, 0x0; + mov.u32 %r324, 0x0; + @%p113 ld.global.L1::evict_first.v4.b32 { %r321, %r322, %r323, %r324 }, [ %rd115 + 0 ]; + @!%p113 mov.u32 %r321, %r325; + @!%p113 mov.u32 %r322, %r325; + @!%p113 mov.u32 %r323, %r325; + @!%p113 mov.u32 %r324, %r325; + mov.u32 %r329, 0x0; + mov.u32 %r330, 0x0; + mov.u32 %r331, 0x0; + mov.u32 %r332, 0x0; + @%p113 ld.global.L1::evict_first.v4.b32 { %r329, %r330, %r331, %r332 }, [ %rd116 + 0 ]; + @!%p113 mov.u32 %r329, %r325; + @!%p113 mov.u32 %r330, %r325; + @!%p113 mov.u32 %r331, %r325; + @!%p113 mov.u32 %r332, %r325; + mov.u32 %r337, 0x0; + mov.u32 %r338, 0x0; + mov.u32 %r339, 0x0; + mov.u32 %r340, 0x0; + @%p113 ld.global.L1::evict_first.v4.b32 { %r337, %r338, %r339, %r340 }, [ %rd117 + 0 ]; + @!%p113 mov.u32 %r337, %r325; + @!%p113 mov.u32 %r338, %r325; + @!%p113 mov.u32 %r339, %r325; + @!%p113 mov.u32 %r340, %r325; + mov.u32 %r345, 0x0; + mov.u32 %r346, 0x0; + mov.u32 %r347, 0x0; + mov.u32 %r348, 0x0; + @%p113 ld.global.L1::evict_first.v4.b32 { %r345, %r346, %r347, %r348 }, [ %rd118 + 0 ]; + @!%p113 mov.u32 %r345, %r325; + @!%p113 mov.u32 %r346, %r325; + @!%p113 mov.u32 %r347, %r325; + @!%p113 mov.u32 %r348, %r325; + .loc 1 75 24 + mov.b32 %r354, %f358; + mov.b32 %r355, 1132462080; + div.full.f32 %r353, %r354, %r355; + mov.b32 %f359, %r353; + mov.b32 %r378, %f350; + div.full.f32 %r377, %r378, %r355; + mov.b32 %f360, %r377; + .loc 1 77 24 + add.f32 %f361, %f359, 0f3727C5AC; + add.f32 %f362, %f360, 0f3727C5AC; + .loc 1 78 30 + rsqrt.approx.ftz.f32 %f363, %f361; + rsqrt.approx.ftz.f32 %f364, %f362; + .loc 1 69 54 + mov.b32 %f365, %r348; + .loc 1 62 51 + mov.b32 %f366, %r245; + .loc 1 70 24 + add.f32 %f367, %f366, %f365; + .loc 1 72 24 + add.f32 %f368, %f58, %f367; +$L__tmp33: + .loc 2 112 17 + fma.rn.f32 %f369, %f346, %f349, %f39; +$L__tmp34: + .loc 1 73 24 + sub.f32 %f370, %f368, %f369; + .loc 1 69 54 + mov.b32 %f371, %r347; + .loc 1 62 51 + mov.b32 %f372, %r244; + .loc 1 70 24 + add.f32 %f373, %f372, %f371; + .loc 1 72 24 + add.f32 %f374, %f57, %f373; + .loc 1 73 24 + sub.f32 %f375, %f374, %f369; + .loc 1 69 54 + mov.b32 %f376, %r346; + .loc 1 62 51 + mov.b32 %f377, %r243; + .loc 1 70 24 + add.f32 %f378, %f377, %f376; + .loc 1 72 24 + add.f32 %f379, %f56, %f378; + .loc 1 73 24 + sub.f32 %f380, %f379, %f369; + .loc 1 69 54 + mov.b32 %f381, %r345; + .loc 1 62 51 + mov.b32 %f382, %r242; + .loc 1 70 24 + add.f32 %f383, %f382, %f381; + .loc 1 72 24 + add.f32 %f384, %f55, %f383; + .loc 1 73 24 + sub.f32 %f385, %f384, %f369; + .loc 1 69 54 + mov.b32 %f386, %r340; + .loc 1 62 51 + mov.b32 %f387, %r237; + .loc 1 70 24 + add.f32 %f388, %f387, %f386; + .loc 1 72 24 + add.f32 %f389, %f54, %f388; + .loc 1 73 24 + sub.f32 %f390, %f389, %f369; + .loc 1 69 54 + mov.b32 %f391, %r339; + .loc 1 62 51 + mov.b32 %f392, %r236; + .loc 1 70 24 + add.f32 %f393, %f392, %f391; + .loc 1 72 24 + add.f32 %f394, %f53, %f393; + .loc 1 73 24 + sub.f32 %f395, %f394, %f369; + .loc 1 69 54 + mov.b32 %f396, %r338; + .loc 1 62 51 + mov.b32 %f397, %r235; + .loc 1 70 24 + add.f32 %f398, %f397, %f396; + .loc 1 72 24 + add.f32 %f399, %f52, %f398; + .loc 1 73 24 + sub.f32 %f400, %f399, %f369; + .loc 1 69 54 + mov.b32 %f401, %r337; + .loc 1 62 51 + mov.b32 %f402, %r234; + .loc 1 70 24 + add.f32 %f403, %f402, %f401; + .loc 1 72 24 + add.f32 %f404, %f51, %f403; + .loc 1 73 24 + sub.f32 %f405, %f404, %f369; + .loc 1 69 54 + mov.b32 %f406, %r332; + .loc 1 62 51 + mov.b32 %f407, %r229; + .loc 1 70 24 + add.f32 %f408, %f407, %f406; + .loc 1 72 24 + add.f32 %f409, %f50, %f408; +$L__tmp35: + .loc 2 112 17 + fma.rn.f32 %f410, %f354, %f357, %f34; +$L__tmp36: + .loc 1 73 24 + sub.f32 %f411, %f409, %f410; + .loc 1 69 54 + mov.b32 %f412, %r331; + .loc 1 62 51 + mov.b32 %f413, %r228; + .loc 1 70 24 + add.f32 %f414, %f413, %f412; + .loc 1 72 24 + add.f32 %f415, %f49, %f414; + .loc 1 73 24 + sub.f32 %f416, %f415, %f410; + .loc 1 69 54 + mov.b32 %f417, %r330; + .loc 1 62 51 + mov.b32 %f418, %r227; + .loc 1 70 24 + add.f32 %f419, %f418, %f417; + .loc 1 72 24 + add.f32 %f420, %f48, %f419; + .loc 1 73 24 + sub.f32 %f421, %f420, %f410; + .loc 1 69 54 + mov.b32 %f422, %r329; + .loc 1 62 51 + mov.b32 %f423, %r226; + .loc 1 70 24 + add.f32 %f424, %f423, %f422; + .loc 1 72 24 + add.f32 %f425, %f47, %f424; + .loc 1 73 24 + sub.f32 %f426, %f425, %f410; + .loc 1 69 54 + mov.b32 %f427, %r324; + .loc 1 62 51 + mov.b32 %f428, %r221; + .loc 1 70 24 + add.f32 %f429, %f428, %f427; + .loc 1 72 24 + add.f32 %f430, %f46, %f429; + .loc 1 73 24 + sub.f32 %f431, %f430, %f410; + .loc 1 69 54 + mov.b32 %f432, %r323; + .loc 1 62 51 + mov.b32 %f433, %r220; + .loc 1 70 24 + add.f32 %f434, %f433, %f432; + .loc 1 72 24 + add.f32 %f435, %f45, %f434; + .loc 1 73 24 + sub.f32 %f436, %f435, %f410; + .loc 1 69 54 + mov.b32 %f437, %r322; + .loc 1 62 51 + mov.b32 %f438, %r219; + .loc 1 70 24 + add.f32 %f439, %f438, %f437; + .loc 1 72 24 + add.f32 %f440, %f44, %f439; + .loc 1 73 24 + sub.f32 %f441, %f440, %f410; + .loc 1 69 54 + mov.b32 %f442, %r321; + .loc 1 62 51 + mov.b32 %f443, %r218; + .loc 1 70 24 + add.f32 %f444, %f443, %f442; + .loc 1 72 24 + add.f32 %f445, %f43, %f444; + .loc 1 73 24 + sub.f32 %f446, %f445, %f410; + .loc 1 79 24 + mul.f32 %f447, %f446, %f363; + mul.f32 %f448, %f441, %f363; + mul.f32 %f449, %f436, %f363; + mul.f32 %f450, %f431, %f363; + mul.f32 %f451, %f426, %f363; + mul.f32 %f452, %f421, %f363; + mul.f32 %f453, %f416, %f363; + mul.f32 %f454, %f411, %f363; + mul.f32 %f455, %f405, %f364; + mul.f32 %f456, %f400, %f364; + mul.f32 %f457, %f395, %f364; + mul.f32 %f458, %f390, %f364; + mul.f32 %f459, %f385, %f364; + mul.f32 %f460, %f380, %f364; + mul.f32 %f461, %f375, %f364; + mul.f32 %f462, %f370, %f364; + .loc 1 80 24 + shl.b32 %r425, %r2, 2; + mov.u32 %r426, global_smem; + add.s32 %r427, %r426, %r425; + st.shared.u32 [%r427], %r282; + bar.sync 0; + shl.b32 %r428, %r1, 2; + add.s32 %r429, %r426, %r428; + ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r429]; + ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r429+16]; + mul.f32 %f471, %f447, %f463; + mul.f32 %f472, %f448, %f464; + mul.f32 %f473, %f449, %f465; + mul.f32 %f474, %f450, %f466; + mul.f32 %f475, %f451, %f467; + mul.f32 %f476, %f452, %f468; + mul.f32 %f477, %f453, %f469; + mul.f32 %f478, %f454, %f470; + mul.f32 %f479, %f455, %f463; + mul.f32 %f480, %f456, %f464; + mul.f32 %f481, %f457, %f465; + mul.f32 %f482, %f458, %f466; + mul.f32 %f483, %f459, %f467; + mul.f32 %f484, %f460, %f468; + mul.f32 %f485, %f461, %f469; + mul.f32 %f486, %f462, %f470; + .loc 1 82 29 + shl.b64 %rd121, %rd7, 1; + add.s64 %rd119, %rd18, %rd121; + shl.b64 %rd122, %rd9, 1; + add.s64 %rd120, %rd18, %rd122; + .loc 1 82 52 + mov.b32 %r401, %f471; + cvt.rn.bf16.f32 %rs33, %r401; + mov.b32 %r402, %f472; + cvt.rn.bf16.f32 %rs34, %r402; + mov.b32 %r403, %f473; + cvt.rn.bf16.f32 %rs35, %r403; + mov.b32 %r404, %f474; + cvt.rn.bf16.f32 %rs36, %r404; + mov.b32 %r405, %f475; + cvt.rn.bf16.f32 %rs37, %r405; + mov.b32 %r406, %f476; + cvt.rn.bf16.f32 %rs38, %r406; + mov.b32 %r407, %f477; + cvt.rn.bf16.f32 %rs39, %r407; + mov.b32 %r408, %f478; + cvt.rn.bf16.f32 %rs40, %r408; + mov.b32 %r409, %f479; + cvt.rn.bf16.f32 %rs41, %r409; + mov.b32 %r410, %f480; + cvt.rn.bf16.f32 %rs42, %r410; + mov.b32 %r411, %f481; + cvt.rn.bf16.f32 %rs43, %r411; + mov.b32 %r412, %f482; + cvt.rn.bf16.f32 %rs44, %r412; + mov.b32 %r413, %f483; + cvt.rn.bf16.f32 %rs45, %r413; + mov.b32 %r414, %f484; + cvt.rn.bf16.f32 %rs46, %r414; + mov.b32 %r415, %f485; + cvt.rn.bf16.f32 %rs47, %r415; + mov.b32 %r416, %f486; + cvt.rn.bf16.f32 %rs48, %r416; + mov.b32 %r430, {%rs33, %rs34}; + mov.b32 %r431, {%rs35, %rs36}; + mov.b32 %r432, {%rs37, %rs38}; + mov.b32 %r433, {%rs39, %rs40}; + @%p113 st.global.v4.b32 [ %rd119 + 0 ], { %r430, %r431, %r432, %r433 }; + mov.b32 %r434, {%rs41, %rs42}; + mov.b32 %r435, {%rs43, %rs44}; + mov.b32 %r436, {%rs45, %rs46}; + mov.b32 %r437, {%rs47, %rs48}; + @%p113 st.global.v4.b32 [ %rd120 + 0 ], { %r434, %r435, %r436, %r437 }; + .loc 1 58 4 + ret; +$L__tmp37: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 302 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 112 +.b8 110 +.b8 51 +.b8 108 +.b8 97 +.b8 119 +.b8 103 +.b8 54 +.b8 53 +.b8 108 +.b8 112 +.b8 105 +.b8 54 +.b8 51 +.b8 103 +.b8 118 +.b8 54 +.b8 99 +.b8 54 +.b8 112 +.b8 110 +.b8 52 +.b8 111 +.b8 105 +.b8 107 +.b8 104 +.b8 103 +.b8 54 +.b8 113 +.b8 118 +.b8 97 +.b8 50 +.b8 104 +.b8 50 +.b8 113 +.b8 106 +.b8 100 +.b8 112 +.b8 120 +.b8 101 +.b8 54 +.b8 113 +.b8 106 +.b8 52 +.b8 108 +.b8 118 +.b8 116 +.b8 116 +.b8 119 +.b8 101 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 112 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp2 +.b8 2 +.b8 47 +.b8 41 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp36 +.b8 2 +.b8 53 +.b8 44 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp36 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp31 +.b8 2 +.b8 53 +.b8 44 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..17c7b6ff0a1e1c4975411464963e443d6507abb0 --- /dev/null +++ b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttgir @@ -0,0 +1,134 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked> + %cst_0 = arith.constant dense<256> : tensor<1x256xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked> + %cst_4 = arith.constant dense<256> : tensor<16x1xi64, #blocked> + %cst_5 = arith.constant dense<50257> : tensor<16x1xi64, #blocked> + %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked> + %cst_7 = arith.constant dense<0> : tensor<16x1xi64, #blocked1> + %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1> + %cst_9 = arith.constant 0.000000e+00 : f32 + %cst_10 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked2> + %cst_11 = arith.constant dense<256> : tensor<1x256xi32, #blocked2> + %cst_12 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked> + %cst_13 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked> + %cst_14 = arith.constant dense<0.000000e+00> : tensor<16x256xf32, #blocked> + %cst_15 = arith.constant dense<0.000000e+00> : tensor<16x256xbf16, #blocked> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1> + %10 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> + %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked> + %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x256xi32, #blocked2> + %14 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x1x!tt.ptr, #blocked> + %15 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x1x!tt.ptr, #blocked1> + %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> + %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr, #blocked1>, tensor<16x1xi32, #blocked1> + %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked> + %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1> + %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked> + %21 = arith.cmpi slt, %12, %cst_0 : tensor<1x256xi32, #blocked> + %22 = arith.cmpi slt, %13, %cst_11 : tensor<1x256xi32, #blocked2> + %23 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked> + %24 = tt.broadcast %12 : (tensor<1x256xi32, #blocked>) -> tensor<16x256xi32, #blocked> + %25 = tt.broadcast %23 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked> + %26 = arith.addi %24, %25 : tensor<16x256xi32, #blocked> + %27 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x256x!tt.ptr, #blocked> + %28 = tt.addptr %27, %26 : tensor<16x256x!tt.ptr, #blocked>, tensor<16x256xi32, #blocked> + %29 = tt.broadcast %21 : (tensor<1x256xi1, #blocked>) -> tensor<16x256xi1, #blocked> + %30 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked> + %31 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked> + %32 = tt.broadcast %31 : (tensor<16x1xi32, #blocked>) -> tensor<16x256xi32, #blocked> + %33 = arith.addi %24, %32 : tensor<16x256xi32, #blocked> + %34 = tt.splat %arg3 : (!tt.ptr) -> tensor<16x256x!tt.ptr, #blocked> + %35 = tt.addptr %34, %33 : tensor<16x256x!tt.ptr, #blocked>, tensor<16x256xi32, #blocked> + %36 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked> + %37 = arith.extf %36 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked> + %38 = arith.addi %18, %cst_5 : tensor<16x1xi64, #blocked> + %39 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1> + %40 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked> + %41 = arith.cmpi slt, %19, %cst_7 : tensor<16x1xi64, #blocked1> + %42 = arith.select %40, %38, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked> + %43 = arith.select %41, %39, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1> + %44 = arith.cmpi sge, %43, %cst_7 : tensor<16x1xi64, #blocked1> + %45 = arith.cmpi slt, %43, %cst_8 : tensor<16x1xi64, #blocked1> + %46 = arith.andi %44, %45 : tensor<16x1xi1, #blocked1> + tt.assert %46, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1> + %47 = arith.muli %42, %cst_4 : tensor<16x1xi64, #blocked> + %48 = tt.broadcast %47 : (tensor<16x1xi64, #blocked>) -> tensor<16x256xi64, #blocked> + %49 = arith.extsi %12 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked> + %50 = tt.broadcast %49 : (tensor<1x256xi64, #blocked>) -> tensor<16x256xi64, #blocked> + %51 = arith.addi %50, %48 : tensor<16x256xi64, #blocked> + %52 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x256x!tt.ptr, #blocked> + %53 = tt.addptr %52, %51 : tensor<16x256x!tt.ptr, #blocked>, tensor<16x256xi64, #blocked> + %54 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked> + %55 = arith.addf %54, %30 : tensor<16x256xf32, #blocked> + %56 = arith.addf %55, %37 : tensor<16x256xf32, #blocked> + %57 = arith.addf %56, %cst_14 : tensor<16x256xf32, #blocked> + %58 = arith.subf %56, %57 : tensor<16x256xf32, #blocked> + %59 = arith.mulf %56, %58 : tensor<16x256xf32, #blocked> + %60 = arith.addf %59, %cst_14 : tensor<16x256xf32, #blocked> + %61 = arith.select %29, %57, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked> + %62 = arith.select %29, %60, %cst_14 : tensor<16x256xi1, #blocked>, tensor<16x256xf32, #blocked> + %63 = arith.select %21, %cst_2, %cst_3 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked> + %64 = tt.broadcast %63 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked> + %65:3 = "tt.reduce"(%61, %62, %64) <{axis = 1 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32): + %90 = arith.subf %arg11, %arg8 : f32 + %91 = arith.addf %arg10, %arg13 : f32 + %92 = arith.cmpf oeq, %91, %cst_9 : f32 + %93 = arith.divf %arg13, %91 : f32 + %94 = arith.select %92, %cst_9, %93 : f32 + %95 = arith.mulf %90, %94 : f32 + %96 = arith.addf %arg8, %95 : f32 + %97 = arith.addf %arg9, %arg12 : f32 + %98 = arith.mulf %90, %90 : f32 + %99 = arith.mulf %98, %arg10 : f32 + %100 = arith.mulf %99, %94 : f32 + %101 = arith.addf %97, %100 : f32 + tt.reduce.return %96, %101, %91 : f32, f32, f32 + }) : (tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>, tensor<16x256xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) + %66 = tt.expand_dims %65#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked> + %67 = tt.expand_dims %65#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked> + %68 = tt.load %28, %29, %cst_14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32, #blocked> + %69 = tt.load %35, %29, %cst_15 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16, #blocked> + %70 = arith.extf %69 : tensor<16x256xbf16, #blocked> to tensor<16x256xf32, #blocked> + %71 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x256x!tt.ptr, #blocked2> + %72 = tt.addptr %71, %13 : tensor<1x256x!tt.ptr, #blocked2>, tensor<1x256xi32, #blocked2> + %73 = tt.load %72, %22, %cst_10 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked2> + tt.assert %46, "index out of bounds: 0 <= tmp16 < 50257", "", "_call_with_frames_removed", 883 : tensor<16x1xi1, #blocked1> + %74 = tt.load %53, %29, %cst_14 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32, #blocked> + %75 = arith.addf %74, %68 : tensor<16x256xf32, #blocked> + %76 = arith.addf %75, %70 : tensor<16x256xf32, #blocked> + %77 = tt.broadcast %66 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked> + %78 = arith.subf %76, %77 : tensor<16x256xf32, #blocked> + %79 = arith.divf %67, %cst_13 : tensor<16x1xf32, #blocked> + %80 = arith.addf %79, %cst_12 : tensor<16x1xf32, #blocked> + %81 = tt.extern_elementwise %80 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked> + %82 = tt.broadcast %81 : (tensor<16x1xf32, #blocked>) -> tensor<16x256xf32, #blocked> + %83 = arith.mulf %78, %82 : tensor<16x256xf32, #blocked> + %84 = triton_gpu.convert_layout %73 : (tensor<1x256xf32, #blocked2>) -> tensor<1x256xf32, #blocked> + %85 = tt.broadcast %84 : (tensor<1x256xf32, #blocked>) -> tensor<16x256xf32, #blocked> + %86 = arith.mulf %83, %85 : tensor<16x256xf32, #blocked> + %87 = tt.splat %arg5 : (!tt.ptr) -> tensor<16x256x!tt.ptr, #blocked> + %88 = tt.addptr %87, %33 : tensor<16x256x!tt.ptr, #blocked>, tensor<16x256xi32, #blocked> + %89 = arith.truncf %86 : tensor<16x256xf32, #blocked> to tensor<16x256xbf16, #blocked> + tt.store %88, %89, %29 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d5250a6fc582c4e878624873634db49573179270 --- /dev/null +++ b/.triton/dump/0db70b0f0846c3c6c38c4ccb3ef979e3/triton_.ttir @@ -0,0 +1,113 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<16x256xbf16> + %cst_0 = arith.constant dense<1.000000e+00> : tensor<1x256xf32> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x256xf32> + %cst_2 = arith.constant 0.000000e+00 : f32 + %cst_3 = arith.constant dense<256> : tensor<16x1xi64> + %cst_4 = arith.constant dense<50257> : tensor<16x1xi64> + %cst_5 = arith.constant dense<0> : tensor<16x1xi64> + %cst_6 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32> + %cst_7 = arith.constant dense<2.560000e+02> : tensor<16x1xf32> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<16x256xf32> + %cst_9 = arith.constant dense<256> : tensor<16x1xi32> + %cst_10 = arith.constant dense<256> : tensor<1x256xi32> + %cst_11 = arith.constant dense<512> : tensor<16x1xi32> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<16x1xi32> + %5 = arith.addi %4, %3 : tensor<16x1xi32> + %6 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x1x!tt.ptr> + %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> + %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64> + %11 = arith.remsi %5, %cst_11 : tensor<16x1xi32> + %12 = arith.cmpi slt, %7, %cst_10 : tensor<1x256xi32> + %13 = arith.muli %11, %cst_9 : tensor<16x1xi32> + %14 = tt.broadcast %7 : (tensor<1x256xi32>) -> tensor<16x256xi32> + %15 = tt.broadcast %13 : (tensor<16x1xi32>) -> tensor<16x256xi32> + %16 = arith.addi %14, %15 : tensor<16x256xi32> + %17 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x256x!tt.ptr> + %18 = tt.addptr %17, %16 : tensor<16x256x!tt.ptr>, tensor<16x256xi32> + %19 = tt.broadcast %12 : (tensor<1x256xi1>) -> tensor<16x256xi1> + %20 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32> + %21 = arith.muli %5, %cst_9 : tensor<16x1xi32> + %22 = tt.broadcast %21 : (tensor<16x1xi32>) -> tensor<16x256xi32> + %23 = arith.addi %14, %22 : tensor<16x256xi32> + %24 = tt.splat %arg3 : (!tt.ptr) -> tensor<16x256x!tt.ptr> + %25 = tt.addptr %24, %23 : tensor<16x256x!tt.ptr>, tensor<16x256xi32> + %26 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xbf16> + %27 = arith.extf %26 : tensor<16x256xbf16> to tensor<16x256xf32> + %28 = arith.addi %10, %cst_4 : tensor<16x1xi64> + %29 = arith.cmpi slt, %10, %cst_5 : tensor<16x1xi64> + %30 = arith.select %29, %28, %10 : tensor<16x1xi1>, tensor<16x1xi64> + %31 = arith.cmpi sge, %30, %cst_5 : tensor<16x1xi64> + %32 = arith.cmpi slt, %30, %cst_4 : tensor<16x1xi64> + %33 = arith.andi %31, %32 : tensor<16x1xi1> + tt.assert %33, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<16x1xi1> + %34 = arith.muli %30, %cst_3 : tensor<16x1xi64> + %35 = tt.broadcast %34 : (tensor<16x1xi64>) -> tensor<16x256xi64> + %36 = arith.extsi %7 : tensor<1x256xi32> to tensor<1x256xi64> + %37 = tt.broadcast %36 : (tensor<1x256xi64>) -> tensor<16x256xi64> + %38 = arith.addi %37, %35 : tensor<16x256xi64> + %39 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x256x!tt.ptr> + %40 = tt.addptr %39, %38 : tensor<16x256x!tt.ptr>, tensor<16x256xi64> + %41 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32> + %42 = arith.addf %41, %20 : tensor<16x256xf32> + %43 = arith.addf %42, %27 : tensor<16x256xf32> + %44 = arith.addf %43, %cst_8 : tensor<16x256xf32> + %45 = arith.subf %43, %44 : tensor<16x256xf32> + %46 = arith.mulf %43, %45 : tensor<16x256xf32> + %47 = arith.addf %46, %cst_8 : tensor<16x256xf32> + %48 = arith.select %19, %44, %cst_8 : tensor<16x256xi1>, tensor<16x256xf32> + %49 = arith.select %19, %47, %cst_8 : tensor<16x256xi1>, tensor<16x256xf32> + %50 = arith.select %12, %cst_0, %cst_1 : tensor<1x256xi1>, tensor<1x256xf32> + %51 = tt.broadcast %50 : (tensor<1x256xf32>) -> tensor<16x256xf32> + %52:3 = "tt.reduce"(%48, %49, %51) <{axis = 1 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32): + %76 = arith.subf %arg11, %arg8 : f32 + %77 = arith.addf %arg10, %arg13 : f32 + %78 = arith.cmpf oeq, %77, %cst_2 : f32 + %79 = arith.divf %arg13, %77 : f32 + %80 = arith.select %78, %cst_2, %79 : f32 + %81 = arith.mulf %76, %80 : f32 + %82 = arith.addf %arg8, %81 : f32 + %83 = arith.addf %arg9, %arg12 : f32 + %84 = arith.mulf %76, %76 : f32 + %85 = arith.mulf %84, %arg10 : f32 + %86 = arith.mulf %85, %80 : f32 + %87 = arith.addf %83, %86 : f32 + tt.reduce.return %82, %87, %77 : f32, f32, f32 + }) : (tensor<16x256xf32>, tensor<16x256xf32>, tensor<16x256xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) + %53 = tt.expand_dims %52#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32> + %54 = tt.expand_dims %52#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32> + %55 = tt.load %18, %19, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x256xf32> + %56 = tt.load %25, %19, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xbf16> + %57 = arith.extf %56 : tensor<16x256xbf16> to tensor<16x256xf32> + %58 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x256x!tt.ptr> + %59 = tt.addptr %58, %7 : tensor<1x256x!tt.ptr>, tensor<1x256xi32> + %60 = tt.load %59, %12, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32> + tt.assert %33, "index out of bounds: 0 <= tmp16 < 50257", "", "_call_with_frames_removed", 883 : tensor<16x1xi1> + %61 = tt.load %40, %19, %cst_8 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x256xf32> + %62 = arith.addf %61, %55 : tensor<16x256xf32> + %63 = arith.addf %62, %57 : tensor<16x256xf32> + %64 = tt.broadcast %53 : (tensor<16x1xf32>) -> tensor<16x256xf32> + %65 = arith.subf %63, %64 : tensor<16x256xf32> + %66 = arith.divf %54, %cst_7 : tensor<16x1xf32> + %67 = arith.addf %66, %cst_6 : tensor<16x1xf32> + %68 = tt.extern_elementwise %67 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32> + %69 = tt.broadcast %68 : (tensor<16x1xf32>) -> tensor<16x256xf32> + %70 = arith.mulf %65, %69 : tensor<16x256xf32> + %71 = tt.broadcast %60 : (tensor<1x256xf32>) -> tensor<16x256xf32> + %72 = arith.mulf %70, %71 : tensor<16x256xf32> + %73 = tt.splat %arg5 : (!tt.ptr) -> tensor<16x256x!tt.ptr> + %74 = tt.addptr %73, %23 : tensor<16x256x!tt.ptr>, tensor<16x256xi32> + %75 = arith.truncf %72 : tensor<16x256xf32> to tensor<16x256xbf16> + tt.store %74, %75, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<16x256xbf16> + tt.return + } +} diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..da6ce5a2908c1f7b07d13f6e312d6b02d2777059 --- /dev/null +++ b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.llir @@ -0,0 +1,245 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = and i32 %6, 3, !dbg !8 + %10 = and i32 %8, 3, !dbg !9 + %urem = and i32 %6, 127, !dbg !9 + %11 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %12 = shl i32 %11, 2, !dbg !11 + %13 = or i32 %12, %9, !dbg !12 + %14 = icmp ult i32 %urem, 120, !dbg !13 + %15 = shl nuw nsw i32 %urem, 17, !dbg !14 + %16 = add i32 %12, %15, !dbg !15 + %17 = sext i32 %16 to i64, !dbg !16 + %18 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !16 + %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14, i32 0, i1 %14) #3, !dbg !17 + %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !17 + %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !17 + %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !17 + %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !17 + %24 = bitcast i32 %20 to float, !dbg !17 + %25 = bitcast i32 %21 to float, !dbg !17 + %26 = bitcast i32 %22 to float, !dbg !17 + %27 = bitcast i32 %23 to float, !dbg !17 + %28 = fadd float %24, 0.000000e+00, !dbg !18 + %29 = fadd float %25, 0.000000e+00, !dbg !18 + %30 = fadd float %26, 0.000000e+00, !dbg !18 + %31 = fadd float %27, 0.000000e+00, !dbg !18 + %32 = select i1 %14, float %28, float 0.000000e+00, !dbg !19 + %33 = select i1 %14, float %29, float 0.000000e+00, !dbg !19 + %34 = select i1 %14, float %30, float 0.000000e+00, !dbg !19 + %35 = select i1 %14, float %31, float 0.000000e+00, !dbg !19 + %36 = bitcast float %32 to i32, !dbg !20 + %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 16, i32 31), !dbg !20 + %38 = bitcast i32 %37 to float, !dbg !20 + %39 = fadd float %32, %38, !dbg !24 + %40 = bitcast float %39 to i32, !dbg !20 + %41 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %40, i32 8, i32 31), !dbg !20 + %42 = bitcast i32 %41 to float, !dbg !20 + %43 = fadd float %39, %42, !dbg !24 + %44 = bitcast float %43 to i32, !dbg !20 + %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 4, i32 31), !dbg !20 + %46 = bitcast i32 %45 to float, !dbg !20 + %47 = fadd float %43, %46, !dbg !24 + %48 = bitcast float %47 to i32, !dbg !20 + %49 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %48, i32 2, i32 31), !dbg !20 + %50 = bitcast i32 %49 to float, !dbg !20 + %51 = fadd float %47, %50, !dbg !24 + %52 = bitcast float %51 to i32, !dbg !20 + %53 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %52, i32 1, i32 31), !dbg !20 + %54 = bitcast i32 %53 to float, !dbg !20 + %55 = fadd float %51, %54, !dbg !24 + %56 = bitcast float %33 to i32, !dbg !20 + %57 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %56, i32 16, i32 31), !dbg !20 + %58 = bitcast i32 %57 to float, !dbg !20 + %59 = fadd float %33, %58, !dbg !24 + %60 = bitcast float %59 to i32, !dbg !20 + %61 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %60, i32 8, i32 31), !dbg !20 + %62 = bitcast i32 %61 to float, !dbg !20 + %63 = fadd float %59, %62, !dbg !24 + %64 = bitcast float %63 to i32, !dbg !20 + %65 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %64, i32 4, i32 31), !dbg !20 + %66 = bitcast i32 %65 to float, !dbg !20 + %67 = fadd float %63, %66, !dbg !24 + %68 = bitcast float %67 to i32, !dbg !20 + %69 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %68, i32 2, i32 31), !dbg !20 + %70 = bitcast i32 %69 to float, !dbg !20 + %71 = fadd float %67, %70, !dbg !24 + %72 = bitcast float %71 to i32, !dbg !20 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 1, i32 31), !dbg !20 + %74 = bitcast i32 %73 to float, !dbg !20 + %75 = fadd float %71, %74, !dbg !24 + %76 = bitcast float %34 to i32, !dbg !20 + %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 16, i32 31), !dbg !20 + %78 = bitcast i32 %77 to float, !dbg !20 + %79 = fadd float %34, %78, !dbg !24 + %80 = bitcast float %79 to i32, !dbg !20 + %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 8, i32 31), !dbg !20 + %82 = bitcast i32 %81 to float, !dbg !20 + %83 = fadd float %79, %82, !dbg !24 + %84 = bitcast float %83 to i32, !dbg !20 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 4, i32 31), !dbg !20 + %86 = bitcast i32 %85 to float, !dbg !20 + %87 = fadd float %83, %86, !dbg !24 + %88 = bitcast float %87 to i32, !dbg !20 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 2, i32 31), !dbg !20 + %90 = bitcast i32 %89 to float, !dbg !20 + %91 = fadd float %87, %90, !dbg !24 + %92 = bitcast float %91 to i32, !dbg !20 + %93 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %92, i32 1, i32 31), !dbg !20 + %94 = bitcast i32 %93 to float, !dbg !20 + %95 = fadd float %91, %94, !dbg !24 + %96 = bitcast float %35 to i32, !dbg !20 + %97 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %96, i32 16, i32 31), !dbg !20 + %98 = bitcast i32 %97 to float, !dbg !20 + %99 = fadd float %35, %98, !dbg !24 + %100 = bitcast float %99 to i32, !dbg !20 + %101 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %100, i32 8, i32 31), !dbg !20 + %102 = bitcast i32 %101 to float, !dbg !20 + %103 = fadd float %99, %102, !dbg !24 + %104 = bitcast float %103 to i32, !dbg !20 + %105 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %104, i32 4, i32 31), !dbg !20 + %106 = bitcast i32 %105 to float, !dbg !20 + %107 = fadd float %103, %106, !dbg !24 + %108 = bitcast float %107 to i32, !dbg !20 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 2, i32 31), !dbg !20 + %110 = bitcast i32 %109 to float, !dbg !20 + %111 = fadd float %107, %110, !dbg !24 + %112 = bitcast float %111 to i32, !dbg !20 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 1, i32 31), !dbg !20 + %114 = bitcast i32 %113 to float, !dbg !20 + %115 = fadd float %111, %114, !dbg !24 + %116 = icmp eq i32 %7, 0, !dbg !20 + %117 = zext nneg i32 %10 to i64, !dbg !20 + %118 = getelementptr float, ptr addrspace(3) @global_smem, i64 %117, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %118, float %55, i1 %116) #3, !dbg !20 + %119 = or i32 %10, 4, !dbg !20 + %120 = zext nneg i32 %119 to i64, !dbg !20 + %121 = getelementptr float, ptr addrspace(3) @global_smem, i64 %120, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %121, float %75, i1 %116) #3, !dbg !20 + %122 = or i32 %10, 8, !dbg !20 + %123 = zext nneg i32 %122 to i64, !dbg !20 + %124 = getelementptr float, ptr addrspace(3) @global_smem, i64 %123, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %124, float %95, i1 %116) #3, !dbg !20 + %125 = or i32 %10, 12, !dbg !20 + %126 = zext nneg i32 %125 to i64, !dbg !20 + %127 = getelementptr float, ptr addrspace(3) @global_smem, i64 %126, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %127, float %115, i1 %116) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %128 = icmp slt i32 %6, 16, !dbg !20 + %129 = sext i32 %6 to i64, !dbg !20 + %130 = getelementptr float, ptr addrspace(3) @global_smem, i64 %129, !dbg !20 + %131 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %130, i1 %128) #3, !dbg !20 + %132 = bitcast float %131 to i32, !dbg !20 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !20 + %134 = bitcast i32 %133 to float, !dbg !20 + %135 = fadd float %131, %134, !dbg !24 + %136 = bitcast float %135 to i32, !dbg !20 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !20 + %138 = bitcast i32 %137 to float, !dbg !20 + %139 = fadd float %135, %138, !dbg !24 + %140 = icmp eq i32 %9, 0, !dbg !20 + %141 = and i1 %128, %140, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %130, float %139, i1 %141) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %142 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !20 + %143 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), align 4, !dbg !20 + %144 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !20 + %145 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 48), align 4, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %146 = insertelement <1 x float> undef, float %142, i64 0, !dbg !28 + store <1 x float> %146, ptr addrspace(3) @global_smem, align 4, !dbg !28 + %147 = insertelement <1 x float> undef, float %143, i64 0, !dbg !28 + store <1 x float> %147, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 4), align 4, !dbg !28 + %148 = insertelement <1 x float> undef, float %144, i64 0, !dbg !28 + store <1 x float> %148, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !28 + %149 = insertelement <1 x float> undef, float %145, i64 0, !dbg !28 + store <1 x float> %149, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 12), align 4, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %150 = zext nneg i32 %9 to i64, !dbg !28 + %151 = getelementptr float, ptr addrspace(3) @global_smem, i64 %150, !dbg !28 + %152 = load <1 x float>, ptr addrspace(3) %151, align 4, !dbg !28 + %.frozen = freeze i32 %13 + %153 = sdiv i32 %.frozen, 256, !dbg !29 + %154 = mul i32 %153, 256 + %.decomposed = sub i32 %.frozen, %154 + %155 = sext i32 %153 to i64, !dbg !30 + %156 = getelementptr i64, ptr addrspace(1) %1, i64 %155, !dbg !30 + %157 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %156, i1 true) #3, !dbg !31 + %158 = lshr i64 %157, 54, !dbg !32 + %159 = and i64 %158, 512, !dbg !32 + %160 = add i64 %159, %157, !dbg !32 + %161 = shl i64 %160, 8, !dbg !33 + %162 = sext i32 %.decomposed to i64, !dbg !34 + %163 = getelementptr float, ptr addrspace(1) %2, i64 %161, !dbg !35 + %164 = getelementptr float, ptr addrspace(1) %163, i64 %162, !dbg !35 + %165 = lshr i32 %7, 2, !dbg !36 + %166 = shl nuw nsw i32 %10, 3, !dbg !36 + %167 = or i32 %166, %165, !dbg !36 + %168 = icmp eq i32 %167, 0, !dbg !36 + %169 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %164, <1 x float> %152, i1 %168) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 29, column: 25, scope: !5) +!14 = !DILocation(line: 31, column: 47, scope: !5) +!15 = !DILocation(line: 31, column: 40, scope: !5) +!16 = !DILocation(line: 31, column: 34, scope: !5) +!17 = !DILocation(line: 31, column: 53, scope: !5) +!18 = !DILocation(line: 33, column: 23, scope: !5) +!19 = !DILocation(line: 34, column: 38, scope: !5) +!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!23 = !DILocation(line: 35, column: 25, scope: !21) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 35, column: 28, scope: !5) +!29 = !DILocation(line: 36, column: 20, scope: !5) +!30 = !DILocation(line: 38, column: 30, scope: !5) +!31 = !DILocation(line: 38, column: 35, scope: !5) +!32 = !DILocation(line: 41, column: 32, scope: !5) +!33 = !DILocation(line: 45, column: 40, scope: !5) +!34 = !DILocation(line: 45, column: 36, scope: !5) +!35 = !DILocation(line: 45, column: 30, scope: !5) +!36 = !DILocation(line: 45, column: 55, scope: !5) +!37 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..360da4beec5a89422aef0534b8c4f1d2d1773c7f --- /dev/null +++ b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ptx @@ -0,0 +1,651 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<15>; + .reg .b32 %r<91>; + .reg .f32 %f<62>; + .reg .b64 %rd<16>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd5, [triton__0d1d2d3de4e_param_0]; + ld.param.u64 %rd6, [triton__0d1d2d3de4e_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r24, %tid.x; + and.b32 %r25, %r24, 31; + ld.param.u64 %rd7, [triton__0d1d2d3de4e_param_2]; + and.b32 %r26, %r24, 3; + .loc 1 24 33 + bfe.u32 %r27, %r24, 5, 2; + and.b32 %r28, %r24, 127; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 21 33 + shl.b32 %r29, %r1, 2; + .loc 1 22 23 + or.b32 %r30, %r29, %r26; + .loc 1 29 25 + setp.lt.u32 %p1, %r28, 120; + .loc 1 31 47 + shl.b32 %r31, %r28, 17; + .loc 1 31 40 + add.s32 %r32, %r29, %r31; + .loc 1 31 34 + mul.wide.s32 %rd8, %r32, 4; + add.s64 %rd1, %rd5, %rd8; + mov.b32 %r6, 0; + .loc 1 31 53 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + .loc 1 33 23 + add.f32 %f5, %f1, 0f00000000; + add.f32 %f6, %f2, 0f00000000; + add.f32 %f7, %f3, 0f00000000; + add.f32 %f8, %f4, 0f00000000; + .loc 1 34 38 + selp.f32 %f9, %f5, 0f00000000, %p1; + selp.f32 %f10, %f6, 0f00000000, %p1; + selp.f32 %f11, %f7, 0f00000000, %p1; + selp.f32 %f12, %f8, 0f00000000, %p1; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r33, %f9; + shfl.sync.bfly.b32 %r34, %r33, 16, 31, -1; + mov.b32 %f13, %r34; +$L__tmp2: + .loc 2 233 15 + add.f32 %f14, %f9, %f13; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r35, %f14; + shfl.sync.bfly.b32 %r36, %r35, 8, 31, -1; + mov.b32 %f15, %r36; +$L__tmp4: + .loc 2 233 15 + add.f32 %f16, %f14, %f15; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r37, %f16; + shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1; + mov.b32 %f17, %r38; +$L__tmp6: + .loc 2 233 15 + add.f32 %f18, %f16, %f17; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r39, %f18; + shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1; + mov.b32 %f19, %r40; +$L__tmp8: + .loc 2 233 15 + add.f32 %f20, %f18, %f19; +$L__tmp9: + .loc 2 243 36 + mov.b32 %r41, %f20; + shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1; + mov.b32 %f21, %r42; +$L__tmp10: + .loc 2 233 15 + add.f32 %f22, %f20, %f21; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r43, %f10; + shfl.sync.bfly.b32 %r44, %r43, 16, 31, -1; + mov.b32 %f23, %r44; +$L__tmp12: + .loc 2 233 15 + add.f32 %f24, %f10, %f23; +$L__tmp13: + .loc 2 243 36 + mov.b32 %r45, %f24; + shfl.sync.bfly.b32 %r46, %r45, 8, 31, -1; + mov.b32 %f25, %r46; +$L__tmp14: + .loc 2 233 15 + add.f32 %f26, %f24, %f25; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r47, %f26; + shfl.sync.bfly.b32 %r48, %r47, 4, 31, -1; + mov.b32 %f27, %r48; +$L__tmp16: + .loc 2 233 15 + add.f32 %f28, %f26, %f27; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r49, %f28; + shfl.sync.bfly.b32 %r50, %r49, 2, 31, -1; + mov.b32 %f29, %r50; +$L__tmp18: + .loc 2 233 15 + add.f32 %f30, %f28, %f29; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r51, %f30; + shfl.sync.bfly.b32 %r52, %r51, 1, 31, -1; + mov.b32 %f31, %r52; +$L__tmp20: + .loc 2 233 15 + add.f32 %f32, %f30, %f31; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r53, %f11; + shfl.sync.bfly.b32 %r54, %r53, 16, 31, -1; + mov.b32 %f33, %r54; +$L__tmp22: + .loc 2 233 15 + add.f32 %f34, %f11, %f33; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r55, %f34; + shfl.sync.bfly.b32 %r56, %r55, 8, 31, -1; + mov.b32 %f35, %r56; +$L__tmp24: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r57, %f36; + shfl.sync.bfly.b32 %r58, %r57, 4, 31, -1; + mov.b32 %f37, %r58; +$L__tmp26: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r59, %f38; + shfl.sync.bfly.b32 %r60, %r59, 2, 31, -1; + mov.b32 %f39, %r60; +$L__tmp28: + .loc 2 233 15 + add.f32 %f40, %f38, %f39; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r61, %f40; + shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1; + mov.b32 %f41, %r62; +$L__tmp30: + .loc 2 233 15 + add.f32 %f42, %f40, %f41; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r63, %f12; + shfl.sync.bfly.b32 %r64, %r63, 16, 31, -1; + mov.b32 %f43, %r64; +$L__tmp32: + .loc 2 233 15 + add.f32 %f44, %f12, %f43; +$L__tmp33: + .loc 2 243 36 + mov.b32 %r65, %f44; + shfl.sync.bfly.b32 %r66, %r65, 8, 31, -1; + mov.b32 %f45, %r66; +$L__tmp34: + .loc 2 233 15 + add.f32 %f46, %f44, %f45; +$L__tmp35: + .loc 2 243 36 + mov.b32 %r67, %f46; + shfl.sync.bfly.b32 %r68, %r67, 4, 31, -1; + mov.b32 %f47, %r68; +$L__tmp36: + .loc 2 233 15 + add.f32 %f48, %f46, %f47; +$L__tmp37: + .loc 2 243 36 + mov.b32 %r69, %f48; + shfl.sync.bfly.b32 %r70, %r69, 2, 31, -1; + mov.b32 %f49, %r70; +$L__tmp38: + .loc 2 233 15 + add.f32 %f50, %f48, %f49; +$L__tmp39: + .loc 2 243 36 + mov.b32 %r71, %f50; + shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1; + mov.b32 %f51, %r72; +$L__tmp40: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp41: + .loc 2 243 36 + setp.eq.s32 %p6, %r25, 0; + shl.b32 %r73, %r27, 2; + mov.u32 %r74, global_smem; + add.s32 %r10, %r74, %r73; + mov.b32 %r11, %f22; + @%p6 st.shared.b32 [ %r10 + 0 ], %r11; + add.s32 %r12, %r10, 16; + mov.b32 %r13, %f32; + @%p6 st.shared.b32 [ %r12 + 0 ], %r13; + add.s32 %r14, %r10, 32; + mov.b32 %r15, %f42; + @%p6 st.shared.b32 [ %r14 + 0 ], %r15; + add.s32 %r16, %r10, 48; + mov.b32 %r17, %f52; + @%p6 st.shared.b32 [ %r16 + 0 ], %r17; + bar.sync 0; + setp.lt.s32 %p10, %r24, 16; + shl.b32 %r75, %r24, 2; + add.s32 %r19, %r74, %r75; + @%p10 ld.shared.b32 %r18, [ %r19 + 0 ]; + mov.b32 %f53, %r18; + shfl.sync.bfly.b32 %r76, %r18, 2, 31, -1; + mov.b32 %f54, %r76; +$L__tmp42: + .loc 2 233 15 + add.f32 %f55, %f53, %f54; +$L__tmp43: + .loc 2 243 36 + mov.b32 %r77, %f55; + shfl.sync.bfly.b32 %r78, %r77, 1, 31, -1; + mov.b32 %f56, %r78; +$L__tmp44: + .loc 2 233 15 + add.f32 %f57, %f55, %f56; +$L__tmp45: + .loc 2 243 36 + setp.eq.s32 %p14, %r26, 0; + and.pred %p11, %p10, %p14; + mov.b32 %r21, %f57; + @%p11 st.shared.b32 [ %r19 + 0 ], %r21; + bar.sync 0; + ld.shared.f32 %f58, [global_smem]; + ld.shared.f32 %f59, [global_smem+16]; + ld.shared.f32 %f60, [global_smem+32]; + ld.shared.f32 %f61, [global_smem+48]; +$L__tmp46: + .loc 1 35 28 + bar.sync 0; + st.shared.f32 [global_smem], %f58; + st.shared.f32 [global_smem+4], %f59; + st.shared.f32 [global_smem+8], %f60; + st.shared.f32 [global_smem+12], %f61; + bar.sync 0; + shl.b32 %r79, %r26, 2; + add.s32 %r80, %r74, %r79; + .loc 1 36 20 + shr.s32 %r82, %r30, 31; + shr.u32 %r83, %r82, 24; + add.s32 %r84, %r30, %r83; + shr.s32 %r85, %r84, 8; + and.b32 %r86, %r84, -256; + sub.s32 %r87, %r30, %r86; + .loc 1 38 30 + mul.wide.s32 %rd9, %r85, 8; + add.s64 %rd3, %rd6, %rd9; + .loc 1 45 55 + ld.shared.u32 %r23, [%r80]; + mov.pred %p12, -1; + .loc 1 38 35 + mov.u64 %rd2, 0x0; + @%p12 ld.global.L1::evict_last.b64 { %rd2 }, [ %rd3 + 0 ]; + .loc 1 41 32 + shr.u64 %rd10, %rd2, 54; + and.b64 %rd11, %rd10, 512; + add.s64 %rd12, %rd11, %rd2; + .loc 1 45 30 + shl.b64 %rd13, %rd12, 10; + add.s64 %rd14, %rd7, %rd13; + mul.wide.s32 %rd15, %r87, 4; + add.s64 %rd4, %rd14, %rd15; + .loc 1 45 55 + bfe.u32 %r88, %r24, 2, 3; + shl.b32 %r89, %r27, 3; + or.b32 %r90, %r89, %r88; + setp.eq.s32 %p13, %r90, 0; + mov.u32 %r22, 0x0; + @%p13 atom.global.gpu.acq_rel.add.f32 %r22, [ %rd4 + 0 ], %r23; + .loc 1 45 4 + ret; +$L__tmp47: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp46 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp45 +.b8 2 +.b8 35 +.b8 25 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp45 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..fa45d8a9764aa0d88c392f939d5f613c3b46a9be --- /dev/null +++ b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttir @@ -0,0 +1,53 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<4x1xi64> + %cst_0 = arith.constant dense<0> : tensor<4x1xi64> + %cst_1 = arith.constant dense<512> : tensor<4x1xi64> + %cst_2 = arith.constant dense : tensor<4x1xi1> + %cst_3 = arith.constant dense<256> : tensor<4x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x128xi32> + %cst_5 = arith.constant dense<120> : tensor<1x128xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<4x128xf32> + %c4_i32 = arith.constant 4 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c4_i32 : i32 + %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32>) -> tensor<4x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<4x1xi32> + %5 = arith.addi %4, %3 : tensor<4x1xi32> + %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32> + %8 = arith.cmpi slt, %7, %cst_5 : tensor<1x128xi32> + %9 = arith.muli %7, %cst_4 : tensor<1x128xi32> + %10 = tt.broadcast %5 : (tensor<4x1xi32>) -> tensor<4x128xi32> + %11 = tt.broadcast %9 : (tensor<1x128xi32>) -> tensor<4x128xi32> + %12 = arith.addi %10, %11 : tensor<4x128xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<4x128x!tt.ptr> + %14 = tt.addptr %13, %12 : tensor<4x128x!tt.ptr>, tensor<4x128xi32> + %15 = tt.broadcast %8 : (tensor<1x128xi1>) -> tensor<4x128xi1> + %16 = tt.load %14, %15, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32> + %17 = arith.addf %16, %cst_6 : tensor<4x128xf32> + %18 = arith.select %15, %17, %cst_6 : tensor<4x128xi1>, tensor<4x128xf32> + %19 = "tt.reduce"(%18) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %35 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %35 : f32 + }) : (tensor<4x128xf32>) -> tensor<4xf32> + %20 = tt.expand_dims %19 {axis = 1 : i32} : (tensor<4xf32>) -> tensor<4x1xf32> + %21 = arith.divsi %5, %cst_3 : tensor<4x1xi32> + %22 = arith.remsi %5, %cst_3 : tensor<4x1xi32> + %23 = tt.splat %arg1 : (!tt.ptr) -> tensor<4x1x!tt.ptr> + %24 = tt.addptr %23, %21 : tensor<4x1x!tt.ptr>, tensor<4x1xi32> + %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64> + %26 = arith.addi %25, %cst_1 : tensor<4x1xi64> + %27 = arith.cmpi slt, %25, %cst_0 : tensor<4x1xi64> + %28 = arith.select %27, %26, %25 : tensor<4x1xi1>, tensor<4x1xi64> + %29 = arith.muli %28, %cst : tensor<4x1xi64> + %30 = arith.extsi %22 : tensor<4x1xi32> to tensor<4x1xi64> + %31 = arith.addi %30, %29 : tensor<4x1xi64> + %32 = tt.splat %arg2 : (!tt.ptr) -> tensor<4x1x!tt.ptr> + %33 = tt.addptr %32, %31 : tensor<4x1x!tt.ptr>, tensor<4x1xi64> + %34 = "tt.atomic_rmw"(%33, %20, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr>, tensor<4x1xf32>, tensor<4x1xi1>) -> tensor<4x1xf32> + tt.return + } +} diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..62df3ec662d11705beb899d39cea9640ae4ff22c Binary files /dev/null and b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.cubin differ diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..04ba449ff9d9cce468828882c57fc782ee4fb120 --- /dev/null +++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.llir @@ -0,0 +1,858 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 { + %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %4 = shl i32 %3, 3, !dbg !10 + %5 = and i32 %4, 1016, !dbg !10 + %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11 + %7 = shl i32 %6, 10, !dbg !12 + %8 = or i32 %7, %5, !dbg !13 + %9 = sext i32 %8 to i64, !dbg !14 + %10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14 + %11 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15 + %12 = extractvalue { i32, i32, i32, i32 } %11, 0, !dbg !15 + %13 = extractvalue { i32, i32, i32, i32 } %11, 1, !dbg !15 + %14 = extractvalue { i32, i32, i32, i32 } %11, 2, !dbg !15 + %15 = extractvalue { i32, i32, i32, i32 } %11, 3, !dbg !15 + %16 = trunc i32 %12 to i16, !dbg !15 + %extelt.offset = lshr i32 %12, 16, !dbg !15 + %17 = trunc i32 %extelt.offset to i16, !dbg !15 + %18 = trunc i32 %13 to i16, !dbg !15 + %extelt.offset1 = lshr i32 %13, 16, !dbg !15 + %19 = trunc i32 %extelt.offset1 to i16, !dbg !15 + %20 = trunc i32 %14 to i16, !dbg !15 + %extelt.offset2 = lshr i32 %14, 16, !dbg !15 + %21 = trunc i32 %extelt.offset2 to i16, !dbg !15 + %22 = trunc i32 %15 to i16, !dbg !15 + %extelt.offset3 = lshr i32 %15, 16, !dbg !15 + %23 = trunc i32 %extelt.offset3 to i16, !dbg !15 + %24 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %16) #4, !dbg !16 + %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16 + %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16 + %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16 + %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16 + %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16 + %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16 + %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16 + %32 = fmul float %24, 0x3FE6A09E60000000, !dbg !17 + %33 = fmul float %25, 0x3FE6A09E60000000, !dbg !17 + %34 = fmul float %26, 0x3FE6A09E60000000, !dbg !17 + %35 = fmul float %27, 0x3FE6A09E60000000, !dbg !17 + %36 = fmul float %28, 0x3FE6A09E60000000, !dbg !17 + %37 = fmul float %29, 0x3FE6A09E60000000, !dbg !17 + %38 = fmul float %30, 0x3FE6A09E60000000, !dbg !17 + %39 = fmul float %31, 0x3FE6A09E60000000, !dbg !17 + %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i = icmp eq i32 %40, 0, !dbg !18 + %41 = tail call float @llvm.nvvm.fabs.ftz.f(float %32) #4, !dbg !18 + %42 = tail call float @llvm.nvvm.fabs.f(float %32) #4, !dbg !18 + %.0.i = select i1 %.not.i, float %42, float %41, !dbg !18 + %43 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18 + br i1 %43, label %__nv_fabsf.exit1.i, label %45, !dbg !18 + +__nv_fabsf.exit1.i: ; preds = %2 + %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i = icmp eq i32 %44, 0, !dbg !18 + %.01.i = select i1 %.not1.i, float %42, float %41, !dbg !18 + br label %__internal_fmad.exit.i, !dbg !18 + +45: ; preds = %2 + %46 = fmul float %32, %32, !dbg !18 + br label %__internal_fmad.exit.i, !dbg !18 + +__internal_fmad.exit.i: ; preds = %45, %__nv_fabsf.exit1.i + %47 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %45 ], !dbg !18 + %48 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %45 ], !dbg !18 + %49 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %45 ], !dbg !18 + %50 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %45 ], !dbg !18 + %51 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %45 ], !dbg !18 + %52 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %45 ], !dbg !18 + %53 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %45 ], !dbg !18 + %54 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %46, %45 ], !dbg !18 + %55 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i = icmp eq i32 %55, 0, !dbg !18 + %56 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %53, float %54, float %52) #4, !dbg !18 + %57 = tail call float @llvm.nvvm.fma.rn.f(float %53, float %54, float %52) #4, !dbg !18 + %.02.i = select i1 %.not2.i, float %57, float %56, !dbg !18 + %58 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i = icmp eq i32 %58, 0, !dbg !18 + %59 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %54, float %51) #4, !dbg !18 + %60 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %54, float %51) #4, !dbg !18 + %.03.i = select i1 %.not3.i, float %60, float %59, !dbg !18 + %61 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i = icmp eq i32 %61, 0, !dbg !18 + %62 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %54, float %50) #4, !dbg !18 + %63 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %54, float %50) #4, !dbg !18 + %.04.i = select i1 %.not4.i, float %63, float %62, !dbg !18 + %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i = icmp eq i32 %64, 0, !dbg !18 + %65 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %54, float %49) #4, !dbg !18 + %66 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %54, float %49) #4, !dbg !18 + %.05.i = select i1 %.not5.i, float %66, float %65, !dbg !18 + %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i = icmp eq i32 %67, 0, !dbg !18 + %68 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %54, float %48) #4, !dbg !18 + %69 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %54, float %48) #4, !dbg !18 + %.06.i = select i1 %.not6.i, float %69, float %68, !dbg !18 + %70 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i = icmp eq i32 %70, 0, !dbg !18 + %71 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %54, float %47) #4, !dbg !18 + %72 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %54, float %47) #4, !dbg !18 + %.07.i = select i1 %.not7.i, float %72, float %71, !dbg !18 + %73 = fneg float %54, !dbg !18 + %74 = select i1 %43, float %73, float %32, !dbg !18 + %75 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i = icmp eq i32 %75, 0, !dbg !18 + %76 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %74, float %74) #4, !dbg !18 + %77 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %74, float %74) #4, !dbg !18 + %.08.i = select i1 %.not8.i, float %77, float %76, !dbg !18 + br i1 %43, label %78, label %__nv_erff.exit, !dbg !18 + +78: ; preds = %__internal_fmad.exit.i + %79 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18 + %80 = fsub float 1.000000e+00, %79, !dbg !18 + %81 = bitcast float %80 to i32, !dbg !18 + %82 = bitcast float %32 to i32, !dbg !18 + %83 = and i32 %82, -2147483648, !dbg !18 + %84 = or i32 %83, %81, !dbg !18 + %85 = bitcast i32 %84 to float, !dbg !18 + br label %__nv_erff.exit, !dbg !18 + +__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %78 + %r.0.i = phi float [ %85, %78 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18 + %86 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i4 = icmp eq i32 %86, 0, !dbg !18 + %87 = tail call float @llvm.nvvm.fabs.ftz.f(float %33) #4, !dbg !18 + %88 = tail call float @llvm.nvvm.fabs.f(float %33) #4, !dbg !18 + %.0.i5 = select i1 %.not.i4, float %88, float %87, !dbg !18 + %89 = fcmp oge float %.0.i5, 0x3FF00C1FC0000000, !dbg !18 + br i1 %89, label %__nv_fabsf.exit1.i22, label %91, !dbg !18 + +__nv_fabsf.exit1.i22: ; preds = %__nv_erff.exit + %90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i23 = icmp eq i32 %90, 0, !dbg !18 + %.01.i24 = select i1 %.not1.i23, float %88, float %87, !dbg !18 + br label %__internal_fmad.exit.i6, !dbg !18 + +91: ; preds = %__nv_erff.exit + %92 = fmul float %33, %33, !dbg !18 + br label %__internal_fmad.exit.i6, !dbg !18 + +__internal_fmad.exit.i6: ; preds = %91, %__nv_fabsf.exit1.i22 + %93 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i22 ], [ 0x3FC06EBA60000000, %91 ], !dbg !18 + %94 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i22 ], [ 0xBFD8127580000000, %91 ], !dbg !18 + %95 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i22 ], [ 0x3FBCE315E0000000, %91 ], !dbg !18 + %96 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i22 ], [ 0xBF9B837CE0000000, %91 ], !dbg !18 + %97 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i22 ], [ 0x3F755ABD40000000, %91 ], !dbg !18 + %98 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i22 ], [ 0xBF4AE9A400000000, %91 ], !dbg !18 + %99 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i22 ], [ 0x3F163D2D40000000, %91 ], !dbg !18 + %100 = phi float [ %.01.i24, %__nv_fabsf.exit1.i22 ], [ %92, %91 ], !dbg !18 + %101 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i7 = icmp eq i32 %101, 0, !dbg !18 + %102 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %99, float %100, float %98) #4, !dbg !18 + %103 = tail call float @llvm.nvvm.fma.rn.f(float %99, float %100, float %98) #4, !dbg !18 + %.02.i8 = select i1 %.not2.i7, float %103, float %102, !dbg !18 + %104 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i9 = icmp eq i32 %104, 0, !dbg !18 + %105 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i8, float %100, float %97) #4, !dbg !18 + %106 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i8, float %100, float %97) #4, !dbg !18 + %.03.i10 = select i1 %.not3.i9, float %106, float %105, !dbg !18 + %107 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i11 = icmp eq i32 %107, 0, !dbg !18 + %108 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i10, float %100, float %96) #4, !dbg !18 + %109 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i10, float %100, float %96) #4, !dbg !18 + %.04.i12 = select i1 %.not4.i11, float %109, float %108, !dbg !18 + %110 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i13 = icmp eq i32 %110, 0, !dbg !18 + %111 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i12, float %100, float %95) #4, !dbg !18 + %112 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i12, float %100, float %95) #4, !dbg !18 + %.05.i14 = select i1 %.not5.i13, float %112, float %111, !dbg !18 + %113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i15 = icmp eq i32 %113, 0, !dbg !18 + %114 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i14, float %100, float %94) #4, !dbg !18 + %115 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i14, float %100, float %94) #4, !dbg !18 + %.06.i16 = select i1 %.not6.i15, float %115, float %114, !dbg !18 + %116 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i17 = icmp eq i32 %116, 0, !dbg !18 + %117 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i16, float %100, float %93) #4, !dbg !18 + %118 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i16, float %100, float %93) #4, !dbg !18 + %.07.i18 = select i1 %.not7.i17, float %118, float %117, !dbg !18 + %119 = fneg float %100, !dbg !18 + %120 = select i1 %89, float %119, float %33, !dbg !18 + %121 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i19 = icmp eq i32 %121, 0, !dbg !18 + %122 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i18, float %120, float %120) #4, !dbg !18 + %123 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i18, float %120, float %120) #4, !dbg !18 + %.08.i20 = select i1 %.not8.i19, float %123, float %122, !dbg !18 + br i1 %89, label %124, label %__nv_erff.exit25, !dbg !18 + +124: ; preds = %__internal_fmad.exit.i6 + %125 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i20) #4, !dbg !18 + %126 = fsub float 1.000000e+00, %125, !dbg !18 + %127 = bitcast float %126 to i32, !dbg !18 + %128 = bitcast float %33 to i32, !dbg !18 + %129 = and i32 %128, -2147483648, !dbg !18 + %130 = or i32 %129, %127, !dbg !18 + %131 = bitcast i32 %130 to float, !dbg !18 + br label %__nv_erff.exit25, !dbg !18 + +__nv_erff.exit25: ; preds = %__internal_fmad.exit.i6, %124 + %r.0.i21 = phi float [ %131, %124 ], [ %.08.i20, %__internal_fmad.exit.i6 ], !dbg !18 + %132 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i26 = icmp eq i32 %132, 0, !dbg !18 + %133 = tail call float @llvm.nvvm.fabs.ftz.f(float %34) #4, !dbg !18 + %134 = tail call float @llvm.nvvm.fabs.f(float %34) #4, !dbg !18 + %.0.i27 = select i1 %.not.i26, float %134, float %133, !dbg !18 + %135 = fcmp oge float %.0.i27, 0x3FF00C1FC0000000, !dbg !18 + br i1 %135, label %__nv_fabsf.exit1.i44, label %137, !dbg !18 + +__nv_fabsf.exit1.i44: ; preds = %__nv_erff.exit25 + %136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i45 = icmp eq i32 %136, 0, !dbg !18 + %.01.i46 = select i1 %.not1.i45, float %134, float %133, !dbg !18 + br label %__internal_fmad.exit.i28, !dbg !18 + +137: ; preds = %__nv_erff.exit25 + %138 = fmul float %34, %34, !dbg !18 + br label %__internal_fmad.exit.i28, !dbg !18 + +__internal_fmad.exit.i28: ; preds = %137, %__nv_fabsf.exit1.i44 + %139 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i44 ], [ 0x3FC06EBA60000000, %137 ], !dbg !18 + %140 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i44 ], [ 0xBFD8127580000000, %137 ], !dbg !18 + %141 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i44 ], [ 0x3FBCE315E0000000, %137 ], !dbg !18 + %142 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i44 ], [ 0xBF9B837CE0000000, %137 ], !dbg !18 + %143 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i44 ], [ 0x3F755ABD40000000, %137 ], !dbg !18 + %144 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i44 ], [ 0xBF4AE9A400000000, %137 ], !dbg !18 + %145 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i44 ], [ 0x3F163D2D40000000, %137 ], !dbg !18 + %146 = phi float [ %.01.i46, %__nv_fabsf.exit1.i44 ], [ %138, %137 ], !dbg !18 + %147 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i29 = icmp eq i32 %147, 0, !dbg !18 + %148 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %145, float %146, float %144) #4, !dbg !18 + %149 = tail call float @llvm.nvvm.fma.rn.f(float %145, float %146, float %144) #4, !dbg !18 + %.02.i30 = select i1 %.not2.i29, float %149, float %148, !dbg !18 + %150 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i31 = icmp eq i32 %150, 0, !dbg !18 + %151 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float %146, float %143) #4, !dbg !18 + %152 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float %146, float %143) #4, !dbg !18 + %.03.i32 = select i1 %.not3.i31, float %152, float %151, !dbg !18 + %153 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i33 = icmp eq i32 %153, 0, !dbg !18 + %154 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i32, float %146, float %142) #4, !dbg !18 + %155 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i32, float %146, float %142) #4, !dbg !18 + %.04.i34 = select i1 %.not4.i33, float %155, float %154, !dbg !18 + %156 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i35 = icmp eq i32 %156, 0, !dbg !18 + %157 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i34, float %146, float %141) #4, !dbg !18 + %158 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i34, float %146, float %141) #4, !dbg !18 + %.05.i36 = select i1 %.not5.i35, float %158, float %157, !dbg !18 + %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i37 = icmp eq i32 %159, 0, !dbg !18 + %160 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i36, float %146, float %140) #4, !dbg !18 + %161 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i36, float %146, float %140) #4, !dbg !18 + %.06.i38 = select i1 %.not6.i37, float %161, float %160, !dbg !18 + %162 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i39 = icmp eq i32 %162, 0, !dbg !18 + %163 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i38, float %146, float %139) #4, !dbg !18 + %164 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i38, float %146, float %139) #4, !dbg !18 + %.07.i40 = select i1 %.not7.i39, float %164, float %163, !dbg !18 + %165 = fneg float %146, !dbg !18 + %166 = select i1 %135, float %165, float %34, !dbg !18 + %167 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i41 = icmp eq i32 %167, 0, !dbg !18 + %168 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i40, float %166, float %166) #4, !dbg !18 + %169 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i40, float %166, float %166) #4, !dbg !18 + %.08.i42 = select i1 %.not8.i41, float %169, float %168, !dbg !18 + br i1 %135, label %170, label %__nv_erff.exit47, !dbg !18 + +170: ; preds = %__internal_fmad.exit.i28 + %171 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i42) #4, !dbg !18 + %172 = fsub float 1.000000e+00, %171, !dbg !18 + %173 = bitcast float %172 to i32, !dbg !18 + %174 = bitcast float %34 to i32, !dbg !18 + %175 = and i32 %174, -2147483648, !dbg !18 + %176 = or i32 %175, %173, !dbg !18 + %177 = bitcast i32 %176 to float, !dbg !18 + br label %__nv_erff.exit47, !dbg !18 + +__nv_erff.exit47: ; preds = %__internal_fmad.exit.i28, %170 + %r.0.i43 = phi float [ %177, %170 ], [ %.08.i42, %__internal_fmad.exit.i28 ], !dbg !18 + %178 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i48 = icmp eq i32 %178, 0, !dbg !18 + %179 = tail call float @llvm.nvvm.fabs.ftz.f(float %35) #4, !dbg !18 + %180 = tail call float @llvm.nvvm.fabs.f(float %35) #4, !dbg !18 + %.0.i49 = select i1 %.not.i48, float %180, float %179, !dbg !18 + %181 = fcmp oge float %.0.i49, 0x3FF00C1FC0000000, !dbg !18 + br i1 %181, label %__nv_fabsf.exit1.i66, label %183, !dbg !18 + +__nv_fabsf.exit1.i66: ; preds = %__nv_erff.exit47 + %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i67 = icmp eq i32 %182, 0, !dbg !18 + %.01.i68 = select i1 %.not1.i67, float %180, float %179, !dbg !18 + br label %__internal_fmad.exit.i50, !dbg !18 + +183: ; preds = %__nv_erff.exit47 + %184 = fmul float %35, %35, !dbg !18 + br label %__internal_fmad.exit.i50, !dbg !18 + +__internal_fmad.exit.i50: ; preds = %183, %__nv_fabsf.exit1.i66 + %185 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i66 ], [ 0x3FC06EBA60000000, %183 ], !dbg !18 + %186 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i66 ], [ 0xBFD8127580000000, %183 ], !dbg !18 + %187 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i66 ], [ 0x3FBCE315E0000000, %183 ], !dbg !18 + %188 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i66 ], [ 0xBF9B837CE0000000, %183 ], !dbg !18 + %189 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i66 ], [ 0x3F755ABD40000000, %183 ], !dbg !18 + %190 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i66 ], [ 0xBF4AE9A400000000, %183 ], !dbg !18 + %191 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i66 ], [ 0x3F163D2D40000000, %183 ], !dbg !18 + %192 = phi float [ %.01.i68, %__nv_fabsf.exit1.i66 ], [ %184, %183 ], !dbg !18 + %193 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i51 = icmp eq i32 %193, 0, !dbg !18 + %194 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %191, float %192, float %190) #4, !dbg !18 + %195 = tail call float @llvm.nvvm.fma.rn.f(float %191, float %192, float %190) #4, !dbg !18 + %.02.i52 = select i1 %.not2.i51, float %195, float %194, !dbg !18 + %196 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i53 = icmp eq i32 %196, 0, !dbg !18 + %197 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i52, float %192, float %189) #4, !dbg !18 + %198 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i52, float %192, float %189) #4, !dbg !18 + %.03.i54 = select i1 %.not3.i53, float %198, float %197, !dbg !18 + %199 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i55 = icmp eq i32 %199, 0, !dbg !18 + %200 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i54, float %192, float %188) #4, !dbg !18 + %201 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i54, float %192, float %188) #4, !dbg !18 + %.04.i56 = select i1 %.not4.i55, float %201, float %200, !dbg !18 + %202 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i57 = icmp eq i32 %202, 0, !dbg !18 + %203 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i56, float %192, float %187) #4, !dbg !18 + %204 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i56, float %192, float %187) #4, !dbg !18 + %.05.i58 = select i1 %.not5.i57, float %204, float %203, !dbg !18 + %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i59 = icmp eq i32 %205, 0, !dbg !18 + %206 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i58, float %192, float %186) #4, !dbg !18 + %207 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i58, float %192, float %186) #4, !dbg !18 + %.06.i60 = select i1 %.not6.i59, float %207, float %206, !dbg !18 + %208 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i61 = icmp eq i32 %208, 0, !dbg !18 + %209 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i60, float %192, float %185) #4, !dbg !18 + %210 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i60, float %192, float %185) #4, !dbg !18 + %.07.i62 = select i1 %.not7.i61, float %210, float %209, !dbg !18 + %211 = fneg float %192, !dbg !18 + %212 = select i1 %181, float %211, float %35, !dbg !18 + %213 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i63 = icmp eq i32 %213, 0, !dbg !18 + %214 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i62, float %212, float %212) #4, !dbg !18 + %215 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i62, float %212, float %212) #4, !dbg !18 + %.08.i64 = select i1 %.not8.i63, float %215, float %214, !dbg !18 + br i1 %181, label %216, label %__nv_erff.exit69, !dbg !18 + +216: ; preds = %__internal_fmad.exit.i50 + %217 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i64) #4, !dbg !18 + %218 = fsub float 1.000000e+00, %217, !dbg !18 + %219 = bitcast float %218 to i32, !dbg !18 + %220 = bitcast float %35 to i32, !dbg !18 + %221 = and i32 %220, -2147483648, !dbg !18 + %222 = or i32 %221, %219, !dbg !18 + %223 = bitcast i32 %222 to float, !dbg !18 + br label %__nv_erff.exit69, !dbg !18 + +__nv_erff.exit69: ; preds = %__internal_fmad.exit.i50, %216 + %r.0.i65 = phi float [ %223, %216 ], [ %.08.i64, %__internal_fmad.exit.i50 ], !dbg !18 + %224 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i70 = icmp eq i32 %224, 0, !dbg !18 + %225 = tail call float @llvm.nvvm.fabs.ftz.f(float %36) #4, !dbg !18 + %226 = tail call float @llvm.nvvm.fabs.f(float %36) #4, !dbg !18 + %.0.i71 = select i1 %.not.i70, float %226, float %225, !dbg !18 + %227 = fcmp oge float %.0.i71, 0x3FF00C1FC0000000, !dbg !18 + br i1 %227, label %__nv_fabsf.exit1.i88, label %229, !dbg !18 + +__nv_fabsf.exit1.i88: ; preds = %__nv_erff.exit69 + %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i89 = icmp eq i32 %228, 0, !dbg !18 + %.01.i90 = select i1 %.not1.i89, float %226, float %225, !dbg !18 + br label %__internal_fmad.exit.i72, !dbg !18 + +229: ; preds = %__nv_erff.exit69 + %230 = fmul float %36, %36, !dbg !18 + br label %__internal_fmad.exit.i72, !dbg !18 + +__internal_fmad.exit.i72: ; preds = %229, %__nv_fabsf.exit1.i88 + %231 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i88 ], [ 0x3FC06EBA60000000, %229 ], !dbg !18 + %232 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i88 ], [ 0xBFD8127580000000, %229 ], !dbg !18 + %233 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i88 ], [ 0x3FBCE315E0000000, %229 ], !dbg !18 + %234 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i88 ], [ 0xBF9B837CE0000000, %229 ], !dbg !18 + %235 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i88 ], [ 0x3F755ABD40000000, %229 ], !dbg !18 + %236 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i88 ], [ 0xBF4AE9A400000000, %229 ], !dbg !18 + %237 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i88 ], [ 0x3F163D2D40000000, %229 ], !dbg !18 + %238 = phi float [ %.01.i90, %__nv_fabsf.exit1.i88 ], [ %230, %229 ], !dbg !18 + %239 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i73 = icmp eq i32 %239, 0, !dbg !18 + %240 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %237, float %238, float %236) #4, !dbg !18 + %241 = tail call float @llvm.nvvm.fma.rn.f(float %237, float %238, float %236) #4, !dbg !18 + %.02.i74 = select i1 %.not2.i73, float %241, float %240, !dbg !18 + %242 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i75 = icmp eq i32 %242, 0, !dbg !18 + %243 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i74, float %238, float %235) #4, !dbg !18 + %244 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i74, float %238, float %235) #4, !dbg !18 + %.03.i76 = select i1 %.not3.i75, float %244, float %243, !dbg !18 + %245 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i77 = icmp eq i32 %245, 0, !dbg !18 + %246 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i76, float %238, float %234) #4, !dbg !18 + %247 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i76, float %238, float %234) #4, !dbg !18 + %.04.i78 = select i1 %.not4.i77, float %247, float %246, !dbg !18 + %248 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i79 = icmp eq i32 %248, 0, !dbg !18 + %249 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i78, float %238, float %233) #4, !dbg !18 + %250 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i78, float %238, float %233) #4, !dbg !18 + %.05.i80 = select i1 %.not5.i79, float %250, float %249, !dbg !18 + %251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i81 = icmp eq i32 %251, 0, !dbg !18 + %252 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i80, float %238, float %232) #4, !dbg !18 + %253 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i80, float %238, float %232) #4, !dbg !18 + %.06.i82 = select i1 %.not6.i81, float %253, float %252, !dbg !18 + %254 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i83 = icmp eq i32 %254, 0, !dbg !18 + %255 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i82, float %238, float %231) #4, !dbg !18 + %256 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i82, float %238, float %231) #4, !dbg !18 + %.07.i84 = select i1 %.not7.i83, float %256, float %255, !dbg !18 + %257 = fneg float %238, !dbg !18 + %258 = select i1 %227, float %257, float %36, !dbg !18 + %259 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i85 = icmp eq i32 %259, 0, !dbg !18 + %260 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i84, float %258, float %258) #4, !dbg !18 + %261 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i84, float %258, float %258) #4, !dbg !18 + %.08.i86 = select i1 %.not8.i85, float %261, float %260, !dbg !18 + br i1 %227, label %262, label %__nv_erff.exit91, !dbg !18 + +262: ; preds = %__internal_fmad.exit.i72 + %263 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i86) #4, !dbg !18 + %264 = fsub float 1.000000e+00, %263, !dbg !18 + %265 = bitcast float %264 to i32, !dbg !18 + %266 = bitcast float %36 to i32, !dbg !18 + %267 = and i32 %266, -2147483648, !dbg !18 + %268 = or i32 %267, %265, !dbg !18 + %269 = bitcast i32 %268 to float, !dbg !18 + br label %__nv_erff.exit91, !dbg !18 + +__nv_erff.exit91: ; preds = %__internal_fmad.exit.i72, %262 + %r.0.i87 = phi float [ %269, %262 ], [ %.08.i86, %__internal_fmad.exit.i72 ], !dbg !18 + %270 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i92 = icmp eq i32 %270, 0, !dbg !18 + %271 = tail call float @llvm.nvvm.fabs.ftz.f(float %37) #4, !dbg !18 + %272 = tail call float @llvm.nvvm.fabs.f(float %37) #4, !dbg !18 + %.0.i93 = select i1 %.not.i92, float %272, float %271, !dbg !18 + %273 = fcmp oge float %.0.i93, 0x3FF00C1FC0000000, !dbg !18 + br i1 %273, label %__nv_fabsf.exit1.i110, label %275, !dbg !18 + +__nv_fabsf.exit1.i110: ; preds = %__nv_erff.exit91 + %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i111 = icmp eq i32 %274, 0, !dbg !18 + %.01.i112 = select i1 %.not1.i111, float %272, float %271, !dbg !18 + br label %__internal_fmad.exit.i94, !dbg !18 + +275: ; preds = %__nv_erff.exit91 + %276 = fmul float %37, %37, !dbg !18 + br label %__internal_fmad.exit.i94, !dbg !18 + +__internal_fmad.exit.i94: ; preds = %275, %__nv_fabsf.exit1.i110 + %277 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i110 ], [ 0x3FC06EBA60000000, %275 ], !dbg !18 + %278 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i110 ], [ 0xBFD8127580000000, %275 ], !dbg !18 + %279 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i110 ], [ 0x3FBCE315E0000000, %275 ], !dbg !18 + %280 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i110 ], [ 0xBF9B837CE0000000, %275 ], !dbg !18 + %281 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i110 ], [ 0x3F755ABD40000000, %275 ], !dbg !18 + %282 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i110 ], [ 0xBF4AE9A400000000, %275 ], !dbg !18 + %283 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i110 ], [ 0x3F163D2D40000000, %275 ], !dbg !18 + %284 = phi float [ %.01.i112, %__nv_fabsf.exit1.i110 ], [ %276, %275 ], !dbg !18 + %285 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i95 = icmp eq i32 %285, 0, !dbg !18 + %286 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %283, float %284, float %282) #4, !dbg !18 + %287 = tail call float @llvm.nvvm.fma.rn.f(float %283, float %284, float %282) #4, !dbg !18 + %.02.i96 = select i1 %.not2.i95, float %287, float %286, !dbg !18 + %288 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i97 = icmp eq i32 %288, 0, !dbg !18 + %289 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i96, float %284, float %281) #4, !dbg !18 + %290 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i96, float %284, float %281) #4, !dbg !18 + %.03.i98 = select i1 %.not3.i97, float %290, float %289, !dbg !18 + %291 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i99 = icmp eq i32 %291, 0, !dbg !18 + %292 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i98, float %284, float %280) #4, !dbg !18 + %293 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i98, float %284, float %280) #4, !dbg !18 + %.04.i100 = select i1 %.not4.i99, float %293, float %292, !dbg !18 + %294 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i101 = icmp eq i32 %294, 0, !dbg !18 + %295 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i100, float %284, float %279) #4, !dbg !18 + %296 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i100, float %284, float %279) #4, !dbg !18 + %.05.i102 = select i1 %.not5.i101, float %296, float %295, !dbg !18 + %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i103 = icmp eq i32 %297, 0, !dbg !18 + %298 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i102, float %284, float %278) #4, !dbg !18 + %299 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i102, float %284, float %278) #4, !dbg !18 + %.06.i104 = select i1 %.not6.i103, float %299, float %298, !dbg !18 + %300 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i105 = icmp eq i32 %300, 0, !dbg !18 + %301 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i104, float %284, float %277) #4, !dbg !18 + %302 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i104, float %284, float %277) #4, !dbg !18 + %.07.i106 = select i1 %.not7.i105, float %302, float %301, !dbg !18 + %303 = fneg float %284, !dbg !18 + %304 = select i1 %273, float %303, float %37, !dbg !18 + %305 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i107 = icmp eq i32 %305, 0, !dbg !18 + %306 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i106, float %304, float %304) #4, !dbg !18 + %307 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i106, float %304, float %304) #4, !dbg !18 + %.08.i108 = select i1 %.not8.i107, float %307, float %306, !dbg !18 + br i1 %273, label %308, label %__nv_erff.exit113, !dbg !18 + +308: ; preds = %__internal_fmad.exit.i94 + %309 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i108) #4, !dbg !18 + %310 = fsub float 1.000000e+00, %309, !dbg !18 + %311 = bitcast float %310 to i32, !dbg !18 + %312 = bitcast float %37 to i32, !dbg !18 + %313 = and i32 %312, -2147483648, !dbg !18 + %314 = or i32 %313, %311, !dbg !18 + %315 = bitcast i32 %314 to float, !dbg !18 + br label %__nv_erff.exit113, !dbg !18 + +__nv_erff.exit113: ; preds = %__internal_fmad.exit.i94, %308 + %r.0.i109 = phi float [ %315, %308 ], [ %.08.i108, %__internal_fmad.exit.i94 ], !dbg !18 + %316 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i114 = icmp eq i32 %316, 0, !dbg !18 + %317 = tail call float @llvm.nvvm.fabs.ftz.f(float %38) #4, !dbg !18 + %318 = tail call float @llvm.nvvm.fabs.f(float %38) #4, !dbg !18 + %.0.i115 = select i1 %.not.i114, float %318, float %317, !dbg !18 + %319 = fcmp oge float %.0.i115, 0x3FF00C1FC0000000, !dbg !18 + br i1 %319, label %__nv_fabsf.exit1.i132, label %321, !dbg !18 + +__nv_fabsf.exit1.i132: ; preds = %__nv_erff.exit113 + %320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i133 = icmp eq i32 %320, 0, !dbg !18 + %.01.i134 = select i1 %.not1.i133, float %318, float %317, !dbg !18 + br label %__internal_fmad.exit.i116, !dbg !18 + +321: ; preds = %__nv_erff.exit113 + %322 = fmul float %38, %38, !dbg !18 + br label %__internal_fmad.exit.i116, !dbg !18 + +__internal_fmad.exit.i116: ; preds = %321, %__nv_fabsf.exit1.i132 + %323 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i132 ], [ 0x3FC06EBA60000000, %321 ], !dbg !18 + %324 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i132 ], [ 0xBFD8127580000000, %321 ], !dbg !18 + %325 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i132 ], [ 0x3FBCE315E0000000, %321 ], !dbg !18 + %326 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i132 ], [ 0xBF9B837CE0000000, %321 ], !dbg !18 + %327 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i132 ], [ 0x3F755ABD40000000, %321 ], !dbg !18 + %328 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i132 ], [ 0xBF4AE9A400000000, %321 ], !dbg !18 + %329 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i132 ], [ 0x3F163D2D40000000, %321 ], !dbg !18 + %330 = phi float [ %.01.i134, %__nv_fabsf.exit1.i132 ], [ %322, %321 ], !dbg !18 + %331 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i117 = icmp eq i32 %331, 0, !dbg !18 + %332 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %329, float %330, float %328) #4, !dbg !18 + %333 = tail call float @llvm.nvvm.fma.rn.f(float %329, float %330, float %328) #4, !dbg !18 + %.02.i118 = select i1 %.not2.i117, float %333, float %332, !dbg !18 + %334 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i119 = icmp eq i32 %334, 0, !dbg !18 + %335 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i118, float %330, float %327) #4, !dbg !18 + %336 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i118, float %330, float %327) #4, !dbg !18 + %.03.i120 = select i1 %.not3.i119, float %336, float %335, !dbg !18 + %337 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i121 = icmp eq i32 %337, 0, !dbg !18 + %338 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i120, float %330, float %326) #4, !dbg !18 + %339 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i120, float %330, float %326) #4, !dbg !18 + %.04.i122 = select i1 %.not4.i121, float %339, float %338, !dbg !18 + %340 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i123 = icmp eq i32 %340, 0, !dbg !18 + %341 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i122, float %330, float %325) #4, !dbg !18 + %342 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i122, float %330, float %325) #4, !dbg !18 + %.05.i124 = select i1 %.not5.i123, float %342, float %341, !dbg !18 + %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i125 = icmp eq i32 %343, 0, !dbg !18 + %344 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i124, float %330, float %324) #4, !dbg !18 + %345 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i124, float %330, float %324) #4, !dbg !18 + %.06.i126 = select i1 %.not6.i125, float %345, float %344, !dbg !18 + %346 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i127 = icmp eq i32 %346, 0, !dbg !18 + %347 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i126, float %330, float %323) #4, !dbg !18 + %348 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i126, float %330, float %323) #4, !dbg !18 + %.07.i128 = select i1 %.not7.i127, float %348, float %347, !dbg !18 + %349 = fneg float %330, !dbg !18 + %350 = select i1 %319, float %349, float %38, !dbg !18 + %351 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i129 = icmp eq i32 %351, 0, !dbg !18 + %352 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i128, float %350, float %350) #4, !dbg !18 + %353 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i128, float %350, float %350) #4, !dbg !18 + %.08.i130 = select i1 %.not8.i129, float %353, float %352, !dbg !18 + br i1 %319, label %354, label %__nv_erff.exit135, !dbg !18 + +354: ; preds = %__internal_fmad.exit.i116 + %355 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i130) #4, !dbg !18 + %356 = fsub float 1.000000e+00, %355, !dbg !18 + %357 = bitcast float %356 to i32, !dbg !18 + %358 = bitcast float %38 to i32, !dbg !18 + %359 = and i32 %358, -2147483648, !dbg !18 + %360 = or i32 %359, %357, !dbg !18 + %361 = bitcast i32 %360 to float, !dbg !18 + br label %__nv_erff.exit135, !dbg !18 + +__nv_erff.exit135: ; preds = %__internal_fmad.exit.i116, %354 + %r.0.i131 = phi float [ %361, %354 ], [ %.08.i130, %__internal_fmad.exit.i116 ], !dbg !18 + %362 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not.i136 = icmp eq i32 %362, 0, !dbg !18 + %363 = tail call float @llvm.nvvm.fabs.ftz.f(float %39) #4, !dbg !18 + %364 = tail call float @llvm.nvvm.fabs.f(float %39) #4, !dbg !18 + %.0.i137 = select i1 %.not.i136, float %364, float %363, !dbg !18 + %365 = fcmp oge float %.0.i137, 0x3FF00C1FC0000000, !dbg !18 + br i1 %365, label %__nv_fabsf.exit1.i154, label %367, !dbg !18 + +__nv_fabsf.exit1.i154: ; preds = %__nv_erff.exit135 + %366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not1.i155 = icmp eq i32 %366, 0, !dbg !18 + %.01.i156 = select i1 %.not1.i155, float %364, float %363, !dbg !18 + br label %__internal_fmad.exit.i138, !dbg !18 + +367: ; preds = %__nv_erff.exit135 + %368 = fmul float %39, %39, !dbg !18 + br label %__internal_fmad.exit.i138, !dbg !18 + +__internal_fmad.exit.i138: ; preds = %367, %__nv_fabsf.exit1.i154 + %369 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i154 ], [ 0x3FC06EBA60000000, %367 ], !dbg !18 + %370 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i154 ], [ 0xBFD8127580000000, %367 ], !dbg !18 + %371 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i154 ], [ 0x3FBCE315E0000000, %367 ], !dbg !18 + %372 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i154 ], [ 0xBF9B837CE0000000, %367 ], !dbg !18 + %373 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i154 ], [ 0x3F755ABD40000000, %367 ], !dbg !18 + %374 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i154 ], [ 0xBF4AE9A400000000, %367 ], !dbg !18 + %375 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i154 ], [ 0x3F163D2D40000000, %367 ], !dbg !18 + %376 = phi float [ %.01.i156, %__nv_fabsf.exit1.i154 ], [ %368, %367 ], !dbg !18 + %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not2.i139 = icmp eq i32 %377, 0, !dbg !18 + %378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %375, float %376, float %374) #4, !dbg !18 + %379 = tail call float @llvm.nvvm.fma.rn.f(float %375, float %376, float %374) #4, !dbg !18 + %.02.i140 = select i1 %.not2.i139, float %379, float %378, !dbg !18 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not3.i141 = icmp eq i32 %380, 0, !dbg !18 + %381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i140, float %376, float %373) #4, !dbg !18 + %382 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i140, float %376, float %373) #4, !dbg !18 + %.03.i142 = select i1 %.not3.i141, float %382, float %381, !dbg !18 + %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not4.i143 = icmp eq i32 %383, 0, !dbg !18 + %384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i142, float %376, float %372) #4, !dbg !18 + %385 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i142, float %376, float %372) #4, !dbg !18 + %.04.i144 = select i1 %.not4.i143, float %385, float %384, !dbg !18 + %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not5.i145 = icmp eq i32 %386, 0, !dbg !18 + %387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i144, float %376, float %371) #4, !dbg !18 + %388 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i144, float %376, float %371) #4, !dbg !18 + %.05.i146 = select i1 %.not5.i145, float %388, float %387, !dbg !18 + %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not6.i147 = icmp eq i32 %389, 0, !dbg !18 + %390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i146, float %376, float %370) #4, !dbg !18 + %391 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i146, float %376, float %370) #4, !dbg !18 + %.06.i148 = select i1 %.not6.i147, float %391, float %390, !dbg !18 + %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not7.i149 = icmp eq i32 %392, 0, !dbg !18 + %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i148, float %376, float %369) #4, !dbg !18 + %394 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i148, float %376, float %369) #4, !dbg !18 + %.07.i150 = select i1 %.not7.i149, float %394, float %393, !dbg !18 + %395 = fneg float %376, !dbg !18 + %396 = select i1 %365, float %395, float %39, !dbg !18 + %397 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18 + %.not8.i151 = icmp eq i32 %397, 0, !dbg !18 + %398 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i150, float %396, float %396) #4, !dbg !18 + %399 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i150, float %396, float %396) #4, !dbg !18 + %.08.i152 = select i1 %.not8.i151, float %399, float %398, !dbg !18 + br i1 %365, label %400, label %__nv_erff.exit157, !dbg !18 + +400: ; preds = %__internal_fmad.exit.i138 + %401 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i152) #4, !dbg !18 + %402 = fsub float 1.000000e+00, %401, !dbg !18 + %403 = bitcast float %402 to i32, !dbg !18 + %404 = bitcast float %39 to i32, !dbg !18 + %405 = and i32 %404, -2147483648, !dbg !18 + %406 = or i32 %405, %403, !dbg !18 + %407 = bitcast i32 %406 to float, !dbg !18 + br label %__nv_erff.exit157, !dbg !18 + +__nv_erff.exit157: ; preds = %__internal_fmad.exit.i138, %400 + %r.0.i153 = phi float [ %407, %400 ], [ %.08.i152, %__internal_fmad.exit.i138 ], !dbg !18 + %408 = fmul float %31, 5.000000e-01, !dbg !19 + %409 = fmul float %30, 5.000000e-01, !dbg !19 + %410 = fmul float %29, 5.000000e-01, !dbg !19 + %411 = fmul float %28, 5.000000e-01, !dbg !19 + %412 = fmul float %27, 5.000000e-01, !dbg !19 + %413 = fmul float %26, 5.000000e-01, !dbg !19 + %414 = fmul float %25, 5.000000e-01, !dbg !19 + %415 = fmul float %24, 5.000000e-01, !dbg !19 + %416 = fadd float %r.0.i, 1.000000e+00, !dbg !20 + %417 = fadd float %r.0.i21, 1.000000e+00, !dbg !20 + %418 = fadd float %r.0.i43, 1.000000e+00, !dbg !20 + %419 = fadd float %r.0.i65, 1.000000e+00, !dbg !20 + %420 = fadd float %r.0.i87, 1.000000e+00, !dbg !20 + %421 = fadd float %r.0.i109, 1.000000e+00, !dbg !20 + %422 = fadd float %r.0.i131, 1.000000e+00, !dbg !20 + %423 = fadd float %r.0.i153, 1.000000e+00, !dbg !20 + %424 = fmul float %415, %416, !dbg !21 + %425 = fmul float %414, %417, !dbg !21 + %426 = fmul float %413, %418, !dbg !21 + %427 = fmul float %412, %419, !dbg !21 + %428 = fmul float %411, %420, !dbg !21 + %429 = fmul float %410, %421, !dbg !21 + %430 = fmul float %409, %422, !dbg !21 + %431 = fmul float %408, %423, !dbg !21 + %432 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %424) #4, !dbg !22 + %433 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %425) #4, !dbg !22 + %434 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %426) #4, !dbg !22 + %435 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %427) #4, !dbg !22 + %436 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %428) #4, !dbg !22 + %437 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %429) #4, !dbg !22 + %438 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %430) #4, !dbg !22 + %439 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %431) #4, !dbg !22 + %440 = insertelement <2 x i16> undef, i16 %432, i64 0, !dbg !22 + %441 = insertelement <2 x i16> %440, i16 %433, i64 1, !dbg !22 + %442 = bitcast <2 x i16> %441 to i32, !dbg !22 + %443 = insertelement <2 x i16> undef, i16 %434, i64 0, !dbg !22 + %444 = insertelement <2 x i16> %443, i16 %435, i64 1, !dbg !22 + %445 = bitcast <2 x i16> %444 to i32, !dbg !22 + %446 = insertelement <2 x i16> undef, i16 %436, i64 0, !dbg !22 + %447 = insertelement <2 x i16> %446, i16 %437, i64 1, !dbg !22 + %448 = bitcast <2 x i16> %447 to i32, !dbg !22 + %449 = insertelement <2 x i16> undef, i16 %438, i64 0, !dbg !22 + %450 = insertelement <2 x i16> %449, i16 %439, i64 1, !dbg !22 + %451 = bitcast <2 x i16> %450 to i32, !dbg !22 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %445, i32 %448, i32 %451, ptr addrspace(1) %10, i1 true) #4, !dbg !22 + ret void, !dbg !23 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: alwaysinline nounwind +define float @__nv_erff(float %a) local_unnamed_addr #1 { +__nv_fabsf.exit: + %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not = icmp eq i32 %0, 0 + %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4 + %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4 + %.0 = select i1 %.not, float %2, float %1 + %3 = fcmp oge float %.0, 0x3FF00C1FC0000000 + br i1 %3, label %__nv_fabsf.exit1, label %5 + +__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit + %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not1 = icmp eq i32 %4, 0 + %.01 = select i1 %.not1, float %2, float %1 + br label %__internal_fmad.exit + +5: ; preds = %__nv_fabsf.exit + %6 = fmul float %a, %a + br label %__internal_fmad.exit + +__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1 + %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ] + %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ] + %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ] + %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ] + %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ] + %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ] + %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ] + %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ] + %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not2 = icmp eq i32 %15, 0 + %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4 + %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4 + %.02 = select i1 %.not2, float %17, float %16 + %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not3 = icmp eq i32 %18, 0 + %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4 + %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4 + %.03 = select i1 %.not3, float %20, float %19 + %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not4 = icmp eq i32 %21, 0 + %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4 + %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4 + %.04 = select i1 %.not4, float %23, float %22 + %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not5 = icmp eq i32 %24, 0 + %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4 + %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4 + %.05 = select i1 %.not5, float %26, float %25 + %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not6 = icmp eq i32 %27, 0 + %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4 + %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4 + %.06 = select i1 %.not6, float %29, float %28 + %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not7 = icmp eq i32 %30, 0 + %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4 + %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4 + %.07 = select i1 %.not7, float %32, float %31 + %33 = fneg float %14 + %34 = select i1 %3, float %33, float %a + %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not8 = icmp eq i32 %35, 0 + %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4 + %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4 + %.08 = select i1 %.not8, float %37, float %36 + br i1 %3, label %38, label %46 + +38: ; preds = %__internal_fmad.exit + %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4 + %40 = fsub float 1.000000e+00, %39 + %41 = bitcast float %40 to i32 + %42 = bitcast float %a to i32 + %43 = and i32 %42, -2147483648 + %44 = or i32 %43, %41 + %45 = bitcast i32 %44 to float + br label %46 + +46: ; preds = %38, %__internal_fmad.exit + %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ] + ret float %r.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fabs.ftz.f(float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fabs.f(float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py", directory: "/tmp/torchinductor_root/kp") +!4 = !{ptr @triton__0d1de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1de, !"maxntidx", i32 128} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 21, column: 36, scope: !7) +!11 = !DILocation(line: 20, column: 28, scope: !7) +!12 = !DILocation(line: 20, column: 33, scope: !7) +!13 = !DILocation(line: 21, column: 23, scope: !7) +!14 = !DILocation(line: 24, column: 34, scope: !7) +!15 = !DILocation(line: 24, column: 39, scope: !7) +!16 = !DILocation(line: 24, column: 48, scope: !7) +!17 = !DILocation(line: 29, column: 18, scope: !7) +!18 = !DILocation(line: 30, column: 23, scope: !7) +!19 = !DILocation(line: 27, column: 18, scope: !7) +!20 = !DILocation(line: 32, column: 18, scope: !7) +!21 = !DILocation(line: 33, column: 18, scope: !7) +!22 = !DILocation(line: 35, column: 40, scope: !7) +!23 = !DILocation(line: 35, column: 4, scope: !7) diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..5d9bab6a92c0f5bd59a83f20f4665258f2e57ea1 --- /dev/null +++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.llir @@ -0,0 +1,290 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = and i32 %6, 7, !dbg !8 + %10 = shl nuw nsw i32 %9, 2, !dbg !8 + %11 = and i32 %8, 7, !dbg !9 + %12 = lshr i32 %7, 3, !dbg !9 + %13 = shl nuw nsw i32 %11, 2, !dbg !9 + %14 = or i32 %13, %12, !dbg !9 + %15 = or i32 %14, 96, !dbg !9 + %16 = or i32 %10, 1, !dbg !10 + %17 = or i32 %10, 2, !dbg !10 + %18 = or i32 %10, 3, !dbg !10 + %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !14 + %20 = shl i32 %19, 5, !dbg !15 + %21 = or i32 %20, %10, !dbg !16 + %22 = or i32 %20, %7, !dbg !16 + %23 = icmp ult i32 %15, 120, !dbg !17 + %24 = shl nuw nsw i32 %14, 17, !dbg !18 + %25 = or i32 %24, 4194304, !dbg !18 + %26 = or i32 %24, 8388608, !dbg !18 + %27 = shl nuw nsw i32 %15, 17, !dbg !18 + %28 = add i32 %21, %24, !dbg !19 + %29 = add i32 %25, %21, !dbg !19 + %30 = add i32 %26, %21, !dbg !19 + %31 = add i32 %21, %27, !dbg !19 + %32 = sext i32 %28 to i64, !dbg !20 + %33 = getelementptr float, ptr addrspace(1) %0, i64 %32, !dbg !20 + %34 = sext i32 %29 to i64, !dbg !20 + %35 = getelementptr float, ptr addrspace(1) %0, i64 %34, !dbg !20 + %36 = sext i32 %30 to i64, !dbg !20 + %37 = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !20 + %38 = sext i32 %31 to i64, !dbg !20 + %39 = getelementptr float, ptr addrspace(1) %0, i64 %38, !dbg !20 + %40 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %41 = extractvalue { i32, i32, i32, i32 } %40, 0, !dbg !21 + %42 = extractvalue { i32, i32, i32, i32 } %40, 1, !dbg !21 + %43 = extractvalue { i32, i32, i32, i32 } %40, 2, !dbg !21 + %44 = extractvalue { i32, i32, i32, i32 } %40, 3, !dbg !21 + %45 = bitcast i32 %41 to float, !dbg !21 + %46 = bitcast i32 %42 to float, !dbg !21 + %47 = bitcast i32 %43 to float, !dbg !21 + %48 = bitcast i32 %44 to float, !dbg !21 + %49 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %50 = extractvalue { i32, i32, i32, i32 } %49, 0, !dbg !21 + %51 = extractvalue { i32, i32, i32, i32 } %49, 1, !dbg !21 + %52 = extractvalue { i32, i32, i32, i32 } %49, 2, !dbg !21 + %53 = extractvalue { i32, i32, i32, i32 } %49, 3, !dbg !21 + %54 = bitcast i32 %50 to float, !dbg !21 + %55 = bitcast i32 %51 to float, !dbg !21 + %56 = bitcast i32 %52 to float, !dbg !21 + %57 = bitcast i32 %53 to float, !dbg !21 + %58 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %37, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !21 + %59 = extractvalue { i32, i32, i32, i32 } %58, 0, !dbg !21 + %60 = extractvalue { i32, i32, i32, i32 } %58, 1, !dbg !21 + %61 = extractvalue { i32, i32, i32, i32 } %58, 2, !dbg !21 + %62 = extractvalue { i32, i32, i32, i32 } %58, 3, !dbg !21 + %63 = bitcast i32 %59 to float, !dbg !21 + %64 = bitcast i32 %60 to float, !dbg !21 + %65 = bitcast i32 %61 to float, !dbg !21 + %66 = bitcast i32 %62 to float, !dbg !21 + %67 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %39, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23, i32 0, i1 %23) #3, !dbg !21 + %68 = extractvalue { i32, i32, i32, i32 } %67, 0, !dbg !21 + %69 = extractvalue { i32, i32, i32, i32 } %67, 1, !dbg !21 + %70 = extractvalue { i32, i32, i32, i32 } %67, 2, !dbg !21 + %71 = extractvalue { i32, i32, i32, i32 } %67, 3, !dbg !21 + %72 = bitcast i32 %68 to float, !dbg !21 + %73 = bitcast i32 %69 to float, !dbg !21 + %74 = bitcast i32 %70 to float, !dbg !21 + %75 = bitcast i32 %71 to float, !dbg !21 + %76 = fadd float %45, 0.000000e+00, !dbg !22 + %77 = fadd float %46, 0.000000e+00, !dbg !22 + %78 = fadd float %47, 0.000000e+00, !dbg !22 + %79 = fadd float %48, 0.000000e+00, !dbg !22 + %80 = fadd float %54, 0.000000e+00, !dbg !22 + %81 = fadd float %55, 0.000000e+00, !dbg !22 + %82 = fadd float %56, 0.000000e+00, !dbg !22 + %83 = fadd float %57, 0.000000e+00, !dbg !22 + %84 = fadd float %63, 0.000000e+00, !dbg !22 + %85 = fadd float %64, 0.000000e+00, !dbg !22 + %86 = fadd float %65, 0.000000e+00, !dbg !22 + %87 = fadd float %66, 0.000000e+00, !dbg !22 + %88 = fadd float %72, 0.000000e+00, !dbg !22 + %89 = fadd float %73, 0.000000e+00, !dbg !22 + %90 = fadd float %74, 0.000000e+00, !dbg !22 + %91 = fadd float %75, 0.000000e+00, !dbg !22 + %92 = select i1 %23, float %88, float 0.000000e+00, !dbg !23 + %93 = select i1 %23, float %89, float 0.000000e+00, !dbg !23 + %94 = select i1 %23, float %90, float 0.000000e+00, !dbg !23 + %95 = select i1 %23, float %91, float 0.000000e+00, !dbg !23 + %96 = fadd float %76, %80, !dbg !24 + %97 = fadd float %77, %81, !dbg !24 + %98 = fadd float %78, %82, !dbg !24 + %99 = fadd float %79, %83, !dbg !24 + %100 = fadd float %96, %84, !dbg !24 + %101 = fadd float %97, %85, !dbg !24 + %102 = fadd float %98, %86, !dbg !24 + %103 = fadd float %99, %87, !dbg !24 + %104 = fadd float %100, %92, !dbg !24 + %105 = fadd float %101, %93, !dbg !24 + %106 = fadd float %102, %94, !dbg !24 + %107 = fadd float %103, %95, !dbg !24 + %108 = bitcast float %104 to i32, !dbg !10 + %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %108, i32 16, i32 31), !dbg !10 + %110 = bitcast i32 %109 to float, !dbg !10 + %111 = fadd float %104, %110, !dbg !24 + %112 = bitcast float %111 to i32, !dbg !10 + %113 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %112, i32 8, i32 31), !dbg !10 + %114 = bitcast i32 %113 to float, !dbg !10 + %115 = fadd float %111, %114, !dbg !24 + %116 = bitcast float %105 to i32, !dbg !10 + %117 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %116, i32 16, i32 31), !dbg !10 + %118 = bitcast i32 %117 to float, !dbg !10 + %119 = fadd float %105, %118, !dbg !24 + %120 = bitcast float %119 to i32, !dbg !10 + %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 8, i32 31), !dbg !10 + %122 = bitcast i32 %121 to float, !dbg !10 + %123 = fadd float %119, %122, !dbg !24 + %124 = bitcast float %106 to i32, !dbg !10 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 16, i32 31), !dbg !10 + %126 = bitcast i32 %125 to float, !dbg !10 + %127 = fadd float %106, %126, !dbg !24 + %128 = bitcast float %127 to i32, !dbg !10 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !10 + %130 = bitcast i32 %129 to float, !dbg !10 + %131 = fadd float %127, %130, !dbg !24 + %132 = bitcast float %107 to i32, !dbg !10 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 16, i32 31), !dbg !10 + %134 = bitcast i32 %133 to float, !dbg !10 + %135 = fadd float %107, %134, !dbg !24 + %136 = bitcast float %135 to i32, !dbg !10 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 8, i32 31), !dbg !10 + %138 = bitcast i32 %137 to float, !dbg !10 + %139 = fadd float %135, %138, !dbg !24 + %140 = icmp ult i32 %7, 8, !dbg !10 + %141 = shl nuw nsw i32 %9, 5, !dbg !10 + %142 = or i32 %141, %11, !dbg !10 + %143 = zext nneg i32 %142 to i64, !dbg !10 + %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %144, float %115, i1 %140) #3, !dbg !10 + %145 = shl nuw nsw i32 %16, 3, !dbg !10 + %146 = or i32 %145, %11, !dbg !10 + %147 = zext nneg i32 %146 to i64, !dbg !10 + %148 = getelementptr float, ptr addrspace(3) @global_smem, i64 %147, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %148, float %123, i1 %140) #3, !dbg !10 + %149 = shl nuw nsw i32 %17, 3, !dbg !10 + %150 = or i32 %149, %11, !dbg !10 + %151 = zext nneg i32 %150 to i64, !dbg !10 + %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %152, float %131, i1 %140) #3, !dbg !10 + %153 = shl nuw nsw i32 %18, 3, !dbg !10 + %154 = or i32 %153, %11, !dbg !10 + %155 = zext nneg i32 %154 to i64, !dbg !10 + %156 = getelementptr float, ptr addrspace(3) @global_smem, i64 %155, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %156, float %139, i1 %140) #3, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !10 + %157 = icmp slt i32 %6, 256, !dbg !10 + %158 = sext i32 %6 to i64, !dbg !10 + %159 = getelementptr float, ptr addrspace(3) @global_smem, i64 %158, !dbg !10 + %160 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %159, i1 %157) #3, !dbg !10 + %161 = bitcast float %160 to i32, !dbg !10 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !10 + %163 = bitcast i32 %162 to float, !dbg !10 + %164 = fadd float %160, %163, !dbg !24 + %165 = bitcast float %164 to i32, !dbg !10 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !10 + %167 = bitcast i32 %166 to float, !dbg !10 + %168 = fadd float %164, %167, !dbg !24 + %169 = bitcast float %168 to i32, !dbg !10 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !10 + %171 = bitcast i32 %170 to float, !dbg !10 + %172 = fadd float %168, %171, !dbg !24 + %173 = icmp eq i32 %9, 0, !dbg !10 + %174 = and i1 %157, %173, !dbg !10 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %159, float %172, i1 %174) #3, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !10 + %175 = zext nneg i32 %141 to i64, !dbg !10 + %176 = getelementptr float, ptr addrspace(3) @global_smem, i64 %175, !dbg !10 + %177 = load float, ptr addrspace(3) %176, align 4, !dbg !10 + %178 = zext nneg i32 %145 to i64, !dbg !10 + %179 = getelementptr float, ptr addrspace(3) @global_smem, i64 %178, !dbg !10 + %180 = load float, ptr addrspace(3) %179, align 4, !dbg !10 + %181 = zext nneg i32 %149 to i64, !dbg !10 + %182 = getelementptr float, ptr addrspace(3) @global_smem, i64 %181, !dbg !10 + %183 = load float, ptr addrspace(3) %182, align 4, !dbg !10 + %184 = zext nneg i32 %153 to i64, !dbg !10 + %185 = getelementptr float, ptr addrspace(3) @global_smem, i64 %184, !dbg !10 + %186 = load float, ptr addrspace(3) %185, align 4, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %187 = zext nneg i32 %10 to i64, !dbg !28 + %188 = getelementptr float, ptr addrspace(3) @global_smem, i64 %187, !dbg !28 + %189 = insertelement <1 x float> undef, float %177, i64 0, !dbg !28 + store <1 x float> %189, ptr addrspace(3) %188, align 4, !dbg !28 + %190 = zext nneg i32 %16 to i64, !dbg !28 + %191 = getelementptr float, ptr addrspace(3) @global_smem, i64 %190, !dbg !28 + %192 = insertelement <1 x float> undef, float %180, i64 0, !dbg !28 + store <1 x float> %192, ptr addrspace(3) %191, align 4, !dbg !28 + %193 = zext nneg i32 %17 to i64, !dbg !28 + %194 = getelementptr float, ptr addrspace(3) @global_smem, i64 %193, !dbg !28 + %195 = insertelement <1 x float> undef, float %183, i64 0, !dbg !28 + store <1 x float> %195, ptr addrspace(3) %194, align 4, !dbg !28 + %196 = zext nneg i32 %18 to i64, !dbg !28 + %197 = getelementptr float, ptr addrspace(3) @global_smem, i64 %196, !dbg !28 + %198 = insertelement <1 x float> undef, float %186, i64 0, !dbg !28 + store <1 x float> %198, ptr addrspace(3) %197, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %199 = zext nneg i32 %7 to i64, !dbg !28 + %200 = getelementptr float, ptr addrspace(3) @global_smem, i64 %199, !dbg !28 + %201 = load <1 x float>, ptr addrspace(3) %200, align 4, !dbg !28 + %.frozen = freeze i32 %22 + %202 = sdiv i32 %.frozen, 256, !dbg !29 + %203 = mul i32 %202, 256 + %.decomposed = sub i32 %.frozen, %203 + %204 = sext i32 %202 to i64, !dbg !30 + %205 = getelementptr i64, ptr addrspace(1) %1, i64 %204, !dbg !30 + %206 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %205, i1 true) #3, !dbg !31 + %207 = lshr i64 %206, 54, !dbg !32 + %208 = and i64 %207, 512, !dbg !32 + %209 = add i64 %208, %206, !dbg !32 + %210 = shl i64 %209, 8, !dbg !33 + %211 = sext i32 %.decomposed to i64, !dbg !34 + %212 = getelementptr float, ptr addrspace(1) %2, i64 %210, !dbg !35 + %213 = getelementptr float, ptr addrspace(1) %212, i64 %211, !dbg !35 + %214 = icmp eq i32 %11, 0, !dbg !36 + %215 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %213, <1 x float> %201, i1 %214) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 243, column: 36, scope: !11, inlinedAt: !13) +!11 = distinct !DILexicalBlockFile(scope: !5, file: !12, discriminator: 0) +!12 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!13 = !DILocation(line: 35, column: 25, scope: !11) +!14 = !DILocation(line: 21, column: 28, scope: !5) +!15 = !DILocation(line: 21, column: 33, scope: !5) +!16 = !DILocation(line: 22, column: 23, scope: !5) +!17 = !DILocation(line: 29, column: 25, scope: !5) +!18 = !DILocation(line: 31, column: 47, scope: !5) +!19 = !DILocation(line: 31, column: 40, scope: !5) +!20 = !DILocation(line: 31, column: 34, scope: !5) +!21 = !DILocation(line: 31, column: 53, scope: !5) +!22 = !DILocation(line: 33, column: 23, scope: !5) +!23 = !DILocation(line: 34, column: 38, scope: !5) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !11, file: !12, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 35, column: 28, scope: !5) +!29 = !DILocation(line: 36, column: 20, scope: !5) +!30 = !DILocation(line: 38, column: 30, scope: !5) +!31 = !DILocation(line: 38, column: 35, scope: !5) +!32 = !DILocation(line: 41, column: 32, scope: !5) +!33 = !DILocation(line: 45, column: 40, scope: !5) +!34 = !DILocation(line: 45, column: 36, scope: !5) +!35 = !DILocation(line: 45, column: 30, scope: !5) +!36 = !DILocation(line: 45, column: 55, scope: !5) +!37 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bd9dd69cfa519de6efe25f21d9290d1f30494411 --- /dev/null +++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ptx @@ -0,0 +1,653 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<30>; + .reg .b32 %r<112>; + .reg .f32 %f<76>; + .reg .b64 %rd<22>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd8, [triton__0d1d2d3de4e_param_0]; + ld.param.u64 %rd9, [triton__0d1d2d3de4e_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r48, %tid.x; + and.b32 %r49, %r48, 31; + ld.param.u64 %rd10, [triton__0d1d2d3de4e_param_2]; + and.b32 %r50, %r48, 7; + shl.b32 %r51, %r50, 2; + .loc 1 24 33 + bfe.u32 %r52, %r48, 5, 3; + bfe.u32 %r53, %r48, 3, 2; + shl.b32 %r54, %r52, 2; + or.b32 %r55, %r54, %r53; + or.b32 %r56, %r55, 96; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 21 33 + shl.b32 %r57, %r1, 5; + .loc 1 22 23 + or.b32 %r58, %r57, %r51; + or.b32 %r59, %r57, %r49; + .loc 1 29 25 + setp.lt.u32 %p16, %r56, 120; + .loc 1 31 47 + shl.b32 %r60, %r55, 17; + shl.b32 %r61, %r56, 17; + .loc 1 31 40 + add.s32 %r62, %r58, %r60; + add.s32 %r63, %r62, 4194304; + add.s32 %r64, %r62, 8388608; + add.s32 %r65, %r58, %r61; + .loc 1 31 34 + mul.wide.s32 %rd11, %r62, 4; + add.s64 %rd1, %rd8, %rd11; + mul.wide.s32 %rd12, %r63, 4; + add.s64 %rd2, %rd8, %rd12; + mul.wide.s32 %rd13, %r64, 4; + add.s64 %rd3, %rd8, %rd13; + mul.wide.s32 %rd14, %r65, 4; + add.s64 %rd4, %rd8, %rd14; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 31 53 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + @!%p1 mov.u32 %r12, %r6; + @!%p1 mov.u32 %r13, %r6; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + @!%p1 mov.u32 %r20, %r6; + @!%p1 mov.u32 %r21, %r6; + mov.b32 %f9, %r18; + mov.b32 %f10, %r19; + mov.b32 %f11, %r20; + mov.b32 %f12, %r21; + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p16 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ]; + @!%p16 mov.u32 %r26, %r6; + @!%p16 mov.u32 %r27, %r6; + @!%p16 mov.u32 %r28, %r6; + @!%p16 mov.u32 %r29, %r6; + mov.b32 %f13, %r26; + mov.b32 %f14, %r27; + mov.b32 %f15, %r28; + mov.b32 %f16, %r29; + .loc 1 33 23 + add.f32 %f17, %f1, 0f00000000; + add.f32 %f18, %f2, 0f00000000; + add.f32 %f19, %f3, 0f00000000; + add.f32 %f20, %f4, 0f00000000; + add.f32 %f21, %f5, 0f00000000; + add.f32 %f22, %f6, 0f00000000; + add.f32 %f23, %f7, 0f00000000; + add.f32 %f24, %f8, 0f00000000; + add.f32 %f25, %f9, 0f00000000; + add.f32 %f26, %f10, 0f00000000; + add.f32 %f27, %f11, 0f00000000; + add.f32 %f28, %f12, 0f00000000; + add.f32 %f29, %f13, 0f00000000; + add.f32 %f30, %f14, 0f00000000; + add.f32 %f31, %f15, 0f00000000; + add.f32 %f32, %f16, 0f00000000; + .loc 1 34 38 + selp.f32 %f33, %f29, 0f00000000, %p16; + selp.f32 %f34, %f30, 0f00000000, %p16; + selp.f32 %f35, %f31, 0f00000000, %p16; + selp.f32 %f36, %f32, 0f00000000, %p16; +$L__tmp1: + .loc 2 233 15 + add.f32 %f37, %f17, %f21; + add.f32 %f38, %f18, %f22; + add.f32 %f39, %f19, %f23; + add.f32 %f40, %f20, %f24; + add.f32 %f41, %f37, %f25; + add.f32 %f42, %f38, %f26; + add.f32 %f43, %f39, %f27; + add.f32 %f44, %f40, %f28; + add.f32 %f45, %f41, %f33; + add.f32 %f46, %f42, %f34; + add.f32 %f47, %f43, %f35; + add.f32 %f48, %f44, %f36; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r66, %f45; + shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1; + mov.b32 %f49, %r67; +$L__tmp3: + .loc 2 233 15 + add.f32 %f50, %f45, %f49; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r68, %f50; + shfl.sync.bfly.b32 %r69, %r68, 8, 31, -1; + mov.b32 %f51, %r69; +$L__tmp5: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r70, %f46; + shfl.sync.bfly.b32 %r71, %r70, 16, 31, -1; + mov.b32 %f53, %r71; +$L__tmp7: + .loc 2 233 15 + add.f32 %f54, %f46, %f53; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r72, %f54; + shfl.sync.bfly.b32 %r73, %r72, 8, 31, -1; + mov.b32 %f55, %r73; +$L__tmp9: + .loc 2 233 15 + add.f32 %f56, %f54, %f55; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r74, %f47; + shfl.sync.bfly.b32 %r75, %r74, 16, 31, -1; + mov.b32 %f57, %r75; +$L__tmp11: + .loc 2 233 15 + add.f32 %f58, %f47, %f57; +$L__tmp12: + .loc 2 243 36 + mov.b32 %r76, %f58; + shfl.sync.bfly.b32 %r77, %r76, 8, 31, -1; + mov.b32 %f59, %r77; +$L__tmp13: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r78, %f48; + shfl.sync.bfly.b32 %r79, %r78, 16, 31, -1; + mov.b32 %f61, %r79; +$L__tmp15: + .loc 2 233 15 + add.f32 %f62, %f48, %f61; +$L__tmp16: + .loc 2 243 36 + mov.b32 %r80, %f62; + shfl.sync.bfly.b32 %r81, %r80, 8, 31, -1; + mov.b32 %f63, %r81; +$L__tmp17: + .loc 2 233 15 + add.f32 %f64, %f62, %f63; +$L__tmp18: + .loc 2 243 36 + setp.lt.u32 %p21, %r49, 8; + shl.b32 %r82, %r50, 7; + or.b32 %r83, %r82, %r54; + mov.u32 %r84, global_smem; + add.s32 %r34, %r84, %r83; + mov.b32 %r35, %f52; + @%p21 st.shared.b32 [ %r34 + 0 ], %r35; + or.b32 %r85, %r82, 32; + or.b32 %r86, %r85, %r54; + add.s32 %r36, %r84, %r86; + mov.b32 %r37, %f56; + @%p21 st.shared.b32 [ %r36 + 0 ], %r37; + or.b32 %r87, %r82, 64; + or.b32 %r88, %r87, %r54; + add.s32 %r38, %r84, %r88; + mov.b32 %r39, %f60; + @%p21 st.shared.b32 [ %r38 + 0 ], %r39; + or.b32 %r89, %r82, 96; + or.b32 %r90, %r89, %r54; + add.s32 %r40, %r84, %r90; + mov.b32 %r41, %f64; + @%p21 st.shared.b32 [ %r40 + 0 ], %r41; + bar.sync 0; + setp.lt.s32 %p25, %r48, 256; + shl.b32 %r91, %r48, 2; + add.s32 %r43, %r84, %r91; + @%p25 ld.shared.b32 %r42, [ %r43 + 0 ]; + mov.b32 %f65, %r42; + shfl.sync.bfly.b32 %r92, %r42, 4, 31, -1; + mov.b32 %f66, %r92; +$L__tmp19: + .loc 2 233 15 + add.f32 %f67, %f65, %f66; +$L__tmp20: + .loc 2 243 36 + mov.b32 %r93, %f67; + shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1; + mov.b32 %f68, %r94; +$L__tmp21: + .loc 2 233 15 + add.f32 %f69, %f67, %f68; +$L__tmp22: + .loc 2 243 36 + mov.b32 %r95, %f69; + shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1; + mov.b32 %f70, %r96; +$L__tmp23: + .loc 2 233 15 + add.f32 %f71, %f69, %f70; +$L__tmp24: + .loc 2 243 36 + setp.eq.s32 %p29, %r50, 0; + and.pred %p26, %p25, %p29; + mov.b32 %r45, %f71; + @%p26 st.shared.b32 [ %r43 + 0 ], %r45; + bar.sync 0; + add.s32 %r97, %r84, %r82; + ld.shared.f32 %f72, [%r97]; + add.s32 %r98, %r84, %r85; + ld.shared.f32 %f73, [%r98]; + add.s32 %r99, %r84, %r87; + ld.shared.f32 %f74, [%r99]; + add.s32 %r100, %r84, %r89; + ld.shared.f32 %f75, [%r100]; +$L__tmp25: + .loc 1 35 28 + bar.sync 0; + shl.b32 %r101, %r50, 4; + add.s32 %r102, %r84, %r101; + st.shared.f32 [%r102], %f72; + st.shared.f32 [%r102+4], %f73; + st.shared.f32 [%r102+8], %f74; + st.shared.f32 [%r102+12], %f75; + bar.sync 0; + shl.b32 %r103, %r49, 2; + add.s32 %r104, %r84, %r103; + .loc 1 36 20 + shr.s32 %r106, %r59, 31; + shr.u32 %r107, %r106, 24; + add.s32 %r108, %r59, %r107; + shr.s32 %r109, %r108, 8; + and.b32 %r110, %r108, -256; + sub.s32 %r111, %r59, %r110; + .loc 1 38 30 + mul.wide.s32 %rd15, %r109, 8; + add.s64 %rd6, %rd9, %rd15; + .loc 1 45 55 + ld.shared.u32 %r47, [%r104]; + .loc 1 38 35 + mov.u64 %rd5, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd5 }, [ %rd6 + 0 ]; + .loc 1 41 32 + shr.u64 %rd16, %rd5, 54; + and.b64 %rd17, %rd16, 512; + add.s64 %rd18, %rd17, %rd5; + .loc 1 45 30 + shl.b64 %rd19, %rd18, 10; + add.s64 %rd20, %rd10, %rd19; + mul.wide.s32 %rd21, %r111, 4; + add.s64 %rd7, %rd20, %rd21; + .loc 1 45 55 + setp.eq.s32 %p28, %r52, 0; + mov.u32 %r46, 0x0; + @%p28 atom.global.gpu.acq_rel.add.f32 %r46, [ %rd7 + 0 ], %r47; + .loc 1 45 4 + ret; +$L__tmp26: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp25 +.b8 2 +.b8 35 +.b8 25 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..a4ba494c14da123a9d9ec428c261d8eba3e086a4 --- /dev/null +++ b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.llir @@ -0,0 +1,162 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8] + +define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %5 = and i32 %4, 127, !dbg !8 + %6 = shl nuw nsw i32 %5, 3, !dbg !8 + %7 = shl nuw nsw i32 %5, 2, !dbg !8 + %8 = or i32 %7, 512, !dbg !8 + %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9 + %10 = shl i32 %9, 10, !dbg !10 + %11 = or i32 %10, %6, !dbg !11 + %12 = or i32 %10, %7, !dbg !11 + %13 = or i32 %10, %8, !dbg !11 + %14 = sext i32 %11 to i64, !dbg !12 + %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12 + %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13 + %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13 + %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13 + %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13 + %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13 + %21 = trunc i32 %17 to i16, !dbg !13 + %extelt.offset = lshr i32 %17, 16, !dbg !13 + %22 = trunc i32 %extelt.offset to i16, !dbg !13 + %23 = trunc i32 %18 to i16, !dbg !13 + %extelt.offset1 = lshr i32 %18, 16, !dbg !13 + %24 = trunc i32 %extelt.offset1 to i16, !dbg !13 + %25 = trunc i32 %19 to i16, !dbg !13 + %extelt.offset2 = lshr i32 %19, 16, !dbg !13 + %26 = trunc i32 %extelt.offset2 to i16, !dbg !13 + %27 = trunc i32 %20 to i16, !dbg !13 + %extelt.offset3 = lshr i32 %20, 16, !dbg !13 + %28 = trunc i32 %extelt.offset3 to i16, !dbg !13 + %29 = zext nneg i32 %6 to i64, !dbg !14 + %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14 + %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14 + store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14 + %32 = or i32 %6, 1, !dbg !14 + %33 = zext nneg i32 %32 to i64, !dbg !14 + %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14 + %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14 + store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14 + %36 = or i32 %6, 2, !dbg !14 + %37 = zext nneg i32 %36 to i64, !dbg !14 + %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14 + %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14 + store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14 + %40 = or i32 %6, 3, !dbg !14 + %41 = zext nneg i32 %40 to i64, !dbg !14 + %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14 + %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14 + store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14 + %44 = or i32 %6, 4, !dbg !14 + %45 = zext nneg i32 %44 to i64, !dbg !14 + %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14 + %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14 + store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14 + %48 = or i32 %6, 5, !dbg !14 + %49 = zext nneg i32 %48 to i64, !dbg !14 + %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14 + %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14 + store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14 + %52 = or i32 %6, 6, !dbg !14 + %53 = zext nneg i32 %52 to i64, !dbg !14 + %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14 + %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14 + store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14 + %56 = or i32 %6, 7, !dbg !14 + %57 = zext nneg i32 %56 to i64, !dbg !14 + %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14 + %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14 + store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14 + tail call void @llvm.nvvm.barrier0(), !dbg !14 + %60 = zext nneg i32 %7 to i64, !dbg !14 + %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14 + %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14 + %63 = or i32 %7, 1, !dbg !14 + %64 = zext nneg i32 %63 to i64, !dbg !14 + %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14 + %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14 + %67 = or i32 %7, 2, !dbg !14 + %68 = zext nneg i32 %67 to i64, !dbg !14 + %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14 + %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14 + %71 = or i32 %7, 3, !dbg !14 + %72 = zext nneg i32 %71 to i64, !dbg !14 + %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14 + %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14 + %75 = zext nneg i32 %8 to i64, !dbg !14 + %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14 + %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14 + %78 = or i32 %7, 513, !dbg !14 + %79 = zext nneg i32 %78 to i64, !dbg !14 + %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14 + %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14 + %82 = or i32 %7, 514, !dbg !14 + %83 = zext nneg i32 %82 to i64, !dbg !14 + %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14 + %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14 + %86 = or i32 %7, 515, !dbg !14 + %87 = zext nneg i32 %86 to i64, !dbg !14 + %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14 + %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14 + %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14 + %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14 + %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14 + %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14 + %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14 + %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14 + %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14 + %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14 + %98 = sext i32 %12 to i64, !dbg !15 + %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15 + %100 = sext i32 %13 to i64, !dbg !15 + %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15 + %102 = bitcast float %90 to i32, !dbg !16 + %103 = bitcast float %91 to i32, !dbg !16 + %104 = bitcast float %92 to i32, !dbg !16 + %105 = bitcast float %93 to i32, !dbg !16 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16 + %106 = bitcast float %94 to i32, !dbg !16 + %107 = bitcast float %95 to i32, !dbg !16 + %108 = bitcast float %96 to i32, !dbg !16 + %109 = bitcast float %97 to i32, !dbg !16 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16 + ret void, !dbg !17 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #1 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot") +!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 24, column: 30, scope: !5) +!13 = !DILocation(line: 24, column: 35, scope: !5) +!14 = !DILocation(line: 24, column: 44, scope: !5) +!15 = !DILocation(line: 26, column: 25, scope: !5) +!16 = !DILocation(line: 26, column: 36, scope: !5) +!17 = !DILocation(line: 26, column: 4, scope: !5) diff --git a/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..873c0ca013828443532e105c9793a002a07dc72f --- /dev/null +++ b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ptx @@ -0,0 +1,338 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<37>; + .reg .b64 %rd<13>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd4, [triton__0d1d2de_param_0]; + ld.param.u64 %rd5, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r22, %tid.x; + and.b32 %r23, %r22, 127; + shl.b32 %r24, %r23, 3; + shl.b32 %r25, %r23, 2; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r26, %r1, 10; + .loc 1 21 23 + or.b32 %r27, %r26, %r24; + or.b32 %r28, %r26, %r25; + .loc 1 24 30 + mul.wide.s32 %rd6, %r27, 2; + add.s64 %rd1, %rd4, %rd6; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + shr.u32 %r29, %r2, 16; + shr.u32 %r30, %r3, 16; + shr.u32 %r31, %r4, 16; + shr.u32 %r32, %r5, 16; + .loc 1 24 44 + shl.b32 %r33, %r23, 4; + mov.u32 %r34, global_smem; + add.s32 %r35, %r34, %r33; + st.shared.u16 [%r35], %r2; + st.shared.u16 [%r35+2], %r29; + st.shared.u16 [%r35+4], %r3; + st.shared.u16 [%r35+6], %r30; + st.shared.u16 [%r35+8], %r4; + st.shared.u16 [%r35+10], %r31; + st.shared.u16 [%r35+12], %r5; + st.shared.u16 [%r35+14], %r32; + bar.sync 0; + add.s32 %r36, %r34, %r24; + ld.shared.u16 %rs1, [%r36]; + ld.shared.u16 %rs2, [%r36+2]; + ld.shared.u16 %rs3, [%r36+4]; + ld.shared.u16 %rs4, [%r36+6]; + ld.shared.u16 %rs5, [%r36+1024]; + ld.shared.u16 %rs6, [%r36+1026]; + ld.shared.u16 %rs7, [%r36+1028]; + ld.shared.u16 %rs8, [%r36+1030]; + cvt.f32.bf16 %r14, %rs1; + cvt.f32.bf16 %r15, %rs2; + cvt.f32.bf16 %r16, %rs3; + cvt.f32.bf16 %r17, %rs4; + cvt.f32.bf16 %r18, %rs5; + cvt.f32.bf16 %r19, %rs6; + cvt.f32.bf16 %r20, %rs7; + cvt.f32.bf16 %r21, %rs8; + .loc 1 26 25 + mul.wide.s32 %rd7, %r28, 4; + add.s64 %rd2, %rd5, %rd7; + cvt.s64.s32 %rd8, %r26; + cvt.u64.u32 %rd9, %r25; + or.b64 %rd10, %rd8, %rd9; + shl.b64 %rd11, %rd10, 2; + add.s64 %rd12, %rd5, %rd11; + add.s64 %rd3, %rd12, 2048; + .loc 1 26 36 + @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 }; + @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/ot/cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 111 +.b8 116 +.b8 98 +.b8 104 +.b8 101 +.b8 116 +.b8 51 +.b8 55 +.b8 118 +.b8 54 +.b8 109 +.b8 104 +.b8 53 +.b8 115 +.b8 97 +.b8 109 +.b8 113 +.b8 108 +.b8 55 +.b8 117 +.b8 120 +.b8 114 +.b8 101 +.b8 51 +.b8 104 +.b8 112 +.b8 114 +.b8 112 +.b8 110 +.b8 98 +.b8 104 +.b8 117 +.b8 118 +.b8 105 +.b8 109 +.b8 51 +.b8 102 +.b8 109 +.b8 114 +.b8 106 +.b8 112 +.b8 113 +.b8 53 +.b8 102 +.b8 103 +.b8 103 +.b8 54 +.b8 108 +.b8 119 +.b8 98 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 111 +.b8 116 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8dfc219ffbb3021e19f3b35e9be96086e23c9c4b --- /dev/null +++ b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttgir @@ -0,0 +1,24 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1> + %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1> + %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked> + %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1> + %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1> + %13 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked1> + %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr, #blocked1>, tensor<1024xi32, #blocked1> + tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..d2a9a5cb9370160003226fec8c61be7abfe7c35e --- /dev/null +++ b/.triton/dump/15fa39c568de5a2b912a7bda93a479b3/triton_.ttir @@ -0,0 +1,18 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32> + tt.return + } +} diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..af30cdbd81db00a48d593f6e50c6169f9222421b Binary files /dev/null and b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.cubin differ diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..602a76a8390065d8fb6a1d457db6e104518bb183 --- /dev/null +++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ptx @@ -0,0 +1,756 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5de6de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5de6de( + .param .u64 triton__0d1d2d3d4d5de6de_param_0, + .param .u64 triton__0d1d2d3d4d5de6de_param_1, + .param .u64 triton__0d1d2d3d4d5de6de_param_2, + .param .u64 triton__0d1d2d3d4d5de6de_param_3, + .param .u64 triton__0d1d2d3d4d5de6de_param_4, + .param .u32 triton__0d1d2d3d4d5de6de_param_5, + .param .u32 triton__0d1d2d3d4d5de6de_param_6 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<27>; + .reg .b16 %rs<3>; + .reg .b32 %r<81>; + .reg .f32 %f<73>; + .reg .b64 %rd<84>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd35, [triton__0d1d2d3d4d5de6de_param_3]; + ld.param.u64 %rd34, [triton__0d1d2d3d4d5de6de_param_2]; + ld.param.u64 %rd33, [triton__0d1d2d3d4d5de6de_param_1]; + ld.param.u64 %rd41, [triton__0d1d2d3d4d5de6de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + bfe.u32 %r2, %r1, 2, 6; + and.b32 %r14, %r1, 63; + .loc 1 24 33 + and.b32 %r3, %r1, 3; + .loc 1 21 28 + mov.u32 %r13, %ctaid.x; + .loc 1 21 33 + shl.b32 %r15, %r13, 6; + .loc 1 22 23 + or.b32 %r16, %r15, %r2; + or.b32 %r17, %r15, %r14; + .loc 1 26 30 + mul.wide.s32 %rd42, %r16, 8; + add.s64 %rd38, %rd41, %rd42; + mul.wide.s32 %rd43, %r17, 8; + add.s64 %rd40, %rd41, %rd43; + mov.pred %p11, -1; + .loc 1 26 35 + mov.u64 %rd37, 0x0; + @%p11 ld.global.L1::evict_last.b64 { %rd37 }, [ %rd38 + 0 ]; + mov.u64 %rd39, 0x0; + @%p11 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ]; + .loc 1 27 18 + bfe.s32 %r18, %r13, 25, 1; + shr.u32 %r19, %r18, 23; + add.s32 %r20, %r16, %r19; + and.b32 %r21, %r20, 16776704; + sub.s32 %r22, %r16, %r21; + .loc 1 35 44 + shl.b32 %r5, %r22, 8; + .loc 1 36 22 + add.s64 %rd44, %rd39, 50257; + .loc 1 37 22 + setp.lt.s64 %p3, %rd37, 0; + setp.lt.s64 %p4, %rd39, 0; + .loc 1 38 36 + selp.b64 %rd45, %rd44, %rd39, %p4; + .loc 1 39 40 + setp.gt.u64 %p5, %rd45, 50256; + .loc 1 40 44 + shl.b64 %rd46, %rd37, 8; + add.s64 %rd47, %rd46, 12865792; + selp.b64 %rd2, %rd47, %rd46, %p3; + mov.b32 %r67, 0; + mov.b32 %r77, 883; + mov.u64 %rd73, 1; + .loc 1 39 55 + @%p5 bra $L__BB0_3; + bra.uni $L__BB0_1; +$L__BB0_3: + .loc 1 31 36 + shl.b64 %rd51, %rd2, 2; + mul.wide.u32 %rd80, %r3, 4; + add.s64 %rd79, %rd51, %rd80; + add.s64 %rd75, %rd33, %rd79; + add.s32 %r35, %r5, %r3; + mul.wide.s32 %rd78, %r35, 4; + add.s64 %rd74, %rd34, %rd78; + mov.f32 %f72, 0f00000000; + mov.b32 %r78, -4; + mov.f32 %f71, %f72; + mov.f32 %f70, %f72; +$L__BB0_4: + .loc 1 35 50 + mov.u32 %r36, 0x0; + @%p11 ld.global.L1::evict_last.b32 { %r36 }, [ %rd74 + 0 ]; + @!%p11 mov.u32 %r36, %r67; + mov.b32 %f28, %r36; + .loc 1 39 55 + mov.u64 %rd54, assertMessage_0; + cvta.global.u64 %rd55, %rd54; + mov.u64 %rd56, assertFile_0; + cvta.global.u64 %rd57, %rd56; + mov.u64 %rd58, assertFunc_0; + cvta.global.u64 %rd59, %rd58; + { // callseq 10, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd55; + .param .b64 param1; + st.param.b64 [param1+0], %rd57; + .param .b32 param2; + st.param.b32 [param2+0], %r77; + .param .b64 param3; + st.param.b64 [param3+0], %rd59; + .param .b64 param4; + st.param.b64 [param4+0], %rd73; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 10 + .loc 1 40 52 + mov.u32 %r38, 0x0; + @%p11 ld.global.L1::evict_last.b32 { %r38 }, [ %rd75 + 0 ]; + @!%p11 mov.u32 %r38, %r67; + mov.b32 %f29, %r38; + .loc 1 41 22 + add.f32 %f30, %f28, %f29; +$L__tmp1: + .loc 2 96 20 + sub.f32 %f31, %f30, %f70; + .loc 2 97 26 + add.f32 %f72, %f72, 0f3F800000; + .loc 2 98 30 + mov.b32 %r41, %f31; + mov.b32 %r42, %f72; + div.full.f32 %r40, %r41, %r42; + mov.b32 %f32, %r40; + .loc 2 98 22 + add.f32 %f70, %f70, %f32; + .loc 2 101 30 + sub.f32 %f33, %f30, %f70; +$L__tmp2: + .loc 1 47 48 + fma.rn.f32 %f71, %f31, %f33, %f71; + .loc 1 31 36 + add.s32 %r78, %r78, 4; + add.s64 %rd75, %rd75, 16; + add.s64 %rd74, %rd74, 16; + setp.lt.u32 %p15, %r78, 252; + @%p15 bra $L__BB0_4; + bra.uni $L__BB0_5; +$L__BB0_1: + .loc 1 0 36 + mov.b32 %r79, -4; + .loc 1 31 36 + shl.b64 %rd48, %rd2, 2; + mul.wide.u32 %rd80, %r3, 4; + add.s64 %rd79, %rd48, %rd80; + add.s64 %rd77, %rd33, %rd79; + add.s32 %r25, %r5, %r3; + mul.wide.s32 %rd78, %r25, 4; + add.s64 %rd76, %rd34, %rd78; + mov.f32 %f72, 0f00000000; + mov.f32 %f71, %f72; + mov.f32 %f70, %f72; +$L__BB0_2: + .loc 1 35 50 + mov.u32 %r26, 0x0; + @%p11 ld.global.L1::evict_last.b32 { %r26 }, [ %rd76 + 0 ]; + @!%p11 mov.u32 %r26, %r67; + mov.b32 %f21, %r26; + .loc 1 40 52 + mov.u32 %r28, 0x0; + @%p11 ld.global.L1::evict_last.b32 { %r28 }, [ %rd77 + 0 ]; + @!%p11 mov.u32 %r28, %r67; + mov.b32 %f22, %r28; + .loc 1 41 22 + add.f32 %f23, %f21, %f22; +$L__tmp3: + .loc 2 96 20 + sub.f32 %f24, %f23, %f70; + .loc 2 97 26 + add.f32 %f72, %f72, 0f3F800000; + .loc 2 98 30 + mov.b32 %r31, %f24; + mov.b32 %r32, %f72; + div.full.f32 %r30, %r31, %r32; + mov.b32 %f25, %r30; + .loc 2 98 22 + add.f32 %f70, %f70, %f25; + .loc 2 101 30 + sub.f32 %f26, %f23, %f70; +$L__tmp4: + .loc 1 47 48 + fma.rn.f32 %f71, %f24, %f26, %f71; + .loc 1 31 36 + add.s32 %r79, %r79, 4; + add.s64 %rd77, %rd77, 16; + add.s64 %rd76, %rd76, 16; + setp.lt.u32 %p10, %r79, 252; + @%p10 bra $L__BB0_2; +$L__BB0_5: + .loc 1 0 36 + ld.param.u64 %rd36, [triton__0d1d2d3d4d5de6de_param_4]; +$L__tmp5: + .loc 2 120 46 + mov.b32 %r54, %f70; + shfl.sync.bfly.b32 %r55, %r54, 2, 31, -1; + mov.b32 %f34, %r55; + mov.b32 %r56, %f71; + shfl.sync.bfly.b32 %r57, %r56, 2, 31, -1; + mov.b32 %f35, %r57; + mov.b32 %r58, %f72; + shfl.sync.bfly.b32 %r45, %r58, 2, 31, -1; + mov.b32 %f36, %r45; +$L__tmp6: + .loc 2 108 21 + sub.f32 %f37, %f34, %f70; + .loc 2 109 28 + add.f32 %f38, %f72, %f36; + .loc 2 110 39 + setp.eq.f32 %p16, %f38, 0f00000000; + .loc 2 110 60 + mov.b32 %r46, %f38; + div.full.f32 %r44, %r45, %r46; + mov.b32 %f39, %r44; + .loc 2 110 49 + selp.f32 %f40, 0f00000000, %f39, %p16; + .loc 2 112 17 + fma.rn.f32 %f41, %f37, %f40, %f70; + .loc 2 113 15 + add.f32 %f42, %f71, %f35; + .loc 2 113 30 + mul.f32 %f43, %f37, %f37; + .loc 2 113 38 + mul.f32 %f44, %f72, %f43; + .loc 2 113 22 + fma.rn.f32 %f45, %f44, %f40, %f42; +$L__tmp7: + .loc 2 120 46 + mov.b32 %r59, %f41; + shfl.sync.bfly.b32 %r60, %r59, 1, 31, -1; + mov.b32 %f46, %r60; + mov.b32 %r61, %f45; + shfl.sync.bfly.b32 %r62, %r61, 1, 31, -1; + mov.b32 %f47, %r62; + shfl.sync.bfly.b32 %r48, %r46, 1, 31, -1; + mov.b32 %f48, %r48; +$L__tmp8: + .loc 2 108 21 + sub.f32 %f49, %f46, %f41; + .loc 2 109 28 + add.f32 %f50, %f38, %f48; + .loc 2 110 39 + setp.eq.f32 %p17, %f50, 0f00000000; + .loc 2 110 60 + mov.b32 %r49, %f50; + div.full.f32 %r47, %r48, %r49; + mov.b32 %f51, %r47; + .loc 2 110 49 + selp.f32 %f52, 0f00000000, %f51, %p17; + .loc 2 112 17 + fma.rn.f32 %f16, %f49, %f52, %f41; + .loc 2 113 15 + add.f32 %f53, %f45, %f47; + .loc 2 113 30 + mul.f32 %f54, %f49, %f49; + .loc 2 113 38 + mul.f32 %f55, %f38, %f54; + .loc 2 113 22 + fma.rn.f32 %f56, %f52, %f55, %f53; +$L__tmp9: + .loc 1 69 23 + mov.b32 %r51, %f56; + mov.b32 %r52, 1132462080; + div.full.f32 %r50, %r51, %r52; + mov.b32 %f57, %r50; + .loc 1 71 24 + add.f32 %f17, %f57, 0f3727C5AC; + .loc 1 55 36 + shl.b32 %r63, %r13, 14; + shl.b32 %r64, %r2, 8; + or.b32 %r65, %r63, %r64; + or.b32 %r10, %r65, %r3; + add.s64 %rd83, %rd33, %rd79; + add.s64 %rd82, %rd35, %rd80; + add.s64 %rd81, %rd34, %rd78; + mov.b32 %r80, -4; + setp.lt.u64 %p22, %rd45, 50257; + rsqrt.approx.ftz.f32 %f61, %f17; + bra.uni $L__BB0_6; +$L__BB0_8: + .loc 1 0 0 + mov.b32 %f18, %r66; + mov.b32 %f19, %r68; + .loc 1 65 54 + mov.u32 %r71, 0x0; + @%p11 ld.global.L1::evict_first.b32 { %r71 }, [ %rd83 + 0 ]; + @!%p11 mov.u32 %r71, %r67; + mov.b32 %f58, %r71; + .loc 1 66 24 + add.f32 %f59, %f18, %f58; + .loc 1 67 24 + sub.f32 %f60, %f59, %f16; + .loc 1 73 24 + mul.f32 %f62, %f60, %f61; + .loc 1 74 24 + mul.f32 %f63, %f62, %f19; + .loc 1 55 36 + add.s32 %r80, %r80, 4; + .loc 1 76 29 + add.s32 %r74, %r80, %r10; + mul.wide.s32 %rd72, %r74, 2; + add.s64 %rd71, %rd36, %rd72; + .loc 1 76 52 + mov.b32 %r73, %f63; + cvt.rn.bf16.f32 %rs1, %r73; + @%p11 st.global.b16 [ %rd71 + 0 ], { %rs1 }; + .loc 1 55 36 + add.s64 %rd83, %rd83, 16; + add.s64 %rd82, %rd82, 16; + add.s64 %rd81, %rd81, 16; + setp.lt.u32 %p26, %r80, 252; + @%p26 bra $L__BB0_6; + bra.uni $L__BB0_9; +$L__BB0_6: + .loc 1 59 51 + mov.u32 %r66, 0x0; + @%p11 ld.global.L1::evict_last.b32 { %r66 }, [ %rd81 + 0 ]; + @!%p11 mov.u32 %r66, %r67; + .loc 1 60 40 + mov.u32 %r68, 0x0; + @%p11 ld.global.L1::evict_last.b32 { %r68 }, [ %rd82 + 0 ]; + @!%p11 mov.u32 %r68, %r67; + .loc 1 64 57 + @%p22 bra $L__BB0_8; + mov.u64 %rd63, assertMessage_1; + cvta.global.u64 %rd64, %rd63; + mov.u64 %rd65, assertFile_1; + cvta.global.u64 %rd66, %rd65; + mov.u64 %rd67, assertFunc_1; + cvta.global.u64 %rd68, %rd67; + { // callseq 11, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd64; + .param .b64 param1; + st.param.b64 [param1+0], %rd66; + .param .b32 param2; + st.param.b32 [param2+0], %r77; + .param .b64 param3; + st.param.b64 [param3+0], %rd68; + .param .b64 param4; + st.param.b64 [param4+0], %rd73; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 11 + bra.uni $L__BB0_8; +$L__BB0_9: + .loc 1 55 4 + ret; +$L__tmp10: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 298 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 103 +.b8 120 +.b8 53 +.b8 108 +.b8 120 +.b8 112 +.b8 117 +.b8 101 +.b8 120 +.b8 112 +.b8 105 +.b8 110 +.b8 100 +.b8 106 +.b8 52 +.b8 100 +.b8 115 +.b8 109 +.b8 106 +.b8 122 +.b8 53 +.b8 120 +.b8 52 +.b8 50 +.b8 117 +.b8 104 +.b8 121 +.b8 121 +.b8 55 +.b8 105 +.b8 115 +.b8 107 +.b8 101 +.b8 118 +.b8 113 +.b8 55 +.b8 111 +.b8 118 +.b8 122 +.b8 112 +.b8 119 +.b8 97 +.b8 103 +.b8 98 +.b8 51 +.b8 116 +.b8 53 +.b8 112 +.b8 111 +.b8 119 +.b8 106 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 103 +.b8 120 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp4 +.b8 2 +.b8 44 +.b8 38 +.b8 4 +.b32 125 +.b64 $L__tmp5 +.b64 $L__tmp8 +.b8 2 +.b8 50 +.b8 41 +.b8 5 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp9 +.b8 2 +.b8 50 +.b8 41 +.b8 4 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp9 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2bfb2534267e2d16953e769c87c8995d28982d84 --- /dev/null +++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttgir @@ -0,0 +1,141 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 4], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<64x1xi64, #blocked> + %cst_2 = arith.constant dense<0> : tensor<64x1xi64, #blocked> + %cst_3 = arith.constant dense<50257> : tensor<64x1xi64, #blocked> + %cst_4 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1> + %cst_5 = arith.constant dense<0> : tensor<64x1xi64, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c4_i32 = arith.constant 4 : i32 + %c256_i32 = arith.constant 256 : i32 + %cst_6 = arith.constant dense<1.000000e+00> : tensor<64x4xf32, #blocked> + %cst_7 = arith.constant 0.000000e+00 : f32 + %cst_8 = arith.constant dense<0.000000e+00> : tensor<1x4xf32, #blocked> + %cst_9 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked> + %cst_10 = arith.constant dense<256> : tensor<1x4xi32, #blocked> + %cst_11 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked> + %cst_12 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1> + %10 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked> + %12 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> + %14 = tt.addptr %12, %8 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> + %15 = tt.addptr %13, %9 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + %16 = tt.load %14 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked> + %17 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1> + %18 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked> + %19 = arith.muli %18, %cst_0 : tensor<64x1xi32, #blocked> + %20 = tt.broadcast %19 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked> + %21 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + %22 = arith.addi %16, %cst_3 : tensor<64x1xi64, #blocked> + %23 = arith.addi %17, %cst_4 : tensor<64x1xi64, #blocked1> + %24 = arith.cmpi slt, %16, %cst_2 : tensor<64x1xi64, #blocked> + %25 = arith.cmpi slt, %17, %cst_5 : tensor<64x1xi64, #blocked1> + %26 = arith.select %24, %22, %16 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> + %27 = arith.select %25, %23, %17 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1> + %28 = arith.cmpi sge, %27, %cst_5 : tensor<64x1xi64, #blocked1> + %29 = arith.cmpi slt, %27, %cst_4 : tensor<64x1xi64, #blocked1> + %30 = arith.andi %28, %29 : tensor<64x1xi1, #blocked1> + %31 = arith.muli %26, %cst_1 : tensor<64x1xi64, #blocked> + %32 = tt.broadcast %31 : (tensor<64x1xi64, #blocked>) -> tensor<64x4xi64, #blocked> + %33 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + %34:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_9, %arg9 = %cst_9, %arg10 = %cst_9) -> (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) : i32 { + %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked> + %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked> + %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked> + %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked> + %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked> + %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> + %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked> + %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked> + tt.assert %30, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1> + %53 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> + %54 = tt.broadcast %53 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked> + %55 = arith.addi %54, %32 : tensor<64x4xi64, #blocked> + %56 = tt.addptr %33, %55 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %57 = tt.load %56, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked> + %58 = arith.addf %57, %52 : tensor<64x4xf32, #blocked> + %59 = arith.subf %58, %arg8 : tensor<64x4xf32, #blocked> + %60 = arith.addf %arg10, %cst_6 : tensor<64x4xf32, #blocked> + %61 = arith.divf %59, %60 : tensor<64x4xf32, #blocked> + %62 = arith.addf %arg8, %61 : tensor<64x4xf32, #blocked> + %63 = arith.subf %58, %62 : tensor<64x4xf32, #blocked> + %64 = arith.mulf %59, %63 : tensor<64x4xf32, #blocked> + %65 = arith.addf %arg9, %64 : tensor<64x4xf32, #blocked> + %66 = arith.select %51, %62, %arg8 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> + %67 = arith.select %51, %65, %arg9 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> + %68 = arith.select %51, %60, %arg10 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked> + scf.yield %66, %67, %68 : tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked> + } + %35:3 = "tt.reduce"(%34#0, %34#1, %34#2) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %45 = arith.subf %arg10, %arg7 : f32 + %46 = arith.addf %arg9, %arg12 : f32 + %47 = arith.cmpf oeq, %46, %cst_7 : f32 + %48 = arith.divf %arg12, %46 : f32 + %49 = arith.select %47, %cst_7, %48 : f32 + %50 = arith.mulf %45, %49 : f32 + %51 = arith.addf %arg7, %50 : f32 + %52 = arith.addf %arg8, %arg11 : f32 + %53 = arith.mulf %45, %45 : f32 + %54 = arith.mulf %53, %arg9 : f32 + %55 = arith.mulf %54, %49 : f32 + %56 = arith.addf %52, %55 : f32 + tt.reduce.return %51, %56, %46 : f32, f32, f32 + }) : (tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>, tensor<64x4xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) + %36 = tt.expand_dims %35#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked> + %37 = tt.expand_dims %35#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked> + %38 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x4x!tt.ptr, #blocked> + %39 = tt.broadcast %36 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked> + %40 = arith.divf %37, %cst_12 : tensor<64x1xf32, #blocked> + %41 = arith.addf %40, %cst_11 : tensor<64x1xf32, #blocked> + %42 = arith.muli %8, %cst_0 : tensor<64x1xi32, #blocked> + %43 = tt.broadcast %42 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked> + %44 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x4x!tt.ptr, #blocked> + scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 { + %45 = tt.splat %arg7 : (i32) -> tensor<1x4xi32, #blocked> + %46 = arith.addi %45, %11 : tensor<1x4xi32, #blocked> + %47 = arith.cmpi slt, %46, %cst_10 : tensor<1x4xi32, #blocked> + %48 = tt.broadcast %46 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked> + %49 = arith.addi %48, %20 : tensor<64x4xi32, #blocked> + %50 = tt.addptr %21, %49 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> + %51 = tt.broadcast %47 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked> + %52 = tt.load %50, %51, %cst_9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32, #blocked> + %53 = tt.addptr %38, %46 : tensor<1x4x!tt.ptr, #blocked>, tensor<1x4xi32, #blocked> + %54 = tt.load %53, %47, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32, #blocked> + tt.assert %30, "index out of bounds: 0 <= tmp13 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1> + %55 = arith.extsi %46 : tensor<1x4xi32, #blocked> to tensor<1x4xi64, #blocked> + %56 = tt.broadcast %55 : (tensor<1x4xi64, #blocked>) -> tensor<64x4xi64, #blocked> + %57 = arith.addi %56, %32 : tensor<64x4xi64, #blocked> + %58 = tt.addptr %33, %57 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi64, #blocked> + %59 = tt.load %58, %51, %cst_9 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked> + %60 = arith.addf %59, %52 : tensor<64x4xf32, #blocked> + %61 = arith.subf %60, %39 : tensor<64x4xf32, #blocked> + %62 = tt.extern_elementwise %41 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked> + %63 = tt.broadcast %62 : (tensor<64x1xf32, #blocked>) -> tensor<64x4xf32, #blocked> + %64 = arith.mulf %61, %63 : tensor<64x4xf32, #blocked> + %65 = tt.broadcast %54 : (tensor<1x4xf32, #blocked>) -> tensor<64x4xf32, #blocked> + %66 = arith.mulf %64, %65 : tensor<64x4xf32, #blocked> + %67 = arith.addi %48, %43 : tensor<64x4xi32, #blocked> + %68 = tt.addptr %44, %67 : tensor<64x4x!tt.ptr, #blocked>, tensor<64x4xi32, #blocked> + %69 = arith.truncf %66 : tensor<64x4xf32, #blocked> to tensor<64x4xbf16, #blocked> + tt.store %68, %69, %51 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16, #blocked> + } + tt.return + } +} diff --git a/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..24396c81e1300c5ffeb599c07262ada7e3ea0a4e --- /dev/null +++ b/.triton/dump/174400122b6dbc99e086544aa1856b9f/triton_.ttir @@ -0,0 +1,139 @@ +module { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x4xf32> + %c256_i32 = arith.constant 256 : i32 + %c4_i32 = arith.constant 4 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<256> : tensor<64x1xi64> + %cst_2 = arith.constant dense<0> : tensor<64x1xi64> + %cst_3 = arith.constant dense<50257> : tensor<64x1xi64> + %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32> + %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x4xf32> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> + %cst_8 = arith.constant dense<256> : tensor<64x1xi32> + %cst_9 = arith.constant dense<256> : tensor<1x4xi32> + %cst_10 = arith.constant dense<512> : tensor<64x1xi32> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %11 = arith.remsi %5, %cst_10 : tensor<64x1xi32> + %12 = arith.muli %11, %cst_8 : tensor<64x1xi32> + %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x4xi32> + %14 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %15 = arith.addi %10, %cst_3 : tensor<64x1xi64> + %16 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64> + %17 = arith.select %16, %15, %10 : tensor<64x1xi1>, tensor<64x1xi64> + %18 = arith.cmpi sge, %17, %cst_2 : tensor<64x1xi64> + %19 = arith.cmpi slt, %17, %cst_3 : tensor<64x1xi64> + %20 = arith.andi %18, %19 : tensor<64x1xi1> + %21 = arith.muli %17, %cst_1 : tensor<64x1xi64> + %22 = tt.broadcast %21 : (tensor<64x1xi64>) -> tensor<64x4xi64> + %23 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) : i32 { + %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32> + %48 = arith.addi %47, %7 : tensor<1x4xi32> + %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32> + %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32> + %51 = arith.addi %50, %13 : tensor<64x4xi32> + %52 = tt.addptr %14, %51 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> + %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1> + %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32> + tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1> + %55 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64> + %56 = tt.broadcast %55 : (tensor<1x4xi64>) -> tensor<64x4xi64> + %57 = arith.addi %56, %22 : tensor<64x4xi64> + %58 = tt.addptr %23, %57 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32> + %60 = arith.addf %59, %54 : tensor<64x4xf32> + %61 = arith.subf %60, %arg8 : tensor<64x4xf32> + %62 = arith.addf %arg10, %cst_0 : tensor<64x4xf32> + %63 = arith.divf %61, %62 : tensor<64x4xf32> + %64 = arith.addf %arg8, %63 : tensor<64x4xf32> + %65 = arith.subf %60, %64 : tensor<64x4xf32> + %66 = arith.mulf %61, %65 : tensor<64x4xf32> + %67 = arith.addf %arg9, %66 : tensor<64x4xf32> + %68 = arith.select %53, %64, %arg8 : tensor<64x4xi1>, tensor<64x4xf32> + %69 = arith.select %53, %67, %arg9 : tensor<64x4xi1>, tensor<64x4xf32> + %70 = arith.select %53, %62, %arg10 : tensor<64x4xi1>, tensor<64x4xf32> + scf.yield %68, %69, %70 : tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32> + } + %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %47 = arith.subf %arg10, %arg7 : f32 + %48 = arith.addf %arg9, %arg12 : f32 + %49 = arith.cmpf oeq, %48, %cst : f32 + %50 = arith.divf %arg12, %48 : f32 + %51 = arith.select %49, %cst, %50 : f32 + %52 = arith.mulf %47, %51 : f32 + %53 = arith.addf %arg7, %52 : f32 + %54 = arith.addf %arg8, %arg11 : f32 + %55 = arith.mulf %47, %47 : f32 + %56 = arith.mulf %55, %arg9 : f32 + %57 = arith.mulf %56, %51 : f32 + %58 = arith.addf %54, %57 : f32 + tt.reduce.return %53, %58, %48 : f32, f32, f32 + }) : (tensor<64x4xf32>, tensor<64x4xf32>, tensor<64x4xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) + %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %28 = arith.muli %11, %cst_8 : tensor<64x1xi32> + %29 = tt.broadcast %28 : (tensor<64x1xi32>) -> tensor<64x4xi32> + %30 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %31 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x4x!tt.ptr> + %32 = arith.addi %10, %cst_3 : tensor<64x1xi64> + %33 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64> + %34 = arith.select %33, %32, %10 : tensor<64x1xi1>, tensor<64x1xi64> + %35 = arith.cmpi sge, %34, %cst_2 : tensor<64x1xi64> + %36 = arith.cmpi slt, %34, %cst_3 : tensor<64x1xi64> + %37 = arith.andi %35, %36 : tensor<64x1xi1> + %38 = arith.muli %34, %cst_1 : tensor<64x1xi64> + %39 = tt.broadcast %38 : (tensor<64x1xi64>) -> tensor<64x4xi64> + %40 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %41 = tt.broadcast %26 : (tensor<64x1xf32>) -> tensor<64x4xf32> + %42 = arith.divf %27, %cst_5 : tensor<64x1xf32> + %43 = arith.addf %42, %cst_4 : tensor<64x1xf32> + %44 = arith.muli %5, %cst_8 : tensor<64x1xi32> + %45 = tt.broadcast %44 : (tensor<64x1xi32>) -> tensor<64x4xi32> + %46 = tt.splat %arg4 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + scf.for %arg7 = %c0_i32 to %c256_i32 step %c4_i32 : i32 { + %47 = tt.splat %arg7 : (i32) -> tensor<1x4xi32> + %48 = arith.addi %47, %7 : tensor<1x4xi32> + %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x4xi32> + %50 = tt.broadcast %48 : (tensor<1x4xi32>) -> tensor<64x4xi32> + %51 = arith.addi %50, %29 : tensor<64x4xi32> + %52 = tt.addptr %30, %51 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> + %53 = tt.broadcast %49 : (tensor<1x4xi1>) -> tensor<64x4xi1> + %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x4xf32> + %55 = tt.addptr %31, %48 : tensor<1x4x!tt.ptr>, tensor<1x4xi32> + %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x4xf32> + tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "", "_call_with_frames_removed", 883 : tensor<64x1xi1> + %57 = arith.extsi %48 : tensor<1x4xi32> to tensor<1x4xi64> + %58 = tt.broadcast %57 : (tensor<1x4xi64>) -> tensor<64x4xi64> + %59 = arith.addi %58, %39 : tensor<64x4xi64> + %60 = tt.addptr %40, %59 : tensor<64x4x!tt.ptr>, tensor<64x4xi64> + %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32> + %62 = arith.addf %61, %54 : tensor<64x4xf32> + %63 = arith.subf %62, %41 : tensor<64x4xf32> + %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32> + %65 = tt.broadcast %64 : (tensor<64x1xf32>) -> tensor<64x4xf32> + %66 = arith.mulf %63, %65 : tensor<64x4xf32> + %67 = tt.broadcast %56 : (tensor<1x4xf32>) -> tensor<64x4xf32> + %68 = arith.mulf %66, %67 : tensor<64x4xf32> + %69 = arith.addi %50, %45 : tensor<64x4xi32> + %70 = tt.addptr %46, %69 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> + %71 = arith.truncf %68 : tensor<64x4xf32> to tensor<64x4xbf16> + tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<64x4xbf16> + } + tt.return + } +} diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..ef91adf029023f7e93531c46cfeb0259ee37a45e --- /dev/null +++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.llir @@ -0,0 +1,235 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = shl i32 %6, 2, !dbg !8 + %10 = and i32 %9, 60, !dbg !8 + %11 = and i32 %8, 3, !dbg !9 + %12 = lshr i32 %7, 4, !dbg !9 + %13 = shl nuw nsw i32 %11, 1, !dbg !9 + %14 = or i32 %13, %12, !dbg !9 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %16 = shl i32 %15, 6, !dbg !11 + %17 = or i32 %16, %10, !dbg !12 + %.frozen = freeze i32 %17 + %18 = sdiv i32 %.frozen, 256, !dbg !13 + %19 = mul i32 %18, 256 + %.decomposed = sub i32 %.frozen, %19 + %20 = shl i32 %18, 15, !dbg !14 + %21 = add i32 %20, %.decomposed + br label %22, !dbg !15 + +22: ; preds = %5, %22 + %23 = phi i32 [ 0, %5 ], [ %53, %22 ] + %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %52, %22 ] + %25 = or i32 %23, %14, !dbg !16 + %26 = shl i32 %25, 8, !dbg !17 + %27 = add i32 %21, %26, !dbg !18 + %28 = sext i32 %27 to i64, !dbg !19 + %29 = getelementptr float, ptr addrspace(1) %0, i64 %28, !dbg !19 + %30 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20 + %31 = extractvalue { i32, i32, i32, i32 } %30, 0, !dbg !20 + %32 = extractvalue { i32, i32, i32, i32 } %30, 1, !dbg !20 + %33 = extractvalue { i32, i32, i32, i32 } %30, 2, !dbg !20 + %34 = extractvalue { i32, i32, i32, i32 } %30, 3, !dbg !20 + %35 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !21 + %36 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %35, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !22 + %37 = extractvalue { i32, i32, i32, i32 } %36, 0, !dbg !22 + %38 = extractvalue { i32, i32, i32, i32 } %36, 1, !dbg !22 + %39 = extractvalue { i32, i32, i32, i32 } %36, 2, !dbg !22 + %40 = extractvalue { i32, i32, i32, i32 } %36, 3, !dbg !22 + %41 = insertelement <4 x i32> poison, i32 %31, i64 0, !dbg !20 + %42 = insertelement <4 x i32> %41, i32 %32, i64 1, !dbg !20 + %43 = insertelement <4 x i32> %42, i32 %33, i64 2, !dbg !20 + %44 = insertelement <4 x i32> %43, i32 %34, i64 3, !dbg !20 + %45 = bitcast <4 x i32> %44 to <4 x float>, !dbg !20 + %46 = insertelement <4 x i32> poison, i32 %37, i64 0, !dbg !22 + %47 = insertelement <4 x i32> %46, i32 %38, i64 1, !dbg !22 + %48 = insertelement <4 x i32> %47, i32 %39, i64 2, !dbg !22 + %49 = insertelement <4 x i32> %48, i32 %40, i64 3, !dbg !22 + %50 = bitcast <4 x i32> %49 to <4 x float>, !dbg !22 + %51 = fmul <4 x float> %45, %50, !dbg !23 + %52 = fadd <4 x float> %24, %51, !dbg !24 + %53 = add nuw nsw i32 %23, 8, !dbg !15 + %54 = icmp ult i32 %23, 120, !dbg !15 + br i1 %54, label %22, label %55, !dbg !15 + +55: ; preds = %22 + %56 = and i32 %6, 63, !dbg !8 + %57 = or i32 %16, %56, !dbg !12 + %58 = or i32 %10, 3, !dbg !25 + %59 = or i32 %10, 2, !dbg !25 + %60 = or i32 %10, 1, !dbg !25 + %61 = extractelement <4 x float> %52, i64 0, !dbg !25 + %62 = bitcast float %61 to i32, !dbg !25 + %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 16, i32 31), !dbg !25 + %64 = bitcast i32 %63 to float, !dbg !25 + %65 = fadd float %61, %64, !dbg !29 + %66 = extractelement <4 x float> %52, i64 1, !dbg !25 + %67 = bitcast float %66 to i32, !dbg !25 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !25 + %69 = bitcast i32 %68 to float, !dbg !25 + %70 = fadd float %66, %69, !dbg !29 + %71 = extractelement <4 x float> %52, i64 2, !dbg !25 + %72 = bitcast float %71 to i32, !dbg !25 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !25 + %74 = bitcast i32 %73 to float, !dbg !25 + %75 = fadd float %71, %74, !dbg !29 + %76 = extractelement <4 x float> %52, i64 3, !dbg !25 + %77 = bitcast float %76 to i32, !dbg !25 + %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !25 + %79 = bitcast i32 %78 to float, !dbg !25 + %80 = fadd float %76, %79, !dbg !29 + %81 = icmp ult i32 %7, 16, !dbg !25 + %82 = shl nuw nsw i32 %10, 2, !dbg !25 + %83 = or i32 %82, %11, !dbg !25 + %84 = zext nneg i32 %83 to i64, !dbg !25 + %85 = getelementptr float, ptr addrspace(3) @global_smem, i64 %84, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %85, float %65, i1 %81) #3, !dbg !25 + %86 = shl nuw nsw i32 %60, 2, !dbg !25 + %87 = or i32 %86, %11, !dbg !25 + %88 = zext nneg i32 %87 to i64, !dbg !25 + %89 = getelementptr float, ptr addrspace(3) @global_smem, i64 %88, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %89, float %70, i1 %81) #3, !dbg !25 + %90 = shl nuw nsw i32 %59, 2, !dbg !25 + %91 = or i32 %90, %11, !dbg !25 + %92 = zext nneg i32 %91 to i64, !dbg !25 + %93 = getelementptr float, ptr addrspace(3) @global_smem, i64 %92, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %93, float %75, i1 %81) #3, !dbg !25 + %94 = shl nuw nsw i32 %58, 2, !dbg !25 + %95 = or i32 %94, %11, !dbg !25 + %96 = zext nneg i32 %95 to i64, !dbg !25 + %97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %80, i1 %81) #3, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + %98 = icmp slt i32 %6, 256, !dbg !25 + %99 = sext i32 %6 to i64, !dbg !25 + %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !25 + %101 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %100, i1 %98) #3, !dbg !25 + %102 = bitcast float %101 to i32, !dbg !25 + %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !25 + %104 = bitcast i32 %103 to float, !dbg !25 + %105 = fadd float %101, %104, !dbg !29 + %106 = bitcast float %105 to i32, !dbg !25 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !25 + %108 = bitcast i32 %107 to float, !dbg !25 + %109 = fadd float %105, %108, !dbg !29 + %110 = and i32 %6, 3, !dbg !25 + %111 = icmp eq i32 %110, 0, !dbg !25 + %112 = and i1 %98, %111, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %109, i1 %112) #3, !dbg !25 + %113 = add i32 %6, 128, !dbg !25 + %114 = sext i32 %113 to i64, !dbg !25 + %115 = getelementptr float, ptr addrspace(3) @global_smem, i64 %114, !dbg !25 + %116 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %115, i1 %98) #3, !dbg !25 + %117 = bitcast float %116 to i32, !dbg !25 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 2, i32 31), !dbg !25 + %119 = bitcast i32 %118 to float, !dbg !25 + %120 = fadd float %116, %119, !dbg !29 + %121 = bitcast float %120 to i32, !dbg !25 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 1, i32 31), !dbg !25 + %123 = bitcast i32 %122 to float, !dbg !25 + %124 = fadd float %120, %123, !dbg !29 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %115, float %124, i1 %112) #3, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + %125 = zext nneg i32 %82 to i64, !dbg !25 + %126 = getelementptr float, ptr addrspace(3) @global_smem, i64 %125, !dbg !25 + %127 = load float, ptr addrspace(3) %126, align 4, !dbg !25 + %128 = zext nneg i32 %86 to i64, !dbg !25 + %129 = getelementptr float, ptr addrspace(3) @global_smem, i64 %128, !dbg !25 + %130 = load float, ptr addrspace(3) %129, align 4, !dbg !25 + %131 = zext nneg i32 %90 to i64, !dbg !25 + %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !25 + %133 = load float, ptr addrspace(3) %132, align 4, !dbg !25 + %134 = zext nneg i32 %94 to i64, !dbg !25 + %135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !25 + %136 = load float, ptr addrspace(3) %135, align 4, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !33 + %137 = zext nneg i32 %10 to i64, !dbg !33 + %138 = getelementptr float, ptr addrspace(3) @global_smem, i64 %137, !dbg !33 + %139 = insertelement <1 x float> undef, float %127, i64 0, !dbg !33 + store <1 x float> %139, ptr addrspace(3) %138, align 4, !dbg !33 + %140 = zext nneg i32 %60 to i64, !dbg !33 + %141 = getelementptr float, ptr addrspace(3) @global_smem, i64 %140, !dbg !33 + %142 = insertelement <1 x float> undef, float %130, i64 0, !dbg !33 + store <1 x float> %142, ptr addrspace(3) %141, align 4, !dbg !33 + %143 = zext nneg i32 %59 to i64, !dbg !33 + %144 = getelementptr float, ptr addrspace(3) @global_smem, i64 %143, !dbg !33 + %145 = insertelement <1 x float> undef, float %133, i64 0, !dbg !33 + store <1 x float> %145, ptr addrspace(3) %144, align 4, !dbg !33 + %146 = zext nneg i32 %58 to i64, !dbg !33 + %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !33 + %148 = insertelement <1 x float> undef, float %136, i64 0, !dbg !33 + store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !33 + tail call void @llvm.nvvm.barrier0(), !dbg !33 + %149 = zext nneg i32 %56 to i64, !dbg !33 + %150 = getelementptr float, ptr addrspace(3) @global_smem, i64 %149, !dbg !33 + %151 = load i32, ptr addrspace(3) %150, align 4, !dbg !33 + %152 = sext i32 %57 to i64, !dbg !34 + %153 = getelementptr float, ptr addrspace(1) %2, i64 %152, !dbg !34 + %154 = and i32 %6, 64, !dbg !35 + %155 = icmp eq i32 %154, 0, !dbg !35 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %151, ptr addrspace(1) %153, i1 %155) #3, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py", directory: "/tmp/torchinductor_root/qd") +!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 26, column: 20, scope: !5) +!14 = !DILocation(line: 33, column: 57, scope: !5) +!15 = !DILocation(line: 29, column: 36, scope: !5) +!16 = !DILocation(line: 30, column: 27, scope: !5) +!17 = !DILocation(line: 33, column: 44, scope: !5) +!18 = !DILocation(line: 33, column: 51, scope: !5) +!19 = !DILocation(line: 33, column: 34, scope: !5) +!20 = !DILocation(line: 33, column: 63, scope: !5) +!21 = !DILocation(line: 34, column: 34, scope: !5) +!22 = !DILocation(line: 34, column: 63, scope: !5) +!23 = !DILocation(line: 35, column: 22, scope: !5) +!24 = !DILocation(line: 38, column: 38, scope: !5) +!25 = !DILocation(line: 243, column: 36, scope: !26, inlinedAt: !28) +!26 = distinct !DILexicalBlockFile(scope: !5, file: !27, discriminator: 0) +!27 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!28 = !DILocation(line: 39, column: 25, scope: !26) +!29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !31) +!30 = distinct !DILexicalBlockFile(scope: !26, file: !27, discriminator: 0) +!31 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32) +!32 = !DILocation(line: 39, column: 25, scope: !30) +!33 = !DILocation(line: 39, column: 28, scope: !5) +!34 = !DILocation(line: 40, column: 25, scope: !5) +!35 = !DILocation(line: 40, column: 36, scope: !5) +!36 = !DILocation(line: 40, column: 4, scope: !5) diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..28d5527aac1d34ee891611b3155e45c41b50af02 --- /dev/null +++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ptx @@ -0,0 +1,572 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4de( + .param .u64 triton__0d1d2d3de4de_param_0, + .param .u64 triton__0d1d2d3de4de_param_1, + .param .u64 triton__0d1d2d3de4de_param_2, + .param .u32 triton__0d1d2d3de4de_param_3, + .param .u32 triton__0d1d2d3de4de_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<22>; + .reg .b32 %r<98>; + .reg .f32 %f<47>; + .reg .b64 %rd<9>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2]; + ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1]; + ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shl.b32 %r13, %r1, 2; + and.b32 %r3, %r13, 60; + .loc 1 24 33 + bfe.u32 %r4, %r1, 5, 2; + .loc 1 21 28 + mov.u32 %r11, %ctaid.x; + .loc 1 21 33 + shl.b32 %r5, %r11, 6; + .loc 1 22 23 + or.b32 %r14, %r5, %r3; + .loc 1 26 20 + shr.s32 %r16, %r14, 31; + shr.u32 %r17, %r16, 24; + add.s32 %r18, %r14, %r17; + shr.s32 %r19, %r18, 8; + .loc 1 29 36 + mad.lo.s32 %r20, %r19, 32512, %r14; + shl.b32 %r21, %r4, 9; + add.s32 %r22, %r20, %r21; + shl.b32 %r23, %r1, 4; + and.b32 %r24, %r23, 256; + add.s32 %r96, %r22, %r24; + mov.f32 %f43, 0f00000000; + mov.b32 %r97, -8; + mov.pred %p1, -1; + mov.f32 %f44, %f43; + mov.f32 %f45, %f43; + mov.f32 %f46, %f43; +$L__BB0_1: + .loc 1 33 34 + mul.wide.s32 %rd6, %r96, 4; + add.s64 %rd4, %rd1, %rd6; + mov.b32 %r29, 0; + .loc 1 33 63 + mov.u32 %r25, 0x0; + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r25, %r26, %r27, %r28 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r25, %r29; + @!%p1 mov.u32 %r26, %r29; + @!%p1 mov.u32 %r27, %r29; + @!%p1 mov.u32 %r28, %r29; + .loc 1 34 34 + add.s64 %rd5, %rd2, %rd6; + .loc 1 34 63 + mov.u32 %r33, 0x0; + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r33, %r29; + @!%p1 mov.u32 %r34, %r29; + @!%p1 mov.u32 %r35, %r29; + @!%p1 mov.u32 %r36, %r29; + .loc 1 33 63 + mov.b32 %f13, %r25; + mov.b32 %f14, %r26; + mov.b32 %f15, %r27; + mov.b32 %f16, %r28; + .loc 1 34 63 + mov.b32 %f17, %r33; + mov.b32 %f18, %r34; + mov.b32 %f19, %r35; + mov.b32 %f20, %r36; + .loc 1 38 38 + fma.rn.f32 %f46, %f16, %f20, %f46; + fma.rn.f32 %f45, %f15, %f19, %f45; + fma.rn.f32 %f44, %f14, %f18, %f44; + fma.rn.f32 %f43, %f13, %f17, %f43; + .loc 1 29 36 + add.s32 %r97, %r97, 8; + add.s32 %r96, %r96, 2048; + setp.lt.u32 %p11, %r97, 120; + @%p11 bra $L__BB0_1; + .loc 1 22 44 + and.b32 %r58, %r1, 63; + .loc 1 22 23 + or.b32 %r59, %r5, %r58; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r60, %f43; + shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1; + mov.b32 %f21, %r61; +$L__tmp2: + .loc 2 233 15 + add.f32 %f22, %f43, %f21; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r62, %f44; + shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1; + mov.b32 %f23, %r63; +$L__tmp4: + .loc 2 233 15 + add.f32 %f24, %f44, %f23; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r64, %f45; + shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1; + mov.b32 %f25, %r65; +$L__tmp6: + .loc 2 233 15 + add.f32 %f26, %f45, %f25; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r66, %f46; + shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1; + mov.b32 %f27, %r67; +$L__tmp8: + .loc 2 233 15 + add.f32 %f28, %f46, %f27; +$L__tmp9: + .loc 2 243 36 + setp.lt.u32 %p12, %r2, 16; + shl.b32 %r68, %r3, 2; + or.b32 %r69, %r68, %r4; + shl.b32 %r70, %r69, 2; + mov.u32 %r71, global_smem; + add.s32 %r41, %r71, %r70; + mov.b32 %r42, %f22; + @%p12 st.shared.b32 [ %r41 + 0 ], %r42; + shl.b32 %r72, %r4, 2; + shl.b32 %r73, %r3, 4; + or.b32 %r74, %r73, 16; + or.b32 %r75, %r74, %r72; + add.s32 %r43, %r71, %r75; + mov.b32 %r44, %f24; + @%p12 st.shared.b32 [ %r43 + 0 ], %r44; + or.b32 %r76, %r73, 32; + or.b32 %r77, %r76, %r72; + add.s32 %r45, %r71, %r77; + mov.b32 %r46, %f26; + @%p12 st.shared.b32 [ %r45 + 0 ], %r46; + or.b32 %r78, %r73, 48; + or.b32 %r79, %r78, %r72; + add.s32 %r47, %r71, %r79; + mov.b32 %r48, %f28; + @%p12 st.shared.b32 [ %r47 + 0 ], %r48; + bar.sync 0; + setp.lt.s32 %p16, %r1, 256; + add.s32 %r50, %r71, %r13; + @%p16 ld.shared.b32 %r49, [ %r50 + 0 ]; + mov.b32 %f29, %r49; + shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1; + mov.b32 %f30, %r81; +$L__tmp10: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r82, %f31; + shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1; + mov.b32 %f32, %r83; +$L__tmp12: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp13: + .loc 2 243 36 + and.b32 %r84, %r1, 3; + setp.eq.s32 %p21, %r84, 0; + and.pred %p17, %p16, %p21; + mov.b32 %r52, %f33; + @%p17 st.shared.b32 [ %r50 + 0 ], %r52; + add.s32 %r54, %r50, 512; + @%p16 ld.shared.b32 %r53, [ %r54 + 0 ]; + mov.b32 %f34, %r53; + shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1; + mov.b32 %f35, %r85; +$L__tmp14: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r86, %f36; + shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; + mov.b32 %f37, %r87; +$L__tmp16: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r56, %f38; + @%p17 st.shared.b32 [ %r54 + 0 ], %r56; + bar.sync 0; + add.s32 %r88, %r71, %r73; + ld.shared.f32 %f39, [%r88]; + add.s32 %r89, %r71, %r74; + ld.shared.f32 %f40, [%r89]; + add.s32 %r90, %r71, %r76; + ld.shared.f32 %f41, [%r90]; + add.s32 %r91, %r71, %r78; + ld.shared.f32 %f42, [%r91]; +$L__tmp18: + .loc 1 39 28 + bar.sync 0; + add.s32 %r92, %r71, %r68; + st.shared.f32 [%r92], %f39; + st.shared.f32 [%r92+4], %f40; + st.shared.f32 [%r92+8], %f41; + st.shared.f32 [%r92+12], %f42; + bar.sync 0; + shl.b32 %r93, %r58, 2; + add.s32 %r94, %r71, %r93; + ld.shared.u32 %r57, [%r94]; + .loc 1 40 25 + mul.wide.s32 %rd8, %r59, 4; + add.s64 %rd7, %rd3, %rd8; + .loc 1 40 36 + and.b32 %r95, %r1, 64; + setp.eq.s32 %p20, %r95, 0; + @%p20 st.global.b32 [ %rd7 + 0 ], { %r57 }; + .loc 1 40 4 + ret; +$L__tmp19: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/qd/cqdvltndxc7vwj5j5dnsb73tk763gajftjwvmbfq7i6sitk5gwoy.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 266 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 113 +.b8 100 +.b8 118 +.b8 108 +.b8 116 +.b8 110 +.b8 100 +.b8 120 +.b8 99 +.b8 55 +.b8 118 +.b8 119 +.b8 106 +.b8 53 +.b8 106 +.b8 53 +.b8 100 +.b8 110 +.b8 115 +.b8 98 +.b8 55 +.b8 51 +.b8 116 +.b8 107 +.b8 55 +.b8 54 +.b8 51 +.b8 103 +.b8 97 +.b8 106 +.b8 102 +.b8 116 +.b8 106 +.b8 119 +.b8 118 +.b8 109 +.b8 98 +.b8 102 +.b8 113 +.b8 55 +.b8 105 +.b8 54 +.b8 115 +.b8 105 +.b8 116 +.b8 107 +.b8 53 +.b8 103 +.b8 119 +.b8 111 +.b8 121 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 113 +.b8 100 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp18 +.b8 2 +.b8 39 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 39 +.b8 25 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 270 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 270 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..d72a62515fa876a92b19f5e1298f1a6c96bc6045 --- /dev/null +++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttgir @@ -0,0 +1,63 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked> + %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c8_i32 = arith.constant 8 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1> + %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked> + %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked> + %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked> + %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked> + %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %18 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 { + %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked> + %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked> + %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked> + %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked> + %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked> + %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked> + %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked> + %34 = tt.load %32, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %35 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %36 = tt.load %35, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %37 = arith.mulf %34, %36 : tensor<64x8xf32, #blocked> + %38 = arith.addf %arg6, %37 : tensor<64x8xf32, #blocked> + %39 = arith.select %33, %38, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> + scf.yield %39 : tensor<64x8xf32, #blocked> + } + %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %25 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %25 : f32 + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1> + %23 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> + %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..5f55166bf82e9de33c296e415ec524467bd37325 Binary files /dev/null and b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.cubin differ diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..034f88c06450217ba26fedb2955794d0a53bf198 --- /dev/null +++ b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.llir @@ -0,0 +1,243 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = shl i32 %6, 2, !dbg !8 + %10 = and i32 %9, 60, !dbg !8 + %11 = and i32 %8, 3, !dbg !9 + %12 = lshr i32 %7, 4, !dbg !9 + %13 = shl nuw nsw i32 %11, 1, !dbg !9 + %14 = or i32 %13, %12, !dbg !9 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %16 = shl i32 %15, 6, !dbg !11 + %17 = or i32 %16, %10, !dbg !12 + %.frozen = freeze i32 %17 + %18 = sdiv i32 %.frozen, 256, !dbg !13 + %19 = mul i32 %18, 256 + %.decomposed = sub i32 %.frozen, %19 + %20 = shl i32 %18, 15, !dbg !14 + %21 = add i32 %20, %.decomposed + br label %22, !dbg !15 + +22: ; preds = %5, %22 + %23 = phi i32 [ 0, %5 ], [ %58, %22 ] + %24 = phi <4 x float> [ zeroinitializer, %5 ], [ %57, %22 ] + %25 = or i32 %23, %14, !dbg !16 + %26 = shl i32 %25, 8, !dbg !17 + %27 = add i32 %21, %26, !dbg !18 + %28 = sext i32 %27 to i64, !dbg !19 + %29 = getelementptr i16, ptr addrspace(1) %0, i64 %28, !dbg !19 + %30 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %29, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20 + %31 = extractvalue { i32, i32 } %30, 0, !dbg !20 + %32 = extractvalue { i32, i32 } %30, 1, !dbg !20 + %33 = trunc i32 %31 to i16, !dbg !20 + %extelt.offset = lshr i32 %31, 16, !dbg !20 + %34 = trunc i32 %extelt.offset to i16, !dbg !20 + %35 = trunc i32 %32 to i16, !dbg !20 + %extelt.offset1 = lshr i32 %32, 16, !dbg !20 + %36 = trunc i32 %extelt.offset1 to i16, !dbg !20 + %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #3, !dbg !21 + %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #3, !dbg !21 + %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #3, !dbg !21 + %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #3, !dbg !21 + %41 = getelementptr float, ptr addrspace(1) %1, i64 %28, !dbg !22 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !23 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !23 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !23 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !23 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !23 + %47 = insertelement <4 x i32> poison, i32 %43, i64 0, !dbg !23 + %48 = insertelement <4 x i32> %47, i32 %44, i64 1, !dbg !23 + %49 = insertelement <4 x i32> %48, i32 %45, i64 2, !dbg !23 + %50 = insertelement <4 x i32> %49, i32 %46, i64 3, !dbg !23 + %51 = bitcast <4 x i32> %50 to <4 x float>, !dbg !23 + %52 = insertelement <4 x float> poison, float %37, i64 0, !dbg !24 + %53 = insertelement <4 x float> %52, float %38, i64 1, !dbg !24 + %54 = insertelement <4 x float> %53, float %39, i64 2, !dbg !24 + %55 = insertelement <4 x float> %54, float %40, i64 3, !dbg !24 + %56 = fmul <4 x float> %55, %51, !dbg !24 + %57 = fadd <4 x float> %24, %56, !dbg !25 + %58 = add nuw nsw i32 %23, 8, !dbg !15 + %59 = icmp ult i32 %23, 120, !dbg !15 + br i1 %59, label %22, label %60, !dbg !15 + +60: ; preds = %22 + %61 = and i32 %6, 63, !dbg !8 + %62 = or i32 %16, %61, !dbg !12 + %63 = or i32 %10, 3, !dbg !26 + %64 = or i32 %10, 2, !dbg !26 + %65 = or i32 %10, 1, !dbg !26 + %66 = extractelement <4 x float> %57, i64 0, !dbg !26 + %67 = bitcast float %66 to i32, !dbg !26 + %68 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %67, i32 16, i32 31), !dbg !26 + %69 = bitcast i32 %68 to float, !dbg !26 + %70 = fadd float %66, %69, !dbg !30 + %71 = extractelement <4 x float> %57, i64 1, !dbg !26 + %72 = bitcast float %71 to i32, !dbg !26 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !26 + %74 = bitcast i32 %73 to float, !dbg !26 + %75 = fadd float %71, %74, !dbg !30 + %76 = extractelement <4 x float> %57, i64 2, !dbg !26 + %77 = bitcast float %76 to i32, !dbg !26 + %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 16, i32 31), !dbg !26 + %79 = bitcast i32 %78 to float, !dbg !26 + %80 = fadd float %76, %79, !dbg !30 + %81 = extractelement <4 x float> %57, i64 3, !dbg !26 + %82 = bitcast float %81 to i32, !dbg !26 + %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 16, i32 31), !dbg !26 + %84 = bitcast i32 %83 to float, !dbg !26 + %85 = fadd float %81, %84, !dbg !30 + %86 = icmp ult i32 %7, 16, !dbg !26 + %87 = shl nuw nsw i32 %10, 2, !dbg !26 + %88 = or i32 %87, %11, !dbg !26 + %89 = zext nneg i32 %88 to i64, !dbg !26 + %90 = getelementptr float, ptr addrspace(3) @global_smem, i64 %89, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %90, float %70, i1 %86) #3, !dbg !26 + %91 = shl nuw nsw i32 %65, 2, !dbg !26 + %92 = or i32 %91, %11, !dbg !26 + %93 = zext nneg i32 %92 to i64, !dbg !26 + %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %75, i1 %86) #3, !dbg !26 + %95 = shl nuw nsw i32 %64, 2, !dbg !26 + %96 = or i32 %95, %11, !dbg !26 + %97 = zext nneg i32 %96 to i64, !dbg !26 + %98 = getelementptr float, ptr addrspace(3) @global_smem, i64 %97, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %98, float %80, i1 %86) #3, !dbg !26 + %99 = shl nuw nsw i32 %63, 2, !dbg !26 + %100 = or i32 %99, %11, !dbg !26 + %101 = zext nneg i32 %100 to i64, !dbg !26 + %102 = getelementptr float, ptr addrspace(3) @global_smem, i64 %101, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %102, float %85, i1 %86) #3, !dbg !26 + tail call void @llvm.nvvm.barrier0(), !dbg !26 + %103 = icmp slt i32 %6, 256, !dbg !26 + %104 = sext i32 %6 to i64, !dbg !26 + %105 = getelementptr float, ptr addrspace(3) @global_smem, i64 %104, !dbg !26 + %106 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %105, i1 %103) #3, !dbg !26 + %107 = bitcast float %106 to i32, !dbg !26 + %108 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %107, i32 2, i32 31), !dbg !26 + %109 = bitcast i32 %108 to float, !dbg !26 + %110 = fadd float %106, %109, !dbg !30 + %111 = bitcast float %110 to i32, !dbg !26 + %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 1, i32 31), !dbg !26 + %113 = bitcast i32 %112 to float, !dbg !26 + %114 = fadd float %110, %113, !dbg !30 + %115 = and i32 %6, 3, !dbg !26 + %116 = icmp eq i32 %115, 0, !dbg !26 + %117 = and i1 %103, %116, !dbg !26 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %105, float %114, i1 %117) #3, !dbg !26 + %118 = add i32 %6, 128, !dbg !26 + %119 = sext i32 %118 to i64, !dbg !26 + %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !26 + %121 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %120, i1 %103) #3, !dbg !26 + %122 = bitcast float %121 to i32, !dbg !26 + %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !26 + %124 = bitcast i32 %123 to float, !dbg !26 + %125 = fadd float %121, %124, !dbg !30 + %126 = bitcast float %125 to i32, !dbg !26 + %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !26 + %128 = bitcast i32 %127 to float, !dbg !26 + %129 = fadd float %125, %128, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %129, i1 %117) #3, !dbg !26 + tail call void @llvm.nvvm.barrier0(), !dbg !26 + %130 = zext nneg i32 %87 to i64, !dbg !26 + %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !26 + %132 = load float, ptr addrspace(3) %131, align 4, !dbg !26 + %133 = zext nneg i32 %91 to i64, !dbg !26 + %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !26 + %135 = load float, ptr addrspace(3) %134, align 4, !dbg !26 + %136 = zext nneg i32 %95 to i64, !dbg !26 + %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !26 + %138 = load float, ptr addrspace(3) %137, align 4, !dbg !26 + %139 = zext nneg i32 %99 to i64, !dbg !26 + %140 = getelementptr float, ptr addrspace(3) @global_smem, i64 %139, !dbg !26 + %141 = load float, ptr addrspace(3) %140, align 4, !dbg !26 + tail call void @llvm.nvvm.barrier0(), !dbg !34 + %142 = zext nneg i32 %10 to i64, !dbg !34 + %143 = getelementptr float, ptr addrspace(3) @global_smem, i64 %142, !dbg !34 + %144 = insertelement <1 x float> undef, float %132, i64 0, !dbg !34 + store <1 x float> %144, ptr addrspace(3) %143, align 4, !dbg !34 + %145 = zext nneg i32 %65 to i64, !dbg !34 + %146 = getelementptr float, ptr addrspace(3) @global_smem, i64 %145, !dbg !34 + %147 = insertelement <1 x float> undef, float %135, i64 0, !dbg !34 + store <1 x float> %147, ptr addrspace(3) %146, align 4, !dbg !34 + %148 = zext nneg i32 %64 to i64, !dbg !34 + %149 = getelementptr float, ptr addrspace(3) @global_smem, i64 %148, !dbg !34 + %150 = insertelement <1 x float> undef, float %138, i64 0, !dbg !34 + store <1 x float> %150, ptr addrspace(3) %149, align 4, !dbg !34 + %151 = zext nneg i32 %63 to i64, !dbg !34 + %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !34 + %153 = insertelement <1 x float> undef, float %141, i64 0, !dbg !34 + store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !34 + tail call void @llvm.nvvm.barrier0(), !dbg !34 + %154 = zext nneg i32 %61 to i64, !dbg !34 + %155 = getelementptr float, ptr addrspace(3) @global_smem, i64 %154, !dbg !34 + %156 = load i32, ptr addrspace(3) %155, align 4, !dbg !34 + %157 = sext i32 %62 to i64, !dbg !35 + %158 = getelementptr float, ptr addrspace(1) %2, i64 %157, !dbg !35 + %159 = and i32 %6, 64, !dbg !36 + %160 = icmp eq i32 %159, 0, !dbg !36 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %156, ptr addrspace(1) %158, i1 %160) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py", directory: "/tmp/torchinductor_root/sj") +!3 = !{ptr @triton__0d1d2d3de4de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4de", linkageName: "triton__0d1d2d3de4de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 26, column: 20, scope: !5) +!14 = !DILocation(line: 33, column: 57, scope: !5) +!15 = !DILocation(line: 29, column: 36, scope: !5) +!16 = !DILocation(line: 30, column: 27, scope: !5) +!17 = !DILocation(line: 33, column: 44, scope: !5) +!18 = !DILocation(line: 33, column: 51, scope: !5) +!19 = !DILocation(line: 33, column: 34, scope: !5) +!20 = !DILocation(line: 33, column: 63, scope: !5) +!21 = !DILocation(line: 33, column: 115, scope: !5) +!22 = !DILocation(line: 34, column: 34, scope: !5) +!23 = !DILocation(line: 34, column: 63, scope: !5) +!24 = !DILocation(line: 36, column: 22, scope: !5) +!25 = !DILocation(line: 39, column: 38, scope: !5) +!26 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !29) +!27 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!29 = !DILocation(line: 40, column: 25, scope: !27) +!30 = !DILocation(line: 233, column: 15, scope: !31, inlinedAt: !32) +!31 = distinct !DILexicalBlockFile(scope: !27, file: !28, discriminator: 0) +!32 = !DILocation(line: 243, column: 36, scope: !31, inlinedAt: !33) +!33 = !DILocation(line: 40, column: 25, scope: !31) +!34 = !DILocation(line: 40, column: 28, scope: !5) +!35 = !DILocation(line: 41, column: 25, scope: !5) +!36 = !DILocation(line: 41, column: 36, scope: !5) +!37 = !DILocation(line: 41, column: 4, scope: !5) diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..185a499db8070473dada441e0fe7d45e7bd06602 --- /dev/null +++ b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ptx @@ -0,0 +1,577 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4de( + .param .u64 triton__0d1d2d3de4de_param_0, + .param .u64 triton__0d1d2d3de4de_param_1, + .param .u64 triton__0d1d2d3de4de_param_2, + .param .u32 triton__0d1d2d3de4de_param_3, + .param .u32 triton__0d1d2d3de4de_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<20>; + .reg .b16 %rs<5>; + .reg .b32 %r<98>; + .reg .f32 %f<47>; + .reg .b64 %rd<10>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2d3de4de_param_2]; + ld.param.u64 %rd2, [triton__0d1d2d3de4de_param_1]; + ld.param.u64 %rd1, [triton__0d1d2d3de4de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + shl.b32 %r13, %r1, 2; + and.b32 %r3, %r13, 60; + .loc 1 24 33 + bfe.u32 %r4, %r1, 5, 2; + .loc 1 21 28 + mov.u32 %r11, %ctaid.x; + .loc 1 21 33 + shl.b32 %r5, %r11, 6; + .loc 1 22 23 + or.b32 %r14, %r5, %r3; + .loc 1 26 20 + shr.s32 %r16, %r14, 31; + shr.u32 %r17, %r16, 24; + add.s32 %r18, %r14, %r17; + shr.s32 %r19, %r18, 8; + .loc 1 29 36 + mad.lo.s32 %r20, %r19, 32512, %r14; + shl.b32 %r21, %r4, 9; + add.s32 %r22, %r20, %r21; + shl.b32 %r23, %r1, 4; + and.b32 %r24, %r23, 256; + add.s32 %r96, %r22, %r24; + mov.f32 %f43, 0f00000000; + mov.b32 %r97, -8; + mov.pred %p1, -1; + mov.f32 %f44, %f43; + mov.f32 %f45, %f43; + mov.f32 %f46, %f43; +$L__BB0_1: + .loc 1 33 34 + mul.wide.s32 %rd6, %r96, 2; + add.s64 %rd4, %rd1, %rd6; + mov.b32 %r27, 0; + .loc 1 33 63 + mov.u32 %r25, 0x0; + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_first.v2.b32 { %r25, %r26 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r25, %r27; + @!%p1 mov.u32 %r26, %r27; + cvt.u16.u32 %rs1, %r25; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r25; } + cvt.u16.u32 %rs3, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r26; } + .loc 1 33 115 + cvt.f32.bf16 %r29, %rs1; + mov.b32 %f13, %r29; + cvt.f32.bf16 %r30, %rs2; + mov.b32 %f14, %r30; + cvt.f32.bf16 %r31, %rs3; + mov.b32 %f15, %r31; + cvt.f32.bf16 %r32, %rs4; + mov.b32 %f16, %r32; + .loc 1 34 34 + mul.wide.s32 %rd7, %r96, 4; + add.s64 %rd5, %rd2, %rd7; + .loc 1 34 63 + mov.u32 %r33, 0x0; + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r33, %r34, %r35, %r36 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r33, %r27; + @!%p1 mov.u32 %r34, %r27; + @!%p1 mov.u32 %r35, %r27; + @!%p1 mov.u32 %r36, %r27; + mov.b32 %f17, %r33; + mov.b32 %f18, %r34; + mov.b32 %f19, %r35; + mov.b32 %f20, %r36; + .loc 1 39 38 + fma.rn.f32 %f46, %f16, %f20, %f46; + fma.rn.f32 %f45, %f15, %f19, %f45; + fma.rn.f32 %f44, %f14, %f18, %f44; + fma.rn.f32 %f43, %f13, %f17, %f43; + .loc 1 29 36 + add.s32 %r97, %r97, 8; + add.s32 %r96, %r96, 2048; + setp.lt.u32 %p9, %r97, 120; + @%p9 bra $L__BB0_1; + .loc 1 22 44 + and.b32 %r58, %r1, 63; + .loc 1 22 23 + or.b32 %r59, %r5, %r58; +$L__tmp1: + .loc 2 243 36 + mov.b32 %r60, %f43; + shfl.sync.bfly.b32 %r61, %r60, 16, 31, -1; + mov.b32 %f21, %r61; +$L__tmp2: + .loc 2 233 15 + add.f32 %f22, %f43, %f21; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r62, %f44; + shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1; + mov.b32 %f23, %r63; +$L__tmp4: + .loc 2 233 15 + add.f32 %f24, %f44, %f23; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r64, %f45; + shfl.sync.bfly.b32 %r65, %r64, 16, 31, -1; + mov.b32 %f25, %r65; +$L__tmp6: + .loc 2 233 15 + add.f32 %f26, %f45, %f25; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r66, %f46; + shfl.sync.bfly.b32 %r67, %r66, 16, 31, -1; + mov.b32 %f27, %r67; +$L__tmp8: + .loc 2 233 15 + add.f32 %f28, %f46, %f27; +$L__tmp9: + .loc 2 243 36 + setp.lt.u32 %p10, %r2, 16; + shl.b32 %r68, %r3, 2; + or.b32 %r69, %r68, %r4; + shl.b32 %r70, %r69, 2; + mov.u32 %r71, global_smem; + add.s32 %r41, %r71, %r70; + mov.b32 %r42, %f22; + @%p10 st.shared.b32 [ %r41 + 0 ], %r42; + shl.b32 %r72, %r4, 2; + shl.b32 %r73, %r3, 4; + or.b32 %r74, %r73, 16; + or.b32 %r75, %r74, %r72; + add.s32 %r43, %r71, %r75; + mov.b32 %r44, %f24; + @%p10 st.shared.b32 [ %r43 + 0 ], %r44; + or.b32 %r76, %r73, 32; + or.b32 %r77, %r76, %r72; + add.s32 %r45, %r71, %r77; + mov.b32 %r46, %f26; + @%p10 st.shared.b32 [ %r45 + 0 ], %r46; + or.b32 %r78, %r73, 48; + or.b32 %r79, %r78, %r72; + add.s32 %r47, %r71, %r79; + mov.b32 %r48, %f28; + @%p10 st.shared.b32 [ %r47 + 0 ], %r48; + bar.sync 0; + setp.lt.s32 %p14, %r1, 256; + add.s32 %r50, %r71, %r13; + @%p14 ld.shared.b32 %r49, [ %r50 + 0 ]; + mov.b32 %f29, %r49; + shfl.sync.bfly.b32 %r81, %r49, 2, 31, -1; + mov.b32 %f30, %r81; +$L__tmp10: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r82, %f31; + shfl.sync.bfly.b32 %r83, %r82, 1, 31, -1; + mov.b32 %f32, %r83; +$L__tmp12: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp13: + .loc 2 243 36 + and.b32 %r84, %r1, 3; + setp.eq.s32 %p19, %r84, 0; + and.pred %p15, %p14, %p19; + mov.b32 %r52, %f33; + @%p15 st.shared.b32 [ %r50 + 0 ], %r52; + add.s32 %r54, %r50, 512; + @%p14 ld.shared.b32 %r53, [ %r54 + 0 ]; + mov.b32 %f34, %r53; + shfl.sync.bfly.b32 %r85, %r53, 2, 31, -1; + mov.b32 %f35, %r85; +$L__tmp14: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r86, %f36; + shfl.sync.bfly.b32 %r87, %r86, 1, 31, -1; + mov.b32 %f37, %r87; +$L__tmp16: + .loc 2 233 15 + add.f32 %f38, %f36, %f37; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r56, %f38; + @%p15 st.shared.b32 [ %r54 + 0 ], %r56; + bar.sync 0; + add.s32 %r88, %r71, %r73; + ld.shared.f32 %f39, [%r88]; + add.s32 %r89, %r71, %r74; + ld.shared.f32 %f40, [%r89]; + add.s32 %r90, %r71, %r76; + ld.shared.f32 %f41, [%r90]; + add.s32 %r91, %r71, %r78; + ld.shared.f32 %f42, [%r91]; +$L__tmp18: + .loc 1 40 28 + bar.sync 0; + add.s32 %r92, %r71, %r68; + st.shared.f32 [%r92], %f39; + st.shared.f32 [%r92+4], %f40; + st.shared.f32 [%r92+8], %f41; + st.shared.f32 [%r92+12], %f42; + bar.sync 0; + shl.b32 %r93, %r58, 2; + add.s32 %r94, %r71, %r93; + ld.shared.u32 %r57, [%r94]; + .loc 1 41 25 + mul.wide.s32 %rd9, %r59, 4; + add.s64 %rd8, %rd3, %rd9; + .loc 1 41 36 + and.b32 %r95, %r1, 64; + setp.eq.s32 %p18, %r95, 0; + @%p18 st.global.b32 [ %rd8 + 0 ], { %r57 }; + .loc 1 41 4 + ret; +$L__tmp19: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/sj/csjd7mlrjujd4uwze5tkg7ptteagpihgt5ztatfqchprcrax22ls.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 266 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 115 +.b8 106 +.b8 100 +.b8 55 +.b8 109 +.b8 108 +.b8 114 +.b8 106 +.b8 117 +.b8 106 +.b8 100 +.b8 52 +.b8 117 +.b8 119 +.b8 122 +.b8 101 +.b8 53 +.b8 116 +.b8 107 +.b8 103 +.b8 55 +.b8 112 +.b8 116 +.b8 116 +.b8 101 +.b8 97 +.b8 103 +.b8 112 +.b8 105 +.b8 104 +.b8 103 +.b8 116 +.b8 53 +.b8 122 +.b8 116 +.b8 97 +.b8 116 +.b8 102 +.b8 113 +.b8 99 +.b8 104 +.b8 112 +.b8 114 +.b8 99 +.b8 114 +.b8 97 +.b8 120 +.b8 50 +.b8 50 +.b8 108 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 115 +.b8 106 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp18 +.b8 2 +.b8 40 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 40 +.b8 25 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 270 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 270 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..96f5375ad075533186e3f048679c44d977bf40d5 --- /dev/null +++ b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttgir @@ -0,0 +1,65 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_0 = arith.constant dense<32768> : tensor<64x1xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<1x8xi32, #blocked> + %cst_2 = arith.constant dense<128> : tensor<1x8xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c8_i32 = arith.constant 8 : i32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1> + %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked> + %12 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked> + %13 = arith.divsi %8, %cst : tensor<64x1xi32, #blocked> + %14 = tt.broadcast %12 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %15 = arith.muli %13, %cst_0 : tensor<64x1xi32, #blocked> + %16 = tt.broadcast %15 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %18 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr, #blocked> + %19 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32, #blocked>) : i32 { + %25 = tt.splat %arg5 : (i32) -> tensor<1x8xi32, #blocked> + %26 = arith.addi %25, %11 : tensor<1x8xi32, #blocked> + %27 = arith.cmpi slt, %26, %cst_2 : tensor<1x8xi32, #blocked> + %28 = arith.muli %26, %cst_1 : tensor<1x8xi32, #blocked> + %29 = tt.broadcast %28 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked> + %30 = arith.addi %14, %29 : tensor<64x8xi32, #blocked> + %31 = arith.addi %30, %16 : tensor<64x8xi32, #blocked> + %32 = tt.addptr %17, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %33 = tt.broadcast %27 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked> + %34 = tt.load %32, %33, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked> + %35 = arith.extf %34 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked> + %36 = tt.addptr %18, %31 : tensor<64x8x!tt.ptr, #blocked>, tensor<64x8xi32, #blocked> + %37 = tt.load %36, %33, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked> + %38 = arith.mulf %35, %37 : tensor<64x8xf32, #blocked> + %39 = arith.addf %arg6, %38 : tensor<64x8xf32, #blocked> + %40 = arith.select %33, %39, %arg6 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked> + scf.yield %40 : tensor<64x8xf32, #blocked> + } + %20 = "tt.reduce"(%19) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %25 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %25 : f32 + }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %21 = triton_gpu.convert_layout %20 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xf32, #blocked1> + %23 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked1> + %24 = tt.addptr %23, %9 : tensor<64x1x!tt.ptr, #blocked1>, tensor<64x1xi32, #blocked1> + tt.store %24, %22 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..3ef8f11a5ab02e6f395a2936666e794288e95117 --- /dev/null +++ b/.triton/dump/284d053ae6736ef59b97361c588791fb/triton_.ttir @@ -0,0 +1,58 @@ +module { + tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16> + %c8_i32 = arith.constant 8 : i32 + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<32768> : tensor<64x1xi32> + %cst_1 = arith.constant dense<256> : tensor<1x8xi32> + %cst_2 = arith.constant dense<128> : tensor<1x8xi32> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> + %cst_4 = arith.constant dense<256> : tensor<64x1xi32> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32> + %8 = arith.remsi %5, %cst_4 : tensor<64x1xi32> + %9 = arith.divsi %5, %cst_4 : tensor<64x1xi32> + %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %11 = arith.muli %9, %cst_0 : tensor<64x1xi32> + %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %14 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_3) -> (tensor<64x8xf32>) : i32 { + %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32> + %21 = arith.addi %20, %7 : tensor<1x8xi32> + %22 = arith.cmpi slt, %21, %cst_2 : tensor<1x8xi32> + %23 = arith.muli %21, %cst_1 : tensor<1x8xi32> + %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32> + %25 = arith.addi %10, %24 : tensor<64x8xi32> + %26 = arith.addi %25, %12 : tensor<64x8xi32> + %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1> + %29 = tt.load %27, %28, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16> + %30 = arith.extf %29 : tensor<64x8xbf16> to tensor<64x8xf32> + %31 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %32 = tt.load %31, %28, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32> + %33 = arith.mulf %30, %32 : tensor<64x8xf32> + %34 = arith.addf %arg6, %33 : tensor<64x8xf32> + %35 = arith.select %28, %34, %arg6 : tensor<64x8xi1>, tensor<64x8xf32> + scf.yield %35 : tensor<64x8xf32> + } + %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %20 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %20 : f32 + }) : (tensor<64x8xf32>) -> tensor<64xf32> + %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %18 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32> + tt.return + } +} diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..c08911a056519cbfce8a8991f889ab5659121a49 --- /dev/null +++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ptx @@ -0,0 +1,456 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<10>; + .reg .b32 %r<44>; + .reg .f32 %f<11>; + .reg .b64 %rd<16>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2d3de4e_param_2]; + ld.param.u64 %rd2, [triton__0d1d2d3de4e_param_1]; + ld.param.u64 %rd1, [triton__0d1d2d3de4e_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 63; + .loc 1 24 33 + bfe.u32 %r3, %r1, 6, 2; + .loc 1 21 28 + mov.u32 %r10, %ctaid.x; + .loc 1 21 33 + shl.b32 %r12, %r10, 6; + .loc 1 22 23 + or.b32 %r4, %r12, %r2; + .loc 1 27 36 + shl.b32 %r13, %r3, 17; + add.s32 %r14, %r13, %r12; + or.b32 %r42, %r14, %r2; + mov.f32 %f10, 0f00000000; + mov.b32 %r43, -4; + mov.pred %p4, -1; +$L__BB0_1: + .loc 1 31 34 + mul.wide.s32 %rd5, %r42, 4; + add.s64 %rd4, %rd1, %rd5; + mov.b32 %r16, 0; + .loc 1 31 53 + mov.u32 %r15, 0x0; + @%p4 ld.global.L1::evict_first.b32 { %r15 }, [ %rd4 + 0 ]; + @!%p4 mov.u32 %r15, %r16; + mov.b32 %f4, %r15; + .loc 1 34 38 + add.f32 %f10, %f10, %f4; + .loc 1 27 36 + add.s32 %r43, %r43, 4; + add.s32 %r42, %r42, 524288; + setp.lt.u32 %p3, %r43, 116; + @%p3 bra $L__BB0_1; +$L__tmp1: + .loc 2 243 36 + shl.b32 %r25, %r3, 2; + shl.b32 %r26, %r2, 4; + or.b32 %r27, %r26, %r25; + mov.u32 %r28, global_smem; + add.s32 %r17, %r28, %r27; + mov.b32 %r18, %f10; + @%p4 st.shared.b32 [ %r17 + 0 ], %r18; + bar.sync 0; + setp.lt.s32 %p5, %r1, 256; + shl.b32 %r29, %r1, 2; + add.s32 %r20, %r28, %r29; + @%p5 ld.shared.b32 %r19, [ %r20 + 0 ]; + mov.b32 %f5, %r19; + shfl.sync.bfly.b32 %r30, %r19, 2, 31, -1; + mov.b32 %f6, %r30; +$L__tmp2: + .loc 2 233 15 + add.f32 %f7, %f5, %f6; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r31, %f7; + shfl.sync.bfly.b32 %r32, %r31, 1, 31, -1; + mov.b32 %f8, %r32; +$L__tmp4: + .loc 2 233 15 + add.f32 %f9, %f7, %f8; +$L__tmp5: + .loc 2 243 36 + and.b32 %r33, %r1, 3; + setp.eq.s32 %p9, %r33, 0; + and.pred %p6, %p5, %p9; + mov.b32 %r22, %f9; + @%p6 st.shared.b32 [ %r20 + 0 ], %r22; + bar.sync 0; + add.s32 %r34, %r28, %r26; +$L__tmp6: + .loc 1 36 20 + shr.s32 %r36, %r4, 31; + shr.u32 %r37, %r36, 24; + add.s32 %r38, %r4, %r37; + shr.s32 %r39, %r38, 8; + and.b32 %r40, %r38, -256; + sub.s32 %r41, %r4, %r40; + .loc 1 38 30 + mul.wide.s32 %rd9, %r39, 8; + add.s64 %rd7, %rd2, %rd9; + .loc 1 45 55 + ld.shared.u32 %r24, [%r34]; + .loc 1 38 35 + mov.u64 %rd6, 0x0; + @%p4 ld.global.L1::evict_last.b64 { %rd6 }, [ %rd7 + 0 ]; + .loc 1 41 32 + shr.u64 %rd10, %rd6, 54; + and.b64 %rd11, %rd10, 512; + add.s64 %rd12, %rd11, %rd6; + .loc 1 45 30 + shl.b64 %rd13, %rd12, 10; + add.s64 %rd14, %rd3, %rd13; + mul.wide.s32 %rd15, %r41, 4; + add.s64 %rd8, %rd14, %rd15; + .loc 1 45 55 + setp.eq.s32 %p8, %r3, 0; + mov.u32 %r23, 0x0; + @%p8 atom.global.gpu.acq_rel.add.f32 %r23, [ %rd8 + 0 ], %r24; + .loc 1 45 4 + ret; +$L__tmp7: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp6 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp5 +.b8 2 +.b8 35 +.b8 25 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp5 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..42ac242162058bd1f96006963526019a8211b9b3 --- /dev/null +++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttir @@ -0,0 +1,61 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi64> + %cst_0 = arith.constant dense<0> : tensor<64x1xi64> + %cst_1 = arith.constant dense<512> : tensor<64x1xi64> + %c4_i32 = arith.constant 4 : i32 + %c120_i32 = arith.constant 120 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_2 = arith.constant dense : tensor<64x1xi1> + %cst_3 = arith.constant dense<256> : tensor<64x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x4xi32> + %cst_5 = arith.constant dense<120> : tensor<1x4xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x4xf32> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32>) -> tensor<1x4xi32> + %8 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x4xi32> + %9 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x4x!tt.ptr> + %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x4xf32>) : i32 { + %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32> + %28 = arith.addi %27, %7 : tensor<1x4xi32> + %29 = arith.cmpi slt, %28, %cst_5 : tensor<1x4xi32> + %30 = arith.muli %28, %cst_4 : tensor<1x4xi32> + %31 = tt.broadcast %30 : (tensor<1x4xi32>) -> tensor<64x4xi32> + %32 = arith.addi %8, %31 : tensor<64x4xi32> + %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr>, tensor<64x4xi32> + %34 = tt.broadcast %29 : (tensor<1x4xi1>) -> tensor<64x4xi1> + %35 = tt.load %33, %34, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32> + %36 = arith.addf %arg6, %35 : tensor<64x4xf32> + %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1>, tensor<64x4xf32> + scf.yield %37 : tensor<64x4xf32> + } + %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %27 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %27 : f32 + }) : (tensor<64x4xf32>) -> tensor<64xf32> + %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %13 = arith.divsi %5, %cst_3 : tensor<64x1xi32> + %14 = arith.remsi %5, %cst_3 : tensor<64x1xi32> + %15 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %18 = arith.addi %17, %cst_1 : tensor<64x1xi64> + %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64> + %20 = arith.select %19, %18, %17 : tensor<64x1xi1>, tensor<64x1xi64> + %21 = arith.muli %20, %cst : tensor<64x1xi64> + %22 = arith.extsi %14 : tensor<64x1xi32> to tensor<64x1xi64> + %23 = arith.addi %22, %21 : tensor<64x1xi64> + %24 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr>, tensor<64x1xi64> + %26 = "tt.atomic_rmw"(%25, %12, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32> + tt.return + } +} diff --git a/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0fd8f5b667af6b6e50fa087b33a10ef7ad9294b4 Binary files /dev/null and b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.cubin differ diff --git a/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..1c2271a41ef6eb2139f34f221ea040af4f5e6bd6 --- /dev/null +++ b/.triton/dump/3cd3b6d7993c56f7d0340d40c84f737c/triton_.ptx @@ -0,0 +1,809 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6de7de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6de7de( + .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<33>; + .reg .b16 %rs<13>; + .reg .b32 %r<93>; + .reg .f32 %f<79>; + .reg .b64 %rd<92>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6de7de_param_4]; + ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6de7de_param_3]; + ld.param.u64 %rd35, [triton__0d1d2d3d4d5d6de7de_param_2]; + ld.param.u64 %rd34, [triton__0d1d2d3d4d5d6de7de_param_1]; + ld.param.u64 %rd43, [triton__0d1d2d3d4d5d6de7de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + bfe.u32 %r2, %r1, 2, 6; + and.b32 %r16, %r1, 63; + .loc 1 24 33 + and.b32 %r3, %r1, 3; + .loc 1 21 28 + mov.u32 %r15, %ctaid.x; + .loc 1 21 33 + shl.b32 %r17, %r15, 6; + .loc 1 22 23 + or.b32 %r18, %r17, %r2; + or.b32 %r19, %r17, %r16; + .loc 1 26 30 + mul.wide.s32 %rd44, %r18, 8; + add.s64 %rd40, %rd43, %rd44; + mul.wide.s32 %rd45, %r19, 8; + add.s64 %rd42, %rd43, %rd45; + mov.pred %p13, -1; + .loc 1 26 35 + mov.u64 %rd39, 0x0; + @%p13 ld.global.L1::evict_last.b64 { %rd39 }, [ %rd40 + 0 ]; + mov.u64 %rd41, 0x0; + @%p13 ld.global.L1::evict_last.b64 { %rd41 }, [ %rd42 + 0 ]; + .loc 1 27 18 + bfe.s32 %r20, %r15, 25, 1; + shr.u32 %r21, %r20, 23; + add.s32 %r22, %r18, %r21; + and.b32 %r23, %r22, 16776704; + sub.s32 %r24, %r18, %r23; + .loc 1 35 44 + shl.b32 %r5, %r24, 8; + .loc 1 37 22 + add.s64 %rd46, %rd41, 50257; + .loc 1 38 22 + setp.lt.s64 %p3, %rd39, 0; + setp.lt.s64 %p4, %rd41, 0; + .loc 1 39 36 + selp.b64 %rd47, %rd46, %rd41, %p4; + .loc 1 40 40 + setp.gt.u64 %p5, %rd47, 50256; + .loc 1 41 44 + shl.b64 %rd48, %rd39, 8; + add.s64 %rd49, %rd48, 12865792; + selp.b64 %rd2, %rd49, %rd48, %p3; + mov.u16 %rs12, 0; + mov.b32 %r76, 0; + mov.b32 %r88, 883; + mov.u64 %rd81, 1; + .loc 1 40 55 + @%p5 bra $L__BB0_3; + bra.uni $L__BB0_1; +$L__BB0_3: + .loc 1 31 36 + shl.b64 %rd55, %rd2, 2; + mul.wide.u32 %rd88, %r3, 4; + add.s64 %rd87, %rd55, %rd88; + add.s64 %rd83, %rd34, %rd87; + shl.b32 %r42, %r15, 14; + shl.b32 %r43, %r2, 8; + or.b32 %r44, %r42, %r43; + or.b32 %r91, %r44, %r3; + add.s32 %r45, %r5, %r3; + mul.wide.s32 %rd86, %r45, 4; + add.s64 %rd82, %rd35, %rd86; + mov.f32 %f78, 0f00000000; + mov.b32 %r89, -4; + mov.f32 %f77, %f78; + mov.f32 %f76, %f78; +$L__BB0_4: + .loc 1 35 50 + mov.u32 %r46, 0x0; + @%p13 ld.global.L1::evict_last.b32 { %r46 }, [ %rd82 + 0 ]; + @!%p13 mov.u32 %r46, %r76; + mov.b32 %f31, %r46; + .loc 1 31 36 + add.s32 %r89, %r89, 4; + .loc 1 36 34 + add.s32 %r54, %r89, %r91; + mul.wide.s32 %rd59, %r54, 2; + add.s64 %rd57, %rd36, %rd59; + .loc 1 36 50 + mov.u16 %rs4, 0x0; + @%p13 ld.global.L1::evict_last.b16 { %rs4 }, [ %rd57 + 0 ]; + @!%p13 mov.u16 %rs4, %rs12; + .loc 1 36 101 + cvt.f32.bf16 %r48, %rs4; + mov.b32 %f32, %r48; + .loc 1 40 55 + mov.u64 %rd60, assertMessage_0; + cvta.global.u64 %rd61, %rd60; + mov.u64 %rd62, assertFile_0; + cvta.global.u64 %rd63, %rd62; + mov.u64 %rd64, assertFunc_0; + cvta.global.u64 %rd65, %rd64; + { // callseq 10, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd61; + .param .b64 param1; + st.param.b64 [param1+0], %rd63; + .param .b32 param2; + st.param.b32 [param2+0], %r88; + .param .b64 param3; + st.param.b64 [param3+0], %rd65; + .param .b64 param4; + st.param.b64 [param4+0], %rd81; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 10 + .loc 1 41 52 + mov.u32 %r49, 0x0; + @%p13 ld.global.L1::evict_last.b32 { %r49 }, [ %rd83 + 0 ]; + @!%p13 mov.u32 %r49, %r76; + mov.b32 %f33, %r49; + .loc 1 42 22 + add.f32 %f34, %f31, %f33; + .loc 1 44 22 + add.f32 %f35, %f32, %f34; +$L__tmp1: + .loc 2 96 20 + sub.f32 %f36, %f35, %f76; + .loc 2 97 26 + add.f32 %f78, %f78, 0f3F800000; + .loc 2 98 30 + mov.b32 %r52, %f36; + mov.b32 %r53, %f78; + div.full.f32 %r51, %r52, %r53; + mov.b32 %f37, %r51; + .loc 2 98 22 + add.f32 %f76, %f76, %f37; + .loc 2 101 30 + sub.f32 %f38, %f35, %f76; +$L__tmp2: + .loc 1 50 50 + fma.rn.f32 %f77, %f36, %f38, %f77; + .loc 1 31 36 + add.s64 %rd83, %rd83, 16; + add.s64 %rd82, %rd82, 16; + setp.lt.u32 %p19, %r89, 252; + @%p19 bra $L__BB0_4; + bra.uni $L__BB0_5; +$L__BB0_1: + .loc 1 0 36 + mov.b32 %r90, -4; + .loc 1 31 36 + shl.b64 %rd50, %rd2, 2; + mul.wide.u32 %rd88, %r3, 4; + add.s64 %rd87, %rd50, %rd88; + add.s64 %rd85, %rd34, %rd87; + shl.b32 %r27, %r15, 14; + shl.b32 %r28, %r2, 8; + or.b32 %r29, %r27, %r28; + or.b32 %r91, %r29, %r3; + add.s32 %r30, %r5, %r3; + mul.wide.s32 %rd86, %r30, 4; + add.s64 %rd84, %rd35, %rd86; + mov.f32 %f78, 0f00000000; + mov.f32 %f77, %f78; + mov.f32 %f76, %f78; +$L__BB0_2: + .loc 1 35 50 + mov.u32 %r31, 0x0; + @%p13 ld.global.L1::evict_last.b32 { %r31 }, [ %rd84 + 0 ]; + @!%p13 mov.u32 %r31, %r76; + mov.b32 %f22, %r31; + .loc 1 31 36 + add.s32 %r90, %r90, 4; + .loc 1 36 34 + add.s32 %r39, %r90, %r91; + mul.wide.s32 %rd54, %r39, 2; + add.s64 %rd52, %rd36, %rd54; + .loc 1 36 50 + mov.u16 %rs1, 0x0; + @%p13 ld.global.L1::evict_last.b16 { %rs1 }, [ %rd52 + 0 ]; + @!%p13 mov.u16 %rs1, %rs12; + .loc 1 36 101 + cvt.f32.bf16 %r33, %rs1; + mov.b32 %f23, %r33; + .loc 1 41 52 + mov.u32 %r34, 0x0; + @%p13 ld.global.L1::evict_last.b32 { %r34 }, [ %rd85 + 0 ]; + @!%p13 mov.u32 %r34, %r76; + mov.b32 %f24, %r34; + .loc 1 42 22 + add.f32 %f25, %f22, %f24; + .loc 1 44 22 + add.f32 %f26, %f23, %f25; +$L__tmp3: + .loc 2 96 20 + sub.f32 %f27, %f26, %f76; + .loc 2 97 26 + add.f32 %f78, %f78, 0f3F800000; + .loc 2 98 30 + mov.b32 %r37, %f27; + mov.b32 %r38, %f78; + div.full.f32 %r36, %r37, %r38; + mov.b32 %f28, %r36; + .loc 2 98 22 + add.f32 %f76, %f76, %f28; + .loc 2 101 30 + sub.f32 %f29, %f26, %f76; +$L__tmp4: + .loc 1 50 50 + fma.rn.f32 %f77, %f27, %f29, %f77; + .loc 1 31 36 + add.s64 %rd85, %rd85, 16; + add.s64 %rd84, %rd84, 16; + setp.lt.u32 %p12, %r90, 252; + @%p12 bra $L__BB0_2; +$L__BB0_5: + .loc 1 0 36 + ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6de7de_param_5]; +$L__tmp5: + .loc 2 120 46 + mov.b32 %r66, %f76; + shfl.sync.bfly.b32 %r67, %r66, 2, 31, -1; + mov.b32 %f39, %r67; + mov.b32 %r68, %f77; + shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1; + mov.b32 %f40, %r69; + mov.b32 %r70, %f78; + shfl.sync.bfly.b32 %r57, %r70, 2, 31, -1; + mov.b32 %f41, %r57; +$L__tmp6: + .loc 2 108 21 + sub.f32 %f42, %f39, %f76; + .loc 2 109 28 + add.f32 %f43, %f78, %f41; + .loc 2 110 39 + setp.eq.f32 %p20, %f43, 0f00000000; + .loc 2 110 60 + mov.b32 %r58, %f43; + div.full.f32 %r56, %r57, %r58; + mov.b32 %f44, %r56; + .loc 2 110 49 + selp.f32 %f45, 0f00000000, %f44, %p20; + .loc 2 112 17 + fma.rn.f32 %f46, %f42, %f45, %f76; + .loc 2 113 15 + add.f32 %f47, %f77, %f40; + .loc 2 113 30 + mul.f32 %f48, %f42, %f42; + .loc 2 113 38 + mul.f32 %f49, %f78, %f48; + .loc 2 113 22 + fma.rn.f32 %f50, %f49, %f45, %f47; +$L__tmp7: + .loc 2 120 46 + mov.b32 %r71, %f46; + shfl.sync.bfly.b32 %r72, %r71, 1, 31, -1; + mov.b32 %f51, %r72; + mov.b32 %r73, %f50; + shfl.sync.bfly.b32 %r74, %r73, 1, 31, -1; + mov.b32 %f52, %r74; + shfl.sync.bfly.b32 %r60, %r58, 1, 31, -1; + mov.b32 %f53, %r60; +$L__tmp8: + .loc 2 108 21 + sub.f32 %f54, %f51, %f46; + .loc 2 109 28 + add.f32 %f55, %f43, %f53; + .loc 2 110 39 + setp.eq.f32 %p21, %f55, 0f00000000; + .loc 2 110 60 + mov.b32 %r61, %f55; + div.full.f32 %r59, %r60, %r61; + mov.b32 %f56, %r59; + .loc 2 110 49 + selp.f32 %f57, 0f00000000, %f56, %p21; + .loc 2 112 17 + fma.rn.f32 %f16, %f54, %f57, %f46; + .loc 2 113 15 + add.f32 %f58, %f50, %f52; + .loc 2 113 30 + mul.f32 %f59, %f54, %f54; + .loc 2 113 38 + mul.f32 %f60, %f43, %f59; + .loc 2 113 22 + fma.rn.f32 %f61, %f57, %f60, %f58; +$L__tmp9: + .loc 1 75 24 + mov.b32 %r63, %f61; + mov.b32 %r64, 1132462080; + div.full.f32 %r62, %r63, %r64; + mov.b32 %f62, %r62; + .loc 1 77 24 + add.f32 %f17, %f62, 0f3727C5AC; + .loc 1 58 36 + add.s64 %rd91, %rd34, %rd87; + add.s64 %rd90, %rd37, %rd88; + add.s64 %rd89, %rd35, %rd86; + mov.b32 %r92, -4; + setp.lt.u64 %p28, %rd47, 50257; + rsqrt.approx.ftz.f32 %f67, %f17; + bra.uni $L__BB0_6; +$L__BB0_8: + .loc 1 0 0 + mov.b32 %f18, %r75; + cvt.s64.s32 %rd30, %r81; + cvt.f32.bf16 %r77, %rs7; + mov.b32 %f19, %r77; + mov.b32 %f20, %r78; + .loc 1 69 54 + mov.u32 %r83, 0x0; + @%p13 ld.global.L1::evict_first.b32 { %r83 }, [ %rd91 + 0 ]; + @!%p13 mov.u32 %r83, %r76; + mov.b32 %f63, %r83; + .loc 1 70 24 + add.f32 %f64, %f18, %f63; + .loc 1 72 24 + add.f32 %f65, %f19, %f64; + .loc 1 73 24 + sub.f32 %f66, %f65, %f16; + .loc 1 79 24 + mul.f32 %f68, %f66, %f67; + .loc 1 80 24 + mul.f32 %f69, %f68, %f20; + .loc 1 82 29 + shl.b64 %rd80, %rd30, 1; + add.s64 %rd79, %rd38, %rd80; + .loc 1 82 52 + mov.b32 %r85, %f69; + cvt.rn.bf16.f32 %rs10, %r85; + @%p13 st.global.b16 [ %rd79 + 0 ], { %rs10 }; + .loc 1 58 36 + add.s32 %r92, %r92, 4; + add.s64 %rd91, %rd91, 16; + add.s64 %rd90, %rd90, 16; + add.s64 %rd89, %rd89, 16; + setp.lt.u32 %p32, %r92, 252; + @%p32 bra $L__BB0_6; + bra.uni $L__BB0_9; +$L__BB0_6: + .loc 1 62 51 + mov.u32 %r75, 0x0; + @%p13 ld.global.L1::evict_last.b32 { %r75 }, [ %rd89 + 0 ]; + @!%p13 mov.u32 %r75, %r76; + .loc 1 63 35 + add.s32 %r80, %r91, %r92; + add.s32 %r81, %r80, 4; + mul.wide.s32 %rd70, %r81, 2; + add.s64 %rd68, %rd36, %rd70; + .loc 1 63 51 + mov.u16 %rs7, 0x0; + @%p13 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd68 + 0 ]; + @!%p13 mov.u16 %rs7, %rs12; + .loc 1 64 40 + mov.u32 %r78, 0x0; + @%p13 ld.global.L1::evict_last.b32 { %r78 }, [ %rd90 + 0 ]; + @!%p13 mov.u32 %r78, %r76; + .loc 1 68 57 + @%p28 bra $L__BB0_8; + mov.u64 %rd71, assertMessage_1; + cvta.global.u64 %rd72, %rd71; + mov.u64 %rd73, assertFile_1; + cvta.global.u64 %rd74, %rd73; + mov.u64 %rd75, assertFunc_1; + cvta.global.u64 %rd76, %rd75; + { // callseq 11, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd72; + .param .b64 param1; + st.param.b64 [param1+0], %rd74; + .param .b32 param2; + st.param.b32 [param2+0], %r88; + .param .b64 param3; + st.param.b64 [param3+0], %rd76; + .param .b64 param4; + st.param.b64 [param4+0], %rd81; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 11 + bra.uni $L__BB0_8; +$L__BB0_9: + .loc 1 58 4 + ret; +$L__tmp10: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 302 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 112 +.b8 110 +.b8 51 +.b8 108 +.b8 97 +.b8 119 +.b8 103 +.b8 54 +.b8 53 +.b8 108 +.b8 112 +.b8 105 +.b8 54 +.b8 51 +.b8 103 +.b8 118 +.b8 54 +.b8 99 +.b8 54 +.b8 112 +.b8 110 +.b8 52 +.b8 111 +.b8 105 +.b8 107 +.b8 104 +.b8 103 +.b8 54 +.b8 113 +.b8 118 +.b8 97 +.b8 50 +.b8 104 +.b8 50 +.b8 113 +.b8 106 +.b8 100 +.b8 112 +.b8 120 +.b8 101 +.b8 54 +.b8 113 +.b8 106 +.b8 52 +.b8 108 +.b8 118 +.b8 116 +.b8 116 +.b8 119 +.b8 101 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 112 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp4 +.b8 2 +.b8 47 +.b8 41 +.b8 4 +.b32 125 +.b64 $L__tmp5 +.b64 $L__tmp8 +.b8 2 +.b8 53 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp9 +.b8 2 +.b8 53 +.b8 44 +.b8 4 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp9 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ce09ef7254f94f26f759e203ef94afed22fbb348 --- /dev/null +++ b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ptx @@ -0,0 +1,717 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5de6de +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5de6de( + .param .u64 triton__0d1d2d3d4d5de6de_param_0, + .param .u64 triton__0d1d2d3d4d5de6de_param_1, + .param .u64 triton__0d1d2d3d4d5de6de_param_2, + .param .u64 triton__0d1d2d3d4d5de6de_param_3, + .param .u64 triton__0d1d2d3d4d5de6de_param_4, + .param .u32 triton__0d1d2d3d4d5de6de_param_5, + .param .u32 triton__0d1d2d3d4d5de6de_param_6 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<26>; + .reg .b16 %rs<9>; + .reg .b32 %r<88>; + .reg .f32 %f<78>; + .reg .b64 %rd<14>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_0]; + ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r56, %tid.x; + and.b32 %r57, %r56, 31; + ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2]; + ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3]; + ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4]; + shl.b32 %r58, %r56, 2; + and.b32 %r59, %r58, 252; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r60, %r1, 8; + .loc 1 30 36 + or.b32 %r61, %r60, %r59; + .loc 1 30 30 + mul.wide.s32 %rd11, %r61, 4; + add.s64 %rd1, %rd6, %rd11; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r4; + mov.b32 %f2, %r5; + .loc 1 31 30 + mul.wide.s32 %rd12, %r61, 2; + add.s64 %rd2, %rd7, %rd12; + .loc 1 31 46 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + cvt.u16.u32 %rs1, %r10; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } + cvt.u16.u32 %rs3, %r11; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } + .loc 1 31 67 + cvt.f32.bf16 %r14, %rs1; + mov.b32 %f3, %r14; + cvt.f32.bf16 %r15, %rs2; + mov.b32 %f4, %r15; + cvt.f32.bf16 %r16, %rs3; + mov.b32 %f5, %r16; + cvt.f32.bf16 %r17, %rs4; + mov.b32 %f6, %r17; + .loc 1 32 30 + add.s64 %rd3, %rd8, %rd12; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + cvt.u16.u32 %rs5, %r18; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; } + cvt.u16.u32 %rs7, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; } + .loc 1 32 67 + cvt.f32.bf16 %r22, %rs5; + mov.b32 %f7, %r22; + cvt.f32.bf16 %r23, %rs6; + mov.b32 %f8, %r23; + cvt.f32.bf16 %r24, %rs7; + mov.b32 %f9, %r24; + cvt.f32.bf16 %r25, %rs8; + mov.b32 %f10, %r25; + .loc 1 33 31 + mul.wide.u32 %rd13, %r59, 4; + add.s64 %rd4, %rd9, %rd13; + .loc 1 33 36 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r6; + @!%p1 mov.u32 %r27, %r6; + @!%p1 mov.u32 %r28, %r6; + @!%p1 mov.u32 %r29, %r6; + .loc 1 35 18 + add.f32 %f11, %f5, %f1; + add.f32 %f12, %f6, %f2; + .loc 1 30 46 + mov.b32 %f13, %r3; + mov.b32 %f14, %r2; + .loc 1 35 18 + add.f32 %f15, %f3, %f14; + add.f32 %f16, %f4, %f13; + .loc 1 37 18 + add.f32 %f17, %f16, %f8; + add.f32 %f18, %f15, %f7; + add.f32 %f19, %f11, %f9; + add.f32 %f20, %f12, %f10; +$L__tmp1: + .loc 2 233 15 + add.f32 %f21, %f18, %f17; + add.f32 %f22, %f21, %f19; + add.f32 %f23, %f22, %f20; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r62, %f23; + shfl.sync.bfly.b32 %r63, %r62, 16, 31, -1; + mov.b32 %f24, %r63; +$L__tmp3: + .loc 2 233 15 + add.f32 %f25, %f23, %f24; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r64, %f25; + shfl.sync.bfly.b32 %r65, %r64, 8, 31, -1; + mov.b32 %f26, %r65; +$L__tmp5: + .loc 2 233 15 + add.f32 %f27, %f25, %f26; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r66, %f27; + shfl.sync.bfly.b32 %r67, %r66, 4, 31, -1; + mov.b32 %f28, %r67; +$L__tmp7: + .loc 2 233 15 + add.f32 %f29, %f27, %f28; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r68, %f29; + shfl.sync.bfly.b32 %r69, %r68, 2, 31, -1; + mov.b32 %f30, %r69; +$L__tmp9: + .loc 2 233 15 + add.f32 %f31, %f29, %f30; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r70, %f31; + shfl.sync.bfly.b32 %r71, %r70, 1, 31, -1; + mov.b32 %f32, %r71; +$L__tmp11: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p17, %r57, 0; + shr.u32 %r72, %r56, 3; + and.b32 %r73, %r72, 4; + mov.u32 %r74, global_smem; + add.s32 %r34, %r74, %r73; + mov.b32 %r35, %f33; + @%p17 st.shared.b32 [ %r34 + 0 ], %r35; + bar.sync 0; + setp.lt.s32 %p18, %r56, 2; + add.s32 %r37, %r74, %r58; + @%p18 ld.shared.b32 %r36, [ %r37 + 0 ]; + mov.b32 %f34, %r36; + shfl.sync.bfly.b32 %r75, %r36, 1, 31, -1; + mov.b32 %f35, %r75; +$L__tmp13: + .loc 2 233 15 + add.f32 %f36, %f34, %f35; +$L__tmp14: + .loc 2 243 36 + and.b32 %r76, %r56, 1; + setp.eq.b32 %p24, %r76, 1; + not.pred %p25, %p24; + and.pred %p19, %p18, %p25; + mov.b32 %r39, %f36; + @%p19 st.shared.b32 [ %r37 + 0 ], %r39; + bar.sync 0; + ld.shared.f32 %f37, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f38, %f37, 0f00000000; +$L__tmp16: + .loc 1 45 20 + mov.b32 %r41, %f38; + mov.b32 %r42, 1132462080; + div.full.f32 %r40, %r41, %r42; + mov.b32 %f39, %r40; + .loc 1 46 19 + sub.f32 %f40, %f18, %f39; + sub.f32 %f41, %f17, %f39; + sub.f32 %f42, %f19, %f39; + sub.f32 %f43, %f20, %f39; + .loc 1 47 20 + mul.f32 %f44, %f41, %f41; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f45, %f40, %f40, %f44; + fma.rn.f32 %f46, %f42, %f42, %f45; + fma.rn.f32 %f47, %f43, %f43, %f46; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r77, %f47; + shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1; + mov.b32 %f48, %r78; +$L__tmp20: + .loc 2 233 15 + add.f32 %f49, %f47, %f48; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r79, %f49; + shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1; + mov.b32 %f50, %r80; +$L__tmp22: + .loc 2 233 15 + add.f32 %f51, %f49, %f50; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r81, %f51; + shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1; + mov.b32 %f52, %r82; +$L__tmp24: + .loc 2 233 15 + add.f32 %f53, %f51, %f52; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r83, %f53; + shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1; + mov.b32 %f54, %r84; +$L__tmp26: + .loc 2 233 15 + add.f32 %f55, %f53, %f54; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r85, %f55; + shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1; + mov.b32 %f56, %r86; +$L__tmp28: + .loc 2 233 15 + add.f32 %f57, %f55, %f56; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r44, %f57; + @%p17 st.shared.b32 [ %r34 + 0 ], %r44; + bar.sync 0; + @%p18 ld.shared.b32 %r45, [ %r37 + 0 ]; + mov.b32 %f58, %r45; + shfl.sync.bfly.b32 %r87, %r45, 1, 31, -1; + mov.b32 %f59, %r87; +$L__tmp30: + .loc 2 233 15 + add.f32 %f60, %f58, %f59; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r48, %f60; + @%p19 st.shared.b32 [ %r37 + 0 ], %r48; + bar.sync 0; + ld.shared.f32 %f61, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f62, %f61, 0f00000000; +$L__tmp33: + .loc 1 53 20 + mov.b32 %r50, %f62; + div.full.f32 %r49, %r50, %r42; + mov.b32 %f63, %r49; + .loc 1 55 20 + add.f32 %f64, %f63, 0f3727C5AC; + .loc 1 56 26 + rsqrt.approx.ftz.f32 %f65, %f64; + .loc 1 33 36 + mov.b32 %f66, %r29; + mov.b32 %f67, %r28; + mov.b32 %f68, %r27; + mov.b32 %f69, %r26; + .loc 1 57 20 + mul.f32 %f70, %f40, %f65; + mul.f32 %f71, %f41, %f65; + mul.f32 %f72, %f42, %f65; + mul.f32 %f73, %f43, %f65; + .loc 1 58 20 + mul.f32 %f74, %f70, %f69; + mul.f32 %f75, %f71, %f68; + mul.f32 %f76, %f72, %f67; + mul.f32 %f77, %f73, %f66; + .loc 1 59 25 + add.s64 %rd5, %rd10, %rd11; + .loc 1 59 48 + mov.b32 %r52, %f74; + mov.b32 %r53, %f75; + mov.b32 %r54, %f76; + mov.b32 %r55, %f77; + @%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r52, %r53, %r54, %r55 }; + .loc 1 59 4 + ret; +$L__tmp34: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/tv/ctvr3xs46luhhbr7xomihgyropjaatss7yata4igaw6kvgwas7g2.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 395 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 116 +.b8 118 +.b8 114 +.b8 51 +.b8 120 +.b8 115 +.b8 52 +.b8 54 +.b8 108 +.b8 117 +.b8 104 +.b8 104 +.b8 98 +.b8 114 +.b8 55 +.b8 120 +.b8 111 +.b8 109 +.b8 105 +.b8 104 +.b8 103 +.b8 121 +.b8 114 +.b8 111 +.b8 112 +.b8 106 +.b8 97 +.b8 97 +.b8 116 +.b8 115 +.b8 115 +.b8 55 +.b8 121 +.b8 97 +.b8 116 +.b8 97 +.b8 52 +.b8 105 +.b8 103 +.b8 97 +.b8 119 +.b8 54 +.b8 107 +.b8 118 +.b8 103 +.b8 119 +.b8 97 +.b8 115 +.b8 55 +.b8 103 +.b8 50 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 116 +.b8 118 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 42 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 42 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 42 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 50 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 50 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 50 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 399 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 399 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..9c1dfd0f82dd96dcabfb644693364a3d08446a3a --- /dev/null +++ b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ttir @@ -0,0 +1,38 @@ +module { + tt.func public @triton__0d1d2d34e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %c0_i32 = arith.constant 0 : i32 + %cst = arith.constant dense<0> : tensor<1x8xi64> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32> + %cst_1 = arith.constant dense<8> : tensor<1x8xi32> + %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32> + %2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32> + %3 = tt.splat %arg1 : (!tt.ptr) -> tensor<1x8x!tt.ptr> + %4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> + %5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32> + %6 = tt.splat %arg2 : (!tt.ptr) -> tensor<1x8x!tt.ptr> + %7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr>, tensor<1x8xi32> + %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64> + %9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1>, tensor<1x8xf32> + %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %19 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %19 : f32 + }) : (tensor<1x8xf32>) -> tensor<1xf32> + %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32> + %12 = arith.select %2, %8, %cst : tensor<1x8xi1>, tensor<1x8xi64> + %13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({ + ^bb0(%arg5: i64, %arg6: i64): + %19 = arith.addi %arg5, %arg6 : i64 + tt.reduce.return %19 : i64 + }) : (tensor<1x8xi64>) -> tensor<1xi64> + %14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64>) -> tensor<1x1xi64> + %15 = arith.sitofp %14 : tensor<1x1xi64> to tensor<1x1xf32> + %16 = arith.divf %11, %15 : tensor<1x1xf32> + gpu.barrier + %17 = tt.addptr %arg0, %c0_i32 : !tt.ptr, i32 + %18 = tt.splat %17 : (!tt.ptr) -> tensor<1x1x!tt.ptr> + tt.store %18, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32> + tt.return + } +} diff --git a/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..692e6b79c189bcbf821a65cb631a78fd5d621abb --- /dev/null +++ b/.triton/dump/4a587ee49c44b4c47e51f28541749625/triton_.ptx @@ -0,0 +1,495 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4de + +.visible .entry triton__0d1d2d3d4de( + .param .u64 triton__0d1d2d3d4de_param_0, + .param .u64 triton__0d1d2d3d4de_param_1, + .param .u64 triton__0d1d2d3d4de_param_2, + .param .u64 triton__0d1d2d3d4de_param_3, + .param .u32 triton__0d1d2d3d4de_param_4 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<8>; + .reg .b16 %rs<33>; + .reg .b32 %r<77>; + .reg .f32 %f<65>; + .reg .b64 %rd<11>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd5, [triton__0d1d2d3d4de_param_0]; + ld.param.u64 %rd6, [triton__0d1d2d3d4de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r50, %tid.x; + shl.b32 %r51, %r50, 3; + ld.param.u64 %rd7, [triton__0d1d2d3d4de_param_2]; + and.b32 %r52, %r51, 1016; + ld.param.u64 %rd8, [triton__0d1d2d3d4de_param_3]; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r53, %r1, 10; + .loc 1 21 23 + or.b32 %r54, %r53, %r52; + .loc 1 23 20 + shr.s32 %r56, %r54, 31; + shr.u32 %r57, %r56, 24; + add.s32 %r58, %r54, %r57; + shr.s32 %r59, %r58, 8; + .loc 1 23 27 + mul.hi.s32 %r60, %r59, 1431655766; + shr.u32 %r61, %r60, 31; + add.s32 %r62, %r60, %r61; + mul.lo.s32 %r63, %r62, 3; + sub.s32 %r64, %r59, %r63; + and.b32 %r65, %r58, -256; + sub.s32 %r66, %r54, %r65; + .loc 1 25 20 + mul.hi.s32 %r67, %r54, 715827883; + shr.u32 %r68, %r67, 31; + shr.u32 %r69, %r67, 7; + add.s32 %r70, %r69, %r68; + .loc 1 27 40 + shl.b32 %r71, %r70, 8; + .loc 1 27 36 + add.s32 %r72, %r71, %r66; + .loc 1 27 30 + mul.wide.s32 %rd9, %r72, 2; + add.s64 %rd1, %rd5, %rd9; + mov.pred %p1, -1; + .loc 1 27 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + cvt.u16.u32 %rs5, %r4; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; } + cvt.u16.u32 %rs7, %r5; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } + .loc 1 27 85 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + cvt.f32.bf16 %r8, %rs3; + mov.b32 %f3, %r8; + cvt.f32.bf16 %r9, %rs4; + mov.b32 %f4, %r9; + cvt.f32.bf16 %r10, %rs5; + mov.b32 %f5, %r10; + cvt.f32.bf16 %r11, %rs6; + mov.b32 %f6, %r11; + cvt.f32.bf16 %r12, %rs7; + mov.b32 %f7, %r12; + cvt.f32.bf16 %r13, %rs8; + mov.b32 %f8, %r13; + .loc 1 28 30 + add.s64 %rd2, %rd6, %rd9; + .loc 1 28 46 + mov.u32 %r14, 0x0; + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + mov.u32 %r17, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ]; + cvt.u16.u32 %rs9, %r14; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; } + cvt.u16.u32 %rs11, %r15; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; } + cvt.u16.u32 %rs13, %r16; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; } + cvt.u16.u32 %rs15, %r17; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; } + .loc 1 28 85 + cvt.f32.bf16 %r18, %rs9; + mov.b32 %f9, %r18; + cvt.f32.bf16 %r19, %rs10; + mov.b32 %f10, %r19; + cvt.f32.bf16 %r20, %rs11; + mov.b32 %f11, %r20; + cvt.f32.bf16 %r21, %rs12; + mov.b32 %f12, %r21; + cvt.f32.bf16 %r22, %rs13; + mov.b32 %f13, %r22; + cvt.f32.bf16 %r23, %rs14; + mov.b32 %f14, %r23; + cvt.f32.bf16 %r24, %rs15; + mov.b32 %f15, %r24; + cvt.f32.bf16 %r25, %rs16; + mov.b32 %f16, %r25; + .loc 1 29 31 + add.s64 %rd3, %rd7, %rd9; + .loc 1 29 47 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd3 + 0 ]; + cvt.u16.u32 %rs17, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r26; } + cvt.u16.u32 %rs19, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r27; } + cvt.u16.u32 %rs21, %r28; + { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r28; } + cvt.u16.u32 %rs23, %r29; + { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r29; } + .loc 1 29 86 + cvt.f32.bf16 %r30, %rs17; + mov.b32 %f17, %r30; + cvt.f32.bf16 %r31, %rs18; + mov.b32 %f18, %r31; + cvt.f32.bf16 %r32, %rs19; + mov.b32 %f19, %r32; + cvt.f32.bf16 %r33, %rs20; + mov.b32 %f20, %r33; + cvt.f32.bf16 %r34, %rs21; + mov.b32 %f21, %r34; + cvt.f32.bf16 %r35, %rs22; + mov.b32 %f22, %r35; + cvt.f32.bf16 %r36, %rs23; + mov.b32 %f23, %r36; + cvt.f32.bf16 %r37, %rs24; + mov.b32 %f24, %r37; + .loc 1 32 19 + setp.eq.s32 %p5, %r64, 2; + .loc 1 34 32 + selp.f32 %f25, %f1, 0f00000000, %p5; + selp.f32 %f26, %f2, 0f00000000, %p5; + selp.f32 %f27, %f3, 0f00000000, %p5; + selp.f32 %f28, %f4, 0f00000000, %p5; + selp.f32 %f29, %f5, 0f00000000, %p5; + selp.f32 %f30, %f6, 0f00000000, %p5; + selp.f32 %f31, %f7, 0f00000000, %p5; + selp.f32 %f32, %f8, 0f00000000, %p5; + .loc 1 36 19 + setp.eq.s32 %p6, %r64, 1; + .loc 1 37 32 + selp.f32 %f33, %f9, 0f00000000, %p6; + selp.f32 %f34, %f10, 0f00000000, %p6; + selp.f32 %f35, %f11, 0f00000000, %p6; + selp.f32 %f36, %f12, 0f00000000, %p6; + selp.f32 %f37, %f13, 0f00000000, %p6; + selp.f32 %f38, %f14, 0f00000000, %p6; + selp.f32 %f39, %f15, 0f00000000, %p6; + selp.f32 %f40, %f16, 0f00000000, %p6; + .loc 1 38 19 + add.f32 %f41, %f25, %f33; + add.f32 %f42, %f26, %f34; + add.f32 %f43, %f27, %f35; + add.f32 %f44, %f28, %f36; + add.f32 %f45, %f29, %f37; + add.f32 %f46, %f30, %f38; + add.f32 %f47, %f31, %f39; + add.f32 %f48, %f32, %f40; + .loc 1 40 20 + setp.eq.s32 %p7, %r64, 0; + .loc 1 41 35 + selp.f32 %f49, %f17, 0f00000000, %p7; + selp.f32 %f50, %f18, 0f00000000, %p7; + selp.f32 %f51, %f19, 0f00000000, %p7; + selp.f32 %f52, %f20, 0f00000000, %p7; + selp.f32 %f53, %f21, 0f00000000, %p7; + selp.f32 %f54, %f22, 0f00000000, %p7; + selp.f32 %f55, %f23, 0f00000000, %p7; + selp.f32 %f56, %f24, 0f00000000, %p7; + .loc 1 42 20 + add.f32 %f57, %f41, %f49; + add.f32 %f58, %f42, %f50; + add.f32 %f59, %f43, %f51; + add.f32 %f60, %f44, %f52; + add.f32 %f61, %f45, %f53; + add.f32 %f62, %f46, %f54; + add.f32 %f63, %f47, %f55; + add.f32 %f64, %f48, %f56; + .loc 1 43 25 + mul.wide.s32 %rd10, %r54, 2; + add.s64 %rd4, %rd8, %rd10; + .loc 1 43 37 + mov.b32 %r38, %f57; + cvt.rn.bf16.f32 %rs25, %r38; + mov.b32 %r39, %f58; + cvt.rn.bf16.f32 %rs26, %r39; + mov.b32 %r40, %f59; + cvt.rn.bf16.f32 %rs27, %r40; + mov.b32 %r41, %f60; + cvt.rn.bf16.f32 %rs28, %r41; + mov.b32 %r42, %f61; + cvt.rn.bf16.f32 %rs29, %r42; + mov.b32 %r43, %f62; + cvt.rn.bf16.f32 %rs30, %r43; + mov.b32 %r44, %f63; + cvt.rn.bf16.f32 %rs31, %r44; + mov.b32 %r45, %f64; + cvt.rn.bf16.f32 %rs32, %r45; + mov.b32 %r73, {%rs25, %rs26}; + mov.b32 %r74, {%rs27, %rs28}; + mov.b32 %r75, {%rs29, %rs30}; + mov.b32 %r76, {%rs31, %rs32}; + @%p1 st.global.v4.b32 [ %rd4 + 0 ], { %r73, %r74, %r75, %r76 }; + .loc 1 43 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/63/c63r7iurwk5ydlswh7rvhcmlx2cfretlrewgw6tljlursshgtfpp.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 184 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 51 +.b8 114 +.b8 55 +.b8 105 +.b8 117 +.b8 114 +.b8 119 +.b8 107 +.b8 53 +.b8 121 +.b8 100 +.b8 108 +.b8 115 +.b8 119 +.b8 104 +.b8 55 +.b8 114 +.b8 118 +.b8 104 +.b8 99 +.b8 109 +.b8 108 +.b8 120 +.b8 50 +.b8 99 +.b8 102 +.b8 114 +.b8 101 +.b8 116 +.b8 108 +.b8 114 +.b8 101 +.b8 119 +.b8 103 +.b8 119 +.b8 54 +.b8 116 +.b8 108 +.b8 106 +.b8 108 +.b8 117 +.b8 114 +.b8 115 +.b8 115 +.b8 104 +.b8 103 +.b8 116 +.b8 102 +.b8 112 +.b8 112 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 51 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 188 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 188 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..102e67c9802341a53b63197c7751569781fdf423 --- /dev/null +++ b/.triton/dump/63ac7476060ddeef758fa13ad6ed58f5/triton_.llir @@ -0,0 +1,278 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !5 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %10 = and i32 %9, 31, !dbg !8 + %11 = lshr i32 %9, 5, !dbg !8 + %12 = and i32 %11, 1, !dbg !8 + %urem = shl i32 %9, 2, !dbg !8 + %13 = and i32 %urem, 252, !dbg !8 + %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %15 = shl i32 %14, 8, !dbg !10 + %16 = or i32 %15, %13, !dbg !11 + %17 = sext i32 %16 to i64, !dbg !12 + %18 = getelementptr i16, ptr addrspace(1) %1, i64 %17, !dbg !12 + %19 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 + %20 = extractvalue { i32, i32 } %19, 0, !dbg !13 + %21 = extractvalue { i32, i32 } %19, 1, !dbg !13 + %22 = trunc i32 %20 to i16, !dbg !13 + %extelt.offset = lshr i32 %20, 16, !dbg !13 + %23 = trunc i32 %extelt.offset to i16, !dbg !13 + %24 = trunc i32 %21 to i16, !dbg !13 + %extelt.offset1 = lshr i32 %21, 16, !dbg !13 + %25 = trunc i32 %extelt.offset1 to i16, !dbg !13 + %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #3, !dbg !14 + %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #3, !dbg !14 + %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14 + %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14 + %30 = zext nneg i32 %13 to i64, !dbg !15 + %31 = getelementptr float, ptr addrspace(1) %2, i64 %30, !dbg !15 + %32 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %31, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %33 = extractvalue { i32, i32, i32, i32 } %32, 0, !dbg !16 + %34 = extractvalue { i32, i32, i32, i32 } %32, 1, !dbg !16 + %35 = extractvalue { i32, i32, i32, i32 } %32, 2, !dbg !16 + %36 = extractvalue { i32, i32, i32, i32 } %32, 3, !dbg !16 + %37 = bitcast i32 %33 to float, !dbg !16 + %38 = bitcast i32 %34 to float, !dbg !16 + %39 = bitcast i32 %35 to float, !dbg !16 + %40 = bitcast i32 %36 to float, !dbg !16 + %41 = getelementptr float, ptr addrspace(1) %3, i64 %17, !dbg !17 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !18 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !18 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !18 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !18 + %47 = bitcast i32 %43 to float, !dbg !18 + %48 = bitcast i32 %44 to float, !dbg !18 + %49 = bitcast i32 %45 to float, !dbg !18 + %50 = bitcast i32 %46 to float, !dbg !18 + %51 = getelementptr float, ptr addrspace(1) %0, i64 %17, !dbg !19 + %52 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20 + %53 = extractvalue { i32, i32, i32, i32 } %52, 0, !dbg !20 + %54 = extractvalue { i32, i32, i32, i32 } %52, 1, !dbg !20 + %55 = extractvalue { i32, i32, i32, i32 } %52, 2, !dbg !20 + %56 = extractvalue { i32, i32, i32, i32 } %52, 3, !dbg !20 + %57 = bitcast i32 %53 to float, !dbg !20 + %58 = bitcast i32 %54 to float, !dbg !20 + %59 = bitcast i32 %55 to float, !dbg !20 + %60 = bitcast i32 %56 to float, !dbg !20 + %61 = sext i32 %14 to i64, !dbg !21 + %62 = getelementptr float, ptr addrspace(1) %4, i64 %61, !dbg !21 + %63 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 + %64 = bitcast i32 %63 to float, !dbg !22 + %65 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 + %66 = bitcast i32 %65 to float, !dbg !22 + %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 + %68 = bitcast i32 %67 to float, !dbg !22 + %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %62, i1 true) #3, !dbg !22 + %70 = bitcast i32 %69 to float, !dbg !22 + %71 = fmul float %26, %37, !dbg !23 + %72 = fmul float %27, %38, !dbg !23 + %73 = fmul float %28, %39, !dbg !23 + %74 = fmul float %29, %40, !dbg !23 + %75 = fadd float %71, %72, !dbg !24 + %76 = fadd float %73, %75, !dbg !24 + %77 = fadd float %74, %76, !dbg !24 + %78 = bitcast float %77 to i32, !dbg !30 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 16, i32 31), !dbg !30 + %80 = bitcast i32 %79 to float, !dbg !30 + %81 = fadd float %77, %80, !dbg !24 + %82 = bitcast float %81 to i32, !dbg !30 + %83 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %82, i32 8, i32 31), !dbg !30 + %84 = bitcast i32 %83 to float, !dbg !30 + %85 = fadd float %81, %84, !dbg !24 + %86 = bitcast float %85 to i32, !dbg !30 + %87 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %86, i32 4, i32 31), !dbg !30 + %88 = bitcast i32 %87 to float, !dbg !30 + %89 = fadd float %85, %88, !dbg !24 + %90 = bitcast float %89 to i32, !dbg !30 + %91 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %90, i32 2, i32 31), !dbg !30 + %92 = bitcast i32 %91 to float, !dbg !30 + %93 = fadd float %89, %92, !dbg !24 + %94 = bitcast float %93 to i32, !dbg !30 + %95 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %94, i32 1, i32 31), !dbg !30 + %96 = bitcast i32 %95 to float, !dbg !30 + %97 = fadd float %93, %96, !dbg !24 + %98 = icmp eq i32 %10, 0, !dbg !30 + %99 = zext nneg i32 %12 to i64, !dbg !30 + %100 = getelementptr float, ptr addrspace(3) @global_smem, i64 %99, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %97, i1 %98) #3, !dbg !30 + tail call void @llvm.nvvm.barrier0(), !dbg !30 + %101 = icmp slt i32 %9, 2, !dbg !30 + %102 = sext i32 %9 to i64, !dbg !30 + %103 = getelementptr float, ptr addrspace(3) @global_smem, i64 %102, !dbg !30 + %104 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !30 + %105 = bitcast float %104 to i32, !dbg !30 + %106 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %105, i32 1, i32 31), !dbg !30 + %107 = bitcast i32 %106 to float, !dbg !30 + %108 = fadd float %104, %107, !dbg !24 + %109 = and i32 %9, 1, !dbg !30 + %110 = icmp eq i32 %109, 0, !dbg !30 + %111 = and i1 %101, %110, !dbg !30 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %108, i1 %111) #3, !dbg !30 + tail call void @llvm.nvvm.barrier0(), !dbg !30 + %112 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !30 + %113 = fadd float %112, 0.000000e+00, !dbg !32 + %114 = fmul float %71, %47, !dbg !36 + %115 = fmul float %72, %48, !dbg !36 + %116 = fmul float %73, %49, !dbg !36 + %117 = fmul float %74, %50, !dbg !36 + tail call void @llvm.nvvm.barrier0(), !dbg !37 + %118 = fadd float %114, %115, !dbg !39 + %119 = fadd float %116, %118, !dbg !39 + %120 = fadd float %117, %119, !dbg !39 + %121 = bitcast float %120 to i32, !dbg !37 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !37 + %123 = bitcast i32 %122 to float, !dbg !37 + %124 = fadd float %120, %123, !dbg !39 + %125 = bitcast float %124 to i32, !dbg !37 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 8, i32 31), !dbg !37 + %127 = bitcast i32 %126 to float, !dbg !37 + %128 = fadd float %124, %127, !dbg !39 + %129 = bitcast float %128 to i32, !dbg !37 + %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 4, i32 31), !dbg !37 + %131 = bitcast i32 %130 to float, !dbg !37 + %132 = fadd float %128, %131, !dbg !39 + %133 = bitcast float %132 to i32, !dbg !37 + %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 2, i32 31), !dbg !37 + %135 = bitcast i32 %134 to float, !dbg !37 + %136 = fadd float %132, %135, !dbg !39 + %137 = bitcast float %136 to i32, !dbg !37 + %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !37 + %139 = bitcast i32 %138 to float, !dbg !37 + %140 = fadd float %136, %139, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %100, float %140, i1 %98) #3, !dbg !37 + tail call void @llvm.nvvm.barrier0(), !dbg !37 + %141 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %103, i1 %101) #3, !dbg !37 + %142 = bitcast float %141 to i32, !dbg !37 + %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 1, i32 31), !dbg !37 + %144 = bitcast i32 %143 to float, !dbg !37 + %145 = fadd float %141, %144, !dbg !39 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %103, float %145, i1 %111) #3, !dbg !37 + tail call void @llvm.nvvm.barrier0(), !dbg !37 + %146 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37 + %147 = fadd float %146, 0.000000e+00, !dbg !42 + %148 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %64, float 2.560000e+02) #3, !dbg !44 + %149 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %66, float 2.560000e+02) #3, !dbg !44 + %150 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %68, float 2.560000e+02) #3, !dbg !44 + %151 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %70, float 2.560000e+02) #3, !dbg !44 + %152 = fmul float %71, 2.560000e+02, !dbg !45 + %153 = fmul float %72, 2.560000e+02, !dbg !45 + %154 = fmul float %73, 2.560000e+02, !dbg !45 + %155 = fmul float %74, 2.560000e+02, !dbg !45 + %156 = fsub float %152, %113, !dbg !46 + %157 = fsub float %153, %113, !dbg !46 + %158 = fsub float %154, %113, !dbg !46 + %159 = fsub float %155, %113, !dbg !46 + %160 = fmul float %147, %47, !dbg !47 + %161 = fmul float %147, %48, !dbg !47 + %162 = fmul float %147, %49, !dbg !47 + %163 = fmul float %147, %50, !dbg !47 + %164 = fsub float %156, %160, !dbg !48 + %165 = fsub float %157, %161, !dbg !48 + %166 = fsub float %158, %162, !dbg !48 + %167 = fsub float %159, %163, !dbg !48 + %168 = fmul float %148, %164, !dbg !49 + %169 = fmul float %148, %165, !dbg !49 + %170 = fmul float %148, %166, !dbg !49 + %171 = fmul float %148, %167, !dbg !49 + %172 = fadd float %168, %57, !dbg !50 + %173 = fadd float %169, %58, !dbg !50 + %174 = fadd float %170, %59, !dbg !50 + %175 = fadd float %171, %60, !dbg !50 + %176 = bitcast float %172 to i32, !dbg !51 + %177 = bitcast float %173 to i32, !dbg !51 + %178 = bitcast float %174 to i32, !dbg !51 + %179 = bitcast float %175 to i32, !dbg !51 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %176, i32 %177, i32 %178, i32 %179, ptr addrspace(1) %51, i1 true) #3, !dbg !51 + %180 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52 + %181 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %172) #3, !dbg !53 + %182 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %173) #3, !dbg !53 + %183 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %174) #3, !dbg !53 + %184 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %175) #3, !dbg !53 + %185 = insertelement <2 x i16> undef, i16 %181, i64 0, !dbg !53 + %186 = insertelement <2 x i16> %185, i16 %182, i64 1, !dbg !53 + %187 = bitcast <2 x i16> %186 to i32, !dbg !53 + %188 = insertelement <2 x i16> undef, i16 %183, i64 0, !dbg !53 + %189 = insertelement <2 x i16> %188, i16 %184, i64 1, !dbg !53 + %190 = bitcast <2 x i16> %189 to i32, !dbg !53 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %187, i32 %190, ptr addrspace(1) %180, i1 true) #3, !dbg !53 + ret void, !dbg !54 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "crnynbmsd2yell2lpjymb46rttfaea2xjwsbxr75j54gctfgi457.py", directory: "/tmp/torchinductor_root/rn") +!3 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 26, column: 26, scope: !5) +!9 = !DILocation(line: 23, column: 28, scope: !5) +!10 = !DILocation(line: 30, column: 40, scope: !5) +!11 = !DILocation(line: 30, column: 36, scope: !5) +!12 = !DILocation(line: 30, column: 30, scope: !5) +!13 = !DILocation(line: 30, column: 46, scope: !5) +!14 = !DILocation(line: 30, column: 67, scope: !5) +!15 = !DILocation(line: 31, column: 30, scope: !5) +!16 = !DILocation(line: 31, column: 35, scope: !5) +!17 = !DILocation(line: 32, column: 30, scope: !5) +!18 = !DILocation(line: 32, column: 46, scope: !5) +!19 = !DILocation(line: 33, column: 35, scope: !5) +!20 = !DILocation(line: 33, column: 51, scope: !5) +!21 = !DILocation(line: 34, column: 31, scope: !5) +!22 = !DILocation(line: 34, column: 36, scope: !5) +!23 = !DILocation(line: 36, column: 18, scope: !5) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !28) +!25 = distinct !DILexicalBlockFile(scope: !27, file: !26, discriminator: 0) +!26 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!27 = distinct !DILexicalBlockFile(scope: !5, file: !26, discriminator: 0) +!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29) +!29 = !DILocation(line: 39, column: 57, scope: !25) +!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) +!31 = !DILocation(line: 39, column: 57, scope: !27) +!32 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !35) +!33 = distinct !DILexicalBlockFile(scope: !5, file: !34, discriminator: 0) +!34 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!35 = !DILocation(line: 39, column: 44, scope: !33) +!36 = !DILocation(line: 40, column: 18, scope: !5) +!37 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !38) +!38 = !DILocation(line: 43, column: 59, scope: !27) +!39 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !40) +!40 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !41) +!41 = !DILocation(line: 43, column: 59, scope: !25) +!42 = !DILocation(line: 8, column: 15, scope: !33, inlinedAt: !43) +!43 = !DILocation(line: 43, column: 45, scope: !33) +!44 = !DILocation(line: 45, column: 20, scope: !5) +!45 = !DILocation(line: 46, column: 19, scope: !5) +!46 = !DILocation(line: 47, column: 20, scope: !5) +!47 = !DILocation(line: 48, column: 19, scope: !5) +!48 = !DILocation(line: 49, column: 20, scope: !5) +!49 = !DILocation(line: 50, column: 20, scope: !5) +!50 = !DILocation(line: 51, column: 20, scope: !5) +!51 = !DILocation(line: 53, column: 51, scope: !5) +!52 = !DILocation(line: 54, column: 25, scope: !5) +!53 = !DILocation(line: 54, column: 48, scope: !5) +!54 = !DILocation(line: 54, column: 4, scope: !5) diff --git a/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..8fe4676895b13d6c78ab8f713ffae182d899fb00 --- /dev/null +++ b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.ptx @@ -0,0 +1,1154 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6de7de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62}; +.global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62}; +.global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6de7de( + .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<65>; + .reg .b16 %rs<13>; + .reg .b32 %r<188>; + .reg .f32 %f<166>; + .reg .b64 %rd<99>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6de7de_param_3]; + ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_2]; + ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6de7de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6de7de_param_1]; + bfe.u32 %r3, %r1, 6, 1; + and.b32 %r4, %r1, 1; + .loc 1 24 33 + shl.b32 %r23, %r1, 1; + and.b32 %r5, %r23, 126; + .loc 1 21 28 + mov.u32 %r14, %ctaid.x; + .loc 1 21 33 + shl.b32 %r24, %r14, 1; + .loc 1 22 23 + or.b32 %r25, %r24, %r3; + or.b32 %r26, %r24, %r4; + .loc 1 26 30 + mul.wide.s32 %rd26, %r25, 8; + add.s64 %rd17, %rd24, %rd26; + mul.wide.s32 %rd27, %r26, 8; + add.s64 %rd21, %rd24, %rd27; + mov.pred %p61, -1; + .loc 1 26 35 + mov.u64 %rd16, 0x0; + @%p61 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd17 + 0 ]; + mov.u64 %rd18, 0x0; + @%p61 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd17 + 0 ]; + mov.u64 %rd20, 0x0; + @%p61 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ]; + .loc 1 27 18 + bfe.s32 %r27, %r14, 30, 1; + shr.u32 %r28, %r27, 23; + add.s32 %r29, %r25, %r28; + and.b32 %r30, %r29, 16776704; + sub.s32 %r31, %r25, %r30; + .loc 1 35 44 + shl.b32 %r6, %r31, 8; + .loc 1 36 44 + shl.b32 %r7, %r25, 8; + .loc 1 37 22 + add.s64 %rd28, %rd20, 50257; + .loc 1 38 22 + setp.lt.s64 %p9, %rd16, 0; + setp.lt.s64 %p10, %rd20, 0; + .loc 1 39 36 + selp.b64 %rd1, %rd28, %rd20, %p10; + .loc 1 40 40 + setp.lt.u64 %p11, %rd1, 50257; + .loc 1 41 44 + shl.b64 %rd29, %rd16, 8; + add.s64 %rd30, %rd29, 12865792; + selp.b64 %rd31, %rd30, %rd29, %p9; + shl.b64 %rd32, %rd31, 2; + add.s64 %rd2, %rd25, %rd32; + .loc 1 35 40 + or.b32 %r32, %r5, %r6; + .loc 1 35 34 + mul.wide.s32 %rd33, %r32, 4; + add.s64 %rd62, %rd12, %rd33; + mov.b32 %r179, 0; + .loc 1 35 50 + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + @%p61 ld.global.L1::evict_last.v2.b32 { %r15, %r16 }, [ %rd62 + 0 ]; + @!%p61 mov.u32 %r15, %r179; + @!%p61 mov.u32 %r16, %r179; + mov.b32 %f2, %r16; + mov.b32 %f1, %r15; + .loc 1 36 40 + or.b32 %r33, %r5, %r7; + .loc 1 36 34 + mul.wide.s32 %rd34, %r33, 2; + add.s64 %rd63, %rd13, %rd34; + .loc 1 36 50 + mov.u32 %r19, 0x0; + @%p61 ld.global.L1::evict_last.b32 { %r19 }, [ %rd63 + 0 ]; + @!%p61 mov.u32 %r19, %r179; + cvt.u16.u32 %rs1, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; } + .loc 1 36 101 + cvt.f32.bf16 %r21, %rs1; + mov.b32 %f3, %r21; + cvt.f32.bf16 %r22, %rs2; + mov.b32 %f4, %r22; + mov.u64 %rd95, assertMessage_0; + mov.u64 %rd96, assertFile_0; + mov.u64 %rd97, assertFunc_0; + mov.b32 %r187, 1892; + mov.u64 %rd98, 1; + .loc 1 40 55 + @%p11 bra $L__BB0_2; + cvta.global.u64 %rd36, %rd95; + cvta.global.u64 %rd38, %rd96; + cvta.global.u64 %rd40, %rd97; + { // callseq 2, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd36; + .param .b64 param1; + st.param.b64 [param1+0], %rd38; + .param .b32 param2; + st.param.b32 [param2+0], %r187; + .param .b64 param3; + st.param.b64 [param3+0], %rd40; + .param .b64 param4; + st.param.b64 [param4+0], %rd98; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 2 +$L__BB0_2: + .loc 1 0 55 + ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6de7de_param_4]; + and.b32 %r2, %r1, 31; + .loc 1 41 40 + cvt.u64.u32 %rd45, %r5; + .loc 1 41 34 + mul.wide.u32 %rd46, %r5, 4; + add.s64 %rd73, %rd2, %rd46; + .loc 1 41 52 + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + @%p61 ld.global.L1::evict_last.v2.b32 { %r35, %r36 }, [ %rd73 + 0 ]; + @!%p61 mov.u32 %r35, %r179; + @!%p61 mov.u32 %r36, %r179; + mov.b32 %f21, %r36; + mov.b32 %f22, %r35; + .loc 1 42 22 + add.f32 %f23, %f1, %f22; + add.f32 %f24, %f2, %f21; + .loc 1 44 22 + add.f32 %f25, %f4, %f24; + mov.b32 %r43, %f25; + add.f32 %f26, %f3, %f23; + mov.b32 %r40, %f26; + mov.b32 %r41, 1065353216; +$L__tmp1: + .loc 2 98 30 + div.full.f32 %r39, %r40, %r41; + mov.b32 %f27, %r39; + div.full.f32 %r42, %r43, %r41; + mov.b32 %f28, %r42; + .loc 2 98 22 + add.f32 %f6, %f28, 0f00000000; + add.f32 %f5, %f27, 0f00000000; + .loc 2 101 30 + sub.f32 %f29, %f26, %f5; + sub.f32 %f30, %f25, %f6; +$L__tmp2: + .loc 1 50 50 + fma.rn.f32 %f8, %f25, %f30, 0f00000000; + fma.rn.f32 %f7, %f26, %f29, 0f00000000; + .loc 1 35 34 + cvt.s64.s32 %rd47, %r6; + add.s64 %rd48, %rd45, %rd47; + shl.b64 %rd49, %rd48, 2; + add.s64 %rd50, %rd12, %rd49; + add.s64 %rd75, %rd50, 512; + .loc 1 35 50 + mov.u32 %r45, 0x0; + mov.u32 %r46, 0x0; + @%p61 ld.global.L1::evict_last.v2.b32 { %r45, %r46 }, [ %rd75 + 0 ]; + @!%p61 mov.u32 %r45, %r179; + @!%p61 mov.u32 %r46, %r179; + mov.b32 %f10, %r46; + mov.b32 %f9, %r45; + .loc 1 36 34 + cvt.s64.s32 %rd51, %r7; + add.s64 %rd8, %rd45, %rd51; + shl.b64 %rd52, %rd8, 1; + add.s64 %rd53, %rd13, %rd52; + add.s64 %rd76, %rd53, 256; + .loc 1 36 50 + mov.u32 %r49, 0x0; + @%p61 ld.global.L1::evict_last.b32 { %r49 }, [ %rd76 + 0 ]; + @!%p61 mov.u32 %r49, %r179; + cvt.u16.u32 %rs3, %r49; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r49; } + .loc 1 36 101 + cvt.f32.bf16 %r51, %rs3; + mov.b32 %f11, %r51; + cvt.f32.bf16 %r52, %rs4; + mov.b32 %f12, %r52; + .loc 1 40 55 + @%p11 bra $L__BB0_4; + cvta.global.u64 %rd55, %rd95; + cvta.global.u64 %rd57, %rd96; + cvta.global.u64 %rd59, %rd97; + { // callseq 3, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd55; + .param .b64 param1; + st.param.b64 [param1+0], %rd57; + .param .b32 param2; + st.param.b32 [param2+0], %r187; + .param .b64 param3; + st.param.b64 [param3+0], %rd59; + .param .b64 param4; + st.param.b64 [param4+0], %rd98; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 3 +$L__BB0_4: + .loc 1 0 55 + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6de7de_param_5]; + cvt.s64.s32 %rd4, %r33; + .loc 1 41 34 + add.s64 %rd86, %rd73, 512; + .loc 1 41 52 + mov.u32 %r54, 0x0; + mov.u32 %r55, 0x0; + @%p61 ld.global.L1::evict_last.v2.b32 { %r54, %r55 }, [ %rd86 + 0 ]; + @!%p61 mov.u32 %r54, %r179; + @!%p61 mov.u32 %r55, %r179; + mov.b32 %f31, %r54; + mov.b32 %f32, %r55; + .loc 1 42 22 + add.f32 %f33, %f10, %f32; + add.f32 %f34, %f9, %f31; + .loc 1 44 22 + add.f32 %f35, %f11, %f34; + add.f32 %f36, %f12, %f33; +$L__tmp3: + .loc 2 96 20 + sub.f32 %f37, %f36, %f6; + mov.b32 %r62, %f37; + sub.f32 %f38, %f35, %f5; + mov.b32 %r59, %f38; + mov.b32 %r60, 1073741824; + .loc 2 98 30 + div.full.f32 %r58, %r59, %r60; + mov.b32 %f39, %r58; + div.full.f32 %r61, %r62, %r60; + mov.b32 %f40, %r61; + .loc 2 98 22 + add.f32 %f41, %f6, %f40; + add.f32 %f42, %f5, %f39; + .loc 2 101 30 + sub.f32 %f43, %f35, %f42; + sub.f32 %f44, %f36, %f41; +$L__tmp4: + .loc 1 50 50 + fma.rn.f32 %f45, %f37, %f44, %f8; + fma.rn.f32 %f46, %f38, %f43, %f7; + .loc 1 24 33 + and.b32 %r119, %r1, 127; + .loc 1 31 36 + shl.b32 %r120, %r119, 2; + mov.u32 %r121, global_smem; + add.s32 %r8, %r121, %r120; + st.shared.u32 [%r8], %r60; + st.shared.u32 [%r8+520], %r60; + bar.sync 0; + mad.lo.s32 %r122, %r3, 130, %r5; + shl.b32 %r123, %r122, 2; + add.s32 %r124, %r121, %r123; + ld.shared.v2.f32 {%f47, %f48}, [%r124]; +$L__tmp5: + .loc 2 120 46 + bar.sync 0; +$L__tmp6: + .loc 2 108 21 + sub.f32 %f49, %f41, %f42; + .loc 2 109 28 + add.f32 %f50, %f47, %f48; + .loc 2 110 39 + setp.eq.f32 %p41, %f50, 0f00000000; + .loc 2 110 60 + mov.b32 %r65, %f48; + mov.b32 %r66, %f50; + div.full.f32 %r64, %r65, %r66; + mov.b32 %f51, %r64; + .loc 2 110 49 + selp.f32 %f52, 0f00000000, %f51, %p41; + .loc 2 112 17 + fma.rn.f32 %f53, %f49, %f52, %f42; + .loc 2 113 15 + add.f32 %f54, %f46, %f45; + .loc 2 113 30 + mul.f32 %f55, %f49, %f49; + .loc 2 113 38 + mul.f32 %f56, %f55, %f47; + .loc 2 113 22 + fma.rn.f32 %f57, %f56, %f52, %f54; +$L__tmp7: + .loc 2 120 46 + mov.b32 %r125, %f53; + shfl.sync.bfly.b32 %r126, %r125, 16, 31, -1; + mov.b32 %f58, %r126; + mov.b32 %r127, %f57; + shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1; + mov.b32 %f59, %r128; + shfl.sync.bfly.b32 %r68, %r66, 16, 31, -1; + mov.b32 %f60, %r68; +$L__tmp8: + .loc 2 108 21 + sub.f32 %f61, %f58, %f53; + .loc 2 109 28 + add.f32 %f62, %f50, %f60; + .loc 2 110 39 + setp.eq.f32 %p42, %f62, 0f00000000; + .loc 2 110 60 + mov.b32 %r69, %f62; + div.full.f32 %r67, %r68, %r69; + mov.b32 %f63, %r67; + .loc 2 110 49 + selp.f32 %f64, 0f00000000, %f63, %p42; + .loc 2 112 17 + fma.rn.f32 %f65, %f61, %f64, %f53; + .loc 2 113 15 + add.f32 %f66, %f57, %f59; + .loc 2 113 30 + mul.f32 %f67, %f61, %f61; + .loc 2 113 38 + mul.f32 %f68, %f50, %f67; + .loc 2 113 22 + fma.rn.f32 %f69, %f68, %f64, %f66; +$L__tmp9: + .loc 2 120 46 + mov.b32 %r129, %f65; + shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1; + mov.b32 %f70, %r130; + mov.b32 %r131, %f69; + shfl.sync.bfly.b32 %r132, %r131, 8, 31, -1; + mov.b32 %f71, %r132; + shfl.sync.bfly.b32 %r71, %r69, 8, 31, -1; + mov.b32 %f72, %r71; +$L__tmp10: + .loc 2 108 21 + sub.f32 %f73, %f70, %f65; + .loc 2 109 28 + add.f32 %f74, %f62, %f72; + .loc 2 110 39 + setp.eq.f32 %p43, %f74, 0f00000000; + .loc 2 110 60 + mov.b32 %r72, %f74; + div.full.f32 %r70, %r71, %r72; + mov.b32 %f75, %r70; + .loc 2 110 49 + selp.f32 %f76, 0f00000000, %f75, %p43; + .loc 2 112 17 + fma.rn.f32 %f77, %f73, %f76, %f65; + .loc 2 113 15 + add.f32 %f78, %f69, %f71; + .loc 2 113 30 + mul.f32 %f79, %f73, %f73; + .loc 2 113 38 + mul.f32 %f80, %f62, %f79; + .loc 2 113 22 + fma.rn.f32 %f81, %f76, %f80, %f78; +$L__tmp11: + .loc 2 120 46 + mov.b32 %r133, %f77; + shfl.sync.bfly.b32 %r134, %r133, 4, 31, -1; + mov.b32 %f82, %r134; + mov.b32 %r135, %f81; + shfl.sync.bfly.b32 %r136, %r135, 4, 31, -1; + mov.b32 %f83, %r136; + shfl.sync.bfly.b32 %r74, %r72, 4, 31, -1; + mov.b32 %f84, %r74; +$L__tmp12: + .loc 2 108 21 + sub.f32 %f85, %f82, %f77; + .loc 2 109 28 + add.f32 %f86, %f74, %f84; + .loc 2 110 39 + setp.eq.f32 %p44, %f86, 0f00000000; + .loc 2 110 60 + mov.b32 %r75, %f86; + div.full.f32 %r73, %r74, %r75; + mov.b32 %f87, %r73; + .loc 2 110 49 + selp.f32 %f88, 0f00000000, %f87, %p44; + .loc 2 112 17 + fma.rn.f32 %f89, %f85, %f88, %f77; + .loc 2 113 15 + add.f32 %f90, %f81, %f83; + .loc 2 113 30 + mul.f32 %f91, %f85, %f85; + .loc 2 113 38 + mul.f32 %f92, %f74, %f91; + .loc 2 113 22 + fma.rn.f32 %f93, %f88, %f92, %f90; +$L__tmp13: + .loc 2 120 46 + mov.b32 %r137, %f89; + shfl.sync.bfly.b32 %r138, %r137, 2, 31, -1; + mov.b32 %f94, %r138; + mov.b32 %r139, %f93; + shfl.sync.bfly.b32 %r140, %r139, 2, 31, -1; + mov.b32 %f95, %r140; + shfl.sync.bfly.b32 %r77, %r75, 2, 31, -1; + mov.b32 %f96, %r77; +$L__tmp14: + .loc 2 108 21 + sub.f32 %f97, %f94, %f89; + .loc 2 109 28 + add.f32 %f98, %f86, %f96; + .loc 2 110 39 + setp.eq.f32 %p45, %f98, 0f00000000; + .loc 2 110 60 + mov.b32 %r78, %f98; + div.full.f32 %r76, %r77, %r78; + mov.b32 %f99, %r76; + .loc 2 110 49 + selp.f32 %f100, 0f00000000, %f99, %p45; + .loc 2 112 17 + fma.rn.f32 %f101, %f97, %f100, %f89; + .loc 2 113 15 + add.f32 %f102, %f93, %f95; + .loc 2 113 30 + mul.f32 %f103, %f97, %f97; + .loc 2 113 38 + mul.f32 %f104, %f86, %f103; + .loc 2 113 22 + fma.rn.f32 %f105, %f100, %f104, %f102; +$L__tmp15: + .loc 2 120 46 + mov.b32 %r141, %f101; + shfl.sync.bfly.b32 %r142, %r141, 1, 31, -1; + mov.b32 %f106, %r142; + mov.b32 %r143, %f105; + shfl.sync.bfly.b32 %r144, %r143, 1, 31, -1; + mov.b32 %f107, %r144; + shfl.sync.bfly.b32 %r80, %r78, 1, 31, -1; + mov.b32 %f108, %r80; +$L__tmp16: + .loc 2 108 21 + sub.f32 %f109, %f106, %f101; + .loc 2 109 28 + add.f32 %f110, %f98, %f108; + .loc 2 110 39 + setp.eq.f32 %p46, %f110, 0f00000000; + .loc 2 110 60 + mov.b32 %r81, %f110; + div.full.f32 %r79, %r80, %r81; + mov.b32 %f111, %r79; + .loc 2 110 49 + selp.f32 %f112, 0f00000000, %f111, %p46; + .loc 2 112 17 + fma.rn.f32 %f113, %f109, %f112, %f101; + .loc 2 113 15 + add.f32 %f114, %f105, %f107; + .loc 2 113 30 + mul.f32 %f115, %f109, %f109; + .loc 2 113 38 + mul.f32 %f116, %f98, %f115; + .loc 2 113 22 + fma.rn.f32 %f117, %f112, %f116, %f114; +$L__tmp17: + .loc 2 120 46 + setp.eq.s32 %p24, %r2, 0; + shr.u32 %r145, %r1, 3; + and.b32 %r146, %r145, 4; + shl.b32 %r147, %r3, 3; + or.b32 %r148, %r147, %r146; + add.s32 %r82, %r121, %r148; + mov.b32 %r83, %f113; + @%p24 st.shared.b32 [ %r82 + 0 ], %r83; + add.s32 %r149, %r121, 16; + add.s32 %r84, %r149, %r148; + mov.b32 %r85, %f117; + @%p24 st.shared.b32 [ %r84 + 0 ], %r85; + add.s32 %r150, %r121, 32; + add.s32 %r86, %r150, %r148; + @%p24 st.shared.b32 [ %r86 + 0 ], %r81; + bar.sync 0; + setp.lt.s32 %p27, %r1, 4; + shl.b32 %r151, %r1, 2; + add.s32 %r89, %r121, %r151; + @%p27 ld.shared.b32 %r88, [ %r89 + 0 ]; + mov.b32 %f118, %r88; + add.s32 %r91, %r149, %r151; + @%p27 ld.shared.b32 %r90, [ %r91 + 0 ]; + mov.b32 %f119, %r90; + add.s32 %r93, %r150, %r151; + @%p27 ld.shared.b32 %r92, [ %r93 + 0 ]; + mov.b32 %f120, %r92; + shfl.sync.bfly.b32 %r152, %r88, 1, 31, -1; + mov.b32 %f121, %r152; + shfl.sync.bfly.b32 %r153, %r90, 1, 31, -1; + mov.b32 %f122, %r153; + shfl.sync.bfly.b32 %r95, %r92, 1, 31, -1; + mov.b32 %f123, %r95; +$L__tmp18: + .loc 2 108 21 + sub.f32 %f124, %f121, %f118; + .loc 2 109 28 + add.f32 %f125, %f120, %f123; + .loc 2 110 39 + setp.eq.f32 %p47, %f125, 0f00000000; + .loc 2 110 60 + mov.b32 %r96, %f125; + div.full.f32 %r94, %r95, %r96; + mov.b32 %f126, %r94; + .loc 2 110 49 + selp.f32 %f127, 0f00000000, %f126, %p47; + .loc 2 112 17 + fma.rn.f32 %f128, %f124, %f127, %f118; + .loc 2 113 15 + add.f32 %f129, %f119, %f122; + .loc 2 113 30 + mul.f32 %f130, %f124, %f124; + .loc 2 113 38 + mul.f32 %f131, %f120, %f130; + .loc 2 113 22 + fma.rn.f32 %f132, %f131, %f127, %f129; +$L__tmp19: + .loc 2 120 46 + setp.eq.s32 %p48, %r4, 0; + and.pred %p30, %p27, %p48; + mov.b32 %r98, %f128; + @%p30 st.shared.b32 [ %r89 + 0 ], %r98; + mov.b32 %r100, %f132; + @%p30 st.shared.b32 [ %r91 + 0 ], %r100; + @%p30 st.shared.b32 [ %r93 + 0 ], %r96; + bar.sync 0; + add.s32 %r154, %r121, %r147; + ld.shared.f32 %f13, [%r154]; + add.s32 %r155, %r149, %r147; +$L__tmp20: + .loc 1 75 24 + ld.shared.u32 %r104, [%r155]; + mov.b32 %r105, 1132462080; + div.full.f32 %r103, %r104, %r105; + mov.b32 %f133, %r103; + .loc 1 77 24 + add.f32 %f14, %f133, 0f3727C5AC; + shl.b32 %r156, %r5, 2; + add.s32 %r9, %r121, %r156; + .loc 1 62 51 + mov.u32 %r109, 0x0; + mov.u32 %r110, 0x0; + @%p61 ld.global.L1::evict_last.v2.b32 { %r109, %r110 }, [ %rd62 + 0 ]; + @!%p61 mov.u32 %r109, %r179; + @!%p61 mov.u32 %r110, %r179; + mov.b32 %f15, %r109; + mov.b32 %f16, %r110; + .loc 1 63 51 + mov.u32 %r113, 0x0; + @%p61 ld.global.L1::evict_first.b32 { %r113 }, [ %rd63 + 0 ]; + @!%p61 mov.u32 %r113, %r179; + cvt.u16.u32 %rs5, %r113; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r113; } + .loc 1 63 103 + cvt.f32.bf16 %r115, %rs5; + mov.b32 %f17, %r115; + cvt.f32.bf16 %r116, %rs6; + mov.b32 %f18, %r116; + .loc 1 64 35 + mul.wide.u32 %rd65, %r119, 4; + add.s64 %rd64, %rd14, %rd65; + .loc 1 64 40 + mov.u32 %r117, 0x0; + @%p61 ld.global.L1::evict_last.b32 { %r117 }, [ %rd64 + 0 ]; + @!%p61 mov.u32 %r117, %r179; + mov.u64 %rd90, assertMessage_1; + mov.u64 %rd91, assertFile_1; + mov.u64 %rd92, assertFunc_1; + .loc 1 68 57 + @%p11 bra $L__BB0_6; + cvta.global.u64 %rd67, %rd90; + cvta.global.u64 %rd69, %rd91; + cvta.global.u64 %rd71, %rd92; + { // callseq 4, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd67; + .param .b64 param1; + st.param.b64 [param1+0], %rd69; + .param .b32 param2; + st.param.b32 [param2+0], %r187; + .param .b64 param3; + st.param.b64 [param3+0], %rd71; + .param .b64 param4; + st.param.b64 [param4+0], %rd98; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 4 +$L__BB0_6: + .loc 1 69 54 + mov.u32 %r158, 0x0; + mov.u32 %r159, 0x0; + @%p61 ld.global.L1::evict_first.v2.b32 { %r158, %r159 }, [ %rd73 + 0 ]; + @!%p61 mov.u32 %r158, %r179; + @!%p61 mov.u32 %r159, %r179; + mov.b32 %f134, %r158; + mov.b32 %f135, %r159; + .loc 1 70 24 + add.f32 %f136, %f15, %f134; + add.f32 %f137, %f16, %f135; + .loc 1 72 24 + add.f32 %f138, %f17, %f136; + add.f32 %f139, %f18, %f137; + .loc 1 73 24 + sub.f32 %f140, %f138, %f13; + sub.f32 %f141, %f139, %f13; + .loc 1 78 30 + rsqrt.approx.ftz.f32 %f142, %f14; + .loc 1 79 24 + mul.f32 %f143, %f140, %f142; + mul.f32 %f144, %f141, %f142; + .loc 1 80 24 + bar.sync 0; + st.shared.u32 [%r8], %r117; + bar.sync 0; + ld.shared.v2.f32 {%f145, %f146}, [%r9]; + mul.f32 %f147, %f143, %f145; + mul.f32 %f148, %f144, %f146; + .loc 1 82 29 + shl.b64 %rd78, %rd4, 1; + add.s64 %rd74, %rd15, %rd78; + .loc 1 82 52 + mov.b32 %r162, %f147; + cvt.rn.bf16.f32 %rs7, %r162; + mov.b32 %r163, %f148; + cvt.rn.bf16.f32 %rs8, %r163; + mov.b32 %r175, {%rs7, %rs8}; + @%p61 st.global.b32 [ %rd74 + 0 ], { %r175 }; + .loc 1 62 51 + mov.u32 %r165, 0x0; + mov.u32 %r166, 0x0; + @%p61 ld.global.L1::evict_last.v2.b32 { %r165, %r166 }, [ %rd75 + 0 ]; + @!%p61 mov.u32 %r165, %r179; + @!%p61 mov.u32 %r166, %r179; + .loc 1 63 51 + mov.u32 %r169, 0x0; + @%p61 ld.global.L1::evict_first.b32 { %r169 }, [ %rd76 + 0 ]; + @!%p61 mov.u32 %r169, %r179; + cvt.u16.u32 %rs9, %r169; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r169; } + .loc 1 63 103 + cvt.f32.bf16 %r171, %rs9; + mov.b32 %f19, %r171; + cvt.f32.bf16 %r172, %rs10; + mov.b32 %f20, %r172; + .loc 1 64 35 + add.s64 %rd77, %rd64, 512; + .loc 1 64 40 + mov.u32 %r173, 0x0; + @%p61 ld.global.L1::evict_last.b32 { %r173 }, [ %rd77 + 0 ]; + @!%p61 mov.u32 %r173, %r179; + .loc 1 68 57 + @%p11 bra $L__BB0_8; + cvta.global.u64 %rd80, %rd90; + cvta.global.u64 %rd82, %rd91; + cvta.global.u64 %rd84, %rd92; + { // callseq 5, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd80; + .param .b64 param1; + st.param.b64 [param1+0], %rd82; + .param .b32 param2; + st.param.b32 [param2+0], %r187; + .param .b64 param3; + st.param.b64 [param3+0], %rd84; + .param .b64 param4; + st.param.b64 [param4+0], %rd98; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 5 +$L__BB0_8: + .loc 1 69 54 + mov.u32 %r177, 0x0; + mov.u32 %r178, 0x0; + @%p61 ld.global.L1::evict_first.v2.b32 { %r177, %r178 }, [ %rd86 + 0 ]; + @!%p61 mov.u32 %r177, %r179; + @!%p61 mov.u32 %r178, %r179; + .loc 1 62 51 + mov.b32 %f150, %r166; + .loc 1 69 54 + mov.b32 %f151, %r178; + .loc 1 70 24 + add.f32 %f152, %f150, %f151; + .loc 1 72 24 + add.f32 %f153, %f20, %f152; + .loc 1 73 24 + sub.f32 %f154, %f153, %f13; + .loc 1 62 51 + mov.b32 %f155, %r165; + .loc 1 69 54 + mov.b32 %f156, %r177; + .loc 1 70 24 + add.f32 %f157, %f155, %f156; + .loc 1 72 24 + add.f32 %f158, %f19, %f157; + .loc 1 73 24 + sub.f32 %f159, %f158, %f13; + .loc 1 79 24 + mul.f32 %f160, %f159, %f142; + mul.f32 %f161, %f154, %f142; + .loc 1 80 24 + bar.sync 0; + st.shared.u32 [%r8], %r173; + bar.sync 0; + ld.shared.v2.f32 {%f162, %f163}, [%r9]; + mul.f32 %f164, %f160, %f162; + mul.f32 %f165, %f161, %f163; + .loc 1 82 29 + add.s64 %rd89, %rd15, %rd52; + add.s64 %rd87, %rd89, 256; + .loc 1 82 52 + mov.b32 %r181, %f164; + cvt.rn.bf16.f32 %rs11, %r181; + mov.b32 %r182, %f165; + cvt.rn.bf16.f32 %rs12, %r182; + mov.b32 %r184, {%rs11, %rs12}; + @%p61 st.global.b32 [ %rd87 + 0 ], { %r184 }; + .loc 1 58 4 + ret; +$L__tmp21: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 302 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 112 +.b8 110 +.b8 51 +.b8 108 +.b8 97 +.b8 119 +.b8 103 +.b8 54 +.b8 53 +.b8 108 +.b8 112 +.b8 105 +.b8 54 +.b8 51 +.b8 103 +.b8 118 +.b8 54 +.b8 99 +.b8 54 +.b8 112 +.b8 110 +.b8 52 +.b8 111 +.b8 105 +.b8 107 +.b8 104 +.b8 103 +.b8 54 +.b8 113 +.b8 118 +.b8 97 +.b8 50 +.b8 104 +.b8 50 +.b8 113 +.b8 106 +.b8 100 +.b8 112 +.b8 120 +.b8 101 +.b8 54 +.b8 113 +.b8 106 +.b8 52 +.b8 108 +.b8 118 +.b8 116 +.b8 116 +.b8 119 +.b8 101 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 112 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp4 +.b8 2 +.b8 47 +.b8 41 +.b8 4 +.b32 125 +.b64 $L__tmp5 +.b64 $L__tmp20 +.b8 2 +.b8 53 +.b8 44 +.b8 5 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp19 +.b8 2 +.b8 53 +.b8 44 +.b8 4 +.b32 125 +.b64 $L__tmp6 +.b64 $L__tmp19 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx b/.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..94e65d92b821ee4c712ad4579908a8c2b96a6dc8 --- /dev/null +++ b/.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.ptx @@ -0,0 +1,1608 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6e7de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6e7de( + .param .u64 triton__0d1d2d3d4d5d6e7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_5, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_6, + .param .u64 triton__0d1d2d3d4d5d6e7de_param_7 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<154>; + .reg .b16 %rs<83>; + .reg .b32 %r<247>; + .reg .f32 %f<401>; + .reg .b64 %rd<217>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd48, [triton__0d1d2d3d4d5d6e7de_param_5]; + ld.param.u64 %rd47, [triton__0d1d2d3d4d5d6e7de_param_4]; + ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6e7de_param_0]; + ld.param.u64 %rd58, [triton__0d1d2d3d4d5d6e7de_param_1]; +$L__tmp0: + .loc 1 24 33 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6e7de_param_2]; + and.b32 %r2, %r1, 255; + ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6e7de_param_3]; + shl.b32 %r3, %r2, 2; + or.b32 %r36, %r3, 1; + or.b32 %r37, %r3, 2; + or.b32 %r38, %r3, 3; + or.b32 %r39, %r2, 256; + or.b32 %r40, %r2, 512; + or.b32 %r41, %r2, 768; + .loc 1 21 28 + mov.u32 %r34, %ctaid.x; + .loc 1 21 34 + cvt.s64.s32 %rd1, %r34; + .loc 1 23 21 + setp.lt.s32 %p1, %r34, 8; + shl.b32 %r42, %r2, 3; + shl.b32 %r43, %r2, 5; + mov.u32 %r44, global_smem; + add.s32 %r4, %r44, %r43; + shl.b32 %r45, %r36, 1; + shl.b32 %r46, %r36, 3; + add.s32 %r5, %r44, %r46; + shl.b32 %r47, %r37, 1; + shl.b32 %r48, %r37, 3; + add.s32 %r6, %r44, %r48; + shl.b32 %r49, %r38, 1; + shl.b32 %r50, %r38, 3; + add.s32 %r7, %r44, %r50; + shl.b32 %r51, %r2, 1; + add.s32 %r12, %r44, %r42; + shl.b32 %r52, %r39, 1; + shl.b32 %r53, %r39, 3; + add.s32 %r9, %r44, %r53; + shl.b32 %r54, %r40, 1; + shl.b32 %r55, %r40, 3; + add.s32 %r10, %r44, %r55; + shl.b32 %r56, %r41, 1; + shl.b32 %r57, %r41, 3; + add.s32 %r11, %r44, %r57; + add.s32 %r13, %r44, %r45; + add.s32 %r14, %r44, %r47; + add.s32 %r15, %r44, %r49; + add.s32 %r16, %r44, %r51; + add.s32 %r17, %r44, %r52; + add.s32 %r18, %r44, %r54; + add.s32 %r19, %r44, %r56; + add.s32 %r20, %r44, %r3; + add.s32 %r21, %r44, %r2; + shl.b32 %r58, %r2, 6; + add.s32 %r22, %r44, %r58; + shl.b32 %r59, %r36, 4; + add.s32 %r23, %r44, %r59; + shl.b32 %r60, %r37, 4; + add.s32 %r24, %r44, %r60; + shl.b32 %r61, %r38, 4; + add.s32 %r25, %r44, %r61; + shl.b32 %r62, %r2, 4; + add.s32 %r26, %r44, %r62; + shl.b32 %r63, %r39, 4; + add.s32 %r27, %r44, %r63; + shl.b32 %r64, %r40, 4; + add.s32 %r28, %r44, %r64; + shl.b32 %r65, %r41, 4; + add.s32 %r29, %r44, %r65; + .loc 1 28 36 + mul.wide.s32 %rd61, %r34, 61440; + mul.wide.u32 %rd62, %r2, 32; + add.s64 %rd63, %rd61, %rd62; + add.s64 %rd64, %rd63, %rd57; + add.s64 %rd208, %rd64, 8208; + mul.wide.s32 %rd65, %r34, 771947520; + add.s64 %rd66, %rd58, %rd65; + mul.wide.u32 %rd67, %r2, 402056; + add.s64 %rd68, %rd66, %rd67; + add.s64 %rd207, %rd68, 103227878; + mul.wide.u32 %rd4, %r2, 16; + mul.wide.s32 %rd69, %r34, 30720; + add.s64 %rd206, %rd60, %rd69; + add.s64 %rd205, %rd59, %rd69; + mov.u64 %rd209, 0; + mov.f32 %f385, 0f00000000; + mov.b32 %r246, -2048; + mov.u16 %rs44, 0; + mov.f32 %f386, %f385; + mov.f32 %f387, %f385; + mov.f32 %f388, %f385; + mov.f32 %f389, %f385; + mov.f32 %f390, %f385; + mov.f32 %f391, %f385; + mov.f32 %f392, %f385; + mov.u64 %rd210, %rd209; + mov.u64 %rd211, %rd209; + mov.u64 %rd212, %rd209; + mov.u64 %rd213, %rd209; + mov.u64 %rd214, %rd209; + mov.u64 %rd215, %rd209; + mov.u64 %rd216, %rd209; + bra.uni $L__BB0_1; +$L__BB0_19: + .loc 1 36 23 + bfe.s32 %r172, %r115, 0, 8; + cvt.u16.u32 %rs67, %r172; + and.b16 %rs68, %rs67, 255; + setp.eq.s16 %p117, %rs68, 0; + bfe.s32 %r173, %r115, 8, 8; + cvt.u16.u32 %rs69, %r173; + and.b16 %rs70, %rs69, 255; + setp.eq.s16 %p118, %rs70, 0; + bfe.s32 %r174, %r115, 16, 8; + cvt.u16.u32 %rs71, %r174; + and.b16 %rs72, %rs71, 255; + setp.eq.s16 %p119, %rs72, 0; + bfe.s32 %r175, %r115, 24, 8; + cvt.u16.u32 %rs73, %r175; + and.b16 %rs74, %rs73, 255; + setp.eq.s16 %p120, %rs74, 0; + bfe.s32 %r176, %r108, 0, 8; + cvt.u16.u32 %rs75, %r176; + and.b16 %rs76, %rs75, 255; + setp.eq.s16 %p121, %rs76, 0; + bfe.s32 %r177, %r108, 8, 8; + cvt.u16.u32 %rs77, %r177; + and.b16 %rs78, %rs77, 255; + setp.eq.s16 %p122, %rs78, 0; + bfe.s32 %r178, %r108, 16, 8; + cvt.u16.u32 %rs79, %r178; + and.b16 %rs80, %rs79, 255; + setp.eq.s16 %p123, %rs80, 0; + bfe.s32 %r179, %r108, 24, 8; + cvt.u16.u32 %rs81, %r179; + and.b16 %rs82, %rs81, 255; + setp.eq.s16 %p124, %rs82, 0; + .loc 1 46 23 + setp.eq.f32 %p133, %f68, 0f00000000; + selp.f32 %f320, 0fFF800000, %f400, %p133; + bar.sync 0; + st.shared.f32 [%r4], %f37; + st.shared.f32 [%r5], %f42; + st.shared.f32 [%r6], %f47; + st.shared.f32 [%r7], %f52; + bar.sync 0; + ld.shared.f32 %f321, [%r12]; + ld.shared.f32 %f322, [%r9]; + ld.shared.f32 %f323, [%r10]; + ld.shared.f32 %f324, [%r11]; + bar.sync 0; + st.shared.f32 [%r4], %f57; + st.shared.f32 [%r5], %f62; + st.shared.f32 [%r6], %f67; + st.shared.f32 [%r7], %f320; + bar.sync 0; + ld.shared.f32 %f325, [%r12]; + ld.shared.f32 %f326, [%r9]; + ld.shared.f32 %f327, [%r10]; + ld.shared.f32 %f328, [%r11]; + .loc 1 48 17 + sub.f32 %f329, %f324, %f28; + sub.f32 %f330, %f323, %f27; + sub.f32 %f331, %f322, %f26; + sub.f32 %f332, %f321, %f25; + sub.f32 %f333, %f328, %f32; + sub.f32 %f334, %f327, %f31; + sub.f32 %f335, %f326, %f30; + sub.f32 %f336, %f325, %f29; + add.f32 %f337, %f336, 0f00000000; + add.f32 %f338, %f335, 0f00000000; + add.f32 %f339, %f334, 0f00000000; + add.f32 %f340, %f333, 0f00000000; + add.f32 %f341, %f332, 0f00000000; + add.f32 %f342, %f331, 0f00000000; + add.f32 %f343, %f330, 0f00000000; + add.f32 %f344, %f329, 0f00000000; + .loc 1 50 38 + selp.f32 %f345, 0f00000000, %f344, %p124; + selp.f32 %f346, 0f00000000, %f343, %p123; + selp.f32 %f347, 0f00000000, %f342, %p122; + selp.f32 %f348, 0f00000000, %f341, %p121; + selp.f32 %f349, 0f00000000, %f340, %p120; + selp.f32 %f350, 0f00000000, %f339, %p119; + selp.f32 %f351, 0f00000000, %f338, %p118; + selp.f32 %f352, 0f00000000, %f337, %p117; + .loc 1 53 48 + selp.f32 %f353, %f352, 0f80000000, %p1; + selp.f32 %f354, %f351, 0f80000000, %p1; + selp.f32 %f355, %f350, 0f80000000, %p90; + selp.f32 %f356, %f349, 0f80000000, %p90; + selp.f32 %f357, %f348, 0f80000000, %p1; + selp.f32 %f358, %f347, 0f80000000, %p1; + selp.f32 %f359, %f346, 0f80000000, %p1; + selp.f32 %f360, %f345, 0f80000000, %p1; + add.f32 %f388, %f388, %f360; + add.f32 %f387, %f387, %f359; + add.f32 %f386, %f386, %f358; + add.f32 %f385, %f385, %f357; + add.f32 %f392, %f392, %f356; + add.f32 %f391, %f391, %f355; + add.f32 %f390, %f390, %f354; + add.f32 %f389, %f389, %f353; + .loc 1 57 48 + and.pred %p134, %p1, %p52; + and.pred %p135, %p1, %p51; + and.pred %p136, %p1, %p50; + and.pred %p137, %p1, %p49; + and.pred %p138, %p17, %p48; + and.pred %p139, %p17, %p47; + and.pred %p140, %p17, %p46; + and.pred %p141, %p17, %p45; + selp.u64 %rd140, 1, 0, %p141; + selp.u64 %rd141, 1, 0, %p140; + selp.u64 %rd142, 1, 0, %p139; + selp.u64 %rd143, 1, 0, %p138; + selp.u64 %rd144, 1, 0, %p137; + selp.u64 %rd145, 1, 0, %p136; + selp.u64 %rd146, 1, 0, %p135; + selp.u64 %rd147, 1, 0, %p134; + add.s64 %rd209, %rd209, %rd147; + add.s64 %rd210, %rd210, %rd146; + add.s64 %rd211, %rd211, %rd145; + add.s64 %rd212, %rd212, %rd144; + add.s64 %rd213, %rd213, %rd143; + add.s64 %rd214, %rd214, %rd142; + add.s64 %rd215, %rd215, %rd141; + add.s64 %rd216, %rd216, %rd140; + .loc 1 28 36 + add.s64 %rd208, %rd208, 16384; + add.s32 %r246, %r246, 2048; + add.s64 %rd207, %rd207, 205852672; + add.s64 %rd206, %rd206, 8192; + add.s64 %rd205, %rd205, 8192; + setp.lt.u32 %p142, %r246, 5632; + @%p142 bra $L__BB0_1; + bra.uni $L__BB0_20; +$L__BB0_1: + .loc 1 0 36 + cvt.u32.u64 %r98, %rd1; + .loc 1 23 21 + setp.lt.s32 %p78, %r98, 8; + .loc 1 29 27 + add.s32 %r99, %r3, %r246; + add.s32 %r100, %r99, 3072; + .loc 1 30 25 + add.s32 %r101, %r246, 3584; + setp.lt.u32 %p43, %r100, 7680; + setp.lt.u32 %p44, %r101, 7680; + .loc 1 29 27 + add.s64 %rd72, %rd208, -8208; + .loc 1 32 34 + add.s64 %rd75, %rd208, -8192; + add.s64 %rd78, %rd208, -16; + .loc 1 32 59 + and.pred %p17, %p78, %p43; + and.pred %p90, %p78, %p44; + .loc 1 32 51 + mov.u64 %rd70, 0x0; + mov.u64 %rd71, 0x0; + @%p78 ld.global.L1::evict_first.v2.b64 { %rd70, %rd71 }, [ %rd72 + 0 ]; + @!%p78 mov.u64 %rd70, 0x0; + @!%p78 mov.u64 %rd71, 0x0; + mov.u64 %rd73, 0x0; + mov.u64 %rd74, 0x0; + @%p78 ld.global.L1::evict_first.v2.b64 { %rd73, %rd74 }, [ %rd75 + 0 ]; + @!%p78 mov.u64 %rd73, 0x0; + @!%p78 mov.u64 %rd74, 0x0; + mov.u64 %rd76, 0x0; + mov.u64 %rd77, 0x0; + @%p17 ld.global.L1::evict_first.v2.b64 { %rd76, %rd77 }, [ %rd78 + 0 ]; + @!%p17 mov.u64 %rd76, 0x0; + @!%p17 mov.u64 %rd77, 0x0; + mov.u64 %rd79, 0x0; + mov.u64 %rd80, 0x0; + @%p17 ld.global.L1::evict_first.v2.b64 { %rd79, %rd80 }, [ %rd208 + 0 ]; + @!%p17 mov.u64 %rd79, 0x0; + @!%p17 mov.u64 %rd80, 0x0; + .loc 1 33 35 + add.s64 %rd82, %rd205, %rd4; + .loc 1 33 52 + add.s64 %rd83, %rd82, 4096; + mov.b32 %r70, 0; + mov.u32 %r66, 0x0; + mov.u32 %r67, 0x0; + mov.u32 %r68, 0x0; + mov.u32 %r69, 0x0; + @%p78 ld.global.L1::evict_first.v4.b32 { %r66, %r67, %r68, %r69 }, [ %rd82 + 0 ]; + @!%p78 mov.u32 %r66, %r70; + @!%p78 mov.u32 %r67, %r70; + @!%p78 mov.u32 %r68, %r70; + @!%p78 mov.u32 %r69, %r70; + mov.u32 %r74, 0x0; + mov.u32 %r75, 0x0; + mov.u32 %r76, 0x0; + mov.u32 %r77, 0x0; + @%p17 ld.global.L1::evict_first.v4.b32 { %r74, %r75, %r76, %r77 }, [ %rd83 + 0 ]; + @!%p17 mov.u32 %r74, %r70; + @!%p17 mov.u32 %r75, %r70; + @!%p17 mov.u32 %r76, %r70; + @!%p17 mov.u32 %r77, %r70; + bar.sync 0; + st.shared.u32 [%r4], %r66; + st.shared.u32 [%r5], %r67; + st.shared.u32 [%r6], %r68; + st.shared.u32 [%r7], %r69; + bar.sync 0; + ld.shared.f32 %f9, [%r12]; + ld.shared.f32 %f10, [%r9]; + ld.shared.f32 %f11, [%r10]; + ld.shared.f32 %f12, [%r11]; + bar.sync 0; + st.shared.u32 [%r4], %r74; + st.shared.u32 [%r5], %r75; + st.shared.u32 [%r6], %r76; + st.shared.u32 [%r7], %r77; + bar.sync 0; + ld.shared.f32 %f13, [%r12]; + ld.shared.f32 %f14, [%r9]; + ld.shared.f32 %f15, [%r10]; + ld.shared.f32 %f16, [%r11]; + .loc 1 34 35 + add.s64 %rd84, %rd206, %rd4; + .loc 1 34 52 + add.s64 %rd85, %rd84, 4096; + mov.u32 %r82, 0x0; + mov.u32 %r83, 0x0; + mov.u32 %r84, 0x0; + mov.u32 %r85, 0x0; + @%p78 ld.global.L1::evict_first.v4.b32 { %r82, %r83, %r84, %r85 }, [ %rd84 + 0 ]; + @!%p78 mov.u32 %r82, %r70; + @!%p78 mov.u32 %r83, %r70; + @!%p78 mov.u32 %r84, %r70; + @!%p78 mov.u32 %r85, %r70; + mov.b32 %f17, %r82; + mov.u32 %r90, 0x0; + mov.u32 %r91, 0x0; + mov.u32 %r92, 0x0; + mov.u32 %r93, 0x0; + @%p17 ld.global.L1::evict_first.v4.b32 { %r90, %r91, %r92, %r93 }, [ %rd85 + 0 ]; + @!%p17 mov.u32 %r90, %r70; + @!%p17 mov.u32 %r91, %r70; + @!%p17 mov.u32 %r92, %r70; + @!%p17 mov.u32 %r93, %r70; + .loc 1 36 23 + setp.ne.s64 %p45, %rd80, -1; + setp.ne.s64 %p46, %rd79, -1; + setp.ne.s64 %p47, %rd77, -1; + setp.ne.s64 %p48, %rd76, -1; + setp.ne.s64 %p49, %rd74, -1; + setp.ne.s64 %p50, %rd73, -1; + setp.ne.s64 %p51, %rd71, -1; + setp.ne.s64 %p52, %rd70, -1; + bar.sync 0; + selp.u16 %rs1, 1, 0, %p52; + st.shared.u8 [%r12], %rs1; + selp.u16 %rs2, 1, 0, %p51; + st.shared.u8 [%r13], %rs2; + selp.u16 %rs3, 1, 0, %p50; + st.shared.u8 [%r14], %rs3; + selp.u16 %rs4, 1, 0, %p49; + st.shared.u8 [%r15], %rs4; + bar.sync 0; + ld.shared.u8 %r102, [%r19]; + ld.shared.u8 %r103, [%r18]; + ld.shared.u8 %r104, [%r17]; + ld.shared.u8 %r105, [%r16]; + bar.sync 0; + selp.u16 %rs5, 1, 0, %p48; + st.shared.u8 [%r12], %rs5; + selp.u16 %rs6, 1, 0, %p47; + st.shared.u8 [%r13], %rs6; + selp.u16 %rs7, 1, 0, %p46; + st.shared.u8 [%r14], %rs7; + selp.u16 %rs8, 1, 0, %p45; + st.shared.u8 [%r15], %rs8; + bar.sync 0; + bfi.b32 %r106, %r104, %r105, 8, 8; + bfi.b32 %r107, %r103, %r106, 16, 8; + bfi.b32 %r108, %r102, %r107, 24, 8; + ld.shared.u8 %r109, [%r16]; + ld.shared.u8 %r110, [%r17]; + bfi.b32 %r111, %r110, %r109, 8, 8; + ld.shared.u8 %r112, [%r18]; + bfi.b32 %r113, %r112, %r111, 16, 8; + ld.shared.u8 %r114, [%r19]; + bfi.b32 %r115, %r114, %r113, 24, 8; + .loc 1 42 40 + bar.sync 0; + .loc 1 38 36 + selp.b64 %rd86, %rd70, 0, %p52; + selp.b64 %rd87, %rd71, 0, %p51; + selp.b64 %rd88, %rd73, 0, %p50; + selp.b64 %rd89, %rd74, 0, %p49; + .loc 1 39 22 + add.s64 %rd90, %rd89, 50257; + add.s64 %rd91, %rd88, 50257; + add.s64 %rd92, %rd87, 50257; + add.s64 %rd93, %rd86, 50257; + .loc 1 40 22 + setp.lt.s64 %p53, %rd89, 0; + setp.lt.s64 %p54, %rd88, 0; + setp.lt.s64 %p55, %rd87, 0; + setp.lt.s64 %p56, %rd86, 0; + .loc 1 41 36 + selp.b64 %rd27, %rd93, %rd86, %p56; + selp.b64 %rd28, %rd92, %rd87, %p55; + selp.b64 %rd29, %rd91, %rd88, %p54; + selp.b64 %rd30, %rd90, %rd89, %p53; + .loc 1 42 40 + setp.lt.u64 %p57, %rd30, 50257; + setp.lt.u64 %p58, %rd29, 50257; + setp.lt.u64 %p59, %rd28, 50257; + setp.lt.u64 %p60, %rd27, 50257; + selp.u32 %r116, 1, 0, %p60; + selp.u32 %r117, 1, 0, %p59; + bfi.b32 %r118, %r117, %r116, 8, 8; + selp.u32 %r119, 1, 0, %p58; + bfi.b32 %r120, %r119, %r118, 16, 8; + selp.u32 %r121, 1, 0, %p57; + bfi.b32 %r122, %r121, %r120, 24, 8; + st.shared.u32 [%r20], %r122; + bar.sync 0; + ld.shared.u8 %rs9, [%r21]; + ld.shared.u8 %rs10, [%r21+256]; + ld.shared.u8 %rs11, [%r21+512]; + ld.shared.u8 %rs12, [%r21+768]; + bar.sync 0; + .loc 1 38 36 + selp.b64 %rd94, %rd76, 0, %p48; + selp.b64 %rd95, %rd77, 0, %p47; + selp.b64 %rd96, %rd79, 0, %p46; + selp.b64 %rd97, %rd80, 0, %p45; + .loc 1 39 22 + add.s64 %rd98, %rd97, 50257; + add.s64 %rd99, %rd96, 50257; + add.s64 %rd100, %rd95, 50257; + add.s64 %rd101, %rd94, 50257; + .loc 1 40 22 + setp.lt.s64 %p61, %rd97, 0; + setp.lt.s64 %p62, %rd96, 0; + setp.lt.s64 %p63, %rd95, 0; + setp.lt.s64 %p64, %rd94, 0; + .loc 1 41 36 + selp.b64 %rd31, %rd101, %rd94, %p64; + selp.b64 %rd32, %rd100, %rd95, %p63; + selp.b64 %rd33, %rd99, %rd96, %p62; + selp.b64 %rd34, %rd98, %rd97, %p61; + .loc 1 42 40 + setp.lt.u64 %p65, %rd34, 50257; + setp.lt.u64 %p66, %rd33, 50257; + setp.lt.u64 %p67, %rd32, 50257; + setp.lt.u64 %p68, %rd31, 50257; + selp.u32 %r123, 1, 0, %p68; + selp.u32 %r124, 1, 0, %p67; + bfi.b32 %r125, %r124, %r123, 8, 8; + selp.u32 %r126, 1, 0, %p66; + bfi.b32 %r127, %r126, %r125, 16, 8; + selp.u32 %r128, 1, 0, %p65; + bfi.b32 %r129, %r128, %r127, 24, 8; + st.shared.u32 [%r20], %r129; + bar.sync 0; + ld.shared.u8 %rs13, [%r21]; + ld.shared.u8 %rs14, [%r21+256]; + ld.shared.u8 %rs15, [%r21+512]; + ld.shared.u8 %rs16, [%r21+768]; + setp.eq.s16 %p69, %rs11, 0; + selp.u16 %rs17, 1, 0, %p69; + shl.b16 %rs18, %rs17, 2; + setp.eq.s16 %p70, %rs12, 0; + selp.u16 %rs19, -1, 0, %p70; + shl.b16 %rs20, %rs19, 3; + or.b16 %rs21, %rs20, %rs18; + setp.eq.s16 %p71, %rs10, 0; + selp.u16 %rs22, 1, 0, %p71; + setp.eq.s16 %p72, %rs9, 0; + selp.u16 %rs23, -1, 0, %p72; + shl.b16 %rs24, %rs23, 1; + or.b16 %rs25, %rs22, %rs24; + and.b16 %rs26, %rs25, 3; + or.b16 %rs27, %rs26, %rs21; + and.b16 %rs28, %rs27, 15; + setp.eq.s16 %p73, %rs15, 0; + selp.u16 %rs29, 1, 0, %p73; + shl.b16 %rs30, %rs29, 2; + setp.eq.s16 %p74, %rs16, 0; + selp.u16 %rs31, -1, 0, %p74; + shl.b16 %rs32, %rs31, 3; + or.b16 %rs33, %rs32, %rs30; + setp.eq.s16 %p75, %rs13, 0; + selp.u16 %rs34, 1, 0, %p75; + setp.eq.s16 %p76, %rs14, 0; + selp.u16 %rs35, -1, 0, %p76; + shl.b16 %rs36, %rs35, 1; + or.b16 %rs37, %rs34, %rs36; + and.b16 %rs38, %rs37, 3; + or.b16 %rs39, %rs38, %rs33; + shl.b16 %rs40, %rs39, 4; + or.b16 %rs41, %rs28, %rs40; + .loc 1 42 55 + and.b16 %rs42, %rs41, 255; + setp.eq.s16 %p77, %rs42, 0; + @%p77 bra $L__BB0_3; + mov.u64 %rd102, assertMessage_0; + cvta.global.u64 %rd103, %rd102; + mov.u64 %rd104, assertFile_0; + cvta.global.u64 %rd105, %rd104; + mov.u64 %rd106, assertFunc_0; + cvta.global.u64 %rd107, %rd106; + mov.b32 %r130, 883; + mov.u64 %rd108, 1; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd103; + .param .b64 param1; + st.param.b64 [param1+0], %rd105; + .param .b32 param2; + st.param.b32 [param2+0], %r130; + .param .b64 param3; + st.param.b64 [param3+0], %rd107; + .param .b64 param4; + st.param.b64 [param4+0], %rd108; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 0 +$L__BB0_3: + .loc 1 43 71 + bar.sync 0; + shl.b64 %rd117, %rd27, 1; + add.s64 %rd118, %rd207, %rd117; + add.s64 %rd119, %rd118, -103227878; + st.shared.u64 [%r22], %rd119; + shl.b64 %rd120, %rd28, 1; + add.s64 %rd121, %rd207, %rd120; + add.s64 %rd122, %rd121, -103127364; + st.shared.u64 [%r23], %rd122; + shl.b64 %rd123, %rd29, 1; + add.s64 %rd124, %rd207, %rd123; + add.s64 %rd125, %rd124, -103026850; + st.shared.u64 [%r24], %rd125; + shl.b64 %rd126, %rd30, 1; + add.s64 %rd127, %rd207, %rd126; + add.s64 %rd128, %rd127, -102926336; + st.shared.u64 [%r25], %rd128; + bar.sync 0; + ld.shared.u64 %rd109, [%r26]; + ld.shared.u64 %rd110, [%r27]; + ld.shared.u64 %rd111, [%r28]; + ld.shared.u64 %rd112, [%r29]; + bar.sync 0; + shl.b64 %rd129, %rd31, 1; + add.s64 %rd130, %rd207, %rd129; + add.s64 %rd131, %rd130, -301542; + st.shared.u64 [%r22], %rd131; + shl.b64 %rd132, %rd32, 1; + add.s64 %rd133, %rd207, %rd132; + add.s64 %rd134, %rd133, -201028; + st.shared.u64 [%r23], %rd134; + shl.b64 %rd135, %rd33, 1; + add.s64 %rd136, %rd207, %rd135; + add.s64 %rd137, %rd136, -100514; + st.shared.u64 [%r24], %rd137; + shl.b64 %rd138, %rd34, 1; + add.s64 %rd139, %rd207, %rd138; + st.shared.u64 [%r25], %rd139; + bar.sync 0; + ld.shared.u64 %rd113, [%r26]; + ld.shared.u64 %rd114, [%r27]; + ld.shared.u64 %rd115, [%r28]; + ld.shared.u64 %rd116, [%r29]; + mov.u16 %rs43, 0x0; + @%p78 ld.global.L1::evict_last.b16 { %rs43 }, [ %rd109 + 0 ]; + @!%p78 mov.u16 %rs43, %rs44; + mov.u16 %rs45, 0x0; + @%p78 ld.global.L1::evict_last.b16 { %rs45 }, [ %rd110 + 0 ]; + @!%p78 mov.u16 %rs45, %rs44; + mov.u16 %rs47, 0x0; + @%p78 ld.global.L1::evict_last.b16 { %rs47 }, [ %rd111 + 0 ]; + @!%p78 mov.u16 %rs47, %rs44; + mov.u16 %rs49, 0x0; + @%p78 ld.global.L1::evict_last.b16 { %rs49 }, [ %rd112 + 0 ]; + @!%p78 mov.u16 %rs49, %rs44; + mov.u16 %rs51, 0x0; + @%p78 ld.global.L1::evict_last.b16 { %rs51 }, [ %rd113 + 0 ]; + @!%p78 mov.u16 %rs51, %rs44; + mov.u16 %rs53, 0x0; + @%p78 ld.global.L1::evict_last.b16 { %rs53 }, [ %rd114 + 0 ]; + @!%p78 mov.u16 %rs53, %rs44; + mov.u16 %rs55, 0x0; + @%p90 ld.global.L1::evict_last.b16 { %rs55 }, [ %rd115 + 0 ]; + @!%p90 mov.u16 %rs55, %rs44; + mov.u16 %rs57, 0x0; + @%p90 ld.global.L1::evict_last.b16 { %rs57 }, [ %rd116 + 0 ]; + @!%p90 mov.u16 %rs57, %rs44; + .loc 1 46 23 + setp.lt.f32 %p94, %f17, 0f00800000; + mul.f32 %f96, %f17, 0f4B000000; + selp.f32 %f33, %f96, %f17, %p94; + selp.f32 %f97, 0fC1B80000, 0f00000000, %p94; + mov.b32 %r140, %f33; + add.s32 %r141, %r140, -1059760811; + and.b32 %r142, %r141, -8388608; + sub.s32 %r143, %r140, %r142; + mov.b32 %f98, %r143; + cvt.rn.f32.s32 %f99, %r142; + mov.f32 %f100, 0f34000000; + fma.rn.ftz.f32 %f101, %f99, %f100, %f97; + add.f32 %f102, %f98, 0fBF800000; + mov.f32 %f103, 0f3E1039F6; + mov.f32 %f104, 0fBE055027; + fma.rn.ftz.f32 %f105, %f104, %f102, %f103; + mov.f32 %f106, 0fBDF8CDCC; + fma.rn.ftz.f32 %f107, %f105, %f102, %f106; + mov.f32 %f108, 0f3E0F2955; + fma.rn.ftz.f32 %f109, %f107, %f102, %f108; + mov.f32 %f110, 0fBE2AD8B9; + fma.rn.ftz.f32 %f111, %f109, %f102, %f110; + mov.f32 %f112, 0f3E4CED0B; + fma.rn.ftz.f32 %f113, %f111, %f102, %f112; + mov.f32 %f114, 0fBE7FFF22; + fma.rn.ftz.f32 %f115, %f113, %f102, %f114; + mov.f32 %f116, 0f3EAAAA78; + fma.rn.ftz.f32 %f117, %f115, %f102, %f116; + mov.f32 %f118, 0fBF000000; + fma.rn.ftz.f32 %f119, %f117, %f102, %f118; + mul.f32 %f120, %f102, %f119; + fma.rn.ftz.f32 %f121, %f120, %f102, %f102; + mov.f32 %f122, 0f3F317218; + fma.rn.ftz.f32 %f393, %f101, %f122, %f121; + setp.lt.u32 %p95, %r140, 2139095040; + mov.f32 %f123, 0f7F800000; + @%p95 bra $L__BB0_5; + .loc 1 0 23 + fma.rn.ftz.f32 %f393, %f33, %f123, %f123; +$L__BB0_5: + mov.b32 %f18, %r83; + .loc 1 46 23 + setp.lt.f32 %p97, %f18, 0f00800000; + mul.f32 %f124, %f18, 0f4B000000; + selp.f32 %f38, %f124, %f18, %p97; + selp.f32 %f125, 0fC1B80000, 0f00000000, %p97; + mov.b32 %r144, %f38; + add.s32 %r145, %r144, -1059760811; + and.b32 %r146, %r145, -8388608; + sub.s32 %r147, %r144, %r146; + mov.b32 %f126, %r147; + cvt.rn.f32.s32 %f127, %r146; + fma.rn.ftz.f32 %f129, %f127, %f100, %f125; + add.f32 %f130, %f126, 0fBF800000; + fma.rn.ftz.f32 %f133, %f104, %f130, %f103; + fma.rn.ftz.f32 %f135, %f133, %f130, %f106; + fma.rn.ftz.f32 %f137, %f135, %f130, %f108; + fma.rn.ftz.f32 %f139, %f137, %f130, %f110; + fma.rn.ftz.f32 %f141, %f139, %f130, %f112; + fma.rn.ftz.f32 %f143, %f141, %f130, %f114; + fma.rn.ftz.f32 %f145, %f143, %f130, %f116; + fma.rn.ftz.f32 %f147, %f145, %f130, %f118; + mul.f32 %f148, %f130, %f147; + fma.rn.ftz.f32 %f149, %f148, %f130, %f130; + fma.rn.ftz.f32 %f394, %f129, %f122, %f149; + setp.lt.u32 %p98, %r144, 2139095040; + @%p98 bra $L__BB0_7; + .loc 1 0 23 + fma.rn.ftz.f32 %f394, %f38, %f123, %f123; +$L__BB0_7: + mov.b32 %f19, %r84; + .loc 1 46 23 + setp.lt.f32 %p100, %f19, 0f00800000; + mul.f32 %f152, %f19, 0f4B000000; + selp.f32 %f43, %f152, %f19, %p100; + selp.f32 %f153, 0fC1B80000, 0f00000000, %p100; + mov.b32 %r148, %f43; + add.s32 %r149, %r148, -1059760811; + and.b32 %r150, %r149, -8388608; + sub.s32 %r151, %r148, %r150; + mov.b32 %f154, %r151; + cvt.rn.f32.s32 %f155, %r150; + fma.rn.ftz.f32 %f157, %f155, %f100, %f153; + add.f32 %f158, %f154, 0fBF800000; + fma.rn.ftz.f32 %f161, %f104, %f158, %f103; + fma.rn.ftz.f32 %f163, %f161, %f158, %f106; + fma.rn.ftz.f32 %f165, %f163, %f158, %f108; + fma.rn.ftz.f32 %f167, %f165, %f158, %f110; + fma.rn.ftz.f32 %f169, %f167, %f158, %f112; + fma.rn.ftz.f32 %f171, %f169, %f158, %f114; + fma.rn.ftz.f32 %f173, %f171, %f158, %f116; + fma.rn.ftz.f32 %f175, %f173, %f158, %f118; + mul.f32 %f176, %f158, %f175; + fma.rn.ftz.f32 %f177, %f176, %f158, %f158; + fma.rn.ftz.f32 %f395, %f157, %f122, %f177; + setp.lt.u32 %p101, %r148, 2139095040; + @%p101 bra $L__BB0_9; + .loc 1 0 23 + fma.rn.ftz.f32 %f395, %f43, %f123, %f123; +$L__BB0_9: + mov.b32 %f20, %r85; + .loc 1 46 23 + setp.lt.f32 %p103, %f20, 0f00800000; + mul.f32 %f180, %f20, 0f4B000000; + selp.f32 %f48, %f180, %f20, %p103; + selp.f32 %f181, 0fC1B80000, 0f00000000, %p103; + mov.b32 %r152, %f48; + add.s32 %r153, %r152, -1059760811; + and.b32 %r154, %r153, -8388608; + sub.s32 %r155, %r152, %r154; + mov.b32 %f182, %r155; + cvt.rn.f32.s32 %f183, %r154; + fma.rn.ftz.f32 %f185, %f183, %f100, %f181; + add.f32 %f186, %f182, 0fBF800000; + fma.rn.ftz.f32 %f189, %f104, %f186, %f103; + fma.rn.ftz.f32 %f191, %f189, %f186, %f106; + fma.rn.ftz.f32 %f193, %f191, %f186, %f108; + fma.rn.ftz.f32 %f195, %f193, %f186, %f110; + fma.rn.ftz.f32 %f197, %f195, %f186, %f112; + fma.rn.ftz.f32 %f199, %f197, %f186, %f114; + fma.rn.ftz.f32 %f201, %f199, %f186, %f116; + fma.rn.ftz.f32 %f203, %f201, %f186, %f118; + mul.f32 %f204, %f186, %f203; + fma.rn.ftz.f32 %f205, %f204, %f186, %f186; + fma.rn.ftz.f32 %f396, %f185, %f122, %f205; + setp.lt.u32 %p104, %r152, 2139095040; + @%p104 bra $L__BB0_11; + .loc 1 0 23 + fma.rn.ftz.f32 %f396, %f48, %f123, %f123; +$L__BB0_11: + mov.b32 %f21, %r90; + .loc 1 46 23 + setp.lt.f32 %p106, %f21, 0f00800000; + mul.f32 %f208, %f21, 0f4B000000; + selp.f32 %f53, %f208, %f21, %p106; + selp.f32 %f209, 0fC1B80000, 0f00000000, %p106; + mov.b32 %r156, %f53; + add.s32 %r157, %r156, -1059760811; + and.b32 %r158, %r157, -8388608; + sub.s32 %r159, %r156, %r158; + mov.b32 %f210, %r159; + cvt.rn.f32.s32 %f211, %r158; + fma.rn.ftz.f32 %f213, %f211, %f100, %f209; + add.f32 %f214, %f210, 0fBF800000; + fma.rn.ftz.f32 %f217, %f104, %f214, %f103; + fma.rn.ftz.f32 %f219, %f217, %f214, %f106; + fma.rn.ftz.f32 %f221, %f219, %f214, %f108; + fma.rn.ftz.f32 %f223, %f221, %f214, %f110; + fma.rn.ftz.f32 %f225, %f223, %f214, %f112; + fma.rn.ftz.f32 %f227, %f225, %f214, %f114; + fma.rn.ftz.f32 %f229, %f227, %f214, %f116; + fma.rn.ftz.f32 %f231, %f229, %f214, %f118; + mul.f32 %f232, %f214, %f231; + fma.rn.ftz.f32 %f233, %f232, %f214, %f214; + fma.rn.ftz.f32 %f397, %f213, %f122, %f233; + setp.lt.u32 %p107, %r156, 2139095040; + @%p107 bra $L__BB0_13; + .loc 1 0 23 + fma.rn.ftz.f32 %f397, %f53, %f123, %f123; +$L__BB0_13: + mov.b32 %f22, %r91; + .loc 1 46 23 + setp.lt.f32 %p109, %f22, 0f00800000; + mul.f32 %f236, %f22, 0f4B000000; + selp.f32 %f58, %f236, %f22, %p109; + selp.f32 %f237, 0fC1B80000, 0f00000000, %p109; + mov.b32 %r160, %f58; + add.s32 %r161, %r160, -1059760811; + and.b32 %r162, %r161, -8388608; + sub.s32 %r163, %r160, %r162; + mov.b32 %f238, %r163; + cvt.rn.f32.s32 %f239, %r162; + fma.rn.ftz.f32 %f241, %f239, %f100, %f237; + add.f32 %f242, %f238, 0fBF800000; + fma.rn.ftz.f32 %f245, %f104, %f242, %f103; + fma.rn.ftz.f32 %f247, %f245, %f242, %f106; + fma.rn.ftz.f32 %f249, %f247, %f242, %f108; + fma.rn.ftz.f32 %f251, %f249, %f242, %f110; + fma.rn.ftz.f32 %f253, %f251, %f242, %f112; + fma.rn.ftz.f32 %f255, %f253, %f242, %f114; + fma.rn.ftz.f32 %f257, %f255, %f242, %f116; + fma.rn.ftz.f32 %f259, %f257, %f242, %f118; + mul.f32 %f260, %f242, %f259; + fma.rn.ftz.f32 %f261, %f260, %f242, %f242; + fma.rn.ftz.f32 %f398, %f241, %f122, %f261; + setp.lt.u32 %p110, %r160, 2139095040; + @%p110 bra $L__BB0_15; + .loc 1 0 23 + fma.rn.ftz.f32 %f398, %f58, %f123, %f123; +$L__BB0_15: + setp.eq.f32 %p96, %f33, 0f00000000; + setp.eq.f32 %p99, %f38, 0f00000000; + setp.eq.f32 %p102, %f43, 0f00000000; + setp.eq.f32 %p105, %f48, 0f00000000; + setp.eq.f32 %p108, %f53, 0f00000000; + mov.b32 %f23, %r92; + .loc 1 46 23 + setp.eq.f32 %p111, %f58, 0f00000000; + setp.lt.f32 %p112, %f23, 0f00800000; + mul.f32 %f264, %f23, 0f4B000000; + selp.f32 %f63, %f264, %f23, %p112; + selp.f32 %f265, 0fC1B80000, 0f00000000, %p112; + mov.b32 %r164, %f63; + add.s32 %r165, %r164, -1059760811; + and.b32 %r166, %r165, -8388608; + sub.s32 %r167, %r164, %r166; + mov.b32 %f266, %r167; + cvt.rn.f32.s32 %f267, %r166; + fma.rn.ftz.f32 %f269, %f267, %f100, %f265; + add.f32 %f270, %f266, 0fBF800000; + fma.rn.ftz.f32 %f273, %f104, %f270, %f103; + fma.rn.ftz.f32 %f275, %f273, %f270, %f106; + fma.rn.ftz.f32 %f277, %f275, %f270, %f108; + fma.rn.ftz.f32 %f279, %f277, %f270, %f110; + fma.rn.ftz.f32 %f281, %f279, %f270, %f112; + fma.rn.ftz.f32 %f283, %f281, %f270, %f114; + fma.rn.ftz.f32 %f285, %f283, %f270, %f116; + fma.rn.ftz.f32 %f287, %f285, %f270, %f118; + mul.f32 %f288, %f270, %f287; + fma.rn.ftz.f32 %f289, %f288, %f270, %f270; + fma.rn.ftz.f32 %f399, %f269, %f122, %f289; + setp.lt.u32 %p113, %r164, 2139095040; + @%p113 bra $L__BB0_17; + .loc 1 0 23 + fma.rn.ftz.f32 %f399, %f63, %f123, %f123; +$L__BB0_17: + mov.b32 %f24, %r93; + cvt.f32.bf16 %r131, %rs43; + mov.b32 %f88, %r131; + cvt.f32.bf16 %r132, %rs45; + mov.b32 %f89, %r132; + cvt.f32.bf16 %r133, %rs47; + mov.b32 %f90, %r133; + cvt.f32.bf16 %r134, %rs49; + mov.b32 %f91, %r134; + cvt.f32.bf16 %r135, %rs51; + mov.b32 %f92, %r135; + cvt.f32.bf16 %r136, %rs53; + mov.b32 %f93, %r136; + cvt.f32.bf16 %r137, %rs55; + mov.b32 %f94, %r137; + cvt.f32.bf16 %r138, %rs57; + mov.b32 %f95, %r138; + sub.f32 %f32, %f95, %f16; + sub.f32 %f31, %f94, %f15; + sub.f32 %f30, %f93, %f14; + sub.f32 %f29, %f92, %f13; + sub.f32 %f28, %f91, %f12; + sub.f32 %f27, %f90, %f11; + sub.f32 %f26, %f89, %f10; + sub.f32 %f25, %f88, %f9; + .loc 1 46 23 + selp.f32 %f37, 0fFF800000, %f393, %p96; + selp.f32 %f42, 0fFF800000, %f394, %p99; + selp.f32 %f47, 0fFF800000, %f395, %p102; + selp.f32 %f52, 0fFF800000, %f396, %p105; + selp.f32 %f57, 0fFF800000, %f397, %p108; + selp.f32 %f62, 0fFF800000, %f398, %p111; + setp.eq.f32 %p114, %f63, 0f00000000; + selp.f32 %f67, 0fFF800000, %f399, %p114; + setp.lt.f32 %p115, %f24, 0f00800000; + mul.f32 %f292, %f24, 0f4B000000; + selp.f32 %f68, %f292, %f24, %p115; + selp.f32 %f293, 0fC1B80000, 0f00000000, %p115; + mov.b32 %r168, %f68; + add.s32 %r169, %r168, -1059760811; + and.b32 %r170, %r169, -8388608; + sub.s32 %r171, %r168, %r170; + mov.b32 %f294, %r171; + cvt.rn.f32.s32 %f295, %r170; + fma.rn.ftz.f32 %f297, %f295, %f100, %f293; + add.f32 %f298, %f294, 0fBF800000; + fma.rn.ftz.f32 %f301, %f104, %f298, %f103; + fma.rn.ftz.f32 %f303, %f301, %f298, %f106; + fma.rn.ftz.f32 %f305, %f303, %f298, %f108; + fma.rn.ftz.f32 %f307, %f305, %f298, %f110; + fma.rn.ftz.f32 %f309, %f307, %f298, %f112; + fma.rn.ftz.f32 %f311, %f309, %f298, %f114; + fma.rn.ftz.f32 %f313, %f311, %f298, %f116; + fma.rn.ftz.f32 %f315, %f313, %f298, %f118; + mul.f32 %f316, %f298, %f315; + fma.rn.ftz.f32 %f317, %f316, %f298, %f298; + fma.rn.ftz.f32 %f400, %f297, %f122, %f317; + setp.lt.u32 %p116, %r168, 2139095040; + @%p116 bra $L__BB0_19; + .loc 1 0 23 + fma.rn.ftz.f32 %f400, %f68, %f123, %f123; + bra.uni $L__BB0_19; +$L__BB0_20: + .loc 1 24 33 + bfe.u32 %r191, %r1, 5, 3; + and.b32 %r192, %r1, 31; +$L__tmp1: + .loc 2 243 36 + bar.sync 0; +$L__tmp2: + .loc 2 233 15 + add.f32 %f361, %f385, %f386; + add.f32 %f362, %f387, %f361; + add.f32 %f363, %f388, %f362; + add.f32 %f364, %f389, %f363; + add.f32 %f365, %f390, %f364; + add.f32 %f366, %f391, %f365; + add.f32 %f367, %f392, %f366; +$L__tmp3: + .loc 2 243 36 + mov.b32 %r193, %f367; + shfl.sync.bfly.b32 %r194, %r193, 16, 31, -1; + mov.b32 %f368, %r194; +$L__tmp4: + .loc 2 233 15 + add.f32 %f369, %f367, %f368; +$L__tmp5: + .loc 2 243 36 + mov.b32 %r195, %f369; + shfl.sync.bfly.b32 %r196, %r195, 8, 31, -1; + mov.b32 %f370, %r196; +$L__tmp6: + .loc 2 233 15 + add.f32 %f371, %f369, %f370; +$L__tmp7: + .loc 2 243 36 + mov.b32 %r197, %f371; + shfl.sync.bfly.b32 %r198, %r197, 4, 31, -1; + mov.b32 %f372, %r198; +$L__tmp8: + .loc 2 233 15 + add.f32 %f373, %f371, %f372; +$L__tmp9: + .loc 2 243 36 + mov.b32 %r199, %f373; + shfl.sync.bfly.b32 %r200, %r199, 2, 31, -1; + mov.b32 %f374, %r200; +$L__tmp10: + .loc 2 233 15 + add.f32 %f375, %f373, %f374; +$L__tmp11: + .loc 2 243 36 + mov.b32 %r201, %f375; + shfl.sync.bfly.b32 %r202, %r201, 1, 31, -1; + mov.b32 %f376, %r202; +$L__tmp12: + .loc 2 233 15 + add.f32 %f377, %f375, %f376; +$L__tmp13: + .loc 2 243 36 + setp.eq.s32 %p143, %r192, 0; + shl.b32 %r203, %r191, 2; + add.s32 %r180, %r44, %r203; + mov.b32 %r181, %f377; + @%p143 st.shared.b32 [ %r180 + 0 ], %r181; + bar.sync 0; + setp.lt.s32 %p144, %r1, 8; + shl.b32 %r205, %r1, 2; + add.s32 %r183, %r44, %r205; + @%p144 ld.shared.b32 %r182, [ %r183 + 0 ]; + mov.b32 %f378, %r182; + shfl.sync.bfly.b32 %r206, %r182, 4, 31, -1; + mov.b32 %f379, %r206; +$L__tmp14: + .loc 2 233 15 + add.f32 %f380, %f378, %f379; +$L__tmp15: + .loc 2 243 36 + mov.b32 %r207, %f380; + shfl.sync.bfly.b32 %r208, %r207, 2, 31, -1; + mov.b32 %f381, %r208; +$L__tmp16: + .loc 2 233 15 + add.f32 %f382, %f380, %f381; +$L__tmp17: + .loc 2 243 36 + mov.b32 %r209, %f382; + shfl.sync.bfly.b32 %r210, %r209, 1, 31, -1; + mov.b32 %f383, %r210; +$L__tmp18: + .loc 2 233 15 + add.f32 %f384, %f382, %f383; +$L__tmp19: + .loc 2 243 36 + and.b32 %r211, %r1, 7; + setp.eq.s32 %p152, %r211, 0; + and.pred %p145, %p144, %p152; + mov.b32 %r185, %f384; + @%p145 st.shared.b32 [ %r183 + 0 ], %r185; + bar.sync 0; + ld.shared.u32 %r186, [global_smem]; +$L__tmp20: + .loc 1 59 25 + shl.b64 %rd154, %rd1, 2; + add.s64 %rd148, %rd47, %rd154; + .loc 1 59 37 + setp.eq.s32 %p153, %r2, 0; + and.pred %p146, %p153, %p78; + @%p146 st.global.b32 [ %rd148 + 0 ], { %r186 }; +$L__tmp21: + .loc 2 243 36 + bar.sync 0; +$L__tmp22: + .loc 2 233 15 + add.s64 %rd155, %rd209, %rd210; + add.s64 %rd156, %rd155, %rd211; + add.s64 %rd157, %rd156, %rd212; + add.s64 %rd158, %rd157, %rd213; + add.s64 %rd159, %rd158, %rd214; + add.s64 %rd160, %rd159, %rd215; + add.s64 %rd161, %rd160, %rd216; +$L__tmp23: + .loc 2 243 36 + cvt.u32.u64 %r212, %rd161; + shfl.sync.bfly.b32 %r213, %r212, 16, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r214}, %rd161; } + shfl.sync.bfly.b32 %r215, %r214, 16, 31, -1; + cvt.u64.u32 %rd162, %r213; + cvt.u64.u32 %rd163, %r215; + shl.b64 %rd164, %rd163, 32; + or.b64 %rd165, %rd162, %rd164; +$L__tmp24: + .loc 2 233 15 + add.s64 %rd166, %rd161, %rd165; +$L__tmp25: + .loc 2 243 36 + cvt.u32.u64 %r216, %rd166; + shfl.sync.bfly.b32 %r217, %r216, 8, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r218}, %rd166; } + shfl.sync.bfly.b32 %r219, %r218, 8, 31, -1; + cvt.u64.u32 %rd167, %r217; + cvt.u64.u32 %rd168, %r219; + shl.b64 %rd169, %rd168, 32; + or.b64 %rd170, %rd167, %rd169; +$L__tmp26: + .loc 2 233 15 + add.s64 %rd171, %rd166, %rd170; +$L__tmp27: + .loc 2 243 36 + cvt.u32.u64 %r220, %rd171; + shfl.sync.bfly.b32 %r221, %r220, 4, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r222}, %rd171; } + shfl.sync.bfly.b32 %r223, %r222, 4, 31, -1; + cvt.u64.u32 %rd172, %r221; + cvt.u64.u32 %rd173, %r223; + shl.b64 %rd174, %rd173, 32; + or.b64 %rd175, %rd172, %rd174; +$L__tmp28: + .loc 2 233 15 + add.s64 %rd176, %rd171, %rd175; +$L__tmp29: + .loc 2 243 36 + cvt.u32.u64 %r224, %rd176; + shfl.sync.bfly.b32 %r225, %r224, 2, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r226}, %rd176; } + shfl.sync.bfly.b32 %r227, %r226, 2, 31, -1; + cvt.u64.u32 %rd177, %r225; + cvt.u64.u32 %rd178, %r227; + shl.b64 %rd179, %rd178, 32; + or.b64 %rd180, %rd177, %rd179; +$L__tmp30: + .loc 2 233 15 + add.s64 %rd181, %rd176, %rd180; +$L__tmp31: + .loc 2 243 36 + cvt.u32.u64 %r228, %rd181; + shfl.sync.bfly.b32 %r229, %r228, 1, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r230}, %rd181; } + shfl.sync.bfly.b32 %r231, %r230, 1, 31, -1; + cvt.u64.u32 %rd182, %r229; + cvt.u64.u32 %rd183, %r231; + shl.b64 %rd184, %rd183, 32; + or.b64 %rd185, %rd182, %rd184; +$L__tmp32: + .loc 2 233 15 + add.s64 %rd149, %rd181, %rd185; +$L__tmp33: + .loc 2 243 36 + shl.b32 %r232, %r191, 3; + add.s32 %r187, %r44, %r232; + @%p143 st.shared.b64 [ %r187 + 0 ], %rd149; + bar.sync 0; + shl.b32 %r233, %r1, 3; + add.s32 %r188, %r44, %r233; + @%p144 ld.shared.b64 %rd150, [ %r188 + 0 ]; + cvt.u32.u64 %r234, %rd150; + shfl.sync.bfly.b32 %r235, %r234, 4, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r236}, %rd150; } + shfl.sync.bfly.b32 %r237, %r236, 4, 31, -1; + cvt.u64.u32 %rd186, %r235; + cvt.u64.u32 %rd187, %r237; + shl.b64 %rd188, %rd187, 32; + or.b64 %rd189, %rd186, %rd188; +$L__tmp34: + .loc 2 233 15 + add.s64 %rd190, %rd150, %rd189; +$L__tmp35: + .loc 2 243 36 + cvt.u32.u64 %r238, %rd190; + shfl.sync.bfly.b32 %r239, %r238, 2, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r240}, %rd190; } + shfl.sync.bfly.b32 %r241, %r240, 2, 31, -1; + cvt.u64.u32 %rd191, %r239; + cvt.u64.u32 %rd192, %r241; + shl.b64 %rd193, %rd192, 32; + or.b64 %rd194, %rd191, %rd193; +$L__tmp36: + .loc 2 233 15 + add.s64 %rd195, %rd190, %rd194; +$L__tmp37: + .loc 2 243 36 + cvt.u32.u64 %r242, %rd195; + shfl.sync.bfly.b32 %r243, %r242, 1, 31, -1; + { .reg .b32 tmp; mov.b64 {tmp, %r244}, %rd195; } + shfl.sync.bfly.b32 %r245, %r244, 1, 31, -1; + cvt.u64.u32 %rd196, %r243; + cvt.u64.u32 %rd197, %r245; + shl.b64 %rd198, %rd197, 32; + or.b64 %rd199, %rd196, %rd198; +$L__tmp38: + .loc 2 233 15 + add.s64 %rd151, %rd195, %rd199; +$L__tmp39: + .loc 2 243 36 + @%p145 st.shared.b64 [ %r188 + 0 ], %rd151; + bar.sync 0; + ld.shared.u32 %rd200, [global_smem+4]; + shl.b64 %rd201, %rd200, 32; + ld.shared.u32 %rd202, [global_smem]; + or.b64 %rd203, %rd201, %rd202; +$L__tmp40: + .loc 1 60 30 + bar.sync 0; + st.shared.u64 [global_smem], %rd203; + bar.sync 0; + ld.shared.u64 %rd152, [global_smem]; + .loc 1 61 25 + shl.b64 %rd204, %rd1, 3; + add.s64 %rd153, %rd48, %rd204; + .loc 1 61 37 + @%p146 st.global.b64 [ %rd153 + 0 ], { %rd152 }; + .loc 1 61 4 + ret; +$L__tmp41: +$L__func_end0: + +} + // .globl __nv_logf +.visible .func (.param .b32 func_retval0) __nv_logf( + .param .b32 __nv_logf_param_0 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<5>; + .reg .f32 %f<36>; +$L__func_begin1: + + ld.param.f32 %f5, [__nv_logf_param_0]; + setp.lt.f32 %p1, %f5, 0f00800000; + mul.f32 %f6, %f5, 0f4B000000; + selp.f32 %f1, %f6, %f5, %p1; + selp.f32 %f7, 0fC1B80000, 0f00000000, %p1; + mov.b32 %r1, %f1; + add.s32 %r2, %r1, -1059760811; + and.b32 %r3, %r2, -8388608; + sub.s32 %r4, %r1, %r3; + mov.b32 %f8, %r4; + cvt.rn.f32.s32 %f9, %r3; + mov.f32 %f10, 0f34000000; + fma.rn.ftz.f32 %f11, %f9, %f10, %f7; + add.f32 %f12, %f8, 0fBF800000; + mov.f32 %f13, 0f3E1039F6; + mov.f32 %f14, 0fBE055027; + fma.rn.ftz.f32 %f15, %f14, %f12, %f13; + mov.f32 %f16, 0fBDF8CDCC; + fma.rn.ftz.f32 %f17, %f15, %f12, %f16; + mov.f32 %f18, 0f3E0F2955; + fma.rn.ftz.f32 %f19, %f17, %f12, %f18; + mov.f32 %f20, 0fBE2AD8B9; + fma.rn.ftz.f32 %f21, %f19, %f12, %f20; + mov.f32 %f22, 0f3E4CED0B; + fma.rn.ftz.f32 %f23, %f21, %f12, %f22; + mov.f32 %f24, 0fBE7FFF22; + fma.rn.ftz.f32 %f25, %f23, %f12, %f24; + mov.f32 %f26, 0f3EAAAA78; + fma.rn.ftz.f32 %f27, %f25, %f12, %f26; + mov.f32 %f28, 0fBF000000; + fma.rn.ftz.f32 %f29, %f27, %f12, %f28; + mul.f32 %f30, %f12, %f29; + fma.rn.ftz.f32 %f31, %f30, %f12, %f12; + mov.f32 %f32, 0f3F317218; + fma.rn.ftz.f32 %f35, %f11, %f32, %f31; + setp.lt.u32 %p2, %r1, 2139095040; + @%p2 bra $L__BB1_2; + mov.f32 %f33, 0f7F800000; + fma.rn.ftz.f32 %f35, %f1, %f33, %f33; +$L__BB1_2: + setp.eq.f32 %p3, %f1, 0f00000000; + selp.f32 %f34, 0fFF800000, %f35, %p3; + st.param.f32 [func_retval0+0], %f34; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/ns/cnshxlw3p7kytog7ihat33cfh5n4z4tq3l77zyi5jxajo5uonq7m.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 349 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 110 +.b8 115 +.b8 104 +.b8 120 +.b8 108 +.b8 119 +.b8 51 +.b8 112 +.b8 55 +.b8 107 +.b8 121 +.b8 116 +.b8 111 +.b8 103 +.b8 55 +.b8 105 +.b8 104 +.b8 97 +.b8 116 +.b8 51 +.b8 51 +.b8 99 +.b8 102 +.b8 104 +.b8 53 +.b8 110 +.b8 52 +.b8 122 +.b8 52 +.b8 116 +.b8 113 +.b8 51 +.b8 108 +.b8 55 +.b8 55 +.b8 122 +.b8 121 +.b8 105 +.b8 53 +.b8 106 +.b8 120 +.b8 97 +.b8 106 +.b8 111 +.b8 53 +.b8 117 +.b8 111 +.b8 110 +.b8 113 +.b8 55 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 110 +.b8 115 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp20 +.b8 2 +.b8 58 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp19 +.b8 2 +.b8 58 +.b8 27 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp19 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp21 +.b64 $L__tmp40 +.b8 2 +.b8 60 +.b8 27 +.b8 5 +.b32 125 +.b64 $L__tmp22 +.b64 $L__tmp39 +.b8 2 +.b8 60 +.b8 27 +.b8 4 +.b32 125 +.b64 $L__tmp22 +.b64 $L__tmp39 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 353 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 353 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..65b6595e24a96c640e3f1cee10ccfe43fb9a17cd --- /dev/null +++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.llir @@ -0,0 +1,949 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !7 { + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %5 = shl i32 %4, 3, !dbg !10 + %6 = and i32 %5, 1016, !dbg !10 + %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11 + %8 = shl i32 %7, 10, !dbg !12 + %9 = or i32 %8, %6, !dbg !13 + %10 = sext i32 %9 to i64, !dbg !14 + %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !14 + %12 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #4, !dbg !15 + %13 = extractvalue { i32, i32, i32, i32 } %12, 0, !dbg !15 + %14 = extractvalue { i32, i32, i32, i32 } %12, 1, !dbg !15 + %15 = extractvalue { i32, i32, i32, i32 } %12, 2, !dbg !15 + %16 = extractvalue { i32, i32, i32, i32 } %12, 3, !dbg !15 + %17 = trunc i32 %13 to i16, !dbg !15 + %extelt.offset = lshr i32 %13, 16, !dbg !15 + %18 = trunc i32 %extelt.offset to i16, !dbg !15 + %19 = trunc i32 %14 to i16, !dbg !15 + %extelt.offset1 = lshr i32 %14, 16, !dbg !15 + %20 = trunc i32 %extelt.offset1 to i16, !dbg !15 + %21 = trunc i32 %15 to i16, !dbg !15 + %extelt.offset2 = lshr i32 %15, 16, !dbg !15 + %22 = trunc i32 %extelt.offset2 to i16, !dbg !15 + %23 = trunc i32 %16 to i16, !dbg !15 + %extelt.offset3 = lshr i32 %16, 16, !dbg !15 + %24 = trunc i32 %extelt.offset3 to i16, !dbg !15 + %25 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %17) #4, !dbg !16 + %26 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %18) #4, !dbg !16 + %27 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %19) #4, !dbg !16 + %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %20) #4, !dbg !16 + %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %21) #4, !dbg !16 + %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %22) #4, !dbg !16 + %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %23) #4, !dbg !16 + %32 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #4, !dbg !16 + %33 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !17 + %34 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %33, i1 true) #4, !dbg !18 + %35 = extractvalue { i32, i32, i32, i32 } %34, 0, !dbg !18 + %36 = extractvalue { i32, i32, i32, i32 } %34, 1, !dbg !18 + %37 = extractvalue { i32, i32, i32, i32 } %34, 2, !dbg !18 + %38 = extractvalue { i32, i32, i32, i32 } %34, 3, !dbg !18 + %39 = trunc i32 %35 to i16, !dbg !18 + %extelt.offset4 = lshr i32 %35, 16, !dbg !18 + %40 = trunc i32 %extelt.offset4 to i16, !dbg !18 + %41 = trunc i32 %36 to i16, !dbg !18 + %extelt.offset5 = lshr i32 %36, 16, !dbg !18 + %42 = trunc i32 %extelt.offset5 to i16, !dbg !18 + %43 = trunc i32 %37 to i16, !dbg !18 + %extelt.offset6 = lshr i32 %37, 16, !dbg !18 + %44 = trunc i32 %extelt.offset6 to i16, !dbg !18 + %45 = trunc i32 %38 to i16, !dbg !18 + %extelt.offset7 = lshr i32 %38, 16, !dbg !18 + %46 = trunc i32 %extelt.offset7 to i16, !dbg !18 + %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %39) #4, !dbg !19 + %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %40) #4, !dbg !19 + %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %41) #4, !dbg !19 + %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %42) #4, !dbg !19 + %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #4, !dbg !19 + %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #4, !dbg !19 + %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #4, !dbg !19 + %54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #4, !dbg !19 + %55 = fmul float %47, 0x3FE6A09E60000000, !dbg !20 + %56 = fmul float %48, 0x3FE6A09E60000000, !dbg !20 + %57 = fmul float %49, 0x3FE6A09E60000000, !dbg !20 + %58 = fmul float %50, 0x3FE6A09E60000000, !dbg !20 + %59 = fmul float %51, 0x3FE6A09E60000000, !dbg !20 + %60 = fmul float %52, 0x3FE6A09E60000000, !dbg !20 + %61 = fmul float %53, 0x3FE6A09E60000000, !dbg !20 + %62 = fmul float %54, 0x3FE6A09E60000000, !dbg !20 + %63 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i = icmp eq i32 %63, 0, !dbg !21 + %64 = tail call float @llvm.nvvm.fabs.ftz.f(float %55) #4, !dbg !21 + %65 = tail call float @llvm.nvvm.fabs.f(float %55) #4, !dbg !21 + %.0.i = select i1 %.not.i, float %65, float %64, !dbg !21 + %66 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !21 + br i1 %66, label %__nv_fabsf.exit1.i, label %68, !dbg !21 + +__nv_fabsf.exit1.i: ; preds = %3 + %67 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i = icmp eq i32 %67, 0, !dbg !21 + %.01.i = select i1 %.not1.i, float %65, float %64, !dbg !21 + br label %__internal_fmad.exit.i, !dbg !21 + +68: ; preds = %3 + %69 = fmul float %55, %55, !dbg !21 + br label %__internal_fmad.exit.i, !dbg !21 + +__internal_fmad.exit.i: ; preds = %68, %__nv_fabsf.exit1.i + %70 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %68 ], !dbg !21 + %71 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %68 ], !dbg !21 + %72 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %68 ], !dbg !21 + %73 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %68 ], !dbg !21 + %74 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %68 ], !dbg !21 + %75 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %68 ], !dbg !21 + %76 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %68 ], !dbg !21 + %77 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %69, %68 ], !dbg !21 + %78 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i = icmp eq i32 %78, 0, !dbg !21 + %79 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %76, float %77, float %75) #4, !dbg !21 + %80 = tail call float @llvm.nvvm.fma.rn.f(float %76, float %77, float %75) #4, !dbg !21 + %.02.i = select i1 %.not2.i, float %80, float %79, !dbg !21 + %81 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i = icmp eq i32 %81, 0, !dbg !21 + %82 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %77, float %74) #4, !dbg !21 + %83 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %77, float %74) #4, !dbg !21 + %.03.i = select i1 %.not3.i, float %83, float %82, !dbg !21 + %84 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i = icmp eq i32 %84, 0, !dbg !21 + %85 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %77, float %73) #4, !dbg !21 + %86 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %77, float %73) #4, !dbg !21 + %.04.i = select i1 %.not4.i, float %86, float %85, !dbg !21 + %87 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i = icmp eq i32 %87, 0, !dbg !21 + %88 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %77, float %72) #4, !dbg !21 + %89 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %77, float %72) #4, !dbg !21 + %.05.i = select i1 %.not5.i, float %89, float %88, !dbg !21 + %90 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i = icmp eq i32 %90, 0, !dbg !21 + %91 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %77, float %71) #4, !dbg !21 + %92 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %77, float %71) #4, !dbg !21 + %.06.i = select i1 %.not6.i, float %92, float %91, !dbg !21 + %93 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i = icmp eq i32 %93, 0, !dbg !21 + %94 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %77, float %70) #4, !dbg !21 + %95 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %77, float %70) #4, !dbg !21 + %.07.i = select i1 %.not7.i, float %95, float %94, !dbg !21 + %96 = fneg float %77, !dbg !21 + %97 = select i1 %66, float %96, float %55, !dbg !21 + %98 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i = icmp eq i32 %98, 0, !dbg !21 + %99 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %97, float %97) #4, !dbg !21 + %100 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %97, float %97) #4, !dbg !21 + %.08.i = select i1 %.not8.i, float %100, float %99, !dbg !21 + br i1 %66, label %101, label %__nv_erff.exit, !dbg !21 + +101: ; preds = %__internal_fmad.exit.i + %102 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !21 + %103 = fsub float 1.000000e+00, %102, !dbg !21 + %104 = bitcast float %103 to i32, !dbg !21 + %105 = bitcast float %55 to i32, !dbg !21 + %106 = and i32 %105, -2147483648, !dbg !21 + %107 = or i32 %106, %104, !dbg !21 + %108 = bitcast i32 %107 to float, !dbg !21 + br label %__nv_erff.exit, !dbg !21 + +__nv_erff.exit: ; preds = %__internal_fmad.exit.i, %101 + %r.0.i = phi float [ %108, %101 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !21 + %109 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i8 = icmp eq i32 %109, 0, !dbg !21 + %110 = tail call float @llvm.nvvm.fabs.ftz.f(float %56) #4, !dbg !21 + %111 = tail call float @llvm.nvvm.fabs.f(float %56) #4, !dbg !21 + %.0.i9 = select i1 %.not.i8, float %111, float %110, !dbg !21 + %112 = fcmp oge float %.0.i9, 0x3FF00C1FC0000000, !dbg !21 + br i1 %112, label %__nv_fabsf.exit1.i26, label %114, !dbg !21 + +__nv_fabsf.exit1.i26: ; preds = %__nv_erff.exit + %113 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i27 = icmp eq i32 %113, 0, !dbg !21 + %.01.i28 = select i1 %.not1.i27, float %111, float %110, !dbg !21 + br label %__internal_fmad.exit.i10, !dbg !21 + +114: ; preds = %__nv_erff.exit + %115 = fmul float %56, %56, !dbg !21 + br label %__internal_fmad.exit.i10, !dbg !21 + +__internal_fmad.exit.i10: ; preds = %114, %__nv_fabsf.exit1.i26 + %116 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i26 ], [ 0x3FC06EBA60000000, %114 ], !dbg !21 + %117 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i26 ], [ 0xBFD8127580000000, %114 ], !dbg !21 + %118 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i26 ], [ 0x3FBCE315E0000000, %114 ], !dbg !21 + %119 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i26 ], [ 0xBF9B837CE0000000, %114 ], !dbg !21 + %120 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i26 ], [ 0x3F755ABD40000000, %114 ], !dbg !21 + %121 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i26 ], [ 0xBF4AE9A400000000, %114 ], !dbg !21 + %122 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i26 ], [ 0x3F163D2D40000000, %114 ], !dbg !21 + %123 = phi float [ %.01.i28, %__nv_fabsf.exit1.i26 ], [ %115, %114 ], !dbg !21 + %124 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i11 = icmp eq i32 %124, 0, !dbg !21 + %125 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %122, float %123, float %121) #4, !dbg !21 + %126 = tail call float @llvm.nvvm.fma.rn.f(float %122, float %123, float %121) #4, !dbg !21 + %.02.i12 = select i1 %.not2.i11, float %126, float %125, !dbg !21 + %127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i13 = icmp eq i32 %127, 0, !dbg !21 + %128 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i12, float %123, float %120) #4, !dbg !21 + %129 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i12, float %123, float %120) #4, !dbg !21 + %.03.i14 = select i1 %.not3.i13, float %129, float %128, !dbg !21 + %130 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i15 = icmp eq i32 %130, 0, !dbg !21 + %131 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i14, float %123, float %119) #4, !dbg !21 + %132 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i14, float %123, float %119) #4, !dbg !21 + %.04.i16 = select i1 %.not4.i15, float %132, float %131, !dbg !21 + %133 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i17 = icmp eq i32 %133, 0, !dbg !21 + %134 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i16, float %123, float %118) #4, !dbg !21 + %135 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i16, float %123, float %118) #4, !dbg !21 + %.05.i18 = select i1 %.not5.i17, float %135, float %134, !dbg !21 + %136 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i19 = icmp eq i32 %136, 0, !dbg !21 + %137 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i18, float %123, float %117) #4, !dbg !21 + %138 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i18, float %123, float %117) #4, !dbg !21 + %.06.i20 = select i1 %.not6.i19, float %138, float %137, !dbg !21 + %139 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i21 = icmp eq i32 %139, 0, !dbg !21 + %140 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i20, float %123, float %116) #4, !dbg !21 + %141 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i20, float %123, float %116) #4, !dbg !21 + %.07.i22 = select i1 %.not7.i21, float %141, float %140, !dbg !21 + %142 = fneg float %123, !dbg !21 + %143 = select i1 %112, float %142, float %56, !dbg !21 + %144 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i23 = icmp eq i32 %144, 0, !dbg !21 + %145 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i22, float %143, float %143) #4, !dbg !21 + %146 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i22, float %143, float %143) #4, !dbg !21 + %.08.i24 = select i1 %.not8.i23, float %146, float %145, !dbg !21 + br i1 %112, label %147, label %__nv_erff.exit29, !dbg !21 + +147: ; preds = %__internal_fmad.exit.i10 + %148 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i24) #4, !dbg !21 + %149 = fsub float 1.000000e+00, %148, !dbg !21 + %150 = bitcast float %149 to i32, !dbg !21 + %151 = bitcast float %56 to i32, !dbg !21 + %152 = and i32 %151, -2147483648, !dbg !21 + %153 = or i32 %152, %150, !dbg !21 + %154 = bitcast i32 %153 to float, !dbg !21 + br label %__nv_erff.exit29, !dbg !21 + +__nv_erff.exit29: ; preds = %__internal_fmad.exit.i10, %147 + %r.0.i25 = phi float [ %154, %147 ], [ %.08.i24, %__internal_fmad.exit.i10 ], !dbg !21 + %155 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i30 = icmp eq i32 %155, 0, !dbg !21 + %156 = tail call float @llvm.nvvm.fabs.ftz.f(float %57) #4, !dbg !21 + %157 = tail call float @llvm.nvvm.fabs.f(float %57) #4, !dbg !21 + %.0.i31 = select i1 %.not.i30, float %157, float %156, !dbg !21 + %158 = fcmp oge float %.0.i31, 0x3FF00C1FC0000000, !dbg !21 + br i1 %158, label %__nv_fabsf.exit1.i48, label %160, !dbg !21 + +__nv_fabsf.exit1.i48: ; preds = %__nv_erff.exit29 + %159 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i49 = icmp eq i32 %159, 0, !dbg !21 + %.01.i50 = select i1 %.not1.i49, float %157, float %156, !dbg !21 + br label %__internal_fmad.exit.i32, !dbg !21 + +160: ; preds = %__nv_erff.exit29 + %161 = fmul float %57, %57, !dbg !21 + br label %__internal_fmad.exit.i32, !dbg !21 + +__internal_fmad.exit.i32: ; preds = %160, %__nv_fabsf.exit1.i48 + %162 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i48 ], [ 0x3FC06EBA60000000, %160 ], !dbg !21 + %163 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i48 ], [ 0xBFD8127580000000, %160 ], !dbg !21 + %164 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i48 ], [ 0x3FBCE315E0000000, %160 ], !dbg !21 + %165 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i48 ], [ 0xBF9B837CE0000000, %160 ], !dbg !21 + %166 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i48 ], [ 0x3F755ABD40000000, %160 ], !dbg !21 + %167 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i48 ], [ 0xBF4AE9A400000000, %160 ], !dbg !21 + %168 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i48 ], [ 0x3F163D2D40000000, %160 ], !dbg !21 + %169 = phi float [ %.01.i50, %__nv_fabsf.exit1.i48 ], [ %161, %160 ], !dbg !21 + %170 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i33 = icmp eq i32 %170, 0, !dbg !21 + %171 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %168, float %169, float %167) #4, !dbg !21 + %172 = tail call float @llvm.nvvm.fma.rn.f(float %168, float %169, float %167) #4, !dbg !21 + %.02.i34 = select i1 %.not2.i33, float %172, float %171, !dbg !21 + %173 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i35 = icmp eq i32 %173, 0, !dbg !21 + %174 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i34, float %169, float %166) #4, !dbg !21 + %175 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i34, float %169, float %166) #4, !dbg !21 + %.03.i36 = select i1 %.not3.i35, float %175, float %174, !dbg !21 + %176 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i37 = icmp eq i32 %176, 0, !dbg !21 + %177 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i36, float %169, float %165) #4, !dbg !21 + %178 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i36, float %169, float %165) #4, !dbg !21 + %.04.i38 = select i1 %.not4.i37, float %178, float %177, !dbg !21 + %179 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i39 = icmp eq i32 %179, 0, !dbg !21 + %180 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i38, float %169, float %164) #4, !dbg !21 + %181 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i38, float %169, float %164) #4, !dbg !21 + %.05.i40 = select i1 %.not5.i39, float %181, float %180, !dbg !21 + %182 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i41 = icmp eq i32 %182, 0, !dbg !21 + %183 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i40, float %169, float %163) #4, !dbg !21 + %184 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i40, float %169, float %163) #4, !dbg !21 + %.06.i42 = select i1 %.not6.i41, float %184, float %183, !dbg !21 + %185 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i43 = icmp eq i32 %185, 0, !dbg !21 + %186 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i42, float %169, float %162) #4, !dbg !21 + %187 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i42, float %169, float %162) #4, !dbg !21 + %.07.i44 = select i1 %.not7.i43, float %187, float %186, !dbg !21 + %188 = fneg float %169, !dbg !21 + %189 = select i1 %158, float %188, float %57, !dbg !21 + %190 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i45 = icmp eq i32 %190, 0, !dbg !21 + %191 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i44, float %189, float %189) #4, !dbg !21 + %192 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i44, float %189, float %189) #4, !dbg !21 + %.08.i46 = select i1 %.not8.i45, float %192, float %191, !dbg !21 + br i1 %158, label %193, label %__nv_erff.exit51, !dbg !21 + +193: ; preds = %__internal_fmad.exit.i32 + %194 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i46) #4, !dbg !21 + %195 = fsub float 1.000000e+00, %194, !dbg !21 + %196 = bitcast float %195 to i32, !dbg !21 + %197 = bitcast float %57 to i32, !dbg !21 + %198 = and i32 %197, -2147483648, !dbg !21 + %199 = or i32 %198, %196, !dbg !21 + %200 = bitcast i32 %199 to float, !dbg !21 + br label %__nv_erff.exit51, !dbg !21 + +__nv_erff.exit51: ; preds = %__internal_fmad.exit.i32, %193 + %r.0.i47 = phi float [ %200, %193 ], [ %.08.i46, %__internal_fmad.exit.i32 ], !dbg !21 + %201 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i52 = icmp eq i32 %201, 0, !dbg !21 + %202 = tail call float @llvm.nvvm.fabs.ftz.f(float %58) #4, !dbg !21 + %203 = tail call float @llvm.nvvm.fabs.f(float %58) #4, !dbg !21 + %.0.i53 = select i1 %.not.i52, float %203, float %202, !dbg !21 + %204 = fcmp oge float %.0.i53, 0x3FF00C1FC0000000, !dbg !21 + br i1 %204, label %__nv_fabsf.exit1.i70, label %206, !dbg !21 + +__nv_fabsf.exit1.i70: ; preds = %__nv_erff.exit51 + %205 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i71 = icmp eq i32 %205, 0, !dbg !21 + %.01.i72 = select i1 %.not1.i71, float %203, float %202, !dbg !21 + br label %__internal_fmad.exit.i54, !dbg !21 + +206: ; preds = %__nv_erff.exit51 + %207 = fmul float %58, %58, !dbg !21 + br label %__internal_fmad.exit.i54, !dbg !21 + +__internal_fmad.exit.i54: ; preds = %206, %__nv_fabsf.exit1.i70 + %208 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i70 ], [ 0x3FC06EBA60000000, %206 ], !dbg !21 + %209 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i70 ], [ 0xBFD8127580000000, %206 ], !dbg !21 + %210 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i70 ], [ 0x3FBCE315E0000000, %206 ], !dbg !21 + %211 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i70 ], [ 0xBF9B837CE0000000, %206 ], !dbg !21 + %212 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i70 ], [ 0x3F755ABD40000000, %206 ], !dbg !21 + %213 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i70 ], [ 0xBF4AE9A400000000, %206 ], !dbg !21 + %214 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i70 ], [ 0x3F163D2D40000000, %206 ], !dbg !21 + %215 = phi float [ %.01.i72, %__nv_fabsf.exit1.i70 ], [ %207, %206 ], !dbg !21 + %216 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i55 = icmp eq i32 %216, 0, !dbg !21 + %217 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %214, float %215, float %213) #4, !dbg !21 + %218 = tail call float @llvm.nvvm.fma.rn.f(float %214, float %215, float %213) #4, !dbg !21 + %.02.i56 = select i1 %.not2.i55, float %218, float %217, !dbg !21 + %219 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i57 = icmp eq i32 %219, 0, !dbg !21 + %220 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i56, float %215, float %212) #4, !dbg !21 + %221 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i56, float %215, float %212) #4, !dbg !21 + %.03.i58 = select i1 %.not3.i57, float %221, float %220, !dbg !21 + %222 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i59 = icmp eq i32 %222, 0, !dbg !21 + %223 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i58, float %215, float %211) #4, !dbg !21 + %224 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i58, float %215, float %211) #4, !dbg !21 + %.04.i60 = select i1 %.not4.i59, float %224, float %223, !dbg !21 + %225 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i61 = icmp eq i32 %225, 0, !dbg !21 + %226 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i60, float %215, float %210) #4, !dbg !21 + %227 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i60, float %215, float %210) #4, !dbg !21 + %.05.i62 = select i1 %.not5.i61, float %227, float %226, !dbg !21 + %228 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i63 = icmp eq i32 %228, 0, !dbg !21 + %229 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i62, float %215, float %209) #4, !dbg !21 + %230 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i62, float %215, float %209) #4, !dbg !21 + %.06.i64 = select i1 %.not6.i63, float %230, float %229, !dbg !21 + %231 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i65 = icmp eq i32 %231, 0, !dbg !21 + %232 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i64, float %215, float %208) #4, !dbg !21 + %233 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i64, float %215, float %208) #4, !dbg !21 + %.07.i66 = select i1 %.not7.i65, float %233, float %232, !dbg !21 + %234 = fneg float %215, !dbg !21 + %235 = select i1 %204, float %234, float %58, !dbg !21 + %236 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i67 = icmp eq i32 %236, 0, !dbg !21 + %237 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i66, float %235, float %235) #4, !dbg !21 + %238 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i66, float %235, float %235) #4, !dbg !21 + %.08.i68 = select i1 %.not8.i67, float %238, float %237, !dbg !21 + br i1 %204, label %239, label %__nv_erff.exit73, !dbg !21 + +239: ; preds = %__internal_fmad.exit.i54 + %240 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i68) #4, !dbg !21 + %241 = fsub float 1.000000e+00, %240, !dbg !21 + %242 = bitcast float %241 to i32, !dbg !21 + %243 = bitcast float %58 to i32, !dbg !21 + %244 = and i32 %243, -2147483648, !dbg !21 + %245 = or i32 %244, %242, !dbg !21 + %246 = bitcast i32 %245 to float, !dbg !21 + br label %__nv_erff.exit73, !dbg !21 + +__nv_erff.exit73: ; preds = %__internal_fmad.exit.i54, %239 + %r.0.i69 = phi float [ %246, %239 ], [ %.08.i68, %__internal_fmad.exit.i54 ], !dbg !21 + %247 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i74 = icmp eq i32 %247, 0, !dbg !21 + %248 = tail call float @llvm.nvvm.fabs.ftz.f(float %59) #4, !dbg !21 + %249 = tail call float @llvm.nvvm.fabs.f(float %59) #4, !dbg !21 + %.0.i75 = select i1 %.not.i74, float %249, float %248, !dbg !21 + %250 = fcmp oge float %.0.i75, 0x3FF00C1FC0000000, !dbg !21 + br i1 %250, label %__nv_fabsf.exit1.i92, label %252, !dbg !21 + +__nv_fabsf.exit1.i92: ; preds = %__nv_erff.exit73 + %251 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i93 = icmp eq i32 %251, 0, !dbg !21 + %.01.i94 = select i1 %.not1.i93, float %249, float %248, !dbg !21 + br label %__internal_fmad.exit.i76, !dbg !21 + +252: ; preds = %__nv_erff.exit73 + %253 = fmul float %59, %59, !dbg !21 + br label %__internal_fmad.exit.i76, !dbg !21 + +__internal_fmad.exit.i76: ; preds = %252, %__nv_fabsf.exit1.i92 + %254 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i92 ], [ 0x3FC06EBA60000000, %252 ], !dbg !21 + %255 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i92 ], [ 0xBFD8127580000000, %252 ], !dbg !21 + %256 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i92 ], [ 0x3FBCE315E0000000, %252 ], !dbg !21 + %257 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i92 ], [ 0xBF9B837CE0000000, %252 ], !dbg !21 + %258 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i92 ], [ 0x3F755ABD40000000, %252 ], !dbg !21 + %259 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i92 ], [ 0xBF4AE9A400000000, %252 ], !dbg !21 + %260 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i92 ], [ 0x3F163D2D40000000, %252 ], !dbg !21 + %261 = phi float [ %.01.i94, %__nv_fabsf.exit1.i92 ], [ %253, %252 ], !dbg !21 + %262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i77 = icmp eq i32 %262, 0, !dbg !21 + %263 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %260, float %261, float %259) #4, !dbg !21 + %264 = tail call float @llvm.nvvm.fma.rn.f(float %260, float %261, float %259) #4, !dbg !21 + %.02.i78 = select i1 %.not2.i77, float %264, float %263, !dbg !21 + %265 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i79 = icmp eq i32 %265, 0, !dbg !21 + %266 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i78, float %261, float %258) #4, !dbg !21 + %267 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i78, float %261, float %258) #4, !dbg !21 + %.03.i80 = select i1 %.not3.i79, float %267, float %266, !dbg !21 + %268 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i81 = icmp eq i32 %268, 0, !dbg !21 + %269 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i80, float %261, float %257) #4, !dbg !21 + %270 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i80, float %261, float %257) #4, !dbg !21 + %.04.i82 = select i1 %.not4.i81, float %270, float %269, !dbg !21 + %271 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i83 = icmp eq i32 %271, 0, !dbg !21 + %272 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i82, float %261, float %256) #4, !dbg !21 + %273 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i82, float %261, float %256) #4, !dbg !21 + %.05.i84 = select i1 %.not5.i83, float %273, float %272, !dbg !21 + %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i85 = icmp eq i32 %274, 0, !dbg !21 + %275 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i84, float %261, float %255) #4, !dbg !21 + %276 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i84, float %261, float %255) #4, !dbg !21 + %.06.i86 = select i1 %.not6.i85, float %276, float %275, !dbg !21 + %277 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i87 = icmp eq i32 %277, 0, !dbg !21 + %278 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i86, float %261, float %254) #4, !dbg !21 + %279 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i86, float %261, float %254) #4, !dbg !21 + %.07.i88 = select i1 %.not7.i87, float %279, float %278, !dbg !21 + %280 = fneg float %261, !dbg !21 + %281 = select i1 %250, float %280, float %59, !dbg !21 + %282 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i89 = icmp eq i32 %282, 0, !dbg !21 + %283 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i88, float %281, float %281) #4, !dbg !21 + %284 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i88, float %281, float %281) #4, !dbg !21 + %.08.i90 = select i1 %.not8.i89, float %284, float %283, !dbg !21 + br i1 %250, label %285, label %__nv_erff.exit95, !dbg !21 + +285: ; preds = %__internal_fmad.exit.i76 + %286 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i90) #4, !dbg !21 + %287 = fsub float 1.000000e+00, %286, !dbg !21 + %288 = bitcast float %287 to i32, !dbg !21 + %289 = bitcast float %59 to i32, !dbg !21 + %290 = and i32 %289, -2147483648, !dbg !21 + %291 = or i32 %290, %288, !dbg !21 + %292 = bitcast i32 %291 to float, !dbg !21 + br label %__nv_erff.exit95, !dbg !21 + +__nv_erff.exit95: ; preds = %__internal_fmad.exit.i76, %285 + %r.0.i91 = phi float [ %292, %285 ], [ %.08.i90, %__internal_fmad.exit.i76 ], !dbg !21 + %293 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i96 = icmp eq i32 %293, 0, !dbg !21 + %294 = tail call float @llvm.nvvm.fabs.ftz.f(float %60) #4, !dbg !21 + %295 = tail call float @llvm.nvvm.fabs.f(float %60) #4, !dbg !21 + %.0.i97 = select i1 %.not.i96, float %295, float %294, !dbg !21 + %296 = fcmp oge float %.0.i97, 0x3FF00C1FC0000000, !dbg !21 + br i1 %296, label %__nv_fabsf.exit1.i114, label %298, !dbg !21 + +__nv_fabsf.exit1.i114: ; preds = %__nv_erff.exit95 + %297 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i115 = icmp eq i32 %297, 0, !dbg !21 + %.01.i116 = select i1 %.not1.i115, float %295, float %294, !dbg !21 + br label %__internal_fmad.exit.i98, !dbg !21 + +298: ; preds = %__nv_erff.exit95 + %299 = fmul float %60, %60, !dbg !21 + br label %__internal_fmad.exit.i98, !dbg !21 + +__internal_fmad.exit.i98: ; preds = %298, %__nv_fabsf.exit1.i114 + %300 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i114 ], [ 0x3FC06EBA60000000, %298 ], !dbg !21 + %301 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i114 ], [ 0xBFD8127580000000, %298 ], !dbg !21 + %302 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i114 ], [ 0x3FBCE315E0000000, %298 ], !dbg !21 + %303 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i114 ], [ 0xBF9B837CE0000000, %298 ], !dbg !21 + %304 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i114 ], [ 0x3F755ABD40000000, %298 ], !dbg !21 + %305 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i114 ], [ 0xBF4AE9A400000000, %298 ], !dbg !21 + %306 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i114 ], [ 0x3F163D2D40000000, %298 ], !dbg !21 + %307 = phi float [ %.01.i116, %__nv_fabsf.exit1.i114 ], [ %299, %298 ], !dbg !21 + %308 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i99 = icmp eq i32 %308, 0, !dbg !21 + %309 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %306, float %307, float %305) #4, !dbg !21 + %310 = tail call float @llvm.nvvm.fma.rn.f(float %306, float %307, float %305) #4, !dbg !21 + %.02.i100 = select i1 %.not2.i99, float %310, float %309, !dbg !21 + %311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i101 = icmp eq i32 %311, 0, !dbg !21 + %312 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i100, float %307, float %304) #4, !dbg !21 + %313 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i100, float %307, float %304) #4, !dbg !21 + %.03.i102 = select i1 %.not3.i101, float %313, float %312, !dbg !21 + %314 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i103 = icmp eq i32 %314, 0, !dbg !21 + %315 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i102, float %307, float %303) #4, !dbg !21 + %316 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i102, float %307, float %303) #4, !dbg !21 + %.04.i104 = select i1 %.not4.i103, float %316, float %315, !dbg !21 + %317 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i105 = icmp eq i32 %317, 0, !dbg !21 + %318 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i104, float %307, float %302) #4, !dbg !21 + %319 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i104, float %307, float %302) #4, !dbg !21 + %.05.i106 = select i1 %.not5.i105, float %319, float %318, !dbg !21 + %320 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i107 = icmp eq i32 %320, 0, !dbg !21 + %321 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i106, float %307, float %301) #4, !dbg !21 + %322 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i106, float %307, float %301) #4, !dbg !21 + %.06.i108 = select i1 %.not6.i107, float %322, float %321, !dbg !21 + %323 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i109 = icmp eq i32 %323, 0, !dbg !21 + %324 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i108, float %307, float %300) #4, !dbg !21 + %325 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i108, float %307, float %300) #4, !dbg !21 + %.07.i110 = select i1 %.not7.i109, float %325, float %324, !dbg !21 + %326 = fneg float %307, !dbg !21 + %327 = select i1 %296, float %326, float %60, !dbg !21 + %328 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i111 = icmp eq i32 %328, 0, !dbg !21 + %329 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i110, float %327, float %327) #4, !dbg !21 + %330 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i110, float %327, float %327) #4, !dbg !21 + %.08.i112 = select i1 %.not8.i111, float %330, float %329, !dbg !21 + br i1 %296, label %331, label %__nv_erff.exit117, !dbg !21 + +331: ; preds = %__internal_fmad.exit.i98 + %332 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i112) #4, !dbg !21 + %333 = fsub float 1.000000e+00, %332, !dbg !21 + %334 = bitcast float %333 to i32, !dbg !21 + %335 = bitcast float %60 to i32, !dbg !21 + %336 = and i32 %335, -2147483648, !dbg !21 + %337 = or i32 %336, %334, !dbg !21 + %338 = bitcast i32 %337 to float, !dbg !21 + br label %__nv_erff.exit117, !dbg !21 + +__nv_erff.exit117: ; preds = %__internal_fmad.exit.i98, %331 + %r.0.i113 = phi float [ %338, %331 ], [ %.08.i112, %__internal_fmad.exit.i98 ], !dbg !21 + %339 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i118 = icmp eq i32 %339, 0, !dbg !21 + %340 = tail call float @llvm.nvvm.fabs.ftz.f(float %61) #4, !dbg !21 + %341 = tail call float @llvm.nvvm.fabs.f(float %61) #4, !dbg !21 + %.0.i119 = select i1 %.not.i118, float %341, float %340, !dbg !21 + %342 = fcmp oge float %.0.i119, 0x3FF00C1FC0000000, !dbg !21 + br i1 %342, label %__nv_fabsf.exit1.i136, label %344, !dbg !21 + +__nv_fabsf.exit1.i136: ; preds = %__nv_erff.exit117 + %343 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i137 = icmp eq i32 %343, 0, !dbg !21 + %.01.i138 = select i1 %.not1.i137, float %341, float %340, !dbg !21 + br label %__internal_fmad.exit.i120, !dbg !21 + +344: ; preds = %__nv_erff.exit117 + %345 = fmul float %61, %61, !dbg !21 + br label %__internal_fmad.exit.i120, !dbg !21 + +__internal_fmad.exit.i120: ; preds = %344, %__nv_fabsf.exit1.i136 + %346 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i136 ], [ 0x3FC06EBA60000000, %344 ], !dbg !21 + %347 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i136 ], [ 0xBFD8127580000000, %344 ], !dbg !21 + %348 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i136 ], [ 0x3FBCE315E0000000, %344 ], !dbg !21 + %349 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i136 ], [ 0xBF9B837CE0000000, %344 ], !dbg !21 + %350 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i136 ], [ 0x3F755ABD40000000, %344 ], !dbg !21 + %351 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i136 ], [ 0xBF4AE9A400000000, %344 ], !dbg !21 + %352 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i136 ], [ 0x3F163D2D40000000, %344 ], !dbg !21 + %353 = phi float [ %.01.i138, %__nv_fabsf.exit1.i136 ], [ %345, %344 ], !dbg !21 + %354 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i121 = icmp eq i32 %354, 0, !dbg !21 + %355 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %352, float %353, float %351) #4, !dbg !21 + %356 = tail call float @llvm.nvvm.fma.rn.f(float %352, float %353, float %351) #4, !dbg !21 + %.02.i122 = select i1 %.not2.i121, float %356, float %355, !dbg !21 + %357 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i123 = icmp eq i32 %357, 0, !dbg !21 + %358 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i122, float %353, float %350) #4, !dbg !21 + %359 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i122, float %353, float %350) #4, !dbg !21 + %.03.i124 = select i1 %.not3.i123, float %359, float %358, !dbg !21 + %360 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i125 = icmp eq i32 %360, 0, !dbg !21 + %361 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i124, float %353, float %349) #4, !dbg !21 + %362 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i124, float %353, float %349) #4, !dbg !21 + %.04.i126 = select i1 %.not4.i125, float %362, float %361, !dbg !21 + %363 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i127 = icmp eq i32 %363, 0, !dbg !21 + %364 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i126, float %353, float %348) #4, !dbg !21 + %365 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i126, float %353, float %348) #4, !dbg !21 + %.05.i128 = select i1 %.not5.i127, float %365, float %364, !dbg !21 + %366 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i129 = icmp eq i32 %366, 0, !dbg !21 + %367 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i128, float %353, float %347) #4, !dbg !21 + %368 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i128, float %353, float %347) #4, !dbg !21 + %.06.i130 = select i1 %.not6.i129, float %368, float %367, !dbg !21 + %369 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i131 = icmp eq i32 %369, 0, !dbg !21 + %370 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i130, float %353, float %346) #4, !dbg !21 + %371 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i130, float %353, float %346) #4, !dbg !21 + %.07.i132 = select i1 %.not7.i131, float %371, float %370, !dbg !21 + %372 = fneg float %353, !dbg !21 + %373 = select i1 %342, float %372, float %61, !dbg !21 + %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i133 = icmp eq i32 %374, 0, !dbg !21 + %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i132, float %373, float %373) #4, !dbg !21 + %376 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i132, float %373, float %373) #4, !dbg !21 + %.08.i134 = select i1 %.not8.i133, float %376, float %375, !dbg !21 + br i1 %342, label %377, label %__nv_erff.exit139, !dbg !21 + +377: ; preds = %__internal_fmad.exit.i120 + %378 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i134) #4, !dbg !21 + %379 = fsub float 1.000000e+00, %378, !dbg !21 + %380 = bitcast float %379 to i32, !dbg !21 + %381 = bitcast float %61 to i32, !dbg !21 + %382 = and i32 %381, -2147483648, !dbg !21 + %383 = or i32 %382, %380, !dbg !21 + %384 = bitcast i32 %383 to float, !dbg !21 + br label %__nv_erff.exit139, !dbg !21 + +__nv_erff.exit139: ; preds = %__internal_fmad.exit.i120, %377 + %r.0.i135 = phi float [ %384, %377 ], [ %.08.i134, %__internal_fmad.exit.i120 ], !dbg !21 + %385 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not.i140 = icmp eq i32 %385, 0, !dbg !21 + %386 = tail call float @llvm.nvvm.fabs.ftz.f(float %62) #4, !dbg !21 + %387 = tail call float @llvm.nvvm.fabs.f(float %62) #4, !dbg !21 + %.0.i141 = select i1 %.not.i140, float %387, float %386, !dbg !21 + %388 = fcmp oge float %.0.i141, 0x3FF00C1FC0000000, !dbg !21 + br i1 %388, label %__nv_fabsf.exit1.i158, label %390, !dbg !21 + +__nv_fabsf.exit1.i158: ; preds = %__nv_erff.exit139 + %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not1.i159 = icmp eq i32 %389, 0, !dbg !21 + %.01.i160 = select i1 %.not1.i159, float %387, float %386, !dbg !21 + br label %__internal_fmad.exit.i142, !dbg !21 + +390: ; preds = %__nv_erff.exit139 + %391 = fmul float %62, %62, !dbg !21 + br label %__internal_fmad.exit.i142, !dbg !21 + +__internal_fmad.exit.i142: ; preds = %390, %__nv_fabsf.exit1.i158 + %392 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i158 ], [ 0x3FC06EBA60000000, %390 ], !dbg !21 + %393 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i158 ], [ 0xBFD8127580000000, %390 ], !dbg !21 + %394 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i158 ], [ 0x3FBCE315E0000000, %390 ], !dbg !21 + %395 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i158 ], [ 0xBF9B837CE0000000, %390 ], !dbg !21 + %396 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i158 ], [ 0x3F755ABD40000000, %390 ], !dbg !21 + %397 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i158 ], [ 0xBF4AE9A400000000, %390 ], !dbg !21 + %398 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i158 ], [ 0x3F163D2D40000000, %390 ], !dbg !21 + %399 = phi float [ %.01.i160, %__nv_fabsf.exit1.i158 ], [ %391, %390 ], !dbg !21 + %400 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not2.i143 = icmp eq i32 %400, 0, !dbg !21 + %401 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %398, float %399, float %397) #4, !dbg !21 + %402 = tail call float @llvm.nvvm.fma.rn.f(float %398, float %399, float %397) #4, !dbg !21 + %.02.i144 = select i1 %.not2.i143, float %402, float %401, !dbg !21 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not3.i145 = icmp eq i32 %403, 0, !dbg !21 + %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i144, float %399, float %396) #4, !dbg !21 + %405 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i144, float %399, float %396) #4, !dbg !21 + %.03.i146 = select i1 %.not3.i145, float %405, float %404, !dbg !21 + %406 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not4.i147 = icmp eq i32 %406, 0, !dbg !21 + %407 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i146, float %399, float %395) #4, !dbg !21 + %408 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i146, float %399, float %395) #4, !dbg !21 + %.04.i148 = select i1 %.not4.i147, float %408, float %407, !dbg !21 + %409 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not5.i149 = icmp eq i32 %409, 0, !dbg !21 + %410 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i148, float %399, float %394) #4, !dbg !21 + %411 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i148, float %399, float %394) #4, !dbg !21 + %.05.i150 = select i1 %.not5.i149, float %411, float %410, !dbg !21 + %412 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not6.i151 = icmp eq i32 %412, 0, !dbg !21 + %413 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i150, float %399, float %393) #4, !dbg !21 + %414 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i150, float %399, float %393) #4, !dbg !21 + %.06.i152 = select i1 %.not6.i151, float %414, float %413, !dbg !21 + %415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not7.i153 = icmp eq i32 %415, 0, !dbg !21 + %416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i152, float %399, float %392) #4, !dbg !21 + %417 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i152, float %399, float %392) #4, !dbg !21 + %.07.i154 = select i1 %.not7.i153, float %417, float %416, !dbg !21 + %418 = fneg float %399, !dbg !21 + %419 = select i1 %388, float %418, float %62, !dbg !21 + %420 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !21 + %.not8.i155 = icmp eq i32 %420, 0, !dbg !21 + %421 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i154, float %419, float %419) #4, !dbg !21 + %422 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i154, float %419, float %419) #4, !dbg !21 + %.08.i156 = select i1 %.not8.i155, float %422, float %421, !dbg !21 + br i1 %388, label %423, label %__nv_erff.exit161, !dbg !21 + +423: ; preds = %__internal_fmad.exit.i142 + %424 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i156) #4, !dbg !21 + %425 = fsub float 1.000000e+00, %424, !dbg !21 + %426 = bitcast float %425 to i32, !dbg !21 + %427 = bitcast float %62 to i32, !dbg !21 + %428 = and i32 %427, -2147483648, !dbg !21 + %429 = or i32 %428, %426, !dbg !21 + %430 = bitcast i32 %429 to float, !dbg !21 + br label %__nv_erff.exit161, !dbg !21 + +__nv_erff.exit161: ; preds = %__internal_fmad.exit.i142, %423 + %r.0.i157 = phi float [ %430, %423 ], [ %.08.i156, %__internal_fmad.exit.i142 ], !dbg !21 + %431 = fadd float %r.0.i, 1.000000e+00, !dbg !22 + %432 = fadd float %r.0.i25, 1.000000e+00, !dbg !22 + %433 = fadd float %r.0.i47, 1.000000e+00, !dbg !22 + %434 = fadd float %r.0.i69, 1.000000e+00, !dbg !22 + %435 = fadd float %r.0.i91, 1.000000e+00, !dbg !22 + %436 = fadd float %r.0.i113, 1.000000e+00, !dbg !22 + %437 = fadd float %r.0.i135, 1.000000e+00, !dbg !22 + %438 = fadd float %r.0.i157, 1.000000e+00, !dbg !22 + %439 = fmul float %431, 5.000000e-01, !dbg !23 + %440 = fmul float %432, 5.000000e-01, !dbg !23 + %441 = fmul float %433, 5.000000e-01, !dbg !23 + %442 = fmul float %434, 5.000000e-01, !dbg !23 + %443 = fmul float %435, 5.000000e-01, !dbg !23 + %444 = fmul float %436, 5.000000e-01, !dbg !23 + %445 = fmul float %437, 5.000000e-01, !dbg !23 + %446 = fmul float %438, 5.000000e-01, !dbg !23 + %447 = fmul float %47, %47, !dbg !24 + %448 = fmul float %48, %48, !dbg !24 + %449 = fmul float %49, %49, !dbg !24 + %450 = fmul float %50, %50, !dbg !24 + %451 = fmul float %51, %51, !dbg !24 + %452 = fmul float %52, %52, !dbg !24 + %453 = fmul float %53, %53, !dbg !24 + %454 = fmul float %54, %54, !dbg !24 + %455 = fmul float %447, -5.000000e-01, !dbg !25 + %456 = fmul float %448, -5.000000e-01, !dbg !25 + %457 = fmul float %449, -5.000000e-01, !dbg !25 + %458 = fmul float %450, -5.000000e-01, !dbg !25 + %459 = fmul float %451, -5.000000e-01, !dbg !25 + %460 = fmul float %452, -5.000000e-01, !dbg !25 + %461 = fmul float %453, -5.000000e-01, !dbg !25 + %462 = fmul float %454, -5.000000e-01, !dbg !25 + %463 = fmul float %455, 0x3FF7154760000000, !dbg !26 + %464 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %463) #4, !dbg !26 + %465 = fmul float %456, 0x3FF7154760000000, !dbg !26 + %466 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %465) #4, !dbg !26 + %467 = fmul float %457, 0x3FF7154760000000, !dbg !26 + %468 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %467) #4, !dbg !26 + %469 = fmul float %458, 0x3FF7154760000000, !dbg !26 + %470 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %469) #4, !dbg !26 + %471 = fmul float %459, 0x3FF7154760000000, !dbg !26 + %472 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %471) #4, !dbg !26 + %473 = fmul float %460, 0x3FF7154760000000, !dbg !26 + %474 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %473) #4, !dbg !26 + %475 = fmul float %461, 0x3FF7154760000000, !dbg !26 + %476 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %475) #4, !dbg !26 + %477 = fmul float %462, 0x3FF7154760000000, !dbg !26 + %478 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %477) #4, !dbg !26 + %479 = fmul float %464, 0x3FD9884540000000, !dbg !27 + %480 = fmul float %466, 0x3FD9884540000000, !dbg !27 + %481 = fmul float %468, 0x3FD9884540000000, !dbg !27 + %482 = fmul float %470, 0x3FD9884540000000, !dbg !27 + %483 = fmul float %472, 0x3FD9884540000000, !dbg !27 + %484 = fmul float %474, 0x3FD9884540000000, !dbg !27 + %485 = fmul float %476, 0x3FD9884540000000, !dbg !27 + %486 = fmul float %478, 0x3FD9884540000000, !dbg !27 + %487 = fmul float %47, %479, !dbg !28 + %488 = fmul float %48, %480, !dbg !28 + %489 = fmul float %49, %481, !dbg !28 + %490 = fmul float %50, %482, !dbg !28 + %491 = fmul float %51, %483, !dbg !28 + %492 = fmul float %52, %484, !dbg !28 + %493 = fmul float %53, %485, !dbg !28 + %494 = fmul float %54, %486, !dbg !28 + %495 = fadd float %439, %487, !dbg !29 + %496 = fadd float %440, %488, !dbg !29 + %497 = fadd float %441, %489, !dbg !29 + %498 = fadd float %442, %490, !dbg !29 + %499 = fadd float %443, %491, !dbg !29 + %500 = fadd float %444, %492, !dbg !29 + %501 = fadd float %445, %493, !dbg !29 + %502 = fadd float %446, %494, !dbg !29 + %503 = fmul float %25, %495, !dbg !30 + %504 = fmul float %26, %496, !dbg !30 + %505 = fmul float %27, %497, !dbg !30 + %506 = fmul float %28, %498, !dbg !30 + %507 = fmul float %29, %499, !dbg !30 + %508 = fmul float %30, %500, !dbg !30 + %509 = fmul float %31, %501, !dbg !30 + %510 = fmul float %32, %502, !dbg !30 + %511 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %503) #4, !dbg !31 + %512 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %504) #4, !dbg !31 + %513 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %505) #4, !dbg !31 + %514 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %506) #4, !dbg !31 + %515 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %507) #4, !dbg !31 + %516 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %508) #4, !dbg !31 + %517 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %509) #4, !dbg !31 + %518 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %510) #4, !dbg !31 + %519 = insertelement <2 x i16> undef, i16 %511, i64 0, !dbg !31 + %520 = insertelement <2 x i16> %519, i16 %512, i64 1, !dbg !31 + %521 = bitcast <2 x i16> %520 to i32, !dbg !31 + %522 = insertelement <2 x i16> undef, i16 %513, i64 0, !dbg !31 + %523 = insertelement <2 x i16> %522, i16 %514, i64 1, !dbg !31 + %524 = bitcast <2 x i16> %523 to i32, !dbg !31 + %525 = insertelement <2 x i16> undef, i16 %515, i64 0, !dbg !31 + %526 = insertelement <2 x i16> %525, i16 %516, i64 1, !dbg !31 + %527 = bitcast <2 x i16> %526 to i32, !dbg !31 + %528 = insertelement <2 x i16> undef, i16 %517, i64 0, !dbg !31 + %529 = insertelement <2 x i16> %528, i16 %518, i64 1, !dbg !31 + %530 = bitcast <2 x i16> %529 to i32, !dbg !31 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %521, i32 %524, i32 %527, i32 %530, ptr addrspace(1) %11, i1 true) #4, !dbg !31 + ret void, !dbg !32 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: alwaysinline nounwind +define float @__nv_erff(float %a) local_unnamed_addr #1 { +__nv_fabsf.exit: + %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not = icmp eq i32 %0, 0 + %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4 + %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4 + %.0 = select i1 %.not, float %2, float %1 + %3 = fcmp oge float %.0, 0x3FF00C1FC0000000 + br i1 %3, label %__nv_fabsf.exit1, label %5 + +__nv_fabsf.exit1: ; preds = %__nv_fabsf.exit + %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not1 = icmp eq i32 %4, 0 + %.01 = select i1 %.not1, float %2, float %1 + br label %__internal_fmad.exit + +5: ; preds = %__nv_fabsf.exit + %6 = fmul float %a, %a + br label %__internal_fmad.exit + +__internal_fmad.exit: ; preds = %5, %__nv_fabsf.exit1 + %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ] + %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ] + %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ] + %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ] + %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ] + %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ] + %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ] + %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ] + %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not2 = icmp eq i32 %15, 0 + %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4 + %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4 + %.02 = select i1 %.not2, float %17, float %16 + %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not3 = icmp eq i32 %18, 0 + %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4 + %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4 + %.03 = select i1 %.not3, float %20, float %19 + %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not4 = icmp eq i32 %21, 0 + %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4 + %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4 + %.04 = select i1 %.not4, float %23, float %22 + %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not5 = icmp eq i32 %24, 0 + %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4 + %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4 + %.05 = select i1 %.not5, float %26, float %25 + %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not6 = icmp eq i32 %27, 0 + %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4 + %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4 + %.06 = select i1 %.not6, float %29, float %28 + %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not7 = icmp eq i32 %30, 0 + %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4 + %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4 + %.07 = select i1 %.not7, float %32, float %31 + %33 = fneg float %14 + %34 = select i1 %3, float %33, float %a + %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4 + %.not8 = icmp eq i32 %35, 0 + %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4 + %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4 + %.08 = select i1 %.not8, float %37, float %36 + br i1 %3, label %38, label %46 + +38: ; preds = %__internal_fmad.exit + %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4 + %40 = fsub float 1.000000e+00, %39 + %41 = bitcast float %40 to i32 + %42 = bitcast float %a to i32 + %43 = and i32 %42, -2147483648 + %44 = or i32 %43, %41 + %45 = bitcast i32 %44 to float + br label %46 + +46: ; preds = %38, %__internal_fmad.exit + %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ] + ret float %r.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fabs.ftz.f(float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fabs.f(float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #4 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py", directory: "/tmp/torchinductor_root/5j") +!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 21, column: 36, scope: !7) +!11 = !DILocation(line: 20, column: 28, scope: !7) +!12 = !DILocation(line: 20, column: 33, scope: !7) +!13 = !DILocation(line: 21, column: 23, scope: !7) +!14 = !DILocation(line: 24, column: 34, scope: !7) +!15 = !DILocation(line: 24, column: 39, scope: !7) +!16 = !DILocation(line: 24, column: 48, scope: !7) +!17 = !DILocation(line: 25, column: 30, scope: !7) +!18 = !DILocation(line: 25, column: 35, scope: !7) +!19 = !DILocation(line: 25, column: 44, scope: !7) +!20 = !DILocation(line: 29, column: 18, scope: !7) +!21 = !DILocation(line: 30, column: 23, scope: !7) +!22 = !DILocation(line: 32, column: 18, scope: !7) +!23 = !DILocation(line: 34, column: 19, scope: !7) +!24 = !DILocation(line: 35, column: 19, scope: !7) +!25 = !DILocation(line: 37, column: 20, scope: !7) +!26 = !DILocation(line: 38, column: 19, scope: !7) +!27 = !DILocation(line: 40, column: 20, scope: !7) +!28 = !DILocation(line: 41, column: 19, scope: !7) +!29 = !DILocation(line: 42, column: 20, scope: !7) +!30 = !DILocation(line: 43, column: 19, scope: !7) +!31 = !DILocation(line: 45, column: 40, scope: !7) +!32 = !DILocation(line: 45, column: 4, scope: !7) diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..7d915f749f517d5b7fd431b32400f530961585b2 --- /dev/null +++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttgir @@ -0,0 +1,38 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.398942292> : tensor<1024xf32, #blocked> + %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32, #blocked> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked> + %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %12 = arith.extf %11 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked> + %13 = arith.mulf %12, %cst_3 : tensor<1024xf32, #blocked> + %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked> + %15 = arith.addf %14, %cst_2 : tensor<1024xf32, #blocked> + %16 = arith.mulf %15, %cst_1 : tensor<1024xf32, #blocked> + %17 = arith.mulf %12, %12 : tensor<1024xf32, #blocked> + %18 = arith.mulf %17, %cst_0 : tensor<1024xf32, #blocked> + %19 = math.exp %18 : tensor<1024xf32, #blocked> + %20 = arith.mulf %19, %cst : tensor<1024xf32, #blocked> + %21 = arith.mulf %12, %20 : tensor<1024xf32, #blocked> + %22 = arith.addf %16, %21 : tensor<1024xf32, #blocked> + %23 = arith.mulf %8, %22 : tensor<1024xf32, #blocked> + %24 = arith.truncf %23 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> + tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..297b7574c8f32a6d7465900b7eb16ea68ebde7dd --- /dev/null +++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ttir @@ -0,0 +1,37 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.398942292> : tensor<1024xf32> + %cst_0 = arith.constant dense<-5.000000e-01> : tensor<1024xf32> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<1024xf32> + %cst_3 = arith.constant dense<0.707106769> : tensor<1024xf32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %12 = arith.extf %11 : tensor<1024xbf16> to tensor<1024xf32> + %13 = arith.mulf %12, %cst_3 : tensor<1024xf32> + %14 = tt.extern_elementwise %13 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32> + %15 = arith.addf %14, %cst_2 : tensor<1024xf32> + %16 = arith.mulf %15, %cst_1 : tensor<1024xf32> + %17 = arith.mulf %12, %12 : tensor<1024xf32> + %18 = arith.mulf %17, %cst_0 : tensor<1024xf32> + %19 = math.exp %18 : tensor<1024xf32> + %20 = arith.mulf %19, %cst : tensor<1024xf32> + %21 = arith.mulf %12, %20 : tensor<1024xf32> + %22 = arith.addf %16, %21 : tensor<1024xf32> + %23 = arith.mulf %8, %22 : tensor<1024xf32> + %24 = arith.truncf %23 : tensor<1024xf32> to tensor<1024xbf16> + tt.store %6, %24 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16> + tt.return + } +} diff --git a/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..4b6ce81d90e355b95065df75370d706c0d6f2896 --- /dev/null +++ b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.llir @@ -0,0 +1,318 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !5 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %12 = and i32 %11, 31, !dbg !8 + %13 = lshr i32 %11, 5, !dbg !8 + %14 = and i32 %13, 1, !dbg !8 + %urem = shl i32 %11, 2, !dbg !8 + %15 = and i32 %urem, 252, !dbg !8 + %16 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %17 = shl i32 %16, 8, !dbg !10 + %18 = or i32 %17, %15, !dbg !11 + %19 = sext i32 %18 to i64, !dbg !12 + %20 = getelementptr i16, ptr addrspace(1) %1, i64 %19, !dbg !12 + %21 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 + %22 = extractvalue { i32, i32 } %21, 0, !dbg !13 + %23 = extractvalue { i32, i32 } %21, 1, !dbg !13 + %24 = trunc i32 %22 to i16, !dbg !13 + %extelt.offset = lshr i32 %22, 16, !dbg !13 + %25 = trunc i32 %extelt.offset to i16, !dbg !13 + %26 = trunc i32 %23 to i16, !dbg !13 + %extelt.offset1 = lshr i32 %23, 16, !dbg !13 + %27 = trunc i32 %extelt.offset1 to i16, !dbg !13 + %28 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %24) #3, !dbg !14 + %29 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %25) #3, !dbg !14 + %30 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %26) #3, !dbg !14 + %31 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %27) #3, !dbg !14 + %32 = zext nneg i32 %15 to i64, !dbg !15 + %33 = getelementptr float, ptr addrspace(1) %2, i64 %32, !dbg !15 + %34 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %33, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %35 = extractvalue { i32, i32, i32, i32 } %34, 0, !dbg !16 + %36 = extractvalue { i32, i32, i32, i32 } %34, 1, !dbg !16 + %37 = extractvalue { i32, i32, i32, i32 } %34, 2, !dbg !16 + %38 = extractvalue { i32, i32, i32, i32 } %34, 3, !dbg !16 + %39 = bitcast i32 %35 to float, !dbg !16 + %40 = bitcast i32 %36 to float, !dbg !16 + %41 = bitcast i32 %37 to float, !dbg !16 + %42 = bitcast i32 %38 to float, !dbg !16 + %43 = getelementptr float, ptr addrspace(1) %3, i64 %19, !dbg !17 + %44 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %43, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %45 = extractvalue { i32, i32, i32, i32 } %44, 0, !dbg !18 + %46 = extractvalue { i32, i32, i32, i32 } %44, 1, !dbg !18 + %47 = extractvalue { i32, i32, i32, i32 } %44, 2, !dbg !18 + %48 = extractvalue { i32, i32, i32, i32 } %44, 3, !dbg !18 + %49 = bitcast i32 %45 to float, !dbg !18 + %50 = bitcast i32 %46 to float, !dbg !18 + %51 = bitcast i32 %47 to float, !dbg !18 + %52 = bitcast i32 %48 to float, !dbg !18 + %53 = getelementptr i16, ptr addrspace(1) %4, i64 %19, !dbg !19 + %54 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %53, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !20 + %55 = extractvalue { i32, i32 } %54, 0, !dbg !20 + %56 = extractvalue { i32, i32 } %54, 1, !dbg !20 + %57 = trunc i32 %55 to i16, !dbg !20 + %extelt.offset2 = lshr i32 %55, 16, !dbg !20 + %58 = trunc i32 %extelt.offset2 to i16, !dbg !20 + %59 = trunc i32 %56 to i16, !dbg !20 + %extelt.offset3 = lshr i32 %56, 16, !dbg !20 + %60 = trunc i32 %extelt.offset3 to i16, !dbg !20 + %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !21 + %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #3, !dbg !21 + %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #3, !dbg !21 + %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #3, !dbg !21 + %65 = sext i32 %16 to i64, !dbg !22 + %66 = getelementptr float, ptr addrspace(1) %5, i64 %65, !dbg !22 + %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23 + %68 = bitcast i32 %67 to float, !dbg !23 + %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23 + %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23 + %71 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !23 + %72 = getelementptr float, ptr addrspace(1) %6, i64 %65, !dbg !24 + %73 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25 + %74 = bitcast i32 %73 to float, !dbg !25 + %75 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25 + %76 = bitcast i32 %75 to float, !dbg !25 + %77 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25 + %78 = bitcast i32 %77 to float, !dbg !25 + %79 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !25 + %80 = bitcast i32 %79 to float, !dbg !25 + %81 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !26 + %82 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !27 + %83 = extractvalue { i32, i32, i32, i32 } %82, 0, !dbg !27 + %84 = extractvalue { i32, i32, i32, i32 } %82, 1, !dbg !27 + %85 = extractvalue { i32, i32, i32, i32 } %82, 2, !dbg !27 + %86 = extractvalue { i32, i32, i32, i32 } %82, 3, !dbg !27 + %87 = bitcast i32 %83 to float, !dbg !27 + %88 = bitcast i32 %84 to float, !dbg !27 + %89 = bitcast i32 %85 to float, !dbg !27 + %90 = bitcast i32 %86 to float, !dbg !27 + %91 = fmul float %28, %39, !dbg !28 + %92 = fmul float %29, %40, !dbg !28 + %93 = fmul float %30, %41, !dbg !28 + %94 = fmul float %31, %42, !dbg !28 + %95 = fadd float %91, %92, !dbg !29 + %96 = fadd float %93, %95, !dbg !29 + %97 = fadd float %94, %96, !dbg !29 + %98 = bitcast float %97 to i32, !dbg !35 + %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !35 + %100 = bitcast i32 %99 to float, !dbg !35 + %101 = fadd float %97, %100, !dbg !29 + %102 = bitcast float %101 to i32, !dbg !35 + %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !35 + %104 = bitcast i32 %103 to float, !dbg !35 + %105 = fadd float %101, %104, !dbg !29 + %106 = bitcast float %105 to i32, !dbg !35 + %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !35 + %108 = bitcast i32 %107 to float, !dbg !35 + %109 = fadd float %105, %108, !dbg !29 + %110 = bitcast float %109 to i32, !dbg !35 + %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !35 + %112 = bitcast i32 %111 to float, !dbg !35 + %113 = fadd float %109, %112, !dbg !29 + %114 = bitcast float %113 to i32, !dbg !35 + %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !35 + %116 = bitcast i32 %115 to float, !dbg !35 + %117 = fadd float %113, %116, !dbg !29 + %118 = icmp eq i32 %12, 0, !dbg !35 + %119 = zext nneg i32 %14 to i64, !dbg !35 + %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !35 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !35 + tail call void @llvm.nvvm.barrier0(), !dbg !35 + %121 = icmp slt i32 %11, 2, !dbg !35 + %122 = sext i32 %11 to i64, !dbg !35 + %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !35 + %124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !35 + %125 = bitcast float %124 to i32, !dbg !35 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !35 + %127 = bitcast i32 %126 to float, !dbg !35 + %128 = fadd float %124, %127, !dbg !29 + %129 = and i32 %11, 1, !dbg !35 + %130 = icmp eq i32 %129, 0, !dbg !35 + %131 = and i1 %121, %130, !dbg !35 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %128, i1 %131) #3, !dbg !35 + tail call void @llvm.nvvm.barrier0(), !dbg !35 + %132 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35 + %133 = fadd float %132, 0.000000e+00, !dbg !37 + %134 = fadd float %61, %49, !dbg !41 + %135 = fadd float %62, %50, !dbg !41 + %136 = fadd float %63, %51, !dbg !41 + %137 = fadd float %64, %52, !dbg !41 + %138 = fsub float %134, %68, !dbg !42 + %139 = fsub float %135, %68, !dbg !42 + %140 = fsub float %136, %68, !dbg !42 + %141 = fsub float %137, %68, !dbg !42 + %142 = fmul float %138, %74, !dbg !43 + %143 = fmul float %139, %74, !dbg !43 + %144 = fmul float %140, %74, !dbg !43 + %145 = fmul float %141, %74, !dbg !43 + %146 = fmul float %91, %142, !dbg !44 + %147 = fmul float %92, %143, !dbg !44 + %148 = fmul float %93, %144, !dbg !44 + %149 = fmul float %94, %145, !dbg !44 + tail call void @llvm.nvvm.barrier0(), !dbg !45 + %150 = fadd float %146, %147, !dbg !47 + %151 = fadd float %148, %150, !dbg !47 + %152 = fadd float %149, %151, !dbg !47 + %153 = bitcast float %152 to i32, !dbg !45 + %154 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %153, i32 16, i32 31), !dbg !45 + %155 = bitcast i32 %154 to float, !dbg !45 + %156 = fadd float %152, %155, !dbg !47 + %157 = bitcast float %156 to i32, !dbg !45 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 8, i32 31), !dbg !45 + %159 = bitcast i32 %158 to float, !dbg !45 + %160 = fadd float %156, %159, !dbg !47 + %161 = bitcast float %160 to i32, !dbg !45 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 4, i32 31), !dbg !45 + %163 = bitcast i32 %162 to float, !dbg !45 + %164 = fadd float %160, %163, !dbg !47 + %165 = bitcast float %164 to i32, !dbg !45 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !45 + %167 = bitcast i32 %166 to float, !dbg !45 + %168 = fadd float %164, %167, !dbg !47 + %169 = bitcast float %168 to i32, !dbg !45 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 1, i32 31), !dbg !45 + %171 = bitcast i32 %170 to float, !dbg !45 + %172 = fadd float %168, %171, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %172, i1 %118) #3, !dbg !45 + tail call void @llvm.nvvm.barrier0(), !dbg !45 + %173 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !45 + %174 = bitcast float %173 to i32, !dbg !45 + %175 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %174, i32 1, i32 31), !dbg !45 + %176 = bitcast i32 %175 to float, !dbg !45 + %177 = fadd float %173, %176, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %177, i1 %131) #3, !dbg !45 + tail call void @llvm.nvvm.barrier0(), !dbg !45 + %178 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !45 + %179 = fadd float %178, 0.000000e+00, !dbg !50 + %180 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %74, float 2.560000e+02) #3, !dbg !52 + %181 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %76, float 2.560000e+02) #3, !dbg !52 + %182 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %78, float 2.560000e+02) #3, !dbg !52 + %183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %80, float 2.560000e+02) #3, !dbg !52 + %184 = fmul float %91, 2.560000e+02, !dbg !53 + %185 = fmul float %92, 2.560000e+02, !dbg !53 + %186 = fmul float %93, 2.560000e+02, !dbg !53 + %187 = fmul float %94, 2.560000e+02, !dbg !53 + %188 = fsub float %184, %133, !dbg !54 + %189 = fsub float %185, %133, !dbg !54 + %190 = fsub float %186, %133, !dbg !54 + %191 = fsub float %187, %133, !dbg !54 + %192 = fmul float %142, %179, !dbg !55 + %193 = fmul float %143, %179, !dbg !55 + %194 = fmul float %144, %179, !dbg !55 + %195 = fmul float %145, %179, !dbg !55 + %196 = fsub float %188, %192, !dbg !56 + %197 = fsub float %189, %193, !dbg !56 + %198 = fsub float %190, %194, !dbg !56 + %199 = fsub float %191, %195, !dbg !56 + %200 = fmul float %180, %196, !dbg !57 + %201 = fmul float %180, %197, !dbg !57 + %202 = fmul float %180, %198, !dbg !57 + %203 = fmul float %180, %199, !dbg !57 + %204 = fadd float %200, %87, !dbg !58 + %205 = fadd float %201, %88, !dbg !58 + %206 = fadd float %202, %89, !dbg !58 + %207 = fadd float %203, %90, !dbg !58 + %208 = bitcast float %204 to i32, !dbg !59 + %209 = bitcast float %205 to i32, !dbg !59 + %210 = bitcast float %206 to i32, !dbg !59 + %211 = bitcast float %207 to i32, !dbg !59 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %81, i1 true) #3, !dbg !59 + %212 = getelementptr i16, ptr addrspace(1) %7, i64 %19, !dbg !60 + %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #3, !dbg !61 + %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #3, !dbg !61 + %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #3, !dbg !61 + %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %207) #3, !dbg !61 + %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !61 + %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !61 + %219 = bitcast <2 x i16> %218 to i32, !dbg !61 + %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !61 + %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !61 + %222 = bitcast <2 x i16> %221 to i32, !dbg !61 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #3, !dbg !61 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cfhjzwujbd4bpel57x4hxw7d3m3qqfwrjg6bfe6e4wk2cyh77u45.py", directory: "/tmp/torchinductor_root/fh") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 26, column: 26, scope: !5) +!9 = !DILocation(line: 23, column: 28, scope: !5) +!10 = !DILocation(line: 30, column: 40, scope: !5) +!11 = !DILocation(line: 30, column: 36, scope: !5) +!12 = !DILocation(line: 30, column: 30, scope: !5) +!13 = !DILocation(line: 30, column: 46, scope: !5) +!14 = !DILocation(line: 30, column: 67, scope: !5) +!15 = !DILocation(line: 31, column: 30, scope: !5) +!16 = !DILocation(line: 31, column: 35, scope: !5) +!17 = !DILocation(line: 32, column: 30, scope: !5) +!18 = !DILocation(line: 32, column: 46, scope: !5) +!19 = !DILocation(line: 33, column: 30, scope: !5) +!20 = !DILocation(line: 33, column: 46, scope: !5) +!21 = !DILocation(line: 33, column: 67, scope: !5) +!22 = !DILocation(line: 34, column: 31, scope: !5) +!23 = !DILocation(line: 34, column: 36, scope: !5) +!24 = !DILocation(line: 35, column: 31, scope: !5) +!25 = !DILocation(line: 35, column: 36, scope: !5) +!26 = !DILocation(line: 36, column: 35, scope: !5) +!27 = !DILocation(line: 36, column: 51, scope: !5) +!28 = !DILocation(line: 38, column: 18, scope: !5) +!29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33) +!30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0) +!31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0) +!33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34) +!34 = !DILocation(line: 41, column: 57, scope: !30) +!35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36) +!36 = !DILocation(line: 41, column: 57, scope: !32) +!37 = !DILocation(line: 8, column: 15, scope: !38, inlinedAt: !40) +!38 = distinct !DILexicalBlockFile(scope: !5, file: !39, discriminator: 0) +!39 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!40 = !DILocation(line: 41, column: 44, scope: !38) +!41 = !DILocation(line: 43, column: 19, scope: !5) +!42 = !DILocation(line: 44, column: 20, scope: !5) +!43 = !DILocation(line: 45, column: 20, scope: !5) +!44 = !DILocation(line: 46, column: 19, scope: !5) +!45 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !46) +!46 = !DILocation(line: 49, column: 59, scope: !32) +!47 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !48) +!48 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !49) +!49 = !DILocation(line: 49, column: 59, scope: !30) +!50 = !DILocation(line: 8, column: 15, scope: !38, inlinedAt: !51) +!51 = !DILocation(line: 49, column: 45, scope: !38) +!52 = !DILocation(line: 51, column: 20, scope: !5) +!53 = !DILocation(line: 52, column: 19, scope: !5) +!54 = !DILocation(line: 53, column: 20, scope: !5) +!55 = !DILocation(line: 54, column: 20, scope: !5) +!56 = !DILocation(line: 55, column: 20, scope: !5) +!57 = !DILocation(line: 56, column: 20, scope: !5) +!58 = !DILocation(line: 57, column: 20, scope: !5) +!59 = !DILocation(line: 59, column: 51, scope: !5) +!60 = !DILocation(line: 60, column: 25, scope: !5) +!61 = !DILocation(line: 60, column: 48, scope: !5) +!62 = !DILocation(line: 60, column: 4, scope: !5) diff --git a/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttir b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..7b6a331fe45289d067b16eea74d24da2bce39bc9 --- /dev/null +++ b/.triton/dump/a99a310eb97b8a71c8c7102625cff179/triton_.ttir @@ -0,0 +1,77 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %9 = arith.extf %8 : tensor<256xbf16> to tensor<256xf32> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %12 = tt.load %11, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %16 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr> + %17 = tt.addptr %16, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %18 = tt.load %17, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %19 = arith.extf %18 : tensor<256xbf16> to tensor<256xf32> + %20 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %21 = tt.splat %20 : (!tt.ptr) -> tensor<1x!tt.ptr> + %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %23 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %24 = tt.splat %23 : (!tt.ptr) -> tensor<1x!tt.ptr> + %25 = tt.load %24 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %26 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %27 = tt.addptr %26, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %28 = tt.load %27, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %29 = arith.mulf %9, %12 : tensor<256xf32> + %30 = arith.select %2, %29, %cst_1 : tensor<256xi1>, tensor<256xf32> + %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %55 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %55 : f32 + }) : (tensor<256xf32>) -> f32 + %32 = arith.addf %31, %cst_0 : f32 + %33 = arith.addf %15, %19 : tensor<256xf32> + %34 = tt.broadcast %22 : (tensor<1xf32>) -> tensor<256xf32> + %35 = arith.subf %33, %34 : tensor<256xf32> + %36 = tt.broadcast %25 : (tensor<1xf32>) -> tensor<256xf32> + %37 = arith.mulf %35, %36 : tensor<256xf32> + %38 = arith.mulf %29, %37 : tensor<256xf32> + %39 = arith.select %2, %38, %cst_1 : tensor<256xi1>, tensor<256xf32> + %40 = "tt.reduce"(%39) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %55 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %55 : f32 + }) : (tensor<256xf32>) -> f32 + %41 = arith.addf %40, %cst_0 : f32 + %42 = arith.divf %25, %cst_3 : tensor<1xf32> + %43 = arith.mulf %29, %cst_2 : tensor<256xf32> + %44 = tt.splat %32 : (f32) -> tensor<256xf32> + %45 = arith.subf %43, %44 : tensor<256xf32> + %46 = tt.splat %41 : (f32) -> tensor<256xf32> + %47 = arith.mulf %37, %46 : tensor<256xf32> + %48 = arith.subf %45, %47 : tensor<256xf32> + %49 = tt.broadcast %42 : (tensor<1xf32>) -> tensor<256xf32> + %50 = arith.mulf %49, %48 : tensor<256xf32> + %51 = arith.addf %28, %50 : tensor<256xf32> + tt.store %27, %51, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %52 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr> + %53 = tt.addptr %52, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %54 = arith.truncf %51 : tensor<256xf32> to tensor<256xbf16> + tt.store %53, %54, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.cubin b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c447423b8f2c1a330ae40f14eaf726654dfb1a72 Binary files /dev/null and b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.cubin differ diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.llir b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..177af98319e61ba549a899d48b265ba4d24f9dcd --- /dev/null +++ b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.llir @@ -0,0 +1,379 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_0 = internal constant [38 x i8] c"" +@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257" +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr + +define void @triton__0d1d2d3d4d5d6d7d8de9de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, i32 %8, i32 %9) local_unnamed_addr !dbg !7 { + %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %urem = shl i32 %11, 2, !dbg !10 + %12 = and i32 %urem, 252, !dbg !10 + %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %14 = srem i32 %13, 512, !dbg !12 + %15 = sext i32 %13 to i64, !dbg !13 + %16 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !13 + %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !14 + %18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !14 + %19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !14 + %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !14 + %21 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %16, i1 true) #6, !dbg !14 + %22 = shl nsw i32 %14, 8, !dbg !15 + %23 = or i32 %22, %12, !dbg !16 + %24 = sext i32 %23 to i64, !dbg !17 + %25 = getelementptr float, ptr addrspace(1) %2, i64 %24, !dbg !17 + %26 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18 + %27 = shl i32 %13, 8, !dbg !19 + %28 = or i32 %27, %12, !dbg !20 + %29 = sext i32 %28 to i64, !dbg !21 + %30 = getelementptr i16, ptr addrspace(1) %3, i64 %29, !dbg !21 + %31 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !22 + %32 = extractvalue { i32, i32 } %31, 0, !dbg !22 + %33 = extractvalue { i32, i32 } %31, 1, !dbg !22 + %34 = trunc i32 %32 to i16, !dbg !22 + %extelt.offset = lshr i32 %32, 16, !dbg !22 + %35 = trunc i32 %extelt.offset to i16, !dbg !22 + %36 = trunc i32 %33 to i16, !dbg !22 + %extelt.offset1 = lshr i32 %33, 16, !dbg !22 + %37 = trunc i32 %extelt.offset1 to i16, !dbg !22 + %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !23 + %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !23 + %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !23 + %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !23 + %42 = getelementptr i16, ptr addrspace(1) %4, i64 %29, !dbg !24 + %43 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %42, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !25 + %44 = extractvalue { i32, i32 } %43, 0, !dbg !25 + %45 = extractvalue { i32, i32 } %43, 1, !dbg !25 + %46 = trunc i32 %44 to i16, !dbg !25 + %extelt.offset2 = lshr i32 %44, 16, !dbg !25 + %47 = trunc i32 %extelt.offset2 to i16, !dbg !25 + %48 = trunc i32 %45 to i16, !dbg !25 + %extelt.offset3 = lshr i32 %45, 16, !dbg !25 + %49 = trunc i32 %extelt.offset3 to i16, !dbg !25 + %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !26 + %51 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #6, !dbg !26 + %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !26 + %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !26 + %54 = zext nneg i32 %12 to i64, !dbg !27 + %55 = getelementptr float, ptr addrspace(1) %5, i64 %54, !dbg !27 + %56 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %55, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28 + %57 = add i64 %21, 50257, !dbg !29 + %58 = icmp slt i64 %21, 0, !dbg !30 + %59 = select i1 %58, i64 %57, i64 %21, !dbg !31 + %60 = icmp ugt i64 %59, 50256, !dbg !32 + br i1 %60, label %61, label %62, !dbg !33 + +61: ; preds = %10 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !33 + br label %62, !dbg !33 + +62: ; preds = %61, %10 + %63 = icmp slt i64 %17, 0, !dbg !30 + %64 = extractvalue { i32, i32, i32, i32 } %26, 3, !dbg !18 + %65 = bitcast i32 %64 to float, !dbg !18 + %66 = extractvalue { i32, i32, i32, i32 } %26, 2, !dbg !18 + %67 = bitcast i32 %66 to float, !dbg !18 + %68 = extractvalue { i32, i32, i32, i32 } %26, 1, !dbg !18 + %69 = extractvalue { i32, i32, i32, i32 } %26, 0, !dbg !18 + %70 = lshr i32 %11, 5, !dbg !10 + %71 = and i32 %70, 1, !dbg !10 + %72 = and i32 %11, 31, !dbg !10 + %73 = shl i64 %17, 8, !dbg !34 + %74 = add i64 %73, 12865792, !dbg !34 + %75 = select i1 %63, i64 %74, i64 %73, !dbg !34 + %76 = or i64 %75, %54, !dbg !35 + %77 = getelementptr float, ptr addrspace(1) %1, i64 %76, !dbg !36 + %78 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %77, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37 + %79 = extractvalue { i32, i32, i32, i32 } %78, 0, !dbg !37 + %80 = extractvalue { i32, i32, i32, i32 } %78, 1, !dbg !37 + %81 = extractvalue { i32, i32, i32, i32 } %78, 2, !dbg !37 + %82 = extractvalue { i32, i32, i32, i32 } %78, 3, !dbg !37 + %83 = bitcast i32 %81 to float, !dbg !37 + %84 = bitcast i32 %82 to float, !dbg !37 + %85 = fadd float %67, %83, !dbg !38 + %86 = fadd float %65, %84, !dbg !38 + %87 = fadd float %40, %85, !dbg !39 + %88 = fadd float %41, %86, !dbg !39 + %89 = insertelement <2 x i32> poison, i32 %69, i64 0, !dbg !18 + %90 = insertelement <2 x i32> %89, i32 %68, i64 1, !dbg !18 + %91 = bitcast <2 x i32> %90 to <2 x float>, !dbg !18 + %92 = insertelement <2 x i32> poison, i32 %79, i64 0, !dbg !37 + %93 = insertelement <2 x i32> %92, i32 %80, i64 1, !dbg !37 + %94 = bitcast <2 x i32> %93 to <2 x float>, !dbg !37 + %95 = fadd <2 x float> %91, %94, !dbg !38 + %96 = insertelement <2 x float> poison, float %38, i64 0, !dbg !39 + %97 = insertelement <2 x float> %96, float %39, i64 1, !dbg !39 + %98 = fadd <2 x float> %97, %95, !dbg !39 + %99 = insertelement <2 x float> poison, float %50, i64 0, !dbg !40 + %100 = insertelement <2 x float> %99, float %51, i64 1, !dbg !40 + %101 = fadd <2 x float> %100, %98, !dbg !40 + %102 = fadd float %52, %87, !dbg !40 + %103 = fadd float %53, %88, !dbg !40 + %104 = extractelement <2 x float> %101, i64 0, !dbg !41 + %105 = extractelement <2 x float> %101, i64 1, !dbg !41 + %106 = fadd float %104, %105, !dbg !41 + %107 = fadd float %102, %106, !dbg !41 + %108 = fadd float %103, %107, !dbg !41 + %109 = bitcast float %108 to i32, !dbg !47 + %110 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %109, i32 16, i32 31), !dbg !47 + %111 = bitcast i32 %110 to float, !dbg !47 + %112 = fadd float %108, %111, !dbg !41 + %113 = bitcast float %112 to i32, !dbg !47 + %114 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %113, i32 8, i32 31), !dbg !47 + %115 = bitcast i32 %114 to float, !dbg !47 + %116 = fadd float %112, %115, !dbg !41 + %117 = bitcast float %116 to i32, !dbg !47 + %118 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %117, i32 4, i32 31), !dbg !47 + %119 = bitcast i32 %118 to float, !dbg !47 + %120 = fadd float %116, %119, !dbg !41 + %121 = bitcast float %120 to i32, !dbg !47 + %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 2, i32 31), !dbg !47 + %123 = bitcast i32 %122 to float, !dbg !47 + %124 = fadd float %120, %123, !dbg !41 + %125 = bitcast float %124 to i32, !dbg !47 + %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 1, i32 31), !dbg !47 + %127 = bitcast i32 %126 to float, !dbg !47 + %128 = fadd float %124, %127, !dbg !41 + %129 = icmp eq i32 %72, 0, !dbg !47 + %130 = zext nneg i32 %71 to i64, !dbg !47 + %131 = getelementptr float, ptr addrspace(3) @global_smem, i64 %130, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %131, float %128, i1 %129) #6, !dbg !47 + tail call void @llvm.nvvm.barrier0(), !dbg !47 + %132 = icmp slt i32 %11, 2, !dbg !47 + %133 = sext i32 %11 to i64, !dbg !47 + %134 = getelementptr float, ptr addrspace(3) @global_smem, i64 %133, !dbg !47 + %135 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %134, i1 %132) #6, !dbg !47 + %136 = bitcast float %135 to i32, !dbg !47 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !47 + %138 = bitcast i32 %137 to float, !dbg !47 + %139 = fadd float %135, %138, !dbg !41 + %140 = and i32 %11, 1, !dbg !47 + %141 = icmp eq i32 %140, 0, !dbg !47 + %142 = and i1 %132, %141, !dbg !47 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %134, float %139, i1 %142) #6, !dbg !47 + tail call void @llvm.nvvm.barrier0(), !dbg !47 + %143 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !47 + %144 = fadd float %143, 0.000000e+00, !dbg !49 + %145 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %144, float 2.560000e+02) #6, !dbg !53 + %146 = fsub float %104, %145, !dbg !54 + %147 = fsub float %105, %145, !dbg !54 + %148 = fsub float %102, %145, !dbg !54 + %149 = fsub float %103, %145, !dbg !54 + %150 = fmul float %146, %146, !dbg !55 + %151 = fmul float %147, %147, !dbg !55 + %152 = fmul float %148, %148, !dbg !55 + %153 = fmul float %149, %149, !dbg !55 + tail call void @llvm.nvvm.barrier0(), !dbg !56 + %154 = fadd float %150, %151, !dbg !58 + %155 = fadd float %152, %154, !dbg !58 + %156 = fadd float %153, %155, !dbg !58 + %157 = bitcast float %156 to i32, !dbg !56 + %158 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %157, i32 16, i32 31), !dbg !56 + %159 = bitcast i32 %158 to float, !dbg !56 + %160 = fadd float %156, %159, !dbg !58 + %161 = bitcast float %160 to i32, !dbg !56 + %162 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %161, i32 8, i32 31), !dbg !56 + %163 = bitcast i32 %162 to float, !dbg !56 + %164 = fadd float %160, %163, !dbg !58 + %165 = bitcast float %164 to i32, !dbg !56 + %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 4, i32 31), !dbg !56 + %167 = bitcast i32 %166 to float, !dbg !56 + %168 = fadd float %164, %167, !dbg !58 + %169 = bitcast float %168 to i32, !dbg !56 + %170 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %169, i32 2, i32 31), !dbg !56 + %171 = bitcast i32 %170 to float, !dbg !56 + %172 = fadd float %168, %171, !dbg !58 + %173 = bitcast float %172 to i32, !dbg !56 + %174 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %173, i32 1, i32 31), !dbg !56 + %175 = bitcast i32 %174 to float, !dbg !56 + %176 = fadd float %172, %175, !dbg !58 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %131, float %176, i1 %129) #6, !dbg !56 + tail call void @llvm.nvvm.barrier0(), !dbg !56 + %177 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %134, i1 %132) #6, !dbg !56 + %178 = bitcast float %177 to i32, !dbg !56 + %179 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %178, i32 1, i32 31), !dbg !56 + %180 = bitcast i32 %179 to float, !dbg !56 + %181 = fadd float %177, %180, !dbg !58 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %134, float %181, i1 %142) #6, !dbg !56 + tail call void @llvm.nvvm.barrier0(), !dbg !56 + %182 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !56 + %183 = fadd float %182, 0.000000e+00, !dbg !61 + %184 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %183, float 2.560000e+02) #6, !dbg !63 + %185 = fadd float %184, 0x3EE4F8B580000000, !dbg !64 + %186 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !65 + %.not.i = icmp eq i32 %186, 0, !dbg !65 + br i1 %.not.i, label %189, label %187, !dbg !65 + +187: ; preds = %62 + %188 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %185), !dbg !65 + br label %__nv_rsqrtf.exit, !dbg !65 + +189: ; preds = %62 + %190 = tail call float @llvm.nvvm.rsqrt.approx.f(float %185), !dbg !65 + br label %__nv_rsqrtf.exit, !dbg !65 + +__nv_rsqrtf.exit: ; preds = %187, %189 + %.0.i = phi float [ %188, %187 ], [ %190, %189 ], !dbg !65 + %191 = extractvalue { i32, i32, i32, i32 } %56, 0, !dbg !28 + %192 = bitcast i32 %191 to float, !dbg !28 + %193 = extractvalue { i32, i32, i32, i32 } %56, 1, !dbg !28 + %194 = bitcast i32 %193 to float, !dbg !28 + %195 = extractvalue { i32, i32, i32, i32 } %56, 2, !dbg !28 + %196 = bitcast i32 %195 to float, !dbg !28 + %197 = extractvalue { i32, i32, i32, i32 } %56, 3, !dbg !28 + %198 = bitcast i32 %197 to float, !dbg !28 + %199 = fmul float %146, %.0.i, !dbg !66 + %200 = fmul float %147, %.0.i, !dbg !66 + %201 = fmul float %148, %.0.i, !dbg !66 + %202 = fmul float %149, %.0.i, !dbg !66 + %203 = fmul float %199, %192, !dbg !67 + %204 = fmul float %200, %194, !dbg !67 + %205 = fmul float %201, %196, !dbg !67 + %206 = fmul float %202, %198, !dbg !67 + %207 = getelementptr float, ptr addrspace(1) %6, i64 %29, !dbg !68 + %208 = bitcast float %104 to i32, !dbg !69 + %209 = bitcast float %105 to i32, !dbg !69 + %210 = bitcast float %102 to i32, !dbg !69 + %211 = bitcast float %103 to i32, !dbg !69 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %208, i32 %209, i32 %210, i32 %211, ptr addrspace(1) %207, i1 true) #6, !dbg !69 + %212 = getelementptr i16, ptr addrspace(1) %7, i64 %29, !dbg !70 + %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %203) #6, !dbg !71 + %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %204) #6, !dbg !71 + %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !71 + %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !71 + %217 = insertelement <2 x i16> undef, i16 %213, i64 0, !dbg !71 + %218 = insertelement <2 x i16> %217, i16 %214, i64 1, !dbg !71 + %219 = bitcast <2 x i16> %218 to i32, !dbg !71 + %220 = insertelement <2 x i16> undef, i16 %215, i64 0, !dbg !71 + %221 = insertelement <2 x i16> %220, i16 %216, i64 1, !dbg !71 + %222 = bitcast <2 x i16> %221 to i32, !dbg !71 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %219, i32 %222, ptr addrspace(1) %212, i1 true) #6, !dbg !71 + ret void, !dbg !72 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cjmwwjkk6clul35jp6rpoowqvjcl7grtlxvbncluwgfnvz5hpoxs.py", directory: "/tmp/torchinductor_root/jm") +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8de9de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8de9de", linkageName: "triton__0d1d2d3d4d5d6d7d8de9de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 18, scope: !7) +!13 = !DILocation(line: 31, column: 30, scope: !7) +!14 = !DILocation(line: 31, column: 35, scope: !7) +!15 = !DILocation(line: 32, column: 40, scope: !7) +!16 = !DILocation(line: 32, column: 36, scope: !7) +!17 = !DILocation(line: 32, column: 30, scope: !7) +!18 = !DILocation(line: 32, column: 46, scope: !7) +!19 = !DILocation(line: 33, column: 40, scope: !7) +!20 = !DILocation(line: 33, column: 36, scope: !7) +!21 = !DILocation(line: 33, column: 30, scope: !7) +!22 = !DILocation(line: 33, column: 46, scope: !7) +!23 = !DILocation(line: 33, column: 67, scope: !7) +!24 = !DILocation(line: 34, column: 31, scope: !7) +!25 = !DILocation(line: 34, column: 47, scope: !7) +!26 = !DILocation(line: 34, column: 68, scope: !7) +!27 = !DILocation(line: 35, column: 31, scope: !7) +!28 = !DILocation(line: 35, column: 36, scope: !7) +!29 = !DILocation(line: 36, column: 18, scope: !7) +!30 = !DILocation(line: 37, column: 18, scope: !7) +!31 = !DILocation(line: 38, column: 32, scope: !7) +!32 = !DILocation(line: 39, column: 36, scope: !7) +!33 = !DILocation(line: 39, column: 51, scope: !7) +!34 = !DILocation(line: 40, column: 40, scope: !7) +!35 = !DILocation(line: 40, column: 36, scope: !7) +!36 = !DILocation(line: 40, column: 30, scope: !7) +!37 = !DILocation(line: 40, column: 48, scope: !7) +!38 = !DILocation(line: 41, column: 18, scope: !7) +!39 = !DILocation(line: 43, column: 18, scope: !7) +!40 = !DILocation(line: 45, column: 19, scope: !7) +!41 = !DILocation(line: 233, column: 15, scope: !42, inlinedAt: !45) +!42 = distinct !DILexicalBlockFile(scope: !44, file: !43, discriminator: 0) +!43 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!44 = distinct !DILexicalBlockFile(scope: !7, file: !43, discriminator: 0) +!45 = !DILocation(line: 243, column: 36, scope: !42, inlinedAt: !46) +!46 = !DILocation(line: 50, column: 59, scope: !42) +!47 = !DILocation(line: 243, column: 36, scope: !44, inlinedAt: !48) +!48 = !DILocation(line: 50, column: 59, scope: !44) +!49 = !DILocation(line: 8, column: 15, scope: !50, inlinedAt: !52) +!50 = distinct !DILexicalBlockFile(scope: !7, file: !51, discriminator: 0) +!51 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!52 = !DILocation(line: 50, column: 45, scope: !50) +!53 = !DILocation(line: 53, column: 20, scope: !7) +!54 = !DILocation(line: 54, column: 20, scope: !7) +!55 = !DILocation(line: 55, column: 20, scope: !7) +!56 = !DILocation(line: 243, column: 36, scope: !44, inlinedAt: !57) +!57 = !DILocation(line: 58, column: 59, scope: !44) +!58 = !DILocation(line: 233, column: 15, scope: !42, inlinedAt: !59) +!59 = !DILocation(line: 243, column: 36, scope: !42, inlinedAt: !60) +!60 = !DILocation(line: 58, column: 59, scope: !42) +!61 = !DILocation(line: 8, column: 15, scope: !50, inlinedAt: !62) +!62 = !DILocation(line: 58, column: 45, scope: !50) +!63 = !DILocation(line: 61, column: 20, scope: !7) +!64 = !DILocation(line: 63, column: 20, scope: !7) +!65 = !DILocation(line: 64, column: 26, scope: !7) +!66 = !DILocation(line: 65, column: 20, scope: !7) +!67 = !DILocation(line: 66, column: 20, scope: !7) +!68 = !DILocation(line: 68, column: 25, scope: !7) +!69 = !DILocation(line: 68, column: 48, scope: !7) +!70 = !DILocation(line: 69, column: 25, scope: !7) +!71 = !DILocation(line: 69, column: 48, scope: !7) +!72 = !DILocation(line: 69, column: 4, scope: !7) diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ptx b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..ef3087f068116ddf87f324e58fe7bd66f1781843 --- /dev/null +++ b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ptx @@ -0,0 +1,871 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8de9de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6d7d8de9de( + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8de9de_param_7, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_8, + .param .u32 triton__0d1d2d3d4d5d6d7d8de9de_param_9 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<40>; + .reg .b16 %rs<13>; + .reg .b32 %r<121>; + .reg .f32 %f<86>; + .reg .b64 %rd<49>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd6, [triton__0d1d2d3d4d5d6d7d8de9de_param_7]; + ld.param.u64 %rd5, [triton__0d1d2d3d4d5d6d7d8de9de_param_6]; + ld.param.u64 %rd4, [triton__0d1d2d3d4d5d6d7d8de9de_param_1]; + ld.param.u64 %rd21, [triton__0d1d2d3d4d5d6d7d8de9de_param_0]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r1, %tid.x; + shl.b32 %r43, %r1, 2; + ld.param.u64 %rd22, [triton__0d1d2d3d4d5d6d7d8de9de_param_2]; + and.b32 %r44, %r43, 252; + ld.param.u64 %rd23, [triton__0d1d2d3d4d5d6d7d8de9de_param_3]; + ld.param.u64 %rd24, [triton__0d1d2d3d4d5d6d7d8de9de_param_4]; + .loc 1 23 28 + mov.u32 %r10, %ctaid.x; + .loc 1 30 18 + shr.s32 %r45, %r10, 31; + shr.u32 %r46, %r45, 23; + add.s32 %r47, %r10, %r46; + and.b32 %r48, %r47, 16776704; + sub.s32 %r49, %r10, %r48; + ld.param.u64 %rd25, [triton__0d1d2d3d4d5d6d7d8de9de_param_5]; + .loc 1 31 30 + mul.wide.s32 %rd26, %r10, 8; + add.s64 %rd8, %rd21, %rd26; + mov.pred %p24, -1; + .loc 1 31 35 + mov.u64 %rd7, 0x0; + @%p24 ld.global.L1::evict_last.b64 { %rd7 }, [ %rd8 + 0 ]; + mov.u64 %rd9, 0x0; + @%p24 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd8 + 0 ]; + mov.u64 %rd11, 0x0; + @%p24 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd8 + 0 ]; + mov.u64 %rd13, 0x0; + @%p24 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd8 + 0 ]; + mov.u64 %rd15, 0x0; + @%p24 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd8 + 0 ]; + .loc 1 32 40 + shl.b32 %r50, %r49, 8; + .loc 1 32 36 + or.b32 %r51, %r50, %r44; + .loc 1 32 30 + mul.wide.s32 %rd27, %r51, 4; + add.s64 %rd17, %rd22, %rd27; + mov.b32 %r59, 0; + .loc 1 32 46 + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + mov.u32 %r14, 0x0; + @%p24 ld.global.L1::evict_last.v4.b32 { %r11, %r12, %r13, %r14 }, [ %rd17 + 0 ]; + @!%p24 mov.u32 %r11, %r59; + @!%p24 mov.u32 %r12, %r59; + @!%p24 mov.u32 %r13, %r59; + @!%p24 mov.u32 %r14, %r59; + .loc 1 33 40 + shl.b32 %r52, %r10, 8; + .loc 1 33 36 + or.b32 %r53, %r52, %r44; + .loc 1 33 30 + cvt.s64.s32 %rd2, %r53; + mul.wide.s32 %rd28, %r53, 2; + add.s64 %rd18, %rd23, %rd28; + .loc 1 33 46 + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + @%p24 ld.global.v2.b32 { %r19, %r20 }, [ %rd18 + 0 ]; + @!%p24 mov.u32 %r19, %r59; + @!%p24 mov.u32 %r20, %r59; + cvt.u16.u32 %rs1, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r19; } + cvt.u16.u32 %rs3, %r20; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r20; } + .loc 1 33 67 + cvt.f32.bf16 %r23, %rs1; + mov.b32 %f1, %r23; + cvt.f32.bf16 %r24, %rs2; + mov.b32 %f2, %r24; + cvt.f32.bf16 %r25, %rs3; + mov.b32 %f3, %r25; + cvt.f32.bf16 %r26, %rs4; + mov.b32 %f4, %r26; + .loc 1 34 31 + add.s64 %rd19, %rd24, %rd28; + .loc 1 34 47 + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + @%p24 ld.global.v2.b32 { %r27, %r28 }, [ %rd19 + 0 ]; + @!%p24 mov.u32 %r27, %r59; + @!%p24 mov.u32 %r28, %r59; + cvt.u16.u32 %rs5, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r27; } + cvt.u16.u32 %rs7, %r28; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r28; } + .loc 1 34 68 + cvt.f32.bf16 %r31, %rs5; + mov.b32 %f5, %r31; + cvt.f32.bf16 %r32, %rs6; + mov.b32 %f6, %r32; + cvt.f32.bf16 %r33, %rs7; + mov.b32 %f7, %r33; + cvt.f32.bf16 %r34, %rs8; + mov.b32 %f8, %r34; + .loc 1 35 31 + cvt.u64.u32 %rd3, %r44; + mul.wide.u32 %rd29, %r44, 4; + add.s64 %rd20, %rd25, %rd29; + .loc 1 35 36 + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + mov.u32 %r38, 0x0; + @%p24 ld.global.L1::evict_last.v4.b32 { %r35, %r36, %r37, %r38 }, [ %rd20 + 0 ]; + @!%p24 mov.u32 %r35, %r59; + @!%p24 mov.u32 %r36, %r59; + @!%p24 mov.u32 %r37, %r59; + @!%p24 mov.u32 %r38, %r59; + .loc 1 36 18 + add.s64 %rd30, %rd15, 50257; + .loc 1 37 18 + setp.lt.s64 %p22, %rd15, 0; + .loc 1 38 32 + selp.b64 %rd31, %rd30, %rd15, %p22; + .loc 1 39 36 + setp.lt.u64 %p23, %rd31, 50257; + .loc 1 39 51 + @%p23 bra $L__BB0_2; + mov.u64 %rd32, assertMessage_0; + cvta.global.u64 %rd33, %rd32; + mov.u64 %rd34, assertFile_0; + cvta.global.u64 %rd35, %rd34; + mov.u64 %rd36, assertFunc_0; + cvta.global.u64 %rd37, %rd36; + mov.b32 %r54, 883; + mov.u64 %rd38, 1; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd33; + .param .b64 param1; + st.param.b64 [param1+0], %rd35; + .param .b32 param2; + st.param.b32 [param2+0], %r54; + .param .b64 param3; + st.param.b64 [param3+0], %rd37; + .param .b64 param4; + st.param.b64 [param4+0], %rd38; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 0 +$L__BB0_2: + .loc 1 37 18 + setp.lt.s64 %p37, %rd7, 0; + .loc 1 32 46 + mov.b32 %f9, %r14; + mov.b32 %f10, %r13; + .loc 1 26 26 + and.b32 %r91, %r1, 31; + .loc 1 40 40 + shl.b64 %rd42, %rd7, 8; + add.s64 %rd43, %rd42, 12865792; + selp.b64 %rd44, %rd43, %rd42, %p37; + .loc 1 40 36 + or.b64 %rd45, %rd44, %rd3; + .loc 1 40 30 + shl.b64 %rd46, %rd45, 2; + add.s64 %rd39, %rd4, %rd46; + .loc 1 40 48 + mov.u32 %r55, 0x0; + mov.u32 %r56, 0x0; + mov.u32 %r57, 0x0; + mov.u32 %r58, 0x0; + @%p24 ld.global.v4.b32 { %r55, %r56, %r57, %r58 }, [ %rd39 + 0 ]; + @!%p24 mov.u32 %r55, %r59; + @!%p24 mov.u32 %r56, %r59; + @!%p24 mov.u32 %r57, %r59; + @!%p24 mov.u32 %r58, %r59; + mov.b32 %f11, %r57; + mov.b32 %f12, %r58; + .loc 1 41 18 + add.f32 %f13, %f10, %f11; + add.f32 %f14, %f9, %f12; + .loc 1 43 18 + add.f32 %f15, %f3, %f13; + add.f32 %f16, %f4, %f14; + .loc 1 32 46 + mov.b32 %f17, %r11; + mov.b32 %f18, %r12; + .loc 1 40 48 + mov.b32 %f19, %r55; + mov.b32 %f20, %r56; + .loc 1 41 18 + add.f32 %f21, %f18, %f20; + add.f32 %f22, %f17, %f19; + .loc 1 43 18 + add.f32 %f23, %f1, %f22; + add.f32 %f24, %f2, %f21; + .loc 1 45 19 + add.f32 %f25, %f6, %f24; + mov.b32 %r82, %f25; + add.f32 %f26, %f5, %f23; + add.f32 %f27, %f7, %f15; + add.f32 %f28, %f8, %f16; +$L__tmp1: + .loc 2 233 15 + add.f32 %f29, %f26, %f25; + add.f32 %f30, %f27, %f29; + add.f32 %f31, %f28, %f30; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r92, %f31; + shfl.sync.bfly.b32 %r93, %r92, 16, 31, -1; + mov.b32 %f32, %r93; +$L__tmp3: + .loc 2 233 15 + add.f32 %f33, %f31, %f32; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r94, %f33; + shfl.sync.bfly.b32 %r95, %r94, 8, 31, -1; + mov.b32 %f34, %r95; +$L__tmp5: + .loc 2 233 15 + add.f32 %f35, %f33, %f34; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r96, %f35; + shfl.sync.bfly.b32 %r97, %r96, 4, 31, -1; + mov.b32 %f36, %r97; +$L__tmp7: + .loc 2 233 15 + add.f32 %f37, %f35, %f36; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r98, %f37; + shfl.sync.bfly.b32 %r99, %r98, 2, 31, -1; + mov.b32 %f38, %r99; +$L__tmp9: + .loc 2 233 15 + add.f32 %f39, %f37, %f38; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r100, %f39; + shfl.sync.bfly.b32 %r101, %r100, 1, 31, -1; + mov.b32 %f40, %r101; +$L__tmp11: + .loc 2 233 15 + add.f32 %f41, %f39, %f40; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p29, %r91, 0; + shr.u32 %r102, %r1, 3; + and.b32 %r103, %r102, 4; + mov.u32 %r104, global_smem; + add.s32 %r63, %r104, %r103; + mov.b32 %r64, %f41; + @%p29 st.shared.b32 [ %r63 + 0 ], %r64; + bar.sync 0; + setp.lt.s32 %p30, %r1, 2; + add.s32 %r66, %r104, %r43; + @%p30 ld.shared.b32 %r65, [ %r66 + 0 ]; + mov.b32 %f42, %r65; + shfl.sync.bfly.b32 %r106, %r65, 1, 31, -1; + mov.b32 %f43, %r106; +$L__tmp13: + .loc 2 233 15 + add.f32 %f44, %f42, %f43; +$L__tmp14: + .loc 2 243 36 + and.b32 %r107, %r1, 1; + setp.eq.b32 %p38, %r107, 1; + not.pred %p39, %p38; + and.pred %p31, %p30, %p39; + mov.b32 %r68, %f44; + @%p31 st.shared.b32 [ %r66 + 0 ], %r68; + bar.sync 0; + ld.shared.f32 %f45, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f46, %f45, 0f00000000; +$L__tmp16: + .loc 1 53 20 + mov.b32 %r70, %f46; + mov.b32 %r71, 1132462080; + div.full.f32 %r69, %r70, %r71; + mov.b32 %f47, %r69; + .loc 1 54 20 + sub.f32 %f48, %f26, %f47; + sub.f32 %f49, %f25, %f47; + sub.f32 %f50, %f27, %f47; + sub.f32 %f51, %f28, %f47; + .loc 1 55 20 + mul.f32 %f52, %f49, %f49; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f53, %f48, %f48, %f52; + fma.rn.f32 %f54, %f50, %f50, %f53; + fma.rn.f32 %f55, %f51, %f51, %f54; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r108, %f55; + shfl.sync.bfly.b32 %r109, %r108, 16, 31, -1; + mov.b32 %f56, %r109; +$L__tmp20: + .loc 2 233 15 + add.f32 %f57, %f55, %f56; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r110, %f57; + shfl.sync.bfly.b32 %r111, %r110, 8, 31, -1; + mov.b32 %f58, %r111; +$L__tmp22: + .loc 2 233 15 + add.f32 %f59, %f57, %f58; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r112, %f59; + shfl.sync.bfly.b32 %r113, %r112, 4, 31, -1; + mov.b32 %f60, %r113; +$L__tmp24: + .loc 2 233 15 + add.f32 %f61, %f59, %f60; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r114, %f61; + shfl.sync.bfly.b32 %r115, %r114, 2, 31, -1; + mov.b32 %f62, %r115; +$L__tmp26: + .loc 2 233 15 + add.f32 %f63, %f61, %f62; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r116, %f63; + shfl.sync.bfly.b32 %r117, %r116, 1, 31, -1; + mov.b32 %f64, %r117; +$L__tmp28: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r73, %f65; + @%p29 st.shared.b32 [ %r63 + 0 ], %r73; + bar.sync 0; + @%p30 ld.shared.b32 %r74, [ %r66 + 0 ]; + mov.b32 %f66, %r74; + shfl.sync.bfly.b32 %r118, %r74, 1, 31, -1; + mov.b32 %f67, %r118; +$L__tmp30: + .loc 2 233 15 + add.f32 %f68, %f66, %f67; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r77, %f68; + @%p31 st.shared.b32 [ %r66 + 0 ], %r77; + bar.sync 0; + ld.shared.f32 %f69, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f70, %f69, 0f00000000; +$L__tmp33: + .loc 1 61 20 + mov.b32 %r79, %f70; + div.full.f32 %r78, %r79, %r71; + mov.b32 %f71, %r78; + .loc 1 63 20 + add.f32 %f72, %f71, 0f3727C5AC; + .loc 1 64 26 + rsqrt.approx.ftz.f32 %f73, %f72; + .loc 1 35 36 + mov.b32 %f74, %r35; + mov.b32 %f75, %r36; + mov.b32 %f76, %r37; + mov.b32 %f77, %r38; + .loc 1 65 20 + mul.f32 %f78, %f48, %f73; + mul.f32 %f79, %f49, %f73; + mul.f32 %f80, %f50, %f73; + mul.f32 %f81, %f51, %f73; + .loc 1 66 20 + mul.f32 %f82, %f78, %f74; + mul.f32 %f83, %f79, %f75; + mul.f32 %f84, %f80, %f76; + mul.f32 %f85, %f81, %f77; + .loc 1 68 25 + shl.b64 %rd47, %rd2, 2; + add.s64 %rd40, %rd5, %rd47; + .loc 1 45 19 + mov.b32 %r81, %f26; + .loc 1 68 48 + mov.b32 %r83, %f27; + mov.b32 %r84, %f28; + @%p24 st.global.v4.b32 [ %rd40 + 0 ], { %r81, %r82, %r83, %r84 }; + .loc 1 69 25 + shl.b64 %rd48, %rd2, 1; + add.s64 %rd41, %rd6, %rd48; + .loc 1 69 48 + mov.b32 %r85, %f82; + cvt.rn.bf16.f32 %rs9, %r85; + mov.b32 %r86, %f83; + cvt.rn.bf16.f32 %rs10, %r86; + mov.b32 %r87, %f84; + cvt.rn.bf16.f32 %rs11, %r87; + mov.b32 %r88, %f85; + cvt.rn.bf16.f32 %rs12, %r88; + mov.b32 %r119, {%rs9, %rs10}; + mov.b32 %r120, {%rs11, %rs12}; + @%p24 st.global.v2.b32 [ %rd41 + 0 ], { %r119, %r120 }; + .loc 1 69 4 + ret; +$L__tmp34: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/jm/cjmwwjkk6clul35jp6rpoowqvjcl7grtlxvbncluwgfnvz5hpoxs.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 407 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 106 +.b8 109 +.b8 119 +.b8 119 +.b8 106 +.b8 107 +.b8 107 +.b8 54 +.b8 99 +.b8 108 +.b8 117 +.b8 108 +.b8 51 +.b8 53 +.b8 106 +.b8 112 +.b8 54 +.b8 114 +.b8 112 +.b8 111 +.b8 111 +.b8 119 +.b8 113 +.b8 118 +.b8 106 +.b8 99 +.b8 108 +.b8 55 +.b8 103 +.b8 114 +.b8 116 +.b8 108 +.b8 120 +.b8 118 +.b8 98 +.b8 110 +.b8 99 +.b8 108 +.b8 117 +.b8 119 +.b8 103 +.b8 102 +.b8 110 +.b8 118 +.b8 122 +.b8 53 +.b8 104 +.b8 112 +.b8 111 +.b8 120 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 106 +.b8 109 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 50 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 50 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 50 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 58 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 58 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 58 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 101 +.b8 57 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 411 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8efe09e06ac45cb16955328477190b028cc24b64 --- /dev/null +++ b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttgir @@ -0,0 +1,100 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<1xi64, #blocked> + %cst_1 = arith.constant dense<50257> : tensor<1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked> + %cst_3 = arith.constant 9.99999974E-6 : f32 + %cst_4 = arith.constant 2.560000e+02 : f32 + %cst_5 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %c512_i32 = arith.constant 512 : i32 + %cst_6 = arith.constant dense<50257> : tensor<1xi64, #blocked1> + %cst_7 = arith.constant dense<0> : tensor<1xi64, #blocked1> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_9 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.remsi %0, %c512_i32 : i32 + %4 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %5 = tt.splat %4 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %6 = tt.splat %4 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked1> + %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked> + %8 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked1> + %9 = arith.muli %3, %c256_i32 : i32 + %10 = tt.splat %9 : (i32) -> tensor<256xi32, #blocked> + %11 = arith.addi %1, %10 : tensor<256xi32, #blocked> + %12 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %13 = tt.addptr %12, %11 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %14 = tt.load %13, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %15 = arith.muli %0, %c256_i32 : i32 + %16 = tt.splat %15 : (i32) -> tensor<256xi32, #blocked> + %17 = arith.addi %1, %16 : tensor<256xi32, #blocked> + %18 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %19 = tt.addptr %18, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %20 = tt.load %19, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %21 = arith.extf %20 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %22 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %23 = tt.addptr %22, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %24 = tt.load %23, %2, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %25 = arith.extf %24 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %26 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %27 = tt.addptr %26, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %28 = tt.load %27, %2, %cst_8 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %29 = arith.addi %7, %cst_1 : tensor<1xi64, #blocked> + %30 = arith.addi %8, %cst_6 : tensor<1xi64, #blocked1> + %31 = arith.cmpi slt, %7, %cst_0 : tensor<1xi64, #blocked> + %32 = arith.cmpi slt, %8, %cst_7 : tensor<1xi64, #blocked1> + %33 = arith.select %31, %29, %7 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked> + %34 = arith.select %32, %30, %8 : tensor<1xi1, #blocked1>, tensor<1xi64, #blocked1> + %35 = arith.cmpi sge, %34, %cst_7 : tensor<1xi64, #blocked1> + %36 = arith.cmpi slt, %34, %cst_6 : tensor<1xi64, #blocked1> + %37 = arith.andi %35, %36 : tensor<1xi1, #blocked1> + tt.assert %37, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<1xi1, #blocked1> + %38 = arith.muli %33, %cst_2 : tensor<1xi64, #blocked> + %39 = tt.broadcast %38 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked> + %40 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> + %41 = arith.addi %40, %39 : tensor<256xi64, #blocked> + %42 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %43 = tt.addptr %42, %41 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %44 = tt.load %43, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %45 = arith.addf %44, %14 : tensor<256xf32, #blocked> + %46 = arith.addf %45, %21 : tensor<256xf32, #blocked> + %47 = arith.addf %46, %25 : tensor<256xf32, #blocked> + %48 = arith.select %2, %47, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %49 = "tt.reduce"(%48) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %69 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %69 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %50 = arith.addf %49, %cst_5 : f32 + %51 = arith.divf %50, %cst_4 : f32 + %52 = tt.splat %51 : (f32) -> tensor<256xf32, #blocked> + %53 = arith.subf %47, %52 : tensor<256xf32, #blocked> + %54 = arith.mulf %53, %53 : tensor<256xf32, #blocked> + %55 = arith.select %2, %54, %cst_8 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %56 = "tt.reduce"(%55) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %69 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %69 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %57 = arith.addf %56, %cst_5 : f32 + %58 = arith.divf %57, %cst_4 : f32 + %59 = arith.addf %58, %cst_3 : f32 + %60 = tt.extern_elementwise %59 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %61 = tt.splat %60 : (f32) -> tensor<256xf32, #blocked> + %62 = arith.mulf %53, %61 : tensor<256xf32, #blocked> + %63 = arith.mulf %62, %28 : tensor<256xf32, #blocked> + %64 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %65 = tt.addptr %64, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %65, %47, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %66 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %67 = tt.addptr %66, %17 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %68 = arith.truncf %63 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %67, %68, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttir b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..018c4289c9e5587db85c94a81b9460d6b3aedc4c --- /dev/null +++ b/.triton/dump/ab1bd569084d0d03cf386a4687040895/triton_.ttir @@ -0,0 +1,91 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<1xi64> + %cst_5 = arith.constant dense<50257> : tensor<1xi64> + %cst_6 = arith.constant dense<0> : tensor<1xi64> + %cst_7 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_7 : tensor<256xi32> + %3 = arith.remsi %0, %c512_i32 : i32 + %4 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %5 = tt.splat %4 : (!tt.ptr) -> tensor<1x!tt.ptr> + %6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64> + %7 = arith.muli %3, %c256_i32 : i32 + %8 = tt.splat %7 : (i32) -> tensor<256xi32> + %9 = arith.addi %1, %8 : tensor<256xi32> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %11 = tt.addptr %10, %9 : tensor<256x!tt.ptr>, tensor<256xi32> + %12 = tt.load %11, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %13 = arith.muli %0, %c256_i32 : i32 + %14 = tt.splat %13 : (i32) -> tensor<256xi32> + %15 = arith.addi %1, %14 : tensor<256xi32> + %16 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %17 = tt.addptr %16, %15 : tensor<256x!tt.ptr>, tensor<256xi32> + %18 = tt.load %17, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %19 = arith.extf %18 : tensor<256xbf16> to tensor<256xf32> + %20 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr> + %21 = tt.addptr %20, %15 : tensor<256x!tt.ptr>, tensor<256xi32> + %22 = tt.load %21, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %23 = arith.extf %22 : tensor<256xbf16> to tensor<256xf32> + %24 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr> + %25 = tt.addptr %24, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %26 = tt.load %25, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %27 = arith.addi %6, %cst_5 : tensor<1xi64> + %28 = arith.cmpi slt, %6, %cst_6 : tensor<1xi64> + %29 = arith.select %28, %27, %6 : tensor<1xi1>, tensor<1xi64> + %30 = arith.cmpi sge, %29, %cst_6 : tensor<1xi64> + %31 = arith.cmpi slt, %29, %cst_5 : tensor<1xi64> + %32 = arith.andi %30, %31 : tensor<1xi1> + tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<1xi1> + %33 = arith.muli %29, %cst_4 : tensor<1xi64> + %34 = tt.broadcast %33 : (tensor<1xi64>) -> tensor<256xi64> + %35 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64> + %36 = arith.addi %35, %34 : tensor<256xi64> + %37 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %38 = tt.addptr %37, %36 : tensor<256x!tt.ptr>, tensor<256xi64> + %39 = tt.load %38, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %40 = arith.addf %39, %12 : tensor<256xf32> + %41 = arith.addf %40, %19 : tensor<256xf32> + %42 = arith.addf %41, %23 : tensor<256xf32> + %43 = arith.select %2, %42, %cst_3 : tensor<256xi1>, tensor<256xf32> + %44 = "tt.reduce"(%43) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %64 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %64 : f32 + }) : (tensor<256xf32>) -> f32 + %45 = arith.addf %44, %cst_0 : f32 + %46 = arith.divf %45, %cst_1 : f32 + %47 = tt.splat %46 : (f32) -> tensor<256xf32> + %48 = arith.subf %42, %47 : tensor<256xf32> + %49 = arith.mulf %48, %48 : tensor<256xf32> + %50 = arith.select %2, %49, %cst_3 : tensor<256xi1>, tensor<256xf32> + %51 = "tt.reduce"(%50) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %64 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %64 : f32 + }) : (tensor<256xf32>) -> f32 + %52 = arith.addf %51, %cst_0 : f32 + %53 = arith.divf %52, %cst_1 : f32 + %54 = arith.addf %53, %cst_2 : f32 + %55 = tt.extern_elementwise %54 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %56 = tt.splat %55 : (f32) -> tensor<256xf32> + %57 = arith.mulf %48, %56 : tensor<256xf32> + %58 = arith.mulf %57, %26 : tensor<256xf32> + %59 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr> + %60 = tt.addptr %59, %15 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %60, %42, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %61 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr> + %62 = tt.addptr %61, %15 : tensor<256x!tt.ptr>, tensor<256xi32> + %63 = arith.truncf %58 : tensor<256xf32> to tensor<256xbf16> + tt.store %62, %63, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +} diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.llir b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..0edc7601ba8264f81618c33004bdafd57004157a --- /dev/null +++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.llir @@ -0,0 +1,384 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 { + %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %7 = and i32 %6, 31, !dbg !8 + %8 = lshr i32 %6, 5, !dbg !8 + %9 = shl i32 %6, 2, !dbg !8 + %10 = and i32 %9, 60, !dbg !8 + %11 = and i32 %8, 7, !dbg !9 + %12 = lshr i32 %7, 4, !dbg !9 + %13 = shl nuw nsw i32 %11, 1, !dbg !9 + %14 = or i32 %13, %12, !dbg !9 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10 + %16 = shl i32 %15, 6, !dbg !11 + %17 = or i32 %16, %10, !dbg !12 + %18 = shl nuw nsw i32 %14, 17, !dbg !13 + %19 = shl nuw nsw i32 %14, 17, !dbg !13 + %20 = or i32 %19, 2097152, !dbg !13 + %21 = shl nuw nsw i32 %14, 17, !dbg !13 + %22 = or i32 %21, 4194304, !dbg !13 + %23 = shl nuw nsw i32 %14, 17, !dbg !13 + %24 = or i32 %23, 6291456, !dbg !13 + %25 = add i32 %18, %17, !dbg !14 + %26 = add i32 %20, %17, !dbg !14 + %27 = add i32 %22, %17, !dbg !14 + %28 = add i32 %24, %17, !dbg !14 + %29 = sext i32 %25 to i64, !dbg !15 + %30 = getelementptr float, ptr addrspace(1) %0, i64 %29, !dbg !15 + %31 = sext i32 %26 to i64, !dbg !15 + %32 = getelementptr float, ptr addrspace(1) %0, i64 %31, !dbg !15 + %33 = sext i32 %27 to i64, !dbg !15 + %34 = getelementptr float, ptr addrspace(1) %0, i64 %33, !dbg !15 + %35 = sext i32 %28 to i64, !dbg !15 + %36 = getelementptr float, ptr addrspace(1) %0, i64 %35, !dbg !15 + %37 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %30, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %38 = extractvalue { i32, i32, i32, i32 } %37, 0, !dbg !16 + %39 = extractvalue { i32, i32, i32, i32 } %37, 1, !dbg !16 + %40 = extractvalue { i32, i32, i32, i32 } %37, 2, !dbg !16 + %41 = extractvalue { i32, i32, i32, i32 } %37, 3, !dbg !16 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !16 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !16 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !16 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !16 + %47 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %34, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %48 = extractvalue { i32, i32, i32, i32 } %47, 0, !dbg !16 + %49 = extractvalue { i32, i32, i32, i32 } %47, 1, !dbg !16 + %50 = extractvalue { i32, i32, i32, i32 } %47, 2, !dbg !16 + %51 = extractvalue { i32, i32, i32, i32 } %47, 3, !dbg !16 + %52 = bitcast i32 %48 to float, !dbg !16 + %53 = bitcast i32 %49 to float, !dbg !16 + %54 = bitcast i32 %50 to float, !dbg !16 + %55 = bitcast i32 %51 to float, !dbg !16 + %56 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %36, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %57 = extractvalue { i32, i32, i32, i32 } %56, 0, !dbg !16 + %58 = extractvalue { i32, i32, i32, i32 } %56, 1, !dbg !16 + %59 = extractvalue { i32, i32, i32, i32 } %56, 2, !dbg !16 + %60 = extractvalue { i32, i32, i32, i32 } %56, 3, !dbg !16 + %61 = fadd float %52, 0.000000e+00, !dbg !17 + %62 = fadd float %53, 0.000000e+00, !dbg !17 + %63 = fadd float %54, 0.000000e+00, !dbg !17 + %64 = fadd float %55, 0.000000e+00, !dbg !17 + %65 = or i32 %14, 112, !dbg !18 + %66 = icmp ult i32 %65, 120, !dbg !19 + %67 = shl nuw nsw i32 %14, 17, !dbg !13 + %68 = or i32 %67, 8388608, !dbg !13 + %69 = shl nuw nsw i32 %14, 17, !dbg !13 + %70 = or i32 %69, 10485760, !dbg !13 + %71 = shl nuw nsw i32 %14, 17, !dbg !13 + %72 = or i32 %71, 12582912, !dbg !13 + %73 = shl nuw nsw i32 %65, 17, !dbg !13 + %74 = add i32 %68, %17, !dbg !14 + %75 = add i32 %70, %17, !dbg !14 + %76 = add i32 %72, %17, !dbg !14 + %77 = add i32 %73, %17, !dbg !14 + %78 = sext i32 %74 to i64, !dbg !15 + %79 = getelementptr float, ptr addrspace(1) %0, i64 %78, !dbg !15 + %80 = sext i32 %75 to i64, !dbg !15 + %81 = getelementptr float, ptr addrspace(1) %0, i64 %80, !dbg !15 + %82 = sext i32 %76 to i64, !dbg !15 + %83 = getelementptr float, ptr addrspace(1) %0, i64 %82, !dbg !15 + %84 = sext i32 %77 to i64, !dbg !15 + %85 = getelementptr float, ptr addrspace(1) %0, i64 %84, !dbg !15 + %86 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %87 = extractvalue { i32, i32, i32, i32 } %86, 0, !dbg !16 + %88 = extractvalue { i32, i32, i32, i32 } %86, 1, !dbg !16 + %89 = extractvalue { i32, i32, i32, i32 } %86, 2, !dbg !16 + %90 = extractvalue { i32, i32, i32, i32 } %86, 3, !dbg !16 + %91 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %92 = extractvalue { i32, i32, i32, i32 } %91, 0, !dbg !16 + %93 = extractvalue { i32, i32, i32, i32 } %91, 1, !dbg !16 + %94 = extractvalue { i32, i32, i32, i32 } %91, 2, !dbg !16 + %95 = extractvalue { i32, i32, i32, i32 } %91, 3, !dbg !16 + %96 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %83, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !16 + %97 = extractvalue { i32, i32, i32, i32 } %96, 0, !dbg !16 + %98 = extractvalue { i32, i32, i32, i32 } %96, 1, !dbg !16 + %99 = extractvalue { i32, i32, i32, i32 } %96, 2, !dbg !16 + %100 = extractvalue { i32, i32, i32, i32 } %96, 3, !dbg !16 + %101 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %85, i1 %66, i32 0, i1 %66, i32 0, i1 %66, i32 0, i1 %66, i32 0, i1 %66) #3, !dbg !16 + %102 = extractvalue { i32, i32, i32, i32 } %101, 0, !dbg !16 + %103 = extractvalue { i32, i32, i32, i32 } %101, 1, !dbg !16 + %104 = extractvalue { i32, i32, i32, i32 } %101, 2, !dbg !16 + %105 = extractvalue { i32, i32, i32, i32 } %101, 3, !dbg !16 + %106 = bitcast i32 %102 to float, !dbg !16 + %107 = bitcast i32 %103 to float, !dbg !16 + %108 = bitcast i32 %104 to float, !dbg !16 + %109 = bitcast i32 %105 to float, !dbg !16 + %110 = insertelement <2 x i32> poison, i32 %38, i64 0, !dbg !16 + %111 = insertelement <2 x i32> %110, i32 %43, i64 1, !dbg !16 + %112 = bitcast <2 x i32> %111 to <2 x float>, !dbg !16 + %113 = fadd <2 x float> %112, zeroinitializer, !dbg !17 + %114 = insertelement <2 x i32> poison, i32 %87, i64 0, !dbg !16 + %115 = insertelement <2 x i32> %114, i32 %92, i64 1, !dbg !16 + %116 = bitcast <2 x i32> %115 to <2 x float>, !dbg !16 + %117 = fadd <2 x float> %113, %116, !dbg !17 + %118 = insertelement <2 x i32> poison, i32 %39, i64 0, !dbg !16 + %119 = insertelement <2 x i32> %118, i32 %44, i64 1, !dbg !16 + %120 = bitcast <2 x i32> %119 to <2 x float>, !dbg !16 + %121 = fadd <2 x float> %120, zeroinitializer, !dbg !17 + %122 = insertelement <2 x i32> poison, i32 %88, i64 0, !dbg !16 + %123 = insertelement <2 x i32> %122, i32 %93, i64 1, !dbg !16 + %124 = bitcast <2 x i32> %123 to <2 x float>, !dbg !16 + %125 = fadd <2 x float> %121, %124, !dbg !17 + %126 = insertelement <2 x i32> poison, i32 %40, i64 0, !dbg !16 + %127 = insertelement <2 x i32> %126, i32 %45, i64 1, !dbg !16 + %128 = bitcast <2 x i32> %127 to <2 x float>, !dbg !16 + %129 = fadd <2 x float> %128, zeroinitializer, !dbg !17 + %130 = insertelement <2 x i32> poison, i32 %89, i64 0, !dbg !16 + %131 = insertelement <2 x i32> %130, i32 %94, i64 1, !dbg !16 + %132 = bitcast <2 x i32> %131 to <2 x float>, !dbg !16 + %133 = fadd <2 x float> %129, %132, !dbg !17 + %134 = insertelement <2 x i32> poison, i32 %41, i64 0, !dbg !16 + %135 = insertelement <2 x i32> %134, i32 %46, i64 1, !dbg !16 + %136 = bitcast <2 x i32> %135 to <2 x float>, !dbg !16 + %137 = fadd <2 x float> %136, zeroinitializer, !dbg !17 + %138 = insertelement <2 x i32> poison, i32 %90, i64 0, !dbg !16 + %139 = insertelement <2 x i32> %138, i32 %95, i64 1, !dbg !16 + %140 = bitcast <2 x i32> %139 to <2 x float>, !dbg !16 + %141 = fadd <2 x float> %137, %140, !dbg !17 + %142 = select i1 %66, float %106, float -0.000000e+00, !dbg !17 + %143 = select i1 %66, float %107, float -0.000000e+00, !dbg !17 + %144 = select i1 %66, float %108, float -0.000000e+00, !dbg !17 + %145 = select i1 %66, float %109, float -0.000000e+00, !dbg !17 + %146 = and i32 %6, 63, !dbg !8 + %147 = or i32 %16, %146, !dbg !12 + %148 = or i32 %10, 3, !dbg !20 + %149 = or i32 %10, 2, !dbg !20 + %150 = or i32 %10, 1, !dbg !20 + %shift = shufflevector <2 x float> %117, <2 x float> poison, <2 x i32> , !dbg !24 + %151 = fadd <2 x float> %shift, %117, !dbg !24 + %shift16 = shufflevector <2 x float> %125, <2 x float> poison, <2 x i32> , !dbg !24 + %152 = fadd <2 x float> %shift16, %125, !dbg !24 + %shift17 = shufflevector <2 x float> %133, <2 x float> poison, <2 x i32> , !dbg !24 + %153 = fadd <2 x float> %shift17, %133, !dbg !24 + %shift18 = shufflevector <2 x float> %141, <2 x float> poison, <2 x i32> , !dbg !24 + %154 = fadd <2 x float> %shift18, %141, !dbg !24 + %155 = insertelement <2 x i32> poison, i32 %57, i64 0, !dbg !16 + %156 = insertelement <2 x i32> %155, i32 %97, i64 1, !dbg !16 + %157 = bitcast <2 x i32> %156 to <2 x float>, !dbg !16 + %158 = insertelement <2 x float> , float %61, i64 1, !dbg !17 + %159 = fadd <2 x float> %158, %157, !dbg !17 + %160 = insertelement <2 x float> %151, float %142, i64 0, !dbg !17 + %161 = fadd <2 x float> %159, %160, !dbg !17 + %162 = insertelement <2 x i32> poison, i32 %58, i64 0, !dbg !16 + %163 = insertelement <2 x i32> %162, i32 %98, i64 1, !dbg !16 + %164 = bitcast <2 x i32> %163 to <2 x float>, !dbg !16 + %165 = insertelement <2 x float> , float %62, i64 1, !dbg !17 + %166 = fadd <2 x float> %165, %164, !dbg !17 + %167 = insertelement <2 x float> %152, float %143, i64 0, !dbg !17 + %168 = fadd <2 x float> %166, %167, !dbg !17 + %169 = insertelement <2 x i32> poison, i32 %59, i64 0, !dbg !16 + %170 = insertelement <2 x i32> %169, i32 %99, i64 1, !dbg !16 + %171 = bitcast <2 x i32> %170 to <2 x float>, !dbg !16 + %172 = insertelement <2 x float> , float %63, i64 1, !dbg !17 + %173 = fadd <2 x float> %172, %171, !dbg !17 + %174 = insertelement <2 x float> %153, float %144, i64 0, !dbg !17 + %175 = fadd <2 x float> %173, %174, !dbg !17 + %176 = insertelement <2 x i32> poison, i32 %60, i64 0, !dbg !16 + %177 = insertelement <2 x i32> %176, i32 %100, i64 1, !dbg !16 + %178 = bitcast <2 x i32> %177 to <2 x float>, !dbg !16 + %179 = insertelement <2 x float> , float %64, i64 1, !dbg !17 + %180 = fadd <2 x float> %179, %178, !dbg !17 + %181 = insertelement <2 x float> %154, float %145, i64 0, !dbg !17 + %182 = fadd <2 x float> %180, %181, !dbg !17 + %shift19 = shufflevector <2 x float> %161, <2 x float> poison, <2 x i32> , !dbg !24 + %183 = fadd <2 x float> %161, %shift19, !dbg !24 + %184 = extractelement <2 x float> %183, i64 0, !dbg !24 + %shift20 = shufflevector <2 x float> %168, <2 x float> poison, <2 x i32> , !dbg !24 + %185 = fadd <2 x float> %168, %shift20, !dbg !24 + %186 = extractelement <2 x float> %185, i64 0, !dbg !24 + %shift21 = shufflevector <2 x float> %175, <2 x float> poison, <2 x i32> , !dbg !24 + %187 = fadd <2 x float> %175, %shift21, !dbg !24 + %188 = extractelement <2 x float> %187, i64 0, !dbg !24 + %shift22 = shufflevector <2 x float> %182, <2 x float> poison, <2 x i32> , !dbg !24 + %189 = fadd <2 x float> %182, %shift22, !dbg !24 + %190 = extractelement <2 x float> %189, i64 0, !dbg !24 + %191 = bitcast float %184 to i32, !dbg !20 + %192 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %191, i32 16, i32 31), !dbg !20 + %193 = bitcast i32 %192 to float, !dbg !20 + %194 = fadd float %184, %193, !dbg !24 + %195 = bitcast float %186 to i32, !dbg !20 + %196 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %195, i32 16, i32 31), !dbg !20 + %197 = bitcast i32 %196 to float, !dbg !20 + %198 = fadd float %186, %197, !dbg !24 + %199 = bitcast float %188 to i32, !dbg !20 + %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 16, i32 31), !dbg !20 + %201 = bitcast i32 %200 to float, !dbg !20 + %202 = fadd float %188, %201, !dbg !24 + %203 = bitcast float %190 to i32, !dbg !20 + %204 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %203, i32 16, i32 31), !dbg !20 + %205 = bitcast i32 %204 to float, !dbg !20 + %206 = fadd float %190, %205, !dbg !24 + %207 = icmp ult i32 %7, 16, !dbg !20 + %208 = shl nuw nsw i32 %10, 3, !dbg !20 + %209 = or i32 %208, %11, !dbg !20 + %210 = zext nneg i32 %209 to i64, !dbg !20 + %211 = getelementptr float, ptr addrspace(3) @global_smem, i64 %210, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %211, float %194, i1 %207) #3, !dbg !20 + %212 = shl nuw nsw i32 %150, 3, !dbg !20 + %213 = or i32 %212, %11, !dbg !20 + %214 = zext nneg i32 %213 to i64, !dbg !20 + %215 = getelementptr float, ptr addrspace(3) @global_smem, i64 %214, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %215, float %198, i1 %207) #3, !dbg !20 + %216 = shl nuw nsw i32 %149, 3, !dbg !20 + %217 = or i32 %216, %11, !dbg !20 + %218 = zext nneg i32 %217 to i64, !dbg !20 + %219 = getelementptr float, ptr addrspace(3) @global_smem, i64 %218, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %219, float %202, i1 %207) #3, !dbg !20 + %220 = shl nuw nsw i32 %148, 3, !dbg !20 + %221 = or i32 %220, %11, !dbg !20 + %222 = zext nneg i32 %221 to i64, !dbg !20 + %223 = getelementptr float, ptr addrspace(3) @global_smem, i64 %222, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %223, float %206, i1 %207) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %224 = icmp slt i32 %6, 512, !dbg !20 + %225 = sext i32 %6 to i64, !dbg !20 + %226 = getelementptr float, ptr addrspace(3) @global_smem, i64 %225, !dbg !20 + %227 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %226, i1 %224) #3, !dbg !20 + %228 = bitcast float %227 to i32, !dbg !20 + %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 4, i32 31), !dbg !20 + %230 = bitcast i32 %229 to float, !dbg !20 + %231 = fadd float %227, %230, !dbg !24 + %232 = bitcast float %231 to i32, !dbg !20 + %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 2, i32 31), !dbg !20 + %234 = bitcast i32 %233 to float, !dbg !20 + %235 = fadd float %231, %234, !dbg !24 + %236 = bitcast float %235 to i32, !dbg !20 + %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 1, i32 31), !dbg !20 + %238 = bitcast i32 %237 to float, !dbg !20 + %239 = fadd float %235, %238, !dbg !24 + %240 = and i32 %6, 7, !dbg !20 + %241 = icmp eq i32 %240, 0, !dbg !20 + %242 = and i1 %224, %241, !dbg !20 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %226, float %239, i1 %242) #3, !dbg !20 + %243 = add i32 %6, 256, !dbg !20 + %244 = sext i32 %243 to i64, !dbg !20 + %245 = getelementptr float, ptr addrspace(3) @global_smem, i64 %244, !dbg !20 + %246 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %245, i1 %224) #3, !dbg !20 + %247 = bitcast float %246 to i32, !dbg !20 + %248 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %247, i32 4, i32 31), !dbg !20 + %249 = bitcast i32 %248 to float, !dbg !20 + %250 = fadd float %246, %249, !dbg !24 + %251 = bitcast float %250 to i32, !dbg !20 + %252 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %251, i32 2, i32 31), !dbg !20 + %253 = bitcast i32 %252 to float, !dbg !20 + %254 = fadd float %250, %253, !dbg !24 + %255 = bitcast float %254 to i32, !dbg !20 + %256 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %255, i32 1, i32 31), !dbg !20 + %257 = bitcast i32 %256 to float, !dbg !20 + %258 = fadd float %254, %257, !dbg !24 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %245, float %258, i1 %242) #3, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !20 + %259 = zext nneg i32 %208 to i64, !dbg !20 + %260 = getelementptr float, ptr addrspace(3) @global_smem, i64 %259, !dbg !20 + %261 = load float, ptr addrspace(3) %260, align 4, !dbg !20 + %262 = zext nneg i32 %212 to i64, !dbg !20 + %263 = getelementptr float, ptr addrspace(3) @global_smem, i64 %262, !dbg !20 + %264 = load float, ptr addrspace(3) %263, align 4, !dbg !20 + %265 = zext nneg i32 %216 to i64, !dbg !20 + %266 = getelementptr float, ptr addrspace(3) @global_smem, i64 %265, !dbg !20 + %267 = load float, ptr addrspace(3) %266, align 4, !dbg !20 + %268 = zext nneg i32 %220 to i64, !dbg !20 + %269 = getelementptr float, ptr addrspace(3) @global_smem, i64 %268, !dbg !20 + %270 = load float, ptr addrspace(3) %269, align 4, !dbg !20 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %271 = zext nneg i32 %10 to i64, !dbg !28 + %272 = getelementptr float, ptr addrspace(3) @global_smem, i64 %271, !dbg !28 + %273 = insertelement <1 x float> undef, float %261, i64 0, !dbg !28 + store <1 x float> %273, ptr addrspace(3) %272, align 4, !dbg !28 + %274 = zext nneg i32 %150 to i64, !dbg !28 + %275 = getelementptr float, ptr addrspace(3) @global_smem, i64 %274, !dbg !28 + %276 = insertelement <1 x float> undef, float %264, i64 0, !dbg !28 + store <1 x float> %276, ptr addrspace(3) %275, align 4, !dbg !28 + %277 = zext nneg i32 %149 to i64, !dbg !28 + %278 = getelementptr float, ptr addrspace(3) @global_smem, i64 %277, !dbg !28 + %279 = insertelement <1 x float> undef, float %267, i64 0, !dbg !28 + store <1 x float> %279, ptr addrspace(3) %278, align 4, !dbg !28 + %280 = zext nneg i32 %148 to i64, !dbg !28 + %281 = getelementptr float, ptr addrspace(3) @global_smem, i64 %280, !dbg !28 + %282 = insertelement <1 x float> undef, float %270, i64 0, !dbg !28 + store <1 x float> %282, ptr addrspace(3) %281, align 4, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %283 = zext nneg i32 %146 to i64, !dbg !28 + %284 = getelementptr float, ptr addrspace(3) @global_smem, i64 %283, !dbg !28 + %285 = load <1 x float>, ptr addrspace(3) %284, align 4, !dbg !28 + %.frozen = freeze i32 %147 + %286 = sdiv i32 %.frozen, 256, !dbg !29 + %287 = mul i32 %286, 256 + %.decomposed = sub i32 %.frozen, %287 + %288 = sext i32 %286 to i64, !dbg !30 + %289 = getelementptr i64, ptr addrspace(1) %1, i64 %288, !dbg !30 + %290 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %289, i1 true) #3, !dbg !31 + %291 = lshr i64 %290, 54, !dbg !32 + %292 = and i64 %291, 512, !dbg !32 + %293 = add i64 %292, %290, !dbg !32 + %294 = shl i64 %293, 8, !dbg !33 + %295 = sext i32 %.decomposed to i64, !dbg !34 + %296 = getelementptr float, ptr addrspace(1) %2, i64 %294, !dbg !35 + %297 = getelementptr float, ptr addrspace(1) %296, i64 %295, !dbg !35 + %298 = and i32 %6, 192, !dbg !36 + %299 = icmp eq i32 %298, 0, !dbg !36 + %300 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %297, <1 x float> %285, i1 %299) #3, !dbg !36 + ret void, !dbg !37 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i") +!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 22, column: 44, scope: !5) +!9 = !DILocation(line: 24, column: 33, scope: !5) +!10 = !DILocation(line: 21, column: 28, scope: !5) +!11 = !DILocation(line: 21, column: 33, scope: !5) +!12 = !DILocation(line: 22, column: 23, scope: !5) +!13 = !DILocation(line: 31, column: 47, scope: !5) +!14 = !DILocation(line: 31, column: 40, scope: !5) +!15 = !DILocation(line: 31, column: 34, scope: !5) +!16 = !DILocation(line: 31, column: 53, scope: !5) +!17 = !DILocation(line: 34, column: 38, scope: !5) +!18 = !DILocation(line: 28, column: 27, scope: !5) +!19 = !DILocation(line: 29, column: 25, scope: !5) +!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23) +!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0) +!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!23 = !DILocation(line: 35, column: 25, scope: !21) +!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26) +!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0) +!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27) +!27 = !DILocation(line: 35, column: 25, scope: !25) +!28 = !DILocation(line: 35, column: 28, scope: !5) +!29 = !DILocation(line: 36, column: 20, scope: !5) +!30 = !DILocation(line: 38, column: 30, scope: !5) +!31 = !DILocation(line: 38, column: 35, scope: !5) +!32 = !DILocation(line: 41, column: 32, scope: !5) +!33 = !DILocation(line: 45, column: 40, scope: !5) +!34 = !DILocation(line: 45, column: 36, scope: !5) +!35 = !DILocation(line: 45, column: 30, scope: !5) +!36 = !DILocation(line: 45, column: 55, scope: !5) +!37 = !DILocation(line: 45, column: 4, scope: !5) diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ptx b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b0d269ae0f8fadc3a630a2b150507e8a1f4a2279 --- /dev/null +++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ptx @@ -0,0 +1,764 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3de4e +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3de4e( + .param .u64 triton__0d1d2d3de4e_param_0, + .param .u64 triton__0d1d2d3de4e_param_1, + .param .u64 triton__0d1d2d3de4e_param_2, + .param .u32 triton__0d1d2d3de4e_param_3, + .param .u32 triton__0d1d2d3de4e_param_4 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<52>; + .reg .b32 %r<152>; + .reg .f32 %f<107>; + .reg .b64 %rd<30>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd12, [triton__0d1d2d3de4e_param_0]; + ld.param.u64 %rd13, [triton__0d1d2d3de4e_param_1]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r84, %tid.x; + and.b32 %r85, %r84, 31; + ld.param.u64 %rd14, [triton__0d1d2d3de4e_param_2]; + shl.b32 %r86, %r84, 2; + and.b32 %r87, %r86, 60; + .loc 1 24 33 + bfe.u32 %r88, %r84, 5, 3; + bfe.u32 %r89, %r84, 4, 1; + shl.b32 %r90, %r88, 1; + or.b32 %r91, %r90, %r89; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 21 33 + shl.b32 %r92, %r1, 6; + .loc 1 22 23 + or.b32 %r93, %r92, %r87; + .loc 1 31 47 + shl.b32 %r94, %r91, 17; + .loc 1 31 40 + add.s32 %r95, %r94, %r93; + add.s32 %r96, %r95, 2097152; + add.s32 %r97, %r95, 4194304; + add.s32 %r98, %r95, 6291456; + .loc 1 31 34 + mul.wide.s32 %rd15, %r95, 4; + add.s64 %rd1, %rd12, %rd15; + mul.wide.s32 %rd16, %r96, 4; + add.s64 %rd2, %rd12, %rd16; + mul.wide.s32 %rd17, %r97, 4; + add.s64 %rd3, %rd12, %rd17; + mul.wide.s32 %rd18, %r98, 4; + add.s64 %rd4, %rd12, %rd18; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 31 53 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + @!%p1 mov.u32 %r12, %r6; + @!%p1 mov.u32 %r13, %r6; + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r18, %r19, %r20, %r21 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + @!%p1 mov.u32 %r20, %r6; + @!%p1 mov.u32 %r21, %r6; + mov.b32 %f1, %r18; + mov.b32 %f2, %r19; + mov.b32 %f3, %r20; + mov.b32 %f4, %r21; + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r6; + @!%p1 mov.u32 %r27, %r6; + @!%p1 mov.u32 %r28, %r6; + @!%p1 mov.u32 %r29, %r6; + .loc 1 34 38 + add.f32 %f5, %f1, 0f00000000; + add.f32 %f6, %f2, 0f00000000; + add.f32 %f7, %f3, 0f00000000; + add.f32 %f8, %f4, 0f00000000; + .loc 1 28 27 + or.b32 %r99, %r91, 112; + .loc 1 29 25 + setp.lt.u32 %p36, %r99, 120; + .loc 1 31 47 + shl.b32 %r100, %r99, 17; + .loc 1 31 40 + add.s32 %r101, %r95, 8388608; + add.s32 %r102, %r95, 10485760; + add.s32 %r103, %r95, 12582912; + add.s32 %r104, %r100, %r93; + .loc 1 31 34 + mul.wide.s32 %rd19, %r101, 4; + add.s64 %rd5, %rd12, %rd19; + mul.wide.s32 %rd20, %r102, 4; + add.s64 %rd6, %rd12, %rd20; + mul.wide.s32 %rd21, %r103, 4; + add.s64 %rd7, %rd12, %rd21; + mul.wide.s32 %rd22, %r104, 4; + add.s64 %rd8, %rd12, %rd22; + .loc 1 31 53 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r34, %r35, %r36, %r37 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r34, %r6; + @!%p1 mov.u32 %r35, %r6; + @!%p1 mov.u32 %r36, %r6; + @!%p1 mov.u32 %r37, %r6; + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + mov.u32 %r45, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ]; + @!%p1 mov.u32 %r42, %r6; + @!%p1 mov.u32 %r43, %r6; + @!%p1 mov.u32 %r44, %r6; + @!%p1 mov.u32 %r45, %r6; + mov.u32 %r50, 0x0; + mov.u32 %r51, 0x0; + mov.u32 %r52, 0x0; + mov.u32 %r53, 0x0; + @%p1 ld.global.L1::evict_first.v4.b32 { %r50, %r51, %r52, %r53 }, [ %rd7 + 0 ]; + @!%p1 mov.u32 %r50, %r6; + @!%p1 mov.u32 %r51, %r6; + @!%p1 mov.u32 %r52, %r6; + @!%p1 mov.u32 %r53, %r6; + mov.u32 %r58, 0x0; + mov.u32 %r59, 0x0; + mov.u32 %r60, 0x0; + mov.u32 %r61, 0x0; + @%p36 ld.global.L1::evict_first.v4.b32 { %r58, %r59, %r60, %r61 }, [ %rd8 + 0 ]; + @!%p36 mov.u32 %r58, %r6; + @!%p36 mov.u32 %r59, %r6; + @!%p36 mov.u32 %r60, %r6; + @!%p36 mov.u32 %r61, %r6; + mov.b32 %f9, %r58; + mov.b32 %f10, %r59; + mov.b32 %f11, %r60; + mov.b32 %f12, %r61; + mov.b32 %f13, %r2; + mov.b32 %f14, %r10; + .loc 1 34 38 + add.f32 %f15, %f14, 0f00000000; + add.f32 %f16, %f13, 0f00000000; + .loc 1 31 53 + mov.b32 %f17, %r42; + mov.b32 %f18, %r34; + .loc 1 34 38 + add.f32 %f19, %f16, %f18; + add.f32 %f20, %f15, %f17; + .loc 1 31 53 + mov.b32 %f21, %r3; + mov.b32 %f22, %r11; + .loc 1 34 38 + add.f32 %f23, %f22, 0f00000000; + add.f32 %f24, %f21, 0f00000000; + .loc 1 31 53 + mov.b32 %f25, %r43; + mov.b32 %f26, %r35; + .loc 1 34 38 + add.f32 %f27, %f24, %f26; + add.f32 %f28, %f23, %f25; + .loc 1 31 53 + mov.b32 %f29, %r4; + mov.b32 %f30, %r12; + .loc 1 34 38 + add.f32 %f31, %f30, 0f00000000; + add.f32 %f32, %f29, 0f00000000; + .loc 1 31 53 + mov.b32 %f33, %r44; + mov.b32 %f34, %r36; + .loc 1 34 38 + add.f32 %f35, %f32, %f34; + add.f32 %f36, %f31, %f33; + .loc 1 31 53 + mov.b32 %f37, %r5; + mov.b32 %f38, %r13; + .loc 1 34 38 + add.f32 %f39, %f38, 0f00000000; + add.f32 %f40, %f37, 0f00000000; + .loc 1 31 53 + mov.b32 %f41, %r45; + mov.b32 %f42, %r37; + .loc 1 34 38 + add.f32 %f43, %f40, %f42; + add.f32 %f44, %f39, %f41; + selp.f32 %f45, %f9, 0f80000000, %p36; + selp.f32 %f46, %f10, 0f80000000, %p36; + selp.f32 %f47, %f11, 0f80000000, %p36; + selp.f32 %f48, %f12, 0f80000000, %p36; + .loc 1 22 44 + and.b32 %r105, %r84, 63; + .loc 1 22 23 + or.b32 %r106, %r92, %r105; +$L__tmp1: + .loc 2 233 15 + add.f32 %f49, %f19, %f20; + add.f32 %f50, %f27, %f28; + add.f32 %f51, %f35, %f36; + add.f32 %f52, %f43, %f44; +$L__tmp2: + .loc 1 31 53 + mov.b32 %f53, %r26; + mov.b32 %f54, %r50; + .loc 1 34 38 + add.f32 %f55, %f5, %f54; + add.f32 %f56, %f53, 0f00000000; + add.f32 %f57, %f55, %f49; + add.f32 %f58, %f56, %f45; + .loc 1 31 53 + mov.b32 %f59, %r27; + mov.b32 %f60, %r51; + .loc 1 34 38 + add.f32 %f61, %f6, %f60; + add.f32 %f62, %f59, 0f00000000; + add.f32 %f63, %f61, %f50; + add.f32 %f64, %f62, %f46; + .loc 1 31 53 + mov.b32 %f65, %r28; + mov.b32 %f66, %r52; + .loc 1 34 38 + add.f32 %f67, %f7, %f66; + add.f32 %f68, %f65, 0f00000000; + add.f32 %f69, %f67, %f51; + add.f32 %f70, %f68, %f47; + .loc 1 31 53 + mov.b32 %f71, %r29; + mov.b32 %f72, %r53; + .loc 1 34 38 + add.f32 %f73, %f8, %f72; + add.f32 %f74, %f71, 0f00000000; + add.f32 %f75, %f73, %f52; + add.f32 %f76, %f74, %f48; +$L__tmp3: + .loc 2 233 15 + add.f32 %f77, %f58, %f57; + add.f32 %f78, %f64, %f63; + add.f32 %f79, %f70, %f69; + add.f32 %f80, %f76, %f75; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r107, %f77; + shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1; + mov.b32 %f81, %r108; +$L__tmp5: + .loc 2 233 15 + add.f32 %f82, %f77, %f81; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r109, %f78; + shfl.sync.bfly.b32 %r110, %r109, 16, 31, -1; + mov.b32 %f83, %r110; +$L__tmp7: + .loc 2 233 15 + add.f32 %f84, %f78, %f83; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r111, %f79; + shfl.sync.bfly.b32 %r112, %r111, 16, 31, -1; + mov.b32 %f85, %r112; +$L__tmp9: + .loc 2 233 15 + add.f32 %f86, %f79, %f85; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r113, %f80; + shfl.sync.bfly.b32 %r114, %r113, 16, 31, -1; + mov.b32 %f87, %r114; +$L__tmp11: + .loc 2 233 15 + add.f32 %f88, %f80, %f87; +$L__tmp12: + .loc 2 243 36 + setp.lt.u32 %p41, %r85, 16; + shl.b32 %r115, %r88, 2; + shl.b32 %r116, %r87, 5; + or.b32 %r117, %r116, %r115; + mov.u32 %r118, global_smem; + add.s32 %r66, %r118, %r117; + mov.b32 %r67, %f82; + @%p41 st.shared.b32 [ %r66 + 0 ], %r67; + or.b32 %r119, %r116, 32; + or.b32 %r120, %r119, %r115; + add.s32 %r68, %r118, %r120; + mov.b32 %r69, %f84; + @%p41 st.shared.b32 [ %r68 + 0 ], %r69; + or.b32 %r121, %r116, 64; + or.b32 %r122, %r121, %r115; + add.s32 %r70, %r118, %r122; + mov.b32 %r71, %f86; + @%p41 st.shared.b32 [ %r70 + 0 ], %r71; + or.b32 %r123, %r116, 96; + or.b32 %r124, %r123, %r115; + add.s32 %r72, %r118, %r124; + mov.b32 %r73, %f88; + @%p41 st.shared.b32 [ %r72 + 0 ], %r73; + bar.sync 0; + setp.lt.s32 %p45, %r84, 512; + add.s32 %r75, %r118, %r86; + @%p45 ld.shared.b32 %r74, [ %r75 + 0 ]; + mov.b32 %f89, %r74; + shfl.sync.bfly.b32 %r125, %r74, 4, 31, -1; + mov.b32 %f90, %r125; +$L__tmp13: + .loc 2 233 15 + add.f32 %f91, %f89, %f90; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r126, %f91; + shfl.sync.bfly.b32 %r127, %r126, 2, 31, -1; + mov.b32 %f92, %r127; +$L__tmp15: + .loc 2 233 15 + add.f32 %f93, %f91, %f92; +$L__tmp16: + .loc 2 243 36 + mov.b32 %r128, %f93; + shfl.sync.bfly.b32 %r129, %r128, 1, 31, -1; + mov.b32 %f94, %r129; +$L__tmp17: + .loc 2 233 15 + add.f32 %f95, %f93, %f94; +$L__tmp18: + .loc 2 243 36 + and.b32 %r130, %r84, 7; + setp.eq.s32 %p51, %r130, 0; + and.pred %p46, %p45, %p51; + mov.b32 %r77, %f95; + @%p46 st.shared.b32 [ %r75 + 0 ], %r77; + add.s32 %r79, %r75, 1024; + @%p45 ld.shared.b32 %r78, [ %r79 + 0 ]; + mov.b32 %f96, %r78; + shfl.sync.bfly.b32 %r131, %r78, 4, 31, -1; + mov.b32 %f97, %r131; +$L__tmp19: + .loc 2 233 15 + add.f32 %f98, %f96, %f97; +$L__tmp20: + .loc 2 243 36 + mov.b32 %r132, %f98; + shfl.sync.bfly.b32 %r133, %r132, 2, 31, -1; + mov.b32 %f99, %r133; +$L__tmp21: + .loc 2 233 15 + add.f32 %f100, %f98, %f99; +$L__tmp22: + .loc 2 243 36 + mov.b32 %r134, %f100; + shfl.sync.bfly.b32 %r135, %r134, 1, 31, -1; + mov.b32 %f101, %r135; +$L__tmp23: + .loc 2 233 15 + add.f32 %f102, %f100, %f101; +$L__tmp24: + .loc 2 243 36 + mov.b32 %r81, %f102; + @%p46 st.shared.b32 [ %r79 + 0 ], %r81; + bar.sync 0; + add.s32 %r136, %r118, %r116; + ld.shared.f32 %f103, [%r136]; + add.s32 %r137, %r118, %r119; + ld.shared.f32 %f104, [%r137]; + add.s32 %r138, %r118, %r121; + ld.shared.f32 %f105, [%r138]; + add.s32 %r139, %r118, %r123; + ld.shared.f32 %f106, [%r139]; +$L__tmp25: + .loc 1 35 28 + bar.sync 0; + shl.b32 %r140, %r87, 2; + add.s32 %r141, %r118, %r140; + st.shared.f32 [%r141], %f103; + st.shared.f32 [%r141+4], %f104; + st.shared.f32 [%r141+8], %f105; + st.shared.f32 [%r141+12], %f106; + bar.sync 0; + shl.b32 %r142, %r105, 2; + add.s32 %r143, %r118, %r142; + .loc 1 36 20 + shr.s32 %r145, %r106, 31; + shr.u32 %r146, %r145, 24; + add.s32 %r147, %r106, %r146; + shr.s32 %r148, %r147, 8; + and.b32 %r149, %r147, -256; + sub.s32 %r150, %r106, %r149; + .loc 1 38 30 + mul.wide.s32 %rd23, %r148, 8; + add.s64 %rd10, %rd13, %rd23; + .loc 1 45 55 + ld.shared.u32 %r83, [%r143]; + .loc 1 38 35 + mov.u64 %rd9, 0x0; + @%p1 ld.global.L1::evict_last.b64 { %rd9 }, [ %rd10 + 0 ]; + .loc 1 41 32 + shr.u64 %rd24, %rd9, 54; + and.b64 %rd25, %rd24, 512; + add.s64 %rd26, %rd25, %rd9; + .loc 1 45 30 + shl.b64 %rd27, %rd26, 10; + add.s64 %rd28, %rd14, %rd27; + mul.wide.s32 %rd29, %r150, 4; + add.s64 %rd11, %rd28, %rd29; + .loc 1 45 55 + and.b32 %r151, %r84, 192; + setp.eq.s32 %p50, %r151, 0; + mov.u32 %r82, 0x0; + @%p50 atom.global.gpu.acq_rel.add.f32 %r82, [ %rd11 + 0 ], %r83; + .loc 1 45 4 + ret; +$L__tmp26: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/6i/c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 264 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 54 +.b8 105 +.b8 107 +.b8 53 +.b8 118 +.b8 120 +.b8 55 +.b8 112 +.b8 50 +.b8 50 +.b8 102 +.b8 112 +.b8 107 +.b8 52 +.b8 100 +.b8 99 +.b8 118 +.b8 104 +.b8 53 +.b8 53 +.b8 122 +.b8 105 +.b8 109 +.b8 119 +.b8 52 +.b8 116 +.b8 53 +.b8 110 +.b8 114 +.b8 53 +.b8 122 +.b8 110 +.b8 50 +.b8 98 +.b8 55 +.b8 105 +.b8 110 +.b8 117 +.b8 106 +.b8 120 +.b8 106 +.b8 97 +.b8 117 +.b8 120 +.b8 115 +.b8 104 +.b8 108 +.b8 106 +.b8 117 +.b8 109 +.b8 109 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 54 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp24 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp4 +.b64 $L__tmp25 +.b8 2 +.b8 35 +.b8 25 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 101 +.b8 52 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 268 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttgir b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f122a6130de200e26956e978263c837934848e70 --- /dev/null +++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttgir @@ -0,0 +1,67 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked> + %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked> + %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked> + %cst_3 = arith.constant dense<131072> : tensor<1x64xi32, #blocked1> + %cst_4 = arith.constant dense<120> : tensor<1x64xi32, #blocked1> + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c120_i32 = arith.constant 120 : i32 + %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked1> + %cst_6 = arith.constant dense : tensor<64x1xi1, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked> + %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1> + %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked> + %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked1> + %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked> + %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> + %12 = tt.broadcast %8 : (tensor<64x1xi32, #blocked1>) -> tensor<64x64xi32, #blocked1> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr, #blocked1> + %14 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c64_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x64xf32, #blocked1>) : i32 { + %32 = tt.splat %arg5 : (i32) -> tensor<1x64xi32, #blocked1> + %33 = arith.addi %32, %11 : tensor<1x64xi32, #blocked1> + %34 = arith.cmpi slt, %33, %cst_4 : tensor<1x64xi32, #blocked1> + %35 = arith.muli %33, %cst_3 : tensor<1x64xi32, #blocked1> + %36 = tt.broadcast %35 : (tensor<1x64xi32, #blocked1>) -> tensor<64x64xi32, #blocked1> + %37 = arith.addi %12, %36 : tensor<64x64xi32, #blocked1> + %38 = tt.addptr %13, %37 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi32, #blocked1> + %39 = tt.broadcast %34 : (tensor<1x64xi1, #blocked1>) -> tensor<64x64xi1, #blocked1> + %40 = tt.load %38, %39, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked1> + %41 = arith.addf %arg6, %40 : tensor<64x64xf32, #blocked1> + %42 = arith.select %39, %41, %arg6 : tensor<64x64xi1, #blocked1>, tensor<64x64xf32, #blocked1> + scf.yield %42 : tensor<64x64xf32, #blocked1> + } + %15 = "tt.reduce"(%14) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %32 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %32 : f32 + }) : (tensor<64x64xf32, #blocked1>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %16 = triton_gpu.convert_layout %15 : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked> + %18 = arith.divsi %9, %cst_2 : tensor<64x1xi32, #blocked> + %19 = arith.remsi %9, %cst_2 : tensor<64x1xi32, #blocked> + %20 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked> + %21 = tt.addptr %20, %18 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi32, #blocked> + %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked> + %23 = arith.addi %22, %cst_1 : tensor<64x1xi64, #blocked> + %24 = arith.cmpi slt, %22, %cst_0 : tensor<64x1xi64, #blocked> + %25 = arith.select %24, %23, %22 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked> + %26 = arith.muli %25, %cst : tensor<64x1xi64, #blocked> + %27 = arith.extsi %19 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> + %28 = arith.addi %27, %26 : tensor<64x1xi64, #blocked> + %29 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr, #blocked> + %30 = tt.addptr %29, %28 : tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xi64, #blocked> + %31 = "tt.atomic_rmw"(%30, %17, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttir b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..11f62c676ae3b36e8a565af04f5be4d9481e29cf --- /dev/null +++ b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.ttir @@ -0,0 +1,59 @@ +module { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<64x1xi64> + %cst_0 = arith.constant dense<0> : tensor<64x1xi64> + %cst_1 = arith.constant dense<512> : tensor<64x1xi64> + %c120_i32 = arith.constant 120 : i32 + %c0_i32 = arith.constant 0 : i32 + %c64_i32 = arith.constant 64 : i32 + %cst_2 = arith.constant dense : tensor<64x1xi1> + %cst_3 = arith.constant dense<256> : tensor<64x1xi32> + %cst_4 = arith.constant dense<131072> : tensor<1x64xi32> + %cst_5 = arith.constant dense<120> : tensor<1x64xi32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<64x64xf32> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<64xi32>) -> tensor<1x64xi32> + %7 = tt.broadcast %5 : (tensor<64x1xi32>) -> tensor<64x64xi32> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x64x!tt.ptr> + %9 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c64_i32 iter_args(%arg6 = %cst_6) -> (tensor<64x64xf32>) : i32 { + %26 = tt.splat %arg5 : (i32) -> tensor<1x64xi32> + %27 = arith.addi %26, %6 : tensor<1x64xi32> + %28 = arith.cmpi slt, %27, %cst_5 : tensor<1x64xi32> + %29 = arith.muli %27, %cst_4 : tensor<1x64xi32> + %30 = tt.broadcast %29 : (tensor<1x64xi32>) -> tensor<64x64xi32> + %31 = arith.addi %7, %30 : tensor<64x64xi32> + %32 = tt.addptr %8, %31 : tensor<64x64x!tt.ptr>, tensor<64x64xi32> + %33 = tt.broadcast %28 : (tensor<1x64xi1>) -> tensor<64x64xi1> + %34 = tt.load %32, %33, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32> + %35 = arith.addf %arg6, %34 : tensor<64x64xf32> + %36 = arith.select %33, %35, %arg6 : tensor<64x64xi1>, tensor<64x64xf32> + scf.yield %36 : tensor<64x64xf32> + } + %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %26 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %26 : f32 + }) : (tensor<64x64xf32>) -> tensor<64xf32> + %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %12 = arith.divsi %5, %cst_3 : tensor<64x1xi32> + %13 = arith.remsi %5, %cst_3 : tensor<64x1xi32> + %14 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %15 = tt.addptr %14, %12 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + %16 = tt.load %15 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64> + %17 = arith.addi %16, %cst_1 : tensor<64x1xi64> + %18 = arith.cmpi slt, %16, %cst_0 : tensor<64x1xi64> + %19 = arith.select %18, %17, %16 : tensor<64x1xi1>, tensor<64x1xi64> + %20 = arith.muli %19, %cst : tensor<64x1xi64> + %21 = arith.extsi %13 : tensor<64x1xi32> to tensor<64x1xi64> + %22 = arith.addi %21, %20 : tensor<64x1xi64> + %23 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %24 = tt.addptr %23, %22 : tensor<64x1x!tt.ptr>, tensor<64x1xi64> + %25 = "tt.atomic_rmw"(%24, %11, %cst_2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr>, tensor<64x1xf32>, tensor<64x1xi1>) -> tensor<64x1xf32> + tt.return + } +} diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.cubin b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b97fb5711ee0ec5e4a4ae561d9a86d1ea693b58a Binary files /dev/null and b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.cubin differ diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ptx b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..9b0f1d0b8732248e49ef6c319abcb29279feed2c --- /dev/null +++ b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ptx @@ -0,0 +1,975 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5de6de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5de6de( + .param .u64 triton__0d1d2d3d4d5de6de_param_0, + .param .u64 triton__0d1d2d3d4d5de6de_param_1, + .param .u64 triton__0d1d2d3d4d5de6de_param_2, + .param .u64 triton__0d1d2d3d4d5de6de_param_3, + .param .u64 triton__0d1d2d3d4d5de6de_param_4, + .param .u32 triton__0d1d2d3d4d5de6de_param_5, + .param .u32 triton__0d1d2d3d4d5de6de_param_6 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<53>; + .reg .b16 %rs<5>; + .reg .b32 %r<161>; + .reg .f32 %f<153>; + .reg .b64 %rd<52>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd6, [triton__0d1d2d3d4d5de6de_param_3]; + ld.param.u64 %rd5, [triton__0d1d2d3d4d5de6de_param_1]; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5de6de_param_0]; +$L__tmp0: + .loc 1 24 33 + mov.u32 %r1, %tid.x; + and.b32 %r2, %r1, 31; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5de6de_param_2]; + bfe.u32 %r3, %r1, 5, 1; + shl.b32 %r23, %r1, 2; + and.b32 %r4, %r23, 252; + .loc 1 21 28 + mov.u32 %r14, %ctaid.x; + .loc 1 26 30 + mul.wide.s32 %rd21, %r14, 8; + add.s64 %rd9, %rd19, %rd21; + mov.pred %p47, -1; + .loc 1 26 35 + mov.u64 %rd8, 0x0; + @%p47 ld.global.L1::evict_last.b64 { %rd8 }, [ %rd9 + 0 ]; + mov.u64 %rd10, 0x0; + @%p47 ld.global.L1::evict_last.b64 { %rd10 }, [ %rd9 + 0 ]; + mov.u64 %rd12, 0x0; + @%p47 ld.global.L1::evict_last.b64 { %rd12 }, [ %rd9 + 0 ]; + mov.u64 %rd14, 0x0; + @%p47 ld.global.L1::evict_last.b64 { %rd14 }, [ %rd9 + 0 ]; + mov.u64 %rd16, 0x0; + @%p47 ld.global.L1::evict_last.b64 { %rd16 }, [ %rd9 + 0 ]; + .loc 1 27 18 + shr.s32 %r24, %r14, 31; + shr.u32 %r25, %r24, 23; + add.s32 %r26, %r14, %r25; + and.b32 %r27, %r26, 16776704; + sub.s32 %r28, %r14, %r27; + .loc 1 35 44 + shl.b32 %r29, %r28, 8; + .loc 1 35 40 + or.b32 %r30, %r29, %r4; + .loc 1 35 34 + mul.wide.s32 %rd22, %r30, 4; + add.s64 %rd32, %rd20, %rd22; + mov.b32 %r134, 0; + .loc 1 35 50 + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + mov.u32 %r17, 0x0; + mov.u32 %r18, 0x0; + @%p47 ld.global.L1::evict_last.v4.b32 { %r15, %r16, %r17, %r18 }, [ %rd32 + 0 ]; + @!%p47 mov.u32 %r15, %r134; + @!%p47 mov.u32 %r16, %r134; + @!%p47 mov.u32 %r17, %r134; + @!%p47 mov.u32 %r18, %r134; + mov.b32 %f1, %r15; + mov.b32 %f2, %r16; + mov.b32 %f3, %r17; + mov.b32 %f4, %r18; + .loc 1 36 22 + add.s64 %rd23, %rd16, 50257; + .loc 1 37 22 + setp.lt.s64 %p11, %rd16, 0; + .loc 1 38 36 + selp.b64 %rd3, %rd23, %rd16, %p11; + .loc 1 39 40 + setp.lt.u64 %p12, %rd3, 50257; + mov.b32 %r160, 883; + mov.u64 %rd51, 1; + .loc 1 39 55 + @%p12 bra $L__BB0_2; + mov.u64 %rd24, assertMessage_0; + cvta.global.u64 %rd25, %rd24; + mov.u64 %rd26, assertFile_0; + cvta.global.u64 %rd27, %rd26; + mov.u64 %rd28, assertFunc_0; + cvta.global.u64 %rd29, %rd28; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd25; + .param .b64 param1; + st.param.b64 [param1+0], %rd27; + .param .b32 param2; + st.param.b32 [param2+0], %r160; + .param .b64 param3; + st.param.b64 [param3+0], %rd29; + .param .b64 param4; + st.param.b64 [param4+0], %rd51; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 0 +$L__BB0_2: + .loc 1 0 55 + ld.param.u64 %rd7, [triton__0d1d2d3d4d5de6de_param_4]; + .loc 1 37 22 + setp.lt.s64 %p38, %rd8, 0; + .loc 1 40 44 + shl.b64 %rd34, %rd8, 8; + add.s64 %rd35, %rd34, 12865792; + selp.b64 %rd36, %rd35, %rd34, %p38; + cvt.u64.u32 %rd37, %r4; + .loc 1 40 40 + or.b64 %rd38, %rd36, %rd37; + .loc 1 40 34 + shl.b64 %rd39, %rd38, 2; + add.s64 %rd48, %rd5, %rd39; + .loc 1 40 52 + mov.u32 %r32, 0x0; + mov.u32 %r33, 0x0; + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + @%p47 ld.global.L1::evict_last.v4.b32 { %r32, %r33, %r34, %r35 }, [ %rd48 + 0 ]; + @!%p47 mov.u32 %r32, %r134; + @!%p47 mov.u32 %r33, %r134; + @!%p47 mov.u32 %r34, %r134; + @!%p47 mov.u32 %r35, %r134; + mov.b32 %f7, %r32; + mov.b32 %f8, %r33; + mov.b32 %f9, %r34; + mov.b32 %f10, %r35; + .loc 1 41 22 + add.f32 %f11, %f1, %f7; + add.f32 %f12, %f2, %f8; + add.f32 %f13, %f3, %f9; + add.f32 %f14, %f4, %f10; +$L__tmp1: + .loc 2 98 22 + add.f32 %f15, %f11, 0f00000000; + add.f32 %f16, %f12, 0f00000000; + add.f32 %f17, %f13, 0f00000000; + add.f32 %f18, %f14, 0f00000000; + .loc 2 101 30 + sub.f32 %f19, %f11, %f15; + sub.f32 %f20, %f12, %f16; + sub.f32 %f21, %f13, %f17; + sub.f32 %f22, %f14, %f18; + .loc 2 101 13 + fma.rn.f32 %f23, %f11, %f19, 0f00000000; + fma.rn.f32 %f24, %f12, %f20, 0f00000000; + fma.rn.f32 %f25, %f13, %f21, 0f00000000; + fma.rn.f32 %f26, %f14, %f22, 0f00000000; +$L__tmp2: + .loc 2 108 21 + sub.f32 %f27, %f16, %f15; + mov.b32 %r41, 1065353216; + mov.b32 %r42, 1073741824; + .loc 2 110 60 + div.full.f32 %r40, %r41, %r42; + mov.b32 %f28, %r40; + .loc 2 112 17 + fma.rn.f32 %f29, %f28, %f27, %f15; + .loc 2 113 15 + add.f32 %f30, %f23, %f24; + .loc 2 113 30 + mul.f32 %f31, %f27, %f27; + .loc 2 113 22 + fma.rn.f32 %f32, %f28, %f31, %f30; + .loc 2 108 21 + sub.f32 %f33, %f17, %f29; + mov.b32 %r45, 1077936128; + .loc 2 110 60 + div.full.f32 %r43, %r41, %r45; + mov.b32 %f34, %r43; + .loc 2 112 17 + fma.rn.f32 %f35, %f34, %f33, %f29; + .loc 2 113 15 + add.f32 %f36, %f25, %f32; + .loc 2 113 30 + mul.f32 %f37, %f33, %f33; + .loc 2 113 38 + fma.rn.f32 %f38, %f33, %f33, %f37; + .loc 2 113 22 + fma.rn.f32 %f39, %f34, %f38, %f36; + .loc 2 108 21 + sub.f32 %f40, %f18, %f35; + mov.b32 %r48, 1082130432; + .loc 2 110 60 + div.full.f32 %r46, %r41, %r48; + mov.b32 %f41, %r46; + .loc 2 112 17 + fma.rn.f32 %f42, %f41, %f40, %f35; + .loc 2 113 15 + add.f32 %f43, %f26, %f39; + .loc 2 113 30 + mul.f32 %f44, %f40, %f40; + .loc 2 113 38 + mul.f32 %f45, %f44, 0f40400000; + .loc 2 113 22 + fma.rn.f32 %f46, %f41, %f45, %f43; +$L__tmp3: + .loc 2 120 46 + mov.b32 %r101, %f42; + shfl.sync.bfly.b32 %r102, %r101, 16, 31, -1; + mov.b32 %f47, %r102; + mov.b32 %r103, %f46; + shfl.sync.bfly.b32 %r104, %r103, 16, 31, -1; + mov.b32 %f48, %r104; + shfl.sync.bfly.b32 %r50, %r48, 16, 31, -1; + mov.b32 %f49, %r50; +$L__tmp4: + .loc 2 108 21 + sub.f32 %f50, %f47, %f42; + .loc 2 109 28 + add.f32 %f51, %f49, 0f40800000; + .loc 2 110 39 + setp.eq.f32 %p39, %f51, 0f00000000; + .loc 2 110 60 + mov.b32 %r51, %f51; + div.full.f32 %r49, %r50, %r51; + mov.b32 %f52, %r49; + .loc 2 110 49 + selp.f32 %f53, 0f00000000, %f52, %p39; + .loc 2 112 17 + fma.rn.f32 %f54, %f53, %f50, %f42; + .loc 2 113 15 + add.f32 %f55, %f46, %f48; + .loc 2 113 30 + mul.f32 %f56, %f50, %f50; + .loc 2 113 38 + mul.f32 %f57, %f56, 0f40800000; + .loc 2 113 22 + fma.rn.f32 %f58, %f53, %f57, %f55; +$L__tmp5: + .loc 2 120 46 + mov.b32 %r105, %f54; + shfl.sync.bfly.b32 %r106, %r105, 8, 31, -1; + mov.b32 %f59, %r106; + mov.b32 %r107, %f58; + shfl.sync.bfly.b32 %r108, %r107, 8, 31, -1; + mov.b32 %f60, %r108; + shfl.sync.bfly.b32 %r53, %r51, 8, 31, -1; + mov.b32 %f61, %r53; +$L__tmp6: + .loc 2 108 21 + sub.f32 %f62, %f59, %f54; + .loc 2 109 28 + add.f32 %f63, %f51, %f61; + .loc 2 110 39 + setp.eq.f32 %p40, %f63, 0f00000000; + .loc 2 110 60 + mov.b32 %r54, %f63; + div.full.f32 %r52, %r53, %r54; + mov.b32 %f64, %r52; + .loc 2 110 49 + selp.f32 %f65, 0f00000000, %f64, %p40; + .loc 2 112 17 + fma.rn.f32 %f66, %f65, %f62, %f54; + .loc 2 113 15 + add.f32 %f67, %f58, %f60; + .loc 2 113 30 + mul.f32 %f68, %f62, %f62; + .loc 2 113 38 + mul.f32 %f69, %f51, %f68; + .loc 2 113 22 + fma.rn.f32 %f70, %f65, %f69, %f67; +$L__tmp7: + .loc 2 120 46 + mov.b32 %r109, %f66; + shfl.sync.bfly.b32 %r110, %r109, 4, 31, -1; + mov.b32 %f71, %r110; + mov.b32 %r111, %f70; + shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1; + mov.b32 %f72, %r112; + shfl.sync.bfly.b32 %r56, %r54, 4, 31, -1; + mov.b32 %f73, %r56; +$L__tmp8: + .loc 2 108 21 + sub.f32 %f74, %f71, %f66; + .loc 2 109 28 + add.f32 %f75, %f63, %f73; + .loc 2 110 39 + setp.eq.f32 %p41, %f75, 0f00000000; + .loc 2 110 60 + mov.b32 %r57, %f75; + div.full.f32 %r55, %r56, %r57; + mov.b32 %f76, %r55; + .loc 2 110 49 + selp.f32 %f77, 0f00000000, %f76, %p41; + .loc 2 112 17 + fma.rn.f32 %f78, %f77, %f74, %f66; + .loc 2 113 15 + add.f32 %f79, %f70, %f72; + .loc 2 113 30 + mul.f32 %f80, %f74, %f74; + .loc 2 113 38 + mul.f32 %f81, %f63, %f80; + .loc 2 113 22 + fma.rn.f32 %f82, %f77, %f81, %f79; +$L__tmp9: + .loc 2 120 46 + mov.b32 %r113, %f78; + shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1; + mov.b32 %f83, %r114; + mov.b32 %r115, %f82; + shfl.sync.bfly.b32 %r116, %r115, 2, 31, -1; + mov.b32 %f84, %r116; + shfl.sync.bfly.b32 %r59, %r57, 2, 31, -1; + mov.b32 %f85, %r59; +$L__tmp10: + .loc 2 108 21 + sub.f32 %f86, %f83, %f78; + .loc 2 109 28 + add.f32 %f87, %f75, %f85; + .loc 2 110 39 + setp.eq.f32 %p42, %f87, 0f00000000; + .loc 2 110 60 + mov.b32 %r60, %f87; + div.full.f32 %r58, %r59, %r60; + mov.b32 %f88, %r58; + .loc 2 110 49 + selp.f32 %f89, 0f00000000, %f88, %p42; + .loc 2 112 17 + fma.rn.f32 %f90, %f89, %f86, %f78; + .loc 2 113 15 + add.f32 %f91, %f82, %f84; + .loc 2 113 30 + mul.f32 %f92, %f86, %f86; + .loc 2 113 38 + mul.f32 %f93, %f75, %f92; + .loc 2 113 22 + fma.rn.f32 %f94, %f89, %f93, %f91; +$L__tmp11: + .loc 2 120 46 + mov.b32 %r117, %f90; + shfl.sync.bfly.b32 %r118, %r117, 1, 31, -1; + mov.b32 %f95, %r118; + mov.b32 %r119, %f94; + shfl.sync.bfly.b32 %r120, %r119, 1, 31, -1; + mov.b32 %f96, %r120; + shfl.sync.bfly.b32 %r62, %r60, 1, 31, -1; + mov.b32 %f97, %r62; +$L__tmp12: + .loc 2 108 21 + sub.f32 %f98, %f95, %f90; + .loc 2 109 28 + add.f32 %f99, %f87, %f97; + .loc 2 110 39 + setp.eq.f32 %p43, %f99, 0f00000000; + .loc 2 110 60 + mov.b32 %r63, %f99; + div.full.f32 %r61, %r62, %r63; + mov.b32 %f100, %r61; + .loc 2 110 49 + selp.f32 %f101, 0f00000000, %f100, %p43; + .loc 2 112 17 + fma.rn.f32 %f102, %f98, %f101, %f90; + .loc 2 113 15 + add.f32 %f103, %f94, %f96; + .loc 2 113 30 + mul.f32 %f104, %f98, %f98; + .loc 2 113 38 + mul.f32 %f105, %f87, %f104; + .loc 2 113 22 + fma.rn.f32 %f106, %f101, %f105, %f103; +$L__tmp13: + .loc 2 120 46 + setp.eq.s32 %p18, %r2, 0; + shl.b32 %r121, %r3, 2; + mov.u32 %r122, global_smem; + add.s32 %r64, %r122, %r121; + mov.b32 %r65, %f102; + @%p18 st.shared.b32 [ %r64 + 0 ], %r65; + add.s32 %r123, %r122, 8; + add.s32 %r66, %r123, %r121; + mov.b32 %r67, %f106; + @%p18 st.shared.b32 [ %r66 + 0 ], %r67; + add.s32 %r124, %r122, 16; + add.s32 %r68, %r124, %r121; + @%p18 st.shared.b32 [ %r68 + 0 ], %r63; + bar.sync 0; + setp.lt.s32 %p21, %r1, 2; + add.s32 %r71, %r122, %r23; + @%p21 ld.shared.b32 %r70, [ %r71 + 0 ]; + mov.b32 %f107, %r70; + add.s32 %r73, %r123, %r23; + @%p21 ld.shared.b32 %r72, [ %r73 + 0 ]; + mov.b32 %f108, %r72; + add.s32 %r75, %r124, %r23; + @%p21 ld.shared.b32 %r74, [ %r75 + 0 ]; + mov.b32 %f109, %r74; + shfl.sync.bfly.b32 %r126, %r70, 1, 31, -1; + mov.b32 %f110, %r126; + shfl.sync.bfly.b32 %r127, %r72, 1, 31, -1; + mov.b32 %f111, %r127; + shfl.sync.bfly.b32 %r77, %r74, 1, 31, -1; + mov.b32 %f112, %r77; +$L__tmp14: + .loc 2 108 21 + sub.f32 %f113, %f110, %f107; + .loc 2 109 28 + add.f32 %f114, %f109, %f112; + .loc 2 110 39 + setp.eq.f32 %p44, %f114, 0f00000000; + .loc 2 110 60 + mov.b32 %r78, %f114; + div.full.f32 %r76, %r77, %r78; + mov.b32 %f115, %r76; + .loc 2 110 49 + selp.f32 %f116, 0f00000000, %f115, %p44; + .loc 2 112 17 + fma.rn.f32 %f117, %f113, %f116, %f107; + .loc 2 113 15 + add.f32 %f118, %f108, %f111; + .loc 2 113 30 + mul.f32 %f119, %f113, %f113; + .loc 2 113 38 + mul.f32 %f120, %f109, %f119; + .loc 2 113 22 + fma.rn.f32 %f121, %f120, %f116, %f118; +$L__tmp15: + .loc 2 120 46 + and.b32 %r128, %r1, 1; + setp.eq.b32 %p45, %r128, 1; + not.pred %p46, %p45; + and.pred %p24, %p21, %p46; + mov.b32 %r80, %f117; + @%p24 st.shared.b32 [ %r71 + 0 ], %r80; + mov.b32 %r82, %f121; + @%p24 st.shared.b32 [ %r73 + 0 ], %r82; + @%p24 st.shared.b32 [ %r75 + 0 ], %r78; + bar.sync 0; + ld.shared.f32 %f5, [global_smem]; + ld.shared.f32 %f6, [global_smem+8]; +$L__tmp16: + .loc 1 59 51 + mov.u32 %r85, 0x0; + mov.u32 %r86, 0x0; + mov.u32 %r87, 0x0; + mov.u32 %r88, 0x0; + @%p47 ld.global.L1::evict_last.v4.b32 { %r85, %r86, %r87, %r88 }, [ %rd32 + 0 ]; + @!%p47 mov.u32 %r85, %r134; + @!%p47 mov.u32 %r86, %r134; + @!%p47 mov.u32 %r87, %r134; + @!%p47 mov.u32 %r88, %r134; + .loc 1 60 35 + mul.wide.u32 %rd40, %r4, 4; + add.s64 %rd33, %rd6, %rd40; + .loc 1 60 40 + mov.u32 %r93, 0x0; + mov.u32 %r94, 0x0; + mov.u32 %r95, 0x0; + mov.u32 %r96, 0x0; + @%p47 ld.global.L1::evict_last.v4.b32 { %r93, %r94, %r95, %r96 }, [ %rd33 + 0 ]; + @!%p47 mov.u32 %r93, %r134; + @!%p47 mov.u32 %r94, %r134; + @!%p47 mov.u32 %r95, %r134; + @!%p47 mov.u32 %r96, %r134; + .loc 1 64 57 + @%p12 bra $L__BB0_4; + mov.u64 %rd41, assertMessage_1; + cvta.global.u64 %rd42, %rd41; + mov.u64 %rd43, assertFile_1; + cvta.global.u64 %rd44, %rd43; + mov.u64 %rd45, assertFunc_1; + cvta.global.u64 %rd46, %rd45; + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd42; + .param .b64 param1; + st.param.b64 [param1+0], %rd44; + .param .b32 param2; + st.param.b32 [param2+0], %r160; + .param .b64 param3; + st.param.b64 [param3+0], %rd46; + .param .b64 param4; + st.param.b64 [param4+0], %rd51; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 1 +$L__BB0_4: + .loc 1 65 54 + mov.u32 %r130, 0x0; + mov.u32 %r131, 0x0; + mov.u32 %r132, 0x0; + mov.u32 %r133, 0x0; + @%p47 ld.global.L1::evict_first.v4.b32 { %r130, %r131, %r132, %r133 }, [ %rd48 + 0 ]; + @!%p47 mov.u32 %r130, %r134; + @!%p47 mov.u32 %r131, %r134; + @!%p47 mov.u32 %r132, %r134; + @!%p47 mov.u32 %r133, %r134; + .loc 1 69 23 + mov.b32 %r139, %f6; + mov.b32 %r140, 1132462080; + div.full.f32 %r138, %r139, %r140; + mov.b32 %f122, %r138; + .loc 1 71 24 + add.f32 %f123, %f122, 0f3727C5AC; + .loc 1 72 30 + rsqrt.approx.ftz.f32 %f124, %f123; + .loc 1 65 54 + mov.b32 %f125, %r133; + .loc 1 59 51 + mov.b32 %f126, %r88; + .loc 1 66 24 + add.f32 %f127, %f126, %f125; + .loc 1 67 24 + sub.f32 %f128, %f127, %f5; + .loc 1 65 54 + mov.b32 %f129, %r132; + .loc 1 59 51 + mov.b32 %f130, %r87; + .loc 1 66 24 + add.f32 %f131, %f130, %f129; + .loc 1 67 24 + sub.f32 %f132, %f131, %f5; + .loc 1 65 54 + mov.b32 %f133, %r131; + .loc 1 59 51 + mov.b32 %f134, %r86; + .loc 1 66 24 + add.f32 %f135, %f134, %f133; + .loc 1 67 24 + sub.f32 %f136, %f135, %f5; + .loc 1 65 54 + mov.b32 %f137, %r130; + .loc 1 59 51 + mov.b32 %f138, %r85; + .loc 1 66 24 + add.f32 %f139, %f138, %f137; + .loc 1 67 24 + sub.f32 %f140, %f139, %f5; + .loc 1 60 40 + mov.b32 %f141, %r93; + mov.b32 %f142, %r94; + mov.b32 %f143, %r95; + mov.b32 %f144, %r96; + .loc 1 73 24 + mul.f32 %f145, %f140, %f124; + mul.f32 %f146, %f136, %f124; + mul.f32 %f147, %f132, %f124; + mul.f32 %f148, %f128, %f124; + .loc 1 74 24 + mul.f32 %f149, %f145, %f141; + mul.f32 %f150, %f146, %f142; + mul.f32 %f151, %f147, %f143; + mul.f32 %f152, %f148, %f144; + .loc 1 76 39 + shl.b32 %r156, %r14, 8; + .loc 1 76 35 + or.b32 %r157, %r156, %r4; + .loc 1 76 29 + mul.wide.s32 %rd50, %r157, 2; + add.s64 %rd49, %rd7, %rd50; + .loc 1 76 52 + mov.b32 %r150, %f149; + cvt.rn.bf16.f32 %rs1, %r150; + mov.b32 %r151, %f150; + cvt.rn.bf16.f32 %rs2, %r151; + mov.b32 %r152, %f151; + cvt.rn.bf16.f32 %rs3, %r152; + mov.b32 %r153, %f152; + cvt.rn.bf16.f32 %rs4, %r153; + mov.b32 %r158, {%rs1, %rs2}; + mov.b32 %r159, {%rs3, %rs4}; + @%p47 st.global.v2.b32 [ %rd49 + 0 ], { %r158, %r159 }; + .loc 1 55 4 + ret; +$L__tmp17: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 298 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 103 +.b8 120 +.b8 53 +.b8 108 +.b8 120 +.b8 112 +.b8 117 +.b8 101 +.b8 120 +.b8 112 +.b8 105 +.b8 110 +.b8 100 +.b8 106 +.b8 52 +.b8 100 +.b8 115 +.b8 109 +.b8 106 +.b8 122 +.b8 53 +.b8 120 +.b8 52 +.b8 50 +.b8 117 +.b8 104 +.b8 121 +.b8 121 +.b8 55 +.b8 105 +.b8 115 +.b8 107 +.b8 101 +.b8 118 +.b8 113 +.b8 55 +.b8 111 +.b8 118 +.b8 122 +.b8 112 +.b8 119 +.b8 97 +.b8 103 +.b8 98 +.b8 51 +.b8 116 +.b8 53 +.b8 112 +.b8 111 +.b8 119 +.b8 106 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 103 +.b8 120 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp2 +.b8 2 +.b8 44 +.b8 38 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 50 +.b8 41 +.b8 4 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp16 +.b8 2 +.b8 50 +.b8 41 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ttgir b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..f9b1ae94642c23829f38f0ec382c75ff2c763d23 --- /dev/null +++ b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ttgir @@ -0,0 +1,101 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [1, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<1x256xi32, #blocked> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x256xf32, #blocked> + %cst_1 = arith.constant dense<1.000000e+00> : tensor<1x256xf32, #blocked> + %cst_2 = arith.constant dense<256> : tensor<1x1xi64, #blocked> + %cst_3 = arith.constant dense<50257> : tensor<1x1xi64, #blocked> + %cst_4 = arith.constant dense<0> : tensor<1x1xi64, #blocked> + %cst_5 = arith.constant dense<0> : tensor<1x1xi64, #blocked1> + %cst_6 = arith.constant dense<50257> : tensor<1x1xi64, #blocked1> + %cst_7 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %c512_i32 = arith.constant 512 : i32 + %cst_8 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32, #blocked> + %cst_9 = arith.constant dense<2.560000e+02> : tensor<1x1xf32, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x256xi32, #blocked> + %3 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %4 = tt.splat %3 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked> + %5 = tt.splat %3 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked1> + %6 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked> + %7 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked1> + %8 = arith.remsi %0, %c512_i32 : i32 + %9 = arith.cmpi slt, %2, %cst : tensor<1x256xi32, #blocked> + %10 = arith.muli %8, %c256_i32 : i32 + %11 = tt.splat %10 : (i32) -> tensor<1x256xi32, #blocked> + %12 = arith.addi %2, %11 : tensor<1x256xi32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<1x256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %12 : tensor<1x256x!tt.ptr, #blocked>, tensor<1x256xi32, #blocked> + %15 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked> + %16 = arith.addi %6, %cst_3 : tensor<1x1xi64, #blocked> + %17 = arith.addi %7, %cst_6 : tensor<1x1xi64, #blocked1> + %18 = arith.cmpi slt, %6, %cst_4 : tensor<1x1xi64, #blocked> + %19 = arith.cmpi slt, %7, %cst_5 : tensor<1x1xi64, #blocked1> + %20 = arith.select %18, %16, %6 : tensor<1x1xi1, #blocked>, tensor<1x1xi64, #blocked> + %21 = arith.select %19, %17, %7 : tensor<1x1xi1, #blocked1>, tensor<1x1xi64, #blocked1> + %22 = arith.cmpi sge, %21, %cst_5 : tensor<1x1xi64, #blocked1> + %23 = arith.cmpi slt, %21, %cst_6 : tensor<1x1xi64, #blocked1> + %24 = arith.andi %22, %23 : tensor<1x1xi1, #blocked1> + tt.assert %24, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1> + %25 = arith.muli %20, %cst_2 : tensor<1x1xi64, #blocked> + %26 = tt.broadcast %25 : (tensor<1x1xi64, #blocked>) -> tensor<1x256xi64, #blocked> + %27 = arith.extsi %2 : tensor<1x256xi32, #blocked> to tensor<1x256xi64, #blocked> + %28 = arith.addi %27, %26 : tensor<1x256xi64, #blocked> + %29 = tt.splat %arg1 : (!tt.ptr) -> tensor<1x256x!tt.ptr, #blocked> + %30 = tt.addptr %29, %28 : tensor<1x256x!tt.ptr, #blocked>, tensor<1x256xi64, #blocked> + %31 = tt.load %30, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked> + %32 = arith.addf %31, %15 : tensor<1x256xf32, #blocked> + %33 = arith.addf %32, %cst_0 : tensor<1x256xf32, #blocked> + %34 = arith.subf %32, %33 : tensor<1x256xf32, #blocked> + %35 = arith.mulf %32, %34 : tensor<1x256xf32, #blocked> + %36 = arith.addf %35, %cst_0 : tensor<1x256xf32, #blocked> + %37 = arith.select %9, %33, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked> + %38 = arith.select %9, %36, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked> + %39 = arith.select %9, %cst_1, %cst_0 : tensor<1x256xi1, #blocked>, tensor<1x256xf32, #blocked> + %40:3 = "tt.reduce"(%37, %38, %39) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %63 = arith.subf %arg10, %arg7 : f32 + %64 = arith.addf %arg9, %arg12 : f32 + %65 = arith.cmpf oeq, %64, %cst_7 : f32 + %66 = arith.divf %arg12, %64 : f32 + %67 = arith.select %65, %cst_7, %66 : f32 + %68 = arith.mulf %63, %67 : f32 + %69 = arith.addf %arg7, %68 : f32 + %70 = arith.addf %arg8, %arg11 : f32 + %71 = arith.mulf %63, %63 : f32 + %72 = arith.mulf %71, %arg9 : f32 + %73 = arith.mulf %72, %67 : f32 + %74 = arith.addf %70, %73 : f32 + tt.reduce.return %69, %74, %64 : f32, f32, f32 + }) : (tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>, tensor<1x256xf32, #blocked>) -> (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) + %41 = tt.expand_dims %40#0 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked> + %42 = tt.expand_dims %40#1 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked> + %43 = tt.load %14, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked> + %44 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x256x!tt.ptr, #blocked> + %45 = tt.addptr %44, %2 : tensor<1x256x!tt.ptr, #blocked>, tensor<1x256xi32, #blocked> + %46 = tt.load %45, %9, %cst_0 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32, #blocked> + tt.assert %24, "index out of bounds: 0 <= tmp13 < 50257", "", "_call_with_frames_removed", 883 : tensor<1x1xi1, #blocked1> + %47 = tt.load %30, %9, %cst_0 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32, #blocked> + %48 = arith.addf %47, %43 : tensor<1x256xf32, #blocked> + %49 = tt.broadcast %41 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked> + %50 = arith.subf %48, %49 : tensor<1x256xf32, #blocked> + %51 = arith.divf %42, %cst_9 : tensor<1x1xf32, #blocked> + %52 = arith.addf %51, %cst_8 : tensor<1x1xf32, #blocked> + %53 = tt.extern_elementwise %52 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32, #blocked>) -> tensor<1x1xf32, #blocked> + %54 = tt.broadcast %53 : (tensor<1x1xf32, #blocked>) -> tensor<1x256xf32, #blocked> + %55 = arith.mulf %50, %54 : tensor<1x256xf32, #blocked> + %56 = arith.mulf %55, %46 : tensor<1x256xf32, #blocked> + %57 = arith.muli %0, %c256_i32 : i32 + %58 = tt.splat %57 : (i32) -> tensor<1x256xi32, #blocked> + %59 = arith.addi %2, %58 : tensor<1x256xi32, #blocked> + %60 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x256x!tt.ptr, #blocked> + %61 = tt.addptr %60, %59 : tensor<1x256x!tt.ptr, #blocked>, tensor<1x256xi32, #blocked> + %62 = arith.truncf %56 : tensor<1x256xf32, #blocked> to tensor<1x256xbf16, #blocked> + tt.store %61, %62, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ttir b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c4b42c11f042d77b8a73277feea11104b4195ebe --- /dev/null +++ b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.ttir @@ -0,0 +1,92 @@ +module { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<1.000000e+00> : tensor<1x256xf32> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<256> : tensor<1x1xi64> + %cst_2 = arith.constant dense<50257> : tensor<1x1xi64> + %cst_3 = arith.constant dense<0> : tensor<1x1xi64> + %cst_4 = arith.constant dense<9.99999974E-6> : tensor<1x1xf32> + %cst_5 = arith.constant dense<2.560000e+02> : tensor<1x1xf32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x256xf32> + %cst_7 = arith.constant dense<256> : tensor<1x256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = tt.expand_dims %1 {axis = 0 : i32} : (tensor<256xi32>) -> tensor<1x256xi32> + %3 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %4 = tt.splat %3 : (!tt.ptr) -> tensor<1x1x!tt.ptr> + %5 = tt.load %4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64> + %6 = arith.remsi %0, %c512_i32 : i32 + %7 = arith.cmpi slt, %2, %cst_7 : tensor<1x256xi32> + %8 = arith.muli %6, %c256_i32 : i32 + %9 = tt.splat %8 : (i32) -> tensor<1x256xi32> + %10 = arith.addi %2, %9 : tensor<1x256xi32> + %11 = tt.splat %arg2 : (!tt.ptr) -> tensor<1x256x!tt.ptr> + %12 = tt.addptr %11, %10 : tensor<1x256x!tt.ptr>, tensor<1x256xi32> + %13 = tt.load %12, %7, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32> + %14 = arith.addi %5, %cst_2 : tensor<1x1xi64> + %15 = arith.cmpi slt, %5, %cst_3 : tensor<1x1xi64> + %16 = arith.select %15, %14, %5 : tensor<1x1xi1>, tensor<1x1xi64> + %17 = arith.cmpi sge, %16, %cst_3 : tensor<1x1xi64> + %18 = arith.cmpi slt, %16, %cst_2 : tensor<1x1xi64> + %19 = arith.andi %17, %18 : tensor<1x1xi1> + tt.assert %19, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<1x1xi1> + %20 = arith.muli %16, %cst_1 : tensor<1x1xi64> + %21 = tt.broadcast %20 : (tensor<1x1xi64>) -> tensor<1x256xi64> + %22 = arith.extsi %2 : tensor<1x256xi32> to tensor<1x256xi64> + %23 = arith.addi %22, %21 : tensor<1x256xi64> + %24 = tt.splat %arg1 : (!tt.ptr) -> tensor<1x256x!tt.ptr> + %25 = tt.addptr %24, %23 : tensor<1x256x!tt.ptr>, tensor<1x256xi64> + %26 = tt.load %25, %7, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32> + %27 = arith.addf %26, %13 : tensor<1x256xf32> + %28 = arith.addf %27, %cst_6 : tensor<1x256xf32> + %29 = arith.subf %27, %28 : tensor<1x256xf32> + %30 = arith.mulf %27, %29 : tensor<1x256xf32> + %31 = arith.addf %30, %cst_6 : tensor<1x256xf32> + %32 = arith.select %7, %28, %cst_6 : tensor<1x256xi1>, tensor<1x256xf32> + %33 = arith.select %7, %31, %cst_6 : tensor<1x256xi1>, tensor<1x256xf32> + %34 = arith.select %7, %cst, %cst_6 : tensor<1x256xi1>, tensor<1x256xf32> + %35:3 = "tt.reduce"(%32, %33, %34) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %58 = arith.subf %arg10, %arg7 : f32 + %59 = arith.addf %arg9, %arg12 : f32 + %60 = arith.cmpf oeq, %59, %cst_0 : f32 + %61 = arith.divf %arg12, %59 : f32 + %62 = arith.select %60, %cst_0, %61 : f32 + %63 = arith.mulf %58, %62 : f32 + %64 = arith.addf %arg7, %63 : f32 + %65 = arith.addf %arg8, %arg11 : f32 + %66 = arith.mulf %58, %58 : f32 + %67 = arith.mulf %66, %arg9 : f32 + %68 = arith.mulf %67, %62 : f32 + %69 = arith.addf %65, %68 : f32 + tt.reduce.return %64, %69, %59 : f32, f32, f32 + }) : (tensor<1x256xf32>, tensor<1x256xf32>, tensor<1x256xf32>) -> (tensor<1xf32>, tensor<1xf32>, tensor<1xf32>) + %36 = tt.expand_dims %35#0 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32> + %37 = tt.expand_dims %35#1 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32> + %38 = tt.load %12, %7, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32> + %39 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x256x!tt.ptr> + %40 = tt.addptr %39, %2 : tensor<1x256x!tt.ptr>, tensor<1x256xi32> + %41 = tt.load %40, %7, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x256xf32> + tt.assert %19, "index out of bounds: 0 <= tmp13 < 50257", "", "_call_with_frames_removed", 883 : tensor<1x1xi1> + %42 = tt.load %25, %7, %cst_6 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x256xf32> + %43 = arith.addf %42, %38 : tensor<1x256xf32> + %44 = tt.broadcast %36 : (tensor<1x1xf32>) -> tensor<1x256xf32> + %45 = arith.subf %43, %44 : tensor<1x256xf32> + %46 = arith.divf %37, %cst_5 : tensor<1x1xf32> + %47 = arith.addf %46, %cst_4 : tensor<1x1xf32> + %48 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<1x1xf32>) -> tensor<1x1xf32> + %49 = tt.broadcast %48 : (tensor<1x1xf32>) -> tensor<1x256xf32> + %50 = arith.mulf %45, %49 : tensor<1x256xf32> + %51 = arith.mulf %50, %41 : tensor<1x256xf32> + %52 = arith.muli %0, %c256_i32 : i32 + %53 = tt.splat %52 : (i32) -> tensor<1x256xi32> + %54 = arith.addi %2, %53 : tensor<1x256xi32> + %55 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x256x!tt.ptr> + %56 = tt.addptr %55, %54 : tensor<1x256x!tt.ptr>, tensor<1x256xi32> + %57 = arith.truncf %51 : tensor<1x256xf32> to tensor<1x256xbf16> + tt.store %56, %57, %7 {cache = 1 : i32, evict = 1 : i32} : tensor<1x256xbf16> + tt.return + } +} diff --git a/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ptx b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..a57d3dab159f677ac1e1353e92f58df4ed6ff2f8 --- /dev/null +++ b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ptx @@ -0,0 +1,1278 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5de6de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62}; +.global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62}; +.global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5de6de( + .param .u64 triton__0d1d2d3d4d5de6de_param_0, + .param .u64 triton__0d1d2d3d4d5de6de_param_1, + .param .u64 triton__0d1d2d3d4d5de6de_param_2, + .param .u64 triton__0d1d2d3d4d5de6de_param_3, + .param .u64 triton__0d1d2d3d4d5de6de_param_4, + .param .u32 triton__0d1d2d3d4d5de6de_param_5, + .param .u32 triton__0d1d2d3d4d5de6de_param_6 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<84>; + .reg .b16 %rs<9>; + .reg .b32 %r<236>; + .reg .f32 %f<324>; + .reg .b64 %rd<89>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd10, [triton__0d1d2d3d4d5de6de_param_4]; + ld.param.u64 %rd9, [triton__0d1d2d3d4d5de6de_param_3]; + ld.param.u64 %rd8, [triton__0d1d2d3d4d5de6de_param_2]; + ld.param.u64 %rd29, [triton__0d1d2d3d4d5de6de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd30, [triton__0d1d2d3d4d5de6de_param_1]; + bfe.u32 %r2, %r1, 4, 4; + and.b32 %r12, %r1, 15; + .loc 1 24 33 + shl.b32 %r3, %r12, 3; + .loc 1 21 28 + mov.u32 %r10, %ctaid.x; + .loc 1 21 33 + shl.b32 %r13, %r10, 4; + .loc 1 22 23 + or.b32 %r4, %r13, %r2; + or.b32 %r14, %r13, %r12; + .loc 1 26 30 + mul.wide.s32 %rd31, %r4, 8; + add.s64 %rd12, %rd29, %rd31; + mul.wide.s32 %rd32, %r14, 8; + add.s64 %rd28, %rd29, %rd32; + mov.pred %p3, -1; + .loc 1 26 35 + mov.u64 %rd11, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd11 }, [ %rd12 + 0 ]; + mov.u64 %rd13, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd12 + 0 ]; + mov.u64 %rd15, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd12 + 0 ]; + mov.u64 %rd17, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd12 + 0 ]; + mov.u64 %rd19, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd12 + 0 ]; + mov.u64 %rd21, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd12 + 0 ]; + mov.u64 %rd23, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd12 + 0 ]; + mov.u64 %rd25, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd12 + 0 ]; + mov.u64 %rd27, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd28 + 0 ]; + .loc 1 27 18 + bfe.s32 %r15, %r10, 27, 1; + shr.u32 %r16, %r15, 23; + add.s32 %r17, %r4, %r16; + and.b32 %r18, %r17, 16776704; + sub.s32 %r19, %r4, %r18; + .loc 1 35 44 + shl.b32 %r5, %r19, 8; + .loc 1 36 22 + add.s64 %rd33, %rd27, 50257; + .loc 1 37 22 + setp.lt.s64 %p13, %rd11, 0; + setp.lt.s64 %p14, %rd27, 0; + .loc 1 38 36 + selp.b64 %rd1, %rd33, %rd27, %p14; + .loc 1 40 44 + shl.b64 %rd34, %rd11, 8; + add.s64 %rd35, %rd34, 12865792; + selp.b64 %rd36, %rd35, %rd34, %p13; + shl.b64 %rd37, %rd36, 2; + add.s64 %rd2, %rd30, %rd37; + mov.b32 %r24, 0; + mov.f32 %f292, 0f00000000; + mov.f32 %f293, %f292; + mov.f32 %f294, %f292; + mov.f32 %f295, %f292; + mov.f32 %f296, %f292; + mov.f32 %f297, %f292; + mov.f32 %f298, %f292; + mov.f32 %f299, %f292; + mov.f32 %f300, %f292; + mov.f32 %f301, %f292; + mov.f32 %f302, %f292; + mov.f32 %f303, %f292; + mov.f32 %f304, %f292; + mov.f32 %f305, %f292; + mov.f32 %f306, %f292; + mov.f32 %f307, %f292; + mov.f32 %f308, %f292; + mov.f32 %f309, %f292; + mov.f32 %f310, %f292; + mov.f32 %f311, %f292; + mov.f32 %f312, %f292; + mov.f32 %f313, %f292; + mov.f32 %f314, %f292; + mov.f32 %f315, %f292; + mov.f32 %f316, %f292; + mov.f32 %f317, %f292; + mov.f32 %f318, %f292; + mov.f32 %f319, %f292; + mov.f32 %f320, %f292; + mov.f32 %f321, %f292; + mov.f32 %f322, %f292; + mov.f32 %f323, %f292; + mov.pred %p82, %p3; + mov.u32 %r234, %r24; + bra.uni $L__BB0_1; +$L__BB0_3: + .loc 1 0 0 + mov.b32 %f33, %r20; + mov.b32 %f34, %r21; + mov.b32 %f35, %r22; + mov.b32 %f36, %r23; + mov.b32 %f37, %r28; + mov.b32 %f38, %r29; + mov.b32 %f39, %r30; + mov.b32 %f40, %r31; + .loc 1 40 34 + mul.wide.u32 %rd55, %r7, 4; + add.s64 %rd53, %rd2, %rd55; + .loc 1 40 40 + cvt.u64.u32 %rd56, %r234; + add.s64 %rd57, %rd56, %rd4; + .loc 1 40 34 + shl.b64 %rd58, %rd57, 2; + add.s64 %rd59, %rd2, %rd58; + add.s64 %rd54, %rd59, 16; + mov.b32 %r167, 0; + mov.pred %p49, -1; + .loc 1 40 52 + mov.u32 %r38, 0x0; + mov.u32 %r39, 0x0; + mov.u32 %r40, 0x0; + mov.u32 %r41, 0x0; + @%p49 ld.global.L1::evict_last.v4.b32 { %r38, %r39, %r40, %r41 }, [ %rd53 + 0 ]; + @!%p49 mov.u32 %r38, %r167; + @!%p49 mov.u32 %r39, %r167; + @!%p49 mov.u32 %r40, %r167; + @!%p49 mov.u32 %r41, %r167; + mov.b32 %f92, %r38; + mov.b32 %f93, %r39; + mov.b32 %f94, %r40; + mov.b32 %f95, %r41; + mov.u32 %r46, 0x0; + mov.u32 %r47, 0x0; + mov.u32 %r48, 0x0; + mov.u32 %r49, 0x0; + @%p49 ld.global.L1::evict_last.v4.b32 { %r46, %r47, %r48, %r49 }, [ %rd54 + 0 ]; + @!%p49 mov.u32 %r46, %r167; + @!%p49 mov.u32 %r47, %r167; + @!%p49 mov.u32 %r48, %r167; + @!%p49 mov.u32 %r49, %r167; + mov.b32 %f96, %r46; + mov.b32 %f97, %r47; + mov.b32 %f98, %r48; + mov.b32 %f99, %r49; + .loc 1 41 22 + add.f32 %f100, %f33, %f92; + add.f32 %f101, %f34, %f93; + add.f32 %f102, %f35, %f94; + add.f32 %f103, %f36, %f95; + add.f32 %f104, %f37, %f96; + add.f32 %f105, %f38, %f97; + add.f32 %f106, %f39, %f98; + add.f32 %f107, %f40, %f99; +$L__tmp1: + .loc 2 96 20 + sub.f32 %f108, %f100, %f316; + sub.f32 %f109, %f101, %f317; + sub.f32 %f110, %f102, %f318; + sub.f32 %f111, %f103, %f319; + sub.f32 %f112, %f104, %f320; + sub.f32 %f113, %f105, %f321; + sub.f32 %f114, %f106, %f322; + sub.f32 %f115, %f107, %f323; + .loc 2 97 26 + add.f32 %f292, %f292, 0f3F800000; + add.f32 %f293, %f293, 0f3F800000; + add.f32 %f294, %f294, 0f3F800000; + add.f32 %f295, %f295, 0f3F800000; + add.f32 %f296, %f296, 0f3F800000; + add.f32 %f297, %f297, 0f3F800000; + add.f32 %f298, %f298, 0f3F800000; + add.f32 %f299, %f299, 0f3F800000; + add.f32 %f300, %f300, 0f3F800000; + add.f32 %f301, %f301, 0f3F800000; + add.f32 %f302, %f302, 0f3F800000; + add.f32 %f303, %f303, 0f3F800000; + add.f32 %f304, %f304, 0f3F800000; + add.f32 %f305, %f305, 0f3F800000; + add.f32 %f306, %f306, 0f3F800000; + add.f32 %f307, %f307, 0f3F800000; + .loc 2 98 30 + mov.b32 %r55, %f108; + mov.b32 %r56, %f292; + div.full.f32 %r54, %r55, %r56; + mov.b32 %f116, %r54; + mov.b32 %r58, %f109; + mov.b32 %r59, %f293; + div.full.f32 %r57, %r58, %r59; + mov.b32 %f117, %r57; + mov.b32 %r61, %f110; + mov.b32 %r62, %f294; + div.full.f32 %r60, %r61, %r62; + mov.b32 %f118, %r60; + mov.b32 %r64, %f111; + mov.b32 %r65, %f295; + div.full.f32 %r63, %r64, %r65; + mov.b32 %f119, %r63; + mov.b32 %r67, %f112; + mov.b32 %r68, %f296; + div.full.f32 %r66, %r67, %r68; + mov.b32 %f120, %r66; + mov.b32 %r70, %f113; + mov.b32 %r71, %f297; + div.full.f32 %r69, %r70, %r71; + mov.b32 %f121, %r69; + mov.b32 %r73, %f114; + mov.b32 %r74, %f298; + div.full.f32 %r72, %r73, %r74; + mov.b32 %f122, %r72; + mov.b32 %r76, %f115; + mov.b32 %r77, %f299; + div.full.f32 %r75, %r76, %r77; + mov.b32 %f123, %r75; + .loc 2 98 22 + add.f32 %f316, %f316, %f116; + add.f32 %f317, %f317, %f117; + add.f32 %f318, %f318, %f118; + add.f32 %f319, %f319, %f119; + add.f32 %f320, %f320, %f120; + add.f32 %f321, %f321, %f121; + add.f32 %f322, %f322, %f122; + add.f32 %f323, %f323, %f123; + .loc 2 101 30 + sub.f32 %f124, %f100, %f316; + sub.f32 %f125, %f101, %f317; + sub.f32 %f126, %f102, %f318; + sub.f32 %f127, %f103, %f319; + sub.f32 %f128, %f104, %f320; + sub.f32 %f129, %f105, %f321; + sub.f32 %f130, %f106, %f322; + sub.f32 %f131, %f107, %f323; +$L__tmp2: + .loc 1 47 48 + fma.rn.f32 %f308, %f108, %f124, %f308; + fma.rn.f32 %f309, %f109, %f125, %f309; + fma.rn.f32 %f310, %f110, %f126, %f310; + fma.rn.f32 %f311, %f111, %f127, %f311; + fma.rn.f32 %f312, %f112, %f128, %f312; + fma.rn.f32 %f313, %f113, %f129, %f313; + fma.rn.f32 %f314, %f114, %f130, %f314; + fma.rn.f32 %f315, %f115, %f131, %f315; + mov.b32 %r234, 128; + mov.pred %p82, 0; + .loc 1 31 36 + @%p1 bra $L__BB0_1; + bra.uni $L__BB0_4; +$L__BB0_1: + .loc 1 0 36 + mov.pred %p1, %p82; + .loc 1 39 40 + setp.lt.u64 %p25, %rd1, 50257; + .loc 1 32 27 + or.b32 %r7, %r234, %r3; + .loc 1 35 40 + or.b32 %r36, %r7, %r5; + .loc 1 35 34 + mul.wide.s32 %rd40, %r36, 4; + add.s64 %rd38, %rd8, %rd40; + cvt.s64.s32 %rd3, %r5; + cvt.s64.s32 %rd41, %r234; + cvt.u64.u32 %rd4, %r3; + add.s64 %rd42, %rd41, %rd4; + add.s64 %rd43, %rd42, %rd3; + shl.b64 %rd44, %rd43, 2; + add.s64 %rd45, %rd8, %rd44; + add.s64 %rd39, %rd45, 16; + .loc 1 35 50 + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + mov.u32 %r22, 0x0; + mov.u32 %r23, 0x0; + @%p3 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd38 + 0 ]; + @!%p3 mov.u32 %r20, %r24; + @!%p3 mov.u32 %r21, %r24; + @!%p3 mov.u32 %r22, %r24; + @!%p3 mov.u32 %r23, %r24; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + mov.u32 %r30, 0x0; + mov.u32 %r31, 0x0; + @%p3 ld.global.L1::evict_last.v4.b32 { %r28, %r29, %r30, %r31 }, [ %rd39 + 0 ]; + @!%p3 mov.u32 %r28, %r24; + @!%p3 mov.u32 %r29, %r24; + @!%p3 mov.u32 %r30, %r24; + @!%p3 mov.u32 %r31, %r24; + mov.b32 %r233, 1892; + mov.u64 %rd88, 1; + .loc 1 39 55 + @%p25 bra $L__BB0_3; + mov.u64 %rd46, assertMessage_0; + cvta.global.u64 %rd47, %rd46; + mov.u64 %rd48, assertFile_0; + cvta.global.u64 %rd49, %rd48; + mov.u64 %rd50, assertFunc_0; + cvta.global.u64 %rd51, %rd50; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd47; + .param .b64 param1; + st.param.b64 [param1+0], %rd49; + .param .b32 param2; + st.param.b32 [param2+0], %r233; + .param .b64 param3; + st.param.b64 [param3+0], %rd51; + .param .b64 param4; + st.param.b64 [param4+0], %rd88; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 0 + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 24 33 + and.b32 %r137, %r1, 127; + .loc 1 31 36 + bfe.s32 %r138, %r1, 7, 1; + and.b32 %r139, %r138, 136; + add.s32 %r140, %r139, %r137; + shl.b32 %r141, %r140, 2; + mov.u32 %r142, global_smem; + add.s32 %r143, %r142, %r141; + st.shared.f32 [%r143], %f300; + st.shared.f32 [%r143+1088], %f301; + st.shared.f32 [%r143+2176], %f302; + st.shared.f32 [%r143+3264], %f303; + st.shared.f32 [%r143+4352], %f304; + st.shared.f32 [%r143+5440], %f305; + st.shared.f32 [%r143+6528], %f306; + st.shared.f32 [%r143+7616], %f307; + bar.sync 0; + mad.lo.s32 %r144, %r2, 136, %r3; + shl.b32 %r145, %r144, 2; + add.s32 %r146, %r142, %r145; + ld.shared.v4.f32 {%f132, %f133, %f134, %f135}, [%r146]; + ld.shared.v4.f32 {%f136, %f137, %f138, %f139}, [%r146+16]; +$L__tmp3: + .loc 2 108 21 + sub.f32 %f140, %f317, %f316; + .loc 2 109 28 + add.f32 %f141, %f132, %f133; + .loc 2 110 39 + setp.eq.f32 %p38, %f141, 0f00000000; + .loc 2 110 60 + mov.b32 %r80, %f133; + mov.b32 %r81, %f141; + div.full.f32 %r79, %r80, %r81; + mov.b32 %f142, %r79; + .loc 2 110 49 + selp.f32 %f143, 0f00000000, %f142, %p38; + .loc 2 112 17 + fma.rn.f32 %f144, %f140, %f143, %f316; + .loc 2 113 15 + add.f32 %f145, %f308, %f309; + .loc 2 113 30 + mul.f32 %f146, %f140, %f140; + .loc 2 113 38 + mul.f32 %f147, %f146, %f132; + .loc 2 113 22 + fma.rn.f32 %f148, %f147, %f143, %f145; + .loc 2 108 21 + sub.f32 %f149, %f318, %f144; + .loc 2 109 28 + add.f32 %f150, %f134, %f141; + .loc 2 110 39 + setp.eq.f32 %p39, %f150, 0f00000000; + .loc 2 110 60 + mov.b32 %r84, %f150; + mov.b32 %r83, %f134; + div.full.f32 %r82, %r83, %r84; + mov.b32 %f151, %r82; + .loc 2 110 49 + selp.f32 %f152, 0f00000000, %f151, %p39; + .loc 2 112 17 + fma.rn.f32 %f153, %f152, %f149, %f144; + .loc 2 113 15 + add.f32 %f154, %f310, %f148; + .loc 2 113 30 + mul.f32 %f155, %f149, %f149; + .loc 2 113 38 + mul.f32 %f156, %f141, %f155; + .loc 2 113 22 + fma.rn.f32 %f157, %f152, %f156, %f154; + .loc 2 108 21 + sub.f32 %f158, %f319, %f153; + .loc 2 109 28 + add.f32 %f159, %f135, %f150; + .loc 2 110 39 + setp.eq.f32 %p40, %f159, 0f00000000; + .loc 2 110 60 + mov.b32 %r87, %f159; + mov.b32 %r86, %f135; + div.full.f32 %r85, %r86, %r87; + mov.b32 %f160, %r85; + .loc 2 110 49 + selp.f32 %f161, 0f00000000, %f160, %p40; + .loc 2 112 17 + fma.rn.f32 %f162, %f161, %f158, %f153; + .loc 2 113 15 + add.f32 %f163, %f311, %f157; + .loc 2 113 30 + mul.f32 %f164, %f158, %f158; + .loc 2 113 38 + mul.f32 %f165, %f150, %f164; + .loc 2 113 22 + fma.rn.f32 %f166, %f161, %f165, %f163; + .loc 2 108 21 + sub.f32 %f167, %f320, %f162; + .loc 2 109 28 + add.f32 %f168, %f136, %f159; + .loc 2 110 39 + setp.eq.f32 %p41, %f168, 0f00000000; + .loc 2 110 60 + mov.b32 %r90, %f168; + mov.b32 %r89, %f136; + div.full.f32 %r88, %r89, %r90; + mov.b32 %f169, %r88; + .loc 2 110 49 + selp.f32 %f170, 0f00000000, %f169, %p41; + .loc 2 112 17 + fma.rn.f32 %f171, %f170, %f167, %f162; + .loc 2 113 15 + add.f32 %f172, %f312, %f166; + .loc 2 113 30 + mul.f32 %f173, %f167, %f167; + .loc 2 113 38 + mul.f32 %f174, %f159, %f173; + .loc 2 113 22 + fma.rn.f32 %f175, %f170, %f174, %f172; + .loc 2 108 21 + sub.f32 %f176, %f321, %f171; + .loc 2 109 28 + add.f32 %f177, %f137, %f168; + .loc 2 110 39 + setp.eq.f32 %p42, %f177, 0f00000000; + .loc 2 110 60 + mov.b32 %r93, %f177; + mov.b32 %r92, %f137; + div.full.f32 %r91, %r92, %r93; + mov.b32 %f178, %r91; + .loc 2 110 49 + selp.f32 %f179, 0f00000000, %f178, %p42; + .loc 2 112 17 + fma.rn.f32 %f180, %f179, %f176, %f171; + .loc 2 113 15 + add.f32 %f181, %f313, %f175; + .loc 2 113 30 + mul.f32 %f182, %f176, %f176; + .loc 2 113 38 + mul.f32 %f183, %f168, %f182; + .loc 2 113 22 + fma.rn.f32 %f184, %f179, %f183, %f181; + .loc 2 108 21 + sub.f32 %f185, %f322, %f180; + .loc 2 109 28 + add.f32 %f186, %f138, %f177; + .loc 2 110 39 + setp.eq.f32 %p43, %f186, 0f00000000; + .loc 2 110 60 + mov.b32 %r96, %f186; + mov.b32 %r95, %f138; + div.full.f32 %r94, %r95, %r96; + mov.b32 %f187, %r94; + .loc 2 110 49 + selp.f32 %f188, 0f00000000, %f187, %p43; + .loc 2 112 17 + fma.rn.f32 %f189, %f188, %f185, %f180; + .loc 2 113 15 + add.f32 %f190, %f314, %f184; + .loc 2 113 30 + mul.f32 %f191, %f185, %f185; + .loc 2 113 38 + mul.f32 %f192, %f177, %f191; + .loc 2 113 22 + fma.rn.f32 %f193, %f188, %f192, %f190; + .loc 2 108 21 + sub.f32 %f194, %f323, %f189; + .loc 2 109 28 + add.f32 %f195, %f139, %f186; + .loc 2 110 39 + setp.eq.f32 %p44, %f195, 0f00000000; + .loc 2 110 60 + mov.b32 %r99, %f195; + mov.b32 %r98, %f139; + div.full.f32 %r97, %r98, %r99; + mov.b32 %f196, %r97; + .loc 2 110 49 + selp.f32 %f197, 0f00000000, %f196, %p44; + .loc 2 112 17 + fma.rn.f32 %f198, %f197, %f194, %f189; + .loc 2 113 15 + add.f32 %f199, %f315, %f193; + .loc 2 113 30 + mul.f32 %f200, %f194, %f194; + .loc 2 113 38 + mul.f32 %f201, %f186, %f200; + .loc 2 113 22 + fma.rn.f32 %f202, %f197, %f201, %f199; +$L__tmp4: + .loc 2 120 46 + mov.b32 %r147, %f198; + shfl.sync.bfly.b32 %r148, %r147, 8, 31, -1; + mov.b32 %f203, %r148; + mov.b32 %r149, %f202; + shfl.sync.bfly.b32 %r150, %r149, 8, 31, -1; + mov.b32 %f204, %r150; + shfl.sync.bfly.b32 %r101, %r99, 8, 31, -1; + mov.b32 %f205, %r101; +$L__tmp5: + .loc 2 108 21 + sub.f32 %f206, %f203, %f198; + .loc 2 109 28 + add.f32 %f207, %f195, %f205; + .loc 2 110 39 + setp.eq.f32 %p45, %f207, 0f00000000; + .loc 2 110 60 + mov.b32 %r102, %f207; + div.full.f32 %r100, %r101, %r102; + mov.b32 %f208, %r100; + .loc 2 110 49 + selp.f32 %f209, 0f00000000, %f208, %p45; + .loc 2 112 17 + fma.rn.f32 %f210, %f209, %f206, %f198; + .loc 2 113 15 + add.f32 %f211, %f202, %f204; + .loc 2 113 30 + mul.f32 %f212, %f206, %f206; + .loc 2 113 38 + mul.f32 %f213, %f195, %f212; + .loc 2 113 22 + fma.rn.f32 %f214, %f209, %f213, %f211; +$L__tmp6: + .loc 2 120 46 + mov.b32 %r151, %f210; + shfl.sync.bfly.b32 %r152, %r151, 4, 31, -1; + mov.b32 %f215, %r152; + mov.b32 %r153, %f214; + shfl.sync.bfly.b32 %r154, %r153, 4, 31, -1; + mov.b32 %f216, %r154; + shfl.sync.bfly.b32 %r104, %r102, 4, 31, -1; + mov.b32 %f217, %r104; +$L__tmp7: + .loc 2 108 21 + sub.f32 %f218, %f215, %f210; + .loc 2 109 28 + add.f32 %f219, %f207, %f217; + .loc 2 110 39 + setp.eq.f32 %p46, %f219, 0f00000000; + .loc 2 110 60 + mov.b32 %r105, %f219; + div.full.f32 %r103, %r104, %r105; + mov.b32 %f220, %r103; + .loc 2 110 49 + selp.f32 %f221, 0f00000000, %f220, %p46; + .loc 2 112 17 + fma.rn.f32 %f222, %f221, %f218, %f210; + .loc 2 113 15 + add.f32 %f223, %f214, %f216; + .loc 2 113 30 + mul.f32 %f224, %f218, %f218; + .loc 2 113 38 + mul.f32 %f225, %f207, %f224; + .loc 2 113 22 + fma.rn.f32 %f226, %f221, %f225, %f223; +$L__tmp8: + .loc 2 120 46 + mov.b32 %r155, %f222; + shfl.sync.bfly.b32 %r156, %r155, 2, 31, -1; + mov.b32 %f227, %r156; + mov.b32 %r157, %f226; + shfl.sync.bfly.b32 %r158, %r157, 2, 31, -1; + mov.b32 %f228, %r158; + shfl.sync.bfly.b32 %r107, %r105, 2, 31, -1; + mov.b32 %f229, %r107; +$L__tmp9: + .loc 2 108 21 + sub.f32 %f230, %f227, %f222; + .loc 2 109 28 + add.f32 %f231, %f219, %f229; + .loc 2 110 39 + setp.eq.f32 %p47, %f231, 0f00000000; + .loc 2 110 60 + mov.b32 %r108, %f231; + div.full.f32 %r106, %r107, %r108; + mov.b32 %f232, %r106; + .loc 2 110 49 + selp.f32 %f233, 0f00000000, %f232, %p47; + .loc 2 112 17 + fma.rn.f32 %f234, %f233, %f230, %f222; + .loc 2 113 15 + add.f32 %f235, %f226, %f228; + .loc 2 113 30 + mul.f32 %f236, %f230, %f230; + .loc 2 113 38 + mul.f32 %f237, %f219, %f236; + .loc 2 113 22 + fma.rn.f32 %f238, %f233, %f237, %f235; +$L__tmp10: + .loc 2 120 46 + mov.b32 %r159, %f234; + shfl.sync.bfly.b32 %r160, %r159, 1, 31, -1; + mov.b32 %f239, %r160; + mov.b32 %r161, %f238; + shfl.sync.bfly.b32 %r162, %r161, 1, 31, -1; + mov.b32 %f240, %r162; + shfl.sync.bfly.b32 %r110, %r108, 1, 31, -1; + mov.b32 %f241, %r110; +$L__tmp11: + .loc 2 108 21 + sub.f32 %f242, %f239, %f234; + .loc 2 109 28 + add.f32 %f243, %f231, %f241; + .loc 2 110 39 + setp.eq.f32 %p48, %f243, 0f00000000; + .loc 2 110 60 + mov.b32 %r111, %f243; + div.full.f32 %r109, %r110, %r111; + mov.b32 %f244, %r109; + .loc 2 110 49 + selp.f32 %f245, 0f00000000, %f244, %p48; + .loc 2 112 17 + fma.rn.f32 %f73, %f245, %f242, %f234; + .loc 2 113 15 + add.f32 %f246, %f238, %f240; + .loc 2 113 30 + mul.f32 %f247, %f242, %f242; + .loc 2 113 38 + mul.f32 %f248, %f231, %f247; + .loc 2 113 22 + fma.rn.f32 %f249, %f245, %f248, %f246; +$L__tmp12: + .loc 1 69 23 + mov.b32 %r113, %f249; + mov.b32 %r114, 1132462080; + div.full.f32 %r112, %r113, %r114; + mov.b32 %f250, %r112; + .loc 1 71 24 + add.f32 %f74, %f250, 0f3727C5AC; + .loc 1 76 39 + shl.b32 %r8, %r4, 8; + rsqrt.approx.ftz.f32 %f275, %f74; + mov.pred %p83, %p49; + mov.u32 %r235, %r167; + bra.uni $L__BB0_5; +$L__BB0_7: + .loc 1 65 35 + shl.b64 %rd84, %rd6, 2; + add.s64 %rd81, %rd2, %rd84; + add.s64 %rd86, %rd2, %rd72; + add.s64 %rd82, %rd86, 16; + mov.b32 %r202, 0; + mov.pred %p70, -1; + .loc 1 65 54 + mov.u32 %r198, 0x0; + mov.u32 %r199, 0x0; + mov.u32 %r200, 0x0; + mov.u32 %r201, 0x0; + @%p70 ld.global.L1::evict_first.v4.b32 { %r198, %r199, %r200, %r201 }, [ %rd81 + 0 ]; + @!%p70 mov.u32 %r198, %r202; + @!%p70 mov.u32 %r199, %r202; + @!%p70 mov.u32 %r200, %r202; + @!%p70 mov.u32 %r201, %r202; + mov.b32 %f251, %r198; + mov.b32 %f252, %r199; + mov.b32 %f253, %r200; + mov.b32 %f254, %r201; + mov.u32 %r206, 0x0; + mov.u32 %r207, 0x0; + mov.u32 %r208, 0x0; + mov.u32 %r209, 0x0; + @%p70 ld.global.L1::evict_first.v4.b32 { %r206, %r207, %r208, %r209 }, [ %rd82 + 0 ]; + @!%p70 mov.u32 %r206, %r202; + @!%p70 mov.u32 %r207, %r202; + @!%p70 mov.u32 %r208, %r202; + @!%p70 mov.u32 %r209, %r202; + mov.b32 %f255, %r206; + mov.b32 %f256, %r207; + mov.b32 %f257, %r208; + mov.b32 %f258, %r209; + .loc 1 66 24 + add.f32 %f259, %f75, %f251; + add.f32 %f260, %f76, %f252; + add.f32 %f261, %f77, %f253; + add.f32 %f262, %f78, %f254; + add.f32 %f263, %f79, %f255; + add.f32 %f264, %f80, %f256; + add.f32 %f265, %f81, %f257; + add.f32 %f266, %f82, %f258; + .loc 1 67 24 + sub.f32 %f267, %f259, %f73; + sub.f32 %f268, %f260, %f73; + sub.f32 %f269, %f261, %f73; + sub.f32 %f270, %f262, %f73; + sub.f32 %f271, %f263, %f73; + sub.f32 %f272, %f264, %f73; + sub.f32 %f273, %f265, %f73; + sub.f32 %f274, %f266, %f73; + cvt.u32.u64 %r227, %rd6; + .loc 1 73 24 + mul.f32 %f276, %f267, %f275; + mul.f32 %f277, %f268, %f275; + mul.f32 %f278, %f269, %f275; + mul.f32 %f279, %f270, %f275; + mul.f32 %f280, %f271, %f275; + mul.f32 %f281, %f272, %f275; + mul.f32 %f282, %f273, %f275; + mul.f32 %f283, %f274, %f275; + .loc 1 74 24 + mul.f32 %f284, %f276, %f83; + mul.f32 %f285, %f277, %f84; + mul.f32 %f286, %f278, %f85; + mul.f32 %f287, %f279, %f86; + mul.f32 %f288, %f280, %f87; + mul.f32 %f289, %f281, %f88; + mul.f32 %f290, %f282, %f89; + mul.f32 %f291, %f283, %f90; + .loc 1 76 35 + or.b32 %r228, %r227, %r8; + .loc 1 76 29 + mul.wide.s32 %rd87, %r228, 2; + add.s64 %rd83, %rd10, %rd87; + .loc 1 76 52 + mov.b32 %r214, %f284; + cvt.rn.bf16.f32 %rs1, %r214; + mov.b32 %r215, %f285; + cvt.rn.bf16.f32 %rs2, %r215; + mov.b32 %r216, %f286; + cvt.rn.bf16.f32 %rs3, %r216; + mov.b32 %r217, %f287; + cvt.rn.bf16.f32 %rs4, %r217; + mov.b32 %r218, %f288; + cvt.rn.bf16.f32 %rs5, %r218; + mov.b32 %r219, %f289; + cvt.rn.bf16.f32 %rs6, %r219; + mov.b32 %r220, %f290; + cvt.rn.bf16.f32 %rs7, %r220; + mov.b32 %r221, %f291; + cvt.rn.bf16.f32 %rs8, %r221; + mov.b32 %r229, {%rs1, %rs2}; + mov.b32 %r230, {%rs3, %rs4}; + mov.b32 %r231, {%rs5, %rs6}; + mov.b32 %r232, {%rs7, %rs8}; + @%p70 st.global.v4.b32 [ %rd83 + 0 ], { %r229, %r230, %r231, %r232 }; + mov.b32 %r235, 128; + mov.pred %p83, 0; + .loc 1 55 36 + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_8; +$L__BB0_5: + .loc 1 0 36 + mov.pred %p2, %p83; + .loc 1 56 27 + or.b32 %r195, %r235, %r3; + .loc 1 59 41 + or.b32 %r196, %r195, %r5; + .loc 1 59 35 + mul.wide.s32 %rd64, %r196, 4; + add.s64 %rd60, %rd8, %rd64; + cvt.s64.s32 %rd65, %r235; + add.s64 %rd66, %rd65, %rd4; + add.s64 %rd67, %rd66, %rd3; + shl.b64 %rd68, %rd67, 2; + add.s64 %rd69, %rd8, %rd68; + add.s64 %rd61, %rd69, 16; + .loc 1 59 51 + mov.u32 %r163, 0x0; + mov.u32 %r164, 0x0; + mov.u32 %r165, 0x0; + mov.u32 %r166, 0x0; + @%p49 ld.global.L1::evict_last.v4.b32 { %r163, %r164, %r165, %r166 }, [ %rd60 + 0 ]; + @!%p49 mov.u32 %r163, %r167; + @!%p49 mov.u32 %r164, %r167; + @!%p49 mov.u32 %r165, %r167; + @!%p49 mov.u32 %r166, %r167; + mov.b32 %f75, %r163; + mov.b32 %f76, %r164; + mov.b32 %f77, %r165; + mov.b32 %f78, %r166; + mov.u32 %r171, 0x0; + mov.u32 %r172, 0x0; + mov.u32 %r173, 0x0; + mov.u32 %r174, 0x0; + @%p49 ld.global.L1::evict_last.v4.b32 { %r171, %r172, %r173, %r174 }, [ %rd61 + 0 ]; + @!%p49 mov.u32 %r171, %r167; + @!%p49 mov.u32 %r172, %r167; + @!%p49 mov.u32 %r173, %r167; + @!%p49 mov.u32 %r174, %r167; + mov.b32 %f79, %r171; + mov.b32 %f80, %r172; + mov.b32 %f81, %r173; + mov.b32 %f82, %r174; + .loc 1 60 35 + cvt.u64.u32 %rd6, %r195; + mul.wide.u32 %rd70, %r195, 4; + add.s64 %rd62, %rd9, %rd70; + cvt.u64.u32 %rd71, %r235; + add.s64 %rd7, %rd71, %rd4; + shl.b64 %rd72, %rd7, 2; + add.s64 %rd73, %rd9, %rd72; + add.s64 %rd63, %rd73, 16; + .loc 1 60 40 + mov.u32 %r179, 0x0; + mov.u32 %r180, 0x0; + mov.u32 %r181, 0x0; + mov.u32 %r182, 0x0; + @%p49 ld.global.L1::evict_last.v4.b32 { %r179, %r180, %r181, %r182 }, [ %rd62 + 0 ]; + @!%p49 mov.u32 %r179, %r167; + @!%p49 mov.u32 %r180, %r167; + @!%p49 mov.u32 %r181, %r167; + @!%p49 mov.u32 %r182, %r167; + mov.b32 %f83, %r179; + mov.b32 %f84, %r180; + mov.b32 %f85, %r181; + mov.b32 %f86, %r182; + mov.u32 %r187, 0x0; + mov.u32 %r188, 0x0; + mov.u32 %r189, 0x0; + mov.u32 %r190, 0x0; + @%p49 ld.global.L1::evict_last.v4.b32 { %r187, %r188, %r189, %r190 }, [ %rd63 + 0 ]; + @!%p49 mov.u32 %r187, %r167; + @!%p49 mov.u32 %r188, %r167; + @!%p49 mov.u32 %r189, %r167; + @!%p49 mov.u32 %r190, %r167; + mov.b32 %f87, %r187; + mov.b32 %f88, %r188; + mov.b32 %f89, %r189; + mov.b32 %f90, %r190; + .loc 1 64 57 + @%p25 bra $L__BB0_7; + mov.u64 %rd74, assertMessage_1; + cvta.global.u64 %rd75, %rd74; + mov.u64 %rd76, assertFile_1; + cvta.global.u64 %rd77, %rd76; + mov.u64 %rd78, assertFunc_1; + cvta.global.u64 %rd79, %rd78; + { // callseq 1, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd75; + .param .b64 param1; + st.param.b64 [param1+0], %rd77; + .param .b32 param2; + st.param.b32 [param2+0], %r233; + .param .b64 param3; + st.param.b64 [param3+0], %rd79; + .param .b64 param4; + st.param.b64 [param4+0], %rd88; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 1 + bra.uni $L__BB0_7; +$L__BB0_8: + .loc 1 55 4 + ret; +$L__tmp13: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 298 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 103 +.b8 120 +.b8 53 +.b8 108 +.b8 120 +.b8 112 +.b8 117 +.b8 101 +.b8 120 +.b8 112 +.b8 105 +.b8 110 +.b8 100 +.b8 106 +.b8 52 +.b8 100 +.b8 115 +.b8 109 +.b8 106 +.b8 122 +.b8 53 +.b8 120 +.b8 52 +.b8 50 +.b8 117 +.b8 104 +.b8 121 +.b8 121 +.b8 55 +.b8 105 +.b8 115 +.b8 107 +.b8 101 +.b8 118 +.b8 113 +.b8 55 +.b8 111 +.b8 118 +.b8 122 +.b8 112 +.b8 119 +.b8 97 +.b8 103 +.b8 98 +.b8 51 +.b8 116 +.b8 53 +.b8 112 +.b8 111 +.b8 119 +.b8 106 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 103 +.b8 120 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp2 +.b8 2 +.b8 44 +.b8 38 +.b8 5 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp12 +.b8 2 +.b8 50 +.b8 41 +.b8 4 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp12 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp4 +.b64 $L__tmp11 +.b8 2 +.b8 50 +.b8 41 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 101 +.b8 54 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 302 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttir b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5b4589d516d59c82c67286e9e46db7ee19b695a3 --- /dev/null +++ b/.triton/dump/cde66a00b7594f6428e4f4dcdfa88537/triton_.ttir @@ -0,0 +1,139 @@ +module { + tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<1.000000e+00> : tensor<16x128xf32> + %c256_i32 = arith.constant 256 : i32 + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<256> : tensor<16x1xi64> + %cst_2 = arith.constant dense<0> : tensor<16x1xi64> + %cst_3 = arith.constant dense<50257> : tensor<16x1xi64> + %cst_4 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32> + %cst_5 = arith.constant dense<2.560000e+02> : tensor<16x1xf32> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x128xf32> + %cst_7 = arith.constant dense<0.000000e+00> : tensor<16x128xf32> + %cst_8 = arith.constant dense<256> : tensor<16x1xi32> + %cst_9 = arith.constant dense<256> : tensor<1x128xi32> + %cst_10 = arith.constant dense<512> : tensor<16x1xi32> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32>) -> tensor<16x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<16x1xi32> + %5 = arith.addi %4, %3 : tensor<16x1xi32> + %6 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<128xi32>) -> tensor<1x128xi32> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x1x!tt.ptr> + %9 = tt.addptr %8, %5 : tensor<16x1x!tt.ptr>, tensor<16x1xi32> + %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64> + %11 = arith.remsi %5, %cst_10 : tensor<16x1xi32> + %12 = arith.muli %11, %cst_8 : tensor<16x1xi32> + %13 = tt.broadcast %12 : (tensor<16x1xi32>) -> tensor<16x128xi32> + %14 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + %15 = arith.addi %10, %cst_3 : tensor<16x1xi64> + %16 = arith.cmpi slt, %10, %cst_2 : tensor<16x1xi64> + %17 = arith.select %16, %15, %10 : tensor<16x1xi1>, tensor<16x1xi64> + %18 = arith.cmpi sge, %17, %cst_2 : tensor<16x1xi64> + %19 = arith.cmpi slt, %17, %cst_3 : tensor<16x1xi64> + %20 = arith.andi %18, %19 : tensor<16x1xi1> + %21 = arith.muli %17, %cst_1 : tensor<16x1xi64> + %22 = tt.broadcast %21 : (tensor<16x1xi64>) -> tensor<16x128xi64> + %23 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) : i32 { + %47 = tt.splat %arg7 : (i32) -> tensor<1x128xi32> + %48 = arith.addi %47, %7 : tensor<1x128xi32> + %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x128xi32> + %50 = tt.broadcast %48 : (tensor<1x128xi32>) -> tensor<16x128xi32> + %51 = arith.addi %50, %13 : tensor<16x128xi32> + %52 = tt.addptr %14, %51 : tensor<16x128x!tt.ptr>, tensor<16x128xi32> + %53 = tt.broadcast %49 : (tensor<1x128xi1>) -> tensor<16x128xi1> + %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32> + tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "", 1892 : tensor<16x1xi1> + %55 = arith.extsi %48 : tensor<1x128xi32> to tensor<1x128xi64> + %56 = tt.broadcast %55 : (tensor<1x128xi64>) -> tensor<16x128xi64> + %57 = arith.addi %56, %22 : tensor<16x128xi64> + %58 = tt.addptr %23, %57 : tensor<16x128x!tt.ptr>, tensor<16x128xi64> + %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32> + %60 = arith.addf %59, %54 : tensor<16x128xf32> + %61 = arith.subf %60, %arg8 : tensor<16x128xf32> + %62 = arith.addf %arg10, %cst_0 : tensor<16x128xf32> + %63 = arith.divf %61, %62 : tensor<16x128xf32> + %64 = arith.addf %arg8, %63 : tensor<16x128xf32> + %65 = arith.subf %60, %64 : tensor<16x128xf32> + %66 = arith.mulf %61, %65 : tensor<16x128xf32> + %67 = arith.addf %arg9, %66 : tensor<16x128xf32> + %68 = arith.select %53, %64, %arg8 : tensor<16x128xi1>, tensor<16x128xf32> + %69 = arith.select %53, %67, %arg9 : tensor<16x128xi1>, tensor<16x128xf32> + %70 = arith.select %53, %62, %arg10 : tensor<16x128xi1>, tensor<16x128xf32> + scf.yield %68, %69, %70 : tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32> + } + %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({ + ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32): + %47 = arith.subf %arg10, %arg7 : f32 + %48 = arith.addf %arg9, %arg12 : f32 + %49 = arith.cmpf oeq, %48, %cst : f32 + %50 = arith.divf %arg12, %48 : f32 + %51 = arith.select %49, %cst, %50 : f32 + %52 = arith.mulf %47, %51 : f32 + %53 = arith.addf %arg7, %52 : f32 + %54 = arith.addf %arg8, %arg11 : f32 + %55 = arith.mulf %47, %47 : f32 + %56 = arith.mulf %55, %arg9 : f32 + %57 = arith.mulf %56, %51 : f32 + %58 = arith.addf %54, %57 : f32 + tt.reduce.return %53, %58, %48 : f32, f32, f32 + }) : (tensor<16x128xf32>, tensor<16x128xf32>, tensor<16x128xf32>) -> (tensor<16xf32>, tensor<16xf32>, tensor<16xf32>) + %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32> + %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<16xf32>) -> tensor<16x1xf32> + %28 = arith.muli %11, %cst_8 : tensor<16x1xi32> + %29 = tt.broadcast %28 : (tensor<16x1xi32>) -> tensor<16x128xi32> + %30 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + %31 = tt.splat %arg3 : (!tt.ptr) -> tensor<1x128x!tt.ptr> + %32 = arith.addi %10, %cst_3 : tensor<16x1xi64> + %33 = arith.cmpi slt, %10, %cst_2 : tensor<16x1xi64> + %34 = arith.select %33, %32, %10 : tensor<16x1xi1>, tensor<16x1xi64> + %35 = arith.cmpi sge, %34, %cst_2 : tensor<16x1xi64> + %36 = arith.cmpi slt, %34, %cst_3 : tensor<16x1xi64> + %37 = arith.andi %35, %36 : tensor<16x1xi1> + %38 = arith.muli %34, %cst_1 : tensor<16x1xi64> + %39 = tt.broadcast %38 : (tensor<16x1xi64>) -> tensor<16x128xi64> + %40 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + %41 = tt.broadcast %26 : (tensor<16x1xf32>) -> tensor<16x128xf32> + %42 = arith.divf %27, %cst_5 : tensor<16x1xf32> + %43 = arith.addf %42, %cst_4 : tensor<16x1xf32> + %44 = arith.muli %5, %cst_8 : tensor<16x1xi32> + %45 = tt.broadcast %44 : (tensor<16x1xi32>) -> tensor<16x128xi32> + %46 = tt.splat %arg4 : (!tt.ptr) -> tensor<16x128x!tt.ptr> + scf.for %arg7 = %c0_i32 to %c256_i32 step %c128_i32 : i32 { + %47 = tt.splat %arg7 : (i32) -> tensor<1x128xi32> + %48 = arith.addi %47, %7 : tensor<1x128xi32> + %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x128xi32> + %50 = tt.broadcast %48 : (tensor<1x128xi32>) -> tensor<16x128xi32> + %51 = arith.addi %50, %29 : tensor<16x128xi32> + %52 = tt.addptr %30, %51 : tensor<16x128x!tt.ptr>, tensor<16x128xi32> + %53 = tt.broadcast %49 : (tensor<1x128xi1>) -> tensor<16x128xi1> + %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32> + %55 = tt.addptr %31, %48 : tensor<1x128x!tt.ptr>, tensor<1x128xi32> + %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32> + tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "", 1892 : tensor<16x1xi1> + %57 = arith.extsi %48 : tensor<1x128xi32> to tensor<1x128xi64> + %58 = tt.broadcast %57 : (tensor<1x128xi64>) -> tensor<16x128xi64> + %59 = arith.addi %58, %39 : tensor<16x128xi64> + %60 = tt.addptr %40, %59 : tensor<16x128x!tt.ptr>, tensor<16x128xi64> + %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32> + %62 = arith.addf %61, %54 : tensor<16x128xf32> + %63 = arith.subf %62, %41 : tensor<16x128xf32> + %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32>) -> tensor<16x1xf32> + %65 = tt.broadcast %64 : (tensor<16x1xf32>) -> tensor<16x128xf32> + %66 = arith.mulf %63, %65 : tensor<16x128xf32> + %67 = tt.broadcast %56 : (tensor<1x128xf32>) -> tensor<16x128xf32> + %68 = arith.mulf %66, %67 : tensor<16x128xf32> + %69 = arith.addi %50, %45 : tensor<16x128xi32> + %70 = tt.addptr %46, %69 : tensor<16x128x!tt.ptr>, tensor<16x128xi32> + %71 = arith.truncf %68 : tensor<16x128xf32> to tensor<16x128xbf16> + tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16> + } + tt.return + } +} diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ptx b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..b09a61c0b75efe10c32a5c44865dbeff5941454c --- /dev/null +++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ptx @@ -0,0 +1,296 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<3>; + .reg .b16 %rs<3>; + .reg .b32 %r<12>; + .reg .b64 %rd<7>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2de_param_0]; + ld.param.u64 %rd4, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r7, %tid.x; + shl.b32 %r8, %r7, 1; + and.b32 %r9, %r8, 510; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r10, %r1, 9; + .loc 1 21 23 + or.b32 %r11, %r10, %r9; + .loc 1 24 30 + mul.wide.s32 %rd5, %r11, 2; + add.s64 %rd1, %rd3, %rd5; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + @%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + .loc 1 24 44 + cvt.f32.bf16 %r5, %rs1; + cvt.f32.bf16 %r6, %rs2; + .loc 1 26 25 + mul.wide.s32 %rd6, %r11, 4; + add.s64 %rd2, %rd4, %rd6; + .loc 1 26 36 + @%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/ot/cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 111 +.b8 116 +.b8 98 +.b8 104 +.b8 101 +.b8 116 +.b8 51 +.b8 55 +.b8 118 +.b8 54 +.b8 109 +.b8 104 +.b8 53 +.b8 115 +.b8 97 +.b8 109 +.b8 113 +.b8 108 +.b8 55 +.b8 117 +.b8 120 +.b8 114 +.b8 101 +.b8 51 +.b8 104 +.b8 112 +.b8 114 +.b8 112 +.b8 110 +.b8 98 +.b8 104 +.b8 117 +.b8 118 +.b8 105 +.b8 109 +.b8 51 +.b8 102 +.b8 109 +.b8 114 +.b8 106 +.b8 112 +.b8 113 +.b8 53 +.b8 102 +.b8 103 +.b8 103 +.b8 54 +.b8 108 +.b8 119 +.b8 98 +.b8 105 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 111 +.b8 116 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttgir b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..8dfc219ffbb3021e19f3b35e9be96086e23c9c4b --- /dev/null +++ b/.triton/dump/dc55e2c8a829a1c52f571c7b5fc76c05/triton_.ttgir @@ -0,0 +1,24 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1> + %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1> + %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked> + %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked> + %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1> + %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1> + %13 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked1> + %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr, #blocked1>, tensor<1024xi32, #blocked1> + tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.cubin b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..24a0f111ed2f0a5d119e3d8b4f7f49630902bff6 Binary files /dev/null and b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.cubin differ diff --git a/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ptx b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..e4e546826b9e3bc19bbf540d4312956a55e4bcb7 --- /dev/null +++ b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ptx @@ -0,0 +1,1382 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6de7de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_1[8] = {60, 109, 111, 100, 117, 108, 101, 62}; +.global .align 1 .b8 assertFile_1[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 54, 32, 60, 32, 53, 48, 50, 53, 55}; +.global .align 1 .b8 assertFunc_0[8] = {60, 109, 111, 100, 117, 108, 101, 62}; +.global .align 1 .b8 assertFile_0[68] = {47, 117, 115, 114, 47, 108, 111, 99, 97, 108, 47, 108, 105, 98, 47, 112, 121, 116, 104, 111, 110, 51, 46, 49, 48, 47, 100, 105, 115, 116, 45, 112, 97, 99, 107, 97, 103, 101, 115, 47, 116, 111, 114, 99, 104, 47, 95, 105, 110, 100, 117, 99, 116, 111, 114, 47, 99, 111, 100, 101, 99, 97, 99, 104, 101, 46, 112, 121}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6de7de( + .param .u64 triton__0d1d2d3d4d5d6de7de_param_0, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_1, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_2, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_3, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_4, + .param .u64 triton__0d1d2d3d4d5d6de7de_param_5, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_6, + .param .u32 triton__0d1d2d3d4d5d6de7de_param_7 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<94>; + .reg .b16 %rs<25>; + .reg .b32 %r<268>; + .reg .f32 %f<356>; + .reg .b64 %rd<95>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6de7de_param_5]; + ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6de7de_param_4]; + ld.param.u64 %rd10, [triton__0d1d2d3d4d5d6de7de_param_3]; + ld.param.u64 %rd9, [triton__0d1d2d3d4d5d6de7de_param_2]; + ld.param.u64 %rd31, [triton__0d1d2d3d4d5d6de7de_param_0]; +$L__tmp0: + .loc 1 22 44 + mov.u32 %r1, %tid.x; + ld.param.u64 %rd32, [triton__0d1d2d3d4d5d6de7de_param_1]; + bfe.u32 %r2, %r1, 4, 4; + and.b32 %r11, %r1, 15; + .loc 1 24 33 + shl.b32 %r3, %r11, 3; + .loc 1 21 28 + mov.u32 %r9, %ctaid.x; + .loc 1 21 33 + shl.b32 %r12, %r9, 4; + .loc 1 22 23 + or.b32 %r13, %r12, %r2; + or.b32 %r14, %r12, %r11; + .loc 1 26 30 + mul.wide.s32 %rd33, %r13, 8; + add.s64 %rd14, %rd31, %rd33; + mul.wide.s32 %rd34, %r14, 8; + add.s64 %rd30, %rd31, %rd34; + mov.pred %p3, -1; + .loc 1 26 35 + mov.u64 %rd13, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd13 }, [ %rd14 + 0 ]; + mov.u64 %rd15, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd15 }, [ %rd14 + 0 ]; + mov.u64 %rd17, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd17 }, [ %rd14 + 0 ]; + mov.u64 %rd19, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd19 }, [ %rd14 + 0 ]; + mov.u64 %rd21, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd21 }, [ %rd14 + 0 ]; + mov.u64 %rd23, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd23 }, [ %rd14 + 0 ]; + mov.u64 %rd25, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd25 }, [ %rd14 + 0 ]; + mov.u64 %rd27, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd27 }, [ %rd14 + 0 ]; + mov.u64 %rd29, 0x0; + @%p3 ld.global.L1::evict_last.b64 { %rd29 }, [ %rd30 + 0 ]; + .loc 1 27 18 + bfe.s32 %r15, %r9, 27, 1; + shr.u32 %r16, %r15, 23; + add.s32 %r17, %r13, %r16; + and.b32 %r18, %r17, 16776704; + sub.s32 %r19, %r13, %r18; + .loc 1 35 44 + shl.b32 %r4, %r19, 8; + .loc 1 36 44 + shl.b32 %r5, %r13, 8; + .loc 1 37 22 + add.s64 %rd35, %rd29, 50257; + .loc 1 38 22 + setp.lt.s64 %p13, %rd13, 0; + setp.lt.s64 %p14, %rd29, 0; + .loc 1 39 36 + selp.b64 %rd1, %rd35, %rd29, %p14; + .loc 1 41 44 + shl.b64 %rd36, %rd13, 8; + add.s64 %rd37, %rd36, 12865792; + selp.b64 %rd38, %rd37, %rd36, %p13; + shl.b64 %rd39, %rd38, 2; + add.s64 %rd2, %rd32, %rd39; + mov.b32 %r24, 0; + mov.f32 %f324, 0f00000000; + mov.f32 %f325, %f324; + mov.f32 %f326, %f324; + mov.f32 %f327, %f324; + mov.f32 %f328, %f324; + mov.f32 %f329, %f324; + mov.f32 %f330, %f324; + mov.f32 %f331, %f324; + mov.f32 %f332, %f324; + mov.f32 %f333, %f324; + mov.f32 %f334, %f324; + mov.f32 %f335, %f324; + mov.f32 %f336, %f324; + mov.f32 %f337, %f324; + mov.f32 %f338, %f324; + mov.f32 %f339, %f324; + mov.f32 %f340, %f324; + mov.f32 %f341, %f324; + mov.f32 %f342, %f324; + mov.f32 %f343, %f324; + mov.f32 %f344, %f324; + mov.f32 %f345, %f324; + mov.f32 %f346, %f324; + mov.f32 %f347, %f324; + mov.f32 %f348, %f324; + mov.f32 %f349, %f324; + mov.f32 %f350, %f324; + mov.f32 %f351, %f324; + mov.f32 %f352, %f324; + mov.f32 %f353, %f324; + mov.f32 %f354, %f324; + mov.f32 %f355, %f324; + mov.pred %p92, %p3; + mov.u32 %r266, %r24; + bra.uni $L__BB0_1; +$L__BB0_3: + .loc 1 0 0 + mov.b32 %f33, %r20; + mov.b32 %f34, %r21; + mov.b32 %f35, %r22; + mov.b32 %f36, %r23; + mov.b32 %f37, %r28; + mov.b32 %f38, %r29; + mov.b32 %f39, %r30; + mov.b32 %f40, %r31; + cvt.u16.u32 %rs1, %r36; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r36; } + cvt.u16.u32 %rs3, %r37; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r37; } + cvt.u16.u32 %rs5, %r38; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r38; } + cvt.u16.u32 %rs7, %r39; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r39; } + cvt.f32.bf16 %r44, %rs1; + mov.b32 %f41, %r44; + cvt.f32.bf16 %r45, %rs2; + mov.b32 %f42, %r45; + cvt.f32.bf16 %r46, %rs3; + mov.b32 %f43, %r46; + cvt.f32.bf16 %r47, %rs4; + mov.b32 %f44, %r47; + cvt.f32.bf16 %r48, %rs5; + mov.b32 %f45, %r48; + cvt.f32.bf16 %r49, %rs6; + mov.b32 %f46, %r49; + cvt.f32.bf16 %r50, %rs7; + mov.b32 %f47, %r50; + cvt.f32.bf16 %r51, %rs8; + mov.b32 %f48, %r51; + .loc 1 41 34 + mul.wide.u32 %rd59, %r7, 4; + add.s64 %rd57, %rd2, %rd59; + .loc 1 41 40 + cvt.u64.u32 %rd60, %r266; + add.s64 %rd61, %rd60, %rd4; + .loc 1 41 34 + shl.b64 %rd62, %rd61, 2; + add.s64 %rd63, %rd2, %rd62; + add.s64 %rd58, %rd63, 16; + mov.b32 %r184, 0; + mov.pred %p54, -1; + .loc 1 41 52 + mov.u32 %r55, 0x0; + mov.u32 %r56, 0x0; + mov.u32 %r57, 0x0; + mov.u32 %r58, 0x0; + @%p54 ld.global.L1::evict_last.v4.b32 { %r55, %r56, %r57, %r58 }, [ %rd57 + 0 ]; + @!%p54 mov.u32 %r55, %r184; + @!%p54 mov.u32 %r56, %r184; + @!%p54 mov.u32 %r57, %r184; + @!%p54 mov.u32 %r58, %r184; + mov.b32 %f108, %r55; + mov.b32 %f109, %r56; + mov.b32 %f110, %r57; + mov.b32 %f111, %r58; + mov.u32 %r63, 0x0; + mov.u32 %r64, 0x0; + mov.u32 %r65, 0x0; + mov.u32 %r66, 0x0; + @%p54 ld.global.L1::evict_last.v4.b32 { %r63, %r64, %r65, %r66 }, [ %rd58 + 0 ]; + @!%p54 mov.u32 %r63, %r184; + @!%p54 mov.u32 %r64, %r184; + @!%p54 mov.u32 %r65, %r184; + @!%p54 mov.u32 %r66, %r184; + mov.b32 %f112, %r63; + mov.b32 %f113, %r64; + mov.b32 %f114, %r65; + mov.b32 %f115, %r66; + .loc 1 42 22 + add.f32 %f116, %f33, %f108; + add.f32 %f117, %f34, %f109; + add.f32 %f118, %f35, %f110; + add.f32 %f119, %f36, %f111; + add.f32 %f120, %f37, %f112; + add.f32 %f121, %f38, %f113; + add.f32 %f122, %f39, %f114; + add.f32 %f123, %f40, %f115; + .loc 1 44 22 + add.f32 %f124, %f41, %f116; + add.f32 %f125, %f42, %f117; + add.f32 %f126, %f43, %f118; + add.f32 %f127, %f44, %f119; + add.f32 %f128, %f45, %f120; + add.f32 %f129, %f46, %f121; + add.f32 %f130, %f47, %f122; + add.f32 %f131, %f48, %f123; +$L__tmp1: + .loc 2 96 20 + sub.f32 %f132, %f124, %f348; + sub.f32 %f133, %f125, %f349; + sub.f32 %f134, %f126, %f350; + sub.f32 %f135, %f127, %f351; + sub.f32 %f136, %f128, %f352; + sub.f32 %f137, %f129, %f353; + sub.f32 %f138, %f130, %f354; + sub.f32 %f139, %f131, %f355; + .loc 2 97 26 + add.f32 %f324, %f324, 0f3F800000; + add.f32 %f325, %f325, 0f3F800000; + add.f32 %f326, %f326, 0f3F800000; + add.f32 %f327, %f327, 0f3F800000; + add.f32 %f328, %f328, 0f3F800000; + add.f32 %f329, %f329, 0f3F800000; + add.f32 %f330, %f330, 0f3F800000; + add.f32 %f331, %f331, 0f3F800000; + add.f32 %f332, %f332, 0f3F800000; + add.f32 %f333, %f333, 0f3F800000; + add.f32 %f334, %f334, 0f3F800000; + add.f32 %f335, %f335, 0f3F800000; + add.f32 %f336, %f336, 0f3F800000; + add.f32 %f337, %f337, 0f3F800000; + add.f32 %f338, %f338, 0f3F800000; + add.f32 %f339, %f339, 0f3F800000; + .loc 2 98 30 + mov.b32 %r72, %f132; + mov.b32 %r73, %f324; + div.full.f32 %r71, %r72, %r73; + mov.b32 %f140, %r71; + mov.b32 %r75, %f133; + mov.b32 %r76, %f325; + div.full.f32 %r74, %r75, %r76; + mov.b32 %f141, %r74; + mov.b32 %r78, %f134; + mov.b32 %r79, %f326; + div.full.f32 %r77, %r78, %r79; + mov.b32 %f142, %r77; + mov.b32 %r81, %f135; + mov.b32 %r82, %f327; + div.full.f32 %r80, %r81, %r82; + mov.b32 %f143, %r80; + mov.b32 %r84, %f136; + mov.b32 %r85, %f328; + div.full.f32 %r83, %r84, %r85; + mov.b32 %f144, %r83; + mov.b32 %r87, %f137; + mov.b32 %r88, %f329; + div.full.f32 %r86, %r87, %r88; + mov.b32 %f145, %r86; + mov.b32 %r90, %f138; + mov.b32 %r91, %f330; + div.full.f32 %r89, %r90, %r91; + mov.b32 %f146, %r89; + mov.b32 %r93, %f139; + mov.b32 %r94, %f331; + div.full.f32 %r92, %r93, %r94; + mov.b32 %f147, %r92; + .loc 2 98 22 + add.f32 %f348, %f348, %f140; + add.f32 %f349, %f349, %f141; + add.f32 %f350, %f350, %f142; + add.f32 %f351, %f351, %f143; + add.f32 %f352, %f352, %f144; + add.f32 %f353, %f353, %f145; + add.f32 %f354, %f354, %f146; + add.f32 %f355, %f355, %f147; + .loc 2 101 30 + sub.f32 %f148, %f124, %f348; + sub.f32 %f149, %f125, %f349; + sub.f32 %f150, %f126, %f350; + sub.f32 %f151, %f127, %f351; + sub.f32 %f152, %f128, %f352; + sub.f32 %f153, %f129, %f353; + sub.f32 %f154, %f130, %f354; + sub.f32 %f155, %f131, %f355; +$L__tmp2: + .loc 1 50 50 + fma.rn.f32 %f340, %f132, %f148, %f340; + fma.rn.f32 %f341, %f133, %f149, %f341; + fma.rn.f32 %f342, %f134, %f150, %f342; + fma.rn.f32 %f343, %f135, %f151, %f343; + fma.rn.f32 %f344, %f136, %f152, %f344; + fma.rn.f32 %f345, %f137, %f153, %f345; + fma.rn.f32 %f346, %f138, %f154, %f346; + fma.rn.f32 %f347, %f139, %f155, %f347; + mov.b32 %r266, 128; + mov.pred %p92, 0; + .loc 1 31 36 + @%p1 bra $L__BB0_1; + bra.uni $L__BB0_4; +$L__BB0_1: + .loc 1 0 36 + mov.pred %p1, %p92; + .loc 1 40 40 + setp.lt.u64 %p30, %rd1, 50257; + .loc 1 32 27 + or.b32 %r7, %r266, %r3; + .loc 1 35 40 + or.b32 %r52, %r7, %r4; + .loc 1 35 34 + mul.wide.s32 %rd43, %r52, 4; + add.s64 %rd40, %rd9, %rd43; + cvt.s64.s32 %rd3, %r4; + cvt.s64.s32 %rd44, %r266; + cvt.u64.u32 %rd4, %r3; + add.s64 %rd45, %rd44, %rd4; + add.s64 %rd46, %rd45, %rd3; + shl.b64 %rd47, %rd46, 2; + add.s64 %rd48, %rd9, %rd47; + add.s64 %rd41, %rd48, 16; + .loc 1 35 50 + mov.u32 %r20, 0x0; + mov.u32 %r21, 0x0; + mov.u32 %r22, 0x0; + mov.u32 %r23, 0x0; + @%p3 ld.global.L1::evict_last.v4.b32 { %r20, %r21, %r22, %r23 }, [ %rd40 + 0 ]; + @!%p3 mov.u32 %r20, %r24; + @!%p3 mov.u32 %r21, %r24; + @!%p3 mov.u32 %r22, %r24; + @!%p3 mov.u32 %r23, %r24; + mov.u32 %r28, 0x0; + mov.u32 %r29, 0x0; + mov.u32 %r30, 0x0; + mov.u32 %r31, 0x0; + @%p3 ld.global.L1::evict_last.v4.b32 { %r28, %r29, %r30, %r31 }, [ %rd41 + 0 ]; + @!%p3 mov.u32 %r28, %r24; + @!%p3 mov.u32 %r29, %r24; + @!%p3 mov.u32 %r30, %r24; + @!%p3 mov.u32 %r31, %r24; + .loc 1 36 40 + or.b32 %r53, %r7, %r5; + .loc 1 36 34 + mul.wide.s32 %rd49, %r53, 2; + add.s64 %rd42, %rd10, %rd49; + .loc 1 36 50 + mov.u32 %r36, 0x0; + mov.u32 %r37, 0x0; + mov.u32 %r38, 0x0; + mov.u32 %r39, 0x0; + @%p3 ld.global.L1::evict_last.v4.b32 { %r36, %r37, %r38, %r39 }, [ %rd42 + 0 ]; + @!%p3 mov.u32 %r36, %r24; + @!%p3 mov.u32 %r37, %r24; + @!%p3 mov.u32 %r38, %r24; + @!%p3 mov.u32 %r39, %r24; + mov.b32 %r265, 1892; + mov.u64 %rd94, 1; + .loc 1 40 55 + @%p30 bra $L__BB0_3; + mov.u64 %rd50, assertMessage_0; + cvta.global.u64 %rd51, %rd50; + mov.u64 %rd52, assertFile_0; + cvta.global.u64 %rd53, %rd52; + mov.u64 %rd54, assertFunc_0; + cvta.global.u64 %rd55, %rd54; + { // callseq 6, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd51; + .param .b64 param1; + st.param.b64 [param1+0], %rd53; + .param .b32 param2; + st.param.b32 [param2+0], %r265; + .param .b64 param3; + st.param.b64 [param3+0], %rd55; + .param .b64 param4; + st.param.b64 [param4+0], %rd94; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 6 + bra.uni $L__BB0_3; +$L__BB0_4: + .loc 1 24 33 + and.b32 %r154, %r1, 127; + .loc 1 31 36 + bfe.s32 %r155, %r1, 7, 1; + and.b32 %r156, %r155, 136; + add.s32 %r157, %r156, %r154; + shl.b32 %r158, %r157, 2; + mov.u32 %r159, global_smem; + add.s32 %r160, %r159, %r158; + st.shared.f32 [%r160], %f332; + st.shared.f32 [%r160+1088], %f333; + st.shared.f32 [%r160+2176], %f334; + st.shared.f32 [%r160+3264], %f335; + st.shared.f32 [%r160+4352], %f336; + st.shared.f32 [%r160+5440], %f337; + st.shared.f32 [%r160+6528], %f338; + st.shared.f32 [%r160+7616], %f339; + bar.sync 0; + mad.lo.s32 %r161, %r2, 136, %r3; + shl.b32 %r162, %r161, 2; + add.s32 %r163, %r159, %r162; + ld.shared.v4.f32 {%f156, %f157, %f158, %f159}, [%r163]; + ld.shared.v4.f32 {%f160, %f161, %f162, %f163}, [%r163+16]; +$L__tmp3: + .loc 2 108 21 + sub.f32 %f164, %f349, %f348; + .loc 2 109 28 + add.f32 %f165, %f156, %f157; + .loc 2 110 39 + setp.eq.f32 %p43, %f165, 0f00000000; + .loc 2 110 60 + mov.b32 %r97, %f157; + mov.b32 %r98, %f165; + div.full.f32 %r96, %r97, %r98; + mov.b32 %f166, %r96; + .loc 2 110 49 + selp.f32 %f167, 0f00000000, %f166, %p43; + .loc 2 112 17 + fma.rn.f32 %f168, %f164, %f167, %f348; + .loc 2 113 15 + add.f32 %f169, %f340, %f341; + .loc 2 113 30 + mul.f32 %f170, %f164, %f164; + .loc 2 113 38 + mul.f32 %f171, %f170, %f156; + .loc 2 113 22 + fma.rn.f32 %f172, %f171, %f167, %f169; + .loc 2 108 21 + sub.f32 %f173, %f350, %f168; + .loc 2 109 28 + add.f32 %f174, %f158, %f165; + .loc 2 110 39 + setp.eq.f32 %p44, %f174, 0f00000000; + .loc 2 110 60 + mov.b32 %r101, %f174; + mov.b32 %r100, %f158; + div.full.f32 %r99, %r100, %r101; + mov.b32 %f175, %r99; + .loc 2 110 49 + selp.f32 %f176, 0f00000000, %f175, %p44; + .loc 2 112 17 + fma.rn.f32 %f177, %f176, %f173, %f168; + .loc 2 113 15 + add.f32 %f178, %f342, %f172; + .loc 2 113 30 + mul.f32 %f179, %f173, %f173; + .loc 2 113 38 + mul.f32 %f180, %f165, %f179; + .loc 2 113 22 + fma.rn.f32 %f181, %f176, %f180, %f178; + .loc 2 108 21 + sub.f32 %f182, %f351, %f177; + .loc 2 109 28 + add.f32 %f183, %f159, %f174; + .loc 2 110 39 + setp.eq.f32 %p45, %f183, 0f00000000; + .loc 2 110 60 + mov.b32 %r104, %f183; + mov.b32 %r103, %f159; + div.full.f32 %r102, %r103, %r104; + mov.b32 %f184, %r102; + .loc 2 110 49 + selp.f32 %f185, 0f00000000, %f184, %p45; + .loc 2 112 17 + fma.rn.f32 %f186, %f185, %f182, %f177; + .loc 2 113 15 + add.f32 %f187, %f343, %f181; + .loc 2 113 30 + mul.f32 %f188, %f182, %f182; + .loc 2 113 38 + mul.f32 %f189, %f174, %f188; + .loc 2 113 22 + fma.rn.f32 %f190, %f185, %f189, %f187; + .loc 2 108 21 + sub.f32 %f191, %f352, %f186; + .loc 2 109 28 + add.f32 %f192, %f160, %f183; + .loc 2 110 39 + setp.eq.f32 %p46, %f192, 0f00000000; + .loc 2 110 60 + mov.b32 %r107, %f192; + mov.b32 %r106, %f160; + div.full.f32 %r105, %r106, %r107; + mov.b32 %f193, %r105; + .loc 2 110 49 + selp.f32 %f194, 0f00000000, %f193, %p46; + .loc 2 112 17 + fma.rn.f32 %f195, %f194, %f191, %f186; + .loc 2 113 15 + add.f32 %f196, %f344, %f190; + .loc 2 113 30 + mul.f32 %f197, %f191, %f191; + .loc 2 113 38 + mul.f32 %f198, %f183, %f197; + .loc 2 113 22 + fma.rn.f32 %f199, %f194, %f198, %f196; + .loc 2 108 21 + sub.f32 %f200, %f353, %f195; + .loc 2 109 28 + add.f32 %f201, %f161, %f192; + .loc 2 110 39 + setp.eq.f32 %p47, %f201, 0f00000000; + .loc 2 110 60 + mov.b32 %r110, %f201; + mov.b32 %r109, %f161; + div.full.f32 %r108, %r109, %r110; + mov.b32 %f202, %r108; + .loc 2 110 49 + selp.f32 %f203, 0f00000000, %f202, %p47; + .loc 2 112 17 + fma.rn.f32 %f204, %f203, %f200, %f195; + .loc 2 113 15 + add.f32 %f205, %f345, %f199; + .loc 2 113 30 + mul.f32 %f206, %f200, %f200; + .loc 2 113 38 + mul.f32 %f207, %f192, %f206; + .loc 2 113 22 + fma.rn.f32 %f208, %f203, %f207, %f205; + .loc 2 108 21 + sub.f32 %f209, %f354, %f204; + .loc 2 109 28 + add.f32 %f210, %f162, %f201; + .loc 2 110 39 + setp.eq.f32 %p48, %f210, 0f00000000; + .loc 2 110 60 + mov.b32 %r113, %f210; + mov.b32 %r112, %f162; + div.full.f32 %r111, %r112, %r113; + mov.b32 %f211, %r111; + .loc 2 110 49 + selp.f32 %f212, 0f00000000, %f211, %p48; + .loc 2 112 17 + fma.rn.f32 %f213, %f212, %f209, %f204; + .loc 2 113 15 + add.f32 %f214, %f346, %f208; + .loc 2 113 30 + mul.f32 %f215, %f209, %f209; + .loc 2 113 38 + mul.f32 %f216, %f201, %f215; + .loc 2 113 22 + fma.rn.f32 %f217, %f212, %f216, %f214; + .loc 2 108 21 + sub.f32 %f218, %f355, %f213; + .loc 2 109 28 + add.f32 %f219, %f163, %f210; + .loc 2 110 39 + setp.eq.f32 %p49, %f219, 0f00000000; + .loc 2 110 60 + mov.b32 %r116, %f219; + mov.b32 %r115, %f163; + div.full.f32 %r114, %r115, %r116; + mov.b32 %f220, %r114; + .loc 2 110 49 + selp.f32 %f221, 0f00000000, %f220, %p49; + .loc 2 112 17 + fma.rn.f32 %f222, %f221, %f218, %f213; + .loc 2 113 15 + add.f32 %f223, %f347, %f217; + .loc 2 113 30 + mul.f32 %f224, %f218, %f218; + .loc 2 113 38 + mul.f32 %f225, %f210, %f224; + .loc 2 113 22 + fma.rn.f32 %f226, %f221, %f225, %f223; +$L__tmp4: + .loc 2 120 46 + mov.b32 %r164, %f222; + shfl.sync.bfly.b32 %r165, %r164, 8, 31, -1; + mov.b32 %f227, %r165; + mov.b32 %r166, %f226; + shfl.sync.bfly.b32 %r167, %r166, 8, 31, -1; + mov.b32 %f228, %r167; + shfl.sync.bfly.b32 %r118, %r116, 8, 31, -1; + mov.b32 %f229, %r118; +$L__tmp5: + .loc 2 108 21 + sub.f32 %f230, %f227, %f222; + .loc 2 109 28 + add.f32 %f231, %f219, %f229; + .loc 2 110 39 + setp.eq.f32 %p50, %f231, 0f00000000; + .loc 2 110 60 + mov.b32 %r119, %f231; + div.full.f32 %r117, %r118, %r119; + mov.b32 %f232, %r117; + .loc 2 110 49 + selp.f32 %f233, 0f00000000, %f232, %p50; + .loc 2 112 17 + fma.rn.f32 %f234, %f233, %f230, %f222; + .loc 2 113 15 + add.f32 %f235, %f226, %f228; + .loc 2 113 30 + mul.f32 %f236, %f230, %f230; + .loc 2 113 38 + mul.f32 %f237, %f219, %f236; + .loc 2 113 22 + fma.rn.f32 %f238, %f233, %f237, %f235; +$L__tmp6: + .loc 2 120 46 + mov.b32 %r168, %f234; + shfl.sync.bfly.b32 %r169, %r168, 4, 31, -1; + mov.b32 %f239, %r169; + mov.b32 %r170, %f238; + shfl.sync.bfly.b32 %r171, %r170, 4, 31, -1; + mov.b32 %f240, %r171; + shfl.sync.bfly.b32 %r121, %r119, 4, 31, -1; + mov.b32 %f241, %r121; +$L__tmp7: + .loc 2 108 21 + sub.f32 %f242, %f239, %f234; + .loc 2 109 28 + add.f32 %f243, %f231, %f241; + .loc 2 110 39 + setp.eq.f32 %p51, %f243, 0f00000000; + .loc 2 110 60 + mov.b32 %r122, %f243; + div.full.f32 %r120, %r121, %r122; + mov.b32 %f244, %r120; + .loc 2 110 49 + selp.f32 %f245, 0f00000000, %f244, %p51; + .loc 2 112 17 + fma.rn.f32 %f246, %f245, %f242, %f234; + .loc 2 113 15 + add.f32 %f247, %f238, %f240; + .loc 2 113 30 + mul.f32 %f248, %f242, %f242; + .loc 2 113 38 + mul.f32 %f249, %f231, %f248; + .loc 2 113 22 + fma.rn.f32 %f250, %f245, %f249, %f247; +$L__tmp8: + .loc 2 120 46 + mov.b32 %r172, %f246; + shfl.sync.bfly.b32 %r173, %r172, 2, 31, -1; + mov.b32 %f251, %r173; + mov.b32 %r174, %f250; + shfl.sync.bfly.b32 %r175, %r174, 2, 31, -1; + mov.b32 %f252, %r175; + shfl.sync.bfly.b32 %r124, %r122, 2, 31, -1; + mov.b32 %f253, %r124; +$L__tmp9: + .loc 2 108 21 + sub.f32 %f254, %f251, %f246; + .loc 2 109 28 + add.f32 %f255, %f243, %f253; + .loc 2 110 39 + setp.eq.f32 %p52, %f255, 0f00000000; + .loc 2 110 60 + mov.b32 %r125, %f255; + div.full.f32 %r123, %r124, %r125; + mov.b32 %f256, %r123; + .loc 2 110 49 + selp.f32 %f257, 0f00000000, %f256, %p52; + .loc 2 112 17 + fma.rn.f32 %f258, %f257, %f254, %f246; + .loc 2 113 15 + add.f32 %f259, %f250, %f252; + .loc 2 113 30 + mul.f32 %f260, %f254, %f254; + .loc 2 113 38 + mul.f32 %f261, %f243, %f260; + .loc 2 113 22 + fma.rn.f32 %f262, %f257, %f261, %f259; +$L__tmp10: + .loc 2 120 46 + mov.b32 %r176, %f258; + shfl.sync.bfly.b32 %r177, %r176, 1, 31, -1; + mov.b32 %f263, %r177; + mov.b32 %r178, %f262; + shfl.sync.bfly.b32 %r179, %r178, 1, 31, -1; + mov.b32 %f264, %r179; + shfl.sync.bfly.b32 %r127, %r125, 1, 31, -1; + mov.b32 %f265, %r127; +$L__tmp11: + .loc 2 108 21 + sub.f32 %f266, %f263, %f258; + .loc 2 109 28 + add.f32 %f267, %f255, %f265; + .loc 2 110 39 + setp.eq.f32 %p53, %f267, 0f00000000; + .loc 2 110 60 + mov.b32 %r128, %f267; + div.full.f32 %r126, %r127, %r128; + mov.b32 %f268, %r126; + .loc 2 110 49 + selp.f32 %f269, 0f00000000, %f268, %p53; + .loc 2 112 17 + fma.rn.f32 %f81, %f269, %f266, %f258; + .loc 2 113 15 + add.f32 %f270, %f262, %f264; + .loc 2 113 30 + mul.f32 %f271, %f266, %f266; + .loc 2 113 38 + mul.f32 %f272, %f255, %f271; + .loc 2 113 22 + fma.rn.f32 %f273, %f269, %f272, %f270; +$L__tmp12: + .loc 1 75 24 + mov.b32 %r130, %f273; + mov.b32 %r131, 1132462080; + div.full.f32 %r129, %r130, %r131; + mov.b32 %f274, %r129; + .loc 1 77 24 + add.f32 %f82, %f274, 0f3727C5AC; + rsqrt.approx.ftz.f32 %f307, %f82; + mov.pred %p93, %p54; + mov.u32 %r267, %r184; + bra.uni $L__BB0_5; +$L__BB0_7: + .loc 1 0 0 + mov.b32 %f83, %r180; + mov.b32 %f84, %r181; + mov.b32 %f85, %r182; + mov.b32 %f86, %r183; + mov.b32 %f87, %r188; + mov.b32 %f88, %r189; + mov.b32 %f89, %r190; + mov.b32 %f90, %r191; + cvt.s64.s32 %rd6, %r230; + mov.b32 %f91, %r204; + mov.b32 %f92, %r205; + mov.b32 %f93, %r206; + mov.b32 %f94, %r207; + mov.b32 %f95, %r208; + mov.b32 %f96, %r209; + mov.b32 %f97, %r210; + mov.b32 %f98, %r211; + cvt.u64.u32 %rd7, %r228; + mov.b32 %f99, %r212; + mov.b32 %f100, %r213; + mov.b32 %f101, %r214; + mov.b32 %f102, %r215; + mov.b32 %f103, %r220; + mov.b32 %f104, %r221; + mov.b32 %f105, %r222; + mov.b32 %f106, %r223; + .loc 1 69 35 + shl.b64 %rd90, %rd7, 2; + add.s64 %rd87, %rd2, %rd90; + add.s64 %rd92, %rd2, %rd78; + add.s64 %rd88, %rd92, 16; + mov.b32 %r236, 0; + mov.pred %p80, -1; + .loc 1 69 54 + mov.u32 %r232, 0x0; + mov.u32 %r233, 0x0; + mov.u32 %r234, 0x0; + mov.u32 %r235, 0x0; + @%p80 ld.global.L1::evict_first.v4.b32 { %r232, %r233, %r234, %r235 }, [ %rd87 + 0 ]; + @!%p80 mov.u32 %r232, %r236; + @!%p80 mov.u32 %r233, %r236; + @!%p80 mov.u32 %r234, %r236; + @!%p80 mov.u32 %r235, %r236; + mov.b32 %f275, %r232; + mov.b32 %f276, %r233; + mov.b32 %f277, %r234; + mov.b32 %f278, %r235; + mov.u32 %r240, 0x0; + mov.u32 %r241, 0x0; + mov.u32 %r242, 0x0; + mov.u32 %r243, 0x0; + @%p80 ld.global.L1::evict_first.v4.b32 { %r240, %r241, %r242, %r243 }, [ %rd88 + 0 ]; + @!%p80 mov.u32 %r240, %r236; + @!%p80 mov.u32 %r241, %r236; + @!%p80 mov.u32 %r242, %r236; + @!%p80 mov.u32 %r243, %r236; + mov.b32 %f279, %r240; + mov.b32 %f280, %r241; + mov.b32 %f281, %r242; + mov.b32 %f282, %r243; + .loc 1 70 24 + add.f32 %f283, %f83, %f275; + add.f32 %f284, %f84, %f276; + add.f32 %f285, %f85, %f277; + add.f32 %f286, %f86, %f278; + add.f32 %f287, %f87, %f279; + add.f32 %f288, %f88, %f280; + add.f32 %f289, %f89, %f281; + add.f32 %f290, %f90, %f282; + .loc 1 72 24 + add.f32 %f291, %f91, %f283; + add.f32 %f292, %f92, %f284; + add.f32 %f293, %f93, %f285; + add.f32 %f294, %f94, %f286; + add.f32 %f295, %f95, %f287; + add.f32 %f296, %f96, %f288; + add.f32 %f297, %f97, %f289; + add.f32 %f298, %f98, %f290; + .loc 1 73 24 + sub.f32 %f299, %f291, %f81; + sub.f32 %f300, %f292, %f81; + sub.f32 %f301, %f293, %f81; + sub.f32 %f302, %f294, %f81; + sub.f32 %f303, %f295, %f81; + sub.f32 %f304, %f296, %f81; + sub.f32 %f305, %f297, %f81; + sub.f32 %f306, %f298, %f81; + .loc 1 79 24 + mul.f32 %f308, %f299, %f307; + mul.f32 %f309, %f300, %f307; + mul.f32 %f310, %f301, %f307; + mul.f32 %f311, %f302, %f307; + mul.f32 %f312, %f303, %f307; + mul.f32 %f313, %f304, %f307; + mul.f32 %f314, %f305, %f307; + mul.f32 %f315, %f306, %f307; + .loc 1 80 24 + mul.f32 %f316, %f308, %f99; + mul.f32 %f317, %f309, %f100; + mul.f32 %f318, %f310, %f101; + mul.f32 %f319, %f311, %f102; + mul.f32 %f320, %f312, %f103; + mul.f32 %f321, %f313, %f104; + mul.f32 %f322, %f314, %f105; + mul.f32 %f323, %f315, %f106; + .loc 1 82 29 + shl.b64 %rd93, %rd6, 1; + add.s64 %rd89, %rd12, %rd93; + .loc 1 82 52 + mov.b32 %r248, %f316; + cvt.rn.bf16.f32 %rs17, %r248; + mov.b32 %r249, %f317; + cvt.rn.bf16.f32 %rs18, %r249; + mov.b32 %r250, %f318; + cvt.rn.bf16.f32 %rs19, %r250; + mov.b32 %r251, %f319; + cvt.rn.bf16.f32 %rs20, %r251; + mov.b32 %r252, %f320; + cvt.rn.bf16.f32 %rs21, %r252; + mov.b32 %r253, %f321; + cvt.rn.bf16.f32 %rs22, %r253; + mov.b32 %r254, %f322; + cvt.rn.bf16.f32 %rs23, %r254; + mov.b32 %r255, %f323; + cvt.rn.bf16.f32 %rs24, %r255; + mov.b32 %r261, {%rs17, %rs18}; + mov.b32 %r262, {%rs19, %rs20}; + mov.b32 %r263, {%rs21, %rs22}; + mov.b32 %r264, {%rs23, %rs24}; + @%p80 st.global.v4.b32 [ %rd89 + 0 ], { %r261, %r262, %r263, %r264 }; + mov.b32 %r267, 128; + mov.pred %p93, 0; + .loc 1 58 36 + @%p2 bra $L__BB0_5; + bra.uni $L__BB0_8; +$L__BB0_5: + .loc 1 0 36 + mov.pred %p2, %p93; + .loc 1 59 27 + or.b32 %r228, %r267, %r3; + .loc 1 62 41 + or.b32 %r229, %r228, %r4; + .loc 1 62 35 + mul.wide.s32 %rd69, %r229, 4; + add.s64 %rd64, %rd9, %rd69; + cvt.s64.s32 %rd70, %r267; + add.s64 %rd71, %rd70, %rd4; + add.s64 %rd72, %rd71, %rd3; + shl.b64 %rd73, %rd72, 2; + add.s64 %rd74, %rd9, %rd73; + add.s64 %rd65, %rd74, 16; + .loc 1 62 51 + mov.u32 %r180, 0x0; + mov.u32 %r181, 0x0; + mov.u32 %r182, 0x0; + mov.u32 %r183, 0x0; + @%p54 ld.global.L1::evict_last.v4.b32 { %r180, %r181, %r182, %r183 }, [ %rd64 + 0 ]; + @!%p54 mov.u32 %r180, %r184; + @!%p54 mov.u32 %r181, %r184; + @!%p54 mov.u32 %r182, %r184; + @!%p54 mov.u32 %r183, %r184; + mov.u32 %r188, 0x0; + mov.u32 %r189, 0x0; + mov.u32 %r190, 0x0; + mov.u32 %r191, 0x0; + @%p54 ld.global.L1::evict_last.v4.b32 { %r188, %r189, %r190, %r191 }, [ %rd65 + 0 ]; + @!%p54 mov.u32 %r188, %r184; + @!%p54 mov.u32 %r189, %r184; + @!%p54 mov.u32 %r190, %r184; + @!%p54 mov.u32 %r191, %r184; + .loc 1 63 41 + or.b32 %r230, %r228, %r5; + .loc 1 63 35 + mul.wide.s32 %rd75, %r230, 2; + add.s64 %rd66, %rd10, %rd75; + .loc 1 63 51 + mov.u32 %r196, 0x0; + mov.u32 %r197, 0x0; + mov.u32 %r198, 0x0; + mov.u32 %r199, 0x0; + @%p54 ld.global.L1::evict_first.v4.b32 { %r196, %r197, %r198, %r199 }, [ %rd66 + 0 ]; + @!%p54 mov.u32 %r196, %r184; + @!%p54 mov.u32 %r197, %r184; + @!%p54 mov.u32 %r198, %r184; + @!%p54 mov.u32 %r199, %r184; + cvt.u16.u32 %rs9, %r196; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r196; } + cvt.u16.u32 %rs11, %r197; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r197; } + cvt.u16.u32 %rs13, %r198; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r198; } + cvt.u16.u32 %rs15, %r199; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r199; } + .loc 1 63 103 + cvt.f32.bf16 %r204, %rs9; + cvt.f32.bf16 %r205, %rs10; + cvt.f32.bf16 %r206, %rs11; + cvt.f32.bf16 %r207, %rs12; + cvt.f32.bf16 %r208, %rs13; + cvt.f32.bf16 %r209, %rs14; + cvt.f32.bf16 %r210, %rs15; + cvt.f32.bf16 %r211, %rs16; + .loc 1 64 35 + mul.wide.u32 %rd76, %r228, 4; + add.s64 %rd67, %rd11, %rd76; + cvt.u64.u32 %rd77, %r267; + add.s64 %rd8, %rd77, %rd4; + shl.b64 %rd78, %rd8, 2; + add.s64 %rd79, %rd11, %rd78; + add.s64 %rd68, %rd79, 16; + .loc 1 64 40 + mov.u32 %r212, 0x0; + mov.u32 %r213, 0x0; + mov.u32 %r214, 0x0; + mov.u32 %r215, 0x0; + @%p54 ld.global.L1::evict_last.v4.b32 { %r212, %r213, %r214, %r215 }, [ %rd67 + 0 ]; + @!%p54 mov.u32 %r212, %r184; + @!%p54 mov.u32 %r213, %r184; + @!%p54 mov.u32 %r214, %r184; + @!%p54 mov.u32 %r215, %r184; + mov.u32 %r220, 0x0; + mov.u32 %r221, 0x0; + mov.u32 %r222, 0x0; + mov.u32 %r223, 0x0; + @%p54 ld.global.L1::evict_last.v4.b32 { %r220, %r221, %r222, %r223 }, [ %rd68 + 0 ]; + @!%p54 mov.u32 %r220, %r184; + @!%p54 mov.u32 %r221, %r184; + @!%p54 mov.u32 %r222, %r184; + @!%p54 mov.u32 %r223, %r184; + .loc 1 68 57 + @%p30 bra $L__BB0_7; + mov.u64 %rd80, assertMessage_1; + cvta.global.u64 %rd81, %rd80; + mov.u64 %rd82, assertFile_1; + cvta.global.u64 %rd83, %rd82; + mov.u64 %rd84, assertFunc_1; + cvta.global.u64 %rd85, %rd84; + { // callseq 7, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd81; + .param .b64 param1; + st.param.b64 [param1+0], %rd83; + .param .b32 param2; + st.param.b32 [param2+0], %r265; + .param .b64 param3; + st.param.b64 [param3+0], %rd85; + .param .b64 param4; + st.param.b64 [param4+0], %rd94; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 7 + bra.uni $L__BB0_7; +$L__BB0_8: + .loc 1 58 4 + ret; +$L__tmp13: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/pn/cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 302 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 112 +.b8 110 +.b8 51 +.b8 108 +.b8 97 +.b8 119 +.b8 103 +.b8 54 +.b8 53 +.b8 108 +.b8 112 +.b8 105 +.b8 54 +.b8 51 +.b8 103 +.b8 118 +.b8 54 +.b8 99 +.b8 54 +.b8 112 +.b8 110 +.b8 52 +.b8 111 +.b8 105 +.b8 107 +.b8 104 +.b8 103 +.b8 54 +.b8 113 +.b8 118 +.b8 97 +.b8 50 +.b8 104 +.b8 50 +.b8 113 +.b8 106 +.b8 100 +.b8 112 +.b8 120 +.b8 101 +.b8 54 +.b8 113 +.b8 106 +.b8 52 +.b8 108 +.b8 118 +.b8 116 +.b8 116 +.b8 119 +.b8 101 +.b8 122 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 112 +.b8 110 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp2 +.b8 2 +.b8 47 +.b8 41 +.b8 5 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp12 +.b8 2 +.b8 53 +.b8 44 +.b8 4 +.b32 125 +.b64 $L__tmp3 +.b64 $L__tmp12 +.b8 2 +.b8 120 +.b8 46 +.b8 0 +.b8 4 +.b32 125 +.b64 $L__tmp4 +.b64 $L__tmp11 +.b8 2 +.b8 53 +.b8 44 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 101 +.b8 55 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 306 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttgir b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..9f1cbda056c2c304428aa1a5f21bdeb5856f9f54 --- /dev/null +++ b/.triton/dump/ea4af10b775ebbfe0bdcca18fd6288c8/triton_.ttgir @@ -0,0 +1,165 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<512> : tensor<16x1xi32, #blocked> + %cst_0 = arith.constant dense<256> : tensor<1x128xi32, #blocked> + %cst_1 = arith.constant dense<256> : tensor<16x1xi32, #blocked> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x128xf32, #blocked> + %cst_4 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked> + %cst_5 = arith.constant dense<256> : tensor<16x1xi64, #blocked> + %cst_6 = arith.constant dense<0> : tensor<16x1xi64, #blocked> + %cst_7 = arith.constant dense<50257> : tensor<16x1xi64, #blocked> + %cst_8 = arith.constant dense<50257> : tensor<16x1xi64, #blocked1> + %cst_9 = arith.constant dense<0> : tensor<16x1xi64, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 + %c256_i32 = arith.constant 256 : i32 + %cst_10 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked2> + %cst_11 = arith.constant 0.000000e+00 : f32 + %cst_12 = arith.constant dense<0.000000e+00> : tensor<16x128xf32, #blocked2> + %cst_13 = arith.constant dense<256> : tensor<1x128xi32, #blocked2> + %cst_14 = arith.constant dense<9.99999974E-6> : tensor<16x1xf32, #blocked> + %cst_15 = arith.constant dense<2.560000e+02> : tensor<16x1xf32, #blocked> + %cst_16 = arith.constant dense<0.000000e+00> : tensor<16x128xbf16, #blocked> + %c16_i32 = arith.constant 16 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c16_i32 : i32 + %2 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xi32, #blocked> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<16x1xi32, #blocked1> + %6 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked> + %7 = tt.splat %1 : (i32) -> tensor<16x1xi32, #blocked1> + %8 = arith.addi %6, %4 : tensor<16x1xi32, #blocked> + %9 = arith.addi %7, %5 : tensor<16x1xi32, #blocked1> + %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> + %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x128xi32, #blocked> + %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2> + %14 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x1x!tt.ptr, #blocked> + %15 = tt.splat %arg0 : (!tt.ptr) -> tensor<16x1x!tt.ptr, #blocked1> + %16 = tt.addptr %14, %8 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> + %17 = tt.addptr %15, %9 : tensor<16x1x!tt.ptr, #blocked1>, tensor<16x1xi32, #blocked1> + %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked> + %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x1xi64, #blocked1> + %20 = arith.remsi %8, %cst : tensor<16x1xi32, #blocked> + %21 = arith.muli %20, %cst_1 : tensor<16x1xi32, #blocked> + %22 = tt.broadcast %21 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked> + %23 = tt.splat %arg2 : (!tt.ptr) -> tensor<16x128x!tt.ptr, #blocked> + %24 = arith.muli %8, %cst_1 : tensor<16x1xi32, #blocked> + %25 = tt.broadcast %24 : (tensor<16x1xi32, #blocked>) -> tensor<16x128xi32, #blocked> + %26 = tt.splat %arg3 : (!tt.ptr) -> tensor<16x128x!tt.ptr, #blocked> + %27 = arith.addi %18, %cst_7 : tensor<16x1xi64, #blocked> + %28 = arith.addi %19, %cst_8 : tensor<16x1xi64, #blocked1> + %29 = arith.cmpi slt, %18, %cst_6 : tensor<16x1xi64, #blocked> + %30 = arith.cmpi slt, %19, %cst_9 : tensor<16x1xi64, #blocked1> + %31 = arith.select %29, %27, %18 : tensor<16x1xi1, #blocked>, tensor<16x1xi64, #blocked> + %32 = arith.select %30, %28, %19 : tensor<16x1xi1, #blocked1>, tensor<16x1xi64, #blocked1> + %33 = arith.cmpi sge, %32, %cst_9 : tensor<16x1xi64, #blocked1> + %34 = arith.cmpi slt, %32, %cst_8 : tensor<16x1xi64, #blocked1> + %35 = arith.andi %33, %34 : tensor<16x1xi1, #blocked1> + %36 = arith.muli %31, %cst_5 : tensor<16x1xi64, #blocked> + %37 = tt.broadcast %36 : (tensor<16x1xi64, #blocked>) -> tensor<16x128xi64, #blocked> + %38 = tt.splat %arg1 : (!tt.ptr) -> tensor<16x128x!tt.ptr, #blocked> + %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked>) : i32 { + %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked> + %50 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked2> + %51 = arith.addi %49, %12 : tensor<1x128xi32, #blocked> + %52 = arith.addi %50, %13 : tensor<1x128xi32, #blocked2> + %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x128xi32, #blocked> + %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x128xi32, #blocked2> + %55 = tt.broadcast %51 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked> + %56 = arith.addi %55, %22 : tensor<16x128xi32, #blocked> + %57 = tt.addptr %23, %56 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi32, #blocked> + %58 = tt.broadcast %53 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked> + %59 = tt.broadcast %54 : (tensor<1x128xi1, #blocked2>) -> tensor<16x128xi1, #blocked2> + %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked> + %61 = arith.addi %55, %25 : tensor<16x128xi32, #blocked> + %62 = tt.addptr %26, %61 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi32, #blocked> + %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xbf16, #blocked> + %64 = arith.extf %63 : tensor<16x128xbf16, #blocked> to tensor<16x128xf32, #blocked> + tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "", 1892 : tensor<16x1xi1, #blocked1> + %65 = arith.extsi %51 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> + %66 = tt.broadcast %65 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked> + %67 = arith.addi %66, %37 : tensor<16x128xi64, #blocked> + %68 = tt.addptr %38, %67 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi64, #blocked> + %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked> + %70 = arith.addf %69, %60 : tensor<16x128xf32, #blocked> + %71 = arith.addf %70, %64 : tensor<16x128xf32, #blocked> + %72 = arith.subf %71, %arg9 : tensor<16x128xf32, #blocked> + %73 = arith.addf %arg12, %cst_4 : tensor<16x128xf32, #blocked> + %74 = arith.addf %arg11, %cst_10 : tensor<16x128xf32, #blocked2> + %75 = arith.divf %72, %73 : tensor<16x128xf32, #blocked> + %76 = arith.addf %arg9, %75 : tensor<16x128xf32, #blocked> + %77 = arith.subf %71, %76 : tensor<16x128xf32, #blocked> + %78 = arith.mulf %72, %77 : tensor<16x128xf32, #blocked> + %79 = arith.addf %arg10, %78 : tensor<16x128xf32, #blocked> + %80 = arith.select %58, %76, %arg9 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked> + %81 = arith.select %58, %79, %arg10 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked> + %82 = arith.select %58, %73, %arg12 : tensor<16x128xi1, #blocked>, tensor<16x128xf32, #blocked> + %83 = arith.select %59, %74, %arg11 : tensor<16x128xi1, #blocked2>, tensor<16x128xf32, #blocked2> + scf.yield %80, %81, %83, %82 : tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked2>, tensor<16x128xf32, #blocked> + } + %40 = triton_gpu.convert_layout %39#2 : (tensor<16x128xf32, #blocked2>) -> tensor<16x128xf32, #blocked> + %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32): + %49 = arith.subf %arg11, %arg8 : f32 + %50 = arith.addf %arg10, %arg13 : f32 + %51 = arith.cmpf oeq, %50, %cst_11 : f32 + %52 = arith.divf %arg13, %50 : f32 + %53 = arith.select %51, %cst_11, %52 : f32 + %54 = arith.mulf %49, %53 : f32 + %55 = arith.addf %arg8, %54 : f32 + %56 = arith.addf %arg9, %arg12 : f32 + %57 = arith.mulf %49, %49 : f32 + %58 = arith.mulf %57, %arg10 : f32 + %59 = arith.mulf %58, %53 : f32 + %60 = arith.addf %56, %59 : f32 + tt.reduce.return %55, %60, %50 : f32, f32, f32 + }) : (tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>, tensor<16x128xf32, #blocked>) -> (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) + %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked> + %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16x1xf32, #blocked> + %44 = tt.splat %arg4 : (!tt.ptr) -> tensor<1x128x!tt.ptr, #blocked> + %45 = tt.broadcast %42 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked> + %46 = arith.divf %43, %cst_15 : tensor<16x1xf32, #blocked> + %47 = arith.addf %46, %cst_14 : tensor<16x1xf32, #blocked> + %48 = tt.splat %arg5 : (!tt.ptr) -> tensor<16x128x!tt.ptr, #blocked> + scf.for %arg8 = %c0_i32 to %c256_i32 step %c128_i32 : i32 { + %49 = tt.splat %arg8 : (i32) -> tensor<1x128xi32, #blocked> + %50 = arith.addi %49, %12 : tensor<1x128xi32, #blocked> + %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x128xi32, #blocked> + %52 = tt.broadcast %50 : (tensor<1x128xi32, #blocked>) -> tensor<16x128xi32, #blocked> + %53 = arith.addi %52, %22 : tensor<16x128xi32, #blocked> + %54 = tt.addptr %23, %53 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi32, #blocked> + %55 = tt.broadcast %51 : (tensor<1x128xi1, #blocked>) -> tensor<16x128xi1, #blocked> + %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<16x128xf32, #blocked> + %57 = arith.addi %52, %25 : tensor<16x128xi32, #blocked> + %58 = tt.addptr %26, %57 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi32, #blocked> + %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xbf16, #blocked> + %60 = arith.extf %59 : tensor<16x128xbf16, #blocked> to tensor<16x128xf32, #blocked> + %61 = tt.addptr %44, %50 : tensor<1x128x!tt.ptr, #blocked>, tensor<1x128xi32, #blocked> + %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x128xf32, #blocked> + tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "/usr/local/lib/python3.10/dist-packages/torch/_inductor/codecache.py", "", 1892 : tensor<16x1xi1, #blocked1> + %63 = arith.extsi %50 : tensor<1x128xi32, #blocked> to tensor<1x128xi64, #blocked> + %64 = tt.broadcast %63 : (tensor<1x128xi64, #blocked>) -> tensor<16x128xi64, #blocked> + %65 = arith.addi %64, %37 : tensor<16x128xi64, #blocked> + %66 = tt.addptr %38, %65 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi64, #blocked> + %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<16x128xf32, #blocked> + %68 = arith.addf %67, %56 : tensor<16x128xf32, #blocked> + %69 = arith.addf %68, %60 : tensor<16x128xf32, #blocked> + %70 = arith.subf %69, %45 : tensor<16x128xf32, #blocked> + %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<16x1xf32, #blocked>) -> tensor<16x1xf32, #blocked> + %72 = tt.broadcast %71 : (tensor<16x1xf32, #blocked>) -> tensor<16x128xf32, #blocked> + %73 = arith.mulf %70, %72 : tensor<16x128xf32, #blocked> + %74 = tt.broadcast %62 : (tensor<1x128xf32, #blocked>) -> tensor<16x128xf32, #blocked> + %75 = arith.mulf %73, %74 : tensor<16x128xf32, #blocked> + %76 = tt.addptr %48, %57 : tensor<16x128x!tt.ptr, #blocked>, tensor<16x128xi32, #blocked> + %77 = arith.truncf %75 : tensor<16x128xf32, #blocked> to tensor<16x128xbf16, #blocked> + tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<16x128xbf16, #blocked> + } + tt.return + } +} diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.cubin b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6a9ac74a7f25a7558d454e5e8eb8f311c61df0cf Binary files /dev/null and b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.cubin differ diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.llir b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..66db9c191fb1d24bcf079ba9cb444fd0f758172e --- /dev/null +++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.llir @@ -0,0 +1,610 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, ptr addrspace(1) %10, ptr addrspace(1) %11, ptr addrspace(1) %12, ptr addrspace(1) %13, ptr addrspace(1) %14, ptr addrspace(1) %15, ptr addrspace(1) %16, ptr addrspace(1) %17, ptr addrspace(1) %18, ptr addrspace(1) %19, ptr addrspace(1) %20, ptr addrspace(1) %21, ptr addrspace(1) %22, ptr addrspace(1) %23, ptr addrspace(1) %24, ptr addrspace(1) %25, ptr addrspace(1) %26, ptr addrspace(1) %27, ptr addrspace(1) %28, i32 %29, i32 %30) local_unnamed_addr !dbg !5 { + %32 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %33 = and i32 %32, 31, !dbg !8 + %34 = lshr i32 %32, 5, !dbg !8 + %35 = and i32 %34, 1, !dbg !8 + %urem = shl i32 %32, 2, !dbg !8 + %36 = and i32 %urem, 252, !dbg !8 + %37 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %38 = shl i32 %37, 8, !dbg !10 + %39 = or i32 %38, %36, !dbg !11 + %40 = sext i32 %39 to i64, !dbg !12 + %41 = getelementptr float, ptr addrspace(1) %0, i64 %40, !dbg !12 + %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !13 + %43 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !13 + %44 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !13 + %45 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !13 + %46 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !13 + %47 = bitcast i32 %43 to float, !dbg !13 + %48 = bitcast i32 %44 to float, !dbg !13 + %49 = bitcast i32 %45 to float, !dbg !13 + %50 = bitcast i32 %46 to float, !dbg !13 + %51 = getelementptr i16, ptr addrspace(1) %1, i64 %40, !dbg !14 + %52 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %51, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !15 + %53 = extractvalue { i32, i32 } %52, 0, !dbg !15 + %54 = extractvalue { i32, i32 } %52, 1, !dbg !15 + %55 = trunc i32 %53 to i16, !dbg !15 + %extelt.offset = lshr i32 %53, 16, !dbg !15 + %56 = trunc i32 %extelt.offset to i16, !dbg !15 + %57 = trunc i32 %54 to i16, !dbg !15 + %extelt.offset1 = lshr i32 %54, 16, !dbg !15 + %58 = trunc i32 %extelt.offset1 to i16, !dbg !15 + %59 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #3, !dbg !16 + %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %56) #3, !dbg !16 + %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !16 + %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %58) #3, !dbg !16 + %63 = getelementptr i16, ptr addrspace(1) %2, i64 %40, !dbg !17 + %64 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %63, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !18 + %65 = extractvalue { i32, i32 } %64, 0, !dbg !18 + %66 = extractvalue { i32, i32 } %64, 1, !dbg !18 + %67 = trunc i32 %65 to i16, !dbg !18 + %extelt.offset2 = lshr i32 %65, 16, !dbg !18 + %68 = trunc i32 %extelt.offset2 to i16, !dbg !18 + %69 = trunc i32 %66 to i16, !dbg !18 + %extelt.offset3 = lshr i32 %66, 16, !dbg !18 + %70 = trunc i32 %extelt.offset3 to i16, !dbg !18 + %71 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %67) #3, !dbg !19 + %72 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %68) #3, !dbg !19 + %73 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #3, !dbg !19 + %74 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #3, !dbg !19 + %75 = sext i32 %37 to i64, !dbg !20 + %76 = getelementptr float, ptr addrspace(1) %3, i64 %75, !dbg !20 + %77 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %78 = bitcast i32 %77 to float, !dbg !21 + %79 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %80 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %81 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !21 + %82 = getelementptr float, ptr addrspace(1) %4, i64 %75, !dbg !22 + %83 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %84 = bitcast i32 %83 to float, !dbg !23 + %85 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %86 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %87 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %82, i1 true) #3, !dbg !23 + %88 = getelementptr i16, ptr addrspace(1) %5, i64 %40, !dbg !24 + %89 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %88, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !25 + %90 = extractvalue { i32, i32 } %89, 0, !dbg !25 + %91 = extractvalue { i32, i32 } %89, 1, !dbg !25 + %92 = trunc i32 %90 to i16, !dbg !25 + %extelt.offset4 = lshr i32 %90, 16, !dbg !25 + %93 = trunc i32 %extelt.offset4 to i16, !dbg !25 + %94 = trunc i32 %91 to i16, !dbg !25 + %extelt.offset5 = lshr i32 %91, 16, !dbg !25 + %95 = trunc i32 %extelt.offset5 to i16, !dbg !25 + %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %92) #3, !dbg !26 + %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %93) #3, !dbg !26 + %98 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %94) #3, !dbg !26 + %99 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %95) #3, !dbg !26 + %100 = getelementptr float, ptr addrspace(1) %6, i64 %75, !dbg !27 + %101 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %102 = bitcast i32 %101 to float, !dbg !28 + %103 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %104 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %105 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %100, i1 true) #3, !dbg !28 + %106 = getelementptr float, ptr addrspace(1) %7, i64 %75, !dbg !29 + %107 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %108 = bitcast i32 %107 to float, !dbg !30 + %109 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %110 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %111 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %106, i1 true) #3, !dbg !30 + %112 = getelementptr i16, ptr addrspace(1) %8, i64 %40, !dbg !31 + %113 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %112, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !32 + %114 = extractvalue { i32, i32 } %113, 0, !dbg !32 + %115 = extractvalue { i32, i32 } %113, 1, !dbg !32 + %116 = trunc i32 %114 to i16, !dbg !32 + %extelt.offset6 = lshr i32 %114, 16, !dbg !32 + %117 = trunc i32 %extelt.offset6 to i16, !dbg !32 + %118 = trunc i32 %115 to i16, !dbg !32 + %extelt.offset7 = lshr i32 %115, 16, !dbg !32 + %119 = trunc i32 %extelt.offset7 to i16, !dbg !32 + %120 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %116) #3, !dbg !33 + %121 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %117) #3, !dbg !33 + %122 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %118) #3, !dbg !33 + %123 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %119) #3, !dbg !33 + %124 = getelementptr i16, ptr addrspace(1) %9, i64 %40, !dbg !34 + %125 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %124, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !35 + %126 = extractvalue { i32, i32 } %125, 0, !dbg !35 + %127 = extractvalue { i32, i32 } %125, 1, !dbg !35 + %128 = trunc i32 %126 to i16, !dbg !35 + %extelt.offset8 = lshr i32 %126, 16, !dbg !35 + %129 = trunc i32 %extelt.offset8 to i16, !dbg !35 + %130 = trunc i32 %127 to i16, !dbg !35 + %extelt.offset9 = lshr i32 %127, 16, !dbg !35 + %131 = trunc i32 %extelt.offset9 to i16, !dbg !35 + %132 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %128) #3, !dbg !36 + %133 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %129) #3, !dbg !36 + %134 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %130) #3, !dbg !36 + %135 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %131) #3, !dbg !36 + %136 = getelementptr i16, ptr addrspace(1) %10, i64 %40, !dbg !37 + %137 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %136, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !38 + %138 = extractvalue { i32, i32 } %137, 0, !dbg !38 + %139 = extractvalue { i32, i32 } %137, 1, !dbg !38 + %140 = trunc i32 %138 to i16, !dbg !38 + %extelt.offset10 = lshr i32 %138, 16, !dbg !38 + %141 = trunc i32 %extelt.offset10 to i16, !dbg !38 + %142 = trunc i32 %139 to i16, !dbg !38 + %extelt.offset11 = lshr i32 %139, 16, !dbg !38 + %143 = trunc i32 %extelt.offset11 to i16, !dbg !38 + %144 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %140) #3, !dbg !39 + %145 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %141) #3, !dbg !39 + %146 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %142) #3, !dbg !39 + %147 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %143) #3, !dbg !39 + %148 = getelementptr float, ptr addrspace(1) %11, i64 %75, !dbg !40 + %149 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %150 = bitcast i32 %149 to float, !dbg !41 + %151 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %152 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %153 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %148, i1 true) #3, !dbg !41 + %154 = getelementptr float, ptr addrspace(1) %12, i64 %75, !dbg !42 + %155 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %156 = bitcast i32 %155 to float, !dbg !43 + %157 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %158 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %159 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %154, i1 true) #3, !dbg !43 + %160 = getelementptr i16, ptr addrspace(1) %13, i64 %40, !dbg !44 + %161 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %160, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !45 + %162 = extractvalue { i32, i32 } %161, 0, !dbg !45 + %163 = extractvalue { i32, i32 } %161, 1, !dbg !45 + %164 = trunc i32 %162 to i16, !dbg !45 + %extelt.offset12 = lshr i32 %162, 16, !dbg !45 + %165 = trunc i32 %extelt.offset12 to i16, !dbg !45 + %166 = trunc i32 %163 to i16, !dbg !45 + %extelt.offset13 = lshr i32 %163, 16, !dbg !45 + %167 = trunc i32 %extelt.offset13 to i16, !dbg !45 + %168 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %164) #3, !dbg !46 + %169 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %165) #3, !dbg !46 + %170 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %166) #3, !dbg !46 + %171 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %167) #3, !dbg !46 + %172 = getelementptr float, ptr addrspace(1) %14, i64 %75, !dbg !47 + %173 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %174 = bitcast i32 %173 to float, !dbg !48 + %175 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %176 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %177 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %172, i1 true) #3, !dbg !48 + %178 = getelementptr float, ptr addrspace(1) %15, i64 %75, !dbg !49 + %179 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %180 = bitcast i32 %179 to float, !dbg !50 + %181 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %182 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %183 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %178, i1 true) #3, !dbg !50 + %184 = getelementptr i16, ptr addrspace(1) %16, i64 %40, !dbg !51 + %185 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %184, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !52 + %186 = extractvalue { i32, i32 } %185, 0, !dbg !52 + %187 = extractvalue { i32, i32 } %185, 1, !dbg !52 + %188 = trunc i32 %186 to i16, !dbg !52 + %extelt.offset14 = lshr i32 %186, 16, !dbg !52 + %189 = trunc i32 %extelt.offset14 to i16, !dbg !52 + %190 = trunc i32 %187 to i16, !dbg !52 + %extelt.offset15 = lshr i32 %187, 16, !dbg !52 + %191 = trunc i32 %extelt.offset15 to i16, !dbg !52 + %192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !53 + %193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !53 + %194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !53 + %195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !53 + %196 = getelementptr float, ptr addrspace(1) %17, i64 %75, !dbg !54 + %197 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %198 = bitcast i32 %197 to float, !dbg !55 + %199 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %200 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %201 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %196, i1 true) #3, !dbg !55 + %202 = getelementptr float, ptr addrspace(1) %18, i64 %75, !dbg !56 + %203 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %204 = bitcast i32 %203 to float, !dbg !57 + %205 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %206 = bitcast i32 %205 to float, !dbg !57 + %207 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %208 = bitcast i32 %207 to float, !dbg !57 + %209 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %202, i1 true) #3, !dbg !57 + %210 = bitcast i32 %209 to float, !dbg !57 + %211 = getelementptr float, ptr addrspace(1) %19, i64 %40, !dbg !58 + %212 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %211, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !59 + %213 = extractvalue { i32, i32, i32, i32 } %212, 0, !dbg !59 + %214 = extractvalue { i32, i32, i32, i32 } %212, 1, !dbg !59 + %215 = extractvalue { i32, i32, i32, i32 } %212, 2, !dbg !59 + %216 = extractvalue { i32, i32, i32, i32 } %212, 3, !dbg !59 + %217 = zext nneg i32 %36 to i64, !dbg !60 + %218 = getelementptr float, ptr addrspace(1) %20, i64 %217, !dbg !60 + %219 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %218, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #3, !dbg !61 + %220 = extractvalue { i32, i32, i32, i32 } %219, 0, !dbg !61 + %221 = extractvalue { i32, i32, i32, i32 } %219, 1, !dbg !61 + %222 = extractvalue { i32, i32, i32, i32 } %219, 2, !dbg !61 + %223 = extractvalue { i32, i32, i32, i32 } %219, 3, !dbg !61 + %224 = fadd float %59, %47, !dbg !62 + %225 = fadd float %60, %48, !dbg !62 + %226 = fadd float %61, %49, !dbg !62 + %227 = fadd float %62, %50, !dbg !62 + %228 = fadd float %224, %71, !dbg !63 + %229 = fadd float %225, %72, !dbg !63 + %230 = fadd float %226, %73, !dbg !63 + %231 = fadd float %227, %74, !dbg !63 + %232 = fsub float %228, %78, !dbg !64 + %233 = fsub float %229, %78, !dbg !64 + %234 = fsub float %230, %78, !dbg !64 + %235 = fsub float %231, %78, !dbg !64 + %236 = fmul float %232, %84, !dbg !65 + %237 = fmul float %233, %84, !dbg !65 + %238 = fmul float %234, %84, !dbg !65 + %239 = fmul float %235, %84, !dbg !65 + %240 = fadd float %228, %96, !dbg !66 + %241 = fadd float %229, %97, !dbg !66 + %242 = fadd float %230, %98, !dbg !66 + %243 = fadd float %231, %99, !dbg !66 + %244 = fsub float %240, %102, !dbg !67 + %245 = fsub float %241, %102, !dbg !67 + %246 = fsub float %242, %102, !dbg !67 + %247 = fsub float %243, %102, !dbg !67 + %248 = fmul float %244, %108, !dbg !68 + %249 = fmul float %245, %108, !dbg !68 + %250 = fmul float %246, %108, !dbg !68 + %251 = fmul float %247, %108, !dbg !68 + %252 = fadd float %240, %120, !dbg !69 + %253 = fadd float %241, %121, !dbg !69 + %254 = fadd float %242, %122, !dbg !69 + %255 = fadd float %243, %123, !dbg !69 + %256 = fadd float %252, %132, !dbg !70 + %257 = fadd float %253, %133, !dbg !70 + %258 = fadd float %254, %134, !dbg !70 + %259 = fadd float %255, %135, !dbg !70 + %260 = fadd float %256, %144, !dbg !71 + %261 = fadd float %257, %145, !dbg !71 + %262 = fadd float %258, %146, !dbg !71 + %263 = fadd float %259, %147, !dbg !71 + %264 = fsub float %260, %150, !dbg !72 + %265 = fsub float %261, %150, !dbg !72 + %266 = fsub float %262, %150, !dbg !72 + %267 = fsub float %263, %150, !dbg !72 + %268 = fmul float %264, %156, !dbg !73 + %269 = fmul float %265, %156, !dbg !73 + %270 = fmul float %266, %156, !dbg !73 + %271 = fmul float %267, %156, !dbg !73 + %272 = fadd float %260, %168, !dbg !74 + %273 = fadd float %261, %169, !dbg !74 + %274 = fadd float %262, %170, !dbg !74 + %275 = fadd float %263, %171, !dbg !74 + %276 = fsub float %272, %174, !dbg !75 + %277 = fsub float %273, %174, !dbg !75 + %278 = fsub float %274, %174, !dbg !75 + %279 = fsub float %275, %174, !dbg !75 + %280 = fmul float %276, %180, !dbg !76 + %281 = fmul float %277, %180, !dbg !76 + %282 = fmul float %278, %180, !dbg !76 + %283 = fmul float %279, %180, !dbg !76 + %284 = fadd float %272, %192, !dbg !77 + %285 = fadd float %273, %193, !dbg !77 + %286 = fadd float %274, %194, !dbg !77 + %287 = fadd float %275, %195, !dbg !77 + %288 = fsub float %284, %198, !dbg !78 + %289 = fsub float %285, %198, !dbg !78 + %290 = fsub float %286, %198, !dbg !78 + %291 = fsub float %287, %198, !dbg !78 + %292 = fmul float %288, %204, !dbg !79 + %293 = fmul float %289, %204, !dbg !79 + %294 = fmul float %290, %204, !dbg !79 + %295 = fmul float %291, %204, !dbg !79 + %296 = insertelement <2 x i32> poison, i32 %213, i64 0, !dbg !59 + %297 = insertelement <2 x i32> %296, i32 %214, i64 1, !dbg !59 + %298 = bitcast <2 x i32> %297 to <2 x float>, !dbg !59 + %299 = insertelement <2 x i32> poison, i32 %220, i64 0, !dbg !61 + %300 = insertelement <2 x i32> %299, i32 %221, i64 1, !dbg !61 + %301 = bitcast <2 x i32> %300 to <2 x float>, !dbg !61 + %302 = fmul <2 x float> %298, %301, !dbg !80 + %303 = insertelement <2 x i32> poison, i32 %216, i64 0, !dbg !59 + %304 = insertelement <2 x i32> %303, i32 %215, i64 1, !dbg !59 + %305 = bitcast <2 x i32> %304 to <2 x float>, !dbg !59 + %306 = insertelement <2 x i32> poison, i32 %223, i64 0, !dbg !61 + %307 = insertelement <2 x i32> %306, i32 %222, i64 1, !dbg !61 + %308 = bitcast <2 x i32> %307 to <2 x float>, !dbg !61 + %309 = fmul <2 x float> %305, %308, !dbg !80 + %310 = extractelement <2 x float> %302, i64 0, !dbg !81 + %311 = extractelement <2 x float> %302, i64 1, !dbg !81 + %312 = fadd float %310, %311, !dbg !81 + %313 = extractelement <2 x float> %309, i64 1, !dbg !81 + %314 = fadd float %313, %312, !dbg !81 + %315 = extractelement <2 x float> %309, i64 0, !dbg !81 + %316 = fadd float %315, %314, !dbg !81 + %317 = bitcast float %316 to i32, !dbg !87 + %318 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %317, i32 16, i32 31), !dbg !87 + %319 = bitcast i32 %318 to float, !dbg !87 + %320 = fadd float %316, %319, !dbg !81 + %321 = bitcast float %320 to i32, !dbg !87 + %322 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %321, i32 8, i32 31), !dbg !87 + %323 = bitcast i32 %322 to float, !dbg !87 + %324 = fadd float %320, %323, !dbg !81 + %325 = bitcast float %324 to i32, !dbg !87 + %326 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %325, i32 4, i32 31), !dbg !87 + %327 = bitcast i32 %326 to float, !dbg !87 + %328 = fadd float %324, %327, !dbg !81 + %329 = bitcast float %328 to i32, !dbg !87 + %330 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %329, i32 2, i32 31), !dbg !87 + %331 = bitcast i32 %330 to float, !dbg !87 + %332 = fadd float %328, %331, !dbg !81 + %333 = bitcast float %332 to i32, !dbg !87 + %334 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %333, i32 1, i32 31), !dbg !87 + %335 = bitcast i32 %334 to float, !dbg !87 + %336 = fadd float %332, %335, !dbg !81 + %337 = icmp eq i32 %33, 0, !dbg !87 + %338 = zext nneg i32 %35 to i64, !dbg !87 + %339 = getelementptr float, ptr addrspace(3) @global_smem, i64 %338, !dbg !87 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %336, i1 %337) #3, !dbg !87 + tail call void @llvm.nvvm.barrier0(), !dbg !87 + %340 = icmp slt i32 %32, 2, !dbg !87 + %341 = sext i32 %32 to i64, !dbg !87 + %342 = getelementptr float, ptr addrspace(3) @global_smem, i64 %341, !dbg !87 + %343 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !87 + %344 = bitcast float %343 to i32, !dbg !87 + %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 1, i32 31), !dbg !87 + %346 = bitcast i32 %345 to float, !dbg !87 + %347 = fadd float %343, %346, !dbg !81 + %348 = and i32 %32, 1, !dbg !87 + %349 = icmp eq i32 %348, 0, !dbg !87 + %350 = and i1 %340, %349, !dbg !87 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %347, i1 %350) #3, !dbg !87 + tail call void @llvm.nvvm.barrier0(), !dbg !87 + %351 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !87 + %352 = fadd float %351, 0.000000e+00, !dbg !89 + %353 = fmul float %292, %310, !dbg !93 + %354 = fmul float %293, %311, !dbg !93 + %355 = fmul float %294, %313, !dbg !93 + %356 = fmul float %295, %315, !dbg !93 + tail call void @llvm.nvvm.barrier0(), !dbg !94 + %357 = fadd float %353, %354, !dbg !96 + %358 = fadd float %355, %357, !dbg !96 + %359 = fadd float %356, %358, !dbg !96 + %360 = bitcast float %359 to i32, !dbg !94 + %361 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %360, i32 16, i32 31), !dbg !94 + %362 = bitcast i32 %361 to float, !dbg !94 + %363 = fadd float %359, %362, !dbg !96 + %364 = bitcast float %363 to i32, !dbg !94 + %365 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %364, i32 8, i32 31), !dbg !94 + %366 = bitcast i32 %365 to float, !dbg !94 + %367 = fadd float %363, %366, !dbg !96 + %368 = bitcast float %367 to i32, !dbg !94 + %369 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %368, i32 4, i32 31), !dbg !94 + %370 = bitcast i32 %369 to float, !dbg !94 + %371 = fadd float %367, %370, !dbg !96 + %372 = bitcast float %371 to i32, !dbg !94 + %373 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %372, i32 2, i32 31), !dbg !94 + %374 = bitcast i32 %373 to float, !dbg !94 + %375 = fadd float %371, %374, !dbg !96 + %376 = bitcast float %375 to i32, !dbg !94 + %377 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %376, i32 1, i32 31), !dbg !94 + %378 = bitcast i32 %377 to float, !dbg !94 + %379 = fadd float %375, %378, !dbg !96 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %379, i1 %337) #3, !dbg !94 + tail call void @llvm.nvvm.barrier0(), !dbg !94 + %380 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !94 + %381 = bitcast float %380 to i32, !dbg !94 + %382 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %381, i32 1, i32 31), !dbg !94 + %383 = bitcast i32 %382 to float, !dbg !94 + %384 = fadd float %380, %383, !dbg !96 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %384, i1 %350) #3, !dbg !94 + tail call void @llvm.nvvm.barrier0(), !dbg !94 + %385 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !94 + %386 = fadd float %385, 0.000000e+00, !dbg !99 + %387 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float 2.560000e+02) #3, !dbg !101 + %388 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %206, float 2.560000e+02) #3, !dbg !101 + %389 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %208, float 2.560000e+02) #3, !dbg !101 + %390 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %210, float 2.560000e+02) #3, !dbg !101 + %391 = fmul float %310, 2.560000e+02, !dbg !102 + %392 = fmul float %311, 2.560000e+02, !dbg !102 + %393 = fmul float %313, 2.560000e+02, !dbg !102 + %394 = fmul float %315, 2.560000e+02, !dbg !102 + %395 = fsub float %391, %352, !dbg !103 + %396 = fsub float %392, %352, !dbg !103 + %397 = fsub float %393, %352, !dbg !103 + %398 = fsub float %394, %352, !dbg !103 + %399 = fmul float %292, %386, !dbg !104 + %400 = fmul float %293, %386, !dbg !104 + %401 = fmul float %294, %386, !dbg !104 + %402 = fmul float %295, %386, !dbg !104 + %403 = fsub float %395, %399, !dbg !105 + %404 = fsub float %396, %400, !dbg !105 + %405 = fsub float %397, %401, !dbg !105 + %406 = fsub float %398, %402, !dbg !105 + %407 = fmul float %387, %403, !dbg !106 + %408 = fmul float %387, %404, !dbg !106 + %409 = fmul float %387, %405, !dbg !106 + %410 = fmul float %387, %406, !dbg !106 + %411 = getelementptr float, ptr addrspace(1) %21, i64 %40, !dbg !107 + %412 = bitcast float %236 to i32, !dbg !108 + %413 = bitcast float %237 to i32, !dbg !108 + %414 = bitcast float %238 to i32, !dbg !108 + %415 = bitcast float %239 to i32, !dbg !108 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %412, i32 %413, i32 %414, i32 %415, ptr addrspace(1) %411, i1 true) #3, !dbg !108 + %416 = getelementptr float, ptr addrspace(1) %22, i64 %40, !dbg !109 + %417 = bitcast float %248 to i32, !dbg !110 + %418 = bitcast float %249 to i32, !dbg !110 + %419 = bitcast float %250 to i32, !dbg !110 + %420 = bitcast float %251 to i32, !dbg !110 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %417, i32 %418, i32 %419, i32 %420, ptr addrspace(1) %416, i1 true) #3, !dbg !110 + %421 = getelementptr float, ptr addrspace(1) %23, i64 %40, !dbg !111 + %422 = bitcast float %252 to i32, !dbg !112 + %423 = bitcast float %253 to i32, !dbg !112 + %424 = bitcast float %254 to i32, !dbg !112 + %425 = bitcast float %255 to i32, !dbg !112 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %422, i32 %423, i32 %424, i32 %425, ptr addrspace(1) %421, i1 true) #3, !dbg !112 + %426 = getelementptr float, ptr addrspace(1) %24, i64 %40, !dbg !113 + %427 = bitcast float %268 to i32, !dbg !114 + %428 = bitcast float %269 to i32, !dbg !114 + %429 = bitcast float %270 to i32, !dbg !114 + %430 = bitcast float %271 to i32, !dbg !114 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %427, i32 %428, i32 %429, i32 %430, ptr addrspace(1) %426, i1 true) #3, !dbg !114 + %431 = getelementptr float, ptr addrspace(1) %25, i64 %40, !dbg !115 + %432 = bitcast float %280 to i32, !dbg !116 + %433 = bitcast float %281 to i32, !dbg !116 + %434 = bitcast float %282 to i32, !dbg !116 + %435 = bitcast float %283 to i32, !dbg !116 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %432, i32 %433, i32 %434, i32 %435, ptr addrspace(1) %431, i1 true) #3, !dbg !116 + %436 = getelementptr float, ptr addrspace(1) %26, i64 %40, !dbg !117 + %437 = bitcast float %292 to i32, !dbg !118 + %438 = bitcast float %293 to i32, !dbg !118 + %439 = bitcast float %294 to i32, !dbg !118 + %440 = bitcast float %295 to i32, !dbg !118 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %437, i32 %438, i32 %439, i32 %440, ptr addrspace(1) %436, i1 true) #3, !dbg !118 + %441 = getelementptr float, ptr addrspace(1) %27, i64 %40, !dbg !119 + %442 = bitcast float %407 to i32, !dbg !120 + %443 = bitcast float %408 to i32, !dbg !120 + %444 = bitcast float %409 to i32, !dbg !120 + %445 = bitcast float %410 to i32, !dbg !120 + tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %442, i32 %443, i32 %444, i32 %445, ptr addrspace(1) %441, i1 true) #3, !dbg !120 + %446 = getelementptr i16, ptr addrspace(1) %28, i64 %40, !dbg !121 + %447 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %407) #3, !dbg !122 + %448 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %408) #3, !dbg !122 + %449 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %409) #3, !dbg !122 + %450 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %410) #3, !dbg !122 + %451 = insertelement <2 x i16> undef, i16 %447, i64 0, !dbg !122 + %452 = insertelement <2 x i16> %451, i16 %448, i64 1, !dbg !122 + %453 = bitcast <2 x i16> %452 to i32, !dbg !122 + %454 = insertelement <2 x i16> undef, i16 %449, i64 0, !dbg !122 + %455 = insertelement <2 x i16> %454, i16 %450, i64 1, !dbg !122 + %456 = bitcast <2 x i16> %455 to i32, !dbg !122 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %453, i32 %456, ptr addrspace(1) %446, i1 true) #3, !dbg !122 + ret void, !dbg !123 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cyo4ksjyladdfw6jgu5nyxbapyihb5b54nc6mogi76rx2lajsiff.py", directory: "/tmp/torchinductor_root/yo") +!3 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de, !"maxntidx", i32 64} +!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 26, column: 26, scope: !5) +!9 = !DILocation(line: 23, column: 28, scope: !5) +!10 = !DILocation(line: 30, column: 40, scope: !5) +!11 = !DILocation(line: 30, column: 36, scope: !5) +!12 = !DILocation(line: 30, column: 30, scope: !5) +!13 = !DILocation(line: 30, column: 46, scope: !5) +!14 = !DILocation(line: 31, column: 30, scope: !5) +!15 = !DILocation(line: 31, column: 46, scope: !5) +!16 = !DILocation(line: 31, column: 67, scope: !5) +!17 = !DILocation(line: 32, column: 30, scope: !5) +!18 = !DILocation(line: 32, column: 46, scope: !5) +!19 = !DILocation(line: 32, column: 67, scope: !5) +!20 = !DILocation(line: 33, column: 30, scope: !5) +!21 = !DILocation(line: 33, column: 35, scope: !5) +!22 = !DILocation(line: 34, column: 30, scope: !5) +!23 = !DILocation(line: 34, column: 35, scope: !5) +!24 = !DILocation(line: 35, column: 31, scope: !5) +!25 = !DILocation(line: 35, column: 47, scope: !5) +!26 = !DILocation(line: 35, column: 68, scope: !5) +!27 = !DILocation(line: 36, column: 31, scope: !5) +!28 = !DILocation(line: 36, column: 36, scope: !5) +!29 = !DILocation(line: 37, column: 31, scope: !5) +!30 = !DILocation(line: 37, column: 36, scope: !5) +!31 = !DILocation(line: 38, column: 31, scope: !5) +!32 = !DILocation(line: 38, column: 47, scope: !5) +!33 = !DILocation(line: 38, column: 68, scope: !5) +!34 = !DILocation(line: 39, column: 31, scope: !5) +!35 = !DILocation(line: 39, column: 47, scope: !5) +!36 = !DILocation(line: 39, column: 68, scope: !5) +!37 = !DILocation(line: 40, column: 32, scope: !5) +!38 = !DILocation(line: 40, column: 48, scope: !5) +!39 = !DILocation(line: 40, column: 69, scope: !5) +!40 = !DILocation(line: 41, column: 32, scope: !5) +!41 = !DILocation(line: 41, column: 37, scope: !5) +!42 = !DILocation(line: 42, column: 32, scope: !5) +!43 = !DILocation(line: 42, column: 37, scope: !5) +!44 = !DILocation(line: 43, column: 32, scope: !5) +!45 = !DILocation(line: 43, column: 48, scope: !5) +!46 = !DILocation(line: 43, column: 69, scope: !5) +!47 = !DILocation(line: 44, column: 32, scope: !5) +!48 = !DILocation(line: 44, column: 37, scope: !5) +!49 = !DILocation(line: 45, column: 32, scope: !5) +!50 = !DILocation(line: 45, column: 37, scope: !5) +!51 = !DILocation(line: 46, column: 32, scope: !5) +!52 = !DILocation(line: 46, column: 48, scope: !5) +!53 = !DILocation(line: 46, column: 69, scope: !5) +!54 = !DILocation(line: 47, column: 32, scope: !5) +!55 = !DILocation(line: 47, column: 37, scope: !5) +!56 = !DILocation(line: 48, column: 32, scope: !5) +!57 = !DILocation(line: 48, column: 37, scope: !5) +!58 = !DILocation(line: 49, column: 32, scope: !5) +!59 = !DILocation(line: 49, column: 48, scope: !5) +!60 = !DILocation(line: 50, column: 32, scope: !5) +!61 = !DILocation(line: 50, column: 37, scope: !5) +!62 = !DILocation(line: 52, column: 18, scope: !5) +!63 = !DILocation(line: 54, column: 18, scope: !5) +!64 = !DILocation(line: 55, column: 18, scope: !5) +!65 = !DILocation(line: 56, column: 19, scope: !5) +!66 = !DILocation(line: 58, column: 19, scope: !5) +!67 = !DILocation(line: 59, column: 20, scope: !5) +!68 = !DILocation(line: 60, column: 20, scope: !5) +!69 = !DILocation(line: 62, column: 20, scope: !5) +!70 = !DILocation(line: 64, column: 20, scope: !5) +!71 = !DILocation(line: 66, column: 20, scope: !5) +!72 = !DILocation(line: 67, column: 20, scope: !5) +!73 = !DILocation(line: 68, column: 20, scope: !5) +!74 = !DILocation(line: 70, column: 20, scope: !5) +!75 = !DILocation(line: 71, column: 20, scope: !5) +!76 = !DILocation(line: 72, column: 20, scope: !5) +!77 = !DILocation(line: 74, column: 20, scope: !5) +!78 = !DILocation(line: 75, column: 20, scope: !5) +!79 = !DILocation(line: 76, column: 20, scope: !5) +!80 = !DILocation(line: 77, column: 20, scope: !5) +!81 = !DILocation(line: 233, column: 15, scope: !82, inlinedAt: !85) +!82 = distinct !DILexicalBlockFile(scope: !84, file: !83, discriminator: 0) +!83 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!84 = distinct !DILexicalBlockFile(scope: !5, file: !83, discriminator: 0) +!85 = !DILocation(line: 243, column: 36, scope: !82, inlinedAt: !86) +!86 = !DILocation(line: 80, column: 59, scope: !82) +!87 = !DILocation(line: 243, column: 36, scope: !84, inlinedAt: !88) +!88 = !DILocation(line: 80, column: 59, scope: !84) +!89 = !DILocation(line: 8, column: 15, scope: !90, inlinedAt: !92) +!90 = distinct !DILexicalBlockFile(scope: !5, file: !91, discriminator: 0) +!91 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!92 = !DILocation(line: 80, column: 45, scope: !90) +!93 = !DILocation(line: 81, column: 20, scope: !5) +!94 = !DILocation(line: 243, column: 36, scope: !84, inlinedAt: !95) +!95 = !DILocation(line: 84, column: 59, scope: !84) +!96 = !DILocation(line: 233, column: 15, scope: !82, inlinedAt: !97) +!97 = !DILocation(line: 243, column: 36, scope: !82, inlinedAt: !98) +!98 = !DILocation(line: 84, column: 59, scope: !82) +!99 = !DILocation(line: 8, column: 15, scope: !90, inlinedAt: !100) +!100 = !DILocation(line: 84, column: 45, scope: !90) +!101 = !DILocation(line: 86, column: 20, scope: !5) +!102 = !DILocation(line: 87, column: 20, scope: !5) +!103 = !DILocation(line: 88, column: 20, scope: !5) +!104 = !DILocation(line: 89, column: 20, scope: !5) +!105 = !DILocation(line: 90, column: 20, scope: !5) +!106 = !DILocation(line: 91, column: 20, scope: !5) +!107 = !DILocation(line: 93, column: 25, scope: !5) +!108 = !DILocation(line: 93, column: 48, scope: !5) +!109 = !DILocation(line: 94, column: 25, scope: !5) +!110 = !DILocation(line: 94, column: 48, scope: !5) +!111 = !DILocation(line: 95, column: 25, scope: !5) +!112 = !DILocation(line: 95, column: 48, scope: !5) +!113 = !DILocation(line: 96, column: 25, scope: !5) +!114 = !DILocation(line: 96, column: 48, scope: !5) +!115 = !DILocation(line: 97, column: 25, scope: !5) +!116 = !DILocation(line: 97, column: 48, scope: !5) +!117 = !DILocation(line: 98, column: 25, scope: !5) +!118 = !DILocation(line: 98, column: 48, scope: !5) +!119 = !DILocation(line: 99, column: 25, scope: !5) +!120 = !DILocation(line: 99, column: 48, scope: !5) +!121 = !DILocation(line: 100, column: 25, scope: !5) +!122 = !DILocation(line: 100, column: 48, scope: !5) +!123 = !DILocation(line: 100, column: 4, scope: !5) diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttgir b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..845b90d47888616d8876066ba0ab5c85fb673044 --- /dev/null +++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttgir @@ -0,0 +1,168 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: !tt.ptr {tt.divisibility = 16 : i32}, %arg13: !tt.ptr {tt.divisibility = 16 : i32}, %arg14: !tt.ptr {tt.divisibility = 16 : i32}, %arg15: !tt.ptr {tt.divisibility = 16 : i32}, %arg16: !tt.ptr {tt.divisibility = 16 : i32}, %arg17: !tt.ptr {tt.divisibility = 16 : i32}, %arg18: !tt.ptr {tt.divisibility = 16 : i32}, %arg19: !tt.ptr {tt.divisibility = 16 : i32}, %arg20: !tt.ptr {tt.divisibility = 16 : i32}, %arg21: !tt.ptr {tt.divisibility = 16 : i32}, %arg22: !tt.ptr {tt.divisibility = 16 : i32}, %arg23: !tt.ptr {tt.divisibility = 16 : i32}, %arg24: !tt.ptr {tt.divisibility = 16 : i32}, %arg25: !tt.ptr {tt.divisibility = 16 : i32}, %arg26: !tt.ptr {tt.divisibility = 16 : i32}, %arg27: !tt.ptr {tt.divisibility = 16 : i32}, %arg28: !tt.ptr {tt.divisibility = 16 : i32}, %arg29: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg30: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_1 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %17 = tt.addptr %arg3, %0 : !tt.ptr, i32 + %18 = tt.splat %17 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %19 = tt.load %18 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %20 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %21 = tt.splat %20 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %23 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %24 = tt.addptr %23, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %25 = tt.load %24, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %26 = arith.extf %25 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %27 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %28 = tt.splat %27 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %29 = tt.load %28 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %30 = tt.addptr %arg7, %0 : !tt.ptr, i32 + %31 = tt.splat %30 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %32 = tt.load %31 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %33 = tt.splat %arg8 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %35 = tt.load %34, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %36 = arith.extf %35 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %37 = tt.splat %arg9 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %39 = tt.load %38, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %40 = arith.extf %39 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %41 = tt.splat %arg10 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %42 = tt.addptr %41, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %43 = tt.load %42, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %44 = arith.extf %43 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %45 = tt.addptr %arg11, %0 : !tt.ptr, i32 + %46 = tt.splat %45 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %47 = tt.load %46 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %48 = tt.addptr %arg12, %0 : !tt.ptr, i32 + %49 = tt.splat %48 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %50 = tt.load %49 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %51 = tt.splat %arg13 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %52 = tt.addptr %51, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %53 = tt.load %52, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %54 = arith.extf %53 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %55 = tt.addptr %arg14, %0 : !tt.ptr, i32 + %56 = tt.splat %55 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %57 = tt.load %56 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %58 = tt.addptr %arg15, %0 : !tt.ptr, i32 + %59 = tt.splat %58 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %60 = tt.load %59 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %61 = tt.splat %arg16 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %62 = tt.addptr %61, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %63 = tt.load %62, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %64 = arith.extf %63 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %65 = tt.addptr %arg17, %0 : !tt.ptr, i32 + %66 = tt.splat %65 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %67 = tt.load %66 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %68 = tt.addptr %arg18, %0 : !tt.ptr, i32 + %69 = tt.splat %68 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %70 = tt.load %69 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %71 = tt.splat %arg19 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %72 = tt.addptr %71, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %73 = tt.load %72, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %74 = tt.splat %arg20 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %75 = tt.addptr %74, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %76 = tt.load %75, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %77 = arith.addf %8, %12 : tensor<256xf32, #blocked> + %78 = arith.addf %77, %16 : tensor<256xf32, #blocked> + %79 = tt.broadcast %19 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %80 = arith.subf %78, %79 : tensor<256xf32, #blocked> + %81 = tt.broadcast %22 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %82 = arith.mulf %80, %81 : tensor<256xf32, #blocked> + %83 = arith.addf %78, %26 : tensor<256xf32, #blocked> + %84 = tt.broadcast %29 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %85 = arith.subf %83, %84 : tensor<256xf32, #blocked> + %86 = tt.broadcast %32 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %87 = arith.mulf %85, %86 : tensor<256xf32, #blocked> + %88 = arith.addf %83, %36 : tensor<256xf32, #blocked> + %89 = arith.addf %88, %40 : tensor<256xf32, #blocked> + %90 = arith.addf %89, %44 : tensor<256xf32, #blocked> + %91 = tt.broadcast %47 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %92 = arith.subf %90, %91 : tensor<256xf32, #blocked> + %93 = tt.broadcast %50 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %94 = arith.mulf %92, %93 : tensor<256xf32, #blocked> + %95 = arith.addf %90, %54 : tensor<256xf32, #blocked> + %96 = tt.broadcast %57 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %97 = arith.subf %95, %96 : tensor<256xf32, #blocked> + %98 = tt.broadcast %60 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %99 = arith.mulf %97, %98 : tensor<256xf32, #blocked> + %100 = arith.addf %95, %64 : tensor<256xf32, #blocked> + %101 = tt.broadcast %67 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %102 = arith.subf %100, %101 : tensor<256xf32, #blocked> + %103 = tt.broadcast %70 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %104 = arith.mulf %102, %103 : tensor<256xf32, #blocked> + %105 = arith.mulf %73, %76 : tensor<256xf32, #blocked> + %106 = arith.select %2, %105, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %107 = "tt.reduce"(%106) <{axis = 0 : i32}> ({ + ^bb0(%arg31: f32, %arg32: f32): + %139 = arith.addf %arg31, %arg32 : f32 + tt.reduce.return %139 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %108 = arith.addf %107, %cst_1 : f32 + %109 = arith.mulf %105, %104 : tensor<256xf32, #blocked> + %110 = arith.select %2, %109, %cst_2 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %111 = "tt.reduce"(%110) <{axis = 0 : i32}> ({ + ^bb0(%arg31: f32, %arg32: f32): + %139 = arith.addf %arg31, %arg32 : f32 + tt.reduce.return %139 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %112 = arith.addf %111, %cst_1 : f32 + %113 = arith.divf %70, %cst_0 : tensor<1xf32, #blocked> + %114 = arith.mulf %105, %cst_3 : tensor<256xf32, #blocked> + %115 = tt.splat %108 : (f32) -> tensor<256xf32, #blocked> + %116 = arith.subf %114, %115 : tensor<256xf32, #blocked> + %117 = tt.splat %112 : (f32) -> tensor<256xf32, #blocked> + %118 = arith.mulf %104, %117 : tensor<256xf32, #blocked> + %119 = arith.subf %116, %118 : tensor<256xf32, #blocked> + %120 = tt.broadcast %113 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %121 = arith.mulf %120, %119 : tensor<256xf32, #blocked> + %122 = tt.splat %arg21 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %123 = tt.addptr %122, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %123, %82, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %124 = tt.splat %arg22 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %125 = tt.addptr %124, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %125, %87, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %126 = tt.splat %arg23 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %127 = tt.addptr %126, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %127, %88, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %128 = tt.splat %arg24 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %129 = tt.addptr %128, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %129, %94, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %130 = tt.splat %arg25 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %131 = tt.addptr %130, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %131, %99, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %132 = tt.splat %arg26 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %133 = tt.addptr %132, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %133, %104, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %134 = tt.splat %arg27 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %135 = tt.addptr %134, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + tt.store %135, %121, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked> + %136 = tt.splat %arg28 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %137 = tt.addptr %136, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %138 = arith.truncf %121 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked> + tt.store %137, %138, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttir b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..c0589cbb86d1b1969086d2696b335f085f9b6f48 --- /dev/null +++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ttir @@ -0,0 +1,167 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: !tt.ptr {tt.divisibility = 16 : i32}, %arg13: !tt.ptr {tt.divisibility = 16 : i32}, %arg14: !tt.ptr {tt.divisibility = 16 : i32}, %arg15: !tt.ptr {tt.divisibility = 16 : i32}, %arg16: !tt.ptr {tt.divisibility = 16 : i32}, %arg17: !tt.ptr {tt.divisibility = 16 : i32}, %arg18: !tt.ptr {tt.divisibility = 16 : i32}, %arg19: !tt.ptr {tt.divisibility = 16 : i32}, %arg20: !tt.ptr {tt.divisibility = 16 : i32}, %arg21: !tt.ptr {tt.divisibility = 16 : i32}, %arg22: !tt.ptr {tt.divisibility = 16 : i32}, %arg23: !tt.ptr {tt.divisibility = 16 : i32}, %arg24: !tt.ptr {tt.divisibility = 16 : i32}, %arg25: !tt.ptr {tt.divisibility = 16 : i32}, %arg26: !tt.ptr {tt.divisibility = 16 : i32}, %arg27: !tt.ptr {tt.divisibility = 16 : i32}, %arg28: !tt.ptr {tt.divisibility = 16 : i32}, %arg29: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg30: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_2 = arith.constant dense<2.560000e+02> : tensor<256xf32> + %cst_3 = arith.constant dense<2.560000e+02> : tensor<1xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32> + %17 = tt.addptr %arg3, %0 : !tt.ptr, i32 + %18 = tt.splat %17 : (!tt.ptr) -> tensor<1x!tt.ptr> + %19 = tt.load %18 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %20 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %21 = tt.splat %20 : (!tt.ptr) -> tensor<1x!tt.ptr> + %22 = tt.load %21 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %23 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr> + %24 = tt.addptr %23, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %25 = tt.load %24, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %26 = arith.extf %25 : tensor<256xbf16> to tensor<256xf32> + %27 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %28 = tt.splat %27 : (!tt.ptr) -> tensor<1x!tt.ptr> + %29 = tt.load %28 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %30 = tt.addptr %arg7, %0 : !tt.ptr, i32 + %31 = tt.splat %30 : (!tt.ptr) -> tensor<1x!tt.ptr> + %32 = tt.load %31 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %33 = tt.splat %arg8 : (!tt.ptr) -> tensor<256x!tt.ptr> + %34 = tt.addptr %33, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %35 = tt.load %34, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %36 = arith.extf %35 : tensor<256xbf16> to tensor<256xf32> + %37 = tt.splat %arg9 : (!tt.ptr) -> tensor<256x!tt.ptr> + %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %39 = tt.load %38, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %40 = arith.extf %39 : tensor<256xbf16> to tensor<256xf32> + %41 = tt.splat %arg10 : (!tt.ptr) -> tensor<256x!tt.ptr> + %42 = tt.addptr %41, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %43 = tt.load %42, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %44 = arith.extf %43 : tensor<256xbf16> to tensor<256xf32> + %45 = tt.addptr %arg11, %0 : !tt.ptr, i32 + %46 = tt.splat %45 : (!tt.ptr) -> tensor<1x!tt.ptr> + %47 = tt.load %46 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %48 = tt.addptr %arg12, %0 : !tt.ptr, i32 + %49 = tt.splat %48 : (!tt.ptr) -> tensor<1x!tt.ptr> + %50 = tt.load %49 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %51 = tt.splat %arg13 : (!tt.ptr) -> tensor<256x!tt.ptr> + %52 = tt.addptr %51, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %53 = tt.load %52, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %54 = arith.extf %53 : tensor<256xbf16> to tensor<256xf32> + %55 = tt.addptr %arg14, %0 : !tt.ptr, i32 + %56 = tt.splat %55 : (!tt.ptr) -> tensor<1x!tt.ptr> + %57 = tt.load %56 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %58 = tt.addptr %arg15, %0 : !tt.ptr, i32 + %59 = tt.splat %58 : (!tt.ptr) -> tensor<1x!tt.ptr> + %60 = tt.load %59 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %61 = tt.splat %arg16 : (!tt.ptr) -> tensor<256x!tt.ptr> + %62 = tt.addptr %61, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %63 = tt.load %62, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %64 = arith.extf %63 : tensor<256xbf16> to tensor<256xf32> + %65 = tt.addptr %arg17, %0 : !tt.ptr, i32 + %66 = tt.splat %65 : (!tt.ptr) -> tensor<1x!tt.ptr> + %67 = tt.load %66 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %68 = tt.addptr %arg18, %0 : !tt.ptr, i32 + %69 = tt.splat %68 : (!tt.ptr) -> tensor<1x!tt.ptr> + %70 = tt.load %69 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32> + %71 = tt.splat %arg19 : (!tt.ptr) -> tensor<256x!tt.ptr> + %72 = tt.addptr %71, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %73 = tt.load %72, %2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %74 = tt.splat %arg20 : (!tt.ptr) -> tensor<256x!tt.ptr> + %75 = tt.addptr %74, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %76 = tt.load %75, %2, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %77 = arith.addf %8, %12 : tensor<256xf32> + %78 = arith.addf %77, %16 : tensor<256xf32> + %79 = tt.broadcast %19 : (tensor<1xf32>) -> tensor<256xf32> + %80 = arith.subf %78, %79 : tensor<256xf32> + %81 = tt.broadcast %22 : (tensor<1xf32>) -> tensor<256xf32> + %82 = arith.mulf %80, %81 : tensor<256xf32> + %83 = arith.addf %78, %26 : tensor<256xf32> + %84 = tt.broadcast %29 : (tensor<1xf32>) -> tensor<256xf32> + %85 = arith.subf %83, %84 : tensor<256xf32> + %86 = tt.broadcast %32 : (tensor<1xf32>) -> tensor<256xf32> + %87 = arith.mulf %85, %86 : tensor<256xf32> + %88 = arith.addf %83, %36 : tensor<256xf32> + %89 = arith.addf %88, %40 : tensor<256xf32> + %90 = arith.addf %89, %44 : tensor<256xf32> + %91 = tt.broadcast %47 : (tensor<1xf32>) -> tensor<256xf32> + %92 = arith.subf %90, %91 : tensor<256xf32> + %93 = tt.broadcast %50 : (tensor<1xf32>) -> tensor<256xf32> + %94 = arith.mulf %92, %93 : tensor<256xf32> + %95 = arith.addf %90, %54 : tensor<256xf32> + %96 = tt.broadcast %57 : (tensor<1xf32>) -> tensor<256xf32> + %97 = arith.subf %95, %96 : tensor<256xf32> + %98 = tt.broadcast %60 : (tensor<1xf32>) -> tensor<256xf32> + %99 = arith.mulf %97, %98 : tensor<256xf32> + %100 = arith.addf %95, %64 : tensor<256xf32> + %101 = tt.broadcast %67 : (tensor<1xf32>) -> tensor<256xf32> + %102 = arith.subf %100, %101 : tensor<256xf32> + %103 = tt.broadcast %70 : (tensor<1xf32>) -> tensor<256xf32> + %104 = arith.mulf %102, %103 : tensor<256xf32> + %105 = arith.mulf %73, %76 : tensor<256xf32> + %106 = arith.select %2, %105, %cst_1 : tensor<256xi1>, tensor<256xf32> + %107 = "tt.reduce"(%106) <{axis = 0 : i32}> ({ + ^bb0(%arg31: f32, %arg32: f32): + %139 = arith.addf %arg31, %arg32 : f32 + tt.reduce.return %139 : f32 + }) : (tensor<256xf32>) -> f32 + %108 = arith.addf %107, %cst_0 : f32 + %109 = arith.mulf %105, %104 : tensor<256xf32> + %110 = arith.select %2, %109, %cst_1 : tensor<256xi1>, tensor<256xf32> + %111 = "tt.reduce"(%110) <{axis = 0 : i32}> ({ + ^bb0(%arg31: f32, %arg32: f32): + %139 = arith.addf %arg31, %arg32 : f32 + tt.reduce.return %139 : f32 + }) : (tensor<256xf32>) -> f32 + %112 = arith.addf %111, %cst_0 : f32 + %113 = arith.divf %70, %cst_3 : tensor<1xf32> + %114 = arith.mulf %105, %cst_2 : tensor<256xf32> + %115 = tt.splat %108 : (f32) -> tensor<256xf32> + %116 = arith.subf %114, %115 : tensor<256xf32> + %117 = tt.splat %112 : (f32) -> tensor<256xf32> + %118 = arith.mulf %104, %117 : tensor<256xf32> + %119 = arith.subf %116, %118 : tensor<256xf32> + %120 = tt.broadcast %113 : (tensor<1xf32>) -> tensor<256xf32> + %121 = arith.mulf %120, %119 : tensor<256xf32> + %122 = tt.splat %arg21 : (!tt.ptr) -> tensor<256x!tt.ptr> + %123 = tt.addptr %122, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %123, %82, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %124 = tt.splat %arg22 : (!tt.ptr) -> tensor<256x!tt.ptr> + %125 = tt.addptr %124, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %125, %87, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %126 = tt.splat %arg23 : (!tt.ptr) -> tensor<256x!tt.ptr> + %127 = tt.addptr %126, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %127, %88, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %128 = tt.splat %arg24 : (!tt.ptr) -> tensor<256x!tt.ptr> + %129 = tt.addptr %128, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %129, %94, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %130 = tt.splat %arg25 : (!tt.ptr) -> tensor<256x!tt.ptr> + %131 = tt.addptr %130, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %131, %99, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %132 = tt.splat %arg26 : (!tt.ptr) -> tensor<256x!tt.ptr> + %133 = tt.addptr %132, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %133, %104, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %134 = tt.splat %arg27 : (!tt.ptr) -> tensor<256x!tt.ptr> + %135 = tt.addptr %134, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %135, %121, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %136 = tt.splat %arg28 : (!tt.ptr) -> tensor<256x!tt.ptr> + %137 = tt.addptr %136, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %138 = arith.truncf %121 : tensor<256xf32> to tensor<256xbf16> + tt.store %137, %138, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + tt.return + } +}