0-hero
/

gpt2-pos-encoding-experiment-100B

Model card Files Files and versions Community

gpt2-pos-encoding-experiment-100B / .triton /dump /0471aff594c8c8b8715b81c529738739 /triton_.llir

0-hero

Add files using upload-large-folder tool

f9d5f95 verified 5 months ago

raw

history blame contribute delete

30.9 kB

	; ModuleID = 'LLVMDialectModule'
	source_filename = "LLVMDialectModule"

	@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
	@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
	@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
	@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
	@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
	@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
	@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
	@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1

	declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr

	define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
	%9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
	%10 = and i32 %9, 31, !dbg !10
	%11 = lshr i32 %9, 5, !dbg !10
	%12 = and i32 %11, 3, !dbg !10
	%13 = lshr i32 %10, 1, !dbg !10
	%14 = shl nuw nsw i32 %12, 4, !dbg !10
	%15 = or i32 %14, %13, !dbg !10
	%16 = and i32 %9, 63, !dbg !10
	%17 = shl i32 %9, 2, !dbg !11
	%18 = and i32 %17, 4, !dbg !11
	%19 = and i32 %9, 7, !dbg !11
	%20 = shl nuw nsw i32 %12, 2, !dbg !12
	%21 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
	%22 = shl i32 %21, 6, !dbg !14
	%23 = or i32 %22, %15, !dbg !15
	%24 = or i32 %22, %16, !dbg !15
	%25 = sext i32 %23 to i64, !dbg !16
	%26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
	%27 = sext i32 %24 to i64, !dbg !16
	%28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !16
	%29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
	%30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
	%31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
	%32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
	%33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #6, !dbg !17
	%34 = srem i32 %23, 512, !dbg !18
	%35 = shl nsw i32 %34, 8, !dbg !19
	%36 = shl i32 %23, 8, !dbg !20
	%37 = add i64 %33, 50257, !dbg !21
	%38 = icmp slt i64 %29, 0, !dbg !22
	%39 = icmp slt i64 %33, 0, !dbg !22
	%40 = select i1 %39, i64 %37, i64 %33, !dbg !23
	%41 = icmp ugt i64 %40, 50256, !dbg !24
	%42 = shl i64 %29, 8, !dbg !25
	%43 = add i64 %42, 12865792, !dbg !25
	%44 = select i1 %38, i64 %43, i64 %42, !dbg !25
	%45 = getelementptr float, ptr addrspace(1) %1, i64 %44
	br label %46, !dbg !12

	46: ; preds = %8, %92
	%47 = phi float [ 0.000000e+00, %8 ], [ %116, %92 ]
	%48 = phi float [ 0.000000e+00, %8 ], [ %117, %92 ]
	%49 = phi float [ 0.000000e+00, %8 ], [ %118, %92 ]
	%50 = phi float [ 0.000000e+00, %8 ], [ %119, %92 ]
	%51 = phi float [ 0.000000e+00, %8 ], [ %120, %92 ]
	%52 = phi float [ 0.000000e+00, %8 ], [ %121, %92 ]
	%53 = phi float [ 0.000000e+00, %8 ], [ %122, %92 ]
	%54 = phi float [ 0.000000e+00, %8 ], [ %123, %92 ]
	%55 = phi float [ 0.000000e+00, %8 ], [ %140, %92 ]
	%56 = phi float [ 0.000000e+00, %8 ], [ %141, %92 ]
	%57 = phi float [ 0.000000e+00, %8 ], [ %142, %92 ]
	%58 = phi float [ 0.000000e+00, %8 ], [ %143, %92 ]
	%59 = phi float [ 0.000000e+00, %8 ], [ %128, %92 ]
	%60 = phi float [ 0.000000e+00, %8 ], [ %129, %92 ]
	%61 = phi float [ 0.000000e+00, %8 ], [ %130, %92 ]
	%62 = phi float [ 0.000000e+00, %8 ], [ %131, %92 ]
	%63 = phi i32 [ 0, %8 ], [ %144, %92 ]
	%64 = or i32 %63, %18, !dbg !26
	%65 = add i32 %64, %35, !dbg !27
	%66 = sext i32 %65 to i64, !dbg !28
	%67 = getelementptr float, ptr addrspace(1) %2, i64 %66, !dbg !28
	%68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %67, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
	%69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !29
	%70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !29
	%71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !29
	%72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !29
	%73 = bitcast i32 %69 to float, !dbg !29
	%74 = bitcast i32 %70 to float, !dbg !29
	%75 = bitcast i32 %71 to float, !dbg !29
	%76 = bitcast i32 %72 to float, !dbg !29
	%77 = add i32 %64, %36, !dbg !30
	%78 = sext i32 %77 to i64, !dbg !31
	%79 = getelementptr i16, ptr addrspace(1) %3, i64 %78, !dbg !31
	%80 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
	%81 = extractvalue { i32, i32 } %80, 0, !dbg !32
	%82 = extractvalue { i32, i32 } %80, 1, !dbg !32
	%83 = trunc i32 %81 to i16, !dbg !32
	%extelt.offset3 = lshr i32 %81, 16, !dbg !32
	%84 = trunc i32 %extelt.offset3 to i16, !dbg !32
	%85 = trunc i32 %82 to i16, !dbg !32
	%extelt.offset4 = lshr i32 %82, 16, !dbg !32
	%86 = trunc i32 %extelt.offset4 to i16, !dbg !32
	%87 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %83) #6, !dbg !33
	%88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #6, !dbg !33
	%89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #6, !dbg !33
	%90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #6, !dbg !33
	br i1 %41, label %91, label %92, !dbg !34

	91: ; preds = %46
	tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
	br label %92, !dbg !34

	92: ; preds = %91, %46
	%93 = zext nneg i32 %64 to i64, !dbg !35
	%94 = getelementptr float, ptr addrspace(1) %45, i64 %93, !dbg !36
	%95 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %94, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
	%96 = extractvalue { i32, i32, i32, i32 } %95, 0, !dbg !37
	%97 = extractvalue { i32, i32, i32, i32 } %95, 1, !dbg !37
	%98 = extractvalue { i32, i32, i32, i32 } %95, 2, !dbg !37
	%99 = extractvalue { i32, i32, i32, i32 } %95, 3, !dbg !37
	%100 = bitcast i32 %96 to float, !dbg !37
	%101 = bitcast i32 %97 to float, !dbg !37
	%102 = bitcast i32 %98 to float, !dbg !37
	%103 = bitcast i32 %99 to float, !dbg !37
	%104 = fadd float %73, %100, !dbg !38
	%105 = fadd float %74, %101, !dbg !38
	%106 = fadd float %75, %102, !dbg !38
	%107 = fadd float %76, %103, !dbg !38
	%108 = fadd float %87, %104, !dbg !39
	%109 = fadd float %88, %105, !dbg !39
	%110 = fadd float %89, %106, !dbg !39
	%111 = fadd float %90, %107, !dbg !39
	%112 = fsub float %108, %59, !dbg !40
	%113 = fsub float %109, %60, !dbg !40
	%114 = fsub float %110, %61, !dbg !40
	%115 = fsub float %111, %62, !dbg !40
	%116 = fadd float %47, 1.000000e+00, !dbg !44
	%117 = fadd float %48, 1.000000e+00, !dbg !44
	%118 = fadd float %49, 1.000000e+00, !dbg !44
	%119 = fadd float %50, 1.000000e+00, !dbg !44
	%120 = fadd float %51, 1.000000e+00, !dbg !44
	%121 = fadd float %52, 1.000000e+00, !dbg !44
	%122 = fadd float %53, 1.000000e+00, !dbg !44
	%123 = fadd float %54, 1.000000e+00, !dbg !44
	%124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float %116) #6, !dbg !45
	%125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %113, float %117) #6, !dbg !45
	%126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float %118) #6, !dbg !45
	%127 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %115, float %119) #6, !dbg !45
	%128 = fadd float %59, %124, !dbg !46
	%129 = fadd float %60, %125, !dbg !46
	%130 = fadd float %61, %126, !dbg !46
	%131 = fadd float %62, %127, !dbg !46
	%132 = fsub float %108, %128, !dbg !47
	%133 = fsub float %109, %129, !dbg !47
	%134 = fsub float %110, %130, !dbg !47
	%135 = fsub float %111, %131, !dbg !47
	%136 = fmul float %112, %132, !dbg !48
	%137 = fmul float %113, %133, !dbg !48
	%138 = fmul float %114, %134, !dbg !48
	%139 = fmul float %115, %135, !dbg !48
	%140 = fadd float %55, %136, !dbg !49
	%141 = fadd float %56, %137, !dbg !49
	%142 = fadd float %57, %138, !dbg !49
	%143 = fadd float %58, %139, !dbg !49
	%144 = add nuw nsw i32 %63, 8, !dbg !12
	%145 = icmp ult i32 %63, 248, !dbg !12
	br i1 %145, label %46, label %146, !dbg !12

	146: ; preds = %92
	%147 = lshr i32 %10, 3, !dbg !12
	%148 = or i32 %20, %147, !dbg !12
	%149 = mul nuw nsw i32 %148, 12, !dbg !12
	%150 = add nuw nsw i32 %149, %19, !dbg !12
	%151 = zext nneg i32 %150 to i64, !dbg !12
	%152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
	%153 = insertelement <1 x float> undef, float %120, i64 0, !dbg !12
	store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !12
	%154 = or i32 %19, 192, !dbg !12
	%155 = add nuw nsw i32 %154, %149, !dbg !12
	%156 = zext nneg i32 %155 to i64, !dbg !12
	%157 = getelementptr float, ptr addrspace(3) @global_smem, i64 %156, !dbg !12
	%158 = insertelement <1 x float> undef, float %121, i64 0, !dbg !12
	store <1 x float> %158, ptr addrspace(3) %157, align 4, !dbg !12
	%159 = or i32 %19, 384, !dbg !12
	%160 = add nuw nsw i32 %159, %149, !dbg !12
	%161 = zext nneg i32 %160 to i64, !dbg !12
	%162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !12
	%163 = insertelement <1 x float> undef, float %122, i64 0, !dbg !12
	store <1 x float> %163, ptr addrspace(3) %162, align 4, !dbg !12
	%164 = or i32 %19, 576, !dbg !12
	%165 = add nuw nsw i32 %164, %149, !dbg !12
	%166 = zext nneg i32 %165 to i64, !dbg !12
	%167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !12
	%168 = insertelement <1 x float> undef, float %123, i64 0, !dbg !12
	store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !12
	tail call void @llvm.nvvm.barrier0(), !dbg !12
	%169 = mul nuw nsw i32 %15, 12, !dbg !12
	%170 = add nuw nsw i32 %169, %18, !dbg !12
	%171 = zext nneg i32 %170 to i64, !dbg !12
	%172 = getelementptr float, ptr addrspace(3) @global_smem, i64 %171, !dbg !12
	%173 = load float, ptr addrspace(3) %172, align 16, !dbg !12
	%174 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 1, !dbg !12
	%175 = load float, ptr addrspace(3) %174, align 4, !dbg !12
	%176 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 2, !dbg !12
	%177 = load float, ptr addrspace(3) %176, align 8, !dbg !12
	%178 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 3, !dbg !12
	%179 = load float, ptr addrspace(3) %178, align 4, !dbg !12
	%180 = fsub float %129, %128, !dbg !50
	%181 = fadd float %173, %175, !dbg !54
	%182 = fcmp oeq float %181, 0.000000e+00, !dbg !55
	%183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %181) #6, !dbg !56
	%184 = select i1 %182, float 0.000000e+00, float %183, !dbg !57
	%185 = fmul float %180, %184, !dbg !58
	%186 = fadd float %128, %185, !dbg !59
	%187 = fadd float %140, %141, !dbg !60
	%188 = fmul float %180, %180, !dbg !61
	%189 = fmul float %188, %173, !dbg !62
	%190 = fmul float %189, %184, !dbg !63
	%191 = fadd float %187, %190, !dbg !64
	%192 = fsub float %130, %186, !dbg !50
	%193 = fadd float %177, %181, !dbg !54
	%194 = fcmp oeq float %193, 0.000000e+00, !dbg !55
	%195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %193) #6, !dbg !56
	%196 = select i1 %194, float 0.000000e+00, float %195, !dbg !57
	%197 = fmul float %196, %192, !dbg !58
	%198 = fadd float %186, %197, !dbg !59
	%199 = fadd float %142, %191, !dbg !60
	%200 = fmul float %192, %192, !dbg !61
	%201 = fmul float %181, %200, !dbg !62
	%202 = fmul float %196, %201, !dbg !63
	%203 = fadd float %199, %202, !dbg !64
	%204 = fsub float %131, %198, !dbg !50
	%205 = fadd float %179, %193, !dbg !54
	%206 = fcmp oeq float %205, 0.000000e+00, !dbg !55
	%207 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %179, float %205) #6, !dbg !56
	%208 = select i1 %206, float 0.000000e+00, float %207, !dbg !57
	%209 = fmul float %208, %204, !dbg !58
	%210 = fadd float %198, %209, !dbg !59
	%211 = fadd float %143, %203, !dbg !60
	%212 = fmul float %204, %204, !dbg !61
	%213 = fmul float %193, %212, !dbg !62
	%214 = fmul float %208, %213, !dbg !63
	%215 = fadd float %211, %214, !dbg !64
	%216 = bitcast float %210 to i32, !dbg !65
	%217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !65
	%218 = bitcast i32 %217 to float, !dbg !65
	%219 = bitcast float %215 to i32, !dbg !65
	%220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !65
	%221 = bitcast i32 %220 to float, !dbg !65
	%222 = bitcast float %205 to i32, !dbg !65
	%223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !65
	%224 = bitcast i32 %223 to float, !dbg !65
	%225 = fsub float %218, %210, !dbg !50
	%226 = fadd float %205, %224, !dbg !54
	%227 = fcmp oeq float %226, 0.000000e+00, !dbg !55
	%228 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %224, float %226) #6, !dbg !56
	%229 = select i1 %227, float 0.000000e+00, float %228, !dbg !57
	%230 = fmul float %229, %225, !dbg !58
	%231 = fadd float %210, %230, !dbg !59
	%232 = fadd float %215, %221, !dbg !60
	%233 = fmul float %225, %225, !dbg !61
	%234 = fmul float %205, %233, !dbg !62
	%235 = fmul float %229, %234, !dbg !63
	%236 = fadd float %232, %235, !dbg !64
	%237 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
	%238 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
	%239 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
	%240 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
	%241 = fadd float %237, 0x3EE4F8B580000000, !dbg !68
	br label %242, !dbg !69

	242: ; preds = %146, %__nv_rsqrtf.exit
	%243 = phi i32 [ 0, %146 ], [ %333, %__nv_rsqrtf.exit ]
	%244 = or i32 %243, %18, !dbg !70
	%245 = add i32 %244, %35, !dbg !71
	%246 = sext i32 %245 to i64, !dbg !72
	%247 = getelementptr float, ptr addrspace(1) %2, i64 %246, !dbg !72
	%248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
	%249 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !73
	%250 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !73
	%251 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !73
	%252 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !73
	%253 = bitcast i32 %249 to float, !dbg !73
	%254 = bitcast i32 %250 to float, !dbg !73
	%255 = bitcast i32 %251 to float, !dbg !73
	%256 = bitcast i32 %252 to float, !dbg !73
	%257 = add i32 %244, %36, !dbg !74
	%258 = sext i32 %257 to i64, !dbg !75
	%259 = getelementptr i16, ptr addrspace(1) %3, i64 %258, !dbg !75
	%260 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %259, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
	%261 = extractvalue { i32, i32 } %260, 0, !dbg !76
	%262 = extractvalue { i32, i32 } %260, 1, !dbg !76
	%263 = trunc i32 %261 to i16, !dbg !76
	%extelt.offset = lshr i32 %261, 16, !dbg !76
	%264 = trunc i32 %extelt.offset to i16, !dbg !76
	%265 = trunc i32 %262 to i16, !dbg !76
	%extelt.offset2 = lshr i32 %262, 16, !dbg !76
	%266 = trunc i32 %extelt.offset2 to i16, !dbg !76
	%267 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %263) #6, !dbg !77
	%268 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %264) #6, !dbg !77
	%269 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %265) #6, !dbg !77
	%270 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %266) #6, !dbg !77
	%271 = zext nneg i32 %244 to i64, !dbg !78
	%272 = getelementptr float, ptr addrspace(1) %4, i64 %271, !dbg !78
	%273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %272, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
	%274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !79
	%275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !79
	%276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !79
	%277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !79
	%278 = bitcast i32 %274 to float, !dbg !79
	%279 = bitcast i32 %275 to float, !dbg !79
	%280 = bitcast i32 %276 to float, !dbg !79
	%281 = bitcast i32 %277 to float, !dbg !79
	br i1 %41, label %282, label %283, !dbg !80

	282: ; preds = %242
	tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
	br label %283, !dbg !80

	283: ; preds = %282, %242
	%284 = getelementptr float, ptr addrspace(1) %45, i64 %271, !dbg !81
	%285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %284, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
	%286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !82
	%287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !82
	%288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !82
	%289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !82
	%290 = bitcast i32 %286 to float, !dbg !82
	%291 = bitcast i32 %287 to float, !dbg !82
	%292 = bitcast i32 %288 to float, !dbg !82
	%293 = bitcast i32 %289 to float, !dbg !82
	%294 = fadd float %253, %290, !dbg !83
	%295 = fadd float %254, %291, !dbg !83
	%296 = fadd float %255, %292, !dbg !83
	%297 = fadd float %256, %293, !dbg !83
	%298 = fadd float %267, %294, !dbg !84
	%299 = fadd float %268, %295, !dbg !84
	%300 = fadd float %269, %296, !dbg !84
	%301 = fadd float %270, %297, !dbg !84
	%302 = fsub float %298, %231, !dbg !85
	%303 = fsub float %299, %231, !dbg !85
	%304 = fsub float %300, %231, !dbg !85
	%305 = fsub float %301, %231, !dbg !85
	%306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
	%.not.i = icmp eq i32 %306, 0, !dbg !86
	br i1 %.not.i, label %309, label %307, !dbg !86

	307: ; preds = %283
	%308 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %241), !dbg !86
	br label %__nv_rsqrtf.exit, !dbg !86

	309: ; preds = %283
	%310 = tail call float @llvm.nvvm.rsqrt.approx.f(float %241), !dbg !86
	br label %__nv_rsqrtf.exit, !dbg !86

	__nv_rsqrtf.exit: ; preds = %307, %309
	%.0.i = phi float [ %308, %307 ], [ %310, %309 ], !dbg !86
	%311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
	%312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
	%313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
	%314 = fmul float %302, %.0.i, !dbg !87
	%315 = fmul float %303, %.0.i, !dbg !87
	%316 = fmul float %304, %.0.i, !dbg !87
	%317 = fmul float %305, %.0.i, !dbg !87
	%318 = fmul float %314, %278, !dbg !88
	%319 = fmul float %315, %279, !dbg !88
	%320 = fmul float %316, %280, !dbg !88
	%321 = fmul float %317, %281, !dbg !88
	%322 = getelementptr i16, ptr addrspace(1) %5, i64 %258, !dbg !89
	%323 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %318) #6, !dbg !90
	%324 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %319) #6, !dbg !90
	%325 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %320) #6, !dbg !90
	%326 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !90
	%327 = insertelement <2 x i16> undef, i16 %323, i64 0, !dbg !90
	%328 = insertelement <2 x i16> %327, i16 %324, i64 1, !dbg !90
	%329 = bitcast <2 x i16> %328 to i32, !dbg !90
	%330 = insertelement <2 x i16> undef, i16 %325, i64 0, !dbg !90
	%331 = insertelement <2 x i16> %330, i16 %326, i64 1, !dbg !90
	%332 = bitcast <2 x i16> %331 to i32, !dbg !90
	tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %329, i32 %332, ptr addrspace(1) %322, i1 true) #6, !dbg !90
	%333 = add nuw nsw i32 %243, 8, !dbg !69
	%334 = icmp ult i32 %243, 248, !dbg !69
	br i1 %334, label %242, label %335, !dbg !69

	335: ; preds = %__nv_rsqrtf.exit
	ret void, !dbg !91
	}

	; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
	declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

	; Function Attrs: convergent nocallback nounwind
	declare void @llvm.nvvm.barrier0() #1

	; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
	declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2

	; Function Attrs: alwaysinline nounwind
	define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
	%1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
	%.not = icmp eq i32 %1, 0
	br i1 %.not, label %4, label %2

	2: ; preds = %0
	%3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
	br label %6

	4: ; preds = %0
	%5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
	br label %6

	6: ; preds = %4, %2
	%.0 = phi float [ %3, %2 ], [ %5, %4 ]
	ret float %.0
	}

	declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4

	; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
	declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5

	; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
	declare float @llvm.nvvm.rsqrt.approx.f(float) #5

	attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
	attributes #1 = { convergent nocallback nounwind }
	attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
	attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
	attributes #6 = { nounwind }

	!llvm.module.flags = !{!0, !1}
	!llvm.dbg.cu = !{!2}
	!nvvm.annotations = !{!4, !5, !5, !4}
	!llvm.ident = !{!6}

	!0 = !{i32 2, !"Debug Info Version", i32 3}
	!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
	!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
	!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
	!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
	!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
	!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
	!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition \| DISPFlagOptimized, unit: !2)
	!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
	!9 = !{}
	!10 = !DILocation(line: 22, column: 44, scope: !7)
	!11 = !DILocation(line: 24, column: 33, scope: !7)
	!12 = !DILocation(line: 31, column: 36, scope: !7)
	!13 = !DILocation(line: 21, column: 28, scope: !7)
	!14 = !DILocation(line: 21, column: 33, scope: !7)
	!15 = !DILocation(line: 22, column: 23, scope: !7)
	!16 = !DILocation(line: 26, column: 30, scope: !7)
	!17 = !DILocation(line: 26, column: 35, scope: !7)
	!18 = !DILocation(line: 27, column: 18, scope: !7)
	!19 = !DILocation(line: 35, column: 44, scope: !7)
	!20 = !DILocation(line: 36, column: 44, scope: !7)
	!21 = !DILocation(line: 37, column: 22, scope: !7)
	!22 = !DILocation(line: 38, column: 22, scope: !7)
	!23 = !DILocation(line: 39, column: 36, scope: !7)
	!24 = !DILocation(line: 40, column: 40, scope: !7)
	!25 = !DILocation(line: 41, column: 44, scope: !7)
	!26 = !DILocation(line: 32, column: 27, scope: !7)
	!27 = !DILocation(line: 35, column: 40, scope: !7)
	!28 = !DILocation(line: 35, column: 34, scope: !7)
	!29 = !DILocation(line: 35, column: 50, scope: !7)
	!30 = !DILocation(line: 36, column: 40, scope: !7)
	!31 = !DILocation(line: 36, column: 34, scope: !7)
	!32 = !DILocation(line: 36, column: 50, scope: !7)
	!33 = !DILocation(line: 36, column: 101, scope: !7)
	!34 = !DILocation(line: 40, column: 55, scope: !7)
	!35 = !DILocation(line: 41, column: 40, scope: !7)
	!36 = !DILocation(line: 41, column: 34, scope: !7)
	!37 = !DILocation(line: 41, column: 52, scope: !7)
	!38 = !DILocation(line: 42, column: 22, scope: !7)
	!39 = !DILocation(line: 44, column: 22, scope: !7)
	!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
	!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
	!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
	!43 = !DILocation(line: 47, column: 41, scope: !41)
	!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
	!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
	!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
	!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
	!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
	!49 = !DILocation(line: 50, column: 50, scope: !7)
	!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
	!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
	!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
	!53 = !DILocation(line: 53, column: 44, scope: !51)
	!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
	!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
	!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
	!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
	!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
	!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
	!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
	!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
	!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
	!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
	!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
	!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
	!66 = !DILocation(line: 53, column: 44, scope: !41)
	!67 = !DILocation(line: 75, column: 24, scope: !7)
	!68 = !DILocation(line: 77, column: 24, scope: !7)
	!69 = !DILocation(line: 58, column: 36, scope: !7)
	!70 = !DILocation(line: 59, column: 27, scope: !7)
	!71 = !DILocation(line: 62, column: 41, scope: !7)
	!72 = !DILocation(line: 62, column: 35, scope: !7)
	!73 = !DILocation(line: 62, column: 51, scope: !7)
	!74 = !DILocation(line: 63, column: 41, scope: !7)
	!75 = !DILocation(line: 63, column: 35, scope: !7)
	!76 = !DILocation(line: 63, column: 51, scope: !7)
	!77 = !DILocation(line: 63, column: 103, scope: !7)
	!78 = !DILocation(line: 64, column: 35, scope: !7)
	!79 = !DILocation(line: 64, column: 40, scope: !7)
	!80 = !DILocation(line: 68, column: 57, scope: !7)
	!81 = !DILocation(line: 69, column: 35, scope: !7)
	!82 = !DILocation(line: 69, column: 54, scope: !7)
	!83 = !DILocation(line: 70, column: 24, scope: !7)
	!84 = !DILocation(line: 72, column: 24, scope: !7)
	!85 = !DILocation(line: 73, column: 24, scope: !7)
	!86 = !DILocation(line: 78, column: 30, scope: !7)
	!87 = !DILocation(line: 79, column: 24, scope: !7)
	!88 = !DILocation(line: 80, column: 24, scope: !7)
	!89 = !DILocation(line: 82, column: 29, scope: !7)
	!90 = !DILocation(line: 82, column: 52, scope: !7)
	!91 = !DILocation(line: 58, column: 4, scope: !7)