diff --git a/.local/share/jupyter/nbextensions/help_panel/img/handle-v.png b/.local/share/jupyter/nbextensions/help_panel/img/handle-v.png
new file mode 100644
index 0000000000000000000000000000000000000000..0e1a9598eee2f1c793ce7d78f69b4532e74cbdc9
Binary files /dev/null and b/.local/share/jupyter/nbextensions/help_panel/img/handle-v.png differ
diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e251a7ed5f188ce3dee19f6aea64ef31b25b4b51
--- /dev/null
+++ b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.llir
@@ -0,0 +1,523 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp16 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = and i32 %11, 3, !dbg !10
+  %13 = lshr i32 %10, 1, !dbg !10
+  %14 = shl nuw nsw i32 %12, 4, !dbg !10
+  %15 = or i32 %14, %13, !dbg !10
+  %16 = and i32 %9, 63, !dbg !10
+  %17 = shl i32 %9, 2, !dbg !11
+  %18 = and i32 %17, 4, !dbg !11
+  %19 = and i32 %9, 7, !dbg !11
+  %20 = shl nuw nsw i32 %12, 2, !dbg !12
+  %21 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
+  %22 = shl i32 %21, 6, !dbg !14
+  %23 = or i32 %22, %15, !dbg !15
+  %24 = or i32 %22, %16, !dbg !15
+  %25 = sext i32 %23 to i64, !dbg !16
+  %26 = getelementptr i64, ptr addrspace(1) %0, i64 %25, !dbg !16
+  %27 = sext i32 %24 to i64, !dbg !16
+  %28 = getelementptr i64, ptr addrspace(1) %0, i64 %27, !dbg !16
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #6, !dbg !17
+  %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #6, !dbg !17
+  %34 = srem i32 %23, 512, !dbg !18
+  %35 = shl nsw i32 %34, 8, !dbg !19
+  %36 = shl i32 %23, 8, !dbg !20
+  %37 = add i64 %33, 50257, !dbg !21
+  %38 = icmp slt i64 %29, 0, !dbg !22
+  %39 = icmp slt i64 %33, 0, !dbg !22
+  %40 = select i1 %39, i64 %37, i64 %33, !dbg !23
+  %41 = icmp ugt i64 %40, 50256, !dbg !24
+  %42 = shl i64 %29, 8, !dbg !25
+  %43 = add i64 %42, 12865792, !dbg !25
+  %44 = select i1 %38, i64 %43, i64 %42, !dbg !25
+  %45 = getelementptr float, ptr addrspace(1) %1, i64 %44
+  br label %46, !dbg !12
+
+46:                                               ; preds = %8, %92
+  %47 = phi float [ 0.000000e+00, %8 ], [ %116, %92 ]
+  %48 = phi float [ 0.000000e+00, %8 ], [ %117, %92 ]
+  %49 = phi float [ 0.000000e+00, %8 ], [ %118, %92 ]
+  %50 = phi float [ 0.000000e+00, %8 ], [ %119, %92 ]
+  %51 = phi float [ 0.000000e+00, %8 ], [ %120, %92 ]
+  %52 = phi float [ 0.000000e+00, %8 ], [ %121, %92 ]
+  %53 = phi float [ 0.000000e+00, %8 ], [ %122, %92 ]
+  %54 = phi float [ 0.000000e+00, %8 ], [ %123, %92 ]
+  %55 = phi float [ 0.000000e+00, %8 ], [ %140, %92 ]
+  %56 = phi float [ 0.000000e+00, %8 ], [ %141, %92 ]
+  %57 = phi float [ 0.000000e+00, %8 ], [ %142, %92 ]
+  %58 = phi float [ 0.000000e+00, %8 ], [ %143, %92 ]
+  %59 = phi float [ 0.000000e+00, %8 ], [ %128, %92 ]
+  %60 = phi float [ 0.000000e+00, %8 ], [ %129, %92 ]
+  %61 = phi float [ 0.000000e+00, %8 ], [ %130, %92 ]
+  %62 = phi float [ 0.000000e+00, %8 ], [ %131, %92 ]
+  %63 = phi i32 [ 0, %8 ], [ %144, %92 ]
+  %64 = or i32 %63, %18, !dbg !26
+  %65 = add i32 %64, %35, !dbg !27
+  %66 = sext i32 %65 to i64, !dbg !28
+  %67 = getelementptr float, ptr addrspace(1) %2, i64 %66, !dbg !28
+  %68 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %67, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %69 = extractvalue { i32, i32, i32, i32 } %68, 0, !dbg !29
+  %70 = extractvalue { i32, i32, i32, i32 } %68, 1, !dbg !29
+  %71 = extractvalue { i32, i32, i32, i32 } %68, 2, !dbg !29
+  %72 = extractvalue { i32, i32, i32, i32 } %68, 3, !dbg !29
+  %73 = bitcast i32 %69 to float, !dbg !29
+  %74 = bitcast i32 %70 to float, !dbg !29
+  %75 = bitcast i32 %71 to float, !dbg !29
+  %76 = bitcast i32 %72 to float, !dbg !29
+  %77 = add i32 %64, %36, !dbg !30
+  %78 = sext i32 %77 to i64, !dbg !31
+  %79 = getelementptr i16, ptr addrspace(1) %3, i64 %78, !dbg !31
+  %80 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_last.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %79, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %81 = extractvalue { i32, i32 } %80, 0, !dbg !32
+  %82 = extractvalue { i32, i32 } %80, 1, !dbg !32
+  %83 = trunc i32 %81 to i16, !dbg !32
+  %extelt.offset3 = lshr i32 %81, 16, !dbg !32
+  %84 = trunc i32 %extelt.offset3 to i16, !dbg !32
+  %85 = trunc i32 %82 to i16, !dbg !32
+  %extelt.offset4 = lshr i32 %82, 16, !dbg !32
+  %86 = trunc i32 %extelt.offset4 to i16, !dbg !32
+  %87 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %83) #6, !dbg !33
+  %88 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #6, !dbg !33
+  %89 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #6, !dbg !33
+  %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %86) #6, !dbg !33
+  br i1 %41, label %91, label %92, !dbg !34
+
+91:                                               ; preds = %46
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !34
+  br label %92, !dbg !34
+
+92:                                               ; preds = %91, %46
+  %93 = zext nneg i32 %64 to i64, !dbg !35
+  %94 = getelementptr float, ptr addrspace(1) %45, i64 %93, !dbg !36
+  %95 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %94, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !37
+  %96 = extractvalue { i32, i32, i32, i32 } %95, 0, !dbg !37
+  %97 = extractvalue { i32, i32, i32, i32 } %95, 1, !dbg !37
+  %98 = extractvalue { i32, i32, i32, i32 } %95, 2, !dbg !37
+  %99 = extractvalue { i32, i32, i32, i32 } %95, 3, !dbg !37
+  %100 = bitcast i32 %96 to float, !dbg !37
+  %101 = bitcast i32 %97 to float, !dbg !37
+  %102 = bitcast i32 %98 to float, !dbg !37
+  %103 = bitcast i32 %99 to float, !dbg !37
+  %104 = fadd float %73, %100, !dbg !38
+  %105 = fadd float %74, %101, !dbg !38
+  %106 = fadd float %75, %102, !dbg !38
+  %107 = fadd float %76, %103, !dbg !38
+  %108 = fadd float %87, %104, !dbg !39
+  %109 = fadd float %88, %105, !dbg !39
+  %110 = fadd float %89, %106, !dbg !39
+  %111 = fadd float %90, %107, !dbg !39
+  %112 = fsub float %108, %59, !dbg !40
+  %113 = fsub float %109, %60, !dbg !40
+  %114 = fsub float %110, %61, !dbg !40
+  %115 = fsub float %111, %62, !dbg !40
+  %116 = fadd float %47, 1.000000e+00, !dbg !44
+  %117 = fadd float %48, 1.000000e+00, !dbg !44
+  %118 = fadd float %49, 1.000000e+00, !dbg !44
+  %119 = fadd float %50, 1.000000e+00, !dbg !44
+  %120 = fadd float %51, 1.000000e+00, !dbg !44
+  %121 = fadd float %52, 1.000000e+00, !dbg !44
+  %122 = fadd float %53, 1.000000e+00, !dbg !44
+  %123 = fadd float %54, 1.000000e+00, !dbg !44
+  %124 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %112, float %116) #6, !dbg !45
+  %125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %113, float %117) #6, !dbg !45
+  %126 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %114, float %118) #6, !dbg !45
+  %127 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %115, float %119) #6, !dbg !45
+  %128 = fadd float %59, %124, !dbg !46
+  %129 = fadd float %60, %125, !dbg !46
+  %130 = fadd float %61, %126, !dbg !46
+  %131 = fadd float %62, %127, !dbg !46
+  %132 = fsub float %108, %128, !dbg !47
+  %133 = fsub float %109, %129, !dbg !47
+  %134 = fsub float %110, %130, !dbg !47
+  %135 = fsub float %111, %131, !dbg !47
+  %136 = fmul float %112, %132, !dbg !48
+  %137 = fmul float %113, %133, !dbg !48
+  %138 = fmul float %114, %134, !dbg !48
+  %139 = fmul float %115, %135, !dbg !48
+  %140 = fadd float %55, %136, !dbg !49
+  %141 = fadd float %56, %137, !dbg !49
+  %142 = fadd float %57, %138, !dbg !49
+  %143 = fadd float %58, %139, !dbg !49
+  %144 = add nuw nsw i32 %63, 8, !dbg !12
+  %145 = icmp ult i32 %63, 248, !dbg !12
+  br i1 %145, label %46, label %146, !dbg !12
+
+146:                                              ; preds = %92
+  %147 = lshr i32 %10, 3, !dbg !12
+  %148 = or i32 %20, %147, !dbg !12
+  %149 = mul nuw nsw i32 %148, 12, !dbg !12
+  %150 = add nuw nsw i32 %149, %19, !dbg !12
+  %151 = zext nneg i32 %150 to i64, !dbg !12
+  %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
+  %153 = insertelement <1 x float> undef, float %120, i64 0, !dbg !12
+  store <1 x float> %153, ptr addrspace(3) %152, align 4, !dbg !12
+  %154 = or i32 %19, 192, !dbg !12
+  %155 = add nuw nsw i32 %154, %149, !dbg !12
+  %156 = zext nneg i32 %155 to i64, !dbg !12
+  %157 = getelementptr float, ptr addrspace(3) @global_smem, i64 %156, !dbg !12
+  %158 = insertelement <1 x float> undef, float %121, i64 0, !dbg !12
+  store <1 x float> %158, ptr addrspace(3) %157, align 4, !dbg !12
+  %159 = or i32 %19, 384, !dbg !12
+  %160 = add nuw nsw i32 %159, %149, !dbg !12
+  %161 = zext nneg i32 %160 to i64, !dbg !12
+  %162 = getelementptr float, ptr addrspace(3) @global_smem, i64 %161, !dbg !12
+  %163 = insertelement <1 x float> undef, float %122, i64 0, !dbg !12
+  store <1 x float> %163, ptr addrspace(3) %162, align 4, !dbg !12
+  %164 = or i32 %19, 576, !dbg !12
+  %165 = add nuw nsw i32 %164, %149, !dbg !12
+  %166 = zext nneg i32 %165 to i64, !dbg !12
+  %167 = getelementptr float, ptr addrspace(3) @global_smem, i64 %166, !dbg !12
+  %168 = insertelement <1 x float> undef, float %123, i64 0, !dbg !12
+  store <1 x float> %168, ptr addrspace(3) %167, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %169 = mul nuw nsw i32 %15, 12, !dbg !12
+  %170 = add nuw nsw i32 %169, %18, !dbg !12
+  %171 = zext nneg i32 %170 to i64, !dbg !12
+  %172 = getelementptr float, ptr addrspace(3) @global_smem, i64 %171, !dbg !12
+  %173 = load float, ptr addrspace(3) %172, align 16, !dbg !12
+  %174 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 1, !dbg !12
+  %175 = load float, ptr addrspace(3) %174, align 4, !dbg !12
+  %176 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 2, !dbg !12
+  %177 = load float, ptr addrspace(3) %176, align 8, !dbg !12
+  %178 = getelementptr inbounds <4 x float>, ptr addrspace(3) %172, i64 0, i64 3, !dbg !12
+  %179 = load float, ptr addrspace(3) %178, align 4, !dbg !12
+  %180 = fsub float %129, %128, !dbg !50
+  %181 = fadd float %173, %175, !dbg !54
+  %182 = fcmp oeq float %181, 0.000000e+00, !dbg !55
+  %183 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %175, float %181) #6, !dbg !56
+  %184 = select i1 %182, float 0.000000e+00, float %183, !dbg !57
+  %185 = fmul float %180, %184, !dbg !58
+  %186 = fadd float %128, %185, !dbg !59
+  %187 = fadd float %140, %141, !dbg !60
+  %188 = fmul float %180, %180, !dbg !61
+  %189 = fmul float %188, %173, !dbg !62
+  %190 = fmul float %189, %184, !dbg !63
+  %191 = fadd float %187, %190, !dbg !64
+  %192 = fsub float %130, %186, !dbg !50
+  %193 = fadd float %177, %181, !dbg !54
+  %194 = fcmp oeq float %193, 0.000000e+00, !dbg !55
+  %195 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %177, float %193) #6, !dbg !56
+  %196 = select i1 %194, float 0.000000e+00, float %195, !dbg !57
+  %197 = fmul float %196, %192, !dbg !58
+  %198 = fadd float %186, %197, !dbg !59
+  %199 = fadd float %142, %191, !dbg !60
+  %200 = fmul float %192, %192, !dbg !61
+  %201 = fmul float %181, %200, !dbg !62
+  %202 = fmul float %196, %201, !dbg !63
+  %203 = fadd float %199, %202, !dbg !64
+  %204 = fsub float %131, %198, !dbg !50
+  %205 = fadd float %179, %193, !dbg !54
+  %206 = fcmp oeq float %205, 0.000000e+00, !dbg !55
+  %207 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %179, float %205) #6, !dbg !56
+  %208 = select i1 %206, float 0.000000e+00, float %207, !dbg !57
+  %209 = fmul float %208, %204, !dbg !58
+  %210 = fadd float %198, %209, !dbg !59
+  %211 = fadd float %143, %203, !dbg !60
+  %212 = fmul float %204, %204, !dbg !61
+  %213 = fmul float %193, %212, !dbg !62
+  %214 = fmul float %208, %213, !dbg !63
+  %215 = fadd float %211, %214, !dbg !64
+  %216 = bitcast float %210 to i32, !dbg !65
+  %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 1, i32 31), !dbg !65
+  %218 = bitcast i32 %217 to float, !dbg !65
+  %219 = bitcast float %215 to i32, !dbg !65
+  %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 1, i32 31), !dbg !65
+  %221 = bitcast i32 %220 to float, !dbg !65
+  %222 = bitcast float %205 to i32, !dbg !65
+  %223 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %222, i32 1, i32 31), !dbg !65
+  %224 = bitcast i32 %223 to float, !dbg !65
+  %225 = fsub float %218, %210, !dbg !50
+  %226 = fadd float %205, %224, !dbg !54
+  %227 = fcmp oeq float %226, 0.000000e+00, !dbg !55
+  %228 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %224, float %226) #6, !dbg !56
+  %229 = select i1 %227, float 0.000000e+00, float %228, !dbg !57
+  %230 = fmul float %229, %225, !dbg !58
+  %231 = fadd float %210, %230, !dbg !59
+  %232 = fadd float %215, %221, !dbg !60
+  %233 = fmul float %225, %225, !dbg !61
+  %234 = fmul float %205, %233, !dbg !62
+  %235 = fmul float %229, %234, !dbg !63
+  %236 = fadd float %232, %235, !dbg !64
+  %237 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %238 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %239 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %240 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %236, float 2.560000e+02) #6, !dbg !67
+  %241 = fadd float %237, 0x3EE4F8B580000000, !dbg !68
+  br label %242, !dbg !69
+
+242:                                              ; preds = %146, %__nv_rsqrtf.exit
+  %243 = phi i32 [ 0, %146 ], [ %333, %__nv_rsqrtf.exit ]
+  %244 = or i32 %243, %18, !dbg !70
+  %245 = add i32 %244, %35, !dbg !71
+  %246 = sext i32 %245 to i64, !dbg !72
+  %247 = getelementptr float, ptr addrspace(1) %2, i64 %246, !dbg !72
+  %248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %249 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !73
+  %250 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !73
+  %251 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !73
+  %252 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !73
+  %253 = bitcast i32 %249 to float, !dbg !73
+  %254 = bitcast i32 %250 to float, !dbg !73
+  %255 = bitcast i32 %251 to float, !dbg !73
+  %256 = bitcast i32 %252 to float, !dbg !73
+  %257 = add i32 %244, %36, !dbg !74
+  %258 = sext i32 %257 to i64, !dbg !75
+  %259 = getelementptr i16, ptr addrspace(1) %3, i64 %258, !dbg !75
+  %260 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %259, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !76
+  %261 = extractvalue { i32, i32 } %260, 0, !dbg !76
+  %262 = extractvalue { i32, i32 } %260, 1, !dbg !76
+  %263 = trunc i32 %261 to i16, !dbg !76
+  %extelt.offset = lshr i32 %261, 16, !dbg !76
+  %264 = trunc i32 %extelt.offset to i16, !dbg !76
+  %265 = trunc i32 %262 to i16, !dbg !76
+  %extelt.offset2 = lshr i32 %262, 16, !dbg !76
+  %266 = trunc i32 %extelt.offset2 to i16, !dbg !76
+  %267 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %263) #6, !dbg !77
+  %268 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %264) #6, !dbg !77
+  %269 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %265) #6, !dbg !77
+  %270 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %266) #6, !dbg !77
+  %271 = zext nneg i32 %244 to i64, !dbg !78
+  %272 = getelementptr float, ptr addrspace(1) %4, i64 %271, !dbg !78
+  %273 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %272, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !79
+  %274 = extractvalue { i32, i32, i32, i32 } %273, 0, !dbg !79
+  %275 = extractvalue { i32, i32, i32, i32 } %273, 1, !dbg !79
+  %276 = extractvalue { i32, i32, i32, i32 } %273, 2, !dbg !79
+  %277 = extractvalue { i32, i32, i32, i32 } %273, 3, !dbg !79
+  %278 = bitcast i32 %274 to float, !dbg !79
+  %279 = bitcast i32 %275 to float, !dbg !79
+  %280 = bitcast i32 %276 to float, !dbg !79
+  %281 = bitcast i32 %277 to float, !dbg !79
+  br i1 %41, label %282, label %283, !dbg !80
+
+282:                                              ; preds = %242
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !80
+  br label %283, !dbg !80
+
+283:                                              ; preds = %282, %242
+  %284 = getelementptr float, ptr addrspace(1) %45, i64 %271, !dbg !81
+  %285 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %284, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !82
+  %286 = extractvalue { i32, i32, i32, i32 } %285, 0, !dbg !82
+  %287 = extractvalue { i32, i32, i32, i32 } %285, 1, !dbg !82
+  %288 = extractvalue { i32, i32, i32, i32 } %285, 2, !dbg !82
+  %289 = extractvalue { i32, i32, i32, i32 } %285, 3, !dbg !82
+  %290 = bitcast i32 %286 to float, !dbg !82
+  %291 = bitcast i32 %287 to float, !dbg !82
+  %292 = bitcast i32 %288 to float, !dbg !82
+  %293 = bitcast i32 %289 to float, !dbg !82
+  %294 = fadd float %253, %290, !dbg !83
+  %295 = fadd float %254, %291, !dbg !83
+  %296 = fadd float %255, %292, !dbg !83
+  %297 = fadd float %256, %293, !dbg !83
+  %298 = fadd float %267, %294, !dbg !84
+  %299 = fadd float %268, %295, !dbg !84
+  %300 = fadd float %269, %296, !dbg !84
+  %301 = fadd float %270, %297, !dbg !84
+  %302 = fsub float %298, %231, !dbg !85
+  %303 = fsub float %299, %231, !dbg !85
+  %304 = fsub float %300, %231, !dbg !85
+  %305 = fsub float %301, %231, !dbg !85
+  %306 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %.not.i = icmp eq i32 %306, 0, !dbg !86
+  br i1 %.not.i, label %309, label %307, !dbg !86
+
+307:                                              ; preds = %283
+  %308 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %241), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+
+309:                                              ; preds = %283
+  %310 = tail call float @llvm.nvvm.rsqrt.approx.f(float %241), !dbg !86
+  br label %__nv_rsqrtf.exit, !dbg !86
+
+__nv_rsqrtf.exit:                                 ; preds = %307, %309
+  %.0.i = phi float [ %308, %307 ], [ %310, %309 ], !dbg !86
+  %311 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %312 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %313 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !86
+  %314 = fmul float %302, %.0.i, !dbg !87
+  %315 = fmul float %303, %.0.i, !dbg !87
+  %316 = fmul float %304, %.0.i, !dbg !87
+  %317 = fmul float %305, %.0.i, !dbg !87
+  %318 = fmul float %314, %278, !dbg !88
+  %319 = fmul float %315, %279, !dbg !88
+  %320 = fmul float %316, %280, !dbg !88
+  %321 = fmul float %317, %281, !dbg !88
+  %322 = getelementptr i16, ptr addrspace(1) %5, i64 %258, !dbg !89
+  %323 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %318) #6, !dbg !90
+  %324 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %319) #6, !dbg !90
+  %325 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %320) #6, !dbg !90
+  %326 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %321) #6, !dbg !90
+  %327 = insertelement <2 x i16> undef, i16 %323, i64 0, !dbg !90
+  %328 = insertelement <2 x i16> %327, i16 %324, i64 1, !dbg !90
+  %329 = bitcast <2 x i16> %328 to i32, !dbg !90
+  %330 = insertelement <2 x i16> undef, i16 %325, i64 0, !dbg !90
+  %331 = insertelement <2 x i16> %330, i16 %326, i64 1, !dbg !90
+  %332 = bitcast <2 x i16> %331 to i32, !dbg !90
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %329, i32 %332, ptr addrspace(1) %322, i1 true) #6, !dbg !90
+  %333 = add nuw nsw i32 %243, 8, !dbg !69
+  %334 = icmp ult i32 %243, 248, !dbg !69
+  br i1 %334, label %242, label %335, !dbg !69
+
+335:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !91
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cpn3lawg65lpi63gv6c6pn4oikhg6qva2h2qjdpxe6qj4lvttwez.py", directory: "/tmp/torchinductor_root/pn")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 31, column: 36, scope: !7)
+!13 = !DILocation(line: 21, column: 28, scope: !7)
+!14 = !DILocation(line: 21, column: 33, scope: !7)
+!15 = !DILocation(line: 22, column: 23, scope: !7)
+!16 = !DILocation(line: 26, column: 30, scope: !7)
+!17 = !DILocation(line: 26, column: 35, scope: !7)
+!18 = !DILocation(line: 27, column: 18, scope: !7)
+!19 = !DILocation(line: 35, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 44, scope: !7)
+!21 = !DILocation(line: 37, column: 22, scope: !7)
+!22 = !DILocation(line: 38, column: 22, scope: !7)
+!23 = !DILocation(line: 39, column: 36, scope: !7)
+!24 = !DILocation(line: 40, column: 40, scope: !7)
+!25 = !DILocation(line: 41, column: 44, scope: !7)
+!26 = !DILocation(line: 32, column: 27, scope: !7)
+!27 = !DILocation(line: 35, column: 40, scope: !7)
+!28 = !DILocation(line: 35, column: 34, scope: !7)
+!29 = !DILocation(line: 35, column: 50, scope: !7)
+!30 = !DILocation(line: 36, column: 40, scope: !7)
+!31 = !DILocation(line: 36, column: 34, scope: !7)
+!32 = !DILocation(line: 36, column: 50, scope: !7)
+!33 = !DILocation(line: 36, column: 101, scope: !7)
+!34 = !DILocation(line: 40, column: 55, scope: !7)
+!35 = !DILocation(line: 41, column: 40, scope: !7)
+!36 = !DILocation(line: 41, column: 34, scope: !7)
+!37 = !DILocation(line: 41, column: 52, scope: !7)
+!38 = !DILocation(line: 42, column: 22, scope: !7)
+!39 = !DILocation(line: 44, column: 22, scope: !7)
+!40 = !DILocation(line: 96, column: 20, scope: !41, inlinedAt: !43)
+!41 = distinct !DILexicalBlockFile(scope: !7, file: !42, discriminator: 0)
+!42 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!43 = !DILocation(line: 47, column: 41, scope: !41)
+!44 = !DILocation(line: 97, column: 26, scope: !41, inlinedAt: !43)
+!45 = !DILocation(line: 98, column: 30, scope: !41, inlinedAt: !43)
+!46 = !DILocation(line: 98, column: 22, scope: !41, inlinedAt: !43)
+!47 = !DILocation(line: 101, column: 30, scope: !41, inlinedAt: !43)
+!48 = !DILocation(line: 101, column: 22, scope: !41, inlinedAt: !43)
+!49 = !DILocation(line: 50, column: 50, scope: !7)
+!50 = !DILocation(line: 108, column: 21, scope: !51, inlinedAt: !52)
+!51 = distinct !DILexicalBlockFile(scope: !41, file: !42, discriminator: 0)
+!52 = !DILocation(line: 120, column: 46, scope: !51, inlinedAt: !53)
+!53 = !DILocation(line: 53, column: 44, scope: !51)
+!54 = !DILocation(line: 109, column: 28, scope: !51, inlinedAt: !52)
+!55 = !DILocation(line: 110, column: 39, scope: !51, inlinedAt: !52)
+!56 = !DILocation(line: 110, column: 60, scope: !51, inlinedAt: !52)
+!57 = !DILocation(line: 110, column: 49, scope: !51, inlinedAt: !52)
+!58 = !DILocation(line: 112, column: 25, scope: !51, inlinedAt: !52)
+!59 = !DILocation(line: 112, column: 17, scope: !51, inlinedAt: !52)
+!60 = !DILocation(line: 113, column: 15, scope: !51, inlinedAt: !52)
+!61 = !DILocation(line: 113, column: 30, scope: !51, inlinedAt: !52)
+!62 = !DILocation(line: 113, column: 38, scope: !51, inlinedAt: !52)
+!63 = !DILocation(line: 113, column: 49, scope: !51, inlinedAt: !52)
+!64 = !DILocation(line: 113, column: 22, scope: !51, inlinedAt: !52)
+!65 = !DILocation(line: 120, column: 46, scope: !41, inlinedAt: !66)
+!66 = !DILocation(line: 53, column: 44, scope: !41)
+!67 = !DILocation(line: 75, column: 24, scope: !7)
+!68 = !DILocation(line: 77, column: 24, scope: !7)
+!69 = !DILocation(line: 58, column: 36, scope: !7)
+!70 = !DILocation(line: 59, column: 27, scope: !7)
+!71 = !DILocation(line: 62, column: 41, scope: !7)
+!72 = !DILocation(line: 62, column: 35, scope: !7)
+!73 = !DILocation(line: 62, column: 51, scope: !7)
+!74 = !DILocation(line: 63, column: 41, scope: !7)
+!75 = !DILocation(line: 63, column: 35, scope: !7)
+!76 = !DILocation(line: 63, column: 51, scope: !7)
+!77 = !DILocation(line: 63, column: 103, scope: !7)
+!78 = !DILocation(line: 64, column: 35, scope: !7)
+!79 = !DILocation(line: 64, column: 40, scope: !7)
+!80 = !DILocation(line: 68, column: 57, scope: !7)
+!81 = !DILocation(line: 69, column: 35, scope: !7)
+!82 = !DILocation(line: 69, column: 54, scope: !7)
+!83 = !DILocation(line: 70, column: 24, scope: !7)
+!84 = !DILocation(line: 72, column: 24, scope: !7)
+!85 = !DILocation(line: 73, column: 24, scope: !7)
+!86 = !DILocation(line: 78, column: 30, scope: !7)
+!87 = !DILocation(line: 79, column: 24, scope: !7)
+!88 = !DILocation(line: 80, column: 24, scope: !7)
+!89 = !DILocation(line: 82, column: 29, scope: !7)
+!90 = !DILocation(line: 82, column: 52, scope: !7)
+!91 = !DILocation(line: 58, column: 4, scope: !7)
diff --git a/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..eca3f80d15a8f035a7a5beb8debd0947f148f760
--- /dev/null
+++ b/.triton/dump/0471aff594c8c8b8715b81c529738739/triton_.ttgir
@@ -0,0 +1,165 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x8xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x8xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x8xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x8xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x8xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x8xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x8xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x8xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<64x8xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x8xi1, #blocked2>) -> tensor<64x8xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<64x8xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<64x8xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<64x8xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<64x8xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<64x8xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_4 : tensor<64x8xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_10 : tensor<64x8xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<64x8xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<64x8xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<64x8xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<64x8xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<64x8xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<64x8xi1, #blocked2>, tensor<64x8xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked2>, tensor<64x8xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<64x8xf32, #blocked2>) -> tensor<64x8xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_11 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_11, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>, tensor<64x8xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>, #blocked>
+    %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c8_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x8xi32, #blocked>
+      %50 = arith.addi %49, %12 : tensor<1x8xi32, #blocked>
+      %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x8xi32, #blocked>
+      %52 = tt.broadcast %50 : (tensor<1x8xi32, #blocked>) -> tensor<64x8xi32, #blocked>
+      %53 = arith.addi %52, %22 : tensor<64x8xi32, #blocked>
+      %54 = tt.addptr %23, %53 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %55 = tt.broadcast %51 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
+      %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %57 = arith.addi %52, %25 : tensor<64x8xi32, #blocked>
+      %58 = tt.addptr %26, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
+      %60 = arith.extf %59 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
+      %61 = tt.addptr %44, %50 : tensor<1x8x!tt.ptr<f32, 1>, #blocked>, tensor<1x8xi32, #blocked>
+      %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %63 = arith.extsi %50 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
+      %64 = tt.broadcast %63 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
+      %65 = arith.addi %64, %37 : tensor<64x8xi64, #blocked>
+      %66 = tt.addptr %38, %65 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
+      %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
+      %68 = arith.addf %67, %56 : tensor<64x8xf32, #blocked>
+      %69 = arith.addf %68, %60 : tensor<64x8xf32, #blocked>
+      %70 = arith.subf %69, %45 : tensor<64x8xf32, #blocked>
+      %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
+      %73 = arith.mulf %70, %72 : tensor<64x8xf32, #blocked>
+      %74 = tt.broadcast %62 : (tensor<1x8xf32, #blocked>) -> tensor<64x8xf32, #blocked>
+      %75 = arith.mulf %73, %74 : tensor<64x8xf32, #blocked>
+      %76 = tt.addptr %48, %57 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi32, #blocked>
+      %77 = arith.truncf %75 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
+      tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin b/.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..ea9cb339d9123dbb3d3fa0b2d3f013d3aedf1025
Binary files /dev/null and b/.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.cubin differ
diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..5a9bf79432ab80d3b016071fa9dd9cf0a26fb274
--- /dev/null
+++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ptx
@@ -0,0 +1,764 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1de
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1de(
+	.param .u64 triton__0d1de_param_0,
+	.param .u32 triton__0d1de_param_1
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<27>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<67>;
+	.reg .f32 	%f<431>;
+	.reg .b64 	%rd<6>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r14, %tid.x;
+	shl.b32 	%r15, %r14, 3;
+	and.b32  	%r16, %r15, 1016;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r17, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r18, %r17, %r16;
+	.loc	1 24 34
+	mul.wide.s32 	%rd4, %r18, 2;
+	add.s64 	%rd5, %rd3, %rd4;
+	mov.pred 	%p1, -1;
+	.loc	1 24 39
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd5 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	cvt.u16.u32 	%rs3, %r3;
+	.loc	1 24 48
+	cvt.f32.bf16 %r6, %rs1;
+	mov.b32 	%f1, %r6;
+	cvt.f32.bf16 %r7, %rs2;
+	mov.b32 	%f2, %r7;
+	.loc	1 29 18
+	mul.f32 	%f9, %f1, 0f3F3504F3;
+	.loc	1 30 23
+	abs.ftz.f32 	%f17, %f9;
+	setp.ge.f32 	%p2, %f17, 0f3F8060FE;
+	mov.f32 	%f365, 0f3789CA3C;
+	mov.f32 	%f364, 0fB9F560B9;
+	mov.f32 	%f363, 0f3BAC840B;
+	mov.f32 	%f362, 0fBD0C8162;
+	mov.f32 	%f361, 0f3E1CF906;
+	mov.f32 	%f360, 0f3F6A937E;
+	mov.f32 	%f359, 0f3F20D842;
+	mov.f32 	%f366, %f17;
+	@%p2 bra 	$L__BB0_2;
+	.loc	1 0 23
+	mov.f32 	%f365, 0f38B1E96A;
+	mov.f32 	%f364, 0fBA574D20;
+	mov.f32 	%f363, 0f3BAAD5EA;
+	mov.f32 	%f362, 0fBCDC1BE7;
+	mov.f32 	%f361, 0f3DE718AF;
+	mov.f32 	%f360, 0fBEC093AC;
+	mov.f32 	%f359, 0f3E0375D3;
+	.loc	1 30 23
+	mul.f32 	%f366, %f9, %f9;
+$L__BB0_2:
+	.loc	1 0 0
+	cvt.f32.bf16 %r8, %rs3;
+	mul.f32 	%f10, %f2, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p3, %f17, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f135, %f365, %f366, %f364;
+	fma.rn.ftz.f32 	%f136, %f135, %f366, %f363;
+	fma.rn.ftz.f32 	%f137, %f136, %f366, %f362;
+	fma.rn.ftz.f32 	%f138, %f137, %f366, %f361;
+	fma.rn.ftz.f32 	%f139, %f138, %f366, %f360;
+	fma.rn.ftz.f32 	%f140, %f139, %f366, %f359;
+	neg.f32 	%f141, %f366;
+	selp.f32 	%f142, %f141, %f9, %p2;
+	fma.rn.ftz.f32 	%f367, %f140, %f142, %f142;
+	mov.f32 	%f358, 0f3F800000;
+	@%p3 bra 	$L__BB0_4;
+	ex2.approx.ftz.f32 	%f143, %f367;
+	sub.f32 	%f145, %f358, %f143;
+	mov.b32 	%r19, %f145;
+	mov.b32 	%r20, %f9;
+	and.b32  	%r21, %r20, -2147483648;
+	or.b32  	%r22, %r21, %r19;
+	mov.b32 	%f367, %r22;
+$L__BB0_4:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+	mov.b32 	%f3, %r8;
+	.loc	1 30 23
+	abs.ftz.f32 	%f30, %f10;
+	setp.ge.f32 	%p5, %f30, 0f3F8060FE;
+	mov.f32 	%f374, 0f3789CA3C;
+	mov.f32 	%f373, 0fB9F560B9;
+	mov.f32 	%f372, 0f3BAC840B;
+	mov.f32 	%f371, 0fBD0C8162;
+	mov.f32 	%f370, 0f3E1CF906;
+	mov.f32 	%f369, 0f3F6A937E;
+	mov.f32 	%f368, 0f3F20D842;
+	mov.f32 	%f375, %f30;
+	@%p5 bra 	$L__BB0_6;
+	mul.f32 	%f375, %f10, %f10;
+	mov.f32 	%f374, 0f38B1E96A;
+	mov.f32 	%f373, 0fBA574D20;
+	mov.f32 	%f372, 0f3BAAD5EA;
+	mov.f32 	%f371, 0fBCDC1BE7;
+	mov.f32 	%f370, 0f3DE718AF;
+	mov.f32 	%f369, 0fBEC093AC;
+	mov.f32 	%f368, 0f3E0375D3;
+$L__BB0_6:
+	.loc	1 0 0
+	cvt.f32.bf16 %r9, %rs4;
+	mul.f32 	%f11, %f3, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p6, %f30, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f160, %f374, %f375, %f373;
+	fma.rn.ftz.f32 	%f161, %f160, %f375, %f372;
+	fma.rn.ftz.f32 	%f162, %f161, %f375, %f371;
+	fma.rn.ftz.f32 	%f163, %f162, %f375, %f370;
+	fma.rn.ftz.f32 	%f164, %f163, %f375, %f369;
+	fma.rn.ftz.f32 	%f165, %f164, %f375, %f368;
+	neg.f32 	%f166, %f375;
+	selp.f32 	%f167, %f166, %f10, %p5;
+	fma.rn.ftz.f32 	%f376, %f165, %f167, %f167;
+	@%p6 bra 	$L__BB0_8;
+	ex2.approx.ftz.f32 	%f168, %f376;
+	sub.f32 	%f170, %f358, %f168;
+	mov.b32 	%r23, %f170;
+	mov.b32 	%r24, %f10;
+	and.b32  	%r25, %r24, -2147483648;
+	or.b32  	%r26, %r25, %r23;
+	mov.b32 	%f376, %r26;
+$L__BB0_8:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs5, %r4;
+	mov.b32 	%f4, %r9;
+	.loc	1 30 23
+	abs.ftz.f32 	%f43, %f11;
+	setp.ge.f32 	%p8, %f43, 0f3F8060FE;
+	mov.f32 	%f383, 0f3789CA3C;
+	mov.f32 	%f382, 0fB9F560B9;
+	mov.f32 	%f381, 0f3BAC840B;
+	mov.f32 	%f380, 0fBD0C8162;
+	mov.f32 	%f379, 0f3E1CF906;
+	mov.f32 	%f378, 0f3F6A937E;
+	mov.f32 	%f377, 0f3F20D842;
+	mov.f32 	%f384, %f43;
+	@%p8 bra 	$L__BB0_10;
+	mul.f32 	%f384, %f11, %f11;
+	mov.f32 	%f383, 0f38B1E96A;
+	mov.f32 	%f382, 0fBA574D20;
+	mov.f32 	%f381, 0f3BAAD5EA;
+	mov.f32 	%f380, 0fBCDC1BE7;
+	mov.f32 	%f379, 0f3DE718AF;
+	mov.f32 	%f378, 0fBEC093AC;
+	mov.f32 	%f377, 0f3E0375D3;
+$L__BB0_10:
+	.loc	1 0 0
+	cvt.f32.bf16 %r10, %rs5;
+	mul.f32 	%f12, %f4, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p9, %f43, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f185, %f383, %f384, %f382;
+	fma.rn.ftz.f32 	%f186, %f185, %f384, %f381;
+	fma.rn.ftz.f32 	%f187, %f186, %f384, %f380;
+	fma.rn.ftz.f32 	%f188, %f187, %f384, %f379;
+	fma.rn.ftz.f32 	%f189, %f188, %f384, %f378;
+	fma.rn.ftz.f32 	%f190, %f189, %f384, %f377;
+	neg.f32 	%f191, %f384;
+	selp.f32 	%f192, %f191, %f11, %p8;
+	fma.rn.ftz.f32 	%f385, %f190, %f192, %f192;
+	@%p9 bra 	$L__BB0_12;
+	ex2.approx.ftz.f32 	%f193, %f385;
+	sub.f32 	%f195, %f358, %f193;
+	mov.b32 	%r27, %f195;
+	mov.b32 	%r28, %f11;
+	and.b32  	%r29, %r28, -2147483648;
+	or.b32  	%r30, %r29, %r27;
+	mov.b32 	%f385, %r30;
+$L__BB0_12:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
+	mov.b32 	%f5, %r10;
+	.loc	1 30 23
+	abs.ftz.f32 	%f56, %f12;
+	setp.ge.f32 	%p11, %f56, 0f3F8060FE;
+	mov.f32 	%f392, 0f3789CA3C;
+	mov.f32 	%f391, 0fB9F560B9;
+	mov.f32 	%f390, 0f3BAC840B;
+	mov.f32 	%f389, 0fBD0C8162;
+	mov.f32 	%f388, 0f3E1CF906;
+	mov.f32 	%f387, 0f3F6A937E;
+	mov.f32 	%f386, 0f3F20D842;
+	mov.f32 	%f393, %f56;
+	@%p11 bra 	$L__BB0_14;
+	mul.f32 	%f393, %f12, %f12;
+	mov.f32 	%f392, 0f38B1E96A;
+	mov.f32 	%f391, 0fBA574D20;
+	mov.f32 	%f390, 0f3BAAD5EA;
+	mov.f32 	%f389, 0fBCDC1BE7;
+	mov.f32 	%f388, 0f3DE718AF;
+	mov.f32 	%f387, 0fBEC093AC;
+	mov.f32 	%f386, 0f3E0375D3;
+$L__BB0_14:
+	.loc	1 0 0
+	cvt.f32.bf16 %r11, %rs6;
+	mul.f32 	%f13, %f5, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p12, %f56, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f210, %f392, %f393, %f391;
+	fma.rn.ftz.f32 	%f211, %f210, %f393, %f390;
+	fma.rn.ftz.f32 	%f212, %f211, %f393, %f389;
+	fma.rn.ftz.f32 	%f213, %f212, %f393, %f388;
+	fma.rn.ftz.f32 	%f214, %f213, %f393, %f387;
+	fma.rn.ftz.f32 	%f215, %f214, %f393, %f386;
+	neg.f32 	%f216, %f393;
+	selp.f32 	%f217, %f216, %f12, %p11;
+	fma.rn.ftz.f32 	%f394, %f215, %f217, %f217;
+	@%p12 bra 	$L__BB0_16;
+	ex2.approx.ftz.f32 	%f218, %f394;
+	sub.f32 	%f220, %f358, %f218;
+	mov.b32 	%r31, %f220;
+	mov.b32 	%r32, %f12;
+	and.b32  	%r33, %r32, -2147483648;
+	or.b32  	%r34, %r33, %r31;
+	mov.b32 	%f394, %r34;
+$L__BB0_16:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs7, %r5;
+	mov.b32 	%f6, %r11;
+	.loc	1 30 23
+	abs.ftz.f32 	%f69, %f13;
+	setp.ge.f32 	%p14, %f69, 0f3F8060FE;
+	mov.f32 	%f401, 0f3789CA3C;
+	mov.f32 	%f400, 0fB9F560B9;
+	mov.f32 	%f399, 0f3BAC840B;
+	mov.f32 	%f398, 0fBD0C8162;
+	mov.f32 	%f397, 0f3E1CF906;
+	mov.f32 	%f396, 0f3F6A937E;
+	mov.f32 	%f395, 0f3F20D842;
+	mov.f32 	%f402, %f69;
+	@%p14 bra 	$L__BB0_18;
+	mul.f32 	%f402, %f13, %f13;
+	mov.f32 	%f401, 0f38B1E96A;
+	mov.f32 	%f400, 0fBA574D20;
+	mov.f32 	%f399, 0f3BAAD5EA;
+	mov.f32 	%f398, 0fBCDC1BE7;
+	mov.f32 	%f397, 0f3DE718AF;
+	mov.f32 	%f396, 0fBEC093AC;
+	mov.f32 	%f395, 0f3E0375D3;
+$L__BB0_18:
+	.loc	1 0 0
+	cvt.f32.bf16 %r12, %rs7;
+	mul.f32 	%f14, %f6, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p15, %f69, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f235, %f401, %f402, %f400;
+	fma.rn.ftz.f32 	%f236, %f235, %f402, %f399;
+	fma.rn.ftz.f32 	%f237, %f236, %f402, %f398;
+	fma.rn.ftz.f32 	%f238, %f237, %f402, %f397;
+	fma.rn.ftz.f32 	%f239, %f238, %f402, %f396;
+	fma.rn.ftz.f32 	%f240, %f239, %f402, %f395;
+	neg.f32 	%f241, %f402;
+	selp.f32 	%f242, %f241, %f13, %p14;
+	fma.rn.ftz.f32 	%f403, %f240, %f242, %f242;
+	@%p15 bra 	$L__BB0_20;
+	ex2.approx.ftz.f32 	%f243, %f403;
+	sub.f32 	%f245, %f358, %f243;
+	mov.b32 	%r35, %f245;
+	mov.b32 	%r36, %f13;
+	and.b32  	%r37, %r36, -2147483648;
+	or.b32  	%r38, %r37, %r35;
+	mov.b32 	%f403, %r38;
+$L__BB0_20:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
+	mov.b32 	%f7, %r12;
+	.loc	1 30 23
+	abs.ftz.f32 	%f82, %f14;
+	setp.ge.f32 	%p17, %f82, 0f3F8060FE;
+	mov.f32 	%f410, 0f3789CA3C;
+	mov.f32 	%f409, 0fB9F560B9;
+	mov.f32 	%f408, 0f3BAC840B;
+	mov.f32 	%f407, 0fBD0C8162;
+	mov.f32 	%f406, 0f3E1CF906;
+	mov.f32 	%f405, 0f3F6A937E;
+	mov.f32 	%f404, 0f3F20D842;
+	mov.f32 	%f411, %f82;
+	@%p17 bra 	$L__BB0_22;
+	mul.f32 	%f411, %f14, %f14;
+	mov.f32 	%f410, 0f38B1E96A;
+	mov.f32 	%f409, 0fBA574D20;
+	mov.f32 	%f408, 0f3BAAD5EA;
+	mov.f32 	%f407, 0fBCDC1BE7;
+	mov.f32 	%f406, 0f3DE718AF;
+	mov.f32 	%f405, 0fBEC093AC;
+	mov.f32 	%f404, 0f3E0375D3;
+$L__BB0_22:
+	.loc	1 0 0
+	cvt.f32.bf16 %r13, %rs8;
+	mul.f32 	%f15, %f7, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p18, %f82, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f260, %f410, %f411, %f409;
+	fma.rn.ftz.f32 	%f261, %f260, %f411, %f408;
+	fma.rn.ftz.f32 	%f262, %f261, %f411, %f407;
+	fma.rn.ftz.f32 	%f263, %f262, %f411, %f406;
+	fma.rn.ftz.f32 	%f264, %f263, %f411, %f405;
+	fma.rn.ftz.f32 	%f265, %f264, %f411, %f404;
+	neg.f32 	%f266, %f411;
+	selp.f32 	%f267, %f266, %f14, %p17;
+	fma.rn.ftz.f32 	%f412, %f265, %f267, %f267;
+	@%p18 bra 	$L__BB0_24;
+	ex2.approx.ftz.f32 	%f268, %f412;
+	sub.f32 	%f270, %f358, %f268;
+	mov.b32 	%r39, %f270;
+	mov.b32 	%r40, %f14;
+	and.b32  	%r41, %r40, -2147483648;
+	or.b32  	%r42, %r41, %r39;
+	mov.b32 	%f412, %r42;
+$L__BB0_24:
+	.loc	1 0 0
+	mov.b32 	%f8, %r13;
+	.loc	1 30 23
+	abs.ftz.f32 	%f95, %f15;
+	setp.ge.f32 	%p20, %f95, 0f3F8060FE;
+	mov.f32 	%f419, 0f3789CA3C;
+	mov.f32 	%f418, 0fB9F560B9;
+	mov.f32 	%f417, 0f3BAC840B;
+	mov.f32 	%f416, 0fBD0C8162;
+	mov.f32 	%f415, 0f3E1CF906;
+	mov.f32 	%f414, 0f3F6A937E;
+	mov.f32 	%f413, 0f3F20D842;
+	mov.f32 	%f420, %f95;
+	@%p20 bra 	$L__BB0_26;
+	mul.f32 	%f420, %f15, %f15;
+	mov.f32 	%f419, 0f38B1E96A;
+	mov.f32 	%f418, 0fBA574D20;
+	mov.f32 	%f417, 0f3BAAD5EA;
+	mov.f32 	%f416, 0fBCDC1BE7;
+	mov.f32 	%f415, 0f3DE718AF;
+	mov.f32 	%f414, 0fBEC093AC;
+	mov.f32 	%f413, 0f3E0375D3;
+$L__BB0_26:
+	.loc	1 0 0
+	mul.f32 	%f16, %f8, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p21, %f95, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f285, %f419, %f420, %f418;
+	fma.rn.ftz.f32 	%f286, %f285, %f420, %f417;
+	fma.rn.ftz.f32 	%f287, %f286, %f420, %f416;
+	fma.rn.ftz.f32 	%f288, %f287, %f420, %f415;
+	fma.rn.ftz.f32 	%f289, %f288, %f420, %f414;
+	fma.rn.ftz.f32 	%f290, %f289, %f420, %f413;
+	neg.f32 	%f291, %f420;
+	selp.f32 	%f292, %f291, %f15, %p20;
+	fma.rn.ftz.f32 	%f421, %f290, %f292, %f292;
+	@%p21 bra 	$L__BB0_28;
+	ex2.approx.ftz.f32 	%f293, %f421;
+	sub.f32 	%f295, %f358, %f293;
+	mov.b32 	%r43, %f295;
+	mov.b32 	%r44, %f15;
+	and.b32  	%r45, %r44, -2147483648;
+	or.b32  	%r46, %r45, %r43;
+	mov.b32 	%f421, %r46;
+$L__BB0_28:
+	abs.ftz.f32 	%f108, %f16;
+	setp.ge.f32 	%p23, %f108, 0f3F8060FE;
+	mov.f32 	%f428, 0f3789CA3C;
+	mov.f32 	%f427, 0fB9F560B9;
+	mov.f32 	%f426, 0f3BAC840B;
+	mov.f32 	%f425, 0fBD0C8162;
+	mov.f32 	%f424, 0f3E1CF906;
+	mov.f32 	%f423, 0f3F6A937E;
+	mov.f32 	%f422, 0f3F20D842;
+	mov.f32 	%f429, %f108;
+	@%p23 bra 	$L__BB0_30;
+	mul.f32 	%f429, %f16, %f16;
+	mov.f32 	%f428, 0f38B1E96A;
+	mov.f32 	%f427, 0fBA574D20;
+	mov.f32 	%f426, 0f3BAAD5EA;
+	mov.f32 	%f425, 0fBCDC1BE7;
+	mov.f32 	%f424, 0f3DE718AF;
+	mov.f32 	%f423, 0fBEC093AC;
+	mov.f32 	%f422, 0f3E0375D3;
+$L__BB0_30:
+	setp.ltu.f32 	%p24, %f108, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f310, %f428, %f429, %f427;
+	fma.rn.ftz.f32 	%f311, %f310, %f429, %f426;
+	fma.rn.ftz.f32 	%f312, %f311, %f429, %f425;
+	fma.rn.ftz.f32 	%f313, %f312, %f429, %f424;
+	fma.rn.ftz.f32 	%f314, %f313, %f429, %f423;
+	fma.rn.ftz.f32 	%f315, %f314, %f429, %f422;
+	neg.f32 	%f316, %f429;
+	selp.f32 	%f317, %f316, %f16, %p23;
+	fma.rn.ftz.f32 	%f430, %f315, %f317, %f317;
+	@%p24 bra 	$L__BB0_32;
+	ex2.approx.ftz.f32 	%f318, %f430;
+	sub.f32 	%f320, %f358, %f318;
+	mov.b32 	%r47, %f320;
+	mov.b32 	%r48, %f16;
+	and.b32  	%r49, %r48, -2147483648;
+	or.b32  	%r50, %r49, %r47;
+	mov.b32 	%f430, %r50;
+$L__BB0_32:
+	.loc	1 27 18
+	mul.f32 	%f321, %f8, 0f3F000000;
+	mul.f32 	%f322, %f7, 0f3F000000;
+	mul.f32 	%f323, %f6, 0f3F000000;
+	mul.f32 	%f324, %f5, 0f3F000000;
+	mul.f32 	%f325, %f4, 0f3F000000;
+	mul.f32 	%f326, %f3, 0f3F000000;
+	mul.f32 	%f327, %f2, 0f3F000000;
+	mul.f32 	%f328, %f1, 0f3F000000;
+	.loc	1 32 18
+	add.f32 	%f329, %f367, 0f3F800000;
+	add.f32 	%f330, %f376, 0f3F800000;
+	add.f32 	%f331, %f385, 0f3F800000;
+	add.f32 	%f332, %f394, 0f3F800000;
+	add.f32 	%f333, %f403, 0f3F800000;
+	add.f32 	%f334, %f412, 0f3F800000;
+	add.f32 	%f335, %f421, 0f3F800000;
+	add.f32 	%f336, %f430, 0f3F800000;
+	.loc	1 33 18
+	mul.f32 	%f337, %f328, %f329;
+	mul.f32 	%f338, %f327, %f330;
+	mul.f32 	%f339, %f326, %f331;
+	mul.f32 	%f340, %f325, %f332;
+	mul.f32 	%f341, %f324, %f333;
+	mul.f32 	%f342, %f323, %f334;
+	mul.f32 	%f343, %f322, %f335;
+	mul.f32 	%f344, %f321, %f336;
+	.loc	1 35 40
+	mov.b32 	%r51, %f337;
+	cvt.rn.bf16.f32 %rs9, %r51;
+	mov.b32 	%r52, %f338;
+	cvt.rn.bf16.f32 %rs10, %r52;
+	mov.b32 	%r53, %f339;
+	cvt.rn.bf16.f32 %rs11, %r53;
+	mov.b32 	%r54, %f340;
+	cvt.rn.bf16.f32 %rs12, %r54;
+	mov.b32 	%r55, %f341;
+	cvt.rn.bf16.f32 %rs13, %r55;
+	mov.b32 	%r56, %f342;
+	cvt.rn.bf16.f32 %rs14, %r56;
+	mov.b32 	%r57, %f343;
+	cvt.rn.bf16.f32 %rs15, %r57;
+	mov.b32 	%r58, %f344;
+	cvt.rn.bf16.f32 %rs16, %r58;
+	mov.b32 	%r63, {%rs9, %rs10};
+	mov.b32 	%r64, {%rs11, %rs12};
+	mov.b32 	%r65, {%rs13, %rs14};
+	mov.b32 	%r66, {%rs15, %rs16};
+	@%p1 st.global.v4.b32 [ %rd5 + 0 ], { %r63, %r64, %r65, %r66 };
+	.loc	1 35 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	// .globl	__nv_erff
+.visible .func  (.param .b32 func_retval0) __nv_erff(
+	.param .b32 __nv_erff_param_0
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<49>;
+$L__func_begin1:
+
+	ld.param.f32 	%f14, [__nv_erff_param_0];
+	abs.ftz.f32 	%f1, %f14;
+	setp.ge.f32 	%p1, %f1, 0f3F8060FE;
+	mov.f32 	%f46, 0f3789CA3C;
+	mov.f32 	%f45, 0fB9F560B9;
+	mov.f32 	%f44, 0f3BAC840B;
+	mov.f32 	%f43, 0fBD0C8162;
+	mov.f32 	%f42, 0f3E1CF906;
+	mov.f32 	%f41, 0f3F6A937E;
+	mov.f32 	%f40, 0f3F20D842;
+	mov.f32 	%f47, %f1;
+	@%p1 bra 	$L__BB1_2;
+	mul.f32 	%f47, %f14, %f14;
+	mov.f32 	%f46, 0f38B1E96A;
+	mov.f32 	%f45, 0fBA574D20;
+	mov.f32 	%f44, 0f3BAAD5EA;
+	mov.f32 	%f43, 0fBCDC1BE7;
+	mov.f32 	%f42, 0f3DE718AF;
+	mov.f32 	%f41, 0fBEC093AC;
+	mov.f32 	%f40, 0f3E0375D3;
+$L__BB1_2:
+	setp.ltu.f32 	%p2, %f1, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f29, %f46, %f47, %f45;
+	fma.rn.ftz.f32 	%f30, %f29, %f47, %f44;
+	fma.rn.ftz.f32 	%f31, %f30, %f47, %f43;
+	fma.rn.ftz.f32 	%f32, %f31, %f47, %f42;
+	fma.rn.ftz.f32 	%f33, %f32, %f47, %f41;
+	fma.rn.ftz.f32 	%f34, %f33, %f47, %f40;
+	neg.f32 	%f35, %f47;
+	selp.f32 	%f36, %f35, %f14, %p1;
+	fma.rn.ftz.f32 	%f48, %f34, %f36, %f36;
+	@%p2 bra 	$L__BB1_4;
+	ex2.approx.ftz.f32 	%f37, %f48;
+	mov.f32 	%f38, 0f3F800000;
+	sub.f32 	%f39, %f38, %f37;
+	mov.b32 	%r1, %f39;
+	mov.b32 	%r2, %f14;
+	and.b32  	%r3, %r2, -2147483648;
+	or.b32  	%r4, %r3, %r1;
+	mov.b32 	%f48, %r4;
+$L__BB1_4:
+	st.param.f32 	[func_retval0+0], %f48;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/kp/ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 172
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 107
+.b8 112
+.b8 104
+.b8 114
+.b8 116
+.b8 100
+.b8 112
+.b8 103
+.b8 115
+.b8 120
+.b8 108
+.b8 55
+.b8 115
+.b8 102
+.b8 97
+.b8 114
+.b8 107
+.b8 107
+.b8 122
+.b8 121
+.b8 108
+.b8 104
+.b8 118
+.b8 52
+.b8 115
+.b8 116
+.b8 51
+.b8 117
+.b8 104
+.b8 109
+.b8 122
+.b8 118
+.b8 103
+.b8 51
+.b8 117
+.b8 54
+.b8 122
+.b8 53
+.b8 101
+.b8 120
+.b8 99
+.b8 102
+.b8 112
+.b8 54
+.b8 121
+.b8 100
+.b8 121
+.b8 98
+.b8 113
+.b8 55
+.b8 52
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 107
+.b8 112
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..aa5ee41897a9e19b28af66673e146870a146c655
--- /dev/null
+++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttgir
@@ -0,0 +1,26 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32, #blocked>
+    %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32, #blocked>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32, #blocked>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<1024xbf16, #blocked> to tensor<1024xf32, #blocked>
+    %9 = arith.mulf %8, %cst_1 : tensor<1024xf32, #blocked>
+    %10 = arith.mulf %8, %cst_0 : tensor<1024xf32, #blocked>
+    %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
+    %12 = arith.addf %11, %cst : tensor<1024xf32, #blocked>
+    %13 = arith.mulf %9, %12 : tensor<1024xf32, #blocked>
+    %14 = arith.truncf %13 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
+    tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..dad0184e8e826d75c09bb514f3cc2b556ca794fe
--- /dev/null
+++ b/.triton/dump/10ca9c2c168e8529fb752d28f80c40a5/triton_.ttir
@@ -0,0 +1,25 @@
+module {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
+    %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
+    %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
+    %9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
+    %10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
+    %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
+    %12 = arith.addf %11, %cst : tensor<1024xf32>
+    %13 = arith.mulf %9, %12 : tensor<1024xf32>
+    %14 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
+    tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..1f47dc5287c950c238eedd992273255c68891108
Binary files /dev/null and b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.cubin differ
diff --git a/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..fd110b3da99f6cd0f9c7f8441eaa1182137be8b3
--- /dev/null
+++ b/.triton/dump/127102ca642eeddf1a14ce3904d9fabf/triton_.ttgir
@@ -0,0 +1,60 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<32x1xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<32x1xi64, #blocked>
+    %cst_1 = arith.constant dense<512> : tensor<32x1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<32x1xi32, #blocked>
+    %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
+    %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #blocked1>
+    %cst_6 = arith.constant dense<true> : tensor<32x1xi1, #blocked>
+    %c32_i32 = arith.constant 32 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c32_i32 : i32
+    %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32x1xi32, #blocked1>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xi32, #blocked>
+    %6 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked1>
+    %7 = tt.splat %1 : (i32) -> tensor<32x1xi32, #blocked>
+    %8 = arith.addi %6, %4 : tensor<32x1xi32, #blocked1>
+    %9 = arith.addi %7, %5 : tensor<32x1xi32, #blocked>
+    %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
+    %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
+    %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
+    %14 = tt.broadcast %8 : (tensor<32x1xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
+    %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<32x128xi32, #blocked1>
+    %16 = arith.addi %14, %15 : tensor<32x128xi32, #blocked1>
+    %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<32x128x!tt.ptr<f32, 1>, #blocked1>
+    %18 = tt.addptr %17, %16 : tensor<32x128x!tt.ptr<f32, 1>, #blocked1>, tensor<32x128xi32, #blocked1>
+    %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<32x128xi1, #blocked1>
+    %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<32x128xf32, #blocked1>
+    %21 = arith.addf %20, %cst_5 : tensor<32x128xf32, #blocked1>
+    %22 = arith.select %19, %21, %cst_5 : tensor<32x128xi1, #blocked1>, tensor<32x128xf32, #blocked1>
+    %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %40 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<32x128xf32, #blocked1>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %24 = triton_gpu.convert_layout %23 : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<32x1xf32, #blocked>
+    %26 = arith.divsi %9, %cst_2 : tensor<32x1xi32, #blocked>
+    %27 = arith.remsi %9, %cst_2 : tensor<32x1xi32, #blocked>
+    %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<32x1x!tt.ptr<i64, 1>, #blocked>
+    %29 = tt.addptr %28, %26 : tensor<32x1x!tt.ptr<i64, 1>, #blocked>, tensor<32x1xi32, #blocked>
+    %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<32x1xi64, #blocked>
+    %31 = arith.addi %30, %cst_1 : tensor<32x1xi64, #blocked>
+    %32 = arith.cmpi slt, %30, %cst_0 : tensor<32x1xi64, #blocked>
+    %33 = arith.select %32, %31, %30 : tensor<32x1xi1, #blocked>, tensor<32x1xi64, #blocked>
+    %34 = arith.muli %33, %cst : tensor<32x1xi64, #blocked>
+    %35 = arith.extsi %27 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked>
+    %36 = arith.addi %35, %34 : tensor<32x1xi64, #blocked>
+    %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<32x1x!tt.ptr<f32, 1>, #blocked>
+    %38 = tt.addptr %37, %36 : tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xi64, #blocked>
+    %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<32x1x!tt.ptr<f32, 1>, #blocked>, tensor<32x1xf32, #blocked>, tensor<32x1xi1, #blocked>) -> tensor<32x1xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..726b9b50b7b7c50d5dcd7fbac48c6d8ea6d50a08
--- /dev/null
+++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.llir
@@ -0,0 +1,184 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp7 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2) local_unnamed_addr !dbg !7 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %5 = and i32 %4, 127, !dbg !10
+  %6 = shl nuw nsw i32 %5, 1, !dbg !10
+  %7 = or i32 %6, 1, !dbg !10
+  %8 = or i32 %6, 256, !dbg !10
+  %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = shl nsw i64 %10, 9, !dbg !13
+  %12 = zext nneg i32 %6 to i64
+  %13 = zext nneg i32 %8 to i64
+  %14 = or i64 %11, %12, !dbg !14
+  %15 = or i64 %11, %13, !dbg !14
+  %16 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !15
+  %17 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !15
+  %18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %16, i1 true) #2, !dbg !16
+  %19 = extractvalue { i64, i64 } %18, 0, !dbg !16
+  %20 = extractvalue { i64, i64 } %18, 1, !dbg !16
+  %21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %17, i1 true) #2, !dbg !16
+  %22 = extractvalue { i64, i64 } %21, 0, !dbg !16
+  %23 = extractvalue { i64, i64 } %21, 1, !dbg !16
+  %24 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !17
+  %25 = insertelement <4 x i64> %24, i64 %22, i64 1, !dbg !17
+  %26 = insertelement <4 x i64> %25, i64 %20, i64 2, !dbg !17
+  %27 = insertelement <4 x i64> %26, i64 %19, i64 3, !dbg !17
+  %28 = icmp eq <4 x i64> %27, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !17
+  %29 = select <4 x i1> %28, <4 x i64> zeroinitializer, <4 x i64> %27, !dbg !18
+  %30 = add <4 x i64> %29, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !19
+  %31 = icmp slt <4 x i64> %29, zeroinitializer, !dbg !20
+  %32 = select <4 x i1> %31, <4 x i64> %30, <4 x i64> %29, !dbg !21
+  %33 = icmp ult <4 x i64> %32, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
+  %34 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %12, !dbg !22
+  %35 = extractelement <4 x i1> %33, i64 3, !dbg !22
+  %36 = zext i1 %35 to i8, !dbg !22
+  %37 = insertelement <1 x i8> undef, i8 %36, i64 0, !dbg !22
+  store <1 x i8> %37, ptr addrspace(3) %34, align 1, !dbg !22
+  %38 = zext nneg i32 %7 to i64, !dbg !22
+  %39 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %38, !dbg !22
+  %40 = extractelement <4 x i1> %33, i64 2, !dbg !22
+  %41 = zext i1 %40 to i8, !dbg !22
+  %42 = insertelement <1 x i8> undef, i8 %41, i64 0, !dbg !22
+  store <1 x i8> %42, ptr addrspace(3) %39, align 1, !dbg !22
+  tail call void @llvm.nvvm.barrier0(), !dbg !22
+  %43 = zext nneg i32 %5 to i64, !dbg !22
+  %44 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %43, !dbg !22
+  %45 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
+  %46 = or i32 %5, 128, !dbg !22
+  %47 = zext nneg i32 %46 to i64, !dbg !22
+  %48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %47, !dbg !22
+  %49 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
+  tail call void @llvm.nvvm.barrier0(), !dbg !22
+  %50 = extractelement <4 x i1> %33, i64 1, !dbg !22
+  %51 = zext i1 %50 to i8, !dbg !22
+  %52 = insertelement <1 x i8> undef, i8 %51, i64 0, !dbg !22
+  store <1 x i8> %52, ptr addrspace(3) %34, align 1, !dbg !22
+  %53 = extractelement <4 x i1> %33, i64 0, !dbg !22
+  %54 = zext i1 %53 to i8, !dbg !22
+  %55 = insertelement <1 x i8> undef, i8 %54, i64 0, !dbg !22
+  store <1 x i8> %55, ptr addrspace(3) %39, align 1, !dbg !22
+  tail call void @llvm.nvvm.barrier0(), !dbg !22
+  %56 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
+  %57 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
+  %58 = insertelement <4 x i8> poison, i8 %49, i64 0, !dbg !22
+  %59 = insertelement <4 x i8> %58, i8 %45, i64 1, !dbg !22
+  %60 = insertelement <4 x i8> %59, i8 %56, i64 2, !dbg !22
+  %61 = insertelement <4 x i8> %60, i8 %57, i64 3, !dbg !22
+  %62 = icmp eq <4 x i8> %61, zeroinitializer, !dbg !22
+  %63 = bitcast <4 x i1> %62 to i4, !dbg !23
+  %.not = icmp eq i4 %63, 0, !dbg !23
+  br i1 %.not, label %65, label %64, !dbg !23
+
+64:                                               ; preds = %3
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
+  br label %65, !dbg !23
+
+65:                                               ; preds = %64, %3
+  %66 = or i32 %6, 257, !dbg !10
+  %67 = zext nneg i32 %66 to i64
+  %68 = or i64 %11, %67, !dbg !14
+  %69 = or i64 %11, %38, !dbg !14
+  %70 = mul nsw i64 %14, 50257, !dbg !24
+  %71 = mul nsw i64 %69, 50257, !dbg !24
+  %72 = mul nsw i64 %15, 50257, !dbg !24
+  %73 = mul nsw i64 %68, 50257, !dbg !24
+  %74 = extractelement <4 x i64> %32, i64 3, !dbg !25
+  %75 = getelementptr float, ptr addrspace(1) %1, i64 %74, !dbg !25
+  %76 = getelementptr float, ptr addrspace(1) %75, i64 %70, !dbg !25
+  %77 = extractelement <4 x i64> %32, i64 2, !dbg !25
+  %78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !25
+  %79 = getelementptr float, ptr addrspace(1) %78, i64 %71, !dbg !25
+  %80 = extractelement <4 x i64> %32, i64 1, !dbg !25
+  %81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !25
+  %82 = getelementptr float, ptr addrspace(1) %81, i64 %72, !dbg !25
+  %83 = extractelement <4 x i64> %32, i64 0, !dbg !25
+  %84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !25
+  %85 = getelementptr float, ptr addrspace(1) %84, i64 %73, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !26
+  %86 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %12, !dbg !26
+  %87 = ptrtoint ptr addrspace(1) %76 to i64, !dbg !26
+  %88 = insertelement <1 x i64> undef, i64 %87, i64 0, !dbg !26
+  store <1 x i64> %88, ptr addrspace(3) %86, align 8, !dbg !26
+  %89 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %38, !dbg !26
+  %90 = ptrtoint ptr addrspace(1) %79 to i64, !dbg !26
+  %91 = insertelement <1 x i64> undef, i64 %90, i64 0, !dbg !26
+  store <1 x i64> %91, ptr addrspace(3) %89, align 8, !dbg !26
+  tail call void @llvm.nvvm.barrier0(), !dbg !26
+  %92 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %43, !dbg !26
+  %93 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
+  %94 = inttoptr i64 %93 to ptr addrspace(1), !dbg !26
+  %95 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %47, !dbg !26
+  %96 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
+  %97 = inttoptr i64 %96 to ptr addrspace(1), !dbg !26
+  tail call void @llvm.nvvm.barrier0(), !dbg !26
+  %98 = ptrtoint ptr addrspace(1) %82 to i64, !dbg !26
+  %99 = insertelement <1 x i64> undef, i64 %98, i64 0, !dbg !26
+  store <1 x i64> %99, ptr addrspace(3) %86, align 8, !dbg !26
+  %100 = ptrtoint ptr addrspace(1) %85 to i64, !dbg !26
+  %101 = insertelement <1 x i64> undef, i64 %100, i64 0, !dbg !26
+  store <1 x i64> %101, ptr addrspace(3) %89, align 8, !dbg !26
+  tail call void @llvm.nvvm.barrier0(), !dbg !26
+  %102 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
+  %103 = inttoptr i64 %102 to ptr addrspace(1), !dbg !26
+  %104 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
+  %105 = inttoptr i64 %104 to ptr addrspace(1), !dbg !26
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %94, i1 true) #2, !dbg !26
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %97, i1 true) #2, !dbg !26
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %103, i1 true) #2, !dbg !26
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %105, i1 true) #2, !dbg !26
+  ret void, !dbg !27
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py", directory: "/tmp/torchinductor_root/hl")
+!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 21, column: 36, scope: !7)
+!11 = !DILocation(line: 20, column: 28, scope: !7)
+!12 = !DILocation(line: 20, column: 34, scope: !7)
+!13 = !DILocation(line: 20, column: 46, scope: !7)
+!14 = !DILocation(line: 21, column: 23, scope: !7)
+!15 = !DILocation(line: 24, column: 30, scope: !7)
+!16 = !DILocation(line: 24, column: 35, scope: !7)
+!17 = !DILocation(line: 26, column: 19, scope: !7)
+!18 = !DILocation(line: 28, column: 32, scope: !7)
+!19 = !DILocation(line: 29, column: 18, scope: !7)
+!20 = !DILocation(line: 30, column: 18, scope: !7)
+!21 = !DILocation(line: 31, column: 32, scope: !7)
+!22 = !DILocation(line: 32, column: 36, scope: !7)
+!23 = !DILocation(line: 32, column: 51, scope: !7)
+!24 = !DILocation(line: 34, column: 39, scope: !7)
+!25 = !DILocation(line: 34, column: 25, scope: !7)
+!26 = !DILocation(line: 34, column: 51, scope: !7)
+!27 = !DILocation(line: 34, column: 4, scope: !7)
diff --git a/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..e529fbb37d6106d704c605c1d9f324cd4315370c
Binary files /dev/null and b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.cubin differ
diff --git a/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..e394324cf70ba1d55f57e9682dc705e97894c983
--- /dev/null
+++ b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %10 = arith.truncf %7 : tensor<1024xf32> to tensor<1024xbf16>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin b/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..d5c8c8dc1cf687b33afb9a61757693075a0c575a
Binary files /dev/null and b/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.cubin differ
diff --git a/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir b/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..01ebfdf197420664c1cdc3232bcfa51a64577e09
--- /dev/null
+++ b/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.llir
@@ -0,0 +1,332 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1de(ptr addrspace(1) %0, i32 %1) local_unnamed_addr !dbg !7 {
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %4 = shl i32 %3, 1, !dbg !10
+  %5 = and i32 %4, 510, !dbg !10
+  %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #4, !dbg !11
+  %7 = shl i32 %6, 9, !dbg !12
+  %8 = or i32 %7, %5, !dbg !13
+  %9 = sext i32 %8 to i64, !dbg !14
+  %10 = getelementptr i16, ptr addrspace(1) %0, i64 %9, !dbg !14
+  %11 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %10, i1 true) #4, !dbg !15
+  %12 = trunc i32 %11 to i16, !dbg !15
+  %extelt.offset = lshr i32 %11, 16, !dbg !15
+  %13 = trunc i32 %extelt.offset to i16, !dbg !15
+  %14 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %12) #4, !dbg !16
+  %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #4, !dbg !16
+  %16 = fmul float %14, 0x3FE6A09E60000000, !dbg !17
+  %17 = fmul float %15, 0x3FE6A09E60000000, !dbg !17
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i = icmp eq i32 %18, 0, !dbg !18
+  %19 = tail call float @llvm.nvvm.fabs.ftz.f(float %16) #4, !dbg !18
+  %20 = tail call float @llvm.nvvm.fabs.f(float %16) #4, !dbg !18
+  %.0.i = select i1 %.not.i, float %20, float %19, !dbg !18
+  %21 = fcmp oge float %.0.i, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %21, label %__nv_fabsf.exit1.i, label %23, !dbg !18
+
+__nv_fabsf.exit1.i:                               ; preds = %2
+  %22 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i = icmp eq i32 %22, 0, !dbg !18
+  %.01.i = select i1 %.not1.i, float %20, float %19, !dbg !18
+  br label %__internal_fmad.exit.i, !dbg !18
+
+23:                                               ; preds = %2
+  %24 = fmul float %16, %16, !dbg !18
+  br label %__internal_fmad.exit.i, !dbg !18
+
+__internal_fmad.exit.i:                           ; preds = %23, %__nv_fabsf.exit1.i
+  %25 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i ], [ 0x3FC06EBA60000000, %23 ], !dbg !18
+  %26 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i ], [ 0xBFD8127580000000, %23 ], !dbg !18
+  %27 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i ], [ 0x3FBCE315E0000000, %23 ], !dbg !18
+  %28 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i ], [ 0xBF9B837CE0000000, %23 ], !dbg !18
+  %29 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i ], [ 0x3F755ABD40000000, %23 ], !dbg !18
+  %30 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i ], [ 0xBF4AE9A400000000, %23 ], !dbg !18
+  %31 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i ], [ 0x3F163D2D40000000, %23 ], !dbg !18
+  %32 = phi float [ %.01.i, %__nv_fabsf.exit1.i ], [ %24, %23 ], !dbg !18
+  %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i = icmp eq i32 %33, 0, !dbg !18
+  %34 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %31, float %32, float %30) #4, !dbg !18
+  %35 = tail call float @llvm.nvvm.fma.rn.f(float %31, float %32, float %30) #4, !dbg !18
+  %.02.i = select i1 %.not2.i, float %35, float %34, !dbg !18
+  %36 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i = icmp eq i32 %36, 0, !dbg !18
+  %37 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float %32, float %29) #4, !dbg !18
+  %38 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float %32, float %29) #4, !dbg !18
+  %.03.i = select i1 %.not3.i, float %38, float %37, !dbg !18
+  %39 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i = icmp eq i32 %39, 0, !dbg !18
+  %40 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i, float %32, float %28) #4, !dbg !18
+  %41 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i, float %32, float %28) #4, !dbg !18
+  %.04.i = select i1 %.not4.i, float %41, float %40, !dbg !18
+  %42 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i = icmp eq i32 %42, 0, !dbg !18
+  %43 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i, float %32, float %27) #4, !dbg !18
+  %44 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i, float %32, float %27) #4, !dbg !18
+  %.05.i = select i1 %.not5.i, float %44, float %43, !dbg !18
+  %45 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i = icmp eq i32 %45, 0, !dbg !18
+  %46 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %32, float %26) #4, !dbg !18
+  %47 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %32, float %26) #4, !dbg !18
+  %.06.i = select i1 %.not6.i, float %47, float %46, !dbg !18
+  %48 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i = icmp eq i32 %48, 0, !dbg !18
+  %49 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i, float %32, float %25) #4, !dbg !18
+  %50 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i, float %32, float %25) #4, !dbg !18
+  %.07.i = select i1 %.not7.i, float %50, float %49, !dbg !18
+  %51 = fneg float %32, !dbg !18
+  %52 = select i1 %21, float %51, float %16, !dbg !18
+  %53 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i = icmp eq i32 %53, 0, !dbg !18
+  %54 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i, float %52, float %52) #4, !dbg !18
+  %55 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i, float %52, float %52) #4, !dbg !18
+  %.08.i = select i1 %.not8.i, float %55, float %54, !dbg !18
+  br i1 %21, label %56, label %__nv_erff.exit, !dbg !18
+
+56:                                               ; preds = %__internal_fmad.exit.i
+  %57 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i) #4, !dbg !18
+  %58 = fsub float 1.000000e+00, %57, !dbg !18
+  %59 = bitcast float %58 to i32, !dbg !18
+  %60 = bitcast float %16 to i32, !dbg !18
+  %61 = and i32 %60, -2147483648, !dbg !18
+  %62 = or i32 %61, %59, !dbg !18
+  %63 = bitcast i32 %62 to float, !dbg !18
+  br label %__nv_erff.exit, !dbg !18
+
+__nv_erff.exit:                                   ; preds = %__internal_fmad.exit.i, %56
+  %r.0.i = phi float [ %63, %56 ], [ %.08.i, %__internal_fmad.exit.i ], !dbg !18
+  %64 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not.i1 = icmp eq i32 %64, 0, !dbg !18
+  %65 = tail call float @llvm.nvvm.fabs.ftz.f(float %17) #4, !dbg !18
+  %66 = tail call float @llvm.nvvm.fabs.f(float %17) #4, !dbg !18
+  %.0.i2 = select i1 %.not.i1, float %66, float %65, !dbg !18
+  %67 = fcmp oge float %.0.i2, 0x3FF00C1FC0000000, !dbg !18
+  br i1 %67, label %__nv_fabsf.exit1.i19, label %69, !dbg !18
+
+__nv_fabsf.exit1.i19:                             ; preds = %__nv_erff.exit
+  %68 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not1.i20 = icmp eq i32 %68, 0, !dbg !18
+  %.01.i21 = select i1 %.not1.i20, float %66, float %65, !dbg !18
+  br label %__internal_fmad.exit.i3, !dbg !18
+
+69:                                               ; preds = %__nv_erff.exit
+  %70 = fmul float %17, %17, !dbg !18
+  br label %__internal_fmad.exit.i3, !dbg !18
+
+__internal_fmad.exit.i3:                          ; preds = %69, %__nv_fabsf.exit1.i19
+  %71 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1.i19 ], [ 0x3FC06EBA60000000, %69 ], !dbg !18
+  %72 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1.i19 ], [ 0xBFD8127580000000, %69 ], !dbg !18
+  %73 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1.i19 ], [ 0x3FBCE315E0000000, %69 ], !dbg !18
+  %74 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1.i19 ], [ 0xBF9B837CE0000000, %69 ], !dbg !18
+  %75 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1.i19 ], [ 0x3F755ABD40000000, %69 ], !dbg !18
+  %76 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1.i19 ], [ 0xBF4AE9A400000000, %69 ], !dbg !18
+  %77 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1.i19 ], [ 0x3F163D2D40000000, %69 ], !dbg !18
+  %78 = phi float [ %.01.i21, %__nv_fabsf.exit1.i19 ], [ %70, %69 ], !dbg !18
+  %79 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not2.i4 = icmp eq i32 %79, 0, !dbg !18
+  %80 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %77, float %78, float %76) #4, !dbg !18
+  %81 = tail call float @llvm.nvvm.fma.rn.f(float %77, float %78, float %76) #4, !dbg !18
+  %.02.i5 = select i1 %.not2.i4, float %81, float %80, !dbg !18
+  %82 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not3.i6 = icmp eq i32 %82, 0, !dbg !18
+  %83 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i5, float %78, float %75) #4, !dbg !18
+  %84 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i5, float %78, float %75) #4, !dbg !18
+  %.03.i7 = select i1 %.not3.i6, float %84, float %83, !dbg !18
+  %85 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not4.i8 = icmp eq i32 %85, 0, !dbg !18
+  %86 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03.i7, float %78, float %74) #4, !dbg !18
+  %87 = tail call float @llvm.nvvm.fma.rn.f(float %.03.i7, float %78, float %74) #4, !dbg !18
+  %.04.i9 = select i1 %.not4.i8, float %87, float %86, !dbg !18
+  %88 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not5.i10 = icmp eq i32 %88, 0, !dbg !18
+  %89 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04.i9, float %78, float %73) #4, !dbg !18
+  %90 = tail call float @llvm.nvvm.fma.rn.f(float %.04.i9, float %78, float %73) #4, !dbg !18
+  %.05.i11 = select i1 %.not5.i10, float %90, float %89, !dbg !18
+  %91 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not6.i12 = icmp eq i32 %91, 0, !dbg !18
+  %92 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i11, float %78, float %72) #4, !dbg !18
+  %93 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i11, float %78, float %72) #4, !dbg !18
+  %.06.i13 = select i1 %.not6.i12, float %93, float %92, !dbg !18
+  %94 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not7.i14 = icmp eq i32 %94, 0, !dbg !18
+  %95 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06.i13, float %78, float %71) #4, !dbg !18
+  %96 = tail call float @llvm.nvvm.fma.rn.f(float %.06.i13, float %78, float %71) #4, !dbg !18
+  %.07.i15 = select i1 %.not7.i14, float %96, float %95, !dbg !18
+  %97 = fneg float %78, !dbg !18
+  %98 = select i1 %67, float %97, float %17, !dbg !18
+  %99 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4, !dbg !18
+  %.not8.i16 = icmp eq i32 %99, 0, !dbg !18
+  %100 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07.i15, float %98, float %98) #4, !dbg !18
+  %101 = tail call float @llvm.nvvm.fma.rn.f(float %.07.i15, float %98, float %98) #4, !dbg !18
+  %.08.i17 = select i1 %.not8.i16, float %101, float %100, !dbg !18
+  br i1 %67, label %102, label %__nv_erff.exit22, !dbg !18
+
+102:                                              ; preds = %__internal_fmad.exit.i3
+  %103 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08.i17) #4, !dbg !18
+  %104 = fsub float 1.000000e+00, %103, !dbg !18
+  %105 = bitcast float %104 to i32, !dbg !18
+  %106 = bitcast float %17 to i32, !dbg !18
+  %107 = and i32 %106, -2147483648, !dbg !18
+  %108 = or i32 %107, %105, !dbg !18
+  %109 = bitcast i32 %108 to float, !dbg !18
+  br label %__nv_erff.exit22, !dbg !18
+
+__nv_erff.exit22:                                 ; preds = %__internal_fmad.exit.i3, %102
+  %r.0.i18 = phi float [ %109, %102 ], [ %.08.i17, %__internal_fmad.exit.i3 ], !dbg !18
+  %110 = fmul float %15, 5.000000e-01, !dbg !19
+  %111 = fmul float %14, 5.000000e-01, !dbg !19
+  %112 = fadd float %r.0.i, 1.000000e+00, !dbg !20
+  %113 = fadd float %r.0.i18, 1.000000e+00, !dbg !20
+  %114 = fmul float %111, %112, !dbg !21
+  %115 = fmul float %110, %113, !dbg !21
+  %116 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %114) #4, !dbg !22
+  %117 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %115) #4, !dbg !22
+  %118 = insertelement <2 x i16> undef, i16 %116, i64 0, !dbg !22
+  %119 = insertelement <2 x i16> %118, i16 %117, i64 1, !dbg !22
+  %120 = bitcast <2 x i16> %119 to i32, !dbg !22
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %120, ptr addrspace(1) %10, i1 true) #4, !dbg !22
+  ret void, !dbg !23
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_erff(float %a) local_unnamed_addr #1 {
+__nv_fabsf.exit:
+  %0 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not = icmp eq i32 %0, 0
+  %1 = tail call float @llvm.nvvm.fabs.ftz.f(float %a) #4
+  %2 = tail call float @llvm.nvvm.fabs.f(float %a) #4
+  %.0 = select i1 %.not, float %2, float %1
+  %3 = fcmp oge float %.0, 0x3FF00C1FC0000000
+  br i1 %3, label %__nv_fabsf.exit1, label %5
+
+__nv_fabsf.exit1:                                 ; preds = %__nv_fabsf.exit
+  %4 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not1 = icmp eq i32 %4, 0
+  %.01 = select i1 %.not1, float %2, float %1
+  br label %__internal_fmad.exit
+
+5:                                                ; preds = %__nv_fabsf.exit
+  %6 = fmul float %a, %a
+  br label %__internal_fmad.exit
+
+__internal_fmad.exit:                             ; preds = %5, %__nv_fabsf.exit1
+  %7 = phi float [ 0x3FE41B0840000000, %__nv_fabsf.exit1 ], [ 0x3FC06EBA60000000, %5 ]
+  %8 = phi float [ 0x3FED526FC0000000, %__nv_fabsf.exit1 ], [ 0xBFD8127580000000, %5 ]
+  %9 = phi float [ 0x3FC39F20C0000000, %__nv_fabsf.exit1 ], [ 0x3FBCE315E0000000, %5 ]
+  %10 = phi float [ 0xBFA1902C40000000, %__nv_fabsf.exit1 ], [ 0xBF9B837CE0000000, %5 ]
+  %11 = phi float [ 0x3F75908160000000, %__nv_fabsf.exit1 ], [ 0x3F755ABD40000000, %5 ]
+  %12 = phi float [ 0xBF3EAC1720000000, %__nv_fabsf.exit1 ], [ 0xBF4AE9A400000000, %5 ]
+  %13 = phi float [ 0x3EF1394780000000, %__nv_fabsf.exit1 ], [ 0x3F163D2D40000000, %5 ]
+  %14 = phi float [ %.01, %__nv_fabsf.exit1 ], [ %6, %5 ]
+  %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not2 = icmp eq i32 %15, 0
+  %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %13, float %14, float %12) #4
+  %17 = tail call float @llvm.nvvm.fma.rn.f(float %13, float %14, float %12) #4
+  %.02 = select i1 %.not2, float %17, float %16
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not3 = icmp eq i32 %18, 0
+  %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float %14, float %11) #4
+  %20 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float %14, float %11) #4
+  %.03 = select i1 %.not3, float %20, float %19
+  %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not4 = icmp eq i32 %21, 0
+  %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.03, float %14, float %10) #4
+  %23 = tail call float @llvm.nvvm.fma.rn.f(float %.03, float %14, float %10) #4
+  %.04 = select i1 %.not4, float %23, float %22
+  %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not5 = icmp eq i32 %24, 0
+  %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.04, float %14, float %9) #4
+  %26 = tail call float @llvm.nvvm.fma.rn.f(float %.04, float %14, float %9) #4
+  %.05 = select i1 %.not5, float %26, float %25
+  %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not6 = icmp eq i32 %27, 0
+  %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %14, float %8) #4
+  %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %14, float %8) #4
+  %.06 = select i1 %.not6, float %29, float %28
+  %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not7 = icmp eq i32 %30, 0
+  %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.06, float %14, float %7) #4
+  %32 = tail call float @llvm.nvvm.fma.rn.f(float %.06, float %14, float %7) #4
+  %.07 = select i1 %.not7, float %32, float %31
+  %33 = fneg float %14
+  %34 = select i1 %3, float %33, float %a
+  %35 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #4
+  %.not8 = icmp eq i32 %35, 0
+  %36 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.07, float %34, float %34) #4
+  %37 = tail call float @llvm.nvvm.fma.rn.f(float %.07, float %34, float %34) #4
+  %.08 = select i1 %.not8, float %37, float %36
+  br i1 %3, label %38, label %46
+
+38:                                               ; preds = %__internal_fmad.exit
+  %39 = tail call float @llvm.nvvm.ex2.approx.ftz.f(float %.08) #4
+  %40 = fsub float 1.000000e+00, %39
+  %41 = bitcast float %40 to i32
+  %42 = bitcast float %a to i32
+  %43 = and i32 %42, -2147483648
+  %44 = or i32 %43, %41
+  %45 = bitcast i32 %44 to float
+  br label %46
+
+46:                                               ; preds = %38, %__internal_fmad.exit
+  %r.0 = phi float [ %45, %38 ], [ %.08, %__internal_fmad.exit ]
+  ret float %r.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.ftz.f(float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fabs.f(float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.ex2.approx.ftz.f(float) #3
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "ckphrtdpgsxl7sfarkkzylhv4st3uhmzvg3u6z5excfp6ydybq74.py", directory: "/tmp/torchinductor_root/kp")
+!4 = !{ptr @triton__0d1de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 21, column: 36, scope: !7)
+!11 = !DILocation(line: 20, column: 28, scope: !7)
+!12 = !DILocation(line: 20, column: 33, scope: !7)
+!13 = !DILocation(line: 21, column: 23, scope: !7)
+!14 = !DILocation(line: 24, column: 34, scope: !7)
+!15 = !DILocation(line: 24, column: 39, scope: !7)
+!16 = !DILocation(line: 24, column: 48, scope: !7)
+!17 = !DILocation(line: 29, column: 18, scope: !7)
+!18 = !DILocation(line: 30, column: 23, scope: !7)
+!19 = !DILocation(line: 27, column: 18, scope: !7)
+!20 = !DILocation(line: 32, column: 18, scope: !7)
+!21 = !DILocation(line: 33, column: 18, scope: !7)
+!22 = !DILocation(line: 35, column: 40, scope: !7)
+!23 = !DILocation(line: 35, column: 4, scope: !7)
diff --git a/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir b/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..dba5072301c93a1b7f84b3d298b47b46546ec357
--- /dev/null
+++ b/.triton/dump/1e922bbbab749da355e4bad9c6b245e6/triton_.ttir
@@ -0,0 +1,25 @@
+module {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<512xf32>
+    %cst_0 = arith.constant dense<0.707106769> : tensor<512xf32>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<512xf32>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = arith.mulf %8, %cst_1 : tensor<512xf32>
+    %10 = arith.mulf %8, %cst_0 : tensor<512xf32>
+    %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<512xf32>) -> tensor<512xf32>
+    %12 = arith.addf %11, %cst : tensor<512xf32>
+    %13 = arith.mulf %9, %12 : tensor<512xf32>
+    %14 = arith.truncf %13 : tensor<512xf32> to tensor<512xbf16>
+    tt.store %6, %14 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx b/.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..836dedd2eac0af63da4ada80489abd20451b5489
--- /dev/null
+++ b/.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ptx
@@ -0,0 +1,278 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1de
+
+.visible .entry triton__0d1de(
+	.param .u64 triton__0d1de_param_0,
+	.param .u64 triton__0d1de_param_1
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<7>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd2, [triton__0d1de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r4, %tid.x;
+	shl.b32 	%r5, %r4, 1;
+	and.b32  	%r6, %r5, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 46
+	mul.wide.s32 	%rd3, %r1, 512;
+	cvt.u64.u32 	%rd4, %r6;
+	.loc	1 21 23
+	or.b64  	%rd5, %rd3, %rd4;
+	.loc	1 25 25
+	shl.b64 	%rd6, %rd5, 2;
+	add.s64 	%rd1, %rd2, %rd6;
+	mov.b32 	%r2, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 25 36
+	@%p1 st.global.v2.b32 [ %rd1 + 0 ], { %r2, %r2 };
+	.loc	1 25 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/pk/cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 172
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 107
+.b8 119
+.b8 51
+.b8 98
+.b8 100
+.b8 111
+.b8 97
+.b8 109
+.b8 108
+.b8 103
+.b8 122
+.b8 118
+.b8 113
+.b8 106
+.b8 101
+.b8 121
+.b8 117
+.b8 107
+.b8 51
+.b8 52
+.b8 98
+.b8 51
+.b8 106
+.b8 99
+.b8 106
+.b8 102
+.b8 53
+.b8 55
+.b8 104
+.b8 116
+.b8 105
+.b8 115
+.b8 97
+.b8 114
+.b8 97
+.b8 55
+.b8 108
+.b8 117
+.b8 107
+.b8 102
+.b8 108
+.b8 101
+.b8 120
+.b8 111
+.b8 51
+.b8 116
+.b8 50
+.b8 50
+.b8 101
+.b8 119
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 107
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..f65bbfb07757cd5cafcdbaeb764abf05bc09e15f
Binary files /dev/null and b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.cubin differ
diff --git a/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e9fb699a23fe71d84177fa72dcc04f250b59f873
--- /dev/null
+++ b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.llir
@@ -0,0 +1,162 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = and i32 %4, 127, !dbg !8
+  %6 = shl nuw nsw i32 %5, 3, !dbg !8
+  %7 = shl nuw nsw i32 %5, 2, !dbg !8
+  %8 = or i32 %7, 512, !dbg !8
+  %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
+  %10 = shl i32 %9, 10, !dbg !10
+  %11 = or i32 %10, %6, !dbg !11
+  %12 = or i32 %10, %7, !dbg !11
+  %13 = or i32 %10, %8, !dbg !11
+  %14 = sext i32 %11 to i64, !dbg !12
+  %15 = getelementptr i16, ptr addrspace(1) %0, i64 %14, !dbg !12
+  %16 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %15, i1 true) #2, !dbg !13
+  %17 = extractvalue { i32, i32, i32, i32 } %16, 0, !dbg !13
+  %18 = extractvalue { i32, i32, i32, i32 } %16, 1, !dbg !13
+  %19 = extractvalue { i32, i32, i32, i32 } %16, 2, !dbg !13
+  %20 = extractvalue { i32, i32, i32, i32 } %16, 3, !dbg !13
+  %21 = trunc i32 %17 to i16, !dbg !13
+  %extelt.offset = lshr i32 %17, 16, !dbg !13
+  %22 = trunc i32 %extelt.offset to i16, !dbg !13
+  %23 = trunc i32 %18 to i16, !dbg !13
+  %extelt.offset1 = lshr i32 %18, 16, !dbg !13
+  %24 = trunc i32 %extelt.offset1 to i16, !dbg !13
+  %25 = trunc i32 %19 to i16, !dbg !13
+  %extelt.offset2 = lshr i32 %19, 16, !dbg !13
+  %26 = trunc i32 %extelt.offset2 to i16, !dbg !13
+  %27 = trunc i32 %20 to i16, !dbg !13
+  %extelt.offset3 = lshr i32 %20, 16, !dbg !13
+  %28 = trunc i32 %extelt.offset3 to i16, !dbg !13
+  %29 = zext nneg i32 %6 to i64, !dbg !14
+  %30 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %29, !dbg !14
+  %31 = insertelement <1 x i16> undef, i16 %21, i64 0, !dbg !14
+  store <1 x i16> %31, ptr addrspace(3) %30, align 2, !dbg !14
+  %32 = or i32 %6, 1, !dbg !14
+  %33 = zext nneg i32 %32 to i64, !dbg !14
+  %34 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %33, !dbg !14
+  %35 = insertelement <1 x i16> undef, i16 %22, i64 0, !dbg !14
+  store <1 x i16> %35, ptr addrspace(3) %34, align 2, !dbg !14
+  %36 = or i32 %6, 2, !dbg !14
+  %37 = zext nneg i32 %36 to i64, !dbg !14
+  %38 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %37, !dbg !14
+  %39 = insertelement <1 x i16> undef, i16 %23, i64 0, !dbg !14
+  store <1 x i16> %39, ptr addrspace(3) %38, align 2, !dbg !14
+  %40 = or i32 %6, 3, !dbg !14
+  %41 = zext nneg i32 %40 to i64, !dbg !14
+  %42 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %41, !dbg !14
+  %43 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !14
+  store <1 x i16> %43, ptr addrspace(3) %42, align 2, !dbg !14
+  %44 = or i32 %6, 4, !dbg !14
+  %45 = zext nneg i32 %44 to i64, !dbg !14
+  %46 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %45, !dbg !14
+  %47 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !14
+  store <1 x i16> %47, ptr addrspace(3) %46, align 2, !dbg !14
+  %48 = or i32 %6, 5, !dbg !14
+  %49 = zext nneg i32 %48 to i64, !dbg !14
+  %50 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %49, !dbg !14
+  %51 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !14
+  store <1 x i16> %51, ptr addrspace(3) %50, align 2, !dbg !14
+  %52 = or i32 %6, 6, !dbg !14
+  %53 = zext nneg i32 %52 to i64, !dbg !14
+  %54 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %53, !dbg !14
+  %55 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !14
+  store <1 x i16> %55, ptr addrspace(3) %54, align 2, !dbg !14
+  %56 = or i32 %6, 7, !dbg !14
+  %57 = zext nneg i32 %56 to i64, !dbg !14
+  %58 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %57, !dbg !14
+  %59 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !14
+  store <1 x i16> %59, ptr addrspace(3) %58, align 2, !dbg !14
+  tail call void @llvm.nvvm.barrier0(), !dbg !14
+  %60 = zext nneg i32 %7 to i64, !dbg !14
+  %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !14
+  %62 = load i16, ptr addrspace(3) %61, align 2, !dbg !14
+  %63 = or i32 %7, 1, !dbg !14
+  %64 = zext nneg i32 %63 to i64, !dbg !14
+  %65 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %64, !dbg !14
+  %66 = load i16, ptr addrspace(3) %65, align 2, !dbg !14
+  %67 = or i32 %7, 2, !dbg !14
+  %68 = zext nneg i32 %67 to i64, !dbg !14
+  %69 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %68, !dbg !14
+  %70 = load i16, ptr addrspace(3) %69, align 2, !dbg !14
+  %71 = or i32 %7, 3, !dbg !14
+  %72 = zext nneg i32 %71 to i64, !dbg !14
+  %73 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %72, !dbg !14
+  %74 = load i16, ptr addrspace(3) %73, align 2, !dbg !14
+  %75 = zext nneg i32 %8 to i64, !dbg !14
+  %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !14
+  %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !14
+  %78 = or i32 %7, 513, !dbg !14
+  %79 = zext nneg i32 %78 to i64, !dbg !14
+  %80 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %79, !dbg !14
+  %81 = load i16, ptr addrspace(3) %80, align 2, !dbg !14
+  %82 = or i32 %7, 514, !dbg !14
+  %83 = zext nneg i32 %82 to i64, !dbg !14
+  %84 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %83, !dbg !14
+  %85 = load i16, ptr addrspace(3) %84, align 2, !dbg !14
+  %86 = or i32 %7, 515, !dbg !14
+  %87 = zext nneg i32 %86 to i64, !dbg !14
+  %88 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %87, !dbg !14
+  %89 = load i16, ptr addrspace(3) %88, align 2, !dbg !14
+  %90 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #2, !dbg !14
+  %91 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %66) #2, !dbg !14
+  %92 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %70) #2, !dbg !14
+  %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #2, !dbg !14
+  %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !14
+  %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %81) #2, !dbg !14
+  %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %85) #2, !dbg !14
+  %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %89) #2, !dbg !14
+  %98 = sext i32 %12 to i64, !dbg !15
+  %99 = getelementptr float, ptr addrspace(1) %1, i64 %98, !dbg !15
+  %100 = sext i32 %13 to i64, !dbg !15
+  %101 = getelementptr float, ptr addrspace(1) %1, i64 %100, !dbg !15
+  %102 = bitcast float %90 to i32, !dbg !16
+  %103 = bitcast float %91 to i32, !dbg !16
+  %104 = bitcast float %92 to i32, !dbg !16
+  %105 = bitcast float %93 to i32, !dbg !16
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %102, i32 %103, i32 %104, i32 %105, ptr addrspace(1) %99, i1 true) #2, !dbg !16
+  %106 = bitcast float %94 to i32, !dbg !16
+  %107 = bitcast float %95 to i32, !dbg !16
+  %108 = bitcast float %96 to i32, !dbg !16
+  %109 = bitcast float %97 to i32, !dbg !16
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %106, i32 %107, i32 %108, i32 %109, ptr addrspace(1) %101, i1 true) #2, !dbg !16
+  ret void, !dbg !17
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py", directory: "/tmp/torchinductor_root/k6")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 24, column: 44, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..117117e00c3c054fd97ca01c7ea5c74750f209d6
--- /dev/null
+++ b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ptx
@@ -0,0 +1,338 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<37>;
+	.reg .b64 	%rd<13>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r22, %tid.x;
+	and.b32  	%r23, %r22, 127;
+	shl.b32 	%r24, %r23, 3;
+	shl.b32 	%r25, %r23, 2;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r26, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r27, %r26, %r24;
+	or.b32  	%r28, %r26, %r25;
+	.loc	1 24 30
+	mul.wide.s32 	%rd6, %r27, 2;
+	add.s64 	%rd1, %rd4, %rd6;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	shr.u32 	%r29, %r2, 16;
+	shr.u32 	%r30, %r3, 16;
+	shr.u32 	%r31, %r4, 16;
+	shr.u32 	%r32, %r5, 16;
+	.loc	1 24 44
+	shl.b32 	%r33, %r23, 4;
+	mov.u32 	%r34, global_smem;
+	add.s32 	%r35, %r34, %r33;
+	st.shared.u16 	[%r35], %r2;
+	st.shared.u16 	[%r35+2], %r29;
+	st.shared.u16 	[%r35+4], %r3;
+	st.shared.u16 	[%r35+6], %r30;
+	st.shared.u16 	[%r35+8], %r4;
+	st.shared.u16 	[%r35+10], %r31;
+	st.shared.u16 	[%r35+12], %r5;
+	st.shared.u16 	[%r35+14], %r32;
+	bar.sync 	0;
+	add.s32 	%r36, %r34, %r24;
+	ld.shared.u16 	%rs1, [%r36];
+	ld.shared.u16 	%rs2, [%r36+2];
+	ld.shared.u16 	%rs3, [%r36+4];
+	ld.shared.u16 	%rs4, [%r36+6];
+	ld.shared.u16 	%rs5, [%r36+1024];
+	ld.shared.u16 	%rs6, [%r36+1026];
+	ld.shared.u16 	%rs7, [%r36+1028];
+	ld.shared.u16 	%rs8, [%r36+1030];
+	cvt.f32.bf16 %r14, %rs1;
+	cvt.f32.bf16 %r15, %rs2;
+	cvt.f32.bf16 %r16, %rs3;
+	cvt.f32.bf16 %r17, %rs4;
+	cvt.f32.bf16 %r18, %rs5;
+	cvt.f32.bf16 %r19, %rs6;
+	cvt.f32.bf16 %r20, %rs7;
+	cvt.f32.bf16 %r21, %rs8;
+	.loc	1 26 25
+	mul.wide.s32 	%rd7, %r28, 4;
+	add.s64 	%rd2, %rd5, %rd7;
+	cvt.s64.s32 	%rd8, %r26;
+	cvt.u64.u32 	%rd9, %r25;
+	or.b64  	%rd10, %rd8, %rd9;
+	shl.b64 	%rd11, %rd10, 2;
+	add.s64 	%rd12, %rd5, %rd11;
+	add.s64 	%rd3, %rd12, 2048;
+	.loc	1 26 36
+	@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
+	@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 107
+.b8 54
+.b8 50
+.b8 107
+.b8 50
+.b8 120
+.b8 122
+.b8 98
+.b8 98
+.b8 54
+.b8 53
+.b8 55
+.b8 115
+.b8 110
+.b8 102
+.b8 100
+.b8 111
+.b8 119
+.b8 119
+.b8 97
+.b8 110
+.b8 122
+.b8 115
+.b8 122
+.b8 97
+.b8 105
+.b8 106
+.b8 54
+.b8 113
+.b8 122
+.b8 119
+.b8 54
+.b8 118
+.b8 117
+.b8 99
+.b8 55
+.b8 99
+.b8 102
+.b8 105
+.b8 100
+.b8 111
+.b8 109
+.b8 106
+.b8 112
+.b8 107
+.b8 107
+.b8 54
+.b8 105
+.b8 103
+.b8 99
+.b8 109
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 107
+.b8 54
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc219ffbb3021e19f3b35e9be96086e23c9c4b
--- /dev/null
+++ b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttgir
@@ -0,0 +1,24 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
+    %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
+    %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
+    %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
+    %8 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %9 = tt.addptr %8, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %11 = triton_gpu.convert_layout %10 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
+    %12 = arith.extf %11 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
+    %13 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
+    %14 = tt.addptr %13, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
+    tt.store %14, %12 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
+    tt.return
+  }
+}
diff --git a/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..d2a9a5cb9370160003226fec8c61be7abfe7c35e
--- /dev/null
+++ b/.triton/dump/294d626e055d1f63037cabf3cda4f2ac/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
+    %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..2891b6454510d32d62d28ae2efbe2a8e82c600e9
Binary files /dev/null and b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.cubin differ
diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..56acceca79e636922fce5162d7e68c9afe3f1b30
--- /dev/null
+++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.llir
@@ -0,0 +1,132 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2d3de4e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i32 %3, i32 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = and i32 %6, 63, !dbg !8
+  %8 = lshr i32 %6, 6, !dbg !9
+  %9 = and i32 %8, 3, !dbg !9
+  %10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
+  %11 = shl i32 %10, 6, !dbg !11
+  %12 = or i32 %11, %7, !dbg !12
+  br label %13, !dbg !13
+
+13:                                               ; preds = %5, %13
+  %14 = phi float [ 0.000000e+00, %5 ], [ %23, %13 ]
+  %15 = phi i32 [ 0, %5 ], [ %24, %13 ]
+  %16 = or i32 %15, %9, !dbg !14
+  %17 = shl i32 %16, 17, !dbg !15
+  %18 = add i32 %17, %12, !dbg !16
+  %19 = sext i32 %18 to i64, !dbg !17
+  %20 = getelementptr float, ptr addrspace(1) %0, i64 %19, !dbg !17
+  %21 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %20, i1 true, i32 0, i1 true) #3, !dbg !18
+  %22 = bitcast i32 %21 to float, !dbg !18
+  %23 = fadd float %14, %22, !dbg !19
+  %24 = add nuw nsw i32 %15, 4, !dbg !13
+  %25 = icmp ult i32 %15, 116, !dbg !13
+  br i1 %25, label %13, label %26, !dbg !13
+
+26:                                               ; preds = %13
+  %27 = shl nuw nsw i32 %7, 2, !dbg !20
+  %28 = or i32 %27, %9, !dbg !20
+  %29 = zext nneg i32 %28 to i64, !dbg !20
+  %30 = getelementptr float, ptr addrspace(3) @global_smem, i64 %29, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %30, float %23, i1 true) #3, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !20
+  %31 = icmp slt i32 %6, 256, !dbg !20
+  %32 = sext i32 %6 to i64, !dbg !20
+  %33 = getelementptr float, ptr addrspace(3) @global_smem, i64 %32, !dbg !20
+  %34 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %33, i1 %31) #3, !dbg !20
+  %35 = bitcast float %34 to i32, !dbg !20
+  %36 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %35, i32 2, i32 31), !dbg !20
+  %37 = bitcast i32 %36 to float, !dbg !20
+  %38 = fadd float %34, %37, !dbg !24
+  %39 = bitcast float %38 to i32, !dbg !20
+  %40 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %39, i32 1, i32 31), !dbg !20
+  %41 = bitcast i32 %40 to float, !dbg !20
+  %42 = fadd float %38, %41, !dbg !24
+  %43 = and i32 %6, 3, !dbg !20
+  %44 = icmp eq i32 %43, 0, !dbg !20
+  %45 = and i1 %31, %44, !dbg !20
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %33, float %42, i1 %45) #3, !dbg !20
+  tail call void @llvm.nvvm.barrier0(), !dbg !20
+  %46 = zext nneg i32 %27 to i64, !dbg !20
+  %47 = getelementptr float, ptr addrspace(3) @global_smem, i64 %46, !dbg !20
+  %48 = load float, ptr addrspace(3) %47, align 4, !dbg !20
+  %.frozen = freeze i32 %12
+  %49 = sdiv i32 %.frozen, 256, !dbg !28
+  %50 = mul i32 %49, 256
+  %.decomposed = sub i32 %.frozen, %50
+  %51 = sext i32 %49 to i64, !dbg !29
+  %52 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !29
+  %53 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %52, i1 true) #3, !dbg !30
+  %54 = lshr i64 %53, 54, !dbg !31
+  %55 = and i64 %54, 512, !dbg !31
+  %56 = add i64 %55, %53, !dbg !31
+  %57 = shl i64 %56, 8, !dbg !32
+  %58 = sext i32 %.decomposed to i64, !dbg !33
+  %59 = getelementptr float, ptr addrspace(1) %2, i64 %57, !dbg !34
+  %60 = getelementptr float, ptr addrspace(1) %59, i64 %58, !dbg !34
+  %61 = icmp eq i32 %9, 0, !dbg !35
+  %62 = insertelement <1 x float> undef, float %48, i64 0, !dbg !35
+  %63 = tail call float asm sideeffect "mov.u32 $0, 0x0;\0A\09@$3 atom.global.gpu.acq_rel.add.f32 $0, [ $1 + 0 ], $2;", "=r,l,r,b"(ptr addrspace(1) %60, <1 x float> %62, i1 %61) #3, !dbg !35
+  ret void, !dbg !36
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c6ik5vx7p22fpk4dcvh55zimw4t5nr5zn2b7inujxjauxshljumm.py", directory: "/tmp/torchinductor_root/6i")
+!3 = !{ptr @triton__0d1d2d3de4e, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3de4e, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4e", linkageName: "triton__0d1d2d3de4e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 22, column: 44, scope: !5)
+!9 = !DILocation(line: 24, column: 33, scope: !5)
+!10 = !DILocation(line: 21, column: 28, scope: !5)
+!11 = !DILocation(line: 21, column: 33, scope: !5)
+!12 = !DILocation(line: 22, column: 23, scope: !5)
+!13 = !DILocation(line: 27, column: 36, scope: !5)
+!14 = !DILocation(line: 28, column: 27, scope: !5)
+!15 = !DILocation(line: 31, column: 47, scope: !5)
+!16 = !DILocation(line: 31, column: 40, scope: !5)
+!17 = !DILocation(line: 31, column: 34, scope: !5)
+!18 = !DILocation(line: 31, column: 53, scope: !5)
+!19 = !DILocation(line: 34, column: 38, scope: !5)
+!20 = !DILocation(line: 243, column: 36, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlockFile(scope: !5, file: !22, discriminator: 0)
+!22 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!23 = !DILocation(line: 35, column: 25, scope: !21)
+!24 = !DILocation(line: 233, column: 15, scope: !25, inlinedAt: !26)
+!25 = distinct !DILexicalBlockFile(scope: !21, file: !22, discriminator: 0)
+!26 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !27)
+!27 = !DILocation(line: 35, column: 25, scope: !25)
+!28 = !DILocation(line: 36, column: 20, scope: !5)
+!29 = !DILocation(line: 38, column: 30, scope: !5)
+!30 = !DILocation(line: 38, column: 35, scope: !5)
+!31 = !DILocation(line: 41, column: 32, scope: !5)
+!32 = !DILocation(line: 45, column: 40, scope: !5)
+!33 = !DILocation(line: 45, column: 36, scope: !5)
+!34 = !DILocation(line: 45, column: 30, scope: !5)
+!35 = !DILocation(line: 45, column: 55, scope: !5)
+!36 = !DILocation(line: 45, column: 4, scope: !5)
diff --git a/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..f76247793b898b0e6e78963e133f5ff530eb0a9b
--- /dev/null
+++ b/.triton/dump/397c6f2fc3ba128a214a60f646524724/triton_.ttgir
@@ -0,0 +1,62 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_1 = arith.constant dense<512> : tensor<64x1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_3 = arith.constant dense<131072> : tensor<1x4xi32, #blocked>
+    %cst_4 = arith.constant dense<120> : tensor<1x4xi32, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c120_i32 = arith.constant 120 : i32
+    %c4_i32 = arith.constant 4 : i32
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<64x4xf32, #blocked>
+    %cst_6 = arith.constant dense<true> : tensor<64x1xi1, #blocked>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32, #blocked>
+    %6 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x4xi32, #blocked>
+    %8 = tt.broadcast %5 : (tensor<64x1xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+    %9 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x4x!tt.ptr<f32, 1>, #blocked>
+    %10 = scf.for %arg5 = %c0_i32 to %c120_i32 step %c4_i32 iter_args(%arg6 = %cst_5) -> (tensor<64x4xf32, #blocked>)  : i32 {
+      %27 = tt.splat %arg5 : (i32) -> tensor<1x4xi32, #blocked>
+      %28 = arith.addi %27, %7 : tensor<1x4xi32, #blocked>
+      %29 = arith.cmpi slt, %28, %cst_4 : tensor<1x4xi32, #blocked>
+      %30 = arith.muli %28, %cst_3 : tensor<1x4xi32, #blocked>
+      %31 = tt.broadcast %30 : (tensor<1x4xi32, #blocked>) -> tensor<64x4xi32, #blocked>
+      %32 = arith.addi %8, %31 : tensor<64x4xi32, #blocked>
+      %33 = tt.addptr %9, %32 : tensor<64x4x!tt.ptr<f32, 1>, #blocked>, tensor<64x4xi32, #blocked>
+      %34 = tt.broadcast %29 : (tensor<1x4xi1, #blocked>) -> tensor<64x4xi1, #blocked>
+      %35 = tt.load %33, %34, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x4xf32, #blocked>
+      %36 = arith.addf %arg6, %35 : tensor<64x4xf32, #blocked>
+      %37 = arith.select %34, %36, %arg6 : tensor<64x4xi1, #blocked>, tensor<64x4xf32, #blocked>
+      scf.yield %37 : tensor<64x4xf32, #blocked>
+    }
+    %11 = "tt.reduce"(%10) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %27 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %27 : f32
+    }) : (tensor<64x4xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %12 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %13 = arith.divsi %5, %cst_2 : tensor<64x1xi32, #blocked>
+    %14 = arith.remsi %5, %cst_2 : tensor<64x1xi32, #blocked>
+    %15 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %16 = tt.addptr %15, %13 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %18 = arith.addi %17, %cst_1 : tensor<64x1xi64, #blocked>
+    %19 = arith.cmpi slt, %17, %cst_0 : tensor<64x1xi64, #blocked>
+    %20 = arith.select %19, %18, %17 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %21 = arith.muli %20, %cst : tensor<64x1xi64, #blocked>
+    %22 = arith.extsi %14 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
+    %23 = arith.addi %22, %21 : tensor<64x1xi64, #blocked>
+    %24 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>, #blocked>
+    %25 = tt.addptr %24, %23 : tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xi64, #blocked>
+    %26 = "tt.atomic_rmw"(%25, %12, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<64x1x!tt.ptr<f32, 1>, #blocked>, tensor<64x1xf32, #blocked>, tensor<64x1xi1, #blocked>) -> tensor<64x1xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..18959e236a4b1dfc26b5f9fe54cec897bb36ef32
Binary files /dev/null and b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.cubin differ
diff --git a/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..7f2001fc894b13ca58dbe11761174d70d18d7130
--- /dev/null
+++ b/.triton/dump/44b225411009956bfbae22f8bac7d703/triton_.ttgir
@@ -0,0 +1,62 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %40 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %24 = arith.addf %23, %cst_2 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
+    %27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
+    %28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
+    %29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %40 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %31 = arith.addf %30, %cst_2 : f32
+    %32 = arith.divf %31, %cst_1 : f32
+    %33 = arith.addf %32, %cst_0 : f32
+    %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
+    %36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
+    %37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
+    %38 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    tt.store %39, %37, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..da2c34c41ee7cf60a7cd2adc8b2b38d56b4b1c87
Binary files /dev/null and b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.cubin differ
diff --git a/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..ae9b76aea1b21384d8b0c89a97d04a326aea92e0
--- /dev/null
+++ b/.triton/dump/4710f23a3addbad00b260d7a02366fe0/triton_.ptx
@@ -0,0 +1,465 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d34e
+
+.visible .entry triton__0d1d2d34e(
+	.param .u64 triton__0d1d2d34e_param_0,
+	.param .u64 triton__0d1d2d34e_param_1,
+	.param .u64 triton__0d1d2d34e_param_2,
+	.param .u32 triton__0d1d2d34e_param_3,
+	.param .u32 triton__0d1d2d34e_param_4
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<6>;
+	.reg .b32 	%r<27>;
+	.reg .f32 	%f<9>;
+	.reg .b64 	%rd<24>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd4, [triton__0d1d2d34e_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2d34e_param_1];
+$L__tmp0:
+	.loc	1 25 34
+	mov.u32 	%r7, %tid.x;
+	and.b32  	%r8, %r7, 7;
+	ld.param.u64 	%rd6, [triton__0d1d2d34e_param_2];
+	.loc	1 28 30
+	mul.wide.u32 	%rd7, %r8, 4;
+	add.s64 	%rd1, %rd5, %rd7;
+	mov.b32 	%r2, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 28 35
+	mov.u32 %r1, 0x0;
+	@%p1 ld.global.b32 { %r1 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r1, %r2;
+	mov.b32 	%f1, %r1;
+	.loc	1 29 30
+	mul.wide.u32 	%rd8, %r8, 8;
+	add.s64 	%rd3, %rd6, %rd8;
+	.loc	1 29 35
+	mov.u64 %rd2, 0x0;
+	@%p1 ld.global.b64 { %rd2 }, [ %rd3 + 0 ];
+	@!%p1 mov.u64 %rd2, 0x0;
+$L__tmp1:
+	.loc	2 243 36
+	shfl.sync.bfly.b32	%r9, %r1, 4, 31, -1;
+	mov.b32 	%f2, %r9;
+$L__tmp2:
+	.loc	2 233 15
+	add.f32 	%f3, %f1, %f2;
+$L__tmp3:
+	.loc	2 243 36
+	mov.b32 	%r10, %f3;
+	shfl.sync.bfly.b32	%r11, %r10, 2, 31, -1;
+	mov.b32 	%f4, %r11;
+$L__tmp4:
+	.loc	2 233 15
+	add.f32 	%f5, %f3, %f4;
+$L__tmp5:
+	.loc	2 243 36
+	mov.b32 	%r12, %f5;
+	shfl.sync.bfly.b32	%r13, %r12, 1, 31, -1;
+	mov.b32 	%f6, %r13;
+$L__tmp6:
+	.loc	2 233 15
+	add.f32 	%f7, %f5, %f6;
+$L__tmp7:
+	.loc	2 243 36
+	cvt.u32.u64 	%r14, %rd2;
+	shfl.sync.bfly.b32	%r15, %r14, 4, 31, -1;
+	{ .reg .b32 tmp; mov.b64 {tmp, %r16}, %rd2; }
+	shfl.sync.bfly.b32	%r17, %r16, 4, 31, -1;
+	cvt.u64.u32 	%rd9, %r15;
+	cvt.u64.u32 	%rd10, %r17;
+	shl.b64 	%rd11, %rd10, 32;
+	or.b64  	%rd12, %rd9, %rd11;
+$L__tmp8:
+	.loc	2 233 15
+	add.s64 	%rd13, %rd2, %rd12;
+$L__tmp9:
+	.loc	2 243 36
+	cvt.u32.u64 	%r18, %rd13;
+	shfl.sync.bfly.b32	%r19, %r18, 2, 31, -1;
+	{ .reg .b32 tmp; mov.b64 {tmp, %r20}, %rd13; }
+	shfl.sync.bfly.b32	%r21, %r20, 2, 31, -1;
+	cvt.u64.u32 	%rd14, %r19;
+	cvt.u64.u32 	%rd15, %r21;
+	shl.b64 	%rd16, %rd15, 32;
+	or.b64  	%rd17, %rd14, %rd16;
+$L__tmp10:
+	.loc	2 233 15
+	add.s64 	%rd18, %rd13, %rd17;
+$L__tmp11:
+	.loc	2 243 36
+	cvt.u32.u64 	%r22, %rd18;
+	shfl.sync.bfly.b32	%r23, %r22, 1, 31, -1;
+	{ .reg .b32 tmp; mov.b64 {tmp, %r24}, %rd18; }
+	shfl.sync.bfly.b32	%r25, %r24, 1, 31, -1;
+	cvt.u64.u32 	%rd19, %r23;
+	cvt.u64.u32 	%rd20, %r25;
+	shl.b64 	%rd21, %rd20, 32;
+	or.b64  	%rd22, %rd19, %rd21;
+$L__tmp12:
+	.loc	2 233 15
+	add.s64 	%rd23, %rd18, %rd22;
+$L__tmp13:
+	.loc	1 36 20
+	cvt.rn.f32.s64 	%f8, %rd23;
+	.loc	1 37 19
+	mov.b32 	%r4, %f7;
+	mov.b32 	%r5, %f8;
+	div.full.f32 %r6, %r4, %r5;
+	.loc	1 38 4
+	bar.sync 	0;
+	.loc	1 39 71
+	and.b32  	%r26, %r7, 63;
+	setp.eq.s32 	%p5, %r26, 0;
+	@%p5 st.global.b32 [ %rd4 + 0 ], { %r6 };
+	.loc	1 39 4
+	ret;
+$L__tmp14:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/7z/c7zrzealf5bsn7qskl6y72zb73mh5bzf6uskuswp33lv4y5kk64w.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 333
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 55
+.b8 122
+.b8 114
+.b8 122
+.b8 101
+.b8 97
+.b8 108
+.b8 102
+.b8 53
+.b8 98
+.b8 115
+.b8 110
+.b8 55
+.b8 113
+.b8 115
+.b8 107
+.b8 108
+.b8 54
+.b8 121
+.b8 55
+.b8 50
+.b8 122
+.b8 98
+.b8 55
+.b8 51
+.b8 109
+.b8 104
+.b8 53
+.b8 98
+.b8 122
+.b8 102
+.b8 54
+.b8 117
+.b8 115
+.b8 107
+.b8 117
+.b8 115
+.b8 119
+.b8 112
+.b8 51
+.b8 51
+.b8 108
+.b8 118
+.b8 52
+.b8 121
+.b8 53
+.b8 107
+.b8 107
+.b8 54
+.b8 52
+.b8 119
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 55
+.b8 122
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 52
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 52
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp6
+.b8 2
+.b8 32
+.b8 24
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp7
+.b8 2
+.b8 32
+.b8 24
+.b8 4
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp7
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp7
+.b64 $L__tmp12
+.b8 2
+.b8 35
+.b8 24
+.b8 5
+.b32 125
+.b64 $L__tmp8
+.b64 $L__tmp13
+.b8 2
+.b8 35
+.b8 24
+.b8 4
+.b32 125
+.b64 $L__tmp8
+.b64 $L__tmp13
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 337
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 52
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 337
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..f97dece3670ea019efd8fdd70103daf67e21511c
Binary files /dev/null and b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.cubin differ
diff --git a/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..92808eebe0f2c8863c7ff20b1eeb86b4fa18c046
--- /dev/null
+++ b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.llir
@@ -0,0 +1,424 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %11 = lshr i32 %10, 5, !dbg !8
+  %urem = and i32 %10, 255, !dbg !8
+  %12 = or i32 %urem, 256, !dbg !8
+  %13 = or i32 %urem, 512, !dbg !8
+  %14 = or i32 %urem, 768, !dbg !8
+  %15 = or i32 %urem, 1024, !dbg !8
+  %16 = or i32 %urem, 1280, !dbg !8
+  %17 = or i32 %urem, 1536, !dbg !8
+  %18 = or i32 %urem, 1792, !dbg !8
+  %19 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
+  %20 = sext i32 %19 to i64, !dbg !10
+  %21 = insertelement <8 x i32> poison, i32 %urem, i64 0
+  %22 = insertelement <8 x i32> %21, i32 %12, i64 1
+  %23 = insertelement <8 x i32> %22, i32 %13, i64 2
+  %24 = insertelement <8 x i32> %23, i32 %14, i64 3
+  %25 = insertelement <8 x i32> %24, i32 %15, i64 4
+  %26 = insertelement <8 x i32> %25, i32 %16, i64 5
+  %27 = insertelement <8 x i32> %26, i32 %17, i64 6
+  %28 = insertelement <8 x i32> %27, i32 %18, i64 7
+  %29 = zext <8 x i32> %28 to <8 x i64>
+  %30 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !11
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !12
+  %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !13
+  %33 = bitcast i32 %32 to float, !dbg !13
+  %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !14
+  %35 = bitcast i32 %34 to float, !dbg !14
+  %36 = mul nsw i64 %20, 50257, !dbg !15
+  %.not = icmp eq i64 %31, -1, !dbg !16
+  %37 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %33, float %35) #3, !dbg !17
+  %38 = select i1 %.not, float 0.000000e+00, float %37, !dbg !18
+  %invariant.gep = getelementptr float, ptr addrspace(1) %0, i64 %36, !dbg !19
+  %39 = insertelement <8 x float> poison, float %38, i64 0, !dbg !20
+  %40 = shufflevector <8 x float> %39, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !20
+  br label %41, !dbg !19
+
+41:                                               ; preds = %9, %41
+  %42 = phi i32 [ 0, %9 ], [ %85, %41 ]
+  %43 = phi <8 x float> [ zeroinitializer, %9 ], [ %84, %41 ]
+  %44 = zext nneg i32 %42 to i64, !dbg !21
+  %45 = insertelement <8 x i64> poison, i64 %44, i64 0, !dbg !21
+  %46 = shufflevector <8 x i64> %45, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !21
+  %47 = or <8 x i64> %46, %29, !dbg !21
+  %48 = icmp ult <8 x i64> %47, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
+  %49 = extractelement <8 x i64> %47, i64 0, !dbg !23
+  %gep = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %49, !dbg !23
+  %50 = extractelement <8 x i64> %47, i64 1, !dbg !23
+  %gep3 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %50, !dbg !23
+  %51 = extractelement <8 x i64> %47, i64 2, !dbg !23
+  %gep5 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %51, !dbg !23
+  %52 = extractelement <8 x i64> %47, i64 3, !dbg !23
+  %gep7 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %52, !dbg !23
+  %53 = extractelement <8 x i64> %47, i64 4, !dbg !23
+  %gep9 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %53, !dbg !23
+  %54 = extractelement <8 x i64> %47, i64 5, !dbg !23
+  %gep11 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %54, !dbg !23
+  %55 = extractelement <8 x i64> %47, i64 6, !dbg !23
+  %gep13 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %55, !dbg !23
+  %56 = extractelement <8 x i64> %47, i64 7, !dbg !23
+  %gep15 = getelementptr float, ptr addrspace(1) %invariant.gep, i64 %56, !dbg !23
+  %57 = extractelement <8 x i1> %48, i64 0, !dbg !24
+  %58 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep, i1 %57, i32 0, i1 %57) #3, !dbg !24
+  %59 = extractelement <8 x i1> %48, i64 1, !dbg !24
+  %60 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep3, i1 %59, i32 0, i1 %59) #3, !dbg !24
+  %61 = extractelement <8 x i1> %48, i64 2, !dbg !24
+  %62 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep5, i1 %61, i32 0, i1 %61) #3, !dbg !24
+  %63 = extractelement <8 x i1> %48, i64 3, !dbg !24
+  %64 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep7, i1 %63, i32 0, i1 %63) #3, !dbg !24
+  %65 = extractelement <8 x i1> %48, i64 4, !dbg !24
+  %66 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep9, i1 %65, i32 0, i1 %65) #3, !dbg !24
+  %67 = extractelement <8 x i1> %48, i64 5, !dbg !24
+  %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep11, i1 %67, i32 0, i1 %67) #3, !dbg !24
+  %69 = extractelement <8 x i1> %48, i64 6, !dbg !24
+  %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep13, i1 %69, i32 0, i1 %69) #3, !dbg !24
+  %71 = extractelement <8 x i1> %48, i64 7, !dbg !24
+  %72 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %gep15, i1 %71, i32 0, i1 %71) #3, !dbg !24
+  %73 = insertelement <8 x i32> poison, i32 %58, i64 0, !dbg !24
+  %74 = insertelement <8 x i32> %73, i32 %60, i64 1, !dbg !24
+  %75 = insertelement <8 x i32> %74, i32 %62, i64 2, !dbg !24
+  %76 = insertelement <8 x i32> %75, i32 %64, i64 3, !dbg !24
+  %77 = insertelement <8 x i32> %76, i32 %66, i64 4, !dbg !24
+  %78 = insertelement <8 x i32> %77, i32 %68, i64 5, !dbg !24
+  %79 = insertelement <8 x i32> %78, i32 %70, i64 6, !dbg !24
+  %80 = insertelement <8 x i32> %79, i32 %72, i64 7, !dbg !24
+  %81 = bitcast <8 x i32> %80 to <8 x float>, !dbg !24
+  %82 = fmul <8 x float> %40, %81, !dbg !20
+  %83 = select <8 x i1> %48, <8 x float> %82, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !25
+  %84 = fadd <8 x float> %43, %83, !dbg !25
+  %85 = add nuw nsw i32 %42, 2048, !dbg !19
+  %86 = icmp ult i32 %42, 48209, !dbg !19
+  br i1 %86, label %41, label %87, !dbg !19
+
+87:                                               ; preds = %41
+  %88 = and i32 %10, 31, !dbg !8
+  %89 = and i32 %11, 7, !dbg !8
+  %shift = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %90 = fadd <8 x float> %84, %shift, !dbg !26
+  %shift37 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %91 = fadd <8 x float> %shift37, %90, !dbg !26
+  %shift38 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %92 = fadd <8 x float> %shift38, %91, !dbg !26
+  %shift39 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %93 = fadd <8 x float> %shift39, %92, !dbg !26
+  %shift40 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %94 = fadd <8 x float> %shift40, %93, !dbg !26
+  %shift41 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %95 = fadd <8 x float> %shift41, %94, !dbg !26
+  %shift42 = shufflevector <8 x float> %84, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !26
+  %96 = fadd <8 x float> %shift42, %95, !dbg !26
+  %97 = extractelement <8 x float> %96, i64 0, !dbg !26
+  %98 = bitcast float %97 to i32, !dbg !32
+  %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !32
+  %100 = bitcast i32 %99 to float, !dbg !32
+  %101 = fadd float %97, %100, !dbg !26
+  %102 = bitcast float %101 to i32, !dbg !32
+  %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !32
+  %104 = bitcast i32 %103 to float, !dbg !32
+  %105 = fadd float %101, %104, !dbg !26
+  %106 = bitcast float %105 to i32, !dbg !32
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !32
+  %108 = bitcast i32 %107 to float, !dbg !32
+  %109 = fadd float %105, %108, !dbg !26
+  %110 = bitcast float %109 to i32, !dbg !32
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !32
+  %112 = bitcast i32 %111 to float, !dbg !32
+  %113 = fadd float %109, %112, !dbg !26
+  %114 = bitcast float %113 to i32, !dbg !32
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !32
+  %116 = bitcast i32 %115 to float, !dbg !32
+  %117 = fadd float %113, %116, !dbg !26
+  %118 = icmp eq i32 %88, 0, !dbg !32
+  %119 = zext nneg i32 %89 to i64, !dbg !32
+  %120 = getelementptr float, ptr addrspace(3) @global_smem, i64 %119, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %120, float %117, i1 %118) #3, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %121 = icmp slt i32 %10, 8, !dbg !32
+  %122 = sext i32 %10 to i64, !dbg !32
+  %123 = getelementptr float, ptr addrspace(3) @global_smem, i64 %122, !dbg !32
+  %124 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %123, i1 %121) #3, !dbg !32
+  %125 = bitcast float %124 to i32, !dbg !32
+  %126 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %125, i32 4, i32 31), !dbg !32
+  %127 = bitcast i32 %126 to float, !dbg !32
+  %128 = fadd float %124, %127, !dbg !26
+  %129 = bitcast float %128 to i32, !dbg !32
+  %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 2, i32 31), !dbg !32
+  %131 = bitcast i32 %130 to float, !dbg !32
+  %132 = fadd float %128, %131, !dbg !26
+  %133 = bitcast float %132 to i32, !dbg !32
+  %134 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %133, i32 1, i32 31), !dbg !32
+  %135 = bitcast i32 %134 to float, !dbg !32
+  %136 = fadd float %132, %135, !dbg !26
+  %137 = and i32 %10, 7, !dbg !32
+  %138 = icmp eq i32 %137, 0, !dbg !32
+  %139 = and i1 %121, %138, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %123, float %136, i1 %139) #3, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %140 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
+  %141 = extractelement <8 x i64> %29, i64 0, !dbg !34
+  %142 = extractelement <8 x i64> %29, i64 1, !dbg !34
+  %143 = extractelement <8 x i64> %29, i64 2, !dbg !34
+  %144 = extractelement <8 x i64> %29, i64 3, !dbg !34
+  %145 = extractelement <8 x i64> %29, i64 4, !dbg !34
+  %146 = extractelement <8 x i64> %29, i64 5, !dbg !34
+  %147 = extractelement <8 x i64> %29, i64 6, !dbg !34
+  %148 = extractelement <8 x i64> %29, i64 7, !dbg !34
+  br label %149, !dbg !35
+
+149:                                              ; preds = %87, %149
+  %150 = phi i32 [ 0, %87 ], [ %312, %149 ]
+  %151 = zext nneg i32 %150 to i64, !dbg !34
+  %152 = or i64 %141, %151, !dbg !34
+  %153 = or i64 %142, %151, !dbg !34
+  %154 = or i64 %143, %151, !dbg !34
+  %155 = or i64 %144, %151, !dbg !34
+  %156 = or i64 %145, %151, !dbg !34
+  %157 = or i64 %146, %151, !dbg !34
+  %158 = or i64 %147, %151, !dbg !34
+  %159 = or i64 %148, %151, !dbg !34
+  %160 = icmp ult i64 %152, 50257, !dbg !36
+  %161 = icmp ult i64 %153, 50257, !dbg !36
+  %162 = icmp ult i64 %154, 50257, !dbg !36
+  %163 = icmp ult i64 %155, 50257, !dbg !36
+  %164 = icmp ult i64 %156, 50257, !dbg !36
+  %165 = icmp ult i64 %157, 50257, !dbg !36
+  %166 = icmp ult i64 %158, 50257, !dbg !36
+  %167 = icmp ult i64 %159, 50257, !dbg !36
+  %168 = add nsw i64 %152, %36, !dbg !37
+  %169 = add nsw i64 %153, %36, !dbg !37
+  %170 = add nsw i64 %154, %36, !dbg !37
+  %171 = add nsw i64 %155, %36, !dbg !37
+  %172 = add nsw i64 %156, %36, !dbg !37
+  %173 = add nsw i64 %157, %36, !dbg !37
+  %174 = add nsw i64 %158, %36, !dbg !37
+  %175 = add nsw i64 %159, %36, !dbg !37
+  %176 = getelementptr i16, ptr addrspace(1) %4, i64 %168, !dbg !38
+  %177 = getelementptr i16, ptr addrspace(1) %4, i64 %169, !dbg !38
+  %178 = getelementptr i16, ptr addrspace(1) %4, i64 %170, !dbg !38
+  %179 = getelementptr i16, ptr addrspace(1) %4, i64 %171, !dbg !38
+  %180 = getelementptr i16, ptr addrspace(1) %4, i64 %172, !dbg !38
+  %181 = getelementptr i16, ptr addrspace(1) %4, i64 %173, !dbg !38
+  %182 = getelementptr i16, ptr addrspace(1) %4, i64 %174, !dbg !38
+  %183 = getelementptr i16, ptr addrspace(1) %4, i64 %175, !dbg !38
+  %184 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %160, i16 0, i1 %160) #3, !dbg !39
+  %185 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %177, i1 %161, i16 0, i1 %161) #3, !dbg !39
+  %186 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %178, i1 %162, i16 0, i1 %162) #3, !dbg !39
+  %187 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %179, i1 %163, i16 0, i1 %163) #3, !dbg !39
+  %188 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %180, i1 %164, i16 0, i1 %164) #3, !dbg !39
+  %189 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %181, i1 %165, i16 0, i1 %165) #3, !dbg !39
+  %190 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %182, i1 %166, i16 0, i1 %166) #3, !dbg !39
+  %191 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %183, i1 %167, i16 0, i1 %167) #3, !dbg !39
+  %192 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %184) #3, !dbg !40
+  %193 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %185) #3, !dbg !40
+  %194 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %186) #3, !dbg !40
+  %195 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %187) #3, !dbg !40
+  %196 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %188) #3, !dbg !40
+  %197 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %189) #3, !dbg !40
+  %198 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %190) #3, !dbg !40
+  %199 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %191) #3, !dbg !40
+  %200 = getelementptr float, ptr addrspace(1) %0, i64 %168, !dbg !41
+  %201 = getelementptr float, ptr addrspace(1) %0, i64 %169, !dbg !41
+  %202 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !41
+  %203 = getelementptr float, ptr addrspace(1) %0, i64 %171, !dbg !41
+  %204 = getelementptr float, ptr addrspace(1) %0, i64 %172, !dbg !41
+  %205 = getelementptr float, ptr addrspace(1) %0, i64 %173, !dbg !41
+  %206 = getelementptr float, ptr addrspace(1) %0, i64 %174, !dbg !41
+  %207 = getelementptr float, ptr addrspace(1) %0, i64 %175, !dbg !41
+  %208 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %200, i1 %160, i32 0, i1 %160) #3, !dbg !42
+  %209 = bitcast i32 %208 to float, !dbg !42
+  %210 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %201, i1 %161, i32 0, i1 %161) #3, !dbg !42
+  %211 = bitcast i32 %210 to float, !dbg !42
+  %212 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %202, i1 %162, i32 0, i1 %162) #3, !dbg !42
+  %213 = bitcast i32 %212 to float, !dbg !42
+  %214 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %203, i1 %163, i32 0, i1 %163) #3, !dbg !42
+  %215 = bitcast i32 %214 to float, !dbg !42
+  %216 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %204, i1 %164, i32 0, i1 %164) #3, !dbg !42
+  %217 = bitcast i32 %216 to float, !dbg !42
+  %218 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %205, i1 %165, i32 0, i1 %165) #3, !dbg !42
+  %219 = bitcast i32 %218 to float, !dbg !42
+  %220 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %206, i1 %166, i32 0, i1 %166) #3, !dbg !42
+  %221 = bitcast i32 %220 to float, !dbg !42
+  %222 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %207, i1 %167, i32 0, i1 %167) #3, !dbg !42
+  %223 = bitcast i32 %222 to float, !dbg !42
+  %224 = getelementptr i16, ptr addrspace(1) %5, i64 %168, !dbg !43
+  %225 = getelementptr i16, ptr addrspace(1) %5, i64 %169, !dbg !43
+  %226 = getelementptr i16, ptr addrspace(1) %5, i64 %170, !dbg !43
+  %227 = getelementptr i16, ptr addrspace(1) %5, i64 %171, !dbg !43
+  %228 = getelementptr i16, ptr addrspace(1) %5, i64 %172, !dbg !43
+  %229 = getelementptr i16, ptr addrspace(1) %5, i64 %173, !dbg !43
+  %230 = getelementptr i16, ptr addrspace(1) %5, i64 %174, !dbg !43
+  %231 = getelementptr i16, ptr addrspace(1) %5, i64 %175, !dbg !43
+  %232 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %224, i1 %160, i16 0, i1 %160) #3, !dbg !44
+  %233 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %225, i1 %161, i16 0, i1 %161) #3, !dbg !44
+  %234 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %226, i1 %162, i16 0, i1 %162) #3, !dbg !44
+  %235 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %227, i1 %163, i16 0, i1 %163) #3, !dbg !44
+  %236 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %228, i1 %164, i16 0, i1 %164) #3, !dbg !44
+  %237 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %229, i1 %165, i16 0, i1 %165) #3, !dbg !44
+  %238 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %230, i1 %166, i16 0, i1 %166) #3, !dbg !44
+  %239 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %231, i1 %167, i16 0, i1 %167) #3, !dbg !44
+  %240 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %232) #3, !dbg !45
+  %241 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %233) #3, !dbg !45
+  %242 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %234) #3, !dbg !45
+  %243 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %235) #3, !dbg !45
+  %244 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %236) #3, !dbg !45
+  %245 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %237) #3, !dbg !45
+  %246 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %238) #3, !dbg !45
+  %247 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %239) #3, !dbg !45
+  %248 = fmul float %38, %209, !dbg !46
+  %249 = fmul float %38, %211, !dbg !46
+  %250 = fmul float %38, %213, !dbg !46
+  %251 = fmul float %38, %215, !dbg !46
+  %252 = fmul float %38, %217, !dbg !46
+  %253 = fmul float %38, %219, !dbg !46
+  %254 = fmul float %38, %221, !dbg !46
+  %255 = fmul float %38, %223, !dbg !46
+  %256 = fmul float %240, 0x3FF7154760000000, !dbg !47
+  %257 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %256) #3, !dbg !47
+  %258 = fmul float %241, 0x3FF7154760000000, !dbg !47
+  %259 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %258) #3, !dbg !47
+  %260 = fmul float %242, 0x3FF7154760000000, !dbg !47
+  %261 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %260) #3, !dbg !47
+  %262 = fmul float %243, 0x3FF7154760000000, !dbg !47
+  %263 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %262) #3, !dbg !47
+  %264 = fmul float %244, 0x3FF7154760000000, !dbg !47
+  %265 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %264) #3, !dbg !47
+  %266 = fmul float %245, 0x3FF7154760000000, !dbg !47
+  %267 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %266) #3, !dbg !47
+  %268 = fmul float %246, 0x3FF7154760000000, !dbg !47
+  %269 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %268) #3, !dbg !47
+  %270 = fmul float %247, 0x3FF7154760000000, !dbg !47
+  %271 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %270) #3, !dbg !47
+  %272 = fmul float %140, %257, !dbg !48
+  %273 = fmul float %140, %259, !dbg !48
+  %274 = fmul float %140, %261, !dbg !48
+  %275 = fmul float %140, %263, !dbg !48
+  %276 = fmul float %140, %265, !dbg !48
+  %277 = fmul float %140, %267, !dbg !48
+  %278 = fmul float %140, %269, !dbg !48
+  %279 = fmul float %140, %271, !dbg !48
+  %280 = fsub float %248, %272, !dbg !49
+  %281 = fsub float %249, %273, !dbg !49
+  %282 = fsub float %250, %274, !dbg !49
+  %283 = fsub float %251, %275, !dbg !49
+  %284 = fsub float %252, %276, !dbg !49
+  %285 = fsub float %253, %277, !dbg !49
+  %286 = fsub float %254, %278, !dbg !49
+  %287 = fsub float %255, %279, !dbg !49
+  %288 = fadd float %192, %280, !dbg !50
+  %289 = fadd float %193, %281, !dbg !50
+  %290 = fadd float %194, %282, !dbg !50
+  %291 = fadd float %195, %283, !dbg !50
+  %292 = fadd float %196, %284, !dbg !50
+  %293 = fadd float %197, %285, !dbg !50
+  %294 = fadd float %198, %286, !dbg !50
+  %295 = fadd float %199, %287, !dbg !50
+  %296 = getelementptr i16, ptr addrspace(1) %6, i64 %168, !dbg !51
+  %297 = getelementptr i16, ptr addrspace(1) %6, i64 %169, !dbg !51
+  %298 = getelementptr i16, ptr addrspace(1) %6, i64 %170, !dbg !51
+  %299 = getelementptr i16, ptr addrspace(1) %6, i64 %171, !dbg !51
+  %300 = getelementptr i16, ptr addrspace(1) %6, i64 %172, !dbg !51
+  %301 = getelementptr i16, ptr addrspace(1) %6, i64 %173, !dbg !51
+  %302 = getelementptr i16, ptr addrspace(1) %6, i64 %174, !dbg !51
+  %303 = getelementptr i16, ptr addrspace(1) %6, i64 %175, !dbg !51
+  %304 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %288) #3, !dbg !52
+  %305 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %289) #3, !dbg !52
+  %306 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %290) #3, !dbg !52
+  %307 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %291) #3, !dbg !52
+  %308 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %292) #3, !dbg !52
+  %309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %293) #3, !dbg !52
+  %310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %294) #3, !dbg !52
+  %311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %295) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %304, ptr addrspace(1) %296, i1 %160) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %305, ptr addrspace(1) %297, i1 %161) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %306, ptr addrspace(1) %298, i1 %162) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %307, ptr addrspace(1) %299, i1 %163) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %308, ptr addrspace(1) %300, i1 %164) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %309, ptr addrspace(1) %301, i1 %165) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %310, ptr addrspace(1) %302, i1 %166) #3, !dbg !52
+  tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %311, ptr addrspace(1) %303, i1 %167) #3, !dbg !52
+  %312 = add nuw nsw i32 %150, 2048, !dbg !35
+  %313 = icmp ult i32 %150, 48209, !dbg !35
+  br i1 %313, label %149, label %314, !dbg !35
+
+314:                                              ; preds = %149
+  ret void, !dbg !53
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
+!3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 24, column: 33, scope: !5)
+!9 = !DILocation(line: 21, column: 28, scope: !5)
+!10 = !DILocation(line: 21, column: 34, scope: !5)
+!11 = !DILocation(line: 26, column: 30, scope: !5)
+!12 = !DILocation(line: 26, column: 35, scope: !5)
+!13 = !DILocation(line: 27, column: 19, scope: !5)
+!14 = !DILocation(line: 29, column: 19, scope: !5)
+!15 = !DILocation(line: 36, column: 46, scope: !5)
+!16 = !DILocation(line: 38, column: 23, scope: !5)
+!17 = !DILocation(line: 39, column: 22, scope: !5)
+!18 = !DILocation(line: 41, column: 37, scope: !5)
+!19 = !DILocation(line: 32, column: 36, scope: !5)
+!20 = !DILocation(line: 42, column: 23, scope: !5)
+!21 = !DILocation(line: 33, column: 27, scope: !5)
+!22 = !DILocation(line: 34, column: 25, scope: !5)
+!23 = !DILocation(line: 36, column: 34, scope: !5)
+!24 = !DILocation(line: 36, column: 52, scope: !5)
+!25 = !DILocation(line: 45, column: 40, scope: !5)
+!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
+!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
+!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!29 = distinct !DILexicalBlockFile(scope: !5, file: !28, discriminator: 0)
+!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
+!31 = !DILocation(line: 46, column: 27, scope: !27)
+!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
+!33 = !DILocation(line: 46, column: 27, scope: !29)
+!34 = !DILocation(line: 52, column: 27, scope: !5)
+!35 = !DILocation(line: 51, column: 36, scope: !5)
+!36 = !DILocation(line: 53, column: 25, scope: !5)
+!37 = !DILocation(line: 55, column: 41, scope: !5)
+!38 = !DILocation(line: 55, column: 35, scope: !5)
+!39 = !DILocation(line: 55, column: 53, scope: !5)
+!40 = !DILocation(line: 55, column: 105, scope: !5)
+!41 = !DILocation(line: 56, column: 35, scope: !5)
+!42 = !DILocation(line: 56, column: 53, scope: !5)
+!43 = !DILocation(line: 57, column: 35, scope: !5)
+!44 = !DILocation(line: 57, column: 53, scope: !5)
+!45 = !DILocation(line: 57, column: 105, scope: !5)
+!46 = !DILocation(line: 63, column: 24, scope: !5)
+!47 = !DILocation(line: 65, column: 23, scope: !5)
+!48 = !DILocation(line: 66, column: 24, scope: !5)
+!49 = !DILocation(line: 67, column: 24, scope: !5)
+!50 = !DILocation(line: 69, column: 24, scope: !5)
+!51 = !DILocation(line: 70, column: 29, scope: !5)
+!52 = !DILocation(line: 70, column: 54, scope: !5)
+!53 = !DILocation(line: 51, column: 4, scope: !5)
diff --git a/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..1cf146b9f52b6c78c86fa41fae3c86b9692cc4fe
--- /dev/null
+++ b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ptx
@@ -0,0 +1,921 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7de8
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2d3d4d5d6d7de8(
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<83>;
+	.reg .b16 	%rs<65>;
+	.reg .b32 	%r<104>;
+	.reg .f32 	%f<164>;
+	.reg .b64 	%rd<126>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd25, [triton__0d1d2d3d4d5d6d7de8_param_6];
+	ld.param.u64 	%rd24, [triton__0d1d2d3d4d5d6d7de8_param_5];
+	ld.param.u64 	%rd23, [triton__0d1d2d3d4d5d6d7de8_param_4];
+	ld.param.u64 	%rd31, [triton__0d1d2d3d4d5d6d7de8_param_0];
+$L__tmp0:
+	.loc	1 24 33
+	mov.u32 	%r1, %tid.x;
+	ld.param.u64 	%rd32, [triton__0d1d2d3d4d5d6d7de8_param_1];
+	shr.u32 	%r2, %r1, 5;
+	ld.param.u64 	%rd28, [triton__0d1d2d3d4d5d6d7de8_param_2];
+	and.b32  	%r9, %r1, 255;
+	ld.param.u64 	%rd29, [triton__0d1d2d3d4d5d6d7de8_param_3];
+	or.b32  	%r10, %r9, 256;
+	or.b32  	%r11, %r9, 512;
+	or.b32  	%r12, %r9, 768;
+	or.b32  	%r13, %r9, 1024;
+	or.b32  	%r14, %r9, 1280;
+	or.b32  	%r15, %r9, 1536;
+	or.b32  	%r16, %r9, 1792;
+	.loc	1 21 28
+	mov.u32 %r3, %ctaid.x;
+	cvt.u64.u32 	%rd1, %r9;
+	cvt.u64.u32 	%rd8, %r16;
+	cvt.u64.u32 	%rd7, %r15;
+	cvt.u64.u32 	%rd6, %r14;
+	cvt.u64.u32 	%rd5, %r13;
+	cvt.u64.u32 	%rd4, %r12;
+	cvt.u64.u32 	%rd3, %r11;
+	cvt.u64.u32 	%rd2, %r10;
+	.loc	1 26 30
+	mul.wide.s32 	%rd33, %r3, 8;
+	add.s64 	%rd27, %rd32, %rd33;
+	mov.pred 	%p1, -1;
+	.loc	1 26 35
+	mov.u64 %rd26, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
+	.loc	1 27 19
+	mov.u32 %r7, 0x0;
+	@%p1 ld.global.b32 { %r7 }, [ %rd28 + 0 ];
+	.loc	1 29 19
+	mov.u32 %r8, 0x0;
+	@%p1 ld.global.b32 { %r8 }, [ %rd29 + 0 ];
+	.loc	1 36 46
+	mul.wide.s32 	%rd9, %r3, 50257;
+	.loc	1 38 23
+	setp.eq.s64 	%p4, %rd26, -1;
+	.loc	1 39 22
+	div.full.f32 %r6, %r7, %r8;
+	mov.b32 	%f35, %r6;
+	.loc	1 41 37
+	selp.f32 	%f2, 0f00000000, %f35, %p4;
+	.loc	1 32 36
+	shl.b64 	%rd34, %rd9, 2;
+	add.s64 	%rd10, %rd31, %rd34;
+	mov.f32 	%f156, 0f00000000;
+	mov.u64 	%rd124, 0;
+	mov.f32 	%f157, %f156;
+	mov.f32 	%f158, %f156;
+	mov.f32 	%f159, %f156;
+	mov.f32 	%f160, %f156;
+	mov.f32 	%f161, %f156;
+	mov.f32 	%f162, %f156;
+	mov.f32 	%f163, %f156;
+$L__BB0_1:
+	.loc	1 33 27
+	or.b64  	%rd43, %rd124, %rd1;
+	or.b64  	%rd44, %rd124, %rd2;
+	or.b64  	%rd45, %rd124, %rd3;
+	or.b64  	%rd46, %rd124, %rd4;
+	or.b64  	%rd47, %rd124, %rd5;
+	or.b64  	%rd48, %rd124, %rd6;
+	or.b64  	%rd49, %rd124, %rd7;
+	or.b64  	%rd50, %rd124, %rd8;
+	.loc	1 34 25
+	setp.lt.u64 	%p20, %rd50, 50257;
+	setp.lt.u64 	%p18, %rd49, 50257;
+	setp.lt.u64 	%p16, %rd48, 50257;
+	setp.lt.u64 	%p14, %rd47, 50257;
+	setp.lt.u64 	%p12, %rd46, 50257;
+	setp.lt.u64 	%p10, %rd45, 50257;
+	setp.lt.u64 	%p8, %rd44, 50257;
+	setp.lt.u64 	%p6, %rd43, 50257;
+	.loc	1 36 34
+	shl.b64 	%rd51, %rd43, 2;
+	add.s64 	%rd35, %rd10, %rd51;
+	shl.b64 	%rd52, %rd44, 2;
+	add.s64 	%rd36, %rd10, %rd52;
+	shl.b64 	%rd53, %rd45, 2;
+	add.s64 	%rd37, %rd10, %rd53;
+	shl.b64 	%rd54, %rd46, 2;
+	add.s64 	%rd38, %rd10, %rd54;
+	shl.b64 	%rd55, %rd47, 2;
+	add.s64 	%rd39, %rd10, %rd55;
+	shl.b64 	%rd56, %rd48, 2;
+	add.s64 	%rd40, %rd10, %rd56;
+	shl.b64 	%rd57, %rd49, 2;
+	add.s64 	%rd41, %rd10, %rd57;
+	shl.b64 	%rd58, %rd50, 2;
+	add.s64 	%rd42, %rd10, %rd58;
+	mov.b32 	%r71, 0;
+	.loc	1 36 52
+	mov.u32 %r17, 0x0;
+	@%p6 ld.global.L1::evict_last.b32 { %r17 }, [ %rd35 + 0 ];
+	@!%p6 mov.u32 %r17, %r71;
+	mov.u32 %r19, 0x0;
+	@%p8 ld.global.L1::evict_last.b32 { %r19 }, [ %rd36 + 0 ];
+	@!%p8 mov.u32 %r19, %r71;
+	mov.u32 %r21, 0x0;
+	@%p10 ld.global.L1::evict_last.b32 { %r21 }, [ %rd37 + 0 ];
+	@!%p10 mov.u32 %r21, %r71;
+	mov.u32 %r23, 0x0;
+	@%p12 ld.global.L1::evict_last.b32 { %r23 }, [ %rd38 + 0 ];
+	@!%p12 mov.u32 %r23, %r71;
+	mov.u32 %r25, 0x0;
+	@%p14 ld.global.L1::evict_last.b32 { %r25 }, [ %rd39 + 0 ];
+	@!%p14 mov.u32 %r25, %r71;
+	mov.u32 %r27, 0x0;
+	@%p16 ld.global.L1::evict_last.b32 { %r27 }, [ %rd40 + 0 ];
+	@!%p16 mov.u32 %r27, %r71;
+	mov.u32 %r29, 0x0;
+	@%p18 ld.global.L1::evict_last.b32 { %r29 }, [ %rd41 + 0 ];
+	@!%p18 mov.u32 %r29, %r71;
+	mov.u32 %r31, 0x0;
+	@%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd42 + 0 ];
+	@!%p20 mov.u32 %r31, %r71;
+	mov.b32 	%f36, %r31;
+	mov.b32 	%f37, %r29;
+	mov.b32 	%f38, %r27;
+	mov.b32 	%f39, %r25;
+	mov.b32 	%f40, %r23;
+	mov.b32 	%f41, %r21;
+	mov.b32 	%f42, %r19;
+	mov.b32 	%f43, %r17;
+	.loc	1 42 23
+	mul.f32 	%f44, %f2, %f43;
+	mul.f32 	%f45, %f2, %f42;
+	mul.f32 	%f46, %f2, %f41;
+	mul.f32 	%f47, %f2, %f40;
+	mul.f32 	%f48, %f2, %f39;
+	mul.f32 	%f49, %f2, %f38;
+	mul.f32 	%f50, %f2, %f37;
+	mul.f32 	%f51, %f2, %f36;
+	.loc	1 45 40
+	selp.f32 	%f52, %f51, 0f80000000, %p20;
+	selp.f32 	%f53, %f50, 0f80000000, %p18;
+	selp.f32 	%f54, %f49, 0f80000000, %p16;
+	selp.f32 	%f55, %f48, 0f80000000, %p14;
+	selp.f32 	%f56, %f47, 0f80000000, %p12;
+	selp.f32 	%f57, %f46, 0f80000000, %p10;
+	selp.f32 	%f58, %f45, 0f80000000, %p8;
+	selp.f32 	%f59, %f44, 0f80000000, %p6;
+	add.f32 	%f156, %f156, %f59;
+	add.f32 	%f157, %f157, %f58;
+	add.f32 	%f158, %f158, %f57;
+	add.f32 	%f159, %f159, %f56;
+	add.f32 	%f160, %f160, %f55;
+	add.f32 	%f161, %f161, %f54;
+	add.f32 	%f162, %f162, %f53;
+	add.f32 	%f163, %f163, %f52;
+	.loc	1 32 36
+	add.s64 	%rd124, %rd124, 2048;
+	cvt.u32.u64 	%r33, %rd124;
+	add.s32 	%r34, %r33, -2048;
+	setp.lt.u32 	%p21, %r34, 48209;
+	@%p21 bra 	$L__BB0_1;
+	.loc	1 24 33
+	and.b32  	%r41, %r1, 31;
+	and.b32  	%r42, %r2, 7;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f60, %f156, %f157;
+	add.f32 	%f61, %f158, %f60;
+	add.f32 	%f62, %f159, %f61;
+	add.f32 	%f63, %f160, %f62;
+	add.f32 	%f64, %f161, %f63;
+	add.f32 	%f65, %f162, %f64;
+	add.f32 	%f66, %f163, %f65;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r43, %f66;
+	shfl.sync.bfly.b32	%r44, %r43, 16, 31, -1;
+	mov.b32 	%f67, %r44;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f68, %f66, %f67;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r45, %f68;
+	shfl.sync.bfly.b32	%r46, %r45, 8, 31, -1;
+	mov.b32 	%f69, %r46;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f70, %f68, %f69;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r47, %f70;
+	shfl.sync.bfly.b32	%r48, %r47, 4, 31, -1;
+	mov.b32 	%f71, %r48;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f72, %f70, %f71;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r49, %f72;
+	shfl.sync.bfly.b32	%r50, %r49, 2, 31, -1;
+	mov.b32 	%f73, %r50;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f74, %f72, %f73;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r51, %f74;
+	shfl.sync.bfly.b32	%r52, %r51, 1, 31, -1;
+	mov.b32 	%f75, %r52;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f76, %f74, %f75;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p22, %r41, 0;
+	shl.b32 	%r53, %r42, 2;
+	mov.u32 	%r54, global_smem;
+	add.s32 	%r35, %r54, %r53;
+	mov.b32 	%r36, %f76;
+	@%p22 st.shared.b32 [ %r35 + 0 ], %r36;
+	bar.sync 	0;
+	setp.lt.s32 	%p23, %r1, 8;
+	shl.b32 	%r55, %r1, 2;
+	add.s32 	%r38, %r54, %r55;
+	@%p23 ld.shared.b32 %r37, [ %r38 + 0 ];
+	mov.b32 	%f77, %r37;
+	shfl.sync.bfly.b32	%r56, %r37, 4, 31, -1;
+	mov.b32 	%f78, %r56;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f79, %f77, %f78;
+$L__tmp14:
+	.loc	2 243 36
+	mov.b32 	%r57, %f79;
+	shfl.sync.bfly.b32	%r58, %r57, 2, 31, -1;
+	mov.b32 	%f80, %r58;
+$L__tmp15:
+	.loc	2 233 15
+	add.f32 	%f81, %f79, %f80;
+$L__tmp16:
+	.loc	2 243 36
+	mov.b32 	%r59, %f81;
+	shfl.sync.bfly.b32	%r60, %r59, 1, 31, -1;
+	mov.b32 	%f82, %r60;
+$L__tmp17:
+	.loc	2 233 15
+	add.f32 	%f83, %f81, %f82;
+$L__tmp18:
+	.loc	2 243 36
+	and.b32  	%r61, %r1, 7;
+	setp.eq.s32 	%p25, %r61, 0;
+	and.pred  	%p24, %p23, %p25;
+	mov.b32 	%r40, %f83;
+	@%p24 st.shared.b32 [ %r38 + 0 ], %r40;
+	bar.sync 	0;
+	ld.shared.f32 	%f26, [global_smem];
+	mov.u64 	%rd125, 0;
+	mov.u16 	%rs2, 0;
+$L__tmp19:
+$L__BB0_3:
+	.loc	1 52 27
+	or.b64  	%rd92, %rd1, %rd125;
+	or.b64  	%rd93, %rd2, %rd125;
+	or.b64  	%rd94, %rd3, %rd125;
+	or.b64  	%rd95, %rd4, %rd125;
+	or.b64  	%rd96, %rd5, %rd125;
+	or.b64  	%rd97, %rd6, %rd125;
+	or.b64  	%rd98, %rd7, %rd125;
+	or.b64  	%rd99, %rd8, %rd125;
+	.loc	1 53 25
+	setp.lt.u64 	%p26, %rd92, 50257;
+	setp.lt.u64 	%p28, %rd93, 50257;
+	setp.lt.u64 	%p30, %rd94, 50257;
+	setp.lt.u64 	%p32, %rd95, 50257;
+	setp.lt.u64 	%p34, %rd96, 50257;
+	setp.lt.u64 	%p36, %rd97, 50257;
+	setp.lt.u64 	%p38, %rd98, 50257;
+	setp.lt.u64 	%p40, %rd99, 50257;
+	.loc	1 55 41
+	add.s64 	%rd100, %rd92, %rd9;
+	add.s64 	%rd101, %rd93, %rd9;
+	add.s64 	%rd102, %rd94, %rd9;
+	add.s64 	%rd103, %rd95, %rd9;
+	add.s64 	%rd104, %rd96, %rd9;
+	add.s64 	%rd105, %rd97, %rd9;
+	add.s64 	%rd106, %rd98, %rd9;
+	add.s64 	%rd107, %rd99, %rd9;
+	.loc	1 55 35
+	shl.b64 	%rd108, %rd100, 1;
+	add.s64 	%rd60, %rd23, %rd108;
+	shl.b64 	%rd109, %rd101, 1;
+	add.s64 	%rd61, %rd23, %rd109;
+	shl.b64 	%rd110, %rd102, 1;
+	add.s64 	%rd62, %rd23, %rd110;
+	shl.b64 	%rd111, %rd103, 1;
+	add.s64 	%rd63, %rd23, %rd111;
+	shl.b64 	%rd112, %rd104, 1;
+	add.s64 	%rd64, %rd23, %rd112;
+	shl.b64 	%rd113, %rd105, 1;
+	add.s64 	%rd65, %rd23, %rd113;
+	shl.b64 	%rd114, %rd106, 1;
+	add.s64 	%rd66, %rd23, %rd114;
+	shl.b64 	%rd115, %rd107, 1;
+	add.s64 	%rd67, %rd23, %rd115;
+	.loc	1 55 53
+	mov.u16 %rs1, 0x0;
+	@%p26 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd60 + 0 ];
+	@!%p26 mov.u16 %rs1, %rs2;
+	mov.u16 %rs3, 0x0;
+	@%p28 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd61 + 0 ];
+	@!%p28 mov.u16 %rs3, %rs2;
+	mov.u16 %rs5, 0x0;
+	@%p30 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd62 + 0 ];
+	@!%p30 mov.u16 %rs5, %rs2;
+	mov.u16 %rs7, 0x0;
+	@%p32 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd63 + 0 ];
+	@!%p32 mov.u16 %rs7, %rs2;
+	mov.u16 %rs9, 0x0;
+	@%p34 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd64 + 0 ];
+	@!%p34 mov.u16 %rs9, %rs2;
+	mov.u16 %rs11, 0x0;
+	@%p36 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd65 + 0 ];
+	@!%p36 mov.u16 %rs11, %rs2;
+	mov.u16 %rs13, 0x0;
+	@%p38 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd66 + 0 ];
+	@!%p38 mov.u16 %rs13, %rs2;
+	mov.u16 %rs15, 0x0;
+	@%p40 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd67 + 0 ];
+	@!%p40 mov.u16 %rs15, %rs2;
+	.loc	1 55 105
+	cvt.f32.bf16 %r62, %rs1;
+	mov.b32 	%f100, %r62;
+	cvt.f32.bf16 %r63, %rs3;
+	mov.b32 	%f101, %r63;
+	cvt.f32.bf16 %r64, %rs5;
+	mov.b32 	%f102, %r64;
+	cvt.f32.bf16 %r65, %rs7;
+	mov.b32 	%f103, %r65;
+	cvt.f32.bf16 %r66, %rs9;
+	mov.b32 	%f104, %r66;
+	cvt.f32.bf16 %r67, %rs11;
+	mov.b32 	%f105, %r67;
+	cvt.f32.bf16 %r68, %rs13;
+	mov.b32 	%f106, %r68;
+	cvt.f32.bf16 %r69, %rs15;
+	mov.b32 	%f107, %r69;
+	.loc	1 56 35
+	shl.b64 	%rd116, %rd92, 2;
+	add.s64 	%rd68, %rd10, %rd116;
+	shl.b64 	%rd117, %rd93, 2;
+	add.s64 	%rd69, %rd10, %rd117;
+	shl.b64 	%rd118, %rd94, 2;
+	add.s64 	%rd70, %rd10, %rd118;
+	shl.b64 	%rd119, %rd95, 2;
+	add.s64 	%rd71, %rd10, %rd119;
+	shl.b64 	%rd120, %rd96, 2;
+	add.s64 	%rd72, %rd10, %rd120;
+	shl.b64 	%rd121, %rd97, 2;
+	add.s64 	%rd73, %rd10, %rd121;
+	shl.b64 	%rd122, %rd98, 2;
+	add.s64 	%rd74, %rd10, %rd122;
+	shl.b64 	%rd123, %rd99, 2;
+	add.s64 	%rd75, %rd10, %rd123;
+	.loc	1 56 53
+	mov.u32 %r70, 0x0;
+	@%p26 ld.global.L1::evict_first.b32 { %r70 }, [ %rd68 + 0 ];
+	@!%p26 mov.u32 %r70, %r71;
+	mov.b32 	%f108, %r70;
+	mov.u32 %r72, 0x0;
+	@%p28 ld.global.L1::evict_first.b32 { %r72 }, [ %rd69 + 0 ];
+	@!%p28 mov.u32 %r72, %r71;
+	mov.b32 	%f109, %r72;
+	mov.u32 %r74, 0x0;
+	@%p30 ld.global.L1::evict_first.b32 { %r74 }, [ %rd70 + 0 ];
+	@!%p30 mov.u32 %r74, %r71;
+	mov.b32 	%f110, %r74;
+	mov.u32 %r76, 0x0;
+	@%p32 ld.global.L1::evict_first.b32 { %r76 }, [ %rd71 + 0 ];
+	@!%p32 mov.u32 %r76, %r71;
+	mov.b32 	%f111, %r76;
+	mov.u32 %r78, 0x0;
+	@%p34 ld.global.L1::evict_first.b32 { %r78 }, [ %rd72 + 0 ];
+	@!%p34 mov.u32 %r78, %r71;
+	mov.b32 	%f112, %r78;
+	mov.u32 %r80, 0x0;
+	@%p36 ld.global.L1::evict_first.b32 { %r80 }, [ %rd73 + 0 ];
+	@!%p36 mov.u32 %r80, %r71;
+	mov.b32 	%f113, %r80;
+	mov.u32 %r82, 0x0;
+	@%p38 ld.global.L1::evict_first.b32 { %r82 }, [ %rd74 + 0 ];
+	@!%p38 mov.u32 %r82, %r71;
+	mov.b32 	%f114, %r82;
+	mov.u32 %r84, 0x0;
+	@%p40 ld.global.L1::evict_first.b32 { %r84 }, [ %rd75 + 0 ];
+	@!%p40 mov.u32 %r84, %r71;
+	mov.b32 	%f115, %r84;
+	.loc	1 57 35
+	add.s64 	%rd76, %rd24, %rd108;
+	add.s64 	%rd77, %rd24, %rd109;
+	add.s64 	%rd78, %rd24, %rd110;
+	add.s64 	%rd79, %rd24, %rd111;
+	add.s64 	%rd80, %rd24, %rd112;
+	add.s64 	%rd81, %rd24, %rd113;
+	add.s64 	%rd82, %rd24, %rd114;
+	add.s64 	%rd83, %rd24, %rd115;
+	.loc	1 57 53
+	mov.u16 %rs25, 0x0;
+	@%p26 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd76 + 0 ];
+	@!%p26 mov.u16 %rs25, %rs2;
+	mov.u16 %rs27, 0x0;
+	@%p28 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd77 + 0 ];
+	@!%p28 mov.u16 %rs27, %rs2;
+	mov.u16 %rs29, 0x0;
+	@%p30 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd78 + 0 ];
+	@!%p30 mov.u16 %rs29, %rs2;
+	mov.u16 %rs31, 0x0;
+	@%p32 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd79 + 0 ];
+	@!%p32 mov.u16 %rs31, %rs2;
+	mov.u16 %rs33, 0x0;
+	@%p34 ld.global.L1::evict_first.b16 { %rs33 }, [ %rd80 + 0 ];
+	@!%p34 mov.u16 %rs33, %rs2;
+	mov.u16 %rs35, 0x0;
+	@%p36 ld.global.L1::evict_first.b16 { %rs35 }, [ %rd81 + 0 ];
+	@!%p36 mov.u16 %rs35, %rs2;
+	mov.u16 %rs37, 0x0;
+	@%p38 ld.global.L1::evict_first.b16 { %rs37 }, [ %rd82 + 0 ];
+	@!%p38 mov.u16 %rs37, %rs2;
+	mov.u16 %rs39, 0x0;
+	@%p40 ld.global.L1::evict_first.b16 { %rs39 }, [ %rd83 + 0 ];
+	@!%p40 mov.u16 %rs39, %rs2;
+	.loc	1 57 105
+	cvt.f32.bf16 %r86, %rs25;
+	mov.b32 	%f116, %r86;
+	cvt.f32.bf16 %r87, %rs27;
+	mov.b32 	%f117, %r87;
+	cvt.f32.bf16 %r88, %rs29;
+	mov.b32 	%f118, %r88;
+	cvt.f32.bf16 %r89, %rs31;
+	mov.b32 	%f119, %r89;
+	cvt.f32.bf16 %r90, %rs33;
+	mov.b32 	%f120, %r90;
+	cvt.f32.bf16 %r91, %rs35;
+	mov.b32 	%f121, %r91;
+	cvt.f32.bf16 %r92, %rs37;
+	mov.b32 	%f122, %r92;
+	cvt.f32.bf16 %r93, %rs39;
+	mov.b32 	%f123, %r93;
+	.loc	1 65 23
+	mul.f32 	%f85, %f116, 0f3FB8AA3B;
+	ex2.approx.f32 %f84, %f85;
+	mul.f32 	%f87, %f117, 0f3FB8AA3B;
+	ex2.approx.f32 %f86, %f87;
+	mul.f32 	%f89, %f118, 0f3FB8AA3B;
+	ex2.approx.f32 %f88, %f89;
+	mul.f32 	%f91, %f119, 0f3FB8AA3B;
+	ex2.approx.f32 %f90, %f91;
+	mul.f32 	%f93, %f120, 0f3FB8AA3B;
+	ex2.approx.f32 %f92, %f93;
+	mul.f32 	%f95, %f121, 0f3FB8AA3B;
+	ex2.approx.f32 %f94, %f95;
+	mul.f32 	%f97, %f122, 0f3FB8AA3B;
+	ex2.approx.f32 %f96, %f97;
+	mul.f32 	%f99, %f123, 0f3FB8AA3B;
+	ex2.approx.f32 %f98, %f99;
+	.loc	1 66 24
+	mul.f32 	%f124, %f26, %f84;
+	mul.f32 	%f125, %f26, %f86;
+	mul.f32 	%f126, %f26, %f88;
+	mul.f32 	%f127, %f26, %f90;
+	mul.f32 	%f128, %f26, %f92;
+	mul.f32 	%f129, %f26, %f94;
+	mul.f32 	%f130, %f26, %f96;
+	mul.f32 	%f131, %f26, %f98;
+	.loc	1 67 24
+	neg.f32 	%f132, %f124;
+	fma.rn.f32 	%f133, %f2, %f108, %f132;
+	neg.f32 	%f134, %f125;
+	fma.rn.f32 	%f135, %f2, %f109, %f134;
+	neg.f32 	%f136, %f126;
+	fma.rn.f32 	%f137, %f2, %f110, %f136;
+	neg.f32 	%f138, %f127;
+	fma.rn.f32 	%f139, %f2, %f111, %f138;
+	neg.f32 	%f140, %f128;
+	fma.rn.f32 	%f141, %f2, %f112, %f140;
+	neg.f32 	%f142, %f129;
+	fma.rn.f32 	%f143, %f2, %f113, %f142;
+	neg.f32 	%f144, %f130;
+	fma.rn.f32 	%f145, %f2, %f114, %f144;
+	neg.f32 	%f146, %f131;
+	fma.rn.f32 	%f147, %f2, %f115, %f146;
+	.loc	1 69 24
+	add.f32 	%f148, %f100, %f133;
+	add.f32 	%f149, %f101, %f135;
+	add.f32 	%f150, %f102, %f137;
+	add.f32 	%f151, %f103, %f139;
+	add.f32 	%f152, %f104, %f141;
+	add.f32 	%f153, %f105, %f143;
+	add.f32 	%f154, %f106, %f145;
+	add.f32 	%f155, %f107, %f147;
+	.loc	1 70 29
+	add.s64 	%rd84, %rd25, %rd108;
+	add.s64 	%rd85, %rd25, %rd109;
+	add.s64 	%rd86, %rd25, %rd110;
+	add.s64 	%rd87, %rd25, %rd111;
+	add.s64 	%rd88, %rd25, %rd112;
+	add.s64 	%rd89, %rd25, %rd113;
+	add.s64 	%rd90, %rd25, %rd114;
+	add.s64 	%rd91, %rd25, %rd115;
+	.loc	1 70 54
+	mov.b32 	%r94, %f148;
+	cvt.rn.bf16.f32 %rs49, %r94;
+	mov.b32 	%r95, %f149;
+	cvt.rn.bf16.f32 %rs50, %r95;
+	mov.b32 	%r96, %f150;
+	cvt.rn.bf16.f32 %rs51, %r96;
+	mov.b32 	%r97, %f151;
+	cvt.rn.bf16.f32 %rs52, %r97;
+	mov.b32 	%r98, %f152;
+	cvt.rn.bf16.f32 %rs53, %r98;
+	mov.b32 	%r99, %f153;
+	cvt.rn.bf16.f32 %rs54, %r99;
+	mov.b32 	%r100, %f154;
+	cvt.rn.bf16.f32 %rs55, %r100;
+	mov.b32 	%r101, %f155;
+	cvt.rn.bf16.f32 %rs56, %r101;
+	@%p26 st.global.b16 [ %rd84 + 0 ], { %rs49 };
+	@%p28 st.global.b16 [ %rd85 + 0 ], { %rs50 };
+	@%p30 st.global.b16 [ %rd86 + 0 ], { %rs51 };
+	@%p32 st.global.b16 [ %rd87 + 0 ], { %rs52 };
+	@%p34 st.global.b16 [ %rd88 + 0 ], { %rs53 };
+	@%p36 st.global.b16 [ %rd89 + 0 ], { %rs54 };
+	@%p38 st.global.b16 [ %rd90 + 0 ], { %rs55 };
+	@%p40 st.global.b16 [ %rd91 + 0 ], { %rs56 };
+	.loc	1 51 36
+	add.s64 	%rd125, %rd125, 2048;
+	cvt.u32.u64 	%r102, %rd125;
+	add.s32 	%r103, %r102, -2048;
+	setp.lt.u32 	%p82, %r103, 48209;
+	@%p82 bra 	$L__BB0_3;
+	.loc	1 51 4
+	ret;
+$L__tmp20:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 278
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 107
+.b8 122
+.b8 103
+.b8 108
+.b8 55
+.b8 116
+.b8 104
+.b8 98
+.b8 52
+.b8 120
+.b8 100
+.b8 102
+.b8 107
+.b8 102
+.b8 110
+.b8 100
+.b8 50
+.b8 116
+.b8 105
+.b8 100
+.b8 107
+.b8 115
+.b8 54
+.b8 109
+.b8 116
+.b8 53
+.b8 102
+.b8 51
+.b8 104
+.b8 97
+.b8 117
+.b8 119
+.b8 102
+.b8 121
+.b8 106
+.b8 102
+.b8 108
+.b8 98
+.b8 116
+.b8 122
+.b8 121
+.b8 101
+.b8 112
+.b8 111
+.b8 53
+.b8 111
+.b8 120
+.b8 107
+.b8 118
+.b8 104
+.b8 107
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 107
+.b8 122
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 101
+.b8 56
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 101
+.b8 56
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp18
+.b8 2
+.b8 46
+.b8 27
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp18
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp19
+.b8 2
+.b8 46
+.b8 27
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 282
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 101
+.b8 56
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 282
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..ea84a19e67362c0c20c0c66eca3bf44a04c5a47c
--- /dev/null
+++ b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttgir
@@ -0,0 +1,81 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x1xf32, #blocked>
+    %cst_0 = arith.constant dense<-1> : tensor<1x1xi64, #blocked>
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
+    %cst_2 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c2048_i32 = arith.constant 2048 : i32
+    %c50257_i32 = arith.constant 50257 : i32
+    %c50257_i64 = arith.constant 50257 : i64
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
+    %4 = arith.extsi %3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
+    %5 = tt.addptr %arg1, %1 : !tt.ptr<i64, 1>, i64
+    %6 = tt.splat %5 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64, #blocked>
+    %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+    %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+    %12 = arith.muli %1, %c50257_i64 : i64
+    %13 = tt.splat %12 : (i64) -> tensor<1x2048xi64, #blocked>
+    %14 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>, #blocked>
+    %15 = arith.cmpi ne, %7, %cst_0 : tensor<1x1xi64, #blocked>
+    %16 = arith.divf %9, %11 : f32
+    %17 = tt.splat %16 : (f32) -> tensor<1x1xf32, #blocked>
+    %18 = arith.select %15, %17, %cst : tensor<1x1xi1, #blocked>, tensor<1x1xf32, #blocked>
+    %19 = tt.broadcast %18 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
+    %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg10 = %cst_1) -> (tensor<1x2048xf32, #blocked>)  : i32 {
+      %27 = arith.extsi %arg9 : i32 to i64
+      %28 = tt.splat %27 : (i64) -> tensor<1x2048xi64, #blocked>
+      %29 = arith.addi %28, %4 : tensor<1x2048xi64, #blocked>
+      %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x2048xi64, #blocked>
+      %31 = arith.addi %29, %13 : tensor<1x2048xi64, #blocked>
+      %32 = tt.addptr %14, %31 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %33 = tt.load %32, %30, %cst_1 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
+      %34 = arith.mulf %33, %19 : tensor<1x2048xf32, #blocked>
+      %35 = arith.addf %arg10, %34 : tensor<1x2048xf32, #blocked>
+      %36 = arith.select %30, %35, %arg10 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
+      scf.yield %36 : tensor<1x2048xf32, #blocked>
+    }
+    %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32, %arg10: f32):
+      %27 = arith.addf %arg9, %arg10 : f32
+      tt.reduce.return %27 : f32
+    }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
+    %23 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
+    %24 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
+    %25 = tt.broadcast %22 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
+    %26 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32  : i32 {
+      %27 = arith.extsi %arg9 : i32 to i64
+      %28 = tt.splat %27 : (i64) -> tensor<1x2048xi64, #blocked>
+      %29 = arith.addi %28, %4 : tensor<1x2048xi64, #blocked>
+      %30 = arith.cmpi slt, %29, %cst_2 : tensor<1x2048xi64, #blocked>
+      %31 = arith.addi %29, %13 : tensor<1x2048xi64, #blocked>
+      %32 = tt.addptr %23, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %33 = tt.load %32, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
+      %34 = arith.extf %33 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
+      %35 = tt.addptr %14, %31 : tensor<1x2048x!tt.ptr<f32, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %36 = tt.load %35, %30, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32, #blocked>
+      %37 = tt.addptr %24, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %38 = tt.load %37, %30, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
+      %39 = arith.extf %38 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
+      %40 = arith.mulf %36, %19 : tensor<1x2048xf32, #blocked>
+      %41 = math.exp %39 : tensor<1x2048xf32, #blocked>
+      %42 = arith.mulf %41, %25 : tensor<1x2048xf32, #blocked>
+      %43 = arith.subf %40, %42 : tensor<1x2048xf32, #blocked>
+      %44 = arith.addf %34, %43 : tensor<1x2048xf32, #blocked>
+      %45 = tt.addptr %26, %31 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %46 = arith.truncf %44 : tensor<1x2048xf32, #blocked> to tensor<1x2048xbf16, #blocked>
+      tt.store %45, %46, %30 {cache = 1 : i32, evict = 1 : i32} : tensor<1x2048xbf16, #blocked>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..afa0c07dbfcaeddb384f05d34272cdb309bec890
--- /dev/null
+++ b/.triton/dump/55fe15065c2876112e70d87fa8bae3d1/triton_.ttir
@@ -0,0 +1,88 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x1xf32>
+    %c50257_i64 = arith.constant 50257 : i64
+    %c50257_i32 = arith.constant 50257 : i32
+    %c2048_i32 = arith.constant 2048 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64>
+    %cst_2 = arith.constant dense<-1> : tensor<1x1xi64>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32>
+    %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32>) -> tensor<1x2048xi32>
+    %4 = arith.extsi %3 : tensor<1x2048xi32> to tensor<1x2048xi64>
+    %5 = tt.addptr %arg1, %1 : !tt.ptr<i64, 1>, i64
+    %6 = tt.splat %5 : (!tt.ptr<i64, 1>) -> tensor<1x1x!tt.ptr<i64, 1>>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x1xi64>
+    %8 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+    %10 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %11 = tt.load %10 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+    %12 = arith.muli %1, %c50257_i64 : i64
+    %13 = tt.splat %12 : (i64) -> tensor<1x2048xi64>
+    %14 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
+    %15 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64>
+    %16 = arith.divf %9, %11 : f32
+    %17 = tt.splat %16 : (f32) -> tensor<1x1xf32>
+    %18 = arith.select %15, %17, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32>
+    %19 = tt.broadcast %18 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
+    %20 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg10 = %cst_3) -> (tensor<1x2048xf32>)  : i32 {
+      %35 = arith.extsi %arg9 : i32 to i64
+      %36 = tt.splat %35 : (i64) -> tensor<1x2048xi64>
+      %37 = arith.addi %36, %4 : tensor<1x2048xi64>
+      %38 = arith.cmpi slt, %37, %cst_1 : tensor<1x2048xi64>
+      %39 = arith.addi %37, %13 : tensor<1x2048xi64>
+      %40 = tt.addptr %14, %39 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
+      %41 = tt.load %40, %38, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xf32>
+      %42 = arith.mulf %41, %19 : tensor<1x2048xf32>
+      %43 = arith.addf %arg10, %42 : tensor<1x2048xf32>
+      %44 = arith.select %38, %43, %arg10 : tensor<1x2048xi1>, tensor<1x2048xf32>
+      scf.yield %44 : tensor<1x2048xf32>
+    }
+    %21 = "tt.reduce"(%20) <{axis = 1 : i32}> ({
+    ^bb0(%arg9: f32, %arg10: f32):
+      %35 = arith.addf %arg9, %arg10 : f32
+      tt.reduce.return %35 : f32
+    }) : (tensor<1x2048xf32>) -> tensor<1xf32>
+    %22 = tt.expand_dims %21 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %23 = arith.muli %1, %c50257_i64 : i64
+    %24 = tt.splat %23 : (i64) -> tensor<1x2048xi64>
+    %25 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
+    %26 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x2048x!tt.ptr<f32, 1>>
+    %27 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
+    %28 = arith.cmpi ne, %7, %cst_2 : tensor<1x1xi64>
+    %29 = arith.divf %9, %11 : f32
+    %30 = tt.splat %29 : (f32) -> tensor<1x1xf32>
+    %31 = arith.select %28, %30, %cst_0 : tensor<1x1xi1>, tensor<1x1xf32>
+    %32 = tt.broadcast %31 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
+    %33 = tt.broadcast %22 : (tensor<1x1xf32>) -> tensor<1x2048xf32>
+    %34 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>>
+    scf.for %arg9 = %c0_i32 to %c50257_i32 step %c2048_i32  : i32 {
+      %35 = arith.extsi %arg9 : i32 to i64
+      %36 = tt.splat %35 : (i64) -> tensor<1x2048xi64>
+      %37 = arith.addi %36, %4 : tensor<1x2048xi64>
+      %38 = arith.cmpi slt, %37, %cst_1 : tensor<1x2048xi64>
+      %39 = arith.addi %37, %24 : tensor<1x2048xi64>
+      %40 = tt.addptr %25, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
+      %41 = tt.load %40, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
+      %42 = arith.extf %41 : tensor<1x2048xbf16> to tensor<1x2048xf32>
+      %43 = tt.addptr %26, %39 : tensor<1x2048x!tt.ptr<f32, 1>>, tensor<1x2048xi64>
+      %44 = tt.load %43, %38, %cst_3 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xf32>
+      %45 = tt.addptr %27, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
+      %46 = tt.load %45, %38, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16>
+      %47 = arith.extf %46 : tensor<1x2048xbf16> to tensor<1x2048xf32>
+      %48 = arith.mulf %44, %32 : tensor<1x2048xf32>
+      %49 = math.exp %47 : tensor<1x2048xf32>
+      %50 = arith.mulf %49, %33 : tensor<1x2048xf32>
+      %51 = arith.subf %48, %50 : tensor<1x2048xf32>
+      %52 = arith.addf %42, %51 : tensor<1x2048xf32>
+      %53 = tt.addptr %34, %39 : tensor<1x2048x!tt.ptr<bf16, 1>>, tensor<1x2048xi64>
+      %54 = arith.truncf %52 : tensor<1x2048xf32> to tensor<1x2048xbf16>
+      tt.store %53, %54, %38 {cache = 1 : i32, evict = 1 : i32} : tensor<1x2048xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..595d27e57758bdf99ce286dd09a3757ee3eb2ef6
Binary files /dev/null and b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.cubin differ
diff --git a/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f1368681e281798baeac28ac98a33e51b3c38742
--- /dev/null
+++ b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.llir
@@ -0,0 +1,55 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = icmp slt i32 %9, 12865792, !dbg !12
+  %11 = sext i32 %9 to i64, !dbg !13
+  %12 = getelementptr i16, ptr addrspace(1) %0, i64 %11, !dbg !13
+  %13 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %12, i1 %10) #1, !dbg !14
+  %14 = trunc i32 %13 to i16, !dbg !14
+  %extelt.offset = lshr i32 %13, 16, !dbg !14
+  %15 = trunc i32 %extelt.offset to i16, !dbg !14
+  %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !15
+  %17 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %15) #1, !dbg !15
+  %18 = getelementptr float, ptr addrspace(1) %1, i64 %11, !dbg !16
+  %19 = bitcast float %16 to i32, !dbg !17
+  %20 = bitcast float %17 to i32, !dbg !17
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %19, i32 %20, ptr addrspace(1) %18, i1 %10) #1, !dbg !17
+  ret void, !dbg !18
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 22, column: 21, scope: !5)
+!13 = !DILocation(line: 24, column: 30, scope: !5)
+!14 = !DILocation(line: 24, column: 35, scope: !5)
+!15 = !DILocation(line: 24, column: 45, scope: !5)
+!16 = !DILocation(line: 26, column: 25, scope: !5)
+!17 = !DILocation(line: 26, column: 36, scope: !5)
+!18 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..a7c8199997c2c36c59766487e45b28ec7a9ac9df
--- /dev/null
+++ b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ptx
@@ -0,0 +1,297 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<12>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32  	%r9, %r8, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r10, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r11, %r10, %r9;
+	.loc	1 22 21
+	setp.lt.s32 	%p1, %r11, 12865792;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r11, 2;
+	add.s64 	%rd1, %rd3, %rd5;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	@%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	.loc	1 24 45
+	cvt.f32.bf16 %r5, %rs1;
+	cvt.f32.bf16 %r6, %rs2;
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r11, 4;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/mx/cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 109
+.b8 120
+.b8 109
+.b8 50
+.b8 111
+.b8 98
+.b8 117
+.b8 99
+.b8 113
+.b8 102
+.b8 102
+.b8 50
+.b8 122
+.b8 52
+.b8 118
+.b8 99
+.b8 53
+.b8 53
+.b8 122
+.b8 99
+.b8 110
+.b8 115
+.b8 99
+.b8 102
+.b8 117
+.b8 118
+.b8 117
+.b8 114
+.b8 53
+.b8 115
+.b8 50
+.b8 98
+.b8 51
+.b8 101
+.b8 51
+.b8 54
+.b8 100
+.b8 118
+.b8 103
+.b8 109
+.b8 53
+.b8 55
+.b8 113
+.b8 111
+.b8 98
+.b8 97
+.b8 110
+.b8 108
+.b8 112
+.b8 104
+.b8 111
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 109
+.b8 120
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..84b7fa53fb0830b2e8da8a3282c22b0869284d1d
--- /dev/null
+++ b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttgir
@@ -0,0 +1,21 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %9 = arith.extf %8 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %11 = tt.addptr %10, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..548532a471f26dd7b1204aaa93cbf75ba717b9e6
--- /dev/null
+++ b/.triton/dump/7264a35f8f1de26b089f0a94e23a0d84/triton_.ttir
@@ -0,0 +1,20 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<12865792> : tensor<512xi32>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = arith.cmpi slt, %4, %cst : tensor<512xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %9 = arith.extf %8 : tensor<512xbf16> to tensor<512xf32>
+    %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %11 = tt.addptr %10, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..312df032c9e8f8d5740dfb3e455ae82e854033ea
Binary files /dev/null and b/.triton/dump/76fb48b96c75cb8e388c291a18ef9b02/triton_.cubin differ
diff --git a/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..f72b84bb4bcc5a2d962638e11d243e8f715d952f
Binary files /dev/null and b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.cubin differ
diff --git a/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..6a65b33132556fa06684d28ead590318ed49e80f
--- /dev/null
+++ b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.llir
@@ -0,0 +1,48 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1de(ptr addrspace(1) %0, i64 %1) local_unnamed_addr !dbg !5 {
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %4 = shl i32 %3, 2, !dbg !8
+  %5 = and i32 %4, 508, !dbg !8
+  %6 = or i32 %5, 512, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = sext i32 %7 to i64, !dbg !10
+  %9 = shl nsw i64 %8, 10, !dbg !11
+  %10 = zext nneg i32 %5 to i64
+  %11 = zext nneg i32 %6 to i64
+  %12 = or i64 %9, %10, !dbg !12
+  %13 = or i64 %9, %11, !dbg !12
+  %14 = getelementptr float, ptr addrspace(1) %0, i64 %12, !dbg !13
+  %15 = getelementptr float, ptr addrspace(1) %0, i64 %13, !dbg !13
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %14, i1 true) #1, !dbg !14
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 0, i32 0, i32 0, i32 0, ptr addrspace(1) %15, i1 true) #1, !dbg !14
+  ret void, !dbg !15
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py", directory: "/tmp/torchinductor_root/pk")
+!3 = !{ptr @triton__0d1de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 34, scope: !5)
+!11 = !DILocation(line: 20, column: 46, scope: !5)
+!12 = !DILocation(line: 21, column: 23, scope: !5)
+!13 = !DILocation(line: 25, column: 25, scope: !5)
+!14 = !DILocation(line: 25, column: 36, scope: !5)
+!15 = !DILocation(line: 25, column: 4, scope: !5)
diff --git a/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..b7d1d4fd6aa2b35a87b85c220f6abdb861ef8b49
--- /dev/null
+++ b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ptx
@@ -0,0 +1,280 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1de
+
+.visible .entry triton__0d1de(
+	.param .u64 triton__0d1de_param_0,
+	.param .u64 triton__0d1de_param_1
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b32 	%r<13>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r10, %tid.x;
+	shl.b32 	%r11, %r10, 2;
+	and.b32  	%r12, %r11, 508;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 46
+	mul.wide.s32 	%rd4, %r1, 1024;
+	cvt.u64.u32 	%rd5, %r12;
+	.loc	1 21 23
+	or.b64  	%rd6, %rd4, %rd5;
+	.loc	1 25 25
+	shl.b64 	%rd7, %rd6, 2;
+	add.s64 	%rd1, %rd3, %rd7;
+	add.s64 	%rd2, %rd1, 2048;
+	mov.b32 	%r2, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 25 36
+	@%p1 st.global.v4.b32 [ %rd1 + 0 ], { %r2, %r2, %r2, %r2 };
+	@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r2, %r2, %r2, %r2 };
+	.loc	1 25 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/pk/cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 172
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 112
+.b8 107
+.b8 119
+.b8 51
+.b8 98
+.b8 100
+.b8 111
+.b8 97
+.b8 109
+.b8 108
+.b8 103
+.b8 122
+.b8 118
+.b8 113
+.b8 106
+.b8 101
+.b8 121
+.b8 117
+.b8 107
+.b8 51
+.b8 52
+.b8 98
+.b8 51
+.b8 106
+.b8 99
+.b8 106
+.b8 102
+.b8 53
+.b8 55
+.b8 104
+.b8 116
+.b8 105
+.b8 115
+.b8 97
+.b8 114
+.b8 97
+.b8 55
+.b8 108
+.b8 117
+.b8 107
+.b8 102
+.b8 108
+.b8 101
+.b8 120
+.b8 111
+.b8 51
+.b8 116
+.b8 50
+.b8 50
+.b8 101
+.b8 119
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 112
+.b8 107
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..96eac6da6ca9ddd4829131695b36983e3cb3be42
--- /dev/null
+++ b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttgir
@@ -0,0 +1,18 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32, #blocked>
+    %c1024_i64 = arith.constant 1024 : i64
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.muli %1, %c1024_i64 : i64
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %4 = arith.extsi %3 : tensor<1024xi32, #blocked> to tensor<1024xi64, #blocked>
+    %5 = tt.splat %2 : (i64) -> tensor<1024xi64, #blocked>
+    %6 = arith.addi %5, %4 : tensor<1024xi64, #blocked>
+    %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
+    %8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi64, #blocked>
+    tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a04ad693ad4ec770239a238f729cf24dc07547e7
--- /dev/null
+++ b/.triton/dump/884b5df35d2a25fd91308249e7657806/triton_.ttir
@@ -0,0 +1,17 @@
+module {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i64 = arith.constant 1024 : i64
+    %cst = arith.constant dense<0.000000e+00> : tensor<1024xf32>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.muli %1, %c1024_i64 : i64
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %4 = arith.extsi %3 : tensor<1024xi32> to tensor<1024xi64>
+    %5 = tt.splat %2 : (i64) -> tensor<1024xi64>
+    %6 = arith.addi %5, %4 : tensor<1024xi64>
+    %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %8 = tt.addptr %7, %6 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi64>
+    tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..5a3dbda459cef5c45b27655c23d3e096a901c49f
--- /dev/null
+++ b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.llir
@@ -0,0 +1,166 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = and i32 %4, 127, !dbg !8
+  %6 = shl nuw nsw i32 %5, 3, !dbg !8
+  %7 = shl nuw nsw i32 %5, 2, !dbg !8
+  %8 = or i32 %7, 512, !dbg !8
+  %9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !9
+  %10 = shl i32 %9, 10, !dbg !10
+  %11 = or i32 %10, %6, !dbg !11
+  %12 = or i32 %10, %7, !dbg !11
+  %13 = or i32 %10, %8, !dbg !11
+  %14 = icmp slt i32 %11, 12865792, !dbg !12
+  %15 = icmp slt i32 %12, 12865792, !dbg !12
+  %16 = icmp slt i32 %13, 12865792, !dbg !12
+  %17 = sext i32 %11 to i64, !dbg !13
+  %18 = getelementptr i16, ptr addrspace(1) %0, i64 %17, !dbg !13
+  %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];", "=r,=r,=r,=r,l,b"(ptr addrspace(1) %18, i1 %14) #2, !dbg !14
+  %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !14
+  %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !14
+  %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !14
+  %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !14
+  %24 = trunc i32 %20 to i16, !dbg !14
+  %extelt.offset = lshr i32 %20, 16, !dbg !14
+  %25 = trunc i32 %extelt.offset to i16, !dbg !14
+  %26 = trunc i32 %21 to i16, !dbg !14
+  %extelt.offset1 = lshr i32 %21, 16, !dbg !14
+  %27 = trunc i32 %extelt.offset1 to i16, !dbg !14
+  %28 = trunc i32 %22 to i16, !dbg !14
+  %extelt.offset2 = lshr i32 %22, 16, !dbg !14
+  %29 = trunc i32 %extelt.offset2 to i16, !dbg !14
+  %30 = trunc i32 %23 to i16, !dbg !14
+  %extelt.offset3 = lshr i32 %23, 16, !dbg !14
+  %31 = trunc i32 %extelt.offset3 to i16, !dbg !14
+  %32 = zext nneg i32 %6 to i64, !dbg !15
+  %33 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %32, !dbg !15
+  %34 = insertelement <1 x i16> undef, i16 %24, i64 0, !dbg !15
+  store <1 x i16> %34, ptr addrspace(3) %33, align 2, !dbg !15
+  %35 = or i32 %6, 1, !dbg !15
+  %36 = zext nneg i32 %35 to i64, !dbg !15
+  %37 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %36, !dbg !15
+  %38 = insertelement <1 x i16> undef, i16 %25, i64 0, !dbg !15
+  store <1 x i16> %38, ptr addrspace(3) %37, align 2, !dbg !15
+  %39 = or i32 %6, 2, !dbg !15
+  %40 = zext nneg i32 %39 to i64, !dbg !15
+  %41 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %40, !dbg !15
+  %42 = insertelement <1 x i16> undef, i16 %26, i64 0, !dbg !15
+  store <1 x i16> %42, ptr addrspace(3) %41, align 2, !dbg !15
+  %43 = or i32 %6, 3, !dbg !15
+  %44 = zext nneg i32 %43 to i64, !dbg !15
+  %45 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %44, !dbg !15
+  %46 = insertelement <1 x i16> undef, i16 %27, i64 0, !dbg !15
+  store <1 x i16> %46, ptr addrspace(3) %45, align 2, !dbg !15
+  %47 = or i32 %6, 4, !dbg !15
+  %48 = zext nneg i32 %47 to i64, !dbg !15
+  %49 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %48, !dbg !15
+  %50 = insertelement <1 x i16> undef, i16 %28, i64 0, !dbg !15
+  store <1 x i16> %50, ptr addrspace(3) %49, align 2, !dbg !15
+  %51 = or i32 %6, 5, !dbg !15
+  %52 = zext nneg i32 %51 to i64, !dbg !15
+  %53 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %52, !dbg !15
+  %54 = insertelement <1 x i16> undef, i16 %29, i64 0, !dbg !15
+  store <1 x i16> %54, ptr addrspace(3) %53, align 2, !dbg !15
+  %55 = or i32 %6, 6, !dbg !15
+  %56 = zext nneg i32 %55 to i64, !dbg !15
+  %57 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %56, !dbg !15
+  %58 = insertelement <1 x i16> undef, i16 %30, i64 0, !dbg !15
+  store <1 x i16> %58, ptr addrspace(3) %57, align 2, !dbg !15
+  %59 = or i32 %6, 7, !dbg !15
+  %60 = zext nneg i32 %59 to i64, !dbg !15
+  %61 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %60, !dbg !15
+  %62 = insertelement <1 x i16> undef, i16 %31, i64 0, !dbg !15
+  store <1 x i16> %62, ptr addrspace(3) %61, align 2, !dbg !15
+  tail call void @llvm.nvvm.barrier0(), !dbg !15
+  %63 = zext nneg i32 %7 to i64, !dbg !15
+  %64 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %63, !dbg !15
+  %65 = load i16, ptr addrspace(3) %64, align 2, !dbg !15
+  %66 = or i32 %7, 1, !dbg !15
+  %67 = zext nneg i32 %66 to i64, !dbg !15
+  %68 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %67, !dbg !15
+  %69 = load i16, ptr addrspace(3) %68, align 2, !dbg !15
+  %70 = or i32 %7, 2, !dbg !15
+  %71 = zext nneg i32 %70 to i64, !dbg !15
+  %72 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %71, !dbg !15
+  %73 = load i16, ptr addrspace(3) %72, align 2, !dbg !15
+  %74 = or i32 %7, 3, !dbg !15
+  %75 = zext nneg i32 %74 to i64, !dbg !15
+  %76 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %75, !dbg !15
+  %77 = load i16, ptr addrspace(3) %76, align 2, !dbg !15
+  %78 = zext nneg i32 %8 to i64, !dbg !15
+  %79 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %78, !dbg !15
+  %80 = load i16, ptr addrspace(3) %79, align 2, !dbg !15
+  %81 = or i32 %7, 513, !dbg !15
+  %82 = zext nneg i32 %81 to i64, !dbg !15
+  %83 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %82, !dbg !15
+  %84 = load i16, ptr addrspace(3) %83, align 2, !dbg !15
+  %85 = or i32 %7, 514, !dbg !15
+  %86 = zext nneg i32 %85 to i64, !dbg !15
+  %87 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %86, !dbg !15
+  %88 = load i16, ptr addrspace(3) %87, align 2, !dbg !15
+  %89 = or i32 %7, 515, !dbg !15
+  %90 = zext nneg i32 %89 to i64, !dbg !15
+  %91 = getelementptr i16, ptr addrspace(3) @global_smem, i64 %90, !dbg !15
+  %92 = load i16, ptr addrspace(3) %91, align 2, !dbg !15
+  %93 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %65) #2, !dbg !15
+  %94 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %69) #2, !dbg !15
+  %95 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #2, !dbg !15
+  %96 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %77) #2, !dbg !15
+  %97 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %80) #2, !dbg !15
+  %98 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %84) #2, !dbg !15
+  %99 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %88) #2, !dbg !15
+  %100 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %92) #2, !dbg !15
+  %101 = sext i32 %12 to i64, !dbg !16
+  %102 = getelementptr float, ptr addrspace(1) %1, i64 %101, !dbg !16
+  %103 = sext i32 %13 to i64, !dbg !16
+  %104 = getelementptr float, ptr addrspace(1) %1, i64 %103, !dbg !16
+  %105 = bitcast float %93 to i32, !dbg !17
+  %106 = bitcast float %94 to i32, !dbg !17
+  %107 = bitcast float %95 to i32, !dbg !17
+  %108 = bitcast float %96 to i32, !dbg !17
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %105, i32 %106, i32 %107, i32 %108, ptr addrspace(1) %102, i1 %15) #2, !dbg !17
+  %109 = bitcast float %97 to i32, !dbg !17
+  %110 = bitcast float %98 to i32, !dbg !17
+  %111 = bitcast float %99 to i32, !dbg !17
+  %112 = bitcast float %100 to i32, !dbg !17
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %109, i32 %110, i32 %111, i32 %112, ptr addrspace(1) %104, i1 %16) #2, !dbg !17
+  ret void, !dbg !18
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py", directory: "/tmp/torchinductor_root/mx")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 22, column: 21, scope: !5)
+!13 = !DILocation(line: 24, column: 30, scope: !5)
+!14 = !DILocation(line: 24, column: 35, scope: !5)
+!15 = !DILocation(line: 24, column: 45, scope: !5)
+!16 = !DILocation(line: 26, column: 25, scope: !5)
+!17 = !DILocation(line: 26, column: 36, scope: !5)
+!18 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..8fe623da5aae0d50e3d21c940e013616afe33d21
--- /dev/null
+++ b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ptx
@@ -0,0 +1,342 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<38>;
+	.reg .b64 	%rd<13>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r22, %tid.x;
+	and.b32  	%r23, %r22, 127;
+	shl.b32 	%r24, %r23, 3;
+	shl.b32 	%r25, %r23, 2;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r26, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r27, %r26, %r24;
+	or.b32  	%r28, %r26, %r25;
+	or.b32  	%r29, %r28, 512;
+	.loc	1 22 21
+	setp.lt.s32 	%p1, %r27, 12865792;
+	setp.lt.s32 	%p2, %r28, 12865792;
+	setp.lt.s32 	%p3, %r29, 12865792;
+	.loc	1 24 30
+	mul.wide.s32 	%rd6, %r27, 2;
+	add.s64 	%rd1, %rd4, %rd6;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	shr.u32 	%r30, %r2, 16;
+	shr.u32 	%r31, %r3, 16;
+	shr.u32 	%r32, %r4, 16;
+	shr.u32 	%r33, %r5, 16;
+	.loc	1 24 45
+	shl.b32 	%r34, %r23, 4;
+	mov.u32 	%r35, global_smem;
+	add.s32 	%r36, %r35, %r34;
+	st.shared.u16 	[%r36], %r2;
+	st.shared.u16 	[%r36+2], %r30;
+	st.shared.u16 	[%r36+4], %r3;
+	st.shared.u16 	[%r36+6], %r31;
+	st.shared.u16 	[%r36+8], %r4;
+	st.shared.u16 	[%r36+10], %r32;
+	st.shared.u16 	[%r36+12], %r5;
+	st.shared.u16 	[%r36+14], %r33;
+	bar.sync 	0;
+	add.s32 	%r37, %r35, %r24;
+	ld.shared.u16 	%rs1, [%r37];
+	ld.shared.u16 	%rs2, [%r37+2];
+	ld.shared.u16 	%rs3, [%r37+4];
+	ld.shared.u16 	%rs4, [%r37+6];
+	ld.shared.u16 	%rs5, [%r37+1024];
+	ld.shared.u16 	%rs6, [%r37+1026];
+	ld.shared.u16 	%rs7, [%r37+1028];
+	ld.shared.u16 	%rs8, [%r37+1030];
+	cvt.f32.bf16 %r14, %rs1;
+	cvt.f32.bf16 %r15, %rs2;
+	cvt.f32.bf16 %r16, %rs3;
+	cvt.f32.bf16 %r17, %rs4;
+	cvt.f32.bf16 %r18, %rs5;
+	cvt.f32.bf16 %r19, %rs6;
+	cvt.f32.bf16 %r20, %rs7;
+	cvt.f32.bf16 %r21, %rs8;
+	.loc	1 26 25
+	mul.wide.s32 	%rd7, %r28, 4;
+	add.s64 	%rd2, %rd5, %rd7;
+	cvt.s64.s32 	%rd8, %r26;
+	cvt.u64.u32 	%rd9, %r25;
+	or.b64  	%rd10, %rd8, %rd9;
+	shl.b64 	%rd11, %rd10, 2;
+	add.s64 	%rd12, %rd5, %rd11;
+	add.s64 	%rd3, %rd12, 2048;
+	.loc	1 26 36
+	@%p2 st.global.v4.b32 [ %rd2 + 0 ], { %r14, %r15, %r16, %r17 };
+	@%p3 st.global.v4.b32 [ %rd3 + 0 ], { %r18, %r19, %r20, %r21 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/mx/cmxm2obucqff2z4vc55zcnscfuvur5s2b3e36dvgm57qobanlpho.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 109
+.b8 120
+.b8 109
+.b8 50
+.b8 111
+.b8 98
+.b8 117
+.b8 99
+.b8 113
+.b8 102
+.b8 102
+.b8 50
+.b8 122
+.b8 52
+.b8 118
+.b8 99
+.b8 53
+.b8 53
+.b8 122
+.b8 99
+.b8 110
+.b8 115
+.b8 99
+.b8 102
+.b8 117
+.b8 118
+.b8 117
+.b8 114
+.b8 53
+.b8 115
+.b8 50
+.b8 98
+.b8 51
+.b8 101
+.b8 51
+.b8 54
+.b8 100
+.b8 118
+.b8 103
+.b8 109
+.b8 53
+.b8 55
+.b8 113
+.b8 111
+.b8 98
+.b8 97
+.b8 110
+.b8 108
+.b8 112
+.b8 104
+.b8 111
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 109
+.b8 120
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..1b2786b1251c23e92335532ce9d1c1775e543619
--- /dev/null
+++ b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttgir
@@ -0,0 +1,28 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
+    %cst_0 = arith.constant dense<12865792> : tensor<1024xi32, #blocked1>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked1>
+    %4 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %5 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked1>
+    %6 = arith.addi %4, %2 : tensor<1024xi32, #blocked>
+    %7 = arith.addi %5, %3 : tensor<1024xi32, #blocked1>
+    %8 = arith.cmpi slt, %6, %cst : tensor<1024xi32, #blocked>
+    %9 = arith.cmpi slt, %7, %cst_0 : tensor<1024xi32, #blocked1>
+    %10 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %11 = tt.addptr %10, %6 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %12 = tt.load %11, %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16, #blocked>
+    %13 = triton_gpu.convert_layout %12 : (tensor<1024xbf16, #blocked>) -> tensor<1024xbf16, #blocked1>
+    %14 = arith.extf %13 : tensor<1024xbf16, #blocked1> to tensor<1024xf32, #blocked1>
+    %15 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked1>
+    %16 = tt.addptr %15, %7 : tensor<1024x!tt.ptr<f32, 1>, #blocked1>, tensor<1024xi32, #blocked1>
+    tt.store %16, %14, %9 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32, #blocked1>
+    tt.return
+  }
+}
diff --git a/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..800a5b71fbf1c2aa0a24d1629524b33d429dd4a6
--- /dev/null
+++ b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.ttir
@@ -0,0 +1,20 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<12865792> : tensor<1024xi32>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
+    %9 = arith.extf %8 : tensor<1024xbf16> to tensor<1024xf32>
+    %10 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>>
+    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32, 1>>, tensor<1024xi32>
+    tt.store %11, %9, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..565fc552434e75672d1a014bfc85b40c1349126e
Binary files /dev/null and b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.cubin differ
diff --git a/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..b576bec99d1d58796ee6682c107ea6ce1122693d
--- /dev/null
+++ b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.llir
@@ -0,0 +1,476 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = and i32 %10, 3, !dbg !10
+  %12 = lshr i32 %9, 1, !dbg !10
+  %13 = shl nuw nsw i32 %11, 4, !dbg !10
+  %14 = or i32 %13, %12, !dbg !10
+  %15 = and i32 %8, 63, !dbg !10
+  %16 = shl i32 %8, 2, !dbg !11
+  %17 = and i32 %16, 4, !dbg !11
+  %18 = and i32 %8, 7, !dbg !11
+  %19 = shl nuw nsw i32 %11, 2, !dbg !12
+  %20 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !13
+  %21 = shl i32 %20, 6, !dbg !14
+  %22 = or i32 %21, %14, !dbg !15
+  %23 = or i32 %21, %15, !dbg !15
+  %24 = sext i32 %22 to i64, !dbg !16
+  %25 = getelementptr i64, ptr addrspace(1) %0, i64 %24, !dbg !16
+  %26 = sext i32 %23 to i64, !dbg !16
+  %27 = getelementptr i64, ptr addrspace(1) %0, i64 %26, !dbg !16
+  %28 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %29 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %30 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %31 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %25, i1 true) #6, !dbg !17
+  %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #6, !dbg !17
+  %33 = srem i32 %22, 512, !dbg !18
+  %34 = shl nsw i32 %33, 8, !dbg !19
+  %35 = add i64 %32, 50257, !dbg !20
+  %36 = icmp slt i64 %28, 0, !dbg !21
+  %37 = icmp slt i64 %32, 0, !dbg !21
+  %38 = select i1 %37, i64 %35, i64 %32, !dbg !22
+  %39 = icmp ugt i64 %38, 50256, !dbg !23
+  %40 = shl i64 %28, 8, !dbg !24
+  %41 = add i64 %40, 12865792, !dbg !24
+  %42 = select i1 %36, i64 %41, i64 %40, !dbg !24
+  %43 = getelementptr float, ptr addrspace(1) %1, i64 %42
+  br label %44, !dbg !12
+
+44:                                               ; preds = %7, %76
+  %45 = phi float [ 0.000000e+00, %7 ], [ %96, %76 ]
+  %46 = phi float [ 0.000000e+00, %7 ], [ %97, %76 ]
+  %47 = phi float [ 0.000000e+00, %7 ], [ %98, %76 ]
+  %48 = phi float [ 0.000000e+00, %7 ], [ %99, %76 ]
+  %49 = phi float [ 0.000000e+00, %7 ], [ %100, %76 ]
+  %50 = phi float [ 0.000000e+00, %7 ], [ %101, %76 ]
+  %51 = phi float [ 0.000000e+00, %7 ], [ %102, %76 ]
+  %52 = phi float [ 0.000000e+00, %7 ], [ %103, %76 ]
+  %53 = phi float [ 0.000000e+00, %7 ], [ %120, %76 ]
+  %54 = phi float [ 0.000000e+00, %7 ], [ %121, %76 ]
+  %55 = phi float [ 0.000000e+00, %7 ], [ %122, %76 ]
+  %56 = phi float [ 0.000000e+00, %7 ], [ %123, %76 ]
+  %57 = phi float [ 0.000000e+00, %7 ], [ %108, %76 ]
+  %58 = phi float [ 0.000000e+00, %7 ], [ %109, %76 ]
+  %59 = phi float [ 0.000000e+00, %7 ], [ %110, %76 ]
+  %60 = phi float [ 0.000000e+00, %7 ], [ %111, %76 ]
+  %61 = phi i32 [ 0, %7 ], [ %124, %76 ]
+  %62 = or i32 %61, %17, !dbg !25
+  %63 = add i32 %62, %34, !dbg !26
+  %64 = sext i32 %63 to i64, !dbg !27
+  %65 = getelementptr float, ptr addrspace(1) %2, i64 %64, !dbg !27
+  %66 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %65, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !28
+  %67 = extractvalue { i32, i32, i32, i32 } %66, 0, !dbg !28
+  %68 = extractvalue { i32, i32, i32, i32 } %66, 1, !dbg !28
+  %69 = extractvalue { i32, i32, i32, i32 } %66, 2, !dbg !28
+  %70 = extractvalue { i32, i32, i32, i32 } %66, 3, !dbg !28
+  %71 = bitcast i32 %67 to float, !dbg !28
+  %72 = bitcast i32 %68 to float, !dbg !28
+  %73 = bitcast i32 %69 to float, !dbg !28
+  %74 = bitcast i32 %70 to float, !dbg !28
+  br i1 %39, label %75, label %76, !dbg !29
+
+75:                                               ; preds = %44
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !29
+  br label %76, !dbg !29
+
+76:                                               ; preds = %75, %44
+  %77 = zext nneg i32 %62 to i64, !dbg !30
+  %78 = getelementptr float, ptr addrspace(1) %43, i64 %77, !dbg !31
+  %79 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %78, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !32
+  %80 = extractvalue { i32, i32, i32, i32 } %79, 0, !dbg !32
+  %81 = extractvalue { i32, i32, i32, i32 } %79, 1, !dbg !32
+  %82 = extractvalue { i32, i32, i32, i32 } %79, 2, !dbg !32
+  %83 = extractvalue { i32, i32, i32, i32 } %79, 3, !dbg !32
+  %84 = bitcast i32 %80 to float, !dbg !32
+  %85 = bitcast i32 %81 to float, !dbg !32
+  %86 = bitcast i32 %82 to float, !dbg !32
+  %87 = bitcast i32 %83 to float, !dbg !32
+  %88 = fadd float %71, %84, !dbg !33
+  %89 = fadd float %72, %85, !dbg !33
+  %90 = fadd float %73, %86, !dbg !33
+  %91 = fadd float %74, %87, !dbg !33
+  %92 = fsub float %88, %57, !dbg !34
+  %93 = fsub float %89, %58, !dbg !34
+  %94 = fsub float %90, %59, !dbg !34
+  %95 = fsub float %91, %60, !dbg !34
+  %96 = fadd float %45, 1.000000e+00, !dbg !38
+  %97 = fadd float %46, 1.000000e+00, !dbg !38
+  %98 = fadd float %47, 1.000000e+00, !dbg !38
+  %99 = fadd float %48, 1.000000e+00, !dbg !38
+  %100 = fadd float %49, 1.000000e+00, !dbg !38
+  %101 = fadd float %50, 1.000000e+00, !dbg !38
+  %102 = fadd float %51, 1.000000e+00, !dbg !38
+  %103 = fadd float %52, 1.000000e+00, !dbg !38
+  %104 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %92, float %96) #6, !dbg !39
+  %105 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %93, float %97) #6, !dbg !39
+  %106 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %94, float %98) #6, !dbg !39
+  %107 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %95, float %99) #6, !dbg !39
+  %108 = fadd float %57, %104, !dbg !40
+  %109 = fadd float %58, %105, !dbg !40
+  %110 = fadd float %59, %106, !dbg !40
+  %111 = fadd float %60, %107, !dbg !40
+  %112 = fsub float %88, %108, !dbg !41
+  %113 = fsub float %89, %109, !dbg !41
+  %114 = fsub float %90, %110, !dbg !41
+  %115 = fsub float %91, %111, !dbg !41
+  %116 = fmul float %92, %112, !dbg !42
+  %117 = fmul float %93, %113, !dbg !42
+  %118 = fmul float %94, %114, !dbg !42
+  %119 = fmul float %95, %115, !dbg !42
+  %120 = fadd float %53, %116, !dbg !43
+  %121 = fadd float %54, %117, !dbg !43
+  %122 = fadd float %55, %118, !dbg !43
+  %123 = fadd float %56, %119, !dbg !43
+  %124 = add nuw nsw i32 %61, 8, !dbg !12
+  %125 = icmp ult i32 %61, 248, !dbg !12
+  br i1 %125, label %44, label %126, !dbg !12
+
+126:                                              ; preds = %76
+  %127 = lshr i32 %9, 3, !dbg !12
+  %128 = or i32 %19, %127, !dbg !12
+  %129 = mul nuw nsw i32 %128, 12, !dbg !12
+  %130 = add nuw nsw i32 %129, %18, !dbg !12
+  %131 = zext nneg i32 %130 to i64, !dbg !12
+  %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !12
+  %133 = insertelement <1 x float> undef, float %100, i64 0, !dbg !12
+  store <1 x float> %133, ptr addrspace(3) %132, align 4, !dbg !12
+  %134 = or i32 %18, 192, !dbg !12
+  %135 = add nuw nsw i32 %134, %129, !dbg !12
+  %136 = zext nneg i32 %135 to i64, !dbg !12
+  %137 = getelementptr float, ptr addrspace(3) @global_smem, i64 %136, !dbg !12
+  %138 = insertelement <1 x float> undef, float %101, i64 0, !dbg !12
+  store <1 x float> %138, ptr addrspace(3) %137, align 4, !dbg !12
+  %139 = or i32 %18, 384, !dbg !12
+  %140 = add nuw nsw i32 %139, %129, !dbg !12
+  %141 = zext nneg i32 %140 to i64, !dbg !12
+  %142 = getelementptr float, ptr addrspace(3) @global_smem, i64 %141, !dbg !12
+  %143 = insertelement <1 x float> undef, float %102, i64 0, !dbg !12
+  store <1 x float> %143, ptr addrspace(3) %142, align 4, !dbg !12
+  %144 = or i32 %18, 576, !dbg !12
+  %145 = add nuw nsw i32 %144, %129, !dbg !12
+  %146 = zext nneg i32 %145 to i64, !dbg !12
+  %147 = getelementptr float, ptr addrspace(3) @global_smem, i64 %146, !dbg !12
+  %148 = insertelement <1 x float> undef, float %103, i64 0, !dbg !12
+  store <1 x float> %148, ptr addrspace(3) %147, align 4, !dbg !12
+  tail call void @llvm.nvvm.barrier0(), !dbg !12
+  %149 = mul nuw nsw i32 %14, 12, !dbg !12
+  %150 = add nuw nsw i32 %149, %17, !dbg !12
+  %151 = zext nneg i32 %150 to i64, !dbg !12
+  %152 = getelementptr float, ptr addrspace(3) @global_smem, i64 %151, !dbg !12
+  %153 = load float, ptr addrspace(3) %152, align 16, !dbg !12
+  %154 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 1, !dbg !12
+  %155 = load float, ptr addrspace(3) %154, align 4, !dbg !12
+  %156 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 2, !dbg !12
+  %157 = load float, ptr addrspace(3) %156, align 8, !dbg !12
+  %158 = getelementptr inbounds <4 x float>, ptr addrspace(3) %152, i64 0, i64 3, !dbg !12
+  %159 = load float, ptr addrspace(3) %158, align 4, !dbg !12
+  %160 = fsub float %109, %108, !dbg !44
+  %161 = fadd float %153, %155, !dbg !48
+  %162 = fcmp oeq float %161, 0.000000e+00, !dbg !49
+  %163 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %155, float %161) #6, !dbg !50
+  %164 = select i1 %162, float 0.000000e+00, float %163, !dbg !51
+  %165 = fmul float %160, %164, !dbg !52
+  %166 = fadd float %108, %165, !dbg !53
+  %167 = fadd float %120, %121, !dbg !54
+  %168 = fmul float %160, %160, !dbg !55
+  %169 = fmul float %168, %153, !dbg !56
+  %170 = fmul float %169, %164, !dbg !57
+  %171 = fadd float %167, %170, !dbg !58
+  %172 = fsub float %110, %166, !dbg !44
+  %173 = fadd float %157, %161, !dbg !48
+  %174 = fcmp oeq float %173, 0.000000e+00, !dbg !49
+  %175 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %157, float %173) #6, !dbg !50
+  %176 = select i1 %174, float 0.000000e+00, float %175, !dbg !51
+  %177 = fmul float %176, %172, !dbg !52
+  %178 = fadd float %166, %177, !dbg !53
+  %179 = fadd float %122, %171, !dbg !54
+  %180 = fmul float %172, %172, !dbg !55
+  %181 = fmul float %161, %180, !dbg !56
+  %182 = fmul float %176, %181, !dbg !57
+  %183 = fadd float %179, %182, !dbg !58
+  %184 = fsub float %111, %178, !dbg !44
+  %185 = fadd float %159, %173, !dbg !48
+  %186 = fcmp oeq float %185, 0.000000e+00, !dbg !49
+  %187 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %159, float %185) #6, !dbg !50
+  %188 = select i1 %186, float 0.000000e+00, float %187, !dbg !51
+  %189 = fmul float %188, %184, !dbg !52
+  %190 = fadd float %178, %189, !dbg !53
+  %191 = fadd float %123, %183, !dbg !54
+  %192 = fmul float %184, %184, !dbg !55
+  %193 = fmul float %173, %192, !dbg !56
+  %194 = fmul float %188, %193, !dbg !57
+  %195 = fadd float %191, %194, !dbg !58
+  %196 = bitcast float %190 to i32, !dbg !59
+  %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 1, i32 31), !dbg !59
+  %198 = bitcast i32 %197 to float, !dbg !59
+  %199 = bitcast float %195 to i32, !dbg !59
+  %200 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %199, i32 1, i32 31), !dbg !59
+  %201 = bitcast i32 %200 to float, !dbg !59
+  %202 = bitcast float %185 to i32, !dbg !59
+  %203 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %202, i32 1, i32 31), !dbg !59
+  %204 = bitcast i32 %203 to float, !dbg !59
+  %205 = fsub float %198, %190, !dbg !44
+  %206 = fadd float %185, %204, !dbg !48
+  %207 = fcmp oeq float %206, 0.000000e+00, !dbg !49
+  %208 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %204, float %206) #6, !dbg !50
+  %209 = select i1 %207, float 0.000000e+00, float %208, !dbg !51
+  %210 = fmul float %209, %205, !dbg !52
+  %211 = fadd float %190, %210, !dbg !53
+  %212 = fadd float %195, %201, !dbg !54
+  %213 = fmul float %205, %205, !dbg !55
+  %214 = fmul float %185, %213, !dbg !56
+  %215 = fmul float %209, %214, !dbg !57
+  %216 = fadd float %212, %215, !dbg !58
+  %217 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %218 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %219 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %220 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %216, float 2.560000e+02) #6, !dbg !61
+  %221 = fadd float %217, 0x3EE4F8B580000000, !dbg !62
+  %222 = shl i32 %22, 8, !dbg !63
+  br label %223, !dbg !64
+
+223:                                              ; preds = %126, %__nv_rsqrtf.exit
+  %224 = phi i32 [ 0, %126 ], [ %298, %__nv_rsqrtf.exit ]
+  %225 = or i32 %224, %17, !dbg !65
+  %226 = add i32 %225, %34, !dbg !66
+  %227 = sext i32 %226 to i64, !dbg !67
+  %228 = getelementptr float, ptr addrspace(1) %2, i64 %227, !dbg !67
+  %229 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %228, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !68
+  %230 = extractvalue { i32, i32, i32, i32 } %229, 0, !dbg !68
+  %231 = extractvalue { i32, i32, i32, i32 } %229, 1, !dbg !68
+  %232 = extractvalue { i32, i32, i32, i32 } %229, 2, !dbg !68
+  %233 = extractvalue { i32, i32, i32, i32 } %229, 3, !dbg !68
+  %234 = bitcast i32 %230 to float, !dbg !68
+  %235 = bitcast i32 %231 to float, !dbg !68
+  %236 = bitcast i32 %232 to float, !dbg !68
+  %237 = bitcast i32 %233 to float, !dbg !68
+  %238 = zext nneg i32 %225 to i64, !dbg !69
+  %239 = getelementptr float, ptr addrspace(1) %3, i64 %238, !dbg !69
+  %240 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %239, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !70
+  %241 = extractvalue { i32, i32, i32, i32 } %240, 0, !dbg !70
+  %242 = extractvalue { i32, i32, i32, i32 } %240, 1, !dbg !70
+  %243 = extractvalue { i32, i32, i32, i32 } %240, 2, !dbg !70
+  %244 = extractvalue { i32, i32, i32, i32 } %240, 3, !dbg !70
+  %245 = bitcast i32 %241 to float, !dbg !70
+  %246 = bitcast i32 %242 to float, !dbg !70
+  %247 = bitcast i32 %243 to float, !dbg !70
+  %248 = bitcast i32 %244 to float, !dbg !70
+  br i1 %39, label %249, label %250, !dbg !71
+
+249:                                              ; preds = %223
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !71
+  br label %250, !dbg !71
+
+250:                                              ; preds = %249, %223
+  %251 = getelementptr float, ptr addrspace(1) %43, i64 %238, !dbg !72
+  %252 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %251, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !73
+  %253 = extractvalue { i32, i32, i32, i32 } %252, 0, !dbg !73
+  %254 = extractvalue { i32, i32, i32, i32 } %252, 1, !dbg !73
+  %255 = extractvalue { i32, i32, i32, i32 } %252, 2, !dbg !73
+  %256 = extractvalue { i32, i32, i32, i32 } %252, 3, !dbg !73
+  %257 = bitcast i32 %253 to float, !dbg !73
+  %258 = bitcast i32 %254 to float, !dbg !73
+  %259 = bitcast i32 %255 to float, !dbg !73
+  %260 = bitcast i32 %256 to float, !dbg !73
+  %261 = fadd float %234, %257, !dbg !74
+  %262 = fadd float %235, %258, !dbg !74
+  %263 = fadd float %236, %259, !dbg !74
+  %264 = fadd float %237, %260, !dbg !74
+  %265 = fsub float %261, %211, !dbg !75
+  %266 = fsub float %262, %211, !dbg !75
+  %267 = fsub float %263, %211, !dbg !75
+  %268 = fsub float %264, %211, !dbg !75
+  %269 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %.not.i = icmp eq i32 %269, 0, !dbg !76
+  br i1 %.not.i, label %272, label %270, !dbg !76
+
+270:                                              ; preds = %250
+  %271 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %221), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+272:                                              ; preds = %250
+  %273 = tail call float @llvm.nvvm.rsqrt.approx.f(float %221), !dbg !76
+  br label %__nv_rsqrtf.exit, !dbg !76
+
+__nv_rsqrtf.exit:                                 ; preds = %270, %272
+  %.0.i = phi float [ %271, %270 ], [ %273, %272 ], !dbg !76
+  %274 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %275 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %276 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !76
+  %277 = fmul float %265, %.0.i, !dbg !77
+  %278 = fmul float %266, %.0.i, !dbg !77
+  %279 = fmul float %267, %.0.i, !dbg !77
+  %280 = fmul float %268, %.0.i, !dbg !77
+  %281 = fmul float %277, %245, !dbg !78
+  %282 = fmul float %278, %246, !dbg !78
+  %283 = fmul float %279, %247, !dbg !78
+  %284 = fmul float %280, %248, !dbg !78
+  %285 = add i32 %225, %222, !dbg !79
+  %286 = sext i32 %285 to i64, !dbg !80
+  %287 = getelementptr i16, ptr addrspace(1) %4, i64 %286, !dbg !80
+  %288 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %281) #6, !dbg !81
+  %289 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %282) #6, !dbg !81
+  %290 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %283) #6, !dbg !81
+  %291 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %284) #6, !dbg !81
+  %292 = insertelement <2 x i16> undef, i16 %288, i64 0, !dbg !81
+  %293 = insertelement <2 x i16> %292, i16 %289, i64 1, !dbg !81
+  %294 = bitcast <2 x i16> %293 to i32, !dbg !81
+  %295 = insertelement <2 x i16> undef, i16 %290, i64 0, !dbg !81
+  %296 = insertelement <2 x i16> %295, i16 %291, i64 1, !dbg !81
+  %297 = bitcast <2 x i16> %296 to i32, !dbg !81
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %294, i32 %297, ptr addrspace(1) %287, i1 true) #6, !dbg !81
+  %298 = add nuw nsw i32 %224, 8, !dbg !64
+  %299 = icmp ult i32 %224, 248, !dbg !64
+  br i1 %299, label %223, label %300, !dbg !64
+
+300:                                              ; preds = %__nv_rsqrtf.exit
+  ret void, !dbg !82
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 128}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 22, column: 44, scope: !7)
+!11 = !DILocation(line: 24, column: 33, scope: !7)
+!12 = !DILocation(line: 31, column: 36, scope: !7)
+!13 = !DILocation(line: 21, column: 28, scope: !7)
+!14 = !DILocation(line: 21, column: 33, scope: !7)
+!15 = !DILocation(line: 22, column: 23, scope: !7)
+!16 = !DILocation(line: 26, column: 30, scope: !7)
+!17 = !DILocation(line: 26, column: 35, scope: !7)
+!18 = !DILocation(line: 27, column: 18, scope: !7)
+!19 = !DILocation(line: 35, column: 44, scope: !7)
+!20 = !DILocation(line: 36, column: 22, scope: !7)
+!21 = !DILocation(line: 37, column: 22, scope: !7)
+!22 = !DILocation(line: 38, column: 36, scope: !7)
+!23 = !DILocation(line: 39, column: 40, scope: !7)
+!24 = !DILocation(line: 40, column: 44, scope: !7)
+!25 = !DILocation(line: 32, column: 27, scope: !7)
+!26 = !DILocation(line: 35, column: 40, scope: !7)
+!27 = !DILocation(line: 35, column: 34, scope: !7)
+!28 = !DILocation(line: 35, column: 50, scope: !7)
+!29 = !DILocation(line: 39, column: 55, scope: !7)
+!30 = !DILocation(line: 40, column: 40, scope: !7)
+!31 = !DILocation(line: 40, column: 34, scope: !7)
+!32 = !DILocation(line: 40, column: 52, scope: !7)
+!33 = !DILocation(line: 41, column: 22, scope: !7)
+!34 = !DILocation(line: 96, column: 20, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 44, column: 38, scope: !35)
+!38 = !DILocation(line: 97, column: 26, scope: !35, inlinedAt: !37)
+!39 = !DILocation(line: 98, column: 30, scope: !35, inlinedAt: !37)
+!40 = !DILocation(line: 98, column: 22, scope: !35, inlinedAt: !37)
+!41 = !DILocation(line: 101, column: 30, scope: !35, inlinedAt: !37)
+!42 = !DILocation(line: 101, column: 22, scope: !35, inlinedAt: !37)
+!43 = !DILocation(line: 47, column: 48, scope: !7)
+!44 = !DILocation(line: 108, column: 21, scope: !45, inlinedAt: !46)
+!45 = distinct !DILexicalBlockFile(scope: !35, file: !36, discriminator: 0)
+!46 = !DILocation(line: 120, column: 46, scope: !45, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 41, scope: !45)
+!48 = !DILocation(line: 109, column: 28, scope: !45, inlinedAt: !46)
+!49 = !DILocation(line: 110, column: 39, scope: !45, inlinedAt: !46)
+!50 = !DILocation(line: 110, column: 60, scope: !45, inlinedAt: !46)
+!51 = !DILocation(line: 110, column: 49, scope: !45, inlinedAt: !46)
+!52 = !DILocation(line: 112, column: 25, scope: !45, inlinedAt: !46)
+!53 = !DILocation(line: 112, column: 17, scope: !45, inlinedAt: !46)
+!54 = !DILocation(line: 113, column: 15, scope: !45, inlinedAt: !46)
+!55 = !DILocation(line: 113, column: 30, scope: !45, inlinedAt: !46)
+!56 = !DILocation(line: 113, column: 38, scope: !45, inlinedAt: !46)
+!57 = !DILocation(line: 113, column: 49, scope: !45, inlinedAt: !46)
+!58 = !DILocation(line: 113, column: 22, scope: !45, inlinedAt: !46)
+!59 = !DILocation(line: 120, column: 46, scope: !35, inlinedAt: !60)
+!60 = !DILocation(line: 50, column: 41, scope: !35)
+!61 = !DILocation(line: 69, column: 23, scope: !7)
+!62 = !DILocation(line: 71, column: 24, scope: !7)
+!63 = !DILocation(line: 76, column: 39, scope: !7)
+!64 = !DILocation(line: 55, column: 36, scope: !7)
+!65 = !DILocation(line: 56, column: 27, scope: !7)
+!66 = !DILocation(line: 59, column: 41, scope: !7)
+!67 = !DILocation(line: 59, column: 35, scope: !7)
+!68 = !DILocation(line: 59, column: 51, scope: !7)
+!69 = !DILocation(line: 60, column: 35, scope: !7)
+!70 = !DILocation(line: 60, column: 40, scope: !7)
+!71 = !DILocation(line: 64, column: 57, scope: !7)
+!72 = !DILocation(line: 65, column: 35, scope: !7)
+!73 = !DILocation(line: 65, column: 54, scope: !7)
+!74 = !DILocation(line: 66, column: 24, scope: !7)
+!75 = !DILocation(line: 67, column: 24, scope: !7)
+!76 = !DILocation(line: 72, column: 30, scope: !7)
+!77 = !DILocation(line: 73, column: 24, scope: !7)
+!78 = !DILocation(line: 74, column: 24, scope: !7)
+!79 = !DILocation(line: 76, column: 35, scope: !7)
+!80 = !DILocation(line: 76, column: 29, scope: !7)
+!81 = !DILocation(line: 76, column: 52, scope: !7)
+!82 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ptx b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..1d9a5cd1d0a7490b1eb1bf142dc5797882f4ab4e
--- /dev/null
+++ b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ptx
@@ -0,0 +1,886 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5de6de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_1[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_1[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_1[39] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 49, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 51, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5de6de(
+	.param .u64 triton__0d1d2d3d4d5de6de_param_0,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_1,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_2,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_3,
+	.param .u64 triton__0d1d2d3d4d5de6de_param_4,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_5,
+	.param .u32 triton__0d1d2d3d4d5de6de_param_6
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<42>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<140>;
+	.reg .f32 	%f<148>;
+	.reg .b64 	%rd<67>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd17, [triton__0d1d2d3d4d5de6de_param_4];
+	ld.param.u64 	%rd16, [triton__0d1d2d3d4d5de6de_param_3];
+	ld.param.u64 	%rd28, [triton__0d1d2d3d4d5de6de_param_0];
+	ld.param.u64 	%rd29, [triton__0d1d2d3d4d5de6de_param_1];
+$L__tmp0:
+	.loc	1 22 44
+	mov.u32 	%r16, %tid.x;
+	and.b32  	%r1, %r16, 31;
+	ld.param.u64 	%rd30, [triton__0d1d2d3d4d5de6de_param_2];
+	bfe.u32 	%r2, %r16, 5, 2;
+	bfe.u32 	%r3, %r16, 1, 4;
+	shl.b32 	%r17, %r2, 4;
+	or.b32  	%r4, %r17, %r3;
+	and.b32  	%r18, %r16, 63;
+	.loc	1 24 33
+	shl.b32 	%r19, %r16, 2;
+	and.b32  	%r5, %r19, 4;
+	and.b32  	%r6, %r16, 7;
+	.loc	1 31 36
+	shl.b32 	%r7, %r2, 2;
+	.loc	1 21 28
+	mov.u32 %r14, %ctaid.x;
+	.loc	1 21 33
+	shl.b32 	%r20, %r14, 6;
+	.loc	1 22 23
+	or.b32  	%r21, %r20, %r4;
+	or.b32  	%r22, %r20, %r18;
+	.loc	1 26 30
+	mul.wide.s32 	%rd31, %r21, 8;
+	add.s64 	%rd19, %rd28, %rd31;
+	mul.wide.s32 	%rd32, %r22, 8;
+	add.s64 	%rd27, %rd28, %rd32;
+	mov.pred 	%p1, -1;
+	.loc	1 26 35
+	mov.u64 %rd18, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
+	mov.u64 %rd20, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd19 + 0 ];
+	mov.u64 %rd22, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd19 + 0 ];
+	mov.u64 %rd24, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd19 + 0 ];
+	mov.u64 %rd26, 0x0;
+	@%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
+	.loc	1 27 18
+	bfe.s32 	%r23, %r14, 25, 1;
+	shr.u32 	%r24, %r23, 23;
+	add.s32 	%r25, %r21, %r24;
+	and.b32  	%r26, %r25, 16776704;
+	sub.s32 	%r27, %r21, %r26;
+	.loc	1 35 44
+	shl.b32 	%r28, %r27, 8;
+	.loc	1 36 22
+	add.s64 	%rd33, %rd26, 50257;
+	.loc	1 37 22
+	setp.lt.s64 	%p6, %rd18, 0;
+	setp.lt.s64 	%p7, %rd26, 0;
+	.loc	1 38 36
+	selp.b64 	%rd1, %rd33, %rd26, %p7;
+	.loc	1 40 44
+	shl.b64 	%rd34, %rd18, 8;
+	add.s64 	%rd35, %rd34, 12865792;
+	selp.b64 	%rd36, %rd35, %rd34, %p6;
+	.loc	1 31 36
+	and.b32  	%r29, %r16, 1;
+	mul.wide.u32 	%rd2, %r29, 16;
+	shl.b64 	%rd37, %rd36, 2;
+	or.b64  	%rd38, %rd2, %rd37;
+	add.s64 	%rd66, %rd29, %rd38;
+	or.b32  	%r30, %r28, %r5;
+	mul.wide.s32 	%rd39, %r30, 4;
+	add.s64 	%rd64, %rd30, %rd39;
+	mov.f32 	%f132, 0f00000000;
+	mov.b32 	%r138, -8;
+	mov.u64 	%rd62, %rd64;
+	mov.u64 	%rd63, %rd66;
+	mov.f32 	%f133, %f132;
+	mov.f32 	%f134, %f132;
+	mov.f32 	%f135, %f132;
+	mov.f32 	%f136, %f132;
+	mov.f32 	%f137, %f132;
+	mov.f32 	%f138, %f132;
+	mov.f32 	%f139, %f132;
+	mov.f32 	%f140, %f132;
+	mov.f32 	%f141, %f132;
+	mov.f32 	%f142, %f132;
+	mov.f32 	%f143, %f132;
+	mov.f32 	%f144, %f132;
+	mov.f32 	%f145, %f132;
+	mov.f32 	%f146, %f132;
+	mov.f32 	%f147, %f132;
+	bra.uni 	$L__BB0_1;
+$L__BB0_3:
+	.loc	1 0 0
+	mov.b32 	%f17, %r31;
+	mov.b32 	%f18, %r32;
+	mov.b32 	%f19, %r33;
+	mov.b32 	%f20, %r34;
+	.loc	1 40 52
+	mov.u32 %r40, 0x0;
+	mov.u32 %r41, 0x0;
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r40, %r41, %r42, %r43 }, [ %rd63 + 0 ];
+	@!%p1 mov.u32 %r40, %r124;
+	@!%p1 mov.u32 %r41, %r124;
+	@!%p1 mov.u32 %r42, %r124;
+	@!%p1 mov.u32 %r43, %r124;
+	mov.b32 	%f48, %r40;
+	mov.b32 	%f49, %r41;
+	mov.b32 	%f50, %r42;
+	mov.b32 	%f51, %r43;
+	.loc	1 41 22
+	add.f32 	%f52, %f17, %f48;
+	add.f32 	%f53, %f18, %f49;
+	add.f32 	%f54, %f19, %f50;
+	add.f32 	%f55, %f20, %f51;
+$L__tmp1:
+	.loc	2 96 20
+	sub.f32 	%f56, %f52, %f144;
+	sub.f32 	%f57, %f53, %f145;
+	sub.f32 	%f58, %f54, %f146;
+	sub.f32 	%f59, %f55, %f147;
+	.loc	2 97 26
+	add.f32 	%f132, %f132, 0f3F800000;
+	add.f32 	%f133, %f133, 0f3F800000;
+	add.f32 	%f134, %f134, 0f3F800000;
+	add.f32 	%f135, %f135, 0f3F800000;
+	add.f32 	%f136, %f136, 0f3F800000;
+	add.f32 	%f137, %f137, 0f3F800000;
+	add.f32 	%f138, %f138, 0f3F800000;
+	add.f32 	%f139, %f139, 0f3F800000;
+	.loc	2 98 30
+	mov.b32 	%r49, %f56;
+	mov.b32 	%r50, %f132;
+	div.full.f32 %r48, %r49, %r50;
+	mov.b32 	%f60, %r48;
+	mov.b32 	%r52, %f57;
+	mov.b32 	%r53, %f133;
+	div.full.f32 %r51, %r52, %r53;
+	mov.b32 	%f61, %r51;
+	mov.b32 	%r55, %f58;
+	mov.b32 	%r56, %f134;
+	div.full.f32 %r54, %r55, %r56;
+	mov.b32 	%f62, %r54;
+	mov.b32 	%r58, %f59;
+	mov.b32 	%r59, %f135;
+	div.full.f32 %r57, %r58, %r59;
+	mov.b32 	%f63, %r57;
+	.loc	2 98 22
+	add.f32 	%f144, %f144, %f60;
+	add.f32 	%f145, %f145, %f61;
+	add.f32 	%f146, %f146, %f62;
+	add.f32 	%f147, %f147, %f63;
+	.loc	2 101 30
+	sub.f32 	%f64, %f52, %f144;
+	sub.f32 	%f65, %f53, %f145;
+	sub.f32 	%f66, %f54, %f146;
+	sub.f32 	%f67, %f55, %f147;
+$L__tmp2:
+	.loc	1 47 48
+	fma.rn.f32 	%f140, %f56, %f64, %f140;
+	fma.rn.f32 	%f141, %f57, %f65, %f141;
+	fma.rn.f32 	%f142, %f58, %f66, %f142;
+	fma.rn.f32 	%f143, %f59, %f67, %f143;
+	.loc	1 31 36
+	add.s32 	%r138, %r138, 8;
+	add.s64 	%rd63, %rd63, 32;
+	add.s64 	%rd62, %rd62, 32;
+	setp.lt.u32 	%p19, %r138, 248;
+	@%p19 bra 	$L__BB0_1;
+	bra.uni 	$L__BB0_4;
+$L__BB0_1:
+	.loc	1 39 40
+	setp.lt.u64 	%p13, %rd1, 50257;
+	mov.b32 	%r124, 0;
+	.loc	1 35 50
+	mov.u32 %r31, 0x0;
+	mov.u32 %r32, 0x0;
+	mov.u32 %r33, 0x0;
+	mov.u32 %r34, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r31, %r32, %r33, %r34 }, [ %rd62 + 0 ];
+	@!%p1 mov.u32 %r31, %r124;
+	@!%p1 mov.u32 %r32, %r124;
+	@!%p1 mov.u32 %r33, %r124;
+	@!%p1 mov.u32 %r34, %r124;
+	mov.b32 	%r137, 883;
+	mov.u64 	%rd61, 1;
+	.loc	1 39 55
+	@%p13 bra 	$L__BB0_3;
+	mov.u64 	%rd41, assertMessage_0;
+	cvta.global.u64 	%rd42, %rd41;
+	mov.u64 	%rd43, assertFile_0;
+	cvta.global.u64 	%rd44, %rd43;
+	mov.u64 	%rd45, assertFunc_0;
+	cvta.global.u64 	%rd46, %rd45;
+	{ // callseq 2, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd42;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd44;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r137;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd46;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd61;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 2
+	bra.uni 	$L__BB0_3;
+$L__BB0_4:
+	.loc	1 31 36
+	shr.u32 	%r85, %r1, 3;
+	or.b32  	%r86, %r7, %r85;
+	mad.lo.s32 	%r87, %r86, 12, %r6;
+	shl.b32 	%r88, %r87, 2;
+	mov.u32 	%r89, global_smem;
+	add.s32 	%r90, %r89, %r88;
+	st.shared.f32 	[%r90], %f136;
+	st.shared.f32 	[%r90+768], %f137;
+	st.shared.f32 	[%r90+1536], %f138;
+	st.shared.f32 	[%r90+2304], %f139;
+	bar.sync 	0;
+	mad.lo.s32 	%r91, %r4, 12, %r5;
+	shl.b32 	%r92, %r91, 2;
+	add.s32 	%r93, %r89, %r92;
+	ld.shared.v4.f32 	{%f68, %f69, %f70, %f71}, [%r93];
+$L__tmp3:
+	.loc	2 108 21
+	sub.f32 	%f72, %f145, %f144;
+	.loc	2 109 28
+	add.f32 	%f73, %f68, %f69;
+	.loc	2 110 39
+	setp.eq.f32 	%p20, %f73, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r61, %f69;
+	mov.b32 	%r62, %f73;
+	div.full.f32 %r60, %r61, %r62;
+	mov.b32 	%f74, %r60;
+	.loc	2 110 49
+	selp.f32 	%f75, 0f00000000, %f74, %p20;
+	.loc	2 112 17
+	fma.rn.f32 	%f76, %f72, %f75, %f144;
+	.loc	2 113 15
+	add.f32 	%f77, %f140, %f141;
+	.loc	2 113 30
+	mul.f32 	%f78, %f72, %f72;
+	.loc	2 113 38
+	mul.f32 	%f79, %f78, %f68;
+	.loc	2 113 22
+	fma.rn.f32 	%f80, %f79, %f75, %f77;
+	.loc	2 108 21
+	sub.f32 	%f81, %f146, %f76;
+	.loc	2 109 28
+	add.f32 	%f82, %f70, %f73;
+	.loc	2 110 39
+	setp.eq.f32 	%p21, %f82, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r65, %f82;
+	mov.b32 	%r64, %f70;
+	div.full.f32 %r63, %r64, %r65;
+	mov.b32 	%f83, %r63;
+	.loc	2 110 49
+	selp.f32 	%f84, 0f00000000, %f83, %p21;
+	.loc	2 112 17
+	fma.rn.f32 	%f85, %f84, %f81, %f76;
+	.loc	2 113 15
+	add.f32 	%f86, %f142, %f80;
+	.loc	2 113 30
+	mul.f32 	%f87, %f81, %f81;
+	.loc	2 113 38
+	mul.f32 	%f88, %f73, %f87;
+	.loc	2 113 22
+	fma.rn.f32 	%f89, %f84, %f88, %f86;
+	.loc	2 108 21
+	sub.f32 	%f90, %f147, %f85;
+	.loc	2 109 28
+	add.f32 	%f91, %f71, %f82;
+	.loc	2 110 39
+	setp.eq.f32 	%p22, %f91, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r68, %f91;
+	mov.b32 	%r67, %f71;
+	div.full.f32 %r66, %r67, %r68;
+	mov.b32 	%f92, %r66;
+	.loc	2 110 49
+	selp.f32 	%f93, 0f00000000, %f92, %p22;
+	.loc	2 112 17
+	fma.rn.f32 	%f94, %f93, %f90, %f85;
+	.loc	2 113 15
+	add.f32 	%f95, %f143, %f89;
+	.loc	2 113 30
+	mul.f32 	%f96, %f90, %f90;
+	.loc	2 113 38
+	mul.f32 	%f97, %f82, %f96;
+	.loc	2 113 22
+	fma.rn.f32 	%f98, %f93, %f97, %f95;
+$L__tmp4:
+	.loc	2 120 46
+	mov.b32 	%r94, %f94;
+	shfl.sync.bfly.b32	%r95, %r94, 1, 31, -1;
+	mov.b32 	%f99, %r95;
+	mov.b32 	%r96, %f98;
+	shfl.sync.bfly.b32	%r97, %r96, 1, 31, -1;
+	mov.b32 	%f100, %r97;
+	shfl.sync.bfly.b32	%r70, %r68, 1, 31, -1;
+	mov.b32 	%f101, %r70;
+$L__tmp5:
+	.loc	2 108 21
+	sub.f32 	%f102, %f99, %f94;
+	.loc	2 109 28
+	add.f32 	%f103, %f91, %f101;
+	.loc	2 110 39
+	setp.eq.f32 	%p23, %f103, 0f00000000;
+	.loc	2 110 60
+	mov.b32 	%r71, %f103;
+	div.full.f32 %r69, %r70, %r71;
+	mov.b32 	%f104, %r69;
+	.loc	2 110 49
+	selp.f32 	%f105, 0f00000000, %f104, %p23;
+	.loc	2 112 17
+	fma.rn.f32 	%f37, %f105, %f102, %f94;
+	.loc	2 113 15
+	add.f32 	%f106, %f98, %f100;
+	.loc	2 113 30
+	mul.f32 	%f107, %f102, %f102;
+	.loc	2 113 38
+	mul.f32 	%f108, %f91, %f107;
+	.loc	2 113 22
+	fma.rn.f32 	%f109, %f105, %f108, %f106;
+$L__tmp6:
+	.loc	1 69 23
+	mov.b32 	%r73, %f109;
+	mov.b32 	%r74, 1132462080;
+	div.full.f32 %r72, %r73, %r74;
+	mov.b32 	%f110, %r72;
+	.loc	1 71 24
+	add.f32 	%f38, %f110, 0f3727C5AC;
+	.loc	1 55 36
+	shl.b32 	%r98, %r14, 14;
+	shl.b32 	%r99, %r2, 12;
+	or.b32  	%r100, %r98, %r99;
+	shl.b32 	%r101, %r3, 8;
+	or.b32  	%r102, %r100, %r101;
+	or.b32  	%r11, %r102, %r5;
+	add.s64 	%rd65, %rd16, %rd2;
+	mov.b32 	%r139, -8;
+	rsqrt.approx.ftz.f32 	%f123, %f38;
+	bra.uni 	$L__BB0_5;
+$L__BB0_7:
+	.loc	1 65 54
+	mov.u32 %r120, 0x0;
+	mov.u32 %r121, 0x0;
+	mov.u32 %r122, 0x0;
+	mov.u32 %r123, 0x0;
+	@%p1 ld.global.L1::evict_first.v4.b32 { %r120, %r121, %r122, %r123 }, [ %rd66 + 0 ];
+	@!%p1 mov.u32 %r120, %r124;
+	@!%p1 mov.u32 %r121, %r124;
+	@!%p1 mov.u32 %r122, %r124;
+	@!%p1 mov.u32 %r123, %r124;
+	mov.b32 	%f111, %r120;
+	mov.b32 	%f112, %r121;
+	mov.b32 	%f113, %r122;
+	mov.b32 	%f114, %r123;
+	.loc	1 66 24
+	add.f32 	%f115, %f39, %f111;
+	add.f32 	%f116, %f40, %f112;
+	add.f32 	%f117, %f41, %f113;
+	add.f32 	%f118, %f42, %f114;
+	.loc	1 67 24
+	sub.f32 	%f119, %f115, %f37;
+	sub.f32 	%f120, %f116, %f37;
+	sub.f32 	%f121, %f117, %f37;
+	sub.f32 	%f122, %f118, %f37;
+	.loc	1 73 24
+	mul.f32 	%f124, %f119, %f123;
+	mul.f32 	%f125, %f120, %f123;
+	mul.f32 	%f126, %f121, %f123;
+	mul.f32 	%f127, %f122, %f123;
+	.loc	1 74 24
+	mul.f32 	%f128, %f124, %f43;
+	mul.f32 	%f129, %f125, %f44;
+	mul.f32 	%f130, %f126, %f45;
+	mul.f32 	%f131, %f127, %f46;
+	.loc	1 55 36
+	add.s32 	%r139, %r139, 8;
+	.loc	1 76 29
+	add.s32 	%r134, %r139, %r11;
+	mul.wide.s32 	%rd60, %r134, 2;
+	add.s64 	%rd59, %rd17, %rd60;
+	.loc	1 76 52
+	mov.b32 	%r128, %f128;
+	cvt.rn.bf16.f32 %rs1, %r128;
+	mov.b32 	%r129, %f129;
+	cvt.rn.bf16.f32 %rs2, %r129;
+	mov.b32 	%r130, %f130;
+	cvt.rn.bf16.f32 %rs3, %r130;
+	mov.b32 	%r131, %f131;
+	cvt.rn.bf16.f32 %rs4, %r131;
+	mov.b32 	%r135, {%rs1, %rs2};
+	mov.b32 	%r136, {%rs3, %rs4};
+	@%p1 st.global.v2.b32 [ %rd59 + 0 ], { %r135, %r136 };
+	.loc	1 55 36
+	add.s64 	%rd66, %rd66, 32;
+	add.s64 	%rd65, %rd65, 32;
+	add.s64 	%rd64, %rd64, 32;
+	setp.lt.u32 	%p41, %r139, 248;
+	@%p41 bra 	$L__BB0_5;
+	bra.uni 	$L__BB0_8;
+$L__BB0_5:
+	.loc	1 59 51
+	mov.u32 %r103, 0x0;
+	mov.u32 %r104, 0x0;
+	mov.u32 %r105, 0x0;
+	mov.u32 %r106, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r103, %r104, %r105, %r106 }, [ %rd64 + 0 ];
+	@!%p1 mov.u32 %r103, %r124;
+	@!%p1 mov.u32 %r104, %r124;
+	@!%p1 mov.u32 %r105, %r124;
+	@!%p1 mov.u32 %r106, %r124;
+	mov.b32 	%f39, %r103;
+	mov.b32 	%f40, %r104;
+	mov.b32 	%f41, %r105;
+	mov.b32 	%f42, %r106;
+	.loc	1 60 40
+	mov.u32 %r111, 0x0;
+	mov.u32 %r112, 0x0;
+	mov.u32 %r113, 0x0;
+	mov.u32 %r114, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r111, %r112, %r113, %r114 }, [ %rd65 + 0 ];
+	@!%p1 mov.u32 %r111, %r124;
+	@!%p1 mov.u32 %r112, %r124;
+	@!%p1 mov.u32 %r113, %r124;
+	@!%p1 mov.u32 %r114, %r124;
+	mov.b32 	%f43, %r111;
+	mov.b32 	%f44, %r112;
+	mov.b32 	%f45, %r113;
+	mov.b32 	%f46, %r114;
+	.loc	1 64 57
+	@%p13 bra 	$L__BB0_7;
+	mov.u64 	%rd51, assertMessage_1;
+	cvta.global.u64 	%rd52, %rd51;
+	mov.u64 	%rd53, assertFile_1;
+	cvta.global.u64 	%rd54, %rd53;
+	mov.u64 	%rd55, assertFunc_1;
+	cvta.global.u64 	%rd56, %rd55;
+	{ // callseq 3, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd52;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd54;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r137;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd56;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd61;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 3
+	bra.uni 	$L__BB0_7;
+$L__BB0_8:
+	.loc	1 55 4
+	ret;
+$L__tmp7:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/gx/cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 298
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 103
+.b8 120
+.b8 53
+.b8 108
+.b8 120
+.b8 112
+.b8 117
+.b8 101
+.b8 120
+.b8 112
+.b8 105
+.b8 110
+.b8 100
+.b8 106
+.b8 52
+.b8 100
+.b8 115
+.b8 109
+.b8 106
+.b8 122
+.b8 53
+.b8 120
+.b8 52
+.b8 50
+.b8 117
+.b8 104
+.b8 121
+.b8 121
+.b8 55
+.b8 105
+.b8 115
+.b8 107
+.b8 101
+.b8 118
+.b8 113
+.b8 55
+.b8 111
+.b8 118
+.b8 122
+.b8 112
+.b8 119
+.b8 97
+.b8 103
+.b8 98
+.b8 51
+.b8 116
+.b8 53
+.b8 112
+.b8 111
+.b8 119
+.b8 106
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 103
+.b8 120
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp2
+.b8 2
+.b8 44
+.b8 38
+.b8 5
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp6
+.b8 2
+.b8 50
+.b8 41
+.b8 4
+.b32 125
+.b64 $L__tmp3
+.b64 $L__tmp6
+.b8 2
+.b8 120
+.b8 46
+.b8 0
+.b8 4
+.b32 125
+.b64 $L__tmp4
+.b64 $L__tmp5
+.b8 2
+.b8 50
+.b8 41
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 101
+.b8 54
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 302
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ttir b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..4e0fc0ad287cdcd82b9bb9d7ed3fecc1ee2a070f
--- /dev/null
+++ b/.triton/dump/9f68cc707cb8f8bff3232abf59cbd9ec/triton_.ttir
@@ -0,0 +1,139 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x8xf32>
+    %c256_i32 = arith.constant 256 : i32
+    %c8_i32 = arith.constant 8 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi64>
+    %cst_2 = arith.constant dense<0> : tensor<64x1xi64>
+    %cst_3 = arith.constant dense<50257> : tensor<64x1xi64>
+    %cst_4 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32>
+    %cst_5 = arith.constant dense<2.560000e+02> : tensor<64x1xf32>
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
+    %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
+    %cst_8 = arith.constant dense<256> : tensor<64x1xi32>
+    %cst_9 = arith.constant dense<256> : tensor<1x8xi32>
+    %cst_10 = arith.constant dense<512> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %8 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
+    %9 = tt.addptr %8, %5 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi32>
+    %10 = tt.load %9 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
+    %11 = arith.remsi %5, %cst_10 : tensor<64x1xi32>
+    %12 = arith.muli %11, %cst_8 : tensor<64x1xi32>
+    %13 = tt.broadcast %12 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %14 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %15 = arith.addi %10, %cst_3 : tensor<64x1xi64>
+    %16 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
+    %17 = arith.select %16, %15, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %18 = arith.cmpi sge, %17, %cst_2 : tensor<64x1xi64>
+    %19 = arith.cmpi slt, %17, %cst_3 : tensor<64x1xi64>
+    %20 = arith.andi %18, %19 : tensor<64x1xi1>
+    %21 = arith.muli %17, %cst_1 : tensor<64x1xi64>
+    %22 = tt.broadcast %21 : (tensor<64x1xi64>) -> tensor<64x8xi64>
+    %23 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %24:3 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c8_i32 iter_args(%arg8 = %cst_7, %arg9 = %cst_7, %arg10 = %cst_7) -> (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>)  : i32 {
+      %47 = tt.splat %arg7 : (i32) -> tensor<1x8xi32>
+      %48 = arith.addi %47, %7 : tensor<1x8xi32>
+      %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x8xi32>
+      %50 = tt.broadcast %48 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %51 = arith.addi %50, %13 : tensor<64x8xi32>
+      %52 = tt.addptr %14, %51 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %53 = tt.broadcast %49 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
+      tt.assert %20, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %55 = arith.extsi %48 : tensor<1x8xi32> to tensor<1x8xi64>
+      %56 = tt.broadcast %55 : (tensor<1x8xi64>) -> tensor<64x8xi64>
+      %57 = arith.addi %56, %22 : tensor<64x8xi64>
+      %58 = tt.addptr %23, %57 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
+      %59 = tt.load %58, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
+      %60 = arith.addf %59, %54 : tensor<64x8xf32>
+      %61 = arith.subf %60, %arg8 : tensor<64x8xf32>
+      %62 = arith.addf %arg10, %cst_0 : tensor<64x8xf32>
+      %63 = arith.divf %61, %62 : tensor<64x8xf32>
+      %64 = arith.addf %arg8, %63 : tensor<64x8xf32>
+      %65 = arith.subf %60, %64 : tensor<64x8xf32>
+      %66 = arith.mulf %61, %65 : tensor<64x8xf32>
+      %67 = arith.addf %arg9, %66 : tensor<64x8xf32>
+      %68 = arith.select %53, %64, %arg8 : tensor<64x8xi1>, tensor<64x8xf32>
+      %69 = arith.select %53, %67, %arg9 : tensor<64x8xi1>, tensor<64x8xf32>
+      %70 = arith.select %53, %62, %arg10 : tensor<64x8xi1>, tensor<64x8xf32>
+      scf.yield %68, %69, %70 : tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>
+    }
+    %25:3 = "tt.reduce"(%24#0, %24#1, %24#2) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %47 = arith.subf %arg10, %arg7 : f32
+      %48 = arith.addf %arg9, %arg12 : f32
+      %49 = arith.cmpf oeq, %48, %cst : f32
+      %50 = arith.divf %arg12, %48 : f32
+      %51 = arith.select %49, %cst, %50 : f32
+      %52 = arith.mulf %47, %51 : f32
+      %53 = arith.addf %arg7, %52 : f32
+      %54 = arith.addf %arg8, %arg11 : f32
+      %55 = arith.mulf %47, %47 : f32
+      %56 = arith.mulf %55, %arg9 : f32
+      %57 = arith.mulf %56, %51 : f32
+      %58 = arith.addf %54, %57 : f32
+      tt.reduce.return %53, %58, %48 : f32, f32, f32
+    }) : (tensor<64x8xf32>, tensor<64x8xf32>, tensor<64x8xf32>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+    %26 = tt.expand_dims %25#0 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %27 = tt.expand_dims %25#1 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %28 = arith.muli %11, %cst_8 : tensor<64x1xi32>
+    %29 = tt.broadcast %28 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %30 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %31 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
+    %32 = arith.addi %10, %cst_3 : tensor<64x1xi64>
+    %33 = arith.cmpi slt, %10, %cst_2 : tensor<64x1xi64>
+    %34 = arith.select %33, %32, %10 : tensor<64x1xi1>, tensor<64x1xi64>
+    %35 = arith.cmpi sge, %34, %cst_2 : tensor<64x1xi64>
+    %36 = arith.cmpi slt, %34, %cst_3 : tensor<64x1xi64>
+    %37 = arith.andi %35, %36 : tensor<64x1xi1>
+    %38 = arith.muli %34, %cst_1 : tensor<64x1xi64>
+    %39 = tt.broadcast %38 : (tensor<64x1xi64>) -> tensor<64x8xi64>
+    %40 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %41 = tt.broadcast %26 : (tensor<64x1xf32>) -> tensor<64x8xf32>
+    %42 = arith.divf %27, %cst_5 : tensor<64x1xf32>
+    %43 = arith.addf %42, %cst_4 : tensor<64x1xf32>
+    %44 = arith.muli %5, %cst_8 : tensor<64x1xi32>
+    %45 = tt.broadcast %44 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %46 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c8_i32  : i32 {
+      %47 = tt.splat %arg7 : (i32) -> tensor<1x8xi32>
+      %48 = arith.addi %47, %7 : tensor<1x8xi32>
+      %49 = arith.cmpi slt, %48, %cst_9 : tensor<1x8xi32>
+      %50 = tt.broadcast %48 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %51 = arith.addi %50, %29 : tensor<64x8xi32>
+      %52 = tt.addptr %30, %51 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %53 = tt.broadcast %49 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %54 = tt.load %52, %53, %cst_7 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
+      %55 = tt.addptr %31, %48 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
+      %56 = tt.load %55, %49, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x8xf32>
+      tt.assert %37, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1>
+      %57 = arith.extsi %48 : tensor<1x8xi32> to tensor<1x8xi64>
+      %58 = tt.broadcast %57 : (tensor<1x8xi64>) -> tensor<64x8xi64>
+      %59 = arith.addi %58, %39 : tensor<64x8xi64>
+      %60 = tt.addptr %40, %59 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
+      %61 = tt.load %60, %53, %cst_7 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %62 = arith.addf %61, %54 : tensor<64x8xf32>
+      %63 = arith.subf %62, %41 : tensor<64x8xf32>
+      %64 = tt.extern_elementwise %43 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32>) -> tensor<64x1xf32>
+      %65 = tt.broadcast %64 : (tensor<64x1xf32>) -> tensor<64x8xf32>
+      %66 = arith.mulf %63, %65 : tensor<64x8xf32>
+      %67 = tt.broadcast %56 : (tensor<1x8xf32>) -> tensor<64x8xf32>
+      %68 = arith.mulf %66, %67 : tensor<64x8xf32>
+      %69 = arith.addi %50, %45 : tensor<64x8xi32>
+      %70 = tt.addptr %46, %69 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi32>
+      %71 = arith.truncf %68 : tensor<64x8xf32> to tensor<64x8xbf16>
+      tt.store %70, %71, %53 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16>
+    }
+    tt.return
+  }
+}
diff --git a/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.llir b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..ce915ad1814bde3add5ae1e0d8adc047c22da18f
--- /dev/null
+++ b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.llir
@@ -0,0 +1,53 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = trunc i32 %12 to i16, !dbg !13
+  %extelt.offset = lshr i32 %12, 16, !dbg !13
+  %14 = trunc i32 %extelt.offset to i16, !dbg !13
+  %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #1, !dbg !14
+  %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !14
+  %17 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15
+  %18 = bitcast float %15 to i32, !dbg !16
+  %19 = bitcast float %16 to i32, !dbg !16
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %18, i32 %19, ptr addrspace(1) %17, i1 true) #1, !dbg !16
+  ret void, !dbg !17
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py", directory: "/tmp/torchinductor_root/k6")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 24, column: 44, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..51500b602affefb96adfc3d3b1f0084310d0db0d
--- /dev/null
+++ b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ptx
@@ -0,0 +1,296 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<12>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32  	%r9, %r8, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r10, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r11, %r10, %r9;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r11, 2;
+	add.s64 	%rd1, %rd3, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	@%p1 ld.global.b32 { %r2 }, [ %rd1 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	.loc	1 24 44
+	cvt.f32.bf16 %r5, %rs1;
+	cvt.f32.bf16 %r6, %rs2;
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r11, 4;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	@%p1 st.global.v2.b32 [ %rd2 + 0 ], { %r5, %r6 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/k6/ck62k2xzbb657snfdowwanzszaij6qzw6vuc7cfidomjpkk6igcm.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 107
+.b8 54
+.b8 50
+.b8 107
+.b8 50
+.b8 120
+.b8 122
+.b8 98
+.b8 98
+.b8 54
+.b8 53
+.b8 55
+.b8 115
+.b8 110
+.b8 102
+.b8 100
+.b8 111
+.b8 119
+.b8 119
+.b8 97
+.b8 110
+.b8 122
+.b8 115
+.b8 122
+.b8 97
+.b8 105
+.b8 106
+.b8 54
+.b8 113
+.b8 122
+.b8 119
+.b8 54
+.b8 118
+.b8 117
+.b8 99
+.b8 55
+.b8 99
+.b8 102
+.b8 105
+.b8 100
+.b8 111
+.b8 109
+.b8 106
+.b8 112
+.b8 107
+.b8 107
+.b8 54
+.b8 105
+.b8 103
+.b8 99
+.b8 109
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 107
+.b8 54
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttgir b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2844e7b30bcdd6d5fd4f8f7e5ee181d165252135
--- /dev/null
+++ b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttgir
@@ -0,0 +1,19 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b
--- /dev/null
+++ b/.triton/dump/a37de85bdb85634924fdf498b7d8602b/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/a69784da01a97187168f22847465505f/triton_.ptx b/.triton/dump/a69784da01a97187168f22847465505f/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..4657ad2d45b6afaecb2ac0762c18a73ddfba83b4
--- /dev/null
+++ b/.triton/dump/a69784da01a97187168f22847465505f/triton_.ptx
@@ -0,0 +1,753 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7de8de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6d7de8de(
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7de8de_param_6,
+	.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_7,
+	.param .u32 triton__0d1d2d3d4d5d6d7de8de_param_8
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<28>;
+	.reg .b16 	%rs<13>;
+	.reg .b32 	%r<95>;
+	.reg .f32 	%f<78>;
+	.reg .b64 	%rd<19>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd8, [triton__0d1d2d3d4d5d6d7de8de_param_0];
+	ld.param.u64 	%rd9, [triton__0d1d2d3d4d5d6d7de8de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r60, %tid.x;
+	and.b32  	%r61, %r60, 31;
+	ld.param.u64 	%rd10, [triton__0d1d2d3d4d5d6d7de8de_param_2];
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6d7de8de_param_3];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6d7de8de_param_4];
+	and.b32  	%r62, %r60, 63;
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5d6d7de8de_param_5];
+	shl.b32 	%r63, %r62, 2;
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5d6d7de8de_param_6];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r64, %r1, 8;
+	.loc	1 30 36
+	or.b32  	%r65, %r64, %r63;
+	.loc	1 30 30
+	mul.wide.s32 	%rd15, %r65, 4;
+	add.s64 	%rd1, %rd9, %rd15;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r4;
+	mov.b32 	%f2, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd16, %r65, 2;
+	add.s64 	%rd2, %rd10, %rd16;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f3, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f4, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f5, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f6, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd11, %rd16;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f7, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f8, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f9, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f10, %r25;
+	.loc	1 33 31
+	mul.wide.u32 	%rd17, %r63, 4;
+	add.s64 	%rd4, %rd12, %rd17;
+	.loc	1 33 36
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	mov.u32 %r28, 0x0;
+	mov.u32 %r29, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r26, %r27, %r28, %r29 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	@!%p1 mov.u32 %r28, %r6;
+	@!%p1 mov.u32 %r29, %r6;
+	.loc	1 35 18
+	add.f32 	%f11, %f5, %f1;
+	add.f32 	%f12, %f6, %f2;
+	.loc	1 30 46
+	mov.b32 	%f13, %r3;
+	mov.b32 	%f14, %r2;
+	.loc	1 35 18
+	add.f32 	%f15, %f3, %f14;
+	add.f32 	%f16, %f4, %f13;
+	.loc	1 37 18
+	add.f32 	%f17, %f16, %f8;
+	add.f32 	%f18, %f15, %f7;
+	add.f32 	%f19, %f11, %f9;
+	add.f32 	%f20, %f12, %f10;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f21, %f18, %f17;
+	add.f32 	%f22, %f21, %f19;
+	add.f32 	%f23, %f22, %f20;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r66, %f23;
+	shfl.sync.bfly.b32	%r67, %r66, 16, 31, -1;
+	mov.b32 	%f24, %r67;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f25, %f23, %f24;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r68, %f25;
+	shfl.sync.bfly.b32	%r69, %r68, 8, 31, -1;
+	mov.b32 	%f26, %r69;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f27, %f25, %f26;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r70, %f27;
+	shfl.sync.bfly.b32	%r71, %r70, 4, 31, -1;
+	mov.b32 	%f28, %r71;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f29, %f27, %f28;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r72, %f29;
+	shfl.sync.bfly.b32	%r73, %r72, 2, 31, -1;
+	mov.b32 	%f30, %r73;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f31, %f29, %f30;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r74, %f31;
+	shfl.sync.bfly.b32	%r75, %r74, 1, 31, -1;
+	mov.b32 	%f32, %r75;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f33, %f31, %f32;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p17, %r61, 0;
+	shr.u32 	%r76, %r60, 3;
+	and.b32  	%r77, %r76, 4;
+	mov.u32 	%r78, global_smem;
+	add.s32 	%r34, %r78, %r77;
+	mov.b32 	%r35, %f33;
+	@%p17 st.shared.b32 [ %r34 + 0 ], %r35;
+	bar.sync 	0;
+	setp.lt.s32 	%p18, %r60, 2;
+	shl.b32 	%r79, %r60, 2;
+	add.s32 	%r37, %r78, %r79;
+	@%p18 ld.shared.b32 %r36, [ %r37 + 0 ];
+	mov.b32 	%f34, %r36;
+	shfl.sync.bfly.b32	%r80, %r36, 1, 31, -1;
+	mov.b32 	%f35, %r80;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f36, %f34, %f35;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r81, %r60, 1;
+	setp.eq.b32 	%p26, %r81, 1;
+	not.pred 	%p27, %p26;
+	and.pred  	%p19, %p18, %p27;
+	mov.b32 	%r39, %f36;
+	@%p19 st.shared.b32 [ %r37 + 0 ], %r39;
+	bar.sync 	0;
+	ld.shared.f32 	%f37, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f38, %f37, 0f00000000;
+$L__tmp16:
+	.loc	1 45 20
+	mov.b32 	%r41, %f38;
+	mov.b32 	%r42, 1132462080;
+	div.full.f32 %r59, %r41, %r42;
+	mov.b32 	%f39, %r59;
+	.loc	1 46 19
+	sub.f32 	%f40, %f18, %f39;
+	sub.f32 	%f41, %f17, %f39;
+	sub.f32 	%f42, %f19, %f39;
+	sub.f32 	%f43, %f20, %f39;
+	.loc	1 47 20
+	mul.f32 	%f44, %f41, %f41;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f45, %f40, %f40, %f44;
+	fma.rn.f32 	%f46, %f42, %f42, %f45;
+	fma.rn.f32 	%f47, %f43, %f43, %f46;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r82, %f47;
+	shfl.sync.bfly.b32	%r83, %r82, 16, 31, -1;
+	mov.b32 	%f48, %r83;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r84, %f49;
+	shfl.sync.bfly.b32	%r85, %r84, 8, 31, -1;
+	mov.b32 	%f50, %r85;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f51, %f49, %f50;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r86, %f51;
+	shfl.sync.bfly.b32	%r87, %r86, 4, 31, -1;
+	mov.b32 	%f52, %r87;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f53, %f51, %f52;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r88, %f53;
+	shfl.sync.bfly.b32	%r89, %r88, 2, 31, -1;
+	mov.b32 	%f54, %r89;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f55, %f53, %f54;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r90, %f55;
+	shfl.sync.bfly.b32	%r91, %r90, 1, 31, -1;
+	mov.b32 	%f56, %r91;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f57, %f55, %f56;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r44, %f57;
+	@%p17 st.shared.b32 [ %r34 + 0 ], %r44;
+	bar.sync 	0;
+	@%p18 ld.shared.b32 %r45, [ %r37 + 0 ];
+	mov.b32 	%f58, %r45;
+	shfl.sync.bfly.b32	%r92, %r45, 1, 31, -1;
+	mov.b32 	%f59, %r92;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f60, %f58, %f59;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r48, %f60;
+	@%p19 st.shared.b32 [ %r37 + 0 ], %r48;
+	bar.sync 	0;
+	ld.shared.f32 	%f61, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f62, %f61, 0f00000000;
+$L__tmp33:
+	.loc	1 52 20
+	mov.b32 	%r50, %f62;
+	div.full.f32 %r49, %r50, %r42;
+	mov.b32 	%f63, %r49;
+	.loc	1 54 20
+	add.f32 	%f64, %f63, 0f3727C5AC;
+	.loc	1 55 26
+	rsqrt.approx.ftz.f32 	%f65, %f64;
+	.loc	1 33 36
+	mov.b32 	%f66, %r29;
+	mov.b32 	%f67, %r28;
+	mov.b32 	%f68, %r27;
+	mov.b32 	%f69, %r26;
+	.loc	1 57 20
+	mul.f32 	%f70, %f40, %f65;
+	mul.f32 	%f71, %f41, %f65;
+	mul.f32 	%f72, %f42, %f65;
+	mul.f32 	%f73, %f43, %f65;
+	.loc	1 58 20
+	mul.f32 	%f74, %f70, %f69;
+	mul.f32 	%f75, %f71, %f68;
+	mul.f32 	%f76, %f72, %f67;
+	mul.f32 	%f77, %f73, %f66;
+	.loc	1 60 4
+	bar.sync 	0;
+	.loc	1 61 28
+	mul.wide.s32 	%rd18, %r1, 4;
+	add.s64 	%rd5, %rd8, %rd18;
+	.loc	1 61 40
+	setp.eq.s32 	%p23, %r62, 0;
+	mov.b32 	%r52, %f65;
+	@%p23 st.global.b32 [ %rd5 + 0 ], { %r52 };
+	.loc	1 62 25
+	add.s64 	%rd6, %rd14, %rd16;
+	.loc	1 62 48
+	mov.b32 	%r53, %f74;
+	cvt.rn.bf16.f32 %rs9, %r53;
+	mov.b32 	%r54, %f75;
+	cvt.rn.bf16.f32 %rs10, %r54;
+	mov.b32 	%r55, %f76;
+	cvt.rn.bf16.f32 %rs11, %r55;
+	mov.b32 	%r56, %f77;
+	cvt.rn.bf16.f32 %rs12, %r56;
+	mov.b32 	%r93, {%rs9, %rs10};
+	mov.b32 	%r94, {%rs11, %rs12};
+	@%p1 st.global.v2.b32 [ %rd6 + 0 ], { %r93, %r94 };
+	.loc	1 63 25
+	add.s64 	%rd7, %rd13, %rd18;
+	.loc	1 63 37
+	@%p23 st.global.b32 [ %rd7 + 0 ], { %r59 };
+	.loc	1 63 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/gy/cgyrkrvxykbeetcyfsjqxf2ni3kynf3x4qqckt4p2fyz7wetdsd2.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 403
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 103
+.b8 121
+.b8 114
+.b8 107
+.b8 114
+.b8 118
+.b8 120
+.b8 121
+.b8 107
+.b8 98
+.b8 101
+.b8 101
+.b8 116
+.b8 99
+.b8 121
+.b8 102
+.b8 115
+.b8 106
+.b8 113
+.b8 120
+.b8 102
+.b8 50
+.b8 110
+.b8 105
+.b8 51
+.b8 107
+.b8 121
+.b8 110
+.b8 102
+.b8 51
+.b8 120
+.b8 52
+.b8 113
+.b8 113
+.b8 99
+.b8 107
+.b8 116
+.b8 52
+.b8 112
+.b8 50
+.b8 102
+.b8 121
+.b8 122
+.b8 55
+.b8 119
+.b8 101
+.b8 116
+.b8 100
+.b8 115
+.b8 100
+.b8 50
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 103
+.b8 121
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 101
+.b8 56
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 101
+.b8 56
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 42
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 42
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 42
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 50
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 50
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 50
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 407
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 101
+.b8 56
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 407
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/a69784da01a97187168f22847465505f/triton_.ttir b/.triton/dump/a69784da01a97187168f22847465505f/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..6ae21a1f3ac6cd11397e57d9d28b26f94f0b10f1
--- /dev/null
+++ b/.triton/dump/a69784da01a97187168f22847465505f/triton_.ttir
@@ -0,0 +1,71 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7de8de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %20 = arith.addf %8, %12 : tensor<256xf32>
+    %21 = arith.addf %20, %16 : tensor<256xf32>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg9: f32, %arg10: f32):
+      %47 = arith.addf %arg9, %arg10 : f32
+      tt.reduce.return %47 : f32
+    }) : (tensor<256xf32>) -> f32
+    %24 = arith.addf %23, %cst_0 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<1xf32>
+    %27 = tt.splat %25 : (f32) -> tensor<256xf32>
+    %28 = arith.subf %21, %27 : tensor<256xf32>
+    %29 = arith.mulf %28, %28 : tensor<256xf32>
+    %30 = arith.select %2, %29, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
+    ^bb0(%arg9: f32, %arg10: f32):
+      %47 = arith.addf %arg9, %arg10 : f32
+      tt.reduce.return %47 : f32
+    }) : (tensor<256xf32>) -> f32
+    %32 = arith.addf %31, %cst_0 : f32
+    %33 = arith.divf %32, %cst_1 : f32
+    %34 = arith.addf %33, %cst_2 : f32
+    %35 = tt.extern_elementwise %34 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %36 = tt.splat %35 : (f32) -> tensor<1xf32>
+    %37 = tt.splat %35 : (f32) -> tensor<256xf32>
+    %38 = arith.mulf %28, %37 : tensor<256xf32>
+    %39 = arith.mulf %38, %19 : tensor<256xf32>
+    gpu.barrier
+    %40 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %41, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    %42 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %43 = tt.addptr %42, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %44 = arith.truncf %39 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %43, %44, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    %45 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
+    %46 = tt.splat %45 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %46, %26 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.llir b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..13f5d26e3ad591ead425ba887ba6961019ad8d15
--- /dev/null
+++ b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.llir
@@ -0,0 +1,297 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1d2d3d4d5d6de7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i32 %6, i32 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %10 = and i32 %9, 31, !dbg !10
+  %11 = lshr i32 %9, 5, !dbg !10
+  %12 = and i32 %11, 1, !dbg !10
+  %urem = and i32 %9, 63, !dbg !10
+  %13 = shl nuw nsw i32 %urem, 2, !dbg !10
+  %14 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %15 = shl i32 %14, 8, !dbg !12
+  %16 = or i32 %15, %13, !dbg !13
+  %17 = sext i32 %16 to i64, !dbg !14
+  %18 = getelementptr float, ptr addrspace(1) %1, i64 %17, !dbg !14
+  %19 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %18, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %20 = extractvalue { i32, i32, i32, i32 } %19, 0, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %19, 1, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %19, 2, !dbg !15
+  %23 = extractvalue { i32, i32, i32, i32 } %19, 3, !dbg !15
+  %24 = bitcast i32 %20 to float, !dbg !15
+  %25 = bitcast i32 %21 to float, !dbg !15
+  %26 = bitcast i32 %22 to float, !dbg !15
+  %27 = bitcast i32 %23 to float, !dbg !15
+  %28 = getelementptr i16, ptr addrspace(1) %2, i64 %17, !dbg !16
+  %29 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %28, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %30 = extractvalue { i32, i32 } %29, 0, !dbg !17
+  %31 = extractvalue { i32, i32 } %29, 1, !dbg !17
+  %32 = trunc i32 %30 to i16, !dbg !17
+  %extelt.offset = lshr i32 %30, 16, !dbg !17
+  %33 = trunc i32 %extelt.offset to i16, !dbg !17
+  %34 = trunc i32 %31 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %31, 16, !dbg !17
+  %35 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
+  %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
+  %39 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %35) #6, !dbg !18
+  %40 = zext nneg i32 %13 to i64, !dbg !19
+  %41 = getelementptr float, ptr addrspace(1) %3, i64 %40, !dbg !19
+  %42 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %41, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %43 = fadd float %36, %24, !dbg !21
+  %44 = fadd float %37, %25, !dbg !21
+  %45 = fadd float %38, %26, !dbg !21
+  %46 = fadd float %39, %27, !dbg !21
+  %47 = fadd float %43, %44, !dbg !22
+  %48 = fadd float %47, %45, !dbg !22
+  %49 = fadd float %48, %46, !dbg !22
+  %50 = bitcast float %49 to i32, !dbg !28
+  %51 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %50, i32 16, i32 31), !dbg !28
+  %52 = bitcast i32 %51 to float, !dbg !28
+  %53 = fadd float %49, %52, !dbg !22
+  %54 = bitcast float %53 to i32, !dbg !28
+  %55 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %54, i32 8, i32 31), !dbg !28
+  %56 = bitcast i32 %55 to float, !dbg !28
+  %57 = fadd float %53, %56, !dbg !22
+  %58 = bitcast float %57 to i32, !dbg !28
+  %59 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %58, i32 4, i32 31), !dbg !28
+  %60 = bitcast i32 %59 to float, !dbg !28
+  %61 = fadd float %57, %60, !dbg !22
+  %62 = bitcast float %61 to i32, !dbg !28
+  %63 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %62, i32 2, i32 31), !dbg !28
+  %64 = bitcast i32 %63 to float, !dbg !28
+  %65 = fadd float %61, %64, !dbg !22
+  %66 = bitcast float %65 to i32, !dbg !28
+  %67 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %66, i32 1, i32 31), !dbg !28
+  %68 = bitcast i32 %67 to float, !dbg !28
+  %69 = fadd float %65, %68, !dbg !22
+  %70 = icmp eq i32 %10, 0, !dbg !28
+  %71 = zext nneg i32 %12 to i64, !dbg !28
+  %72 = getelementptr float, ptr addrspace(3) @global_smem, i64 %71, !dbg !28
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %72, float %69, i1 %70) #6, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %73 = icmp slt i32 %9, 2, !dbg !28
+  %74 = sext i32 %9 to i64, !dbg !28
+  %75 = getelementptr float, ptr addrspace(3) @global_smem, i64 %74, !dbg !28
+  %76 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %75, i1 %73) #6, !dbg !28
+  %77 = bitcast float %76 to i32, !dbg !28
+  %78 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %77, i32 1, i32 31), !dbg !28
+  %79 = bitcast i32 %78 to float, !dbg !28
+  %80 = fadd float %76, %79, !dbg !22
+  %81 = and i32 %9, 1, !dbg !28
+  %82 = icmp eq i32 %81, 0, !dbg !28
+  %83 = and i1 %73, %82, !dbg !28
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %75, float %80, i1 %83) #6, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %84 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !28
+  %85 = fadd float %84, 0.000000e+00, !dbg !30
+  %86 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %85, float 2.560000e+02) #6, !dbg !34
+  %87 = fsub float %43, %86, !dbg !35
+  %88 = fsub float %44, %86, !dbg !35
+  %89 = fsub float %45, %86, !dbg !35
+  %90 = fsub float %46, %86, !dbg !35
+  %91 = fmul float %87, %87, !dbg !36
+  %92 = fmul float %88, %88, !dbg !36
+  %93 = fmul float %89, %89, !dbg !36
+  %94 = fmul float %90, %90, !dbg !36
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %95 = fadd float %91, %92, !dbg !39
+  %96 = fadd float %93, %95, !dbg !39
+  %97 = fadd float %94, %96, !dbg !39
+  %98 = bitcast float %97 to i32, !dbg !37
+  %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 16, i32 31), !dbg !37
+  %100 = bitcast i32 %99 to float, !dbg !37
+  %101 = fadd float %97, %100, !dbg !39
+  %102 = bitcast float %101 to i32, !dbg !37
+  %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 8, i32 31), !dbg !37
+  %104 = bitcast i32 %103 to float, !dbg !37
+  %105 = fadd float %101, %104, !dbg !39
+  %106 = bitcast float %105 to i32, !dbg !37
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 4, i32 31), !dbg !37
+  %108 = bitcast i32 %107 to float, !dbg !37
+  %109 = fadd float %105, %108, !dbg !39
+  %110 = bitcast float %109 to i32, !dbg !37
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 2, i32 31), !dbg !37
+  %112 = bitcast i32 %111 to float, !dbg !37
+  %113 = fadd float %109, %112, !dbg !39
+  %114 = bitcast float %113 to i32, !dbg !37
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 1, i32 31), !dbg !37
+  %116 = bitcast i32 %115 to float, !dbg !37
+  %117 = fadd float %113, %116, !dbg !39
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %72, float %117, i1 %70) #6, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %118 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %75, i1 %73) #6, !dbg !37
+  %119 = bitcast float %118 to i32, !dbg !37
+  %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 1, i32 31), !dbg !37
+  %121 = bitcast i32 %120 to float, !dbg !37
+  %122 = fadd float %118, %121, !dbg !39
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %75, float %122, i1 %83) #6, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !37
+  %123 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !37
+  %124 = fadd float %123, 0.000000e+00, !dbg !42
+  %125 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %124, float 2.560000e+02) #6, !dbg !44
+  %126 = fadd float %125, 0x3EE4F8B580000000, !dbg !45
+  %127 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !46
+  %.not.i = icmp eq i32 %127, 0, !dbg !46
+  br i1 %.not.i, label %130, label %128, !dbg !46
+
+128:                                              ; preds = %8
+  %129 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %126), !dbg !46
+  br label %__nv_rsqrtf.exit, !dbg !46
+
+130:                                              ; preds = %8
+  %131 = tail call float @llvm.nvvm.rsqrt.approx.f(float %126), !dbg !46
+  br label %__nv_rsqrtf.exit, !dbg !46
+
+__nv_rsqrtf.exit:                                 ; preds = %128, %130
+  %.0.i = phi float [ %129, %128 ], [ %131, %130 ], !dbg !46
+  %132 = extractvalue { i32, i32, i32, i32 } %42, 3, !dbg !20
+  %133 = bitcast i32 %132 to float, !dbg !20
+  %134 = extractvalue { i32, i32, i32, i32 } %42, 2, !dbg !20
+  %135 = bitcast i32 %134 to float, !dbg !20
+  %136 = extractvalue { i32, i32, i32, i32 } %42, 1, !dbg !20
+  %137 = bitcast i32 %136 to float, !dbg !20
+  %138 = extractvalue { i32, i32, i32, i32 } %42, 0, !dbg !20
+  %139 = bitcast i32 %138 to float, !dbg !20
+  %140 = fmul float %87, %.0.i, !dbg !47
+  %141 = fmul float %88, %.0.i, !dbg !47
+  %142 = fmul float %89, %.0.i, !dbg !47
+  %143 = fmul float %90, %.0.i, !dbg !47
+  %144 = fmul float %140, %139, !dbg !48
+  %145 = fmul float %141, %137, !dbg !48
+  %146 = fmul float %142, %135, !dbg !48
+  %147 = fmul float %143, %133, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %148 = sext i32 %14 to i64, !dbg !50
+  %149 = getelementptr float, ptr addrspace(1) %0, i64 %148, !dbg !50
+  %150 = icmp eq i32 %urem, 0, !dbg !51
+  %151 = bitcast float %.0.i to i32, !dbg !51
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %151, ptr addrspace(1) %149, i1 %150) #6, !dbg !51
+  %152 = getelementptr i16, ptr addrspace(1) %5, i64 %17, !dbg !52
+  %153 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %144) #6, !dbg !53
+  %154 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %145) #6, !dbg !53
+  %155 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %146) #6, !dbg !53
+  %156 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %147) #6, !dbg !53
+  %157 = insertelement <2 x i16> undef, i16 %153, i64 0, !dbg !53
+  %158 = insertelement <2 x i16> %157, i16 %154, i64 1, !dbg !53
+  %159 = bitcast <2 x i16> %158 to i32, !dbg !53
+  %160 = insertelement <2 x i16> undef, i16 %155, i64 0, !dbg !53
+  %161 = insertelement <2 x i16> %160, i16 %156, i64 1, !dbg !53
+  %162 = bitcast <2 x i16> %161 to i32, !dbg !53
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %159, i32 %162, ptr addrspace(1) %152, i1 true) #6, !dbg !53
+  %163 = getelementptr float, ptr addrspace(1) %4, i64 %148, !dbg !54
+  %164 = bitcast float %86 to i32, !dbg !55
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %164, ptr addrspace(1) %163, i1 %150) #6, !dbg !55
+  ret void, !dbg !56
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cw35gljjtatzr2ztskwlxndj2nreiih7r3vg5rw4douyaxccqgij.py", directory: "/tmp/torchinductor_root/w3")
+!4 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6de7de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6de7de", linkageName: "triton__0d1d2d3d4d5d6de7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 31, scope: !7)
+!20 = !DILocation(line: 32, column: 36, scope: !7)
+!21 = !DILocation(line: 34, column: 18, scope: !7)
+!22 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !26)
+!23 = distinct !DILexicalBlockFile(scope: !25, file: !24, discriminator: 0)
+!24 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!25 = distinct !DILexicalBlockFile(scope: !7, file: !24, discriminator: 0)
+!26 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !27)
+!27 = !DILocation(line: 39, column: 58, scope: !23)
+!28 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !29)
+!29 = !DILocation(line: 39, column: 58, scope: !25)
+!30 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !33)
+!31 = distinct !DILexicalBlockFile(scope: !7, file: !32, discriminator: 0)
+!32 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!33 = !DILocation(line: 39, column: 45, scope: !31)
+!34 = !DILocation(line: 42, column: 20, scope: !7)
+!35 = !DILocation(line: 43, column: 19, scope: !7)
+!36 = !DILocation(line: 44, column: 20, scope: !7)
+!37 = !DILocation(line: 243, column: 36, scope: !25, inlinedAt: !38)
+!38 = !DILocation(line: 47, column: 59, scope: !25)
+!39 = !DILocation(line: 233, column: 15, scope: !23, inlinedAt: !40)
+!40 = !DILocation(line: 243, column: 36, scope: !23, inlinedAt: !41)
+!41 = !DILocation(line: 47, column: 59, scope: !23)
+!42 = !DILocation(line: 8, column: 15, scope: !31, inlinedAt: !43)
+!43 = !DILocation(line: 47, column: 45, scope: !31)
+!44 = !DILocation(line: 49, column: 20, scope: !7)
+!45 = !DILocation(line: 51, column: 20, scope: !7)
+!46 = !DILocation(line: 52, column: 26, scope: !7)
+!47 = !DILocation(line: 54, column: 20, scope: !7)
+!48 = !DILocation(line: 55, column: 20, scope: !7)
+!49 = !DILocation(line: 57, column: 4, scope: !7)
+!50 = !DILocation(line: 58, column: 28, scope: !7)
+!51 = !DILocation(line: 58, column: 40, scope: !7)
+!52 = !DILocation(line: 59, column: 25, scope: !7)
+!53 = !DILocation(line: 59, column: 48, scope: !7)
+!54 = !DILocation(line: 60, column: 25, scope: !7)
+!55 = !DILocation(line: 60, column: 37, scope: !7)
+!56 = !DILocation(line: 60, column: 4, scope: !7)
diff --git a/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.cubin b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..a83d510acd73d6117339630593ce8409d19caa76
Binary files /dev/null and b/.triton/dump/afaa4c3313357296e42c7fedd4f7e6ef/triton_.cubin differ
diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.cubin b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..4ffd31dfd228204640db4b3286667c288b8f7f79
Binary files /dev/null and b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.cubin differ
diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ptx b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..d40da3a1df187d05ea80aeb621ccae523925efce
--- /dev/null
+++ b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ptx
@@ -0,0 +1,295 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<13>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32  	%r9, %r8, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r10, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r11, %r10, %r9;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r11, 4;
+	add.s64 	%rd1, %rd3, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r11, 2;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	cvt.rn.bf16.f32 %rs1, %r4;
+	cvt.rn.bf16.f32 %rs2, %r5;
+	mov.b32 	%r12, {%rs1, %rs2};
+	@%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/ch/cch6kzmgbnoxqjgy3okxqs7sy2uz27atdhc4lkuwz5ajinexdurx.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 104
+.b8 54
+.b8 107
+.b8 122
+.b8 109
+.b8 103
+.b8 98
+.b8 110
+.b8 111
+.b8 120
+.b8 113
+.b8 106
+.b8 103
+.b8 121
+.b8 51
+.b8 111
+.b8 107
+.b8 120
+.b8 113
+.b8 115
+.b8 55
+.b8 115
+.b8 121
+.b8 50
+.b8 117
+.b8 122
+.b8 50
+.b8 55
+.b8 97
+.b8 116
+.b8 100
+.b8 104
+.b8 99
+.b8 52
+.b8 108
+.b8 107
+.b8 117
+.b8 119
+.b8 122
+.b8 53
+.b8 97
+.b8 106
+.b8 105
+.b8 110
+.b8 101
+.b8 120
+.b8 100
+.b8 117
+.b8 114
+.b8 120
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 104
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttgir b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..9f2d830320a91c96caa653767fd2930a46d0fa5d
--- /dev/null
+++ b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttgir
@@ -0,0 +1,19 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32, #blocked>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %10 = arith.truncf %7 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.llir b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..46f5931f0073de6802079a62695c2478beeb7c8c
--- /dev/null
+++ b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.llir
@@ -0,0 +1,374 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !7 {
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %14 = and i32 %13, 31, !dbg !10
+  %15 = lshr i32 %13, 5, !dbg !10
+  %16 = and i32 %15, 1, !dbg !10
+  %urem = and i32 %13, 63, !dbg !10
+  %17 = shl nuw nsw i32 %urem, 2, !dbg !10
+  %18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %19 = shl i32 %18, 8, !dbg !12
+  %20 = or i32 %19, %17, !dbg !13
+  %21 = sext i32 %20 to i64, !dbg !14
+  %22 = getelementptr float, ptr addrspace(1) %1, i64 %21, !dbg !14
+  %23 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %22, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %24 = extractvalue { i32, i32, i32, i32 } %23, 0, !dbg !15
+  %25 = extractvalue { i32, i32, i32, i32 } %23, 1, !dbg !15
+  %26 = extractvalue { i32, i32, i32, i32 } %23, 2, !dbg !15
+  %27 = extractvalue { i32, i32, i32, i32 } %23, 3, !dbg !15
+  %28 = bitcast i32 %24 to float, !dbg !15
+  %29 = bitcast i32 %25 to float, !dbg !15
+  %30 = bitcast i32 %26 to float, !dbg !15
+  %31 = bitcast i32 %27 to float, !dbg !15
+  %32 = getelementptr i16, ptr addrspace(1) %2, i64 %21, !dbg !16
+  %33 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %34 = extractvalue { i32, i32 } %33, 0, !dbg !17
+  %35 = extractvalue { i32, i32 } %33, 1, !dbg !17
+  %36 = trunc i32 %34 to i16, !dbg !17
+  %extelt.offset = lshr i32 %34, 16, !dbg !17
+  %37 = trunc i32 %extelt.offset to i16, !dbg !17
+  %38 = trunc i32 %35 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %35, 16, !dbg !17
+  %39 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18
+  %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18
+  %42 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %38) #6, !dbg !18
+  %43 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %39) #6, !dbg !18
+  %44 = getelementptr i16, ptr addrspace(1) %3, i64 %21, !dbg !19
+  %45 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %44, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %46 = extractvalue { i32, i32 } %45, 0, !dbg !20
+  %47 = extractvalue { i32, i32 } %45, 1, !dbg !20
+  %48 = trunc i32 %46 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %46, 16, !dbg !20
+  %49 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %50 = trunc i32 %47 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %47, 16, !dbg !20
+  %51 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21
+  %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21
+  %54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #6, !dbg !21
+  %55 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #6, !dbg !21
+  %56 = getelementptr i16, ptr addrspace(1) %4, i64 %21, !dbg !22
+  %57 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %58 = extractvalue { i32, i32 } %57, 0, !dbg !23
+  %59 = extractvalue { i32, i32 } %57, 1, !dbg !23
+  %60 = trunc i32 %58 to i16, !dbg !23
+  %extelt.offset4 = lshr i32 %58, 16, !dbg !23
+  %61 = trunc i32 %extelt.offset4 to i16, !dbg !23
+  %62 = trunc i32 %59 to i16, !dbg !23
+  %extelt.offset5 = lshr i32 %59, 16, !dbg !23
+  %63 = trunc i32 %extelt.offset5 to i16, !dbg !23
+  %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24
+  %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24
+  %66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #6, !dbg !24
+  %67 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %63) #6, !dbg !24
+  %68 = getelementptr i16, ptr addrspace(1) %5, i64 %21, !dbg !25
+  %69 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
+  %70 = extractvalue { i32, i32 } %69, 0, !dbg !26
+  %71 = extractvalue { i32, i32 } %69, 1, !dbg !26
+  %72 = trunc i32 %70 to i16, !dbg !26
+  %extelt.offset6 = lshr i32 %70, 16, !dbg !26
+  %73 = trunc i32 %extelt.offset6 to i16, !dbg !26
+  %74 = trunc i32 %71 to i16, !dbg !26
+  %extelt.offset7 = lshr i32 %71, 16, !dbg !26
+  %75 = trunc i32 %extelt.offset7 to i16, !dbg !26
+  %76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27
+  %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27
+  %78 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #6, !dbg !27
+  %79 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %75) #6, !dbg !27
+  %80 = zext nneg i32 %17 to i64, !dbg !28
+  %81 = getelementptr float, ptr addrspace(1) %6, i64 %80, !dbg !28
+  %82 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %83 = fadd float %40, %28, !dbg !30
+  %84 = fadd float %41, %29, !dbg !30
+  %85 = fadd float %42, %30, !dbg !30
+  %86 = fadd float %83, %52, !dbg !31
+  %87 = fadd float %84, %53, !dbg !31
+  %88 = fadd float %85, %54, !dbg !31
+  %89 = fadd float %87, %65, !dbg !32
+  %90 = fadd float %88, %66, !dbg !32
+  %91 = fadd float %89, %77, !dbg !33
+  %92 = fadd float %90, %78, !dbg !33
+  %93 = insertelement <2 x float> poison, float %86, i64 0, !dbg !32
+  %94 = insertelement <2 x float> %93, float %43, i64 1, !dbg !32
+  %95 = insertelement <2 x float> poison, float %64, i64 0, !dbg !32
+  %96 = insertelement <2 x float> %95, float %31, i64 1, !dbg !32
+  %97 = fadd <2 x float> %94, %96, !dbg !32
+  %98 = insertelement <2 x float> poison, float %76, i64 0, !dbg !33
+  %99 = insertelement <2 x float> %98, float %55, i64 1, !dbg !33
+  %100 = fadd <2 x float> %97, %99, !dbg !33
+  %101 = insertelement <2 x float> poison, float %91, i64 0, !dbg !34
+  %102 = insertelement <2 x float> %101, float %67, i64 1, !dbg !34
+  %103 = fadd <2 x float> %100, %102, !dbg !34
+  %104 = insertelement <2 x float> poison, float %92, i64 0, !dbg !34
+  %105 = insertelement <2 x float> %104, float %79, i64 1, !dbg !34
+  %106 = fadd <2 x float> %103, %105, !dbg !34
+  %107 = extractelement <2 x float> %106, i64 0, !dbg !34
+  %108 = extractelement <2 x float> %106, i64 1, !dbg !34
+  %109 = fadd float %107, %108, !dbg !34
+  %110 = bitcast float %109 to i32, !dbg !40
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 16, i32 31), !dbg !40
+  %112 = bitcast i32 %111 to float, !dbg !40
+  %113 = fadd float %109, %112, !dbg !34
+  %114 = bitcast float %113 to i32, !dbg !40
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 8, i32 31), !dbg !40
+  %116 = bitcast i32 %115 to float, !dbg !40
+  %117 = fadd float %113, %116, !dbg !34
+  %118 = bitcast float %117 to i32, !dbg !40
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 4, i32 31), !dbg !40
+  %120 = bitcast i32 %119 to float, !dbg !40
+  %121 = fadd float %117, %120, !dbg !34
+  %122 = bitcast float %121 to i32, !dbg !40
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !40
+  %124 = bitcast i32 %123 to float, !dbg !40
+  %125 = fadd float %121, %124, !dbg !34
+  %126 = bitcast float %125 to i32, !dbg !40
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !40
+  %128 = bitcast i32 %127 to float, !dbg !40
+  %129 = fadd float %125, %128, !dbg !34
+  %130 = icmp eq i32 %14, 0, !dbg !40
+  %131 = zext nneg i32 %16 to i64, !dbg !40
+  %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !40
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %132, float %129, i1 %130) #6, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !40
+  %133 = icmp slt i32 %13, 2, !dbg !40
+  %134 = sext i32 %13 to i64, !dbg !40
+  %135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !40
+  %136 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %135, i1 %133) #6, !dbg !40
+  %137 = bitcast float %136 to i32, !dbg !40
+  %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !40
+  %139 = bitcast i32 %138 to float, !dbg !40
+  %140 = fadd float %136, %139, !dbg !34
+  %141 = and i32 %13, 1, !dbg !40
+  %142 = icmp eq i32 %141, 0, !dbg !40
+  %143 = and i1 %133, %142, !dbg !40
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %135, float %140, i1 %143) #6, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !40
+  %144 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40
+  %145 = fadd float %144, 0.000000e+00, !dbg !42
+  %146 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float 2.560000e+02) #6, !dbg !46
+  %147 = extractelement <2 x float> %100, i64 0, !dbg !47
+  %148 = fsub float %147, %146, !dbg !47
+  %149 = fsub float %91, %146, !dbg !47
+  %150 = fsub float %92, %146, !dbg !47
+  %151 = fsub float %108, %146, !dbg !47
+  %152 = fmul float %148, %148, !dbg !48
+  %153 = fmul float %149, %149, !dbg !48
+  %154 = fmul float %150, %150, !dbg !48
+  %155 = fmul float %151, %151, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %156 = fadd float %152, %153, !dbg !51
+  %157 = fadd float %154, %156, !dbg !51
+  %158 = fadd float %155, %157, !dbg !51
+  %159 = bitcast float %158 to i32, !dbg !49
+  %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 16, i32 31), !dbg !49
+  %161 = bitcast i32 %160 to float, !dbg !49
+  %162 = fadd float %158, %161, !dbg !51
+  %163 = bitcast float %162 to i32, !dbg !49
+  %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 8, i32 31), !dbg !49
+  %165 = bitcast i32 %164 to float, !dbg !49
+  %166 = fadd float %162, %165, !dbg !51
+  %167 = bitcast float %166 to i32, !dbg !49
+  %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 4, i32 31), !dbg !49
+  %169 = bitcast i32 %168 to float, !dbg !49
+  %170 = fadd float %166, %169, !dbg !51
+  %171 = bitcast float %170 to i32, !dbg !49
+  %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 2, i32 31), !dbg !49
+  %173 = bitcast i32 %172 to float, !dbg !49
+  %174 = fadd float %170, %173, !dbg !51
+  %175 = bitcast float %174 to i32, !dbg !49
+  %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 1, i32 31), !dbg !49
+  %177 = bitcast i32 %176 to float, !dbg !49
+  %178 = fadd float %174, %177, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %132, float %178, i1 %130) #6, !dbg !49
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %179 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %135, i1 %133) #6, !dbg !49
+  %180 = bitcast float %179 to i32, !dbg !49
+  %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 1, i32 31), !dbg !49
+  %182 = bitcast i32 %181 to float, !dbg !49
+  %183 = fadd float %179, %182, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %135, float %183, i1 %143) #6, !dbg !49
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %184 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49
+  %185 = fadd float %184, 0.000000e+00, !dbg !54
+  %186 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %185, float 2.560000e+02) #6, !dbg !56
+  %187 = fadd float %186, 0x3EE4F8B580000000, !dbg !57
+  %188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58
+  %.not.i = icmp eq i32 %188, 0, !dbg !58
+  br i1 %.not.i, label %191, label %189, !dbg !58
+
+189:                                              ; preds = %12
+  %190 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %187), !dbg !58
+  br label %__nv_rsqrtf.exit, !dbg !58
+
+191:                                              ; preds = %12
+  %192 = tail call float @llvm.nvvm.rsqrt.approx.f(float %187), !dbg !58
+  br label %__nv_rsqrtf.exit, !dbg !58
+
+__nv_rsqrtf.exit:                                 ; preds = %189, %191
+  %.0.i = phi float [ %190, %189 ], [ %192, %191 ], !dbg !58
+  %193 = extractvalue { i32, i32, i32, i32 } %82, 3, !dbg !29
+  %194 = bitcast i32 %193 to float, !dbg !29
+  %195 = extractvalue { i32, i32, i32, i32 } %82, 2, !dbg !29
+  %196 = bitcast i32 %195 to float, !dbg !29
+  %197 = extractvalue { i32, i32, i32, i32 } %82, 1, !dbg !29
+  %198 = bitcast i32 %197 to float, !dbg !29
+  %199 = extractvalue { i32, i32, i32, i32 } %82, 0, !dbg !29
+  %200 = bitcast i32 %199 to float, !dbg !29
+  %201 = fmul float %148, %.0.i, !dbg !59
+  %202 = fmul float %149, %.0.i, !dbg !59
+  %203 = fmul float %150, %.0.i, !dbg !59
+  %204 = fmul float %151, %.0.i, !dbg !59
+  %205 = fmul float %201, %200, !dbg !60
+  %206 = fmul float %202, %198, !dbg !60
+  %207 = fmul float %203, %196, !dbg !60
+  %208 = fmul float %204, %194, !dbg !60
+  %209 = getelementptr float, ptr addrspace(1) %7, i64 %21, !dbg !61
+  %210 = bitcast float %147 to i32, !dbg !62
+  %211 = bitcast float %91 to i32, !dbg !62
+  %212 = bitcast float %92 to i32, !dbg !62
+  %213 = bitcast float %108 to i32, !dbg !62
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %210, i32 %211, i32 %212, i32 %213, ptr addrspace(1) %209, i1 true) #6, !dbg !62
+  tail call void @llvm.nvvm.barrier0(), !dbg !63
+  %214 = sext i32 %18 to i64, !dbg !64
+  %215 = getelementptr float, ptr addrspace(1) %0, i64 %214, !dbg !64
+  %216 = icmp eq i32 %urem, 0, !dbg !65
+  %217 = bitcast float %.0.i to i32, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %217, ptr addrspace(1) %215, i1 %216) #6, !dbg !65
+  %218 = getelementptr float, ptr addrspace(1) %9, i64 %21, !dbg !66
+  %219 = bitcast float %205 to i32, !dbg !67
+  %220 = bitcast float %206 to i32, !dbg !67
+  %221 = bitcast float %207 to i32, !dbg !67
+  %222 = bitcast float %208 to i32, !dbg !67
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %219, i32 %220, i32 %221, i32 %222, ptr addrspace(1) %218, i1 true) #6, !dbg !67
+  %223 = getelementptr float, ptr addrspace(1) %8, i64 %214, !dbg !68
+  %224 = bitcast float %146 to i32, !dbg !69
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %224, ptr addrspace(1) %223, i1 %216) #6, !dbg !69
+  ret void, !dbg !70
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "csf6zcjhrl2sjepofkaaj2rwyu4vq322pi5ukcu37oynjbso2i4g.py", directory: "/tmp/torchinductor_root/sf")
+!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 30, scope: !7)
+!23 = !DILocation(line: 33, column: 46, scope: !7)
+!24 = !DILocation(line: 33, column: 67, scope: !7)
+!25 = !DILocation(line: 34, column: 31, scope: !7)
+!26 = !DILocation(line: 34, column: 47, scope: !7)
+!27 = !DILocation(line: 34, column: 68, scope: !7)
+!28 = !DILocation(line: 35, column: 31, scope: !7)
+!29 = !DILocation(line: 35, column: 36, scope: !7)
+!30 = !DILocation(line: 37, column: 18, scope: !7)
+!31 = !DILocation(line: 39, column: 18, scope: !7)
+!32 = !DILocation(line: 41, column: 18, scope: !7)
+!33 = !DILocation(line: 43, column: 19, scope: !7)
+!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38)
+!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39)
+!39 = !DILocation(line: 48, column: 59, scope: !35)
+!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41)
+!41 = !DILocation(line: 48, column: 59, scope: !37)
+!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45)
+!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0)
+!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!45 = !DILocation(line: 48, column: 45, scope: !43)
+!46 = !DILocation(line: 51, column: 20, scope: !7)
+!47 = !DILocation(line: 52, column: 20, scope: !7)
+!48 = !DILocation(line: 53, column: 20, scope: !7)
+!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50)
+!50 = !DILocation(line: 56, column: 59, scope: !37)
+!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52)
+!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53)
+!53 = !DILocation(line: 56, column: 59, scope: !35)
+!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55)
+!55 = !DILocation(line: 56, column: 45, scope: !43)
+!56 = !DILocation(line: 58, column: 20, scope: !7)
+!57 = !DILocation(line: 60, column: 20, scope: !7)
+!58 = !DILocation(line: 61, column: 26, scope: !7)
+!59 = !DILocation(line: 63, column: 20, scope: !7)
+!60 = !DILocation(line: 64, column: 20, scope: !7)
+!61 = !DILocation(line: 65, column: 25, scope: !7)
+!62 = !DILocation(line: 65, column: 48, scope: !7)
+!63 = !DILocation(line: 66, column: 4, scope: !7)
+!64 = !DILocation(line: 67, column: 28, scope: !7)
+!65 = !DILocation(line: 67, column: 40, scope: !7)
+!66 = !DILocation(line: 68, column: 25, scope: !7)
+!67 = !DILocation(line: 68, column: 48, scope: !7)
+!68 = !DILocation(line: 69, column: 25, scope: !7)
+!69 = !DILocation(line: 69, column: 37, scope: !7)
+!70 = !DILocation(line: 69, column: 4, scope: !7)
diff --git a/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.llir b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..a3b189e4884e355a3b9f098bc160632ac616a9a6
--- /dev/null
+++ b/.triton/dump/be2ceddb05ebae326eeb754fc508b131/triton_.llir
@@ -0,0 +1,477 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_1 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_1 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_1 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp13 < 50257"
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp3 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5de6de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, i32 %5, i32 %6) local_unnamed_addr !dbg !7 {
+  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %9 = and i32 %8, 31, !dbg !10
+  %10 = lshr i32 %8, 5, !dbg !10
+  %11 = and i32 %10, 1, !dbg !10
+  %urem = shl i32 %8, 2, !dbg !10
+  %12 = and i32 %urem, 252, !dbg !10
+  %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %14 = sext i32 %13 to i64, !dbg !12
+  %15 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !12
+  %16 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13
+  %17 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13
+  %18 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13
+  %19 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13
+  %20 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %15, i1 true) #6, !dbg !13
+  %21 = srem i32 %13, 512, !dbg !14
+  %22 = shl nsw i32 %21, 8, !dbg !15
+  %23 = or i32 %22, %12, !dbg !16
+  %24 = sext i32 %23 to i64, !dbg !17
+  %25 = getelementptr float, ptr addrspace(1) %2, i64 %24, !dbg !17
+  %26 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !18
+  %27 = extractvalue { i32, i32, i32, i32 } %26, 0, !dbg !18
+  %28 = extractvalue { i32, i32, i32, i32 } %26, 1, !dbg !18
+  %29 = extractvalue { i32, i32, i32, i32 } %26, 2, !dbg !18
+  %30 = extractvalue { i32, i32, i32, i32 } %26, 3, !dbg !18
+  %31 = bitcast i32 %27 to float, !dbg !18
+  %32 = bitcast i32 %28 to float, !dbg !18
+  %33 = bitcast i32 %29 to float, !dbg !18
+  %34 = bitcast i32 %30 to float, !dbg !18
+  %35 = add i64 %20, 50257, !dbg !19
+  %36 = icmp slt i64 %16, 0, !dbg !20
+  %37 = icmp slt i64 %20, 0, !dbg !20
+  %38 = select i1 %37, i64 %35, i64 %20, !dbg !21
+  %39 = icmp ugt i64 %38, 50256, !dbg !22
+  br i1 %39, label %40, label %41, !dbg !23
+
+40:                                               ; preds = %7
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
+  br label %41, !dbg !23
+
+41:                                               ; preds = %40, %7
+  %42 = shl i64 %16, 8, !dbg !24
+  %43 = add i64 %42, 12865792, !dbg !24
+  %44 = select i1 %36, i64 %43, i64 %42, !dbg !24
+  %45 = zext nneg i32 %12 to i64
+  %46 = or i64 %44, %45, !dbg !25
+  %47 = getelementptr float, ptr addrspace(1) %1, i64 %46, !dbg !26
+  %48 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %47, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !27
+  %49 = extractvalue { i32, i32, i32, i32 } %48, 0, !dbg !27
+  %50 = extractvalue { i32, i32, i32, i32 } %48, 1, !dbg !27
+  %51 = extractvalue { i32, i32, i32, i32 } %48, 2, !dbg !27
+  %52 = extractvalue { i32, i32, i32, i32 } %48, 3, !dbg !27
+  %53 = bitcast i32 %49 to float, !dbg !27
+  %54 = bitcast i32 %50 to float, !dbg !27
+  %55 = bitcast i32 %51 to float, !dbg !27
+  %56 = bitcast i32 %52 to float, !dbg !27
+  %57 = fadd float %31, %53, !dbg !28
+  %58 = fadd float %32, %54, !dbg !28
+  %59 = fadd float %33, %55, !dbg !28
+  %60 = fadd float %34, %56, !dbg !28
+  %61 = fadd float %57, 0.000000e+00, !dbg !29
+  %62 = fadd float %58, 0.000000e+00, !dbg !29
+  %63 = fadd float %59, 0.000000e+00, !dbg !29
+  %64 = fadd float %60, 0.000000e+00, !dbg !29
+  %65 = fsub float %57, %61, !dbg !33
+  %66 = fsub float %58, %62, !dbg !33
+  %67 = fsub float %59, %63, !dbg !33
+  %68 = fsub float %60, %64, !dbg !33
+  %69 = fmul float %57, %65, !dbg !34
+  %70 = fmul float %58, %66, !dbg !34
+  %71 = fmul float %59, %67, !dbg !34
+  %72 = fmul float %60, %68, !dbg !34
+  %73 = fadd float %69, 0.000000e+00, !dbg !35
+  %74 = fadd float %70, 0.000000e+00, !dbg !35
+  %75 = fadd float %71, 0.000000e+00, !dbg !35
+  %76 = fadd float %72, 0.000000e+00, !dbg !35
+  %77 = fsub float %62, %61, !dbg !36
+  %78 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 2.000000e+00) #6, !dbg !40
+  %79 = fmul float %78, %77, !dbg !41
+  %80 = fadd float %61, %79, !dbg !42
+  %81 = fadd float %73, %74, !dbg !43
+  %82 = fmul float %77, %77, !dbg !44
+  %83 = fmul float %78, %82, !dbg !45
+  %84 = fadd float %83, %81, !dbg !46
+  %85 = fsub float %63, %80, !dbg !36
+  %86 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 3.000000e+00) #6, !dbg !40
+  %87 = fmul float %86, %85, !dbg !41
+  %88 = fadd float %80, %87, !dbg !42
+  %89 = fadd float %75, %84, !dbg !43
+  %90 = fmul float %85, %85, !dbg !44
+  %91 = fmul float %90, 2.000000e+00, !dbg !47
+  %92 = fmul float %86, %91, !dbg !45
+  %93 = fadd float %89, %92, !dbg !46
+  %94 = fsub float %64, %88, !dbg !36
+  %95 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float 1.000000e+00, float 4.000000e+00) #6, !dbg !40
+  %96 = fmul float %95, %94, !dbg !41
+  %97 = fadd float %88, %96, !dbg !42
+  %98 = fadd float %76, %93, !dbg !43
+  %99 = fmul float %94, %94, !dbg !44
+  %100 = fmul float %99, 3.000000e+00, !dbg !47
+  %101 = fmul float %95, %100, !dbg !45
+  %102 = fadd float %98, %101, !dbg !46
+  %103 = bitcast float %97 to i32, !dbg !48
+  %104 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %103, i32 16, i32 31), !dbg !48
+  %105 = bitcast i32 %104 to float, !dbg !48
+  %106 = bitcast float %102 to i32, !dbg !48
+  %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 16, i32 31), !dbg !48
+  %108 = bitcast i32 %107 to float, !dbg !48
+  %109 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 1082130432, i32 16, i32 31), !dbg !48
+  %110 = bitcast i32 %109 to float, !dbg !48
+  %111 = fsub float %105, %97, !dbg !36
+  %112 = fadd float %110, 4.000000e+00, !dbg !50
+  %113 = fcmp oeq float %112, 0.000000e+00, !dbg !51
+  %114 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %110, float %112) #6, !dbg !40
+  %115 = select i1 %113, float 0.000000e+00, float %114, !dbg !52
+  %116 = fmul float %115, %111, !dbg !41
+  %117 = fadd float %97, %116, !dbg !42
+  %118 = fadd float %102, %108, !dbg !43
+  %119 = fmul float %111, %111, !dbg !44
+  %120 = fmul float %119, 4.000000e+00, !dbg !47
+  %121 = fmul float %115, %120, !dbg !45
+  %122 = fadd float %118, %121, !dbg !46
+  %123 = bitcast float %117 to i32, !dbg !48
+  %124 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %123, i32 8, i32 31), !dbg !48
+  %125 = bitcast i32 %124 to float, !dbg !48
+  %126 = bitcast float %122 to i32, !dbg !48
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 8, i32 31), !dbg !48
+  %128 = bitcast i32 %127 to float, !dbg !48
+  %129 = bitcast float %112 to i32, !dbg !48
+  %130 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %129, i32 8, i32 31), !dbg !48
+  %131 = bitcast i32 %130 to float, !dbg !48
+  %132 = fsub float %125, %117, !dbg !36
+  %133 = fadd float %112, %131, !dbg !50
+  %134 = fcmp oeq float %133, 0.000000e+00, !dbg !51
+  %135 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %131, float %133) #6, !dbg !40
+  %136 = select i1 %134, float 0.000000e+00, float %135, !dbg !52
+  %137 = fmul float %136, %132, !dbg !41
+  %138 = fadd float %117, %137, !dbg !42
+  %139 = fadd float %122, %128, !dbg !43
+  %140 = fmul float %132, %132, !dbg !44
+  %141 = fmul float %112, %140, !dbg !47
+  %142 = fmul float %136, %141, !dbg !45
+  %143 = fadd float %139, %142, !dbg !46
+  %144 = bitcast float %138 to i32, !dbg !48
+  %145 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %144, i32 4, i32 31), !dbg !48
+  %146 = bitcast i32 %145 to float, !dbg !48
+  %147 = bitcast float %143 to i32, !dbg !48
+  %148 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %147, i32 4, i32 31), !dbg !48
+  %149 = bitcast i32 %148 to float, !dbg !48
+  %150 = bitcast float %133 to i32, !dbg !48
+  %151 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %150, i32 4, i32 31), !dbg !48
+  %152 = bitcast i32 %151 to float, !dbg !48
+  %153 = fsub float %146, %138, !dbg !36
+  %154 = fadd float %133, %152, !dbg !50
+  %155 = fcmp oeq float %154, 0.000000e+00, !dbg !51
+  %156 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %152, float %154) #6, !dbg !40
+  %157 = select i1 %155, float 0.000000e+00, float %156, !dbg !52
+  %158 = fmul float %157, %153, !dbg !41
+  %159 = fadd float %138, %158, !dbg !42
+  %160 = fadd float %143, %149, !dbg !43
+  %161 = fmul float %153, %153, !dbg !44
+  %162 = fmul float %133, %161, !dbg !47
+  %163 = fmul float %157, %162, !dbg !45
+  %164 = fadd float %160, %163, !dbg !46
+  %165 = bitcast float %159 to i32, !dbg !48
+  %166 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %165, i32 2, i32 31), !dbg !48
+  %167 = bitcast i32 %166 to float, !dbg !48
+  %168 = bitcast float %164 to i32, !dbg !48
+  %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 2, i32 31), !dbg !48
+  %170 = bitcast i32 %169 to float, !dbg !48
+  %171 = bitcast float %154 to i32, !dbg !48
+  %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 2, i32 31), !dbg !48
+  %173 = bitcast i32 %172 to float, !dbg !48
+  %174 = fsub float %167, %159, !dbg !36
+  %175 = fadd float %154, %173, !dbg !50
+  %176 = fcmp oeq float %175, 0.000000e+00, !dbg !51
+  %177 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %173, float %175) #6, !dbg !40
+  %178 = select i1 %176, float 0.000000e+00, float %177, !dbg !52
+  %179 = fmul float %178, %174, !dbg !41
+  %180 = fadd float %159, %179, !dbg !42
+  %181 = fadd float %164, %170, !dbg !43
+  %182 = fmul float %174, %174, !dbg !44
+  %183 = fmul float %154, %182, !dbg !47
+  %184 = fmul float %178, %183, !dbg !45
+  %185 = fadd float %181, %184, !dbg !46
+  %186 = bitcast float %180 to i32, !dbg !48
+  %187 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %186, i32 1, i32 31), !dbg !48
+  %188 = bitcast i32 %187 to float, !dbg !48
+  %189 = bitcast float %185 to i32, !dbg !48
+  %190 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %189, i32 1, i32 31), !dbg !48
+  %191 = bitcast i32 %190 to float, !dbg !48
+  %192 = bitcast float %175 to i32, !dbg !48
+  %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !48
+  %194 = bitcast i32 %193 to float, !dbg !48
+  %195 = fsub float %188, %180, !dbg !36
+  %196 = fadd float %175, %194, !dbg !50
+  %197 = fcmp oeq float %196, 0.000000e+00, !dbg !51
+  %198 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %194, float %196) #6, !dbg !40
+  %199 = select i1 %197, float 0.000000e+00, float %198, !dbg !52
+  %200 = fmul float %195, %199, !dbg !41
+  %201 = fadd float %180, %200, !dbg !42
+  %202 = fadd float %185, %191, !dbg !43
+  %203 = fmul float %195, %195, !dbg !44
+  %204 = fmul float %175, %203, !dbg !47
+  %205 = fmul float %199, %204, !dbg !45
+  %206 = fadd float %202, %205, !dbg !46
+  %207 = icmp eq i32 %9, 0, !dbg !48
+  %208 = zext nneg i32 %11 to i64, !dbg !48
+  %209 = getelementptr float, ptr addrspace(3) @global_smem, i64 %208, !dbg !48
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %209, float %201, i1 %207) #6, !dbg !48
+  %210 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %208, !dbg !48
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %210, float %206, i1 %207) #6, !dbg !48
+  %211 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %208, !dbg !48
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %211, float %196, i1 %207) #6, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !48
+  %212 = icmp slt i32 %8, 2, !dbg !48
+  %213 = sext i32 %8 to i64, !dbg !48
+  %214 = getelementptr float, ptr addrspace(3) @global_smem, i64 %213, !dbg !48
+  %215 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %214, i1 %212) #6, !dbg !48
+  %216 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), i64 %213, !dbg !48
+  %217 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %216, i1 %212) #6, !dbg !48
+  %218 = getelementptr float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 16), i64 %213, !dbg !48
+  %219 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %218, i1 %212) #6, !dbg !48
+  %220 = bitcast float %215 to i32, !dbg !48
+  %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 1, i32 31), !dbg !48
+  %222 = bitcast i32 %221 to float, !dbg !48
+  %223 = bitcast float %217 to i32, !dbg !48
+  %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 1, i32 31), !dbg !48
+  %225 = bitcast i32 %224 to float, !dbg !48
+  %226 = bitcast float %219 to i32, !dbg !48
+  %227 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %226, i32 1, i32 31), !dbg !48
+  %228 = bitcast i32 %227 to float, !dbg !48
+  %229 = fsub float %222, %215, !dbg !36
+  %230 = fadd float %219, %228, !dbg !50
+  %231 = fcmp oeq float %230, 0.000000e+00, !dbg !51
+  %232 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %228, float %230) #6, !dbg !40
+  %233 = select i1 %231, float 0.000000e+00, float %232, !dbg !52
+  %234 = fmul float %229, %233, !dbg !41
+  %235 = fadd float %215, %234, !dbg !42
+  %236 = fadd float %217, %225, !dbg !43
+  %237 = fmul float %229, %229, !dbg !44
+  %238 = fmul float %219, %237, !dbg !47
+  %239 = fmul float %238, %233, !dbg !45
+  %240 = fadd float %236, %239, !dbg !46
+  %241 = and i32 %8, 1, !dbg !48
+  %242 = icmp eq i32 %241, 0, !dbg !48
+  %243 = and i1 %212, %242, !dbg !48
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %214, float %235, i1 %243) #6, !dbg !48
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %216, float %240, i1 %243) #6, !dbg !48
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %218, float %230, i1 %243) #6, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !48
+  %244 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !48
+  %245 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 8), align 4, !dbg !48
+  %246 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %25, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !53
+  %247 = getelementptr float, ptr addrspace(1) %3, i64 %45, !dbg !54
+  %248 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %247, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !55
+  br i1 %39, label %249, label %250, !dbg !56
+
+249:                                              ; preds = %41
+  tail call void @__assertfail(ptr nonnull @assertMessage_1, ptr nonnull @assertFile_1, i32 883, ptr nonnull @assertFunc_1, i64 1), !dbg !56
+  br label %250, !dbg !56
+
+250:                                              ; preds = %249, %41
+  %251 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %47, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !57
+  %252 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58
+  %253 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58
+  %254 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58
+  %255 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %245, float 2.560000e+02) #6, !dbg !58
+  %256 = fadd float %252, 0x3EE4F8B580000000, !dbg !59
+  %257 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60
+  %.not.i = icmp eq i32 %257, 0, !dbg !60
+  br i1 %.not.i, label %260, label %258, !dbg !60
+
+258:                                              ; preds = %250
+  %259 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %256), !dbg !60
+  br label %__nv_rsqrtf.exit, !dbg !60
+
+260:                                              ; preds = %250
+  %261 = tail call float @llvm.nvvm.rsqrt.approx.f(float %256), !dbg !60
+  br label %__nv_rsqrtf.exit, !dbg !60
+
+__nv_rsqrtf.exit:                                 ; preds = %258, %260
+  %.0.i = phi float [ %259, %258 ], [ %261, %260 ], !dbg !60
+  %262 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60
+  %263 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60
+  %264 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !60
+  %265 = extractvalue { i32, i32, i32, i32 } %251, 3, !dbg !57
+  %266 = bitcast i32 %265 to float, !dbg !57
+  %267 = extractvalue { i32, i32, i32, i32 } %246, 3, !dbg !53
+  %268 = bitcast i32 %267 to float, !dbg !53
+  %269 = fadd float %268, %266, !dbg !61
+  %270 = fsub float %269, %244, !dbg !62
+  %271 = extractvalue { i32, i32, i32, i32 } %251, 2, !dbg !57
+  %272 = bitcast i32 %271 to float, !dbg !57
+  %273 = extractvalue { i32, i32, i32, i32 } %246, 2, !dbg !53
+  %274 = bitcast i32 %273 to float, !dbg !53
+  %275 = fadd float %274, %272, !dbg !61
+  %276 = fsub float %275, %244, !dbg !62
+  %277 = extractvalue { i32, i32, i32, i32 } %251, 1, !dbg !57
+  %278 = bitcast i32 %277 to float, !dbg !57
+  %279 = extractvalue { i32, i32, i32, i32 } %246, 1, !dbg !53
+  %280 = bitcast i32 %279 to float, !dbg !53
+  %281 = fadd float %280, %278, !dbg !61
+  %282 = fsub float %281, %244, !dbg !62
+  %283 = extractvalue { i32, i32, i32, i32 } %251, 0, !dbg !57
+  %284 = bitcast i32 %283 to float, !dbg !57
+  %285 = extractvalue { i32, i32, i32, i32 } %246, 0, !dbg !53
+  %286 = bitcast i32 %285 to float, !dbg !53
+  %287 = fadd float %286, %284, !dbg !61
+  %288 = fsub float %287, %244, !dbg !62
+  %289 = extractvalue { i32, i32, i32, i32 } %248, 0, !dbg !55
+  %290 = bitcast i32 %289 to float, !dbg !55
+  %291 = extractvalue { i32, i32, i32, i32 } %248, 1, !dbg !55
+  %292 = bitcast i32 %291 to float, !dbg !55
+  %293 = extractvalue { i32, i32, i32, i32 } %248, 2, !dbg !55
+  %294 = bitcast i32 %293 to float, !dbg !55
+  %295 = extractvalue { i32, i32, i32, i32 } %248, 3, !dbg !55
+  %296 = bitcast i32 %295 to float, !dbg !55
+  %297 = fmul float %288, %.0.i, !dbg !63
+  %298 = fmul float %282, %.0.i, !dbg !63
+  %299 = fmul float %276, %.0.i, !dbg !63
+  %300 = fmul float %270, %.0.i, !dbg !63
+  %301 = fmul float %297, %290, !dbg !64
+  %302 = fmul float %298, %292, !dbg !64
+  %303 = fmul float %299, %294, !dbg !64
+  %304 = fmul float %300, %296, !dbg !64
+  %305 = shl i32 %13, 8, !dbg !65
+  %306 = or i32 %305, %12, !dbg !66
+  %307 = sext i32 %306 to i64, !dbg !67
+  %308 = getelementptr i16, ptr addrspace(1) %4, i64 %307, !dbg !67
+  %309 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %301) #6, !dbg !68
+  %310 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %302) #6, !dbg !68
+  %311 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %303) #6, !dbg !68
+  %312 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %304) #6, !dbg !68
+  %313 = insertelement <2 x i16> undef, i16 %309, i64 0, !dbg !68
+  %314 = insertelement <2 x i16> %313, i16 %310, i64 1, !dbg !68
+  %315 = bitcast <2 x i16> %314 to i32, !dbg !68
+  %316 = insertelement <2 x i16> undef, i16 %311, i64 0, !dbg !68
+  %317 = insertelement <2 x i16> %316, i16 %312, i64 1, !dbg !68
+  %318 = bitcast <2 x i16> %317 to i32, !dbg !68
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %315, i32 %318, ptr addrspace(1) %308, i1 true) #6, !dbg !68
+  ret void, !dbg !69
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cgx5lxpuexpindj4dsmjz5x42uhyy7iskevq7ovzpwagb3t5powj.py", directory: "/tmp/torchinductor_root/gx")
+!4 = !{ptr @triton__0d1d2d3d4d5de6de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5de6de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5de6de", linkageName: "triton__0d1d2d3d4d5de6de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 24, column: 33, scope: !7)
+!11 = !DILocation(line: 21, column: 28, scope: !7)
+!12 = !DILocation(line: 26, column: 30, scope: !7)
+!13 = !DILocation(line: 26, column: 35, scope: !7)
+!14 = !DILocation(line: 27, column: 18, scope: !7)
+!15 = !DILocation(line: 35, column: 44, scope: !7)
+!16 = !DILocation(line: 35, column: 40, scope: !7)
+!17 = !DILocation(line: 35, column: 34, scope: !7)
+!18 = !DILocation(line: 35, column: 50, scope: !7)
+!19 = !DILocation(line: 36, column: 22, scope: !7)
+!20 = !DILocation(line: 37, column: 22, scope: !7)
+!21 = !DILocation(line: 38, column: 36, scope: !7)
+!22 = !DILocation(line: 39, column: 40, scope: !7)
+!23 = !DILocation(line: 39, column: 55, scope: !7)
+!24 = !DILocation(line: 40, column: 44, scope: !7)
+!25 = !DILocation(line: 40, column: 40, scope: !7)
+!26 = !DILocation(line: 40, column: 34, scope: !7)
+!27 = !DILocation(line: 40, column: 52, scope: !7)
+!28 = !DILocation(line: 41, column: 22, scope: !7)
+!29 = !DILocation(line: 98, column: 22, scope: !30, inlinedAt: !32)
+!30 = distinct !DILexicalBlockFile(scope: !7, file: !31, discriminator: 0)
+!31 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!32 = !DILocation(line: 44, column: 38, scope: !30)
+!33 = !DILocation(line: 101, column: 30, scope: !30, inlinedAt: !32)
+!34 = !DILocation(line: 101, column: 22, scope: !30, inlinedAt: !32)
+!35 = !DILocation(line: 101, column: 13, scope: !30, inlinedAt: !32)
+!36 = !DILocation(line: 108, column: 21, scope: !37, inlinedAt: !38)
+!37 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
+!38 = !DILocation(line: 120, column: 46, scope: !37, inlinedAt: !39)
+!39 = !DILocation(line: 50, column: 41, scope: !37)
+!40 = !DILocation(line: 110, column: 60, scope: !37, inlinedAt: !38)
+!41 = !DILocation(line: 112, column: 25, scope: !37, inlinedAt: !38)
+!42 = !DILocation(line: 112, column: 17, scope: !37, inlinedAt: !38)
+!43 = !DILocation(line: 113, column: 15, scope: !37, inlinedAt: !38)
+!44 = !DILocation(line: 113, column: 30, scope: !37, inlinedAt: !38)
+!45 = !DILocation(line: 113, column: 49, scope: !37, inlinedAt: !38)
+!46 = !DILocation(line: 113, column: 22, scope: !37, inlinedAt: !38)
+!47 = !DILocation(line: 113, column: 38, scope: !37, inlinedAt: !38)
+!48 = !DILocation(line: 120, column: 46, scope: !30, inlinedAt: !49)
+!49 = !DILocation(line: 50, column: 41, scope: !30)
+!50 = !DILocation(line: 109, column: 28, scope: !37, inlinedAt: !38)
+!51 = !DILocation(line: 110, column: 39, scope: !37, inlinedAt: !38)
+!52 = !DILocation(line: 110, column: 49, scope: !37, inlinedAt: !38)
+!53 = !DILocation(line: 59, column: 51, scope: !7)
+!54 = !DILocation(line: 60, column: 35, scope: !7)
+!55 = !DILocation(line: 60, column: 40, scope: !7)
+!56 = !DILocation(line: 64, column: 57, scope: !7)
+!57 = !DILocation(line: 65, column: 54, scope: !7)
+!58 = !DILocation(line: 69, column: 23, scope: !7)
+!59 = !DILocation(line: 71, column: 24, scope: !7)
+!60 = !DILocation(line: 72, column: 30, scope: !7)
+!61 = !DILocation(line: 66, column: 24, scope: !7)
+!62 = !DILocation(line: 67, column: 24, scope: !7)
+!63 = !DILocation(line: 73, column: 24, scope: !7)
+!64 = !DILocation(line: 74, column: 24, scope: !7)
+!65 = !DILocation(line: 76, column: 39, scope: !7)
+!66 = !DILocation(line: 76, column: 35, scope: !7)
+!67 = !DILocation(line: 76, column: 29, scope: !7)
+!68 = !DILocation(line: 76, column: 52, scope: !7)
+!69 = !DILocation(line: 55, column: 4, scope: !7)
diff --git a/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.llir b/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..ff4f409cd3506d8abb3c136d6d93c59e43c3d860
--- /dev/null
+++ b/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.llir
@@ -0,0 +1,380 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1d2d3d4d5d6d7d8d9d10de11de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, ptr addrspace(1) %7, ptr addrspace(1) %8, ptr addrspace(1) %9, i32 %10, i32 %11) local_unnamed_addr !dbg !7 {
+  %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %14 = and i32 %13, 31, !dbg !10
+  %15 = lshr i32 %13, 5, !dbg !10
+  %16 = and i32 %15, 1, !dbg !10
+  %urem = and i32 %13, 63, !dbg !10
+  %17 = shl nuw nsw i32 %urem, 2, !dbg !10
+  %18 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %19 = shl i32 %18, 8, !dbg !12
+  %20 = or i32 %19, %17, !dbg !13
+  %21 = sext i32 %20 to i64, !dbg !14
+  %22 = getelementptr float, ptr addrspace(1) %1, i64 %21, !dbg !14
+  %23 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %22, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %24 = extractvalue { i32, i32, i32, i32 } %23, 0, !dbg !15
+  %25 = extractvalue { i32, i32, i32, i32 } %23, 1, !dbg !15
+  %26 = extractvalue { i32, i32, i32, i32 } %23, 2, !dbg !15
+  %27 = extractvalue { i32, i32, i32, i32 } %23, 3, !dbg !15
+  %28 = bitcast i32 %24 to float, !dbg !15
+  %29 = bitcast i32 %25 to float, !dbg !15
+  %30 = bitcast i32 %26 to float, !dbg !15
+  %31 = bitcast i32 %27 to float, !dbg !15
+  %32 = getelementptr i16, ptr addrspace(1) %2, i64 %21, !dbg !16
+  %33 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %32, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %34 = extractvalue { i32, i32 } %33, 0, !dbg !17
+  %35 = extractvalue { i32, i32 } %33, 1, !dbg !17
+  %36 = trunc i32 %34 to i16, !dbg !17
+  %extelt.offset = lshr i32 %34, 16, !dbg !17
+  %37 = trunc i32 %extelt.offset to i16, !dbg !17
+  %38 = trunc i32 %35 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %35, 16, !dbg !17
+  %39 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %40 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %36) #6, !dbg !18
+  %41 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %37) #6, !dbg !18
+  %42 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %38) #6, !dbg !18
+  %43 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %39) #6, !dbg !18
+  %44 = getelementptr i16, ptr addrspace(1) %3, i64 %21, !dbg !19
+  %45 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %44, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %46 = extractvalue { i32, i32 } %45, 0, !dbg !20
+  %47 = extractvalue { i32, i32 } %45, 1, !dbg !20
+  %48 = trunc i32 %46 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %46, 16, !dbg !20
+  %49 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %50 = trunc i32 %47 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %47, 16, !dbg !20
+  %51 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %52 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %48) #6, !dbg !21
+  %53 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #6, !dbg !21
+  %54 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %50) #6, !dbg !21
+  %55 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #6, !dbg !21
+  %56 = getelementptr i16, ptr addrspace(1) %4, i64 %21, !dbg !22
+  %57 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %56, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %58 = extractvalue { i32, i32 } %57, 0, !dbg !23
+  %59 = extractvalue { i32, i32 } %57, 1, !dbg !23
+  %60 = trunc i32 %58 to i16, !dbg !23
+  %extelt.offset4 = lshr i32 %58, 16, !dbg !23
+  %61 = trunc i32 %extelt.offset4 to i16, !dbg !23
+  %62 = trunc i32 %59 to i16, !dbg !23
+  %extelt.offset5 = lshr i32 %59, 16, !dbg !23
+  %63 = trunc i32 %extelt.offset5 to i16, !dbg !23
+  %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %60) #6, !dbg !24
+  %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %61) #6, !dbg !24
+  %66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %62) #6, !dbg !24
+  %67 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %63) #6, !dbg !24
+  %68 = getelementptr i16, ptr addrspace(1) %5, i64 %21, !dbg !25
+  %69 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %68, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !26
+  %70 = extractvalue { i32, i32 } %69, 0, !dbg !26
+  %71 = extractvalue { i32, i32 } %69, 1, !dbg !26
+  %72 = trunc i32 %70 to i16, !dbg !26
+  %extelt.offset6 = lshr i32 %70, 16, !dbg !26
+  %73 = trunc i32 %extelt.offset6 to i16, !dbg !26
+  %74 = trunc i32 %71 to i16, !dbg !26
+  %extelt.offset7 = lshr i32 %71, 16, !dbg !26
+  %75 = trunc i32 %extelt.offset7 to i16, !dbg !26
+  %76 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %72) #6, !dbg !27
+  %77 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %73) #6, !dbg !27
+  %78 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %74) #6, !dbg !27
+  %79 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %75) #6, !dbg !27
+  %80 = zext nneg i32 %17 to i64, !dbg !28
+  %81 = getelementptr float, ptr addrspace(1) %6, i64 %80, !dbg !28
+  %82 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %81, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !29
+  %83 = fadd float %40, %28, !dbg !30
+  %84 = fadd float %41, %29, !dbg !30
+  %85 = fadd float %42, %30, !dbg !30
+  %86 = fadd float %83, %52, !dbg !31
+  %87 = fadd float %84, %53, !dbg !31
+  %88 = fadd float %85, %54, !dbg !31
+  %89 = fadd float %87, %65, !dbg !32
+  %90 = fadd float %88, %66, !dbg !32
+  %91 = fadd float %89, %77, !dbg !33
+  %92 = fadd float %90, %78, !dbg !33
+  %93 = insertelement <2 x float> poison, float %86, i64 0, !dbg !32
+  %94 = insertelement <2 x float> %93, float %43, i64 1, !dbg !32
+  %95 = insertelement <2 x float> poison, float %64, i64 0, !dbg !32
+  %96 = insertelement <2 x float> %95, float %31, i64 1, !dbg !32
+  %97 = fadd <2 x float> %94, %96, !dbg !32
+  %98 = insertelement <2 x float> poison, float %76, i64 0, !dbg !33
+  %99 = insertelement <2 x float> %98, float %55, i64 1, !dbg !33
+  %100 = fadd <2 x float> %97, %99, !dbg !33
+  %101 = insertelement <2 x float> poison, float %91, i64 0, !dbg !34
+  %102 = insertelement <2 x float> %101, float %67, i64 1, !dbg !34
+  %103 = fadd <2 x float> %100, %102, !dbg !34
+  %104 = insertelement <2 x float> poison, float %92, i64 0, !dbg !34
+  %105 = insertelement <2 x float> %104, float %79, i64 1, !dbg !34
+  %106 = fadd <2 x float> %103, %105, !dbg !34
+  %107 = extractelement <2 x float> %106, i64 0, !dbg !34
+  %108 = extractelement <2 x float> %106, i64 1, !dbg !34
+  %109 = fadd float %107, %108, !dbg !34
+  %110 = bitcast float %109 to i32, !dbg !40
+  %111 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %110, i32 16, i32 31), !dbg !40
+  %112 = bitcast i32 %111 to float, !dbg !40
+  %113 = fadd float %109, %112, !dbg !34
+  %114 = bitcast float %113 to i32, !dbg !40
+  %115 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %114, i32 8, i32 31), !dbg !40
+  %116 = bitcast i32 %115 to float, !dbg !40
+  %117 = fadd float %113, %116, !dbg !34
+  %118 = bitcast float %117 to i32, !dbg !40
+  %119 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %118, i32 4, i32 31), !dbg !40
+  %120 = bitcast i32 %119 to float, !dbg !40
+  %121 = fadd float %117, %120, !dbg !34
+  %122 = bitcast float %121 to i32, !dbg !40
+  %123 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %122, i32 2, i32 31), !dbg !40
+  %124 = bitcast i32 %123 to float, !dbg !40
+  %125 = fadd float %121, %124, !dbg !34
+  %126 = bitcast float %125 to i32, !dbg !40
+  %127 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %126, i32 1, i32 31), !dbg !40
+  %128 = bitcast i32 %127 to float, !dbg !40
+  %129 = fadd float %125, %128, !dbg !34
+  %130 = icmp eq i32 %14, 0, !dbg !40
+  %131 = zext nneg i32 %16 to i64, !dbg !40
+  %132 = getelementptr float, ptr addrspace(3) @global_smem, i64 %131, !dbg !40
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %132, float %129, i1 %130) #6, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !40
+  %133 = icmp slt i32 %13, 2, !dbg !40
+  %134 = sext i32 %13 to i64, !dbg !40
+  %135 = getelementptr float, ptr addrspace(3) @global_smem, i64 %134, !dbg !40
+  %136 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %135, i1 %133) #6, !dbg !40
+  %137 = bitcast float %136 to i32, !dbg !40
+  %138 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %137, i32 1, i32 31), !dbg !40
+  %139 = bitcast i32 %138 to float, !dbg !40
+  %140 = fadd float %136, %139, !dbg !34
+  %141 = and i32 %13, 1, !dbg !40
+  %142 = icmp eq i32 %141, 0, !dbg !40
+  %143 = and i1 %133, %142, !dbg !40
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %135, float %140, i1 %143) #6, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !40
+  %144 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !40
+  %145 = fadd float %144, 0.000000e+00, !dbg !42
+  %146 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %145, float 2.560000e+02) #6, !dbg !46
+  %147 = extractelement <2 x float> %100, i64 0, !dbg !47
+  %148 = fsub float %147, %146, !dbg !47
+  %149 = fsub float %91, %146, !dbg !47
+  %150 = fsub float %92, %146, !dbg !47
+  %151 = fsub float %108, %146, !dbg !47
+  %152 = fmul float %148, %148, !dbg !48
+  %153 = fmul float %149, %149, !dbg !48
+  %154 = fmul float %150, %150, !dbg !48
+  %155 = fmul float %151, %151, !dbg !48
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %156 = fadd float %152, %153, !dbg !51
+  %157 = fadd float %154, %156, !dbg !51
+  %158 = fadd float %155, %157, !dbg !51
+  %159 = bitcast float %158 to i32, !dbg !49
+  %160 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %159, i32 16, i32 31), !dbg !49
+  %161 = bitcast i32 %160 to float, !dbg !49
+  %162 = fadd float %158, %161, !dbg !51
+  %163 = bitcast float %162 to i32, !dbg !49
+  %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 8, i32 31), !dbg !49
+  %165 = bitcast i32 %164 to float, !dbg !49
+  %166 = fadd float %162, %165, !dbg !51
+  %167 = bitcast float %166 to i32, !dbg !49
+  %168 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %167, i32 4, i32 31), !dbg !49
+  %169 = bitcast i32 %168 to float, !dbg !49
+  %170 = fadd float %166, %169, !dbg !51
+  %171 = bitcast float %170 to i32, !dbg !49
+  %172 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %171, i32 2, i32 31), !dbg !49
+  %173 = bitcast i32 %172 to float, !dbg !49
+  %174 = fadd float %170, %173, !dbg !51
+  %175 = bitcast float %174 to i32, !dbg !49
+  %176 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %175, i32 1, i32 31), !dbg !49
+  %177 = bitcast i32 %176 to float, !dbg !49
+  %178 = fadd float %174, %177, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %132, float %178, i1 %130) #6, !dbg !49
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %179 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %135, i1 %133) #6, !dbg !49
+  %180 = bitcast float %179 to i32, !dbg !49
+  %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 1, i32 31), !dbg !49
+  %182 = bitcast i32 %181 to float, !dbg !49
+  %183 = fadd float %179, %182, !dbg !51
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %135, float %183, i1 %143) #6, !dbg !49
+  tail call void @llvm.nvvm.barrier0(), !dbg !49
+  %184 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !49
+  %185 = fadd float %184, 0.000000e+00, !dbg !54
+  %186 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %185, float 2.560000e+02) #6, !dbg !56
+  %187 = fadd float %186, 0x3EE4F8B580000000, !dbg !57
+  %188 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !58
+  %.not.i = icmp eq i32 %188, 0, !dbg !58
+  br i1 %.not.i, label %191, label %189, !dbg !58
+
+189:                                              ; preds = %12
+  %190 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %187), !dbg !58
+  br label %__nv_rsqrtf.exit, !dbg !58
+
+191:                                              ; preds = %12
+  %192 = tail call float @llvm.nvvm.rsqrt.approx.f(float %187), !dbg !58
+  br label %__nv_rsqrtf.exit, !dbg !58
+
+__nv_rsqrtf.exit:                                 ; preds = %189, %191
+  %.0.i = phi float [ %190, %189 ], [ %192, %191 ], !dbg !58
+  %193 = extractvalue { i32, i32, i32, i32 } %82, 3, !dbg !29
+  %194 = bitcast i32 %193 to float, !dbg !29
+  %195 = extractvalue { i32, i32, i32, i32 } %82, 2, !dbg !29
+  %196 = bitcast i32 %195 to float, !dbg !29
+  %197 = extractvalue { i32, i32, i32, i32 } %82, 1, !dbg !29
+  %198 = bitcast i32 %197 to float, !dbg !29
+  %199 = extractvalue { i32, i32, i32, i32 } %82, 0, !dbg !29
+  %200 = bitcast i32 %199 to float, !dbg !29
+  %201 = fmul float %148, %.0.i, !dbg !59
+  %202 = fmul float %149, %.0.i, !dbg !59
+  %203 = fmul float %150, %.0.i, !dbg !59
+  %204 = fmul float %151, %.0.i, !dbg !59
+  %205 = fmul float %201, %200, !dbg !60
+  %206 = fmul float %202, %198, !dbg !60
+  %207 = fmul float %203, %196, !dbg !60
+  %208 = fmul float %204, %194, !dbg !60
+  %209 = getelementptr float, ptr addrspace(1) %7, i64 %21, !dbg !61
+  %210 = bitcast float %147 to i32, !dbg !62
+  %211 = bitcast float %91 to i32, !dbg !62
+  %212 = bitcast float %92 to i32, !dbg !62
+  %213 = bitcast float %108 to i32, !dbg !62
+  tail call void asm sideeffect "@$5 st.global.v4.b32 [ $4 + 0 ], { $0, $1, $2, $3 };", "r,r,r,r,l,b"(i32 %210, i32 %211, i32 %212, i32 %213, ptr addrspace(1) %209, i1 true) #6, !dbg !62
+  tail call void @llvm.nvvm.barrier0(), !dbg !63
+  %214 = sext i32 %18 to i64, !dbg !64
+  %215 = getelementptr float, ptr addrspace(1) %0, i64 %214, !dbg !64
+  %216 = icmp eq i32 %urem, 0, !dbg !65
+  %217 = bitcast float %.0.i to i32, !dbg !65
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %217, ptr addrspace(1) %215, i1 %216) #6, !dbg !65
+  %218 = getelementptr i16, ptr addrspace(1) %9, i64 %21, !dbg !66
+  %219 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #6, !dbg !67
+  %220 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #6, !dbg !67
+  %221 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %207) #6, !dbg !67
+  %222 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %208) #6, !dbg !67
+  %223 = insertelement <2 x i16> undef, i16 %219, i64 0, !dbg !67
+  %224 = insertelement <2 x i16> %223, i16 %220, i64 1, !dbg !67
+  %225 = bitcast <2 x i16> %224 to i32, !dbg !67
+  %226 = insertelement <2 x i16> undef, i16 %221, i64 0, !dbg !67
+  %227 = insertelement <2 x i16> %226, i16 %222, i64 1, !dbg !67
+  %228 = bitcast <2 x i16> %227 to i32, !dbg !67
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %225, i32 %228, ptr addrspace(1) %218, i1 true) #6, !dbg !67
+  %229 = getelementptr float, ptr addrspace(1) %8, i64 %214, !dbg !68
+  %230 = bitcast float %146 to i32, !dbg !69
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %230, ptr addrspace(1) %229, i1 %216) #6, !dbg !69
+  ret void, !dbg !70
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cilofmivtj4aqoxmz3r7fz7sc3blcxfzk3utwsuayln6lpg5jwtv.py", directory: "/tmp/torchinductor_root/il")
+!4 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6d7d8d9d10de11de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", linkageName: "triton__0d1d2d3d4d5d6d7d8d9d10de11de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 30, scope: !7)
+!23 = !DILocation(line: 33, column: 46, scope: !7)
+!24 = !DILocation(line: 33, column: 67, scope: !7)
+!25 = !DILocation(line: 34, column: 31, scope: !7)
+!26 = !DILocation(line: 34, column: 47, scope: !7)
+!27 = !DILocation(line: 34, column: 68, scope: !7)
+!28 = !DILocation(line: 35, column: 31, scope: !7)
+!29 = !DILocation(line: 35, column: 36, scope: !7)
+!30 = !DILocation(line: 37, column: 18, scope: !7)
+!31 = !DILocation(line: 39, column: 18, scope: !7)
+!32 = !DILocation(line: 41, column: 18, scope: !7)
+!33 = !DILocation(line: 43, column: 19, scope: !7)
+!34 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !38)
+!35 = distinct !DILexicalBlockFile(scope: !37, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!37 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!38 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !39)
+!39 = !DILocation(line: 48, column: 59, scope: !35)
+!40 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !41)
+!41 = !DILocation(line: 48, column: 59, scope: !37)
+!42 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !45)
+!43 = distinct !DILexicalBlockFile(scope: !7, file: !44, discriminator: 0)
+!44 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!45 = !DILocation(line: 48, column: 45, scope: !43)
+!46 = !DILocation(line: 51, column: 20, scope: !7)
+!47 = !DILocation(line: 52, column: 20, scope: !7)
+!48 = !DILocation(line: 53, column: 20, scope: !7)
+!49 = !DILocation(line: 243, column: 36, scope: !37, inlinedAt: !50)
+!50 = !DILocation(line: 56, column: 59, scope: !37)
+!51 = !DILocation(line: 233, column: 15, scope: !35, inlinedAt: !52)
+!52 = !DILocation(line: 243, column: 36, scope: !35, inlinedAt: !53)
+!53 = !DILocation(line: 56, column: 59, scope: !35)
+!54 = !DILocation(line: 8, column: 15, scope: !43, inlinedAt: !55)
+!55 = !DILocation(line: 56, column: 45, scope: !43)
+!56 = !DILocation(line: 58, column: 20, scope: !7)
+!57 = !DILocation(line: 60, column: 20, scope: !7)
+!58 = !DILocation(line: 61, column: 26, scope: !7)
+!59 = !DILocation(line: 63, column: 20, scope: !7)
+!60 = !DILocation(line: 64, column: 20, scope: !7)
+!61 = !DILocation(line: 66, column: 25, scope: !7)
+!62 = !DILocation(line: 66, column: 48, scope: !7)
+!63 = !DILocation(line: 67, column: 4, scope: !7)
+!64 = !DILocation(line: 68, column: 28, scope: !7)
+!65 = !DILocation(line: 68, column: 40, scope: !7)
+!66 = !DILocation(line: 69, column: 25, scope: !7)
+!67 = !DILocation(line: 69, column: 48, scope: !7)
+!68 = !DILocation(line: 70, column: 25, scope: !7)
+!69 = !DILocation(line: 70, column: 37, scope: !7)
+!70 = !DILocation(line: 70, column: 4, scope: !7)
diff --git a/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttgir b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..1b80454a261c05bdbafbd096b9b705bbebca392c
--- /dev/null
+++ b/.triton/dump/d7a12c0ba96f8920b8147157303ee99f/triton_.ttgir
@@ -0,0 +1,63 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant 9.99999974E-6 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %11 = tt.load %10, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %12 = arith.extf %11 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %13 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_4 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %16 = arith.extf %15 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %17 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %18 = tt.addptr %17, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %19 = tt.load %18, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %20 = arith.addf %8, %12 : tensor<256xf32, #blocked>
+    %21 = arith.addf %20, %16 : tensor<256xf32, #blocked>
+    %22 = arith.select %2, %21, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %23 = "tt.reduce"(%22) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %24 = arith.addf %23, %cst_2 : f32
+    %25 = arith.divf %24, %cst_1 : f32
+    %26 = tt.splat %25 : (f32) -> tensor<256xf32, #blocked>
+    %27 = arith.subf %21, %26 : tensor<256xf32, #blocked>
+    %28 = arith.mulf %27, %27 : tensor<256xf32, #blocked>
+    %29 = arith.select %2, %28, %cst_3 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32):
+      %41 = arith.addf %arg7, %arg8 : f32
+      tt.reduce.return %41 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %31 = arith.addf %30, %cst_2 : f32
+    %32 = arith.divf %31, %cst_1 : f32
+    %33 = arith.addf %32, %cst_0 : f32
+    %34 = tt.extern_elementwise %33 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %35 = tt.splat %34 : (f32) -> tensor<256xf32, #blocked>
+    %36 = arith.mulf %27, %35 : tensor<256xf32, #blocked>
+    %37 = arith.mulf %36, %19 : tensor<256xf32, #blocked>
+    %38 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %39 = tt.addptr %38, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %40 = arith.truncf %37 : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.store %39, %40, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.llir b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..e969370e4eb2c44ec2d36a90b9baf2c2e3926b27
--- /dev/null
+++ b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.llir
@@ -0,0 +1,56 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = icmp slt i32 %9, 12865792, !dbg !12
+  %11 = sext i32 %9 to i64, !dbg !13
+  %12 = getelementptr float, ptr addrspace(1) %0, i64 %11, !dbg !13
+  %13 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %12, i1 %10) #1, !dbg !14
+  %14 = extractvalue { i32, i32 } %13, 0, !dbg !14
+  %15 = extractvalue { i32, i32 } %13, 1, !dbg !14
+  %16 = bitcast i32 %14 to float, !dbg !14
+  %17 = bitcast i32 %15 to float, !dbg !14
+  %18 = getelementptr i16, ptr addrspace(1) %1, i64 %11, !dbg !15
+  %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !16
+  %20 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %17) #1, !dbg !16
+  %21 = insertelement <2 x i16> undef, i16 %19, i64 0, !dbg !16
+  %22 = insertelement <2 x i16> %21, i16 %20, i64 1, !dbg !16
+  %23 = bitcast <2 x i16> %22 to i32, !dbg !16
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %23, ptr addrspace(1) %18, i1 %10) #1, !dbg !16
+  ret void, !dbg !17
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c3dqs5x45k2yonlaarvhjaaf2n4okr2444cmsi5lflqvzppalavv.py", directory: "/tmp/torchinductor_root/3d")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 22, column: 21, scope: !5)
+!13 = !DILocation(line: 24, column: 30, scope: !5)
+!14 = !DILocation(line: 24, column: 35, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ptx b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..2b1be55066211338b0e205330820de973476be4b
--- /dev/null
+++ b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ptx
@@ -0,0 +1,296 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 256, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<3>;
+	.reg .b32 	%r<13>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r7, %tid.x;
+	shl.b32 	%r8, %r7, 1;
+	and.b32  	%r9, %r8, 510;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r10, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r11, %r10, %r9;
+	.loc	1 22 21
+	setp.lt.s32 	%p1, %r11, 12865792;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r11, 4;
+	add.s64 	%rd1, %rd3, %rd5;
+	.loc	1 24 35
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r11, 2;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	cvt.rn.bf16.f32 %rs1, %r4;
+	cvt.rn.bf16.f32 %rs2, %r5;
+	mov.b32 	%r12, {%rs1, %rs2};
+	@%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/3d/c3dqs5x45k2yonlaarvhjaaf2n4okr2444cmsi5lflqvzppalavv.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 51
+.b8 100
+.b8 113
+.b8 115
+.b8 53
+.b8 120
+.b8 52
+.b8 53
+.b8 107
+.b8 50
+.b8 121
+.b8 111
+.b8 110
+.b8 108
+.b8 97
+.b8 97
+.b8 114
+.b8 118
+.b8 104
+.b8 106
+.b8 97
+.b8 97
+.b8 102
+.b8 50
+.b8 110
+.b8 52
+.b8 111
+.b8 107
+.b8 114
+.b8 50
+.b8 52
+.b8 52
+.b8 52
+.b8 99
+.b8 109
+.b8 115
+.b8 105
+.b8 53
+.b8 108
+.b8 102
+.b8 108
+.b8 113
+.b8 118
+.b8 122
+.b8 112
+.b8 112
+.b8 97
+.b8 108
+.b8 97
+.b8 118
+.b8 118
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 51
+.b8 100
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttir b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..d7ecc07519628309aeabce4f18ef8123f313d28c
--- /dev/null
+++ b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttir
@@ -0,0 +1,20 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<12865792> : tensor<512xi32>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = arith.cmpi slt, %4, %cst : tensor<512xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %11 = arith.truncf %8 : tensor<512xf32> to tensor<512xbf16>
+    tt.store %10, %11, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.cubin b/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..82811324984b66d78e177476bed0720af45f1a8b
Binary files /dev/null and b/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.cubin differ
diff --git a/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.llir b/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..aead1791fdaaf6fb7dcf67c765a5a1549549da27
--- /dev/null
+++ b/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.llir
@@ -0,0 +1,112 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2d3d45e(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, i32 %4, i32 %5) local_unnamed_addr !dbg !5 {
+  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %8 = and i32 %7, 7, !dbg !8
+  %9 = zext nneg i32 %8 to i64, !dbg !9
+  %10 = getelementptr float, ptr addrspace(1) %1, i64 %9, !dbg !9
+  %11 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %10, i1 true, i32 0, i1 true) #3, !dbg !10
+  %12 = bitcast i32 %11 to float, !dbg !10
+  %13 = getelementptr i64, ptr addrspace(1) %2, i64 %9, !dbg !11
+  %14 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.b64 { $0 }, [ $1 + 0 ];\0A\09@!$3 mov.u64 $0, 0x0;", "=l,l,b,b"(ptr addrspace(1) %13, i1 true, i1 true) #3, !dbg !12
+  %15 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %11, i32 4, i32 31), !dbg !13
+  %16 = bitcast i32 %15 to float, !dbg !13
+  %17 = fadd float %12, %16, !dbg !17
+  %18 = bitcast float %17 to i32, !dbg !13
+  %19 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %18, i32 2, i32 31), !dbg !13
+  %20 = bitcast i32 %19 to float, !dbg !13
+  %21 = fadd float %17, %20, !dbg !17
+  %22 = bitcast float %21 to i32, !dbg !13
+  %23 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %22, i32 1, i32 31), !dbg !13
+  %24 = bitcast i32 %23 to float, !dbg !13
+  %25 = fadd float %21, %24, !dbg !17
+  %26 = trunc i64 %14 to i32, !dbg !21
+  %27 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %26, i32 4, i32 31), !dbg !21
+  %bc = bitcast i64 %14 to <2 x i32>, !dbg !21
+  %28 = extractelement <2 x i32> %bc, i64 1, !dbg !21
+  %29 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %28, i32 4, i32 31), !dbg !21
+  %30 = insertelement <2 x i32> undef, i32 %27, i64 0, !dbg !21
+  %31 = insertelement <2 x i32> %30, i32 %29, i64 1, !dbg !21
+  %32 = bitcast <2 x i32> %31 to i64, !dbg !21
+  %33 = add i64 %14, %32, !dbg !23
+  %34 = trunc i64 %33 to i32, !dbg !21
+  %35 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %34, i32 2, i32 31), !dbg !21
+  %bc1 = bitcast i64 %33 to <2 x i32>, !dbg !21
+  %36 = extractelement <2 x i32> %bc1, i64 1, !dbg !21
+  %37 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %36, i32 2, i32 31), !dbg !21
+  %38 = insertelement <2 x i32> undef, i32 %35, i64 0, !dbg !21
+  %39 = insertelement <2 x i32> %38, i32 %37, i64 1, !dbg !21
+  %40 = bitcast <2 x i32> %39 to i64, !dbg !21
+  %41 = add i64 %33, %40, !dbg !23
+  %42 = trunc i64 %41 to i32, !dbg !21
+  %43 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %42, i32 1, i32 31), !dbg !21
+  %bc2 = bitcast i64 %41 to <2 x i32>, !dbg !21
+  %44 = extractelement <2 x i32> %bc2, i64 1, !dbg !21
+  %45 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %44, i32 1, i32 31), !dbg !21
+  %46 = insertelement <2 x i32> undef, i32 %43, i64 0, !dbg !21
+  %47 = insertelement <2 x i32> %46, i32 %45, i64 1, !dbg !21
+  %48 = bitcast <2 x i32> %47 to i64, !dbg !21
+  %49 = add i64 %41, %48, !dbg !23
+  %50 = sitofp i64 %49 to float, !dbg !26
+  %51 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %25, float %50) #3, !dbg !27
+  %52 = and i32 %7, 63, !dbg !28
+  %53 = icmp eq i32 %52, 0, !dbg !28
+  %54 = bitcast float %50 to i32, !dbg !28
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %54, ptr addrspace(1) %3, i1 %53) #3, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !29
+  %55 = bitcast float %51 to i32, !dbg !30
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %55, ptr addrspace(1) %0, i1 %53) #3, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "c4pjg247azzp2aq7obfyfqmnzxmijq72eajmsphnpe2f5hawqzuf.py", directory: "/tmp/torchinductor_root/4p")
+!3 = !{ptr @triton__0d1d2d3d45e, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3d45e, !"maxntidx", i32 64}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3d45e", linkageName: "triton__0d1d2d3d45e", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 25, column: 34, scope: !5)
+!9 = !DILocation(line: 28, column: 30, scope: !5)
+!10 = !DILocation(line: 28, column: 35, scope: !5)
+!11 = !DILocation(line: 29, column: 30, scope: !5)
+!12 = !DILocation(line: 29, column: 35, scope: !5)
+!13 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlockFile(scope: !5, file: !15, discriminator: 0)
+!15 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!16 = !DILocation(line: 32, column: 24, scope: !14)
+!17 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !19)
+!18 = distinct !DILexicalBlockFile(scope: !14, file: !15, discriminator: 0)
+!19 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !20)
+!20 = !DILocation(line: 32, column: 24, scope: !18)
+!21 = !DILocation(line: 243, column: 36, scope: !14, inlinedAt: !22)
+!22 = !DILocation(line: 35, column: 24, scope: !14)
+!23 = !DILocation(line: 233, column: 15, scope: !18, inlinedAt: !24)
+!24 = !DILocation(line: 243, column: 36, scope: !18, inlinedAt: !25)
+!25 = !DILocation(line: 35, column: 24, scope: !18)
+!26 = !DILocation(line: 36, column: 20, scope: !5)
+!27 = !DILocation(line: 37, column: 19, scope: !5)
+!28 = !DILocation(line: 38, column: 68, scope: !5)
+!29 = !DILocation(line: 39, column: 4, scope: !5)
+!30 = !DILocation(line: 40, column: 71, scope: !5)
+!31 = !DILocation(line: 40, column: 4, scope: !5)
diff --git a/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.ttir b/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..c9389e1d8a2f7d20219e0196b94ac9bb7edb9074
--- /dev/null
+++ b/.triton/dump/e4b4750326af484a12cc939125bd831c/triton_.ttir
@@ -0,0 +1,41 @@
+module {
+  tt.func public @triton__0d1d2d3d45e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: i32, %arg5: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<0> : tensor<1x8xi64>
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<1x8xf32>
+    %cst_1 = arith.constant dense<8> : tensor<1x8xi32>
+    %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %1 = tt.expand_dims %0 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %2 = arith.cmpi slt, %1, %cst_1 : tensor<1x8xi32>
+    %3 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<1x8x!tt.ptr<f32, 1>>
+    %4 = tt.addptr %3, %1 : tensor<1x8x!tt.ptr<f32, 1>>, tensor<1x8xi32>
+    %5 = tt.load %4, %2, %cst_0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xf32>
+    %6 = tt.splat %arg2 : (!tt.ptr<i64, 1>) -> tensor<1x8x!tt.ptr<i64, 1>>
+    %7 = tt.addptr %6, %1 : tensor<1x8x!tt.ptr<i64, 1>>, tensor<1x8xi32>
+    %8 = tt.load %7, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1x8xi64>
+    %9 = arith.select %2, %5, %cst_0 : tensor<1x8xi1>, tensor<1x8xf32>
+    %10 = "tt.reduce"(%9) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: f32, %arg7: f32):
+      %21 = arith.addf %arg6, %arg7 : f32
+      tt.reduce.return %21 : f32
+    }) : (tensor<1x8xf32>) -> tensor<1xf32>
+    %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32>) -> tensor<1x1xf32>
+    %12 = arith.select %2, %8, %cst : tensor<1x8xi1>, tensor<1x8xi64>
+    %13 = "tt.reduce"(%12) <{axis = 1 : i32}> ({
+    ^bb0(%arg6: i64, %arg7: i64):
+      %21 = arith.addi %arg6, %arg7 : i64
+      tt.reduce.return %21 : i64
+    }) : (tensor<1x8xi64>) -> tensor<1xi64>
+    %14 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<1xi64>) -> tensor<1x1xi64>
+    %15 = arith.sitofp %14 : tensor<1x1xi64> to tensor<1x1xf32>
+    %16 = arith.divf %11, %15 : tensor<1x1xf32>
+    %17 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
+    tt.store %18, %15 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
+    gpu.barrier
+    %19 = tt.addptr %arg0, %c0_i32 : !tt.ptr<f32, 1>, i32
+    %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>>
+    tt.store %20, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.cubin b/.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..9b45ae143427b717fd5a0ef9c6f197c2d3761771
Binary files /dev/null and b/.triton/dump/f5088324dcdcf6814f6743553c1321c2/triton_.cubin differ
diff --git a/wandb/run-20240927_021423-clesd0p8/files/wandb-metadata.json b/wandb/run-20240927_021423-clesd0p8/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..1deca39887b9df08c314d5a0965781ce6c4af51e
--- /dev/null
+++ b/wandb/run-20240927_021423-clesd0p8/files/wandb-metadata.json
@@ -0,0 +1,60 @@
+{
+  "os": "Linux-5.15.0-113-generic-x86_64-with-glibc2.35",
+  "python": "3.10.12",
+  "startedAt": "2024-09-27T02:14:23.674273Z",
+  "args": [
+    "--batch_size=120"
+  ],
+  "program": "/root/train.py",
+  "codePath": "train.py",
+  "email": "prasadchandalada@gmail.com",
+  "root": "/root",
+  "host": "184d1c0992ce",
+  "username": "root",
+  "executable": "/usr/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 128,
+  "gpu": "[NVIDIA L40S, NVIDIA L40S, NVIDIA L40S, NVIDIA L40S]",
+  "gpu_count": 4,
+  "disk": {
+    "/": {
+      "total": "542239621120",
+      "used": "401116938240"
+    }
+  },
+  "memory": {
+    "total": "811327934464"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 128
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    }
+  ],
+  "cudaVersion": "12.2"
+}
\ No newline at end of file