0-hero
/

gpt2-pos-encoding-experiment-100B

Model card Files Files and versions Community

0-hero commited on Sep 27, 2024

Commit

0eeffdd

verified ·

1 Parent(s): 00b5d9e

Add files using upload-large-folder tool

Browse files

Files changed (11) hide show

.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.cubin +0 -0
.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttgir +19 -0
.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir +153 -0
.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir +164 -0
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ptx +277 -0
.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.llir +0 -0
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin +0 -0
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.llir +443 -0
.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttgir +81 -0
wandb/run-20240926_055222-14kj2390/files/wandb-metadata.json +60 -0
wandb/run-20240926_055222-14kj2390/logs/debug-core.log +14 -0

.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.cubin ADDED Viewed

Binary file (16.8 kB). View file

.triton/dump/4c6ad48573c74d55ed79384f6b432d50/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,19 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
+    tt.return
+  }
+}

.triton/dump/510522bb05917b836ed253751364fcad/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,153 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5de6de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
+    %25 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
+    %26 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
+    %27 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
+    %28 = arith.select %26, %24, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %29 = arith.select %27, %25, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %30 = arith.cmpi sge, %29, %cst_9 : tensor<64x1xi64, #blocked1>
+    %31 = arith.cmpi slt, %29, %cst_8 : tensor<64x1xi64, #blocked1>
+    %32 = arith.andi %30, %31 : tensor<64x1xi1, #blocked1>
+    %33 = arith.muli %28, %cst_5 : tensor<64x1xi64, #blocked>
+    %34 = tt.broadcast %33 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+    %35 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %36:4 = scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg8 = %cst_2, %arg9 = %cst_2, %arg10 = %cst_12, %arg11 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>)  : i32 {
+      %48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
+      %49 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked2>
+      %50 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
+      %51 = arith.addi %49, %13 : tensor<1x64xi32, #blocked2>
+      %52 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
+      %53 = arith.cmpi slt, %51, %cst_13 : tensor<1x64xi32, #blocked2>
+      %54 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %55 = arith.addi %54, %22 : tensor<64x64xi32, #blocked>
+      %56 = tt.addptr %23, %55 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %57 = tt.broadcast %52 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
+      %59 = tt.load %56, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      tt.assert %32, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %60 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %61 = tt.broadcast %60 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %62 = arith.addi %61, %34 : tensor<64x64xi64, #blocked>
+      %63 = tt.addptr %35, %62 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %64 = tt.load %63, %57, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %65 = arith.addf %64, %59 : tensor<64x64xf32, #blocked>
+      %66 = arith.subf %65, %arg8 : tensor<64x64xf32, #blocked>
+      %67 = arith.addf %arg11, %cst_4 : tensor<64x64xf32, #blocked>
+      %68 = arith.addf %arg10, %cst_10 : tensor<64x64xf32, #blocked2>
+      %69 = arith.divf %66, %67 : tensor<64x64xf32, #blocked>
+      %70 = arith.addf %arg8, %69 : tensor<64x64xf32, #blocked>
+      %71 = arith.subf %65, %70 : tensor<64x64xf32, #blocked>
+      %72 = arith.mulf %66, %71 : tensor<64x64xf32, #blocked>
+      %73 = arith.addf %arg9, %72 : tensor<64x64xf32, #blocked>
+      %74 = arith.select %57, %70, %arg8 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %75 = arith.select %57, %73, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %76 = arith.select %57, %67, %arg11 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %77 = arith.select %58, %68, %arg10 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
+      scf.yield %74, %75, %77, %76 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
+    }
+    %37 = triton_gpu.convert_layout %36#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
+    %38:3 = "tt.reduce"(%36#0, %36#1, %37) <{axis = 1 : i32}> ({
+    ^bb0(%arg7: f32, %arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32):
+      %48 = arith.subf %arg10, %arg7 : f32
+      %49 = arith.addf %arg9, %arg12 : f32
+      %50 = arith.cmpf oeq, %49, %cst_11 : f32
+      %51 = arith.divf %arg12, %49 : f32
+      %52 = arith.select %50, %cst_11, %51 : f32
+      %53 = arith.mulf %48, %52 : f32
+      %54 = arith.addf %arg7, %53 : f32
+      %55 = arith.addf %arg8, %arg11 : f32
+      %56 = arith.mulf %48, %48 : f32
+      %57 = arith.mulf %56, %arg9 : f32
+      %58 = arith.mulf %57, %52 : f32
+      %59 = arith.addf %55, %58 : f32
+      tt.reduce.return %54, %59, %49 : f32, f32, f32
+    }) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %39 = tt.expand_dims %38#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %40 = tt.expand_dims %38#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %41 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
+    %42 = tt.broadcast %39 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+    %43 = arith.divf %40, %cst_15 : tensor<64x1xf32, #blocked>
+    %44 = arith.addf %43, %cst_14 : tensor<64x1xf32, #blocked>
+    %45 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
+    %46 = tt.broadcast %45 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %47 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg7 = %c0_i32 to %c256_i32 step %c64_i32  : i32 {
+      %48 = tt.splat %arg7 : (i32) -> tensor<1x64xi32, #blocked>
+      %49 = arith.addi %48, %12 : tensor<1x64xi32, #blocked>
+      %50 = arith.cmpi slt, %49, %cst_0 : tensor<1x64xi32, #blocked>
+      %51 = tt.broadcast %49 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %52 = arith.addi %51, %22 : tensor<64x64xi32, #blocked>
+      %53 = tt.addptr %23, %52 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %54 = tt.broadcast %50 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %55 = tt.load %53, %54, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %56 = tt.addptr %41, %49 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
+      %57 = tt.load %56, %50, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
+      tt.assert %32, "index out of bounds: 0 <= tmp13 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %58 = arith.extsi %49 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %59 = tt.broadcast %58 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %60 = arith.addi %59, %34 : tensor<64x64xi64, #blocked>
+      %61 = tt.addptr %35, %60 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %62 = tt.load %61, %54, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %63 = arith.addf %62, %55 : tensor<64x64xf32, #blocked>
+      %64 = arith.subf %63, %42 : tensor<64x64xf32, #blocked>
+      %65 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %66 = tt.broadcast %65 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %67 = arith.mulf %64, %66 : tensor<64x64xf32, #blocked>
+      %68 = tt.broadcast %57 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %69 = arith.mulf %67, %68 : tensor<64x64xf32, #blocked>
+      %70 = arith.addi %51, %46 : tensor<64x64xi32, #blocked>
+      %71 = tt.addptr %47, %70 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %72 = arith.truncf %69 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
+      tt.store %71, %72, %54 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/53075505618c3af0ef6ce61f3300cdcb/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,164 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<64x1xi32, #blocked>
+    %cst_0 = arith.constant dense<256> : tensor<1x64xi32, #blocked>
+    %cst_1 = arith.constant dense<256> : tensor<64x1xi32, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<1x64xf32, #blocked>
+    %cst_4 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked>
+    %cst_5 = arith.constant dense<256> : tensor<64x1xi64, #blocked>
+    %cst_6 = arith.constant dense<0> : tensor<64x1xi64, #blocked>
+    %cst_7 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
+    %cst_8 = arith.constant dense<50257> : tensor<64x1xi64, #blocked1>
+    %cst_9 = arith.constant dense<0> : tensor<64x1xi64, #blocked1>
+    %c0_i32 = arith.constant 0 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_10 = arith.constant dense<1.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_11 = arith.constant 0.000000e+00 : f32
+    %cst_12 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked2>
+    %cst_13 = arith.constant dense<256> : tensor<1x64xi32, #blocked2>
+    %cst_14 = arith.constant dense<9.99999974E-6> : tensor<64x1xf32, #blocked>
+    %cst_15 = arith.constant dense<2.560000e+02> : tensor<64x1xf32, #blocked>
+    %cst_16 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<64x1xi32, #blocked1>
+    %6 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked>
+    %7 = tt.splat %1 : (i32) -> tensor<64x1xi32, #blocked1>
+    %8 = arith.addi %6, %4 : tensor<64x1xi32, #blocked>
+    %9 = arith.addi %7, %5 : tensor<64x1xi32, #blocked1>
+    %10 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %11 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>
+    %12 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
+    %13 = tt.expand_dims %11 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2>
+    %14 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
+    %15 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked1>
+    %16 = tt.addptr %14, %8 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi32, #blocked>
+    %17 = tt.addptr %15, %9 : tensor<64x1x!tt.ptr<i64, 1>, #blocked1>, tensor<64x1xi32, #blocked1>
+    %18 = tt.load %16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
+    %19 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked1>
+    %20 = arith.remsi %8, %cst : tensor<64x1xi32, #blocked>
+    %21 = arith.muli %20, %cst_1 : tensor<64x1xi32, #blocked>
+    %22 = tt.broadcast %21 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %23 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %24 = arith.muli %8, %cst_1 : tensor<64x1xi32, #blocked>
+    %25 = tt.broadcast %24 : (tensor<64x1xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+    %26 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
+    %27 = arith.addi %18, %cst_7 : tensor<64x1xi64, #blocked>
+    %28 = arith.addi %19, %cst_8 : tensor<64x1xi64, #blocked1>
+    %29 = arith.cmpi slt, %18, %cst_6 : tensor<64x1xi64, #blocked>
+    %30 = arith.cmpi slt, %19, %cst_9 : tensor<64x1xi64, #blocked1>
+    %31 = arith.select %29, %27, %18 : tensor<64x1xi1, #blocked>, tensor<64x1xi64, #blocked>
+    %32 = arith.select %30, %28, %19 : tensor<64x1xi1, #blocked1>, tensor<64x1xi64, #blocked1>
+    %33 = arith.cmpi sge, %32, %cst_9 : tensor<64x1xi64, #blocked1>
+    %34 = arith.cmpi slt, %32, %cst_8 : tensor<64x1xi64, #blocked1>
+    %35 = arith.andi %33, %34 : tensor<64x1xi1, #blocked1>
+    %36 = arith.muli %31, %cst_5 : tensor<64x1xi64, #blocked>
+    %37 = tt.broadcast %36 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+    %38 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
+    %39:4 = scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32 iter_args(%arg9 = %cst_2, %arg10 = %cst_2, %arg11 = %cst_12, %arg12 = %cst_2) -> (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>)  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
+      %50 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked2>
+      %51 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
+      %52 = arith.addi %50, %13 : tensor<1x64xi32, #blocked2>
+      %53 = arith.cmpi slt, %51, %cst_0 : tensor<1x64xi32, #blocked>
+      %54 = arith.cmpi slt, %52, %cst_13 : tensor<1x64xi32, #blocked2>
+      %55 = tt.broadcast %51 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %56 = arith.addi %55, %22 : tensor<64x64xi32, #blocked>
+      %57 = tt.addptr %23, %56 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %58 = tt.broadcast %53 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %59 = tt.broadcast %54 : (tensor<1x64xi1, #blocked2>) -> tensor<64x64xi1, #blocked2>
+      %60 = tt.load %57, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %61 = arith.addi %55, %25 : tensor<64x64xi32, #blocked>
+      %62 = tt.addptr %26, %61 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %63 = tt.load %62, %58, %cst_16 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
+      %64 = arith.extf %63 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %65 = arith.extsi %51 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %66 = tt.broadcast %65 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %67 = arith.addi %66, %37 : tensor<64x64xi64, #blocked>
+      %68 = tt.addptr %38, %67 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %69 = tt.load %68, %58, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %70 = arith.addf %69, %60 : tensor<64x64xf32, #blocked>
+      %71 = arith.addf %70, %64 : tensor<64x64xf32, #blocked>
+      %72 = arith.subf %71, %arg9 : tensor<64x64xf32, #blocked>
+      %73 = arith.addf %arg12, %cst_4 : tensor<64x64xf32, #blocked>
+      %74 = arith.addf %arg11, %cst_10 : tensor<64x64xf32, #blocked2>
+      %75 = arith.divf %72, %73 : tensor<64x64xf32, #blocked>
+      %76 = arith.addf %arg9, %75 : tensor<64x64xf32, #blocked>
+      %77 = arith.subf %71, %76 : tensor<64x64xf32, #blocked>
+      %78 = arith.mulf %72, %77 : tensor<64x64xf32, #blocked>
+      %79 = arith.addf %arg10, %78 : tensor<64x64xf32, #blocked>
+      %80 = arith.select %58, %76, %arg9 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %81 = arith.select %58, %79, %arg10 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %82 = arith.select %58, %73, %arg12 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
+      %83 = arith.select %59, %74, %arg11 : tensor<64x64xi1, #blocked2>, tensor<64x64xf32, #blocked2>
+      scf.yield %80, %81, %83, %82 : tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked2>, tensor<64x64xf32, #blocked>
+    }
+    %40 = triton_gpu.convert_layout %39#2 : (tensor<64x64xf32, #blocked2>) -> tensor<64x64xf32, #blocked>
+    %41:3 = "tt.reduce"(%39#0, %39#1, %40) <{axis = 1 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32, %arg10: f32, %arg11: f32, %arg12: f32, %arg13: f32):
+      %49 = arith.subf %arg11, %arg8 : f32
+      %50 = arith.addf %arg10, %arg13 : f32
+      %51 = arith.cmpf oeq, %50, %cst_11 : f32
+      %52 = arith.divf %arg13, %50 : f32
+      %53 = arith.select %51, %cst_11, %52 : f32
+      %54 = arith.mulf %49, %53 : f32
+      %55 = arith.addf %arg8, %54 : f32
+      %56 = arith.addf %arg9, %arg12 : f32
+      %57 = arith.mulf %49, %49 : f32
+      %58 = arith.mulf %57, %arg10 : f32
+      %59 = arith.mulf %58, %53 : f32
+      %60 = arith.addf %56, %59 : f32
+      tt.reduce.return %55, %60, %50 : f32, f32, f32
+    }) : (tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>, tensor<64x64xf32, #blocked>) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>)
+    %42 = tt.expand_dims %41#0 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %43 = tt.expand_dims %41#1 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
+    %44 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<1x64x!tt.ptr<f32, 1>, #blocked>
+    %45 = tt.broadcast %42 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+    %46 = arith.divf %43, %cst_15 : tensor<64x1xf32, #blocked>
+    %47 = arith.addf %46, %cst_14 : tensor<64x1xf32, #blocked>
+    %48 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
+    scf.for %arg8 = %c0_i32 to %c256_i32 step %c64_i32  : i32 {
+      %49 = tt.splat %arg8 : (i32) -> tensor<1x64xi32, #blocked>
+      %50 = arith.addi %49, %12 : tensor<1x64xi32, #blocked>
+      %51 = arith.cmpi slt, %50, %cst_0 : tensor<1x64xi32, #blocked>
+      %52 = tt.broadcast %50 : (tensor<1x64xi32, #blocked>) -> tensor<64x64xi32, #blocked>
+      %53 = arith.addi %52, %22 : tensor<64x64xi32, #blocked>
+      %54 = tt.addptr %23, %53 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %55 = tt.broadcast %51 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
+      %56 = tt.load %54, %55, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %57 = arith.addi %52, %25 : tensor<64x64xi32, #blocked>
+      %58 = tt.addptr %26, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %59 = tt.load %58, %55, %cst_16 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
+      %60 = arith.extf %59 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
+      %61 = tt.addptr %44, %50 : tensor<1x64x!tt.ptr<f32, 1>, #blocked>, tensor<1x64xi32, #blocked>
+      %62 = tt.load %61, %51, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x64xf32, #blocked>
+      tt.assert %35, "index out of bounds: 0 <= tmp16 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<64x1xi1, #blocked1>
+      %63 = arith.extsi %50 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
+      %64 = tt.broadcast %63 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
+      %65 = arith.addi %64, %37 : tensor<64x64xi64, #blocked>
+      %66 = tt.addptr %38, %65 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
+      %67 = tt.load %66, %55, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
+      %68 = arith.addf %67, %56 : tensor<64x64xf32, #blocked>
+      %69 = arith.addf %68, %60 : tensor<64x64xf32, #blocked>
+      %70 = arith.subf %69, %45 : tensor<64x64xf32, #blocked>
+      %71 = tt.extern_elementwise %47 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (tensor<64x1xf32, #blocked>) -> tensor<64x1xf32, #blocked>
+      %72 = tt.broadcast %71 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %73 = arith.mulf %70, %72 : tensor<64x64xf32, #blocked>
+      %74 = tt.broadcast %62 : (tensor<1x64xf32, #blocked>) -> tensor<64x64xf32, #blocked>
+      %75 = arith.mulf %73, %74 : tensor<64x64xf32, #blocked>
+      %76 = tt.addptr %48, %57 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi32, #blocked>
+      %77 = arith.truncf %75 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
+      tt.store %76, %77, %55 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
+    }
+    tt.return
+  }
+}

.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ptx ADDED Viewed

	@@ -0,0 +1,277 @@

+//
+// Generated by LLVM NVPTX Back-End
+//
+.version 8.2
+.target sm_89
+.address_size 64
+	// .globl	triton__0d1de
+.visible .entry triton__0d1de(
+	.param .u64 triton__0d1de_param_0,
+	.param .u32 triton__0d1de_param_1
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<2>;
+	.reg .b32 	%r<6>;
+	.reg .b64 	%rd<5>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+	ld.param.u64 	%rd3, [triton__0d1de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r2, %tid.x;
+	and.b32  	%r3, %r2, 127;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r4, %r1, 7;
+	.loc	1 21 23
+	or.b32  	%r5, %r4, %r3;
+	.loc	1 22 21
+	setp.lt.s32 	%p1, %r5, 512;
+	.loc	1 25 25
+	cvt.s64.s32 	%rd1, %r5;
+	mul.wide.s32 	%rd4, %r5, 8;
+	add.s64 	%rd2, %rd3, %rd4;
+	.loc	1 25 36
+	@%p1 st.global.b64 [ %rd2 + 0 ], { %rd1 };
+	.loc	1 25 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+}
+	.file	1 "/tmp/torchinductor_root/wx/cwxxgxdevnyc453z7hh4nxzgmvlhh6suwokktps3dw62btskgxt4.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 172
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 119
+.b8 120
+.b8 120
+.b8 103
+.b8 120
+.b8 100
+.b8 101
+.b8 118
+.b8 110
+.b8 121
+.b8 99
+.b8 52
+.b8 53
+.b8 51
+.b8 122
+.b8 55
+.b8 104
+.b8 104
+.b8 52
+.b8 110
+.b8 120
+.b8 122
+.b8 103
+.b8 109
+.b8 118
+.b8 108
+.b8 104
+.b8 104
+.b8 54
+.b8 115
+.b8 117
+.b8 119
+.b8 111
+.b8 107
+.b8 107
+.b8 116
+.b8 112
+.b8 115
+.b8 51
+.b8 100
+.b8 119
+.b8 54
+.b8 50
+.b8 98
+.b8 116
+.b8 115
+.b8 107
+.b8 103
+.b8 120
+.b8 116
+.b8 52
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 119
+.b8 120
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 176
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}

.triton/dump/89f8cc1079aa03024e56dc2aee42813a/triton_.llir ADDED Viewed

The diff for this file is too large to render. See raw diff

.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.cubin ADDED Viewed

Binary file (4.78 kB). View file

.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.llir ADDED Viewed

	@@ -0,0 +1,443 @@

+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+@global_smem = external addrspace(3) global [0 x i8]
+define void @triton__0d1d2d3de4(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, i64 %3, i64 %4) local_unnamed_addr !dbg !5 {
+  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %7 = lshr i32 %6, 5, !dbg !8
+  %urem = and i32 %6, 255, !dbg !8
+  %8 = or i32 %urem, 256, !dbg !8
+  %9 = or i32 %urem, 512, !dbg !8
+  %10 = or i32 %urem, 768, !dbg !8
+  %11 = or i32 %urem, 1024, !dbg !8
+  %12 = or i32 %urem, 1280, !dbg !8
+  %13 = or i32 %urem, 1536, !dbg !8
+  %14 = or i32 %urem, 1792, !dbg !8
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
+  %16 = sext i32 %15 to i64, !dbg !10
+  %17 = insertelement <8 x i32> poison, i32 %urem, i64 0
+  %18 = insertelement <8 x i32> %17, i32 %8, i64 1
+  %19 = insertelement <8 x i32> %18, i32 %9, i64 2
+  %20 = insertelement <8 x i32> %19, i32 %10, i64 3
+  %21 = insertelement <8 x i32> %20, i32 %11, i64 4
+  %22 = insertelement <8 x i32> %21, i32 %12, i64 5
+  %23 = insertelement <8 x i32> %22, i32 %13, i64 6
+  %24 = insertelement <8 x i32> %23, i32 %14, i64 7
+  %25 = zext <8 x i32> %24 to <8 x i64>
+  %26 = mul nsw i64 %16, 50257, !dbg !11
+  %invariant.gep = getelementptr i16, ptr addrspace(1) %0, i64 %26, !dbg !12
+  br label %27, !dbg !12
+27:                                               ; preds = %5, %27
+  %28 = phi i32 [ 0, %5 ], [ %80, %27 ]
+  %29 = phi <8 x float> [ <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, %5 ], [ %79, %27 ]
+  %30 = zext nneg i32 %28 to i64, !dbg !13
+  %31 = fcmp ord <8 x float> %29, zeroinitializer, !dbg !14
+  %32 = insertelement <8 x i64> poison, i64 %30, i64 0, !dbg !13
+  %33 = shufflevector <8 x i64> %32, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !13
+  %34 = or <8 x i64> %33, %25, !dbg !13
+  %35 = icmp ult <8 x i64> %34, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !18
+  %36 = extractelement <8 x i64> %34, i64 0, !dbg !19
+  %gep = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %36, !dbg !19
+  %37 = extractelement <8 x i64> %34, i64 1, !dbg !19
+  %gep20 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %37, !dbg !19
+  %38 = extractelement <8 x i64> %34, i64 2, !dbg !19
+  %gep22 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %38, !dbg !19
+  %39 = extractelement <8 x i64> %34, i64 3, !dbg !19
+  %gep24 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %39, !dbg !19
+  %40 = extractelement <8 x i64> %34, i64 4, !dbg !19
+  %gep26 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %40, !dbg !19
+  %41 = extractelement <8 x i64> %34, i64 5, !dbg !19
+  %gep28 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %41, !dbg !19
+  %42 = extractelement <8 x i64> %34, i64 6, !dbg !19
+  %gep30 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %42, !dbg !19
+  %43 = extractelement <8 x i64> %34, i64 7, !dbg !19
+  %gep32 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %43, !dbg !19
+  %44 = extractelement <8 x i1> %35, i64 0, !dbg !20
+  %45 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep, i1 %44, i16 0, i1 %44) #3, !dbg !20
+  %46 = extractelement <8 x i1> %35, i64 1, !dbg !20
+  %47 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep20, i1 %46, i16 0, i1 %46) #3, !dbg !20
+  %48 = extractelement <8 x i1> %35, i64 2, !dbg !20
+  %49 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep22, i1 %48, i16 0, i1 %48) #3, !dbg !20
+  %50 = extractelement <8 x i1> %35, i64 3, !dbg !20
+  %51 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep24, i1 %50, i16 0, i1 %50) #3, !dbg !20
+  %52 = extractelement <8 x i1> %35, i64 4, !dbg !20
+  %53 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep26, i1 %52, i16 0, i1 %52) #3, !dbg !20
+  %54 = extractelement <8 x i1> %35, i64 5, !dbg !20
+  %55 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep28, i1 %54, i16 0, i1 %54) #3, !dbg !20
+  %56 = extractelement <8 x i1> %35, i64 6, !dbg !20
+  %57 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep30, i1 %56, i16 0, i1 %56) #3, !dbg !20
+  %58 = extractelement <8 x i1> %35, i64 7, !dbg !20
+  %59 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep32, i1 %58, i16 0, i1 %58) #3, !dbg !20
+  %60 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #3, !dbg !21
+  %61 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %47) #3, !dbg !21
+  %62 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %49) #3, !dbg !21
+  %63 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %51) #3, !dbg !21
+  %64 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %53) #3, !dbg !21
+  %65 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %55) #3, !dbg !21
+  %66 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %57) #3, !dbg !21
+  %67 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %59) #3, !dbg !21
+  %68 = insertelement <8 x float> poison, float %60, i64 0, !dbg !22
+  %69 = insertelement <8 x float> %68, float %61, i64 1, !dbg !22
+  %70 = insertelement <8 x float> %69, float %62, i64 2, !dbg !22
+  %71 = insertelement <8 x float> %70, float %63, i64 3, !dbg !22
+  %72 = insertelement <8 x float> %71, float %64, i64 4, !dbg !22
+  %73 = insertelement <8 x float> %72, float %65, i64 5, !dbg !22
+  %74 = insertelement <8 x float> %73, float %66, i64 6, !dbg !22
+  %75 = insertelement <8 x float> %74, float %67, i64 7, !dbg !22
+  %76 = fcmp ule <8 x float> %29, %75, !dbg !22
+  %77 = and <8 x i1> %31, %76, !dbg !23
+  %78 = and <8 x i1> %35, %77, !dbg !24
+  %79 = select <8 x i1> %78, <8 x float> %75, <8 x float> %29, !dbg !24
+  %80 = add nuw nsw i32 %28, 2048, !dbg !12
+  %81 = icmp ult i32 %28, 48209, !dbg !12
+  br i1 %81, label %27, label %82, !dbg !12
+82:                                               ; preds = %27
+  %83 = and i32 %6, 31, !dbg !8
+  %84 = and i32 %7, 7, !dbg !8
+  %85 = extractelement <8 x float> %79, i64 0, !dbg !25
+  %86 = extractelement <8 x float> %79, i64 1, !dbg !25
+  %87 = fcmp ogt float %85, %86, !dbg !25
+  %88 = fcmp uno float %85, 0.000000e+00, !dbg !29
+  %89 = or i1 %87, %88, !dbg !30
+  %90 = select i1 %89, float %85, float %86, !dbg !31
+  %91 = extractelement <8 x float> %79, i64 2, !dbg !25
+  %92 = fcmp ogt float %90, %91, !dbg !25
+  %93 = fcmp uno float %90, 0.000000e+00, !dbg !29
+  %94 = or i1 %92, %93, !dbg !30
+  %95 = select i1 %94, float %90, float %91, !dbg !31
+  %96 = extractelement <8 x float> %79, i64 3, !dbg !25
+  %97 = fcmp ogt float %95, %96, !dbg !25
+  %98 = fcmp uno float %95, 0.000000e+00, !dbg !29
+  %99 = or i1 %97, %98, !dbg !30
+  %100 = select i1 %99, float %95, float %96, !dbg !31
+  %101 = extractelement <8 x float> %79, i64 4, !dbg !25
+  %102 = fcmp ogt float %100, %101, !dbg !25
+  %103 = fcmp uno float %100, 0.000000e+00, !dbg !29
+  %104 = or i1 %102, %103, !dbg !30
+  %105 = select i1 %104, float %100, float %101, !dbg !31
+  %106 = extractelement <8 x float> %79, i64 5, !dbg !25
+  %107 = fcmp ogt float %105, %106, !dbg !25
+  %108 = fcmp uno float %105, 0.000000e+00, !dbg !29
+  %109 = or i1 %107, %108, !dbg !30
+  %110 = select i1 %109, float %105, float %106, !dbg !31
+  %111 = extractelement <8 x float> %79, i64 6, !dbg !25
+  %112 = fcmp ogt float %110, %111, !dbg !25
+  %113 = fcmp uno float %110, 0.000000e+00, !dbg !29
+  %114 = or i1 %112, %113, !dbg !30
+  %115 = select i1 %114, float %110, float %111, !dbg !31
+  %116 = extractelement <8 x float> %79, i64 7, !dbg !25
+  %117 = fcmp ogt float %115, %116, !dbg !25
+  %118 = fcmp uno float %115, 0.000000e+00, !dbg !29
+  %119 = or i1 %117, %118, !dbg !30
+  %120 = select i1 %119, float %115, float %116, !dbg !31
+  %121 = bitcast float %120 to i32, !dbg !32
+  %122 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %121, i32 16, i32 31), !dbg !32
+  %123 = bitcast i32 %122 to float, !dbg !32
+  %124 = fcmp ogt float %120, %123, !dbg !25
+  %125 = fcmp uno float %120, 0.000000e+00, !dbg !29
+  %126 = or i1 %125, %124, !dbg !30
+  %127 = select i1 %126, float %120, float %123, !dbg !31
+  %128 = bitcast float %127 to i32, !dbg !32
+  %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 8, i32 31), !dbg !32
+  %130 = bitcast i32 %129 to float, !dbg !32
+  %131 = fcmp ogt float %127, %130, !dbg !25
+  %132 = fcmp uno float %127, 0.000000e+00, !dbg !29
+  %133 = or i1 %131, %132, !dbg !30
+  %134 = select i1 %133, float %127, float %130, !dbg !31
+  %135 = bitcast float %134 to i32, !dbg !32
+  %136 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %135, i32 4, i32 31), !dbg !32
+  %137 = bitcast i32 %136 to float, !dbg !32
+  %138 = fcmp ogt float %134, %137, !dbg !25
+  %139 = fcmp uno float %134, 0.000000e+00, !dbg !29
+  %140 = or i1 %138, %139, !dbg !30
+  %141 = select i1 %140, float %134, float %137, !dbg !31
+  %142 = bitcast float %141 to i32, !dbg !32
+  %143 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %142, i32 2, i32 31), !dbg !32
+  %144 = bitcast i32 %143 to float, !dbg !32
+  %145 = fcmp ogt float %141, %144, !dbg !25
+  %146 = fcmp uno float %141, 0.000000e+00, !dbg !29
+  %147 = or i1 %145, %146, !dbg !30
+  %148 = select i1 %147, float %141, float %144, !dbg !31
+  %149 = bitcast float %148 to i32, !dbg !32
+  %150 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %149, i32 1, i32 31), !dbg !32
+  %151 = bitcast i32 %150 to float, !dbg !32
+  %152 = fcmp ogt float %148, %151, !dbg !25
+  %153 = fcmp uno float %148, 0.000000e+00, !dbg !29
+  %154 = or i1 %152, %153, !dbg !30
+  %155 = select i1 %154, float %148, float %151, !dbg !31
+  %156 = icmp eq i32 %83, 0, !dbg !32
+  %157 = zext nneg i32 %84 to i64, !dbg !32
+  %158 = getelementptr float, ptr addrspace(3) @global_smem, i64 %157, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %158, float %155, i1 %156) #3, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %159 = icmp slt i32 %6, 8, !dbg !32
+  %160 = sext i32 %6 to i64, !dbg !32
+  %161 = getelementptr float, ptr addrspace(3) @global_smem, i64 %160, !dbg !32
+  %162 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %161, i1 %159) #3, !dbg !32
+  %163 = bitcast float %162 to i32, !dbg !32
+  %164 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %163, i32 4, i32 31), !dbg !32
+  %165 = bitcast i32 %164 to float, !dbg !32
+  %166 = fcmp ogt float %162, %165, !dbg !25
+  %167 = fcmp uno float %162, 0.000000e+00, !dbg !29
+  %168 = or i1 %167, %166, !dbg !30
+  %169 = select i1 %168, float %162, float %165, !dbg !31
+  %170 = bitcast float %169 to i32, !dbg !32
+  %171 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %170, i32 2, i32 31), !dbg !32
+  %172 = bitcast i32 %171 to float, !dbg !32
+  %173 = fcmp ogt float %169, %172, !dbg !25
+  %174 = fcmp uno float %169, 0.000000e+00, !dbg !29
+  %175 = or i1 %173, %174, !dbg !30
+  %176 = select i1 %175, float %169, float %172, !dbg !31
+  %177 = bitcast float %176 to i32, !dbg !32
+  %178 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %177, i32 1, i32 31), !dbg !32
+  %179 = bitcast i32 %178 to float, !dbg !32
+  %180 = fcmp ogt float %176, %179, !dbg !25
+  %181 = fcmp uno float %176, 0.000000e+00, !dbg !29
+  %182 = or i1 %180, %181, !dbg !30
+  %183 = select i1 %182, float %176, float %179, !dbg !31
+  %184 = and i32 %6, 7, !dbg !32
+  %185 = icmp eq i32 %184, 0, !dbg !32
+  %186 = and i1 %159, %185, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %161, float %183, i1 %186) #3, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %187 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !34
+  %188 = insertelement <1 x float> undef, float %187, i64 0, !dbg !34
+  store <1 x float> %188, ptr addrspace(3) @global_smem, align 4, !dbg !34
+  tail call void @llvm.nvvm.barrier0(), !dbg !34
+  %189 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !34
+  %190 = getelementptr float, ptr addrspace(1) %1, i64 %16, !dbg !35
+  %191 = icmp eq i32 %urem, 0, !dbg !36
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %189, ptr addrspace(1) %190, i1 %191) #3, !dbg !36
+  br label %192, !dbg !37
+192:                                              ; preds = %82, %192
+  %193 = phi i32 [ 0, %82 ], [ %266, %192 ]
+  %194 = phi <8 x float> [ zeroinitializer, %82 ], [ %265, %192 ]
+  %195 = zext nneg i32 %193 to i64, !dbg !38
+  %196 = insertelement <8 x i64> poison, i64 %195, i64 0, !dbg !38
+  %197 = shufflevector <8 x i64> %196, <8 x i64> poison, <8 x i32> zeroinitializer, !dbg !38
+  %198 = or <8 x i64> %197, %25, !dbg !38
+  %199 = icmp ult <8 x i64> %198, <i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !39
+  %200 = extractelement <8 x i64> %198, i64 0, !dbg !40
+  %gep34 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %200, !dbg !40
+  %201 = extractelement <8 x i64> %198, i64 1, !dbg !40
+  %gep36 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %201, !dbg !40
+  %202 = extractelement <8 x i64> %198, i64 2, !dbg !40
+  %gep38 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %202, !dbg !40
+  %203 = extractelement <8 x i64> %198, i64 3, !dbg !40
+  %gep40 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %203, !dbg !40
+  %204 = extractelement <8 x i64> %198, i64 4, !dbg !40
+  %gep42 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %204, !dbg !40
+  %205 = extractelement <8 x i64> %198, i64 5, !dbg !40
+  %gep44 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %205, !dbg !40
+  %206 = extractelement <8 x i64> %198, i64 6, !dbg !40
+  %gep46 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %206, !dbg !40
+  %207 = extractelement <8 x i64> %198, i64 7, !dbg !40
+  %gep48 = getelementptr i16, ptr addrspace(1) %invariant.gep, i64 %207, !dbg !40
+  %208 = extractelement <8 x i1> %199, i64 0, !dbg !41
+  %209 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep34, i1 %208, i16 0, i1 %208) #3, !dbg !41
+  %210 = extractelement <8 x i1> %199, i64 1, !dbg !41
+  %211 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep36, i1 %210, i16 0, i1 %210) #3, !dbg !41
+  %212 = extractelement <8 x i1> %199, i64 2, !dbg !41
+  %213 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep38, i1 %212, i16 0, i1 %212) #3, !dbg !41
+  %214 = extractelement <8 x i1> %199, i64 3, !dbg !41
+  %215 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep40, i1 %214, i16 0, i1 %214) #3, !dbg !41
+  %216 = extractelement <8 x i1> %199, i64 4, !dbg !41
+  %217 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep42, i1 %216, i16 0, i1 %216) #3, !dbg !41
+  %218 = extractelement <8 x i1> %199, i64 5, !dbg !41
+  %219 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep44, i1 %218, i16 0, i1 %218) #3, !dbg !41
+  %220 = extractelement <8 x i1> %199, i64 6, !dbg !41
+  %221 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep46, i1 %220, i16 0, i1 %220) #3, !dbg !41
+  %222 = extractelement <8 x i1> %199, i64 7, !dbg !41
+  %223 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %gep48, i1 %222, i16 0, i1 %222) #3, !dbg !41
+  %224 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %209) #3, !dbg !42
+  %225 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %211) #3, !dbg !42
+  %226 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %213) #3, !dbg !42
+  %227 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %215) #3, !dbg !42
+  %228 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %217) #3, !dbg !42
+  %229 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %219) #3, !dbg !42
+  %230 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %221) #3, !dbg !42
+  %231 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %223) #3, !dbg !42
+  %232 = fsub float %224, %187, !dbg !43
+  %233 = fsub float %225, %187, !dbg !43
+  %234 = fsub float %226, %187, !dbg !43
+  %235 = fsub float %227, %187, !dbg !43
+  %236 = fsub float %228, %187, !dbg !43
+  %237 = fsub float %229, %187, !dbg !43
+  %238 = fsub float %230, %187, !dbg !43
+  %239 = fsub float %231, %187, !dbg !43
+  %240 = fmul float %232, 0x3FF7154760000000, !dbg !44
+  %241 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %240) #3, !dbg !44
+  %242 = fmul float %233, 0x3FF7154760000000, !dbg !44
+  %243 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %242) #3, !dbg !44
+  %244 = fmul float %234, 0x3FF7154760000000, !dbg !44
+  %245 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %244) #3, !dbg !44
+  %246 = fmul float %235, 0x3FF7154760000000, !dbg !44
+  %247 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %246) #3, !dbg !44
+  %248 = fmul float %236, 0x3FF7154760000000, !dbg !44
+  %249 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %248) #3, !dbg !44
+  %250 = fmul float %237, 0x3FF7154760000000, !dbg !44
+  %251 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %250) #3, !dbg !44
+  %252 = fmul float %238, 0x3FF7154760000000, !dbg !44
+  %253 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %252) #3, !dbg !44
+  %254 = fmul float %239, 0x3FF7154760000000, !dbg !44
+  %255 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %254) #3, !dbg !44
+  %256 = insertelement <8 x float> poison, float %241, i64 0, !dbg !45
+  %257 = insertelement <8 x float> %256, float %243, i64 1, !dbg !45
+  %258 = insertelement <8 x float> %257, float %245, i64 2, !dbg !45
+  %259 = insertelement <8 x float> %258, float %247, i64 3, !dbg !45
+  %260 = insertelement <8 x float> %259, float %249, i64 4, !dbg !45
+  %261 = insertelement <8 x float> %260, float %251, i64 5, !dbg !45
+  %262 = insertelement <8 x float> %261, float %253, i64 6, !dbg !45
+  %263 = insertelement <8 x float> %262, float %255, i64 7, !dbg !45
+  %264 = select <8 x i1> %199, <8 x float> %263, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !45
+  %265 = fadd <8 x float> %194, %264, !dbg !45
+  %266 = add nuw nsw i32 %193, 2048, !dbg !37
+  %267 = icmp ult i32 %193, 48209, !dbg !37
+  br i1 %267, label %192, label %268, !dbg !37
+268:                                              ; preds = %192
+  tail call void @llvm.nvvm.barrier0(), !dbg !46
+  %shift = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %269 = fadd <8 x float> %265, %shift, !dbg !50
+  %shift94 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %270 = fadd <8 x float> %shift94, %269, !dbg !50
+  %shift95 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %271 = fadd <8 x float> %shift95, %270, !dbg !50
+  %shift96 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %272 = fadd <8 x float> %shift96, %271, !dbg !50
+  %shift97 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %273 = fadd <8 x float> %shift97, %272, !dbg !50
+  %shift98 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %274 = fadd <8 x float> %shift98, %273, !dbg !50
+  %shift99 = shufflevector <8 x float> %265, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !50
+  %275 = fadd <8 x float> %shift99, %274, !dbg !50
+  %276 = extractelement <8 x float> %275, i64 0, !dbg !50
+  %277 = bitcast float %276 to i32, !dbg !46
+  %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 16, i32 31), !dbg !46
+  %279 = bitcast i32 %278 to float, !dbg !46
+  %280 = fadd float %276, %279, !dbg !50
+  %281 = bitcast float %280 to i32, !dbg !46
+  %282 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %281, i32 8, i32 31), !dbg !46
+  %283 = bitcast i32 %282 to float, !dbg !46
+  %284 = fadd float %280, %283, !dbg !50
+  %285 = bitcast float %284 to i32, !dbg !46
+  %286 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %285, i32 4, i32 31), !dbg !46
+  %287 = bitcast i32 %286 to float, !dbg !46
+  %288 = fadd float %284, %287, !dbg !50
+  %289 = bitcast float %288 to i32, !dbg !46
+  %290 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %289, i32 2, i32 31), !dbg !46
+  %291 = bitcast i32 %290 to float, !dbg !46
+  %292 = fadd float %288, %291, !dbg !50
+  %293 = bitcast float %292 to i32, !dbg !46
+  %294 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %293, i32 1, i32 31), !dbg !46
+  %295 = bitcast i32 %294 to float, !dbg !46
+  %296 = fadd float %292, %295, !dbg !50
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %158, float %296, i1 %156) #3, !dbg !46
+  tail call void @llvm.nvvm.barrier0(), !dbg !46
+  %297 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %161, i1 %159) #3, !dbg !46
+  %298 = bitcast float %297 to i32, !dbg !46
+  %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 4, i32 31), !dbg !46
+  %300 = bitcast i32 %299 to float, !dbg !46
+  %301 = fadd float %297, %300, !dbg !50
+  %302 = bitcast float %301 to i32, !dbg !46
+  %303 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %302, i32 2, i32 31), !dbg !46
+  %304 = bitcast i32 %303 to float, !dbg !46
+  %305 = fadd float %301, %304, !dbg !50
+  %306 = bitcast float %305 to i32, !dbg !46
+  %307 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %306, i32 1, i32 31), !dbg !46
+  %308 = bitcast i32 %307 to float, !dbg !46
+  %309 = fadd float %305, %308, !dbg !50
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %161, float %309, i1 %186) #3, !dbg !46
+  tail call void @llvm.nvvm.barrier0(), !dbg !46
+  %310 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !46
+  tail call void @llvm.nvvm.barrier0(), !dbg !54
+  %311 = insertelement <1 x float> undef, float %310, i64 0, !dbg !54
+  store <1 x float> %311, ptr addrspace(3) @global_smem, align 4, !dbg !54
+  tail call void @llvm.nvvm.barrier0(), !dbg !54
+  %312 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !54
+  %313 = getelementptr float, ptr addrspace(1) %2, i64 %16, !dbg !55
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %312, ptr addrspace(1) %313, i1 %191) #3, !dbg !56
+  ret void, !dbg !57
+}
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "ccyhhqogjmaiuaq7b54att75rswph7r3hvxgfmkjyupj74n77r6i.py", directory: "/tmp/torchinductor_root/cy")
+!3 = !{ptr @triton__0d1d2d3de4, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2d3de4, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2d3de4", linkageName: "triton__0d1d2d3de4", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 24, column: 33, scope: !5)
+!9 = !DILocation(line: 21, column: 28, scope: !5)
+!10 = !DILocation(line: 21, column: 34, scope: !5)
+!11 = !DILocation(line: 31, column: 46, scope: !5)
+!12 = !DILocation(line: 27, column: 36, scope: !5)
+!13 = !DILocation(line: 28, column: 27, scope: !5)
+!14 = !DILocation(line: 38, column: 21, scope: !15, inlinedAt: !17)
+!15 = distinct !DILexicalBlockFile(scope: !5, file: !16, discriminator: 0)
+!16 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!17 = !DILocation(line: 34, column: 45, scope: !15)
+!18 = !DILocation(line: 29, column: 25, scope: !5)
+!19 = !DILocation(line: 31, column: 34, scope: !5)
+!20 = !DILocation(line: 31, column: 52, scope: !5)
+!21 = !DILocation(line: 31, column: 103, scope: !5)
+!22 = !DILocation(line: 36, column: 15, scope: !15, inlinedAt: !17)
+!23 = !DILocation(line: 38, column: 16, scope: !15, inlinedAt: !17)
+!24 = !DILocation(line: 0, scope: !5)
+!25 = !DILocation(line: 36, column: 15, scope: !26, inlinedAt: !27)
+!26 = distinct !DILexicalBlockFile(scope: !15, file: !16, discriminator: 0)
+!27 = !DILocation(line: 49, column: 29, scope: !26, inlinedAt: !28)
+!28 = !DILocation(line: 36, column: 38, scope: !26)
+!29 = !DILocation(line: 38, column: 21, scope: !26, inlinedAt: !27)
+!30 = !DILocation(line: 38, column: 16, scope: !26, inlinedAt: !27)
+!31 = !DILocation(line: 39, column: 29, scope: !26, inlinedAt: !27)
+!32 = !DILocation(line: 49, column: 29, scope: !15, inlinedAt: !33)
+!33 = !DILocation(line: 36, column: 38, scope: !15)
+!34 = !DILocation(line: 36, column: 41, scope: !5)
+!35 = !DILocation(line: 37, column: 25, scope: !5)
+!36 = !DILocation(line: 37, column: 36, scope: !5)
+!37 = !DILocation(line: 39, column: 36, scope: !5)
+!38 = !DILocation(line: 40, column: 27, scope: !5)
+!39 = !DILocation(line: 41, column: 25, scope: !5)
+!40 = !DILocation(line: 43, column: 34, scope: !5)
+!41 = !DILocation(line: 43, column: 52, scope: !5)
+!42 = !DILocation(line: 43, column: 104, scope: !5)
+!43 = !DILocation(line: 45, column: 22, scope: !5)
+!44 = !DILocation(line: 46, column: 22, scope: !5)
+!45 = !DILocation(line: 49, column: 40, scope: !5)
+!46 = !DILocation(line: 243, column: 36, scope: !47, inlinedAt: !49)
+!47 = distinct !DILexicalBlockFile(scope: !5, file: !48, discriminator: 0)
+!48 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!49 = !DILocation(line: 50, column: 27, scope: !47)
+!50 = !DILocation(line: 233, column: 15, scope: !51, inlinedAt: !52)
+!51 = distinct !DILexicalBlockFile(scope: !47, file: !48, discriminator: 0)
+!52 = !DILocation(line: 243, column: 36, scope: !51, inlinedAt: !53)
+!53 = !DILocation(line: 50, column: 27, scope: !51)
+!54 = !DILocation(line: 50, column: 30, scope: !5)
+!55 = !DILocation(line: 51, column: 25, scope: !5)
+!56 = !DILocation(line: 51, column: 37, scope: !5)
+!57 = !DILocation(line: 51, column: 4, scope: !5)

.triton/dump/fac03406d1136fc802dac111a1efea36/triton_.ttgir ADDED Viewed

	@@ -0,0 +1,81 @@

+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i64) attributes {noinline = false} {
+    %c50257_i64 = arith.constant 50257 : i64
+    %cst = arith.constant dense<0.000000e+00> : tensor<1x2048xbf16, #blocked>
+    %cst_0 = arith.constant dense<true> : tensor<1x2048xi1, #blocked>
+    %c50257_i32 = arith.constant 50257 : i32
+    %c2048_i32 = arith.constant 2048 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst_1 = arith.constant dense<50257> : tensor<1x2048xi64, #blocked>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<1x2048xf32, #blocked>
+    %cst_3 = arith.constant dense<0xFF800000> : tensor<1x2048xf32, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = tt.make_range {end = 2048 : i32, start = 0 : i32} : tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<2048xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x2048xi32, #blocked>
+    %4 = arith.extsi %3 : tensor<1x2048xi32, #blocked> to tensor<1x2048xi64, #blocked>
+    %5 = arith.muli %1, %c50257_i64 : i64
+    %6 = tt.splat %5 : (i64) -> tensor<1x2048xi64, #blocked>
+    %7 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>
+    %8 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_3) -> (tensor<1x2048xf32, #blocked>)  : i32 {
+      %22 = arith.extsi %arg5 : i32 to i64
+      %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64, #blocked>
+      %24 = arith.addi %23, %4 : tensor<1x2048xi64, #blocked>
+      %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64, #blocked>
+      %26 = arith.addi %24, %6 : tensor<1x2048xi64, #blocked>
+      %27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
+      %29 = arith.extf %28 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
+      %30 = arith.cmpf ogt, %arg6, %29 : tensor<1x2048xf32, #blocked>
+      %31 = arith.cmpf une, %arg6, %arg6 : tensor<1x2048xf32, #blocked>
+      %32 = arith.ori %30, %31 : tensor<1x2048xi1, #blocked>
+      %33 = arith.xori %32, %cst_0 : tensor<1x2048xi1, #blocked>
+      %34 = arith.andi %25, %33 : tensor<1x2048xi1, #blocked>
+      %35 = arith.select %34, %29, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
+      scf.yield %35 : tensor<1x2048xf32, #blocked>
+    }
+    %9 = "tt.reduce"(%8) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %22 = arith.cmpf ogt, %arg5, %arg6 : f32
+      %23 = arith.cmpf une, %arg5, %arg5 : f32
+      %24 = arith.ori %22, %23 : i1
+      %25 = arith.select %24, %arg5, %arg6 : f32
+      tt.reduce.return %25 : f32
+    }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %10 = triton_gpu.convert_layout %9 : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %11 = tt.expand_dims %10 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
+    %12 = tt.expand_dims %9 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
+    %13 = tt.addptr %arg1, %1 : !tt.ptr<f32, 1>, i64
+    %14 = tt.splat %13 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %14, %11 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
+    %15 = tt.broadcast %12 : (tensor<1x1xf32, #blocked>) -> tensor<1x2048xf32, #blocked>
+    %16 = scf.for %arg5 = %c0_i32 to %c50257_i32 step %c2048_i32 iter_args(%arg6 = %cst_2) -> (tensor<1x2048xf32, #blocked>)  : i32 {
+      %22 = arith.extsi %arg5 : i32 to i64
+      %23 = tt.splat %22 : (i64) -> tensor<1x2048xi64, #blocked>
+      %24 = arith.addi %23, %4 : tensor<1x2048xi64, #blocked>
+      %25 = arith.cmpi slt, %24, %cst_1 : tensor<1x2048xi64, #blocked>
+      %26 = arith.addi %24, %6 : tensor<1x2048xi64, #blocked>
+      %27 = tt.addptr %7, %26 : tensor<1x2048x!tt.ptr<bf16, 1>, #blocked>, tensor<1x2048xi64, #blocked>
+      %28 = tt.load %27, %25, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x2048xbf16, #blocked>
+      %29 = arith.extf %28 : tensor<1x2048xbf16, #blocked> to tensor<1x2048xf32, #blocked>
+      %30 = arith.subf %29, %15 : tensor<1x2048xf32, #blocked>
+      %31 = math.exp %30 : tensor<1x2048xf32, #blocked>
+      %32 = arith.addf %arg6, %31 : tensor<1x2048xf32, #blocked>
+      %33 = arith.select %25, %32, %arg6 : tensor<1x2048xi1, #blocked>, tensor<1x2048xf32, #blocked>
+      scf.yield %33 : tensor<1x2048xf32, #blocked>
+    }
+    %17 = "tt.reduce"(%16) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %22 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %22 : f32
+    }) : (tensor<1x2048xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %18 = triton_gpu.convert_layout %17 : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %19 = tt.expand_dims %18 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<1x1xf32, #blocked1>
+    %20 = tt.addptr %arg2, %1 : !tt.ptr<f32, 1>, i64
+    %21 = tt.splat %20 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %21, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked1>
+    tt.return
+  }
+}

wandb/run-20240926_055222-14kj2390/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "os": "Linux-5.15.0-113-generic-x86_64-with-glibc2.35",
+  "python": "3.10.12",
+  "startedAt": "2024-09-26T05:52:22.950984Z",
+  "args": [
+    "--batch_size=120"
+  ],
+  "program": "/root/train.py",
+  "codePath": "train.py",
+  "email": "prasadchandalada@gmail.com",
+  "root": "/root",
+  "host": "184d1c0992ce",
+  "username": "root",
+  "executable": "/usr/bin/python",
+  "codePathLocal": "train.py",
+  "cpu_count": 64,
+  "cpu_count_logical": 128,
+  "gpu": "[NVIDIA L40S, NVIDIA L40S, NVIDIA L40S, NVIDIA L40S]",
+  "gpu_count": 4,
+  "disk": {
+    "/": {
+      "total": "542239621120",
+      "used": "400404738048"
+    }
+  },
+  "memory": {
+    "total": "811327934464"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 128
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L40S",
+      "memoryTotal": "48305799168",
+      "cudaCores": 18176,
+      "architecture": "Ada"
+    }
+  ],
+  "cudaVersion": "12.2"
+}

wandb/run-20240926_055222-14kj2390/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,14 @@

+{"time":"2024-09-26T05:52:22.315197362Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp0irn9n95/port-986.txt","pid":986,"debug":false,"disable-analytics":false}
+{"time":"2024-09-26T05:52:22.315233622Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
+{"time":"2024-09-26T05:52:22.316073319Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":986}
+{"time":"2024-09-26T05:52:22.316064076Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41193,"Zone":""}}
+{"time":"2024-09-26T05:52:22.505125081Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T05:52:22.951875072Z","level":"INFO","msg":"connection init received","streamId":"14kj2390","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T05:52:22.952304043Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_055222.log /root/wandb/run-20240926_055222-14kj2390/logs/debug-core.log: file exists"}
+{"time":"2024-09-26T05:52:22.955991404Z","level":"INFO","msg":"connection init completed","streamId":"14kj2390","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T12:39:40.244212691Z","level":"INFO","msg":"handle finish received","streamId":"14kj2390","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T12:39:41.460220703Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T12:39:41.460275234Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2024-09-26T12:39:41.460350917Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T12:39:41.460369816Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:42070"}
+{"time":"2024-09-26T12:39:41.460376796Z","level":"INFO","msg":"server is closed"}