diff --git a/.local/share/Trash/info/train_002.bin.trashinfo b/.local/share/Trash/info/train_002.bin.trashinfo
new file mode 100644
index 0000000000000000000000000000000000000000..064ac10de9f571b7925072459ae27261a4f1569c
--- /dev/null
+++ b/.local/share/Trash/info/train_002.bin.trashinfo
@@ -0,0 +1,3 @@
+[Trash Info]
+Path=/root/data/fineweb/train_002.bin
+DeletionDate=2024-09-26T05:50:34
diff --git a/.local/share/Trash/info/train_004.bin.trashinfo b/.local/share/Trash/info/train_004.bin.trashinfo
new file mode 100644
index 0000000000000000000000000000000000000000..943c39fcd1c67768072e1aee2513ed61ad97c4c5
--- /dev/null
+++ b/.local/share/Trash/info/train_004.bin.trashinfo
@@ -0,0 +1,3 @@
+[Trash Info]
+Path=/root/data/fineweb/train_004.bin
+DeletionDate=2024-09-26T05:50:34
diff --git a/.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png b/.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c95198564cf58b036ce168c10fda3d7f5aba1ae
Binary files /dev/null and b/.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png differ
diff --git a/.local/share/jupyter/nbextensions/ruler/icon.png b/.local/share/jupyter/nbextensions/ruler/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6a8c58449d878562301c43864b75ebec90e9a3c
Binary files /dev/null and b/.local/share/jupyter/nbextensions/ruler/icon.png differ
diff --git a/.local/share/jupyter/nbextensions/ruler/main.js b/.local/share/jupyter/nbextensions/ruler/main.js
new file mode 100644
index 0000000000000000000000000000000000000000..c2d6e68649a709adb5b4e21562e356618363df49
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/ruler/main.js
@@ -0,0 +1,124 @@
+// Add rulers to codecells
+define([
+    'base/js/namespace',
+    'base/js/events',
+    'services/config',
+    'notebook/js/codecell',
+    'codemirror/lib/codemirror',
+    'codemirror/addon/display/rulers'
+], function (Jupyter, events, configmod, codecell, codemirror) {
+    "use strict";
+
+    var log_prefix = '[ruler]';
+
+    // define default config parameter values
+    var params = {
+        ruler_column: [78],
+        ruler_color: ["#ff0000"],
+        ruler_linestyle: ["dashed"],
+        ruler_do_css_patch: false
+    };
+
+
+    var rulers = [];
+
+    var isNumber = function (n) {
+        return !isNaN(parseFloat(n)) && isFinite(n);
+    };
+
+    // updates default params with any specified in the provided config data
+    var update_params = function (config_data) {
+        for (var key in params) {
+            if (config_data.hasOwnProperty(key)) {
+                params[key] = config_data[key];
+            }
+        }
+    };
+
+    var on_config_loaded = function () {
+
+        if (Jupyter.notebook !== undefined) {
+            var i, config = Jupyter.notebook.config;
+        } else {
+            var i, config = Jupyter.editor.config;
+        }
+
+        if (config.data.hasOwnProperty('ruler_color') && config.data.ruler_color.length > 0) {
+            params.ruler_color = config.data.ruler_color;
+        }
+
+        if (config.data.hasOwnProperty('ruler_column')) {
+            var new_columns = [];
+            for (i in config.data.ruler_column) {
+                if (isNumber(config.data.ruler_column[i])) {
+                    new_columns.push(config.data.ruler_column[i]);
+                }
+            }
+            if (new_columns.length > 0) {
+                params.ruler_column = new_columns;
+            }
+        }
+
+        if (config.data.hasOwnProperty('ruler_linestyle') && config.data.ruler_linestyle.length > 0) {
+            params.ruler_linestyle = config.data.ruler_linestyle;
+        }
+
+        for (i in params.ruler_column) {
+            rulers.push({
+                color: params.ruler_color[i % params.ruler_color.length],
+                column: params.ruler_column[i],
+                lineStyle: params.ruler_linestyle[i % params.ruler_linestyle.length]
+            });
+        }
+        console.debug(log_prefix, 'ruler specs:', rulers);
+
+        if (Jupyter.notebook !== undefined) {
+            var i, config = Jupyter.notebook.config;
+
+            // Change default for new cells
+            codecell.CodeCell.options_default.cm_config.rulers = rulers;
+            // Apply to any already-existing cells
+            var cells = Jupyter.notebook.get_cells().forEach(function (cell) {
+                if (cell instanceof codecell.CodeCell) {
+                    cell.code_mirror.setOption('rulers', rulers);
+                }
+            });
+
+        }
+        else {
+            Jupyter.editor.codemirror.setOption('rulers', rulers);
+        }
+    };
+
+    var load_extension = function () {
+
+        // first, check which view we're in, in order to decide whether to load
+        var conf_sect;
+        if (Jupyter.notebook) {
+            // we're in notebook view
+            conf_sect = Jupyter.notebook.config;
+        }
+        else if (Jupyter.editor) {
+            // we're in file-editor view
+            conf_sect = Jupyter.editor.config;
+        }
+        else {
+            // we're some other view like dashboard, terminal, etc, so bail now
+            return;
+        }
+
+        conf_sect.loaded
+            .then(function () {
+                update_params(conf_sect.data);
+            })
+            .then(on_config_loaded)
+            .catch(function on_error(reason) {
+                console.warn(log_prefix, 'error:', reason);
+            });
+    };
+
+    var extension = {
+        load_ipython_extension: load_extension
+    };
+    return extension;
+});
diff --git a/.local/share/jupyter/nbextensions/runtools/readme.md b/.local/share/jupyter/nbextensions/runtools/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..424b87736efa1988e9b97bd2a8bc6ad1369a41fd
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/runtools/readme.md
@@ -0,0 +1,119 @@
+Runtools
+========
+Runtools provide a number of additional functions for working with code cells in the IPython notebook:
+
+Code Cell Execution
+-------------------
+
+* Execute a single cell
+* Execute from top cell to currently selected cell
+* Execute from currently selected cell to bottom cell
+* Execute all cells
+* Execute all cells, ignore exceptions (requires [ipython/pull/6521](https://github.com/ipython/ipython/pull/6521))
+* Execute marked code cells (cells with green gutter area are marked)
+* Stop execution (duplicate to standard toolbar button)
+
+When executing marked cells, they are put in a execution list, and
+executed in order. The execution list can be modified by unmarking
+a cell not yet run. The execution list can be stopped by clicking on
+`stop execution`. Execution of the currently running cell can be stopped
+by pressing `stop execution` twice.
+
+Code Cell Marking
+-----------------
+
+* Mark one or more code cell
+
+
+Code Cell Display
+-----------------
+
+* Hide or show input (i.e. the source code) of marked code cells
+* Hide or show output of marked code cells
+
+
+Description
+-----------
+
+The *runtools* extension adds a button to turn on/off a floating toolbar:   
+![](icon.png)
+
+This adds Code execution buttons:   
+![](runtools_execute.png)
+
+Codecells can be marked by clicking on the gutter of a codecell or by clicking on the markers toolbar:   
+![](runtools_marker.png)
+
+Marked codecells can be locked to read-only mode and moved upd and down:   
+![](runtools_lock.png)
+
+The input and output areas of marked codecells can be hidden:   
+![](runtools_show_hide.png)
+
+A IPython notebook with marked cells looks like this:
+![](runtools_nb.png)
+
+
+Demo
+----
+
+![](demo.gif)
+
+
+Internals
+---------
+
+New metadata elements added to each cell:
+* `cell.metadata.hide_input` - hide input field of the cell
+* `cell.metadata.hide_output` - hide output field of the cell
+* `cell.metadata.run_control.marked` - mark a codecell
+
+To export a notebook with hidden input/output fields, the custom template `hide_input_output.tpl` is required.
+It should have been installed in the `templates` folder.
+You can find the `templates` folder of `jupyter_contrib_nbextensions` from python using
+
+```python
+from jupyter_contrib_nbextensions.nbconvert_support import templates_directory
+print(templates_directory())
+```
+
+The template needs to be in a path where nbconvert can find it. This can be your local path or specified in 
+`jupyter_nbconvert_config` or `jupyter_notebook_config` as `c.Exporter.extra_template_paths`, see [Jupyter docs](https://jupyter-notebook.readthedocs.io/en/latest/config.html).
+
+For HTML export a template is provided as `nbextensions.tpl` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template:
+```
+{%- extends 'full.tpl' -%}
+
+{% block input_group -%}
+{%- if cell.metadata.hide_input -%}
+{%- else -%}
+{{ super() }}
+{%- endif -%}
+{% endblock input_group %}
+
+{% block output_group -%}
+{%- if cell.metadata.hide_output -%}
+{%- else -%}
+{{ super() }}
+{%- endif -%}
+{% endblock output_group %}
+```
+
+For LaTeX export a different template is required, which is included as `nbextensions.tplx` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template:
+```
+((- extends 'report.tplx' -))
+
+((* block input_group -))
+((- if cell.metadata.hide_input -))
+((- else -))
+((( super() )))
+((- endif -))
+(( endblock input_group *))
+
+((* block output_group -))
+((- if cell.metadata.hide_output -))
+((- else -))
+((( super() )))
+((- endif -))
+(( endblock output_group *))
+```
\ No newline at end of file
diff --git a/.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png b/.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png
new file mode 100644
index 0000000000000000000000000000000000000000..8ff9477aa0c17d40a79b896164ac7be718953c50
Binary files /dev/null and b/.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png differ
diff --git a/.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml b/.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ccd35cfeff9ee6804496efe8f219945c328640f
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml
@@ -0,0 +1,6 @@
+Type: Jupyter Notebook Extension
+Name: Scratchpad
+Description: Adds a scratchpad cell to Jupyter notebook.
+Link: README.md
+Main: main.js
+Compatibility: 4.x, 5.x
diff --git a/.local/share/jupyter/nbextensions/select_keymap/README.md b/.local/share/jupyter/nbextensions/select_keymap/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0e58992c00abb7285a19cdd65e3d3ddd75add7a1
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/select_keymap/README.md
@@ -0,0 +1,14 @@
+Select CodeMirror Keymap
+=======
+
+This extension lets you choose between the available CodeMirror keymaps: default, emacs, vim, and sublime.
+
+There's a chance that this might cause key conflicts, especially with other extensions.
+
+Most browsers consume some of the global keybindings like `Ctrl+n`. [The Menu Wizard add-on for Firefox](https://addons.mozilla.org/en-US/firefox/addon/s3menu-wizard/) allows you to disable some of the global key shortcuts, thus passing the keys through to CodeMirror.
+
+![Demo](select_keymap.png)
+
+Based on:
+ * [jupyter-emacskeys](https://github.com/rmcgibbo/jupyter-emacskeys)
+ * [notebook_input_mode](https://github.com/asford/notebook_input_mode)
diff --git a/.local/share/jupyter/nbextensions/skill/main.js b/.local/share/jupyter/nbextensions/skill/main.js
new file mode 100644
index 0000000000000000000000000000000000000000..93ff45950cdf69aa0add4ea582db2ca0adc7c281
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/skill/main.js
@@ -0,0 +1,14 @@
+define(function() {
+	"use strict";
+	// jupyter nbextensions must export a load_ipython_extension function to
+	// avoid throwing an error. Also, loading the module should do nothing
+	// unless the function is called, so we wrap requiring the codemirror mode
+	// in the load call.
+	return {
+		load_ipython_extension: function () {
+			requirejs(['./skill'], function () {
+				console.log('[SKILL Syntax] loaded');
+			});
+		}
+	};
+});
\ No newline at end of file
diff --git a/.local/share/jupyter/nbextensions/skill/skill.yaml b/.local/share/jupyter/nbextensions/skill/skill.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e7054d498a3055f8990a30768fd30de90f9b122
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/skill/skill.yaml
@@ -0,0 +1,6 @@
+Type: IPython Notebook Extension
+Name: SKILL Syntax
+Description: Enable SKILL syntax support for CodeMirror
+Link: README.md
+Main: main.js
+Compatibility: 4.x, 5.x
diff --git a/.local/share/jupyter/nbextensions/skip-traceback/traceback.png b/.local/share/jupyter/nbextensions/skip-traceback/traceback.png
new file mode 100644
index 0000000000000000000000000000000000000000..5acf410b6742d5ed146959a4752a3cfa1a4af5be
Binary files /dev/null and b/.local/share/jupyter/nbextensions/skip-traceback/traceback.png differ
diff --git a/.local/share/jupyter/nbextensions/splitcell/splitcell.js b/.local/share/jupyter/nbextensions/splitcell/splitcell.js
new file mode 100644
index 0000000000000000000000000000000000000000..0c6522c2ac824ca14d8c6e518e3cbe58724de660
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/splitcell/splitcell.js
@@ -0,0 +1,101 @@
+// Allow for split cells in jupyter notebooks
+
+define([
+	'base/js/namespace',
+		'base/js/events'
+], function (
+	Jupyter,
+	events
+) {
+	"use strict";
+
+	//define default config parameter values
+	var params = {
+		toggle_cell_style_keybinding : 'shift-s'
+	};
+
+	//updates default params with any specified in the server's config
+	var update_params = function(){
+		var config = Jupyter.notebook.config;
+		for (var key in params){
+			if (config.data.hasOwnProperty(key)){
+				params[key] = config.data[key];
+			}
+		}
+	};
+
+	 var setup = function (){
+		// update defaults
+		update_params();
+
+		//register actions with ActionHandler instance
+		var prefix = 'auto';
+		var name = 'toggle-cell-style';
+		var action = {
+			icon : 'fa-arrows-h',
+			help : 'Toggle split/centered cell style',
+			help_index : 'eb',
+			id : 'split_cells',
+			handler : toggle_cell_style
+		};
+
+		var action_full_name = Jupyter.keyboard_manager.actions.register(action, name, prefix);
+
+		//define keyboard shortucts
+		var command_mode_shortcuts = {};
+		command_mode_shortcuts[params.toggle_cell_style_keybinding] =  action_full_name;
+
+		//register keyboard shortucts with keyboard_manager
+		Jupyter.notebook.keyboard_manager.command_shortcuts.add_shortcuts(command_mode_shortcuts);
+		Jupyter.toolbar.add_buttons_group([action_full_name]);
+	};
+
+
+	var toggle_cell_style = function(){
+		var cell = Jupyter.notebook.get_selected_cell();
+		if (!("cell_style" in cell.metadata)){cell.metadata.cell_style = 'split';}
+		else if (cell.metadata.cell_style == 'center'){cell.metadata.cell_style = 'split';}
+		else {cell.metadata.cell_style = 'center';}
+
+		update_cell_style_element(cell);
+	};
+
+	var get_cell_style_html = function(cell_style){
+		console.log(cell_style);
+		if (cell_style == "split") 
+			{return "float:left; width:50%;";}
+		return "width:100%;";
+	};
+
+	var update_cell_style_element = function(cell){
+		var cell_style_html = get_cell_style_html(cell.metadata.cell_style);
+		cell.element.attr('style', cell_style_html);
+	};
+
+	function initialize () {
+		// On Load lets set the cell styles correctly
+		var cells = Jupyter.notebook.get_cells();
+		var ncells = Jupyter.notebook.ncells();
+
+		for (var i=0; i<ncells; i++){
+			var cell = cells[i];
+			if ("cell_style" in cell.metadata){
+				update_cell_style_element(cell, cell.metadata.cell_style);
+			}
+		 }
+	}
+
+	var load_extension = function() {
+		Jupyter.notebook.config.loaded.then(setup);
+
+		if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
+			// notebook already loaded. Update directly
+			initialize();
+		}
+		events.on("notebook_loaded.Notebook", initialize);
+	};
+
+	return {
+		load_ipython_extension : load_extension
+	};
+});
diff --git a/.local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png b/.local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..606efcd1b5e92aa641b5c3c6178bf907da7cb2b2
Binary files /dev/null and b/.local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png differ
diff --git a/.local/share/jupyter/nbextensions/varInspector/README.md b/.local/share/jupyter/nbextensions/varInspector/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..35982ce3edd5b0997e02c57bdd697e3180ac6415
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/varInspector/README.md
@@ -0,0 +1,36 @@
+# Variable Inspector
+
+## Description and main features
+
+The Variable Inspector extension, which currently supports python and R kernels,  enables to collect all defined variables and display them in a floating window. The window not only display the name of variables but also  their type, size in memory and content. The columns are sortable. The window is draggable, resizable, collapsable. The list of displayed variables is automatically updated at each cell execution. Variables can be deleted from workspace by clicking a link. Position and state (displayed/collapsed) are stored in the notebook's metadata and restored at startup.
+
+The extension supports multiple kernels. To add support for a new kernel, one has to
+- provide a library which loads required modules and define a function which lists all variables, together with their name, type, size and content. The output of this function must be a JSON representation of a list of objects (one for each variable) with keys 'varName','varType', 'varSize', 'varContent',
+- provide the command for deleting a variable, as `delete_cmd_prefix` and `delete_cmd_postfix`, eg. for `rm(variable)`, specify `rm(` and `)`.
+- give the command to refresh the list of variables (usually this is a call to the function defined in the library above). This information can be provided either in the source file or in the yaml config file.
+
+In any case, contributions to support further kernels will be very welcome!
+
+#### Demo:
+![](demo.gif)
+
+
+## Configuration
+The initial configuration can be given using the IPython-contrib nbextensions facility. It includes:
+
+- varInspector.window_display - Display at startup or not (default: false)
+- varInspector.cols.lenName: (and .lenType, .lenVar) - Width of columns (actually the max number of character to display in each column)
+- varInspector.kernels_config - json object defining the kernels specific code and commands.
+
+
+## Notes
+- The displayed size of variables use the `getsizeof()` python method. This method doesn't work for all types, so the reported size is to be considered with some caution. The extension includes some code to correctly return the size of numpy arrays, pandas Series and DataFrame but the size for some other types may be incorrect.
+- The extension builds on some code provided [here](https://github.com/jupyter-widgets/ipywidgets/blob/master/docs/source/examples/Variable%20Inspector.ipynb)  (essentially the `_fill` method)
+- The extension uses Christian Bach's [table sorter jquery plugin](https://github.com/christianbach/tablesorter). License file is included.
+
+
+## History
+
+- @jfbercher march 22, 2017 -- initial release
+- @jfbercher april 03, 2017 -- multiple kernel support. added support for R kernels.
+- @jfbercher june 30, 2017 -- fixed #1014 (use of `%reset` with IPython kernel) and #1015 printing with python 2 kernel.
diff --git a/.local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc b/.local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e895dce5773d3f6691a85fd487a5609d277e4fa6
Binary files /dev/null and b/.local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc differ
diff --git a/.local/share/jupyter/nbextensions/varInspector/demo.gif b/.local/share/jupyter/nbextensions/varInspector/demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..2bfbad6679c609421101f6cddcf9a9a3dd1083c4
Binary files /dev/null and b/.local/share/jupyter/nbextensions/varInspector/demo.gif differ
diff --git a/.local/share/jupyter/nbextensions/varInspector/icon.png b/.local/share/jupyter/nbextensions/varInspector/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..47677fb48e9deeb85caae355b39254b93278c4d1
Binary files /dev/null and b/.local/share/jupyter/nbextensions/varInspector/icon.png differ
diff --git a/.local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt b/.local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0623c925f1635c3bd1845cabada37e7e924f8451
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Christian Bach
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/.local/share/jupyter/nbextensions/varInspector/var_list.py b/.local/share/jupyter/nbextensions/varInspector/var_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b975b5d86703aa4a032cdd54a9ea109d15911d6
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/varInspector/var_list.py
@@ -0,0 +1,63 @@
+import json
+from sys import getsizeof
+
+from IPython import get_ipython
+from IPython.core.magics.namespace import NamespaceMagics
+_nms = NamespaceMagics()
+_Jupyter = get_ipython()
+_nms.shell = _Jupyter.kernel.shell
+
+try:
+    import numpy as np
+except ImportError:
+    pass    
+
+def _getsizeof(x):
+    # return the size of variable x. Amended version of sys.getsizeof
+    # which also supports ndarray, Series and DataFrame
+    if type(x).__name__ in ['ndarray', 'Series']:
+        return x.nbytes
+    elif type(x).__name__ == 'DataFrame':
+        return x.memory_usage().sum()
+    else:
+        return getsizeof(x)
+
+def _getshapeof(x):
+    #returns the shape of x if it has one
+    #returns None otherwise - might want to return an empty string for an empty column
+    try:
+        return x.shape
+    except AttributeError: #x does not have a shape
+        return None
+
+def _getcontentof(x):
+    length = 150
+    if type(x).__name__ == 'DataFrame':
+        colnames = ', '.join(x.columns.map(str))
+        content = "Column names: %s" % colnames
+    elif type(x).__name__ == 'Series':
+        content = "Series [%d rows]" % x.shape
+    elif type(x).__name__ == 'ndarray':
+        content = x.__repr__()
+    else:
+        if hasattr(x, '__len__'):
+            if len(x) > length:
+                content = str(x[:length])
+        else:
+            content = str(x)
+        if len(content) > 150:
+            return content[:150] + " ..."
+    return content
+
+def var_dic_list():
+    types_to_exclude = ['module', 'function', 'builtin_function_or_method',
+                        'instance', '_Feature', 'type', 'ufunc']
+    values = _nms.who_ls()
+    vardic = [{'varName': v, 'varType': type(eval(v)).__name__, 'varSize': str(_getsizeof(eval(v))), 'varShape': str(_getshapeof(eval(v))) if _getshapeof(eval(v)) else '', 'varContent': _getcontentof(eval(v)) }  # noqa
+    
+    for v in values if (v not in ['_html', '_nms', 'NamespaceMagics', '_Jupyter']) & (type(eval(v)).__name__ not in types_to_exclude)] # noqa 
+    return json.dumps(vardic)
+
+
+# command to refresh the list of variables
+print(var_dic_list())
diff --git a/.local/share/jupyter/nbextensions/varInspector/var_list.r b/.local/share/jupyter/nbextensions/varInspector/var_list.r
new file mode 100644
index 0000000000000000000000000000000000000000..340c4176d435490ea3097736bd84942768c9d177
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/varInspector/var_list.r
@@ -0,0 +1,17 @@
+library(jsonlite)
+var_dic_list = function(){ 
+    ll = ls(.GlobalEnv, all.names = FALSE)
+    varList=list()
+    iter = 1
+        for (k in ll){
+            if (class(get(k))!='function'){
+            class = class(get(k)); rk = capture.output(str(get(k))); size =  object.size(get(k)); sk = substr(get(k),0, 200); 
+            #  [{'varName':v, 'varType': type(eval(v)).__name__, 'varSize': _getsizeof(eval(v)), 'varContent': str(eval(v))[:200]} 
+            l = list(varName = k, varType = class, varSize = size, varContent = sk)
+            varList[[iter]] = l
+            # print(l)
+            iter = iter + 1}
+        }
+return(toJSON(varList, simplifyVector = FALSE, force=TRUE))
+    }
+cat(var_dic_list())
\ No newline at end of file
diff --git a/.local/share/jupyter/nbextensions/zenmode/README.md b/.local/share/jupyter/nbextensions/zenmode/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e790aa9bc5566cae81f4abdfd5a719b7d758c95
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/zenmode/README.md
@@ -0,0 +1,4 @@
+Zenmode
+=======
+
+A little extension to give Zenmode functionality to the IPython notebook
\ No newline at end of file
diff --git a/.local/share/jupyter/nbextensions/zenmode/images/back2.jpg b/.local/share/jupyter/nbextensions/zenmode/images/back2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f433b963b0a7de48c4957f8c0e261a03ecafb5d8
Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/back2.jpg differ
diff --git a/.local/share/jupyter/nbextensions/zenmode/images/back21.jpg b/.local/share/jupyter/nbextensions/zenmode/images/back21.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0ced7d3348e6d867fcfe5d7f7e2b35ce0eb12d2e
Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/back21.jpg differ
diff --git a/.local/share/jupyter/nbextensions/zenmode/images/back3.jpg b/.local/share/jupyter/nbextensions/zenmode/images/back3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..361e689aa7b1f9741b12e18f8e6cd32dfc2e38c9
Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/back3.jpg differ
diff --git a/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png
new file mode 100644
index 0000000000000000000000000000000000000000..e56eb185168d59534c94ffff9bfd10b30902991f
Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png differ
diff --git a/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png
new file mode 100644
index 0000000000000000000000000000000000000000..73020648efb9ca22bbadf91da2ce77f8e18cc498
Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png differ
diff --git a/.local/share/jupyter/nbextensions/zenmode/main.css b/.local/share/jupyter/nbextensions/zenmode/main.css
new file mode 100644
index 0000000000000000000000000000000000000000..3fb688f3cb64d1839119bc2f9c7bbe800f747ad1
--- /dev/null
+++ b/.local/share/jupyter/nbextensions/zenmode/main.css
@@ -0,0 +1,34 @@
+.navbar-inner {
+    opacity: 0.5;
+    -webkit-transition: opacity 0.3s ease-in-out;
+    -moz-transition: opacity 0.3s ease-in-out;
+    -o-transition: opacity 0.3s ease-in-out;
+    transition: opacity 0.3s ease-in-out;
+}
+
+.navbar-inner:hover {
+    opacity: 1.0;
+}
+
+#maintoolbar .navbar-text {
+    display: none !important;
+}
+
+#notebook-container {
+    background-color: rgba(255, 255, 255, 0);
+}
+
+/*
+.cell {
+    background-color: rgb(255, 255, 255);
+}
+
+.CodeMirror {
+    background: #F8FCCF;
+}
+
+div.input_area {
+    margin: 2px;
+    border: none;
+}
+*/
diff --git a/.local/share/jupyter/runtime/jpserver-434.json b/.local/share/jupyter/runtime/jpserver-434.json
new file mode 100644
index 0000000000000000000000000000000000000000..827db6bb1f6650a69a0476806b815ee094429a2d
--- /dev/null
+++ b/.local/share/jupyter/runtime/jpserver-434.json
@@ -0,0 +1,13 @@
+{
+  "base_url": "/",
+  "hostname": "0.0.0.0",
+  "password": false,
+  "pid": 434,
+  "port": 8080,
+  "root_dir": "/root",
+  "secure": true,
+  "sock": "",
+  "token": "5a434251505375f2b42435914de608ef3450739f4e14b0be1cfeae3b7364239e",
+  "url": "https://184d1c0992ce:8080/",
+  "version": "2.12.5"
+}
\ No newline at end of file
diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..043c32f99120b95b4bdae1f75f759ac28d96dd66
Binary files /dev/null and b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin differ
diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..4b8f38b8414ea811649605ae9ab1c4d55cd94b3d
--- /dev/null
+++ b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir
@@ -0,0 +1,60 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<4x1xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<4x1xi64, #blocked>
+    %cst_1 = arith.constant dense<512> : tensor<4x1xi64, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<4x1xi32, #blocked>
+    %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
+    %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
+    %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1>
+    %cst_6 = arith.constant dense<true> : tensor<4x1xi1, #blocked>
+    %c4_i32 = arith.constant 4 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c4_i32 : i32
+    %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4x1xi32, #blocked1>
+    %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xi32, #blocked>
+    %6 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked1>
+    %7 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked>
+    %8 = arith.addi %6, %4 : tensor<4x1xi32, #blocked1>
+    %9 = arith.addi %7, %5 : tensor<4x1xi32, #blocked>
+    %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
+    %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
+    %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
+    %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
+    %14 = tt.broadcast %8 : (tensor<4x1xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
+    %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
+    %16 = arith.addi %14, %15 : tensor<4x128xi32, #blocked1>
+    %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>, #blocked1>
+    %18 = tt.addptr %17, %16 : tensor<4x128x!tt.ptr<f32, 1>, #blocked1>, tensor<4x128xi32, #blocked1>
+    %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<4x128xi1, #blocked1>
+    %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32, #blocked1>
+    %21 = arith.addf %20, %cst_5 : tensor<4x128xf32, #blocked1>
+    %22 = arith.select %19, %21, %cst_5 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1>
+    %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %40 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %40 : f32
+    }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    %24 = triton_gpu.convert_layout %23 : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xf32, #blocked>
+    %26 = arith.divsi %9, %cst_2 : tensor<4x1xi32, #blocked>
+    %27 = arith.remsi %9, %cst_2 : tensor<4x1xi32, #blocked>
+    %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>, #blocked>
+    %29 = tt.addptr %28, %26 : tensor<4x1x!tt.ptr<i64, 1>, #blocked>, tensor<4x1xi32, #blocked>
+    %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64, #blocked>
+    %31 = arith.addi %30, %cst_1 : tensor<4x1xi64, #blocked>
+    %32 = arith.cmpi slt, %30, %cst_0 : tensor<4x1xi64, #blocked>
+    %33 = arith.select %32, %31, %30 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked>
+    %34 = arith.muli %33, %cst : tensor<4x1xi64, #blocked>
+    %35 = arith.extsi %27 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked>
+    %36 = arith.addi %35, %34 : tensor<4x1xi64, #blocked>
+    %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>, #blocked>
+    %38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xi64, #blocked>
+    %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xf32, #blocked>, tensor<4x1xi1, #blocked>) -> tensor<4x1xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..5880bb280c4de369bd2e6dc5cdeb23488e6f4a5c
Binary files /dev/null and b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin differ
diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..12368a3de34292de00a4c7dc42d7890ab0e33358
--- /dev/null
+++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx
@@ -0,0 +1,453 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+.extern .func __assertfail
+(
+	.param .b64 __assertfail_param_0,
+	.param .b64 __assertfail_param_1,
+	.param .b32 __assertfail_param_2,
+	.param .b64 __assertfail_param_3,
+	.param .b64 __assertfail_param_4
+)
+;
+.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
+.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
+.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55};
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u64 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<24>;
+	.reg .b16 	%rs<21>;
+	.reg .b32 	%r<21>;
+	.reg .b64 	%rd<58>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd9, [triton__0d1d2de_param_1];
+	ld.param.u64 	%rd16, [triton__0d1d2de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r4, %tid.x;
+	and.b32  	%r1, %r4, 127;
+	shl.b32 	%r2, %r1, 1;
+	or.b32  	%r5, %r2, 1;
+	or.b32  	%r6, %r2, 256;
+	.loc	1 20 28
+	mov.u32 %r3, %ctaid.x;
+	.loc	1 20 46
+	mul.wide.s32 	%rd1, %r3, 512;
+	cvt.u64.u32 	%rd17, %r2;
+	cvt.u64.u32 	%rd18, %r6;
+	.loc	1 21 23
+	or.b64  	%rd2, %rd1, %rd17;
+	or.b64  	%rd3, %rd1, %rd18;
+	.loc	1 24 30
+	shl.b64 	%rd19, %rd2, 3;
+	add.s64 	%rd12, %rd16, %rd19;
+	add.s64 	%rd15, %rd12, 2048;
+	mov.pred 	%p20, -1;
+	.loc	1 24 35
+	mov.u64 %rd10, 0x0;
+	mov.u64 %rd11, 0x0;
+	@%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ];
+	mov.u64 %rd13, 0x0;
+	mov.u64 %rd14, 0x0;
+	@%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ];
+	.loc	1 26 19
+	setp.eq.s64 	%p3, %rd14, -1;
+	setp.eq.s64 	%p4, %rd13, -1;
+	setp.eq.s64 	%p5, %rd11, -1;
+	setp.eq.s64 	%p6, %rd10, -1;
+	.loc	1 28 32
+	selp.b64 	%rd20, 0, %rd10, %p6;
+	selp.b64 	%rd21, 0, %rd11, %p5;
+	selp.b64 	%rd22, 0, %rd13, %p4;
+	selp.b64 	%rd23, 0, %rd14, %p3;
+	.loc	1 29 18
+	add.s64 	%rd24, %rd23, 50257;
+	add.s64 	%rd25, %rd22, 50257;
+	add.s64 	%rd26, %rd21, 50257;
+	add.s64 	%rd27, %rd20, 50257;
+	.loc	1 30 18
+	setp.lt.s64 	%p7, %rd23, 0;
+	setp.lt.s64 	%p8, %rd22, 0;
+	setp.lt.s64 	%p9, %rd21, 0;
+	setp.lt.s64 	%p10, %rd20, 0;
+	.loc	1 31 32
+	selp.b64 	%rd7, %rd27, %rd20, %p10;
+	selp.b64 	%rd6, %rd26, %rd21, %p9;
+	selp.b64 	%rd5, %rd25, %rd22, %p8;
+	selp.b64 	%rd4, %rd24, %rd23, %p7;
+	.loc	1 32 36
+	setp.lt.u64 	%p11, %rd4, 50257;
+	setp.lt.u64 	%p12, %rd5, 50257;
+	setp.lt.u64 	%p13, %rd6, 50257;
+	setp.lt.u64 	%p14, %rd7, 50257;
+	mov.u32 	%r7, global_smem;
+	add.s32 	%r8, %r7, %r2;
+	selp.u16 	%rs1, 1, 0, %p14;
+	st.shared.u8 	[%r8], %rs1;
+	cvt.u64.u32 	%rd8, %r5;
+	selp.u16 	%rs2, 1, 0, %p13;
+	st.shared.u8 	[%r8+1], %rs2;
+	bar.sync 	0;
+	add.s32 	%r9, %r7, %r1;
+	ld.shared.u8 	%rs3, [%r9];
+	ld.shared.u8 	%rs4, [%r9+128];
+	bar.sync 	0;
+	selp.u16 	%rs5, 1, 0, %p12;
+	st.shared.u8 	[%r8], %rs5;
+	selp.u16 	%rs6, 1, 0, %p11;
+	st.shared.u8 	[%r8+1], %rs6;
+	bar.sync 	0;
+	ld.shared.u8 	%rs7, [%r9];
+	ld.shared.u8 	%rs8, [%r9+128];
+	setp.eq.s16 	%p15, %rs7, 0;
+	selp.u16 	%rs9, 1, 0, %p15;
+	shl.b16 	%rs10, %rs9, 2;
+	setp.eq.s16 	%p16, %rs8, 0;
+	selp.u16 	%rs11, -1, 0, %p16;
+	shl.b16 	%rs12, %rs11, 3;
+	or.b16  	%rs13, %rs12, %rs10;
+	setp.eq.s16 	%p17, %rs4, 0;
+	selp.u16 	%rs14, 1, 0, %p17;
+	setp.eq.s16 	%p18, %rs3, 0;
+	selp.u16 	%rs15, -1, 0, %p18;
+	shl.b16 	%rs16, %rs15, 1;
+	or.b16  	%rs17, %rs14, %rs16;
+	and.b16  	%rs18, %rs17, 3;
+	or.b16  	%rs19, %rs18, %rs13;
+	.loc	1 32 51
+	and.b16  	%rs20, %rs19, 15;
+	setp.eq.s16 	%p19, %rs20, 0;
+	@%p19 bra 	$L__BB0_2;
+	mov.u64 	%rd28, assertMessage_0;
+	cvta.global.u64 	%rd29, %rd28;
+	mov.u64 	%rd30, assertFile_0;
+	cvta.global.u64 	%rd31, %rd30;
+	mov.u64 	%rd32, assertFunc_0;
+	cvta.global.u64 	%rd33, %rd32;
+	mov.b32 	%r10, 883;
+	mov.u64 	%rd34, 1;
+	{ // callseq 0, 0
+	.reg .b32 temp_param_reg;
+	.param .b64 param0;
+	st.param.b64 	[param0+0], %rd29;
+	.param .b64 param1;
+	st.param.b64 	[param1+0], %rd31;
+	.param .b32 param2;
+	st.param.b32 	[param2+0], %r10;
+	.param .b64 param3;
+	st.param.b64 	[param3+0], %rd33;
+	.param .b64 param4;
+	st.param.b64 	[param4+0], %rd34;
+	call.uni 
+	__assertfail, 
+	(
+	param0, 
+	param1, 
+	param2, 
+	param3, 
+	param4
+	);
+	} // callseq 0
+$L__BB0_2:
+	.loc	1 21 36
+	or.b32  	%r15, %r2, 257;
+	cvt.u64.u32 	%rd39, %r15;
+	.loc	1 21 23
+	or.b64  	%rd40, %rd1, %rd39;
+	or.b64  	%rd41, %rd1, %rd8;
+	.loc	1 34 25
+	shl.b64 	%rd42, %rd7, 2;
+	add.s64 	%rd43, %rd9, %rd42;
+	mul.lo.s64 	%rd44, %rd2, 201028;
+	add.s64 	%rd45, %rd43, %rd44;
+	shl.b64 	%rd46, %rd6, 2;
+	add.s64 	%rd47, %rd9, %rd46;
+	mul.lo.s64 	%rd48, %rd41, 201028;
+	add.s64 	%rd49, %rd47, %rd48;
+	shl.b64 	%rd50, %rd5, 2;
+	add.s64 	%rd51, %rd9, %rd50;
+	mul.lo.s64 	%rd52, %rd3, 201028;
+	add.s64 	%rd53, %rd51, %rd52;
+	shl.b64 	%rd54, %rd4, 2;
+	add.s64 	%rd55, %rd9, %rd54;
+	mul.lo.s64 	%rd56, %rd40, 201028;
+	add.s64 	%rd57, %rd55, %rd56;
+	.loc	1 34 51
+	bar.sync 	0;
+	shl.b32 	%r16, %r2, 3;
+	add.s32 	%r18, %r7, %r16;
+	st.shared.u64 	[%r18], %rd45;
+	st.shared.u64 	[%r18+8], %rd49;
+	bar.sync 	0;
+	shl.b32 	%r19, %r1, 3;
+	add.s32 	%r20, %r7, %r19;
+	ld.shared.u64 	%rd35, [%r20];
+	ld.shared.u64 	%rd36, [%r20+1024];
+	bar.sync 	0;
+	st.shared.u64 	[%r18], %rd53;
+	st.shared.u64 	[%r18+8], %rd57;
+	bar.sync 	0;
+	ld.shared.u64 	%rd37, [%r20];
+	ld.shared.u64 	%rd38, [%r20+1024];
+	mov.b32 	%r11, -1082130432;
+	@%p20 st.global.b32 [ %rd35 + 0 ], { %r11 };
+	@%p20 st.global.b32 [ %rd36 + 0 ], { %r11 };
+	@%p20 st.global.b32 [ %rd37 + 0 ], { %r11 };
+	@%p20 st.global.b32 [ %rd38 + 0 ], { %r11 };
+	.loc	1 34 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 104
+.b8 108
+.b8 114
+.b8 107
+.b8 103
+.b8 112
+.b8 118
+.b8 118
+.b8 98
+.b8 100
+.b8 105
+.b8 122
+.b8 100
+.b8 122
+.b8 55
+.b8 115
+.b8 108
+.b8 108
+.b8 113
+.b8 117
+.b8 101
+.b8 116
+.b8 50
+.b8 106
+.b8 55
+.b8 122
+.b8 104
+.b8 116
+.b8 101
+.b8 115
+.b8 54
+.b8 109
+.b8 101
+.b8 104
+.b8 54
+.b8 107
+.b8 101
+.b8 110
+.b8 114
+.b8 113
+.b8 120
+.b8 111
+.b8 118
+.b8 50
+.b8 54
+.b8 109
+.b8 115
+.b8 119
+.b8 118
+.b8 119
+.b8 55
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 104
+.b8 108
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..0b7d78df8ee7c3a7698074c07e6a85c89557a262
--- /dev/null
+++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir
@@ -0,0 +1,38 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<50257> : tensor<512xi64, #blocked>
+    %cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked>
+    %cst_1 = arith.constant dense<-1> : tensor<512xi64, #blocked>
+    %cst_2 = arith.constant dense<-1.000000e+00> : tensor<512xf32, #blocked1>
+    %c512_i64 = arith.constant 512 : i64
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.muli %1, %c512_i64 : i64
+    %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
+    %5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
+    %6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
+    %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>, #blocked>
+    %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>, #blocked>, tensor<512xi64, #blocked>
+    %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64, #blocked>
+    %10 = arith.cmpi ne, %9, %cst_1 : tensor<512xi64, #blocked>
+    %11 = arith.select %10, %9, %cst_0 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
+    %12 = arith.addi %11, %cst : tensor<512xi64, #blocked>
+    %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64, #blocked>
+    %14 = arith.select %13, %12, %11 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
+    %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64, #blocked>
+    %16 = arith.cmpi slt, %14, %cst : tensor<512xi64, #blocked>
+    %17 = arith.andi %15, %16 : tensor<512xi1, #blocked>
+    %18 = triton_gpu.convert_layout %17 : (tensor<512xi1, #blocked>) -> tensor<512xi1, #blocked1>
+    tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1, #blocked1>
+    %19 = arith.muli %6, %cst : tensor<512xi64, #blocked>
+    %20 = arith.addi %14, %19 : tensor<512xi64, #blocked>
+    %21 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %22 = tt.addptr %21, %20 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
+    %23 = triton_gpu.convert_layout %22 : (tensor<512x!tt.ptr<f32, 1>, #blocked>) -> tensor<512x!tt.ptr<f32, 1>, #blocked1>
+    tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked1>
+    tt.return
+  }
+}
diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..ac625daa75a7b2e0a1660cd29001f4943ddb9c7b
--- /dev/null
+++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir
@@ -0,0 +1,34 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<50257> : tensor<512xi64>
+    %cst_0 = arith.constant dense<0> : tensor<512xi64>
+    %c512_i64 = arith.constant 512 : i64
+    %cst_1 = arith.constant dense<-1.000000e+00> : tensor<512xf32>
+    %cst_2 = arith.constant dense<-1> : tensor<512xi64>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.extsi %0 : i32 to i64
+    %2 = arith.muli %1, %c512_i64 : i64
+    %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64>
+    %5 = tt.splat %2 : (i64) -> tensor<512xi64>
+    %6 = arith.addi %5, %4 : tensor<512xi64>
+    %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>>
+    %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>>, tensor<512xi64>
+    %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64>
+    %10 = arith.cmpi ne, %9, %cst_2 : tensor<512xi64>
+    %11 = arith.select %10, %9, %cst_0 : tensor<512xi1>, tensor<512xi64>
+    %12 = arith.addi %11, %cst : tensor<512xi64>
+    %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64>
+    %14 = arith.select %13, %12, %11 : tensor<512xi1>, tensor<512xi64>
+    %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64>
+    %16 = arith.cmpi slt, %14, %cst : tensor<512xi64>
+    %17 = arith.andi %15, %16 : tensor<512xi1>
+    tt.assert %17, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1>
+    %18 = arith.muli %6, %cst : tensor<512xi64>
+    %19 = arith.addi %14, %18 : tensor<512xi64>
+    %20 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %21 = tt.addptr %20, %19 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi64>
+    tt.store %21, %cst_1 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..bee0c6b92fa96062bbd3c250fff66034763e177d
--- /dev/null
+++ b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx
@@ -0,0 +1,312 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<9>;
+	.reg .b32 	%r<31>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r22, %tid.x;
+	shl.b32 	%r23, %r22, 3;
+	and.b32  	%r24, %r23, 1016;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r25, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r26, %r25, %r24;
+	.loc	1 24 30
+	mul.wide.s32 	%rd6, %r26, 4;
+	add.s64 	%rd1, %rd4, %rd6;
+	add.s64 	%rd2, %rd1, 16;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	mov.u32 %r12, 0x0;
+	mov.u32 %r13, 0x0;
+	@%p1 ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd1 + 0 ];
+	mov.u32 %r14, 0x0;
+	mov.u32 %r15, 0x0;
+	mov.u32 %r16, 0x0;
+	mov.u32 %r17, 0x0;
+	@%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
+	.loc	1 26 25
+	mul.wide.s32 	%rd7, %r26, 2;
+	add.s64 	%rd3, %rd5, %rd7;
+	.loc	1 26 36
+	cvt.rn.bf16.f32 %rs1, %r10;
+	cvt.rn.bf16.f32 %rs2, %r11;
+	cvt.rn.bf16.f32 %rs3, %r12;
+	cvt.rn.bf16.f32 %rs4, %r13;
+	cvt.rn.bf16.f32 %rs5, %r14;
+	cvt.rn.bf16.f32 %rs6, %r15;
+	cvt.rn.bf16.f32 %rs7, %r16;
+	cvt.rn.bf16.f32 %rs8, %r17;
+	mov.b32 	%r27, {%rs1, %rs2};
+	mov.b32 	%r28, {%rs3, %rs4};
+	mov.b32 	%r29, {%rs5, %rs6};
+	mov.b32 	%r30, {%rs7, %rs8};
+	@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r27, %r28, %r29, %r30 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/5t/c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 53
+.b8 116
+.b8 114
+.b8 121
+.b8 112
+.b8 53
+.b8 113
+.b8 119
+.b8 107
+.b8 104
+.b8 114
+.b8 101
+.b8 105
+.b8 106
+.b8 107
+.b8 55
+.b8 115
+.b8 53
+.b8 120
+.b8 51
+.b8 50
+.b8 55
+.b8 119
+.b8 111
+.b8 102
+.b8 122
+.b8 53
+.b8 52
+.b8 108
+.b8 119
+.b8 106
+.b8 52
+.b8 107
+.b8 118
+.b8 99
+.b8 116
+.b8 117
+.b8 113
+.b8 100
+.b8 122
+.b8 118
+.b8 50
+.b8 118
+.b8 114
+.b8 102
+.b8 50
+.b8 120
+.b8 121
+.b8 111
+.b8 110
+.b8 115
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 53
+.b8 116
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..614e4de8b1d1d1bea9558461182cf294bdee414b
--- /dev/null
+++ b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir
@@ -0,0 +1,19 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
+    %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
+    %10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0d55d8e52e5055c0628c4a6cd43dddde5d53d7e4
Binary files /dev/null and b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin differ
diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..962967057117e882cdf06f7455d5b47cc5232325
--- /dev/null
+++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir
@@ -0,0 +1,56 @@
+module {
+  tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c8_i32 = arith.constant 8 : i32
+    %c128_i32 = arith.constant 128 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %cst = arith.constant dense<32768> : tensor<64x1xi32>
+    %cst_0 = arith.constant dense<256> : tensor<1x8xi32>
+    %cst_1 = arith.constant dense<128> : tensor<1x8xi32>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
+    %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
+    %c64_i32 = arith.constant 64 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c64_i32 : i32
+    %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
+    %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
+    %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
+    %5 = arith.addi %4, %3 : tensor<64x1xi32>
+    %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
+    %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
+    %8 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
+    %9 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
+    %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %11 = arith.muli %9, %cst : tensor<64x1xi32>
+    %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
+    %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
+    %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_2) -> (tensor<64x8xf32>)  : i32 {
+      %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
+      %21 = arith.addi %20, %7 : tensor<1x8xi32>
+      %22 = arith.cmpi slt, %21, %cst_1 : tensor<1x8xi32>
+      %23 = arith.muli %21, %cst_0 : tensor<1x8xi32>
+      %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
+      %25 = arith.addi %10, %24 : tensor<64x8xi32>
+      %26 = arith.addi %25, %12 : tensor<64x8xi32>
+      %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
+      %29 = tt.load %27, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %30 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
+      %31 = tt.load %30, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
+      %32 = arith.mulf %29, %31 : tensor<64x8xf32>
+      %33 = arith.addf %arg6, %32 : tensor<64x8xf32>
+      %34 = arith.select %28, %33, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
+      scf.yield %34 : tensor<64x8xf32>
+    }
+    %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %20 = arith.addf %arg5, %arg6 : f32
+      tt.reduce.return %20 : f32
+    }) : (tensor<64x8xf32>) -> tensor<64xf32>
+    %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
+    %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
+    %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
+    tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir b/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..989f4b0263f466b7932aaa29efeacd8142cc4858
--- /dev/null
+++ b/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir
@@ -0,0 +1,88 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
+    %cst_0 = arith.constant dense<-1> : tensor<1xi64, #blocked>
+    %cst_1 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
+    %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
+    %cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked>
+    %cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked>
+    %cst_5 = arith.constant 0.000000e+00 : f32
+    %c256_i32 = arith.constant 256 : i32
+    %cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
+    %cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
+    %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
+    %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
+    %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
+    %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
+    %8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
+    %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
+    %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
+    %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
+    %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
+    %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
+    %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
+    %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
+    %22 = tt.addptr %arg6, %0 : !tt.ptr<i64, 1>, i32
+    %23 = tt.splat %22 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
+    %24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
+    %25 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %26 = tt.addptr %25, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
+    %27 = tt.load %26, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
+    %28 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
+    %29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %63 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %63 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %31 = arith.addf %30, %cst_5 : f32
+    %32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
+    %33 = arith.subf %15, %32 : tensor<256xf32, #blocked>
+    %34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
+    %35 = arith.mulf %33, %34 : tensor<256xf32, #blocked>
+    %36 = arith.mulf %28, %35 : tensor<256xf32, #blocked>
+    %37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %63 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %63 : f32
+    }) : (tensor<256xf32, #blocked>) -> f32
+    %39 = arith.addf %38, %cst_5 : f32
+    %40 = arith.cmpi eq, %24, %cst_0 : tensor<1xi64, #blocked>
+    %41 = arith.divf %21, %cst_1 : tensor<1xf32, #blocked>
+    %42 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked>
+    %43 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked>
+    %44 = arith.subf %42, %43 : tensor<256xf32, #blocked>
+    %45 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
+    %46 = arith.mulf %35, %45 : tensor<256xf32, #blocked>
+    %47 = arith.subf %44, %46 : tensor<256xf32, #blocked>
+    %48 = tt.broadcast %41 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
+    %49 = arith.mulf %48, %47 : tensor<256xf32, #blocked>
+    %50 = arith.addf %27, %49 : tensor<256xf32, #blocked>
+    %51 = tt.broadcast %40 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked>
+    %52 = arith.select %51, %cst_6, %50 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
+    %53 = arith.addi %24, %cst_4 : tensor<1xi64, #blocked>
+    %54 = arith.cmpi slt, %24, %cst_3 : tensor<1xi64, #blocked>
+    %55 = arith.select %54, %53, %24 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
+    %56 = arith.muli %55, %cst_2 : tensor<1xi64, #blocked>
+    %57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
+    %58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
+    %59 = arith.addi %58, %57 : tensor<256xi64, #blocked>
+    %60 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
+    %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
+    %62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..620845f02dff7a50f12b12538eb505d2fb4b62ba
--- /dev/null
+++ b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx
@@ -0,0 +1,778 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<27>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<67>;
+	.reg .f32 	%f<431>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r14, %tid.x;
+	shl.b32 	%r15, %r14, 3;
+	and.b32  	%r16, %r15, 1016;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r17, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r18, %r17, %r16;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r18, 2;
+	add.s64 	%rd3, %rd4, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd3 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	cvt.u16.u32 	%rs3, %r3;
+	.loc	1 24 44
+	cvt.f32.bf16 %r6, %rs1;
+	mov.b32 	%f1, %r6;
+	cvt.f32.bf16 %r7, %rs2;
+	mov.b32 	%f2, %r7;
+	.loc	1 29 18
+	mul.f32 	%f9, %f1, 0f3F3504F3;
+	.loc	1 30 23
+	abs.ftz.f32 	%f17, %f9;
+	setp.ge.f32 	%p2, %f17, 0f3F8060FE;
+	mov.f32 	%f365, 0f3789CA3C;
+	mov.f32 	%f364, 0fB9F560B9;
+	mov.f32 	%f363, 0f3BAC840B;
+	mov.f32 	%f362, 0fBD0C8162;
+	mov.f32 	%f361, 0f3E1CF906;
+	mov.f32 	%f360, 0f3F6A937E;
+	mov.f32 	%f359, 0f3F20D842;
+	mov.f32 	%f366, %f17;
+	@%p2 bra 	$L__BB0_2;
+	.loc	1 0 23
+	mov.f32 	%f365, 0f38B1E96A;
+	mov.f32 	%f364, 0fBA574D20;
+	mov.f32 	%f363, 0f3BAAD5EA;
+	mov.f32 	%f362, 0fBCDC1BE7;
+	mov.f32 	%f361, 0f3DE718AF;
+	mov.f32 	%f360, 0fBEC093AC;
+	mov.f32 	%f359, 0f3E0375D3;
+	.loc	1 30 23
+	mul.f32 	%f366, %f9, %f9;
+$L__BB0_2:
+	.loc	1 0 0
+	cvt.f32.bf16 %r8, %rs3;
+	mul.f32 	%f10, %f2, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p3, %f17, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f135, %f365, %f366, %f364;
+	fma.rn.ftz.f32 	%f136, %f135, %f366, %f363;
+	fma.rn.ftz.f32 	%f137, %f136, %f366, %f362;
+	fma.rn.ftz.f32 	%f138, %f137, %f366, %f361;
+	fma.rn.ftz.f32 	%f139, %f138, %f366, %f360;
+	fma.rn.ftz.f32 	%f140, %f139, %f366, %f359;
+	neg.f32 	%f141, %f366;
+	selp.f32 	%f142, %f141, %f9, %p2;
+	fma.rn.ftz.f32 	%f367, %f140, %f142, %f142;
+	mov.f32 	%f358, 0f3F800000;
+	@%p3 bra 	$L__BB0_4;
+	ex2.approx.ftz.f32 	%f143, %f367;
+	sub.f32 	%f145, %f358, %f143;
+	mov.b32 	%r19, %f145;
+	mov.b32 	%r20, %f9;
+	and.b32  	%r21, %r20, -2147483648;
+	or.b32  	%r22, %r21, %r19;
+	mov.b32 	%f367, %r22;
+$L__BB0_4:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+	mov.b32 	%f3, %r8;
+	.loc	1 30 23
+	abs.ftz.f32 	%f30, %f10;
+	setp.ge.f32 	%p5, %f30, 0f3F8060FE;
+	mov.f32 	%f374, 0f3789CA3C;
+	mov.f32 	%f373, 0fB9F560B9;
+	mov.f32 	%f372, 0f3BAC840B;
+	mov.f32 	%f371, 0fBD0C8162;
+	mov.f32 	%f370, 0f3E1CF906;
+	mov.f32 	%f369, 0f3F6A937E;
+	mov.f32 	%f368, 0f3F20D842;
+	mov.f32 	%f375, %f30;
+	@%p5 bra 	$L__BB0_6;
+	mul.f32 	%f375, %f10, %f10;
+	mov.f32 	%f374, 0f38B1E96A;
+	mov.f32 	%f373, 0fBA574D20;
+	mov.f32 	%f372, 0f3BAAD5EA;
+	mov.f32 	%f371, 0fBCDC1BE7;
+	mov.f32 	%f370, 0f3DE718AF;
+	mov.f32 	%f369, 0fBEC093AC;
+	mov.f32 	%f368, 0f3E0375D3;
+$L__BB0_6:
+	.loc	1 0 0
+	cvt.f32.bf16 %r9, %rs4;
+	mul.f32 	%f11, %f3, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p6, %f30, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f160, %f374, %f375, %f373;
+	fma.rn.ftz.f32 	%f161, %f160, %f375, %f372;
+	fma.rn.ftz.f32 	%f162, %f161, %f375, %f371;
+	fma.rn.ftz.f32 	%f163, %f162, %f375, %f370;
+	fma.rn.ftz.f32 	%f164, %f163, %f375, %f369;
+	fma.rn.ftz.f32 	%f165, %f164, %f375, %f368;
+	neg.f32 	%f166, %f375;
+	selp.f32 	%f167, %f166, %f10, %p5;
+	fma.rn.ftz.f32 	%f376, %f165, %f167, %f167;
+	@%p6 bra 	$L__BB0_8;
+	ex2.approx.ftz.f32 	%f168, %f376;
+	sub.f32 	%f170, %f358, %f168;
+	mov.b32 	%r23, %f170;
+	mov.b32 	%r24, %f10;
+	and.b32  	%r25, %r24, -2147483648;
+	or.b32  	%r26, %r25, %r23;
+	mov.b32 	%f376, %r26;
+$L__BB0_8:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs5, %r4;
+	mov.b32 	%f4, %r9;
+	.loc	1 30 23
+	abs.ftz.f32 	%f43, %f11;
+	setp.ge.f32 	%p8, %f43, 0f3F8060FE;
+	mov.f32 	%f383, 0f3789CA3C;
+	mov.f32 	%f382, 0fB9F560B9;
+	mov.f32 	%f381, 0f3BAC840B;
+	mov.f32 	%f380, 0fBD0C8162;
+	mov.f32 	%f379, 0f3E1CF906;
+	mov.f32 	%f378, 0f3F6A937E;
+	mov.f32 	%f377, 0f3F20D842;
+	mov.f32 	%f384, %f43;
+	@%p8 bra 	$L__BB0_10;
+	mul.f32 	%f384, %f11, %f11;
+	mov.f32 	%f383, 0f38B1E96A;
+	mov.f32 	%f382, 0fBA574D20;
+	mov.f32 	%f381, 0f3BAAD5EA;
+	mov.f32 	%f380, 0fBCDC1BE7;
+	mov.f32 	%f379, 0f3DE718AF;
+	mov.f32 	%f378, 0fBEC093AC;
+	mov.f32 	%f377, 0f3E0375D3;
+$L__BB0_10:
+	.loc	1 0 0
+	cvt.f32.bf16 %r10, %rs5;
+	mul.f32 	%f12, %f4, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p9, %f43, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f185, %f383, %f384, %f382;
+	fma.rn.ftz.f32 	%f186, %f185, %f384, %f381;
+	fma.rn.ftz.f32 	%f187, %f186, %f384, %f380;
+	fma.rn.ftz.f32 	%f188, %f187, %f384, %f379;
+	fma.rn.ftz.f32 	%f189, %f188, %f384, %f378;
+	fma.rn.ftz.f32 	%f190, %f189, %f384, %f377;
+	neg.f32 	%f191, %f384;
+	selp.f32 	%f192, %f191, %f11, %p8;
+	fma.rn.ftz.f32 	%f385, %f190, %f192, %f192;
+	@%p9 bra 	$L__BB0_12;
+	ex2.approx.ftz.f32 	%f193, %f385;
+	sub.f32 	%f195, %f358, %f193;
+	mov.b32 	%r27, %f195;
+	mov.b32 	%r28, %f11;
+	and.b32  	%r29, %r28, -2147483648;
+	or.b32  	%r30, %r29, %r27;
+	mov.b32 	%f385, %r30;
+$L__BB0_12:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
+	mov.b32 	%f5, %r10;
+	.loc	1 30 23
+	abs.ftz.f32 	%f56, %f12;
+	setp.ge.f32 	%p11, %f56, 0f3F8060FE;
+	mov.f32 	%f392, 0f3789CA3C;
+	mov.f32 	%f391, 0fB9F560B9;
+	mov.f32 	%f390, 0f3BAC840B;
+	mov.f32 	%f389, 0fBD0C8162;
+	mov.f32 	%f388, 0f3E1CF906;
+	mov.f32 	%f387, 0f3F6A937E;
+	mov.f32 	%f386, 0f3F20D842;
+	mov.f32 	%f393, %f56;
+	@%p11 bra 	$L__BB0_14;
+	mul.f32 	%f393, %f12, %f12;
+	mov.f32 	%f392, 0f38B1E96A;
+	mov.f32 	%f391, 0fBA574D20;
+	mov.f32 	%f390, 0f3BAAD5EA;
+	mov.f32 	%f389, 0fBCDC1BE7;
+	mov.f32 	%f388, 0f3DE718AF;
+	mov.f32 	%f387, 0fBEC093AC;
+	mov.f32 	%f386, 0f3E0375D3;
+$L__BB0_14:
+	.loc	1 0 0
+	cvt.f32.bf16 %r11, %rs6;
+	mul.f32 	%f13, %f5, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p12, %f56, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f210, %f392, %f393, %f391;
+	fma.rn.ftz.f32 	%f211, %f210, %f393, %f390;
+	fma.rn.ftz.f32 	%f212, %f211, %f393, %f389;
+	fma.rn.ftz.f32 	%f213, %f212, %f393, %f388;
+	fma.rn.ftz.f32 	%f214, %f213, %f393, %f387;
+	fma.rn.ftz.f32 	%f215, %f214, %f393, %f386;
+	neg.f32 	%f216, %f393;
+	selp.f32 	%f217, %f216, %f12, %p11;
+	fma.rn.ftz.f32 	%f394, %f215, %f217, %f217;
+	@%p12 bra 	$L__BB0_16;
+	ex2.approx.ftz.f32 	%f218, %f394;
+	sub.f32 	%f220, %f358, %f218;
+	mov.b32 	%r31, %f220;
+	mov.b32 	%r32, %f12;
+	and.b32  	%r33, %r32, -2147483648;
+	or.b32  	%r34, %r33, %r31;
+	mov.b32 	%f394, %r34;
+$L__BB0_16:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs7, %r5;
+	mov.b32 	%f6, %r11;
+	.loc	1 30 23
+	abs.ftz.f32 	%f69, %f13;
+	setp.ge.f32 	%p14, %f69, 0f3F8060FE;
+	mov.f32 	%f401, 0f3789CA3C;
+	mov.f32 	%f400, 0fB9F560B9;
+	mov.f32 	%f399, 0f3BAC840B;
+	mov.f32 	%f398, 0fBD0C8162;
+	mov.f32 	%f397, 0f3E1CF906;
+	mov.f32 	%f396, 0f3F6A937E;
+	mov.f32 	%f395, 0f3F20D842;
+	mov.f32 	%f402, %f69;
+	@%p14 bra 	$L__BB0_18;
+	mul.f32 	%f402, %f13, %f13;
+	mov.f32 	%f401, 0f38B1E96A;
+	mov.f32 	%f400, 0fBA574D20;
+	mov.f32 	%f399, 0f3BAAD5EA;
+	mov.f32 	%f398, 0fBCDC1BE7;
+	mov.f32 	%f397, 0f3DE718AF;
+	mov.f32 	%f396, 0fBEC093AC;
+	mov.f32 	%f395, 0f3E0375D3;
+$L__BB0_18:
+	.loc	1 0 0
+	cvt.f32.bf16 %r12, %rs7;
+	mul.f32 	%f14, %f6, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p15, %f69, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f235, %f401, %f402, %f400;
+	fma.rn.ftz.f32 	%f236, %f235, %f402, %f399;
+	fma.rn.ftz.f32 	%f237, %f236, %f402, %f398;
+	fma.rn.ftz.f32 	%f238, %f237, %f402, %f397;
+	fma.rn.ftz.f32 	%f239, %f238, %f402, %f396;
+	fma.rn.ftz.f32 	%f240, %f239, %f402, %f395;
+	neg.f32 	%f241, %f402;
+	selp.f32 	%f242, %f241, %f13, %p14;
+	fma.rn.ftz.f32 	%f403, %f240, %f242, %f242;
+	@%p15 bra 	$L__BB0_20;
+	ex2.approx.ftz.f32 	%f243, %f403;
+	sub.f32 	%f245, %f358, %f243;
+	mov.b32 	%r35, %f245;
+	mov.b32 	%r36, %f13;
+	and.b32  	%r37, %r36, -2147483648;
+	or.b32  	%r38, %r37, %r35;
+	mov.b32 	%f403, %r38;
+$L__BB0_20:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
+	mov.b32 	%f7, %r12;
+	.loc	1 30 23
+	abs.ftz.f32 	%f82, %f14;
+	setp.ge.f32 	%p17, %f82, 0f3F8060FE;
+	mov.f32 	%f410, 0f3789CA3C;
+	mov.f32 	%f409, 0fB9F560B9;
+	mov.f32 	%f408, 0f3BAC840B;
+	mov.f32 	%f407, 0fBD0C8162;
+	mov.f32 	%f406, 0f3E1CF906;
+	mov.f32 	%f405, 0f3F6A937E;
+	mov.f32 	%f404, 0f3F20D842;
+	mov.f32 	%f411, %f82;
+	@%p17 bra 	$L__BB0_22;
+	mul.f32 	%f411, %f14, %f14;
+	mov.f32 	%f410, 0f38B1E96A;
+	mov.f32 	%f409, 0fBA574D20;
+	mov.f32 	%f408, 0f3BAAD5EA;
+	mov.f32 	%f407, 0fBCDC1BE7;
+	mov.f32 	%f406, 0f3DE718AF;
+	mov.f32 	%f405, 0fBEC093AC;
+	mov.f32 	%f404, 0f3E0375D3;
+$L__BB0_22:
+	.loc	1 0 0
+	cvt.f32.bf16 %r13, %rs8;
+	mul.f32 	%f15, %f7, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p18, %f82, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f260, %f410, %f411, %f409;
+	fma.rn.ftz.f32 	%f261, %f260, %f411, %f408;
+	fma.rn.ftz.f32 	%f262, %f261, %f411, %f407;
+	fma.rn.ftz.f32 	%f263, %f262, %f411, %f406;
+	fma.rn.ftz.f32 	%f264, %f263, %f411, %f405;
+	fma.rn.ftz.f32 	%f265, %f264, %f411, %f404;
+	neg.f32 	%f266, %f411;
+	selp.f32 	%f267, %f266, %f14, %p17;
+	fma.rn.ftz.f32 	%f412, %f265, %f267, %f267;
+	@%p18 bra 	$L__BB0_24;
+	ex2.approx.ftz.f32 	%f268, %f412;
+	sub.f32 	%f270, %f358, %f268;
+	mov.b32 	%r39, %f270;
+	mov.b32 	%r40, %f14;
+	and.b32  	%r41, %r40, -2147483648;
+	or.b32  	%r42, %r41, %r39;
+	mov.b32 	%f412, %r42;
+$L__BB0_24:
+	.loc	1 0 0
+	mov.b32 	%f8, %r13;
+	.loc	1 30 23
+	abs.ftz.f32 	%f95, %f15;
+	setp.ge.f32 	%p20, %f95, 0f3F8060FE;
+	mov.f32 	%f419, 0f3789CA3C;
+	mov.f32 	%f418, 0fB9F560B9;
+	mov.f32 	%f417, 0f3BAC840B;
+	mov.f32 	%f416, 0fBD0C8162;
+	mov.f32 	%f415, 0f3E1CF906;
+	mov.f32 	%f414, 0f3F6A937E;
+	mov.f32 	%f413, 0f3F20D842;
+	mov.f32 	%f420, %f95;
+	@%p20 bra 	$L__BB0_26;
+	mul.f32 	%f420, %f15, %f15;
+	mov.f32 	%f419, 0f38B1E96A;
+	mov.f32 	%f418, 0fBA574D20;
+	mov.f32 	%f417, 0f3BAAD5EA;
+	mov.f32 	%f416, 0fBCDC1BE7;
+	mov.f32 	%f415, 0f3DE718AF;
+	mov.f32 	%f414, 0fBEC093AC;
+	mov.f32 	%f413, 0f3E0375D3;
+$L__BB0_26:
+	.loc	1 0 0
+	mul.f32 	%f16, %f8, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p21, %f95, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f285, %f419, %f420, %f418;
+	fma.rn.ftz.f32 	%f286, %f285, %f420, %f417;
+	fma.rn.ftz.f32 	%f287, %f286, %f420, %f416;
+	fma.rn.ftz.f32 	%f288, %f287, %f420, %f415;
+	fma.rn.ftz.f32 	%f289, %f288, %f420, %f414;
+	fma.rn.ftz.f32 	%f290, %f289, %f420, %f413;
+	neg.f32 	%f291, %f420;
+	selp.f32 	%f292, %f291, %f15, %p20;
+	fma.rn.ftz.f32 	%f421, %f290, %f292, %f292;
+	@%p21 bra 	$L__BB0_28;
+	ex2.approx.ftz.f32 	%f293, %f421;
+	sub.f32 	%f295, %f358, %f293;
+	mov.b32 	%r43, %f295;
+	mov.b32 	%r44, %f15;
+	and.b32  	%r45, %r44, -2147483648;
+	or.b32  	%r46, %r45, %r43;
+	mov.b32 	%f421, %r46;
+$L__BB0_28:
+	abs.ftz.f32 	%f108, %f16;
+	setp.ge.f32 	%p23, %f108, 0f3F8060FE;
+	mov.f32 	%f428, 0f3789CA3C;
+	mov.f32 	%f427, 0fB9F560B9;
+	mov.f32 	%f426, 0f3BAC840B;
+	mov.f32 	%f425, 0fBD0C8162;
+	mov.f32 	%f424, 0f3E1CF906;
+	mov.f32 	%f423, 0f3F6A937E;
+	mov.f32 	%f422, 0f3F20D842;
+	mov.f32 	%f429, %f108;
+	@%p23 bra 	$L__BB0_30;
+	mul.f32 	%f429, %f16, %f16;
+	mov.f32 	%f428, 0f38B1E96A;
+	mov.f32 	%f427, 0fBA574D20;
+	mov.f32 	%f426, 0f3BAAD5EA;
+	mov.f32 	%f425, 0fBCDC1BE7;
+	mov.f32 	%f424, 0f3DE718AF;
+	mov.f32 	%f423, 0fBEC093AC;
+	mov.f32 	%f422, 0f3E0375D3;
+$L__BB0_30:
+	.loc	1 0 23
+	ld.param.u64 	%rd2, [triton__0d1d2de_param_1];
+	cvt.s64.s32 	%rd1, %r18;
+	.loc	1 30 23
+	setp.ltu.f32 	%p24, %f108, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f310, %f428, %f429, %f427;
+	fma.rn.ftz.f32 	%f311, %f310, %f429, %f426;
+	fma.rn.ftz.f32 	%f312, %f311, %f429, %f425;
+	fma.rn.ftz.f32 	%f313, %f312, %f429, %f424;
+	fma.rn.ftz.f32 	%f314, %f313, %f429, %f423;
+	fma.rn.ftz.f32 	%f315, %f314, %f429, %f422;
+	neg.f32 	%f316, %f429;
+	selp.f32 	%f317, %f316, %f16, %p23;
+	fma.rn.ftz.f32 	%f430, %f315, %f317, %f317;
+	@%p24 bra 	$L__BB0_32;
+	ex2.approx.ftz.f32 	%f318, %f430;
+	sub.f32 	%f320, %f358, %f318;
+	mov.b32 	%r47, %f320;
+	mov.b32 	%r48, %f16;
+	and.b32  	%r49, %r48, -2147483648;
+	or.b32  	%r50, %r49, %r47;
+	mov.b32 	%f430, %r50;
+$L__BB0_32:
+	.loc	1 27 18
+	mul.f32 	%f321, %f8, 0f3F000000;
+	mul.f32 	%f322, %f7, 0f3F000000;
+	mul.f32 	%f323, %f6, 0f3F000000;
+	mul.f32 	%f324, %f5, 0f3F000000;
+	mul.f32 	%f325, %f4, 0f3F000000;
+	mul.f32 	%f326, %f3, 0f3F000000;
+	mul.f32 	%f327, %f2, 0f3F000000;
+	mul.f32 	%f328, %f1, 0f3F000000;
+	.loc	1 32 18
+	add.f32 	%f329, %f367, 0f3F800000;
+	add.f32 	%f330, %f376, 0f3F800000;
+	add.f32 	%f331, %f385, 0f3F800000;
+	add.f32 	%f332, %f394, 0f3F800000;
+	add.f32 	%f333, %f403, 0f3F800000;
+	add.f32 	%f334, %f412, 0f3F800000;
+	add.f32 	%f335, %f421, 0f3F800000;
+	add.f32 	%f336, %f430, 0f3F800000;
+	.loc	1 33 18
+	mul.f32 	%f337, %f328, %f329;
+	mul.f32 	%f338, %f327, %f330;
+	mul.f32 	%f339, %f326, %f331;
+	mul.f32 	%f340, %f325, %f332;
+	mul.f32 	%f341, %f324, %f333;
+	mul.f32 	%f342, %f323, %f334;
+	mul.f32 	%f343, %f322, %f335;
+	mul.f32 	%f344, %f321, %f336;
+	.loc	1 35 25
+	shl.b64 	%rd7, %rd1, 1;
+	add.s64 	%rd6, %rd2, %rd7;
+	.loc	1 35 37
+	mov.b32 	%r51, %f337;
+	cvt.rn.bf16.f32 %rs9, %r51;
+	mov.b32 	%r52, %f338;
+	cvt.rn.bf16.f32 %rs10, %r52;
+	mov.b32 	%r53, %f339;
+	cvt.rn.bf16.f32 %rs11, %r53;
+	mov.b32 	%r54, %f340;
+	cvt.rn.bf16.f32 %rs12, %r54;
+	mov.b32 	%r55, %f341;
+	cvt.rn.bf16.f32 %rs13, %r55;
+	mov.b32 	%r56, %f342;
+	cvt.rn.bf16.f32 %rs14, %r56;
+	mov.b32 	%r57, %f343;
+	cvt.rn.bf16.f32 %rs15, %r57;
+	mov.b32 	%r58, %f344;
+	cvt.rn.bf16.f32 %rs16, %r58;
+	mov.b32 	%r63, {%rs9, %rs10};
+	mov.b32 	%r64, {%rs11, %rs12};
+	mov.b32 	%r65, {%rs13, %rs14};
+	mov.b32 	%r66, {%rs15, %rs16};
+	@%p1 st.global.v4.b32 [ %rd6 + 0 ], { %r63, %r64, %r65, %r66 };
+	.loc	1 35 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	// .globl	__nv_erff
+.visible .func  (.param .b32 func_retval0) __nv_erff(
+	.param .b32 __nv_erff_param_0
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<49>;
+$L__func_begin1:
+
+	ld.param.f32 	%f14, [__nv_erff_param_0];
+	abs.ftz.f32 	%f1, %f14;
+	setp.ge.f32 	%p1, %f1, 0f3F8060FE;
+	mov.f32 	%f46, 0f3789CA3C;
+	mov.f32 	%f45, 0fB9F560B9;
+	mov.f32 	%f44, 0f3BAC840B;
+	mov.f32 	%f43, 0fBD0C8162;
+	mov.f32 	%f42, 0f3E1CF906;
+	mov.f32 	%f41, 0f3F6A937E;
+	mov.f32 	%f40, 0f3F20D842;
+	mov.f32 	%f47, %f1;
+	@%p1 bra 	$L__BB1_2;
+	mul.f32 	%f47, %f14, %f14;
+	mov.f32 	%f46, 0f38B1E96A;
+	mov.f32 	%f45, 0fBA574D20;
+	mov.f32 	%f44, 0f3BAAD5EA;
+	mov.f32 	%f43, 0fBCDC1BE7;
+	mov.f32 	%f42, 0f3DE718AF;
+	mov.f32 	%f41, 0fBEC093AC;
+	mov.f32 	%f40, 0f3E0375D3;
+$L__BB1_2:
+	setp.ltu.f32 	%p2, %f1, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f29, %f46, %f47, %f45;
+	fma.rn.ftz.f32 	%f30, %f29, %f47, %f44;
+	fma.rn.ftz.f32 	%f31, %f30, %f47, %f43;
+	fma.rn.ftz.f32 	%f32, %f31, %f47, %f42;
+	fma.rn.ftz.f32 	%f33, %f32, %f47, %f41;
+	fma.rn.ftz.f32 	%f34, %f33, %f47, %f40;
+	neg.f32 	%f35, %f47;
+	selp.f32 	%f36, %f35, %f14, %p1;
+	fma.rn.ftz.f32 	%f48, %f34, %f36, %f36;
+	@%p2 bra 	$L__BB1_4;
+	ex2.approx.ftz.f32 	%f37, %f48;
+	mov.f32 	%f38, 0f3F800000;
+	sub.f32 	%f39, %f38, %f37;
+	mov.b32 	%r1, %f39;
+	mov.b32 	%r2, %f14;
+	and.b32  	%r3, %r2, -2147483648;
+	or.b32  	%r4, %r3, %r1;
+	mov.b32 	%f48, %r4;
+$L__BB1_4:
+	st.param.f32 	[func_retval0+0], %f48;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/jf/cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 106
+.b8 102
+.b8 111
+.b8 113
+.b8 111
+.b8 51
+.b8 110
+.b8 117
+.b8 116
+.b8 110
+.b8 105
+.b8 53
+.b8 99
+.b8 109
+.b8 116
+.b8 119
+.b8 52
+.b8 98
+.b8 114
+.b8 108
+.b8 97
+.b8 51
+.b8 52
+.b8 99
+.b8 122
+.b8 52
+.b8 53
+.b8 102
+.b8 117
+.b8 115
+.b8 97
+.b8 100
+.b8 101
+.b8 104
+.b8 107
+.b8 120
+.b8 102
+.b8 107
+.b8 114
+.b8 50
+.b8 102
+.b8 105
+.b8 101
+.b8 50
+.b8 113
+.b8 103
+.b8 111
+.b8 55
+.b8 118
+.b8 119
+.b8 116
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 106
+.b8 102
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..79d21fa82a44df2150549490d783a08bf37e14f5
--- /dev/null
+++ b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir
@@ -0,0 +1,27 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
+    %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
+    %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
+    %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
+    %9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
+    %10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
+    %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
+    %12 = arith.addf %11, %cst : tensor<1024xf32>
+    %13 = arith.mulf %9, %12 : tensor<1024xf32>
+    %14 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
+    %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
+    %16 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
+    tt.store %15, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir b/.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..f67711bc6dcda8ca57c15c39f8f5a9551df2835d
--- /dev/null
+++ b/.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir
@@ -0,0 +1,1473 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
+@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
+@assertMessage_0 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp11 < 50257"
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
+
+define void @triton__0d1d2d3d4d5d6e7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i64 %6, i64 %7) local_unnamed_addr !dbg !7 {
+  %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %urem = and i32 %9, 255, !dbg !10
+  %10 = or i32 %urem, 256, !dbg !10
+  %11 = or i32 %urem, 512, !dbg !10
+  %12 = or i32 %urem, 768, !dbg !10
+  %13 = shl nuw nsw i32 %urem, 2, !dbg !10
+  %14 = or i32 %13, 1, !dbg !10
+  %15 = or i32 %13, 2, !dbg !10
+  %16 = or i32 %13, 3, !dbg !10
+  %17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !11
+  %18 = sext i32 %17 to i64, !dbg !12
+  %19 = icmp slt i32 %17, 8, !dbg !13
+  %20 = mul nsw i64 %18, 7680, !dbg !14
+  %21 = mul nsw i64 %18, 385973760, !dbg !15
+  %22 = getelementptr i64, ptr addrspace(1) %0, i64 %20
+  %23 = shl nuw nsw i32 %urem, 3
+  %24 = zext nneg i32 %23 to i64
+  %25 = getelementptr float, ptr addrspace(3) @global_smem, i64 %24
+  %26 = shl nuw nsw i32 %14, 1
+  %27 = zext nneg i32 %26 to i64
+  %28 = getelementptr float, ptr addrspace(3) @global_smem, i64 %27
+  %29 = shl nuw nsw i32 %15, 1
+  %30 = zext nneg i32 %29 to i64
+  %31 = getelementptr float, ptr addrspace(3) @global_smem, i64 %30
+  %32 = shl nuw nsw i32 %16, 1
+  %33 = zext nneg i32 %32 to i64
+  %34 = getelementptr float, ptr addrspace(3) @global_smem, i64 %33
+  %35 = shl nuw nsw i32 %urem, 1
+  %36 = zext nneg i32 %35 to i64
+  %37 = getelementptr float, ptr addrspace(3) @global_smem, i64 %36
+  %38 = shl nuw nsw i32 %10, 1
+  %39 = zext nneg i32 %38 to i64
+  %40 = getelementptr float, ptr addrspace(3) @global_smem, i64 %39
+  %41 = shl nuw nsw i32 %11, 1
+  %42 = zext nneg i32 %41 to i64
+  %43 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42
+  %44 = shl nuw nsw i32 %12, 1
+  %45 = zext nneg i32 %44 to i64
+  %46 = getelementptr float, ptr addrspace(3) @global_smem, i64 %45
+  %47 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %24
+  %48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %27
+  %49 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %30
+  %50 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %33
+  %51 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %36
+  %52 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %39
+  %53 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %42
+  %54 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %45
+  %55 = zext nneg i32 %13 to i64
+  %56 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %55
+  %57 = zext nneg i32 %urem to i64
+  %58 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %57
+  %59 = zext nneg i32 %10 to i64
+  %60 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %59
+  %61 = zext nneg i32 %11 to i64
+  %62 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %61
+  %63 = zext nneg i32 %12 to i64
+  %64 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %63
+  %65 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %24
+  %66 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %27
+  %67 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %30
+  %68 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %33
+  %69 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %36
+  %70 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %39
+  %71 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %42
+  %72 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %45
+  %73 = insertelement <8 x i1> poison, i1 %19, i64 0, !dbg !16
+  br label %74, !dbg !17
+
+74:                                               ; preds = %8, %__nv_logf.exit239
+  %75 = phi i32 [ 0, %8 ], [ %774, %__nv_logf.exit239 ]
+  %76 = phi <8 x float> [ zeroinitializer, %8 ], [ %773, %__nv_logf.exit239 ]
+  %77 = phi <8 x i64> [ zeroinitializer, %8 ], [ %211, %__nv_logf.exit239 ]
+  %78 = or i32 %75, 1536, !dbg !18
+  %79 = or i32 %75, %13, !dbg !18
+  %80 = zext nneg i32 %79 to i64, !dbg !18
+  %81 = or i32 %75, %14, !dbg !18
+  %82 = zext nneg i32 %81 to i64, !dbg !18
+  %83 = or i32 %75, %15, !dbg !18
+  %84 = zext nneg i32 %83 to i64, !dbg !18
+  %85 = or i32 %75, %16, !dbg !18
+  %86 = zext nneg i32 %85 to i64, !dbg !18
+  %87 = or i32 %79, 1024, !dbg !18
+  %88 = zext nneg i32 %87 to i64, !dbg !18
+  %89 = or i32 %79, 1025, !dbg !18
+  %90 = zext nneg i32 %89 to i64, !dbg !18
+  %91 = or i32 %79, 1026, !dbg !18
+  %92 = zext nneg i32 %91 to i64, !dbg !18
+  %93 = or i32 %79, 1027, !dbg !18
+  %94 = zext nneg i32 %93 to i64, !dbg !18
+  %95 = icmp ult i32 %78, 7680, !dbg !19
+  %96 = icmp ult i32 %87, 7680, !dbg !19
+  %97 = add nsw i64 %20, %80, !dbg !20
+  %98 = add nsw i64 %20, %88, !dbg !20
+  %99 = getelementptr i64, ptr addrspace(1) %0, i64 %97, !dbg !21
+  %100 = getelementptr i64, ptr addrspace(1) %22, i64 %84, !dbg !21
+  %101 = getelementptr i64, ptr addrspace(1) %0, i64 %98, !dbg !21
+  %102 = getelementptr i64, ptr addrspace(1) %22, i64 %92, !dbg !21
+  %103 = and i1 %19, %95, !dbg !22
+  %104 = and i1 %19, %96, !dbg !22
+  %105 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %99, i1 %19, i1 %19, i1 %19) #5, !dbg !23
+  %106 = extractvalue { i64, i64 } %105, 0, !dbg !23
+  %107 = extractvalue { i64, i64 } %105, 1, !dbg !23
+  %108 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %100, i1 %19, i1 %19, i1 %19) #5, !dbg !23
+  %109 = extractvalue { i64, i64 } %108, 0, !dbg !23
+  %110 = extractvalue { i64, i64 } %108, 1, !dbg !23
+  %111 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %101, i1 %104, i1 %104, i1 %104) #5, !dbg !23
+  %112 = extractvalue { i64, i64 } %111, 0, !dbg !23
+  %113 = extractvalue { i64, i64 } %111, 1, !dbg !23
+  %114 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %102, i1 %104, i1 %104, i1 %104) #5, !dbg !23
+  %115 = extractvalue { i64, i64 } %114, 0, !dbg !23
+  %116 = extractvalue { i64, i64 } %114, 1, !dbg !23
+  %117 = getelementptr float, ptr addrspace(1) %2, i64 %97, !dbg !24
+  %118 = getelementptr float, ptr addrspace(1) %2, i64 %98, !dbg !24
+  %119 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19) #5, !dbg !25
+  %120 = extractvalue { i32, i32, i32, i32 } %119, 0, !dbg !25
+  %121 = extractvalue { i32, i32, i32, i32 } %119, 1, !dbg !25
+  %122 = extractvalue { i32, i32, i32, i32 } %119, 2, !dbg !25
+  %123 = extractvalue { i32, i32, i32, i32 } %119, 3, !dbg !25
+  %124 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104) #5, !dbg !25
+  %125 = extractvalue { i32, i32, i32, i32 } %124, 0, !dbg !25
+  %126 = extractvalue { i32, i32, i32, i32 } %124, 1, !dbg !25
+  %127 = extractvalue { i32, i32, i32, i32 } %124, 2, !dbg !25
+  %128 = extractvalue { i32, i32, i32, i32 } %124, 3, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  store i32 %120, ptr addrspace(3) %25, align 4, !dbg !25
+  store i32 %121, ptr addrspace(3) %28, align 4, !dbg !25
+  store i32 %122, ptr addrspace(3) %31, align 4, !dbg !25
+  store i32 %123, ptr addrspace(3) %34, align 4, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %129 = load float, ptr addrspace(3) %37, align 4, !dbg !25
+  %130 = load float, ptr addrspace(3) %40, align 4, !dbg !25
+  %131 = load float, ptr addrspace(3) %43, align 4, !dbg !25
+  %132 = load float, ptr addrspace(3) %46, align 4, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  store i32 %125, ptr addrspace(3) %25, align 4, !dbg !25
+  store i32 %126, ptr addrspace(3) %28, align 4, !dbg !25
+  store i32 %127, ptr addrspace(3) %31, align 4, !dbg !25
+  store i32 %128, ptr addrspace(3) %34, align 4, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %133 = load float, ptr addrspace(3) %37, align 4, !dbg !25
+  %134 = load float, ptr addrspace(3) %40, align 4, !dbg !25
+  %135 = load float, ptr addrspace(3) %43, align 4, !dbg !25
+  %136 = load float, ptr addrspace(3) %46, align 4, !dbg !25
+  %137 = getelementptr float, ptr addrspace(1) %3, i64 %97, !dbg !26
+  %138 = getelementptr float, ptr addrspace(1) %3, i64 %98, !dbg !26
+  %139 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19) #5, !dbg !27
+  %140 = extractvalue { i32, i32, i32, i32 } %139, 0, !dbg !27
+  %141 = extractvalue { i32, i32, i32, i32 } %139, 1, !dbg !27
+  %142 = extractvalue { i32, i32, i32, i32 } %139, 2, !dbg !27
+  %143 = extractvalue { i32, i32, i32, i32 } %139, 3, !dbg !27
+  %144 = bitcast i32 %140 to float, !dbg !27
+  %145 = bitcast i32 %141 to float, !dbg !27
+  %146 = bitcast i32 %142 to float, !dbg !27
+  %147 = bitcast i32 %143 to float, !dbg !27
+  %148 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %138, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104) #5, !dbg !27
+  %149 = extractvalue { i32, i32, i32, i32 } %148, 0, !dbg !27
+  %150 = extractvalue { i32, i32, i32, i32 } %148, 1, !dbg !27
+  %151 = extractvalue { i32, i32, i32, i32 } %148, 2, !dbg !27
+  %152 = extractvalue { i32, i32, i32, i32 } %148, 3, !dbg !27
+  %153 = bitcast i32 %149 to float, !dbg !27
+  %154 = bitcast i32 %150 to float, !dbg !27
+  %155 = bitcast i32 %151 to float, !dbg !27
+  %156 = bitcast i32 %152 to float, !dbg !27
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %157 = insertelement <8 x i64> poison, i64 %106, i64 0, !dbg !28
+  %158 = insertelement <8 x i64> %157, i64 %107, i64 1, !dbg !28
+  %159 = insertelement <8 x i64> %158, i64 %109, i64 2, !dbg !28
+  %160 = insertelement <8 x i64> %159, i64 %110, i64 3, !dbg !28
+  %161 = insertelement <8 x i64> %160, i64 %112, i64 4, !dbg !28
+  %162 = insertelement <8 x i64> %161, i64 %113, i64 5, !dbg !28
+  %163 = insertelement <8 x i64> %162, i64 %115, i64 6, !dbg !28
+  %164 = insertelement <8 x i64> %163, i64 %116, i64 7, !dbg !28
+  %165 = icmp ne <8 x i64> %164, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !28
+  %166 = extractelement <8 x i1> %165, i64 0, !dbg !29
+  %167 = zext i1 %166 to i8, !dbg !28
+  %168 = insertelement <1 x i8> undef, i8 %167, i64 0, !dbg !28
+  store <1 x i8> %168, ptr addrspace(3) %47, align 1, !dbg !28
+  %169 = extractelement <8 x i1> %165, i64 1, !dbg !29
+  %170 = zext i1 %169 to i8, !dbg !28
+  %171 = insertelement <1 x i8> undef, i8 %170, i64 0, !dbg !28
+  store <1 x i8> %171, ptr addrspace(3) %48, align 1, !dbg !28
+  %172 = extractelement <8 x i1> %165, i64 2, !dbg !29
+  %173 = zext i1 %172 to i8, !dbg !28
+  %174 = insertelement <1 x i8> undef, i8 %173, i64 0, !dbg !28
+  store <1 x i8> %174, ptr addrspace(3) %49, align 1, !dbg !28
+  %175 = extractelement <8 x i1> %165, i64 3, !dbg !29
+  %176 = zext i1 %175 to i8, !dbg !28
+  %177 = insertelement <1 x i8> undef, i8 %176, i64 0, !dbg !28
+  store <1 x i8> %177, ptr addrspace(3) %50, align 1, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %178 = load i8, ptr addrspace(3) %51, align 1, !dbg !28
+  %179 = load i8, ptr addrspace(3) %52, align 1, !dbg !28
+  %180 = load i8, ptr addrspace(3) %53, align 1, !dbg !28
+  %181 = load i8, ptr addrspace(3) %54, align 1, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %182 = extractelement <8 x i1> %165, i64 4, !dbg !29
+  %183 = zext i1 %182 to i8, !dbg !28
+  %184 = insertelement <1 x i8> undef, i8 %183, i64 0, !dbg !28
+  store <1 x i8> %184, ptr addrspace(3) %47, align 1, !dbg !28
+  %185 = extractelement <8 x i1> %165, i64 5, !dbg !29
+  %186 = zext i1 %185 to i8, !dbg !28
+  %187 = insertelement <1 x i8> undef, i8 %186, i64 0, !dbg !28
+  store <1 x i8> %187, ptr addrspace(3) %48, align 1, !dbg !28
+  %188 = extractelement <8 x i1> %165, i64 6, !dbg !29
+  %189 = zext i1 %188 to i8, !dbg !28
+  %190 = insertelement <1 x i8> undef, i8 %189, i64 0, !dbg !28
+  store <1 x i8> %190, ptr addrspace(3) %49, align 1, !dbg !28
+  %191 = extractelement <8 x i1> %165, i64 7, !dbg !29
+  %192 = zext i1 %191 to i8, !dbg !28
+  %193 = insertelement <1 x i8> undef, i8 %192, i64 0, !dbg !28
+  store <1 x i8> %193, ptr addrspace(3) %50, align 1, !dbg !28
+  tail call void @llvm.nvvm.barrier0(), !dbg !28
+  %194 = load i8, ptr addrspace(3) %51, align 1, !dbg !28
+  %195 = load i8, ptr addrspace(3) %52, align 1, !dbg !28
+  %196 = load i8, ptr addrspace(3) %53, align 1, !dbg !28
+  %197 = load i8, ptr addrspace(3) %54, align 1, !dbg !28
+  %198 = insertelement <8 x i8> poison, i8 %178, i64 0, !dbg !28
+  %199 = insertelement <8 x i8> %198, i8 %179, i64 1, !dbg !28
+  %200 = insertelement <8 x i8> %199, i8 %180, i64 2, !dbg !28
+  %201 = insertelement <8 x i8> %200, i8 %181, i64 3, !dbg !28
+  %202 = insertelement <8 x i8> %201, i8 %194, i64 4, !dbg !28
+  %203 = insertelement <8 x i8> %202, i8 %195, i64 5, !dbg !28
+  %204 = insertelement <8 x i8> %203, i8 %196, i64 6, !dbg !28
+  %205 = insertelement <8 x i8> %204, i8 %197, i64 7, !dbg !28
+  %206 = icmp eq <8 x i8> %205, zeroinitializer, !dbg !28
+  %207 = insertelement <8 x i1> %73, i1 %104, i64 1, !dbg !30
+  %208 = shufflevector <8 x i1> %207, <8 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>, !dbg !30
+  %209 = select <8 x i1> %208, <8 x i1> %165, <8 x i1> zeroinitializer, !dbg !30
+  %210 = zext <8 x i1> %209 to <8 x i64>, !dbg !30
+  %211 = add <8 x i64> %77, %210, !dbg !30
+  tail call void @llvm.nvvm.barrier0(), !dbg !31
+  %212 = shufflevector <8 x i1> %165, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg !29
+  %213 = insertelement <4 x i64> poison, i64 %106, i64 0, !dbg !29
+  %214 = insertelement <4 x i64> %213, i64 %107, i64 1, !dbg !29
+  %215 = insertelement <4 x i64> %214, i64 %109, i64 2, !dbg !29
+  %216 = insertelement <4 x i64> %215, i64 %110, i64 3, !dbg !29
+  %217 = select <4 x i1> %212, <4 x i64> %216, <4 x i64> zeroinitializer, !dbg !29
+  %218 = add <4 x i64> %217, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !32
+  %219 = icmp slt <4 x i64> %217, zeroinitializer, !dbg !33
+  %220 = select <4 x i1> %219, <4 x i64> %218, <4 x i64> %217, !dbg !34
+  %221 = icmp ult <4 x i64> %220, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !31
+  %222 = zext <4 x i1> %221 to <4 x i8>, !dbg !31
+  store <4 x i8> %222, ptr addrspace(3) %56, align 4, !dbg !31
+  tail call void @llvm.nvvm.barrier0(), !dbg !31
+  %223 = load i8, ptr addrspace(3) %58, align 1, !dbg !31
+  %224 = load i8, ptr addrspace(3) %60, align 1, !dbg !31
+  %225 = load i8, ptr addrspace(3) %62, align 1, !dbg !31
+  %226 = load i8, ptr addrspace(3) %64, align 1, !dbg !31
+  tail call void @llvm.nvvm.barrier0(), !dbg !31
+  %227 = shufflevector <8 x i1> %165, <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, !dbg !29
+  %228 = insertelement <4 x i64> poison, i64 %112, i64 0, !dbg !29
+  %229 = insertelement <4 x i64> %228, i64 %113, i64 1, !dbg !29
+  %230 = insertelement <4 x i64> %229, i64 %115, i64 2, !dbg !29
+  %231 = insertelement <4 x i64> %230, i64 %116, i64 3, !dbg !29
+  %232 = select <4 x i1> %227, <4 x i64> %231, <4 x i64> zeroinitializer, !dbg !29
+  %233 = add <4 x i64> %232, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !32
+  %234 = icmp slt <4 x i64> %232, zeroinitializer, !dbg !33
+  %235 = select <4 x i1> %234, <4 x i64> %233, <4 x i64> %232, !dbg !34
+  %236 = icmp ult <4 x i64> %235, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !31
+  %237 = zext <4 x i1> %236 to <4 x i8>, !dbg !31
+  store <4 x i8> %237, ptr addrspace(3) %56, align 4, !dbg !31
+  tail call void @llvm.nvvm.barrier0(), !dbg !31
+  %238 = load i8, ptr addrspace(3) %58, align 1, !dbg !31
+  %239 = load i8, ptr addrspace(3) %60, align 1, !dbg !31
+  %240 = load i8, ptr addrspace(3) %62, align 1, !dbg !31
+  %241 = load i8, ptr addrspace(3) %64, align 1, !dbg !31
+  %242 = insertelement <8 x i8> poison, i8 %224, i64 0, !dbg !31
+  %243 = insertelement <8 x i8> %242, i8 %223, i64 1, !dbg !31
+  %244 = insertelement <8 x i8> %243, i8 %225, i64 2, !dbg !31
+  %245 = insertelement <8 x i8> %244, i8 %226, i64 3, !dbg !31
+  %246 = insertelement <8 x i8> %245, i8 %238, i64 4, !dbg !31
+  %247 = insertelement <8 x i8> %246, i8 %239, i64 5, !dbg !31
+  %248 = insertelement <8 x i8> %247, i8 %240, i64 6, !dbg !31
+  %249 = insertelement <8 x i8> %248, i8 %241, i64 7, !dbg !31
+  %250 = icmp eq <8 x i8> %249, zeroinitializer, !dbg !31
+  %251 = bitcast <8 x i1> %250 to i8, !dbg !35
+  %.not = icmp eq i8 %251, 0, !dbg !35
+  br i1 %.not, label %253, label %252, !dbg !35
+
+252:                                              ; preds = %74
+  tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !35
+  br label %253, !dbg !35
+
+253:                                              ; preds = %252, %74
+  %254 = mul nuw nsw i64 %80, 50257, !dbg !36
+  %255 = mul nuw nsw i64 %82, 50257, !dbg !36
+  %256 = mul nuw nsw i64 %84, 50257, !dbg !36
+  %257 = mul nuw nsw i64 %86, 50257, !dbg !36
+  %258 = mul nuw nsw i64 %88, 50257, !dbg !36
+  %259 = mul nuw nsw i64 %90, 50257, !dbg !36
+  %260 = mul nuw nsw i64 %92, 50257, !dbg !36
+  %261 = mul nuw nsw i64 %94, 50257, !dbg !36
+  %262 = extractelement <4 x i64> %220, i64 0, !dbg !37
+  %263 = getelementptr i16, ptr addrspace(1) %1, i64 %262, !dbg !37
+  %264 = getelementptr i16, ptr addrspace(1) %263, i64 %254, !dbg !37
+  %265 = getelementptr i16, ptr addrspace(1) %264, i64 %21, !dbg !37
+  %266 = extractelement <4 x i64> %220, i64 1, !dbg !37
+  %267 = getelementptr i16, ptr addrspace(1) %1, i64 %266, !dbg !37
+  %268 = getelementptr i16, ptr addrspace(1) %267, i64 %255, !dbg !37
+  %269 = getelementptr i16, ptr addrspace(1) %268, i64 %21, !dbg !37
+  %270 = extractelement <4 x i64> %220, i64 2, !dbg !37
+  %271 = getelementptr i16, ptr addrspace(1) %1, i64 %270, !dbg !37
+  %272 = getelementptr i16, ptr addrspace(1) %271, i64 %256, !dbg !37
+  %273 = getelementptr i16, ptr addrspace(1) %272, i64 %21, !dbg !37
+  %274 = extractelement <4 x i64> %220, i64 3, !dbg !37
+  %275 = getelementptr i16, ptr addrspace(1) %1, i64 %274, !dbg !37
+  %276 = getelementptr i16, ptr addrspace(1) %275, i64 %257, !dbg !37
+  %277 = getelementptr i16, ptr addrspace(1) %276, i64 %21, !dbg !37
+  %278 = extractelement <4 x i64> %235, i64 0, !dbg !37
+  %279 = getelementptr i16, ptr addrspace(1) %1, i64 %278, !dbg !37
+  %280 = getelementptr i16, ptr addrspace(1) %279, i64 %258, !dbg !37
+  %281 = getelementptr i16, ptr addrspace(1) %280, i64 %21, !dbg !37
+  %282 = extractelement <4 x i64> %235, i64 1, !dbg !37
+  %283 = getelementptr i16, ptr addrspace(1) %1, i64 %282, !dbg !37
+  %284 = getelementptr i16, ptr addrspace(1) %283, i64 %259, !dbg !37
+  %285 = getelementptr i16, ptr addrspace(1) %284, i64 %21, !dbg !37
+  %286 = extractelement <4 x i64> %235, i64 2, !dbg !37
+  %287 = getelementptr i16, ptr addrspace(1) %1, i64 %286, !dbg !37
+  %288 = getelementptr i16, ptr addrspace(1) %287, i64 %260, !dbg !37
+  %289 = getelementptr i16, ptr addrspace(1) %288, i64 %21, !dbg !37
+  %290 = extractelement <4 x i64> %235, i64 3, !dbg !37
+  %291 = getelementptr i16, ptr addrspace(1) %1, i64 %290, !dbg !37
+  %292 = getelementptr i16, ptr addrspace(1) %291, i64 %261, !dbg !37
+  %293 = getelementptr i16, ptr addrspace(1) %292, i64 %21, !dbg !37
+  tail call void @llvm.nvvm.barrier0(), !dbg !38
+  %294 = ptrtoint ptr addrspace(1) %265 to i64, !dbg !38
+  %295 = insertelement <1 x i64> undef, i64 %294, i64 0, !dbg !38
+  store <1 x i64> %295, ptr addrspace(3) %65, align 8, !dbg !38
+  %296 = ptrtoint ptr addrspace(1) %269 to i64, !dbg !38
+  %297 = insertelement <1 x i64> undef, i64 %296, i64 0, !dbg !38
+  store <1 x i64> %297, ptr addrspace(3) %66, align 8, !dbg !38
+  %298 = ptrtoint ptr addrspace(1) %273 to i64, !dbg !38
+  %299 = insertelement <1 x i64> undef, i64 %298, i64 0, !dbg !38
+  store <1 x i64> %299, ptr addrspace(3) %67, align 8, !dbg !38
+  %300 = ptrtoint ptr addrspace(1) %277 to i64, !dbg !38
+  %301 = insertelement <1 x i64> undef, i64 %300, i64 0, !dbg !38
+  store <1 x i64> %301, ptr addrspace(3) %68, align 8, !dbg !38
+  tail call void @llvm.nvvm.barrier0(), !dbg !38
+  %302 = load i64, ptr addrspace(3) %69, align 8, !dbg !38
+  %303 = inttoptr i64 %302 to ptr addrspace(1), !dbg !38
+  %304 = load i64, ptr addrspace(3) %70, align 8, !dbg !38
+  %305 = inttoptr i64 %304 to ptr addrspace(1), !dbg !38
+  %306 = load i64, ptr addrspace(3) %71, align 8, !dbg !38
+  %307 = inttoptr i64 %306 to ptr addrspace(1), !dbg !38
+  %308 = load i64, ptr addrspace(3) %72, align 8, !dbg !38
+  %309 = inttoptr i64 %308 to ptr addrspace(1), !dbg !38
+  tail call void @llvm.nvvm.barrier0(), !dbg !38
+  %310 = ptrtoint ptr addrspace(1) %281 to i64, !dbg !38
+  %311 = insertelement <1 x i64> undef, i64 %310, i64 0, !dbg !38
+  store <1 x i64> %311, ptr addrspace(3) %65, align 8, !dbg !38
+  %312 = ptrtoint ptr addrspace(1) %285 to i64, !dbg !38
+  %313 = insertelement <1 x i64> undef, i64 %312, i64 0, !dbg !38
+  store <1 x i64> %313, ptr addrspace(3) %66, align 8, !dbg !38
+  %314 = ptrtoint ptr addrspace(1) %289 to i64, !dbg !38
+  %315 = insertelement <1 x i64> undef, i64 %314, i64 0, !dbg !38
+  store <1 x i64> %315, ptr addrspace(3) %67, align 8, !dbg !38
+  %316 = ptrtoint ptr addrspace(1) %293 to i64, !dbg !38
+  %317 = insertelement <1 x i64> undef, i64 %316, i64 0, !dbg !38
+  store <1 x i64> %317, ptr addrspace(3) %68, align 8, !dbg !38
+  tail call void @llvm.nvvm.barrier0(), !dbg !38
+  %318 = load i64, ptr addrspace(3) %69, align 8, !dbg !38
+  %319 = inttoptr i64 %318 to ptr addrspace(1), !dbg !38
+  %320 = load i64, ptr addrspace(3) %70, align 8, !dbg !38
+  %321 = inttoptr i64 %320 to ptr addrspace(1), !dbg !38
+  %322 = load i64, ptr addrspace(3) %71, align 8, !dbg !38
+  %323 = inttoptr i64 %322 to ptr addrspace(1), !dbg !38
+  %324 = load i64, ptr addrspace(3) %72, align 8, !dbg !38
+  %325 = inttoptr i64 %324 to ptr addrspace(1), !dbg !38
+  %326 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %303, i1 %19, i16 0, i1 %19) #5, !dbg !38
+  %327 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %305, i1 %19, i16 0, i1 %19) #5, !dbg !38
+  %328 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %307, i1 %19, i16 0, i1 %19) #5, !dbg !38
+  %329 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %309, i1 %19, i16 0, i1 %19) #5, !dbg !38
+  %330 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %319, i1 %19, i16 0, i1 %19) #5, !dbg !38
+  %331 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %321, i1 %19, i16 0, i1 %19) #5, !dbg !38
+  %332 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %323, i1 %103, i16 0, i1 %103) #5, !dbg !38
+  %333 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %325, i1 %103, i16 0, i1 %103) #5, !dbg !38
+  %334 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %326) #5, !dbg !39
+  %335 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %327) #5, !dbg !39
+  %336 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %328) #5, !dbg !39
+  %337 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %329) #5, !dbg !39
+  %338 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %330) #5, !dbg !39
+  %339 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %331) #5, !dbg !39
+  %340 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %332) #5, !dbg !39
+  %341 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %333) #5, !dbg !39
+  %342 = insertelement <8 x float> poison, float %334, i64 0, !dbg !40
+  %343 = insertelement <8 x float> %342, float %335, i64 1, !dbg !40
+  %344 = insertelement <8 x float> %343, float %336, i64 2, !dbg !40
+  %345 = insertelement <8 x float> %344, float %337, i64 3, !dbg !40
+  %346 = insertelement <8 x float> %345, float %338, i64 4, !dbg !40
+  %347 = insertelement <8 x float> %346, float %339, i64 5, !dbg !40
+  %348 = insertelement <8 x float> %347, float %340, i64 6, !dbg !40
+  %349 = insertelement <8 x float> %348, float %341, i64 7, !dbg !40
+  %350 = insertelement <8 x float> poison, float %129, i64 0, !dbg !40
+  %351 = insertelement <8 x float> %350, float %130, i64 1, !dbg !40
+  %352 = insertelement <8 x float> %351, float %131, i64 2, !dbg !40
+  %353 = insertelement <8 x float> %352, float %132, i64 3, !dbg !40
+  %354 = insertelement <8 x float> %353, float %133, i64 4, !dbg !40
+  %355 = insertelement <8 x float> %354, float %134, i64 5, !dbg !40
+  %356 = insertelement <8 x float> %355, float %135, i64 6, !dbg !40
+  %357 = insertelement <8 x float> %356, float %136, i64 7, !dbg !40
+  %358 = fsub <8 x float> %349, %357, !dbg !40
+  %359 = fcmp olt float %144, 0x3810000000000000, !dbg !41
+  %360 = fmul float %144, 0x4160000000000000, !dbg !41
+  %.02.i = select i1 %359, float %360, float %144, !dbg !41
+  %i.i.0.i = select i1 %359, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %361 = bitcast float %.02.i to i32, !dbg !41
+  %362 = add i32 %361, -1059760811, !dbg !41
+  %363 = and i32 %362, -8388608, !dbg !41
+  %364 = sub i32 %361, %363, !dbg !41
+  %365 = bitcast i32 %364 to float, !dbg !41
+  %366 = sitofp i32 %363 to float, !dbg !41
+  %367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i = icmp eq i32 %367, 0, !dbg !41
+  %368 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3E80000000000000, float %i.i.0.i) #5, !dbg !41
+  %369 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3E80000000000000, float %i.i.0.i) #5, !dbg !41
+  %.08.i = select i1 %.not.i, float %369, float %368, !dbg !41
+  %370 = fadd float %365, -1.000000e+00, !dbg !41
+  %371 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i = icmp eq i32 %371, 0, !dbg !41
+  %372 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %370, float 0x3FC2073EC0000000) #5, !dbg !41
+  %373 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %370, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i = select i1 %.not1.i, float %373, float %372, !dbg !41
+  %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i = icmp eq i32 %374, 0, !dbg !41
+  %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %370, float 0xBFBF19B980000000) #5, !dbg !41
+  %376 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %370, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i = select i1 %.not2.i, float %376, float %375, !dbg !41
+  %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i = icmp eq i32 %377, 0, !dbg !41
+  %378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %370, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %379 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %370, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i = select i1 %.not3.i, float %379, float %378, !dbg !41
+  %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i = icmp eq i32 %380, 0, !dbg !41
+  %381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %370, float 0xBFC55B1720000000) #5, !dbg !41
+  %382 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %370, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i = select i1 %.not4.i, float %382, float %381, !dbg !41
+  %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i = icmp eq i32 %383, 0, !dbg !41
+  %384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %370, float 0x3FC99DA160000000) #5, !dbg !41
+  %385 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %370, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i = select i1 %.not5.i, float %385, float %384, !dbg !41
+  %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i = icmp eq i32 %386, 0, !dbg !41
+  %387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %370, float 0xBFCFFFE440000000) #5, !dbg !41
+  %388 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %370, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i = select i1 %.not6.i, float %388, float %387, !dbg !41
+  %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i = icmp eq i32 %389, 0, !dbg !41
+  %390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %370, float 0x3FD5554F00000000) #5, !dbg !41
+  %391 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %370, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i = select i1 %.not7.i, float %391, float %390, !dbg !41
+  %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i = icmp eq i32 %392, 0, !dbg !41
+  %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i, float %370, float -5.000000e-01) #5, !dbg !41
+  %394 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i, float %370, float -5.000000e-01) #5, !dbg !41
+  %.07.i = select i1 %.not8.i, float %394, float %393, !dbg !41
+  %395 = fmul float %370, %.07.i, !dbg !41
+  %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i = icmp eq i32 %396, 0, !dbg !41
+  %397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float %370, float %370) #5, !dbg !41
+  %398 = tail call float @llvm.nvvm.fma.rn.f(float %395, float %370, float %370) #5, !dbg !41
+  %.06.i = select i1 %.not9.i, float %398, float %397, !dbg !41
+  %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i = icmp eq i32 %399, 0, !dbg !41
+  %400 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #5, !dbg !41
+  %401 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #5, !dbg !41
+  %.04.i = select i1 %.not10.i, float %401, float %400, !dbg !41
+  %402 = icmp ugt i32 %361, 2139095039, !dbg !41
+  br i1 %402, label %__nv_fmaf_rn.exit.i.i, label %__nv_logf.exit, !dbg !41
+
+__nv_fmaf_rn.exit.i.i:                            ; preds = %253
+  %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i = icmp eq i32 %403, 0, !dbg !41
+  %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %405 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i = select i1 %.not11.i, float %405, float %404, !dbg !41
+  br label %__nv_logf.exit, !dbg !41
+
+__nv_logf.exit:                                   ; preds = %253, %__nv_fmaf_rn.exit.i.i
+  %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %.04.i, %253 ], !dbg !41
+  %406 = fcmp oeq float %.02.i, 0.000000e+00, !dbg !41
+  %r.i.1.i = select i1 %406, float 0xFFF0000000000000, float %r.i.0.i, !dbg !41
+  %407 = fcmp olt float %145, 0x3810000000000000, !dbg !41
+  %408 = fmul float %145, 0x4160000000000000, !dbg !41
+  %.02.i30 = select i1 %407, float %408, float %145, !dbg !41
+  %i.i.0.i31 = select i1 %407, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %409 = bitcast float %.02.i30 to i32, !dbg !41
+  %410 = add i32 %409, -1059760811, !dbg !41
+  %411 = and i32 %410, -8388608, !dbg !41
+  %412 = sub i32 %409, %411, !dbg !41
+  %413 = bitcast i32 %412 to float, !dbg !41
+  %414 = sitofp i32 %411 to float, !dbg !41
+  %415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i32 = icmp eq i32 %415, 0, !dbg !41
+  %416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %414, float 0x3E80000000000000, float %i.i.0.i31) #5, !dbg !41
+  %417 = tail call float @llvm.nvvm.fma.rn.f(float %414, float 0x3E80000000000000, float %i.i.0.i31) #5, !dbg !41
+  %.08.i33 = select i1 %.not.i32, float %417, float %416, !dbg !41
+  %418 = fadd float %413, -1.000000e+00, !dbg !41
+  %419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i34 = icmp eq i32 %419, 0, !dbg !41
+  %420 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %418, float 0x3FC2073EC0000000) #5, !dbg !41
+  %421 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %418, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i35 = select i1 %.not1.i34, float %421, float %420, !dbg !41
+  %422 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i36 = icmp eq i32 %422, 0, !dbg !41
+  %423 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i35, float %418, float 0xBFBF19B980000000) #5, !dbg !41
+  %424 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i35, float %418, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i37 = select i1 %.not2.i36, float %424, float %423, !dbg !41
+  %425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i38 = icmp eq i32 %425, 0, !dbg !41
+  %426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i37, float %418, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %427 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i37, float %418, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i39 = select i1 %.not3.i38, float %427, float %426, !dbg !41
+  %428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i40 = icmp eq i32 %428, 0, !dbg !41
+  %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i39, float %418, float 0xBFC55B1720000000) #5, !dbg !41
+  %430 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i39, float %418, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i41 = select i1 %.not4.i40, float %430, float %429, !dbg !41
+  %431 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i42 = icmp eq i32 %431, 0, !dbg !41
+  %432 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i41, float %418, float 0x3FC99DA160000000) #5, !dbg !41
+  %433 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i41, float %418, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i43 = select i1 %.not5.i42, float %433, float %432, !dbg !41
+  %434 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i44 = icmp eq i32 %434, 0, !dbg !41
+  %435 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i43, float %418, float 0xBFCFFFE440000000) #5, !dbg !41
+  %436 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i43, float %418, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i45 = select i1 %.not6.i44, float %436, float %435, !dbg !41
+  %437 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i46 = icmp eq i32 %437, 0, !dbg !41
+  %438 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i45, float %418, float 0x3FD5554F00000000) #5, !dbg !41
+  %439 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i45, float %418, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i47 = select i1 %.not7.i46, float %439, float %438, !dbg !41
+  %440 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i48 = icmp eq i32 %440, 0, !dbg !41
+  %441 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i47, float %418, float -5.000000e-01) #5, !dbg !41
+  %442 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i47, float %418, float -5.000000e-01) #5, !dbg !41
+  %.07.i49 = select i1 %.not8.i48, float %442, float %441, !dbg !41
+  %443 = fmul float %418, %.07.i49, !dbg !41
+  %444 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i50 = icmp eq i32 %444, 0, !dbg !41
+  %445 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %443, float %418, float %418) #5, !dbg !41
+  %446 = tail call float @llvm.nvvm.fma.rn.f(float %443, float %418, float %418) #5, !dbg !41
+  %.06.i51 = select i1 %.not9.i50, float %446, float %445, !dbg !41
+  %447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i52 = icmp eq i32 %447, 0, !dbg !41
+  %448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i33, float 0x3FE62E4300000000, float %.06.i51) #5, !dbg !41
+  %449 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i33, float 0x3FE62E4300000000, float %.06.i51) #5, !dbg !41
+  %.04.i53 = select i1 %.not10.i52, float %449, float %448, !dbg !41
+  %450 = icmp ugt i32 %409, 2139095039, !dbg !41
+  br i1 %450, label %__nv_fmaf_rn.exit.i.i56, label %__nv_logf.exit59, !dbg !41
+
+__nv_fmaf_rn.exit.i.i56:                          ; preds = %__nv_logf.exit
+  %451 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i57 = icmp eq i32 %451, 0, !dbg !41
+  %452 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %453 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i58 = select i1 %.not11.i57, float %453, float %452, !dbg !41
+  br label %__nv_logf.exit59, !dbg !41
+
+__nv_logf.exit59:                                 ; preds = %__nv_logf.exit, %__nv_fmaf_rn.exit.i.i56
+  %r.i.0.i54 = phi float [ %.03.i58, %__nv_fmaf_rn.exit.i.i56 ], [ %.04.i53, %__nv_logf.exit ], !dbg !41
+  %454 = fcmp oeq float %.02.i30, 0.000000e+00, !dbg !41
+  %r.i.1.i55 = select i1 %454, float 0xFFF0000000000000, float %r.i.0.i54, !dbg !41
+  %455 = fcmp olt float %146, 0x3810000000000000, !dbg !41
+  %456 = fmul float %146, 0x4160000000000000, !dbg !41
+  %.02.i60 = select i1 %455, float %456, float %146, !dbg !41
+  %i.i.0.i61 = select i1 %455, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %457 = bitcast float %.02.i60 to i32, !dbg !41
+  %458 = add i32 %457, -1059760811, !dbg !41
+  %459 = and i32 %458, -8388608, !dbg !41
+  %460 = sub i32 %457, %459, !dbg !41
+  %461 = bitcast i32 %460 to float, !dbg !41
+  %462 = sitofp i32 %459 to float, !dbg !41
+  %463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i62 = icmp eq i32 %463, 0, !dbg !41
+  %464 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %462, float 0x3E80000000000000, float %i.i.0.i61) #5, !dbg !41
+  %465 = tail call float @llvm.nvvm.fma.rn.f(float %462, float 0x3E80000000000000, float %i.i.0.i61) #5, !dbg !41
+  %.08.i63 = select i1 %.not.i62, float %465, float %464, !dbg !41
+  %466 = fadd float %461, -1.000000e+00, !dbg !41
+  %467 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i64 = icmp eq i32 %467, 0, !dbg !41
+  %468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %466, float 0x3FC2073EC0000000) #5, !dbg !41
+  %469 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %466, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i65 = select i1 %.not1.i64, float %469, float %468, !dbg !41
+  %470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i66 = icmp eq i32 %470, 0, !dbg !41
+  %471 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i65, float %466, float 0xBFBF19B980000000) #5, !dbg !41
+  %472 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i65, float %466, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i67 = select i1 %.not2.i66, float %472, float %471, !dbg !41
+  %473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i68 = icmp eq i32 %473, 0, !dbg !41
+  %474 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i67, float %466, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %475 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i67, float %466, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i69 = select i1 %.not3.i68, float %475, float %474, !dbg !41
+  %476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i70 = icmp eq i32 %476, 0, !dbg !41
+  %477 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i69, float %466, float 0xBFC55B1720000000) #5, !dbg !41
+  %478 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i69, float %466, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i71 = select i1 %.not4.i70, float %478, float %477, !dbg !41
+  %479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i72 = icmp eq i32 %479, 0, !dbg !41
+  %480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i71, float %466, float 0x3FC99DA160000000) #5, !dbg !41
+  %481 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i71, float %466, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i73 = select i1 %.not5.i72, float %481, float %480, !dbg !41
+  %482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i74 = icmp eq i32 %482, 0, !dbg !41
+  %483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i73, float %466, float 0xBFCFFFE440000000) #5, !dbg !41
+  %484 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i73, float %466, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i75 = select i1 %.not6.i74, float %484, float %483, !dbg !41
+  %485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i76 = icmp eq i32 %485, 0, !dbg !41
+  %486 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i75, float %466, float 0x3FD5554F00000000) #5, !dbg !41
+  %487 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i75, float %466, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i77 = select i1 %.not7.i76, float %487, float %486, !dbg !41
+  %488 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i78 = icmp eq i32 %488, 0, !dbg !41
+  %489 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i77, float %466, float -5.000000e-01) #5, !dbg !41
+  %490 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i77, float %466, float -5.000000e-01) #5, !dbg !41
+  %.07.i79 = select i1 %.not8.i78, float %490, float %489, !dbg !41
+  %491 = fmul float %466, %.07.i79, !dbg !41
+  %492 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i80 = icmp eq i32 %492, 0, !dbg !41
+  %493 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %491, float %466, float %466) #5, !dbg !41
+  %494 = tail call float @llvm.nvvm.fma.rn.f(float %491, float %466, float %466) #5, !dbg !41
+  %.06.i81 = select i1 %.not9.i80, float %494, float %493, !dbg !41
+  %495 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i82 = icmp eq i32 %495, 0, !dbg !41
+  %496 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i63, float 0x3FE62E4300000000, float %.06.i81) #5, !dbg !41
+  %497 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i63, float 0x3FE62E4300000000, float %.06.i81) #5, !dbg !41
+  %.04.i83 = select i1 %.not10.i82, float %497, float %496, !dbg !41
+  %498 = icmp ugt i32 %457, 2139095039, !dbg !41
+  br i1 %498, label %__nv_fmaf_rn.exit.i.i86, label %__nv_logf.exit89, !dbg !41
+
+__nv_fmaf_rn.exit.i.i86:                          ; preds = %__nv_logf.exit59
+  %499 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i87 = icmp eq i32 %499, 0, !dbg !41
+  %500 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i60, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %501 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i60, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i88 = select i1 %.not11.i87, float %501, float %500, !dbg !41
+  br label %__nv_logf.exit89, !dbg !41
+
+__nv_logf.exit89:                                 ; preds = %__nv_logf.exit59, %__nv_fmaf_rn.exit.i.i86
+  %r.i.0.i84 = phi float [ %.03.i88, %__nv_fmaf_rn.exit.i.i86 ], [ %.04.i83, %__nv_logf.exit59 ], !dbg !41
+  %502 = fcmp oeq float %.02.i60, 0.000000e+00, !dbg !41
+  %r.i.1.i85 = select i1 %502, float 0xFFF0000000000000, float %r.i.0.i84, !dbg !41
+  %503 = fcmp olt float %147, 0x3810000000000000, !dbg !41
+  %504 = fmul float %147, 0x4160000000000000, !dbg !41
+  %.02.i90 = select i1 %503, float %504, float %147, !dbg !41
+  %i.i.0.i91 = select i1 %503, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %505 = bitcast float %.02.i90 to i32, !dbg !41
+  %506 = add i32 %505, -1059760811, !dbg !41
+  %507 = and i32 %506, -8388608, !dbg !41
+  %508 = sub i32 %505, %507, !dbg !41
+  %509 = bitcast i32 %508 to float, !dbg !41
+  %510 = sitofp i32 %507 to float, !dbg !41
+  %511 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i92 = icmp eq i32 %511, 0, !dbg !41
+  %512 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %510, float 0x3E80000000000000, float %i.i.0.i91) #5, !dbg !41
+  %513 = tail call float @llvm.nvvm.fma.rn.f(float %510, float 0x3E80000000000000, float %i.i.0.i91) #5, !dbg !41
+  %.08.i93 = select i1 %.not.i92, float %513, float %512, !dbg !41
+  %514 = fadd float %509, -1.000000e+00, !dbg !41
+  %515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i94 = icmp eq i32 %515, 0, !dbg !41
+  %516 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %514, float 0x3FC2073EC0000000) #5, !dbg !41
+  %517 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %514, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i95 = select i1 %.not1.i94, float %517, float %516, !dbg !41
+  %518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i96 = icmp eq i32 %518, 0, !dbg !41
+  %519 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i95, float %514, float 0xBFBF19B980000000) #5, !dbg !41
+  %520 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i95, float %514, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i97 = select i1 %.not2.i96, float %520, float %519, !dbg !41
+  %521 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i98 = icmp eq i32 %521, 0, !dbg !41
+  %522 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i97, float %514, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %523 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i97, float %514, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i99 = select i1 %.not3.i98, float %523, float %522, !dbg !41
+  %524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i100 = icmp eq i32 %524, 0, !dbg !41
+  %525 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i99, float %514, float 0xBFC55B1720000000) #5, !dbg !41
+  %526 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i99, float %514, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i101 = select i1 %.not4.i100, float %526, float %525, !dbg !41
+  %527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i102 = icmp eq i32 %527, 0, !dbg !41
+  %528 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i101, float %514, float 0x3FC99DA160000000) #5, !dbg !41
+  %529 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i101, float %514, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i103 = select i1 %.not5.i102, float %529, float %528, !dbg !41
+  %530 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i104 = icmp eq i32 %530, 0, !dbg !41
+  %531 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i103, float %514, float 0xBFCFFFE440000000) #5, !dbg !41
+  %532 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i103, float %514, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i105 = select i1 %.not6.i104, float %532, float %531, !dbg !41
+  %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i106 = icmp eq i32 %533, 0, !dbg !41
+  %534 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i105, float %514, float 0x3FD5554F00000000) #5, !dbg !41
+  %535 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i105, float %514, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i107 = select i1 %.not7.i106, float %535, float %534, !dbg !41
+  %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i108 = icmp eq i32 %536, 0, !dbg !41
+  %537 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i107, float %514, float -5.000000e-01) #5, !dbg !41
+  %538 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i107, float %514, float -5.000000e-01) #5, !dbg !41
+  %.07.i109 = select i1 %.not8.i108, float %538, float %537, !dbg !41
+  %539 = fmul float %514, %.07.i109, !dbg !41
+  %540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i110 = icmp eq i32 %540, 0, !dbg !41
+  %541 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %539, float %514, float %514) #5, !dbg !41
+  %542 = tail call float @llvm.nvvm.fma.rn.f(float %539, float %514, float %514) #5, !dbg !41
+  %.06.i111 = select i1 %.not9.i110, float %542, float %541, !dbg !41
+  %543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i112 = icmp eq i32 %543, 0, !dbg !41
+  %544 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i93, float 0x3FE62E4300000000, float %.06.i111) #5, !dbg !41
+  %545 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i93, float 0x3FE62E4300000000, float %.06.i111) #5, !dbg !41
+  %.04.i113 = select i1 %.not10.i112, float %545, float %544, !dbg !41
+  %546 = icmp ugt i32 %505, 2139095039, !dbg !41
+  br i1 %546, label %__nv_fmaf_rn.exit.i.i116, label %__nv_logf.exit119, !dbg !41
+
+__nv_fmaf_rn.exit.i.i116:                         ; preds = %__nv_logf.exit89
+  %547 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i117 = icmp eq i32 %547, 0, !dbg !41
+  %548 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i90, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %549 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i90, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i118 = select i1 %.not11.i117, float %549, float %548, !dbg !41
+  br label %__nv_logf.exit119, !dbg !41
+
+__nv_logf.exit119:                                ; preds = %__nv_logf.exit89, %__nv_fmaf_rn.exit.i.i116
+  %r.i.0.i114 = phi float [ %.03.i118, %__nv_fmaf_rn.exit.i.i116 ], [ %.04.i113, %__nv_logf.exit89 ], !dbg !41
+  %550 = fcmp oeq float %.02.i90, 0.000000e+00, !dbg !41
+  %r.i.1.i115 = select i1 %550, float 0xFFF0000000000000, float %r.i.0.i114, !dbg !41
+  %551 = fcmp olt float %153, 0x3810000000000000, !dbg !41
+  %552 = fmul float %153, 0x4160000000000000, !dbg !41
+  %.02.i120 = select i1 %551, float %552, float %153, !dbg !41
+  %i.i.0.i121 = select i1 %551, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %553 = bitcast float %.02.i120 to i32, !dbg !41
+  %554 = add i32 %553, -1059760811, !dbg !41
+  %555 = and i32 %554, -8388608, !dbg !41
+  %556 = sub i32 %553, %555, !dbg !41
+  %557 = bitcast i32 %556 to float, !dbg !41
+  %558 = sitofp i32 %555 to float, !dbg !41
+  %559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i122 = icmp eq i32 %559, 0, !dbg !41
+  %560 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %558, float 0x3E80000000000000, float %i.i.0.i121) #5, !dbg !41
+  %561 = tail call float @llvm.nvvm.fma.rn.f(float %558, float 0x3E80000000000000, float %i.i.0.i121) #5, !dbg !41
+  %.08.i123 = select i1 %.not.i122, float %561, float %560, !dbg !41
+  %562 = fadd float %557, -1.000000e+00, !dbg !41
+  %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i124 = icmp eq i32 %563, 0, !dbg !41
+  %564 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %562, float 0x3FC2073EC0000000) #5, !dbg !41
+  %565 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %562, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i125 = select i1 %.not1.i124, float %565, float %564, !dbg !41
+  %566 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i126 = icmp eq i32 %566, 0, !dbg !41
+  %567 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i125, float %562, float 0xBFBF19B980000000) #5, !dbg !41
+  %568 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i125, float %562, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i127 = select i1 %.not2.i126, float %568, float %567, !dbg !41
+  %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i128 = icmp eq i32 %569, 0, !dbg !41
+  %570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i127, float %562, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %571 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i127, float %562, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i129 = select i1 %.not3.i128, float %571, float %570, !dbg !41
+  %572 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i130 = icmp eq i32 %572, 0, !dbg !41
+  %573 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i129, float %562, float 0xBFC55B1720000000) #5, !dbg !41
+  %574 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i129, float %562, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i131 = select i1 %.not4.i130, float %574, float %573, !dbg !41
+  %575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i132 = icmp eq i32 %575, 0, !dbg !41
+  %576 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i131, float %562, float 0x3FC99DA160000000) #5, !dbg !41
+  %577 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i131, float %562, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i133 = select i1 %.not5.i132, float %577, float %576, !dbg !41
+  %578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i134 = icmp eq i32 %578, 0, !dbg !41
+  %579 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i133, float %562, float 0xBFCFFFE440000000) #5, !dbg !41
+  %580 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i133, float %562, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i135 = select i1 %.not6.i134, float %580, float %579, !dbg !41
+  %581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i136 = icmp eq i32 %581, 0, !dbg !41
+  %582 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i135, float %562, float 0x3FD5554F00000000) #5, !dbg !41
+  %583 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i135, float %562, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i137 = select i1 %.not7.i136, float %583, float %582, !dbg !41
+  %584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i138 = icmp eq i32 %584, 0, !dbg !41
+  %585 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i137, float %562, float -5.000000e-01) #5, !dbg !41
+  %586 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i137, float %562, float -5.000000e-01) #5, !dbg !41
+  %.07.i139 = select i1 %.not8.i138, float %586, float %585, !dbg !41
+  %587 = fmul float %562, %.07.i139, !dbg !41
+  %588 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i140 = icmp eq i32 %588, 0, !dbg !41
+  %589 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %587, float %562, float %562) #5, !dbg !41
+  %590 = tail call float @llvm.nvvm.fma.rn.f(float %587, float %562, float %562) #5, !dbg !41
+  %.06.i141 = select i1 %.not9.i140, float %590, float %589, !dbg !41
+  %591 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i142 = icmp eq i32 %591, 0, !dbg !41
+  %592 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i123, float 0x3FE62E4300000000, float %.06.i141) #5, !dbg !41
+  %593 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i123, float 0x3FE62E4300000000, float %.06.i141) #5, !dbg !41
+  %.04.i143 = select i1 %.not10.i142, float %593, float %592, !dbg !41
+  %594 = icmp ugt i32 %553, 2139095039, !dbg !41
+  br i1 %594, label %__nv_fmaf_rn.exit.i.i146, label %__nv_logf.exit149, !dbg !41
+
+__nv_fmaf_rn.exit.i.i146:                         ; preds = %__nv_logf.exit119
+  %595 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i147 = icmp eq i32 %595, 0, !dbg !41
+  %596 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i120, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %597 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i120, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i148 = select i1 %.not11.i147, float %597, float %596, !dbg !41
+  br label %__nv_logf.exit149, !dbg !41
+
+__nv_logf.exit149:                                ; preds = %__nv_logf.exit119, %__nv_fmaf_rn.exit.i.i146
+  %r.i.0.i144 = phi float [ %.03.i148, %__nv_fmaf_rn.exit.i.i146 ], [ %.04.i143, %__nv_logf.exit119 ], !dbg !41
+  %598 = fcmp oeq float %.02.i120, 0.000000e+00, !dbg !41
+  %r.i.1.i145 = select i1 %598, float 0xFFF0000000000000, float %r.i.0.i144, !dbg !41
+  %599 = fcmp olt float %154, 0x3810000000000000, !dbg !41
+  %600 = fmul float %154, 0x4160000000000000, !dbg !41
+  %.02.i150 = select i1 %599, float %600, float %154, !dbg !41
+  %i.i.0.i151 = select i1 %599, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %601 = bitcast float %.02.i150 to i32, !dbg !41
+  %602 = add i32 %601, -1059760811, !dbg !41
+  %603 = and i32 %602, -8388608, !dbg !41
+  %604 = sub i32 %601, %603, !dbg !41
+  %605 = bitcast i32 %604 to float, !dbg !41
+  %606 = sitofp i32 %603 to float, !dbg !41
+  %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i152 = icmp eq i32 %607, 0, !dbg !41
+  %608 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %606, float 0x3E80000000000000, float %i.i.0.i151) #5, !dbg !41
+  %609 = tail call float @llvm.nvvm.fma.rn.f(float %606, float 0x3E80000000000000, float %i.i.0.i151) #5, !dbg !41
+  %.08.i153 = select i1 %.not.i152, float %609, float %608, !dbg !41
+  %610 = fadd float %605, -1.000000e+00, !dbg !41
+  %611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i154 = icmp eq i32 %611, 0, !dbg !41
+  %612 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %610, float 0x3FC2073EC0000000) #5, !dbg !41
+  %613 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %610, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i155 = select i1 %.not1.i154, float %613, float %612, !dbg !41
+  %614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i156 = icmp eq i32 %614, 0, !dbg !41
+  %615 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i155, float %610, float 0xBFBF19B980000000) #5, !dbg !41
+  %616 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i155, float %610, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i157 = select i1 %.not2.i156, float %616, float %615, !dbg !41
+  %617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i158 = icmp eq i32 %617, 0, !dbg !41
+  %618 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i157, float %610, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %619 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i157, float %610, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i159 = select i1 %.not3.i158, float %619, float %618, !dbg !41
+  %620 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i160 = icmp eq i32 %620, 0, !dbg !41
+  %621 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i159, float %610, float 0xBFC55B1720000000) #5, !dbg !41
+  %622 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i159, float %610, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i161 = select i1 %.not4.i160, float %622, float %621, !dbg !41
+  %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i162 = icmp eq i32 %623, 0, !dbg !41
+  %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i161, float %610, float 0x3FC99DA160000000) #5, !dbg !41
+  %625 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i161, float %610, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i163 = select i1 %.not5.i162, float %625, float %624, !dbg !41
+  %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i164 = icmp eq i32 %626, 0, !dbg !41
+  %627 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i163, float %610, float 0xBFCFFFE440000000) #5, !dbg !41
+  %628 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i163, float %610, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i165 = select i1 %.not6.i164, float %628, float %627, !dbg !41
+  %629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i166 = icmp eq i32 %629, 0, !dbg !41
+  %630 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i165, float %610, float 0x3FD5554F00000000) #5, !dbg !41
+  %631 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i165, float %610, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i167 = select i1 %.not7.i166, float %631, float %630, !dbg !41
+  %632 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i168 = icmp eq i32 %632, 0, !dbg !41
+  %633 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i167, float %610, float -5.000000e-01) #5, !dbg !41
+  %634 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i167, float %610, float -5.000000e-01) #5, !dbg !41
+  %.07.i169 = select i1 %.not8.i168, float %634, float %633, !dbg !41
+  %635 = fmul float %610, %.07.i169, !dbg !41
+  %636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i170 = icmp eq i32 %636, 0, !dbg !41
+  %637 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %635, float %610, float %610) #5, !dbg !41
+  %638 = tail call float @llvm.nvvm.fma.rn.f(float %635, float %610, float %610) #5, !dbg !41
+  %.06.i171 = select i1 %.not9.i170, float %638, float %637, !dbg !41
+  %639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i172 = icmp eq i32 %639, 0, !dbg !41
+  %640 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i153, float 0x3FE62E4300000000, float %.06.i171) #5, !dbg !41
+  %641 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i153, float 0x3FE62E4300000000, float %.06.i171) #5, !dbg !41
+  %.04.i173 = select i1 %.not10.i172, float %641, float %640, !dbg !41
+  %642 = icmp ugt i32 %601, 2139095039, !dbg !41
+  br i1 %642, label %__nv_fmaf_rn.exit.i.i176, label %__nv_logf.exit179, !dbg !41
+
+__nv_fmaf_rn.exit.i.i176:                         ; preds = %__nv_logf.exit149
+  %643 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i177 = icmp eq i32 %643, 0, !dbg !41
+  %644 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i150, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %645 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i150, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i178 = select i1 %.not11.i177, float %645, float %644, !dbg !41
+  br label %__nv_logf.exit179, !dbg !41
+
+__nv_logf.exit179:                                ; preds = %__nv_logf.exit149, %__nv_fmaf_rn.exit.i.i176
+  %r.i.0.i174 = phi float [ %.03.i178, %__nv_fmaf_rn.exit.i.i176 ], [ %.04.i173, %__nv_logf.exit149 ], !dbg !41
+  %646 = fcmp oeq float %.02.i150, 0.000000e+00, !dbg !41
+  %r.i.1.i175 = select i1 %646, float 0xFFF0000000000000, float %r.i.0.i174, !dbg !41
+  %647 = fcmp olt float %155, 0x3810000000000000, !dbg !41
+  %648 = fmul float %155, 0x4160000000000000, !dbg !41
+  %.02.i180 = select i1 %647, float %648, float %155, !dbg !41
+  %i.i.0.i181 = select i1 %647, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %649 = bitcast float %.02.i180 to i32, !dbg !41
+  %650 = add i32 %649, -1059760811, !dbg !41
+  %651 = and i32 %650, -8388608, !dbg !41
+  %652 = sub i32 %649, %651, !dbg !41
+  %653 = bitcast i32 %652 to float, !dbg !41
+  %654 = sitofp i32 %651 to float, !dbg !41
+  %655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i182 = icmp eq i32 %655, 0, !dbg !41
+  %656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %654, float 0x3E80000000000000, float %i.i.0.i181) #5, !dbg !41
+  %657 = tail call float @llvm.nvvm.fma.rn.f(float %654, float 0x3E80000000000000, float %i.i.0.i181) #5, !dbg !41
+  %.08.i183 = select i1 %.not.i182, float %657, float %656, !dbg !41
+  %658 = fadd float %653, -1.000000e+00, !dbg !41
+  %659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i184 = icmp eq i32 %659, 0, !dbg !41
+  %660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %658, float 0x3FC2073EC0000000) #5, !dbg !41
+  %661 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %658, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i185 = select i1 %.not1.i184, float %661, float %660, !dbg !41
+  %662 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i186 = icmp eq i32 %662, 0, !dbg !41
+  %663 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i185, float %658, float 0xBFBF19B980000000) #5, !dbg !41
+  %664 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i185, float %658, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i187 = select i1 %.not2.i186, float %664, float %663, !dbg !41
+  %665 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i188 = icmp eq i32 %665, 0, !dbg !41
+  %666 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i187, float %658, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %667 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i187, float %658, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i189 = select i1 %.not3.i188, float %667, float %666, !dbg !41
+  %668 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i190 = icmp eq i32 %668, 0, !dbg !41
+  %669 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i189, float %658, float 0xBFC55B1720000000) #5, !dbg !41
+  %670 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i189, float %658, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i191 = select i1 %.not4.i190, float %670, float %669, !dbg !41
+  %671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i192 = icmp eq i32 %671, 0, !dbg !41
+  %672 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i191, float %658, float 0x3FC99DA160000000) #5, !dbg !41
+  %673 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i191, float %658, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i193 = select i1 %.not5.i192, float %673, float %672, !dbg !41
+  %674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i194 = icmp eq i32 %674, 0, !dbg !41
+  %675 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i193, float %658, float 0xBFCFFFE440000000) #5, !dbg !41
+  %676 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i193, float %658, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i195 = select i1 %.not6.i194, float %676, float %675, !dbg !41
+  %677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i196 = icmp eq i32 %677, 0, !dbg !41
+  %678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i195, float %658, float 0x3FD5554F00000000) #5, !dbg !41
+  %679 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i195, float %658, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i197 = select i1 %.not7.i196, float %679, float %678, !dbg !41
+  %680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i198 = icmp eq i32 %680, 0, !dbg !41
+  %681 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i197, float %658, float -5.000000e-01) #5, !dbg !41
+  %682 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i197, float %658, float -5.000000e-01) #5, !dbg !41
+  %.07.i199 = select i1 %.not8.i198, float %682, float %681, !dbg !41
+  %683 = fmul float %658, %.07.i199, !dbg !41
+  %684 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i200 = icmp eq i32 %684, 0, !dbg !41
+  %685 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %683, float %658, float %658) #5, !dbg !41
+  %686 = tail call float @llvm.nvvm.fma.rn.f(float %683, float %658, float %658) #5, !dbg !41
+  %.06.i201 = select i1 %.not9.i200, float %686, float %685, !dbg !41
+  %687 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i202 = icmp eq i32 %687, 0, !dbg !41
+  %688 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i183, float 0x3FE62E4300000000, float %.06.i201) #5, !dbg !41
+  %689 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i183, float 0x3FE62E4300000000, float %.06.i201) #5, !dbg !41
+  %.04.i203 = select i1 %.not10.i202, float %689, float %688, !dbg !41
+  %690 = icmp ugt i32 %649, 2139095039, !dbg !41
+  br i1 %690, label %__nv_fmaf_rn.exit.i.i206, label %__nv_logf.exit209, !dbg !41
+
+__nv_fmaf_rn.exit.i.i206:                         ; preds = %__nv_logf.exit179
+  %691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i207 = icmp eq i32 %691, 0, !dbg !41
+  %692 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i180, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %693 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i180, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i208 = select i1 %.not11.i207, float %693, float %692, !dbg !41
+  br label %__nv_logf.exit209, !dbg !41
+
+__nv_logf.exit209:                                ; preds = %__nv_logf.exit179, %__nv_fmaf_rn.exit.i.i206
+  %r.i.0.i204 = phi float [ %.03.i208, %__nv_fmaf_rn.exit.i.i206 ], [ %.04.i203, %__nv_logf.exit179 ], !dbg !41
+  %694 = fcmp oeq float %.02.i180, 0.000000e+00, !dbg !41
+  %r.i.1.i205 = select i1 %694, float 0xFFF0000000000000, float %r.i.0.i204, !dbg !41
+  %695 = fcmp olt float %156, 0x3810000000000000, !dbg !41
+  %696 = fmul float %156, 0x4160000000000000, !dbg !41
+  %.02.i210 = select i1 %695, float %696, float %156, !dbg !41
+  %i.i.0.i211 = select i1 %695, float -2.300000e+01, float 0.000000e+00, !dbg !41
+  %697 = bitcast float %.02.i210 to i32, !dbg !41
+  %698 = add i32 %697, -1059760811, !dbg !41
+  %699 = and i32 %698, -8388608, !dbg !41
+  %700 = sub i32 %697, %699, !dbg !41
+  %701 = bitcast i32 %700 to float, !dbg !41
+  %702 = sitofp i32 %699 to float, !dbg !41
+  %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not.i212 = icmp eq i32 %703, 0, !dbg !41
+  %704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %702, float 0x3E80000000000000, float %i.i.0.i211) #5, !dbg !41
+  %705 = tail call float @llvm.nvvm.fma.rn.f(float %702, float 0x3E80000000000000, float %i.i.0.i211) #5, !dbg !41
+  %.08.i213 = select i1 %.not.i212, float %705, float %704, !dbg !41
+  %706 = fadd float %701, -1.000000e+00, !dbg !41
+  %707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not1.i214 = icmp eq i32 %707, 0, !dbg !41
+  %708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %706, float 0x3FC2073EC0000000) #5, !dbg !41
+  %709 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %706, float 0x3FC2073EC0000000) #5, !dbg !41
+  %.010.i215 = select i1 %.not1.i214, float %709, float %708, !dbg !41
+  %710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not2.i216 = icmp eq i32 %710, 0, !dbg !41
+  %711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i215, float %706, float 0xBFBF19B980000000) #5, !dbg !41
+  %712 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i215, float %706, float 0xBFBF19B980000000) #5, !dbg !41
+  %.011.i217 = select i1 %.not2.i216, float %712, float %711, !dbg !41
+  %713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not3.i218 = icmp eq i32 %713, 0, !dbg !41
+  %714 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i217, float %706, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %715 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i217, float %706, float 0x3FC1E52AA0000000) #5, !dbg !41
+  %.012.i219 = select i1 %.not3.i218, float %715, float %714, !dbg !41
+  %716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not4.i220 = icmp eq i32 %716, 0, !dbg !41
+  %717 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i219, float %706, float 0xBFC55B1720000000) #5, !dbg !41
+  %718 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i219, float %706, float 0xBFC55B1720000000) #5, !dbg !41
+  %.09.i221 = select i1 %.not4.i220, float %718, float %717, !dbg !41
+  %719 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not5.i222 = icmp eq i32 %719, 0, !dbg !41
+  %720 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i221, float %706, float 0x3FC99DA160000000) #5, !dbg !41
+  %721 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i221, float %706, float 0x3FC99DA160000000) #5, !dbg !41
+  %.05.i223 = select i1 %.not5.i222, float %721, float %720, !dbg !41
+  %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not6.i224 = icmp eq i32 %722, 0, !dbg !41
+  %723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i223, float %706, float 0xBFCFFFE440000000) #5, !dbg !41
+  %724 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i223, float %706, float 0xBFCFFFE440000000) #5, !dbg !41
+  %.01.i225 = select i1 %.not6.i224, float %724, float %723, !dbg !41
+  %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not7.i226 = icmp eq i32 %725, 0, !dbg !41
+  %726 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i225, float %706, float 0x3FD5554F00000000) #5, !dbg !41
+  %727 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i225, float %706, float 0x3FD5554F00000000) #5, !dbg !41
+  %.0.i227 = select i1 %.not7.i226, float %727, float %726, !dbg !41
+  %728 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not8.i228 = icmp eq i32 %728, 0, !dbg !41
+  %729 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i227, float %706, float -5.000000e-01) #5, !dbg !41
+  %730 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i227, float %706, float -5.000000e-01) #5, !dbg !41
+  %.07.i229 = select i1 %.not8.i228, float %730, float %729, !dbg !41
+  %731 = fmul float %706, %.07.i229, !dbg !41
+  %732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not9.i230 = icmp eq i32 %732, 0, !dbg !41
+  %733 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %731, float %706, float %706) #5, !dbg !41
+  %734 = tail call float @llvm.nvvm.fma.rn.f(float %731, float %706, float %706) #5, !dbg !41
+  %.06.i231 = select i1 %.not9.i230, float %734, float %733, !dbg !41
+  %735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not10.i232 = icmp eq i32 %735, 0, !dbg !41
+  %736 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i213, float 0x3FE62E4300000000, float %.06.i231) #5, !dbg !41
+  %737 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i213, float 0x3FE62E4300000000, float %.06.i231) #5, !dbg !41
+  %.04.i233 = select i1 %.not10.i232, float %737, float %736, !dbg !41
+  %738 = icmp ugt i32 %697, 2139095039, !dbg !41
+  br i1 %738, label %__nv_fmaf_rn.exit.i.i236, label %__nv_logf.exit239, !dbg !41
+
+__nv_fmaf_rn.exit.i.i236:                         ; preds = %__nv_logf.exit209
+  %739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41
+  %.not11.i237 = icmp eq i32 %739, 0, !dbg !41
+  %740 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i210, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %741 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i210, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41
+  %.03.i238 = select i1 %.not11.i237, float %741, float %740, !dbg !41
+  br label %__nv_logf.exit239, !dbg !41
+
+__nv_logf.exit239:                                ; preds = %__nv_logf.exit209, %__nv_fmaf_rn.exit.i.i236
+  %r.i.0.i234 = phi float [ %.03.i238, %__nv_fmaf_rn.exit.i.i236 ], [ %.04.i233, %__nv_logf.exit209 ], !dbg !41
+  %742 = fcmp oeq float %.02.i210, 0.000000e+00, !dbg !41
+  %r.i.1.i235 = select i1 %742, float 0xFFF0000000000000, float %r.i.0.i234, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %743 = insertelement <1 x float> undef, float %r.i.1.i, i64 0, !dbg !41
+  store <1 x float> %743, ptr addrspace(3) %25, align 4, !dbg !41
+  %744 = insertelement <1 x float> undef, float %r.i.1.i55, i64 0, !dbg !41
+  store <1 x float> %744, ptr addrspace(3) %28, align 4, !dbg !41
+  %745 = insertelement <1 x float> undef, float %r.i.1.i85, i64 0, !dbg !41
+  store <1 x float> %745, ptr addrspace(3) %31, align 4, !dbg !41
+  %746 = insertelement <1 x float> undef, float %r.i.1.i115, i64 0, !dbg !41
+  store <1 x float> %746, ptr addrspace(3) %34, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %747 = load float, ptr addrspace(3) %37, align 4, !dbg !41
+  %748 = load float, ptr addrspace(3) %40, align 4, !dbg !41
+  %749 = load float, ptr addrspace(3) %43, align 4, !dbg !41
+  %750 = load float, ptr addrspace(3) %46, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %751 = insertelement <1 x float> undef, float %r.i.1.i145, i64 0, !dbg !41
+  store <1 x float> %751, ptr addrspace(3) %25, align 4, !dbg !41
+  %752 = insertelement <1 x float> undef, float %r.i.1.i175, i64 0, !dbg !41
+  store <1 x float> %752, ptr addrspace(3) %28, align 4, !dbg !41
+  %753 = insertelement <1 x float> undef, float %r.i.1.i205, i64 0, !dbg !41
+  store <1 x float> %753, ptr addrspace(3) %31, align 4, !dbg !41
+  %754 = insertelement <1 x float> undef, float %r.i.1.i235, i64 0, !dbg !41
+  store <1 x float> %754, ptr addrspace(3) %34, align 4, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %755 = load float, ptr addrspace(3) %37, align 4, !dbg !41
+  %756 = load float, ptr addrspace(3) %40, align 4, !dbg !41
+  %757 = load float, ptr addrspace(3) %43, align 4, !dbg !41
+  %758 = load float, ptr addrspace(3) %46, align 4, !dbg !41
+  %759 = insertelement <8 x float> poison, float %747, i64 0, !dbg !42
+  %760 = insertelement <8 x float> %759, float %748, i64 1, !dbg !42
+  %761 = insertelement <8 x float> %760, float %749, i64 2, !dbg !42
+  %762 = insertelement <8 x float> %761, float %750, i64 3, !dbg !42
+  %763 = insertelement <8 x float> %762, float %755, i64 4, !dbg !42
+  %764 = insertelement <8 x float> %763, float %756, i64 5, !dbg !42
+  %765 = insertelement <8 x float> %764, float %757, i64 6, !dbg !42
+  %766 = insertelement <8 x float> %765, float %758, i64 7, !dbg !42
+  %767 = fsub <8 x float> %766, %358, !dbg !42
+  %768 = fadd <8 x float> %767, zeroinitializer, !dbg !42
+  %769 = select <8 x i1> %206, <8 x float> zeroinitializer, <8 x float> %768, !dbg !43
+  %770 = insertelement <8 x i1> %73, i1 %103, i64 1, !dbg !16
+  %771 = shufflevector <8 x i1> %770, <8 x i1> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1>, !dbg !16
+  %772 = select <8 x i1> %771, <8 x float> %769, <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !16
+  %773 = fadd <8 x float> %76, %772, !dbg !16
+  %774 = add nuw nsw i32 %75, 2048, !dbg !17
+  %775 = icmp ult i32 %75, 5632, !dbg !17
+  br i1 %775, label %74, label %776, !dbg !17
+
+776:                                              ; preds = %__nv_logf.exit239
+  %777 = lshr i32 %9, 5, !dbg !10
+  %778 = and i32 %777, 7, !dbg !10
+  %779 = and i32 %9, 31, !dbg !10
+  tail call void @llvm.nvvm.barrier0(), !dbg !44
+  %shift = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %780 = add <8 x i64> %211, %shift, !dbg !48
+  %shift286 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %781 = add <8 x i64> %780, %shift286, !dbg !48
+  %shift287 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %782 = add <8 x i64> %781, %shift287, !dbg !48
+  %shift288 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %783 = add <8 x i64> %782, %shift288, !dbg !48
+  %shift289 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %784 = add <8 x i64> %783, %shift289, !dbg !48
+  %shift290 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %785 = add <8 x i64> %784, %shift290, !dbg !48
+  %shift291 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !48
+  %786 = add <8 x i64> %785, %shift291, !dbg !48
+  %787 = extractelement <8 x i64> %786, i64 0, !dbg !48
+  %788 = trunc i64 %787 to i32, !dbg !44
+  %789 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %788, i32 16, i32 31), !dbg !44
+  %bc = bitcast i64 %787 to <2 x i32>, !dbg !44
+  %790 = extractelement <2 x i32> %bc, i64 1, !dbg !44
+  %791 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %790, i32 16, i32 31), !dbg !44
+  %792 = insertelement <2 x i32> undef, i32 %789, i64 0, !dbg !44
+  %793 = insertelement <2 x i32> %792, i32 %791, i64 1, !dbg !44
+  %794 = bitcast <2 x i32> %793 to i64, !dbg !44
+  %795 = add i64 %787, %794, !dbg !48
+  %796 = trunc i64 %795 to i32, !dbg !44
+  %797 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %796, i32 8, i32 31), !dbg !44
+  %bc1 = bitcast i64 %795 to <2 x i32>, !dbg !44
+  %798 = extractelement <2 x i32> %bc1, i64 1, !dbg !44
+  %799 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %798, i32 8, i32 31), !dbg !44
+  %800 = insertelement <2 x i32> undef, i32 %797, i64 0, !dbg !44
+  %801 = insertelement <2 x i32> %800, i32 %799, i64 1, !dbg !44
+  %802 = bitcast <2 x i32> %801 to i64, !dbg !44
+  %803 = add i64 %795, %802, !dbg !48
+  %804 = trunc i64 %803 to i32, !dbg !44
+  %805 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %804, i32 4, i32 31), !dbg !44
+  %bc2 = bitcast i64 %803 to <2 x i32>, !dbg !44
+  %806 = extractelement <2 x i32> %bc2, i64 1, !dbg !44
+  %807 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %806, i32 4, i32 31), !dbg !44
+  %808 = insertelement <2 x i32> undef, i32 %805, i64 0, !dbg !44
+  %809 = insertelement <2 x i32> %808, i32 %807, i64 1, !dbg !44
+  %810 = bitcast <2 x i32> %809 to i64, !dbg !44
+  %811 = add i64 %803, %810, !dbg !48
+  %812 = trunc i64 %811 to i32, !dbg !44
+  %813 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %812, i32 2, i32 31), !dbg !44
+  %bc3 = bitcast i64 %811 to <2 x i32>, !dbg !44
+  %814 = extractelement <2 x i32> %bc3, i64 1, !dbg !44
+  %815 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %814, i32 2, i32 31), !dbg !44
+  %816 = insertelement <2 x i32> undef, i32 %813, i64 0, !dbg !44
+  %817 = insertelement <2 x i32> %816, i32 %815, i64 1, !dbg !44
+  %818 = bitcast <2 x i32> %817 to i64, !dbg !44
+  %819 = add i64 %811, %818, !dbg !48
+  %820 = trunc i64 %819 to i32, !dbg !44
+  %821 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %820, i32 1, i32 31), !dbg !44
+  %bc4 = bitcast i64 %819 to <2 x i32>, !dbg !44
+  %822 = extractelement <2 x i32> %bc4, i64 1, !dbg !44
+  %823 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %822, i32 1, i32 31), !dbg !44
+  %824 = insertelement <2 x i32> undef, i32 %821, i64 0, !dbg !44
+  %825 = insertelement <2 x i32> %824, i32 %823, i64 1, !dbg !44
+  %826 = bitcast <2 x i32> %825 to i64, !dbg !44
+  %827 = add i64 %819, %826, !dbg !48
+  %828 = icmp eq i32 %779, 0, !dbg !44
+  %829 = zext nneg i32 %778 to i64, !dbg !44
+  %830 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %829, !dbg !44
+  tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %830, i64 %827, i1 %828) #5, !dbg !44
+  tail call void @llvm.nvvm.barrier0(), !dbg !44
+  %831 = icmp slt i32 %9, 8, !dbg !44
+  %832 = sext i32 %9 to i64, !dbg !44
+  %833 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %832, !dbg !44
+  %834 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %833, i1 %831) #5, !dbg !44
+  %835 = trunc i64 %834 to i32, !dbg !44
+  %836 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %835, i32 4, i32 31), !dbg !44
+  %bc5 = bitcast i64 %834 to <2 x i32>, !dbg !44
+  %837 = extractelement <2 x i32> %bc5, i64 1, !dbg !44
+  %838 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %837, i32 4, i32 31), !dbg !44
+  %839 = insertelement <2 x i32> undef, i32 %836, i64 0, !dbg !44
+  %840 = insertelement <2 x i32> %839, i32 %838, i64 1, !dbg !44
+  %841 = bitcast <2 x i32> %840 to i64, !dbg !44
+  %842 = add i64 %834, %841, !dbg !48
+  %843 = trunc i64 %842 to i32, !dbg !44
+  %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %843, i32 2, i32 31), !dbg !44
+  %bc6 = bitcast i64 %842 to <2 x i32>, !dbg !44
+  %845 = extractelement <2 x i32> %bc6, i64 1, !dbg !44
+  %846 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %845, i32 2, i32 31), !dbg !44
+  %847 = insertelement <2 x i32> undef, i32 %844, i64 0, !dbg !44
+  %848 = insertelement <2 x i32> %847, i32 %846, i64 1, !dbg !44
+  %849 = bitcast <2 x i32> %848 to i64, !dbg !44
+  %850 = add i64 %842, %849, !dbg !48
+  %851 = trunc i64 %850 to i32, !dbg !44
+  %852 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %851, i32 1, i32 31), !dbg !44
+  %bc7 = bitcast i64 %850 to <2 x i32>, !dbg !44
+  %853 = extractelement <2 x i32> %bc7, i64 1, !dbg !44
+  %854 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %853, i32 1, i32 31), !dbg !44
+  %855 = insertelement <2 x i32> undef, i32 %852, i64 0, !dbg !44
+  %856 = insertelement <2 x i32> %855, i32 %854, i64 1, !dbg !44
+  %857 = bitcast <2 x i32> %856 to i64, !dbg !44
+  %858 = add i64 %850, %857, !dbg !48
+  %859 = and i32 %9, 7, !dbg !44
+  %860 = icmp eq i32 %859, 0, !dbg !44
+  %861 = and i1 %831, %860, !dbg !44
+  tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %833, i64 %858, i1 %861) #5, !dbg !44
+  tail call void @llvm.nvvm.barrier0(), !dbg !44
+  %862 = load i64, ptr addrspace(3) @global_smem, align 4, !dbg !44
+  tail call void @llvm.nvvm.barrier0(), !dbg !52
+  %863 = insertelement <1 x i64> undef, i64 %862, i64 0, !dbg !52
+  store <1 x i64> %863, ptr addrspace(3) @global_smem, align 8, !dbg !52
+  tail call void @llvm.nvvm.barrier0(), !dbg !52
+  %864 = load i64, ptr addrspace(3) @global_smem, align 8, !dbg !52
+  %865 = getelementptr i64, ptr addrspace(1) %4, i64 %18, !dbg !53
+  %866 = icmp eq i32 %urem, 0, !dbg !54
+  %867 = and i1 %866, %19, !dbg !54
+  tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %864, ptr addrspace(1) %865, i1 %867) #5, !dbg !54
+  tail call void @llvm.nvvm.barrier0(), !dbg !55
+  %shift292 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %868 = fadd <8 x float> %773, %shift292, !dbg !57
+  %shift293 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %869 = fadd <8 x float> %shift293, %868, !dbg !57
+  %shift294 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %870 = fadd <8 x float> %shift294, %869, !dbg !57
+  %shift295 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %871 = fadd <8 x float> %shift295, %870, !dbg !57
+  %shift296 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %872 = fadd <8 x float> %shift296, %871, !dbg !57
+  %shift297 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %873 = fadd <8 x float> %shift297, %872, !dbg !57
+  %shift298 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> <i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !57
+  %874 = fadd <8 x float> %shift298, %873, !dbg !57
+  %875 = extractelement <8 x float> %874, i64 0, !dbg !57
+  %876 = bitcast float %875 to i32, !dbg !55
+  %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 16, i32 31), !dbg !55
+  %878 = bitcast i32 %877 to float, !dbg !55
+  %879 = fadd float %875, %878, !dbg !57
+  %880 = bitcast float %879 to i32, !dbg !55
+  %881 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %880, i32 8, i32 31), !dbg !55
+  %882 = bitcast i32 %881 to float, !dbg !55
+  %883 = fadd float %879, %882, !dbg !57
+  %884 = bitcast float %883 to i32, !dbg !55
+  %885 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %884, i32 4, i32 31), !dbg !55
+  %886 = bitcast i32 %885 to float, !dbg !55
+  %887 = fadd float %883, %886, !dbg !57
+  %888 = bitcast float %887 to i32, !dbg !55
+  %889 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %888, i32 2, i32 31), !dbg !55
+  %890 = bitcast i32 %889 to float, !dbg !55
+  %891 = fadd float %887, %890, !dbg !57
+  %892 = bitcast float %891 to i32, !dbg !55
+  %893 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %892, i32 1, i32 31), !dbg !55
+  %894 = bitcast i32 %893 to float, !dbg !55
+  %895 = fadd float %891, %894, !dbg !57
+  %896 = getelementptr float, ptr addrspace(3) @global_smem, i64 %829, !dbg !55
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %896, float %895, i1 %828) #5, !dbg !55
+  tail call void @llvm.nvvm.barrier0(), !dbg !55
+  %897 = getelementptr float, ptr addrspace(3) @global_smem, i64 %832, !dbg !55
+  %898 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %897, i1 %831) #5, !dbg !55
+  %899 = bitcast float %898 to i32, !dbg !55
+  %900 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %899, i32 4, i32 31), !dbg !55
+  %901 = bitcast i32 %900 to float, !dbg !55
+  %902 = fadd float %898, %901, !dbg !57
+  %903 = bitcast float %902 to i32, !dbg !55
+  %904 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %903, i32 2, i32 31), !dbg !55
+  %905 = bitcast i32 %904 to float, !dbg !55
+  %906 = fadd float %902, %905, !dbg !57
+  %907 = bitcast float %906 to i32, !dbg !55
+  %908 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %907, i32 1, i32 31), !dbg !55
+  %909 = bitcast i32 %908 to float, !dbg !55
+  %910 = fadd float %906, %909, !dbg !57
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %897, float %910, i1 %861) #5, !dbg !55
+  tail call void @llvm.nvvm.barrier0(), !dbg !55
+  %911 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !55
+  %912 = getelementptr float, ptr addrspace(1) %5, i64 %18, !dbg !60
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %911, ptr addrspace(1) %912, i1 %867) #5, !dbg !61
+  ret void, !dbg !62
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #1
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_logf(float %a) local_unnamed_addr #3 {
+__nv_fmaf_rn.exit10.i:
+  %0 = fcmp olt float %a, 0x3810000000000000
+  %1 = fmul float %a, 0x4160000000000000
+  %.02 = select i1 %0, float %1, float %a
+  %i.i.0 = select i1 %0, float -2.300000e+01, float 0.000000e+00
+  %2 = bitcast float %.02 to i32
+  %3 = add i32 %2, -1059760811
+  %4 = and i32 %3, -8388608
+  %5 = sub i32 %2, %4
+  %6 = bitcast i32 %5 to float
+  %7 = sitofp i32 %4 to float
+  %8 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not = icmp eq i32 %8, 0
+  %9 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %7, float 0x3E80000000000000, float %i.i.0) #5
+  %10 = tail call float @llvm.nvvm.fma.rn.f(float %7, float 0x3E80000000000000, float %i.i.0) #5
+  %.08 = select i1 %.not, float %10, float %9
+  %11 = fadd float %6, -1.000000e+00
+  %12 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not1 = icmp eq i32 %12, 0
+  %13 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %11, float 0x3FC2073EC0000000) #5
+  %14 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %11, float 0x3FC2073EC0000000) #5
+  %.010 = select i1 %.not1, float %14, float %13
+  %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not2 = icmp eq i32 %15, 0
+  %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010, float %11, float 0xBFBF19B980000000) #5
+  %17 = tail call float @llvm.nvvm.fma.rn.f(float %.010, float %11, float 0xBFBF19B980000000) #5
+  %.011 = select i1 %.not2, float %17, float %16
+  %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not3 = icmp eq i32 %18, 0
+  %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011, float %11, float 0x3FC1E52AA0000000) #5
+  %20 = tail call float @llvm.nvvm.fma.rn.f(float %.011, float %11, float 0x3FC1E52AA0000000) #5
+  %.012 = select i1 %.not3, float %20, float %19
+  %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not4 = icmp eq i32 %21, 0
+  %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012, float %11, float 0xBFC55B1720000000) #5
+  %23 = tail call float @llvm.nvvm.fma.rn.f(float %.012, float %11, float 0xBFC55B1720000000) #5
+  %.09 = select i1 %.not4, float %23, float %22
+  %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not5 = icmp eq i32 %24, 0
+  %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09, float %11, float 0x3FC99DA160000000) #5
+  %26 = tail call float @llvm.nvvm.fma.rn.f(float %.09, float %11, float 0x3FC99DA160000000) #5
+  %.05 = select i1 %.not5, float %26, float %25
+  %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not6 = icmp eq i32 %27, 0
+  %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %11, float 0xBFCFFFE440000000) #5
+  %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %11, float 0xBFCFFFE440000000) #5
+  %.01 = select i1 %.not6, float %29, float %28
+  %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not7 = icmp eq i32 %30, 0
+  %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01, float %11, float 0x3FD5554F00000000) #5
+  %32 = tail call float @llvm.nvvm.fma.rn.f(float %.01, float %11, float 0x3FD5554F00000000) #5
+  %.0 = select i1 %.not7, float %32, float %31
+  %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not8 = icmp eq i32 %33, 0
+  %34 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0, float %11, float -5.000000e-01) #5
+  %35 = tail call float @llvm.nvvm.fma.rn.f(float %.0, float %11, float -5.000000e-01) #5
+  %.07 = select i1 %.not8, float %35, float %34
+  %36 = fmul float %11, %.07
+  %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not9 = icmp eq i32 %37, 0
+  %38 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %36, float %11, float %11) #5
+  %39 = tail call float @llvm.nvvm.fma.rn.f(float %36, float %11, float %11) #5
+  %.06 = select i1 %.not9, float %39, float %38
+  %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not10 = icmp eq i32 %40, 0
+  %41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08, float 0x3FE62E4300000000, float %.06) #5
+  %42 = tail call float @llvm.nvvm.fma.rn.f(float %.08, float 0x3FE62E4300000000, float %.06) #5
+  %.04 = select i1 %.not10, float %42, float %41
+  %43 = icmp ugt i32 %2, 2139095039
+  br i1 %43, label %__nv_fmaf_rn.exit.i, label %__internal_accurate_logf.exit
+
+__nv_fmaf_rn.exit.i:                              ; preds = %__nv_fmaf_rn.exit10.i
+  %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5
+  %.not11 = icmp eq i32 %44, 0
+  %45 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float 0x7FF0000000000000, float 0x7FF0000000000000) #5
+  %46 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float 0x7FF0000000000000, float 0x7FF0000000000000) #5
+  %.03 = select i1 %.not11, float %46, float %45
+  br label %__internal_accurate_logf.exit
+
+__internal_accurate_logf.exit:                    ; preds = %__nv_fmaf_rn.exit.i, %__nv_fmaf_rn.exit10.i
+  %r.i.0 = phi float [ %.03, %__nv_fmaf_rn.exit.i ], [ %.04, %__nv_fmaf_rn.exit10.i ]
+  %47 = fcmp oeq float %.02, 0.000000e+00
+  %r.i.1 = select i1 %47, float 0xFFF0000000000000, float %r.i.0
+  ret float %r.i.1
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare float @llvm.nvvm.fma.rn.f(float, float, float) #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind }
+attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "culwqy52mqs4o2bmqocf2r5plomw2phviv5gutbxlcpdrdkc46ri.py", directory: "/tmp/torchinductor_root/ul")
+!4 = !{ptr @triton__0d1d2d3d4d5d6e7de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6e7de, !"maxntidx", i32 256}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6e7de", linkageName: "triton__0d1d2d3d4d5d6e7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 24, column: 33, scope: !7)
+!11 = !DILocation(line: 21, column: 28, scope: !7)
+!12 = !DILocation(line: 21, column: 34, scope: !7)
+!13 = !DILocation(line: 23, column: 21, scope: !7)
+!14 = !DILocation(line: 32, column: 45, scope: !7)
+!15 = !DILocation(line: 47, column: 67, scope: !7)
+!16 = !DILocation(line: 59, column: 48, scope: !7)
+!17 = !DILocation(line: 28, column: 36, scope: !7)
+!18 = !DILocation(line: 29, column: 27, scope: !7)
+!19 = !DILocation(line: 30, column: 25, scope: !7)
+!20 = !DILocation(line: 32, column: 40, scope: !7)
+!21 = !DILocation(line: 32, column: 34, scope: !7)
+!22 = !DILocation(line: 32, column: 59, scope: !7)
+!23 = !DILocation(line: 32, column: 51, scope: !7)
+!24 = !DILocation(line: 33, column: 35, scope: !7)
+!25 = !DILocation(line: 33, column: 52, scope: !7)
+!26 = !DILocation(line: 34, column: 35, scope: !7)
+!27 = !DILocation(line: 34, column: 52, scope: !7)
+!28 = !DILocation(line: 36, column: 23, scope: !7)
+!29 = !DILocation(line: 42, column: 36, scope: !7)
+!30 = !DILocation(line: 40, column: 46, scope: !7)
+!31 = !DILocation(line: 46, column: 41, scope: !7)
+!32 = !DILocation(line: 43, column: 22, scope: !7)
+!33 = !DILocation(line: 44, column: 23, scope: !7)
+!34 = !DILocation(line: 45, column: 38, scope: !7)
+!35 = !DILocation(line: 46, column: 57, scope: !7)
+!36 = !DILocation(line: 47, column: 50, scope: !7)
+!37 = !DILocation(line: 47, column: 35, scope: !7)
+!38 = !DILocation(line: 47, column: 73, scope: !7)
+!39 = !DILocation(line: 47, column: 132, scope: !7)
+!40 = !DILocation(line: 49, column: 24, scope: !7)
+!41 = !DILocation(line: 50, column: 23, scope: !7)
+!42 = !DILocation(line: 54, column: 17, scope: !7)
+!43 = !DILocation(line: 56, column: 38, scope: !7)
+!44 = !DILocation(line: 243, column: 36, scope: !45, inlinedAt: !47)
+!45 = distinct !DILexicalBlockFile(scope: !7, file: !46, discriminator: 0)
+!46 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!47 = !DILocation(line: 60, column: 25, scope: !45)
+!48 = !DILocation(line: 233, column: 15, scope: !49, inlinedAt: !50)
+!49 = distinct !DILexicalBlockFile(scope: !45, file: !46, discriminator: 0)
+!50 = !DILocation(line: 243, column: 36, scope: !49, inlinedAt: !51)
+!51 = !DILocation(line: 60, column: 25, scope: !49)
+!52 = !DILocation(line: 60, column: 28, scope: !7)
+!53 = !DILocation(line: 61, column: 25, scope: !7)
+!54 = !DILocation(line: 61, column: 36, scope: !7)
+!55 = !DILocation(line: 243, column: 36, scope: !45, inlinedAt: !56)
+!56 = !DILocation(line: 62, column: 27, scope: !45)
+!57 = !DILocation(line: 233, column: 15, scope: !49, inlinedAt: !58)
+!58 = !DILocation(line: 243, column: 36, scope: !49, inlinedAt: !59)
+!59 = !DILocation(line: 62, column: 27, scope: !49)
+!60 = !DILocation(line: 63, column: 25, scope: !7)
+!61 = !DILocation(line: 63, column: 37, scope: !7)
+!62 = !DILocation(line: 63, column: 4, scope: !7)
diff --git a/.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir b/.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..86ef92a6245097c1c78b49c4b528cdf41bd3c2e3
--- /dev/null
+++ b/.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir
@@ -0,0 +1,89 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 2.560000e+02 : f32
+    %cst_1 = arith.constant 9.99999974E-6 : f32
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_3 = arith.constant dense<256> : tensor<1xi64>
+    %cst_4 = arith.constant dense<50257> : tensor<1xi64>
+    %cst_5 = arith.constant dense<0> : tensor<1xi64>
+    %cst_6 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_6 : tensor<256xi32>
+    %3 = arith.remsi %0, %c512_i32 : i32
+    %4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
+    %5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
+    %6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
+    %7 = arith.muli %3, %c256_i32 : i32
+    %8 = tt.splat %7 : (i32) -> tensor<256xi32>
+    %9 = arith.addi %1, %8 : tensor<256xi32>
+    %10 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %11 = tt.addptr %10, %9 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %13 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %16 = arith.addi %6, %cst_4 : tensor<1xi64>
+    %17 = arith.cmpi slt, %6, %cst_5 : tensor<1xi64>
+    %18 = arith.select %17, %16, %6 : tensor<1xi1>, tensor<1xi64>
+    %19 = arith.cmpi sge, %18, %cst_5 : tensor<1xi64>
+    %20 = arith.cmpi slt, %18, %cst_4 : tensor<1xi64>
+    %21 = arith.andi %19, %20 : tensor<1xi1>
+    tt.assert %21, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1>
+    %22 = arith.muli %18, %cst_3 : tensor<1xi64>
+    %23 = tt.broadcast %22 : (tensor<1xi64>) -> tensor<256xi64>
+    %24 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
+    %25 = arith.addi %24, %23 : tensor<256xi64>
+    %26 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %27 = tt.addptr %26, %25 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
+    %28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %29 = arith.addf %28, %12 : tensor<256xf32>
+    %30 = arith.select %2, %29, %cst_2 : tensor<256xi1>, tensor<256xf32>
+    %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %60 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %60 : f32
+    }) : (tensor<256xf32>) -> f32
+    %32 = arith.addf %31, %cst : f32
+    %33 = arith.divf %32, %cst_0 : f32
+    %34 = tt.splat %33 : (f32) -> tensor<1xf32>
+    %35 = tt.splat %33 : (f32) -> tensor<256xf32>
+    %36 = arith.subf %29, %35 : tensor<256xf32>
+    %37 = arith.mulf %36, %36 : tensor<256xf32>
+    %38 = arith.select %2, %37, %cst_2 : tensor<256xi1>, tensor<256xf32>
+    %39 = "tt.reduce"(%38) <{axis = 0 : i32}> ({
+    ^bb0(%arg10: f32, %arg11: f32):
+      %60 = arith.addf %arg10, %arg11 : f32
+      tt.reduce.return %60 : f32
+    }) : (tensor<256xf32>) -> f32
+    %40 = arith.addf %39, %cst : f32
+    %41 = arith.divf %40, %cst_0 : f32
+    %42 = arith.addf %41, %cst_1 : f32
+    %43 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %44 = tt.splat %43 : (f32) -> tensor<1xf32>
+    %45 = tt.splat %43 : (f32) -> tensor<256xf32>
+    %46 = arith.mulf %36, %45 : tensor<256xf32>
+    %47 = arith.mulf %46, %15 : tensor<256xf32>
+    %48 = arith.muli %0, %c256_i32 : i32
+    %49 = tt.splat %48 : (i32) -> tensor<256xi32>
+    %50 = arith.addi %1, %49 : tensor<256xi32>
+    %51 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %52 = tt.addptr %51, %50 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %52, %29, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    gpu.barrier
+    %53 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %54 = tt.splat %53 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %54, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    %55 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %56 = tt.addptr %55, %50 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %57 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %56, %57, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    %58 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
+    %59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %59, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir b/.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2bd2301e9a3b5eb349019c799bcb251b88fc0250
--- /dev/null
+++ b/.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir
@@ -0,0 +1,18 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<128xi32, #blocked>
+    %c128_i32 = arith.constant 128 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c128_i32 : i32
+    %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<128xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<128xi32, #blocked>
+    %5 = arith.cmpi slt, %4, %cst : tensor<128xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>, #blocked>
+    %7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>, #blocked>, tensor<128xi32, #blocked>
+    %8 = arith.extsi %4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked>
+    tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir b/.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..a75947b0f6f2cf2e2189442beac7fe831ec6e19d
--- /dev/null
+++ b/.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir
@@ -0,0 +1,17 @@
+module {
+  tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<512> : tensor<256xi32>
+    %c256_i32 = arith.constant 256 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c256_i32 : i32
+    %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<256xi32>
+    %4 = arith.addi %3, %2 : tensor<256xi32>
+    %5 = arith.cmpi slt, %4, %cst : tensor<256xi32>
+    %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>>
+    %7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>>, tensor<256xi32>
+    %8 = arith.extsi %4 : tensor<256xi32> to tensor<256xi64>
+    tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64>
+    tt.return
+  }
+}
diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..e7d92bdaf36a0219dbb4ae6781b8e373cb122b90
Binary files /dev/null and b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin differ
diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..24dcef2f99da30f8c2c81b89268719ce32694c66
--- /dev/null
+++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx
@@ -0,0 +1,861 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<28>;
+	.reg .b16 	%rs<25>;
+	.reg .b32 	%r<79>;
+	.reg .f32 	%f<487>;
+	.reg .b64 	%rd<8>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd5, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r26, %tid.x;
+	shl.b32 	%r27, %r26, 3;
+	and.b32  	%r28, %r27, 1016;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r29, %r1, 10;
+	.loc	1 21 23
+	or.b32  	%r30, %r29, %r28;
+	.loc	1 24 34
+	mul.wide.s32 	%rd6, %r30, 2;
+	add.s64 	%rd7, %rd4, %rd6;
+	mov.pred 	%p1, -1;
+	.loc	1 24 39
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd7 + 0 ];
+	.loc	1 25 30
+	add.s64 	%rd3, %rd5, %rd6;
+	.loc	1 25 35
+	mov.u32 %r14, 0x0;
+	mov.u32 %r15, 0x0;
+	mov.u32 %r16, 0x0;
+	mov.u32 %r17, 0x0;
+	@%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd3 + 0 ];
+	cvt.u16.u32 	%rs9, %r14;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
+	cvt.u16.u32 	%rs11, %r15;
+	.loc	1 25 44
+	cvt.f32.bf16 %r18, %rs9;
+	mov.b32 	%f9, %r18;
+	cvt.f32.bf16 %r19, %rs10;
+	mov.b32 	%f10, %r19;
+	.loc	1 29 18
+	mul.f32 	%f17, %f9, 0f3F3504F3;
+	.loc	1 30 23
+	abs.ftz.f32 	%f25, %f17;
+	setp.ge.f32 	%p3, %f25, 0f3F8060FE;
+	mov.f32 	%f421, 0f3789CA3C;
+	mov.f32 	%f420, 0fB9F560B9;
+	mov.f32 	%f419, 0f3BAC840B;
+	mov.f32 	%f418, 0fBD0C8162;
+	mov.f32 	%f417, 0f3E1CF906;
+	mov.f32 	%f416, 0f3F6A937E;
+	mov.f32 	%f415, 0f3F20D842;
+	mov.f32 	%f422, %f25;
+	@%p3 bra 	$L__BB0_2;
+	.loc	1 0 23
+	mov.f32 	%f421, 0f38B1E96A;
+	mov.f32 	%f420, 0fBA574D20;
+	mov.f32 	%f419, 0f3BAAD5EA;
+	mov.f32 	%f418, 0fBCDC1BE7;
+	mov.f32 	%f417, 0f3DE718AF;
+	mov.f32 	%f416, 0fBEC093AC;
+	mov.f32 	%f415, 0f3E0375D3;
+	.loc	1 30 23
+	mul.f32 	%f422, %f17, %f17;
+$L__BB0_2:
+	.loc	1 0 0
+	cvt.f32.bf16 %r20, %rs11;
+	mul.f32 	%f18, %f10, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p4, %f25, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f143, %f421, %f422, %f420;
+	fma.rn.ftz.f32 	%f144, %f143, %f422, %f419;
+	fma.rn.ftz.f32 	%f145, %f144, %f422, %f418;
+	fma.rn.ftz.f32 	%f146, %f145, %f422, %f417;
+	fma.rn.ftz.f32 	%f147, %f146, %f422, %f416;
+	fma.rn.ftz.f32 	%f148, %f147, %f422, %f415;
+	neg.f32 	%f149, %f422;
+	selp.f32 	%f150, %f149, %f17, %p3;
+	fma.rn.ftz.f32 	%f423, %f148, %f150, %f150;
+	mov.f32 	%f414, 0f3F800000;
+	@%p4 bra 	$L__BB0_4;
+	ex2.approx.ftz.f32 	%f151, %f423;
+	sub.f32 	%f153, %f414, %f151;
+	mov.b32 	%r31, %f153;
+	mov.b32 	%r32, %f17;
+	and.b32  	%r33, %r32, -2147483648;
+	or.b32  	%r34, %r33, %r31;
+	mov.b32 	%f423, %r34;
+$L__BB0_4:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
+	mov.b32 	%f11, %r20;
+	.loc	1 30 23
+	abs.ftz.f32 	%f38, %f18;
+	setp.ge.f32 	%p6, %f38, 0f3F8060FE;
+	mov.f32 	%f430, 0f3789CA3C;
+	mov.f32 	%f429, 0fB9F560B9;
+	mov.f32 	%f428, 0f3BAC840B;
+	mov.f32 	%f427, 0fBD0C8162;
+	mov.f32 	%f426, 0f3E1CF906;
+	mov.f32 	%f425, 0f3F6A937E;
+	mov.f32 	%f424, 0f3F20D842;
+	mov.f32 	%f431, %f38;
+	@%p6 bra 	$L__BB0_6;
+	mul.f32 	%f431, %f18, %f18;
+	mov.f32 	%f430, 0f38B1E96A;
+	mov.f32 	%f429, 0fBA574D20;
+	mov.f32 	%f428, 0f3BAAD5EA;
+	mov.f32 	%f427, 0fBCDC1BE7;
+	mov.f32 	%f426, 0f3DE718AF;
+	mov.f32 	%f425, 0fBEC093AC;
+	mov.f32 	%f424, 0f3E0375D3;
+$L__BB0_6:
+	.loc	1 0 0
+	cvt.f32.bf16 %r21, %rs12;
+	mul.f32 	%f19, %f11, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p7, %f38, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f168, %f430, %f431, %f429;
+	fma.rn.ftz.f32 	%f169, %f168, %f431, %f428;
+	fma.rn.ftz.f32 	%f170, %f169, %f431, %f427;
+	fma.rn.ftz.f32 	%f171, %f170, %f431, %f426;
+	fma.rn.ftz.f32 	%f172, %f171, %f431, %f425;
+	fma.rn.ftz.f32 	%f173, %f172, %f431, %f424;
+	neg.f32 	%f174, %f431;
+	selp.f32 	%f175, %f174, %f18, %p6;
+	fma.rn.ftz.f32 	%f432, %f173, %f175, %f175;
+	@%p7 bra 	$L__BB0_8;
+	ex2.approx.ftz.f32 	%f176, %f432;
+	sub.f32 	%f178, %f414, %f176;
+	mov.b32 	%r35, %f178;
+	mov.b32 	%r36, %f18;
+	and.b32  	%r37, %r36, -2147483648;
+	or.b32  	%r38, %r37, %r35;
+	mov.b32 	%f432, %r38;
+$L__BB0_8:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs13, %r16;
+	mov.b32 	%f12, %r21;
+	.loc	1 30 23
+	abs.ftz.f32 	%f51, %f19;
+	setp.ge.f32 	%p9, %f51, 0f3F8060FE;
+	mov.f32 	%f439, 0f3789CA3C;
+	mov.f32 	%f438, 0fB9F560B9;
+	mov.f32 	%f437, 0f3BAC840B;
+	mov.f32 	%f436, 0fBD0C8162;
+	mov.f32 	%f435, 0f3E1CF906;
+	mov.f32 	%f434, 0f3F6A937E;
+	mov.f32 	%f433, 0f3F20D842;
+	mov.f32 	%f440, %f51;
+	@%p9 bra 	$L__BB0_10;
+	mul.f32 	%f440, %f19, %f19;
+	mov.f32 	%f439, 0f38B1E96A;
+	mov.f32 	%f438, 0fBA574D20;
+	mov.f32 	%f437, 0f3BAAD5EA;
+	mov.f32 	%f436, 0fBCDC1BE7;
+	mov.f32 	%f435, 0f3DE718AF;
+	mov.f32 	%f434, 0fBEC093AC;
+	mov.f32 	%f433, 0f3E0375D3;
+$L__BB0_10:
+	.loc	1 0 0
+	cvt.f32.bf16 %r22, %rs13;
+	mul.f32 	%f20, %f12, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p10, %f51, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f193, %f439, %f440, %f438;
+	fma.rn.ftz.f32 	%f194, %f193, %f440, %f437;
+	fma.rn.ftz.f32 	%f195, %f194, %f440, %f436;
+	fma.rn.ftz.f32 	%f196, %f195, %f440, %f435;
+	fma.rn.ftz.f32 	%f197, %f196, %f440, %f434;
+	fma.rn.ftz.f32 	%f198, %f197, %f440, %f433;
+	neg.f32 	%f199, %f440;
+	selp.f32 	%f200, %f199, %f19, %p9;
+	fma.rn.ftz.f32 	%f441, %f198, %f200, %f200;
+	@%p10 bra 	$L__BB0_12;
+	ex2.approx.ftz.f32 	%f201, %f441;
+	sub.f32 	%f203, %f414, %f201;
+	mov.b32 	%r39, %f203;
+	mov.b32 	%r40, %f19;
+	and.b32  	%r41, %r40, -2147483648;
+	or.b32  	%r42, %r41, %r39;
+	mov.b32 	%f441, %r42;
+$L__BB0_12:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
+	mov.b32 	%f13, %r22;
+	.loc	1 30 23
+	abs.ftz.f32 	%f64, %f20;
+	setp.ge.f32 	%p12, %f64, 0f3F8060FE;
+	mov.f32 	%f448, 0f3789CA3C;
+	mov.f32 	%f447, 0fB9F560B9;
+	mov.f32 	%f446, 0f3BAC840B;
+	mov.f32 	%f445, 0fBD0C8162;
+	mov.f32 	%f444, 0f3E1CF906;
+	mov.f32 	%f443, 0f3F6A937E;
+	mov.f32 	%f442, 0f3F20D842;
+	mov.f32 	%f449, %f64;
+	@%p12 bra 	$L__BB0_14;
+	mul.f32 	%f449, %f20, %f20;
+	mov.f32 	%f448, 0f38B1E96A;
+	mov.f32 	%f447, 0fBA574D20;
+	mov.f32 	%f446, 0f3BAAD5EA;
+	mov.f32 	%f445, 0fBCDC1BE7;
+	mov.f32 	%f444, 0f3DE718AF;
+	mov.f32 	%f443, 0fBEC093AC;
+	mov.f32 	%f442, 0f3E0375D3;
+$L__BB0_14:
+	.loc	1 0 0
+	cvt.f32.bf16 %r23, %rs14;
+	mul.f32 	%f21, %f13, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p13, %f64, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f218, %f448, %f449, %f447;
+	fma.rn.ftz.f32 	%f219, %f218, %f449, %f446;
+	fma.rn.ftz.f32 	%f220, %f219, %f449, %f445;
+	fma.rn.ftz.f32 	%f221, %f220, %f449, %f444;
+	fma.rn.ftz.f32 	%f222, %f221, %f449, %f443;
+	fma.rn.ftz.f32 	%f223, %f222, %f449, %f442;
+	neg.f32 	%f224, %f449;
+	selp.f32 	%f225, %f224, %f20, %p12;
+	fma.rn.ftz.f32 	%f450, %f223, %f225, %f225;
+	@%p13 bra 	$L__BB0_16;
+	ex2.approx.ftz.f32 	%f226, %f450;
+	sub.f32 	%f228, %f414, %f226;
+	mov.b32 	%r43, %f228;
+	mov.b32 	%r44, %f20;
+	and.b32  	%r45, %r44, -2147483648;
+	or.b32  	%r46, %r45, %r43;
+	mov.b32 	%f450, %r46;
+$L__BB0_16:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs15, %r17;
+	mov.b32 	%f14, %r23;
+	.loc	1 30 23
+	abs.ftz.f32 	%f77, %f21;
+	setp.ge.f32 	%p15, %f77, 0f3F8060FE;
+	mov.f32 	%f457, 0f3789CA3C;
+	mov.f32 	%f456, 0fB9F560B9;
+	mov.f32 	%f455, 0f3BAC840B;
+	mov.f32 	%f454, 0fBD0C8162;
+	mov.f32 	%f453, 0f3E1CF906;
+	mov.f32 	%f452, 0f3F6A937E;
+	mov.f32 	%f451, 0f3F20D842;
+	mov.f32 	%f458, %f77;
+	@%p15 bra 	$L__BB0_18;
+	mul.f32 	%f458, %f21, %f21;
+	mov.f32 	%f457, 0f38B1E96A;
+	mov.f32 	%f456, 0fBA574D20;
+	mov.f32 	%f455, 0f3BAAD5EA;
+	mov.f32 	%f454, 0fBCDC1BE7;
+	mov.f32 	%f453, 0f3DE718AF;
+	mov.f32 	%f452, 0fBEC093AC;
+	mov.f32 	%f451, 0f3E0375D3;
+$L__BB0_18:
+	.loc	1 0 0
+	cvt.f32.bf16 %r24, %rs15;
+	mul.f32 	%f22, %f14, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p16, %f77, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f243, %f457, %f458, %f456;
+	fma.rn.ftz.f32 	%f244, %f243, %f458, %f455;
+	fma.rn.ftz.f32 	%f245, %f244, %f458, %f454;
+	fma.rn.ftz.f32 	%f246, %f245, %f458, %f453;
+	fma.rn.ftz.f32 	%f247, %f246, %f458, %f452;
+	fma.rn.ftz.f32 	%f248, %f247, %f458, %f451;
+	neg.f32 	%f249, %f458;
+	selp.f32 	%f250, %f249, %f21, %p15;
+	fma.rn.ftz.f32 	%f459, %f248, %f250, %f250;
+	@%p16 bra 	$L__BB0_20;
+	ex2.approx.ftz.f32 	%f251, %f459;
+	sub.f32 	%f253, %f414, %f251;
+	mov.b32 	%r47, %f253;
+	mov.b32 	%r48, %f21;
+	and.b32  	%r49, %r48, -2147483648;
+	or.b32  	%r50, %r49, %r47;
+	mov.b32 	%f459, %r50;
+$L__BB0_20:
+	.loc	1 0 0
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
+	mov.b32 	%f15, %r24;
+	.loc	1 30 23
+	abs.ftz.f32 	%f90, %f22;
+	setp.ge.f32 	%p18, %f90, 0f3F8060FE;
+	mov.f32 	%f466, 0f3789CA3C;
+	mov.f32 	%f465, 0fB9F560B9;
+	mov.f32 	%f464, 0f3BAC840B;
+	mov.f32 	%f463, 0fBD0C8162;
+	mov.f32 	%f462, 0f3E1CF906;
+	mov.f32 	%f461, 0f3F6A937E;
+	mov.f32 	%f460, 0f3F20D842;
+	mov.f32 	%f467, %f90;
+	@%p18 bra 	$L__BB0_22;
+	mul.f32 	%f467, %f22, %f22;
+	mov.f32 	%f466, 0f38B1E96A;
+	mov.f32 	%f465, 0fBA574D20;
+	mov.f32 	%f464, 0f3BAAD5EA;
+	mov.f32 	%f463, 0fBCDC1BE7;
+	mov.f32 	%f462, 0f3DE718AF;
+	mov.f32 	%f461, 0fBEC093AC;
+	mov.f32 	%f460, 0f3E0375D3;
+$L__BB0_22:
+	.loc	1 0 0
+	cvt.f32.bf16 %r25, %rs16;
+	mul.f32 	%f23, %f15, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p19, %f90, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f268, %f466, %f467, %f465;
+	fma.rn.ftz.f32 	%f269, %f268, %f467, %f464;
+	fma.rn.ftz.f32 	%f270, %f269, %f467, %f463;
+	fma.rn.ftz.f32 	%f271, %f270, %f467, %f462;
+	fma.rn.ftz.f32 	%f272, %f271, %f467, %f461;
+	fma.rn.ftz.f32 	%f273, %f272, %f467, %f460;
+	neg.f32 	%f274, %f467;
+	selp.f32 	%f275, %f274, %f22, %p18;
+	fma.rn.ftz.f32 	%f468, %f273, %f275, %f275;
+	@%p19 bra 	$L__BB0_24;
+	ex2.approx.ftz.f32 	%f276, %f468;
+	sub.f32 	%f278, %f414, %f276;
+	mov.b32 	%r51, %f278;
+	mov.b32 	%r52, %f22;
+	and.b32  	%r53, %r52, -2147483648;
+	or.b32  	%r54, %r53, %r51;
+	mov.b32 	%f468, %r54;
+$L__BB0_24:
+	.loc	1 0 0
+	mov.b32 	%f16, %r25;
+	.loc	1 30 23
+	abs.ftz.f32 	%f103, %f23;
+	setp.ge.f32 	%p21, %f103, 0f3F8060FE;
+	mov.f32 	%f475, 0f3789CA3C;
+	mov.f32 	%f474, 0fB9F560B9;
+	mov.f32 	%f473, 0f3BAC840B;
+	mov.f32 	%f472, 0fBD0C8162;
+	mov.f32 	%f471, 0f3E1CF906;
+	mov.f32 	%f470, 0f3F6A937E;
+	mov.f32 	%f469, 0f3F20D842;
+	mov.f32 	%f476, %f103;
+	@%p21 bra 	$L__BB0_26;
+	mul.f32 	%f476, %f23, %f23;
+	mov.f32 	%f475, 0f38B1E96A;
+	mov.f32 	%f474, 0fBA574D20;
+	mov.f32 	%f473, 0f3BAAD5EA;
+	mov.f32 	%f472, 0fBCDC1BE7;
+	mov.f32 	%f471, 0f3DE718AF;
+	mov.f32 	%f470, 0fBEC093AC;
+	mov.f32 	%f469, 0f3E0375D3;
+$L__BB0_26:
+	.loc	1 0 0
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	cvt.u16.u32 	%rs3, %r3;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+	cvt.u16.u32 	%rs5, %r4;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
+	cvt.u16.u32 	%rs7, %r5;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
+	mul.f32 	%f24, %f16, 0f3F3504F3;
+	.loc	1 30 23
+	setp.ltu.f32 	%p22, %f103, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f293, %f475, %f476, %f474;
+	fma.rn.ftz.f32 	%f294, %f293, %f476, %f473;
+	fma.rn.ftz.f32 	%f295, %f294, %f476, %f472;
+	fma.rn.ftz.f32 	%f296, %f295, %f476, %f471;
+	fma.rn.ftz.f32 	%f297, %f296, %f476, %f470;
+	fma.rn.ftz.f32 	%f298, %f297, %f476, %f469;
+	neg.f32 	%f299, %f476;
+	selp.f32 	%f300, %f299, %f23, %p21;
+	fma.rn.ftz.f32 	%f477, %f298, %f300, %f300;
+	@%p22 bra 	$L__BB0_28;
+	ex2.approx.ftz.f32 	%f301, %f477;
+	sub.f32 	%f303, %f414, %f301;
+	mov.b32 	%r55, %f303;
+	mov.b32 	%r56, %f23;
+	and.b32  	%r57, %r56, -2147483648;
+	or.b32  	%r58, %r57, %r55;
+	mov.b32 	%f477, %r58;
+$L__BB0_28:
+	.loc	1 0 0
+	cvt.f32.bf16 %r6, %rs1;
+	cvt.f32.bf16 %r7, %rs2;
+	cvt.f32.bf16 %r8, %rs3;
+	cvt.f32.bf16 %r9, %rs4;
+	cvt.f32.bf16 %r10, %rs5;
+	cvt.f32.bf16 %r11, %rs6;
+	cvt.f32.bf16 %r12, %rs7;
+	cvt.f32.bf16 %r13, %rs8;
+	.loc	1 30 23
+	abs.ftz.f32 	%f116, %f24;
+	setp.ge.f32 	%p24, %f116, 0f3F8060FE;
+	mov.f32 	%f484, 0f3789CA3C;
+	mov.f32 	%f483, 0fB9F560B9;
+	mov.f32 	%f482, 0f3BAC840B;
+	mov.f32 	%f481, 0fBD0C8162;
+	mov.f32 	%f480, 0f3E1CF906;
+	mov.f32 	%f479, 0f3F6A937E;
+	mov.f32 	%f478, 0f3F20D842;
+	mov.f32 	%f485, %f116;
+	@%p24 bra 	$L__BB0_30;
+	mul.f32 	%f485, %f24, %f24;
+	mov.f32 	%f484, 0f38B1E96A;
+	mov.f32 	%f483, 0fBA574D20;
+	mov.f32 	%f482, 0f3BAAD5EA;
+	mov.f32 	%f481, 0fBCDC1BE7;
+	mov.f32 	%f480, 0f3DE718AF;
+	mov.f32 	%f479, 0fBEC093AC;
+	mov.f32 	%f478, 0f3E0375D3;
+$L__BB0_30:
+	.loc	1 0 0
+	mov.b32 	%f1, %r6;
+	mov.b32 	%f2, %r7;
+	mov.b32 	%f3, %r8;
+	mov.b32 	%f4, %r9;
+	mov.b32 	%f5, %r10;
+	mov.b32 	%f6, %r11;
+	mov.b32 	%f7, %r12;
+	mov.b32 	%f8, %r13;
+	.loc	1 30 23
+	setp.ltu.f32 	%p25, %f116, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f318, %f484, %f485, %f483;
+	fma.rn.ftz.f32 	%f319, %f318, %f485, %f482;
+	fma.rn.ftz.f32 	%f320, %f319, %f485, %f481;
+	fma.rn.ftz.f32 	%f321, %f320, %f485, %f480;
+	fma.rn.ftz.f32 	%f322, %f321, %f485, %f479;
+	fma.rn.ftz.f32 	%f323, %f322, %f485, %f478;
+	neg.f32 	%f324, %f485;
+	selp.f32 	%f325, %f324, %f24, %p24;
+	fma.rn.ftz.f32 	%f486, %f323, %f325, %f325;
+	@%p25 bra 	$L__BB0_32;
+	ex2.approx.ftz.f32 	%f326, %f486;
+	sub.f32 	%f328, %f414, %f326;
+	mov.b32 	%r59, %f328;
+	mov.b32 	%r60, %f24;
+	and.b32  	%r61, %r60, -2147483648;
+	or.b32  	%r62, %r61, %r59;
+	mov.b32 	%f486, %r62;
+$L__BB0_32:
+	.loc	1 32 18
+	add.f32 	%f345, %f423, 0f3F800000;
+	add.f32 	%f346, %f432, 0f3F800000;
+	add.f32 	%f347, %f441, 0f3F800000;
+	add.f32 	%f348, %f450, 0f3F800000;
+	add.f32 	%f349, %f459, 0f3F800000;
+	add.f32 	%f350, %f468, 0f3F800000;
+	add.f32 	%f351, %f477, 0f3F800000;
+	add.f32 	%f352, %f486, 0f3F800000;
+	.loc	1 35 19
+	mul.f32 	%f353, %f9, %f9;
+	mul.f32 	%f354, %f10, %f10;
+	mul.f32 	%f355, %f11, %f11;
+	mul.f32 	%f356, %f12, %f12;
+	mul.f32 	%f357, %f13, %f13;
+	mul.f32 	%f358, %f14, %f14;
+	mul.f32 	%f359, %f15, %f15;
+	mul.f32 	%f360, %f16, %f16;
+	.loc	1 37 20
+	mul.f32 	%f361, %f353, 0fBF000000;
+	mul.f32 	%f362, %f354, 0fBF000000;
+	mul.f32 	%f363, %f355, 0fBF000000;
+	mul.f32 	%f364, %f356, 0fBF000000;
+	mul.f32 	%f365, %f357, 0fBF000000;
+	mul.f32 	%f366, %f358, 0fBF000000;
+	mul.f32 	%f367, %f359, 0fBF000000;
+	mul.f32 	%f368, %f360, 0fBF000000;
+	.loc	1 38 19
+	mul.f32 	%f330, %f361, 0f3FB8AA3B;
+	ex2.approx.f32 %f329, %f330;
+	mul.f32 	%f332, %f362, 0f3FB8AA3B;
+	ex2.approx.f32 %f331, %f332;
+	mul.f32 	%f334, %f363, 0f3FB8AA3B;
+	ex2.approx.f32 %f333, %f334;
+	mul.f32 	%f336, %f364, 0f3FB8AA3B;
+	ex2.approx.f32 %f335, %f336;
+	mul.f32 	%f338, %f365, 0f3FB8AA3B;
+	ex2.approx.f32 %f337, %f338;
+	mul.f32 	%f340, %f366, 0f3FB8AA3B;
+	ex2.approx.f32 %f339, %f340;
+	mul.f32 	%f342, %f367, 0f3FB8AA3B;
+	ex2.approx.f32 %f341, %f342;
+	mul.f32 	%f344, %f368, 0f3FB8AA3B;
+	ex2.approx.f32 %f343, %f344;
+	.loc	1 40 20
+	mul.f32 	%f369, %f329, 0f3ECC422A;
+	mul.f32 	%f370, %f331, 0f3ECC422A;
+	mul.f32 	%f371, %f333, 0f3ECC422A;
+	mul.f32 	%f372, %f335, 0f3ECC422A;
+	mul.f32 	%f373, %f337, 0f3ECC422A;
+	mul.f32 	%f374, %f339, 0f3ECC422A;
+	mul.f32 	%f375, %f341, 0f3ECC422A;
+	mul.f32 	%f376, %f343, 0f3ECC422A;
+	.loc	1 41 19
+	mul.f32 	%f377, %f9, %f369;
+	mul.f32 	%f378, %f10, %f370;
+	mul.f32 	%f379, %f11, %f371;
+	mul.f32 	%f380, %f12, %f372;
+	mul.f32 	%f381, %f13, %f373;
+	mul.f32 	%f382, %f14, %f374;
+	mul.f32 	%f383, %f15, %f375;
+	mul.f32 	%f384, %f16, %f376;
+	.loc	1 42 20
+	fma.rn.f32 	%f385, %f345, 0f3F000000, %f377;
+	fma.rn.f32 	%f386, %f346, 0f3F000000, %f378;
+	fma.rn.f32 	%f387, %f347, 0f3F000000, %f379;
+	fma.rn.f32 	%f388, %f348, 0f3F000000, %f380;
+	fma.rn.f32 	%f389, %f349, 0f3F000000, %f381;
+	fma.rn.f32 	%f390, %f350, 0f3F000000, %f382;
+	fma.rn.f32 	%f391, %f351, 0f3F000000, %f383;
+	fma.rn.f32 	%f392, %f352, 0f3F000000, %f384;
+	.loc	1 43 19
+	mul.f32 	%f393, %f1, %f385;
+	mul.f32 	%f394, %f2, %f386;
+	mul.f32 	%f395, %f3, %f387;
+	mul.f32 	%f396, %f4, %f388;
+	mul.f32 	%f397, %f5, %f389;
+	mul.f32 	%f398, %f6, %f390;
+	mul.f32 	%f399, %f7, %f391;
+	mul.f32 	%f400, %f8, %f392;
+	.loc	1 45 40
+	mov.b32 	%r63, %f393;
+	cvt.rn.bf16.f32 %rs17, %r63;
+	mov.b32 	%r64, %f394;
+	cvt.rn.bf16.f32 %rs18, %r64;
+	mov.b32 	%r65, %f395;
+	cvt.rn.bf16.f32 %rs19, %r65;
+	mov.b32 	%r66, %f396;
+	cvt.rn.bf16.f32 %rs20, %r66;
+	mov.b32 	%r67, %f397;
+	cvt.rn.bf16.f32 %rs21, %r67;
+	mov.b32 	%r68, %f398;
+	cvt.rn.bf16.f32 %rs22, %r68;
+	mov.b32 	%r69, %f399;
+	cvt.rn.bf16.f32 %rs23, %r69;
+	mov.b32 	%r70, %f400;
+	cvt.rn.bf16.f32 %rs24, %r70;
+	mov.b32 	%r75, {%rs17, %rs18};
+	mov.b32 	%r76, {%rs19, %rs20};
+	mov.b32 	%r77, {%rs21, %rs22};
+	mov.b32 	%r78, {%rs23, %rs24};
+	@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r75, %r76, %r77, %r78 };
+	.loc	1 45 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	// .globl	__nv_erff
+.visible .func  (.param .b32 func_retval0) __nv_erff(
+	.param .b32 __nv_erff_param_0
+)
+{
+	.reg .pred 	%p<4>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<49>;
+$L__func_begin1:
+
+	ld.param.f32 	%f14, [__nv_erff_param_0];
+	abs.ftz.f32 	%f1, %f14;
+	setp.ge.f32 	%p1, %f1, 0f3F8060FE;
+	mov.f32 	%f46, 0f3789CA3C;
+	mov.f32 	%f45, 0fB9F560B9;
+	mov.f32 	%f44, 0f3BAC840B;
+	mov.f32 	%f43, 0fBD0C8162;
+	mov.f32 	%f42, 0f3E1CF906;
+	mov.f32 	%f41, 0f3F6A937E;
+	mov.f32 	%f40, 0f3F20D842;
+	mov.f32 	%f47, %f1;
+	@%p1 bra 	$L__BB1_2;
+	mul.f32 	%f47, %f14, %f14;
+	mov.f32 	%f46, 0f38B1E96A;
+	mov.f32 	%f45, 0fBA574D20;
+	mov.f32 	%f44, 0f3BAAD5EA;
+	mov.f32 	%f43, 0fBCDC1BE7;
+	mov.f32 	%f42, 0f3DE718AF;
+	mov.f32 	%f41, 0fBEC093AC;
+	mov.f32 	%f40, 0f3E0375D3;
+$L__BB1_2:
+	setp.ltu.f32 	%p2, %f1, 0f3F8060FE;
+	fma.rn.ftz.f32 	%f29, %f46, %f47, %f45;
+	fma.rn.ftz.f32 	%f30, %f29, %f47, %f44;
+	fma.rn.ftz.f32 	%f31, %f30, %f47, %f43;
+	fma.rn.ftz.f32 	%f32, %f31, %f47, %f42;
+	fma.rn.ftz.f32 	%f33, %f32, %f47, %f41;
+	fma.rn.ftz.f32 	%f34, %f33, %f47, %f40;
+	neg.f32 	%f35, %f47;
+	selp.f32 	%f36, %f35, %f14, %p1;
+	fma.rn.ftz.f32 	%f48, %f34, %f36, %f36;
+	@%p2 bra 	$L__BB1_4;
+	ex2.approx.ftz.f32 	%f37, %f48;
+	mov.f32 	%f38, 0f3F800000;
+	sub.f32 	%f39, %f38, %f37;
+	mov.b32 	%r1, %f39;
+	mov.b32 	%r2, %f14;
+	and.b32  	%r3, %r2, -2147483648;
+	or.b32  	%r4, %r3, %r1;
+	mov.b32 	%f48, %r4;
+$L__BB1_4:
+	st.param.f32 	[func_retval0+0], %f48;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 53
+.b8 106
+.b8 120
+.b8 97
+.b8 103
+.b8 117
+.b8 120
+.b8 104
+.b8 111
+.b8 51
+.b8 110
+.b8 104
+.b8 114
+.b8 108
+.b8 116
+.b8 53
+.b8 118
+.b8 99
+.b8 105
+.b8 110
+.b8 110
+.b8 122
+.b8 53
+.b8 102
+.b8 101
+.b8 118
+.b8 111
+.b8 100
+.b8 117
+.b8 109
+.b8 108
+.b8 112
+.b8 119
+.b8 110
+.b8 52
+.b8 119
+.b8 121
+.b8 98
+.b8 50
+.b8 118
+.b8 120
+.b8 51
+.b8 120
+.b8 114
+.b8 118
+.b8 101
+.b8 105
+.b8 99
+.b8 101
+.b8 114
+.b8 108
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 53
+.b8 106
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..3a6f2b53568e3bb7360a3f0c2dc820c6f83697ad
Binary files /dev/null and b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin differ
diff --git a/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..e0db3cd413022c0143329a8eb6b0fb39287ccd28
--- /dev/null
+++ b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c256_i32 : i32
+    %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<256xi32>
+    %4 = arith.addi %3, %2 : tensor<256xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %8 = arith.extf %7 : tensor<256xbf16> to tensor<256xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/a69784da01a97187168f22847465505f/triton_.llir b/.triton/dump/a69784da01a97187168f22847465505f/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..3b86f3776d3b626db1731054927f55d804770a98
--- /dev/null
+++ b/.triton/dump/a69784da01a97187168f22847465505f/triton_.llir
@@ -0,0 +1,324 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1
+
+define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !7 {
+  %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
+  %11 = and i32 %10, 31, !dbg !10
+  %12 = lshr i32 %10, 5, !dbg !10
+  %13 = and i32 %12, 1, !dbg !10
+  %urem = and i32 %10, 63, !dbg !10
+  %14 = shl nuw nsw i32 %urem, 2, !dbg !10
+  %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11
+  %16 = shl i32 %15, 8, !dbg !12
+  %17 = or i32 %16, %14, !dbg !13
+  %18 = sext i32 %17 to i64, !dbg !14
+  %19 = getelementptr float, ptr addrspace(1) %1, i64 %18, !dbg !14
+  %20 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15
+  %21 = extractvalue { i32, i32, i32, i32 } %20, 0, !dbg !15
+  %22 = extractvalue { i32, i32, i32, i32 } %20, 1, !dbg !15
+  %23 = extractvalue { i32, i32, i32, i32 } %20, 2, !dbg !15
+  %24 = extractvalue { i32, i32, i32, i32 } %20, 3, !dbg !15
+  %25 = bitcast i32 %23 to float, !dbg !15
+  %26 = bitcast i32 %24 to float, !dbg !15
+  %27 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !16
+  %28 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %27, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17
+  %29 = extractvalue { i32, i32 } %28, 0, !dbg !17
+  %30 = extractvalue { i32, i32 } %28, 1, !dbg !17
+  %31 = trunc i32 %29 to i16, !dbg !17
+  %extelt.offset = lshr i32 %29, 16, !dbg !17
+  %32 = trunc i32 %extelt.offset to i16, !dbg !17
+  %33 = trunc i32 %30 to i16, !dbg !17
+  %extelt.offset1 = lshr i32 %30, 16, !dbg !17
+  %34 = trunc i32 %extelt.offset1 to i16, !dbg !17
+  %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18
+  %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18
+  %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18
+  %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18
+  %39 = getelementptr i16, ptr addrspace(1) %3, i64 %18, !dbg !19
+  %40 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20
+  %41 = extractvalue { i32, i32 } %40, 0, !dbg !20
+  %42 = extractvalue { i32, i32 } %40, 1, !dbg !20
+  %43 = trunc i32 %41 to i16, !dbg !20
+  %extelt.offset2 = lshr i32 %41, 16, !dbg !20
+  %44 = trunc i32 %extelt.offset2 to i16, !dbg !20
+  %45 = trunc i32 %42 to i16, !dbg !20
+  %extelt.offset3 = lshr i32 %42, 16, !dbg !20
+  %46 = trunc i32 %extelt.offset3 to i16, !dbg !20
+  %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21
+  %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21
+  %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21
+  %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21
+  %51 = zext nneg i32 %14 to i64, !dbg !22
+  %52 = getelementptr float, ptr addrspace(1) %4, i64 %51, !dbg !22
+  %53 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23
+  %54 = fadd float %37, %25, !dbg !24
+  %55 = fadd float %38, %26, !dbg !24
+  %56 = insertelement <2 x i32> poison, i32 %21, i64 0, !dbg !15
+  %57 = insertelement <2 x i32> %56, i32 %22, i64 1, !dbg !15
+  %58 = bitcast <2 x i32> %57 to <2 x float>, !dbg !15
+  %59 = insertelement <2 x float> poison, float %35, i64 0, !dbg !24
+  %60 = insertelement <2 x float> %59, float %36, i64 1, !dbg !24
+  %61 = fadd <2 x float> %60, %58, !dbg !24
+  %62 = insertelement <2 x float> poison, float %47, i64 0, !dbg !25
+  %63 = insertelement <2 x float> %62, float %48, i64 1, !dbg !25
+  %64 = fadd <2 x float> %61, %63, !dbg !25
+  %65 = fadd float %54, %49, !dbg !25
+  %66 = fadd float %55, %50, !dbg !25
+  %67 = extractelement <2 x float> %64, i64 0, !dbg !26
+  %68 = extractelement <2 x float> %64, i64 1, !dbg !26
+  %69 = fadd float %67, %68, !dbg !26
+  %70 = fadd float %69, %65, !dbg !26
+  %71 = fadd float %70, %66, !dbg !26
+  %72 = bitcast float %71 to i32, !dbg !32
+  %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !32
+  %74 = bitcast i32 %73 to float, !dbg !32
+  %75 = fadd float %71, %74, !dbg !26
+  %76 = bitcast float %75 to i32, !dbg !32
+  %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 8, i32 31), !dbg !32
+  %78 = bitcast i32 %77 to float, !dbg !32
+  %79 = fadd float %75, %78, !dbg !26
+  %80 = bitcast float %79 to i32, !dbg !32
+  %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 4, i32 31), !dbg !32
+  %82 = bitcast i32 %81 to float, !dbg !32
+  %83 = fadd float %79, %82, !dbg !26
+  %84 = bitcast float %83 to i32, !dbg !32
+  %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 2, i32 31), !dbg !32
+  %86 = bitcast i32 %85 to float, !dbg !32
+  %87 = fadd float %83, %86, !dbg !26
+  %88 = bitcast float %87 to i32, !dbg !32
+  %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 1, i32 31), !dbg !32
+  %90 = bitcast i32 %89 to float, !dbg !32
+  %91 = fadd float %87, %90, !dbg !26
+  %92 = icmp eq i32 %11, 0, !dbg !32
+  %93 = zext nneg i32 %13 to i64, !dbg !32
+  %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %91, i1 %92) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %95 = icmp slt i32 %10, 2, !dbg !32
+  %96 = sext i32 %10 to i64, !dbg !32
+  %97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !32
+  %98 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %97, i1 %95) #6, !dbg !32
+  %99 = bitcast float %98 to i32, !dbg !32
+  %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 1, i32 31), !dbg !32
+  %101 = bitcast i32 %100 to float, !dbg !32
+  %102 = fadd float %98, %101, !dbg !26
+  %103 = and i32 %10, 1, !dbg !32
+  %104 = icmp eq i32 %103, 0, !dbg !32
+  %105 = and i1 %95, %104, !dbg !32
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %102, i1 %105) #6, !dbg !32
+  tail call void @llvm.nvvm.barrier0(), !dbg !32
+  %106 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32
+  %107 = fadd float %106, 0.000000e+00, !dbg !34
+  %108 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %107, float 2.560000e+02) #6, !dbg !38
+  %109 = fsub float %67, %108, !dbg !39
+  %110 = fsub float %68, %108, !dbg !39
+  %111 = fsub float %65, %108, !dbg !39
+  %112 = fsub float %66, %108, !dbg !39
+  %113 = fmul float %109, %109, !dbg !40
+  %114 = fmul float %110, %110, !dbg !40
+  %115 = fmul float %111, %111, !dbg !40
+  %116 = fmul float %112, %112, !dbg !40
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %117 = fadd float %113, %114, !dbg !43
+  %118 = fadd float %115, %117, !dbg !43
+  %119 = fadd float %116, %118, !dbg !43
+  %120 = bitcast float %119 to i32, !dbg !41
+  %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !41
+  %122 = bitcast i32 %121 to float, !dbg !41
+  %123 = fadd float %119, %122, !dbg !43
+  %124 = bitcast float %123 to i32, !dbg !41
+  %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 8, i32 31), !dbg !41
+  %126 = bitcast i32 %125 to float, !dbg !41
+  %127 = fadd float %123, %126, !dbg !43
+  %128 = bitcast float %127 to i32, !dbg !41
+  %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 4, i32 31), !dbg !41
+  %130 = bitcast i32 %129 to float, !dbg !41
+  %131 = fadd float %127, %130, !dbg !43
+  %132 = bitcast float %131 to i32, !dbg !41
+  %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !41
+  %134 = bitcast i32 %133 to float, !dbg !41
+  %135 = fadd float %131, %134, !dbg !43
+  %136 = bitcast float %135 to i32, !dbg !41
+  %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !41
+  %138 = bitcast i32 %137 to float, !dbg !41
+  %139 = fadd float %135, %138, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %139, i1 %92) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %140 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %97, i1 %95) #6, !dbg !41
+  %141 = bitcast float %140 to i32, !dbg !41
+  %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 1, i32 31), !dbg !41
+  %143 = bitcast i32 %142 to float, !dbg !41
+  %144 = fadd float %140, %143, !dbg !43
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %144, i1 %105) #6, !dbg !41
+  tail call void @llvm.nvvm.barrier0(), !dbg !41
+  %145 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41
+  %146 = fadd float %145, 0.000000e+00, !dbg !46
+  %147 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %146, float 2.560000e+02) #6, !dbg !48
+  %148 = fadd float %147, 0x3EE4F8B580000000, !dbg !49
+  %149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50
+  %.not.i = icmp eq i32 %149, 0, !dbg !50
+  br i1 %.not.i, label %152, label %150, !dbg !50
+
+150:                                              ; preds = %9
+  %151 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %148), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+
+152:                                              ; preds = %9
+  %153 = tail call float @llvm.nvvm.rsqrt.approx.f(float %148), !dbg !50
+  br label %__nv_rsqrtf.exit, !dbg !50
+
+__nv_rsqrtf.exit:                                 ; preds = %150, %152
+  %.0.i = phi float [ %151, %150 ], [ %153, %152 ], !dbg !50
+  %154 = extractvalue { i32, i32, i32, i32 } %53, 3, !dbg !23
+  %155 = bitcast i32 %154 to float, !dbg !23
+  %156 = extractvalue { i32, i32, i32, i32 } %53, 2, !dbg !23
+  %157 = bitcast i32 %156 to float, !dbg !23
+  %158 = extractvalue { i32, i32, i32, i32 } %53, 1, !dbg !23
+  %159 = bitcast i32 %158 to float, !dbg !23
+  %160 = extractvalue { i32, i32, i32, i32 } %53, 0, !dbg !23
+  %161 = bitcast i32 %160 to float, !dbg !23
+  %162 = fmul float %109, %.0.i, !dbg !51
+  %163 = fmul float %110, %.0.i, !dbg !51
+  %164 = fmul float %111, %.0.i, !dbg !51
+  %165 = fmul float %112, %.0.i, !dbg !51
+  %166 = fmul float %162, %161, !dbg !52
+  %167 = fmul float %163, %159, !dbg !52
+  %168 = fmul float %164, %157, !dbg !52
+  %169 = fmul float %165, %155, !dbg !52
+  tail call void @llvm.nvvm.barrier0(), !dbg !53
+  %170 = sext i32 %15 to i64, !dbg !54
+  %171 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !54
+  %172 = icmp eq i32 %urem, 0, !dbg !55
+  %173 = bitcast float %.0.i to i32, !dbg !55
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %173, ptr addrspace(1) %171, i1 %172) #6, !dbg !55
+  %174 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56
+  %175 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %166) #6, !dbg !57
+  %176 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %167) #6, !dbg !57
+  %177 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %168) #6, !dbg !57
+  %178 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %169) #6, !dbg !57
+  %179 = insertelement <2 x i16> undef, i16 %175, i64 0, !dbg !57
+  %180 = insertelement <2 x i16> %179, i16 %176, i64 1, !dbg !57
+  %181 = bitcast <2 x i16> %180 to i32, !dbg !57
+  %182 = insertelement <2 x i16> undef, i16 %177, i64 0, !dbg !57
+  %183 = insertelement <2 x i16> %182, i16 %178, i64 1, !dbg !57
+  %184 = bitcast <2 x i16> %183 to i32, !dbg !57
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %181, i32 %184, ptr addrspace(1) %174, i1 true) #6, !dbg !57
+  %185 = getelementptr float, ptr addrspace(1) %5, i64 %170, !dbg !58
+  %186 = bitcast float %108 to i32, !dbg !59
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %186, ptr addrspace(1) %185, i1 %172) #6, !dbg !59
+  ret void, !dbg !60
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+; Function Attrs: alwaysinline nounwind
+define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 {
+  %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6
+  %.not = icmp eq i32 %1, 0
+  br i1 %.not, label %4, label %2
+
+2:                                                ; preds = %0
+  %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x)
+  br label %6
+
+4:                                                ; preds = %0
+  %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x)
+  br label %6
+
+6:                                                ; preds = %4, %2
+  %.0 = phi float [ %3, %2 ], [ %5, %4 ]
+  ret float %.0
+}
+
+declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare float @llvm.nvvm.rsqrt.approx.f(float) #5
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #6 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!2}
+!nvvm.annotations = !{!4, !5, !5, !4}
+!llvm.ident = !{!6}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!3 = !DIFile(filename: "cgyrkrvxykbeetcyfsjqxf2ni3kynf3x4qqckt4p2fyz7wetdsd2.py", directory: "/tmp/torchinductor_root/gy")
+!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1}
+!5 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64}
+!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
+!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
+!9 = !{}
+!10 = !DILocation(line: 26, column: 26, scope: !7)
+!11 = !DILocation(line: 23, column: 28, scope: !7)
+!12 = !DILocation(line: 30, column: 40, scope: !7)
+!13 = !DILocation(line: 30, column: 36, scope: !7)
+!14 = !DILocation(line: 30, column: 30, scope: !7)
+!15 = !DILocation(line: 30, column: 46, scope: !7)
+!16 = !DILocation(line: 31, column: 30, scope: !7)
+!17 = !DILocation(line: 31, column: 46, scope: !7)
+!18 = !DILocation(line: 31, column: 67, scope: !7)
+!19 = !DILocation(line: 32, column: 30, scope: !7)
+!20 = !DILocation(line: 32, column: 46, scope: !7)
+!21 = !DILocation(line: 32, column: 67, scope: !7)
+!22 = !DILocation(line: 33, column: 31, scope: !7)
+!23 = !DILocation(line: 33, column: 36, scope: !7)
+!24 = !DILocation(line: 35, column: 18, scope: !7)
+!25 = !DILocation(line: 37, column: 18, scope: !7)
+!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30)
+!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0)
+!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0)
+!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31)
+!31 = !DILocation(line: 42, column: 59, scope: !27)
+!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33)
+!33 = !DILocation(line: 42, column: 59, scope: !29)
+!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37)
+!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0)
+!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor")
+!37 = !DILocation(line: 42, column: 45, scope: !35)
+!38 = !DILocation(line: 45, column: 20, scope: !7)
+!39 = !DILocation(line: 46, column: 19, scope: !7)
+!40 = !DILocation(line: 47, column: 20, scope: !7)
+!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42)
+!42 = !DILocation(line: 50, column: 59, scope: !29)
+!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44)
+!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45)
+!45 = !DILocation(line: 50, column: 59, scope: !27)
+!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47)
+!47 = !DILocation(line: 50, column: 45, scope: !35)
+!48 = !DILocation(line: 52, column: 20, scope: !7)
+!49 = !DILocation(line: 54, column: 20, scope: !7)
+!50 = !DILocation(line: 55, column: 26, scope: !7)
+!51 = !DILocation(line: 57, column: 20, scope: !7)
+!52 = !DILocation(line: 58, column: 20, scope: !7)
+!53 = !DILocation(line: 60, column: 4, scope: !7)
+!54 = !DILocation(line: 61, column: 28, scope: !7)
+!55 = !DILocation(line: 61, column: 40, scope: !7)
+!56 = !DILocation(line: 62, column: 25, scope: !7)
+!57 = !DILocation(line: 62, column: 48, scope: !7)
+!58 = !DILocation(line: 63, column: 25, scope: !7)
+!59 = !DILocation(line: 63, column: 37, scope: !7)
+!60 = !DILocation(line: 63, column: 4, scope: !7)
diff --git a/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.cubin b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c0b5a6b628109108f94833fc9e53ba889d8411e2
Binary files /dev/null and b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.cubin differ
diff --git a/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttir b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..59013b5d7b57f0541e999a8da0ba77fe925af884
--- /dev/null
+++ b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttir
@@ -0,0 +1,66 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %16 = arith.addf %8, %12 : tensor<256xf32>
+    %17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %42 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %42 : f32
+    }) : (tensor<256xf32>) -> f32
+    %19 = arith.addf %18, %cst_0 : f32
+    %20 = arith.divf %19, %cst_1 : f32
+    %21 = tt.splat %20 : (f32) -> tensor<1xf32>
+    %22 = tt.splat %20 : (f32) -> tensor<256xf32>
+    %23 = arith.subf %16, %22 : tensor<256xf32>
+    %24 = arith.mulf %23, %23 : tensor<256xf32>
+    %25 = arith.select %2, %24, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %26 = "tt.reduce"(%25) <{axis = 0 : i32}> ({
+    ^bb0(%arg8: f32, %arg9: f32):
+      %42 = arith.addf %arg8, %arg9 : f32
+      tt.reduce.return %42 : f32
+    }) : (tensor<256xf32>) -> f32
+    %27 = arith.addf %26, %cst_0 : f32
+    %28 = arith.divf %27, %cst_1 : f32
+    %29 = arith.addf %28, %cst_2 : f32
+    %30 = tt.extern_elementwise %29 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %31 = tt.splat %30 : (f32) -> tensor<1xf32>
+    %32 = tt.splat %30 : (f32) -> tensor<256xf32>
+    %33 = arith.mulf %23, %32 : tensor<256xf32>
+    %34 = arith.mulf %33, %15 : tensor<256xf32>
+    gpu.barrier
+    %35 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %36 = tt.splat %35 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %36, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    %37 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %39 = arith.truncf %34 : tensor<256xf32> to tensor<256xbf16>
+    tt.store %38, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
+    %40 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
+    %41 = tt.splat %40 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %41, %21 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.llir b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..9af59ae363c94ccbf7b9542aad4bb488c0edacf5
--- /dev/null
+++ b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.llir
@@ -0,0 +1,54 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = extractvalue { i32, i32 } %12, 0, !dbg !13
+  %14 = extractvalue { i32, i32 } %12, 1, !dbg !13
+  %15 = bitcast i32 %13 to float, !dbg !13
+  %16 = bitcast i32 %14 to float, !dbg !13
+  %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14
+  %18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15
+  %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15
+  %20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15
+  %21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15
+  %22 = bitcast <2 x i16> %21 to i32, !dbg !15
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15
+  ret void, !dbg !16
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cch6kzmgbnoxqjgy3okxqs7sy2uz27atdhc4lkuwz5ajinexdurx.py", directory: "/tmp/torchinductor_root/ch")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 26, column: 25, scope: !5)
+!15 = !DILocation(line: 26, column: 36, scope: !5)
+!16 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttir b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..de8871dcaba54cb7d1b997f9009d0eeab54d091f
--- /dev/null
+++ b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32>
+    %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %10 = arith.truncf %7 : tensor<512xf32> to tensor<512xbf16>
+    tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
+    tt.return
+  }
+}
diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.cubin b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..17a7eaafdd465c58e427fe6c4644bd6ead9b6bef
Binary files /dev/null and b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.cubin differ
diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..12dbd4fd575fd1b129e5c7ce0c0047cbd528298c
--- /dev/null
+++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx
@@ -0,0 +1,301 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de
+
+.visible .entry triton__0d1d2de(
+	.param .u64 triton__0d1d2de_param_0,
+	.param .u64 triton__0d1d2de_param_1,
+	.param .u32 triton__0d1d2de_param_2
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<17>;
+	.reg .b64 	%rd<7>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd3, [triton__0d1d2de_param_0];
+	ld.param.u64 	%rd4, [triton__0d1d2de_param_1];
+$L__tmp0:
+	.loc	1 21 36
+	mov.u32 	%r12, %tid.x;
+	shl.b32 	%r13, %r12, 2;
+	and.b32  	%r14, %r13, 508;
+	.loc	1 20 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 20 33
+	shl.b32 	%r15, %r1, 9;
+	.loc	1 21 23
+	or.b32  	%r16, %r15, %r14;
+	.loc	1 24 30
+	mul.wide.s32 	%rd5, %r16, 2;
+	add.s64 	%rd1, %rd3, %rd5;
+	mov.pred 	%p1, -1;
+	.loc	1 24 35
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	@%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ];
+	cvt.u16.u32 	%rs1, %r2;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
+	cvt.u16.u32 	%rs3, %r3;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+	.loc	1 24 44
+	cvt.f32.bf16 %r8, %rs1;
+	cvt.f32.bf16 %r9, %rs2;
+	cvt.f32.bf16 %r10, %rs3;
+	cvt.f32.bf16 %r11, %rs4;
+	.loc	1 26 25
+	mul.wide.s32 	%rd6, %r16, 4;
+	add.s64 	%rd2, %rd4, %rd6;
+	.loc	1 26 36
+	@%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r8, %r9, %r10, %r11 };
+	.loc	1 26 4
+	ret;
+$L__tmp1:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/zl/czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 176
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 122
+.b8 108
+.b8 54
+.b8 110
+.b8 109
+.b8 119
+.b8 97
+.b8 115
+.b8 108
+.b8 55
+.b8 107
+.b8 52
+.b8 105
+.b8 99
+.b8 53
+.b8 53
+.b8 120
+.b8 111
+.b8 119
+.b8 105
+.b8 104
+.b8 99
+.b8 122
+.b8 99
+.b8 111
+.b8 111
+.b8 104
+.b8 51
+.b8 109
+.b8 104
+.b8 117
+.b8 53
+.b8 118
+.b8 54
+.b8 108
+.b8 115
+.b8 54
+.b8 119
+.b8 50
+.b8 120
+.b8 122
+.b8 113
+.b8 113
+.b8 111
+.b8 99
+.b8 100
+.b8 99
+.b8 50
+.b8 100
+.b8 97
+.b8 55
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 122
+.b8 108
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 180
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..cdc64a19be9d4816eddc0cdbad88d21c48b0d233
--- /dev/null
+++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir
@@ -0,0 +1,19 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b
--- /dev/null
+++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ptx b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..0f8c7f7e766e7ca52b6681a2cdb04d9a9ca281d1
--- /dev/null
+++ b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ptx
@@ -0,0 +1,837 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7d8d9d10de11de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<35>;
+	.reg .b16 	%rs<17>;
+	.reg .b32 	%r<111>;
+	.reg .f32 	%f<94>;
+	.reg .b64 	%rd<25>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r78, %tid.x;
+	and.b32  	%r79, %r78, 31;
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2];
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3];
+	ld.param.u64 	%rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4];
+	and.b32  	%r80, %r78, 63;
+	ld.param.u64 	%rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5];
+	shl.b32 	%r81, %r80, 2;
+	ld.param.u64 	%rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6];
+	ld.param.u64 	%rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r82, %r1, 8;
+	ld.param.u64 	%rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8];
+	.loc	1 30 36
+	or.b32  	%r83, %r82, %r81;
+	ld.param.u64 	%rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9];
+	.loc	1 30 30
+	mul.wide.s32 	%rd21, %r83, 4;
+	add.s64 	%rd1, %rd12, %rd21;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r2;
+	mov.b32 	%f2, %r3;
+	mov.b32 	%f3, %r4;
+	mov.b32 	%f4, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd22, %r83, 2;
+	add.s64 	%rd2, %rd13, %rd22;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd14, %rd22;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f9, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f10, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f11, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f12, %r25;
+	.loc	1 33 30
+	add.s64 	%rd4, %rd15, %rd22;
+	.loc	1 33 46
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	cvt.u16.u32 	%rs9, %r26;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
+	cvt.u16.u32 	%rs11, %r27;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
+	.loc	1 33 67
+	cvt.f32.bf16 %r30, %rs9;
+	mov.b32 	%f13, %r30;
+	cvt.f32.bf16 %r31, %rs10;
+	mov.b32 	%f14, %r31;
+	cvt.f32.bf16 %r32, %rs11;
+	mov.b32 	%f15, %r32;
+	cvt.f32.bf16 %r33, %rs12;
+	mov.b32 	%f16, %r33;
+	.loc	1 34 31
+	add.s64 	%rd5, %rd16, %rd22;
+	.loc	1 34 47
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	cvt.u16.u32 	%rs13, %r34;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
+	cvt.u16.u32 	%rs15, %r35;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
+	.loc	1 34 68
+	cvt.f32.bf16 %r38, %rs13;
+	mov.b32 	%f17, %r38;
+	cvt.f32.bf16 %r39, %rs14;
+	mov.b32 	%f18, %r39;
+	cvt.f32.bf16 %r40, %rs15;
+	mov.b32 	%f19, %r40;
+	cvt.f32.bf16 %r41, %rs16;
+	mov.b32 	%f20, %r41;
+	.loc	1 35 31
+	mul.wide.u32 	%rd23, %r81, 4;
+	add.s64 	%rd6, %rd17, %rd23;
+	.loc	1 35 36
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	mov.u32 %r45, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
+	@!%p1 mov.u32 %r42, %r6;
+	@!%p1 mov.u32 %r43, %r6;
+	@!%p1 mov.u32 %r44, %r6;
+	@!%p1 mov.u32 %r45, %r6;
+	.loc	1 37 18
+	add.f32 	%f21, %f5, %f1;
+	add.f32 	%f22, %f6, %f2;
+	add.f32 	%f23, %f7, %f3;
+	.loc	1 39 18
+	add.f32 	%f24, %f21, %f9;
+	add.f32 	%f25, %f22, %f10;
+	add.f32 	%f26, %f23, %f11;
+	.loc	1 41 18
+	add.f32 	%f27, %f25, %f14;
+	add.f32 	%f28, %f26, %f15;
+	.loc	1 43 19
+	add.f32 	%f29, %f27, %f18;
+	add.f32 	%f30, %f28, %f19;
+	.loc	1 41 18
+	add.f32 	%f31, %f24, %f13;
+	add.f32 	%f32, %f8, %f4;
+	.loc	1 43 19
+	add.f32 	%f33, %f32, %f12;
+	add.f32 	%f34, %f31, %f17;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f35, %f34, %f29;
+	add.f32 	%f36, %f33, %f16;
+	add.f32 	%f37, %f35, %f30;
+	add.f32 	%f38, %f36, %f20;
+	mov.b32 	%r71, %f38;
+	add.f32 	%f39, %f37, %f38;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r84, %f39;
+	shfl.sync.bfly.b32	%r85, %r84, 16, 31, -1;
+	mov.b32 	%f40, %r85;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r86, %f41;
+	shfl.sync.bfly.b32	%r87, %r86, 8, 31, -1;
+	mov.b32 	%f42, %r87;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f43, %f41, %f42;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r88, %f43;
+	shfl.sync.bfly.b32	%r89, %r88, 4, 31, -1;
+	mov.b32 	%f44, %r89;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f45, %f43, %f44;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r90, %f45;
+	shfl.sync.bfly.b32	%r91, %r90, 2, 31, -1;
+	mov.b32 	%f46, %r91;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f47, %f45, %f46;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r92, %f47;
+	shfl.sync.bfly.b32	%r93, %r92, 1, 31, -1;
+	mov.b32 	%f48, %r93;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p23, %r79, 0;
+	shr.u32 	%r94, %r78, 3;
+	and.b32  	%r95, %r94, 4;
+	mov.u32 	%r96, global_smem;
+	add.s32 	%r50, %r96, %r95;
+	mov.b32 	%r51, %f49;
+	@%p23 st.shared.b32 [ %r50 + 0 ], %r51;
+	bar.sync 	0;
+	setp.lt.s32 	%p24, %r78, 2;
+	shl.b32 	%r97, %r78, 2;
+	add.s32 	%r53, %r96, %r97;
+	@%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
+	mov.b32 	%f50, %r52;
+	shfl.sync.bfly.b32	%r98, %r52, 1, 31, -1;
+	mov.b32 	%f51, %r98;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r99, %r78, 1;
+	setp.eq.b32 	%p33, %r99, 1;
+	not.pred 	%p34, %p33;
+	and.pred  	%p25, %p24, %p34;
+	mov.b32 	%r55, %f52;
+	@%p25 st.shared.b32 [ %r53 + 0 ], %r55;
+	bar.sync 	0;
+	ld.shared.f32 	%f53, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f54, %f53, 0f00000000;
+$L__tmp16:
+	.loc	1 51 20
+	mov.b32 	%r57, %f54;
+	mov.b32 	%r58, 1132462080;
+	div.full.f32 %r77, %r57, %r58;
+	mov.b32 	%f55, %r77;
+	.loc	1 52 20
+	sub.f32 	%f56, %f34, %f55;
+	sub.f32 	%f57, %f29, %f55;
+	sub.f32 	%f58, %f30, %f55;
+	sub.f32 	%f59, %f38, %f55;
+	.loc	1 53 20
+	mul.f32 	%f60, %f57, %f57;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f61, %f56, %f56, %f60;
+	fma.rn.f32 	%f62, %f58, %f58, %f61;
+	fma.rn.f32 	%f63, %f59, %f59, %f62;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r100, %f63;
+	shfl.sync.bfly.b32	%r101, %r100, 16, 31, -1;
+	mov.b32 	%f64, %r101;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r102, %f65;
+	shfl.sync.bfly.b32	%r103, %r102, 8, 31, -1;
+	mov.b32 	%f66, %r103;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f67, %f65, %f66;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r104, %f67;
+	shfl.sync.bfly.b32	%r105, %r104, 4, 31, -1;
+	mov.b32 	%f68, %r105;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f69, %f67, %f68;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r106, %f69;
+	shfl.sync.bfly.b32	%r107, %r106, 2, 31, -1;
+	mov.b32 	%f70, %r107;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f71, %f69, %f70;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r108, %f71;
+	shfl.sync.bfly.b32	%r109, %r108, 1, 31, -1;
+	mov.b32 	%f72, %r109;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f73, %f71, %f72;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r60, %f73;
+	@%p23 st.shared.b32 [ %r50 + 0 ], %r60;
+	bar.sync 	0;
+	@%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
+	mov.b32 	%f74, %r61;
+	shfl.sync.bfly.b32	%r110, %r61, 1, 31, -1;
+	mov.b32 	%f75, %r110;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f76, %f74, %f75;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r64, %f76;
+	@%p25 st.shared.b32 [ %r53 + 0 ], %r64;
+	bar.sync 	0;
+	ld.shared.f32 	%f77, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f78, %f77, 0f00000000;
+$L__tmp33:
+	.loc	1 58 20
+	mov.b32 	%r66, %f78;
+	div.full.f32 %r65, %r66, %r58;
+	mov.b32 	%f79, %r65;
+	.loc	1 60 20
+	add.f32 	%f80, %f79, 0f3727C5AC;
+	.loc	1 61 26
+	rsqrt.approx.ftz.f32 	%f81, %f80;
+	.loc	1 35 36
+	mov.b32 	%f82, %r45;
+	mov.b32 	%f83, %r44;
+	mov.b32 	%f84, %r43;
+	mov.b32 	%f85, %r42;
+	.loc	1 63 20
+	mul.f32 	%f86, %f56, %f81;
+	mul.f32 	%f87, %f57, %f81;
+	mul.f32 	%f88, %f58, %f81;
+	mul.f32 	%f89, %f59, %f81;
+	.loc	1 64 20
+	mul.f32 	%f90, %f86, %f85;
+	mul.f32 	%f91, %f87, %f84;
+	mul.f32 	%f92, %f88, %f83;
+	mul.f32 	%f93, %f89, %f82;
+	.loc	1 65 25
+	add.s64 	%rd7, %rd18, %rd21;
+	.loc	1 65 48
+	mov.b32 	%r68, %f34;
+	mov.b32 	%r69, %f29;
+	mov.b32 	%r70, %f30;
+	@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
+	.loc	1 66 4
+	bar.sync 	0;
+	.loc	1 67 28
+	mul.wide.s32 	%rd24, %r1, 4;
+	add.s64 	%rd8, %rd11, %rd24;
+	.loc	1 67 40
+	setp.eq.s32 	%p30, %r80, 0;
+	mov.b32 	%r72, %f81;
+	@%p30 st.global.b32 [ %rd8 + 0 ], { %r72 };
+	.loc	1 68 25
+	add.s64 	%rd9, %rd20, %rd21;
+	.loc	1 68 48
+	mov.b32 	%r73, %f90;
+	mov.b32 	%r74, %f91;
+	mov.b32 	%r75, %f92;
+	mov.b32 	%r76, %f93;
+	@%p1 st.global.v4.b32 [ %rd9 + 0 ], { %r73, %r74, %r75, %r76 };
+	.loc	1 69 25
+	add.s64 	%rd10, %rd19, %rd24;
+	.loc	1 69 37
+	@%p30 st.global.b32 [ %rd10 + 0 ], { %r77 };
+	.loc	1 69 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/sf/csf6zcjhrl2sjepofkaaj2rwyu4vq322pi5ukcu37oynjbso2i4g.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 419
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 115
+.b8 102
+.b8 54
+.b8 122
+.b8 99
+.b8 106
+.b8 104
+.b8 114
+.b8 108
+.b8 50
+.b8 115
+.b8 106
+.b8 101
+.b8 112
+.b8 111
+.b8 102
+.b8 107
+.b8 97
+.b8 97
+.b8 106
+.b8 50
+.b8 114
+.b8 119
+.b8 121
+.b8 117
+.b8 52
+.b8 118
+.b8 113
+.b8 51
+.b8 50
+.b8 50
+.b8 112
+.b8 105
+.b8 53
+.b8 117
+.b8 107
+.b8 99
+.b8 117
+.b8 51
+.b8 55
+.b8 111
+.b8 121
+.b8 110
+.b8 106
+.b8 98
+.b8 115
+.b8 111
+.b8 50
+.b8 105
+.b8 52
+.b8 103
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 115
+.b8 102
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 101
+.b8 49
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 101
+.b8 49
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 48
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 48
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 48
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 56
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 56
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 56
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 423
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 101
+.b8 49
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 423
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttir b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..5c9eab337c61b54065c360c79eaf977b31c29c0c
--- /dev/null
+++ b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttir
@@ -0,0 +1,83 @@
+module {
+  tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg9: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c256_i32 = arith.constant 256 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16>
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cst_1 = arith.constant 2.560000e+02 : f32
+    %cst_2 = arith.constant 9.99999974E-6 : f32
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32>
+    %cst_4 = arith.constant dense<256> : tensor<256xi32>
+    %0 = tt.get_program_id x : i32
+    %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
+    %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32>
+    %3 = arith.muli %0, %c256_i32 : i32
+    %4 = tt.splat %3 : (i32) -> tensor<256xi32>
+    %5 = arith.addi %1, %4 : tensor<256xi32>
+    %6 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
+    %9 = tt.splat %arg2 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32>
+    %13 = tt.splat %arg3 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32>
+    %17 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32>
+    %21 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
+    %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
+    %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
+    %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32>
+    %25 = tt.splat %arg6 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
+    %28 = arith.addf %8, %12 : tensor<256xf32>
+    %29 = arith.addf %28, %16 : tensor<256xf32>
+    %30 = arith.addf %29, %20 : tensor<256xf32>
+    %31 = arith.addf %30, %24 : tensor<256xf32>
+    %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %58 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %58 : f32
+    }) : (tensor<256xf32>) -> f32
+    %34 = arith.addf %33, %cst_0 : f32
+    %35 = arith.divf %34, %cst_1 : f32
+    %36 = tt.splat %35 : (f32) -> tensor<1xf32>
+    %37 = tt.splat %35 : (f32) -> tensor<256xf32>
+    %38 = arith.subf %31, %37 : tensor<256xf32>
+    %39 = arith.mulf %38, %38 : tensor<256xf32>
+    %40 = arith.select %2, %39, %cst_3 : tensor<256xi1>, tensor<256xf32>
+    %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({
+    ^bb0(%arg12: f32, %arg13: f32):
+      %58 = arith.addf %arg12, %arg13 : f32
+      tt.reduce.return %58 : f32
+    }) : (tensor<256xf32>) -> f32
+    %42 = arith.addf %41, %cst_0 : f32
+    %43 = arith.divf %42, %cst_1 : f32
+    %44 = arith.addf %43, %cst_2 : f32
+    %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
+    %46 = tt.splat %45 : (f32) -> tensor<1xf32>
+    %47 = tt.splat %45 : (f32) -> tensor<256xf32>
+    %48 = arith.mulf %38, %47 : tensor<256xf32>
+    %49 = arith.mulf %48, %27 : tensor<256xf32>
+    %50 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    gpu.barrier
+    %52 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
+    %53 = tt.splat %52 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    %54 = tt.splat %arg9 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
+    %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
+    tt.store %55, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
+    %56 = tt.addptr %arg8, %0 : !tt.ptr<f32, 1>, i32
+    %57 = tt.splat %56 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
+    tt.store %57, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ptx b/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..cbf8d8cb9dddc86aa2a767755bada6649eb8114a
--- /dev/null
+++ b/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ptx
@@ -0,0 +1,843 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7d8d9d10de11de
+.extern .shared .align 1 .b8 global_smem[];
+.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
+
+.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<35>;
+	.reg .b16 	%rs<21>;
+	.reg .b32 	%r<115>;
+	.reg .f32 	%f<94>;
+	.reg .b64 	%rd<25>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd11, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0];
+	ld.param.u64 	%rd12, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r80, %tid.x;
+	and.b32  	%r81, %r80, 31;
+	ld.param.u64 	%rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2];
+	ld.param.u64 	%rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3];
+	ld.param.u64 	%rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4];
+	and.b32  	%r82, %r80, 63;
+	ld.param.u64 	%rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5];
+	shl.b32 	%r83, %r82, 2;
+	ld.param.u64 	%rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6];
+	ld.param.u64 	%rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r84, %r1, 8;
+	ld.param.u64 	%rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8];
+	.loc	1 30 36
+	or.b32  	%r85, %r84, %r83;
+	ld.param.u64 	%rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9];
+	.loc	1 30 30
+	mul.wide.s32 	%rd21, %r85, 4;
+	add.s64 	%rd1, %rd12, %rd21;
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	mov.b32 	%f1, %r2;
+	mov.b32 	%f2, %r3;
+	mov.b32 	%f3, %r4;
+	mov.b32 	%f4, %r5;
+	.loc	1 31 30
+	mul.wide.s32 	%rd22, %r85, 2;
+	add.s64 	%rd2, %rd13, %rd22;
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	cvt.u16.u32 	%rs1, %r10;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	cvt.u16.u32 	%rs3, %r11;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd14, %rd22;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f9, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f10, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f11, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f12, %r25;
+	.loc	1 33 30
+	add.s64 	%rd4, %rd15, %rd22;
+	.loc	1 33 46
+	mov.u32 %r26, 0x0;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ];
+	@!%p1 mov.u32 %r26, %r6;
+	@!%p1 mov.u32 %r27, %r6;
+	cvt.u16.u32 	%rs9, %r26;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; }
+	cvt.u16.u32 	%rs11, %r27;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; }
+	.loc	1 33 67
+	cvt.f32.bf16 %r30, %rs9;
+	mov.b32 	%f13, %r30;
+	cvt.f32.bf16 %r31, %rs10;
+	mov.b32 	%f14, %r31;
+	cvt.f32.bf16 %r32, %rs11;
+	mov.b32 	%f15, %r32;
+	cvt.f32.bf16 %r33, %rs12;
+	mov.b32 	%f16, %r33;
+	.loc	1 34 31
+	add.s64 	%rd5, %rd16, %rd22;
+	.loc	1 34 47
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	cvt.u16.u32 	%rs13, %r34;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; }
+	cvt.u16.u32 	%rs15, %r35;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; }
+	.loc	1 34 68
+	cvt.f32.bf16 %r38, %rs13;
+	mov.b32 	%f17, %r38;
+	cvt.f32.bf16 %r39, %rs14;
+	mov.b32 	%f18, %r39;
+	cvt.f32.bf16 %r40, %rs15;
+	mov.b32 	%f19, %r40;
+	cvt.f32.bf16 %r41, %rs16;
+	mov.b32 	%f20, %r41;
+	.loc	1 35 31
+	mul.wide.u32 	%rd23, %r83, 4;
+	add.s64 	%rd6, %rd17, %rd23;
+	.loc	1 35 36
+	mov.u32 %r42, 0x0;
+	mov.u32 %r43, 0x0;
+	mov.u32 %r44, 0x0;
+	mov.u32 %r45, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ];
+	@!%p1 mov.u32 %r42, %r6;
+	@!%p1 mov.u32 %r43, %r6;
+	@!%p1 mov.u32 %r44, %r6;
+	@!%p1 mov.u32 %r45, %r6;
+	.loc	1 37 18
+	add.f32 	%f21, %f5, %f1;
+	add.f32 	%f22, %f6, %f2;
+	add.f32 	%f23, %f7, %f3;
+	.loc	1 39 18
+	add.f32 	%f24, %f21, %f9;
+	add.f32 	%f25, %f22, %f10;
+	add.f32 	%f26, %f23, %f11;
+	.loc	1 41 18
+	add.f32 	%f27, %f25, %f14;
+	add.f32 	%f28, %f26, %f15;
+	.loc	1 43 19
+	add.f32 	%f29, %f27, %f18;
+	add.f32 	%f30, %f28, %f19;
+	.loc	1 41 18
+	add.f32 	%f31, %f24, %f13;
+	add.f32 	%f32, %f8, %f4;
+	.loc	1 43 19
+	add.f32 	%f33, %f32, %f12;
+	add.f32 	%f34, %f31, %f17;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f35, %f34, %f29;
+	add.f32 	%f36, %f33, %f16;
+	add.f32 	%f37, %f35, %f30;
+	add.f32 	%f38, %f36, %f20;
+	mov.b32 	%r71, %f38;
+	add.f32 	%f39, %f37, %f38;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r86, %f39;
+	shfl.sync.bfly.b32	%r87, %r86, 16, 31, -1;
+	mov.b32 	%f40, %r87;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f41, %f39, %f40;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r88, %f41;
+	shfl.sync.bfly.b32	%r89, %r88, 8, 31, -1;
+	mov.b32 	%f42, %r89;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f43, %f41, %f42;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r90, %f43;
+	shfl.sync.bfly.b32	%r91, %r90, 4, 31, -1;
+	mov.b32 	%f44, %r91;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f45, %f43, %f44;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r92, %f45;
+	shfl.sync.bfly.b32	%r93, %r92, 2, 31, -1;
+	mov.b32 	%f46, %r93;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f47, %f45, %f46;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r94, %f47;
+	shfl.sync.bfly.b32	%r95, %r94, 1, 31, -1;
+	mov.b32 	%f48, %r95;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f49, %f47, %f48;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p23, %r81, 0;
+	shr.u32 	%r96, %r80, 3;
+	and.b32  	%r97, %r96, 4;
+	mov.u32 	%r98, global_smem;
+	add.s32 	%r50, %r98, %r97;
+	mov.b32 	%r51, %f49;
+	@%p23 st.shared.b32 [ %r50 + 0 ], %r51;
+	bar.sync 	0;
+	setp.lt.s32 	%p24, %r80, 2;
+	shl.b32 	%r99, %r80, 2;
+	add.s32 	%r53, %r98, %r99;
+	@%p24 ld.shared.b32 %r52, [ %r53 + 0 ];
+	mov.b32 	%f50, %r52;
+	shfl.sync.bfly.b32	%r100, %r52, 1, 31, -1;
+	mov.b32 	%f51, %r100;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f52, %f50, %f51;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r101, %r80, 1;
+	setp.eq.b32 	%p33, %r101, 1;
+	not.pred 	%p34, %p33;
+	and.pred  	%p25, %p24, %p34;
+	mov.b32 	%r55, %f52;
+	@%p25 st.shared.b32 [ %r53 + 0 ], %r55;
+	bar.sync 	0;
+	ld.shared.f32 	%f53, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f54, %f53, 0f00000000;
+$L__tmp16:
+	.loc	1 51 20
+	mov.b32 	%r57, %f54;
+	mov.b32 	%r58, 1132462080;
+	div.full.f32 %r79, %r57, %r58;
+	mov.b32 	%f55, %r79;
+	.loc	1 52 20
+	sub.f32 	%f56, %f34, %f55;
+	sub.f32 	%f57, %f29, %f55;
+	sub.f32 	%f58, %f30, %f55;
+	sub.f32 	%f59, %f38, %f55;
+	.loc	1 53 20
+	mul.f32 	%f60, %f57, %f57;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f61, %f56, %f56, %f60;
+	fma.rn.f32 	%f62, %f58, %f58, %f61;
+	fma.rn.f32 	%f63, %f59, %f59, %f62;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r102, %f63;
+	shfl.sync.bfly.b32	%r103, %r102, 16, 31, -1;
+	mov.b32 	%f64, %r103;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f65, %f63, %f64;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r104, %f65;
+	shfl.sync.bfly.b32	%r105, %r104, 8, 31, -1;
+	mov.b32 	%f66, %r105;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f67, %f65, %f66;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r106, %f67;
+	shfl.sync.bfly.b32	%r107, %r106, 4, 31, -1;
+	mov.b32 	%f68, %r107;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f69, %f67, %f68;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r108, %f69;
+	shfl.sync.bfly.b32	%r109, %r108, 2, 31, -1;
+	mov.b32 	%f70, %r109;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f71, %f69, %f70;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r110, %f71;
+	shfl.sync.bfly.b32	%r111, %r110, 1, 31, -1;
+	mov.b32 	%f72, %r111;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f73, %f71, %f72;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r60, %f73;
+	@%p23 st.shared.b32 [ %r50 + 0 ], %r60;
+	bar.sync 	0;
+	@%p24 ld.shared.b32 %r61, [ %r53 + 0 ];
+	mov.b32 	%f74, %r61;
+	shfl.sync.bfly.b32	%r112, %r61, 1, 31, -1;
+	mov.b32 	%f75, %r112;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f76, %f74, %f75;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r64, %f76;
+	@%p25 st.shared.b32 [ %r53 + 0 ], %r64;
+	bar.sync 	0;
+	ld.shared.f32 	%f77, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f78, %f77, 0f00000000;
+$L__tmp33:
+	.loc	1 58 20
+	mov.b32 	%r66, %f78;
+	div.full.f32 %r65, %r66, %r58;
+	mov.b32 	%f79, %r65;
+	.loc	1 60 20
+	add.f32 	%f80, %f79, 0f3727C5AC;
+	.loc	1 61 26
+	rsqrt.approx.ftz.f32 	%f81, %f80;
+	.loc	1 35 36
+	mov.b32 	%f82, %r45;
+	mov.b32 	%f83, %r44;
+	mov.b32 	%f84, %r43;
+	mov.b32 	%f85, %r42;
+	.loc	1 63 20
+	mul.f32 	%f86, %f56, %f81;
+	mul.f32 	%f87, %f57, %f81;
+	mul.f32 	%f88, %f58, %f81;
+	mul.f32 	%f89, %f59, %f81;
+	.loc	1 64 20
+	mul.f32 	%f90, %f86, %f85;
+	mul.f32 	%f91, %f87, %f84;
+	mul.f32 	%f92, %f88, %f83;
+	mul.f32 	%f93, %f89, %f82;
+	.loc	1 66 25
+	add.s64 	%rd7, %rd18, %rd21;
+	.loc	1 66 48
+	mov.b32 	%r68, %f34;
+	mov.b32 	%r69, %f29;
+	mov.b32 	%r70, %f30;
+	@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 };
+	.loc	1 67 4
+	bar.sync 	0;
+	.loc	1 68 28
+	mul.wide.s32 	%rd24, %r1, 4;
+	add.s64 	%rd8, %rd11, %rd24;
+	.loc	1 68 40
+	setp.eq.s32 	%p30, %r82, 0;
+	mov.b32 	%r72, %f81;
+	@%p30 st.global.b32 [ %rd8 + 0 ], { %r72 };
+	.loc	1 69 25
+	add.s64 	%rd9, %rd20, %rd22;
+	.loc	1 69 48
+	mov.b32 	%r73, %f90;
+	cvt.rn.bf16.f32 %rs17, %r73;
+	mov.b32 	%r74, %f91;
+	cvt.rn.bf16.f32 %rs18, %r74;
+	mov.b32 	%r75, %f92;
+	cvt.rn.bf16.f32 %rs19, %r75;
+	mov.b32 	%r76, %f93;
+	cvt.rn.bf16.f32 %rs20, %r76;
+	mov.b32 	%r113, {%rs17, %rs18};
+	mov.b32 	%r114, {%rs19, %rs20};
+	@%p1 st.global.v2.b32 [ %rd9 + 0 ], { %r113, %r114 };
+	.loc	1 70 25
+	add.s64 	%rd10, %rd19, %rd24;
+	.loc	1 70 37
+	@%p30 st.global.b32 [ %rd10 + 0 ], { %r79 };
+	.loc	1 70 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	// .globl	__nv_rsqrtf
+.visible .func  (.param .b32 func_retval0) __nv_rsqrtf(
+	.param .b32 __nv_rsqrtf_param_0
+)
+{
+	.reg .f32 	%f<3>;
+$L__func_begin1:
+
+	ld.param.f32 	%f1, [__nv_rsqrtf_param_0];
+	rsqrt.approx.ftz.f32 	%f2, %f1;
+	st.param.f32 	[func_retval0+0], %f2;
+	ret;
+$L__func_end1:
+
+}
+	.file	1 "/tmp/torchinductor_root/il/cilofmivtj4aqoxmz3r7fz7sc3blcxfzk3utwsuayln6lpg5jwtv.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 419
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 105
+.b8 108
+.b8 111
+.b8 102
+.b8 109
+.b8 105
+.b8 118
+.b8 116
+.b8 106
+.b8 52
+.b8 97
+.b8 113
+.b8 111
+.b8 120
+.b8 109
+.b8 122
+.b8 51
+.b8 114
+.b8 55
+.b8 102
+.b8 122
+.b8 55
+.b8 115
+.b8 99
+.b8 51
+.b8 98
+.b8 108
+.b8 99
+.b8 120
+.b8 102
+.b8 122
+.b8 107
+.b8 51
+.b8 117
+.b8 116
+.b8 119
+.b8 115
+.b8 117
+.b8 97
+.b8 121
+.b8 108
+.b8 110
+.b8 54
+.b8 108
+.b8 112
+.b8 103
+.b8 53
+.b8 106
+.b8 119
+.b8 116
+.b8 118
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 105
+.b8 108
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 101
+.b8 49
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 101
+.b8 49
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 48
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 48
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 48
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 56
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 56
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 56
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 423
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 101
+.b8 49
+.b8 49
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 423
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.cubin b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..900de43634ac6db616b83bfd8a8ba7423cb29829
Binary files /dev/null and b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.cubin differ
diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.llir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..d906f452141808f21c61f8ec5e97b1449f424a0d
--- /dev/null
+++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.llir
@@ -0,0 +1,53 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 {
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %5 = shl i32 %4, 1, !dbg !8
+  %6 = and i32 %5, 510, !dbg !8
+  %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
+  %8 = shl i32 %7, 9, !dbg !10
+  %9 = or i32 %8, %6, !dbg !11
+  %10 = sext i32 %9 to i64, !dbg !12
+  %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12
+  %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13
+  %13 = trunc i32 %12 to i16, !dbg !13
+  %extelt.offset = lshr i32 %12, 16, !dbg !13
+  %14 = trunc i32 %extelt.offset to i16, !dbg !13
+  %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #1, !dbg !14
+  %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !14
+  %17 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15
+  %18 = bitcast float %15 to i32, !dbg !16
+  %19 = bitcast float %16 to i32, !dbg !16
+  tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %18, i32 %19, ptr addrspace(1) %17, i1 true) #1, !dbg !16
+  ret void, !dbg !17
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot")
+!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 21, column: 36, scope: !5)
+!9 = !DILocation(line: 20, column: 28, scope: !5)
+!10 = !DILocation(line: 20, column: 33, scope: !5)
+!11 = !DILocation(line: 21, column: 23, scope: !5)
+!12 = !DILocation(line: 24, column: 30, scope: !5)
+!13 = !DILocation(line: 24, column: 35, scope: !5)
+!14 = !DILocation(line: 24, column: 44, scope: !5)
+!15 = !DILocation(line: 26, column: 25, scope: !5)
+!16 = !DILocation(line: 26, column: 36, scope: !5)
+!17 = !DILocation(line: 26, column: 4, scope: !5)
diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttgir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..2844e7b30bcdd6d5fd4f8f7e5ee181d165252135
--- /dev/null
+++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttgir
@@ -0,0 +1,19 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked>
+    %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir
new file mode 100644
index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b
--- /dev/null
+++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir
@@ -0,0 +1,18 @@
+module {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32>
+    %4 = arith.addi %3, %2 : tensor<512xi32>
+    %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
+    %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
+    %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16>
+    %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32>
+    %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
+    tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
+    tt.return
+  }
+}
diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.cubin b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c135c9853feed1f3fd4d2fb5fb6f260aacf379a4
Binary files /dev/null and b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.cubin differ
diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttgir b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..514d70caf1f170c1efee99b3d26aa1c6201a0397
--- /dev/null
+++ b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttgir
@@ -0,0 +1,21 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked>
+    %c512_i32 = arith.constant 512 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c512_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
+    %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked>
+    %4 = arith.addi %3, %2 : tensor<512xi32, #blocked>
+    %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked>
+    %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
+    %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi32, #blocked>
+    %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32, #blocked>
+    %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>, #blocked>
+    %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr<bf16, 1>, #blocked>, tensor<512xi32, #blocked>
+    %11 = arith.truncf %8 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked>
+    tt.store %10, %11, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..113fae93e2c87d8ed33a036760d6e263f3d9979c
Binary files /dev/null and b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin differ
diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.llir b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.llir
new file mode 100644
index 0000000000000000000000000000000000000000..8772933a947f8310058944ad460f07f04c457ece
--- /dev/null
+++ b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.llir
@@ -0,0 +1,148 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define void @triton__0d1d2de3de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3) local_unnamed_addr !dbg !5 {
+  %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
+  %6 = and i32 %5, 31, !dbg !8
+  %7 = lshr i32 %5, 5, !dbg !8
+  %8 = and i32 %7, 3, !dbg !8
+  %urem = and i32 %5, 127, !dbg !8
+  %9 = or i32 %urem, 384, !dbg !8
+  %10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9
+  %11 = icmp slt i32 %10, 256, !dbg !10
+  %12 = icmp ult i32 %9, 480, !dbg !11
+  %13 = shl nuw nsw i32 %urem, 8, !dbg !12
+  %14 = or i32 %13, 32768, !dbg !12
+  %15 = or i32 %13, 65536, !dbg !12
+  %16 = shl nuw nsw i32 %9, 8, !dbg !12
+  %17 = add i32 %10, %13, !dbg !13
+  %18 = add i32 %14, %10, !dbg !13
+  %19 = add i32 %15, %10, !dbg !13
+  %20 = add i32 %10, %16, !dbg !13
+  %21 = sext i32 %17 to i64, !dbg !14
+  %22 = getelementptr float, ptr addrspace(1) %0, i64 %21, !dbg !14
+  %23 = sext i32 %18 to i64, !dbg !14
+  %24 = getelementptr float, ptr addrspace(1) %0, i64 %23, !dbg !14
+  %25 = sext i32 %19 to i64, !dbg !14
+  %26 = getelementptr float, ptr addrspace(1) %0, i64 %25, !dbg !14
+  %27 = sext i32 %20 to i64, !dbg !14
+  %28 = getelementptr float, ptr addrspace(1) %0, i64 %27, !dbg !14
+  %29 = and i1 %12, %11, !dbg !15
+  %30 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %22, i1 %11, i32 0, i1 %11) #3, !dbg !16
+  %31 = bitcast i32 %30 to float, !dbg !16
+  %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %24, i1 %11, i32 0, i1 %11) #3, !dbg !16
+  %33 = bitcast i32 %32 to float, !dbg !16
+  %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %26, i1 %11, i32 0, i1 %11) #3, !dbg !16
+  %35 = bitcast i32 %34 to float, !dbg !16
+  %36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %28, i1 %29, i32 0, i1 %29) #3, !dbg !16
+  %37 = bitcast i32 %36 to float, !dbg !16
+  %38 = fadd float %31, 0.000000e+00, !dbg !17
+  %39 = fadd float %33, 0.000000e+00, !dbg !17
+  %40 = fadd float %35, 0.000000e+00, !dbg !17
+  %41 = fadd float %37, 0.000000e+00, !dbg !17
+  %42 = select i1 %29, float %41, float 0.000000e+00, !dbg !18
+  %43 = fadd float %38, %39, !dbg !19
+  %44 = fadd float %43, %40, !dbg !19
+  %45 = select i1 %11, float %44, float 0.000000e+00, !dbg !19
+  %46 = fadd float %45, %42, !dbg !19
+  %47 = bitcast float %46 to i32, !dbg !25
+  %48 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 16, i32 31), !dbg !25
+  %49 = bitcast i32 %48 to float, !dbg !25
+  %50 = fadd float %46, %49, !dbg !19
+  %51 = bitcast float %50 to i32, !dbg !25
+  %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 8, i32 31), !dbg !25
+  %53 = bitcast i32 %52 to float, !dbg !25
+  %54 = fadd float %50, %53, !dbg !19
+  %55 = bitcast float %54 to i32, !dbg !25
+  %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !25
+  %57 = bitcast i32 %56 to float, !dbg !25
+  %58 = fadd float %54, %57, !dbg !19
+  %59 = bitcast float %58 to i32, !dbg !25
+  %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 2, i32 31), !dbg !25
+  %61 = bitcast i32 %60 to float, !dbg !25
+  %62 = fadd float %58, %61, !dbg !19
+  %63 = bitcast float %62 to i32, !dbg !25
+  %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 1, i32 31), !dbg !25
+  %65 = bitcast i32 %64 to float, !dbg !25
+  %66 = fadd float %62, %65, !dbg !19
+  %67 = icmp eq i32 %6, 0, !dbg !25
+  %68 = zext nneg i32 %8 to i64, !dbg !25
+  %69 = getelementptr float, ptr addrspace(3) @global_smem, i64 %68, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, float %66, i1 %67) #3, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %70 = icmp slt i32 %5, 4, !dbg !25
+  %71 = sext i32 %5 to i64, !dbg !25
+  %72 = getelementptr float, ptr addrspace(3) @global_smem, i64 %71, !dbg !25
+  %73 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %72, i1 %70) #3, !dbg !25
+  %74 = bitcast float %73 to i32, !dbg !25
+  %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 2, i32 31), !dbg !25
+  %76 = bitcast i32 %75 to float, !dbg !25
+  %77 = fadd float %73, %76, !dbg !19
+  %78 = bitcast float %77 to i32, !dbg !25
+  %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !25
+  %80 = bitcast i32 %79 to float, !dbg !25
+  %81 = fadd float %77, %80, !dbg !19
+  %82 = and i32 %5, 3, !dbg !25
+  %83 = icmp eq i32 %82, 0, !dbg !25
+  %84 = and i1 %70, %83, !dbg !25
+  tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %72, float %81, i1 %84) #3, !dbg !25
+  tail call void @llvm.nvvm.barrier0(), !dbg !25
+  %85 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !25
+  %86 = sext i32 %10 to i64, !dbg !27
+  %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !27
+  %88 = icmp eq i32 %urem, 0, !dbg !28
+  %89 = and i1 %88, %11, !dbg !28
+  tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %85, ptr addrspace(1) %87, i1 %89) #3, !dbg !28
+  ret void, !dbg !29
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+
+; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
+declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent nocallback nounwind }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+!nvvm.annotations = !{!3, !4, !4, !3}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "ccizx54ebt45pqvf7it3p5t23oudtaqbed2j3uakpossm65m4cax.py", directory: "/tmp/torchinductor_root/ci")
+!3 = !{ptr @triton__0d1d2de3de, !"kernel", i32 1}
+!4 = !{ptr @triton__0d1d2de3de, !"maxntidx", i32 128}
+!5 = distinct !DISubprogram(name: "triton__0d1d2de3de", linkageName: "triton__0d1d2de3de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
+!7 = !{}
+!8 = !DILocation(line: 24, column: 33, scope: !5)
+!9 = !DILocation(line: 21, column: 28, scope: !5)
+!10 = !DILocation(line: 23, column: 21, scope: !5)
+!11 = !DILocation(line: 29, column: 25, scope: !5)
+!12 = !DILocation(line: 31, column: 44, scope: !5)
+!13 = !DILocation(line: 31, column: 40, scope: !5)
+!14 = !DILocation(line: 31, column: 34, scope: !5)
+!15 = !DILocation(line: 31, column: 58, scope: !5)
+!16 = !DILocation(line: 31, column: 50, scope: !5)
+!17 = !DILocation(line: 33, column: 23, scope: !5)
+!18 = !DILocation(line: 34, column: 46, scope: !5)
+!19 = !DILocation(line: 233, column: 15, scope: !20, inlinedAt: !23)
+!20 = distinct !DILexicalBlockFile(scope: !22, file: !21, discriminator: 0)
+!21 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
+!22 = distinct !DILexicalBlockFile(scope: !5, file: !21, discriminator: 0)
+!23 = !DILocation(line: 243, column: 36, scope: !20, inlinedAt: !24)
+!24 = !DILocation(line: 35, column: 25, scope: !20)
+!25 = !DILocation(line: 243, column: 36, scope: !22, inlinedAt: !26)
+!26 = !DILocation(line: 35, column: 25, scope: !22)
+!27 = !DILocation(line: 36, column: 25, scope: !5)
+!28 = !DILocation(line: 36, column: 36, scope: !5)
+!29 = !DILocation(line: 36, column: 4, scope: !5)
diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ptx b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..f9d3ed9a556bfcc716c4d81b7b966e9fb3b9bbc3
--- /dev/null
+++ b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ptx
@@ -0,0 +1,496 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2de3de
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2de3de(
+	.param .u64 triton__0d1d2de3de_param_0,
+	.param .u64 triton__0d1d2de3de_param_1,
+	.param .u32 triton__0d1d2de3de_param_2,
+	.param .u32 triton__0d1d2de3de_param_3
+)
+.maxntid 128, 1, 1
+{
+	.reg .pred 	%p<16>;
+	.reg .b32 	%r<45>;
+	.reg .f32 	%f<29>;
+	.reg .b64 	%rd<13>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd6, [triton__0d1d2de3de_param_0];
+	ld.param.u64 	%rd7, [triton__0d1d2de3de_param_1];
+$L__tmp0:
+	.loc	1 24 33
+	mov.u32 	%r17, %tid.x;
+	and.b32  	%r18, %r17, 31;
+	and.b32  	%r19, %r17, 127;
+	or.b32  	%r20, %r19, 384;
+	.loc	1 21 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 23 21
+	setp.lt.s32 	%p1, %r1, 256;
+	.loc	1 29 25
+	setp.lt.u32 	%p13, %r20, 480;
+	.loc	1 31 44
+	shl.b32 	%r21, %r19, 8;
+	shl.b32 	%r22, %r20, 8;
+	.loc	1 31 40
+	add.s32 	%r23, %r1, %r21;
+	add.s32 	%r24, %r23, 32768;
+	add.s32 	%r25, %r23, 65536;
+	add.s32 	%r26, %r1, %r22;
+	.loc	1 31 34
+	mul.wide.s32 	%rd8, %r23, 4;
+	add.s64 	%rd1, %rd6, %rd8;
+	mul.wide.s32 	%rd9, %r24, 4;
+	add.s64 	%rd2, %rd6, %rd9;
+	mul.wide.s32 	%rd10, %r25, 4;
+	add.s64 	%rd3, %rd6, %rd10;
+	mul.wide.s32 	%rd11, %r26, 4;
+	add.s64 	%rd4, %rd6, %rd11;
+	.loc	1 31 58
+	and.pred  	%p7, %p13, %p1;
+	mov.b32 	%r3, 0;
+	.loc	1 31 50
+	mov.u32 %r2, 0x0;
+	@%p1 ld.global.L1::evict_first.b32 { %r2 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r3;
+	mov.b32 	%f1, %r2;
+	mov.u32 %r4, 0x0;
+	@%p1 ld.global.L1::evict_first.b32 { %r4 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r4, %r3;
+	mov.b32 	%f2, %r4;
+	mov.u32 %r6, 0x0;
+	@%p1 ld.global.L1::evict_first.b32 { %r6 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r6, %r3;
+	mov.b32 	%f3, %r6;
+	mov.u32 %r8, 0x0;
+	@%p7 ld.global.L1::evict_first.b32 { %r8 }, [ %rd4 + 0 ];
+	@!%p7 mov.u32 %r8, %r3;
+	mov.b32 	%f4, %r8;
+	.loc	1 33 23
+	add.f32 	%f5, %f1, 0f00000000;
+	add.f32 	%f6, %f2, 0f00000000;
+	add.f32 	%f7, %f3, 0f00000000;
+	add.f32 	%f8, %f4, 0f00000000;
+	.loc	1 34 46
+	selp.f32 	%f9, %f8, 0f00000000, %p7;
+$L__tmp1:
+	.loc	2 233 15
+	add.f32 	%f10, %f5, %f6;
+	add.f32 	%f11, %f10, %f7;
+	selp.f32 	%f12, %f11, 0f00000000, %p1;
+	add.f32 	%f13, %f12, %f9;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r27, %f13;
+	shfl.sync.bfly.b32	%r28, %r27, 16, 31, -1;
+	mov.b32 	%f14, %r28;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f15, %f13, %f14;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r29, %f15;
+	shfl.sync.bfly.b32	%r30, %r29, 8, 31, -1;
+	mov.b32 	%f16, %r30;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f17, %f15, %f16;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r31, %f17;
+	shfl.sync.bfly.b32	%r32, %r31, 4, 31, -1;
+	mov.b32 	%f18, %r32;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f19, %f17, %f18;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r33, %f19;
+	shfl.sync.bfly.b32	%r34, %r33, 2, 31, -1;
+	mov.b32 	%f20, %r34;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f21, %f19, %f20;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r35, %f21;
+	shfl.sync.bfly.b32	%r36, %r35, 1, 31, -1;
+	mov.b32 	%f22, %r36;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f23, %f21, %f22;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p9, %r18, 0;
+	shr.u32 	%r37, %r17, 3;
+	and.b32  	%r38, %r37, 12;
+	mov.u32 	%r39, global_smem;
+	add.s32 	%r10, %r39, %r38;
+	mov.b32 	%r11, %f23;
+	@%p9 st.shared.b32 [ %r10 + 0 ], %r11;
+	bar.sync 	0;
+	setp.lt.s32 	%p10, %r17, 4;
+	shl.b32 	%r40, %r17, 2;
+	add.s32 	%r13, %r39, %r40;
+	@%p10 ld.shared.b32 %r12, [ %r13 + 0 ];
+	mov.b32 	%f24, %r12;
+	shfl.sync.bfly.b32	%r41, %r12, 2, 31, -1;
+	mov.b32 	%f25, %r41;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f26, %f24, %f25;
+$L__tmp14:
+	.loc	2 243 36
+	mov.b32 	%r42, %f26;
+	shfl.sync.bfly.b32	%r43, %r42, 1, 31, -1;
+	mov.b32 	%f27, %r43;
+$L__tmp15:
+	.loc	2 233 15
+	add.f32 	%f28, %f26, %f27;
+$L__tmp16:
+	.loc	2 243 36
+	and.b32  	%r44, %r17, 3;
+	setp.eq.s32 	%p14, %r44, 0;
+	and.pred  	%p11, %p10, %p14;
+	mov.b32 	%r15, %f28;
+	@%p11 st.shared.b32 [ %r13 + 0 ], %r15;
+	bar.sync 	0;
+	ld.shared.u32 	%r16, [global_smem];
+$L__tmp17:
+	.loc	1 36 25
+	mul.wide.s32 	%rd12, %r1, 4;
+	add.s64 	%rd5, %rd7, %rd12;
+	.loc	1 36 36
+	setp.eq.s32 	%p15, %r19, 0;
+	and.pred  	%p12, %p15, %p1;
+	@%p12 st.global.b32 [ %rd5 + 0 ], { %r16 };
+	.loc	1 36 4
+	ret;
+$L__tmp18:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/ci/ccizx54ebt45pqvf7it3p5t23oudtaqbed2j3uakpossm65m4cax.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 262
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 99
+.b8 105
+.b8 122
+.b8 120
+.b8 53
+.b8 52
+.b8 101
+.b8 98
+.b8 116
+.b8 52
+.b8 53
+.b8 112
+.b8 113
+.b8 118
+.b8 102
+.b8 55
+.b8 105
+.b8 116
+.b8 51
+.b8 112
+.b8 53
+.b8 116
+.b8 50
+.b8 51
+.b8 111
+.b8 117
+.b8 100
+.b8 116
+.b8 97
+.b8 113
+.b8 98
+.b8 101
+.b8 100
+.b8 50
+.b8 106
+.b8 51
+.b8 117
+.b8 97
+.b8 107
+.b8 112
+.b8 111
+.b8 115
+.b8 115
+.b8 109
+.b8 54
+.b8 53
+.b8 109
+.b8 52
+.b8 99
+.b8 97
+.b8 120
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 99
+.b8 105
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 51
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 51
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp16
+.b8 2
+.b8 35
+.b8 25
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp16
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp17
+.b8 2
+.b8 35
+.b8 25
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 266
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 101
+.b8 51
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 266
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ttgir b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ttgir
new file mode 100644
index 0000000000000000000000000000000000000000..caa4a5e3e6954c37ad9630a5cb6f9d0a3d62b851
--- /dev/null
+++ b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ttgir
@@ -0,0 +1,35 @@
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @triton__0d1d2de3de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<256> : tensor<1x512xi32, #blocked>
+    %cst_0 = arith.constant dense<480> : tensor<1x512xi32, #blocked>
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x512xf32, #blocked>
+    %c256_i32 = arith.constant 256 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.cmpi slt, %0, %c256_i32 : i32
+    %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
+    %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x512xi32, #blocked>
+    %4 = arith.cmpi slt, %3, %cst_0 : tensor<1x512xi32, #blocked>
+    %5 = arith.muli %3, %cst : tensor<1x512xi32, #blocked>
+    %6 = tt.splat %0 : (i32) -> tensor<1x512xi32, #blocked>
+    %7 = arith.addi %6, %5 : tensor<1x512xi32, #blocked>
+    %8 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1x512x!tt.ptr<f32, 1>, #blocked>
+    %9 = tt.addptr %8, %7 : tensor<1x512x!tt.ptr<f32, 1>, #blocked>, tensor<1x512xi32, #blocked>
+    %10 = tt.splat %1 : (i1) -> tensor<1x512xi1, #blocked>
+    %11 = arith.andi %4, %10 : tensor<1x512xi1, #blocked>
+    %12 = tt.load %9, %11, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x512xf32, #blocked>
+    %13 = arith.addf %12, %cst_1 : tensor<1x512xf32, #blocked>
+    %14 = arith.select %11, %13, %cst_1 : tensor<1x512xi1, #blocked>, tensor<1x512xf32, #blocked>
+    %15 = "tt.reduce"(%14) <{axis = 1 : i32}> ({
+    ^bb0(%arg4: f32, %arg5: f32):
+      %20 = arith.addf %arg4, %arg5 : f32
+      tt.reduce.return %20 : f32
+    }) : (tensor<1x512xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %16 = tt.expand_dims %15 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked>
+    %17 = tt.addptr %arg1, %0 : !tt.ptr<f32, 1>, i32
+    %18 = tt.splat %17 : (!tt.ptr<f32, 1>) -> tensor<1x1x!tt.ptr<f32, 1>, #blocked>
+    %19 = tt.splat %1 : (i1) -> tensor<1x1xi1, #blocked>
+    tt.store %18, %16, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx
new file mode 100644
index 0000000000000000000000000000000000000000..6c93281712f9e553538000e6a6a99e22f723d4fa
--- /dev/null
+++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx
@@ -0,0 +1,1360 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 8.2
+.target sm_89
+.address_size 64
+
+	// .globl	triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de
+.extern .shared .align 1 .b8 global_smem[];
+
+.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de(
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_0,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_1,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_2,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_3,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_4,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_5,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_6,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_7,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_8,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_9,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_10,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_11,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_12,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_13,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_14,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_15,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_16,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_17,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_18,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_19,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_20,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_21,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_22,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_23,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_24,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_25,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_26,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_27,
+	.param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_28,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_29,
+	.param .u32 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_30
+)
+.maxntid 64, 1, 1
+{
+	.reg .pred 	%p<96>;
+	.reg .b16 	%rs<37>;
+	.reg .b32 	%r<222>;
+	.reg .f32 	%f<186>;
+	.reg .b64 	%rd<93>;
+	.loc	1 18 0
+$L__func_begin0:
+	.loc	1 18 0
+
+	ld.param.u64 	%rd60, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_0];
+	ld.param.u64 	%rd61, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_1];
+$L__tmp0:
+	.loc	1 26 26
+	mov.u32 	%r188, %tid.x;
+	and.b32  	%r189, %r188, 31;
+	ld.param.u64 	%rd62, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_2];
+	ld.param.u64 	%rd63, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_3];
+	ld.param.u64 	%rd64, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_4];
+	shl.b32 	%r190, %r188, 2;
+	ld.param.u64 	%rd65, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_5];
+	and.b32  	%r191, %r190, 252;
+	ld.param.u64 	%rd66, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_6];
+	ld.param.u64 	%rd67, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_7];
+	.loc	1 23 28
+	mov.u32 %r1, %ctaid.x;
+	.loc	1 30 40
+	shl.b32 	%r192, %r1, 8;
+	ld.param.u64 	%rd68, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_8];
+	.loc	1 30 36
+	or.b32  	%r193, %r192, %r191;
+	ld.param.u64 	%rd69, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_9];
+	ld.param.u64 	%rd70, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_10];
+	.loc	1 30 30
+	mul.wide.s32 	%rd71, %r193, 4;
+	add.s64 	%rd1, %rd60, %rd71;
+	ld.param.u64 	%rd72, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_11];
+	mov.b32 	%r6, 0;
+	mov.pred 	%p1, -1;
+	.loc	1 30 46
+	mov.u32 %r2, 0x0;
+	mov.u32 %r3, 0x0;
+	mov.u32 %r4, 0x0;
+	mov.u32 %r5, 0x0;
+	@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ];
+	@!%p1 mov.u32 %r2, %r6;
+	@!%p1 mov.u32 %r3, %r6;
+	@!%p1 mov.u32 %r4, %r6;
+	@!%p1 mov.u32 %r5, %r6;
+	ld.param.u64 	%rd73, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_12];
+	ld.param.u64 	%rd74, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_13];
+	ld.param.u64 	%rd75, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_14];
+	ld.param.u64 	%rd76, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_15];
+	ld.param.u64 	%rd77, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_16];
+	mov.b32 	%f1, %r2;
+	ld.param.u64 	%rd78, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_17];
+	mov.b32 	%f2, %r3;
+	ld.param.u64 	%rd79, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_18];
+	mov.b32 	%f3, %r4;
+	ld.param.u64 	%rd80, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_19];
+	mov.b32 	%f4, %r5;
+	ld.param.u64 	%rd81, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_20];
+	.loc	1 31 30
+	mul.wide.s32 	%rd82, %r193, 2;
+	add.s64 	%rd2, %rd61, %rd82;
+	ld.param.u64 	%rd83, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_21];
+	.loc	1 31 46
+	mov.u32 %r10, 0x0;
+	mov.u32 %r11, 0x0;
+	@%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ];
+	@!%p1 mov.u32 %r10, %r6;
+	@!%p1 mov.u32 %r11, %r6;
+	ld.param.u64 	%rd84, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_22];
+	ld.param.u64 	%rd85, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_23];
+	ld.param.u64 	%rd86, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_24];
+	cvt.u16.u32 	%rs1, %r10;
+	ld.param.u64 	%rd87, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_25];
+	ld.param.u64 	%rd88, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_26];
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; }
+	ld.param.u64 	%rd89, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_27];
+	cvt.u16.u32 	%rs3, %r11;
+	ld.param.u64 	%rd90, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_28];
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; }
+	.loc	1 31 67
+	cvt.f32.bf16 %r14, %rs1;
+	mov.b32 	%f5, %r14;
+	cvt.f32.bf16 %r15, %rs2;
+	mov.b32 	%f6, %r15;
+	cvt.f32.bf16 %r16, %rs3;
+	mov.b32 	%f7, %r16;
+	cvt.f32.bf16 %r17, %rs4;
+	mov.b32 	%f8, %r17;
+	.loc	1 32 30
+	add.s64 	%rd3, %rd62, %rd82;
+	.loc	1 32 46
+	mov.u32 %r18, 0x0;
+	mov.u32 %r19, 0x0;
+	@%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ];
+	@!%p1 mov.u32 %r18, %r6;
+	@!%p1 mov.u32 %r19, %r6;
+	cvt.u16.u32 	%rs5, %r18;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; }
+	cvt.u16.u32 	%rs7, %r19;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; }
+	.loc	1 32 67
+	cvt.f32.bf16 %r22, %rs5;
+	mov.b32 	%f9, %r22;
+	cvt.f32.bf16 %r23, %rs6;
+	mov.b32 	%f10, %r23;
+	cvt.f32.bf16 %r24, %rs7;
+	mov.b32 	%f11, %r24;
+	cvt.f32.bf16 %r25, %rs8;
+	mov.b32 	%f12, %r25;
+	.loc	1 33 30
+	mul.wide.s32 	%rd91, %r1, 4;
+	add.s64 	%rd4, %rd63, %rd91;
+	.loc	1 33 35
+	mov.u32 %r26, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ];
+	mov.b32 	%f13, %r26;
+	mov.u32 %r27, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ];
+	mov.u32 %r28, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ];
+	mov.u32 %r29, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ];
+	.loc	1 34 30
+	add.s64 	%rd8, %rd64, %rd91;
+	.loc	1 34 35
+	mov.u32 %r30, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r30 }, [ %rd8 + 0 ];
+	mov.b32 	%f14, %r30;
+	mov.u32 %r31, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ];
+	mov.u32 %r32, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ];
+	mov.u32 %r33, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ];
+	.loc	1 35 31
+	add.s64 	%rd12, %rd65, %rd82;
+	.loc	1 35 47
+	mov.u32 %r34, 0x0;
+	mov.u32 %r35, 0x0;
+	@%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd12 + 0 ];
+	@!%p1 mov.u32 %r34, %r6;
+	@!%p1 mov.u32 %r35, %r6;
+	cvt.u16.u32 	%rs9, %r34;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r34; }
+	cvt.u16.u32 	%rs11, %r35;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r35; }
+	.loc	1 35 68
+	cvt.f32.bf16 %r38, %rs9;
+	mov.b32 	%f15, %r38;
+	cvt.f32.bf16 %r39, %rs10;
+	mov.b32 	%f16, %r39;
+	cvt.f32.bf16 %r40, %rs11;
+	mov.b32 	%f17, %r40;
+	cvt.f32.bf16 %r41, %rs12;
+	mov.b32 	%f18, %r41;
+	.loc	1 36 31
+	add.s64 	%rd13, %rd66, %rd91;
+	.loc	1 36 36
+	mov.u32 %r42, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r42 }, [ %rd13 + 0 ];
+	mov.b32 	%f19, %r42;
+	mov.u32 %r43, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r43 }, [ %rd13 + 0 ];
+	mov.u32 %r44, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r44 }, [ %rd13 + 0 ];
+	mov.u32 %r45, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r45 }, [ %rd13 + 0 ];
+	.loc	1 37 31
+	add.s64 	%rd17, %rd67, %rd91;
+	.loc	1 37 36
+	mov.u32 %r46, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r46 }, [ %rd17 + 0 ];
+	mov.b32 	%f20, %r46;
+	mov.u32 %r47, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r47 }, [ %rd17 + 0 ];
+	mov.u32 %r48, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r48 }, [ %rd17 + 0 ];
+	mov.u32 %r49, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r49 }, [ %rd17 + 0 ];
+	.loc	1 38 31
+	add.s64 	%rd21, %rd68, %rd82;
+	.loc	1 38 47
+	mov.u32 %r50, 0x0;
+	mov.u32 %r51, 0x0;
+	@%p1 ld.global.v2.b32 { %r50, %r51 }, [ %rd21 + 0 ];
+	@!%p1 mov.u32 %r50, %r6;
+	@!%p1 mov.u32 %r51, %r6;
+	cvt.u16.u32 	%rs13, %r50;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r50; }
+	cvt.u16.u32 	%rs15, %r51;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r51; }
+	.loc	1 38 68
+	cvt.f32.bf16 %r54, %rs13;
+	mov.b32 	%f21, %r54;
+	cvt.f32.bf16 %r55, %rs14;
+	mov.b32 	%f22, %r55;
+	cvt.f32.bf16 %r56, %rs15;
+	mov.b32 	%f23, %r56;
+	cvt.f32.bf16 %r57, %rs16;
+	mov.b32 	%f24, %r57;
+	.loc	1 39 31
+	add.s64 	%rd22, %rd69, %rd82;
+	.loc	1 39 47
+	mov.u32 %r58, 0x0;
+	mov.u32 %r59, 0x0;
+	@%p1 ld.global.v2.b32 { %r58, %r59 }, [ %rd22 + 0 ];
+	@!%p1 mov.u32 %r58, %r6;
+	@!%p1 mov.u32 %r59, %r6;
+	cvt.u16.u32 	%rs17, %r58;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r58; }
+	cvt.u16.u32 	%rs19, %r59;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r59; }
+	.loc	1 39 68
+	cvt.f32.bf16 %r62, %rs17;
+	mov.b32 	%f25, %r62;
+	cvt.f32.bf16 %r63, %rs18;
+	mov.b32 	%f26, %r63;
+	cvt.f32.bf16 %r64, %rs19;
+	mov.b32 	%f27, %r64;
+	cvt.f32.bf16 %r65, %rs20;
+	mov.b32 	%f28, %r65;
+	.loc	1 40 32
+	add.s64 	%rd23, %rd70, %rd82;
+	.loc	1 40 48
+	mov.u32 %r66, 0x0;
+	mov.u32 %r67, 0x0;
+	@%p1 ld.global.v2.b32 { %r66, %r67 }, [ %rd23 + 0 ];
+	@!%p1 mov.u32 %r66, %r6;
+	@!%p1 mov.u32 %r67, %r6;
+	cvt.u16.u32 	%rs21, %r66;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r66; }
+	cvt.u16.u32 	%rs23, %r67;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r67; }
+	.loc	1 40 69
+	cvt.f32.bf16 %r70, %rs21;
+	mov.b32 	%f29, %r70;
+	cvt.f32.bf16 %r71, %rs22;
+	mov.b32 	%f30, %r71;
+	cvt.f32.bf16 %r72, %rs23;
+	mov.b32 	%f31, %r72;
+	cvt.f32.bf16 %r73, %rs24;
+	mov.b32 	%f32, %r73;
+	.loc	1 41 32
+	add.s64 	%rd24, %rd72, %rd91;
+	.loc	1 41 37
+	mov.u32 %r74, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r74 }, [ %rd24 + 0 ];
+	mov.b32 	%f33, %r74;
+	mov.u32 %r75, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r75 }, [ %rd24 + 0 ];
+	mov.u32 %r76, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r76 }, [ %rd24 + 0 ];
+	mov.u32 %r77, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r77 }, [ %rd24 + 0 ];
+	.loc	1 42 32
+	add.s64 	%rd28, %rd73, %rd91;
+	.loc	1 42 37
+	mov.u32 %r78, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r78 }, [ %rd28 + 0 ];
+	mov.b32 	%f34, %r78;
+	mov.u32 %r79, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r79 }, [ %rd28 + 0 ];
+	mov.u32 %r80, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r80 }, [ %rd28 + 0 ];
+	mov.u32 %r81, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r81 }, [ %rd28 + 0 ];
+	.loc	1 43 32
+	add.s64 	%rd32, %rd74, %rd82;
+	.loc	1 43 48
+	mov.u32 %r82, 0x0;
+	mov.u32 %r83, 0x0;
+	@%p1 ld.global.v2.b32 { %r82, %r83 }, [ %rd32 + 0 ];
+	@!%p1 mov.u32 %r82, %r6;
+	@!%p1 mov.u32 %r83, %r6;
+	cvt.u16.u32 	%rs25, %r82;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r82; }
+	cvt.u16.u32 	%rs27, %r83;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r83; }
+	.loc	1 43 69
+	cvt.f32.bf16 %r86, %rs25;
+	mov.b32 	%f35, %r86;
+	cvt.f32.bf16 %r87, %rs26;
+	mov.b32 	%f36, %r87;
+	cvt.f32.bf16 %r88, %rs27;
+	mov.b32 	%f37, %r88;
+	cvt.f32.bf16 %r89, %rs28;
+	mov.b32 	%f38, %r89;
+	.loc	1 44 32
+	add.s64 	%rd33, %rd75, %rd91;
+	.loc	1 44 37
+	mov.u32 %r90, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r90 }, [ %rd33 + 0 ];
+	mov.b32 	%f39, %r90;
+	mov.u32 %r91, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r91 }, [ %rd33 + 0 ];
+	mov.u32 %r92, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r92 }, [ %rd33 + 0 ];
+	mov.u32 %r93, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r93 }, [ %rd33 + 0 ];
+	.loc	1 45 32
+	add.s64 	%rd37, %rd76, %rd91;
+	.loc	1 45 37
+	mov.u32 %r94, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r94 }, [ %rd37 + 0 ];
+	mov.b32 	%f40, %r94;
+	mov.u32 %r95, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r95 }, [ %rd37 + 0 ];
+	mov.u32 %r96, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r96 }, [ %rd37 + 0 ];
+	mov.u32 %r97, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r97 }, [ %rd37 + 0 ];
+	.loc	1 46 32
+	add.s64 	%rd41, %rd77, %rd82;
+	.loc	1 46 48
+	mov.u32 %r98, 0x0;
+	mov.u32 %r99, 0x0;
+	@%p1 ld.global.v2.b32 { %r98, %r99 }, [ %rd41 + 0 ];
+	@!%p1 mov.u32 %r98, %r6;
+	@!%p1 mov.u32 %r99, %r6;
+	cvt.u16.u32 	%rs29, %r98;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r98; }
+	cvt.u16.u32 	%rs31, %r99;
+	{ .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r99; }
+	.loc	1 46 69
+	cvt.f32.bf16 %r102, %rs29;
+	mov.b32 	%f41, %r102;
+	cvt.f32.bf16 %r103, %rs30;
+	mov.b32 	%f42, %r103;
+	cvt.f32.bf16 %r104, %rs31;
+	mov.b32 	%f43, %r104;
+	cvt.f32.bf16 %r105, %rs32;
+	mov.b32 	%f44, %r105;
+	.loc	1 47 32
+	add.s64 	%rd42, %rd78, %rd91;
+	.loc	1 47 37
+	mov.u32 %r106, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r106 }, [ %rd42 + 0 ];
+	mov.b32 	%f45, %r106;
+	mov.u32 %r107, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r107 }, [ %rd42 + 0 ];
+	mov.u32 %r108, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r108 }, [ %rd42 + 0 ];
+	mov.u32 %r109, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r109 }, [ %rd42 + 0 ];
+	.loc	1 48 32
+	add.s64 	%rd46, %rd79, %rd91;
+	.loc	1 48 37
+	mov.u32 %r143, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r143 }, [ %rd46 + 0 ];
+	mov.b32 	%f46, %r143;
+	mov.u32 %r111, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r111 }, [ %rd46 + 0 ];
+	mov.u32 %r112, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r112 }, [ %rd46 + 0 ];
+	mov.u32 %r113, 0x0;
+	@%p1 ld.global.L1::evict_last.b32 { %r113 }, [ %rd46 + 0 ];
+	.loc	1 49 32
+	add.s64 	%rd50, %rd80, %rd71;
+	.loc	1 49 48
+	mov.u32 %r114, 0x0;
+	mov.u32 %r115, 0x0;
+	mov.u32 %r116, 0x0;
+	mov.u32 %r117, 0x0;
+	@%p1 ld.global.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd50 + 0 ];
+	@!%p1 mov.u32 %r114, %r6;
+	@!%p1 mov.u32 %r115, %r6;
+	@!%p1 mov.u32 %r116, %r6;
+	@!%p1 mov.u32 %r117, %r6;
+	.loc	1 50 32
+	mul.wide.u32 	%rd92, %r191, 4;
+	add.s64 	%rd51, %rd81, %rd92;
+	.loc	1 50 37
+	mov.u32 %r122, 0x0;
+	mov.u32 %r123, 0x0;
+	mov.u32 %r124, 0x0;
+	mov.u32 %r125, 0x0;
+	@%p1 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd51 + 0 ];
+	@!%p1 mov.u32 %r122, %r6;
+	@!%p1 mov.u32 %r123, %r6;
+	@!%p1 mov.u32 %r124, %r6;
+	@!%p1 mov.u32 %r125, %r6;
+	.loc	1 52 18
+	add.f32 	%f47, %f5, %f1;
+	add.f32 	%f48, %f6, %f2;
+	add.f32 	%f49, %f7, %f3;
+	add.f32 	%f50, %f8, %f4;
+	.loc	1 54 18
+	add.f32 	%f51, %f47, %f9;
+	add.f32 	%f52, %f48, %f10;
+	add.f32 	%f53, %f49, %f11;
+	add.f32 	%f54, %f50, %f12;
+	.loc	1 55 18
+	sub.f32 	%f55, %f51, %f13;
+	sub.f32 	%f56, %f52, %f13;
+	sub.f32 	%f57, %f53, %f13;
+	sub.f32 	%f58, %f54, %f13;
+	.loc	1 56 19
+	mul.f32 	%f59, %f55, %f14;
+	mul.f32 	%f60, %f56, %f14;
+	mul.f32 	%f61, %f57, %f14;
+	mul.f32 	%f62, %f58, %f14;
+	.loc	1 58 19
+	add.f32 	%f63, %f51, %f15;
+	add.f32 	%f64, %f52, %f16;
+	add.f32 	%f65, %f53, %f17;
+	add.f32 	%f66, %f54, %f18;
+	.loc	1 59 20
+	sub.f32 	%f67, %f63, %f19;
+	sub.f32 	%f68, %f64, %f19;
+	sub.f32 	%f69, %f65, %f19;
+	sub.f32 	%f70, %f66, %f19;
+	.loc	1 60 20
+	mul.f32 	%f71, %f67, %f20;
+	mul.f32 	%f72, %f68, %f20;
+	mul.f32 	%f73, %f69, %f20;
+	mul.f32 	%f74, %f70, %f20;
+	.loc	1 62 20
+	add.f32 	%f75, %f63, %f21;
+	add.f32 	%f76, %f64, %f22;
+	add.f32 	%f77, %f65, %f23;
+	add.f32 	%f78, %f66, %f24;
+	.loc	1 64 20
+	add.f32 	%f79, %f75, %f25;
+	add.f32 	%f80, %f76, %f26;
+	add.f32 	%f81, %f77, %f27;
+	add.f32 	%f82, %f78, %f28;
+	.loc	1 66 20
+	add.f32 	%f83, %f79, %f29;
+	add.f32 	%f84, %f80, %f30;
+	add.f32 	%f85, %f81, %f31;
+	add.f32 	%f86, %f82, %f32;
+	.loc	1 67 20
+	sub.f32 	%f87, %f83, %f33;
+	sub.f32 	%f88, %f84, %f33;
+	sub.f32 	%f89, %f85, %f33;
+	sub.f32 	%f90, %f86, %f33;
+	.loc	1 68 20
+	mul.f32 	%f91, %f87, %f34;
+	mul.f32 	%f92, %f88, %f34;
+	mul.f32 	%f93, %f89, %f34;
+	mul.f32 	%f94, %f90, %f34;
+	.loc	1 70 20
+	add.f32 	%f95, %f83, %f35;
+	add.f32 	%f96, %f84, %f36;
+	add.f32 	%f97, %f85, %f37;
+	add.f32 	%f98, %f86, %f38;
+	.loc	1 71 20
+	sub.f32 	%f99, %f95, %f39;
+	sub.f32 	%f100, %f96, %f39;
+	sub.f32 	%f101, %f97, %f39;
+	sub.f32 	%f102, %f98, %f39;
+	.loc	1 72 20
+	mul.f32 	%f103, %f99, %f40;
+	mul.f32 	%f104, %f100, %f40;
+	mul.f32 	%f105, %f101, %f40;
+	mul.f32 	%f106, %f102, %f40;
+	.loc	1 74 20
+	add.f32 	%f107, %f95, %f41;
+	add.f32 	%f108, %f96, %f42;
+	add.f32 	%f109, %f97, %f43;
+	add.f32 	%f110, %f98, %f44;
+	.loc	1 75 20
+	sub.f32 	%f111, %f107, %f45;
+	sub.f32 	%f112, %f108, %f45;
+	sub.f32 	%f113, %f109, %f45;
+	sub.f32 	%f114, %f110, %f45;
+	.loc	1 76 20
+	mul.f32 	%f115, %f111, %f46;
+	mul.f32 	%f116, %f112, %f46;
+	mul.f32 	%f117, %f113, %f46;
+	mul.f32 	%f118, %f114, %f46;
+	.loc	1 49 48
+	mov.b32 	%f119, %r115;
+	mov.b32 	%f120, %r114;
+	.loc	1 50 37
+	mov.b32 	%f121, %r123;
+	mov.b32 	%f122, %r122;
+	.loc	1 77 20
+	mul.f32 	%f123, %f120, %f122;
+	mul.f32 	%f124, %f119, %f121;
+	.loc	1 49 48
+	mov.b32 	%f125, %r116;
+	mov.b32 	%f126, %r117;
+	.loc	1 50 37
+	mov.b32 	%f127, %r124;
+	mov.b32 	%f128, %r125;
+	.loc	1 77 20
+	mul.f32 	%f129, %f126, %f128;
+	mul.f32 	%f130, %f125, %f127;
+$L__tmp1:
+	.loc	2 233 15
+	fma.rn.f32 	%f131, %f120, %f122, %f124;
+	fma.rn.f32 	%f132, %f125, %f127, %f131;
+	fma.rn.f32 	%f133, %f126, %f128, %f132;
+$L__tmp2:
+	.loc	2 243 36
+	mov.b32 	%r194, %f133;
+	shfl.sync.bfly.b32	%r195, %r194, 16, 31, -1;
+	mov.b32 	%f134, %r195;
+$L__tmp3:
+	.loc	2 233 15
+	add.f32 	%f135, %f133, %f134;
+$L__tmp4:
+	.loc	2 243 36
+	mov.b32 	%r196, %f135;
+	shfl.sync.bfly.b32	%r197, %r196, 8, 31, -1;
+	mov.b32 	%f136, %r197;
+$L__tmp5:
+	.loc	2 233 15
+	add.f32 	%f137, %f135, %f136;
+$L__tmp6:
+	.loc	2 243 36
+	mov.b32 	%r198, %f137;
+	shfl.sync.bfly.b32	%r199, %r198, 4, 31, -1;
+	mov.b32 	%f138, %r199;
+$L__tmp7:
+	.loc	2 233 15
+	add.f32 	%f139, %f137, %f138;
+$L__tmp8:
+	.loc	2 243 36
+	mov.b32 	%r200, %f139;
+	shfl.sync.bfly.b32	%r201, %r200, 2, 31, -1;
+	mov.b32 	%f140, %r201;
+$L__tmp9:
+	.loc	2 233 15
+	add.f32 	%f141, %f139, %f140;
+$L__tmp10:
+	.loc	2 243 36
+	mov.b32 	%r202, %f141;
+	shfl.sync.bfly.b32	%r203, %r202, 1, 31, -1;
+	mov.b32 	%f142, %r203;
+$L__tmp11:
+	.loc	2 233 15
+	add.f32 	%f143, %f141, %f142;
+$L__tmp12:
+	.loc	2 243 36
+	setp.eq.s32 	%p80, %r189, 0;
+	shr.u32 	%r204, %r188, 3;
+	and.b32  	%r205, %r204, 4;
+	mov.u32 	%r206, global_smem;
+	add.s32 	%r130, %r206, %r205;
+	mov.b32 	%r131, %f143;
+	@%p80 st.shared.b32 [ %r130 + 0 ], %r131;
+	bar.sync 	0;
+	setp.lt.s32 	%p81, %r188, 2;
+	add.s32 	%r133, %r206, %r190;
+	@%p81 ld.shared.b32 %r132, [ %r133 + 0 ];
+	mov.b32 	%f144, %r132;
+	shfl.sync.bfly.b32	%r207, %r132, 1, 31, -1;
+	mov.b32 	%f145, %r207;
+$L__tmp13:
+	.loc	2 233 15
+	add.f32 	%f146, %f144, %f145;
+$L__tmp14:
+	.loc	2 243 36
+	and.b32  	%r208, %r188, 1;
+	setp.eq.b32 	%p94, %r208, 1;
+	not.pred 	%p95, %p94;
+	and.pred  	%p82, %p81, %p95;
+	mov.b32 	%r135, %f146;
+	@%p82 st.shared.b32 [ %r133 + 0 ], %r135;
+	bar.sync 	0;
+	ld.shared.f32 	%f147, [global_smem];
+$L__tmp15:
+	.loc	3 8 15
+	add.f32 	%f148, %f147, 0f00000000;
+$L__tmp16:
+	.loc	1 81 20
+	mul.f32 	%f149, %f116, %f124;
+$L__tmp17:
+	.loc	2 243 36
+	bar.sync 	0;
+$L__tmp18:
+	.loc	2 233 15
+	fma.rn.f32 	%f150, %f115, %f123, %f149;
+	fma.rn.f32 	%f151, %f117, %f130, %f150;
+	fma.rn.f32 	%f152, %f118, %f129, %f151;
+$L__tmp19:
+	.loc	2 243 36
+	mov.b32 	%r209, %f152;
+	shfl.sync.bfly.b32	%r210, %r209, 16, 31, -1;
+	mov.b32 	%f153, %r210;
+$L__tmp20:
+	.loc	2 233 15
+	add.f32 	%f154, %f152, %f153;
+$L__tmp21:
+	.loc	2 243 36
+	mov.b32 	%r211, %f154;
+	shfl.sync.bfly.b32	%r212, %r211, 8, 31, -1;
+	mov.b32 	%f155, %r212;
+$L__tmp22:
+	.loc	2 233 15
+	add.f32 	%f156, %f154, %f155;
+$L__tmp23:
+	.loc	2 243 36
+	mov.b32 	%r213, %f156;
+	shfl.sync.bfly.b32	%r214, %r213, 4, 31, -1;
+	mov.b32 	%f157, %r214;
+$L__tmp24:
+	.loc	2 233 15
+	add.f32 	%f158, %f156, %f157;
+$L__tmp25:
+	.loc	2 243 36
+	mov.b32 	%r215, %f158;
+	shfl.sync.bfly.b32	%r216, %r215, 2, 31, -1;
+	mov.b32 	%f159, %r216;
+$L__tmp26:
+	.loc	2 233 15
+	add.f32 	%f160, %f158, %f159;
+$L__tmp27:
+	.loc	2 243 36
+	mov.b32 	%r217, %f160;
+	shfl.sync.bfly.b32	%r218, %r217, 1, 31, -1;
+	mov.b32 	%f161, %r218;
+$L__tmp28:
+	.loc	2 233 15
+	add.f32 	%f162, %f160, %f161;
+$L__tmp29:
+	.loc	2 243 36
+	mov.b32 	%r137, %f162;
+	@%p80 st.shared.b32 [ %r130 + 0 ], %r137;
+	bar.sync 	0;
+	@%p81 ld.shared.b32 %r138, [ %r133 + 0 ];
+	mov.b32 	%f163, %r138;
+	shfl.sync.bfly.b32	%r219, %r138, 1, 31, -1;
+	mov.b32 	%f164, %r219;
+$L__tmp30:
+	.loc	2 233 15
+	add.f32 	%f165, %f163, %f164;
+$L__tmp31:
+	.loc	2 243 36
+	mov.b32 	%r141, %f165;
+	@%p82 st.shared.b32 [ %r133 + 0 ], %r141;
+	bar.sync 	0;
+	ld.shared.f32 	%f166, [global_smem];
+$L__tmp32:
+	.loc	3 8 15
+	add.f32 	%f167, %f166, 0f00000000;
+	mov.b32 	%r144, 1132462080;
+$L__tmp33:
+	.loc	1 86 20
+	div.full.f32 %r142, %r143, %r144;
+	mov.b32 	%f168, %r142;
+	.loc	1 88 20
+	neg.f32 	%f169, %f148;
+	fma.rn.f32 	%f170, %f123, 0f43800000, %f169;
+	fma.rn.f32 	%f171, %f124, 0f43800000, %f169;
+	fma.rn.f32 	%f172, %f130, 0f43800000, %f169;
+	fma.rn.f32 	%f173, %f129, 0f43800000, %f169;
+	.loc	1 90 20
+	neg.f32 	%f174, %f115;
+	fma.rn.f32 	%f175, %f174, %f167, %f170;
+	neg.f32 	%f176, %f116;
+	fma.rn.f32 	%f177, %f176, %f167, %f171;
+	neg.f32 	%f178, %f117;
+	fma.rn.f32 	%f179, %f178, %f167, %f172;
+	neg.f32 	%f180, %f118;
+	fma.rn.f32 	%f181, %f180, %f167, %f173;
+	.loc	1 91 20
+	mul.f32 	%f182, %f168, %f175;
+	mul.f32 	%f183, %f168, %f177;
+	mul.f32 	%f184, %f168, %f179;
+	mul.f32 	%f185, %f168, %f181;
+	.loc	1 93 25
+	add.s64 	%rd52, %rd83, %rd71;
+	.loc	1 93 48
+	mov.b32 	%r154, %f59;
+	mov.b32 	%r155, %f60;
+	mov.b32 	%r156, %f61;
+	mov.b32 	%r157, %f62;
+	@%p1 st.global.v4.b32 [ %rd52 + 0 ], { %r154, %r155, %r156, %r157 };
+	.loc	1 94 25
+	add.s64 	%rd53, %rd84, %rd71;
+	.loc	1 94 48
+	mov.b32 	%r158, %f71;
+	mov.b32 	%r159, %f72;
+	mov.b32 	%r160, %f73;
+	mov.b32 	%r161, %f74;
+	@%p1 st.global.v4.b32 [ %rd53 + 0 ], { %r158, %r159, %r160, %r161 };
+	.loc	1 95 25
+	add.s64 	%rd54, %rd85, %rd71;
+	.loc	1 95 48
+	mov.b32 	%r162, %f75;
+	mov.b32 	%r163, %f76;
+	mov.b32 	%r164, %f77;
+	mov.b32 	%r165, %f78;
+	@%p1 st.global.v4.b32 [ %rd54 + 0 ], { %r162, %r163, %r164, %r165 };
+	.loc	1 96 25
+	add.s64 	%rd55, %rd86, %rd71;
+	.loc	1 96 48
+	mov.b32 	%r166, %f91;
+	mov.b32 	%r167, %f92;
+	mov.b32 	%r168, %f93;
+	mov.b32 	%r169, %f94;
+	@%p1 st.global.v4.b32 [ %rd55 + 0 ], { %r166, %r167, %r168, %r169 };
+	.loc	1 97 25
+	add.s64 	%rd56, %rd87, %rd71;
+	.loc	1 97 48
+	mov.b32 	%r170, %f103;
+	mov.b32 	%r171, %f104;
+	mov.b32 	%r172, %f105;
+	mov.b32 	%r173, %f106;
+	@%p1 st.global.v4.b32 [ %rd56 + 0 ], { %r170, %r171, %r172, %r173 };
+	.loc	1 98 25
+	add.s64 	%rd57, %rd88, %rd71;
+	.loc	1 98 48
+	mov.b32 	%r174, %f115;
+	mov.b32 	%r175, %f116;
+	mov.b32 	%r176, %f117;
+	mov.b32 	%r177, %f118;
+	@%p1 st.global.v4.b32 [ %rd57 + 0 ], { %r174, %r175, %r176, %r177 };
+	.loc	1 99 25
+	add.s64 	%rd58, %rd89, %rd71;
+	.loc	1 99 48
+	mov.b32 	%r178, %f182;
+	mov.b32 	%r179, %f183;
+	mov.b32 	%r180, %f184;
+	mov.b32 	%r181, %f185;
+	@%p1 st.global.v4.b32 [ %rd58 + 0 ], { %r178, %r179, %r180, %r181 };
+	.loc	1 100 25
+	add.s64 	%rd59, %rd90, %rd82;
+	.loc	1 100 48
+	cvt.rn.bf16.f32 %rs33, %r178;
+	cvt.rn.bf16.f32 %rs34, %r179;
+	cvt.rn.bf16.f32 %rs35, %r180;
+	cvt.rn.bf16.f32 %rs36, %r181;
+	mov.b32 	%r220, {%rs33, %rs34};
+	mov.b32 	%r221, {%rs35, %rs36};
+	@%p1 st.global.v2.b32 [ %rd59 + 0 ], { %r220, %r221 };
+	.loc	1 100 4
+	ret;
+$L__tmp34:
+$L__func_end0:
+
+}
+	.file	1 "/tmp/torchinductor_root/yo/cyo4ksjyladdfw6jgu5nyxbapyihb5b54nc6mogi76rx2lajsiff.py"
+	.file	2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
+	.file	3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py"
+	.section	.debug_abbrev
+	{
+.b8 1
+.b8 17
+.b8 1
+.b8 37
+.b8 8
+.b8 19
+.b8 5
+.b8 3
+.b8 8
+.b8 16
+.b8 6
+.b8 27
+.b8 8
+.b8 180
+.b8 66
+.b8 12
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 0
+.b8 0
+.b8 2
+.b8 46
+.b8 0
+.b8 135
+.b8 64
+.b8 8
+.b8 3
+.b8 8
+.b8 58
+.b8 11
+.b8 59
+.b8 11
+.b8 63
+.b8 12
+.b8 32
+.b8 11
+.b8 0
+.b8 0
+.b8 3
+.b8 46
+.b8 1
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 64
+.b8 10
+.b8 49
+.b8 19
+.b8 0
+.b8 0
+.b8 4
+.b8 29
+.b8 1
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 5
+.b8 29
+.b8 0
+.b8 49
+.b8 19
+.b8 17
+.b8 1
+.b8 18
+.b8 1
+.b8 88
+.b8 11
+.b8 89
+.b8 11
+.b8 87
+.b8 11
+.b8 0
+.b8 0
+.b8 0
+	}
+	.section	.debug_info
+	{
+.b32 533
+.b8 2
+.b8 0
+.b32 .debug_abbrev
+.b8 8
+.b8 1
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 0
+.b8 2
+.b8 0
+.b8 99
+.b8 121
+.b8 111
+.b8 52
+.b8 107
+.b8 115
+.b8 106
+.b8 121
+.b8 108
+.b8 97
+.b8 100
+.b8 100
+.b8 102
+.b8 119
+.b8 54
+.b8 106
+.b8 103
+.b8 117
+.b8 53
+.b8 110
+.b8 121
+.b8 120
+.b8 98
+.b8 97
+.b8 112
+.b8 121
+.b8 105
+.b8 104
+.b8 98
+.b8 53
+.b8 98
+.b8 53
+.b8 52
+.b8 110
+.b8 99
+.b8 54
+.b8 109
+.b8 111
+.b8 103
+.b8 105
+.b8 55
+.b8 54
+.b8 114
+.b8 120
+.b8 50
+.b8 108
+.b8 97
+.b8 106
+.b8 115
+.b8 105
+.b8 102
+.b8 102
+.b8 46
+.b8 112
+.b8 121
+.b8 0
+.b32 .debug_line
+.b8 47
+.b8 116
+.b8 109
+.b8 112
+.b8 47
+.b8 116
+.b8 111
+.b8 114
+.b8 99
+.b8 104
+.b8 105
+.b8 110
+.b8 100
+.b8 117
+.b8 99
+.b8 116
+.b8 111
+.b8 114
+.b8 95
+.b8 114
+.b8 111
+.b8 111
+.b8 116
+.b8 47
+.b8 121
+.b8 111
+.b8 0
+.b8 1
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 2
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 49
+.b8 49
+.b8 100
+.b8 49
+.b8 50
+.b8 100
+.b8 49
+.b8 51
+.b8 100
+.b8 49
+.b8 52
+.b8 100
+.b8 49
+.b8 53
+.b8 100
+.b8 49
+.b8 54
+.b8 100
+.b8 49
+.b8 55
+.b8 100
+.b8 49
+.b8 56
+.b8 100
+.b8 49
+.b8 57
+.b8 100
+.b8 50
+.b8 48
+.b8 100
+.b8 50
+.b8 49
+.b8 100
+.b8 50
+.b8 50
+.b8 100
+.b8 50
+.b8 51
+.b8 100
+.b8 50
+.b8 52
+.b8 100
+.b8 50
+.b8 53
+.b8 100
+.b8 50
+.b8 54
+.b8 100
+.b8 50
+.b8 55
+.b8 100
+.b8 50
+.b8 56
+.b8 100
+.b8 50
+.b8 57
+.b8 100
+.b8 101
+.b8 51
+.b8 48
+.b8 100
+.b8 101
+.b8 0
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 49
+.b8 49
+.b8 100
+.b8 49
+.b8 50
+.b8 100
+.b8 49
+.b8 51
+.b8 100
+.b8 49
+.b8 52
+.b8 100
+.b8 49
+.b8 53
+.b8 100
+.b8 49
+.b8 54
+.b8 100
+.b8 49
+.b8 55
+.b8 100
+.b8 49
+.b8 56
+.b8 100
+.b8 49
+.b8 57
+.b8 100
+.b8 50
+.b8 48
+.b8 100
+.b8 50
+.b8 49
+.b8 100
+.b8 50
+.b8 50
+.b8 100
+.b8 50
+.b8 51
+.b8 100
+.b8 50
+.b8 52
+.b8 100
+.b8 50
+.b8 53
+.b8 100
+.b8 50
+.b8 54
+.b8 100
+.b8 50
+.b8 55
+.b8 100
+.b8 50
+.b8 56
+.b8 100
+.b8 50
+.b8 57
+.b8 100
+.b8 101
+.b8 51
+.b8 48
+.b8 100
+.b8 101
+.b8 0
+.b8 1
+.b8 18
+.b8 1
+.b8 1
+.b8 3
+.b64 $L__func_begin0
+.b64 $L__func_end0
+.b8 1
+.b8 156
+.b32 125
+.b8 4
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 80
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp1
+.b64 $L__tmp14
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp2
+.b64 $L__tmp15
+.b8 2
+.b8 80
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp15
+.b64 $L__tmp16
+.b8 3
+.b8 80
+.b8 45
+.b8 5
+.b32 125
+.b64 $L__tmp17
+.b64 $L__tmp32
+.b8 2
+.b8 84
+.b8 59
+.b8 4
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 84
+.b8 59
+.b8 5
+.b32 125
+.b64 $L__tmp18
+.b64 $L__tmp31
+.b8 2
+.b8 243
+.b8 36
+.b8 0
+.b8 5
+.b32 125
+.b64 $L__tmp32
+.b64 $L__tmp33
+.b8 3
+.b8 84
+.b8 45
+.b8 0
+.b8 0
+	}
+	.section	.debug_pubnames
+	{
+.b32 $L__pubNames_end0-$L__pubNames_start0
+$L__pubNames_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 537
+.b32 125
+.b8 116
+.b8 114
+.b8 105
+.b8 116
+.b8 111
+.b8 110
+.b8 95
+.b8 95
+.b8 48
+.b8 100
+.b8 49
+.b8 100
+.b8 50
+.b8 100
+.b8 51
+.b8 100
+.b8 52
+.b8 100
+.b8 53
+.b8 100
+.b8 54
+.b8 100
+.b8 55
+.b8 100
+.b8 56
+.b8 100
+.b8 57
+.b8 100
+.b8 49
+.b8 48
+.b8 100
+.b8 49
+.b8 49
+.b8 100
+.b8 49
+.b8 50
+.b8 100
+.b8 49
+.b8 51
+.b8 100
+.b8 49
+.b8 52
+.b8 100
+.b8 49
+.b8 53
+.b8 100
+.b8 49
+.b8 54
+.b8 100
+.b8 49
+.b8 55
+.b8 100
+.b8 49
+.b8 56
+.b8 100
+.b8 49
+.b8 57
+.b8 100
+.b8 50
+.b8 48
+.b8 100
+.b8 50
+.b8 49
+.b8 100
+.b8 50
+.b8 50
+.b8 100
+.b8 50
+.b8 51
+.b8 100
+.b8 50
+.b8 52
+.b8 100
+.b8 50
+.b8 53
+.b8 100
+.b8 50
+.b8 54
+.b8 100
+.b8 50
+.b8 55
+.b8 100
+.b8 50
+.b8 56
+.b8 100
+.b8 50
+.b8 57
+.b8 100
+.b8 101
+.b8 51
+.b8 48
+.b8 100
+.b8 101
+.b8 0
+.b32 0
+$L__pubNames_end0:
+	}
+	.section	.debug_pubtypes
+	{
+.b32 $L__pubTypes_end0-$L__pubTypes_start0
+$L__pubTypes_start0:
+.b8 2
+.b8 0
+.b32 .debug_info
+.b32 537
+.b32 0
+$L__pubTypes_end0:
+	}
+	.section	.debug_loc	{	}
diff --git a/wandb/run-20240926_055222-14kj2390/files/output.log b/wandb/run-20240926_055222-14kj2390/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..7cac6cc6d1a1f6cf0f9bd9a0929eb1258c80a2e0
--- /dev/null
+++ b/wandb/run-20240926_055222-14kj2390/files/output.log
@@ -0,0 +1,701 @@
+Training learned + default:   5%|▊                | 500/10000 [20:17<5:34:45,  2.11s/it, loss=5.1549, lr=5.98e-04, mfu=9.53%, time_per_iter_ms=2114.39ms]
+
+Step 100:
+Train loss: 6.8038, Val loss: 6.7955
+wikitext-103-v1 - Train loss: 7.9064, Val loss: 7.9072
+ptb - Train loss: 7.7765, Val loss: 7.7932
+lambada - Train loss: 6.6290, Val loss: 6.6271
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 200:
+Train loss: 5.9632, Val loss: 5.9645
+wikitext-103-v1 - Train loss: 7.2751, Val loss: 7.2753
+ptb - Train loss: 7.6082, Val loss: 7.6259
+lambada - Train loss: 5.7352, Val loss: 5.7404
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 300:
+Train loss: 5.5788, Val loss: 5.5806
+wikitext-103-v1 - Train loss: 6.9806, Val loss: 6.9695
+ptb - Train loss: 7.2654, Val loss: 7.2983
+lambada - Train loss: 5.4466, Val loss: 5.4591
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 400:
+Train loss: 5.3057, Val loss: 5.2957
+wikitext-103-v1 - Train loss: 6.7475, Val loss: 6.7423
+ptb - Train loss: 7.0971, Val loss: 7.1375
+lambada - Train loss: 5.2606, Val loss: 5.2760
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 500:
+Train loss: 5.1085, Val loss: 5.1086
+wikitext-103-v1 - Train loss: 6.5747, Val loss: 6.5779
+ptb - Train loss: 6.9253, Val loss: 6.9706
+lambada - Train loss: 5.1147, Val loss: 5.1296
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 600:
+Train loss: 4.9573, Val loss: 4.9547
+wikitext-103-v1 - Train loss: 6.4305, Val loss: 6.4301
+ptb - Train loss: 6.8023, Val loss: 6.8555
+lambada - Train loss: 5.0186, Val loss: 5.0324
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 700:
+Train loss: 4.8335, Val loss: 4.8295
+wikitext-103-v1 - Train loss: 6.2667, Val loss: 6.2699
+ptb - Train loss: 6.6823, Val loss: 6.7431
+lambada - Train loss: 4.9478, Val loss: 4.9548
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 800:
+Train loss: 4.7205, Val loss: 4.7212
+wikitext-103-v1 - Train loss: 6.1274, Val loss: 6.1212
+ptb - Train loss: 6.4782, Val loss: 6.5370
+lambada - Train loss: 4.9008, Val loss: 4.9094
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 900:
+Train loss: 4.6260, Val loss: 4.6246
+wikitext-103-v1 - Train loss: 5.9350, Val loss: 5.9339
+ptb - Train loss: 6.1903, Val loss: 6.2605
+lambada - Train loss: 4.8564, Val loss: 4.8686
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1000:
+Train loss: 4.5295, Val loss: 4.5315
+wikitext-103-v1 - Train loss: 5.7712, Val loss: 5.7584
+ptb - Train loss: 5.8422, Val loss: 5.9189
+lambada - Train loss: 4.8058, Val loss: 4.8109
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1100:
+Train loss: 4.4763, Val loss: 4.4670
+wikitext-103-v1 - Train loss: 5.6941, Val loss: 5.6790
+ptb - Train loss: 5.6589, Val loss: 5.7596
+lambada - Train loss: 4.7676, Val loss: 4.7707
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1200:
+Train loss: 4.4093, Val loss: 4.4050
+wikitext-103-v1 - Train loss: 5.5809, Val loss: 5.5747
+ptb - Train loss: 5.5707, Val loss: 5.6739
+lambada - Train loss: 4.7642, Val loss: 4.7673
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1300:
+Train loss: 4.3669, Val loss: 4.3716
+wikitext-103-v1 - Train loss: 5.5322, Val loss: 5.5115
+ptb - Train loss: 5.4906, Val loss: 5.5867
+lambada - Train loss: 4.7353, Val loss: 4.7372
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1400:
+Train loss: 4.3331, Val loss: 4.3318
+wikitext-103-v1 - Train loss: 5.4625, Val loss: 5.4471
+ptb - Train loss: 5.4163, Val loss: 5.5288
+lambada - Train loss: 4.7051, Val loss: 4.7102
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1500:
+Train loss: 4.3095, Val loss: 4.3069
+wikitext-103-v1 - Train loss: 5.4572, Val loss: 5.4487
+ptb - Train loss: 5.4178, Val loss: 5.5298
+lambada - Train loss: 4.6914, Val loss: 4.6964
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1600:
+Train loss: 4.2780, Val loss: 4.2778
+wikitext-103-v1 - Train loss: 5.4283, Val loss: 5.4132
+ptb - Train loss: 5.3550, Val loss: 5.4572
+lambada - Train loss: 4.6789, Val loss: 4.6788
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1700:
+Train loss: 4.2646, Val loss: 4.2523
+wikitext-103-v1 - Train loss: 5.4240, Val loss: 5.4090
+ptb - Train loss: 5.3642, Val loss: 5.4759
+lambada - Train loss: 4.6545, Val loss: 4.6600
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1800:
+Train loss: 4.2347, Val loss: 4.2302
+wikitext-103-v1 - Train loss: 5.3465, Val loss: 5.3261
+ptb - Train loss: 5.2994, Val loss: 5.4021
+lambada - Train loss: 4.6463, Val loss: 4.6483
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 1900:
+Train loss: 4.2068, Val loss: 4.2177
+wikitext-103-v1 - Train loss: 5.3374, Val loss: 5.3226
+ptb - Train loss: 5.2643, Val loss: 5.3607
+lambada - Train loss: 4.6296, Val loss: 4.6349
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2000:
+Train loss: 4.1983, Val loss: 4.1948
+wikitext-103-v1 - Train loss: 5.3491, Val loss: 5.3315
+ptb - Train loss: 5.3221, Val loss: 5.4272
+lambada - Train loss: 4.6377, Val loss: 4.6344
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2100:
+Train loss: 4.1860, Val loss: 4.1882
+wikitext-103-v1 - Train loss: 5.3103, Val loss: 5.2909
+ptb - Train loss: 5.2290, Val loss: 5.3266
+lambada - Train loss: 4.6136, Val loss: 4.6200
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2200:
+Train loss: 4.1666, Val loss: 4.1677
+wikitext-103-v1 - Train loss: 5.2956, Val loss: 5.2775
+ptb - Train loss: 5.2349, Val loss: 5.3438
+lambada - Train loss: 4.5960, Val loss: 4.6060
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2300:
+Train loss: 4.1573, Val loss: 4.1576
+wikitext-103-v1 - Train loss: 5.2848, Val loss: 5.2643
+ptb - Train loss: 5.2105, Val loss: 5.3189
+lambada - Train loss: 4.6056, Val loss: 4.6054
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2400:
+Train loss: 4.1411, Val loss: 4.1447
+wikitext-103-v1 - Train loss: 5.2642, Val loss: 5.2456
+ptb - Train loss: 5.2038, Val loss: 5.3090
+lambada - Train loss: 4.5966, Val loss: 4.6040
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2500:
+Train loss: 4.1320, Val loss: 4.1369
+wikitext-103-v1 - Train loss: 5.2598, Val loss: 5.2343
+ptb - Train loss: 5.2028, Val loss: 5.3161
+lambada - Train loss: 4.5989, Val loss: 4.6001
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2600:
+Train loss: 4.1241, Val loss: 4.1256
+wikitext-103-v1 - Train loss: 5.2357, Val loss: 5.2125
+ptb - Train loss: 5.1644, Val loss: 5.2680
+lambada - Train loss: 4.5846, Val loss: 4.5838
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2700:
+Train loss: 4.1183, Val loss: 4.1143
+wikitext-103-v1 - Train loss: 5.2448, Val loss: 5.2277
+ptb - Train loss: 5.1603, Val loss: 5.2628
+lambada - Train loss: 4.5798, Val loss: 4.5827
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2800:
+Train loss: 4.1055, Val loss: 4.1096
+wikitext-103-v1 - Train loss: 5.2229, Val loss: 5.2017
+ptb - Train loss: 5.1477, Val loss: 5.2506
+lambada - Train loss: 4.5665, Val loss: 4.5677
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 2900:
+Train loss: 4.0935, Val loss: 4.0882
+wikitext-103-v1 - Train loss: 5.2075, Val loss: 5.1859
+ptb - Train loss: 5.1501, Val loss: 5.2564
+lambada - Train loss: 4.5792, Val loss: 4.5752
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3000:
+Train loss: 4.0905, Val loss: 4.0838
+wikitext-103-v1 - Train loss: 5.2077, Val loss: 5.1879
+ptb - Train loss: 5.1382, Val loss: 5.2432
+lambada - Train loss: 4.5655, Val loss: 4.5647
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3100:
+Train loss: 4.0829, Val loss: 4.0785
+wikitext-103-v1 - Train loss: 5.1924, Val loss: 5.1646
+ptb - Train loss: 5.1343, Val loss: 5.2375
+lambada - Train loss: 4.5594, Val loss: 4.5584
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3200:
+Train loss: 4.0735, Val loss: 4.0756
+wikitext-103-v1 - Train loss: 5.1842, Val loss: 5.1655
+ptb - Train loss: 5.1134, Val loss: 5.2139
+lambada - Train loss: 4.5548, Val loss: 4.5578
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3300:
+Train loss: 4.0685, Val loss: 4.0711
+wikitext-103-v1 - Train loss: 5.1913, Val loss: 5.1735
+ptb - Train loss: 5.1080, Val loss: 5.2114
+lambada - Train loss: 4.5626, Val loss: 4.5604
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3400:
+Train loss: 4.0556, Val loss: 4.0570
+wikitext-103-v1 - Train loss: 5.1777, Val loss: 5.1480
+ptb - Train loss: 5.0920, Val loss: 5.1912
+lambada - Train loss: 4.5487, Val loss: 4.5460
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3500:
+Train loss: 4.0569, Val loss: 4.0554
+wikitext-103-v1 - Train loss: 5.1588, Val loss: 5.1413
+ptb - Train loss: 5.0720, Val loss: 5.1675
+lambada - Train loss: 4.5377, Val loss: 4.5366
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3600:
+Train loss: 4.0502, Val loss: 4.0463
+wikitext-103-v1 - Train loss: 5.1695, Val loss: 5.1527
+ptb - Train loss: 5.0727, Val loss: 5.1805
+lambada - Train loss: 4.5387, Val loss: 4.5359
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3700:
+Train loss: 4.0493, Val loss: 4.0440
+wikitext-103-v1 - Train loss: 5.1483, Val loss: 5.1283
+ptb - Train loss: 5.0744, Val loss: 5.1717
+lambada - Train loss: 4.5342, Val loss: 4.5324
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3800:
+Train loss: 4.0387, Val loss: 4.0354
+wikitext-103-v1 - Train loss: 5.1486, Val loss: 5.1196
+ptb - Train loss: 5.0400, Val loss: 5.1513
+lambada - Train loss: 4.5402, Val loss: 4.5332
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 3900:
+Train loss: 4.0341, Val loss: 4.0290
+wikitext-103-v1 - Train loss: 5.1404, Val loss: 5.1191
+ptb - Train loss: 5.0406, Val loss: 5.1401
+lambada - Train loss: 4.5226, Val loss: 4.5213
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4000:
+Train loss: 4.0297, Val loss: 4.0303
+wikitext-103-v1 - Train loss: 5.1229, Val loss: 5.1015
+ptb - Train loss: 5.0423, Val loss: 5.1482
+lambada - Train loss: 4.5457, Val loss: 4.5434
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4100:
+Train loss: 4.0285, Val loss: 4.0279
+wikitext-103-v1 - Train loss: 5.1410, Val loss: 5.1173
+ptb - Train loss: 5.0534, Val loss: 5.1546
+lambada - Train loss: 4.5249, Val loss: 4.5211
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4200:
+Train loss: 4.0124, Val loss: 4.0195
+wikitext-103-v1 - Train loss: 5.1362, Val loss: 5.1201
+ptb - Train loss: 5.0534, Val loss: 5.1583
+lambada - Train loss: 4.5213, Val loss: 4.5162
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4300:
+Train loss: 4.0193, Val loss: 4.0081
+wikitext-103-v1 - Train loss: 5.1212, Val loss: 5.1108
+ptb - Train loss: 5.0391, Val loss: 5.1401
+lambada - Train loss: 4.5240, Val loss: 4.5265
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4400:
+Train loss: 4.0106, Val loss: 4.0069
+wikitext-103-v1 - Train loss: 5.1272, Val loss: 5.1073
+ptb - Train loss: 5.0385, Val loss: 5.1346
+lambada - Train loss: 4.5174, Val loss: 4.5123
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4500:
+Train loss: 4.0093, Val loss: 4.0066
+wikitext-103-v1 - Train loss: 5.1077, Val loss: 5.0875
+ptb - Train loss: 5.0100, Val loss: 5.1084
+lambada - Train loss: 4.5189, Val loss: 4.5178
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4600:
+Train loss: 4.0014, Val loss: 4.0017
+wikitext-103-v1 - Train loss: 5.1019, Val loss: 5.0889
+ptb - Train loss: 5.0120, Val loss: 5.1113
+lambada - Train loss: 4.5059, Val loss: 4.5104
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4700:
+Train loss: 3.9949, Val loss: 3.9994
+wikitext-103-v1 - Train loss: 5.0935, Val loss: 5.0713
+ptb - Train loss: 4.9881, Val loss: 5.0865
+lambada - Train loss: 4.5065, Val loss: 4.5037
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4800:
+Train loss: 3.9991, Val loss: 3.9934
+wikitext-103-v1 - Train loss: 5.0869, Val loss: 5.0718
+ptb - Train loss: 4.9856, Val loss: 5.0937
+lambada - Train loss: 4.5097, Val loss: 4.5079
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 4900:
+Train loss: 3.9904, Val loss: 3.9903
+wikitext-103-v1 - Train loss: 5.0945, Val loss: 5.0764
+ptb - Train loss: 5.0047, Val loss: 5.1038
+lambada - Train loss: 4.5071, Val loss: 4.5000
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5000:
+Train loss: 3.9841, Val loss: 3.9873
+wikitext-103-v1 - Train loss: 5.0710, Val loss: 5.0561
+ptb - Train loss: 4.9981, Val loss: 5.0994
+lambada - Train loss: 4.5154, Val loss: 4.5174
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5100:
+Train loss: 3.9910, Val loss: 3.9813
+wikitext-103-v1 - Train loss: 5.0788, Val loss: 5.0557
+ptb - Train loss: 4.9792, Val loss: 5.0777
+lambada - Train loss: 4.4983, Val loss: 4.4992
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5200:
+Train loss: 3.9810, Val loss: 3.9722
+wikitext-103-v1 - Train loss: 5.0702, Val loss: 5.0631
+ptb - Train loss: 4.9835, Val loss: 5.0830
+lambada - Train loss: 4.4988, Val loss: 4.4999
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5300:
+Train loss: 3.9759, Val loss: 3.9737
+wikitext-103-v1 - Train loss: 5.0629, Val loss: 5.0486
+ptb - Train loss: 4.9766, Val loss: 5.0699
+lambada - Train loss: 4.4918, Val loss: 4.4898
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5400:
+Train loss: 3.9718, Val loss: 3.9762
+wikitext-103-v1 - Train loss: 5.0648, Val loss: 5.0488
+ptb - Train loss: 4.9808, Val loss: 5.0805
+lambada - Train loss: 4.4962, Val loss: 4.4957
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5500:
+Train loss: 3.9740, Val loss: 3.9764
+wikitext-103-v1 - Train loss: 5.0641, Val loss: 5.0500
+ptb - Train loss: 4.9647, Val loss: 5.0674
+lambada - Train loss: 4.4849, Val loss: 4.4855
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5600:
+Train loss: 3.9577, Val loss: 3.9633
+wikitext-103-v1 - Train loss: 5.0513, Val loss: 5.0275
+ptb - Train loss: 4.9602, Val loss: 5.0572
+lambada - Train loss: 4.4822, Val loss: 4.4827
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5700:
+Train loss: 3.9657, Val loss: 3.9676
+wikitext-103-v1 - Train loss: 5.0498, Val loss: 5.0252
+ptb - Train loss: 4.9549, Val loss: 5.0510
+lambada - Train loss: 4.4874, Val loss: 4.4878
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5800:
+Train loss: 3.9636, Val loss: 3.9648
+wikitext-103-v1 - Train loss: 5.0513, Val loss: 5.0333
+ptb - Train loss: 4.9587, Val loss: 5.0591
+lambada - Train loss: 4.4922, Val loss: 4.4849
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 5900:
+Train loss: 3.9576, Val loss: 3.9588
+wikitext-103-v1 - Train loss: 5.0306, Val loss: 5.0349
+ptb - Train loss: 4.9546, Val loss: 5.0537
+lambada - Train loss: 4.4707, Val loss: 4.4656
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6000:
+Train loss: 3.9571, Val loss: 3.9567
+wikitext-103-v1 - Train loss: 5.0335, Val loss: 5.0150
+ptb - Train loss: 4.9552, Val loss: 5.0535
+lambada - Train loss: 4.4833, Val loss: 4.4809
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6100:
+Train loss: 3.9572, Val loss: 3.9515
+wikitext-103-v1 - Train loss: 5.0369, Val loss: 5.0118
+ptb - Train loss: 4.9523, Val loss: 5.0533
+lambada - Train loss: 4.4831, Val loss: 4.4803
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6200:
+Train loss: 3.9477, Val loss: 3.9533
+wikitext-103-v1 - Train loss: 5.0323, Val loss: 5.0173
+ptb - Train loss: 4.9542, Val loss: 5.0520
+lambada - Train loss: 4.4807, Val loss: 4.4803
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6300:
+Train loss: 3.9529, Val loss: 3.9482
+wikitext-103-v1 - Train loss: 5.0296, Val loss: 5.0053
+ptb - Train loss: 4.9409, Val loss: 5.0413
+lambada - Train loss: 4.4767, Val loss: 4.4752
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6400:
+Train loss: 3.9437, Val loss: 3.9420
+wikitext-103-v1 - Train loss: 5.0380, Val loss: 5.0141
+ptb - Train loss: 4.9432, Val loss: 5.0490
+lambada - Train loss: 4.4715, Val loss: 4.4718
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6500:
+Train loss: 3.9498, Val loss: 3.9390
+wikitext-103-v1 - Train loss: 5.0437, Val loss: 5.0206
+ptb - Train loss: 4.9500, Val loss: 5.0495
+lambada - Train loss: 4.4766, Val loss: 4.4775
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6600:
+Train loss: 3.9457, Val loss: 3.9436
+wikitext-103-v1 - Train loss: 5.0214, Val loss: 5.0075
+ptb - Train loss: 4.9380, Val loss: 5.0405
+lambada - Train loss: 4.4751, Val loss: 4.4734
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6700:
+Train loss: 3.9463, Val loss: 3.9377
+wikitext-103-v1 - Train loss: 5.0262, Val loss: 5.0050
+ptb - Train loss: 4.9359, Val loss: 5.0361
+lambada - Train loss: 4.4685, Val loss: 4.4638
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6800:
+Train loss: 3.9393, Val loss: 3.9393
+wikitext-103-v1 - Train loss: 5.0274, Val loss: 5.0019
+ptb - Train loss: 4.9270, Val loss: 5.0256
+lambada - Train loss: 4.4650, Val loss: 4.4645
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 6900:
+Train loss: 3.9324, Val loss: 3.9347
+wikitext-103-v1 - Train loss: 5.0093, Val loss: 4.9990
+ptb - Train loss: 4.9319, Val loss: 5.0297
+lambada - Train loss: 4.4679, Val loss: 4.4681
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7000:
+Train loss: 3.9386, Val loss: 3.9279
+wikitext-103-v1 - Train loss: 5.0184, Val loss: 5.0018
+ptb - Train loss: 4.9223, Val loss: 5.0256
+lambada - Train loss: 4.4654, Val loss: 4.4593
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7100:
+Train loss: 3.9333, Val loss: 3.9322
+wikitext-103-v1 - Train loss: 5.0186, Val loss: 4.9935
+ptb - Train loss: 4.9186, Val loss: 5.0169
+lambada - Train loss: 4.4588, Val loss: 4.4566
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7200:
+Train loss: 3.9342, Val loss: 3.9300
+wikitext-103-v1 - Train loss: 5.0002, Val loss: 4.9939
+ptb - Train loss: 4.9140, Val loss: 5.0205
+lambada - Train loss: 4.4645, Val loss: 4.4640
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7300:
+Train loss: 3.9329, Val loss: 3.9321
+wikitext-103-v1 - Train loss: 5.0134, Val loss: 4.9943
+ptb - Train loss: 4.9130, Val loss: 5.0149
+lambada - Train loss: 4.4556, Val loss: 4.4552
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7400:
+Train loss: 3.9271, Val loss: 3.9309
+wikitext-103-v1 - Train loss: 5.0068, Val loss: 4.9918
+ptb - Train loss: 4.9191, Val loss: 5.0184
+lambada - Train loss: 4.4619, Val loss: 4.4547
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7500:
+Train loss: 3.9322, Val loss: 3.9292
+wikitext-103-v1 - Train loss: 5.0121, Val loss: 4.9903
+ptb - Train loss: 4.9147, Val loss: 5.0180
+lambada - Train loss: 4.4603, Val loss: 4.4608
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7600:
+Train loss: 3.9216, Val loss: 3.9253
+wikitext-103-v1 - Train loss: 5.0079, Val loss: 4.9907
+ptb - Train loss: 4.9166, Val loss: 5.0171
+lambada - Train loss: 4.4605, Val loss: 4.4594
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7700:
+Train loss: 3.9283, Val loss: 3.9211
+wikitext-103-v1 - Train loss: 5.0071, Val loss: 4.9817
+ptb - Train loss: 4.9177, Val loss: 5.0144
+lambada - Train loss: 4.4571, Val loss: 4.4570
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7800:
+Train loss: 3.9205, Val loss: 3.9184
+wikitext-103-v1 - Train loss: 5.0049, Val loss: 4.9808
+ptb - Train loss: 4.9114, Val loss: 5.0158
+lambada - Train loss: 4.4650, Val loss: 4.4661
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 7900:
+Train loss: 3.9243, Val loss: 3.9222
+wikitext-103-v1 - Train loss: 5.0040, Val loss: 4.9802
+ptb - Train loss: 4.9160, Val loss: 5.0118
+lambada - Train loss: 4.4549, Val loss: 4.4529
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8000:
+Train loss: 3.9250, Val loss: 3.9242
+wikitext-103-v1 - Train loss: 4.9941, Val loss: 4.9855
+ptb - Train loss: 4.9191, Val loss: 5.0123
+lambada - Train loss: 4.4539, Val loss: 4.4552
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8100:
+Train loss: 3.9191, Val loss: 3.9206
+wikitext-103-v1 - Train loss: 5.0038, Val loss: 4.9899
+ptb - Train loss: 4.9185, Val loss: 5.0285
+lambada - Train loss: 4.4547, Val loss: 4.4544
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8200:
+Train loss: 3.9230, Val loss: 3.9179
+wikitext-103-v1 - Train loss: 4.9961, Val loss: 4.9745
+ptb - Train loss: 4.9024, Val loss: 5.0016
+lambada - Train loss: 4.4549, Val loss: 4.4528
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8300:
+Train loss: 3.9202, Val loss: 3.9221
+wikitext-103-v1 - Train loss: 4.9916, Val loss: 4.9718
+ptb - Train loss: 4.8936, Val loss: 4.9952
+lambada - Train loss: 4.4566, Val loss: 4.4566
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8400:
+Train loss: 3.9179, Val loss: 3.9154
+wikitext-103-v1 - Train loss: 4.9929, Val loss: 4.9820
+ptb - Train loss: 4.9103, Val loss: 5.0113
+lambada - Train loss: 4.4497, Val loss: 4.4471
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8500:
+Train loss: 3.9177, Val loss: 3.9206
+wikitext-103-v1 - Train loss: 4.9873, Val loss: 4.9718
+ptb - Train loss: 4.8961, Val loss: 5.0103
+lambada - Train loss: 4.4538, Val loss: 4.4548
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8600:
+Train loss: 3.9126, Val loss: 3.9091
+wikitext-103-v1 - Train loss: 4.9969, Val loss: 4.9774
+ptb - Train loss: 4.9036, Val loss: 5.0020
+lambada - Train loss: 4.4524, Val loss: 4.4483
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8700:
+Train loss: 3.9183, Val loss: 3.9143
+wikitext-103-v1 - Train loss: 4.9846, Val loss: 4.9672
+ptb - Train loss: 4.9017, Val loss: 5.0107
+lambada - Train loss: 4.4507, Val loss: 4.4508
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8800:
+Train loss: 3.9141, Val loss: 3.9113
+wikitext-103-v1 - Train loss: 4.9931, Val loss: 4.9725
+ptb - Train loss: 4.8923, Val loss: 4.9876
+lambada - Train loss: 4.4501, Val loss: 4.4474
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 8900:
+Train loss: 3.9139, Val loss: 3.9112
+wikitext-103-v1 - Train loss: 4.9853, Val loss: 4.9735
+ptb - Train loss: 4.8939, Val loss: 4.9961
+lambada - Train loss: 4.4520, Val loss: 4.4532
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9000:
+Train loss: 3.9111, Val loss: 3.9007
+wikitext-103-v1 - Train loss: 4.9977, Val loss: 4.9627
+ptb - Train loss: 4.8913, Val loss: 4.9936
+lambada - Train loss: 4.4460, Val loss: 4.4430
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9100:
+Train loss: 3.9107, Val loss: 3.9115
+wikitext-103-v1 - Train loss: 4.9896, Val loss: 4.9761
+ptb - Train loss: 4.8987, Val loss: 5.0039
+lambada - Train loss: 4.4509, Val loss: 4.4508
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9200:
+Train loss: 3.9089, Val loss: 3.9111
+wikitext-103-v1 - Train loss: 4.9739, Val loss: 4.9647
+ptb - Train loss: 4.8953, Val loss: 4.9993
+lambada - Train loss: 4.4479, Val loss: 4.4462
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9300:
+Train loss: 3.9058, Val loss: 3.9133
+wikitext-103-v1 - Train loss: 4.9842, Val loss: 4.9689
+ptb - Train loss: 4.8892, Val loss: 4.9937
+lambada - Train loss: 4.4449, Val loss: 4.4449
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9400:
+Train loss: 3.9127, Val loss: 3.9036
+wikitext-103-v1 - Train loss: 4.9826, Val loss: 4.9736
+ptb - Train loss: 4.8906, Val loss: 4.9918
+lambada - Train loss: 4.4485, Val loss: 4.4455
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9500:
+Train loss: 3.9034, Val loss: 3.9067
+wikitext-103-v1 - Train loss: 4.9902, Val loss: 4.9696
+ptb - Train loss: 4.9035, Val loss: 5.0067
+lambada - Train loss: 4.4472, Val loss: 4.4489
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9600:
+Train loss: 3.9037, Val loss: 3.8991
+wikitext-103-v1 - Train loss: 4.9837, Val loss: 4.9578
+ptb - Train loss: 4.8968, Val loss: 5.0015
+lambada - Train loss: 4.4497, Val loss: 4.4449
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9700:
+Train loss: 3.9029, Val loss: 3.9092
+wikitext-103-v1 - Train loss: 4.9835, Val loss: 4.9616
+ptb - Train loss: 4.8896, Val loss: 4.9964
+lambada - Train loss: 4.4453, Val loss: 4.4417
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9800:
+Train loss: 3.9027, Val loss: 3.9042
+wikitext-103-v1 - Train loss: 4.9806, Val loss: 4.9510
+ptb - Train loss: 4.8925, Val loss: 4.9939
+lambada - Train loss: 4.4443, Val loss: 4.4423
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 9900:
+Train loss: 3.9021, Val loss: 3.9027
+wikitext-103-v1 - Train loss: 4.9817, Val loss: 4.9592
+ptb - Train loss: 4.8927, Val loss: 4.9979
+lambada - Train loss: 4.4439, Val loss: 4.4361
+Saving checkpoint to out/ckpt_learned_default.pt
+
+Step 10000:
+Train loss: 3.9086, Val loss: 3.9058
+wikitext-103-v1 - Train loss: 4.9778, Val loss: 4.9618
+ptb - Train loss: 4.8921, Val loss: 4.9924
+lambada - Train loss: 4.4409, Val loss: 4.4380
+Saving checkpoint to out/ckpt_learned_default.pt
diff --git a/wandb/run-20240926_192831-378lr5yg/files/config.yaml b/wandb/run-20240926_192831-378lr5yg/files/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e23fc017ee27b71f35139c71a97d8b5ddb712ab1
--- /dev/null
+++ b/wandb/run-20240926_192831-378lr5yg/files/config.yaml
@@ -0,0 +1,113 @@
+_wandb:
+    value:
+        cli_version: 0.18.1
+        m: []
+        python_version: 3.10.12
+        t:
+            "1":
+                - 1
+                - 55
+            "2":
+                - 1
+                - 55
+            "3":
+                - 2
+                - 13
+                - 16
+                - 23
+                - 55
+                - 61
+            "4": 3.10.12
+            "5": 0.18.1
+            "8":
+                - 5
+            "12": 0.18.1
+            "13": linux-x86_64
+always_save_checkpoint:
+    value: true
+attention_types:
+    value:
+        - default
+backend:
+    value: nccl
+batch_size:
+    value: 120
+beta1:
+    value: 0.9
+beta2:
+    value: 0.95
+bias:
+    value: false
+block_size:
+    value: 512
+checkpoint_path:
+    value: ""
+collect_activations:
+    value: false
+collect_attention_patterns:
+    value: false
+compile:
+    value: true
+dataset:
+    value: fineweb
+decay_lr:
+    value: true
+device:
+    value: cuda
+dropout:
+    value: 0
+dtype:
+    value: bfloat16
+embedding_types:
+    value:
+        - polynomial_legendre
+        - polynomial_chebyshev
+        - random_fourier
+        - wavelet
+eval_datasets:
+    value:
+        - wikitext-103-v1
+        - ptb
+        - lambada
+eval_interval:
+    value: 100
+eval_iters:
+    value: 100
+eval_only:
+    value: false
+grad_clip:
+    value: 1
+gradient_accumulation_steps:
+    value: 40
+init_from:
+    value: scratch
+learning_rate:
+    value: 0.0006
+log_interval:
+    value: 1
+lr_decay_iters:
+    value: 10000
+max_iters:
+    value: 10000
+min_lr:
+    value: 6e-05
+n_embd:
+    value: 256
+n_head:
+    value: 4
+n_layer:
+    value: 4
+out_dir:
+    value: out
+seed:
+    value: 1337
+wandb_log:
+    value: true
+wandb_project:
+    value: gpt2_positional_encodings_100B
+wandb_run_name:
+    value: experiment
+warmup_iters:
+    value: 100
+weight_decay:
+    value: 0.1