diff --git a/.local/share/Trash/info/train_002.bin.trashinfo b/.local/share/Trash/info/train_002.bin.trashinfo new file mode 100644 index 0000000000000000000000000000000000000000..064ac10de9f571b7925072459ae27261a4f1569c --- /dev/null +++ b/.local/share/Trash/info/train_002.bin.trashinfo @@ -0,0 +1,3 @@ +[Trash Info] +Path=/root/data/fineweb/train_002.bin +DeletionDate=2024-09-26T05:50:34 diff --git a/.local/share/Trash/info/train_004.bin.trashinfo b/.local/share/Trash/info/train_004.bin.trashinfo new file mode 100644 index 0000000000000000000000000000000000000000..943c39fcd1c67768072e1aee2513ed61ad97c4c5 --- /dev/null +++ b/.local/share/Trash/info/train_004.bin.trashinfo @@ -0,0 +1,3 @@ +[Trash Info] +Path=/root/data/fineweb/train_004.bin +DeletionDate=2024-09-26T05:50:34 diff --git a/.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png b/.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png new file mode 100644 index 0000000000000000000000000000000000000000..3c95198564cf58b036ce168c10fda3d7f5aba1ae Binary files /dev/null and b/.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png differ diff --git a/.local/share/jupyter/nbextensions/ruler/icon.png b/.local/share/jupyter/nbextensions/ruler/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..c6a8c58449d878562301c43864b75ebec90e9a3c Binary files /dev/null and b/.local/share/jupyter/nbextensions/ruler/icon.png differ diff --git a/.local/share/jupyter/nbextensions/ruler/main.js b/.local/share/jupyter/nbextensions/ruler/main.js new file mode 100644 index 0000000000000000000000000000000000000000..c2d6e68649a709adb5b4e21562e356618363df49 --- /dev/null +++ b/.local/share/jupyter/nbextensions/ruler/main.js @@ -0,0 +1,124 @@ +// Add rulers to codecells +define([ + 'base/js/namespace', + 'base/js/events', + 'services/config', + 'notebook/js/codecell', + 'codemirror/lib/codemirror', + 'codemirror/addon/display/rulers' +], function (Jupyter, events, configmod, codecell, codemirror) { + "use strict"; + + var log_prefix = '[ruler]'; + + // define default config parameter values + var params = { + ruler_column: [78], + ruler_color: ["#ff0000"], + ruler_linestyle: ["dashed"], + ruler_do_css_patch: false + }; + + + var rulers = []; + + var isNumber = function (n) { + return !isNaN(parseFloat(n)) && isFinite(n); + }; + + // updates default params with any specified in the provided config data + var update_params = function (config_data) { + for (var key in params) { + if (config_data.hasOwnProperty(key)) { + params[key] = config_data[key]; + } + } + }; + + var on_config_loaded = function () { + + if (Jupyter.notebook !== undefined) { + var i, config = Jupyter.notebook.config; + } else { + var i, config = Jupyter.editor.config; + } + + if (config.data.hasOwnProperty('ruler_color') && config.data.ruler_color.length > 0) { + params.ruler_color = config.data.ruler_color; + } + + if (config.data.hasOwnProperty('ruler_column')) { + var new_columns = []; + for (i in config.data.ruler_column) { + if (isNumber(config.data.ruler_column[i])) { + new_columns.push(config.data.ruler_column[i]); + } + } + if (new_columns.length > 0) { + params.ruler_column = new_columns; + } + } + + if (config.data.hasOwnProperty('ruler_linestyle') && config.data.ruler_linestyle.length > 0) { + params.ruler_linestyle = config.data.ruler_linestyle; + } + + for (i in params.ruler_column) { + rulers.push({ + color: params.ruler_color[i % params.ruler_color.length], + column: params.ruler_column[i], + lineStyle: params.ruler_linestyle[i % params.ruler_linestyle.length] + }); + } + console.debug(log_prefix, 'ruler specs:', rulers); + + if (Jupyter.notebook !== undefined) { + var i, config = Jupyter.notebook.config; + + // Change default for new cells + codecell.CodeCell.options_default.cm_config.rulers = rulers; + // Apply to any already-existing cells + var cells = Jupyter.notebook.get_cells().forEach(function (cell) { + if (cell instanceof codecell.CodeCell) { + cell.code_mirror.setOption('rulers', rulers); + } + }); + + } + else { + Jupyter.editor.codemirror.setOption('rulers', rulers); + } + }; + + var load_extension = function () { + + // first, check which view we're in, in order to decide whether to load + var conf_sect; + if (Jupyter.notebook) { + // we're in notebook view + conf_sect = Jupyter.notebook.config; + } + else if (Jupyter.editor) { + // we're in file-editor view + conf_sect = Jupyter.editor.config; + } + else { + // we're some other view like dashboard, terminal, etc, so bail now + return; + } + + conf_sect.loaded + .then(function () { + update_params(conf_sect.data); + }) + .then(on_config_loaded) + .catch(function on_error(reason) { + console.warn(log_prefix, 'error:', reason); + }); + }; + + var extension = { + load_ipython_extension: load_extension + }; + return extension; +}); diff --git a/.local/share/jupyter/nbextensions/runtools/readme.md b/.local/share/jupyter/nbextensions/runtools/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..424b87736efa1988e9b97bd2a8bc6ad1369a41fd --- /dev/null +++ b/.local/share/jupyter/nbextensions/runtools/readme.md @@ -0,0 +1,119 @@ +Runtools +======== +Runtools provide a number of additional functions for working with code cells in the IPython notebook: + +Code Cell Execution +------------------- + +* Execute a single cell +* Execute from top cell to currently selected cell +* Execute from currently selected cell to bottom cell +* Execute all cells +* Execute all cells, ignore exceptions (requires [ipython/pull/6521](https://github.com/ipython/ipython/pull/6521)) +* Execute marked code cells (cells with green gutter area are marked) +* Stop execution (duplicate to standard toolbar button) + +When executing marked cells, they are put in a execution list, and +executed in order. The execution list can be modified by unmarking +a cell not yet run. The execution list can be stopped by clicking on +`stop execution`. Execution of the currently running cell can be stopped +by pressing `stop execution` twice. + +Code Cell Marking +----------------- + +* Mark one or more code cell + + +Code Cell Display +----------------- + +* Hide or show input (i.e. the source code) of marked code cells +* Hide or show output of marked code cells + + +Description +----------- + +The *runtools* extension adds a button to turn on/off a floating toolbar: +![](icon.png) + +This adds Code execution buttons: +![](runtools_execute.png) + +Codecells can be marked by clicking on the gutter of a codecell or by clicking on the markers toolbar: +![](runtools_marker.png) + +Marked codecells can be locked to read-only mode and moved upd and down: +![](runtools_lock.png) + +The input and output areas of marked codecells can be hidden: +![](runtools_show_hide.png) + +A IPython notebook with marked cells looks like this: +![](runtools_nb.png) + + +Demo +---- + +![](demo.gif) + + +Internals +--------- + +New metadata elements added to each cell: +* `cell.metadata.hide_input` - hide input field of the cell +* `cell.metadata.hide_output` - hide output field of the cell +* `cell.metadata.run_control.marked` - mark a codecell + +To export a notebook with hidden input/output fields, the custom template `hide_input_output.tpl` is required. +It should have been installed in the `templates` folder. +You can find the `templates` folder of `jupyter_contrib_nbextensions` from python using + +```python +from jupyter_contrib_nbextensions.nbconvert_support import templates_directory +print(templates_directory()) +``` + +The template needs to be in a path where nbconvert can find it. This can be your local path or specified in +`jupyter_nbconvert_config` or `jupyter_notebook_config` as `c.Exporter.extra_template_paths`, see [Jupyter docs](https://jupyter-notebook.readthedocs.io/en/latest/config.html). + +For HTML export a template is provided as `nbextensions.tpl` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template: +``` +{%- extends 'full.tpl' -%} + +{% block input_group -%} +{%- if cell.metadata.hide_input -%} +{%- else -%} +{{ super() }} +{%- endif -%} +{% endblock input_group %} + +{% block output_group -%} +{%- if cell.metadata.hide_output -%} +{%- else -%} +{{ super() }} +{%- endif -%} +{% endblock output_group %} +``` + +For LaTeX export a different template is required, which is included as `nbextensions.tplx` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template: +``` +((- extends 'report.tplx' -)) + +((* block input_group -)) +((- if cell.metadata.hide_input -)) +((- else -)) +((( super() ))) +((- endif -)) +(( endblock input_group *)) + +((* block output_group -)) +((- if cell.metadata.hide_output -)) +((- else -)) +((( super() ))) +((- endif -)) +(( endblock output_group *)) +``` \ No newline at end of file diff --git a/.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png b/.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png new file mode 100644 index 0000000000000000000000000000000000000000..8ff9477aa0c17d40a79b896164ac7be718953c50 Binary files /dev/null and b/.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png differ diff --git a/.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml b/.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ccd35cfeff9ee6804496efe8f219945c328640f --- /dev/null +++ b/.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml @@ -0,0 +1,6 @@ +Type: Jupyter Notebook Extension +Name: Scratchpad +Description: Adds a scratchpad cell to Jupyter notebook. +Link: README.md +Main: main.js +Compatibility: 4.x, 5.x diff --git a/.local/share/jupyter/nbextensions/select_keymap/README.md b/.local/share/jupyter/nbextensions/select_keymap/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0e58992c00abb7285a19cdd65e3d3ddd75add7a1 --- /dev/null +++ b/.local/share/jupyter/nbextensions/select_keymap/README.md @@ -0,0 +1,14 @@ +Select CodeMirror Keymap +======= + +This extension lets you choose between the available CodeMirror keymaps: default, emacs, vim, and sublime. + +There's a chance that this might cause key conflicts, especially with other extensions. + +Most browsers consume some of the global keybindings like `Ctrl+n`. [The Menu Wizard add-on for Firefox](https://addons.mozilla.org/en-US/firefox/addon/s3menu-wizard/) allows you to disable some of the global key shortcuts, thus passing the keys through to CodeMirror. + +![Demo](select_keymap.png) + +Based on: + * [jupyter-emacskeys](https://github.com/rmcgibbo/jupyter-emacskeys) + * [notebook_input_mode](https://github.com/asford/notebook_input_mode) diff --git a/.local/share/jupyter/nbextensions/skill/main.js b/.local/share/jupyter/nbextensions/skill/main.js new file mode 100644 index 0000000000000000000000000000000000000000..93ff45950cdf69aa0add4ea582db2ca0adc7c281 --- /dev/null +++ b/.local/share/jupyter/nbextensions/skill/main.js @@ -0,0 +1,14 @@ +define(function() { + "use strict"; + // jupyter nbextensions must export a load_ipython_extension function to + // avoid throwing an error. Also, loading the module should do nothing + // unless the function is called, so we wrap requiring the codemirror mode + // in the load call. + return { + load_ipython_extension: function () { + requirejs(['./skill'], function () { + console.log('[SKILL Syntax] loaded'); + }); + } + }; +}); \ No newline at end of file diff --git a/.local/share/jupyter/nbextensions/skill/skill.yaml b/.local/share/jupyter/nbextensions/skill/skill.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e7054d498a3055f8990a30768fd30de90f9b122 --- /dev/null +++ b/.local/share/jupyter/nbextensions/skill/skill.yaml @@ -0,0 +1,6 @@ +Type: IPython Notebook Extension +Name: SKILL Syntax +Description: Enable SKILL syntax support for CodeMirror +Link: README.md +Main: main.js +Compatibility: 4.x, 5.x diff --git a/.local/share/jupyter/nbextensions/skip-traceback/traceback.png b/.local/share/jupyter/nbextensions/skip-traceback/traceback.png new file mode 100644 index 0000000000000000000000000000000000000000..5acf410b6742d5ed146959a4752a3cfa1a4af5be Binary files /dev/null and b/.local/share/jupyter/nbextensions/skip-traceback/traceback.png differ diff --git a/.local/share/jupyter/nbextensions/splitcell/splitcell.js b/.local/share/jupyter/nbextensions/splitcell/splitcell.js new file mode 100644 index 0000000000000000000000000000000000000000..0c6522c2ac824ca14d8c6e518e3cbe58724de660 --- /dev/null +++ b/.local/share/jupyter/nbextensions/splitcell/splitcell.js @@ -0,0 +1,101 @@ +// Allow for split cells in jupyter notebooks + +define([ + 'base/js/namespace', + 'base/js/events' +], function ( + Jupyter, + events +) { + "use strict"; + + //define default config parameter values + var params = { + toggle_cell_style_keybinding : 'shift-s' + }; + + //updates default params with any specified in the server's config + var update_params = function(){ + var config = Jupyter.notebook.config; + for (var key in params){ + if (config.data.hasOwnProperty(key)){ + params[key] = config.data[key]; + } + } + }; + + var setup = function (){ + // update defaults + update_params(); + + //register actions with ActionHandler instance + var prefix = 'auto'; + var name = 'toggle-cell-style'; + var action = { + icon : 'fa-arrows-h', + help : 'Toggle split/centered cell style', + help_index : 'eb', + id : 'split_cells', + handler : toggle_cell_style + }; + + var action_full_name = Jupyter.keyboard_manager.actions.register(action, name, prefix); + + //define keyboard shortucts + var command_mode_shortcuts = {}; + command_mode_shortcuts[params.toggle_cell_style_keybinding] = action_full_name; + + //register keyboard shortucts with keyboard_manager + Jupyter.notebook.keyboard_manager.command_shortcuts.add_shortcuts(command_mode_shortcuts); + Jupyter.toolbar.add_buttons_group([action_full_name]); + }; + + + var toggle_cell_style = function(){ + var cell = Jupyter.notebook.get_selected_cell(); + if (!("cell_style" in cell.metadata)){cell.metadata.cell_style = 'split';} + else if (cell.metadata.cell_style == 'center'){cell.metadata.cell_style = 'split';} + else {cell.metadata.cell_style = 'center';} + + update_cell_style_element(cell); + }; + + var get_cell_style_html = function(cell_style){ + console.log(cell_style); + if (cell_style == "split") + {return "float:left; width:50%;";} + return "width:100%;"; + }; + + var update_cell_style_element = function(cell){ + var cell_style_html = get_cell_style_html(cell.metadata.cell_style); + cell.element.attr('style', cell_style_html); + }; + + function initialize () { + // On Load lets set the cell styles correctly + var cells = Jupyter.notebook.get_cells(); + var ncells = Jupyter.notebook.ncells(); + + for (var i=0; i length: + content = str(x[:length]) + else: + content = str(x) + if len(content) > 150: + return content[:150] + " ..." + return content + +def var_dic_list(): + types_to_exclude = ['module', 'function', 'builtin_function_or_method', + 'instance', '_Feature', 'type', 'ufunc'] + values = _nms.who_ls() + vardic = [{'varName': v, 'varType': type(eval(v)).__name__, 'varSize': str(_getsizeof(eval(v))), 'varShape': str(_getshapeof(eval(v))) if _getshapeof(eval(v)) else '', 'varContent': _getcontentof(eval(v)) } # noqa + + for v in values if (v not in ['_html', '_nms', 'NamespaceMagics', '_Jupyter']) & (type(eval(v)).__name__ not in types_to_exclude)] # noqa + return json.dumps(vardic) + + +# command to refresh the list of variables +print(var_dic_list()) diff --git a/.local/share/jupyter/nbextensions/varInspector/var_list.r b/.local/share/jupyter/nbextensions/varInspector/var_list.r new file mode 100644 index 0000000000000000000000000000000000000000..340c4176d435490ea3097736bd84942768c9d177 --- /dev/null +++ b/.local/share/jupyter/nbextensions/varInspector/var_list.r @@ -0,0 +1,17 @@ +library(jsonlite) +var_dic_list = function(){ + ll = ls(.GlobalEnv, all.names = FALSE) + varList=list() + iter = 1 + for (k in ll){ + if (class(get(k))!='function'){ + class = class(get(k)); rk = capture.output(str(get(k))); size = object.size(get(k)); sk = substr(get(k),0, 200); + # [{'varName':v, 'varType': type(eval(v)).__name__, 'varSize': _getsizeof(eval(v)), 'varContent': str(eval(v))[:200]} + l = list(varName = k, varType = class, varSize = size, varContent = sk) + varList[[iter]] = l + # print(l) + iter = iter + 1} + } +return(toJSON(varList, simplifyVector = FALSE, force=TRUE)) + } +cat(var_dic_list()) \ No newline at end of file diff --git a/.local/share/jupyter/nbextensions/zenmode/README.md b/.local/share/jupyter/nbextensions/zenmode/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6e790aa9bc5566cae81f4abdfd5a719b7d758c95 --- /dev/null +++ b/.local/share/jupyter/nbextensions/zenmode/README.md @@ -0,0 +1,4 @@ +Zenmode +======= + +A little extension to give Zenmode functionality to the IPython notebook \ No newline at end of file diff --git a/.local/share/jupyter/nbextensions/zenmode/images/back2.jpg b/.local/share/jupyter/nbextensions/zenmode/images/back2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f433b963b0a7de48c4957f8c0e261a03ecafb5d8 Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/back2.jpg differ diff --git a/.local/share/jupyter/nbextensions/zenmode/images/back21.jpg b/.local/share/jupyter/nbextensions/zenmode/images/back21.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0ced7d3348e6d867fcfe5d7f7e2b35ce0eb12d2e Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/back21.jpg differ diff --git a/.local/share/jupyter/nbextensions/zenmode/images/back3.jpg b/.local/share/jupyter/nbextensions/zenmode/images/back3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..361e689aa7b1f9741b12e18f8e6cd32dfc2e38c9 Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/back3.jpg differ diff --git a/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png new file mode 100644 index 0000000000000000000000000000000000000000..e56eb185168d59534c94ffff9bfd10b30902991f Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png differ diff --git a/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png new file mode 100644 index 0000000000000000000000000000000000000000..73020648efb9ca22bbadf91da2ce77f8e18cc498 Binary files /dev/null and b/.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png differ diff --git a/.local/share/jupyter/nbextensions/zenmode/main.css b/.local/share/jupyter/nbextensions/zenmode/main.css new file mode 100644 index 0000000000000000000000000000000000000000..3fb688f3cb64d1839119bc2f9c7bbe800f747ad1 --- /dev/null +++ b/.local/share/jupyter/nbextensions/zenmode/main.css @@ -0,0 +1,34 @@ +.navbar-inner { + opacity: 0.5; + -webkit-transition: opacity 0.3s ease-in-out; + -moz-transition: opacity 0.3s ease-in-out; + -o-transition: opacity 0.3s ease-in-out; + transition: opacity 0.3s ease-in-out; +} + +.navbar-inner:hover { + opacity: 1.0; +} + +#maintoolbar .navbar-text { + display: none !important; +} + +#notebook-container { + background-color: rgba(255, 255, 255, 0); +} + +/* +.cell { + background-color: rgb(255, 255, 255); +} + +.CodeMirror { + background: #F8FCCF; +} + +div.input_area { + margin: 2px; + border: none; +} +*/ diff --git a/.local/share/jupyter/runtime/jpserver-434.json b/.local/share/jupyter/runtime/jpserver-434.json new file mode 100644 index 0000000000000000000000000000000000000000..827db6bb1f6650a69a0476806b815ee094429a2d --- /dev/null +++ b/.local/share/jupyter/runtime/jpserver-434.json @@ -0,0 +1,13 @@ +{ + "base_url": "/", + "hostname": "0.0.0.0", + "password": false, + "pid": 434, + "port": 8080, + "root_dir": "/root", + "secure": true, + "sock": "", + "token": "5a434251505375f2b42435914de608ef3450739f4e14b0be1cfeae3b7364239e", + "url": "https://184d1c0992ce:8080/", + "version": "2.12.5" +} \ No newline at end of file diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..043c32f99120b95b4bdae1f75f759ac28d96dd66 Binary files /dev/null and b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin differ diff --git a/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..4b8f38b8414ea811649605ae9ab1c4d55cd94b3d --- /dev/null +++ b/.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir @@ -0,0 +1,60 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<4x1xi64, #blocked> + %cst_0 = arith.constant dense<0> : tensor<4x1xi64, #blocked> + %cst_1 = arith.constant dense<512> : tensor<4x1xi64, #blocked> + %cst_2 = arith.constant dense<256> : tensor<4x1xi32, #blocked> + %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1> + %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1> + %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1> + %cst_6 = arith.constant dense : tensor<4x1xi1, #blocked> + %c4_i32 = arith.constant 4 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c4_i32 : i32 + %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4x1xi32, #blocked1> + %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xi32, #blocked> + %6 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked1> + %7 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked> + %8 = arith.addi %6, %4 : tensor<4x1xi32, #blocked1> + %9 = arith.addi %7, %5 : tensor<4x1xi32, #blocked> + %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1> + %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1> + %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1> + %14 = tt.broadcast %8 : (tensor<4x1xi32, #blocked1>) -> tensor<4x128xi32, #blocked1> + %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<4x128xi32, #blocked1> + %16 = arith.addi %14, %15 : tensor<4x128xi32, #blocked1> + %17 = tt.splat %arg0 : (!tt.ptr) -> tensor<4x128x!tt.ptr, #blocked1> + %18 = tt.addptr %17, %16 : tensor<4x128x!tt.ptr, #blocked1>, tensor<4x128xi32, #blocked1> + %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<4x128xi1, #blocked1> + %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32, #blocked1> + %21 = arith.addf %20, %cst_5 : tensor<4x128xf32, #blocked1> + %22 = arith.select %19, %21, %cst_5 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1> + %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %40 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %40 : f32 + }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %24 = triton_gpu.convert_layout %23 : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xf32, #blocked> + %26 = arith.divsi %9, %cst_2 : tensor<4x1xi32, #blocked> + %27 = arith.remsi %9, %cst_2 : tensor<4x1xi32, #blocked> + %28 = tt.splat %arg1 : (!tt.ptr) -> tensor<4x1x!tt.ptr, #blocked> + %29 = tt.addptr %28, %26 : tensor<4x1x!tt.ptr, #blocked>, tensor<4x1xi32, #blocked> + %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64, #blocked> + %31 = arith.addi %30, %cst_1 : tensor<4x1xi64, #blocked> + %32 = arith.cmpi slt, %30, %cst_0 : tensor<4x1xi64, #blocked> + %33 = arith.select %32, %31, %30 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked> + %34 = arith.muli %33, %cst : tensor<4x1xi64, #blocked> + %35 = arith.extsi %27 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked> + %36 = arith.addi %35, %34 : tensor<4x1xi64, #blocked> + %37 = tt.splat %arg2 : (!tt.ptr) -> tensor<4x1x!tt.ptr, #blocked> + %38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr, #blocked>, tensor<4x1xi64, #blocked> + %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr, #blocked>, tensor<4x1xf32, #blocked>, tensor<4x1xi1, #blocked>) -> tensor<4x1xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..5880bb280c4de369bd2e6dc5cdeb23488e6f4a5c Binary files /dev/null and b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin differ diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..12368a3de34292de00a4c7dc42d7890ab0e33358 --- /dev/null +++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx @@ -0,0 +1,453 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.extern .func __assertfail +( + .param .b64 __assertfail_param_0, + .param .b64 __assertfail_param_1, + .param .b32 __assertfail_param_2, + .param .b64 __assertfail_param_3, + .param .b64 __assertfail_param_4 +) +; +.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100}; +.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62}; +.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55}; +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u64 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<24>; + .reg .b16 %rs<21>; + .reg .b32 %r<21>; + .reg .b64 %rd<58>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd9, [triton__0d1d2de_param_1]; + ld.param.u64 %rd16, [triton__0d1d2de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r4, %tid.x; + and.b32 %r1, %r4, 127; + shl.b32 %r2, %r1, 1; + or.b32 %r5, %r2, 1; + or.b32 %r6, %r2, 256; + .loc 1 20 28 + mov.u32 %r3, %ctaid.x; + .loc 1 20 46 + mul.wide.s32 %rd1, %r3, 512; + cvt.u64.u32 %rd17, %r2; + cvt.u64.u32 %rd18, %r6; + .loc 1 21 23 + or.b64 %rd2, %rd1, %rd17; + or.b64 %rd3, %rd1, %rd18; + .loc 1 24 30 + shl.b64 %rd19, %rd2, 3; + add.s64 %rd12, %rd16, %rd19; + add.s64 %rd15, %rd12, 2048; + mov.pred %p20, -1; + .loc 1 24 35 + mov.u64 %rd10, 0x0; + mov.u64 %rd11, 0x0; + @%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ]; + mov.u64 %rd13, 0x0; + mov.u64 %rd14, 0x0; + @%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ]; + .loc 1 26 19 + setp.eq.s64 %p3, %rd14, -1; + setp.eq.s64 %p4, %rd13, -1; + setp.eq.s64 %p5, %rd11, -1; + setp.eq.s64 %p6, %rd10, -1; + .loc 1 28 32 + selp.b64 %rd20, 0, %rd10, %p6; + selp.b64 %rd21, 0, %rd11, %p5; + selp.b64 %rd22, 0, %rd13, %p4; + selp.b64 %rd23, 0, %rd14, %p3; + .loc 1 29 18 + add.s64 %rd24, %rd23, 50257; + add.s64 %rd25, %rd22, 50257; + add.s64 %rd26, %rd21, 50257; + add.s64 %rd27, %rd20, 50257; + .loc 1 30 18 + setp.lt.s64 %p7, %rd23, 0; + setp.lt.s64 %p8, %rd22, 0; + setp.lt.s64 %p9, %rd21, 0; + setp.lt.s64 %p10, %rd20, 0; + .loc 1 31 32 + selp.b64 %rd7, %rd27, %rd20, %p10; + selp.b64 %rd6, %rd26, %rd21, %p9; + selp.b64 %rd5, %rd25, %rd22, %p8; + selp.b64 %rd4, %rd24, %rd23, %p7; + .loc 1 32 36 + setp.lt.u64 %p11, %rd4, 50257; + setp.lt.u64 %p12, %rd5, 50257; + setp.lt.u64 %p13, %rd6, 50257; + setp.lt.u64 %p14, %rd7, 50257; + mov.u32 %r7, global_smem; + add.s32 %r8, %r7, %r2; + selp.u16 %rs1, 1, 0, %p14; + st.shared.u8 [%r8], %rs1; + cvt.u64.u32 %rd8, %r5; + selp.u16 %rs2, 1, 0, %p13; + st.shared.u8 [%r8+1], %rs2; + bar.sync 0; + add.s32 %r9, %r7, %r1; + ld.shared.u8 %rs3, [%r9]; + ld.shared.u8 %rs4, [%r9+128]; + bar.sync 0; + selp.u16 %rs5, 1, 0, %p12; + st.shared.u8 [%r8], %rs5; + selp.u16 %rs6, 1, 0, %p11; + st.shared.u8 [%r8+1], %rs6; + bar.sync 0; + ld.shared.u8 %rs7, [%r9]; + ld.shared.u8 %rs8, [%r9+128]; + setp.eq.s16 %p15, %rs7, 0; + selp.u16 %rs9, 1, 0, %p15; + shl.b16 %rs10, %rs9, 2; + setp.eq.s16 %p16, %rs8, 0; + selp.u16 %rs11, -1, 0, %p16; + shl.b16 %rs12, %rs11, 3; + or.b16 %rs13, %rs12, %rs10; + setp.eq.s16 %p17, %rs4, 0; + selp.u16 %rs14, 1, 0, %p17; + setp.eq.s16 %p18, %rs3, 0; + selp.u16 %rs15, -1, 0, %p18; + shl.b16 %rs16, %rs15, 1; + or.b16 %rs17, %rs14, %rs16; + and.b16 %rs18, %rs17, 3; + or.b16 %rs19, %rs18, %rs13; + .loc 1 32 51 + and.b16 %rs20, %rs19, 15; + setp.eq.s16 %p19, %rs20, 0; + @%p19 bra $L__BB0_2; + mov.u64 %rd28, assertMessage_0; + cvta.global.u64 %rd29, %rd28; + mov.u64 %rd30, assertFile_0; + cvta.global.u64 %rd31, %rd30; + mov.u64 %rd32, assertFunc_0; + cvta.global.u64 %rd33, %rd32; + mov.b32 %r10, 883; + mov.u64 %rd34, 1; + { // callseq 0, 0 + .reg .b32 temp_param_reg; + .param .b64 param0; + st.param.b64 [param0+0], %rd29; + .param .b64 param1; + st.param.b64 [param1+0], %rd31; + .param .b32 param2; + st.param.b32 [param2+0], %r10; + .param .b64 param3; + st.param.b64 [param3+0], %rd33; + .param .b64 param4; + st.param.b64 [param4+0], %rd34; + call.uni + __assertfail, + ( + param0, + param1, + param2, + param3, + param4 + ); + } // callseq 0 +$L__BB0_2: + .loc 1 21 36 + or.b32 %r15, %r2, 257; + cvt.u64.u32 %rd39, %r15; + .loc 1 21 23 + or.b64 %rd40, %rd1, %rd39; + or.b64 %rd41, %rd1, %rd8; + .loc 1 34 25 + shl.b64 %rd42, %rd7, 2; + add.s64 %rd43, %rd9, %rd42; + mul.lo.s64 %rd44, %rd2, 201028; + add.s64 %rd45, %rd43, %rd44; + shl.b64 %rd46, %rd6, 2; + add.s64 %rd47, %rd9, %rd46; + mul.lo.s64 %rd48, %rd41, 201028; + add.s64 %rd49, %rd47, %rd48; + shl.b64 %rd50, %rd5, 2; + add.s64 %rd51, %rd9, %rd50; + mul.lo.s64 %rd52, %rd3, 201028; + add.s64 %rd53, %rd51, %rd52; + shl.b64 %rd54, %rd4, 2; + add.s64 %rd55, %rd9, %rd54; + mul.lo.s64 %rd56, %rd40, 201028; + add.s64 %rd57, %rd55, %rd56; + .loc 1 34 51 + bar.sync 0; + shl.b32 %r16, %r2, 3; + add.s32 %r18, %r7, %r16; + st.shared.u64 [%r18], %rd45; + st.shared.u64 [%r18+8], %rd49; + bar.sync 0; + shl.b32 %r19, %r1, 3; + add.s32 %r20, %r7, %r19; + ld.shared.u64 %rd35, [%r20]; + ld.shared.u64 %rd36, [%r20+1024]; + bar.sync 0; + st.shared.u64 [%r18], %rd53; + st.shared.u64 [%r18+8], %rd57; + bar.sync 0; + ld.shared.u64 %rd37, [%r20]; + ld.shared.u64 %rd38, [%r20+1024]; + mov.b32 %r11, -1082130432; + @%p20 st.global.b32 [ %rd35 + 0 ], { %r11 }; + @%p20 st.global.b32 [ %rd36 + 0 ], { %r11 }; + @%p20 st.global.b32 [ %rd37 + 0 ], { %r11 }; + @%p20 st.global.b32 [ %rd38 + 0 ], { %r11 }; + .loc 1 34 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 104 +.b8 108 +.b8 114 +.b8 107 +.b8 103 +.b8 112 +.b8 118 +.b8 118 +.b8 98 +.b8 100 +.b8 105 +.b8 122 +.b8 100 +.b8 122 +.b8 55 +.b8 115 +.b8 108 +.b8 108 +.b8 113 +.b8 117 +.b8 101 +.b8 116 +.b8 50 +.b8 106 +.b8 55 +.b8 122 +.b8 104 +.b8 116 +.b8 101 +.b8 115 +.b8 54 +.b8 109 +.b8 101 +.b8 104 +.b8 54 +.b8 107 +.b8 101 +.b8 110 +.b8 114 +.b8 113 +.b8 120 +.b8 111 +.b8 118 +.b8 50 +.b8 54 +.b8 109 +.b8 115 +.b8 119 +.b8 118 +.b8 119 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 104 +.b8 108 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..0b7d78df8ee7c3a7698074c07e6a85c89557a262 --- /dev/null +++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir @@ -0,0 +1,38 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<50257> : tensor<512xi64, #blocked> + %cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked> + %cst_1 = arith.constant dense<-1> : tensor<512xi64, #blocked> + %cst_2 = arith.constant dense<-1.000000e+00> : tensor<512xf32, #blocked1> + %c512_i64 = arith.constant 512 : i64 + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c512_i64 : i64 + %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked> + %5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked> + %6 = arith.addi %5, %4 : tensor<512xi64, #blocked> + %7 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> + %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64, #blocked> + %10 = arith.cmpi ne, %9, %cst_1 : tensor<512xi64, #blocked> + %11 = arith.select %10, %9, %cst_0 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> + %12 = arith.addi %11, %cst : tensor<512xi64, #blocked> + %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64, #blocked> + %14 = arith.select %13, %12, %11 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked> + %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64, #blocked> + %16 = arith.cmpi slt, %14, %cst : tensor<512xi64, #blocked> + %17 = arith.andi %15, %16 : tensor<512xi1, #blocked> + %18 = triton_gpu.convert_layout %17 : (tensor<512xi1, #blocked>) -> tensor<512xi1, #blocked1> + tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "", "_call_with_frames_removed", 883 : tensor<512xi1, #blocked1> + %19 = arith.muli %6, %cst : tensor<512xi64, #blocked> + %20 = arith.addi %14, %19 : tensor<512xi64, #blocked> + %21 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %22 = tt.addptr %21, %20 : tensor<512x!tt.ptr, #blocked>, tensor<512xi64, #blocked> + %23 = triton_gpu.convert_layout %22 : (tensor<512x!tt.ptr, #blocked>) -> tensor<512x!tt.ptr, #blocked1> + tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked1> + tt.return + } +} diff --git a/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..ac625daa75a7b2e0a1660cd29001f4943ddb9c7b --- /dev/null +++ b/.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir @@ -0,0 +1,34 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<50257> : tensor<512xi64> + %cst_0 = arith.constant dense<0> : tensor<512xi64> + %c512_i64 = arith.constant 512 : i64 + %cst_1 = arith.constant dense<-1.000000e+00> : tensor<512xf32> + %cst_2 = arith.constant dense<-1> : tensor<512xi64> + %0 = tt.get_program_id x : i32 + %1 = arith.extsi %0 : i32 to i64 + %2 = arith.muli %1, %c512_i64 : i64 + %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64> + %5 = tt.splat %2 : (i64) -> tensor<512xi64> + %6 = arith.addi %5, %4 : tensor<512xi64> + %7 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr>, tensor<512xi64> + %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64> + %10 = arith.cmpi ne, %9, %cst_2 : tensor<512xi64> + %11 = arith.select %10, %9, %cst_0 : tensor<512xi1>, tensor<512xi64> + %12 = arith.addi %11, %cst : tensor<512xi64> + %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64> + %14 = arith.select %13, %12, %11 : tensor<512xi1>, tensor<512xi64> + %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64> + %16 = arith.cmpi slt, %14, %cst : tensor<512xi64> + %17 = arith.andi %15, %16 : tensor<512xi1> + tt.assert %17, "index out of bounds: 0 <= tmp7 < 50257", "", "_call_with_frames_removed", 883 : tensor<512xi1> + %18 = arith.muli %6, %cst : tensor<512xi64> + %19 = arith.addi %14, %18 : tensor<512xi64> + %20 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr> + %21 = tt.addptr %20, %19 : tensor<512x!tt.ptr>, tensor<512xi64> + tt.store %21, %cst_1 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32> + tt.return + } +} diff --git a/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..bee0c6b92fa96062bbd3c250fff66034763e177d --- /dev/null +++ b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx @@ -0,0 +1,312 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<4>; + .reg .b16 %rs<9>; + .reg .b32 %r<31>; + .reg .b64 %rd<8>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd4, [triton__0d1d2de_param_0]; + ld.param.u64 %rd5, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r22, %tid.x; + shl.b32 %r23, %r22, 3; + and.b32 %r24, %r23, 1016; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r25, %r1, 10; + .loc 1 21 23 + or.b32 %r26, %r25, %r24; + .loc 1 24 30 + mul.wide.s32 %rd6, %r26, 4; + add.s64 %rd1, %rd4, %rd6; + add.s64 %rd2, %rd1, 16; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + mov.u32 %r12, 0x0; + mov.u32 %r13, 0x0; + @%p1 ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd1 + 0 ]; + mov.u32 %r14, 0x0; + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + mov.u32 %r17, 0x0; + @%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ]; + .loc 1 26 25 + mul.wide.s32 %rd7, %r26, 2; + add.s64 %rd3, %rd5, %rd7; + .loc 1 26 36 + cvt.rn.bf16.f32 %rs1, %r10; + cvt.rn.bf16.f32 %rs2, %r11; + cvt.rn.bf16.f32 %rs3, %r12; + cvt.rn.bf16.f32 %rs4, %r13; + cvt.rn.bf16.f32 %rs5, %r14; + cvt.rn.bf16.f32 %rs6, %r15; + cvt.rn.bf16.f32 %rs7, %r16; + cvt.rn.bf16.f32 %rs8, %r17; + mov.b32 %r27, {%rs1, %rs2}; + mov.b32 %r28, {%rs3, %rs4}; + mov.b32 %r29, {%rs5, %rs6}; + mov.b32 %r30, {%rs7, %rs8}; + @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r27, %r28, %r29, %r30 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/5t/c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 53 +.b8 116 +.b8 114 +.b8 121 +.b8 112 +.b8 53 +.b8 113 +.b8 119 +.b8 107 +.b8 104 +.b8 114 +.b8 101 +.b8 105 +.b8 106 +.b8 107 +.b8 55 +.b8 115 +.b8 53 +.b8 120 +.b8 51 +.b8 50 +.b8 55 +.b8 119 +.b8 111 +.b8 102 +.b8 122 +.b8 53 +.b8 52 +.b8 108 +.b8 119 +.b8 106 +.b8 52 +.b8 107 +.b8 118 +.b8 99 +.b8 116 +.b8 117 +.b8 113 +.b8 100 +.b8 122 +.b8 118 +.b8 50 +.b8 118 +.b8 114 +.b8 102 +.b8 50 +.b8 120 +.b8 121 +.b8 111 +.b8 110 +.b8 115 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 53 +.b8 116 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..614e4de8b1d1d1bea9558461182cf294bdee414b --- /dev/null +++ b/.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir @@ -0,0 +1,19 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked> + %8 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr, #blocked> + %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked> + tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..0d55d8e52e5055c0628c4a6cd43dddde5d53d7e4 Binary files /dev/null and b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin differ diff --git a/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..962967057117e882cdf06f7455d5b47cc5232325 --- /dev/null +++ b/.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir @@ -0,0 +1,56 @@ +module { + tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c8_i32 = arith.constant 8 : i32 + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst = arith.constant dense<32768> : tensor<64x1xi32> + %cst_0 = arith.constant dense<256> : tensor<1x8xi32> + %cst_1 = arith.constant dense<128> : tensor<1x8xi32> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32> + %cst_3 = arith.constant dense<256> : tensor<64x1xi32> + %c64_i32 = arith.constant 64 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> + %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32> + %4 = tt.splat %1 : (i32) -> tensor<64x1xi32> + %5 = arith.addi %4, %3 : tensor<64x1xi32> + %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32> + %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32> + %8 = arith.remsi %5, %cst_3 : tensor<64x1xi32> + %9 = arith.divsi %5, %cst_3 : tensor<64x1xi32> + %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %11 = arith.muli %9, %cst : tensor<64x1xi32> + %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32> + %13 = tt.splat %arg0 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %14 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x8x!tt.ptr> + %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_2) -> (tensor<64x8xf32>) : i32 { + %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32> + %21 = arith.addi %20, %7 : tensor<1x8xi32> + %22 = arith.cmpi slt, %21, %cst_1 : tensor<1x8xi32> + %23 = arith.muli %21, %cst_0 : tensor<1x8xi32> + %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32> + %25 = arith.addi %10, %24 : tensor<64x8xi32> + %26 = arith.addi %25, %12 : tensor<64x8xi32> + %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1> + %29 = tt.load %27, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32> + %30 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr>, tensor<64x8xi32> + %31 = tt.load %30, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32> + %32 = arith.mulf %29, %31 : tensor<64x8xf32> + %33 = arith.addf %arg6, %32 : tensor<64x8xf32> + %34 = arith.select %28, %33, %arg6 : tensor<64x8xi1>, tensor<64x8xf32> + scf.yield %34 : tensor<64x8xf32> + } + %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %20 = arith.addf %arg5, %arg6 : f32 + tt.reduce.return %20 : f32 + }) : (tensor<64x8xf32>) -> tensor<64xf32> + %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32> + %18 = tt.splat %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> + %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr>, tensor<64x1xi32> + tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32> + tt.return + } +} diff --git a/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir b/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..989f4b0263f466b7932aaa29efeacd8142cc4858 --- /dev/null +++ b/.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir @@ -0,0 +1,88 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<256xi32, #blocked> + %cst_0 = arith.constant dense<-1> : tensor<1xi64, #blocked> + %cst_1 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked> + %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked> + %cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked> + %cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked> + %cst_5 = arith.constant 0.000000e+00 : f32 + %c256_i32 = arith.constant 256 : i32 + %cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked> + %cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked> + %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked> + %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked> + %5 = arith.addi %1, %4 : tensor<256xi32, #blocked> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked> + %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked> + %10 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %16 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %17 = tt.splat %16 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %19 = tt.addptr %arg5, %0 : !tt.ptr, i32 + %20 = tt.splat %19 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked> + %22 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %23 = tt.splat %22 : (!tt.ptr) -> tensor<1x!tt.ptr, #blocked> + %24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked> + %25 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %26 = tt.addptr %25, %5 : tensor<256x!tt.ptr, #blocked>, tensor<256xi32, #blocked> + %27 = tt.load %26, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked> + %28 = arith.mulf %9, %12 : tensor<256xf32, #blocked> + %29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %31 = arith.addf %30, %cst_5 : f32 + %32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %33 = arith.subf %15, %32 : tensor<256xf32, #blocked> + %34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %35 = arith.mulf %33, %34 : tensor<256xf32, #blocked> + %36 = arith.mulf %28, %35 : tensor<256xf32, #blocked> + %37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %63 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %63 : f32 + }) : (tensor<256xf32, #blocked>) -> f32 + %39 = arith.addf %38, %cst_5 : f32 + %40 = arith.cmpi eq, %24, %cst_0 : tensor<1xi64, #blocked> + %41 = arith.divf %21, %cst_1 : tensor<1xf32, #blocked> + %42 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked> + %43 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked> + %44 = arith.subf %42, %43 : tensor<256xf32, #blocked> + %45 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked> + %46 = arith.mulf %35, %45 : tensor<256xf32, #blocked> + %47 = arith.subf %44, %46 : tensor<256xf32, #blocked> + %48 = tt.broadcast %41 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked> + %49 = arith.mulf %48, %47 : tensor<256xf32, #blocked> + %50 = arith.addf %27, %49 : tensor<256xf32, #blocked> + %51 = tt.broadcast %40 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked> + %52 = arith.select %51, %cst_6, %50 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked> + %53 = arith.addi %24, %cst_4 : tensor<1xi64, #blocked> + %54 = arith.cmpi slt, %24, %cst_3 : tensor<1xi64, #blocked> + %55 = arith.select %54, %53, %24 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked> + %56 = arith.muli %55, %cst_2 : tensor<1xi64, #blocked> + %57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked> + %58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked> + %59 = arith.addi %58, %57 : tensor<256xi64, #blocked> + %60 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr, #blocked> + %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr, #blocked>, tensor<256xi64, #blocked> + %62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..620845f02dff7a50f12b12538eb505d2fb4b62ba --- /dev/null +++ b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx @@ -0,0 +1,778 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<27>; + .reg .b16 %rs<17>; + .reg .b32 %r<67>; + .reg .f32 %f<431>; + .reg .b64 %rd<8>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd4, [triton__0d1d2de_param_0]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r14, %tid.x; + shl.b32 %r15, %r14, 3; + and.b32 %r16, %r15, 1016; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r17, %r1, 10; + .loc 1 21 23 + or.b32 %r18, %r17, %r16; + .loc 1 24 30 + mul.wide.s32 %rd5, %r18, 2; + add.s64 %rd3, %rd4, %rd5; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd3 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + .loc 1 24 44 + cvt.f32.bf16 %r6, %rs1; + mov.b32 %f1, %r6; + cvt.f32.bf16 %r7, %rs2; + mov.b32 %f2, %r7; + .loc 1 29 18 + mul.f32 %f9, %f1, 0f3F3504F3; + .loc 1 30 23 + abs.ftz.f32 %f17, %f9; + setp.ge.f32 %p2, %f17, 0f3F8060FE; + mov.f32 %f365, 0f3789CA3C; + mov.f32 %f364, 0fB9F560B9; + mov.f32 %f363, 0f3BAC840B; + mov.f32 %f362, 0fBD0C8162; + mov.f32 %f361, 0f3E1CF906; + mov.f32 %f360, 0f3F6A937E; + mov.f32 %f359, 0f3F20D842; + mov.f32 %f366, %f17; + @%p2 bra $L__BB0_2; + .loc 1 0 23 + mov.f32 %f365, 0f38B1E96A; + mov.f32 %f364, 0fBA574D20; + mov.f32 %f363, 0f3BAAD5EA; + mov.f32 %f362, 0fBCDC1BE7; + mov.f32 %f361, 0f3DE718AF; + mov.f32 %f360, 0fBEC093AC; + mov.f32 %f359, 0f3E0375D3; + .loc 1 30 23 + mul.f32 %f366, %f9, %f9; +$L__BB0_2: + .loc 1 0 0 + cvt.f32.bf16 %r8, %rs3; + mul.f32 %f10, %f2, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p3, %f17, 0f3F8060FE; + fma.rn.ftz.f32 %f135, %f365, %f366, %f364; + fma.rn.ftz.f32 %f136, %f135, %f366, %f363; + fma.rn.ftz.f32 %f137, %f136, %f366, %f362; + fma.rn.ftz.f32 %f138, %f137, %f366, %f361; + fma.rn.ftz.f32 %f139, %f138, %f366, %f360; + fma.rn.ftz.f32 %f140, %f139, %f366, %f359; + neg.f32 %f141, %f366; + selp.f32 %f142, %f141, %f9, %p2; + fma.rn.ftz.f32 %f367, %f140, %f142, %f142; + mov.f32 %f358, 0f3F800000; + @%p3 bra $L__BB0_4; + ex2.approx.ftz.f32 %f143, %f367; + sub.f32 %f145, %f358, %f143; + mov.b32 %r19, %f145; + mov.b32 %r20, %f9; + and.b32 %r21, %r20, -2147483648; + or.b32 %r22, %r21, %r19; + mov.b32 %f367, %r22; +$L__BB0_4: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + mov.b32 %f3, %r8; + .loc 1 30 23 + abs.ftz.f32 %f30, %f10; + setp.ge.f32 %p5, %f30, 0f3F8060FE; + mov.f32 %f374, 0f3789CA3C; + mov.f32 %f373, 0fB9F560B9; + mov.f32 %f372, 0f3BAC840B; + mov.f32 %f371, 0fBD0C8162; + mov.f32 %f370, 0f3E1CF906; + mov.f32 %f369, 0f3F6A937E; + mov.f32 %f368, 0f3F20D842; + mov.f32 %f375, %f30; + @%p5 bra $L__BB0_6; + mul.f32 %f375, %f10, %f10; + mov.f32 %f374, 0f38B1E96A; + mov.f32 %f373, 0fBA574D20; + mov.f32 %f372, 0f3BAAD5EA; + mov.f32 %f371, 0fBCDC1BE7; + mov.f32 %f370, 0f3DE718AF; + mov.f32 %f369, 0fBEC093AC; + mov.f32 %f368, 0f3E0375D3; +$L__BB0_6: + .loc 1 0 0 + cvt.f32.bf16 %r9, %rs4; + mul.f32 %f11, %f3, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p6, %f30, 0f3F8060FE; + fma.rn.ftz.f32 %f160, %f374, %f375, %f373; + fma.rn.ftz.f32 %f161, %f160, %f375, %f372; + fma.rn.ftz.f32 %f162, %f161, %f375, %f371; + fma.rn.ftz.f32 %f163, %f162, %f375, %f370; + fma.rn.ftz.f32 %f164, %f163, %f375, %f369; + fma.rn.ftz.f32 %f165, %f164, %f375, %f368; + neg.f32 %f166, %f375; + selp.f32 %f167, %f166, %f10, %p5; + fma.rn.ftz.f32 %f376, %f165, %f167, %f167; + @%p6 bra $L__BB0_8; + ex2.approx.ftz.f32 %f168, %f376; + sub.f32 %f170, %f358, %f168; + mov.b32 %r23, %f170; + mov.b32 %r24, %f10; + and.b32 %r25, %r24, -2147483648; + or.b32 %r26, %r25, %r23; + mov.b32 %f376, %r26; +$L__BB0_8: + .loc 1 0 0 + cvt.u16.u32 %rs5, %r4; + mov.b32 %f4, %r9; + .loc 1 30 23 + abs.ftz.f32 %f43, %f11; + setp.ge.f32 %p8, %f43, 0f3F8060FE; + mov.f32 %f383, 0f3789CA3C; + mov.f32 %f382, 0fB9F560B9; + mov.f32 %f381, 0f3BAC840B; + mov.f32 %f380, 0fBD0C8162; + mov.f32 %f379, 0f3E1CF906; + mov.f32 %f378, 0f3F6A937E; + mov.f32 %f377, 0f3F20D842; + mov.f32 %f384, %f43; + @%p8 bra $L__BB0_10; + mul.f32 %f384, %f11, %f11; + mov.f32 %f383, 0f38B1E96A; + mov.f32 %f382, 0fBA574D20; + mov.f32 %f381, 0f3BAAD5EA; + mov.f32 %f380, 0fBCDC1BE7; + mov.f32 %f379, 0f3DE718AF; + mov.f32 %f378, 0fBEC093AC; + mov.f32 %f377, 0f3E0375D3; +$L__BB0_10: + .loc 1 0 0 + cvt.f32.bf16 %r10, %rs5; + mul.f32 %f12, %f4, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p9, %f43, 0f3F8060FE; + fma.rn.ftz.f32 %f185, %f383, %f384, %f382; + fma.rn.ftz.f32 %f186, %f185, %f384, %f381; + fma.rn.ftz.f32 %f187, %f186, %f384, %f380; + fma.rn.ftz.f32 %f188, %f187, %f384, %f379; + fma.rn.ftz.f32 %f189, %f188, %f384, %f378; + fma.rn.ftz.f32 %f190, %f189, %f384, %f377; + neg.f32 %f191, %f384; + selp.f32 %f192, %f191, %f11, %p8; + fma.rn.ftz.f32 %f385, %f190, %f192, %f192; + @%p9 bra $L__BB0_12; + ex2.approx.ftz.f32 %f193, %f385; + sub.f32 %f195, %f358, %f193; + mov.b32 %r27, %f195; + mov.b32 %r28, %f11; + and.b32 %r29, %r28, -2147483648; + or.b32 %r30, %r29, %r27; + mov.b32 %f385, %r30; +$L__BB0_12: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; } + mov.b32 %f5, %r10; + .loc 1 30 23 + abs.ftz.f32 %f56, %f12; + setp.ge.f32 %p11, %f56, 0f3F8060FE; + mov.f32 %f392, 0f3789CA3C; + mov.f32 %f391, 0fB9F560B9; + mov.f32 %f390, 0f3BAC840B; + mov.f32 %f389, 0fBD0C8162; + mov.f32 %f388, 0f3E1CF906; + mov.f32 %f387, 0f3F6A937E; + mov.f32 %f386, 0f3F20D842; + mov.f32 %f393, %f56; + @%p11 bra $L__BB0_14; + mul.f32 %f393, %f12, %f12; + mov.f32 %f392, 0f38B1E96A; + mov.f32 %f391, 0fBA574D20; + mov.f32 %f390, 0f3BAAD5EA; + mov.f32 %f389, 0fBCDC1BE7; + mov.f32 %f388, 0f3DE718AF; + mov.f32 %f387, 0fBEC093AC; + mov.f32 %f386, 0f3E0375D3; +$L__BB0_14: + .loc 1 0 0 + cvt.f32.bf16 %r11, %rs6; + mul.f32 %f13, %f5, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p12, %f56, 0f3F8060FE; + fma.rn.ftz.f32 %f210, %f392, %f393, %f391; + fma.rn.ftz.f32 %f211, %f210, %f393, %f390; + fma.rn.ftz.f32 %f212, %f211, %f393, %f389; + fma.rn.ftz.f32 %f213, %f212, %f393, %f388; + fma.rn.ftz.f32 %f214, %f213, %f393, %f387; + fma.rn.ftz.f32 %f215, %f214, %f393, %f386; + neg.f32 %f216, %f393; + selp.f32 %f217, %f216, %f12, %p11; + fma.rn.ftz.f32 %f394, %f215, %f217, %f217; + @%p12 bra $L__BB0_16; + ex2.approx.ftz.f32 %f218, %f394; + sub.f32 %f220, %f358, %f218; + mov.b32 %r31, %f220; + mov.b32 %r32, %f12; + and.b32 %r33, %r32, -2147483648; + or.b32 %r34, %r33, %r31; + mov.b32 %f394, %r34; +$L__BB0_16: + .loc 1 0 0 + cvt.u16.u32 %rs7, %r5; + mov.b32 %f6, %r11; + .loc 1 30 23 + abs.ftz.f32 %f69, %f13; + setp.ge.f32 %p14, %f69, 0f3F8060FE; + mov.f32 %f401, 0f3789CA3C; + mov.f32 %f400, 0fB9F560B9; + mov.f32 %f399, 0f3BAC840B; + mov.f32 %f398, 0fBD0C8162; + mov.f32 %f397, 0f3E1CF906; + mov.f32 %f396, 0f3F6A937E; + mov.f32 %f395, 0f3F20D842; + mov.f32 %f402, %f69; + @%p14 bra $L__BB0_18; + mul.f32 %f402, %f13, %f13; + mov.f32 %f401, 0f38B1E96A; + mov.f32 %f400, 0fBA574D20; + mov.f32 %f399, 0f3BAAD5EA; + mov.f32 %f398, 0fBCDC1BE7; + mov.f32 %f397, 0f3DE718AF; + mov.f32 %f396, 0fBEC093AC; + mov.f32 %f395, 0f3E0375D3; +$L__BB0_18: + .loc 1 0 0 + cvt.f32.bf16 %r12, %rs7; + mul.f32 %f14, %f6, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p15, %f69, 0f3F8060FE; + fma.rn.ftz.f32 %f235, %f401, %f402, %f400; + fma.rn.ftz.f32 %f236, %f235, %f402, %f399; + fma.rn.ftz.f32 %f237, %f236, %f402, %f398; + fma.rn.ftz.f32 %f238, %f237, %f402, %f397; + fma.rn.ftz.f32 %f239, %f238, %f402, %f396; + fma.rn.ftz.f32 %f240, %f239, %f402, %f395; + neg.f32 %f241, %f402; + selp.f32 %f242, %f241, %f13, %p14; + fma.rn.ftz.f32 %f403, %f240, %f242, %f242; + @%p15 bra $L__BB0_20; + ex2.approx.ftz.f32 %f243, %f403; + sub.f32 %f245, %f358, %f243; + mov.b32 %r35, %f245; + mov.b32 %r36, %f13; + and.b32 %r37, %r36, -2147483648; + or.b32 %r38, %r37, %r35; + mov.b32 %f403, %r38; +$L__BB0_20: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } + mov.b32 %f7, %r12; + .loc 1 30 23 + abs.ftz.f32 %f82, %f14; + setp.ge.f32 %p17, %f82, 0f3F8060FE; + mov.f32 %f410, 0f3789CA3C; + mov.f32 %f409, 0fB9F560B9; + mov.f32 %f408, 0f3BAC840B; + mov.f32 %f407, 0fBD0C8162; + mov.f32 %f406, 0f3E1CF906; + mov.f32 %f405, 0f3F6A937E; + mov.f32 %f404, 0f3F20D842; + mov.f32 %f411, %f82; + @%p17 bra $L__BB0_22; + mul.f32 %f411, %f14, %f14; + mov.f32 %f410, 0f38B1E96A; + mov.f32 %f409, 0fBA574D20; + mov.f32 %f408, 0f3BAAD5EA; + mov.f32 %f407, 0fBCDC1BE7; + mov.f32 %f406, 0f3DE718AF; + mov.f32 %f405, 0fBEC093AC; + mov.f32 %f404, 0f3E0375D3; +$L__BB0_22: + .loc 1 0 0 + cvt.f32.bf16 %r13, %rs8; + mul.f32 %f15, %f7, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p18, %f82, 0f3F8060FE; + fma.rn.ftz.f32 %f260, %f410, %f411, %f409; + fma.rn.ftz.f32 %f261, %f260, %f411, %f408; + fma.rn.ftz.f32 %f262, %f261, %f411, %f407; + fma.rn.ftz.f32 %f263, %f262, %f411, %f406; + fma.rn.ftz.f32 %f264, %f263, %f411, %f405; + fma.rn.ftz.f32 %f265, %f264, %f411, %f404; + neg.f32 %f266, %f411; + selp.f32 %f267, %f266, %f14, %p17; + fma.rn.ftz.f32 %f412, %f265, %f267, %f267; + @%p18 bra $L__BB0_24; + ex2.approx.ftz.f32 %f268, %f412; + sub.f32 %f270, %f358, %f268; + mov.b32 %r39, %f270; + mov.b32 %r40, %f14; + and.b32 %r41, %r40, -2147483648; + or.b32 %r42, %r41, %r39; + mov.b32 %f412, %r42; +$L__BB0_24: + .loc 1 0 0 + mov.b32 %f8, %r13; + .loc 1 30 23 + abs.ftz.f32 %f95, %f15; + setp.ge.f32 %p20, %f95, 0f3F8060FE; + mov.f32 %f419, 0f3789CA3C; + mov.f32 %f418, 0fB9F560B9; + mov.f32 %f417, 0f3BAC840B; + mov.f32 %f416, 0fBD0C8162; + mov.f32 %f415, 0f3E1CF906; + mov.f32 %f414, 0f3F6A937E; + mov.f32 %f413, 0f3F20D842; + mov.f32 %f420, %f95; + @%p20 bra $L__BB0_26; + mul.f32 %f420, %f15, %f15; + mov.f32 %f419, 0f38B1E96A; + mov.f32 %f418, 0fBA574D20; + mov.f32 %f417, 0f3BAAD5EA; + mov.f32 %f416, 0fBCDC1BE7; + mov.f32 %f415, 0f3DE718AF; + mov.f32 %f414, 0fBEC093AC; + mov.f32 %f413, 0f3E0375D3; +$L__BB0_26: + .loc 1 0 0 + mul.f32 %f16, %f8, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p21, %f95, 0f3F8060FE; + fma.rn.ftz.f32 %f285, %f419, %f420, %f418; + fma.rn.ftz.f32 %f286, %f285, %f420, %f417; + fma.rn.ftz.f32 %f287, %f286, %f420, %f416; + fma.rn.ftz.f32 %f288, %f287, %f420, %f415; + fma.rn.ftz.f32 %f289, %f288, %f420, %f414; + fma.rn.ftz.f32 %f290, %f289, %f420, %f413; + neg.f32 %f291, %f420; + selp.f32 %f292, %f291, %f15, %p20; + fma.rn.ftz.f32 %f421, %f290, %f292, %f292; + @%p21 bra $L__BB0_28; + ex2.approx.ftz.f32 %f293, %f421; + sub.f32 %f295, %f358, %f293; + mov.b32 %r43, %f295; + mov.b32 %r44, %f15; + and.b32 %r45, %r44, -2147483648; + or.b32 %r46, %r45, %r43; + mov.b32 %f421, %r46; +$L__BB0_28: + abs.ftz.f32 %f108, %f16; + setp.ge.f32 %p23, %f108, 0f3F8060FE; + mov.f32 %f428, 0f3789CA3C; + mov.f32 %f427, 0fB9F560B9; + mov.f32 %f426, 0f3BAC840B; + mov.f32 %f425, 0fBD0C8162; + mov.f32 %f424, 0f3E1CF906; + mov.f32 %f423, 0f3F6A937E; + mov.f32 %f422, 0f3F20D842; + mov.f32 %f429, %f108; + @%p23 bra $L__BB0_30; + mul.f32 %f429, %f16, %f16; + mov.f32 %f428, 0f38B1E96A; + mov.f32 %f427, 0fBA574D20; + mov.f32 %f426, 0f3BAAD5EA; + mov.f32 %f425, 0fBCDC1BE7; + mov.f32 %f424, 0f3DE718AF; + mov.f32 %f423, 0fBEC093AC; + mov.f32 %f422, 0f3E0375D3; +$L__BB0_30: + .loc 1 0 23 + ld.param.u64 %rd2, [triton__0d1d2de_param_1]; + cvt.s64.s32 %rd1, %r18; + .loc 1 30 23 + setp.ltu.f32 %p24, %f108, 0f3F8060FE; + fma.rn.ftz.f32 %f310, %f428, %f429, %f427; + fma.rn.ftz.f32 %f311, %f310, %f429, %f426; + fma.rn.ftz.f32 %f312, %f311, %f429, %f425; + fma.rn.ftz.f32 %f313, %f312, %f429, %f424; + fma.rn.ftz.f32 %f314, %f313, %f429, %f423; + fma.rn.ftz.f32 %f315, %f314, %f429, %f422; + neg.f32 %f316, %f429; + selp.f32 %f317, %f316, %f16, %p23; + fma.rn.ftz.f32 %f430, %f315, %f317, %f317; + @%p24 bra $L__BB0_32; + ex2.approx.ftz.f32 %f318, %f430; + sub.f32 %f320, %f358, %f318; + mov.b32 %r47, %f320; + mov.b32 %r48, %f16; + and.b32 %r49, %r48, -2147483648; + or.b32 %r50, %r49, %r47; + mov.b32 %f430, %r50; +$L__BB0_32: + .loc 1 27 18 + mul.f32 %f321, %f8, 0f3F000000; + mul.f32 %f322, %f7, 0f3F000000; + mul.f32 %f323, %f6, 0f3F000000; + mul.f32 %f324, %f5, 0f3F000000; + mul.f32 %f325, %f4, 0f3F000000; + mul.f32 %f326, %f3, 0f3F000000; + mul.f32 %f327, %f2, 0f3F000000; + mul.f32 %f328, %f1, 0f3F000000; + .loc 1 32 18 + add.f32 %f329, %f367, 0f3F800000; + add.f32 %f330, %f376, 0f3F800000; + add.f32 %f331, %f385, 0f3F800000; + add.f32 %f332, %f394, 0f3F800000; + add.f32 %f333, %f403, 0f3F800000; + add.f32 %f334, %f412, 0f3F800000; + add.f32 %f335, %f421, 0f3F800000; + add.f32 %f336, %f430, 0f3F800000; + .loc 1 33 18 + mul.f32 %f337, %f328, %f329; + mul.f32 %f338, %f327, %f330; + mul.f32 %f339, %f326, %f331; + mul.f32 %f340, %f325, %f332; + mul.f32 %f341, %f324, %f333; + mul.f32 %f342, %f323, %f334; + mul.f32 %f343, %f322, %f335; + mul.f32 %f344, %f321, %f336; + .loc 1 35 25 + shl.b64 %rd7, %rd1, 1; + add.s64 %rd6, %rd2, %rd7; + .loc 1 35 37 + mov.b32 %r51, %f337; + cvt.rn.bf16.f32 %rs9, %r51; + mov.b32 %r52, %f338; + cvt.rn.bf16.f32 %rs10, %r52; + mov.b32 %r53, %f339; + cvt.rn.bf16.f32 %rs11, %r53; + mov.b32 %r54, %f340; + cvt.rn.bf16.f32 %rs12, %r54; + mov.b32 %r55, %f341; + cvt.rn.bf16.f32 %rs13, %r55; + mov.b32 %r56, %f342; + cvt.rn.bf16.f32 %rs14, %r56; + mov.b32 %r57, %f343; + cvt.rn.bf16.f32 %rs15, %r57; + mov.b32 %r58, %f344; + cvt.rn.bf16.f32 %rs16, %r58; + mov.b32 %r63, {%rs9, %rs10}; + mov.b32 %r64, {%rs11, %rs12}; + mov.b32 %r65, {%rs13, %rs14}; + mov.b32 %r66, {%rs15, %rs16}; + @%p1 st.global.v4.b32 [ %rd6 + 0 ], { %r63, %r64, %r65, %r66 }; + .loc 1 35 4 + ret; +$L__tmp1: +$L__func_end0: + +} + // .globl __nv_erff +.visible .func (.param .b32 func_retval0) __nv_erff( + .param .b32 __nv_erff_param_0 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<5>; + .reg .f32 %f<49>; +$L__func_begin1: + + ld.param.f32 %f14, [__nv_erff_param_0]; + abs.ftz.f32 %f1, %f14; + setp.ge.f32 %p1, %f1, 0f3F8060FE; + mov.f32 %f46, 0f3789CA3C; + mov.f32 %f45, 0fB9F560B9; + mov.f32 %f44, 0f3BAC840B; + mov.f32 %f43, 0fBD0C8162; + mov.f32 %f42, 0f3E1CF906; + mov.f32 %f41, 0f3F6A937E; + mov.f32 %f40, 0f3F20D842; + mov.f32 %f47, %f1; + @%p1 bra $L__BB1_2; + mul.f32 %f47, %f14, %f14; + mov.f32 %f46, 0f38B1E96A; + mov.f32 %f45, 0fBA574D20; + mov.f32 %f44, 0f3BAAD5EA; + mov.f32 %f43, 0fBCDC1BE7; + mov.f32 %f42, 0f3DE718AF; + mov.f32 %f41, 0fBEC093AC; + mov.f32 %f40, 0f3E0375D3; +$L__BB1_2: + setp.ltu.f32 %p2, %f1, 0f3F8060FE; + fma.rn.ftz.f32 %f29, %f46, %f47, %f45; + fma.rn.ftz.f32 %f30, %f29, %f47, %f44; + fma.rn.ftz.f32 %f31, %f30, %f47, %f43; + fma.rn.ftz.f32 %f32, %f31, %f47, %f42; + fma.rn.ftz.f32 %f33, %f32, %f47, %f41; + fma.rn.ftz.f32 %f34, %f33, %f47, %f40; + neg.f32 %f35, %f47; + selp.f32 %f36, %f35, %f14, %p1; + fma.rn.ftz.f32 %f48, %f34, %f36, %f36; + @%p2 bra $L__BB1_4; + ex2.approx.ftz.f32 %f37, %f48; + mov.f32 %f38, 0f3F800000; + sub.f32 %f39, %f38, %f37; + mov.b32 %r1, %f39; + mov.b32 %r2, %f14; + and.b32 %r3, %r2, -2147483648; + or.b32 %r4, %r3, %r1; + mov.b32 %f48, %r4; +$L__BB1_4: + st.param.f32 [func_retval0+0], %f48; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/jf/cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 106 +.b8 102 +.b8 111 +.b8 113 +.b8 111 +.b8 51 +.b8 110 +.b8 117 +.b8 116 +.b8 110 +.b8 105 +.b8 53 +.b8 99 +.b8 109 +.b8 116 +.b8 119 +.b8 52 +.b8 98 +.b8 114 +.b8 108 +.b8 97 +.b8 51 +.b8 52 +.b8 99 +.b8 122 +.b8 52 +.b8 53 +.b8 102 +.b8 117 +.b8 115 +.b8 97 +.b8 100 +.b8 101 +.b8 104 +.b8 107 +.b8 120 +.b8 102 +.b8 107 +.b8 114 +.b8 50 +.b8 102 +.b8 105 +.b8 101 +.b8 50 +.b8 113 +.b8 103 +.b8 111 +.b8 55 +.b8 118 +.b8 119 +.b8 116 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 106 +.b8 102 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..79d21fa82a44df2150549490d783a08bf37e14f5 --- /dev/null +++ b/.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir @@ -0,0 +1,27 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32> + %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32> + %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32> + %c1024_i32 = arith.constant 1024 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1024_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> + %3 = tt.splat %1 : (i32) -> tensor<1024xi32> + %4 = arith.addi %3, %2 : tensor<1024xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16> + %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32> + %9 = arith.mulf %8, %cst_1 : tensor<1024xf32> + %10 = arith.mulf %8, %cst_0 : tensor<1024xf32> + %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32> + %12 = arith.addf %11, %cst : tensor<1024xf32> + %13 = arith.mulf %9, %12 : tensor<1024xf32> + %14 = tt.splat %arg1 : (!tt.ptr) -> tensor<1024x!tt.ptr> + %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr>, tensor<1024xi32> + %16 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16> + tt.store %15, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16> + tt.return + } +} diff --git a/.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir b/.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..f67711bc6dcda8ca57c15c39f8f5a9551df2835d --- /dev/null +++ b/.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir @@ -0,0 +1,1473 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed" +@assertFile_0 = internal constant [38 x i8] c"" +@assertMessage_0 = internal constant [39 x i8] c"index out of bounds: 0 <= tmp11 < 50257" +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr + +define void @triton__0d1d2d3d4d5d6e7de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, i64 %6, i64 %7) local_unnamed_addr !dbg !7 { + %9 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %urem = and i32 %9, 255, !dbg !10 + %10 = or i32 %urem, 256, !dbg !10 + %11 = or i32 %urem, 512, !dbg !10 + %12 = or i32 %urem, 768, !dbg !10 + %13 = shl nuw nsw i32 %urem, 2, !dbg !10 + %14 = or i32 %13, 1, !dbg !10 + %15 = or i32 %13, 2, !dbg !10 + %16 = or i32 %13, 3, !dbg !10 + %17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #5, !dbg !11 + %18 = sext i32 %17 to i64, !dbg !12 + %19 = icmp slt i32 %17, 8, !dbg !13 + %20 = mul nsw i64 %18, 7680, !dbg !14 + %21 = mul nsw i64 %18, 385973760, !dbg !15 + %22 = getelementptr i64, ptr addrspace(1) %0, i64 %20 + %23 = shl nuw nsw i32 %urem, 3 + %24 = zext nneg i32 %23 to i64 + %25 = getelementptr float, ptr addrspace(3) @global_smem, i64 %24 + %26 = shl nuw nsw i32 %14, 1 + %27 = zext nneg i32 %26 to i64 + %28 = getelementptr float, ptr addrspace(3) @global_smem, i64 %27 + %29 = shl nuw nsw i32 %15, 1 + %30 = zext nneg i32 %29 to i64 + %31 = getelementptr float, ptr addrspace(3) @global_smem, i64 %30 + %32 = shl nuw nsw i32 %16, 1 + %33 = zext nneg i32 %32 to i64 + %34 = getelementptr float, ptr addrspace(3) @global_smem, i64 %33 + %35 = shl nuw nsw i32 %urem, 1 + %36 = zext nneg i32 %35 to i64 + %37 = getelementptr float, ptr addrspace(3) @global_smem, i64 %36 + %38 = shl nuw nsw i32 %10, 1 + %39 = zext nneg i32 %38 to i64 + %40 = getelementptr float, ptr addrspace(3) @global_smem, i64 %39 + %41 = shl nuw nsw i32 %11, 1 + %42 = zext nneg i32 %41 to i64 + %43 = getelementptr float, ptr addrspace(3) @global_smem, i64 %42 + %44 = shl nuw nsw i32 %12, 1 + %45 = zext nneg i32 %44 to i64 + %46 = getelementptr float, ptr addrspace(3) @global_smem, i64 %45 + %47 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %24 + %48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %27 + %49 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %30 + %50 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %33 + %51 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %36 + %52 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %39 + %53 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %42 + %54 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %45 + %55 = zext nneg i32 %13 to i64 + %56 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %55 + %57 = zext nneg i32 %urem to i64 + %58 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %57 + %59 = zext nneg i32 %10 to i64 + %60 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %59 + %61 = zext nneg i32 %11 to i64 + %62 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %61 + %63 = zext nneg i32 %12 to i64 + %64 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %63 + %65 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %24 + %66 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %27 + %67 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %30 + %68 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %33 + %69 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %36 + %70 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %39 + %71 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %42 + %72 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %45 + %73 = insertelement <8 x i1> poison, i1 %19, i64 0, !dbg !16 + br label %74, !dbg !17 + +74: ; preds = %8, %__nv_logf.exit239 + %75 = phi i32 [ 0, %8 ], [ %774, %__nv_logf.exit239 ] + %76 = phi <8 x float> [ zeroinitializer, %8 ], [ %773, %__nv_logf.exit239 ] + %77 = phi <8 x i64> [ zeroinitializer, %8 ], [ %211, %__nv_logf.exit239 ] + %78 = or i32 %75, 1536, !dbg !18 + %79 = or i32 %75, %13, !dbg !18 + %80 = zext nneg i32 %79 to i64, !dbg !18 + %81 = or i32 %75, %14, !dbg !18 + %82 = zext nneg i32 %81 to i64, !dbg !18 + %83 = or i32 %75, %15, !dbg !18 + %84 = zext nneg i32 %83 to i64, !dbg !18 + %85 = or i32 %75, %16, !dbg !18 + %86 = zext nneg i32 %85 to i64, !dbg !18 + %87 = or i32 %79, 1024, !dbg !18 + %88 = zext nneg i32 %87 to i64, !dbg !18 + %89 = or i32 %79, 1025, !dbg !18 + %90 = zext nneg i32 %89 to i64, !dbg !18 + %91 = or i32 %79, 1026, !dbg !18 + %92 = zext nneg i32 %91 to i64, !dbg !18 + %93 = or i32 %79, 1027, !dbg !18 + %94 = zext nneg i32 %93 to i64, !dbg !18 + %95 = icmp ult i32 %78, 7680, !dbg !19 + %96 = icmp ult i32 %87, 7680, !dbg !19 + %97 = add nsw i64 %20, %80, !dbg !20 + %98 = add nsw i64 %20, %88, !dbg !20 + %99 = getelementptr i64, ptr addrspace(1) %0, i64 %97, !dbg !21 + %100 = getelementptr i64, ptr addrspace(1) %22, i64 %84, !dbg !21 + %101 = getelementptr i64, ptr addrspace(1) %0, i64 %98, !dbg !21 + %102 = getelementptr i64, ptr addrspace(1) %22, i64 %92, !dbg !21 + %103 = and i1 %19, %95, !dbg !22 + %104 = and i1 %19, %96, !dbg !22 + %105 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %99, i1 %19, i1 %19, i1 %19) #5, !dbg !23 + %106 = extractvalue { i64, i64 } %105, 0, !dbg !23 + %107 = extractvalue { i64, i64 } %105, 1, !dbg !23 + %108 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %100, i1 %19, i1 %19, i1 %19) #5, !dbg !23 + %109 = extractvalue { i64, i64 } %108, 0, !dbg !23 + %110 = extractvalue { i64, i64 } %108, 1, !dbg !23 + %111 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %101, i1 %104, i1 %104, i1 %104) #5, !dbg !23 + %112 = extractvalue { i64, i64 } %111, 0, !dbg !23 + %113 = extractvalue { i64, i64 } %111, 1, !dbg !23 + %114 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.L1::evict_first.v2.b64 { $0, $1 }, [ $2 + 0 ];\0A\09@!$4 mov.u64 $0, 0x0;\0A\09@!$5 mov.u64 $1, 0x0;", "=l,=l,l,b,b,b"(ptr addrspace(1) %102, i1 %104, i1 %104, i1 %104) #5, !dbg !23 + %115 = extractvalue { i64, i64 } %114, 0, !dbg !23 + %116 = extractvalue { i64, i64 } %114, 1, !dbg !23 + %117 = getelementptr float, ptr addrspace(1) %2, i64 %97, !dbg !24 + %118 = getelementptr float, ptr addrspace(1) %2, i64 %98, !dbg !24 + %119 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %117, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19) #5, !dbg !25 + %120 = extractvalue { i32, i32, i32, i32 } %119, 0, !dbg !25 + %121 = extractvalue { i32, i32, i32, i32 } %119, 1, !dbg !25 + %122 = extractvalue { i32, i32, i32, i32 } %119, 2, !dbg !25 + %123 = extractvalue { i32, i32, i32, i32 } %119, 3, !dbg !25 + %124 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %118, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104) #5, !dbg !25 + %125 = extractvalue { i32, i32, i32, i32 } %124, 0, !dbg !25 + %126 = extractvalue { i32, i32, i32, i32 } %124, 1, !dbg !25 + %127 = extractvalue { i32, i32, i32, i32 } %124, 2, !dbg !25 + %128 = extractvalue { i32, i32, i32, i32 } %124, 3, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + store i32 %120, ptr addrspace(3) %25, align 4, !dbg !25 + store i32 %121, ptr addrspace(3) %28, align 4, !dbg !25 + store i32 %122, ptr addrspace(3) %31, align 4, !dbg !25 + store i32 %123, ptr addrspace(3) %34, align 4, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + %129 = load float, ptr addrspace(3) %37, align 4, !dbg !25 + %130 = load float, ptr addrspace(3) %40, align 4, !dbg !25 + %131 = load float, ptr addrspace(3) %43, align 4, !dbg !25 + %132 = load float, ptr addrspace(3) %46, align 4, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + store i32 %125, ptr addrspace(3) %25, align 4, !dbg !25 + store i32 %126, ptr addrspace(3) %28, align 4, !dbg !25 + store i32 %127, ptr addrspace(3) %31, align 4, !dbg !25 + store i32 %128, ptr addrspace(3) %34, align 4, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + %133 = load float, ptr addrspace(3) %37, align 4, !dbg !25 + %134 = load float, ptr addrspace(3) %40, align 4, !dbg !25 + %135 = load float, ptr addrspace(3) %43, align 4, !dbg !25 + %136 = load float, ptr addrspace(3) %46, align 4, !dbg !25 + %137 = getelementptr float, ptr addrspace(1) %3, i64 %97, !dbg !26 + %138 = getelementptr float, ptr addrspace(1) %3, i64 %98, !dbg !26 + %139 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %137, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19, i32 0, i1 %19) #5, !dbg !27 + %140 = extractvalue { i32, i32, i32, i32 } %139, 0, !dbg !27 + %141 = extractvalue { i32, i32, i32, i32 } %139, 1, !dbg !27 + %142 = extractvalue { i32, i32, i32, i32 } %139, 2, !dbg !27 + %143 = extractvalue { i32, i32, i32, i32 } %139, 3, !dbg !27 + %144 = bitcast i32 %140 to float, !dbg !27 + %145 = bitcast i32 %141 to float, !dbg !27 + %146 = bitcast i32 %142 to float, !dbg !27 + %147 = bitcast i32 %143 to float, !dbg !27 + %148 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_first.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %138, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104, i32 0, i1 %104) #5, !dbg !27 + %149 = extractvalue { i32, i32, i32, i32 } %148, 0, !dbg !27 + %150 = extractvalue { i32, i32, i32, i32 } %148, 1, !dbg !27 + %151 = extractvalue { i32, i32, i32, i32 } %148, 2, !dbg !27 + %152 = extractvalue { i32, i32, i32, i32 } %148, 3, !dbg !27 + %153 = bitcast i32 %149 to float, !dbg !27 + %154 = bitcast i32 %150 to float, !dbg !27 + %155 = bitcast i32 %151 to float, !dbg !27 + %156 = bitcast i32 %152 to float, !dbg !27 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %157 = insertelement <8 x i64> poison, i64 %106, i64 0, !dbg !28 + %158 = insertelement <8 x i64> %157, i64 %107, i64 1, !dbg !28 + %159 = insertelement <8 x i64> %158, i64 %109, i64 2, !dbg !28 + %160 = insertelement <8 x i64> %159, i64 %110, i64 3, !dbg !28 + %161 = insertelement <8 x i64> %160, i64 %112, i64 4, !dbg !28 + %162 = insertelement <8 x i64> %161, i64 %113, i64 5, !dbg !28 + %163 = insertelement <8 x i64> %162, i64 %115, i64 6, !dbg !28 + %164 = insertelement <8 x i64> %163, i64 %116, i64 7, !dbg !28 + %165 = icmp ne <8 x i64> %164, , !dbg !28 + %166 = extractelement <8 x i1> %165, i64 0, !dbg !29 + %167 = zext i1 %166 to i8, !dbg !28 + %168 = insertelement <1 x i8> undef, i8 %167, i64 0, !dbg !28 + store <1 x i8> %168, ptr addrspace(3) %47, align 1, !dbg !28 + %169 = extractelement <8 x i1> %165, i64 1, !dbg !29 + %170 = zext i1 %169 to i8, !dbg !28 + %171 = insertelement <1 x i8> undef, i8 %170, i64 0, !dbg !28 + store <1 x i8> %171, ptr addrspace(3) %48, align 1, !dbg !28 + %172 = extractelement <8 x i1> %165, i64 2, !dbg !29 + %173 = zext i1 %172 to i8, !dbg !28 + %174 = insertelement <1 x i8> undef, i8 %173, i64 0, !dbg !28 + store <1 x i8> %174, ptr addrspace(3) %49, align 1, !dbg !28 + %175 = extractelement <8 x i1> %165, i64 3, !dbg !29 + %176 = zext i1 %175 to i8, !dbg !28 + %177 = insertelement <1 x i8> undef, i8 %176, i64 0, !dbg !28 + store <1 x i8> %177, ptr addrspace(3) %50, align 1, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %178 = load i8, ptr addrspace(3) %51, align 1, !dbg !28 + %179 = load i8, ptr addrspace(3) %52, align 1, !dbg !28 + %180 = load i8, ptr addrspace(3) %53, align 1, !dbg !28 + %181 = load i8, ptr addrspace(3) %54, align 1, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %182 = extractelement <8 x i1> %165, i64 4, !dbg !29 + %183 = zext i1 %182 to i8, !dbg !28 + %184 = insertelement <1 x i8> undef, i8 %183, i64 0, !dbg !28 + store <1 x i8> %184, ptr addrspace(3) %47, align 1, !dbg !28 + %185 = extractelement <8 x i1> %165, i64 5, !dbg !29 + %186 = zext i1 %185 to i8, !dbg !28 + %187 = insertelement <1 x i8> undef, i8 %186, i64 0, !dbg !28 + store <1 x i8> %187, ptr addrspace(3) %48, align 1, !dbg !28 + %188 = extractelement <8 x i1> %165, i64 6, !dbg !29 + %189 = zext i1 %188 to i8, !dbg !28 + %190 = insertelement <1 x i8> undef, i8 %189, i64 0, !dbg !28 + store <1 x i8> %190, ptr addrspace(3) %49, align 1, !dbg !28 + %191 = extractelement <8 x i1> %165, i64 7, !dbg !29 + %192 = zext i1 %191 to i8, !dbg !28 + %193 = insertelement <1 x i8> undef, i8 %192, i64 0, !dbg !28 + store <1 x i8> %193, ptr addrspace(3) %50, align 1, !dbg !28 + tail call void @llvm.nvvm.barrier0(), !dbg !28 + %194 = load i8, ptr addrspace(3) %51, align 1, !dbg !28 + %195 = load i8, ptr addrspace(3) %52, align 1, !dbg !28 + %196 = load i8, ptr addrspace(3) %53, align 1, !dbg !28 + %197 = load i8, ptr addrspace(3) %54, align 1, !dbg !28 + %198 = insertelement <8 x i8> poison, i8 %178, i64 0, !dbg !28 + %199 = insertelement <8 x i8> %198, i8 %179, i64 1, !dbg !28 + %200 = insertelement <8 x i8> %199, i8 %180, i64 2, !dbg !28 + %201 = insertelement <8 x i8> %200, i8 %181, i64 3, !dbg !28 + %202 = insertelement <8 x i8> %201, i8 %194, i64 4, !dbg !28 + %203 = insertelement <8 x i8> %202, i8 %195, i64 5, !dbg !28 + %204 = insertelement <8 x i8> %203, i8 %196, i64 6, !dbg !28 + %205 = insertelement <8 x i8> %204, i8 %197, i64 7, !dbg !28 + %206 = icmp eq <8 x i8> %205, zeroinitializer, !dbg !28 + %207 = insertelement <8 x i1> %73, i1 %104, i64 1, !dbg !30 + %208 = shufflevector <8 x i1> %207, <8 x i1> poison, <8 x i32> , !dbg !30 + %209 = select <8 x i1> %208, <8 x i1> %165, <8 x i1> zeroinitializer, !dbg !30 + %210 = zext <8 x i1> %209 to <8 x i64>, !dbg !30 + %211 = add <8 x i64> %77, %210, !dbg !30 + tail call void @llvm.nvvm.barrier0(), !dbg !31 + %212 = shufflevector <8 x i1> %165, <8 x i1> poison, <4 x i32> , !dbg !29 + %213 = insertelement <4 x i64> poison, i64 %106, i64 0, !dbg !29 + %214 = insertelement <4 x i64> %213, i64 %107, i64 1, !dbg !29 + %215 = insertelement <4 x i64> %214, i64 %109, i64 2, !dbg !29 + %216 = insertelement <4 x i64> %215, i64 %110, i64 3, !dbg !29 + %217 = select <4 x i1> %212, <4 x i64> %216, <4 x i64> zeroinitializer, !dbg !29 + %218 = add <4 x i64> %217, , !dbg !32 + %219 = icmp slt <4 x i64> %217, zeroinitializer, !dbg !33 + %220 = select <4 x i1> %219, <4 x i64> %218, <4 x i64> %217, !dbg !34 + %221 = icmp ult <4 x i64> %220, , !dbg !31 + %222 = zext <4 x i1> %221 to <4 x i8>, !dbg !31 + store <4 x i8> %222, ptr addrspace(3) %56, align 4, !dbg !31 + tail call void @llvm.nvvm.barrier0(), !dbg !31 + %223 = load i8, ptr addrspace(3) %58, align 1, !dbg !31 + %224 = load i8, ptr addrspace(3) %60, align 1, !dbg !31 + %225 = load i8, ptr addrspace(3) %62, align 1, !dbg !31 + %226 = load i8, ptr addrspace(3) %64, align 1, !dbg !31 + tail call void @llvm.nvvm.barrier0(), !dbg !31 + %227 = shufflevector <8 x i1> %165, <8 x i1> poison, <4 x i32> , !dbg !29 + %228 = insertelement <4 x i64> poison, i64 %112, i64 0, !dbg !29 + %229 = insertelement <4 x i64> %228, i64 %113, i64 1, !dbg !29 + %230 = insertelement <4 x i64> %229, i64 %115, i64 2, !dbg !29 + %231 = insertelement <4 x i64> %230, i64 %116, i64 3, !dbg !29 + %232 = select <4 x i1> %227, <4 x i64> %231, <4 x i64> zeroinitializer, !dbg !29 + %233 = add <4 x i64> %232, , !dbg !32 + %234 = icmp slt <4 x i64> %232, zeroinitializer, !dbg !33 + %235 = select <4 x i1> %234, <4 x i64> %233, <4 x i64> %232, !dbg !34 + %236 = icmp ult <4 x i64> %235, , !dbg !31 + %237 = zext <4 x i1> %236 to <4 x i8>, !dbg !31 + store <4 x i8> %237, ptr addrspace(3) %56, align 4, !dbg !31 + tail call void @llvm.nvvm.barrier0(), !dbg !31 + %238 = load i8, ptr addrspace(3) %58, align 1, !dbg !31 + %239 = load i8, ptr addrspace(3) %60, align 1, !dbg !31 + %240 = load i8, ptr addrspace(3) %62, align 1, !dbg !31 + %241 = load i8, ptr addrspace(3) %64, align 1, !dbg !31 + %242 = insertelement <8 x i8> poison, i8 %224, i64 0, !dbg !31 + %243 = insertelement <8 x i8> %242, i8 %223, i64 1, !dbg !31 + %244 = insertelement <8 x i8> %243, i8 %225, i64 2, !dbg !31 + %245 = insertelement <8 x i8> %244, i8 %226, i64 3, !dbg !31 + %246 = insertelement <8 x i8> %245, i8 %238, i64 4, !dbg !31 + %247 = insertelement <8 x i8> %246, i8 %239, i64 5, !dbg !31 + %248 = insertelement <8 x i8> %247, i8 %240, i64 6, !dbg !31 + %249 = insertelement <8 x i8> %248, i8 %241, i64 7, !dbg !31 + %250 = icmp eq <8 x i8> %249, zeroinitializer, !dbg !31 + %251 = bitcast <8 x i1> %250 to i8, !dbg !35 + %.not = icmp eq i8 %251, 0, !dbg !35 + br i1 %.not, label %253, label %252, !dbg !35 + +252: ; preds = %74 + tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !35 + br label %253, !dbg !35 + +253: ; preds = %252, %74 + %254 = mul nuw nsw i64 %80, 50257, !dbg !36 + %255 = mul nuw nsw i64 %82, 50257, !dbg !36 + %256 = mul nuw nsw i64 %84, 50257, !dbg !36 + %257 = mul nuw nsw i64 %86, 50257, !dbg !36 + %258 = mul nuw nsw i64 %88, 50257, !dbg !36 + %259 = mul nuw nsw i64 %90, 50257, !dbg !36 + %260 = mul nuw nsw i64 %92, 50257, !dbg !36 + %261 = mul nuw nsw i64 %94, 50257, !dbg !36 + %262 = extractelement <4 x i64> %220, i64 0, !dbg !37 + %263 = getelementptr i16, ptr addrspace(1) %1, i64 %262, !dbg !37 + %264 = getelementptr i16, ptr addrspace(1) %263, i64 %254, !dbg !37 + %265 = getelementptr i16, ptr addrspace(1) %264, i64 %21, !dbg !37 + %266 = extractelement <4 x i64> %220, i64 1, !dbg !37 + %267 = getelementptr i16, ptr addrspace(1) %1, i64 %266, !dbg !37 + %268 = getelementptr i16, ptr addrspace(1) %267, i64 %255, !dbg !37 + %269 = getelementptr i16, ptr addrspace(1) %268, i64 %21, !dbg !37 + %270 = extractelement <4 x i64> %220, i64 2, !dbg !37 + %271 = getelementptr i16, ptr addrspace(1) %1, i64 %270, !dbg !37 + %272 = getelementptr i16, ptr addrspace(1) %271, i64 %256, !dbg !37 + %273 = getelementptr i16, ptr addrspace(1) %272, i64 %21, !dbg !37 + %274 = extractelement <4 x i64> %220, i64 3, !dbg !37 + %275 = getelementptr i16, ptr addrspace(1) %1, i64 %274, !dbg !37 + %276 = getelementptr i16, ptr addrspace(1) %275, i64 %257, !dbg !37 + %277 = getelementptr i16, ptr addrspace(1) %276, i64 %21, !dbg !37 + %278 = extractelement <4 x i64> %235, i64 0, !dbg !37 + %279 = getelementptr i16, ptr addrspace(1) %1, i64 %278, !dbg !37 + %280 = getelementptr i16, ptr addrspace(1) %279, i64 %258, !dbg !37 + %281 = getelementptr i16, ptr addrspace(1) %280, i64 %21, !dbg !37 + %282 = extractelement <4 x i64> %235, i64 1, !dbg !37 + %283 = getelementptr i16, ptr addrspace(1) %1, i64 %282, !dbg !37 + %284 = getelementptr i16, ptr addrspace(1) %283, i64 %259, !dbg !37 + %285 = getelementptr i16, ptr addrspace(1) %284, i64 %21, !dbg !37 + %286 = extractelement <4 x i64> %235, i64 2, !dbg !37 + %287 = getelementptr i16, ptr addrspace(1) %1, i64 %286, !dbg !37 + %288 = getelementptr i16, ptr addrspace(1) %287, i64 %260, !dbg !37 + %289 = getelementptr i16, ptr addrspace(1) %288, i64 %21, !dbg !37 + %290 = extractelement <4 x i64> %235, i64 3, !dbg !37 + %291 = getelementptr i16, ptr addrspace(1) %1, i64 %290, !dbg !37 + %292 = getelementptr i16, ptr addrspace(1) %291, i64 %261, !dbg !37 + %293 = getelementptr i16, ptr addrspace(1) %292, i64 %21, !dbg !37 + tail call void @llvm.nvvm.barrier0(), !dbg !38 + %294 = ptrtoint ptr addrspace(1) %265 to i64, !dbg !38 + %295 = insertelement <1 x i64> undef, i64 %294, i64 0, !dbg !38 + store <1 x i64> %295, ptr addrspace(3) %65, align 8, !dbg !38 + %296 = ptrtoint ptr addrspace(1) %269 to i64, !dbg !38 + %297 = insertelement <1 x i64> undef, i64 %296, i64 0, !dbg !38 + store <1 x i64> %297, ptr addrspace(3) %66, align 8, !dbg !38 + %298 = ptrtoint ptr addrspace(1) %273 to i64, !dbg !38 + %299 = insertelement <1 x i64> undef, i64 %298, i64 0, !dbg !38 + store <1 x i64> %299, ptr addrspace(3) %67, align 8, !dbg !38 + %300 = ptrtoint ptr addrspace(1) %277 to i64, !dbg !38 + %301 = insertelement <1 x i64> undef, i64 %300, i64 0, !dbg !38 + store <1 x i64> %301, ptr addrspace(3) %68, align 8, !dbg !38 + tail call void @llvm.nvvm.barrier0(), !dbg !38 + %302 = load i64, ptr addrspace(3) %69, align 8, !dbg !38 + %303 = inttoptr i64 %302 to ptr addrspace(1), !dbg !38 + %304 = load i64, ptr addrspace(3) %70, align 8, !dbg !38 + %305 = inttoptr i64 %304 to ptr addrspace(1), !dbg !38 + %306 = load i64, ptr addrspace(3) %71, align 8, !dbg !38 + %307 = inttoptr i64 %306 to ptr addrspace(1), !dbg !38 + %308 = load i64, ptr addrspace(3) %72, align 8, !dbg !38 + %309 = inttoptr i64 %308 to ptr addrspace(1), !dbg !38 + tail call void @llvm.nvvm.barrier0(), !dbg !38 + %310 = ptrtoint ptr addrspace(1) %281 to i64, !dbg !38 + %311 = insertelement <1 x i64> undef, i64 %310, i64 0, !dbg !38 + store <1 x i64> %311, ptr addrspace(3) %65, align 8, !dbg !38 + %312 = ptrtoint ptr addrspace(1) %285 to i64, !dbg !38 + %313 = insertelement <1 x i64> undef, i64 %312, i64 0, !dbg !38 + store <1 x i64> %313, ptr addrspace(3) %66, align 8, !dbg !38 + %314 = ptrtoint ptr addrspace(1) %289 to i64, !dbg !38 + %315 = insertelement <1 x i64> undef, i64 %314, i64 0, !dbg !38 + store <1 x i64> %315, ptr addrspace(3) %67, align 8, !dbg !38 + %316 = ptrtoint ptr addrspace(1) %293 to i64, !dbg !38 + %317 = insertelement <1 x i64> undef, i64 %316, i64 0, !dbg !38 + store <1 x i64> %317, ptr addrspace(3) %68, align 8, !dbg !38 + tail call void @llvm.nvvm.barrier0(), !dbg !38 + %318 = load i64, ptr addrspace(3) %69, align 8, !dbg !38 + %319 = inttoptr i64 %318 to ptr addrspace(1), !dbg !38 + %320 = load i64, ptr addrspace(3) %70, align 8, !dbg !38 + %321 = inttoptr i64 %320 to ptr addrspace(1), !dbg !38 + %322 = load i64, ptr addrspace(3) %71, align 8, !dbg !38 + %323 = inttoptr i64 %322 to ptr addrspace(1), !dbg !38 + %324 = load i64, ptr addrspace(3) %72, align 8, !dbg !38 + %325 = inttoptr i64 %324 to ptr addrspace(1), !dbg !38 + %326 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %303, i1 %19, i16 0, i1 %19) #5, !dbg !38 + %327 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %305, i1 %19, i16 0, i1 %19) #5, !dbg !38 + %328 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %307, i1 %19, i16 0, i1 %19) #5, !dbg !38 + %329 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %309, i1 %19, i16 0, i1 %19) #5, !dbg !38 + %330 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %319, i1 %19, i16 0, i1 %19) #5, !dbg !38 + %331 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %321, i1 %19, i16 0, i1 %19) #5, !dbg !38 + %332 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %323, i1 %103, i16 0, i1 %103) #5, !dbg !38 + %333 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %325, i1 %103, i16 0, i1 %103) #5, !dbg !38 + %334 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %326) #5, !dbg !39 + %335 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %327) #5, !dbg !39 + %336 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %328) #5, !dbg !39 + %337 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %329) #5, !dbg !39 + %338 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %330) #5, !dbg !39 + %339 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %331) #5, !dbg !39 + %340 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %332) #5, !dbg !39 + %341 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %333) #5, !dbg !39 + %342 = insertelement <8 x float> poison, float %334, i64 0, !dbg !40 + %343 = insertelement <8 x float> %342, float %335, i64 1, !dbg !40 + %344 = insertelement <8 x float> %343, float %336, i64 2, !dbg !40 + %345 = insertelement <8 x float> %344, float %337, i64 3, !dbg !40 + %346 = insertelement <8 x float> %345, float %338, i64 4, !dbg !40 + %347 = insertelement <8 x float> %346, float %339, i64 5, !dbg !40 + %348 = insertelement <8 x float> %347, float %340, i64 6, !dbg !40 + %349 = insertelement <8 x float> %348, float %341, i64 7, !dbg !40 + %350 = insertelement <8 x float> poison, float %129, i64 0, !dbg !40 + %351 = insertelement <8 x float> %350, float %130, i64 1, !dbg !40 + %352 = insertelement <8 x float> %351, float %131, i64 2, !dbg !40 + %353 = insertelement <8 x float> %352, float %132, i64 3, !dbg !40 + %354 = insertelement <8 x float> %353, float %133, i64 4, !dbg !40 + %355 = insertelement <8 x float> %354, float %134, i64 5, !dbg !40 + %356 = insertelement <8 x float> %355, float %135, i64 6, !dbg !40 + %357 = insertelement <8 x float> %356, float %136, i64 7, !dbg !40 + %358 = fsub <8 x float> %349, %357, !dbg !40 + %359 = fcmp olt float %144, 0x3810000000000000, !dbg !41 + %360 = fmul float %144, 0x4160000000000000, !dbg !41 + %.02.i = select i1 %359, float %360, float %144, !dbg !41 + %i.i.0.i = select i1 %359, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %361 = bitcast float %.02.i to i32, !dbg !41 + %362 = add i32 %361, -1059760811, !dbg !41 + %363 = and i32 %362, -8388608, !dbg !41 + %364 = sub i32 %361, %363, !dbg !41 + %365 = bitcast i32 %364 to float, !dbg !41 + %366 = sitofp i32 %363 to float, !dbg !41 + %367 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i = icmp eq i32 %367, 0, !dbg !41 + %368 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %366, float 0x3E80000000000000, float %i.i.0.i) #5, !dbg !41 + %369 = tail call float @llvm.nvvm.fma.rn.f(float %366, float 0x3E80000000000000, float %i.i.0.i) #5, !dbg !41 + %.08.i = select i1 %.not.i, float %369, float %368, !dbg !41 + %370 = fadd float %365, -1.000000e+00, !dbg !41 + %371 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i = icmp eq i32 %371, 0, !dbg !41 + %372 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %370, float 0x3FC2073EC0000000) #5, !dbg !41 + %373 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %370, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i = select i1 %.not1.i, float %373, float %372, !dbg !41 + %374 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i = icmp eq i32 %374, 0, !dbg !41 + %375 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i, float %370, float 0xBFBF19B980000000) #5, !dbg !41 + %376 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i, float %370, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i = select i1 %.not2.i, float %376, float %375, !dbg !41 + %377 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i = icmp eq i32 %377, 0, !dbg !41 + %378 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i, float %370, float 0x3FC1E52AA0000000) #5, !dbg !41 + %379 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i, float %370, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i = select i1 %.not3.i, float %379, float %378, !dbg !41 + %380 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i = icmp eq i32 %380, 0, !dbg !41 + %381 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i, float %370, float 0xBFC55B1720000000) #5, !dbg !41 + %382 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i, float %370, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i = select i1 %.not4.i, float %382, float %381, !dbg !41 + %383 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i = icmp eq i32 %383, 0, !dbg !41 + %384 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i, float %370, float 0x3FC99DA160000000) #5, !dbg !41 + %385 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i, float %370, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i = select i1 %.not5.i, float %385, float %384, !dbg !41 + %386 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i = icmp eq i32 %386, 0, !dbg !41 + %387 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i, float %370, float 0xBFCFFFE440000000) #5, !dbg !41 + %388 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i, float %370, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i = select i1 %.not6.i, float %388, float %387, !dbg !41 + %389 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i = icmp eq i32 %389, 0, !dbg !41 + %390 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i, float %370, float 0x3FD5554F00000000) #5, !dbg !41 + %391 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i, float %370, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i = select i1 %.not7.i, float %391, float %390, !dbg !41 + %392 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i = icmp eq i32 %392, 0, !dbg !41 + %393 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i, float %370, float -5.000000e-01) #5, !dbg !41 + %394 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i, float %370, float -5.000000e-01) #5, !dbg !41 + %.07.i = select i1 %.not8.i, float %394, float %393, !dbg !41 + %395 = fmul float %370, %.07.i, !dbg !41 + %396 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i = icmp eq i32 %396, 0, !dbg !41 + %397 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %395, float %370, float %370) #5, !dbg !41 + %398 = tail call float @llvm.nvvm.fma.rn.f(float %395, float %370, float %370) #5, !dbg !41 + %.06.i = select i1 %.not9.i, float %398, float %397, !dbg !41 + %399 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i = icmp eq i32 %399, 0, !dbg !41 + %400 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #5, !dbg !41 + %401 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i, float 0x3FE62E4300000000, float %.06.i) #5, !dbg !41 + %.04.i = select i1 %.not10.i, float %401, float %400, !dbg !41 + %402 = icmp ugt i32 %361, 2139095039, !dbg !41 + br i1 %402, label %__nv_fmaf_rn.exit.i.i, label %__nv_logf.exit, !dbg !41 + +__nv_fmaf_rn.exit.i.i: ; preds = %253 + %403 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i = icmp eq i32 %403, 0, !dbg !41 + %404 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %405 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i = select i1 %.not11.i, float %405, float %404, !dbg !41 + br label %__nv_logf.exit, !dbg !41 + +__nv_logf.exit: ; preds = %253, %__nv_fmaf_rn.exit.i.i + %r.i.0.i = phi float [ %.03.i, %__nv_fmaf_rn.exit.i.i ], [ %.04.i, %253 ], !dbg !41 + %406 = fcmp oeq float %.02.i, 0.000000e+00, !dbg !41 + %r.i.1.i = select i1 %406, float 0xFFF0000000000000, float %r.i.0.i, !dbg !41 + %407 = fcmp olt float %145, 0x3810000000000000, !dbg !41 + %408 = fmul float %145, 0x4160000000000000, !dbg !41 + %.02.i30 = select i1 %407, float %408, float %145, !dbg !41 + %i.i.0.i31 = select i1 %407, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %409 = bitcast float %.02.i30 to i32, !dbg !41 + %410 = add i32 %409, -1059760811, !dbg !41 + %411 = and i32 %410, -8388608, !dbg !41 + %412 = sub i32 %409, %411, !dbg !41 + %413 = bitcast i32 %412 to float, !dbg !41 + %414 = sitofp i32 %411 to float, !dbg !41 + %415 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i32 = icmp eq i32 %415, 0, !dbg !41 + %416 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %414, float 0x3E80000000000000, float %i.i.0.i31) #5, !dbg !41 + %417 = tail call float @llvm.nvvm.fma.rn.f(float %414, float 0x3E80000000000000, float %i.i.0.i31) #5, !dbg !41 + %.08.i33 = select i1 %.not.i32, float %417, float %416, !dbg !41 + %418 = fadd float %413, -1.000000e+00, !dbg !41 + %419 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i34 = icmp eq i32 %419, 0, !dbg !41 + %420 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %418, float 0x3FC2073EC0000000) #5, !dbg !41 + %421 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %418, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i35 = select i1 %.not1.i34, float %421, float %420, !dbg !41 + %422 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i36 = icmp eq i32 %422, 0, !dbg !41 + %423 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i35, float %418, float 0xBFBF19B980000000) #5, !dbg !41 + %424 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i35, float %418, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i37 = select i1 %.not2.i36, float %424, float %423, !dbg !41 + %425 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i38 = icmp eq i32 %425, 0, !dbg !41 + %426 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i37, float %418, float 0x3FC1E52AA0000000) #5, !dbg !41 + %427 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i37, float %418, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i39 = select i1 %.not3.i38, float %427, float %426, !dbg !41 + %428 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i40 = icmp eq i32 %428, 0, !dbg !41 + %429 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i39, float %418, float 0xBFC55B1720000000) #5, !dbg !41 + %430 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i39, float %418, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i41 = select i1 %.not4.i40, float %430, float %429, !dbg !41 + %431 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i42 = icmp eq i32 %431, 0, !dbg !41 + %432 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i41, float %418, float 0x3FC99DA160000000) #5, !dbg !41 + %433 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i41, float %418, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i43 = select i1 %.not5.i42, float %433, float %432, !dbg !41 + %434 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i44 = icmp eq i32 %434, 0, !dbg !41 + %435 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i43, float %418, float 0xBFCFFFE440000000) #5, !dbg !41 + %436 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i43, float %418, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i45 = select i1 %.not6.i44, float %436, float %435, !dbg !41 + %437 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i46 = icmp eq i32 %437, 0, !dbg !41 + %438 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i45, float %418, float 0x3FD5554F00000000) #5, !dbg !41 + %439 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i45, float %418, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i47 = select i1 %.not7.i46, float %439, float %438, !dbg !41 + %440 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i48 = icmp eq i32 %440, 0, !dbg !41 + %441 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i47, float %418, float -5.000000e-01) #5, !dbg !41 + %442 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i47, float %418, float -5.000000e-01) #5, !dbg !41 + %.07.i49 = select i1 %.not8.i48, float %442, float %441, !dbg !41 + %443 = fmul float %418, %.07.i49, !dbg !41 + %444 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i50 = icmp eq i32 %444, 0, !dbg !41 + %445 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %443, float %418, float %418) #5, !dbg !41 + %446 = tail call float @llvm.nvvm.fma.rn.f(float %443, float %418, float %418) #5, !dbg !41 + %.06.i51 = select i1 %.not9.i50, float %446, float %445, !dbg !41 + %447 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i52 = icmp eq i32 %447, 0, !dbg !41 + %448 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i33, float 0x3FE62E4300000000, float %.06.i51) #5, !dbg !41 + %449 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i33, float 0x3FE62E4300000000, float %.06.i51) #5, !dbg !41 + %.04.i53 = select i1 %.not10.i52, float %449, float %448, !dbg !41 + %450 = icmp ugt i32 %409, 2139095039, !dbg !41 + br i1 %450, label %__nv_fmaf_rn.exit.i.i56, label %__nv_logf.exit59, !dbg !41 + +__nv_fmaf_rn.exit.i.i56: ; preds = %__nv_logf.exit + %451 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i57 = icmp eq i32 %451, 0, !dbg !41 + %452 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i30, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %453 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i30, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i58 = select i1 %.not11.i57, float %453, float %452, !dbg !41 + br label %__nv_logf.exit59, !dbg !41 + +__nv_logf.exit59: ; preds = %__nv_logf.exit, %__nv_fmaf_rn.exit.i.i56 + %r.i.0.i54 = phi float [ %.03.i58, %__nv_fmaf_rn.exit.i.i56 ], [ %.04.i53, %__nv_logf.exit ], !dbg !41 + %454 = fcmp oeq float %.02.i30, 0.000000e+00, !dbg !41 + %r.i.1.i55 = select i1 %454, float 0xFFF0000000000000, float %r.i.0.i54, !dbg !41 + %455 = fcmp olt float %146, 0x3810000000000000, !dbg !41 + %456 = fmul float %146, 0x4160000000000000, !dbg !41 + %.02.i60 = select i1 %455, float %456, float %146, !dbg !41 + %i.i.0.i61 = select i1 %455, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %457 = bitcast float %.02.i60 to i32, !dbg !41 + %458 = add i32 %457, -1059760811, !dbg !41 + %459 = and i32 %458, -8388608, !dbg !41 + %460 = sub i32 %457, %459, !dbg !41 + %461 = bitcast i32 %460 to float, !dbg !41 + %462 = sitofp i32 %459 to float, !dbg !41 + %463 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i62 = icmp eq i32 %463, 0, !dbg !41 + %464 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %462, float 0x3E80000000000000, float %i.i.0.i61) #5, !dbg !41 + %465 = tail call float @llvm.nvvm.fma.rn.f(float %462, float 0x3E80000000000000, float %i.i.0.i61) #5, !dbg !41 + %.08.i63 = select i1 %.not.i62, float %465, float %464, !dbg !41 + %466 = fadd float %461, -1.000000e+00, !dbg !41 + %467 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i64 = icmp eq i32 %467, 0, !dbg !41 + %468 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %466, float 0x3FC2073EC0000000) #5, !dbg !41 + %469 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %466, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i65 = select i1 %.not1.i64, float %469, float %468, !dbg !41 + %470 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i66 = icmp eq i32 %470, 0, !dbg !41 + %471 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i65, float %466, float 0xBFBF19B980000000) #5, !dbg !41 + %472 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i65, float %466, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i67 = select i1 %.not2.i66, float %472, float %471, !dbg !41 + %473 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i68 = icmp eq i32 %473, 0, !dbg !41 + %474 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i67, float %466, float 0x3FC1E52AA0000000) #5, !dbg !41 + %475 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i67, float %466, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i69 = select i1 %.not3.i68, float %475, float %474, !dbg !41 + %476 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i70 = icmp eq i32 %476, 0, !dbg !41 + %477 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i69, float %466, float 0xBFC55B1720000000) #5, !dbg !41 + %478 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i69, float %466, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i71 = select i1 %.not4.i70, float %478, float %477, !dbg !41 + %479 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i72 = icmp eq i32 %479, 0, !dbg !41 + %480 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i71, float %466, float 0x3FC99DA160000000) #5, !dbg !41 + %481 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i71, float %466, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i73 = select i1 %.not5.i72, float %481, float %480, !dbg !41 + %482 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i74 = icmp eq i32 %482, 0, !dbg !41 + %483 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i73, float %466, float 0xBFCFFFE440000000) #5, !dbg !41 + %484 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i73, float %466, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i75 = select i1 %.not6.i74, float %484, float %483, !dbg !41 + %485 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i76 = icmp eq i32 %485, 0, !dbg !41 + %486 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i75, float %466, float 0x3FD5554F00000000) #5, !dbg !41 + %487 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i75, float %466, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i77 = select i1 %.not7.i76, float %487, float %486, !dbg !41 + %488 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i78 = icmp eq i32 %488, 0, !dbg !41 + %489 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i77, float %466, float -5.000000e-01) #5, !dbg !41 + %490 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i77, float %466, float -5.000000e-01) #5, !dbg !41 + %.07.i79 = select i1 %.not8.i78, float %490, float %489, !dbg !41 + %491 = fmul float %466, %.07.i79, !dbg !41 + %492 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i80 = icmp eq i32 %492, 0, !dbg !41 + %493 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %491, float %466, float %466) #5, !dbg !41 + %494 = tail call float @llvm.nvvm.fma.rn.f(float %491, float %466, float %466) #5, !dbg !41 + %.06.i81 = select i1 %.not9.i80, float %494, float %493, !dbg !41 + %495 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i82 = icmp eq i32 %495, 0, !dbg !41 + %496 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i63, float 0x3FE62E4300000000, float %.06.i81) #5, !dbg !41 + %497 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i63, float 0x3FE62E4300000000, float %.06.i81) #5, !dbg !41 + %.04.i83 = select i1 %.not10.i82, float %497, float %496, !dbg !41 + %498 = icmp ugt i32 %457, 2139095039, !dbg !41 + br i1 %498, label %__nv_fmaf_rn.exit.i.i86, label %__nv_logf.exit89, !dbg !41 + +__nv_fmaf_rn.exit.i.i86: ; preds = %__nv_logf.exit59 + %499 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i87 = icmp eq i32 %499, 0, !dbg !41 + %500 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i60, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %501 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i60, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i88 = select i1 %.not11.i87, float %501, float %500, !dbg !41 + br label %__nv_logf.exit89, !dbg !41 + +__nv_logf.exit89: ; preds = %__nv_logf.exit59, %__nv_fmaf_rn.exit.i.i86 + %r.i.0.i84 = phi float [ %.03.i88, %__nv_fmaf_rn.exit.i.i86 ], [ %.04.i83, %__nv_logf.exit59 ], !dbg !41 + %502 = fcmp oeq float %.02.i60, 0.000000e+00, !dbg !41 + %r.i.1.i85 = select i1 %502, float 0xFFF0000000000000, float %r.i.0.i84, !dbg !41 + %503 = fcmp olt float %147, 0x3810000000000000, !dbg !41 + %504 = fmul float %147, 0x4160000000000000, !dbg !41 + %.02.i90 = select i1 %503, float %504, float %147, !dbg !41 + %i.i.0.i91 = select i1 %503, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %505 = bitcast float %.02.i90 to i32, !dbg !41 + %506 = add i32 %505, -1059760811, !dbg !41 + %507 = and i32 %506, -8388608, !dbg !41 + %508 = sub i32 %505, %507, !dbg !41 + %509 = bitcast i32 %508 to float, !dbg !41 + %510 = sitofp i32 %507 to float, !dbg !41 + %511 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i92 = icmp eq i32 %511, 0, !dbg !41 + %512 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %510, float 0x3E80000000000000, float %i.i.0.i91) #5, !dbg !41 + %513 = tail call float @llvm.nvvm.fma.rn.f(float %510, float 0x3E80000000000000, float %i.i.0.i91) #5, !dbg !41 + %.08.i93 = select i1 %.not.i92, float %513, float %512, !dbg !41 + %514 = fadd float %509, -1.000000e+00, !dbg !41 + %515 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i94 = icmp eq i32 %515, 0, !dbg !41 + %516 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %514, float 0x3FC2073EC0000000) #5, !dbg !41 + %517 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %514, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i95 = select i1 %.not1.i94, float %517, float %516, !dbg !41 + %518 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i96 = icmp eq i32 %518, 0, !dbg !41 + %519 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i95, float %514, float 0xBFBF19B980000000) #5, !dbg !41 + %520 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i95, float %514, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i97 = select i1 %.not2.i96, float %520, float %519, !dbg !41 + %521 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i98 = icmp eq i32 %521, 0, !dbg !41 + %522 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i97, float %514, float 0x3FC1E52AA0000000) #5, !dbg !41 + %523 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i97, float %514, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i99 = select i1 %.not3.i98, float %523, float %522, !dbg !41 + %524 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i100 = icmp eq i32 %524, 0, !dbg !41 + %525 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i99, float %514, float 0xBFC55B1720000000) #5, !dbg !41 + %526 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i99, float %514, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i101 = select i1 %.not4.i100, float %526, float %525, !dbg !41 + %527 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i102 = icmp eq i32 %527, 0, !dbg !41 + %528 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i101, float %514, float 0x3FC99DA160000000) #5, !dbg !41 + %529 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i101, float %514, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i103 = select i1 %.not5.i102, float %529, float %528, !dbg !41 + %530 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i104 = icmp eq i32 %530, 0, !dbg !41 + %531 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i103, float %514, float 0xBFCFFFE440000000) #5, !dbg !41 + %532 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i103, float %514, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i105 = select i1 %.not6.i104, float %532, float %531, !dbg !41 + %533 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i106 = icmp eq i32 %533, 0, !dbg !41 + %534 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i105, float %514, float 0x3FD5554F00000000) #5, !dbg !41 + %535 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i105, float %514, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i107 = select i1 %.not7.i106, float %535, float %534, !dbg !41 + %536 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i108 = icmp eq i32 %536, 0, !dbg !41 + %537 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i107, float %514, float -5.000000e-01) #5, !dbg !41 + %538 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i107, float %514, float -5.000000e-01) #5, !dbg !41 + %.07.i109 = select i1 %.not8.i108, float %538, float %537, !dbg !41 + %539 = fmul float %514, %.07.i109, !dbg !41 + %540 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i110 = icmp eq i32 %540, 0, !dbg !41 + %541 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %539, float %514, float %514) #5, !dbg !41 + %542 = tail call float @llvm.nvvm.fma.rn.f(float %539, float %514, float %514) #5, !dbg !41 + %.06.i111 = select i1 %.not9.i110, float %542, float %541, !dbg !41 + %543 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i112 = icmp eq i32 %543, 0, !dbg !41 + %544 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i93, float 0x3FE62E4300000000, float %.06.i111) #5, !dbg !41 + %545 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i93, float 0x3FE62E4300000000, float %.06.i111) #5, !dbg !41 + %.04.i113 = select i1 %.not10.i112, float %545, float %544, !dbg !41 + %546 = icmp ugt i32 %505, 2139095039, !dbg !41 + br i1 %546, label %__nv_fmaf_rn.exit.i.i116, label %__nv_logf.exit119, !dbg !41 + +__nv_fmaf_rn.exit.i.i116: ; preds = %__nv_logf.exit89 + %547 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i117 = icmp eq i32 %547, 0, !dbg !41 + %548 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i90, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %549 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i90, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i118 = select i1 %.not11.i117, float %549, float %548, !dbg !41 + br label %__nv_logf.exit119, !dbg !41 + +__nv_logf.exit119: ; preds = %__nv_logf.exit89, %__nv_fmaf_rn.exit.i.i116 + %r.i.0.i114 = phi float [ %.03.i118, %__nv_fmaf_rn.exit.i.i116 ], [ %.04.i113, %__nv_logf.exit89 ], !dbg !41 + %550 = fcmp oeq float %.02.i90, 0.000000e+00, !dbg !41 + %r.i.1.i115 = select i1 %550, float 0xFFF0000000000000, float %r.i.0.i114, !dbg !41 + %551 = fcmp olt float %153, 0x3810000000000000, !dbg !41 + %552 = fmul float %153, 0x4160000000000000, !dbg !41 + %.02.i120 = select i1 %551, float %552, float %153, !dbg !41 + %i.i.0.i121 = select i1 %551, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %553 = bitcast float %.02.i120 to i32, !dbg !41 + %554 = add i32 %553, -1059760811, !dbg !41 + %555 = and i32 %554, -8388608, !dbg !41 + %556 = sub i32 %553, %555, !dbg !41 + %557 = bitcast i32 %556 to float, !dbg !41 + %558 = sitofp i32 %555 to float, !dbg !41 + %559 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i122 = icmp eq i32 %559, 0, !dbg !41 + %560 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %558, float 0x3E80000000000000, float %i.i.0.i121) #5, !dbg !41 + %561 = tail call float @llvm.nvvm.fma.rn.f(float %558, float 0x3E80000000000000, float %i.i.0.i121) #5, !dbg !41 + %.08.i123 = select i1 %.not.i122, float %561, float %560, !dbg !41 + %562 = fadd float %557, -1.000000e+00, !dbg !41 + %563 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i124 = icmp eq i32 %563, 0, !dbg !41 + %564 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %562, float 0x3FC2073EC0000000) #5, !dbg !41 + %565 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %562, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i125 = select i1 %.not1.i124, float %565, float %564, !dbg !41 + %566 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i126 = icmp eq i32 %566, 0, !dbg !41 + %567 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i125, float %562, float 0xBFBF19B980000000) #5, !dbg !41 + %568 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i125, float %562, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i127 = select i1 %.not2.i126, float %568, float %567, !dbg !41 + %569 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i128 = icmp eq i32 %569, 0, !dbg !41 + %570 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i127, float %562, float 0x3FC1E52AA0000000) #5, !dbg !41 + %571 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i127, float %562, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i129 = select i1 %.not3.i128, float %571, float %570, !dbg !41 + %572 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i130 = icmp eq i32 %572, 0, !dbg !41 + %573 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i129, float %562, float 0xBFC55B1720000000) #5, !dbg !41 + %574 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i129, float %562, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i131 = select i1 %.not4.i130, float %574, float %573, !dbg !41 + %575 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i132 = icmp eq i32 %575, 0, !dbg !41 + %576 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i131, float %562, float 0x3FC99DA160000000) #5, !dbg !41 + %577 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i131, float %562, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i133 = select i1 %.not5.i132, float %577, float %576, !dbg !41 + %578 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i134 = icmp eq i32 %578, 0, !dbg !41 + %579 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i133, float %562, float 0xBFCFFFE440000000) #5, !dbg !41 + %580 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i133, float %562, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i135 = select i1 %.not6.i134, float %580, float %579, !dbg !41 + %581 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i136 = icmp eq i32 %581, 0, !dbg !41 + %582 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i135, float %562, float 0x3FD5554F00000000) #5, !dbg !41 + %583 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i135, float %562, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i137 = select i1 %.not7.i136, float %583, float %582, !dbg !41 + %584 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i138 = icmp eq i32 %584, 0, !dbg !41 + %585 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i137, float %562, float -5.000000e-01) #5, !dbg !41 + %586 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i137, float %562, float -5.000000e-01) #5, !dbg !41 + %.07.i139 = select i1 %.not8.i138, float %586, float %585, !dbg !41 + %587 = fmul float %562, %.07.i139, !dbg !41 + %588 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i140 = icmp eq i32 %588, 0, !dbg !41 + %589 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %587, float %562, float %562) #5, !dbg !41 + %590 = tail call float @llvm.nvvm.fma.rn.f(float %587, float %562, float %562) #5, !dbg !41 + %.06.i141 = select i1 %.not9.i140, float %590, float %589, !dbg !41 + %591 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i142 = icmp eq i32 %591, 0, !dbg !41 + %592 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i123, float 0x3FE62E4300000000, float %.06.i141) #5, !dbg !41 + %593 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i123, float 0x3FE62E4300000000, float %.06.i141) #5, !dbg !41 + %.04.i143 = select i1 %.not10.i142, float %593, float %592, !dbg !41 + %594 = icmp ugt i32 %553, 2139095039, !dbg !41 + br i1 %594, label %__nv_fmaf_rn.exit.i.i146, label %__nv_logf.exit149, !dbg !41 + +__nv_fmaf_rn.exit.i.i146: ; preds = %__nv_logf.exit119 + %595 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i147 = icmp eq i32 %595, 0, !dbg !41 + %596 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i120, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %597 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i120, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i148 = select i1 %.not11.i147, float %597, float %596, !dbg !41 + br label %__nv_logf.exit149, !dbg !41 + +__nv_logf.exit149: ; preds = %__nv_logf.exit119, %__nv_fmaf_rn.exit.i.i146 + %r.i.0.i144 = phi float [ %.03.i148, %__nv_fmaf_rn.exit.i.i146 ], [ %.04.i143, %__nv_logf.exit119 ], !dbg !41 + %598 = fcmp oeq float %.02.i120, 0.000000e+00, !dbg !41 + %r.i.1.i145 = select i1 %598, float 0xFFF0000000000000, float %r.i.0.i144, !dbg !41 + %599 = fcmp olt float %154, 0x3810000000000000, !dbg !41 + %600 = fmul float %154, 0x4160000000000000, !dbg !41 + %.02.i150 = select i1 %599, float %600, float %154, !dbg !41 + %i.i.0.i151 = select i1 %599, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %601 = bitcast float %.02.i150 to i32, !dbg !41 + %602 = add i32 %601, -1059760811, !dbg !41 + %603 = and i32 %602, -8388608, !dbg !41 + %604 = sub i32 %601, %603, !dbg !41 + %605 = bitcast i32 %604 to float, !dbg !41 + %606 = sitofp i32 %603 to float, !dbg !41 + %607 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i152 = icmp eq i32 %607, 0, !dbg !41 + %608 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %606, float 0x3E80000000000000, float %i.i.0.i151) #5, !dbg !41 + %609 = tail call float @llvm.nvvm.fma.rn.f(float %606, float 0x3E80000000000000, float %i.i.0.i151) #5, !dbg !41 + %.08.i153 = select i1 %.not.i152, float %609, float %608, !dbg !41 + %610 = fadd float %605, -1.000000e+00, !dbg !41 + %611 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i154 = icmp eq i32 %611, 0, !dbg !41 + %612 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %610, float 0x3FC2073EC0000000) #5, !dbg !41 + %613 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %610, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i155 = select i1 %.not1.i154, float %613, float %612, !dbg !41 + %614 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i156 = icmp eq i32 %614, 0, !dbg !41 + %615 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i155, float %610, float 0xBFBF19B980000000) #5, !dbg !41 + %616 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i155, float %610, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i157 = select i1 %.not2.i156, float %616, float %615, !dbg !41 + %617 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i158 = icmp eq i32 %617, 0, !dbg !41 + %618 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i157, float %610, float 0x3FC1E52AA0000000) #5, !dbg !41 + %619 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i157, float %610, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i159 = select i1 %.not3.i158, float %619, float %618, !dbg !41 + %620 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i160 = icmp eq i32 %620, 0, !dbg !41 + %621 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i159, float %610, float 0xBFC55B1720000000) #5, !dbg !41 + %622 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i159, float %610, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i161 = select i1 %.not4.i160, float %622, float %621, !dbg !41 + %623 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i162 = icmp eq i32 %623, 0, !dbg !41 + %624 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i161, float %610, float 0x3FC99DA160000000) #5, !dbg !41 + %625 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i161, float %610, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i163 = select i1 %.not5.i162, float %625, float %624, !dbg !41 + %626 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i164 = icmp eq i32 %626, 0, !dbg !41 + %627 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i163, float %610, float 0xBFCFFFE440000000) #5, !dbg !41 + %628 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i163, float %610, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i165 = select i1 %.not6.i164, float %628, float %627, !dbg !41 + %629 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i166 = icmp eq i32 %629, 0, !dbg !41 + %630 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i165, float %610, float 0x3FD5554F00000000) #5, !dbg !41 + %631 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i165, float %610, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i167 = select i1 %.not7.i166, float %631, float %630, !dbg !41 + %632 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i168 = icmp eq i32 %632, 0, !dbg !41 + %633 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i167, float %610, float -5.000000e-01) #5, !dbg !41 + %634 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i167, float %610, float -5.000000e-01) #5, !dbg !41 + %.07.i169 = select i1 %.not8.i168, float %634, float %633, !dbg !41 + %635 = fmul float %610, %.07.i169, !dbg !41 + %636 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i170 = icmp eq i32 %636, 0, !dbg !41 + %637 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %635, float %610, float %610) #5, !dbg !41 + %638 = tail call float @llvm.nvvm.fma.rn.f(float %635, float %610, float %610) #5, !dbg !41 + %.06.i171 = select i1 %.not9.i170, float %638, float %637, !dbg !41 + %639 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i172 = icmp eq i32 %639, 0, !dbg !41 + %640 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i153, float 0x3FE62E4300000000, float %.06.i171) #5, !dbg !41 + %641 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i153, float 0x3FE62E4300000000, float %.06.i171) #5, !dbg !41 + %.04.i173 = select i1 %.not10.i172, float %641, float %640, !dbg !41 + %642 = icmp ugt i32 %601, 2139095039, !dbg !41 + br i1 %642, label %__nv_fmaf_rn.exit.i.i176, label %__nv_logf.exit179, !dbg !41 + +__nv_fmaf_rn.exit.i.i176: ; preds = %__nv_logf.exit149 + %643 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i177 = icmp eq i32 %643, 0, !dbg !41 + %644 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i150, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %645 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i150, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i178 = select i1 %.not11.i177, float %645, float %644, !dbg !41 + br label %__nv_logf.exit179, !dbg !41 + +__nv_logf.exit179: ; preds = %__nv_logf.exit149, %__nv_fmaf_rn.exit.i.i176 + %r.i.0.i174 = phi float [ %.03.i178, %__nv_fmaf_rn.exit.i.i176 ], [ %.04.i173, %__nv_logf.exit149 ], !dbg !41 + %646 = fcmp oeq float %.02.i150, 0.000000e+00, !dbg !41 + %r.i.1.i175 = select i1 %646, float 0xFFF0000000000000, float %r.i.0.i174, !dbg !41 + %647 = fcmp olt float %155, 0x3810000000000000, !dbg !41 + %648 = fmul float %155, 0x4160000000000000, !dbg !41 + %.02.i180 = select i1 %647, float %648, float %155, !dbg !41 + %i.i.0.i181 = select i1 %647, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %649 = bitcast float %.02.i180 to i32, !dbg !41 + %650 = add i32 %649, -1059760811, !dbg !41 + %651 = and i32 %650, -8388608, !dbg !41 + %652 = sub i32 %649, %651, !dbg !41 + %653 = bitcast i32 %652 to float, !dbg !41 + %654 = sitofp i32 %651 to float, !dbg !41 + %655 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i182 = icmp eq i32 %655, 0, !dbg !41 + %656 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %654, float 0x3E80000000000000, float %i.i.0.i181) #5, !dbg !41 + %657 = tail call float @llvm.nvvm.fma.rn.f(float %654, float 0x3E80000000000000, float %i.i.0.i181) #5, !dbg !41 + %.08.i183 = select i1 %.not.i182, float %657, float %656, !dbg !41 + %658 = fadd float %653, -1.000000e+00, !dbg !41 + %659 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i184 = icmp eq i32 %659, 0, !dbg !41 + %660 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %658, float 0x3FC2073EC0000000) #5, !dbg !41 + %661 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %658, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i185 = select i1 %.not1.i184, float %661, float %660, !dbg !41 + %662 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i186 = icmp eq i32 %662, 0, !dbg !41 + %663 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i185, float %658, float 0xBFBF19B980000000) #5, !dbg !41 + %664 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i185, float %658, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i187 = select i1 %.not2.i186, float %664, float %663, !dbg !41 + %665 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i188 = icmp eq i32 %665, 0, !dbg !41 + %666 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i187, float %658, float 0x3FC1E52AA0000000) #5, !dbg !41 + %667 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i187, float %658, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i189 = select i1 %.not3.i188, float %667, float %666, !dbg !41 + %668 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i190 = icmp eq i32 %668, 0, !dbg !41 + %669 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i189, float %658, float 0xBFC55B1720000000) #5, !dbg !41 + %670 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i189, float %658, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i191 = select i1 %.not4.i190, float %670, float %669, !dbg !41 + %671 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i192 = icmp eq i32 %671, 0, !dbg !41 + %672 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i191, float %658, float 0x3FC99DA160000000) #5, !dbg !41 + %673 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i191, float %658, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i193 = select i1 %.not5.i192, float %673, float %672, !dbg !41 + %674 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i194 = icmp eq i32 %674, 0, !dbg !41 + %675 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i193, float %658, float 0xBFCFFFE440000000) #5, !dbg !41 + %676 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i193, float %658, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i195 = select i1 %.not6.i194, float %676, float %675, !dbg !41 + %677 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i196 = icmp eq i32 %677, 0, !dbg !41 + %678 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i195, float %658, float 0x3FD5554F00000000) #5, !dbg !41 + %679 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i195, float %658, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i197 = select i1 %.not7.i196, float %679, float %678, !dbg !41 + %680 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i198 = icmp eq i32 %680, 0, !dbg !41 + %681 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i197, float %658, float -5.000000e-01) #5, !dbg !41 + %682 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i197, float %658, float -5.000000e-01) #5, !dbg !41 + %.07.i199 = select i1 %.not8.i198, float %682, float %681, !dbg !41 + %683 = fmul float %658, %.07.i199, !dbg !41 + %684 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i200 = icmp eq i32 %684, 0, !dbg !41 + %685 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %683, float %658, float %658) #5, !dbg !41 + %686 = tail call float @llvm.nvvm.fma.rn.f(float %683, float %658, float %658) #5, !dbg !41 + %.06.i201 = select i1 %.not9.i200, float %686, float %685, !dbg !41 + %687 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i202 = icmp eq i32 %687, 0, !dbg !41 + %688 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i183, float 0x3FE62E4300000000, float %.06.i201) #5, !dbg !41 + %689 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i183, float 0x3FE62E4300000000, float %.06.i201) #5, !dbg !41 + %.04.i203 = select i1 %.not10.i202, float %689, float %688, !dbg !41 + %690 = icmp ugt i32 %649, 2139095039, !dbg !41 + br i1 %690, label %__nv_fmaf_rn.exit.i.i206, label %__nv_logf.exit209, !dbg !41 + +__nv_fmaf_rn.exit.i.i206: ; preds = %__nv_logf.exit179 + %691 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i207 = icmp eq i32 %691, 0, !dbg !41 + %692 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i180, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %693 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i180, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i208 = select i1 %.not11.i207, float %693, float %692, !dbg !41 + br label %__nv_logf.exit209, !dbg !41 + +__nv_logf.exit209: ; preds = %__nv_logf.exit179, %__nv_fmaf_rn.exit.i.i206 + %r.i.0.i204 = phi float [ %.03.i208, %__nv_fmaf_rn.exit.i.i206 ], [ %.04.i203, %__nv_logf.exit179 ], !dbg !41 + %694 = fcmp oeq float %.02.i180, 0.000000e+00, !dbg !41 + %r.i.1.i205 = select i1 %694, float 0xFFF0000000000000, float %r.i.0.i204, !dbg !41 + %695 = fcmp olt float %156, 0x3810000000000000, !dbg !41 + %696 = fmul float %156, 0x4160000000000000, !dbg !41 + %.02.i210 = select i1 %695, float %696, float %156, !dbg !41 + %i.i.0.i211 = select i1 %695, float -2.300000e+01, float 0.000000e+00, !dbg !41 + %697 = bitcast float %.02.i210 to i32, !dbg !41 + %698 = add i32 %697, -1059760811, !dbg !41 + %699 = and i32 %698, -8388608, !dbg !41 + %700 = sub i32 %697, %699, !dbg !41 + %701 = bitcast i32 %700 to float, !dbg !41 + %702 = sitofp i32 %699 to float, !dbg !41 + %703 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not.i212 = icmp eq i32 %703, 0, !dbg !41 + %704 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %702, float 0x3E80000000000000, float %i.i.0.i211) #5, !dbg !41 + %705 = tail call float @llvm.nvvm.fma.rn.f(float %702, float 0x3E80000000000000, float %i.i.0.i211) #5, !dbg !41 + %.08.i213 = select i1 %.not.i212, float %705, float %704, !dbg !41 + %706 = fadd float %701, -1.000000e+00, !dbg !41 + %707 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not1.i214 = icmp eq i32 %707, 0, !dbg !41 + %708 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %706, float 0x3FC2073EC0000000) #5, !dbg !41 + %709 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %706, float 0x3FC2073EC0000000) #5, !dbg !41 + %.010.i215 = select i1 %.not1.i214, float %709, float %708, !dbg !41 + %710 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not2.i216 = icmp eq i32 %710, 0, !dbg !41 + %711 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010.i215, float %706, float 0xBFBF19B980000000) #5, !dbg !41 + %712 = tail call float @llvm.nvvm.fma.rn.f(float %.010.i215, float %706, float 0xBFBF19B980000000) #5, !dbg !41 + %.011.i217 = select i1 %.not2.i216, float %712, float %711, !dbg !41 + %713 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not3.i218 = icmp eq i32 %713, 0, !dbg !41 + %714 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011.i217, float %706, float 0x3FC1E52AA0000000) #5, !dbg !41 + %715 = tail call float @llvm.nvvm.fma.rn.f(float %.011.i217, float %706, float 0x3FC1E52AA0000000) #5, !dbg !41 + %.012.i219 = select i1 %.not3.i218, float %715, float %714, !dbg !41 + %716 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not4.i220 = icmp eq i32 %716, 0, !dbg !41 + %717 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012.i219, float %706, float 0xBFC55B1720000000) #5, !dbg !41 + %718 = tail call float @llvm.nvvm.fma.rn.f(float %.012.i219, float %706, float 0xBFC55B1720000000) #5, !dbg !41 + %.09.i221 = select i1 %.not4.i220, float %718, float %717, !dbg !41 + %719 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not5.i222 = icmp eq i32 %719, 0, !dbg !41 + %720 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09.i221, float %706, float 0x3FC99DA160000000) #5, !dbg !41 + %721 = tail call float @llvm.nvvm.fma.rn.f(float %.09.i221, float %706, float 0x3FC99DA160000000) #5, !dbg !41 + %.05.i223 = select i1 %.not5.i222, float %721, float %720, !dbg !41 + %722 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not6.i224 = icmp eq i32 %722, 0, !dbg !41 + %723 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05.i223, float %706, float 0xBFCFFFE440000000) #5, !dbg !41 + %724 = tail call float @llvm.nvvm.fma.rn.f(float %.05.i223, float %706, float 0xBFCFFFE440000000) #5, !dbg !41 + %.01.i225 = select i1 %.not6.i224, float %724, float %723, !dbg !41 + %725 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not7.i226 = icmp eq i32 %725, 0, !dbg !41 + %726 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01.i225, float %706, float 0x3FD5554F00000000) #5, !dbg !41 + %727 = tail call float @llvm.nvvm.fma.rn.f(float %.01.i225, float %706, float 0x3FD5554F00000000) #5, !dbg !41 + %.0.i227 = select i1 %.not7.i226, float %727, float %726, !dbg !41 + %728 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not8.i228 = icmp eq i32 %728, 0, !dbg !41 + %729 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0.i227, float %706, float -5.000000e-01) #5, !dbg !41 + %730 = tail call float @llvm.nvvm.fma.rn.f(float %.0.i227, float %706, float -5.000000e-01) #5, !dbg !41 + %.07.i229 = select i1 %.not8.i228, float %730, float %729, !dbg !41 + %731 = fmul float %706, %.07.i229, !dbg !41 + %732 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not9.i230 = icmp eq i32 %732, 0, !dbg !41 + %733 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %731, float %706, float %706) #5, !dbg !41 + %734 = tail call float @llvm.nvvm.fma.rn.f(float %731, float %706, float %706) #5, !dbg !41 + %.06.i231 = select i1 %.not9.i230, float %734, float %733, !dbg !41 + %735 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not10.i232 = icmp eq i32 %735, 0, !dbg !41 + %736 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08.i213, float 0x3FE62E4300000000, float %.06.i231) #5, !dbg !41 + %737 = tail call float @llvm.nvvm.fma.rn.f(float %.08.i213, float 0x3FE62E4300000000, float %.06.i231) #5, !dbg !41 + %.04.i233 = select i1 %.not10.i232, float %737, float %736, !dbg !41 + %738 = icmp ugt i32 %697, 2139095039, !dbg !41 + br i1 %738, label %__nv_fmaf_rn.exit.i.i236, label %__nv_logf.exit239, !dbg !41 + +__nv_fmaf_rn.exit.i.i236: ; preds = %__nv_logf.exit209 + %739 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5, !dbg !41 + %.not11.i237 = icmp eq i32 %739, 0, !dbg !41 + %740 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02.i210, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %741 = tail call float @llvm.nvvm.fma.rn.f(float %.02.i210, float 0x7FF0000000000000, float 0x7FF0000000000000) #5, !dbg !41 + %.03.i238 = select i1 %.not11.i237, float %741, float %740, !dbg !41 + br label %__nv_logf.exit239, !dbg !41 + +__nv_logf.exit239: ; preds = %__nv_logf.exit209, %__nv_fmaf_rn.exit.i.i236 + %r.i.0.i234 = phi float [ %.03.i238, %__nv_fmaf_rn.exit.i.i236 ], [ %.04.i233, %__nv_logf.exit209 ], !dbg !41 + %742 = fcmp oeq float %.02.i210, 0.000000e+00, !dbg !41 + %r.i.1.i235 = select i1 %742, float 0xFFF0000000000000, float %r.i.0.i234, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %743 = insertelement <1 x float> undef, float %r.i.1.i, i64 0, !dbg !41 + store <1 x float> %743, ptr addrspace(3) %25, align 4, !dbg !41 + %744 = insertelement <1 x float> undef, float %r.i.1.i55, i64 0, !dbg !41 + store <1 x float> %744, ptr addrspace(3) %28, align 4, !dbg !41 + %745 = insertelement <1 x float> undef, float %r.i.1.i85, i64 0, !dbg !41 + store <1 x float> %745, ptr addrspace(3) %31, align 4, !dbg !41 + %746 = insertelement <1 x float> undef, float %r.i.1.i115, i64 0, !dbg !41 + store <1 x float> %746, ptr addrspace(3) %34, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %747 = load float, ptr addrspace(3) %37, align 4, !dbg !41 + %748 = load float, ptr addrspace(3) %40, align 4, !dbg !41 + %749 = load float, ptr addrspace(3) %43, align 4, !dbg !41 + %750 = load float, ptr addrspace(3) %46, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %751 = insertelement <1 x float> undef, float %r.i.1.i145, i64 0, !dbg !41 + store <1 x float> %751, ptr addrspace(3) %25, align 4, !dbg !41 + %752 = insertelement <1 x float> undef, float %r.i.1.i175, i64 0, !dbg !41 + store <1 x float> %752, ptr addrspace(3) %28, align 4, !dbg !41 + %753 = insertelement <1 x float> undef, float %r.i.1.i205, i64 0, !dbg !41 + store <1 x float> %753, ptr addrspace(3) %31, align 4, !dbg !41 + %754 = insertelement <1 x float> undef, float %r.i.1.i235, i64 0, !dbg !41 + store <1 x float> %754, ptr addrspace(3) %34, align 4, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %755 = load float, ptr addrspace(3) %37, align 4, !dbg !41 + %756 = load float, ptr addrspace(3) %40, align 4, !dbg !41 + %757 = load float, ptr addrspace(3) %43, align 4, !dbg !41 + %758 = load float, ptr addrspace(3) %46, align 4, !dbg !41 + %759 = insertelement <8 x float> poison, float %747, i64 0, !dbg !42 + %760 = insertelement <8 x float> %759, float %748, i64 1, !dbg !42 + %761 = insertelement <8 x float> %760, float %749, i64 2, !dbg !42 + %762 = insertelement <8 x float> %761, float %750, i64 3, !dbg !42 + %763 = insertelement <8 x float> %762, float %755, i64 4, !dbg !42 + %764 = insertelement <8 x float> %763, float %756, i64 5, !dbg !42 + %765 = insertelement <8 x float> %764, float %757, i64 6, !dbg !42 + %766 = insertelement <8 x float> %765, float %758, i64 7, !dbg !42 + %767 = fsub <8 x float> %766, %358, !dbg !42 + %768 = fadd <8 x float> %767, zeroinitializer, !dbg !42 + %769 = select <8 x i1> %206, <8 x float> zeroinitializer, <8 x float> %768, !dbg !43 + %770 = insertelement <8 x i1> %73, i1 %103, i64 1, !dbg !16 + %771 = shufflevector <8 x i1> %770, <8 x i1> poison, <8 x i32> , !dbg !16 + %772 = select <8 x i1> %771, <8 x float> %769, <8 x float> , !dbg !16 + %773 = fadd <8 x float> %76, %772, !dbg !16 + %774 = add nuw nsw i32 %75, 2048, !dbg !17 + %775 = icmp ult i32 %75, 5632, !dbg !17 + br i1 %775, label %74, label %776, !dbg !17 + +776: ; preds = %__nv_logf.exit239 + %777 = lshr i32 %9, 5, !dbg !10 + %778 = and i32 %777, 7, !dbg !10 + %779 = and i32 %9, 31, !dbg !10 + tail call void @llvm.nvvm.barrier0(), !dbg !44 + %shift = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %780 = add <8 x i64> %211, %shift, !dbg !48 + %shift286 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %781 = add <8 x i64> %780, %shift286, !dbg !48 + %shift287 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %782 = add <8 x i64> %781, %shift287, !dbg !48 + %shift288 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %783 = add <8 x i64> %782, %shift288, !dbg !48 + %shift289 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %784 = add <8 x i64> %783, %shift289, !dbg !48 + %shift290 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %785 = add <8 x i64> %784, %shift290, !dbg !48 + %shift291 = shufflevector <8 x i64> %211, <8 x i64> poison, <8 x i32> , !dbg !48 + %786 = add <8 x i64> %785, %shift291, !dbg !48 + %787 = extractelement <8 x i64> %786, i64 0, !dbg !48 + %788 = trunc i64 %787 to i32, !dbg !44 + %789 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %788, i32 16, i32 31), !dbg !44 + %bc = bitcast i64 %787 to <2 x i32>, !dbg !44 + %790 = extractelement <2 x i32> %bc, i64 1, !dbg !44 + %791 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %790, i32 16, i32 31), !dbg !44 + %792 = insertelement <2 x i32> undef, i32 %789, i64 0, !dbg !44 + %793 = insertelement <2 x i32> %792, i32 %791, i64 1, !dbg !44 + %794 = bitcast <2 x i32> %793 to i64, !dbg !44 + %795 = add i64 %787, %794, !dbg !48 + %796 = trunc i64 %795 to i32, !dbg !44 + %797 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %796, i32 8, i32 31), !dbg !44 + %bc1 = bitcast i64 %795 to <2 x i32>, !dbg !44 + %798 = extractelement <2 x i32> %bc1, i64 1, !dbg !44 + %799 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %798, i32 8, i32 31), !dbg !44 + %800 = insertelement <2 x i32> undef, i32 %797, i64 0, !dbg !44 + %801 = insertelement <2 x i32> %800, i32 %799, i64 1, !dbg !44 + %802 = bitcast <2 x i32> %801 to i64, !dbg !44 + %803 = add i64 %795, %802, !dbg !48 + %804 = trunc i64 %803 to i32, !dbg !44 + %805 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %804, i32 4, i32 31), !dbg !44 + %bc2 = bitcast i64 %803 to <2 x i32>, !dbg !44 + %806 = extractelement <2 x i32> %bc2, i64 1, !dbg !44 + %807 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %806, i32 4, i32 31), !dbg !44 + %808 = insertelement <2 x i32> undef, i32 %805, i64 0, !dbg !44 + %809 = insertelement <2 x i32> %808, i32 %807, i64 1, !dbg !44 + %810 = bitcast <2 x i32> %809 to i64, !dbg !44 + %811 = add i64 %803, %810, !dbg !48 + %812 = trunc i64 %811 to i32, !dbg !44 + %813 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %812, i32 2, i32 31), !dbg !44 + %bc3 = bitcast i64 %811 to <2 x i32>, !dbg !44 + %814 = extractelement <2 x i32> %bc3, i64 1, !dbg !44 + %815 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %814, i32 2, i32 31), !dbg !44 + %816 = insertelement <2 x i32> undef, i32 %813, i64 0, !dbg !44 + %817 = insertelement <2 x i32> %816, i32 %815, i64 1, !dbg !44 + %818 = bitcast <2 x i32> %817 to i64, !dbg !44 + %819 = add i64 %811, %818, !dbg !48 + %820 = trunc i64 %819 to i32, !dbg !44 + %821 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %820, i32 1, i32 31), !dbg !44 + %bc4 = bitcast i64 %819 to <2 x i32>, !dbg !44 + %822 = extractelement <2 x i32> %bc4, i64 1, !dbg !44 + %823 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %822, i32 1, i32 31), !dbg !44 + %824 = insertelement <2 x i32> undef, i32 %821, i64 0, !dbg !44 + %825 = insertelement <2 x i32> %824, i32 %823, i64 1, !dbg !44 + %826 = bitcast <2 x i32> %825 to i64, !dbg !44 + %827 = add i64 %819, %826, !dbg !48 + %828 = icmp eq i32 %779, 0, !dbg !44 + %829 = zext nneg i32 %778 to i64, !dbg !44 + %830 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %829, !dbg !44 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %830, i64 %827, i1 %828) #5, !dbg !44 + tail call void @llvm.nvvm.barrier0(), !dbg !44 + %831 = icmp slt i32 %9, 8, !dbg !44 + %832 = sext i32 %9 to i64, !dbg !44 + %833 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %832, !dbg !44 + %834 = tail call i64 asm sideeffect "@$2 ld.shared.b64 $0, [ $1 + 0 ];", "=l,r,b"(ptr addrspace(3) %833, i1 %831) #5, !dbg !44 + %835 = trunc i64 %834 to i32, !dbg !44 + %836 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %835, i32 4, i32 31), !dbg !44 + %bc5 = bitcast i64 %834 to <2 x i32>, !dbg !44 + %837 = extractelement <2 x i32> %bc5, i64 1, !dbg !44 + %838 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %837, i32 4, i32 31), !dbg !44 + %839 = insertelement <2 x i32> undef, i32 %836, i64 0, !dbg !44 + %840 = insertelement <2 x i32> %839, i32 %838, i64 1, !dbg !44 + %841 = bitcast <2 x i32> %840 to i64, !dbg !44 + %842 = add i64 %834, %841, !dbg !48 + %843 = trunc i64 %842 to i32, !dbg !44 + %844 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %843, i32 2, i32 31), !dbg !44 + %bc6 = bitcast i64 %842 to <2 x i32>, !dbg !44 + %845 = extractelement <2 x i32> %bc6, i64 1, !dbg !44 + %846 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %845, i32 2, i32 31), !dbg !44 + %847 = insertelement <2 x i32> undef, i32 %844, i64 0, !dbg !44 + %848 = insertelement <2 x i32> %847, i32 %846, i64 1, !dbg !44 + %849 = bitcast <2 x i32> %848 to i64, !dbg !44 + %850 = add i64 %842, %849, !dbg !48 + %851 = trunc i64 %850 to i32, !dbg !44 + %852 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %851, i32 1, i32 31), !dbg !44 + %bc7 = bitcast i64 %850 to <2 x i32>, !dbg !44 + %853 = extractelement <2 x i32> %bc7, i64 1, !dbg !44 + %854 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %853, i32 1, i32 31), !dbg !44 + %855 = insertelement <2 x i32> undef, i32 %852, i64 0, !dbg !44 + %856 = insertelement <2 x i32> %855, i32 %854, i64 1, !dbg !44 + %857 = bitcast <2 x i32> %856 to i64, !dbg !44 + %858 = add i64 %850, %857, !dbg !48 + %859 = and i32 %9, 7, !dbg !44 + %860 = icmp eq i32 %859, 0, !dbg !44 + %861 = and i1 %831, %860, !dbg !44 + tail call void asm sideeffect "@$2 st.shared.b64 [ $0 + 0 ], $1;", "r,l,b"(ptr addrspace(3) %833, i64 %858, i1 %861) #5, !dbg !44 + tail call void @llvm.nvvm.barrier0(), !dbg !44 + %862 = load i64, ptr addrspace(3) @global_smem, align 4, !dbg !44 + tail call void @llvm.nvvm.barrier0(), !dbg !52 + %863 = insertelement <1 x i64> undef, i64 %862, i64 0, !dbg !52 + store <1 x i64> %863, ptr addrspace(3) @global_smem, align 8, !dbg !52 + tail call void @llvm.nvvm.barrier0(), !dbg !52 + %864 = load i64, ptr addrspace(3) @global_smem, align 8, !dbg !52 + %865 = getelementptr i64, ptr addrspace(1) %4, i64 %18, !dbg !53 + %866 = icmp eq i32 %urem, 0, !dbg !54 + %867 = and i1 %866, %19, !dbg !54 + tail call void asm sideeffect "@$2 st.global.b64 [ $1 + 0 ], { $0 };", "l,l,b"(i64 %864, ptr addrspace(1) %865, i1 %867) #5, !dbg !54 + tail call void @llvm.nvvm.barrier0(), !dbg !55 + %shift292 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %868 = fadd <8 x float> %773, %shift292, !dbg !57 + %shift293 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %869 = fadd <8 x float> %shift293, %868, !dbg !57 + %shift294 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %870 = fadd <8 x float> %shift294, %869, !dbg !57 + %shift295 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %871 = fadd <8 x float> %shift295, %870, !dbg !57 + %shift296 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %872 = fadd <8 x float> %shift296, %871, !dbg !57 + %shift297 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %873 = fadd <8 x float> %shift297, %872, !dbg !57 + %shift298 = shufflevector <8 x float> %773, <8 x float> poison, <8 x i32> , !dbg !57 + %874 = fadd <8 x float> %shift298, %873, !dbg !57 + %875 = extractelement <8 x float> %874, i64 0, !dbg !57 + %876 = bitcast float %875 to i32, !dbg !55 + %877 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %876, i32 16, i32 31), !dbg !55 + %878 = bitcast i32 %877 to float, !dbg !55 + %879 = fadd float %875, %878, !dbg !57 + %880 = bitcast float %879 to i32, !dbg !55 + %881 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %880, i32 8, i32 31), !dbg !55 + %882 = bitcast i32 %881 to float, !dbg !55 + %883 = fadd float %879, %882, !dbg !57 + %884 = bitcast float %883 to i32, !dbg !55 + %885 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %884, i32 4, i32 31), !dbg !55 + %886 = bitcast i32 %885 to float, !dbg !55 + %887 = fadd float %883, %886, !dbg !57 + %888 = bitcast float %887 to i32, !dbg !55 + %889 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %888, i32 2, i32 31), !dbg !55 + %890 = bitcast i32 %889 to float, !dbg !55 + %891 = fadd float %887, %890, !dbg !57 + %892 = bitcast float %891 to i32, !dbg !55 + %893 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %892, i32 1, i32 31), !dbg !55 + %894 = bitcast i32 %893 to float, !dbg !55 + %895 = fadd float %891, %894, !dbg !57 + %896 = getelementptr float, ptr addrspace(3) @global_smem, i64 %829, !dbg !55 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %896, float %895, i1 %828) #5, !dbg !55 + tail call void @llvm.nvvm.barrier0(), !dbg !55 + %897 = getelementptr float, ptr addrspace(3) @global_smem, i64 %832, !dbg !55 + %898 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %897, i1 %831) #5, !dbg !55 + %899 = bitcast float %898 to i32, !dbg !55 + %900 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %899, i32 4, i32 31), !dbg !55 + %901 = bitcast i32 %900 to float, !dbg !55 + %902 = fadd float %898, %901, !dbg !57 + %903 = bitcast float %902 to i32, !dbg !55 + %904 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %903, i32 2, i32 31), !dbg !55 + %905 = bitcast i32 %904 to float, !dbg !55 + %906 = fadd float %902, %905, !dbg !57 + %907 = bitcast float %906 to i32, !dbg !55 + %908 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %907, i32 1, i32 31), !dbg !55 + %909 = bitcast i32 %908 to float, !dbg !55 + %910 = fadd float %906, %909, !dbg !57 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %897, float %910, i1 %861) #5, !dbg !55 + tail call void @llvm.nvvm.barrier0(), !dbg !55 + %911 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !55 + %912 = getelementptr float, ptr addrspace(1) %5, i64 %18, !dbg !60 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %911, ptr addrspace(1) %912, i1 %867) #5, !dbg !61 + ret void, !dbg !62 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #1 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_logf(float %a) local_unnamed_addr #3 { +__nv_fmaf_rn.exit10.i: + %0 = fcmp olt float %a, 0x3810000000000000 + %1 = fmul float %a, 0x4160000000000000 + %.02 = select i1 %0, float %1, float %a + %i.i.0 = select i1 %0, float -2.300000e+01, float 0.000000e+00 + %2 = bitcast float %.02 to i32 + %3 = add i32 %2, -1059760811 + %4 = and i32 %3, -8388608 + %5 = sub i32 %2, %4 + %6 = bitcast i32 %5 to float + %7 = sitofp i32 %4 to float + %8 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not = icmp eq i32 %8, 0 + %9 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %7, float 0x3E80000000000000, float %i.i.0) #5 + %10 = tail call float @llvm.nvvm.fma.rn.f(float %7, float 0x3E80000000000000, float %i.i.0) #5 + %.08 = select i1 %.not, float %10, float %9 + %11 = fadd float %6, -1.000000e+00 + %12 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not1 = icmp eq i32 %12, 0 + %13 = tail call float @llvm.nvvm.fma.rn.ftz.f(float 0xBFC0AA04E0000000, float %11, float 0x3FC2073EC0000000) #5 + %14 = tail call float @llvm.nvvm.fma.rn.f(float 0xBFC0AA04E0000000, float %11, float 0x3FC2073EC0000000) #5 + %.010 = select i1 %.not1, float %14, float %13 + %15 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not2 = icmp eq i32 %15, 0 + %16 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.010, float %11, float 0xBFBF19B980000000) #5 + %17 = tail call float @llvm.nvvm.fma.rn.f(float %.010, float %11, float 0xBFBF19B980000000) #5 + %.011 = select i1 %.not2, float %17, float %16 + %18 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not3 = icmp eq i32 %18, 0 + %19 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.011, float %11, float 0x3FC1E52AA0000000) #5 + %20 = tail call float @llvm.nvvm.fma.rn.f(float %.011, float %11, float 0x3FC1E52AA0000000) #5 + %.012 = select i1 %.not3, float %20, float %19 + %21 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not4 = icmp eq i32 %21, 0 + %22 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.012, float %11, float 0xBFC55B1720000000) #5 + %23 = tail call float @llvm.nvvm.fma.rn.f(float %.012, float %11, float 0xBFC55B1720000000) #5 + %.09 = select i1 %.not4, float %23, float %22 + %24 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not5 = icmp eq i32 %24, 0 + %25 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.09, float %11, float 0x3FC99DA160000000) #5 + %26 = tail call float @llvm.nvvm.fma.rn.f(float %.09, float %11, float 0x3FC99DA160000000) #5 + %.05 = select i1 %.not5, float %26, float %25 + %27 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not6 = icmp eq i32 %27, 0 + %28 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.05, float %11, float 0xBFCFFFE440000000) #5 + %29 = tail call float @llvm.nvvm.fma.rn.f(float %.05, float %11, float 0xBFCFFFE440000000) #5 + %.01 = select i1 %.not6, float %29, float %28 + %30 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not7 = icmp eq i32 %30, 0 + %31 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.01, float %11, float 0x3FD5554F00000000) #5 + %32 = tail call float @llvm.nvvm.fma.rn.f(float %.01, float %11, float 0x3FD5554F00000000) #5 + %.0 = select i1 %.not7, float %32, float %31 + %33 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not8 = icmp eq i32 %33, 0 + %34 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.0, float %11, float -5.000000e-01) #5 + %35 = tail call float @llvm.nvvm.fma.rn.f(float %.0, float %11, float -5.000000e-01) #5 + %.07 = select i1 %.not8, float %35, float %34 + %36 = fmul float %11, %.07 + %37 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not9 = icmp eq i32 %37, 0 + %38 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %36, float %11, float %11) #5 + %39 = tail call float @llvm.nvvm.fma.rn.f(float %36, float %11, float %11) #5 + %.06 = select i1 %.not9, float %39, float %38 + %40 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not10 = icmp eq i32 %40, 0 + %41 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.08, float 0x3FE62E4300000000, float %.06) #5 + %42 = tail call float @llvm.nvvm.fma.rn.f(float %.08, float 0x3FE62E4300000000, float %.06) #5 + %.04 = select i1 %.not10, float %42, float %41 + %43 = icmp ugt i32 %2, 2139095039 + br i1 %43, label %__nv_fmaf_rn.exit.i, label %__internal_accurate_logf.exit + +__nv_fmaf_rn.exit.i: ; preds = %__nv_fmaf_rn.exit10.i + %44 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #5 + %.not11 = icmp eq i32 %44, 0 + %45 = tail call float @llvm.nvvm.fma.rn.ftz.f(float %.02, float 0x7FF0000000000000, float 0x7FF0000000000000) #5 + %46 = tail call float @llvm.nvvm.fma.rn.f(float %.02, float 0x7FF0000000000000, float 0x7FF0000000000000) #5 + %.03 = select i1 %.not11, float %46, float %45 + br label %__internal_accurate_logf.exit + +__internal_accurate_logf.exit: ; preds = %__nv_fmaf_rn.exit.i, %__nv_fmaf_rn.exit10.i + %r.i.0 = phi float [ %.03, %__nv_fmaf_rn.exit.i ], [ %.04, %__nv_fmaf_rn.exit10.i ] + %47 = fcmp oeq float %.02, 0.000000e+00 + %r.i.1 = select i1 %47, float 0xFFF0000000000000, float %r.i.0 + ret float %r.i.1 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.ftz.f(float, float, float) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.nvvm.fma.rn.f(float, float, float) #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind } +attributes #2 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "culwqy52mqs4o2bmqocf2r5plomw2phviv5gutbxlcpdrdkc46ri.py", directory: "/tmp/torchinductor_root/ul") +!4 = !{ptr @triton__0d1d2d3d4d5d6e7de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6e7de, !"maxntidx", i32 256} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6e7de", linkageName: "triton__0d1d2d3d4d5d6e7de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 24, column: 33, scope: !7) +!11 = !DILocation(line: 21, column: 28, scope: !7) +!12 = !DILocation(line: 21, column: 34, scope: !7) +!13 = !DILocation(line: 23, column: 21, scope: !7) +!14 = !DILocation(line: 32, column: 45, scope: !7) +!15 = !DILocation(line: 47, column: 67, scope: !7) +!16 = !DILocation(line: 59, column: 48, scope: !7) +!17 = !DILocation(line: 28, column: 36, scope: !7) +!18 = !DILocation(line: 29, column: 27, scope: !7) +!19 = !DILocation(line: 30, column: 25, scope: !7) +!20 = !DILocation(line: 32, column: 40, scope: !7) +!21 = !DILocation(line: 32, column: 34, scope: !7) +!22 = !DILocation(line: 32, column: 59, scope: !7) +!23 = !DILocation(line: 32, column: 51, scope: !7) +!24 = !DILocation(line: 33, column: 35, scope: !7) +!25 = !DILocation(line: 33, column: 52, scope: !7) +!26 = !DILocation(line: 34, column: 35, scope: !7) +!27 = !DILocation(line: 34, column: 52, scope: !7) +!28 = !DILocation(line: 36, column: 23, scope: !7) +!29 = !DILocation(line: 42, column: 36, scope: !7) +!30 = !DILocation(line: 40, column: 46, scope: !7) +!31 = !DILocation(line: 46, column: 41, scope: !7) +!32 = !DILocation(line: 43, column: 22, scope: !7) +!33 = !DILocation(line: 44, column: 23, scope: !7) +!34 = !DILocation(line: 45, column: 38, scope: !7) +!35 = !DILocation(line: 46, column: 57, scope: !7) +!36 = !DILocation(line: 47, column: 50, scope: !7) +!37 = !DILocation(line: 47, column: 35, scope: !7) +!38 = !DILocation(line: 47, column: 73, scope: !7) +!39 = !DILocation(line: 47, column: 132, scope: !7) +!40 = !DILocation(line: 49, column: 24, scope: !7) +!41 = !DILocation(line: 50, column: 23, scope: !7) +!42 = !DILocation(line: 54, column: 17, scope: !7) +!43 = !DILocation(line: 56, column: 38, scope: !7) +!44 = !DILocation(line: 243, column: 36, scope: !45, inlinedAt: !47) +!45 = distinct !DILexicalBlockFile(scope: !7, file: !46, discriminator: 0) +!46 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!47 = !DILocation(line: 60, column: 25, scope: !45) +!48 = !DILocation(line: 233, column: 15, scope: !49, inlinedAt: !50) +!49 = distinct !DILexicalBlockFile(scope: !45, file: !46, discriminator: 0) +!50 = !DILocation(line: 243, column: 36, scope: !49, inlinedAt: !51) +!51 = !DILocation(line: 60, column: 25, scope: !49) +!52 = !DILocation(line: 60, column: 28, scope: !7) +!53 = !DILocation(line: 61, column: 25, scope: !7) +!54 = !DILocation(line: 61, column: 36, scope: !7) +!55 = !DILocation(line: 243, column: 36, scope: !45, inlinedAt: !56) +!56 = !DILocation(line: 62, column: 27, scope: !45) +!57 = !DILocation(line: 233, column: 15, scope: !49, inlinedAt: !58) +!58 = !DILocation(line: 243, column: 36, scope: !49, inlinedAt: !59) +!59 = !DILocation(line: 62, column: 27, scope: !49) +!60 = !DILocation(line: 63, column: 25, scope: !7) +!61 = !DILocation(line: 63, column: 37, scope: !7) +!62 = !DILocation(line: 63, column: 4, scope: !7) diff --git a/.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir b/.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..86ef92a6245097c1c78b49c4b528cdf41bd3c2e3 --- /dev/null +++ b/.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir @@ -0,0 +1,89 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 2.560000e+02 : f32 + %cst_1 = arith.constant 9.99999974E-6 : f32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_3 = arith.constant dense<256> : tensor<1xi64> + %cst_4 = arith.constant dense<50257> : tensor<1xi64> + %cst_5 = arith.constant dense<0> : tensor<1xi64> + %cst_6 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_6 : tensor<256xi32> + %3 = arith.remsi %0, %c512_i32 : i32 + %4 = tt.addptr %arg1, %0 : !tt.ptr, i32 + %5 = tt.splat %4 : (!tt.ptr) -> tensor<1x!tt.ptr> + %6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64> + %7 = arith.muli %3, %c256_i32 : i32 + %8 = tt.splat %7 : (i32) -> tensor<256xi32> + %9 = arith.addi %1, %8 : tensor<256xi32> + %10 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %11 = tt.addptr %10, %9 : tensor<256x!tt.ptr>, tensor<256xi32> + %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %13 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %16 = arith.addi %6, %cst_4 : tensor<1xi64> + %17 = arith.cmpi slt, %6, %cst_5 : tensor<1xi64> + %18 = arith.select %17, %16, %6 : tensor<1xi1>, tensor<1xi64> + %19 = arith.cmpi sge, %18, %cst_5 : tensor<1xi64> + %20 = arith.cmpi slt, %18, %cst_4 : tensor<1xi64> + %21 = arith.andi %19, %20 : tensor<1xi1> + tt.assert %21, "index out of bounds: 0 <= tmp3 < 50257", "", "_call_with_frames_removed", 883 : tensor<1xi1> + %22 = arith.muli %18, %cst_3 : tensor<1xi64> + %23 = tt.broadcast %22 : (tensor<1xi64>) -> tensor<256xi64> + %24 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64> + %25 = arith.addi %24, %23 : tensor<256xi64> + %26 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %27 = tt.addptr %26, %25 : tensor<256x!tt.ptr>, tensor<256xi64> + %28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %29 = arith.addf %28, %12 : tensor<256xf32> + %30 = arith.select %2, %29, %cst_2 : tensor<256xi1>, tensor<256xf32> + %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %60 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %60 : f32 + }) : (tensor<256xf32>) -> f32 + %32 = arith.addf %31, %cst : f32 + %33 = arith.divf %32, %cst_0 : f32 + %34 = tt.splat %33 : (f32) -> tensor<1xf32> + %35 = tt.splat %33 : (f32) -> tensor<256xf32> + %36 = arith.subf %29, %35 : tensor<256xf32> + %37 = arith.mulf %36, %36 : tensor<256xf32> + %38 = arith.select %2, %37, %cst_2 : tensor<256xi1>, tensor<256xf32> + %39 = "tt.reduce"(%38) <{axis = 0 : i32}> ({ + ^bb0(%arg10: f32, %arg11: f32): + %60 = arith.addf %arg10, %arg11 : f32 + tt.reduce.return %60 : f32 + }) : (tensor<256xf32>) -> f32 + %40 = arith.addf %39, %cst : f32 + %41 = arith.divf %40, %cst_0 : f32 + %42 = arith.addf %41, %cst_1 : f32 + %43 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %44 = tt.splat %43 : (f32) -> tensor<1xf32> + %45 = tt.splat %43 : (f32) -> tensor<256xf32> + %46 = arith.mulf %36, %45 : tensor<256xf32> + %47 = arith.mulf %46, %15 : tensor<256xf32> + %48 = arith.muli %0, %c256_i32 : i32 + %49 = tt.splat %48 : (i32) -> tensor<256xi32> + %50 = arith.addi %1, %49 : tensor<256xi32> + %51 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr> + %52 = tt.addptr %51, %50 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %52, %29, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + gpu.barrier + %53 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %54 = tt.splat %53 : (!tt.ptr) -> tensor<1x!tt.ptr> + tt.store %54, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32> + %55 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr> + %56 = tt.addptr %55, %50 : tensor<256x!tt.ptr>, tensor<256xi32> + %57 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16> + tt.store %56, %57, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + %58 = tt.addptr %arg6, %0 : !tt.ptr, i32 + %59 = tt.splat %58 : (!tt.ptr) -> tensor<1x!tt.ptr> + tt.store %59, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32> + tt.return + } +} diff --git a/.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir b/.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2bd2301e9a3b5eb349019c799bcb251b88fc0250 --- /dev/null +++ b/.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir @@ -0,0 +1,18 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<512> : tensor<128xi32, #blocked> + %c128_i32 = arith.constant 128 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c128_i32 : i32 + %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<128xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<128xi32, #blocked> + %5 = arith.cmpi slt, %4, %cst : tensor<128xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked> + %7 = tt.addptr %6, %4 : tensor<128x!tt.ptr, #blocked>, tensor<128xi32, #blocked> + %8 = arith.extsi %4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked> + tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64, #blocked> + tt.return + } +} diff --git a/.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir b/.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..a75947b0f6f2cf2e2189442beac7fe831ec6e19d --- /dev/null +++ b/.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir @@ -0,0 +1,17 @@ +module { + tt.func public @triton__0d1de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<512> : tensor<256xi32> + %c256_i32 = arith.constant 256 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c256_i32 : i32 + %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %3 = tt.splat %1 : (i32) -> tensor<256xi32> + %4 = arith.addi %3, %2 : tensor<256xi32> + %5 = arith.cmpi slt, %4, %cst : tensor<256xi32> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %4 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = arith.extsi %4 : tensor<256xi32> to tensor<256xi64> + tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64> + tt.return + } +} diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e7d92bdaf36a0219dbb4ae6781b8e373cb122b90 Binary files /dev/null and b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin differ diff --git a/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..24dcef2f99da30f8c2c81b89268719ce32694c66 --- /dev/null +++ b/.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx @@ -0,0 +1,861 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<28>; + .reg .b16 %rs<25>; + .reg .b32 %r<79>; + .reg .f32 %f<487>; + .reg .b64 %rd<8>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd4, [triton__0d1d2de_param_0]; + ld.param.u64 %rd5, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r26, %tid.x; + shl.b32 %r27, %r26, 3; + and.b32 %r28, %r27, 1016; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r29, %r1, 10; + .loc 1 21 23 + or.b32 %r30, %r29, %r28; + .loc 1 24 34 + mul.wide.s32 %rd6, %r30, 2; + add.s64 %rd7, %rd4, %rd6; + mov.pred %p1, -1; + .loc 1 24 39 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd7 + 0 ]; + .loc 1 25 30 + add.s64 %rd3, %rd5, %rd6; + .loc 1 25 35 + mov.u32 %r14, 0x0; + mov.u32 %r15, 0x0; + mov.u32 %r16, 0x0; + mov.u32 %r17, 0x0; + @%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd3 + 0 ]; + cvt.u16.u32 %rs9, %r14; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; } + cvt.u16.u32 %rs11, %r15; + .loc 1 25 44 + cvt.f32.bf16 %r18, %rs9; + mov.b32 %f9, %r18; + cvt.f32.bf16 %r19, %rs10; + mov.b32 %f10, %r19; + .loc 1 29 18 + mul.f32 %f17, %f9, 0f3F3504F3; + .loc 1 30 23 + abs.ftz.f32 %f25, %f17; + setp.ge.f32 %p3, %f25, 0f3F8060FE; + mov.f32 %f421, 0f3789CA3C; + mov.f32 %f420, 0fB9F560B9; + mov.f32 %f419, 0f3BAC840B; + mov.f32 %f418, 0fBD0C8162; + mov.f32 %f417, 0f3E1CF906; + mov.f32 %f416, 0f3F6A937E; + mov.f32 %f415, 0f3F20D842; + mov.f32 %f422, %f25; + @%p3 bra $L__BB0_2; + .loc 1 0 23 + mov.f32 %f421, 0f38B1E96A; + mov.f32 %f420, 0fBA574D20; + mov.f32 %f419, 0f3BAAD5EA; + mov.f32 %f418, 0fBCDC1BE7; + mov.f32 %f417, 0f3DE718AF; + mov.f32 %f416, 0fBEC093AC; + mov.f32 %f415, 0f3E0375D3; + .loc 1 30 23 + mul.f32 %f422, %f17, %f17; +$L__BB0_2: + .loc 1 0 0 + cvt.f32.bf16 %r20, %rs11; + mul.f32 %f18, %f10, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p4, %f25, 0f3F8060FE; + fma.rn.ftz.f32 %f143, %f421, %f422, %f420; + fma.rn.ftz.f32 %f144, %f143, %f422, %f419; + fma.rn.ftz.f32 %f145, %f144, %f422, %f418; + fma.rn.ftz.f32 %f146, %f145, %f422, %f417; + fma.rn.ftz.f32 %f147, %f146, %f422, %f416; + fma.rn.ftz.f32 %f148, %f147, %f422, %f415; + neg.f32 %f149, %f422; + selp.f32 %f150, %f149, %f17, %p3; + fma.rn.ftz.f32 %f423, %f148, %f150, %f150; + mov.f32 %f414, 0f3F800000; + @%p4 bra $L__BB0_4; + ex2.approx.ftz.f32 %f151, %f423; + sub.f32 %f153, %f414, %f151; + mov.b32 %r31, %f153; + mov.b32 %r32, %f17; + and.b32 %r33, %r32, -2147483648; + or.b32 %r34, %r33, %r31; + mov.b32 %f423, %r34; +$L__BB0_4: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; } + mov.b32 %f11, %r20; + .loc 1 30 23 + abs.ftz.f32 %f38, %f18; + setp.ge.f32 %p6, %f38, 0f3F8060FE; + mov.f32 %f430, 0f3789CA3C; + mov.f32 %f429, 0fB9F560B9; + mov.f32 %f428, 0f3BAC840B; + mov.f32 %f427, 0fBD0C8162; + mov.f32 %f426, 0f3E1CF906; + mov.f32 %f425, 0f3F6A937E; + mov.f32 %f424, 0f3F20D842; + mov.f32 %f431, %f38; + @%p6 bra $L__BB0_6; + mul.f32 %f431, %f18, %f18; + mov.f32 %f430, 0f38B1E96A; + mov.f32 %f429, 0fBA574D20; + mov.f32 %f428, 0f3BAAD5EA; + mov.f32 %f427, 0fBCDC1BE7; + mov.f32 %f426, 0f3DE718AF; + mov.f32 %f425, 0fBEC093AC; + mov.f32 %f424, 0f3E0375D3; +$L__BB0_6: + .loc 1 0 0 + cvt.f32.bf16 %r21, %rs12; + mul.f32 %f19, %f11, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p7, %f38, 0f3F8060FE; + fma.rn.ftz.f32 %f168, %f430, %f431, %f429; + fma.rn.ftz.f32 %f169, %f168, %f431, %f428; + fma.rn.ftz.f32 %f170, %f169, %f431, %f427; + fma.rn.ftz.f32 %f171, %f170, %f431, %f426; + fma.rn.ftz.f32 %f172, %f171, %f431, %f425; + fma.rn.ftz.f32 %f173, %f172, %f431, %f424; + neg.f32 %f174, %f431; + selp.f32 %f175, %f174, %f18, %p6; + fma.rn.ftz.f32 %f432, %f173, %f175, %f175; + @%p7 bra $L__BB0_8; + ex2.approx.ftz.f32 %f176, %f432; + sub.f32 %f178, %f414, %f176; + mov.b32 %r35, %f178; + mov.b32 %r36, %f18; + and.b32 %r37, %r36, -2147483648; + or.b32 %r38, %r37, %r35; + mov.b32 %f432, %r38; +$L__BB0_8: + .loc 1 0 0 + cvt.u16.u32 %rs13, %r16; + mov.b32 %f12, %r21; + .loc 1 30 23 + abs.ftz.f32 %f51, %f19; + setp.ge.f32 %p9, %f51, 0f3F8060FE; + mov.f32 %f439, 0f3789CA3C; + mov.f32 %f438, 0fB9F560B9; + mov.f32 %f437, 0f3BAC840B; + mov.f32 %f436, 0fBD0C8162; + mov.f32 %f435, 0f3E1CF906; + mov.f32 %f434, 0f3F6A937E; + mov.f32 %f433, 0f3F20D842; + mov.f32 %f440, %f51; + @%p9 bra $L__BB0_10; + mul.f32 %f440, %f19, %f19; + mov.f32 %f439, 0f38B1E96A; + mov.f32 %f438, 0fBA574D20; + mov.f32 %f437, 0f3BAAD5EA; + mov.f32 %f436, 0fBCDC1BE7; + mov.f32 %f435, 0f3DE718AF; + mov.f32 %f434, 0fBEC093AC; + mov.f32 %f433, 0f3E0375D3; +$L__BB0_10: + .loc 1 0 0 + cvt.f32.bf16 %r22, %rs13; + mul.f32 %f20, %f12, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p10, %f51, 0f3F8060FE; + fma.rn.ftz.f32 %f193, %f439, %f440, %f438; + fma.rn.ftz.f32 %f194, %f193, %f440, %f437; + fma.rn.ftz.f32 %f195, %f194, %f440, %f436; + fma.rn.ftz.f32 %f196, %f195, %f440, %f435; + fma.rn.ftz.f32 %f197, %f196, %f440, %f434; + fma.rn.ftz.f32 %f198, %f197, %f440, %f433; + neg.f32 %f199, %f440; + selp.f32 %f200, %f199, %f19, %p9; + fma.rn.ftz.f32 %f441, %f198, %f200, %f200; + @%p10 bra $L__BB0_12; + ex2.approx.ftz.f32 %f201, %f441; + sub.f32 %f203, %f414, %f201; + mov.b32 %r39, %f203; + mov.b32 %r40, %f19; + and.b32 %r41, %r40, -2147483648; + or.b32 %r42, %r41, %r39; + mov.b32 %f441, %r42; +$L__BB0_12: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; } + mov.b32 %f13, %r22; + .loc 1 30 23 + abs.ftz.f32 %f64, %f20; + setp.ge.f32 %p12, %f64, 0f3F8060FE; + mov.f32 %f448, 0f3789CA3C; + mov.f32 %f447, 0fB9F560B9; + mov.f32 %f446, 0f3BAC840B; + mov.f32 %f445, 0fBD0C8162; + mov.f32 %f444, 0f3E1CF906; + mov.f32 %f443, 0f3F6A937E; + mov.f32 %f442, 0f3F20D842; + mov.f32 %f449, %f64; + @%p12 bra $L__BB0_14; + mul.f32 %f449, %f20, %f20; + mov.f32 %f448, 0f38B1E96A; + mov.f32 %f447, 0fBA574D20; + mov.f32 %f446, 0f3BAAD5EA; + mov.f32 %f445, 0fBCDC1BE7; + mov.f32 %f444, 0f3DE718AF; + mov.f32 %f443, 0fBEC093AC; + mov.f32 %f442, 0f3E0375D3; +$L__BB0_14: + .loc 1 0 0 + cvt.f32.bf16 %r23, %rs14; + mul.f32 %f21, %f13, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p13, %f64, 0f3F8060FE; + fma.rn.ftz.f32 %f218, %f448, %f449, %f447; + fma.rn.ftz.f32 %f219, %f218, %f449, %f446; + fma.rn.ftz.f32 %f220, %f219, %f449, %f445; + fma.rn.ftz.f32 %f221, %f220, %f449, %f444; + fma.rn.ftz.f32 %f222, %f221, %f449, %f443; + fma.rn.ftz.f32 %f223, %f222, %f449, %f442; + neg.f32 %f224, %f449; + selp.f32 %f225, %f224, %f20, %p12; + fma.rn.ftz.f32 %f450, %f223, %f225, %f225; + @%p13 bra $L__BB0_16; + ex2.approx.ftz.f32 %f226, %f450; + sub.f32 %f228, %f414, %f226; + mov.b32 %r43, %f228; + mov.b32 %r44, %f20; + and.b32 %r45, %r44, -2147483648; + or.b32 %r46, %r45, %r43; + mov.b32 %f450, %r46; +$L__BB0_16: + .loc 1 0 0 + cvt.u16.u32 %rs15, %r17; + mov.b32 %f14, %r23; + .loc 1 30 23 + abs.ftz.f32 %f77, %f21; + setp.ge.f32 %p15, %f77, 0f3F8060FE; + mov.f32 %f457, 0f3789CA3C; + mov.f32 %f456, 0fB9F560B9; + mov.f32 %f455, 0f3BAC840B; + mov.f32 %f454, 0fBD0C8162; + mov.f32 %f453, 0f3E1CF906; + mov.f32 %f452, 0f3F6A937E; + mov.f32 %f451, 0f3F20D842; + mov.f32 %f458, %f77; + @%p15 bra $L__BB0_18; + mul.f32 %f458, %f21, %f21; + mov.f32 %f457, 0f38B1E96A; + mov.f32 %f456, 0fBA574D20; + mov.f32 %f455, 0f3BAAD5EA; + mov.f32 %f454, 0fBCDC1BE7; + mov.f32 %f453, 0f3DE718AF; + mov.f32 %f452, 0fBEC093AC; + mov.f32 %f451, 0f3E0375D3; +$L__BB0_18: + .loc 1 0 0 + cvt.f32.bf16 %r24, %rs15; + mul.f32 %f22, %f14, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p16, %f77, 0f3F8060FE; + fma.rn.ftz.f32 %f243, %f457, %f458, %f456; + fma.rn.ftz.f32 %f244, %f243, %f458, %f455; + fma.rn.ftz.f32 %f245, %f244, %f458, %f454; + fma.rn.ftz.f32 %f246, %f245, %f458, %f453; + fma.rn.ftz.f32 %f247, %f246, %f458, %f452; + fma.rn.ftz.f32 %f248, %f247, %f458, %f451; + neg.f32 %f249, %f458; + selp.f32 %f250, %f249, %f21, %p15; + fma.rn.ftz.f32 %f459, %f248, %f250, %f250; + @%p16 bra $L__BB0_20; + ex2.approx.ftz.f32 %f251, %f459; + sub.f32 %f253, %f414, %f251; + mov.b32 %r47, %f253; + mov.b32 %r48, %f21; + and.b32 %r49, %r48, -2147483648; + or.b32 %r50, %r49, %r47; + mov.b32 %f459, %r50; +$L__BB0_20: + .loc 1 0 0 + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; } + mov.b32 %f15, %r24; + .loc 1 30 23 + abs.ftz.f32 %f90, %f22; + setp.ge.f32 %p18, %f90, 0f3F8060FE; + mov.f32 %f466, 0f3789CA3C; + mov.f32 %f465, 0fB9F560B9; + mov.f32 %f464, 0f3BAC840B; + mov.f32 %f463, 0fBD0C8162; + mov.f32 %f462, 0f3E1CF906; + mov.f32 %f461, 0f3F6A937E; + mov.f32 %f460, 0f3F20D842; + mov.f32 %f467, %f90; + @%p18 bra $L__BB0_22; + mul.f32 %f467, %f22, %f22; + mov.f32 %f466, 0f38B1E96A; + mov.f32 %f465, 0fBA574D20; + mov.f32 %f464, 0f3BAAD5EA; + mov.f32 %f463, 0fBCDC1BE7; + mov.f32 %f462, 0f3DE718AF; + mov.f32 %f461, 0fBEC093AC; + mov.f32 %f460, 0f3E0375D3; +$L__BB0_22: + .loc 1 0 0 + cvt.f32.bf16 %r25, %rs16; + mul.f32 %f23, %f15, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p19, %f90, 0f3F8060FE; + fma.rn.ftz.f32 %f268, %f466, %f467, %f465; + fma.rn.ftz.f32 %f269, %f268, %f467, %f464; + fma.rn.ftz.f32 %f270, %f269, %f467, %f463; + fma.rn.ftz.f32 %f271, %f270, %f467, %f462; + fma.rn.ftz.f32 %f272, %f271, %f467, %f461; + fma.rn.ftz.f32 %f273, %f272, %f467, %f460; + neg.f32 %f274, %f467; + selp.f32 %f275, %f274, %f22, %p18; + fma.rn.ftz.f32 %f468, %f273, %f275, %f275; + @%p19 bra $L__BB0_24; + ex2.approx.ftz.f32 %f276, %f468; + sub.f32 %f278, %f414, %f276; + mov.b32 %r51, %f278; + mov.b32 %r52, %f22; + and.b32 %r53, %r52, -2147483648; + or.b32 %r54, %r53, %r51; + mov.b32 %f468, %r54; +$L__BB0_24: + .loc 1 0 0 + mov.b32 %f16, %r25; + .loc 1 30 23 + abs.ftz.f32 %f103, %f23; + setp.ge.f32 %p21, %f103, 0f3F8060FE; + mov.f32 %f475, 0f3789CA3C; + mov.f32 %f474, 0fB9F560B9; + mov.f32 %f473, 0f3BAC840B; + mov.f32 %f472, 0fBD0C8162; + mov.f32 %f471, 0f3E1CF906; + mov.f32 %f470, 0f3F6A937E; + mov.f32 %f469, 0f3F20D842; + mov.f32 %f476, %f103; + @%p21 bra $L__BB0_26; + mul.f32 %f476, %f23, %f23; + mov.f32 %f475, 0f38B1E96A; + mov.f32 %f474, 0fBA574D20; + mov.f32 %f473, 0f3BAAD5EA; + mov.f32 %f472, 0fBCDC1BE7; + mov.f32 %f471, 0f3DE718AF; + mov.f32 %f470, 0fBEC093AC; + mov.f32 %f469, 0f3E0375D3; +$L__BB0_26: + .loc 1 0 0 + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + cvt.u16.u32 %rs5, %r4; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; } + cvt.u16.u32 %rs7, %r5; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; } + mul.f32 %f24, %f16, 0f3F3504F3; + .loc 1 30 23 + setp.ltu.f32 %p22, %f103, 0f3F8060FE; + fma.rn.ftz.f32 %f293, %f475, %f476, %f474; + fma.rn.ftz.f32 %f294, %f293, %f476, %f473; + fma.rn.ftz.f32 %f295, %f294, %f476, %f472; + fma.rn.ftz.f32 %f296, %f295, %f476, %f471; + fma.rn.ftz.f32 %f297, %f296, %f476, %f470; + fma.rn.ftz.f32 %f298, %f297, %f476, %f469; + neg.f32 %f299, %f476; + selp.f32 %f300, %f299, %f23, %p21; + fma.rn.ftz.f32 %f477, %f298, %f300, %f300; + @%p22 bra $L__BB0_28; + ex2.approx.ftz.f32 %f301, %f477; + sub.f32 %f303, %f414, %f301; + mov.b32 %r55, %f303; + mov.b32 %r56, %f23; + and.b32 %r57, %r56, -2147483648; + or.b32 %r58, %r57, %r55; + mov.b32 %f477, %r58; +$L__BB0_28: + .loc 1 0 0 + cvt.f32.bf16 %r6, %rs1; + cvt.f32.bf16 %r7, %rs2; + cvt.f32.bf16 %r8, %rs3; + cvt.f32.bf16 %r9, %rs4; + cvt.f32.bf16 %r10, %rs5; + cvt.f32.bf16 %r11, %rs6; + cvt.f32.bf16 %r12, %rs7; + cvt.f32.bf16 %r13, %rs8; + .loc 1 30 23 + abs.ftz.f32 %f116, %f24; + setp.ge.f32 %p24, %f116, 0f3F8060FE; + mov.f32 %f484, 0f3789CA3C; + mov.f32 %f483, 0fB9F560B9; + mov.f32 %f482, 0f3BAC840B; + mov.f32 %f481, 0fBD0C8162; + mov.f32 %f480, 0f3E1CF906; + mov.f32 %f479, 0f3F6A937E; + mov.f32 %f478, 0f3F20D842; + mov.f32 %f485, %f116; + @%p24 bra $L__BB0_30; + mul.f32 %f485, %f24, %f24; + mov.f32 %f484, 0f38B1E96A; + mov.f32 %f483, 0fBA574D20; + mov.f32 %f482, 0f3BAAD5EA; + mov.f32 %f481, 0fBCDC1BE7; + mov.f32 %f480, 0f3DE718AF; + mov.f32 %f479, 0fBEC093AC; + mov.f32 %f478, 0f3E0375D3; +$L__BB0_30: + .loc 1 0 0 + mov.b32 %f1, %r6; + mov.b32 %f2, %r7; + mov.b32 %f3, %r8; + mov.b32 %f4, %r9; + mov.b32 %f5, %r10; + mov.b32 %f6, %r11; + mov.b32 %f7, %r12; + mov.b32 %f8, %r13; + .loc 1 30 23 + setp.ltu.f32 %p25, %f116, 0f3F8060FE; + fma.rn.ftz.f32 %f318, %f484, %f485, %f483; + fma.rn.ftz.f32 %f319, %f318, %f485, %f482; + fma.rn.ftz.f32 %f320, %f319, %f485, %f481; + fma.rn.ftz.f32 %f321, %f320, %f485, %f480; + fma.rn.ftz.f32 %f322, %f321, %f485, %f479; + fma.rn.ftz.f32 %f323, %f322, %f485, %f478; + neg.f32 %f324, %f485; + selp.f32 %f325, %f324, %f24, %p24; + fma.rn.ftz.f32 %f486, %f323, %f325, %f325; + @%p25 bra $L__BB0_32; + ex2.approx.ftz.f32 %f326, %f486; + sub.f32 %f328, %f414, %f326; + mov.b32 %r59, %f328; + mov.b32 %r60, %f24; + and.b32 %r61, %r60, -2147483648; + or.b32 %r62, %r61, %r59; + mov.b32 %f486, %r62; +$L__BB0_32: + .loc 1 32 18 + add.f32 %f345, %f423, 0f3F800000; + add.f32 %f346, %f432, 0f3F800000; + add.f32 %f347, %f441, 0f3F800000; + add.f32 %f348, %f450, 0f3F800000; + add.f32 %f349, %f459, 0f3F800000; + add.f32 %f350, %f468, 0f3F800000; + add.f32 %f351, %f477, 0f3F800000; + add.f32 %f352, %f486, 0f3F800000; + .loc 1 35 19 + mul.f32 %f353, %f9, %f9; + mul.f32 %f354, %f10, %f10; + mul.f32 %f355, %f11, %f11; + mul.f32 %f356, %f12, %f12; + mul.f32 %f357, %f13, %f13; + mul.f32 %f358, %f14, %f14; + mul.f32 %f359, %f15, %f15; + mul.f32 %f360, %f16, %f16; + .loc 1 37 20 + mul.f32 %f361, %f353, 0fBF000000; + mul.f32 %f362, %f354, 0fBF000000; + mul.f32 %f363, %f355, 0fBF000000; + mul.f32 %f364, %f356, 0fBF000000; + mul.f32 %f365, %f357, 0fBF000000; + mul.f32 %f366, %f358, 0fBF000000; + mul.f32 %f367, %f359, 0fBF000000; + mul.f32 %f368, %f360, 0fBF000000; + .loc 1 38 19 + mul.f32 %f330, %f361, 0f3FB8AA3B; + ex2.approx.f32 %f329, %f330; + mul.f32 %f332, %f362, 0f3FB8AA3B; + ex2.approx.f32 %f331, %f332; + mul.f32 %f334, %f363, 0f3FB8AA3B; + ex2.approx.f32 %f333, %f334; + mul.f32 %f336, %f364, 0f3FB8AA3B; + ex2.approx.f32 %f335, %f336; + mul.f32 %f338, %f365, 0f3FB8AA3B; + ex2.approx.f32 %f337, %f338; + mul.f32 %f340, %f366, 0f3FB8AA3B; + ex2.approx.f32 %f339, %f340; + mul.f32 %f342, %f367, 0f3FB8AA3B; + ex2.approx.f32 %f341, %f342; + mul.f32 %f344, %f368, 0f3FB8AA3B; + ex2.approx.f32 %f343, %f344; + .loc 1 40 20 + mul.f32 %f369, %f329, 0f3ECC422A; + mul.f32 %f370, %f331, 0f3ECC422A; + mul.f32 %f371, %f333, 0f3ECC422A; + mul.f32 %f372, %f335, 0f3ECC422A; + mul.f32 %f373, %f337, 0f3ECC422A; + mul.f32 %f374, %f339, 0f3ECC422A; + mul.f32 %f375, %f341, 0f3ECC422A; + mul.f32 %f376, %f343, 0f3ECC422A; + .loc 1 41 19 + mul.f32 %f377, %f9, %f369; + mul.f32 %f378, %f10, %f370; + mul.f32 %f379, %f11, %f371; + mul.f32 %f380, %f12, %f372; + mul.f32 %f381, %f13, %f373; + mul.f32 %f382, %f14, %f374; + mul.f32 %f383, %f15, %f375; + mul.f32 %f384, %f16, %f376; + .loc 1 42 20 + fma.rn.f32 %f385, %f345, 0f3F000000, %f377; + fma.rn.f32 %f386, %f346, 0f3F000000, %f378; + fma.rn.f32 %f387, %f347, 0f3F000000, %f379; + fma.rn.f32 %f388, %f348, 0f3F000000, %f380; + fma.rn.f32 %f389, %f349, 0f3F000000, %f381; + fma.rn.f32 %f390, %f350, 0f3F000000, %f382; + fma.rn.f32 %f391, %f351, 0f3F000000, %f383; + fma.rn.f32 %f392, %f352, 0f3F000000, %f384; + .loc 1 43 19 + mul.f32 %f393, %f1, %f385; + mul.f32 %f394, %f2, %f386; + mul.f32 %f395, %f3, %f387; + mul.f32 %f396, %f4, %f388; + mul.f32 %f397, %f5, %f389; + mul.f32 %f398, %f6, %f390; + mul.f32 %f399, %f7, %f391; + mul.f32 %f400, %f8, %f392; + .loc 1 45 40 + mov.b32 %r63, %f393; + cvt.rn.bf16.f32 %rs17, %r63; + mov.b32 %r64, %f394; + cvt.rn.bf16.f32 %rs18, %r64; + mov.b32 %r65, %f395; + cvt.rn.bf16.f32 %rs19, %r65; + mov.b32 %r66, %f396; + cvt.rn.bf16.f32 %rs20, %r66; + mov.b32 %r67, %f397; + cvt.rn.bf16.f32 %rs21, %r67; + mov.b32 %r68, %f398; + cvt.rn.bf16.f32 %rs22, %r68; + mov.b32 %r69, %f399; + cvt.rn.bf16.f32 %rs23, %r69; + mov.b32 %r70, %f400; + cvt.rn.bf16.f32 %rs24, %r70; + mov.b32 %r75, {%rs17, %rs18}; + mov.b32 %r76, {%rs19, %rs20}; + mov.b32 %r77, {%rs21, %rs22}; + mov.b32 %r78, {%rs23, %rs24}; + @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r75, %r76, %r77, %r78 }; + .loc 1 45 4 + ret; +$L__tmp1: +$L__func_end0: + +} + // .globl __nv_erff +.visible .func (.param .b32 func_retval0) __nv_erff( + .param .b32 __nv_erff_param_0 +) +{ + .reg .pred %p<4>; + .reg .b32 %r<5>; + .reg .f32 %f<49>; +$L__func_begin1: + + ld.param.f32 %f14, [__nv_erff_param_0]; + abs.ftz.f32 %f1, %f14; + setp.ge.f32 %p1, %f1, 0f3F8060FE; + mov.f32 %f46, 0f3789CA3C; + mov.f32 %f45, 0fB9F560B9; + mov.f32 %f44, 0f3BAC840B; + mov.f32 %f43, 0fBD0C8162; + mov.f32 %f42, 0f3E1CF906; + mov.f32 %f41, 0f3F6A937E; + mov.f32 %f40, 0f3F20D842; + mov.f32 %f47, %f1; + @%p1 bra $L__BB1_2; + mul.f32 %f47, %f14, %f14; + mov.f32 %f46, 0f38B1E96A; + mov.f32 %f45, 0fBA574D20; + mov.f32 %f44, 0f3BAAD5EA; + mov.f32 %f43, 0fBCDC1BE7; + mov.f32 %f42, 0f3DE718AF; + mov.f32 %f41, 0fBEC093AC; + mov.f32 %f40, 0f3E0375D3; +$L__BB1_2: + setp.ltu.f32 %p2, %f1, 0f3F8060FE; + fma.rn.ftz.f32 %f29, %f46, %f47, %f45; + fma.rn.ftz.f32 %f30, %f29, %f47, %f44; + fma.rn.ftz.f32 %f31, %f30, %f47, %f43; + fma.rn.ftz.f32 %f32, %f31, %f47, %f42; + fma.rn.ftz.f32 %f33, %f32, %f47, %f41; + fma.rn.ftz.f32 %f34, %f33, %f47, %f40; + neg.f32 %f35, %f47; + selp.f32 %f36, %f35, %f14, %p1; + fma.rn.ftz.f32 %f48, %f34, %f36, %f36; + @%p2 bra $L__BB1_4; + ex2.approx.ftz.f32 %f37, %f48; + mov.f32 %f38, 0f3F800000; + sub.f32 %f39, %f38, %f37; + mov.b32 %r1, %f39; + mov.b32 %r2, %f14; + and.b32 %r3, %r2, -2147483648; + or.b32 %r4, %r3, %r1; + mov.b32 %f48, %r4; +$L__BB1_4: + st.param.f32 [func_retval0+0], %f48; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 53 +.b8 106 +.b8 120 +.b8 97 +.b8 103 +.b8 117 +.b8 120 +.b8 104 +.b8 111 +.b8 51 +.b8 110 +.b8 104 +.b8 114 +.b8 108 +.b8 116 +.b8 53 +.b8 118 +.b8 99 +.b8 105 +.b8 110 +.b8 110 +.b8 122 +.b8 53 +.b8 102 +.b8 101 +.b8 118 +.b8 111 +.b8 100 +.b8 117 +.b8 109 +.b8 108 +.b8 112 +.b8 119 +.b8 110 +.b8 52 +.b8 119 +.b8 121 +.b8 98 +.b8 50 +.b8 118 +.b8 120 +.b8 51 +.b8 120 +.b8 114 +.b8 118 +.b8 101 +.b8 105 +.b8 99 +.b8 101 +.b8 114 +.b8 108 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 53 +.b8 106 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..3a6f2b53568e3bb7360a3f0c2dc820c6f83697ad Binary files /dev/null and b/.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin differ diff --git a/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..e0db3cd413022c0143329a8eb6b0fb39287ccd28 --- /dev/null +++ b/.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir @@ -0,0 +1,18 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c256_i32 : i32 + %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %3 = tt.splat %1 : (i32) -> tensor<256xi32> + %4 = arith.addi %3, %2 : tensor<256xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr>, tensor<256xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %8 = arith.extf %7 : tensor<256xbf16> to tensor<256xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + tt.return + } +} diff --git a/.triton/dump/a69784da01a97187168f22847465505f/triton_.llir b/.triton/dump/a69784da01a97187168f22847465505f/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..3b86f3776d3b626db1731054927f55d804770a98 --- /dev/null +++ b/.triton/dump/a69784da01a97187168f22847465505f/triton_.llir @@ -0,0 +1,324 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] +@.str = private unnamed_addr constant [11 x i8] c"__CUDA_FTZ\00", align 1 + +define void @triton__0d1d2d3d4d5d6d7de8de(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i32 %7, i32 %8) local_unnamed_addr !dbg !7 { + %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10 + %11 = and i32 %10, 31, !dbg !10 + %12 = lshr i32 %10, 5, !dbg !10 + %13 = and i32 %12, 1, !dbg !10 + %urem = and i32 %10, 63, !dbg !10 + %14 = shl nuw nsw i32 %urem, 2, !dbg !10 + %15 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #6, !dbg !11 + %16 = shl i32 %15, 8, !dbg !12 + %17 = or i32 %16, %14, !dbg !13 + %18 = sext i32 %17 to i64, !dbg !14 + %19 = getelementptr float, ptr addrspace(1) %1, i64 %18, !dbg !14 + %20 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %19, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !15 + %21 = extractvalue { i32, i32, i32, i32 } %20, 0, !dbg !15 + %22 = extractvalue { i32, i32, i32, i32 } %20, 1, !dbg !15 + %23 = extractvalue { i32, i32, i32, i32 } %20, 2, !dbg !15 + %24 = extractvalue { i32, i32, i32, i32 } %20, 3, !dbg !15 + %25 = bitcast i32 %23 to float, !dbg !15 + %26 = bitcast i32 %24 to float, !dbg !15 + %27 = getelementptr i16, ptr addrspace(1) %2, i64 %18, !dbg !16 + %28 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %27, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !17 + %29 = extractvalue { i32, i32 } %28, 0, !dbg !17 + %30 = extractvalue { i32, i32 } %28, 1, !dbg !17 + %31 = trunc i32 %29 to i16, !dbg !17 + %extelt.offset = lshr i32 %29, 16, !dbg !17 + %32 = trunc i32 %extelt.offset to i16, !dbg !17 + %33 = trunc i32 %30 to i16, !dbg !17 + %extelt.offset1 = lshr i32 %30, 16, !dbg !17 + %34 = trunc i32 %extelt.offset1 to i16, !dbg !17 + %35 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %31) #6, !dbg !18 + %36 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %32) #6, !dbg !18 + %37 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %33) #6, !dbg !18 + %38 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %34) #6, !dbg !18 + %39 = getelementptr i16, ptr addrspace(1) %3, i64 %18, !dbg !19 + %40 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];\0A\09@!$5 mov.u32 $0, $4;\0A\09@!$7 mov.u32 $1, $6;", "=r,=r,l,b,r,b,r,b"(ptr addrspace(1) %39, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !20 + %41 = extractvalue { i32, i32 } %40, 0, !dbg !20 + %42 = extractvalue { i32, i32 } %40, 1, !dbg !20 + %43 = trunc i32 %41 to i16, !dbg !20 + %extelt.offset2 = lshr i32 %41, 16, !dbg !20 + %44 = trunc i32 %extelt.offset2 to i16, !dbg !20 + %45 = trunc i32 %42 to i16, !dbg !20 + %extelt.offset3 = lshr i32 %42, 16, !dbg !20 + %46 = trunc i32 %extelt.offset3 to i16, !dbg !20 + %47 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %43) #6, !dbg !21 + %48 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %44) #6, !dbg !21 + %49 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %45) #6, !dbg !21 + %50 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %46) #6, !dbg !21 + %51 = zext nneg i32 %14 to i64, !dbg !22 + %52 = getelementptr float, ptr addrspace(1) %4, i64 %51, !dbg !22 + %53 = tail call { i32, i32, i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09mov.u32 $2, 0x0;\0A\09mov.u32 $3, 0x0;\0A\09@$5 ld.global.L1::evict_last.v4.b32 { $0, $1, $2, $3 }, [ $4 + 0 ];\0A\09@!$7 mov.u32 $0, $6;\0A\09@!$9 mov.u32 $1, $8;\0A\09@!$11 mov.u32 $2, $10;\0A\09@!$13 mov.u32 $3, $12;", "=r,=r,=r,=r,l,b,r,b,r,b,r,b,r,b"(ptr addrspace(1) %52, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true, i32 0, i1 true) #6, !dbg !23 + %54 = fadd float %37, %25, !dbg !24 + %55 = fadd float %38, %26, !dbg !24 + %56 = insertelement <2 x i32> poison, i32 %21, i64 0, !dbg !15 + %57 = insertelement <2 x i32> %56, i32 %22, i64 1, !dbg !15 + %58 = bitcast <2 x i32> %57 to <2 x float>, !dbg !15 + %59 = insertelement <2 x float> poison, float %35, i64 0, !dbg !24 + %60 = insertelement <2 x float> %59, float %36, i64 1, !dbg !24 + %61 = fadd <2 x float> %60, %58, !dbg !24 + %62 = insertelement <2 x float> poison, float %47, i64 0, !dbg !25 + %63 = insertelement <2 x float> %62, float %48, i64 1, !dbg !25 + %64 = fadd <2 x float> %61, %63, !dbg !25 + %65 = fadd float %54, %49, !dbg !25 + %66 = fadd float %55, %50, !dbg !25 + %67 = extractelement <2 x float> %64, i64 0, !dbg !26 + %68 = extractelement <2 x float> %64, i64 1, !dbg !26 + %69 = fadd float %67, %68, !dbg !26 + %70 = fadd float %69, %65, !dbg !26 + %71 = fadd float %70, %66, !dbg !26 + %72 = bitcast float %71 to i32, !dbg !32 + %73 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %72, i32 16, i32 31), !dbg !32 + %74 = bitcast i32 %73 to float, !dbg !32 + %75 = fadd float %71, %74, !dbg !26 + %76 = bitcast float %75 to i32, !dbg !32 + %77 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %76, i32 8, i32 31), !dbg !32 + %78 = bitcast i32 %77 to float, !dbg !32 + %79 = fadd float %75, %78, !dbg !26 + %80 = bitcast float %79 to i32, !dbg !32 + %81 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %80, i32 4, i32 31), !dbg !32 + %82 = bitcast i32 %81 to float, !dbg !32 + %83 = fadd float %79, %82, !dbg !26 + %84 = bitcast float %83 to i32, !dbg !32 + %85 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %84, i32 2, i32 31), !dbg !32 + %86 = bitcast i32 %85 to float, !dbg !32 + %87 = fadd float %83, %86, !dbg !26 + %88 = bitcast float %87 to i32, !dbg !32 + %89 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %88, i32 1, i32 31), !dbg !32 + %90 = bitcast i32 %89 to float, !dbg !32 + %91 = fadd float %87, %90, !dbg !26 + %92 = icmp eq i32 %11, 0, !dbg !32 + %93 = zext nneg i32 %13 to i64, !dbg !32 + %94 = getelementptr float, ptr addrspace(3) @global_smem, i64 %93, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %91, i1 %92) #6, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %95 = icmp slt i32 %10, 2, !dbg !32 + %96 = sext i32 %10 to i64, !dbg !32 + %97 = getelementptr float, ptr addrspace(3) @global_smem, i64 %96, !dbg !32 + %98 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %97, i1 %95) #6, !dbg !32 + %99 = bitcast float %98 to i32, !dbg !32 + %100 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %99, i32 1, i32 31), !dbg !32 + %101 = bitcast i32 %100 to float, !dbg !32 + %102 = fadd float %98, %101, !dbg !26 + %103 = and i32 %10, 1, !dbg !32 + %104 = icmp eq i32 %103, 0, !dbg !32 + %105 = and i1 %95, %104, !dbg !32 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %102, i1 %105) #6, !dbg !32 + tail call void @llvm.nvvm.barrier0(), !dbg !32 + %106 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !32 + %107 = fadd float %106, 0.000000e+00, !dbg !34 + %108 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %107, float 2.560000e+02) #6, !dbg !38 + %109 = fsub float %67, %108, !dbg !39 + %110 = fsub float %68, %108, !dbg !39 + %111 = fsub float %65, %108, !dbg !39 + %112 = fsub float %66, %108, !dbg !39 + %113 = fmul float %109, %109, !dbg !40 + %114 = fmul float %110, %110, !dbg !40 + %115 = fmul float %111, %111, !dbg !40 + %116 = fmul float %112, %112, !dbg !40 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %117 = fadd float %113, %114, !dbg !43 + %118 = fadd float %115, %117, !dbg !43 + %119 = fadd float %116, %118, !dbg !43 + %120 = bitcast float %119 to i32, !dbg !41 + %121 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %120, i32 16, i32 31), !dbg !41 + %122 = bitcast i32 %121 to float, !dbg !41 + %123 = fadd float %119, %122, !dbg !43 + %124 = bitcast float %123 to i32, !dbg !41 + %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 8, i32 31), !dbg !41 + %126 = bitcast i32 %125 to float, !dbg !41 + %127 = fadd float %123, %126, !dbg !43 + %128 = bitcast float %127 to i32, !dbg !41 + %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 4, i32 31), !dbg !41 + %130 = bitcast i32 %129 to float, !dbg !41 + %131 = fadd float %127, %130, !dbg !43 + %132 = bitcast float %131 to i32, !dbg !41 + %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 2, i32 31), !dbg !41 + %134 = bitcast i32 %133 to float, !dbg !41 + %135 = fadd float %131, %134, !dbg !43 + %136 = bitcast float %135 to i32, !dbg !41 + %137 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %136, i32 1, i32 31), !dbg !41 + %138 = bitcast i32 %137 to float, !dbg !41 + %139 = fadd float %135, %138, !dbg !43 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %94, float %139, i1 %92) #6, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %140 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %97, i1 %95) #6, !dbg !41 + %141 = bitcast float %140 to i32, !dbg !41 + %142 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %141, i32 1, i32 31), !dbg !41 + %143 = bitcast i32 %142 to float, !dbg !41 + %144 = fadd float %140, %143, !dbg !43 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %97, float %144, i1 %105) #6, !dbg !41 + tail call void @llvm.nvvm.barrier0(), !dbg !41 + %145 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !41 + %146 = fadd float %145, 0.000000e+00, !dbg !46 + %147 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %146, float 2.560000e+02) #6, !dbg !48 + %148 = fadd float %147, 0x3EE4F8B580000000, !dbg !49 + %149 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6, !dbg !50 + %.not.i = icmp eq i32 %149, 0, !dbg !50 + br i1 %.not.i, label %152, label %150, !dbg !50 + +150: ; preds = %9 + %151 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %148), !dbg !50 + br label %__nv_rsqrtf.exit, !dbg !50 + +152: ; preds = %9 + %153 = tail call float @llvm.nvvm.rsqrt.approx.f(float %148), !dbg !50 + br label %__nv_rsqrtf.exit, !dbg !50 + +__nv_rsqrtf.exit: ; preds = %150, %152 + %.0.i = phi float [ %151, %150 ], [ %153, %152 ], !dbg !50 + %154 = extractvalue { i32, i32, i32, i32 } %53, 3, !dbg !23 + %155 = bitcast i32 %154 to float, !dbg !23 + %156 = extractvalue { i32, i32, i32, i32 } %53, 2, !dbg !23 + %157 = bitcast i32 %156 to float, !dbg !23 + %158 = extractvalue { i32, i32, i32, i32 } %53, 1, !dbg !23 + %159 = bitcast i32 %158 to float, !dbg !23 + %160 = extractvalue { i32, i32, i32, i32 } %53, 0, !dbg !23 + %161 = bitcast i32 %160 to float, !dbg !23 + %162 = fmul float %109, %.0.i, !dbg !51 + %163 = fmul float %110, %.0.i, !dbg !51 + %164 = fmul float %111, %.0.i, !dbg !51 + %165 = fmul float %112, %.0.i, !dbg !51 + %166 = fmul float %162, %161, !dbg !52 + %167 = fmul float %163, %159, !dbg !52 + %168 = fmul float %164, %157, !dbg !52 + %169 = fmul float %165, %155, !dbg !52 + tail call void @llvm.nvvm.barrier0(), !dbg !53 + %170 = sext i32 %15 to i64, !dbg !54 + %171 = getelementptr float, ptr addrspace(1) %0, i64 %170, !dbg !54 + %172 = icmp eq i32 %urem, 0, !dbg !55 + %173 = bitcast float %.0.i to i32, !dbg !55 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %173, ptr addrspace(1) %171, i1 %172) #6, !dbg !55 + %174 = getelementptr i16, ptr addrspace(1) %6, i64 %18, !dbg !56 + %175 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %166) #6, !dbg !57 + %176 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %167) #6, !dbg !57 + %177 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %168) #6, !dbg !57 + %178 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %169) #6, !dbg !57 + %179 = insertelement <2 x i16> undef, i16 %175, i64 0, !dbg !57 + %180 = insertelement <2 x i16> %179, i16 %176, i64 1, !dbg !57 + %181 = bitcast <2 x i16> %180 to i32, !dbg !57 + %182 = insertelement <2 x i16> undef, i16 %177, i64 0, !dbg !57 + %183 = insertelement <2 x i16> %182, i16 %178, i64 1, !dbg !57 + %184 = bitcast <2 x i16> %183 to i32, !dbg !57 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %181, i32 %184, ptr addrspace(1) %174, i1 true) #6, !dbg !57 + %185 = getelementptr float, ptr addrspace(1) %5, i64 %170, !dbg !58 + %186 = bitcast float %108 to i32, !dbg !59 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %186, ptr addrspace(1) %185, i1 %172) #6, !dbg !59 + ret void, !dbg !60 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +; Function Attrs: alwaysinline nounwind +define float @__nv_rsqrtf(float %x) local_unnamed_addr #3 { + %1 = tail call i32 @__nvvm_reflect(ptr nonnull @.str) #6 + %.not = icmp eq i32 %1, 0 + br i1 %.not, label %4, label %2 + +2: ; preds = %0 + %3 = tail call float @llvm.nvvm.rsqrt.approx.ftz.f(float %x) + br label %6 + +4: ; preds = %0 + %5 = tail call float @llvm.nvvm.rsqrt.approx.f(float %x) + br label %6 + +6: ; preds = %4, %2 + %.0 = phi float [ %3, %2 ], [ %5, %4 ] + ret float %.0 +} + +declare i32 @__nvvm_reflect(ptr) local_unnamed_addr #4 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.ftz.f(float) #5 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare float @llvm.nvvm.rsqrt.approx.f(float) #5 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #6 = { nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} +!nvvm.annotations = !{!4, !5, !5, !4} +!llvm.ident = !{!6} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1} +!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!3 = !DIFile(filename: "cgyrkrvxykbeetcyfsjqxf2ni3kynf3x4qqckt4p2fyz7wetdsd2.py", directory: "/tmp/torchinductor_root/gy") +!4 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"kernel", i32 1} +!5 = !{ptr @triton__0d1d2d3d4d5d6d7de8de, !"maxntidx", i32 64} +!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} +!7 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8de", linkageName: "triton__0d1d2d3d4d5d6d7de8de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2) +!8 = !DISubroutineType(cc: DW_CC_normal, types: !9) +!9 = !{} +!10 = !DILocation(line: 26, column: 26, scope: !7) +!11 = !DILocation(line: 23, column: 28, scope: !7) +!12 = !DILocation(line: 30, column: 40, scope: !7) +!13 = !DILocation(line: 30, column: 36, scope: !7) +!14 = !DILocation(line: 30, column: 30, scope: !7) +!15 = !DILocation(line: 30, column: 46, scope: !7) +!16 = !DILocation(line: 31, column: 30, scope: !7) +!17 = !DILocation(line: 31, column: 46, scope: !7) +!18 = !DILocation(line: 31, column: 67, scope: !7) +!19 = !DILocation(line: 32, column: 30, scope: !7) +!20 = !DILocation(line: 32, column: 46, scope: !7) +!21 = !DILocation(line: 32, column: 67, scope: !7) +!22 = !DILocation(line: 33, column: 31, scope: !7) +!23 = !DILocation(line: 33, column: 36, scope: !7) +!24 = !DILocation(line: 35, column: 18, scope: !7) +!25 = !DILocation(line: 37, column: 18, scope: !7) +!26 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !30) +!27 = distinct !DILexicalBlockFile(scope: !29, file: !28, discriminator: 0) +!28 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!29 = distinct !DILexicalBlockFile(scope: !7, file: !28, discriminator: 0) +!30 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !31) +!31 = !DILocation(line: 42, column: 59, scope: !27) +!32 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !33) +!33 = !DILocation(line: 42, column: 59, scope: !29) +!34 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !37) +!35 = distinct !DILexicalBlockFile(scope: !7, file: !36, discriminator: 0) +!36 = !DIFile(filename: "triton_helpers.py", directory: "/usr/local/lib/python3.10/dist-packages/torch/_inductor") +!37 = !DILocation(line: 42, column: 45, scope: !35) +!38 = !DILocation(line: 45, column: 20, scope: !7) +!39 = !DILocation(line: 46, column: 19, scope: !7) +!40 = !DILocation(line: 47, column: 20, scope: !7) +!41 = !DILocation(line: 243, column: 36, scope: !29, inlinedAt: !42) +!42 = !DILocation(line: 50, column: 59, scope: !29) +!43 = !DILocation(line: 233, column: 15, scope: !27, inlinedAt: !44) +!44 = !DILocation(line: 243, column: 36, scope: !27, inlinedAt: !45) +!45 = !DILocation(line: 50, column: 59, scope: !27) +!46 = !DILocation(line: 8, column: 15, scope: !35, inlinedAt: !47) +!47 = !DILocation(line: 50, column: 45, scope: !35) +!48 = !DILocation(line: 52, column: 20, scope: !7) +!49 = !DILocation(line: 54, column: 20, scope: !7) +!50 = !DILocation(line: 55, column: 26, scope: !7) +!51 = !DILocation(line: 57, column: 20, scope: !7) +!52 = !DILocation(line: 58, column: 20, scope: !7) +!53 = !DILocation(line: 60, column: 4, scope: !7) +!54 = !DILocation(line: 61, column: 28, scope: !7) +!55 = !DILocation(line: 61, column: 40, scope: !7) +!56 = !DILocation(line: 62, column: 25, scope: !7) +!57 = !DILocation(line: 62, column: 48, scope: !7) +!58 = !DILocation(line: 63, column: 25, scope: !7) +!59 = !DILocation(line: 63, column: 37, scope: !7) +!60 = !DILocation(line: 63, column: 4, scope: !7) diff --git a/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.cubin b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c0b5a6b628109108f94833fc9e53ba889d8411e2 Binary files /dev/null and b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.cubin differ diff --git a/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttir b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..59013b5d7b57f0541e999a8da0ba77fe925af884 --- /dev/null +++ b/.triton/dump/ab89f85e55d5ddd9676325b49df9419f/triton_.ttir @@ -0,0 +1,66 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6de7de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %16 = arith.addf %8, %12 : tensor<256xf32> + %17 = arith.select %2, %16, %cst_3 : tensor<256xi1>, tensor<256xf32> + %18 = "tt.reduce"(%17) <{axis = 0 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %42 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %42 : f32 + }) : (tensor<256xf32>) -> f32 + %19 = arith.addf %18, %cst_0 : f32 + %20 = arith.divf %19, %cst_1 : f32 + %21 = tt.splat %20 : (f32) -> tensor<1xf32> + %22 = tt.splat %20 : (f32) -> tensor<256xf32> + %23 = arith.subf %16, %22 : tensor<256xf32> + %24 = arith.mulf %23, %23 : tensor<256xf32> + %25 = arith.select %2, %24, %cst_3 : tensor<256xi1>, tensor<256xf32> + %26 = "tt.reduce"(%25) <{axis = 0 : i32}> ({ + ^bb0(%arg8: f32, %arg9: f32): + %42 = arith.addf %arg8, %arg9 : f32 + tt.reduce.return %42 : f32 + }) : (tensor<256xf32>) -> f32 + %27 = arith.addf %26, %cst_0 : f32 + %28 = arith.divf %27, %cst_1 : f32 + %29 = arith.addf %28, %cst_2 : f32 + %30 = tt.extern_elementwise %29 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %31 = tt.splat %30 : (f32) -> tensor<1xf32> + %32 = tt.splat %30 : (f32) -> tensor<256xf32> + %33 = arith.mulf %23, %32 : tensor<256xf32> + %34 = arith.mulf %33, %15 : tensor<256xf32> + gpu.barrier + %35 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %36 = tt.splat %35 : (!tt.ptr) -> tensor<1x!tt.ptr> + tt.store %36, %31 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32> + %37 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr> + %38 = tt.addptr %37, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %39 = arith.truncf %34 : tensor<256xf32> to tensor<256xbf16> + tt.store %38, %39, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16> + %40 = tt.addptr %arg4, %0 : !tt.ptr, i32 + %41 = tt.splat %40 : (!tt.ptr) -> tensor<1x!tt.ptr> + tt.store %41, %21 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32> + tt.return + } +} diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.llir b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..9af59ae363c94ccbf7b9542aad4bb488c0edacf5 --- /dev/null +++ b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.llir @@ -0,0 +1,54 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %5 = shl i32 %4, 1, !dbg !8 + %6 = and i32 %5, 510, !dbg !8 + %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %8 = shl i32 %7, 9, !dbg !10 + %9 = or i32 %8, %6, !dbg !11 + %10 = sext i32 %9 to i64, !dbg !12 + %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !12 + %12 = tail call { i32, i32 } asm sideeffect "mov.u32 $0, 0x0;\0A\09mov.u32 $1, 0x0;\0A\09@$3 ld.global.v2.b32 { $0, $1 }, [ $2 + 0 ];", "=r,=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13 + %13 = extractvalue { i32, i32 } %12, 0, !dbg !13 + %14 = extractvalue { i32, i32 } %12, 1, !dbg !13 + %15 = bitcast i32 %13 to float, !dbg !13 + %16 = bitcast i32 %14 to float, !dbg !13 + %17 = getelementptr i16, ptr addrspace(1) %1, i64 %10, !dbg !14 + %18 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %15) #1, !dbg !15 + %19 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %16) #1, !dbg !15 + %20 = insertelement <2 x i16> undef, i16 %18, i64 0, !dbg !15 + %21 = insertelement <2 x i16> %20, i16 %19, i64 1, !dbg !15 + %22 = bitcast <2 x i16> %21 to i32, !dbg !15 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %22, ptr addrspace(1) %17, i1 true) #1, !dbg !15 + ret void, !dbg !16 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cch6kzmgbnoxqjgy3okxqs7sy2uz27atdhc4lkuwz5ajinexdurx.py", directory: "/tmp/torchinductor_root/ch") +!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 24, column: 30, scope: !5) +!13 = !DILocation(line: 24, column: 35, scope: !5) +!14 = !DILocation(line: 26, column: 25, scope: !5) +!15 = !DILocation(line: 26, column: 36, scope: !5) +!16 = !DILocation(line: 26, column: 4, scope: !5) diff --git a/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttir b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..de8871dcaba54cb7d1b997f9009d0eeab54d091f --- /dev/null +++ b/.triton/dump/b9f6ef1f5ddf337922c3695aabb2c1ac/triton_.ttir @@ -0,0 +1,18 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32> + %8 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr> + %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + %10 = arith.truncf %7 : tensor<512xf32> to tensor<512xbf16> + tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16> + tt.return + } +} diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.cubin b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..17a7eaafdd465c58e427fe6c4644bd6ead9b6bef Binary files /dev/null and b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.cubin differ diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..12dbd4fd575fd1b129e5c7ce0c0047cbd528298c --- /dev/null +++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ptx @@ -0,0 +1,301 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de + +.visible .entry triton__0d1d2de( + .param .u64 triton__0d1d2de_param_0, + .param .u64 triton__0d1d2de_param_1, + .param .u32 triton__0d1d2de_param_2 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<3>; + .reg .b16 %rs<5>; + .reg .b32 %r<17>; + .reg .b64 %rd<7>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd3, [triton__0d1d2de_param_0]; + ld.param.u64 %rd4, [triton__0d1d2de_param_1]; +$L__tmp0: + .loc 1 21 36 + mov.u32 %r12, %tid.x; + shl.b32 %r13, %r12, 2; + and.b32 %r14, %r13, 508; + .loc 1 20 28 + mov.u32 %r1, %ctaid.x; + .loc 1 20 33 + shl.b32 %r15, %r1, 9; + .loc 1 21 23 + or.b32 %r16, %r15, %r14; + .loc 1 24 30 + mul.wide.s32 %rd5, %r16, 2; + add.s64 %rd1, %rd3, %rd5; + mov.pred %p1, -1; + .loc 1 24 35 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + @%p1 ld.global.v2.b32 { %r2, %r3 }, [ %rd1 + 0 ]; + cvt.u16.u32 %rs1, %r2; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; } + cvt.u16.u32 %rs3, %r3; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; } + .loc 1 24 44 + cvt.f32.bf16 %r8, %rs1; + cvt.f32.bf16 %r9, %rs2; + cvt.f32.bf16 %r10, %rs3; + cvt.f32.bf16 %r11, %rs4; + .loc 1 26 25 + mul.wide.s32 %rd6, %r16, 4; + add.s64 %rd2, %rd4, %rd6; + .loc 1 26 36 + @%p1 st.global.v4.b32 [ %rd2 + 0 ], { %r8, %r9, %r10, %r11 }; + .loc 1 26 4 + ret; +$L__tmp1: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/zl/czl6nmwasl7k4ic55xowihczcooh3mhu5v6ls6w2xzqqocdc2da7.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 176 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 122 +.b8 108 +.b8 54 +.b8 110 +.b8 109 +.b8 119 +.b8 97 +.b8 115 +.b8 108 +.b8 55 +.b8 107 +.b8 52 +.b8 105 +.b8 99 +.b8 53 +.b8 53 +.b8 120 +.b8 111 +.b8 119 +.b8 105 +.b8 104 +.b8 99 +.b8 122 +.b8 99 +.b8 111 +.b8 111 +.b8 104 +.b8 51 +.b8 109 +.b8 104 +.b8 117 +.b8 53 +.b8 118 +.b8 54 +.b8 108 +.b8 115 +.b8 54 +.b8 119 +.b8 50 +.b8 120 +.b8 122 +.b8 113 +.b8 113 +.b8 111 +.b8 99 +.b8 100 +.b8 99 +.b8 50 +.b8 100 +.b8 97 +.b8 55 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 122 +.b8 108 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 180 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..cdc64a19be9d4816eddc0cdbad88d21c48b0d233 --- /dev/null +++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttgir @@ -0,0 +1,19 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b --- /dev/null +++ b/.triton/dump/bd83577d36184f8e720cea5389ce1557/triton_.ttir @@ -0,0 +1,18 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16> + %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32> + tt.return + } +} diff --git a/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ptx b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..0f8c7f7e766e7ca52b6681a2cdb04d9a9ca281d1 --- /dev/null +++ b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ptx @@ -0,0 +1,837 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de( + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<35>; + .reg .b16 %rs<17>; + .reg .b32 %r<111>; + .reg .f32 %f<94>; + .reg .b64 %rd<25>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0]; + ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r78, %tid.x; + and.b32 %r79, %r78, 31; + ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2]; + ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3]; + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4]; + and.b32 %r80, %r78, 63; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5]; + shl.b32 %r81, %r80, 2; + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6]; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r82, %r1, 8; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8]; + .loc 1 30 36 + or.b32 %r83, %r82, %r81; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9]; + .loc 1 30 30 + mul.wide.s32 %rd21, %r83, 4; + add.s64 %rd1, %rd12, %rd21; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + .loc 1 31 30 + mul.wide.s32 %rd22, %r83, 2; + add.s64 %rd2, %rd13, %rd22; + .loc 1 31 46 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + cvt.u16.u32 %rs1, %r10; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } + cvt.u16.u32 %rs3, %r11; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } + .loc 1 31 67 + cvt.f32.bf16 %r14, %rs1; + mov.b32 %f5, %r14; + cvt.f32.bf16 %r15, %rs2; + mov.b32 %f6, %r15; + cvt.f32.bf16 %r16, %rs3; + mov.b32 %f7, %r16; + cvt.f32.bf16 %r17, %rs4; + mov.b32 %f8, %r17; + .loc 1 32 30 + add.s64 %rd3, %rd14, %rd22; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + cvt.u16.u32 %rs5, %r18; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; } + cvt.u16.u32 %rs7, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; } + .loc 1 32 67 + cvt.f32.bf16 %r22, %rs5; + mov.b32 %f9, %r22; + cvt.f32.bf16 %r23, %rs6; + mov.b32 %f10, %r23; + cvt.f32.bf16 %r24, %rs7; + mov.b32 %f11, %r24; + cvt.f32.bf16 %r25, %rs8; + mov.b32 %f12, %r25; + .loc 1 33 30 + add.s64 %rd4, %rd15, %rd22; + .loc 1 33 46 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r6; + @!%p1 mov.u32 %r27, %r6; + cvt.u16.u32 %rs9, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; } + cvt.u16.u32 %rs11, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; } + .loc 1 33 67 + cvt.f32.bf16 %r30, %rs9; + mov.b32 %f13, %r30; + cvt.f32.bf16 %r31, %rs10; + mov.b32 %f14, %r31; + cvt.f32.bf16 %r32, %rs11; + mov.b32 %f15, %r32; + cvt.f32.bf16 %r33, %rs12; + mov.b32 %f16, %r33; + .loc 1 34 31 + add.s64 %rd5, %rd16, %rd22; + .loc 1 34 47 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + @%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r34, %r6; + @!%p1 mov.u32 %r35, %r6; + cvt.u16.u32 %rs13, %r34; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; } + cvt.u16.u32 %rs15, %r35; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; } + .loc 1 34 68 + cvt.f32.bf16 %r38, %rs13; + mov.b32 %f17, %r38; + cvt.f32.bf16 %r39, %rs14; + mov.b32 %f18, %r39; + cvt.f32.bf16 %r40, %rs15; + mov.b32 %f19, %r40; + cvt.f32.bf16 %r41, %rs16; + mov.b32 %f20, %r41; + .loc 1 35 31 + mul.wide.u32 %rd23, %r81, 4; + add.s64 %rd6, %rd17, %rd23; + .loc 1 35 36 + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + mov.u32 %r45, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ]; + @!%p1 mov.u32 %r42, %r6; + @!%p1 mov.u32 %r43, %r6; + @!%p1 mov.u32 %r44, %r6; + @!%p1 mov.u32 %r45, %r6; + .loc 1 37 18 + add.f32 %f21, %f5, %f1; + add.f32 %f22, %f6, %f2; + add.f32 %f23, %f7, %f3; + .loc 1 39 18 + add.f32 %f24, %f21, %f9; + add.f32 %f25, %f22, %f10; + add.f32 %f26, %f23, %f11; + .loc 1 41 18 + add.f32 %f27, %f25, %f14; + add.f32 %f28, %f26, %f15; + .loc 1 43 19 + add.f32 %f29, %f27, %f18; + add.f32 %f30, %f28, %f19; + .loc 1 41 18 + add.f32 %f31, %f24, %f13; + add.f32 %f32, %f8, %f4; + .loc 1 43 19 + add.f32 %f33, %f32, %f12; + add.f32 %f34, %f31, %f17; +$L__tmp1: + .loc 2 233 15 + add.f32 %f35, %f34, %f29; + add.f32 %f36, %f33, %f16; + add.f32 %f37, %f35, %f30; + add.f32 %f38, %f36, %f20; + mov.b32 %r71, %f38; + add.f32 %f39, %f37, %f38; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r84, %f39; + shfl.sync.bfly.b32 %r85, %r84, 16, 31, -1; + mov.b32 %f40, %r85; +$L__tmp3: + .loc 2 233 15 + add.f32 %f41, %f39, %f40; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r86, %f41; + shfl.sync.bfly.b32 %r87, %r86, 8, 31, -1; + mov.b32 %f42, %r87; +$L__tmp5: + .loc 2 233 15 + add.f32 %f43, %f41, %f42; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r88, %f43; + shfl.sync.bfly.b32 %r89, %r88, 4, 31, -1; + mov.b32 %f44, %r89; +$L__tmp7: + .loc 2 233 15 + add.f32 %f45, %f43, %f44; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r90, %f45; + shfl.sync.bfly.b32 %r91, %r90, 2, 31, -1; + mov.b32 %f46, %r91; +$L__tmp9: + .loc 2 233 15 + add.f32 %f47, %f45, %f46; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r92, %f47; + shfl.sync.bfly.b32 %r93, %r92, 1, 31, -1; + mov.b32 %f48, %r93; +$L__tmp11: + .loc 2 233 15 + add.f32 %f49, %f47, %f48; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p23, %r79, 0; + shr.u32 %r94, %r78, 3; + and.b32 %r95, %r94, 4; + mov.u32 %r96, global_smem; + add.s32 %r50, %r96, %r95; + mov.b32 %r51, %f49; + @%p23 st.shared.b32 [ %r50 + 0 ], %r51; + bar.sync 0; + setp.lt.s32 %p24, %r78, 2; + shl.b32 %r97, %r78, 2; + add.s32 %r53, %r96, %r97; + @%p24 ld.shared.b32 %r52, [ %r53 + 0 ]; + mov.b32 %f50, %r52; + shfl.sync.bfly.b32 %r98, %r52, 1, 31, -1; + mov.b32 %f51, %r98; +$L__tmp13: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp14: + .loc 2 243 36 + and.b32 %r99, %r78, 1; + setp.eq.b32 %p33, %r99, 1; + not.pred %p34, %p33; + and.pred %p25, %p24, %p34; + mov.b32 %r55, %f52; + @%p25 st.shared.b32 [ %r53 + 0 ], %r55; + bar.sync 0; + ld.shared.f32 %f53, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f54, %f53, 0f00000000; +$L__tmp16: + .loc 1 51 20 + mov.b32 %r57, %f54; + mov.b32 %r58, 1132462080; + div.full.f32 %r77, %r57, %r58; + mov.b32 %f55, %r77; + .loc 1 52 20 + sub.f32 %f56, %f34, %f55; + sub.f32 %f57, %f29, %f55; + sub.f32 %f58, %f30, %f55; + sub.f32 %f59, %f38, %f55; + .loc 1 53 20 + mul.f32 %f60, %f57, %f57; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f61, %f56, %f56, %f60; + fma.rn.f32 %f62, %f58, %f58, %f61; + fma.rn.f32 %f63, %f59, %f59, %f62; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r100, %f63; + shfl.sync.bfly.b32 %r101, %r100, 16, 31, -1; + mov.b32 %f64, %r101; +$L__tmp20: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r102, %f65; + shfl.sync.bfly.b32 %r103, %r102, 8, 31, -1; + mov.b32 %f66, %r103; +$L__tmp22: + .loc 2 233 15 + add.f32 %f67, %f65, %f66; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r104, %f67; + shfl.sync.bfly.b32 %r105, %r104, 4, 31, -1; + mov.b32 %f68, %r105; +$L__tmp24: + .loc 2 233 15 + add.f32 %f69, %f67, %f68; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r106, %f69; + shfl.sync.bfly.b32 %r107, %r106, 2, 31, -1; + mov.b32 %f70, %r107; +$L__tmp26: + .loc 2 233 15 + add.f32 %f71, %f69, %f70; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r108, %f71; + shfl.sync.bfly.b32 %r109, %r108, 1, 31, -1; + mov.b32 %f72, %r109; +$L__tmp28: + .loc 2 233 15 + add.f32 %f73, %f71, %f72; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r60, %f73; + @%p23 st.shared.b32 [ %r50 + 0 ], %r60; + bar.sync 0; + @%p24 ld.shared.b32 %r61, [ %r53 + 0 ]; + mov.b32 %f74, %r61; + shfl.sync.bfly.b32 %r110, %r61, 1, 31, -1; + mov.b32 %f75, %r110; +$L__tmp30: + .loc 2 233 15 + add.f32 %f76, %f74, %f75; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r64, %f76; + @%p25 st.shared.b32 [ %r53 + 0 ], %r64; + bar.sync 0; + ld.shared.f32 %f77, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f78, %f77, 0f00000000; +$L__tmp33: + .loc 1 58 20 + mov.b32 %r66, %f78; + div.full.f32 %r65, %r66, %r58; + mov.b32 %f79, %r65; + .loc 1 60 20 + add.f32 %f80, %f79, 0f3727C5AC; + .loc 1 61 26 + rsqrt.approx.ftz.f32 %f81, %f80; + .loc 1 35 36 + mov.b32 %f82, %r45; + mov.b32 %f83, %r44; + mov.b32 %f84, %r43; + mov.b32 %f85, %r42; + .loc 1 63 20 + mul.f32 %f86, %f56, %f81; + mul.f32 %f87, %f57, %f81; + mul.f32 %f88, %f58, %f81; + mul.f32 %f89, %f59, %f81; + .loc 1 64 20 + mul.f32 %f90, %f86, %f85; + mul.f32 %f91, %f87, %f84; + mul.f32 %f92, %f88, %f83; + mul.f32 %f93, %f89, %f82; + .loc 1 65 25 + add.s64 %rd7, %rd18, %rd21; + .loc 1 65 48 + mov.b32 %r68, %f34; + mov.b32 %r69, %f29; + mov.b32 %r70, %f30; + @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 }; + .loc 1 66 4 + bar.sync 0; + .loc 1 67 28 + mul.wide.s32 %rd24, %r1, 4; + add.s64 %rd8, %rd11, %rd24; + .loc 1 67 40 + setp.eq.s32 %p30, %r80, 0; + mov.b32 %r72, %f81; + @%p30 st.global.b32 [ %rd8 + 0 ], { %r72 }; + .loc 1 68 25 + add.s64 %rd9, %rd20, %rd21; + .loc 1 68 48 + mov.b32 %r73, %f90; + mov.b32 %r74, %f91; + mov.b32 %r75, %f92; + mov.b32 %r76, %f93; + @%p1 st.global.v4.b32 [ %rd9 + 0 ], { %r73, %r74, %r75, %r76 }; + .loc 1 69 25 + add.s64 %rd10, %rd19, %rd24; + .loc 1 69 37 + @%p30 st.global.b32 [ %rd10 + 0 ], { %r77 }; + .loc 1 69 4 + ret; +$L__tmp34: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/sf/csf6zcjhrl2sjepofkaaj2rwyu4vq322pi5ukcu37oynjbso2i4g.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 419 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 115 +.b8 102 +.b8 54 +.b8 122 +.b8 99 +.b8 106 +.b8 104 +.b8 114 +.b8 108 +.b8 50 +.b8 115 +.b8 106 +.b8 101 +.b8 112 +.b8 111 +.b8 102 +.b8 107 +.b8 97 +.b8 97 +.b8 106 +.b8 50 +.b8 114 +.b8 119 +.b8 121 +.b8 117 +.b8 52 +.b8 118 +.b8 113 +.b8 51 +.b8 50 +.b8 50 +.b8 112 +.b8 105 +.b8 53 +.b8 117 +.b8 107 +.b8 99 +.b8 117 +.b8 51 +.b8 55 +.b8 111 +.b8 121 +.b8 110 +.b8 106 +.b8 98 +.b8 115 +.b8 111 +.b8 50 +.b8 105 +.b8 52 +.b8 103 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 115 +.b8 102 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 48 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 48 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 48 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 56 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 56 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 56 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 423 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 423 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttir b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..5c9eab337c61b54065c360c79eaf977b31c29c0c --- /dev/null +++ b/.triton/dump/be28ee3793470d9803546f03e49c3edf/triton_.ttir @@ -0,0 +1,83 @@ +module { + tt.func public @triton__0d1d2d3d4d5d6d7d8d9d10de11de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c256_i32 = arith.constant 256 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<256xbf16> + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 2.560000e+02 : f32 + %cst_2 = arith.constant 9.99999974E-6 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<256xf32> + %cst_4 = arith.constant dense<256> : tensor<256xi32> + %0 = tt.get_program_id x : i32 + %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32> + %2 = arith.cmpi slt, %1, %cst_4 : tensor<256xi32> + %3 = arith.muli %0, %c256_i32 : i32 + %4 = tt.splat %3 : (i32) -> tensor<256xi32> + %5 = arith.addi %1, %4 : tensor<256xi32> + %6 = tt.splat %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr> + %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %8 = tt.load %7, %2, %cst_3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32> + %9 = tt.splat %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr> + %10 = tt.addptr %9, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %11 = tt.load %10, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %12 = arith.extf %11 : tensor<256xbf16> to tensor<256xf32> + %13 = tt.splat %arg3 : (!tt.ptr) -> tensor<256x!tt.ptr> + %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %15 = tt.load %14, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %16 = arith.extf %15 : tensor<256xbf16> to tensor<256xf32> + %17 = tt.splat %arg4 : (!tt.ptr) -> tensor<256x!tt.ptr> + %18 = tt.addptr %17, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %19 = tt.load %18, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %20 = arith.extf %19 : tensor<256xbf16> to tensor<256xf32> + %21 = tt.splat %arg5 : (!tt.ptr) -> tensor<256x!tt.ptr> + %22 = tt.addptr %21, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + %23 = tt.load %22, %2, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16> + %24 = arith.extf %23 : tensor<256xbf16> to tensor<256xf32> + %25 = tt.splat %arg6 : (!tt.ptr) -> tensor<256x!tt.ptr> + %26 = tt.addptr %25, %1 : tensor<256x!tt.ptr>, tensor<256xi32> + %27 = tt.load %26, %2, %cst_3 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32> + %28 = arith.addf %8, %12 : tensor<256xf32> + %29 = arith.addf %28, %16 : tensor<256xf32> + %30 = arith.addf %29, %20 : tensor<256xf32> + %31 = arith.addf %30, %24 : tensor<256xf32> + %32 = arith.select %2, %31, %cst_3 : tensor<256xi1>, tensor<256xf32> + %33 = "tt.reduce"(%32) <{axis = 0 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %58 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %58 : f32 + }) : (tensor<256xf32>) -> f32 + %34 = arith.addf %33, %cst_0 : f32 + %35 = arith.divf %34, %cst_1 : f32 + %36 = tt.splat %35 : (f32) -> tensor<1xf32> + %37 = tt.splat %35 : (f32) -> tensor<256xf32> + %38 = arith.subf %31, %37 : tensor<256xf32> + %39 = arith.mulf %38, %38 : tensor<256xf32> + %40 = arith.select %2, %39, %cst_3 : tensor<256xi1>, tensor<256xf32> + %41 = "tt.reduce"(%40) <{axis = 0 : i32}> ({ + ^bb0(%arg12: f32, %arg13: f32): + %58 = arith.addf %arg12, %arg13 : f32 + tt.reduce.return %58 : f32 + }) : (tensor<256xf32>) -> f32 + %42 = arith.addf %41, %cst_0 : f32 + %43 = arith.divf %42, %cst_1 : f32 + %44 = arith.addf %43, %cst_2 : f32 + %45 = tt.extern_elementwise %44 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32 + %46 = tt.splat %45 : (f32) -> tensor<1xf32> + %47 = tt.splat %45 : (f32) -> tensor<256xf32> + %48 = arith.mulf %38, %47 : tensor<256xf32> + %49 = arith.mulf %48, %27 : tensor<256xf32> + %50 = tt.splat %arg7 : (!tt.ptr) -> tensor<256x!tt.ptr> + %51 = tt.addptr %50, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %51, %31, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + gpu.barrier + %52 = tt.addptr %arg0, %0 : !tt.ptr, i32 + %53 = tt.splat %52 : (!tt.ptr) -> tensor<1x!tt.ptr> + tt.store %53, %46 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32> + %54 = tt.splat %arg9 : (!tt.ptr) -> tensor<256x!tt.ptr> + %55 = tt.addptr %54, %5 : tensor<256x!tt.ptr>, tensor<256xi32> + tt.store %55, %49, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32> + %56 = tt.addptr %arg8, %0 : !tt.ptr, i32 + %57 = tt.splat %56 : (!tt.ptr) -> tensor<1x!tt.ptr> + tt.store %57, %36 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32> + tt.return + } +} diff --git a/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ptx b/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..cbf8d8cb9dddc86aa2a767755bada6649eb8114a --- /dev/null +++ b/.triton/dump/c0c34db8f5ff22b79fc971c63187477a/triton_.ptx @@ -0,0 +1,843 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8d9d10de11de +.extern .shared .align 1 .b8 global_smem[]; +.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0}; + +.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10de11de( + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_10, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_11 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<35>; + .reg .b16 %rs<21>; + .reg .b32 %r<115>; + .reg .f32 %f<94>; + .reg .b64 %rd<25>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd11, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_0]; + ld.param.u64 %rd12, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r80, %tid.x; + and.b32 %r81, %r80, 31; + ld.param.u64 %rd13, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_2]; + ld.param.u64 %rd14, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_3]; + ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_4]; + and.b32 %r82, %r80, 63; + ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_5]; + shl.b32 %r83, %r82, 2; + ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_6]; + ld.param.u64 %rd18, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r84, %r1, 8; + ld.param.u64 %rd19, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_8]; + .loc 1 30 36 + or.b32 %r85, %r84, %r83; + ld.param.u64 %rd20, [triton__0d1d2d3d4d5d6d7d8d9d10de11de_param_9]; + .loc 1 30 30 + mul.wide.s32 %rd21, %r85, 4; + add.s64 %rd1, %rd12, %rd21; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + mov.b32 %f1, %r2; + mov.b32 %f2, %r3; + mov.b32 %f3, %r4; + mov.b32 %f4, %r5; + .loc 1 31 30 + mul.wide.s32 %rd22, %r85, 2; + add.s64 %rd2, %rd13, %rd22; + .loc 1 31 46 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + cvt.u16.u32 %rs1, %r10; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } + cvt.u16.u32 %rs3, %r11; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } + .loc 1 31 67 + cvt.f32.bf16 %r14, %rs1; + mov.b32 %f5, %r14; + cvt.f32.bf16 %r15, %rs2; + mov.b32 %f6, %r15; + cvt.f32.bf16 %r16, %rs3; + mov.b32 %f7, %r16; + cvt.f32.bf16 %r17, %rs4; + mov.b32 %f8, %r17; + .loc 1 32 30 + add.s64 %rd3, %rd14, %rd22; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + cvt.u16.u32 %rs5, %r18; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; } + cvt.u16.u32 %rs7, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; } + .loc 1 32 67 + cvt.f32.bf16 %r22, %rs5; + mov.b32 %f9, %r22; + cvt.f32.bf16 %r23, %rs6; + mov.b32 %f10, %r23; + cvt.f32.bf16 %r24, %rs7; + mov.b32 %f11, %r24; + cvt.f32.bf16 %r25, %rs8; + mov.b32 %f12, %r25; + .loc 1 33 30 + add.s64 %rd4, %rd15, %rd22; + .loc 1 33 46 + mov.u32 %r26, 0x0; + mov.u32 %r27, 0x0; + @%p1 ld.global.v2.b32 { %r26, %r27 }, [ %rd4 + 0 ]; + @!%p1 mov.u32 %r26, %r6; + @!%p1 mov.u32 %r27, %r6; + cvt.u16.u32 %rs9, %r26; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r26; } + cvt.u16.u32 %rs11, %r27; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r27; } + .loc 1 33 67 + cvt.f32.bf16 %r30, %rs9; + mov.b32 %f13, %r30; + cvt.f32.bf16 %r31, %rs10; + mov.b32 %f14, %r31; + cvt.f32.bf16 %r32, %rs11; + mov.b32 %f15, %r32; + cvt.f32.bf16 %r33, %rs12; + mov.b32 %f16, %r33; + .loc 1 34 31 + add.s64 %rd5, %rd16, %rd22; + .loc 1 34 47 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + @%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd5 + 0 ]; + @!%p1 mov.u32 %r34, %r6; + @!%p1 mov.u32 %r35, %r6; + cvt.u16.u32 %rs13, %r34; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r34; } + cvt.u16.u32 %rs15, %r35; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r35; } + .loc 1 34 68 + cvt.f32.bf16 %r38, %rs13; + mov.b32 %f17, %r38; + cvt.f32.bf16 %r39, %rs14; + mov.b32 %f18, %r39; + cvt.f32.bf16 %r40, %rs15; + mov.b32 %f19, %r40; + cvt.f32.bf16 %r41, %rs16; + mov.b32 %f20, %r41; + .loc 1 35 31 + mul.wide.u32 %rd23, %r83, 4; + add.s64 %rd6, %rd17, %rd23; + .loc 1 35 36 + mov.u32 %r42, 0x0; + mov.u32 %r43, 0x0; + mov.u32 %r44, 0x0; + mov.u32 %r45, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r42, %r43, %r44, %r45 }, [ %rd6 + 0 ]; + @!%p1 mov.u32 %r42, %r6; + @!%p1 mov.u32 %r43, %r6; + @!%p1 mov.u32 %r44, %r6; + @!%p1 mov.u32 %r45, %r6; + .loc 1 37 18 + add.f32 %f21, %f5, %f1; + add.f32 %f22, %f6, %f2; + add.f32 %f23, %f7, %f3; + .loc 1 39 18 + add.f32 %f24, %f21, %f9; + add.f32 %f25, %f22, %f10; + add.f32 %f26, %f23, %f11; + .loc 1 41 18 + add.f32 %f27, %f25, %f14; + add.f32 %f28, %f26, %f15; + .loc 1 43 19 + add.f32 %f29, %f27, %f18; + add.f32 %f30, %f28, %f19; + .loc 1 41 18 + add.f32 %f31, %f24, %f13; + add.f32 %f32, %f8, %f4; + .loc 1 43 19 + add.f32 %f33, %f32, %f12; + add.f32 %f34, %f31, %f17; +$L__tmp1: + .loc 2 233 15 + add.f32 %f35, %f34, %f29; + add.f32 %f36, %f33, %f16; + add.f32 %f37, %f35, %f30; + add.f32 %f38, %f36, %f20; + mov.b32 %r71, %f38; + add.f32 %f39, %f37, %f38; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r86, %f39; + shfl.sync.bfly.b32 %r87, %r86, 16, 31, -1; + mov.b32 %f40, %r87; +$L__tmp3: + .loc 2 233 15 + add.f32 %f41, %f39, %f40; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r88, %f41; + shfl.sync.bfly.b32 %r89, %r88, 8, 31, -1; + mov.b32 %f42, %r89; +$L__tmp5: + .loc 2 233 15 + add.f32 %f43, %f41, %f42; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r90, %f43; + shfl.sync.bfly.b32 %r91, %r90, 4, 31, -1; + mov.b32 %f44, %r91; +$L__tmp7: + .loc 2 233 15 + add.f32 %f45, %f43, %f44; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r92, %f45; + shfl.sync.bfly.b32 %r93, %r92, 2, 31, -1; + mov.b32 %f46, %r93; +$L__tmp9: + .loc 2 233 15 + add.f32 %f47, %f45, %f46; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r94, %f47; + shfl.sync.bfly.b32 %r95, %r94, 1, 31, -1; + mov.b32 %f48, %r95; +$L__tmp11: + .loc 2 233 15 + add.f32 %f49, %f47, %f48; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p23, %r81, 0; + shr.u32 %r96, %r80, 3; + and.b32 %r97, %r96, 4; + mov.u32 %r98, global_smem; + add.s32 %r50, %r98, %r97; + mov.b32 %r51, %f49; + @%p23 st.shared.b32 [ %r50 + 0 ], %r51; + bar.sync 0; + setp.lt.s32 %p24, %r80, 2; + shl.b32 %r99, %r80, 2; + add.s32 %r53, %r98, %r99; + @%p24 ld.shared.b32 %r52, [ %r53 + 0 ]; + mov.b32 %f50, %r52; + shfl.sync.bfly.b32 %r100, %r52, 1, 31, -1; + mov.b32 %f51, %r100; +$L__tmp13: + .loc 2 233 15 + add.f32 %f52, %f50, %f51; +$L__tmp14: + .loc 2 243 36 + and.b32 %r101, %r80, 1; + setp.eq.b32 %p33, %r101, 1; + not.pred %p34, %p33; + and.pred %p25, %p24, %p34; + mov.b32 %r55, %f52; + @%p25 st.shared.b32 [ %r53 + 0 ], %r55; + bar.sync 0; + ld.shared.f32 %f53, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f54, %f53, 0f00000000; +$L__tmp16: + .loc 1 51 20 + mov.b32 %r57, %f54; + mov.b32 %r58, 1132462080; + div.full.f32 %r79, %r57, %r58; + mov.b32 %f55, %r79; + .loc 1 52 20 + sub.f32 %f56, %f34, %f55; + sub.f32 %f57, %f29, %f55; + sub.f32 %f58, %f30, %f55; + sub.f32 %f59, %f38, %f55; + .loc 1 53 20 + mul.f32 %f60, %f57, %f57; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f61, %f56, %f56, %f60; + fma.rn.f32 %f62, %f58, %f58, %f61; + fma.rn.f32 %f63, %f59, %f59, %f62; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r102, %f63; + shfl.sync.bfly.b32 %r103, %r102, 16, 31, -1; + mov.b32 %f64, %r103; +$L__tmp20: + .loc 2 233 15 + add.f32 %f65, %f63, %f64; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r104, %f65; + shfl.sync.bfly.b32 %r105, %r104, 8, 31, -1; + mov.b32 %f66, %r105; +$L__tmp22: + .loc 2 233 15 + add.f32 %f67, %f65, %f66; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r106, %f67; + shfl.sync.bfly.b32 %r107, %r106, 4, 31, -1; + mov.b32 %f68, %r107; +$L__tmp24: + .loc 2 233 15 + add.f32 %f69, %f67, %f68; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r108, %f69; + shfl.sync.bfly.b32 %r109, %r108, 2, 31, -1; + mov.b32 %f70, %r109; +$L__tmp26: + .loc 2 233 15 + add.f32 %f71, %f69, %f70; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r110, %f71; + shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1; + mov.b32 %f72, %r111; +$L__tmp28: + .loc 2 233 15 + add.f32 %f73, %f71, %f72; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r60, %f73; + @%p23 st.shared.b32 [ %r50 + 0 ], %r60; + bar.sync 0; + @%p24 ld.shared.b32 %r61, [ %r53 + 0 ]; + mov.b32 %f74, %r61; + shfl.sync.bfly.b32 %r112, %r61, 1, 31, -1; + mov.b32 %f75, %r112; +$L__tmp30: + .loc 2 233 15 + add.f32 %f76, %f74, %f75; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r64, %f76; + @%p25 st.shared.b32 [ %r53 + 0 ], %r64; + bar.sync 0; + ld.shared.f32 %f77, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f78, %f77, 0f00000000; +$L__tmp33: + .loc 1 58 20 + mov.b32 %r66, %f78; + div.full.f32 %r65, %r66, %r58; + mov.b32 %f79, %r65; + .loc 1 60 20 + add.f32 %f80, %f79, 0f3727C5AC; + .loc 1 61 26 + rsqrt.approx.ftz.f32 %f81, %f80; + .loc 1 35 36 + mov.b32 %f82, %r45; + mov.b32 %f83, %r44; + mov.b32 %f84, %r43; + mov.b32 %f85, %r42; + .loc 1 63 20 + mul.f32 %f86, %f56, %f81; + mul.f32 %f87, %f57, %f81; + mul.f32 %f88, %f58, %f81; + mul.f32 %f89, %f59, %f81; + .loc 1 64 20 + mul.f32 %f90, %f86, %f85; + mul.f32 %f91, %f87, %f84; + mul.f32 %f92, %f88, %f83; + mul.f32 %f93, %f89, %f82; + .loc 1 66 25 + add.s64 %rd7, %rd18, %rd21; + .loc 1 66 48 + mov.b32 %r68, %f34; + mov.b32 %r69, %f29; + mov.b32 %r70, %f30; + @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r68, %r69, %r70, %r71 }; + .loc 1 67 4 + bar.sync 0; + .loc 1 68 28 + mul.wide.s32 %rd24, %r1, 4; + add.s64 %rd8, %rd11, %rd24; + .loc 1 68 40 + setp.eq.s32 %p30, %r82, 0; + mov.b32 %r72, %f81; + @%p30 st.global.b32 [ %rd8 + 0 ], { %r72 }; + .loc 1 69 25 + add.s64 %rd9, %rd20, %rd22; + .loc 1 69 48 + mov.b32 %r73, %f90; + cvt.rn.bf16.f32 %rs17, %r73; + mov.b32 %r74, %f91; + cvt.rn.bf16.f32 %rs18, %r74; + mov.b32 %r75, %f92; + cvt.rn.bf16.f32 %rs19, %r75; + mov.b32 %r76, %f93; + cvt.rn.bf16.f32 %rs20, %r76; + mov.b32 %r113, {%rs17, %rs18}; + mov.b32 %r114, {%rs19, %rs20}; + @%p1 st.global.v2.b32 [ %rd9 + 0 ], { %r113, %r114 }; + .loc 1 70 25 + add.s64 %rd10, %rd19, %rd24; + .loc 1 70 37 + @%p30 st.global.b32 [ %rd10 + 0 ], { %r79 }; + .loc 1 70 4 + ret; +$L__tmp34: +$L__func_end0: + +} + // .globl __nv_rsqrtf +.visible .func (.param .b32 func_retval0) __nv_rsqrtf( + .param .b32 __nv_rsqrtf_param_0 +) +{ + .reg .f32 %f<3>; +$L__func_begin1: + + ld.param.f32 %f1, [__nv_rsqrtf_param_0]; + rsqrt.approx.ftz.f32 %f2, %f1; + st.param.f32 [func_retval0+0], %f2; + ret; +$L__func_end1: + +} + .file 1 "/tmp/torchinductor_root/il/cilofmivtj4aqoxmz3r7fz7sc3blcxfzk3utwsuayln6lpg5jwtv.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 419 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 105 +.b8 108 +.b8 111 +.b8 102 +.b8 109 +.b8 105 +.b8 118 +.b8 116 +.b8 106 +.b8 52 +.b8 97 +.b8 113 +.b8 111 +.b8 120 +.b8 109 +.b8 122 +.b8 51 +.b8 114 +.b8 55 +.b8 102 +.b8 122 +.b8 55 +.b8 115 +.b8 99 +.b8 51 +.b8 98 +.b8 108 +.b8 99 +.b8 120 +.b8 102 +.b8 122 +.b8 107 +.b8 51 +.b8 117 +.b8 116 +.b8 119 +.b8 115 +.b8 117 +.b8 97 +.b8 121 +.b8 108 +.b8 110 +.b8 54 +.b8 108 +.b8 112 +.b8 103 +.b8 53 +.b8 106 +.b8 119 +.b8 116 +.b8 118 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 105 +.b8 108 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 48 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 48 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 48 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 56 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 56 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 56 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 423 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 101 +.b8 49 +.b8 49 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 423 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.cubin b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..900de43634ac6db616b83bfd8a8ba7423cb29829 Binary files /dev/null and b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.cubin differ diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.llir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..d906f452141808f21c61f8ec5e97b1449f424a0d --- /dev/null +++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.llir @@ -0,0 +1,53 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2) local_unnamed_addr !dbg !5 { + %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %5 = shl i32 %4, 1, !dbg !8 + %6 = and i32 %5, 510, !dbg !8 + %7 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9 + %8 = shl i32 %7, 9, !dbg !10 + %9 = or i32 %8, %6, !dbg !11 + %10 = sext i32 %9 to i64, !dbg !12 + %11 = getelementptr i16, ptr addrspace(1) %0, i64 %10, !dbg !12 + %12 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %11, i1 true) #1, !dbg !13 + %13 = trunc i32 %12 to i16, !dbg !13 + %extelt.offset = lshr i32 %12, 16, !dbg !13 + %14 = trunc i32 %extelt.offset to i16, !dbg !13 + %15 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %13) #1, !dbg !14 + %16 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %14) #1, !dbg !14 + %17 = getelementptr float, ptr addrspace(1) %1, i64 %10, !dbg !15 + %18 = bitcast float %15 to i32, !dbg !16 + %19 = bitcast float %16 to i32, !dbg !16 + tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 %18, i32 %19, ptr addrspace(1) %17, i1 true) #1, !dbg !16 + ret void, !dbg !17 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "cotbhet37v6mh5samql7uxre3hprpnbhuvim3fmrjpq5fgg6lwbi.py", directory: "/tmp/torchinductor_root/ot") +!3 = !{ptr @triton__0d1d2de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2de, !"maxntidx", i32 256} +!5 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 21, column: 36, scope: !5) +!9 = !DILocation(line: 20, column: 28, scope: !5) +!10 = !DILocation(line: 20, column: 33, scope: !5) +!11 = !DILocation(line: 21, column: 23, scope: !5) +!12 = !DILocation(line: 24, column: 30, scope: !5) +!13 = !DILocation(line: 24, column: 35, scope: !5) +!14 = !DILocation(line: 24, column: 44, scope: !5) +!15 = !DILocation(line: 26, column: 25, scope: !5) +!16 = !DILocation(line: 26, column: 36, scope: !5) +!17 = !DILocation(line: 26, column: 4, scope: !5) diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttgir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..2844e7b30bcdd6d5fd4f8f7e5ee181d165252135 --- /dev/null +++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttgir @@ -0,0 +1,19 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16, #blocked> + %8 = arith.extf %7 : tensor<512xbf16, #blocked> to tensor<512xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir new file mode 100644 index 0000000000000000000000000000000000000000..50602f214a3439fdc49ae67bf8b80948941d601b --- /dev/null +++ b/.triton/dump/ce93b2263681d57e69e2c7bae9ea3b76/triton_.ttir @@ -0,0 +1,18 @@ +module { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32> + %3 = tt.splat %1 : (i32) -> tensor<512xi32> + %4 = arith.addi %3, %2 : tensor<512xi32> + %5 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr> + %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xbf16> + %8 = arith.extf %7 : tensor<512xbf16> to tensor<512xf32> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr>, tensor<512xi32> + tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32> + tt.return + } +} diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.cubin b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c135c9853feed1f3fd4d2fb5fb6f260aacf379a4 Binary files /dev/null and b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.cubin differ diff --git a/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttgir b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..514d70caf1f170c1efee99b3d26aa1c6201a0397 --- /dev/null +++ b/.triton/dump/d9fd3a6f8185ff3a7c7960cca0bc8b8b/triton_.ttgir @@ -0,0 +1,21 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<12865792> : tensor<512xi32, #blocked> + %c512_i32 = arith.constant 512 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c512_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked> + %3 = tt.splat %1 : (i32) -> tensor<512xi32, #blocked> + %4 = arith.addi %3, %2 : tensor<512xi32, #blocked> + %5 = arith.cmpi slt, %4, %cst : tensor<512xi32, #blocked> + %6 = tt.splat %arg0 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %7 = tt.addptr %6, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32, #blocked> + %9 = tt.splat %arg1 : (!tt.ptr) -> tensor<512x!tt.ptr, #blocked> + %10 = tt.addptr %9, %4 : tensor<512x!tt.ptr, #blocked>, tensor<512xi32, #blocked> + %11 = arith.truncf %8 : tensor<512xf32, #blocked> to tensor<512xbf16, #blocked> + tt.store %10, %11, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16, #blocked> + tt.return + } +} diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin new file mode 100644 index 0000000000000000000000000000000000000000..113fae93e2c87d8ed33a036760d6e263f3d9979c Binary files /dev/null and b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.cubin differ diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.llir b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.llir new file mode 100644 index 0000000000000000000000000000000000000000..8772933a947f8310058944ad460f07f04c457ece --- /dev/null +++ b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.llir @@ -0,0 +1,148 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" + +@global_smem = external addrspace(3) global [0 x i8] + +define void @triton__0d1d2de3de(ptr addrspace(1) %0, ptr addrspace(1) %1, i32 %2, i32 %3) local_unnamed_addr !dbg !5 { + %5 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8 + %6 = and i32 %5, 31, !dbg !8 + %7 = lshr i32 %5, 5, !dbg !8 + %8 = and i32 %7, 3, !dbg !8 + %urem = and i32 %5, 127, !dbg !8 + %9 = or i32 %urem, 384, !dbg !8 + %10 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !9 + %11 = icmp slt i32 %10, 256, !dbg !10 + %12 = icmp ult i32 %9, 480, !dbg !11 + %13 = shl nuw nsw i32 %urem, 8, !dbg !12 + %14 = or i32 %13, 32768, !dbg !12 + %15 = or i32 %13, 65536, !dbg !12 + %16 = shl nuw nsw i32 %9, 8, !dbg !12 + %17 = add i32 %10, %13, !dbg !13 + %18 = add i32 %14, %10, !dbg !13 + %19 = add i32 %15, %10, !dbg !13 + %20 = add i32 %10, %16, !dbg !13 + %21 = sext i32 %17 to i64, !dbg !14 + %22 = getelementptr float, ptr addrspace(1) %0, i64 %21, !dbg !14 + %23 = sext i32 %18 to i64, !dbg !14 + %24 = getelementptr float, ptr addrspace(1) %0, i64 %23, !dbg !14 + %25 = sext i32 %19 to i64, !dbg !14 + %26 = getelementptr float, ptr addrspace(1) %0, i64 %25, !dbg !14 + %27 = sext i32 %20 to i64, !dbg !14 + %28 = getelementptr float, ptr addrspace(1) %0, i64 %27, !dbg !14 + %29 = and i1 %12, %11, !dbg !15 + %30 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %22, i1 %11, i32 0, i1 %11) #3, !dbg !16 + %31 = bitcast i32 %30 to float, !dbg !16 + %32 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %24, i1 %11, i32 0, i1 %11) #3, !dbg !16 + %33 = bitcast i32 %32 to float, !dbg !16 + %34 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %26, i1 %11, i32 0, i1 %11) #3, !dbg !16 + %35 = bitcast i32 %34 to float, !dbg !16 + %36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %28, i1 %29, i32 0, i1 %29) #3, !dbg !16 + %37 = bitcast i32 %36 to float, !dbg !16 + %38 = fadd float %31, 0.000000e+00, !dbg !17 + %39 = fadd float %33, 0.000000e+00, !dbg !17 + %40 = fadd float %35, 0.000000e+00, !dbg !17 + %41 = fadd float %37, 0.000000e+00, !dbg !17 + %42 = select i1 %29, float %41, float 0.000000e+00, !dbg !18 + %43 = fadd float %38, %39, !dbg !19 + %44 = fadd float %43, %40, !dbg !19 + %45 = select i1 %11, float %44, float 0.000000e+00, !dbg !19 + %46 = fadd float %45, %42, !dbg !19 + %47 = bitcast float %46 to i32, !dbg !25 + %48 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %47, i32 16, i32 31), !dbg !25 + %49 = bitcast i32 %48 to float, !dbg !25 + %50 = fadd float %46, %49, !dbg !19 + %51 = bitcast float %50 to i32, !dbg !25 + %52 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %51, i32 8, i32 31), !dbg !25 + %53 = bitcast i32 %52 to float, !dbg !25 + %54 = fadd float %50, %53, !dbg !19 + %55 = bitcast float %54 to i32, !dbg !25 + %56 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %55, i32 4, i32 31), !dbg !25 + %57 = bitcast i32 %56 to float, !dbg !25 + %58 = fadd float %54, %57, !dbg !19 + %59 = bitcast float %58 to i32, !dbg !25 + %60 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %59, i32 2, i32 31), !dbg !25 + %61 = bitcast i32 %60 to float, !dbg !25 + %62 = fadd float %58, %61, !dbg !19 + %63 = bitcast float %62 to i32, !dbg !25 + %64 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %63, i32 1, i32 31), !dbg !25 + %65 = bitcast i32 %64 to float, !dbg !25 + %66 = fadd float %62, %65, !dbg !19 + %67 = icmp eq i32 %6, 0, !dbg !25 + %68 = zext nneg i32 %8 to i64, !dbg !25 + %69 = getelementptr float, ptr addrspace(3) @global_smem, i64 %68, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %69, float %66, i1 %67) #3, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + %70 = icmp slt i32 %5, 4, !dbg !25 + %71 = sext i32 %5 to i64, !dbg !25 + %72 = getelementptr float, ptr addrspace(3) @global_smem, i64 %71, !dbg !25 + %73 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %72, i1 %70) #3, !dbg !25 + %74 = bitcast float %73 to i32, !dbg !25 + %75 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %74, i32 2, i32 31), !dbg !25 + %76 = bitcast i32 %75 to float, !dbg !25 + %77 = fadd float %73, %76, !dbg !19 + %78 = bitcast float %77 to i32, !dbg !25 + %79 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %78, i32 1, i32 31), !dbg !25 + %80 = bitcast i32 %79 to float, !dbg !25 + %81 = fadd float %77, %80, !dbg !19 + %82 = and i32 %5, 3, !dbg !25 + %83 = icmp eq i32 %82, 0, !dbg !25 + %84 = and i1 %70, %83, !dbg !25 + tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %72, float %81, i1 %84) #3, !dbg !25 + tail call void @llvm.nvvm.barrier0(), !dbg !25 + %85 = load i32, ptr addrspace(3) @global_smem, align 4, !dbg !25 + %86 = sext i32 %10 to i64, !dbg !27 + %87 = getelementptr float, ptr addrspace(1) %1, i64 %86, !dbg !27 + %88 = icmp eq i32 %urem, 0, !dbg !28 + %89 = and i1 %88, %11, !dbg !28 + tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 %85, ptr addrspace(1) %87, i1 %89) #3, !dbg !28 + ret void, !dbg !29 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0 + +; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite) +declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1 + +; Function Attrs: convergent nocallback nounwind +declare void @llvm.nvvm.barrier0() #2 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) } +attributes #2 = { convergent nocallback nounwind } +attributes #3 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.dbg.cu = !{!1} +!nvvm.annotations = !{!3, !4, !4, !3} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!2 = !DIFile(filename: "ccizx54ebt45pqvf7it3p5t23oudtaqbed2j3uakpossm65m4cax.py", directory: "/tmp/torchinductor_root/ci") +!3 = !{ptr @triton__0d1d2de3de, !"kernel", i32 1} +!4 = !{ptr @triton__0d1d2de3de, !"maxntidx", i32 128} +!5 = distinct !DISubprogram(name: "triton__0d1d2de3de", linkageName: "triton__0d1d2de3de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1) +!6 = !DISubroutineType(cc: DW_CC_normal, types: !7) +!7 = !{} +!8 = !DILocation(line: 24, column: 33, scope: !5) +!9 = !DILocation(line: 21, column: 28, scope: !5) +!10 = !DILocation(line: 23, column: 21, scope: !5) +!11 = !DILocation(line: 29, column: 25, scope: !5) +!12 = !DILocation(line: 31, column: 44, scope: !5) +!13 = !DILocation(line: 31, column: 40, scope: !5) +!14 = !DILocation(line: 31, column: 34, scope: !5) +!15 = !DILocation(line: 31, column: 58, scope: !5) +!16 = !DILocation(line: 31, column: 50, scope: !5) +!17 = !DILocation(line: 33, column: 23, scope: !5) +!18 = !DILocation(line: 34, column: 46, scope: !5) +!19 = !DILocation(line: 233, column: 15, scope: !20, inlinedAt: !23) +!20 = distinct !DILexicalBlockFile(scope: !22, file: !21, discriminator: 0) +!21 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language") +!22 = distinct !DILexicalBlockFile(scope: !5, file: !21, discriminator: 0) +!23 = !DILocation(line: 243, column: 36, scope: !20, inlinedAt: !24) +!24 = !DILocation(line: 35, column: 25, scope: !20) +!25 = !DILocation(line: 243, column: 36, scope: !22, inlinedAt: !26) +!26 = !DILocation(line: 35, column: 25, scope: !22) +!27 = !DILocation(line: 36, column: 25, scope: !5) +!28 = !DILocation(line: 36, column: 36, scope: !5) +!29 = !DILocation(line: 36, column: 4, scope: !5) diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ptx b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..f9d3ed9a556bfcc716c4d81b7b966e9fb3b9bbc3 --- /dev/null +++ b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ptx @@ -0,0 +1,496 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2de3de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2de3de( + .param .u64 triton__0d1d2de3de_param_0, + .param .u64 triton__0d1d2de3de_param_1, + .param .u32 triton__0d1d2de3de_param_2, + .param .u32 triton__0d1d2de3de_param_3 +) +.maxntid 128, 1, 1 +{ + .reg .pred %p<16>; + .reg .b32 %r<45>; + .reg .f32 %f<29>; + .reg .b64 %rd<13>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd6, [triton__0d1d2de3de_param_0]; + ld.param.u64 %rd7, [triton__0d1d2de3de_param_1]; +$L__tmp0: + .loc 1 24 33 + mov.u32 %r17, %tid.x; + and.b32 %r18, %r17, 31; + and.b32 %r19, %r17, 127; + or.b32 %r20, %r19, 384; + .loc 1 21 28 + mov.u32 %r1, %ctaid.x; + .loc 1 23 21 + setp.lt.s32 %p1, %r1, 256; + .loc 1 29 25 + setp.lt.u32 %p13, %r20, 480; + .loc 1 31 44 + shl.b32 %r21, %r19, 8; + shl.b32 %r22, %r20, 8; + .loc 1 31 40 + add.s32 %r23, %r1, %r21; + add.s32 %r24, %r23, 32768; + add.s32 %r25, %r23, 65536; + add.s32 %r26, %r1, %r22; + .loc 1 31 34 + mul.wide.s32 %rd8, %r23, 4; + add.s64 %rd1, %rd6, %rd8; + mul.wide.s32 %rd9, %r24, 4; + add.s64 %rd2, %rd6, %rd9; + mul.wide.s32 %rd10, %r25, 4; + add.s64 %rd3, %rd6, %rd10; + mul.wide.s32 %rd11, %r26, 4; + add.s64 %rd4, %rd6, %rd11; + .loc 1 31 58 + and.pred %p7, %p13, %p1; + mov.b32 %r3, 0; + .loc 1 31 50 + mov.u32 %r2, 0x0; + @%p1 ld.global.L1::evict_first.b32 { %r2 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r3; + mov.b32 %f1, %r2; + mov.u32 %r4, 0x0; + @%p1 ld.global.L1::evict_first.b32 { %r4 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r4, %r3; + mov.b32 %f2, %r4; + mov.u32 %r6, 0x0; + @%p1 ld.global.L1::evict_first.b32 { %r6 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r6, %r3; + mov.b32 %f3, %r6; + mov.u32 %r8, 0x0; + @%p7 ld.global.L1::evict_first.b32 { %r8 }, [ %rd4 + 0 ]; + @!%p7 mov.u32 %r8, %r3; + mov.b32 %f4, %r8; + .loc 1 33 23 + add.f32 %f5, %f1, 0f00000000; + add.f32 %f6, %f2, 0f00000000; + add.f32 %f7, %f3, 0f00000000; + add.f32 %f8, %f4, 0f00000000; + .loc 1 34 46 + selp.f32 %f9, %f8, 0f00000000, %p7; +$L__tmp1: + .loc 2 233 15 + add.f32 %f10, %f5, %f6; + add.f32 %f11, %f10, %f7; + selp.f32 %f12, %f11, 0f00000000, %p1; + add.f32 %f13, %f12, %f9; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r27, %f13; + shfl.sync.bfly.b32 %r28, %r27, 16, 31, -1; + mov.b32 %f14, %r28; +$L__tmp3: + .loc 2 233 15 + add.f32 %f15, %f13, %f14; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r29, %f15; + shfl.sync.bfly.b32 %r30, %r29, 8, 31, -1; + mov.b32 %f16, %r30; +$L__tmp5: + .loc 2 233 15 + add.f32 %f17, %f15, %f16; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r31, %f17; + shfl.sync.bfly.b32 %r32, %r31, 4, 31, -1; + mov.b32 %f18, %r32; +$L__tmp7: + .loc 2 233 15 + add.f32 %f19, %f17, %f18; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r33, %f19; + shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1; + mov.b32 %f20, %r34; +$L__tmp9: + .loc 2 233 15 + add.f32 %f21, %f19, %f20; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r35, %f21; + shfl.sync.bfly.b32 %r36, %r35, 1, 31, -1; + mov.b32 %f22, %r36; +$L__tmp11: + .loc 2 233 15 + add.f32 %f23, %f21, %f22; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p9, %r18, 0; + shr.u32 %r37, %r17, 3; + and.b32 %r38, %r37, 12; + mov.u32 %r39, global_smem; + add.s32 %r10, %r39, %r38; + mov.b32 %r11, %f23; + @%p9 st.shared.b32 [ %r10 + 0 ], %r11; + bar.sync 0; + setp.lt.s32 %p10, %r17, 4; + shl.b32 %r40, %r17, 2; + add.s32 %r13, %r39, %r40; + @%p10 ld.shared.b32 %r12, [ %r13 + 0 ]; + mov.b32 %f24, %r12; + shfl.sync.bfly.b32 %r41, %r12, 2, 31, -1; + mov.b32 %f25, %r41; +$L__tmp13: + .loc 2 233 15 + add.f32 %f26, %f24, %f25; +$L__tmp14: + .loc 2 243 36 + mov.b32 %r42, %f26; + shfl.sync.bfly.b32 %r43, %r42, 1, 31, -1; + mov.b32 %f27, %r43; +$L__tmp15: + .loc 2 233 15 + add.f32 %f28, %f26, %f27; +$L__tmp16: + .loc 2 243 36 + and.b32 %r44, %r17, 3; + setp.eq.s32 %p14, %r44, 0; + and.pred %p11, %p10, %p14; + mov.b32 %r15, %f28; + @%p11 st.shared.b32 [ %r13 + 0 ], %r15; + bar.sync 0; + ld.shared.u32 %r16, [global_smem]; +$L__tmp17: + .loc 1 36 25 + mul.wide.s32 %rd12, %r1, 4; + add.s64 %rd5, %rd7, %rd12; + .loc 1 36 36 + setp.eq.s32 %p15, %r19, 0; + and.pred %p12, %p15, %p1; + @%p12 st.global.b32 [ %rd5 + 0 ], { %r16 }; + .loc 1 36 4 + ret; +$L__tmp18: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/ci/ccizx54ebt45pqvf7it3p5t23oudtaqbed2j3uakpossm65m4cax.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 262 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 99 +.b8 105 +.b8 122 +.b8 120 +.b8 53 +.b8 52 +.b8 101 +.b8 98 +.b8 116 +.b8 52 +.b8 53 +.b8 112 +.b8 113 +.b8 118 +.b8 102 +.b8 55 +.b8 105 +.b8 116 +.b8 51 +.b8 112 +.b8 53 +.b8 116 +.b8 50 +.b8 51 +.b8 111 +.b8 117 +.b8 100 +.b8 116 +.b8 97 +.b8 113 +.b8 98 +.b8 101 +.b8 100 +.b8 50 +.b8 106 +.b8 51 +.b8 117 +.b8 97 +.b8 107 +.b8 112 +.b8 111 +.b8 115 +.b8 115 +.b8 109 +.b8 54 +.b8 53 +.b8 109 +.b8 52 +.b8 99 +.b8 97 +.b8 120 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 99 +.b8 105 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 51 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 51 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp16 +.b8 2 +.b8 35 +.b8 25 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp16 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp17 +.b8 2 +.b8 35 +.b8 25 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 266 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 101 +.b8 51 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 266 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ttgir b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ttgir new file mode 100644 index 0000000000000000000000000000000000000000..caa4a5e3e6954c37ad9630a5cb6f9d0a3d62b851 --- /dev/null +++ b/.triton/dump/dd5d4cf2a4d17b28a806beb81895fb2d/triton_.ttgir @@ -0,0 +1,35 @@ +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @triton__0d1d2de3de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<256> : tensor<1x512xi32, #blocked> + %cst_0 = arith.constant dense<480> : tensor<1x512xi32, #blocked> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<1x512xf32, #blocked> + %c256_i32 = arith.constant 256 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.cmpi slt, %0, %c256_i32 : i32 + %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %3 = tt.expand_dims %2 {axis = 0 : i32} : (tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x512xi32, #blocked> + %4 = arith.cmpi slt, %3, %cst_0 : tensor<1x512xi32, #blocked> + %5 = arith.muli %3, %cst : tensor<1x512xi32, #blocked> + %6 = tt.splat %0 : (i32) -> tensor<1x512xi32, #blocked> + %7 = arith.addi %6, %5 : tensor<1x512xi32, #blocked> + %8 = tt.splat %arg0 : (!tt.ptr) -> tensor<1x512x!tt.ptr, #blocked> + %9 = tt.addptr %8, %7 : tensor<1x512x!tt.ptr, #blocked>, tensor<1x512xi32, #blocked> + %10 = tt.splat %1 : (i1) -> tensor<1x512xi1, #blocked> + %11 = arith.andi %4, %10 : tensor<1x512xi1, #blocked> + %12 = tt.load %9, %11, %cst_1 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<1x512xf32, #blocked> + %13 = arith.addf %12, %cst_1 : tensor<1x512xf32, #blocked> + %14 = arith.select %11, %13, %cst_1 : tensor<1x512xi1, #blocked>, tensor<1x512xf32, #blocked> + %15 = "tt.reduce"(%14) <{axis = 1 : i32}> ({ + ^bb0(%arg4: f32, %arg5: f32): + %20 = arith.addf %arg4, %arg5 : f32 + tt.reduce.return %20 : f32 + }) : (tensor<1x512xf32, #blocked>) -> tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %16 = tt.expand_dims %15 {axis = 1 : i32} : (tensor<1xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<1x1xf32, #blocked> + %17 = tt.addptr %arg1, %0 : !tt.ptr, i32 + %18 = tt.splat %17 : (!tt.ptr) -> tensor<1x1x!tt.ptr, #blocked> + %19 = tt.splat %1 : (i1) -> tensor<1x1xi1, #blocked> + tt.store %18, %16, %19 {cache = 1 : i32, evict = 1 : i32} : tensor<1x1xf32, #blocked> + tt.return + } +} diff --git a/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx new file mode 100644 index 0000000000000000000000000000000000000000..6c93281712f9e553538000e6a6a99e22f723d4fa --- /dev/null +++ b/.triton/dump/eb437f65020ef3b5ef28e6f8fcc9380a/triton_.ptx @@ -0,0 +1,1360 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 8.2 +.target sm_89 +.address_size 64 + + // .globl triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de( + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_0, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_1, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_2, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_3, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_4, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_5, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_6, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_7, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_8, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_9, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_10, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_11, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_12, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_13, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_14, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_15, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_16, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_17, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_18, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_19, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_20, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_21, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_22, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_23, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_24, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_25, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_26, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_27, + .param .u64 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_28, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_29, + .param .u32 triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_30 +) +.maxntid 64, 1, 1 +{ + .reg .pred %p<96>; + .reg .b16 %rs<37>; + .reg .b32 %r<222>; + .reg .f32 %f<186>; + .reg .b64 %rd<93>; + .loc 1 18 0 +$L__func_begin0: + .loc 1 18 0 + + ld.param.u64 %rd60, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_0]; + ld.param.u64 %rd61, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_1]; +$L__tmp0: + .loc 1 26 26 + mov.u32 %r188, %tid.x; + and.b32 %r189, %r188, 31; + ld.param.u64 %rd62, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_2]; + ld.param.u64 %rd63, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_3]; + ld.param.u64 %rd64, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_4]; + shl.b32 %r190, %r188, 2; + ld.param.u64 %rd65, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_5]; + and.b32 %r191, %r190, 252; + ld.param.u64 %rd66, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_6]; + ld.param.u64 %rd67, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_7]; + .loc 1 23 28 + mov.u32 %r1, %ctaid.x; + .loc 1 30 40 + shl.b32 %r192, %r1, 8; + ld.param.u64 %rd68, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_8]; + .loc 1 30 36 + or.b32 %r193, %r192, %r191; + ld.param.u64 %rd69, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_9]; + ld.param.u64 %rd70, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_10]; + .loc 1 30 30 + mul.wide.s32 %rd71, %r193, 4; + add.s64 %rd1, %rd60, %rd71; + ld.param.u64 %rd72, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_11]; + mov.b32 %r6, 0; + mov.pred %p1, -1; + .loc 1 30 46 + mov.u32 %r2, 0x0; + mov.u32 %r3, 0x0; + mov.u32 %r4, 0x0; + mov.u32 %r5, 0x0; + @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd1 + 0 ]; + @!%p1 mov.u32 %r2, %r6; + @!%p1 mov.u32 %r3, %r6; + @!%p1 mov.u32 %r4, %r6; + @!%p1 mov.u32 %r5, %r6; + ld.param.u64 %rd73, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_12]; + ld.param.u64 %rd74, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_13]; + ld.param.u64 %rd75, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_14]; + ld.param.u64 %rd76, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_15]; + ld.param.u64 %rd77, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_16]; + mov.b32 %f1, %r2; + ld.param.u64 %rd78, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_17]; + mov.b32 %f2, %r3; + ld.param.u64 %rd79, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_18]; + mov.b32 %f3, %r4; + ld.param.u64 %rd80, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_19]; + mov.b32 %f4, %r5; + ld.param.u64 %rd81, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_20]; + .loc 1 31 30 + mul.wide.s32 %rd82, %r193, 2; + add.s64 %rd2, %rd61, %rd82; + ld.param.u64 %rd83, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_21]; + .loc 1 31 46 + mov.u32 %r10, 0x0; + mov.u32 %r11, 0x0; + @%p1 ld.global.v2.b32 { %r10, %r11 }, [ %rd2 + 0 ]; + @!%p1 mov.u32 %r10, %r6; + @!%p1 mov.u32 %r11, %r6; + ld.param.u64 %rd84, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_22]; + ld.param.u64 %rd85, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_23]; + ld.param.u64 %rd86, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_24]; + cvt.u16.u32 %rs1, %r10; + ld.param.u64 %rd87, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_25]; + ld.param.u64 %rd88, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_26]; + { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r10; } + ld.param.u64 %rd89, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_27]; + cvt.u16.u32 %rs3, %r11; + ld.param.u64 %rd90, [triton__0d1d2d3d4d5d6d7d8d9d10d11d12d13d14d15d16d17d18d19d20d21d22d23d24d25d26d27d28d29de30de_param_28]; + { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r11; } + .loc 1 31 67 + cvt.f32.bf16 %r14, %rs1; + mov.b32 %f5, %r14; + cvt.f32.bf16 %r15, %rs2; + mov.b32 %f6, %r15; + cvt.f32.bf16 %r16, %rs3; + mov.b32 %f7, %r16; + cvt.f32.bf16 %r17, %rs4; + mov.b32 %f8, %r17; + .loc 1 32 30 + add.s64 %rd3, %rd62, %rd82; + .loc 1 32 46 + mov.u32 %r18, 0x0; + mov.u32 %r19, 0x0; + @%p1 ld.global.v2.b32 { %r18, %r19 }, [ %rd3 + 0 ]; + @!%p1 mov.u32 %r18, %r6; + @!%p1 mov.u32 %r19, %r6; + cvt.u16.u32 %rs5, %r18; + { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r18; } + cvt.u16.u32 %rs7, %r19; + { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r19; } + .loc 1 32 67 + cvt.f32.bf16 %r22, %rs5; + mov.b32 %f9, %r22; + cvt.f32.bf16 %r23, %rs6; + mov.b32 %f10, %r23; + cvt.f32.bf16 %r24, %rs7; + mov.b32 %f11, %r24; + cvt.f32.bf16 %r25, %rs8; + mov.b32 %f12, %r25; + .loc 1 33 30 + mul.wide.s32 %rd91, %r1, 4; + add.s64 %rd4, %rd63, %rd91; + .loc 1 33 35 + mov.u32 %r26, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r26 }, [ %rd4 + 0 ]; + mov.b32 %f13, %r26; + mov.u32 %r27, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r27 }, [ %rd4 + 0 ]; + mov.u32 %r28, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r28 }, [ %rd4 + 0 ]; + mov.u32 %r29, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r29 }, [ %rd4 + 0 ]; + .loc 1 34 30 + add.s64 %rd8, %rd64, %rd91; + .loc 1 34 35 + mov.u32 %r30, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r30 }, [ %rd8 + 0 ]; + mov.b32 %f14, %r30; + mov.u32 %r31, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r31 }, [ %rd8 + 0 ]; + mov.u32 %r32, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r32 }, [ %rd8 + 0 ]; + mov.u32 %r33, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r33 }, [ %rd8 + 0 ]; + .loc 1 35 31 + add.s64 %rd12, %rd65, %rd82; + .loc 1 35 47 + mov.u32 %r34, 0x0; + mov.u32 %r35, 0x0; + @%p1 ld.global.v2.b32 { %r34, %r35 }, [ %rd12 + 0 ]; + @!%p1 mov.u32 %r34, %r6; + @!%p1 mov.u32 %r35, %r6; + cvt.u16.u32 %rs9, %r34; + { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r34; } + cvt.u16.u32 %rs11, %r35; + { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r35; } + .loc 1 35 68 + cvt.f32.bf16 %r38, %rs9; + mov.b32 %f15, %r38; + cvt.f32.bf16 %r39, %rs10; + mov.b32 %f16, %r39; + cvt.f32.bf16 %r40, %rs11; + mov.b32 %f17, %r40; + cvt.f32.bf16 %r41, %rs12; + mov.b32 %f18, %r41; + .loc 1 36 31 + add.s64 %rd13, %rd66, %rd91; + .loc 1 36 36 + mov.u32 %r42, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r42 }, [ %rd13 + 0 ]; + mov.b32 %f19, %r42; + mov.u32 %r43, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r43 }, [ %rd13 + 0 ]; + mov.u32 %r44, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r44 }, [ %rd13 + 0 ]; + mov.u32 %r45, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r45 }, [ %rd13 + 0 ]; + .loc 1 37 31 + add.s64 %rd17, %rd67, %rd91; + .loc 1 37 36 + mov.u32 %r46, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r46 }, [ %rd17 + 0 ]; + mov.b32 %f20, %r46; + mov.u32 %r47, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r47 }, [ %rd17 + 0 ]; + mov.u32 %r48, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r48 }, [ %rd17 + 0 ]; + mov.u32 %r49, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r49 }, [ %rd17 + 0 ]; + .loc 1 38 31 + add.s64 %rd21, %rd68, %rd82; + .loc 1 38 47 + mov.u32 %r50, 0x0; + mov.u32 %r51, 0x0; + @%p1 ld.global.v2.b32 { %r50, %r51 }, [ %rd21 + 0 ]; + @!%p1 mov.u32 %r50, %r6; + @!%p1 mov.u32 %r51, %r6; + cvt.u16.u32 %rs13, %r50; + { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r50; } + cvt.u16.u32 %rs15, %r51; + { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r51; } + .loc 1 38 68 + cvt.f32.bf16 %r54, %rs13; + mov.b32 %f21, %r54; + cvt.f32.bf16 %r55, %rs14; + mov.b32 %f22, %r55; + cvt.f32.bf16 %r56, %rs15; + mov.b32 %f23, %r56; + cvt.f32.bf16 %r57, %rs16; + mov.b32 %f24, %r57; + .loc 1 39 31 + add.s64 %rd22, %rd69, %rd82; + .loc 1 39 47 + mov.u32 %r58, 0x0; + mov.u32 %r59, 0x0; + @%p1 ld.global.v2.b32 { %r58, %r59 }, [ %rd22 + 0 ]; + @!%p1 mov.u32 %r58, %r6; + @!%p1 mov.u32 %r59, %r6; + cvt.u16.u32 %rs17, %r58; + { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r58; } + cvt.u16.u32 %rs19, %r59; + { .reg .b16 tmp; mov.b32 {tmp, %rs20}, %r59; } + .loc 1 39 68 + cvt.f32.bf16 %r62, %rs17; + mov.b32 %f25, %r62; + cvt.f32.bf16 %r63, %rs18; + mov.b32 %f26, %r63; + cvt.f32.bf16 %r64, %rs19; + mov.b32 %f27, %r64; + cvt.f32.bf16 %r65, %rs20; + mov.b32 %f28, %r65; + .loc 1 40 32 + add.s64 %rd23, %rd70, %rd82; + .loc 1 40 48 + mov.u32 %r66, 0x0; + mov.u32 %r67, 0x0; + @%p1 ld.global.v2.b32 { %r66, %r67 }, [ %rd23 + 0 ]; + @!%p1 mov.u32 %r66, %r6; + @!%p1 mov.u32 %r67, %r6; + cvt.u16.u32 %rs21, %r66; + { .reg .b16 tmp; mov.b32 {tmp, %rs22}, %r66; } + cvt.u16.u32 %rs23, %r67; + { .reg .b16 tmp; mov.b32 {tmp, %rs24}, %r67; } + .loc 1 40 69 + cvt.f32.bf16 %r70, %rs21; + mov.b32 %f29, %r70; + cvt.f32.bf16 %r71, %rs22; + mov.b32 %f30, %r71; + cvt.f32.bf16 %r72, %rs23; + mov.b32 %f31, %r72; + cvt.f32.bf16 %r73, %rs24; + mov.b32 %f32, %r73; + .loc 1 41 32 + add.s64 %rd24, %rd72, %rd91; + .loc 1 41 37 + mov.u32 %r74, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r74 }, [ %rd24 + 0 ]; + mov.b32 %f33, %r74; + mov.u32 %r75, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r75 }, [ %rd24 + 0 ]; + mov.u32 %r76, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r76 }, [ %rd24 + 0 ]; + mov.u32 %r77, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r77 }, [ %rd24 + 0 ]; + .loc 1 42 32 + add.s64 %rd28, %rd73, %rd91; + .loc 1 42 37 + mov.u32 %r78, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r78 }, [ %rd28 + 0 ]; + mov.b32 %f34, %r78; + mov.u32 %r79, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r79 }, [ %rd28 + 0 ]; + mov.u32 %r80, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r80 }, [ %rd28 + 0 ]; + mov.u32 %r81, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r81 }, [ %rd28 + 0 ]; + .loc 1 43 32 + add.s64 %rd32, %rd74, %rd82; + .loc 1 43 48 + mov.u32 %r82, 0x0; + mov.u32 %r83, 0x0; + @%p1 ld.global.v2.b32 { %r82, %r83 }, [ %rd32 + 0 ]; + @!%p1 mov.u32 %r82, %r6; + @!%p1 mov.u32 %r83, %r6; + cvt.u16.u32 %rs25, %r82; + { .reg .b16 tmp; mov.b32 {tmp, %rs26}, %r82; } + cvt.u16.u32 %rs27, %r83; + { .reg .b16 tmp; mov.b32 {tmp, %rs28}, %r83; } + .loc 1 43 69 + cvt.f32.bf16 %r86, %rs25; + mov.b32 %f35, %r86; + cvt.f32.bf16 %r87, %rs26; + mov.b32 %f36, %r87; + cvt.f32.bf16 %r88, %rs27; + mov.b32 %f37, %r88; + cvt.f32.bf16 %r89, %rs28; + mov.b32 %f38, %r89; + .loc 1 44 32 + add.s64 %rd33, %rd75, %rd91; + .loc 1 44 37 + mov.u32 %r90, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r90 }, [ %rd33 + 0 ]; + mov.b32 %f39, %r90; + mov.u32 %r91, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r91 }, [ %rd33 + 0 ]; + mov.u32 %r92, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r92 }, [ %rd33 + 0 ]; + mov.u32 %r93, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r93 }, [ %rd33 + 0 ]; + .loc 1 45 32 + add.s64 %rd37, %rd76, %rd91; + .loc 1 45 37 + mov.u32 %r94, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r94 }, [ %rd37 + 0 ]; + mov.b32 %f40, %r94; + mov.u32 %r95, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r95 }, [ %rd37 + 0 ]; + mov.u32 %r96, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r96 }, [ %rd37 + 0 ]; + mov.u32 %r97, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r97 }, [ %rd37 + 0 ]; + .loc 1 46 32 + add.s64 %rd41, %rd77, %rd82; + .loc 1 46 48 + mov.u32 %r98, 0x0; + mov.u32 %r99, 0x0; + @%p1 ld.global.v2.b32 { %r98, %r99 }, [ %rd41 + 0 ]; + @!%p1 mov.u32 %r98, %r6; + @!%p1 mov.u32 %r99, %r6; + cvt.u16.u32 %rs29, %r98; + { .reg .b16 tmp; mov.b32 {tmp, %rs30}, %r98; } + cvt.u16.u32 %rs31, %r99; + { .reg .b16 tmp; mov.b32 {tmp, %rs32}, %r99; } + .loc 1 46 69 + cvt.f32.bf16 %r102, %rs29; + mov.b32 %f41, %r102; + cvt.f32.bf16 %r103, %rs30; + mov.b32 %f42, %r103; + cvt.f32.bf16 %r104, %rs31; + mov.b32 %f43, %r104; + cvt.f32.bf16 %r105, %rs32; + mov.b32 %f44, %r105; + .loc 1 47 32 + add.s64 %rd42, %rd78, %rd91; + .loc 1 47 37 + mov.u32 %r106, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r106 }, [ %rd42 + 0 ]; + mov.b32 %f45, %r106; + mov.u32 %r107, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r107 }, [ %rd42 + 0 ]; + mov.u32 %r108, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r108 }, [ %rd42 + 0 ]; + mov.u32 %r109, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r109 }, [ %rd42 + 0 ]; + .loc 1 48 32 + add.s64 %rd46, %rd79, %rd91; + .loc 1 48 37 + mov.u32 %r143, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r143 }, [ %rd46 + 0 ]; + mov.b32 %f46, %r143; + mov.u32 %r111, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r111 }, [ %rd46 + 0 ]; + mov.u32 %r112, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r112 }, [ %rd46 + 0 ]; + mov.u32 %r113, 0x0; + @%p1 ld.global.L1::evict_last.b32 { %r113 }, [ %rd46 + 0 ]; + .loc 1 49 32 + add.s64 %rd50, %rd80, %rd71; + .loc 1 49 48 + mov.u32 %r114, 0x0; + mov.u32 %r115, 0x0; + mov.u32 %r116, 0x0; + mov.u32 %r117, 0x0; + @%p1 ld.global.v4.b32 { %r114, %r115, %r116, %r117 }, [ %rd50 + 0 ]; + @!%p1 mov.u32 %r114, %r6; + @!%p1 mov.u32 %r115, %r6; + @!%p1 mov.u32 %r116, %r6; + @!%p1 mov.u32 %r117, %r6; + .loc 1 50 32 + mul.wide.u32 %rd92, %r191, 4; + add.s64 %rd51, %rd81, %rd92; + .loc 1 50 37 + mov.u32 %r122, 0x0; + mov.u32 %r123, 0x0; + mov.u32 %r124, 0x0; + mov.u32 %r125, 0x0; + @%p1 ld.global.L1::evict_last.v4.b32 { %r122, %r123, %r124, %r125 }, [ %rd51 + 0 ]; + @!%p1 mov.u32 %r122, %r6; + @!%p1 mov.u32 %r123, %r6; + @!%p1 mov.u32 %r124, %r6; + @!%p1 mov.u32 %r125, %r6; + .loc 1 52 18 + add.f32 %f47, %f5, %f1; + add.f32 %f48, %f6, %f2; + add.f32 %f49, %f7, %f3; + add.f32 %f50, %f8, %f4; + .loc 1 54 18 + add.f32 %f51, %f47, %f9; + add.f32 %f52, %f48, %f10; + add.f32 %f53, %f49, %f11; + add.f32 %f54, %f50, %f12; + .loc 1 55 18 + sub.f32 %f55, %f51, %f13; + sub.f32 %f56, %f52, %f13; + sub.f32 %f57, %f53, %f13; + sub.f32 %f58, %f54, %f13; + .loc 1 56 19 + mul.f32 %f59, %f55, %f14; + mul.f32 %f60, %f56, %f14; + mul.f32 %f61, %f57, %f14; + mul.f32 %f62, %f58, %f14; + .loc 1 58 19 + add.f32 %f63, %f51, %f15; + add.f32 %f64, %f52, %f16; + add.f32 %f65, %f53, %f17; + add.f32 %f66, %f54, %f18; + .loc 1 59 20 + sub.f32 %f67, %f63, %f19; + sub.f32 %f68, %f64, %f19; + sub.f32 %f69, %f65, %f19; + sub.f32 %f70, %f66, %f19; + .loc 1 60 20 + mul.f32 %f71, %f67, %f20; + mul.f32 %f72, %f68, %f20; + mul.f32 %f73, %f69, %f20; + mul.f32 %f74, %f70, %f20; + .loc 1 62 20 + add.f32 %f75, %f63, %f21; + add.f32 %f76, %f64, %f22; + add.f32 %f77, %f65, %f23; + add.f32 %f78, %f66, %f24; + .loc 1 64 20 + add.f32 %f79, %f75, %f25; + add.f32 %f80, %f76, %f26; + add.f32 %f81, %f77, %f27; + add.f32 %f82, %f78, %f28; + .loc 1 66 20 + add.f32 %f83, %f79, %f29; + add.f32 %f84, %f80, %f30; + add.f32 %f85, %f81, %f31; + add.f32 %f86, %f82, %f32; + .loc 1 67 20 + sub.f32 %f87, %f83, %f33; + sub.f32 %f88, %f84, %f33; + sub.f32 %f89, %f85, %f33; + sub.f32 %f90, %f86, %f33; + .loc 1 68 20 + mul.f32 %f91, %f87, %f34; + mul.f32 %f92, %f88, %f34; + mul.f32 %f93, %f89, %f34; + mul.f32 %f94, %f90, %f34; + .loc 1 70 20 + add.f32 %f95, %f83, %f35; + add.f32 %f96, %f84, %f36; + add.f32 %f97, %f85, %f37; + add.f32 %f98, %f86, %f38; + .loc 1 71 20 + sub.f32 %f99, %f95, %f39; + sub.f32 %f100, %f96, %f39; + sub.f32 %f101, %f97, %f39; + sub.f32 %f102, %f98, %f39; + .loc 1 72 20 + mul.f32 %f103, %f99, %f40; + mul.f32 %f104, %f100, %f40; + mul.f32 %f105, %f101, %f40; + mul.f32 %f106, %f102, %f40; + .loc 1 74 20 + add.f32 %f107, %f95, %f41; + add.f32 %f108, %f96, %f42; + add.f32 %f109, %f97, %f43; + add.f32 %f110, %f98, %f44; + .loc 1 75 20 + sub.f32 %f111, %f107, %f45; + sub.f32 %f112, %f108, %f45; + sub.f32 %f113, %f109, %f45; + sub.f32 %f114, %f110, %f45; + .loc 1 76 20 + mul.f32 %f115, %f111, %f46; + mul.f32 %f116, %f112, %f46; + mul.f32 %f117, %f113, %f46; + mul.f32 %f118, %f114, %f46; + .loc 1 49 48 + mov.b32 %f119, %r115; + mov.b32 %f120, %r114; + .loc 1 50 37 + mov.b32 %f121, %r123; + mov.b32 %f122, %r122; + .loc 1 77 20 + mul.f32 %f123, %f120, %f122; + mul.f32 %f124, %f119, %f121; + .loc 1 49 48 + mov.b32 %f125, %r116; + mov.b32 %f126, %r117; + .loc 1 50 37 + mov.b32 %f127, %r124; + mov.b32 %f128, %r125; + .loc 1 77 20 + mul.f32 %f129, %f126, %f128; + mul.f32 %f130, %f125, %f127; +$L__tmp1: + .loc 2 233 15 + fma.rn.f32 %f131, %f120, %f122, %f124; + fma.rn.f32 %f132, %f125, %f127, %f131; + fma.rn.f32 %f133, %f126, %f128, %f132; +$L__tmp2: + .loc 2 243 36 + mov.b32 %r194, %f133; + shfl.sync.bfly.b32 %r195, %r194, 16, 31, -1; + mov.b32 %f134, %r195; +$L__tmp3: + .loc 2 233 15 + add.f32 %f135, %f133, %f134; +$L__tmp4: + .loc 2 243 36 + mov.b32 %r196, %f135; + shfl.sync.bfly.b32 %r197, %r196, 8, 31, -1; + mov.b32 %f136, %r197; +$L__tmp5: + .loc 2 233 15 + add.f32 %f137, %f135, %f136; +$L__tmp6: + .loc 2 243 36 + mov.b32 %r198, %f137; + shfl.sync.bfly.b32 %r199, %r198, 4, 31, -1; + mov.b32 %f138, %r199; +$L__tmp7: + .loc 2 233 15 + add.f32 %f139, %f137, %f138; +$L__tmp8: + .loc 2 243 36 + mov.b32 %r200, %f139; + shfl.sync.bfly.b32 %r201, %r200, 2, 31, -1; + mov.b32 %f140, %r201; +$L__tmp9: + .loc 2 233 15 + add.f32 %f141, %f139, %f140; +$L__tmp10: + .loc 2 243 36 + mov.b32 %r202, %f141; + shfl.sync.bfly.b32 %r203, %r202, 1, 31, -1; + mov.b32 %f142, %r203; +$L__tmp11: + .loc 2 233 15 + add.f32 %f143, %f141, %f142; +$L__tmp12: + .loc 2 243 36 + setp.eq.s32 %p80, %r189, 0; + shr.u32 %r204, %r188, 3; + and.b32 %r205, %r204, 4; + mov.u32 %r206, global_smem; + add.s32 %r130, %r206, %r205; + mov.b32 %r131, %f143; + @%p80 st.shared.b32 [ %r130 + 0 ], %r131; + bar.sync 0; + setp.lt.s32 %p81, %r188, 2; + add.s32 %r133, %r206, %r190; + @%p81 ld.shared.b32 %r132, [ %r133 + 0 ]; + mov.b32 %f144, %r132; + shfl.sync.bfly.b32 %r207, %r132, 1, 31, -1; + mov.b32 %f145, %r207; +$L__tmp13: + .loc 2 233 15 + add.f32 %f146, %f144, %f145; +$L__tmp14: + .loc 2 243 36 + and.b32 %r208, %r188, 1; + setp.eq.b32 %p94, %r208, 1; + not.pred %p95, %p94; + and.pred %p82, %p81, %p95; + mov.b32 %r135, %f146; + @%p82 st.shared.b32 [ %r133 + 0 ], %r135; + bar.sync 0; + ld.shared.f32 %f147, [global_smem]; +$L__tmp15: + .loc 3 8 15 + add.f32 %f148, %f147, 0f00000000; +$L__tmp16: + .loc 1 81 20 + mul.f32 %f149, %f116, %f124; +$L__tmp17: + .loc 2 243 36 + bar.sync 0; +$L__tmp18: + .loc 2 233 15 + fma.rn.f32 %f150, %f115, %f123, %f149; + fma.rn.f32 %f151, %f117, %f130, %f150; + fma.rn.f32 %f152, %f118, %f129, %f151; +$L__tmp19: + .loc 2 243 36 + mov.b32 %r209, %f152; + shfl.sync.bfly.b32 %r210, %r209, 16, 31, -1; + mov.b32 %f153, %r210; +$L__tmp20: + .loc 2 233 15 + add.f32 %f154, %f152, %f153; +$L__tmp21: + .loc 2 243 36 + mov.b32 %r211, %f154; + shfl.sync.bfly.b32 %r212, %r211, 8, 31, -1; + mov.b32 %f155, %r212; +$L__tmp22: + .loc 2 233 15 + add.f32 %f156, %f154, %f155; +$L__tmp23: + .loc 2 243 36 + mov.b32 %r213, %f156; + shfl.sync.bfly.b32 %r214, %r213, 4, 31, -1; + mov.b32 %f157, %r214; +$L__tmp24: + .loc 2 233 15 + add.f32 %f158, %f156, %f157; +$L__tmp25: + .loc 2 243 36 + mov.b32 %r215, %f158; + shfl.sync.bfly.b32 %r216, %r215, 2, 31, -1; + mov.b32 %f159, %r216; +$L__tmp26: + .loc 2 233 15 + add.f32 %f160, %f158, %f159; +$L__tmp27: + .loc 2 243 36 + mov.b32 %r217, %f160; + shfl.sync.bfly.b32 %r218, %r217, 1, 31, -1; + mov.b32 %f161, %r218; +$L__tmp28: + .loc 2 233 15 + add.f32 %f162, %f160, %f161; +$L__tmp29: + .loc 2 243 36 + mov.b32 %r137, %f162; + @%p80 st.shared.b32 [ %r130 + 0 ], %r137; + bar.sync 0; + @%p81 ld.shared.b32 %r138, [ %r133 + 0 ]; + mov.b32 %f163, %r138; + shfl.sync.bfly.b32 %r219, %r138, 1, 31, -1; + mov.b32 %f164, %r219; +$L__tmp30: + .loc 2 233 15 + add.f32 %f165, %f163, %f164; +$L__tmp31: + .loc 2 243 36 + mov.b32 %r141, %f165; + @%p82 st.shared.b32 [ %r133 + 0 ], %r141; + bar.sync 0; + ld.shared.f32 %f166, [global_smem]; +$L__tmp32: + .loc 3 8 15 + add.f32 %f167, %f166, 0f00000000; + mov.b32 %r144, 1132462080; +$L__tmp33: + .loc 1 86 20 + div.full.f32 %r142, %r143, %r144; + mov.b32 %f168, %r142; + .loc 1 88 20 + neg.f32 %f169, %f148; + fma.rn.f32 %f170, %f123, 0f43800000, %f169; + fma.rn.f32 %f171, %f124, 0f43800000, %f169; + fma.rn.f32 %f172, %f130, 0f43800000, %f169; + fma.rn.f32 %f173, %f129, 0f43800000, %f169; + .loc 1 90 20 + neg.f32 %f174, %f115; + fma.rn.f32 %f175, %f174, %f167, %f170; + neg.f32 %f176, %f116; + fma.rn.f32 %f177, %f176, %f167, %f171; + neg.f32 %f178, %f117; + fma.rn.f32 %f179, %f178, %f167, %f172; + neg.f32 %f180, %f118; + fma.rn.f32 %f181, %f180, %f167, %f173; + .loc 1 91 20 + mul.f32 %f182, %f168, %f175; + mul.f32 %f183, %f168, %f177; + mul.f32 %f184, %f168, %f179; + mul.f32 %f185, %f168, %f181; + .loc 1 93 25 + add.s64 %rd52, %rd83, %rd71; + .loc 1 93 48 + mov.b32 %r154, %f59; + mov.b32 %r155, %f60; + mov.b32 %r156, %f61; + mov.b32 %r157, %f62; + @%p1 st.global.v4.b32 [ %rd52 + 0 ], { %r154, %r155, %r156, %r157 }; + .loc 1 94 25 + add.s64 %rd53, %rd84, %rd71; + .loc 1 94 48 + mov.b32 %r158, %f71; + mov.b32 %r159, %f72; + mov.b32 %r160, %f73; + mov.b32 %r161, %f74; + @%p1 st.global.v4.b32 [ %rd53 + 0 ], { %r158, %r159, %r160, %r161 }; + .loc 1 95 25 + add.s64 %rd54, %rd85, %rd71; + .loc 1 95 48 + mov.b32 %r162, %f75; + mov.b32 %r163, %f76; + mov.b32 %r164, %f77; + mov.b32 %r165, %f78; + @%p1 st.global.v4.b32 [ %rd54 + 0 ], { %r162, %r163, %r164, %r165 }; + .loc 1 96 25 + add.s64 %rd55, %rd86, %rd71; + .loc 1 96 48 + mov.b32 %r166, %f91; + mov.b32 %r167, %f92; + mov.b32 %r168, %f93; + mov.b32 %r169, %f94; + @%p1 st.global.v4.b32 [ %rd55 + 0 ], { %r166, %r167, %r168, %r169 }; + .loc 1 97 25 + add.s64 %rd56, %rd87, %rd71; + .loc 1 97 48 + mov.b32 %r170, %f103; + mov.b32 %r171, %f104; + mov.b32 %r172, %f105; + mov.b32 %r173, %f106; + @%p1 st.global.v4.b32 [ %rd56 + 0 ], { %r170, %r171, %r172, %r173 }; + .loc 1 98 25 + add.s64 %rd57, %rd88, %rd71; + .loc 1 98 48 + mov.b32 %r174, %f115; + mov.b32 %r175, %f116; + mov.b32 %r176, %f117; + mov.b32 %r177, %f118; + @%p1 st.global.v4.b32 [ %rd57 + 0 ], { %r174, %r175, %r176, %r177 }; + .loc 1 99 25 + add.s64 %rd58, %rd89, %rd71; + .loc 1 99 48 + mov.b32 %r178, %f182; + mov.b32 %r179, %f183; + mov.b32 %r180, %f184; + mov.b32 %r181, %f185; + @%p1 st.global.v4.b32 [ %rd58 + 0 ], { %r178, %r179, %r180, %r181 }; + .loc 1 100 25 + add.s64 %rd59, %rd90, %rd82; + .loc 1 100 48 + cvt.rn.bf16.f32 %rs33, %r178; + cvt.rn.bf16.f32 %rs34, %r179; + cvt.rn.bf16.f32 %rs35, %r180; + cvt.rn.bf16.f32 %rs36, %r181; + mov.b32 %r220, {%rs33, %rs34}; + mov.b32 %r221, {%rs35, %rs36}; + @%p1 st.global.v2.b32 [ %rd59 + 0 ], { %r220, %r221 }; + .loc 1 100 4 + ret; +$L__tmp34: +$L__func_end0: + +} + .file 1 "/tmp/torchinductor_root/yo/cyo4ksjyladdfw6jgu5nyxbapyihb5b54nc6mogi76rx2lajsiff.py" + .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py" + .file 3 "/usr/local/lib/python3.10/dist-packages/torch/_inductor/triton_helpers.py" + .section .debug_abbrev + { +.b8 1 +.b8 17 +.b8 1 +.b8 37 +.b8 8 +.b8 19 +.b8 5 +.b8 3 +.b8 8 +.b8 16 +.b8 6 +.b8 27 +.b8 8 +.b8 180 +.b8 66 +.b8 12 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 0 +.b8 0 +.b8 2 +.b8 46 +.b8 0 +.b8 135 +.b8 64 +.b8 8 +.b8 3 +.b8 8 +.b8 58 +.b8 11 +.b8 59 +.b8 11 +.b8 63 +.b8 12 +.b8 32 +.b8 11 +.b8 0 +.b8 0 +.b8 3 +.b8 46 +.b8 1 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 64 +.b8 10 +.b8 49 +.b8 19 +.b8 0 +.b8 0 +.b8 4 +.b8 29 +.b8 1 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 5 +.b8 29 +.b8 0 +.b8 49 +.b8 19 +.b8 17 +.b8 1 +.b8 18 +.b8 1 +.b8 88 +.b8 11 +.b8 89 +.b8 11 +.b8 87 +.b8 11 +.b8 0 +.b8 0 +.b8 0 + } + .section .debug_info + { +.b32 533 +.b8 2 +.b8 0 +.b32 .debug_abbrev +.b8 8 +.b8 1 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 0 +.b8 2 +.b8 0 +.b8 99 +.b8 121 +.b8 111 +.b8 52 +.b8 107 +.b8 115 +.b8 106 +.b8 121 +.b8 108 +.b8 97 +.b8 100 +.b8 100 +.b8 102 +.b8 119 +.b8 54 +.b8 106 +.b8 103 +.b8 117 +.b8 53 +.b8 110 +.b8 121 +.b8 120 +.b8 98 +.b8 97 +.b8 112 +.b8 121 +.b8 105 +.b8 104 +.b8 98 +.b8 53 +.b8 98 +.b8 53 +.b8 52 +.b8 110 +.b8 99 +.b8 54 +.b8 109 +.b8 111 +.b8 103 +.b8 105 +.b8 55 +.b8 54 +.b8 114 +.b8 120 +.b8 50 +.b8 108 +.b8 97 +.b8 106 +.b8 115 +.b8 105 +.b8 102 +.b8 102 +.b8 46 +.b8 112 +.b8 121 +.b8 0 +.b32 .debug_line +.b8 47 +.b8 116 +.b8 109 +.b8 112 +.b8 47 +.b8 116 +.b8 111 +.b8 114 +.b8 99 +.b8 104 +.b8 105 +.b8 110 +.b8 100 +.b8 117 +.b8 99 +.b8 116 +.b8 111 +.b8 114 +.b8 95 +.b8 114 +.b8 111 +.b8 111 +.b8 116 +.b8 47 +.b8 121 +.b8 111 +.b8 0 +.b8 1 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 2 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 49 +.b8 49 +.b8 100 +.b8 49 +.b8 50 +.b8 100 +.b8 49 +.b8 51 +.b8 100 +.b8 49 +.b8 52 +.b8 100 +.b8 49 +.b8 53 +.b8 100 +.b8 49 +.b8 54 +.b8 100 +.b8 49 +.b8 55 +.b8 100 +.b8 49 +.b8 56 +.b8 100 +.b8 49 +.b8 57 +.b8 100 +.b8 50 +.b8 48 +.b8 100 +.b8 50 +.b8 49 +.b8 100 +.b8 50 +.b8 50 +.b8 100 +.b8 50 +.b8 51 +.b8 100 +.b8 50 +.b8 52 +.b8 100 +.b8 50 +.b8 53 +.b8 100 +.b8 50 +.b8 54 +.b8 100 +.b8 50 +.b8 55 +.b8 100 +.b8 50 +.b8 56 +.b8 100 +.b8 50 +.b8 57 +.b8 100 +.b8 101 +.b8 51 +.b8 48 +.b8 100 +.b8 101 +.b8 0 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 49 +.b8 49 +.b8 100 +.b8 49 +.b8 50 +.b8 100 +.b8 49 +.b8 51 +.b8 100 +.b8 49 +.b8 52 +.b8 100 +.b8 49 +.b8 53 +.b8 100 +.b8 49 +.b8 54 +.b8 100 +.b8 49 +.b8 55 +.b8 100 +.b8 49 +.b8 56 +.b8 100 +.b8 49 +.b8 57 +.b8 100 +.b8 50 +.b8 48 +.b8 100 +.b8 50 +.b8 49 +.b8 100 +.b8 50 +.b8 50 +.b8 100 +.b8 50 +.b8 51 +.b8 100 +.b8 50 +.b8 52 +.b8 100 +.b8 50 +.b8 53 +.b8 100 +.b8 50 +.b8 54 +.b8 100 +.b8 50 +.b8 55 +.b8 100 +.b8 50 +.b8 56 +.b8 100 +.b8 50 +.b8 57 +.b8 100 +.b8 101 +.b8 51 +.b8 48 +.b8 100 +.b8 101 +.b8 0 +.b8 1 +.b8 18 +.b8 1 +.b8 1 +.b8 3 +.b64 $L__func_begin0 +.b64 $L__func_end0 +.b8 1 +.b8 156 +.b32 125 +.b8 4 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 80 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp1 +.b64 $L__tmp14 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp2 +.b64 $L__tmp15 +.b8 2 +.b8 80 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp15 +.b64 $L__tmp16 +.b8 3 +.b8 80 +.b8 45 +.b8 5 +.b32 125 +.b64 $L__tmp17 +.b64 $L__tmp32 +.b8 2 +.b8 84 +.b8 59 +.b8 4 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 84 +.b8 59 +.b8 5 +.b32 125 +.b64 $L__tmp18 +.b64 $L__tmp31 +.b8 2 +.b8 243 +.b8 36 +.b8 0 +.b8 5 +.b32 125 +.b64 $L__tmp32 +.b64 $L__tmp33 +.b8 3 +.b8 84 +.b8 45 +.b8 0 +.b8 0 + } + .section .debug_pubnames + { +.b32 $L__pubNames_end0-$L__pubNames_start0 +$L__pubNames_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 537 +.b32 125 +.b8 116 +.b8 114 +.b8 105 +.b8 116 +.b8 111 +.b8 110 +.b8 95 +.b8 95 +.b8 48 +.b8 100 +.b8 49 +.b8 100 +.b8 50 +.b8 100 +.b8 51 +.b8 100 +.b8 52 +.b8 100 +.b8 53 +.b8 100 +.b8 54 +.b8 100 +.b8 55 +.b8 100 +.b8 56 +.b8 100 +.b8 57 +.b8 100 +.b8 49 +.b8 48 +.b8 100 +.b8 49 +.b8 49 +.b8 100 +.b8 49 +.b8 50 +.b8 100 +.b8 49 +.b8 51 +.b8 100 +.b8 49 +.b8 52 +.b8 100 +.b8 49 +.b8 53 +.b8 100 +.b8 49 +.b8 54 +.b8 100 +.b8 49 +.b8 55 +.b8 100 +.b8 49 +.b8 56 +.b8 100 +.b8 49 +.b8 57 +.b8 100 +.b8 50 +.b8 48 +.b8 100 +.b8 50 +.b8 49 +.b8 100 +.b8 50 +.b8 50 +.b8 100 +.b8 50 +.b8 51 +.b8 100 +.b8 50 +.b8 52 +.b8 100 +.b8 50 +.b8 53 +.b8 100 +.b8 50 +.b8 54 +.b8 100 +.b8 50 +.b8 55 +.b8 100 +.b8 50 +.b8 56 +.b8 100 +.b8 50 +.b8 57 +.b8 100 +.b8 101 +.b8 51 +.b8 48 +.b8 100 +.b8 101 +.b8 0 +.b32 0 +$L__pubNames_end0: + } + .section .debug_pubtypes + { +.b32 $L__pubTypes_end0-$L__pubTypes_start0 +$L__pubTypes_start0: +.b8 2 +.b8 0 +.b32 .debug_info +.b32 537 +.b32 0 +$L__pubTypes_end0: + } + .section .debug_loc { } diff --git a/wandb/run-20240926_055222-14kj2390/files/output.log b/wandb/run-20240926_055222-14kj2390/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7cac6cc6d1a1f6cf0f9bd9a0929eb1258c80a2e0 --- /dev/null +++ b/wandb/run-20240926_055222-14kj2390/files/output.log @@ -0,0 +1,701 @@ +Training learned + default: 5%|▊ | 500/10000 [20:17<5:34:45, 2.11s/it, loss=5.1549, lr=5.98e-04, mfu=9.53%, time_per_iter_ms=2114.39ms] + +Step 100: +Train loss: 6.8038, Val loss: 6.7955 +wikitext-103-v1 - Train loss: 7.9064, Val loss: 7.9072 +ptb - Train loss: 7.7765, Val loss: 7.7932 +lambada - Train loss: 6.6290, Val loss: 6.6271 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 200: +Train loss: 5.9632, Val loss: 5.9645 +wikitext-103-v1 - Train loss: 7.2751, Val loss: 7.2753 +ptb - Train loss: 7.6082, Val loss: 7.6259 +lambada - Train loss: 5.7352, Val loss: 5.7404 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 300: +Train loss: 5.5788, Val loss: 5.5806 +wikitext-103-v1 - Train loss: 6.9806, Val loss: 6.9695 +ptb - Train loss: 7.2654, Val loss: 7.2983 +lambada - Train loss: 5.4466, Val loss: 5.4591 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 400: +Train loss: 5.3057, Val loss: 5.2957 +wikitext-103-v1 - Train loss: 6.7475, Val loss: 6.7423 +ptb - Train loss: 7.0971, Val loss: 7.1375 +lambada - Train loss: 5.2606, Val loss: 5.2760 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 500: +Train loss: 5.1085, Val loss: 5.1086 +wikitext-103-v1 - Train loss: 6.5747, Val loss: 6.5779 +ptb - Train loss: 6.9253, Val loss: 6.9706 +lambada - Train loss: 5.1147, Val loss: 5.1296 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 600: +Train loss: 4.9573, Val loss: 4.9547 +wikitext-103-v1 - Train loss: 6.4305, Val loss: 6.4301 +ptb - Train loss: 6.8023, Val loss: 6.8555 +lambada - Train loss: 5.0186, Val loss: 5.0324 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 700: +Train loss: 4.8335, Val loss: 4.8295 +wikitext-103-v1 - Train loss: 6.2667, Val loss: 6.2699 +ptb - Train loss: 6.6823, Val loss: 6.7431 +lambada - Train loss: 4.9478, Val loss: 4.9548 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 800: +Train loss: 4.7205, Val loss: 4.7212 +wikitext-103-v1 - Train loss: 6.1274, Val loss: 6.1212 +ptb - Train loss: 6.4782, Val loss: 6.5370 +lambada - Train loss: 4.9008, Val loss: 4.9094 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 900: +Train loss: 4.6260, Val loss: 4.6246 +wikitext-103-v1 - Train loss: 5.9350, Val loss: 5.9339 +ptb - Train loss: 6.1903, Val loss: 6.2605 +lambada - Train loss: 4.8564, Val loss: 4.8686 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1000: +Train loss: 4.5295, Val loss: 4.5315 +wikitext-103-v1 - Train loss: 5.7712, Val loss: 5.7584 +ptb - Train loss: 5.8422, Val loss: 5.9189 +lambada - Train loss: 4.8058, Val loss: 4.8109 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1100: +Train loss: 4.4763, Val loss: 4.4670 +wikitext-103-v1 - Train loss: 5.6941, Val loss: 5.6790 +ptb - Train loss: 5.6589, Val loss: 5.7596 +lambada - Train loss: 4.7676, Val loss: 4.7707 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1200: +Train loss: 4.4093, Val loss: 4.4050 +wikitext-103-v1 - Train loss: 5.5809, Val loss: 5.5747 +ptb - Train loss: 5.5707, Val loss: 5.6739 +lambada - Train loss: 4.7642, Val loss: 4.7673 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1300: +Train loss: 4.3669, Val loss: 4.3716 +wikitext-103-v1 - Train loss: 5.5322, Val loss: 5.5115 +ptb - Train loss: 5.4906, Val loss: 5.5867 +lambada - Train loss: 4.7353, Val loss: 4.7372 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1400: +Train loss: 4.3331, Val loss: 4.3318 +wikitext-103-v1 - Train loss: 5.4625, Val loss: 5.4471 +ptb - Train loss: 5.4163, Val loss: 5.5288 +lambada - Train loss: 4.7051, Val loss: 4.7102 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1500: +Train loss: 4.3095, Val loss: 4.3069 +wikitext-103-v1 - Train loss: 5.4572, Val loss: 5.4487 +ptb - Train loss: 5.4178, Val loss: 5.5298 +lambada - Train loss: 4.6914, Val loss: 4.6964 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1600: +Train loss: 4.2780, Val loss: 4.2778 +wikitext-103-v1 - Train loss: 5.4283, Val loss: 5.4132 +ptb - Train loss: 5.3550, Val loss: 5.4572 +lambada - Train loss: 4.6789, Val loss: 4.6788 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1700: +Train loss: 4.2646, Val loss: 4.2523 +wikitext-103-v1 - Train loss: 5.4240, Val loss: 5.4090 +ptb - Train loss: 5.3642, Val loss: 5.4759 +lambada - Train loss: 4.6545, Val loss: 4.6600 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1800: +Train loss: 4.2347, Val loss: 4.2302 +wikitext-103-v1 - Train loss: 5.3465, Val loss: 5.3261 +ptb - Train loss: 5.2994, Val loss: 5.4021 +lambada - Train loss: 4.6463, Val loss: 4.6483 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 1900: +Train loss: 4.2068, Val loss: 4.2177 +wikitext-103-v1 - Train loss: 5.3374, Val loss: 5.3226 +ptb - Train loss: 5.2643, Val loss: 5.3607 +lambada - Train loss: 4.6296, Val loss: 4.6349 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2000: +Train loss: 4.1983, Val loss: 4.1948 +wikitext-103-v1 - Train loss: 5.3491, Val loss: 5.3315 +ptb - Train loss: 5.3221, Val loss: 5.4272 +lambada - Train loss: 4.6377, Val loss: 4.6344 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2100: +Train loss: 4.1860, Val loss: 4.1882 +wikitext-103-v1 - Train loss: 5.3103, Val loss: 5.2909 +ptb - Train loss: 5.2290, Val loss: 5.3266 +lambada - Train loss: 4.6136, Val loss: 4.6200 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2200: +Train loss: 4.1666, Val loss: 4.1677 +wikitext-103-v1 - Train loss: 5.2956, Val loss: 5.2775 +ptb - Train loss: 5.2349, Val loss: 5.3438 +lambada - Train loss: 4.5960, Val loss: 4.6060 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2300: +Train loss: 4.1573, Val loss: 4.1576 +wikitext-103-v1 - Train loss: 5.2848, Val loss: 5.2643 +ptb - Train loss: 5.2105, Val loss: 5.3189 +lambada - Train loss: 4.6056, Val loss: 4.6054 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2400: +Train loss: 4.1411, Val loss: 4.1447 +wikitext-103-v1 - Train loss: 5.2642, Val loss: 5.2456 +ptb - Train loss: 5.2038, Val loss: 5.3090 +lambada - Train loss: 4.5966, Val loss: 4.6040 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2500: +Train loss: 4.1320, Val loss: 4.1369 +wikitext-103-v1 - Train loss: 5.2598, Val loss: 5.2343 +ptb - Train loss: 5.2028, Val loss: 5.3161 +lambada - Train loss: 4.5989, Val loss: 4.6001 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2600: +Train loss: 4.1241, Val loss: 4.1256 +wikitext-103-v1 - Train loss: 5.2357, Val loss: 5.2125 +ptb - Train loss: 5.1644, Val loss: 5.2680 +lambada - Train loss: 4.5846, Val loss: 4.5838 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2700: +Train loss: 4.1183, Val loss: 4.1143 +wikitext-103-v1 - Train loss: 5.2448, Val loss: 5.2277 +ptb - Train loss: 5.1603, Val loss: 5.2628 +lambada - Train loss: 4.5798, Val loss: 4.5827 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2800: +Train loss: 4.1055, Val loss: 4.1096 +wikitext-103-v1 - Train loss: 5.2229, Val loss: 5.2017 +ptb - Train loss: 5.1477, Val loss: 5.2506 +lambada - Train loss: 4.5665, Val loss: 4.5677 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 2900: +Train loss: 4.0935, Val loss: 4.0882 +wikitext-103-v1 - Train loss: 5.2075, Val loss: 5.1859 +ptb - Train loss: 5.1501, Val loss: 5.2564 +lambada - Train loss: 4.5792, Val loss: 4.5752 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3000: +Train loss: 4.0905, Val loss: 4.0838 +wikitext-103-v1 - Train loss: 5.2077, Val loss: 5.1879 +ptb - Train loss: 5.1382, Val loss: 5.2432 +lambada - Train loss: 4.5655, Val loss: 4.5647 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3100: +Train loss: 4.0829, Val loss: 4.0785 +wikitext-103-v1 - Train loss: 5.1924, Val loss: 5.1646 +ptb - Train loss: 5.1343, Val loss: 5.2375 +lambada - Train loss: 4.5594, Val loss: 4.5584 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3200: +Train loss: 4.0735, Val loss: 4.0756 +wikitext-103-v1 - Train loss: 5.1842, Val loss: 5.1655 +ptb - Train loss: 5.1134, Val loss: 5.2139 +lambada - Train loss: 4.5548, Val loss: 4.5578 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3300: +Train loss: 4.0685, Val loss: 4.0711 +wikitext-103-v1 - Train loss: 5.1913, Val loss: 5.1735 +ptb - Train loss: 5.1080, Val loss: 5.2114 +lambada - Train loss: 4.5626, Val loss: 4.5604 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3400: +Train loss: 4.0556, Val loss: 4.0570 +wikitext-103-v1 - Train loss: 5.1777, Val loss: 5.1480 +ptb - Train loss: 5.0920, Val loss: 5.1912 +lambada - Train loss: 4.5487, Val loss: 4.5460 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3500: +Train loss: 4.0569, Val loss: 4.0554 +wikitext-103-v1 - Train loss: 5.1588, Val loss: 5.1413 +ptb - Train loss: 5.0720, Val loss: 5.1675 +lambada - Train loss: 4.5377, Val loss: 4.5366 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3600: +Train loss: 4.0502, Val loss: 4.0463 +wikitext-103-v1 - Train loss: 5.1695, Val loss: 5.1527 +ptb - Train loss: 5.0727, Val loss: 5.1805 +lambada - Train loss: 4.5387, Val loss: 4.5359 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3700: +Train loss: 4.0493, Val loss: 4.0440 +wikitext-103-v1 - Train loss: 5.1483, Val loss: 5.1283 +ptb - Train loss: 5.0744, Val loss: 5.1717 +lambada - Train loss: 4.5342, Val loss: 4.5324 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3800: +Train loss: 4.0387, Val loss: 4.0354 +wikitext-103-v1 - Train loss: 5.1486, Val loss: 5.1196 +ptb - Train loss: 5.0400, Val loss: 5.1513 +lambada - Train loss: 4.5402, Val loss: 4.5332 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 3900: +Train loss: 4.0341, Val loss: 4.0290 +wikitext-103-v1 - Train loss: 5.1404, Val loss: 5.1191 +ptb - Train loss: 5.0406, Val loss: 5.1401 +lambada - Train loss: 4.5226, Val loss: 4.5213 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4000: +Train loss: 4.0297, Val loss: 4.0303 +wikitext-103-v1 - Train loss: 5.1229, Val loss: 5.1015 +ptb - Train loss: 5.0423, Val loss: 5.1482 +lambada - Train loss: 4.5457, Val loss: 4.5434 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4100: +Train loss: 4.0285, Val loss: 4.0279 +wikitext-103-v1 - Train loss: 5.1410, Val loss: 5.1173 +ptb - Train loss: 5.0534, Val loss: 5.1546 +lambada - Train loss: 4.5249, Val loss: 4.5211 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4200: +Train loss: 4.0124, Val loss: 4.0195 +wikitext-103-v1 - Train loss: 5.1362, Val loss: 5.1201 +ptb - Train loss: 5.0534, Val loss: 5.1583 +lambada - Train loss: 4.5213, Val loss: 4.5162 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4300: +Train loss: 4.0193, Val loss: 4.0081 +wikitext-103-v1 - Train loss: 5.1212, Val loss: 5.1108 +ptb - Train loss: 5.0391, Val loss: 5.1401 +lambada - Train loss: 4.5240, Val loss: 4.5265 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4400: +Train loss: 4.0106, Val loss: 4.0069 +wikitext-103-v1 - Train loss: 5.1272, Val loss: 5.1073 +ptb - Train loss: 5.0385, Val loss: 5.1346 +lambada - Train loss: 4.5174, Val loss: 4.5123 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4500: +Train loss: 4.0093, Val loss: 4.0066 +wikitext-103-v1 - Train loss: 5.1077, Val loss: 5.0875 +ptb - Train loss: 5.0100, Val loss: 5.1084 +lambada - Train loss: 4.5189, Val loss: 4.5178 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4600: +Train loss: 4.0014, Val loss: 4.0017 +wikitext-103-v1 - Train loss: 5.1019, Val loss: 5.0889 +ptb - Train loss: 5.0120, Val loss: 5.1113 +lambada - Train loss: 4.5059, Val loss: 4.5104 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4700: +Train loss: 3.9949, Val loss: 3.9994 +wikitext-103-v1 - Train loss: 5.0935, Val loss: 5.0713 +ptb - Train loss: 4.9881, Val loss: 5.0865 +lambada - Train loss: 4.5065, Val loss: 4.5037 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4800: +Train loss: 3.9991, Val loss: 3.9934 +wikitext-103-v1 - Train loss: 5.0869, Val loss: 5.0718 +ptb - Train loss: 4.9856, Val loss: 5.0937 +lambada - Train loss: 4.5097, Val loss: 4.5079 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 4900: +Train loss: 3.9904, Val loss: 3.9903 +wikitext-103-v1 - Train loss: 5.0945, Val loss: 5.0764 +ptb - Train loss: 5.0047, Val loss: 5.1038 +lambada - Train loss: 4.5071, Val loss: 4.5000 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5000: +Train loss: 3.9841, Val loss: 3.9873 +wikitext-103-v1 - Train loss: 5.0710, Val loss: 5.0561 +ptb - Train loss: 4.9981, Val loss: 5.0994 +lambada - Train loss: 4.5154, Val loss: 4.5174 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5100: +Train loss: 3.9910, Val loss: 3.9813 +wikitext-103-v1 - Train loss: 5.0788, Val loss: 5.0557 +ptb - Train loss: 4.9792, Val loss: 5.0777 +lambada - Train loss: 4.4983, Val loss: 4.4992 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5200: +Train loss: 3.9810, Val loss: 3.9722 +wikitext-103-v1 - Train loss: 5.0702, Val loss: 5.0631 +ptb - Train loss: 4.9835, Val loss: 5.0830 +lambada - Train loss: 4.4988, Val loss: 4.4999 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5300: +Train loss: 3.9759, Val loss: 3.9737 +wikitext-103-v1 - Train loss: 5.0629, Val loss: 5.0486 +ptb - Train loss: 4.9766, Val loss: 5.0699 +lambada - Train loss: 4.4918, Val loss: 4.4898 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5400: +Train loss: 3.9718, Val loss: 3.9762 +wikitext-103-v1 - Train loss: 5.0648, Val loss: 5.0488 +ptb - Train loss: 4.9808, Val loss: 5.0805 +lambada - Train loss: 4.4962, Val loss: 4.4957 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5500: +Train loss: 3.9740, Val loss: 3.9764 +wikitext-103-v1 - Train loss: 5.0641, Val loss: 5.0500 +ptb - Train loss: 4.9647, Val loss: 5.0674 +lambada - Train loss: 4.4849, Val loss: 4.4855 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5600: +Train loss: 3.9577, Val loss: 3.9633 +wikitext-103-v1 - Train loss: 5.0513, Val loss: 5.0275 +ptb - Train loss: 4.9602, Val loss: 5.0572 +lambada - Train loss: 4.4822, Val loss: 4.4827 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5700: +Train loss: 3.9657, Val loss: 3.9676 +wikitext-103-v1 - Train loss: 5.0498, Val loss: 5.0252 +ptb - Train loss: 4.9549, Val loss: 5.0510 +lambada - Train loss: 4.4874, Val loss: 4.4878 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5800: +Train loss: 3.9636, Val loss: 3.9648 +wikitext-103-v1 - Train loss: 5.0513, Val loss: 5.0333 +ptb - Train loss: 4.9587, Val loss: 5.0591 +lambada - Train loss: 4.4922, Val loss: 4.4849 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 5900: +Train loss: 3.9576, Val loss: 3.9588 +wikitext-103-v1 - Train loss: 5.0306, Val loss: 5.0349 +ptb - Train loss: 4.9546, Val loss: 5.0537 +lambada - Train loss: 4.4707, Val loss: 4.4656 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6000: +Train loss: 3.9571, Val loss: 3.9567 +wikitext-103-v1 - Train loss: 5.0335, Val loss: 5.0150 +ptb - Train loss: 4.9552, Val loss: 5.0535 +lambada - Train loss: 4.4833, Val loss: 4.4809 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6100: +Train loss: 3.9572, Val loss: 3.9515 +wikitext-103-v1 - Train loss: 5.0369, Val loss: 5.0118 +ptb - Train loss: 4.9523, Val loss: 5.0533 +lambada - Train loss: 4.4831, Val loss: 4.4803 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6200: +Train loss: 3.9477, Val loss: 3.9533 +wikitext-103-v1 - Train loss: 5.0323, Val loss: 5.0173 +ptb - Train loss: 4.9542, Val loss: 5.0520 +lambada - Train loss: 4.4807, Val loss: 4.4803 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6300: +Train loss: 3.9529, Val loss: 3.9482 +wikitext-103-v1 - Train loss: 5.0296, Val loss: 5.0053 +ptb - Train loss: 4.9409, Val loss: 5.0413 +lambada - Train loss: 4.4767, Val loss: 4.4752 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6400: +Train loss: 3.9437, Val loss: 3.9420 +wikitext-103-v1 - Train loss: 5.0380, Val loss: 5.0141 +ptb - Train loss: 4.9432, Val loss: 5.0490 +lambada - Train loss: 4.4715, Val loss: 4.4718 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6500: +Train loss: 3.9498, Val loss: 3.9390 +wikitext-103-v1 - Train loss: 5.0437, Val loss: 5.0206 +ptb - Train loss: 4.9500, Val loss: 5.0495 +lambada - Train loss: 4.4766, Val loss: 4.4775 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6600: +Train loss: 3.9457, Val loss: 3.9436 +wikitext-103-v1 - Train loss: 5.0214, Val loss: 5.0075 +ptb - Train loss: 4.9380, Val loss: 5.0405 +lambada - Train loss: 4.4751, Val loss: 4.4734 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6700: +Train loss: 3.9463, Val loss: 3.9377 +wikitext-103-v1 - Train loss: 5.0262, Val loss: 5.0050 +ptb - Train loss: 4.9359, Val loss: 5.0361 +lambada - Train loss: 4.4685, Val loss: 4.4638 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6800: +Train loss: 3.9393, Val loss: 3.9393 +wikitext-103-v1 - Train loss: 5.0274, Val loss: 5.0019 +ptb - Train loss: 4.9270, Val loss: 5.0256 +lambada - Train loss: 4.4650, Val loss: 4.4645 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 6900: +Train loss: 3.9324, Val loss: 3.9347 +wikitext-103-v1 - Train loss: 5.0093, Val loss: 4.9990 +ptb - Train loss: 4.9319, Val loss: 5.0297 +lambada - Train loss: 4.4679, Val loss: 4.4681 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7000: +Train loss: 3.9386, Val loss: 3.9279 +wikitext-103-v1 - Train loss: 5.0184, Val loss: 5.0018 +ptb - Train loss: 4.9223, Val loss: 5.0256 +lambada - Train loss: 4.4654, Val loss: 4.4593 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7100: +Train loss: 3.9333, Val loss: 3.9322 +wikitext-103-v1 - Train loss: 5.0186, Val loss: 4.9935 +ptb - Train loss: 4.9186, Val loss: 5.0169 +lambada - Train loss: 4.4588, Val loss: 4.4566 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7200: +Train loss: 3.9342, Val loss: 3.9300 +wikitext-103-v1 - Train loss: 5.0002, Val loss: 4.9939 +ptb - Train loss: 4.9140, Val loss: 5.0205 +lambada - Train loss: 4.4645, Val loss: 4.4640 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7300: +Train loss: 3.9329, Val loss: 3.9321 +wikitext-103-v1 - Train loss: 5.0134, Val loss: 4.9943 +ptb - Train loss: 4.9130, Val loss: 5.0149 +lambada - Train loss: 4.4556, Val loss: 4.4552 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7400: +Train loss: 3.9271, Val loss: 3.9309 +wikitext-103-v1 - Train loss: 5.0068, Val loss: 4.9918 +ptb - Train loss: 4.9191, Val loss: 5.0184 +lambada - Train loss: 4.4619, Val loss: 4.4547 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7500: +Train loss: 3.9322, Val loss: 3.9292 +wikitext-103-v1 - Train loss: 5.0121, Val loss: 4.9903 +ptb - Train loss: 4.9147, Val loss: 5.0180 +lambada - Train loss: 4.4603, Val loss: 4.4608 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7600: +Train loss: 3.9216, Val loss: 3.9253 +wikitext-103-v1 - Train loss: 5.0079, Val loss: 4.9907 +ptb - Train loss: 4.9166, Val loss: 5.0171 +lambada - Train loss: 4.4605, Val loss: 4.4594 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7700: +Train loss: 3.9283, Val loss: 3.9211 +wikitext-103-v1 - Train loss: 5.0071, Val loss: 4.9817 +ptb - Train loss: 4.9177, Val loss: 5.0144 +lambada - Train loss: 4.4571, Val loss: 4.4570 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7800: +Train loss: 3.9205, Val loss: 3.9184 +wikitext-103-v1 - Train loss: 5.0049, Val loss: 4.9808 +ptb - Train loss: 4.9114, Val loss: 5.0158 +lambada - Train loss: 4.4650, Val loss: 4.4661 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 7900: +Train loss: 3.9243, Val loss: 3.9222 +wikitext-103-v1 - Train loss: 5.0040, Val loss: 4.9802 +ptb - Train loss: 4.9160, Val loss: 5.0118 +lambada - Train loss: 4.4549, Val loss: 4.4529 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8000: +Train loss: 3.9250, Val loss: 3.9242 +wikitext-103-v1 - Train loss: 4.9941, Val loss: 4.9855 +ptb - Train loss: 4.9191, Val loss: 5.0123 +lambada - Train loss: 4.4539, Val loss: 4.4552 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8100: +Train loss: 3.9191, Val loss: 3.9206 +wikitext-103-v1 - Train loss: 5.0038, Val loss: 4.9899 +ptb - Train loss: 4.9185, Val loss: 5.0285 +lambada - Train loss: 4.4547, Val loss: 4.4544 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8200: +Train loss: 3.9230, Val loss: 3.9179 +wikitext-103-v1 - Train loss: 4.9961, Val loss: 4.9745 +ptb - Train loss: 4.9024, Val loss: 5.0016 +lambada - Train loss: 4.4549, Val loss: 4.4528 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8300: +Train loss: 3.9202, Val loss: 3.9221 +wikitext-103-v1 - Train loss: 4.9916, Val loss: 4.9718 +ptb - Train loss: 4.8936, Val loss: 4.9952 +lambada - Train loss: 4.4566, Val loss: 4.4566 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8400: +Train loss: 3.9179, Val loss: 3.9154 +wikitext-103-v1 - Train loss: 4.9929, Val loss: 4.9820 +ptb - Train loss: 4.9103, Val loss: 5.0113 +lambada - Train loss: 4.4497, Val loss: 4.4471 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8500: +Train loss: 3.9177, Val loss: 3.9206 +wikitext-103-v1 - Train loss: 4.9873, Val loss: 4.9718 +ptb - Train loss: 4.8961, Val loss: 5.0103 +lambada - Train loss: 4.4538, Val loss: 4.4548 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8600: +Train loss: 3.9126, Val loss: 3.9091 +wikitext-103-v1 - Train loss: 4.9969, Val loss: 4.9774 +ptb - Train loss: 4.9036, Val loss: 5.0020 +lambada - Train loss: 4.4524, Val loss: 4.4483 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8700: +Train loss: 3.9183, Val loss: 3.9143 +wikitext-103-v1 - Train loss: 4.9846, Val loss: 4.9672 +ptb - Train loss: 4.9017, Val loss: 5.0107 +lambada - Train loss: 4.4507, Val loss: 4.4508 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8800: +Train loss: 3.9141, Val loss: 3.9113 +wikitext-103-v1 - Train loss: 4.9931, Val loss: 4.9725 +ptb - Train loss: 4.8923, Val loss: 4.9876 +lambada - Train loss: 4.4501, Val loss: 4.4474 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 8900: +Train loss: 3.9139, Val loss: 3.9112 +wikitext-103-v1 - Train loss: 4.9853, Val loss: 4.9735 +ptb - Train loss: 4.8939, Val loss: 4.9961 +lambada - Train loss: 4.4520, Val loss: 4.4532 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9000: +Train loss: 3.9111, Val loss: 3.9007 +wikitext-103-v1 - Train loss: 4.9977, Val loss: 4.9627 +ptb - Train loss: 4.8913, Val loss: 4.9936 +lambada - Train loss: 4.4460, Val loss: 4.4430 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9100: +Train loss: 3.9107, Val loss: 3.9115 +wikitext-103-v1 - Train loss: 4.9896, Val loss: 4.9761 +ptb - Train loss: 4.8987, Val loss: 5.0039 +lambada - Train loss: 4.4509, Val loss: 4.4508 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9200: +Train loss: 3.9089, Val loss: 3.9111 +wikitext-103-v1 - Train loss: 4.9739, Val loss: 4.9647 +ptb - Train loss: 4.8953, Val loss: 4.9993 +lambada - Train loss: 4.4479, Val loss: 4.4462 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9300: +Train loss: 3.9058, Val loss: 3.9133 +wikitext-103-v1 - Train loss: 4.9842, Val loss: 4.9689 +ptb - Train loss: 4.8892, Val loss: 4.9937 +lambada - Train loss: 4.4449, Val loss: 4.4449 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9400: +Train loss: 3.9127, Val loss: 3.9036 +wikitext-103-v1 - Train loss: 4.9826, Val loss: 4.9736 +ptb - Train loss: 4.8906, Val loss: 4.9918 +lambada - Train loss: 4.4485, Val loss: 4.4455 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9500: +Train loss: 3.9034, Val loss: 3.9067 +wikitext-103-v1 - Train loss: 4.9902, Val loss: 4.9696 +ptb - Train loss: 4.9035, Val loss: 5.0067 +lambada - Train loss: 4.4472, Val loss: 4.4489 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9600: +Train loss: 3.9037, Val loss: 3.8991 +wikitext-103-v1 - Train loss: 4.9837, Val loss: 4.9578 +ptb - Train loss: 4.8968, Val loss: 5.0015 +lambada - Train loss: 4.4497, Val loss: 4.4449 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9700: +Train loss: 3.9029, Val loss: 3.9092 +wikitext-103-v1 - Train loss: 4.9835, Val loss: 4.9616 +ptb - Train loss: 4.8896, Val loss: 4.9964 +lambada - Train loss: 4.4453, Val loss: 4.4417 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9800: +Train loss: 3.9027, Val loss: 3.9042 +wikitext-103-v1 - Train loss: 4.9806, Val loss: 4.9510 +ptb - Train loss: 4.8925, Val loss: 4.9939 +lambada - Train loss: 4.4443, Val loss: 4.4423 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 9900: +Train loss: 3.9021, Val loss: 3.9027 +wikitext-103-v1 - Train loss: 4.9817, Val loss: 4.9592 +ptb - Train loss: 4.8927, Val loss: 4.9979 +lambada - Train loss: 4.4439, Val loss: 4.4361 +Saving checkpoint to out/ckpt_learned_default.pt + +Step 10000: +Train loss: 3.9086, Val loss: 3.9058 +wikitext-103-v1 - Train loss: 4.9778, Val loss: 4.9618 +ptb - Train loss: 4.8921, Val loss: 4.9924 +lambada - Train loss: 4.4409, Val loss: 4.4380 +Saving checkpoint to out/ckpt_learned_default.pt diff --git a/wandb/run-20240926_192831-378lr5yg/files/config.yaml b/wandb/run-20240926_192831-378lr5yg/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e23fc017ee27b71f35139c71a97d8b5ddb712ab1 --- /dev/null +++ b/wandb/run-20240926_192831-378lr5yg/files/config.yaml @@ -0,0 +1,113 @@ +_wandb: + value: + cli_version: 0.18.1 + m: [] + python_version: 3.10.12 + t: + "1": + - 1 + - 55 + "2": + - 1 + - 55 + "3": + - 2 + - 13 + - 16 + - 23 + - 55 + - 61 + "4": 3.10.12 + "5": 0.18.1 + "8": + - 5 + "12": 0.18.1 + "13": linux-x86_64 +always_save_checkpoint: + value: true +attention_types: + value: + - default +backend: + value: nccl +batch_size: + value: 120 +beta1: + value: 0.9 +beta2: + value: 0.95 +bias: + value: false +block_size: + value: 512 +checkpoint_path: + value: "" +collect_activations: + value: false +collect_attention_patterns: + value: false +compile: + value: true +dataset: + value: fineweb +decay_lr: + value: true +device: + value: cuda +dropout: + value: 0 +dtype: + value: bfloat16 +embedding_types: + value: + - polynomial_legendre + - polynomial_chebyshev + - random_fourier + - wavelet +eval_datasets: + value: + - wikitext-103-v1 + - ptb + - lambada +eval_interval: + value: 100 +eval_iters: + value: 100 +eval_only: + value: false +grad_clip: + value: 1 +gradient_accumulation_steps: + value: 40 +init_from: + value: scratch +learning_rate: + value: 0.0006 +log_interval: + value: 1 +lr_decay_iters: + value: 10000 +max_iters: + value: 10000 +min_lr: + value: 6e-05 +n_embd: + value: 256 +n_head: + value: 4 +n_layer: + value: 4 +out_dir: + value: out +seed: + value: 1337 +wandb_log: + value: true +wandb_project: + value: gpt2_positional_encodings_100B +wandb_run_name: + value: experiment +warmup_iters: + value: 100 +weight_decay: + value: 0.1