0-hero commited on
Commit
71c6277
·
verified ·
1 Parent(s): 79f9b39

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .local/share/Trash/info/train_002.bin.trashinfo +3 -0
  2. .local/share/Trash/info/train_004.bin.trashinfo +3 -0
  3. .local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png +0 -0
  4. .local/share/jupyter/nbextensions/ruler/icon.png +0 -0
  5. .local/share/jupyter/nbextensions/ruler/main.js +124 -0
  6. .local/share/jupyter/nbextensions/runtools/readme.md +119 -0
  7. .local/share/jupyter/nbextensions/runtools/runtools_show_hide.png +0 -0
  8. .local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml +6 -0
  9. .local/share/jupyter/nbextensions/select_keymap/README.md +14 -0
  10. .local/share/jupyter/nbextensions/skill/main.js +14 -0
  11. .local/share/jupyter/nbextensions/skill/skill.yaml +6 -0
  12. .local/share/jupyter/nbextensions/skip-traceback/traceback.png +0 -0
  13. .local/share/jupyter/nbextensions/splitcell/splitcell.js +101 -0
  14. .local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png +0 -0
  15. .local/share/jupyter/nbextensions/varInspector/README.md +36 -0
  16. .local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc +0 -0
  17. .local/share/jupyter/nbextensions/varInspector/demo.gif +0 -0
  18. .local/share/jupyter/nbextensions/varInspector/icon.png +0 -0
  19. .local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt +21 -0
  20. .local/share/jupyter/nbextensions/varInspector/var_list.py +63 -0
  21. .local/share/jupyter/nbextensions/varInspector/var_list.r +17 -0
  22. .local/share/jupyter/nbextensions/zenmode/README.md +4 -0
  23. .local/share/jupyter/nbextensions/zenmode/images/back2.jpg +0 -0
  24. .local/share/jupyter/nbextensions/zenmode/images/back21.jpg +0 -0
  25. .local/share/jupyter/nbextensions/zenmode/images/back3.jpg +0 -0
  26. .local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png +0 -0
  27. .local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png +0 -0
  28. .local/share/jupyter/nbextensions/zenmode/main.css +34 -0
  29. .local/share/jupyter/runtime/jpserver-434.json +13 -0
  30. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin +0 -0
  31. .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir +60 -0
  32. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin +0 -0
  33. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx +453 -0
  34. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir +38 -0
  35. .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir +34 -0
  36. .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx +312 -0
  37. .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir +19 -0
  38. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin +0 -0
  39. .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir +56 -0
  40. .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir +88 -0
  41. .triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx +778 -0
  42. .triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir +27 -0
  43. .triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir +0 -0
  44. .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir +89 -0
  45. .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir +18 -0
  46. .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir +17 -0
  47. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin +0 -0
  48. .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx +861 -0
  49. .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin +0 -0
  50. .triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir +18 -0
.local/share/Trash/info/train_002.bin.trashinfo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [Trash Info]
2
+ Path=/root/data/fineweb/train_002.bin
3
+ DeletionDate=2024-09-26T05:50:34
.local/share/Trash/info/train_004.bin.trashinfo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [Trash Info]
2
+ Path=/root/data/fineweb/train_004.bin
3
+ DeletionDate=2024-09-26T05:50:34
.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png ADDED
.local/share/jupyter/nbextensions/ruler/icon.png ADDED
.local/share/jupyter/nbextensions/ruler/main.js ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Add rulers to codecells
2
+ define([
3
+ 'base/js/namespace',
4
+ 'base/js/events',
5
+ 'services/config',
6
+ 'notebook/js/codecell',
7
+ 'codemirror/lib/codemirror',
8
+ 'codemirror/addon/display/rulers'
9
+ ], function (Jupyter, events, configmod, codecell, codemirror) {
10
+ "use strict";
11
+
12
+ var log_prefix = '[ruler]';
13
+
14
+ // define default config parameter values
15
+ var params = {
16
+ ruler_column: [78],
17
+ ruler_color: ["#ff0000"],
18
+ ruler_linestyle: ["dashed"],
19
+ ruler_do_css_patch: false
20
+ };
21
+
22
+
23
+ var rulers = [];
24
+
25
+ var isNumber = function (n) {
26
+ return !isNaN(parseFloat(n)) && isFinite(n);
27
+ };
28
+
29
+ // updates default params with any specified in the provided config data
30
+ var update_params = function (config_data) {
31
+ for (var key in params) {
32
+ if (config_data.hasOwnProperty(key)) {
33
+ params[key] = config_data[key];
34
+ }
35
+ }
36
+ };
37
+
38
+ var on_config_loaded = function () {
39
+
40
+ if (Jupyter.notebook !== undefined) {
41
+ var i, config = Jupyter.notebook.config;
42
+ } else {
43
+ var i, config = Jupyter.editor.config;
44
+ }
45
+
46
+ if (config.data.hasOwnProperty('ruler_color') && config.data.ruler_color.length > 0) {
47
+ params.ruler_color = config.data.ruler_color;
48
+ }
49
+
50
+ if (config.data.hasOwnProperty('ruler_column')) {
51
+ var new_columns = [];
52
+ for (i in config.data.ruler_column) {
53
+ if (isNumber(config.data.ruler_column[i])) {
54
+ new_columns.push(config.data.ruler_column[i]);
55
+ }
56
+ }
57
+ if (new_columns.length > 0) {
58
+ params.ruler_column = new_columns;
59
+ }
60
+ }
61
+
62
+ if (config.data.hasOwnProperty('ruler_linestyle') && config.data.ruler_linestyle.length > 0) {
63
+ params.ruler_linestyle = config.data.ruler_linestyle;
64
+ }
65
+
66
+ for (i in params.ruler_column) {
67
+ rulers.push({
68
+ color: params.ruler_color[i % params.ruler_color.length],
69
+ column: params.ruler_column[i],
70
+ lineStyle: params.ruler_linestyle[i % params.ruler_linestyle.length]
71
+ });
72
+ }
73
+ console.debug(log_prefix, 'ruler specs:', rulers);
74
+
75
+ if (Jupyter.notebook !== undefined) {
76
+ var i, config = Jupyter.notebook.config;
77
+
78
+ // Change default for new cells
79
+ codecell.CodeCell.options_default.cm_config.rulers = rulers;
80
+ // Apply to any already-existing cells
81
+ var cells = Jupyter.notebook.get_cells().forEach(function (cell) {
82
+ if (cell instanceof codecell.CodeCell) {
83
+ cell.code_mirror.setOption('rulers', rulers);
84
+ }
85
+ });
86
+
87
+ }
88
+ else {
89
+ Jupyter.editor.codemirror.setOption('rulers', rulers);
90
+ }
91
+ };
92
+
93
+ var load_extension = function () {
94
+
95
+ // first, check which view we're in, in order to decide whether to load
96
+ var conf_sect;
97
+ if (Jupyter.notebook) {
98
+ // we're in notebook view
99
+ conf_sect = Jupyter.notebook.config;
100
+ }
101
+ else if (Jupyter.editor) {
102
+ // we're in file-editor view
103
+ conf_sect = Jupyter.editor.config;
104
+ }
105
+ else {
106
+ // we're some other view like dashboard, terminal, etc, so bail now
107
+ return;
108
+ }
109
+
110
+ conf_sect.loaded
111
+ .then(function () {
112
+ update_params(conf_sect.data);
113
+ })
114
+ .then(on_config_loaded)
115
+ .catch(function on_error(reason) {
116
+ console.warn(log_prefix, 'error:', reason);
117
+ });
118
+ };
119
+
120
+ var extension = {
121
+ load_ipython_extension: load_extension
122
+ };
123
+ return extension;
124
+ });
.local/share/jupyter/nbextensions/runtools/readme.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Runtools
2
+ ========
3
+ Runtools provide a number of additional functions for working with code cells in the IPython notebook:
4
+
5
+ Code Cell Execution
6
+ -------------------
7
+
8
+ * Execute a single cell
9
+ * Execute from top cell to currently selected cell
10
+ * Execute from currently selected cell to bottom cell
11
+ * Execute all cells
12
+ * Execute all cells, ignore exceptions (requires [ipython/pull/6521](https://github.com/ipython/ipython/pull/6521))
13
+ * Execute marked code cells (cells with green gutter area are marked)
14
+ * Stop execution (duplicate to standard toolbar button)
15
+
16
+ When executing marked cells, they are put in a execution list, and
17
+ executed in order. The execution list can be modified by unmarking
18
+ a cell not yet run. The execution list can be stopped by clicking on
19
+ `stop execution`. Execution of the currently running cell can be stopped
20
+ by pressing `stop execution` twice.
21
+
22
+ Code Cell Marking
23
+ -----------------
24
+
25
+ * Mark one or more code cell
26
+
27
+
28
+ Code Cell Display
29
+ -----------------
30
+
31
+ * Hide or show input (i.e. the source code) of marked code cells
32
+ * Hide or show output of marked code cells
33
+
34
+
35
+ Description
36
+ -----------
37
+
38
+ The *runtools* extension adds a button to turn on/off a floating toolbar:
39
+ ![](icon.png)
40
+
41
+ This adds Code execution buttons:
42
+ ![](runtools_execute.png)
43
+
44
+ Codecells can be marked by clicking on the gutter of a codecell or by clicking on the markers toolbar:
45
+ ![](runtools_marker.png)
46
+
47
+ Marked codecells can be locked to read-only mode and moved upd and down:
48
+ ![](runtools_lock.png)
49
+
50
+ The input and output areas of marked codecells can be hidden:
51
+ ![](runtools_show_hide.png)
52
+
53
+ A IPython notebook with marked cells looks like this:
54
+ ![](runtools_nb.png)
55
+
56
+
57
+ Demo
58
+ ----
59
+
60
+ ![](demo.gif)
61
+
62
+
63
+ Internals
64
+ ---------
65
+
66
+ New metadata elements added to each cell:
67
+ * `cell.metadata.hide_input` - hide input field of the cell
68
+ * `cell.metadata.hide_output` - hide output field of the cell
69
+ * `cell.metadata.run_control.marked` - mark a codecell
70
+
71
+ To export a notebook with hidden input/output fields, the custom template `hide_input_output.tpl` is required.
72
+ It should have been installed in the `templates` folder.
73
+ You can find the `templates` folder of `jupyter_contrib_nbextensions` from python using
74
+
75
+ ```python
76
+ from jupyter_contrib_nbextensions.nbconvert_support import templates_directory
77
+ print(templates_directory())
78
+ ```
79
+
80
+ The template needs to be in a path where nbconvert can find it. This can be your local path or specified in
81
+ `jupyter_nbconvert_config` or `jupyter_notebook_config` as `c.Exporter.extra_template_paths`, see [Jupyter docs](https://jupyter-notebook.readthedocs.io/en/latest/config.html).
82
+
83
+ For HTML export a template is provided as `nbextensions.tpl` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template:
84
+ ```
85
+ {%- extends 'full.tpl' -%}
86
+
87
+ {% block input_group -%}
88
+ {%- if cell.metadata.hide_input -%}
89
+ {%- else -%}
90
+ {{ super() }}
91
+ {%- endif -%}
92
+ {% endblock input_group %}
93
+
94
+ {% block output_group -%}
95
+ {%- if cell.metadata.hide_output -%}
96
+ {%- else -%}
97
+ {{ super() }}
98
+ {%- endif -%}
99
+ {% endblock output_group %}
100
+ ```
101
+
102
+ For LaTeX export a different template is required, which is included as `nbextensions.tplx` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template:
103
+ ```
104
+ ((- extends 'report.tplx' -))
105
+
106
+ ((* block input_group -))
107
+ ((- if cell.metadata.hide_input -))
108
+ ((- else -))
109
+ ((( super() )))
110
+ ((- endif -))
111
+ (( endblock input_group *))
112
+
113
+ ((* block output_group -))
114
+ ((- if cell.metadata.hide_output -))
115
+ ((- else -))
116
+ ((( super() )))
117
+ ((- endif -))
118
+ (( endblock output_group *))
119
+ ```
.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png ADDED
.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Name: Scratchpad
3
+ Description: Adds a scratchpad cell to Jupyter notebook.
4
+ Link: README.md
5
+ Main: main.js
6
+ Compatibility: 4.x, 5.x
.local/share/jupyter/nbextensions/select_keymap/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Select CodeMirror Keymap
2
+ =======
3
+
4
+ This extension lets you choose between the available CodeMirror keymaps: default, emacs, vim, and sublime.
5
+
6
+ There's a chance that this might cause key conflicts, especially with other extensions.
7
+
8
+ Most browsers consume some of the global keybindings like `Ctrl+n`. [The Menu Wizard add-on for Firefox](https://addons.mozilla.org/en-US/firefox/addon/s3menu-wizard/) allows you to disable some of the global key shortcuts, thus passing the keys through to CodeMirror.
9
+
10
+ ![Demo](select_keymap.png)
11
+
12
+ Based on:
13
+ * [jupyter-emacskeys](https://github.com/rmcgibbo/jupyter-emacskeys)
14
+ * [notebook_input_mode](https://github.com/asford/notebook_input_mode)
.local/share/jupyter/nbextensions/skill/main.js ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ define(function() {
2
+ "use strict";
3
+ // jupyter nbextensions must export a load_ipython_extension function to
4
+ // avoid throwing an error. Also, loading the module should do nothing
5
+ // unless the function is called, so we wrap requiring the codemirror mode
6
+ // in the load call.
7
+ return {
8
+ load_ipython_extension: function () {
9
+ requirejs(['./skill'], function () {
10
+ console.log('[SKILL Syntax] loaded');
11
+ });
12
+ }
13
+ };
14
+ });
.local/share/jupyter/nbextensions/skill/skill.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Type: IPython Notebook Extension
2
+ Name: SKILL Syntax
3
+ Description: Enable SKILL syntax support for CodeMirror
4
+ Link: README.md
5
+ Main: main.js
6
+ Compatibility: 4.x, 5.x
.local/share/jupyter/nbextensions/skip-traceback/traceback.png ADDED
.local/share/jupyter/nbextensions/splitcell/splitcell.js ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Allow for split cells in jupyter notebooks
2
+
3
+ define([
4
+ 'base/js/namespace',
5
+ 'base/js/events'
6
+ ], function (
7
+ Jupyter,
8
+ events
9
+ ) {
10
+ "use strict";
11
+
12
+ //define default config parameter values
13
+ var params = {
14
+ toggle_cell_style_keybinding : 'shift-s'
15
+ };
16
+
17
+ //updates default params with any specified in the server's config
18
+ var update_params = function(){
19
+ var config = Jupyter.notebook.config;
20
+ for (var key in params){
21
+ if (config.data.hasOwnProperty(key)){
22
+ params[key] = config.data[key];
23
+ }
24
+ }
25
+ };
26
+
27
+ var setup = function (){
28
+ // update defaults
29
+ update_params();
30
+
31
+ //register actions with ActionHandler instance
32
+ var prefix = 'auto';
33
+ var name = 'toggle-cell-style';
34
+ var action = {
35
+ icon : 'fa-arrows-h',
36
+ help : 'Toggle split/centered cell style',
37
+ help_index : 'eb',
38
+ id : 'split_cells',
39
+ handler : toggle_cell_style
40
+ };
41
+
42
+ var action_full_name = Jupyter.keyboard_manager.actions.register(action, name, prefix);
43
+
44
+ //define keyboard shortucts
45
+ var command_mode_shortcuts = {};
46
+ command_mode_shortcuts[params.toggle_cell_style_keybinding] = action_full_name;
47
+
48
+ //register keyboard shortucts with keyboard_manager
49
+ Jupyter.notebook.keyboard_manager.command_shortcuts.add_shortcuts(command_mode_shortcuts);
50
+ Jupyter.toolbar.add_buttons_group([action_full_name]);
51
+ };
52
+
53
+
54
+ var toggle_cell_style = function(){
55
+ var cell = Jupyter.notebook.get_selected_cell();
56
+ if (!("cell_style" in cell.metadata)){cell.metadata.cell_style = 'split';}
57
+ else if (cell.metadata.cell_style == 'center'){cell.metadata.cell_style = 'split';}
58
+ else {cell.metadata.cell_style = 'center';}
59
+
60
+ update_cell_style_element(cell);
61
+ };
62
+
63
+ var get_cell_style_html = function(cell_style){
64
+ console.log(cell_style);
65
+ if (cell_style == "split")
66
+ {return "float:left; width:50%;";}
67
+ return "width:100%;";
68
+ };
69
+
70
+ var update_cell_style_element = function(cell){
71
+ var cell_style_html = get_cell_style_html(cell.metadata.cell_style);
72
+ cell.element.attr('style', cell_style_html);
73
+ };
74
+
75
+ function initialize () {
76
+ // On Load lets set the cell styles correctly
77
+ var cells = Jupyter.notebook.get_cells();
78
+ var ncells = Jupyter.notebook.ncells();
79
+
80
+ for (var i=0; i<ncells; i++){
81
+ var cell = cells[i];
82
+ if ("cell_style" in cell.metadata){
83
+ update_cell_style_element(cell, cell.metadata.cell_style);
84
+ }
85
+ }
86
+ }
87
+
88
+ var load_extension = function() {
89
+ Jupyter.notebook.config.loaded.then(setup);
90
+
91
+ if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
92
+ // notebook already loaded. Update directly
93
+ initialize();
94
+ }
95
+ events.on("notebook_loaded.Notebook", initialize);
96
+ };
97
+
98
+ return {
99
+ load_ipython_extension : load_extension
100
+ };
101
+ });
.local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png ADDED
.local/share/jupyter/nbextensions/varInspector/README.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Variable Inspector
2
+
3
+ ## Description and main features
4
+
5
+ The Variable Inspector extension, which currently supports python and R kernels, enables to collect all defined variables and display them in a floating window. The window not only display the name of variables but also their type, size in memory and content. The columns are sortable. The window is draggable, resizable, collapsable. The list of displayed variables is automatically updated at each cell execution. Variables can be deleted from workspace by clicking a link. Position and state (displayed/collapsed) are stored in the notebook's metadata and restored at startup.
6
+
7
+ The extension supports multiple kernels. To add support for a new kernel, one has to
8
+ - provide a library which loads required modules and define a function which lists all variables, together with their name, type, size and content. The output of this function must be a JSON representation of a list of objects (one for each variable) with keys 'varName','varType', 'varSize', 'varContent',
9
+ - provide the command for deleting a variable, as `delete_cmd_prefix` and `delete_cmd_postfix`, eg. for `rm(variable)`, specify `rm(` and `)`.
10
+ - give the command to refresh the list of variables (usually this is a call to the function defined in the library above). This information can be provided either in the source file or in the yaml config file.
11
+
12
+ In any case, contributions to support further kernels will be very welcome!
13
+
14
+ #### Demo:
15
+ ![](demo.gif)
16
+
17
+
18
+ ## Configuration
19
+ The initial configuration can be given using the IPython-contrib nbextensions facility. It includes:
20
+
21
+ - varInspector.window_display - Display at startup or not (default: false)
22
+ - varInspector.cols.lenName: (and .lenType, .lenVar) - Width of columns (actually the max number of character to display in each column)
23
+ - varInspector.kernels_config - json object defining the kernels specific code and commands.
24
+
25
+
26
+ ## Notes
27
+ - The displayed size of variables use the `getsizeof()` python method. This method doesn't work for all types, so the reported size is to be considered with some caution. The extension includes some code to correctly return the size of numpy arrays, pandas Series and DataFrame but the size for some other types may be incorrect.
28
+ - The extension builds on some code provided [here](https://github.com/jupyter-widgets/ipywidgets/blob/master/docs/source/examples/Variable%20Inspector.ipynb) (essentially the `_fill` method)
29
+ - The extension uses Christian Bach's [table sorter jquery plugin](https://github.com/christianbach/tablesorter). License file is included.
30
+
31
+
32
+ ## History
33
+
34
+ - @jfbercher march 22, 2017 -- initial release
35
+ - @jfbercher april 03, 2017 -- multiple kernel support. added support for R kernels.
36
+ - @jfbercher june 30, 2017 -- fixed #1014 (use of `%reset` with IPython kernel) and #1015 printing with python 2 kernel.
.local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc ADDED
Binary file (2.02 kB). View file
 
.local/share/jupyter/nbextensions/varInspector/demo.gif ADDED
.local/share/jupyter/nbextensions/varInspector/icon.png ADDED
.local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 Christian Bach
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
.local/share/jupyter/nbextensions/varInspector/var_list.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from sys import getsizeof
3
+
4
+ from IPython import get_ipython
5
+ from IPython.core.magics.namespace import NamespaceMagics
6
+ _nms = NamespaceMagics()
7
+ _Jupyter = get_ipython()
8
+ _nms.shell = _Jupyter.kernel.shell
9
+
10
+ try:
11
+ import numpy as np
12
+ except ImportError:
13
+ pass
14
+
15
+ def _getsizeof(x):
16
+ # return the size of variable x. Amended version of sys.getsizeof
17
+ # which also supports ndarray, Series and DataFrame
18
+ if type(x).__name__ in ['ndarray', 'Series']:
19
+ return x.nbytes
20
+ elif type(x).__name__ == 'DataFrame':
21
+ return x.memory_usage().sum()
22
+ else:
23
+ return getsizeof(x)
24
+
25
+ def _getshapeof(x):
26
+ #returns the shape of x if it has one
27
+ #returns None otherwise - might want to return an empty string for an empty column
28
+ try:
29
+ return x.shape
30
+ except AttributeError: #x does not have a shape
31
+ return None
32
+
33
+ def _getcontentof(x):
34
+ length = 150
35
+ if type(x).__name__ == 'DataFrame':
36
+ colnames = ', '.join(x.columns.map(str))
37
+ content = "Column names: %s" % colnames
38
+ elif type(x).__name__ == 'Series':
39
+ content = "Series [%d rows]" % x.shape
40
+ elif type(x).__name__ == 'ndarray':
41
+ content = x.__repr__()
42
+ else:
43
+ if hasattr(x, '__len__'):
44
+ if len(x) > length:
45
+ content = str(x[:length])
46
+ else:
47
+ content = str(x)
48
+ if len(content) > 150:
49
+ return content[:150] + " ..."
50
+ return content
51
+
52
+ def var_dic_list():
53
+ types_to_exclude = ['module', 'function', 'builtin_function_or_method',
54
+ 'instance', '_Feature', 'type', 'ufunc']
55
+ values = _nms.who_ls()
56
+ vardic = [{'varName': v, 'varType': type(eval(v)).__name__, 'varSize': str(_getsizeof(eval(v))), 'varShape': str(_getshapeof(eval(v))) if _getshapeof(eval(v)) else '', 'varContent': _getcontentof(eval(v)) } # noqa
57
+
58
+ for v in values if (v not in ['_html', '_nms', 'NamespaceMagics', '_Jupyter']) & (type(eval(v)).__name__ not in types_to_exclude)] # noqa
59
+ return json.dumps(vardic)
60
+
61
+
62
+ # command to refresh the list of variables
63
+ print(var_dic_list())
.local/share/jupyter/nbextensions/varInspector/var_list.r ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ library(jsonlite)
2
+ var_dic_list = function(){
3
+ ll = ls(.GlobalEnv, all.names = FALSE)
4
+ varList=list()
5
+ iter = 1
6
+ for (k in ll){
7
+ if (class(get(k))!='function'){
8
+ class = class(get(k)); rk = capture.output(str(get(k))); size = object.size(get(k)); sk = substr(get(k),0, 200);
9
+ # [{'varName':v, 'varType': type(eval(v)).__name__, 'varSize': _getsizeof(eval(v)), 'varContent': str(eval(v))[:200]}
10
+ l = list(varName = k, varType = class, varSize = size, varContent = sk)
11
+ varList[[iter]] = l
12
+ # print(l)
13
+ iter = iter + 1}
14
+ }
15
+ return(toJSON(varList, simplifyVector = FALSE, force=TRUE))
16
+ }
17
+ cat(var_dic_list())
.local/share/jupyter/nbextensions/zenmode/README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Zenmode
2
+ =======
3
+
4
+ A little extension to give Zenmode functionality to the IPython notebook
.local/share/jupyter/nbextensions/zenmode/images/back2.jpg ADDED
.local/share/jupyter/nbextensions/zenmode/images/back21.jpg ADDED
.local/share/jupyter/nbextensions/zenmode/images/back3.jpg ADDED
.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png ADDED
.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png ADDED
.local/share/jupyter/nbextensions/zenmode/main.css ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .navbar-inner {
2
+ opacity: 0.5;
3
+ -webkit-transition: opacity 0.3s ease-in-out;
4
+ -moz-transition: opacity 0.3s ease-in-out;
5
+ -o-transition: opacity 0.3s ease-in-out;
6
+ transition: opacity 0.3s ease-in-out;
7
+ }
8
+
9
+ .navbar-inner:hover {
10
+ opacity: 1.0;
11
+ }
12
+
13
+ #maintoolbar .navbar-text {
14
+ display: none !important;
15
+ }
16
+
17
+ #notebook-container {
18
+ background-color: rgba(255, 255, 255, 0);
19
+ }
20
+
21
+ /*
22
+ .cell {
23
+ background-color: rgb(255, 255, 255);
24
+ }
25
+
26
+ .CodeMirror {
27
+ background: #F8FCCF;
28
+ }
29
+
30
+ div.input_area {
31
+ margin: 2px;
32
+ border: none;
33
+ }
34
+ */
.local/share/jupyter/runtime/jpserver-434.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_url": "/",
3
+ "hostname": "0.0.0.0",
4
+ "password": false,
5
+ "pid": 434,
6
+ "port": 8080,
7
+ "root_dir": "/root",
8
+ "secure": true,
9
+ "sock": "",
10
+ "token": "5a434251505375f2b42435914de608ef3450739f4e14b0be1cfeae3b7364239e",
11
+ "url": "https://184d1c0992ce:8080/",
12
+ "version": "2.12.5"
13
+ }
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin ADDED
Binary file (13.3 kB). View file
 
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<256> : tensor<4x1xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<4x1xi64, #blocked>
7
+ %cst_1 = arith.constant dense<512> : tensor<4x1xi64, #blocked>
8
+ %cst_2 = arith.constant dense<256> : tensor<4x1xi32, #blocked>
9
+ %cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
10
+ %cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
11
+ %cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1>
12
+ %cst_6 = arith.constant dense<true> : tensor<4x1xi1, #blocked>
13
+ %c4_i32 = arith.constant 4 : i32
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.muli %0, %c4_i32 : i32
16
+ %2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
17
+ %3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4x1xi32, #blocked1>
19
+ %5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xi32, #blocked>
20
+ %6 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked1>
21
+ %7 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked>
22
+ %8 = arith.addi %6, %4 : tensor<4x1xi32, #blocked1>
23
+ %9 = arith.addi %7, %5 : tensor<4x1xi32, #blocked>
24
+ %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
25
+ %11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
26
+ %12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
27
+ %13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
28
+ %14 = tt.broadcast %8 : (tensor<4x1xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
29
+ %15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
30
+ %16 = arith.addi %14, %15 : tensor<4x128xi32, #blocked1>
31
+ %17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>, #blocked1>
32
+ %18 = tt.addptr %17, %16 : tensor<4x128x!tt.ptr<f32, 1>, #blocked1>, tensor<4x128xi32, #blocked1>
33
+ %19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<4x128xi1, #blocked1>
34
+ %20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32, #blocked1>
35
+ %21 = arith.addf %20, %cst_5 : tensor<4x128xf32, #blocked1>
36
+ %22 = arith.select %19, %21, %cst_5 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1>
37
+ %23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
38
+ ^bb0(%arg5: f32, %arg6: f32):
39
+ %40 = arith.addf %arg5, %arg6 : f32
40
+ tt.reduce.return %40 : f32
41
+ }) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
42
+ %24 = triton_gpu.convert_layout %23 : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
43
+ %25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xf32, #blocked>
44
+ %26 = arith.divsi %9, %cst_2 : tensor<4x1xi32, #blocked>
45
+ %27 = arith.remsi %9, %cst_2 : tensor<4x1xi32, #blocked>
46
+ %28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>, #blocked>
47
+ %29 = tt.addptr %28, %26 : tensor<4x1x!tt.ptr<i64, 1>, #blocked>, tensor<4x1xi32, #blocked>
48
+ %30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64, #blocked>
49
+ %31 = arith.addi %30, %cst_1 : tensor<4x1xi64, #blocked>
50
+ %32 = arith.cmpi slt, %30, %cst_0 : tensor<4x1xi64, #blocked>
51
+ %33 = arith.select %32, %31, %30 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked>
52
+ %34 = arith.muli %33, %cst : tensor<4x1xi64, #blocked>
53
+ %35 = arith.extsi %27 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked>
54
+ %36 = arith.addi %35, %34 : tensor<4x1xi64, #blocked>
55
+ %37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>, #blocked>
56
+ %38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xi64, #blocked>
57
+ %39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xf32, #blocked>, tensor<4x1xi1, #blocked>) -> tensor<4x1xf32, #blocked>
58
+ tt.return
59
+ }
60
+ }
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin ADDED
Binary file (13 kB). View file
 
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .extern .func __assertfail
11
+ (
12
+ .param .b64 __assertfail_param_0,
13
+ .param .b64 __assertfail_param_1,
14
+ .param .b32 __assertfail_param_2,
15
+ .param .b64 __assertfail_param_3,
16
+ .param .b64 __assertfail_param_4
17
+ )
18
+ ;
19
+ .global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
20
+ .global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
21
+ .global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55};
22
+ .extern .shared .align 1 .b8 global_smem[];
23
+
24
+ .visible .entry triton__0d1d2de(
25
+ .param .u64 triton__0d1d2de_param_0,
26
+ .param .u64 triton__0d1d2de_param_1,
27
+ .param .u64 triton__0d1d2de_param_2
28
+ )
29
+ .maxntid 128, 1, 1
30
+ {
31
+ .reg .pred %p<24>;
32
+ .reg .b16 %rs<21>;
33
+ .reg .b32 %r<21>;
34
+ .reg .b64 %rd<58>;
35
+ .loc 1 18 0
36
+ $L__func_begin0:
37
+ .loc 1 18 0
38
+
39
+ ld.param.u64 %rd9, [triton__0d1d2de_param_1];
40
+ ld.param.u64 %rd16, [triton__0d1d2de_param_0];
41
+ $L__tmp0:
42
+ .loc 1 21 36
43
+ mov.u32 %r4, %tid.x;
44
+ and.b32 %r1, %r4, 127;
45
+ shl.b32 %r2, %r1, 1;
46
+ or.b32 %r5, %r2, 1;
47
+ or.b32 %r6, %r2, 256;
48
+ .loc 1 20 28
49
+ mov.u32 %r3, %ctaid.x;
50
+ .loc 1 20 46
51
+ mul.wide.s32 %rd1, %r3, 512;
52
+ cvt.u64.u32 %rd17, %r2;
53
+ cvt.u64.u32 %rd18, %r6;
54
+ .loc 1 21 23
55
+ or.b64 %rd2, %rd1, %rd17;
56
+ or.b64 %rd3, %rd1, %rd18;
57
+ .loc 1 24 30
58
+ shl.b64 %rd19, %rd2, 3;
59
+ add.s64 %rd12, %rd16, %rd19;
60
+ add.s64 %rd15, %rd12, 2048;
61
+ mov.pred %p20, -1;
62
+ .loc 1 24 35
63
+ mov.u64 %rd10, 0x0;
64
+ mov.u64 %rd11, 0x0;
65
+ @%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ];
66
+ mov.u64 %rd13, 0x0;
67
+ mov.u64 %rd14, 0x0;
68
+ @%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ];
69
+ .loc 1 26 19
70
+ setp.eq.s64 %p3, %rd14, -1;
71
+ setp.eq.s64 %p4, %rd13, -1;
72
+ setp.eq.s64 %p5, %rd11, -1;
73
+ setp.eq.s64 %p6, %rd10, -1;
74
+ .loc 1 28 32
75
+ selp.b64 %rd20, 0, %rd10, %p6;
76
+ selp.b64 %rd21, 0, %rd11, %p5;
77
+ selp.b64 %rd22, 0, %rd13, %p4;
78
+ selp.b64 %rd23, 0, %rd14, %p3;
79
+ .loc 1 29 18
80
+ add.s64 %rd24, %rd23, 50257;
81
+ add.s64 %rd25, %rd22, 50257;
82
+ add.s64 %rd26, %rd21, 50257;
83
+ add.s64 %rd27, %rd20, 50257;
84
+ .loc 1 30 18
85
+ setp.lt.s64 %p7, %rd23, 0;
86
+ setp.lt.s64 %p8, %rd22, 0;
87
+ setp.lt.s64 %p9, %rd21, 0;
88
+ setp.lt.s64 %p10, %rd20, 0;
89
+ .loc 1 31 32
90
+ selp.b64 %rd7, %rd27, %rd20, %p10;
91
+ selp.b64 %rd6, %rd26, %rd21, %p9;
92
+ selp.b64 %rd5, %rd25, %rd22, %p8;
93
+ selp.b64 %rd4, %rd24, %rd23, %p7;
94
+ .loc 1 32 36
95
+ setp.lt.u64 %p11, %rd4, 50257;
96
+ setp.lt.u64 %p12, %rd5, 50257;
97
+ setp.lt.u64 %p13, %rd6, 50257;
98
+ setp.lt.u64 %p14, %rd7, 50257;
99
+ mov.u32 %r7, global_smem;
100
+ add.s32 %r8, %r7, %r2;
101
+ selp.u16 %rs1, 1, 0, %p14;
102
+ st.shared.u8 [%r8], %rs1;
103
+ cvt.u64.u32 %rd8, %r5;
104
+ selp.u16 %rs2, 1, 0, %p13;
105
+ st.shared.u8 [%r8+1], %rs2;
106
+ bar.sync 0;
107
+ add.s32 %r9, %r7, %r1;
108
+ ld.shared.u8 %rs3, [%r9];
109
+ ld.shared.u8 %rs4, [%r9+128];
110
+ bar.sync 0;
111
+ selp.u16 %rs5, 1, 0, %p12;
112
+ st.shared.u8 [%r8], %rs5;
113
+ selp.u16 %rs6, 1, 0, %p11;
114
+ st.shared.u8 [%r8+1], %rs6;
115
+ bar.sync 0;
116
+ ld.shared.u8 %rs7, [%r9];
117
+ ld.shared.u8 %rs8, [%r9+128];
118
+ setp.eq.s16 %p15, %rs7, 0;
119
+ selp.u16 %rs9, 1, 0, %p15;
120
+ shl.b16 %rs10, %rs9, 2;
121
+ setp.eq.s16 %p16, %rs8, 0;
122
+ selp.u16 %rs11, -1, 0, %p16;
123
+ shl.b16 %rs12, %rs11, 3;
124
+ or.b16 %rs13, %rs12, %rs10;
125
+ setp.eq.s16 %p17, %rs4, 0;
126
+ selp.u16 %rs14, 1, 0, %p17;
127
+ setp.eq.s16 %p18, %rs3, 0;
128
+ selp.u16 %rs15, -1, 0, %p18;
129
+ shl.b16 %rs16, %rs15, 1;
130
+ or.b16 %rs17, %rs14, %rs16;
131
+ and.b16 %rs18, %rs17, 3;
132
+ or.b16 %rs19, %rs18, %rs13;
133
+ .loc 1 32 51
134
+ and.b16 %rs20, %rs19, 15;
135
+ setp.eq.s16 %p19, %rs20, 0;
136
+ @%p19 bra $L__BB0_2;
137
+ mov.u64 %rd28, assertMessage_0;
138
+ cvta.global.u64 %rd29, %rd28;
139
+ mov.u64 %rd30, assertFile_0;
140
+ cvta.global.u64 %rd31, %rd30;
141
+ mov.u64 %rd32, assertFunc_0;
142
+ cvta.global.u64 %rd33, %rd32;
143
+ mov.b32 %r10, 883;
144
+ mov.u64 %rd34, 1;
145
+ { // callseq 0, 0
146
+ .reg .b32 temp_param_reg;
147
+ .param .b64 param0;
148
+ st.param.b64 [param0+0], %rd29;
149
+ .param .b64 param1;
150
+ st.param.b64 [param1+0], %rd31;
151
+ .param .b32 param2;
152
+ st.param.b32 [param2+0], %r10;
153
+ .param .b64 param3;
154
+ st.param.b64 [param3+0], %rd33;
155
+ .param .b64 param4;
156
+ st.param.b64 [param4+0], %rd34;
157
+ call.uni
158
+ __assertfail,
159
+ (
160
+ param0,
161
+ param1,
162
+ param2,
163
+ param3,
164
+ param4
165
+ );
166
+ } // callseq 0
167
+ $L__BB0_2:
168
+ .loc 1 21 36
169
+ or.b32 %r15, %r2, 257;
170
+ cvt.u64.u32 %rd39, %r15;
171
+ .loc 1 21 23
172
+ or.b64 %rd40, %rd1, %rd39;
173
+ or.b64 %rd41, %rd1, %rd8;
174
+ .loc 1 34 25
175
+ shl.b64 %rd42, %rd7, 2;
176
+ add.s64 %rd43, %rd9, %rd42;
177
+ mul.lo.s64 %rd44, %rd2, 201028;
178
+ add.s64 %rd45, %rd43, %rd44;
179
+ shl.b64 %rd46, %rd6, 2;
180
+ add.s64 %rd47, %rd9, %rd46;
181
+ mul.lo.s64 %rd48, %rd41, 201028;
182
+ add.s64 %rd49, %rd47, %rd48;
183
+ shl.b64 %rd50, %rd5, 2;
184
+ add.s64 %rd51, %rd9, %rd50;
185
+ mul.lo.s64 %rd52, %rd3, 201028;
186
+ add.s64 %rd53, %rd51, %rd52;
187
+ shl.b64 %rd54, %rd4, 2;
188
+ add.s64 %rd55, %rd9, %rd54;
189
+ mul.lo.s64 %rd56, %rd40, 201028;
190
+ add.s64 %rd57, %rd55, %rd56;
191
+ .loc 1 34 51
192
+ bar.sync 0;
193
+ shl.b32 %r16, %r2, 3;
194
+ add.s32 %r18, %r7, %r16;
195
+ st.shared.u64 [%r18], %rd45;
196
+ st.shared.u64 [%r18+8], %rd49;
197
+ bar.sync 0;
198
+ shl.b32 %r19, %r1, 3;
199
+ add.s32 %r20, %r7, %r19;
200
+ ld.shared.u64 %rd35, [%r20];
201
+ ld.shared.u64 %rd36, [%r20+1024];
202
+ bar.sync 0;
203
+ st.shared.u64 [%r18], %rd53;
204
+ st.shared.u64 [%r18+8], %rd57;
205
+ bar.sync 0;
206
+ ld.shared.u64 %rd37, [%r20];
207
+ ld.shared.u64 %rd38, [%r20+1024];
208
+ mov.b32 %r11, -1082130432;
209
+ @%p20 st.global.b32 [ %rd35 + 0 ], { %r11 };
210
+ @%p20 st.global.b32 [ %rd36 + 0 ], { %r11 };
211
+ @%p20 st.global.b32 [ %rd37 + 0 ], { %r11 };
212
+ @%p20 st.global.b32 [ %rd38 + 0 ], { %r11 };
213
+ .loc 1 34 4
214
+ ret;
215
+ $L__tmp1:
216
+ $L__func_end0:
217
+
218
+ }
219
+ .file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py"
220
+ .section .debug_abbrev
221
+ {
222
+ .b8 1
223
+ .b8 17
224
+ .b8 1
225
+ .b8 37
226
+ .b8 8
227
+ .b8 19
228
+ .b8 5
229
+ .b8 3
230
+ .b8 8
231
+ .b8 16
232
+ .b8 6
233
+ .b8 27
234
+ .b8 8
235
+ .b8 180
236
+ .b8 66
237
+ .b8 12
238
+ .b8 17
239
+ .b8 1
240
+ .b8 18
241
+ .b8 1
242
+ .b8 0
243
+ .b8 0
244
+ .b8 2
245
+ .b8 46
246
+ .b8 0
247
+ .b8 17
248
+ .b8 1
249
+ .b8 18
250
+ .b8 1
251
+ .b8 64
252
+ .b8 10
253
+ .b8 135
254
+ .b8 64
255
+ .b8 8
256
+ .b8 3
257
+ .b8 8
258
+ .b8 58
259
+ .b8 11
260
+ .b8 59
261
+ .b8 11
262
+ .b8 63
263
+ .b8 12
264
+ .b8 0
265
+ .b8 0
266
+ .b8 0
267
+ }
268
+ .section .debug_info
269
+ {
270
+ .b32 176
271
+ .b8 2
272
+ .b8 0
273
+ .b32 .debug_abbrev
274
+ .b8 8
275
+ .b8 1
276
+ .b8 116
277
+ .b8 114
278
+ .b8 105
279
+ .b8 116
280
+ .b8 111
281
+ .b8 110
282
+ .b8 0
283
+ .b8 2
284
+ .b8 0
285
+ .b8 99
286
+ .b8 104
287
+ .b8 108
288
+ .b8 114
289
+ .b8 107
290
+ .b8 103
291
+ .b8 112
292
+ .b8 118
293
+ .b8 118
294
+ .b8 98
295
+ .b8 100
296
+ .b8 105
297
+ .b8 122
298
+ .b8 100
299
+ .b8 122
300
+ .b8 55
301
+ .b8 115
302
+ .b8 108
303
+ .b8 108
304
+ .b8 113
305
+ .b8 117
306
+ .b8 101
307
+ .b8 116
308
+ .b8 50
309
+ .b8 106
310
+ .b8 55
311
+ .b8 122
312
+ .b8 104
313
+ .b8 116
314
+ .b8 101
315
+ .b8 115
316
+ .b8 54
317
+ .b8 109
318
+ .b8 101
319
+ .b8 104
320
+ .b8 54
321
+ .b8 107
322
+ .b8 101
323
+ .b8 110
324
+ .b8 114
325
+ .b8 113
326
+ .b8 120
327
+ .b8 111
328
+ .b8 118
329
+ .b8 50
330
+ .b8 54
331
+ .b8 109
332
+ .b8 115
333
+ .b8 119
334
+ .b8 118
335
+ .b8 119
336
+ .b8 55
337
+ .b8 46
338
+ .b8 112
339
+ .b8 121
340
+ .b8 0
341
+ .b32 .debug_line
342
+ .b8 47
343
+ .b8 116
344
+ .b8 109
345
+ .b8 112
346
+ .b8 47
347
+ .b8 116
348
+ .b8 111
349
+ .b8 114
350
+ .b8 99
351
+ .b8 104
352
+ .b8 105
353
+ .b8 110
354
+ .b8 100
355
+ .b8 117
356
+ .b8 99
357
+ .b8 116
358
+ .b8 111
359
+ .b8 114
360
+ .b8 95
361
+ .b8 114
362
+ .b8 111
363
+ .b8 111
364
+ .b8 116
365
+ .b8 47
366
+ .b8 104
367
+ .b8 108
368
+ .b8 0
369
+ .b8 1
370
+ .b64 $L__func_begin0
371
+ .b64 $L__func_end0
372
+ .b8 2
373
+ .b64 $L__func_begin0
374
+ .b64 $L__func_end0
375
+ .b8 1
376
+ .b8 156
377
+ .b8 116
378
+ .b8 114
379
+ .b8 105
380
+ .b8 116
381
+ .b8 111
382
+ .b8 110
383
+ .b8 95
384
+ .b8 95
385
+ .b8 48
386
+ .b8 100
387
+ .b8 49
388
+ .b8 100
389
+ .b8 50
390
+ .b8 100
391
+ .b8 101
392
+ .b8 0
393
+ .b8 116
394
+ .b8 114
395
+ .b8 105
396
+ .b8 116
397
+ .b8 111
398
+ .b8 110
399
+ .b8 95
400
+ .b8 95
401
+ .b8 48
402
+ .b8 100
403
+ .b8 49
404
+ .b8 100
405
+ .b8 50
406
+ .b8 100
407
+ .b8 101
408
+ .b8 0
409
+ .b8 1
410
+ .b8 18
411
+ .b8 1
412
+ .b8 0
413
+ }
414
+ .section .debug_pubnames
415
+ {
416
+ .b32 $L__pubNames_end0-$L__pubNames_start0
417
+ $L__pubNames_start0:
418
+ .b8 2
419
+ .b8 0
420
+ .b32 .debug_info
421
+ .b32 180
422
+ .b32 125
423
+ .b8 116
424
+ .b8 114
425
+ .b8 105
426
+ .b8 116
427
+ .b8 111
428
+ .b8 110
429
+ .b8 95
430
+ .b8 95
431
+ .b8 48
432
+ .b8 100
433
+ .b8 49
434
+ .b8 100
435
+ .b8 50
436
+ .b8 100
437
+ .b8 101
438
+ .b8 0
439
+ .b32 0
440
+ $L__pubNames_end0:
441
+ }
442
+ .section .debug_pubtypes
443
+ {
444
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
445
+ $L__pubTypes_start0:
446
+ .b8 2
447
+ .b8 0
448
+ .b32 .debug_info
449
+ .b32 180
450
+ .b32 0
451
+ $L__pubTypes_end0:
452
+ }
453
+ .section .debug_loc { }
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ #blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
3
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
4
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
5
+ %cst = arith.constant dense<50257> : tensor<512xi64, #blocked>
6
+ %cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked>
7
+ %cst_1 = arith.constant dense<-1> : tensor<512xi64, #blocked>
8
+ %cst_2 = arith.constant dense<-1.000000e+00> : tensor<512xf32, #blocked1>
9
+ %c512_i64 = arith.constant 512 : i64
10
+ %0 = tt.get_program_id x : i32
11
+ %1 = arith.extsi %0 : i32 to i64
12
+ %2 = arith.muli %1, %c512_i64 : i64
13
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
14
+ %4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
15
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
16
+ %6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
17
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>, #blocked>
18
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>, #blocked>, tensor<512xi64, #blocked>
19
+ %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64, #blocked>
20
+ %10 = arith.cmpi ne, %9, %cst_1 : tensor<512xi64, #blocked>
21
+ %11 = arith.select %10, %9, %cst_0 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
22
+ %12 = arith.addi %11, %cst : tensor<512xi64, #blocked>
23
+ %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64, #blocked>
24
+ %14 = arith.select %13, %12, %11 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
25
+ %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64, #blocked>
26
+ %16 = arith.cmpi slt, %14, %cst : tensor<512xi64, #blocked>
27
+ %17 = arith.andi %15, %16 : tensor<512xi1, #blocked>
28
+ %18 = triton_gpu.convert_layout %17 : (tensor<512xi1, #blocked>) -> tensor<512xi1, #blocked1>
29
+ tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1, #blocked1>
30
+ %19 = arith.muli %6, %cst : tensor<512xi64, #blocked>
31
+ %20 = arith.addi %14, %19 : tensor<512xi64, #blocked>
32
+ %21 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
33
+ %22 = tt.addptr %21, %20 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
34
+ %23 = triton_gpu.convert_layout %22 : (tensor<512x!tt.ptr<f32, 1>, #blocked>) -> tensor<512x!tt.ptr<f32, 1>, #blocked1>
35
+ tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked1>
36
+ tt.return
37
+ }
38
+ }
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<50257> : tensor<512xi64>
4
+ %cst_0 = arith.constant dense<0> : tensor<512xi64>
5
+ %c512_i64 = arith.constant 512 : i64
6
+ %cst_1 = arith.constant dense<-1.000000e+00> : tensor<512xf32>
7
+ %cst_2 = arith.constant dense<-1> : tensor<512xi64>
8
+ %0 = tt.get_program_id x : i32
9
+ %1 = arith.extsi %0 : i32 to i64
10
+ %2 = arith.muli %1, %c512_i64 : i64
11
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
12
+ %4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64>
13
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64>
14
+ %6 = arith.addi %5, %4 : tensor<512xi64>
15
+ %7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>>
16
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>>, tensor<512xi64>
17
+ %9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64>
18
+ %10 = arith.cmpi ne, %9, %cst_2 : tensor<512xi64>
19
+ %11 = arith.select %10, %9, %cst_0 : tensor<512xi1>, tensor<512xi64>
20
+ %12 = arith.addi %11, %cst : tensor<512xi64>
21
+ %13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64>
22
+ %14 = arith.select %13, %12, %11 : tensor<512xi1>, tensor<512xi64>
23
+ %15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64>
24
+ %16 = arith.cmpi slt, %14, %cst : tensor<512xi64>
25
+ %17 = arith.andi %15, %16 : tensor<512xi1>
26
+ tt.assert %17, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1>
27
+ %18 = arith.muli %6, %cst : tensor<512xi64>
28
+ %19 = arith.addi %14, %18 : tensor<512xi64>
29
+ %20 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
30
+ %21 = tt.addptr %20, %19 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi64>
31
+ tt.store %21, %cst_1 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
32
+ tt.return
33
+ }
34
+ }
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 128, 1, 1
17
+ {
18
+ .reg .pred %p<4>;
19
+ .reg .b16 %rs<9>;
20
+ .reg .b32 %r<31>;
21
+ .reg .b64 %rd<8>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r22, %tid.x;
31
+ shl.b32 %r23, %r22, 3;
32
+ and.b32 %r24, %r23, 1016;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r25, %r1, 10;
37
+ .loc 1 21 23
38
+ or.b32 %r26, %r25, %r24;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd6, %r26, 4;
41
+ add.s64 %rd1, %rd4, %rd6;
42
+ add.s64 %rd2, %rd1, 16;
43
+ mov.pred %p1, -1;
44
+ .loc 1 24 35
45
+ mov.u32 %r10, 0x0;
46
+ mov.u32 %r11, 0x0;
47
+ mov.u32 %r12, 0x0;
48
+ mov.u32 %r13, 0x0;
49
+ @%p1 ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd1 + 0 ];
50
+ mov.u32 %r14, 0x0;
51
+ mov.u32 %r15, 0x0;
52
+ mov.u32 %r16, 0x0;
53
+ mov.u32 %r17, 0x0;
54
+ @%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
55
+ .loc 1 26 25
56
+ mul.wide.s32 %rd7, %r26, 2;
57
+ add.s64 %rd3, %rd5, %rd7;
58
+ .loc 1 26 36
59
+ cvt.rn.bf16.f32 %rs1, %r10;
60
+ cvt.rn.bf16.f32 %rs2, %r11;
61
+ cvt.rn.bf16.f32 %rs3, %r12;
62
+ cvt.rn.bf16.f32 %rs4, %r13;
63
+ cvt.rn.bf16.f32 %rs5, %r14;
64
+ cvt.rn.bf16.f32 %rs6, %r15;
65
+ cvt.rn.bf16.f32 %rs7, %r16;
66
+ cvt.rn.bf16.f32 %rs8, %r17;
67
+ mov.b32 %r27, {%rs1, %rs2};
68
+ mov.b32 %r28, {%rs3, %rs4};
69
+ mov.b32 %r29, {%rs5, %rs6};
70
+ mov.b32 %r30, {%rs7, %rs8};
71
+ @%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r27, %r28, %r29, %r30 };
72
+ .loc 1 26 4
73
+ ret;
74
+ $L__tmp1:
75
+ $L__func_end0:
76
+
77
+ }
78
+ .file 1 "/tmp/torchinductor_root/5t/c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py"
79
+ .section .debug_abbrev
80
+ {
81
+ .b8 1
82
+ .b8 17
83
+ .b8 1
84
+ .b8 37
85
+ .b8 8
86
+ .b8 19
87
+ .b8 5
88
+ .b8 3
89
+ .b8 8
90
+ .b8 16
91
+ .b8 6
92
+ .b8 27
93
+ .b8 8
94
+ .b8 180
95
+ .b8 66
96
+ .b8 12
97
+ .b8 17
98
+ .b8 1
99
+ .b8 18
100
+ .b8 1
101
+ .b8 0
102
+ .b8 0
103
+ .b8 2
104
+ .b8 46
105
+ .b8 0
106
+ .b8 17
107
+ .b8 1
108
+ .b8 18
109
+ .b8 1
110
+ .b8 64
111
+ .b8 10
112
+ .b8 135
113
+ .b8 64
114
+ .b8 8
115
+ .b8 3
116
+ .b8 8
117
+ .b8 58
118
+ .b8 11
119
+ .b8 59
120
+ .b8 11
121
+ .b8 63
122
+ .b8 12
123
+ .b8 0
124
+ .b8 0
125
+ .b8 0
126
+ }
127
+ .section .debug_info
128
+ {
129
+ .b32 176
130
+ .b8 2
131
+ .b8 0
132
+ .b32 .debug_abbrev
133
+ .b8 8
134
+ .b8 1
135
+ .b8 116
136
+ .b8 114
137
+ .b8 105
138
+ .b8 116
139
+ .b8 111
140
+ .b8 110
141
+ .b8 0
142
+ .b8 2
143
+ .b8 0
144
+ .b8 99
145
+ .b8 53
146
+ .b8 116
147
+ .b8 114
148
+ .b8 121
149
+ .b8 112
150
+ .b8 53
151
+ .b8 113
152
+ .b8 119
153
+ .b8 107
154
+ .b8 104
155
+ .b8 114
156
+ .b8 101
157
+ .b8 105
158
+ .b8 106
159
+ .b8 107
160
+ .b8 55
161
+ .b8 115
162
+ .b8 53
163
+ .b8 120
164
+ .b8 51
165
+ .b8 50
166
+ .b8 55
167
+ .b8 119
168
+ .b8 111
169
+ .b8 102
170
+ .b8 122
171
+ .b8 53
172
+ .b8 52
173
+ .b8 108
174
+ .b8 119
175
+ .b8 106
176
+ .b8 52
177
+ .b8 107
178
+ .b8 118
179
+ .b8 99
180
+ .b8 116
181
+ .b8 117
182
+ .b8 113
183
+ .b8 100
184
+ .b8 122
185
+ .b8 118
186
+ .b8 50
187
+ .b8 118
188
+ .b8 114
189
+ .b8 102
190
+ .b8 50
191
+ .b8 120
192
+ .b8 121
193
+ .b8 111
194
+ .b8 110
195
+ .b8 115
196
+ .b8 46
197
+ .b8 112
198
+ .b8 121
199
+ .b8 0
200
+ .b32 .debug_line
201
+ .b8 47
202
+ .b8 116
203
+ .b8 109
204
+ .b8 112
205
+ .b8 47
206
+ .b8 116
207
+ .b8 111
208
+ .b8 114
209
+ .b8 99
210
+ .b8 104
211
+ .b8 105
212
+ .b8 110
213
+ .b8 100
214
+ .b8 117
215
+ .b8 99
216
+ .b8 116
217
+ .b8 111
218
+ .b8 114
219
+ .b8 95
220
+ .b8 114
221
+ .b8 111
222
+ .b8 111
223
+ .b8 116
224
+ .b8 47
225
+ .b8 53
226
+ .b8 116
227
+ .b8 0
228
+ .b8 1
229
+ .b64 $L__func_begin0
230
+ .b64 $L__func_end0
231
+ .b8 2
232
+ .b64 $L__func_begin0
233
+ .b64 $L__func_end0
234
+ .b8 1
235
+ .b8 156
236
+ .b8 116
237
+ .b8 114
238
+ .b8 105
239
+ .b8 116
240
+ .b8 111
241
+ .b8 110
242
+ .b8 95
243
+ .b8 95
244
+ .b8 48
245
+ .b8 100
246
+ .b8 49
247
+ .b8 100
248
+ .b8 50
249
+ .b8 100
250
+ .b8 101
251
+ .b8 0
252
+ .b8 116
253
+ .b8 114
254
+ .b8 105
255
+ .b8 116
256
+ .b8 111
257
+ .b8 110
258
+ .b8 95
259
+ .b8 95
260
+ .b8 48
261
+ .b8 100
262
+ .b8 49
263
+ .b8 100
264
+ .b8 50
265
+ .b8 100
266
+ .b8 101
267
+ .b8 0
268
+ .b8 1
269
+ .b8 18
270
+ .b8 1
271
+ .b8 0
272
+ }
273
+ .section .debug_pubnames
274
+ {
275
+ .b32 $L__pubNames_end0-$L__pubNames_start0
276
+ $L__pubNames_start0:
277
+ .b8 2
278
+ .b8 0
279
+ .b32 .debug_info
280
+ .b32 180
281
+ .b32 125
282
+ .b8 116
283
+ .b8 114
284
+ .b8 105
285
+ .b8 116
286
+ .b8 111
287
+ .b8 110
288
+ .b8 95
289
+ .b8 95
290
+ .b8 48
291
+ .b8 100
292
+ .b8 49
293
+ .b8 100
294
+ .b8 50
295
+ .b8 100
296
+ .b8 101
297
+ .b8 0
298
+ .b32 0
299
+ $L__pubNames_end0:
300
+ }
301
+ .section .debug_pubtypes
302
+ {
303
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
304
+ $L__pubTypes_start0:
305
+ .b8 2
306
+ .b8 0
307
+ .b32 .debug_info
308
+ .b32 180
309
+ .b32 0
310
+ $L__pubTypes_end0:
311
+ }
312
+ .section .debug_loc { }
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %c1024_i32 = arith.constant 1024 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c1024_i32 : i32
7
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
8
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
9
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
10
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
11
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
12
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
13
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
14
+ %9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
15
+ %10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
16
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
17
+ tt.return
18
+ }
19
+ }
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin ADDED
Binary file (14.1 kB). View file
 
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c8_i32 = arith.constant 8 : i32
4
+ %c128_i32 = arith.constant 128 : i32
5
+ %c0_i32 = arith.constant 0 : i32
6
+ %cst = arith.constant dense<32768> : tensor<64x1xi32>
7
+ %cst_0 = arith.constant dense<256> : tensor<1x8xi32>
8
+ %cst_1 = arith.constant dense<128> : tensor<1x8xi32>
9
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
10
+ %cst_3 = arith.constant dense<256> : tensor<64x1xi32>
11
+ %c64_i32 = arith.constant 64 : i32
12
+ %0 = tt.get_program_id x : i32
13
+ %1 = arith.muli %0, %c64_i32 : i32
14
+ %2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
15
+ %3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
16
+ %4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
17
+ %5 = arith.addi %4, %3 : tensor<64x1xi32>
18
+ %6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
19
+ %7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
20
+ %8 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
21
+ %9 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
22
+ %10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
23
+ %11 = arith.muli %9, %cst : tensor<64x1xi32>
24
+ %12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
25
+ %13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
26
+ %14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
27
+ %15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_2) -> (tensor<64x8xf32>) : i32 {
28
+ %20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
29
+ %21 = arith.addi %20, %7 : tensor<1x8xi32>
30
+ %22 = arith.cmpi slt, %21, %cst_1 : tensor<1x8xi32>
31
+ %23 = arith.muli %21, %cst_0 : tensor<1x8xi32>
32
+ %24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
33
+ %25 = arith.addi %10, %24 : tensor<64x8xi32>
34
+ %26 = arith.addi %25, %12 : tensor<64x8xi32>
35
+ %27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
36
+ %28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
37
+ %29 = tt.load %27, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
38
+ %30 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
39
+ %31 = tt.load %30, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
40
+ %32 = arith.mulf %29, %31 : tensor<64x8xf32>
41
+ %33 = arith.addf %arg6, %32 : tensor<64x8xf32>
42
+ %34 = arith.select %28, %33, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
43
+ scf.yield %34 : tensor<64x8xf32>
44
+ }
45
+ %16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
46
+ ^bb0(%arg5: f32, %arg6: f32):
47
+ %20 = arith.addf %arg5, %arg6 : f32
48
+ tt.reduce.return %20 : f32
49
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
50
+ %17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
51
+ %18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
52
+ %19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
53
+ tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
54
+ tt.return
55
+ }
56
+ }
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<256> : tensor<256xi32, #blocked>
5
+ %cst_0 = arith.constant dense<-1> : tensor<1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
7
+ %cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
8
+ %cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked>
9
+ %cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked>
10
+ %cst_5 = arith.constant 0.000000e+00 : f32
11
+ %c256_i32 = arith.constant 256 : i32
12
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
13
+ %cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
14
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
15
+ %0 = tt.get_program_id x : i32
16
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
17
+ %2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
18
+ %3 = arith.muli %0, %c256_i32 : i32
19
+ %4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
20
+ %5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
21
+ %6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
22
+ %7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
23
+ %8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
24
+ %9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
25
+ %10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
26
+ %11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
27
+ %12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
28
+ %13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
29
+ %14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
30
+ %15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
31
+ %16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
32
+ %17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
33
+ %18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
34
+ %19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
35
+ %20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
36
+ %21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
37
+ %22 = tt.addptr %arg6, %0 : !tt.ptr<i64, 1>, i32
38
+ %23 = tt.splat %22 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
39
+ %24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
40
+ %25 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
41
+ %26 = tt.addptr %25, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
42
+ %27 = tt.load %26, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
43
+ %28 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
44
+ %29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
45
+ %30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg10: f32, %arg11: f32):
47
+ %63 = arith.addf %arg10, %arg11 : f32
48
+ tt.reduce.return %63 : f32
49
+ }) : (tensor<256xf32, #blocked>) -> f32
50
+ %31 = arith.addf %30, %cst_5 : f32
51
+ %32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
52
+ %33 = arith.subf %15, %32 : tensor<256xf32, #blocked>
53
+ %34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
54
+ %35 = arith.mulf %33, %34 : tensor<256xf32, #blocked>
55
+ %36 = arith.mulf %28, %35 : tensor<256xf32, #blocked>
56
+ %37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
57
+ %38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({
58
+ ^bb0(%arg10: f32, %arg11: f32):
59
+ %63 = arith.addf %arg10, %arg11 : f32
60
+ tt.reduce.return %63 : f32
61
+ }) : (tensor<256xf32, #blocked>) -> f32
62
+ %39 = arith.addf %38, %cst_5 : f32
63
+ %40 = arith.cmpi eq, %24, %cst_0 : tensor<1xi64, #blocked>
64
+ %41 = arith.divf %21, %cst_1 : tensor<1xf32, #blocked>
65
+ %42 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked>
66
+ %43 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked>
67
+ %44 = arith.subf %42, %43 : tensor<256xf32, #blocked>
68
+ %45 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
69
+ %46 = arith.mulf %35, %45 : tensor<256xf32, #blocked>
70
+ %47 = arith.subf %44, %46 : tensor<256xf32, #blocked>
71
+ %48 = tt.broadcast %41 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
72
+ %49 = arith.mulf %48, %47 : tensor<256xf32, #blocked>
73
+ %50 = arith.addf %27, %49 : tensor<256xf32, #blocked>
74
+ %51 = tt.broadcast %40 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked>
75
+ %52 = arith.select %51, %cst_6, %50 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
76
+ %53 = arith.addi %24, %cst_4 : tensor<1xi64, #blocked>
77
+ %54 = arith.cmpi slt, %24, %cst_3 : tensor<1xi64, #blocked>
78
+ %55 = arith.select %54, %53, %24 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
79
+ %56 = arith.muli %55, %cst_2 : tensor<1xi64, #blocked>
80
+ %57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
81
+ %58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
82
+ %59 = arith.addi %58, %57 : tensor<256xi64, #blocked>
83
+ %60 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
84
+ %61 = tt.addptr %60, %59 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
85
+ %62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked>
86
+ tt.return
87
+ }
88
+ }
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx ADDED
@@ -0,0 +1,778 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 128, 1, 1
18
+ {
19
+ .reg .pred %p<27>;
20
+ .reg .b16 %rs<17>;
21
+ .reg .b32 %r<67>;
22
+ .reg .f32 %f<431>;
23
+ .reg .b64 %rd<8>;
24
+ .loc 1 18 0
25
+ $L__func_begin0:
26
+ .loc 1 18 0
27
+
28
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
29
+ $L__tmp0:
30
+ .loc 1 21 36
31
+ mov.u32 %r14, %tid.x;
32
+ shl.b32 %r15, %r14, 3;
33
+ and.b32 %r16, %r15, 1016;
34
+ .loc 1 20 28
35
+ mov.u32 %r1, %ctaid.x;
36
+ .loc 1 20 33
37
+ shl.b32 %r17, %r1, 10;
38
+ .loc 1 21 23
39
+ or.b32 %r18, %r17, %r16;
40
+ .loc 1 24 30
41
+ mul.wide.s32 %rd5, %r18, 2;
42
+ add.s64 %rd3, %rd4, %rd5;
43
+ mov.pred %p1, -1;
44
+ .loc 1 24 35
45
+ mov.u32 %r2, 0x0;
46
+ mov.u32 %r3, 0x0;
47
+ mov.u32 %r4, 0x0;
48
+ mov.u32 %r5, 0x0;
49
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd3 + 0 ];
50
+ cvt.u16.u32 %rs1, %r2;
51
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
52
+ cvt.u16.u32 %rs3, %r3;
53
+ .loc 1 24 44
54
+ cvt.f32.bf16 %r6, %rs1;
55
+ mov.b32 %f1, %r6;
56
+ cvt.f32.bf16 %r7, %rs2;
57
+ mov.b32 %f2, %r7;
58
+ .loc 1 29 18
59
+ mul.f32 %f9, %f1, 0f3F3504F3;
60
+ .loc 1 30 23
61
+ abs.ftz.f32 %f17, %f9;
62
+ setp.ge.f32 %p2, %f17, 0f3F8060FE;
63
+ mov.f32 %f365, 0f3789CA3C;
64
+ mov.f32 %f364, 0fB9F560B9;
65
+ mov.f32 %f363, 0f3BAC840B;
66
+ mov.f32 %f362, 0fBD0C8162;
67
+ mov.f32 %f361, 0f3E1CF906;
68
+ mov.f32 %f360, 0f3F6A937E;
69
+ mov.f32 %f359, 0f3F20D842;
70
+ mov.f32 %f366, %f17;
71
+ @%p2 bra $L__BB0_2;
72
+ .loc 1 0 23
73
+ mov.f32 %f365, 0f38B1E96A;
74
+ mov.f32 %f364, 0fBA574D20;
75
+ mov.f32 %f363, 0f3BAAD5EA;
76
+ mov.f32 %f362, 0fBCDC1BE7;
77
+ mov.f32 %f361, 0f3DE718AF;
78
+ mov.f32 %f360, 0fBEC093AC;
79
+ mov.f32 %f359, 0f3E0375D3;
80
+ .loc 1 30 23
81
+ mul.f32 %f366, %f9, %f9;
82
+ $L__BB0_2:
83
+ .loc 1 0 0
84
+ cvt.f32.bf16 %r8, %rs3;
85
+ mul.f32 %f10, %f2, 0f3F3504F3;
86
+ .loc 1 30 23
87
+ setp.ltu.f32 %p3, %f17, 0f3F8060FE;
88
+ fma.rn.ftz.f32 %f135, %f365, %f366, %f364;
89
+ fma.rn.ftz.f32 %f136, %f135, %f366, %f363;
90
+ fma.rn.ftz.f32 %f137, %f136, %f366, %f362;
91
+ fma.rn.ftz.f32 %f138, %f137, %f366, %f361;
92
+ fma.rn.ftz.f32 %f139, %f138, %f366, %f360;
93
+ fma.rn.ftz.f32 %f140, %f139, %f366, %f359;
94
+ neg.f32 %f141, %f366;
95
+ selp.f32 %f142, %f141, %f9, %p2;
96
+ fma.rn.ftz.f32 %f367, %f140, %f142, %f142;
97
+ mov.f32 %f358, 0f3F800000;
98
+ @%p3 bra $L__BB0_4;
99
+ ex2.approx.ftz.f32 %f143, %f367;
100
+ sub.f32 %f145, %f358, %f143;
101
+ mov.b32 %r19, %f145;
102
+ mov.b32 %r20, %f9;
103
+ and.b32 %r21, %r20, -2147483648;
104
+ or.b32 %r22, %r21, %r19;
105
+ mov.b32 %f367, %r22;
106
+ $L__BB0_4:
107
+ .loc 1 0 0
108
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
109
+ mov.b32 %f3, %r8;
110
+ .loc 1 30 23
111
+ abs.ftz.f32 %f30, %f10;
112
+ setp.ge.f32 %p5, %f30, 0f3F8060FE;
113
+ mov.f32 %f374, 0f3789CA3C;
114
+ mov.f32 %f373, 0fB9F560B9;
115
+ mov.f32 %f372, 0f3BAC840B;
116
+ mov.f32 %f371, 0fBD0C8162;
117
+ mov.f32 %f370, 0f3E1CF906;
118
+ mov.f32 %f369, 0f3F6A937E;
119
+ mov.f32 %f368, 0f3F20D842;
120
+ mov.f32 %f375, %f30;
121
+ @%p5 bra $L__BB0_6;
122
+ mul.f32 %f375, %f10, %f10;
123
+ mov.f32 %f374, 0f38B1E96A;
124
+ mov.f32 %f373, 0fBA574D20;
125
+ mov.f32 %f372, 0f3BAAD5EA;
126
+ mov.f32 %f371, 0fBCDC1BE7;
127
+ mov.f32 %f370, 0f3DE718AF;
128
+ mov.f32 %f369, 0fBEC093AC;
129
+ mov.f32 %f368, 0f3E0375D3;
130
+ $L__BB0_6:
131
+ .loc 1 0 0
132
+ cvt.f32.bf16 %r9, %rs4;
133
+ mul.f32 %f11, %f3, 0f3F3504F3;
134
+ .loc 1 30 23
135
+ setp.ltu.f32 %p6, %f30, 0f3F8060FE;
136
+ fma.rn.ftz.f32 %f160, %f374, %f375, %f373;
137
+ fma.rn.ftz.f32 %f161, %f160, %f375, %f372;
138
+ fma.rn.ftz.f32 %f162, %f161, %f375, %f371;
139
+ fma.rn.ftz.f32 %f163, %f162, %f375, %f370;
140
+ fma.rn.ftz.f32 %f164, %f163, %f375, %f369;
141
+ fma.rn.ftz.f32 %f165, %f164, %f375, %f368;
142
+ neg.f32 %f166, %f375;
143
+ selp.f32 %f167, %f166, %f10, %p5;
144
+ fma.rn.ftz.f32 %f376, %f165, %f167, %f167;
145
+ @%p6 bra $L__BB0_8;
146
+ ex2.approx.ftz.f32 %f168, %f376;
147
+ sub.f32 %f170, %f358, %f168;
148
+ mov.b32 %r23, %f170;
149
+ mov.b32 %r24, %f10;
150
+ and.b32 %r25, %r24, -2147483648;
151
+ or.b32 %r26, %r25, %r23;
152
+ mov.b32 %f376, %r26;
153
+ $L__BB0_8:
154
+ .loc 1 0 0
155
+ cvt.u16.u32 %rs5, %r4;
156
+ mov.b32 %f4, %r9;
157
+ .loc 1 30 23
158
+ abs.ftz.f32 %f43, %f11;
159
+ setp.ge.f32 %p8, %f43, 0f3F8060FE;
160
+ mov.f32 %f383, 0f3789CA3C;
161
+ mov.f32 %f382, 0fB9F560B9;
162
+ mov.f32 %f381, 0f3BAC840B;
163
+ mov.f32 %f380, 0fBD0C8162;
164
+ mov.f32 %f379, 0f3E1CF906;
165
+ mov.f32 %f378, 0f3F6A937E;
166
+ mov.f32 %f377, 0f3F20D842;
167
+ mov.f32 %f384, %f43;
168
+ @%p8 bra $L__BB0_10;
169
+ mul.f32 %f384, %f11, %f11;
170
+ mov.f32 %f383, 0f38B1E96A;
171
+ mov.f32 %f382, 0fBA574D20;
172
+ mov.f32 %f381, 0f3BAAD5EA;
173
+ mov.f32 %f380, 0fBCDC1BE7;
174
+ mov.f32 %f379, 0f3DE718AF;
175
+ mov.f32 %f378, 0fBEC093AC;
176
+ mov.f32 %f377, 0f3E0375D3;
177
+ $L__BB0_10:
178
+ .loc 1 0 0
179
+ cvt.f32.bf16 %r10, %rs5;
180
+ mul.f32 %f12, %f4, 0f3F3504F3;
181
+ .loc 1 30 23
182
+ setp.ltu.f32 %p9, %f43, 0f3F8060FE;
183
+ fma.rn.ftz.f32 %f185, %f383, %f384, %f382;
184
+ fma.rn.ftz.f32 %f186, %f185, %f384, %f381;
185
+ fma.rn.ftz.f32 %f187, %f186, %f384, %f380;
186
+ fma.rn.ftz.f32 %f188, %f187, %f384, %f379;
187
+ fma.rn.ftz.f32 %f189, %f188, %f384, %f378;
188
+ fma.rn.ftz.f32 %f190, %f189, %f384, %f377;
189
+ neg.f32 %f191, %f384;
190
+ selp.f32 %f192, %f191, %f11, %p8;
191
+ fma.rn.ftz.f32 %f385, %f190, %f192, %f192;
192
+ @%p9 bra $L__BB0_12;
193
+ ex2.approx.ftz.f32 %f193, %f385;
194
+ sub.f32 %f195, %f358, %f193;
195
+ mov.b32 %r27, %f195;
196
+ mov.b32 %r28, %f11;
197
+ and.b32 %r29, %r28, -2147483648;
198
+ or.b32 %r30, %r29, %r27;
199
+ mov.b32 %f385, %r30;
200
+ $L__BB0_12:
201
+ .loc 1 0 0
202
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
203
+ mov.b32 %f5, %r10;
204
+ .loc 1 30 23
205
+ abs.ftz.f32 %f56, %f12;
206
+ setp.ge.f32 %p11, %f56, 0f3F8060FE;
207
+ mov.f32 %f392, 0f3789CA3C;
208
+ mov.f32 %f391, 0fB9F560B9;
209
+ mov.f32 %f390, 0f3BAC840B;
210
+ mov.f32 %f389, 0fBD0C8162;
211
+ mov.f32 %f388, 0f3E1CF906;
212
+ mov.f32 %f387, 0f3F6A937E;
213
+ mov.f32 %f386, 0f3F20D842;
214
+ mov.f32 %f393, %f56;
215
+ @%p11 bra $L__BB0_14;
216
+ mul.f32 %f393, %f12, %f12;
217
+ mov.f32 %f392, 0f38B1E96A;
218
+ mov.f32 %f391, 0fBA574D20;
219
+ mov.f32 %f390, 0f3BAAD5EA;
220
+ mov.f32 %f389, 0fBCDC1BE7;
221
+ mov.f32 %f388, 0f3DE718AF;
222
+ mov.f32 %f387, 0fBEC093AC;
223
+ mov.f32 %f386, 0f3E0375D3;
224
+ $L__BB0_14:
225
+ .loc 1 0 0
226
+ cvt.f32.bf16 %r11, %rs6;
227
+ mul.f32 %f13, %f5, 0f3F3504F3;
228
+ .loc 1 30 23
229
+ setp.ltu.f32 %p12, %f56, 0f3F8060FE;
230
+ fma.rn.ftz.f32 %f210, %f392, %f393, %f391;
231
+ fma.rn.ftz.f32 %f211, %f210, %f393, %f390;
232
+ fma.rn.ftz.f32 %f212, %f211, %f393, %f389;
233
+ fma.rn.ftz.f32 %f213, %f212, %f393, %f388;
234
+ fma.rn.ftz.f32 %f214, %f213, %f393, %f387;
235
+ fma.rn.ftz.f32 %f215, %f214, %f393, %f386;
236
+ neg.f32 %f216, %f393;
237
+ selp.f32 %f217, %f216, %f12, %p11;
238
+ fma.rn.ftz.f32 %f394, %f215, %f217, %f217;
239
+ @%p12 bra $L__BB0_16;
240
+ ex2.approx.ftz.f32 %f218, %f394;
241
+ sub.f32 %f220, %f358, %f218;
242
+ mov.b32 %r31, %f220;
243
+ mov.b32 %r32, %f12;
244
+ and.b32 %r33, %r32, -2147483648;
245
+ or.b32 %r34, %r33, %r31;
246
+ mov.b32 %f394, %r34;
247
+ $L__BB0_16:
248
+ .loc 1 0 0
249
+ cvt.u16.u32 %rs7, %r5;
250
+ mov.b32 %f6, %r11;
251
+ .loc 1 30 23
252
+ abs.ftz.f32 %f69, %f13;
253
+ setp.ge.f32 %p14, %f69, 0f3F8060FE;
254
+ mov.f32 %f401, 0f3789CA3C;
255
+ mov.f32 %f400, 0fB9F560B9;
256
+ mov.f32 %f399, 0f3BAC840B;
257
+ mov.f32 %f398, 0fBD0C8162;
258
+ mov.f32 %f397, 0f3E1CF906;
259
+ mov.f32 %f396, 0f3F6A937E;
260
+ mov.f32 %f395, 0f3F20D842;
261
+ mov.f32 %f402, %f69;
262
+ @%p14 bra $L__BB0_18;
263
+ mul.f32 %f402, %f13, %f13;
264
+ mov.f32 %f401, 0f38B1E96A;
265
+ mov.f32 %f400, 0fBA574D20;
266
+ mov.f32 %f399, 0f3BAAD5EA;
267
+ mov.f32 %f398, 0fBCDC1BE7;
268
+ mov.f32 %f397, 0f3DE718AF;
269
+ mov.f32 %f396, 0fBEC093AC;
270
+ mov.f32 %f395, 0f3E0375D3;
271
+ $L__BB0_18:
272
+ .loc 1 0 0
273
+ cvt.f32.bf16 %r12, %rs7;
274
+ mul.f32 %f14, %f6, 0f3F3504F3;
275
+ .loc 1 30 23
276
+ setp.ltu.f32 %p15, %f69, 0f3F8060FE;
277
+ fma.rn.ftz.f32 %f235, %f401, %f402, %f400;
278
+ fma.rn.ftz.f32 %f236, %f235, %f402, %f399;
279
+ fma.rn.ftz.f32 %f237, %f236, %f402, %f398;
280
+ fma.rn.ftz.f32 %f238, %f237, %f402, %f397;
281
+ fma.rn.ftz.f32 %f239, %f238, %f402, %f396;
282
+ fma.rn.ftz.f32 %f240, %f239, %f402, %f395;
283
+ neg.f32 %f241, %f402;
284
+ selp.f32 %f242, %f241, %f13, %p14;
285
+ fma.rn.ftz.f32 %f403, %f240, %f242, %f242;
286
+ @%p15 bra $L__BB0_20;
287
+ ex2.approx.ftz.f32 %f243, %f403;
288
+ sub.f32 %f245, %f358, %f243;
289
+ mov.b32 %r35, %f245;
290
+ mov.b32 %r36, %f13;
291
+ and.b32 %r37, %r36, -2147483648;
292
+ or.b32 %r38, %r37, %r35;
293
+ mov.b32 %f403, %r38;
294
+ $L__BB0_20:
295
+ .loc 1 0 0
296
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
297
+ mov.b32 %f7, %r12;
298
+ .loc 1 30 23
299
+ abs.ftz.f32 %f82, %f14;
300
+ setp.ge.f32 %p17, %f82, 0f3F8060FE;
301
+ mov.f32 %f410, 0f3789CA3C;
302
+ mov.f32 %f409, 0fB9F560B9;
303
+ mov.f32 %f408, 0f3BAC840B;
304
+ mov.f32 %f407, 0fBD0C8162;
305
+ mov.f32 %f406, 0f3E1CF906;
306
+ mov.f32 %f405, 0f3F6A937E;
307
+ mov.f32 %f404, 0f3F20D842;
308
+ mov.f32 %f411, %f82;
309
+ @%p17 bra $L__BB0_22;
310
+ mul.f32 %f411, %f14, %f14;
311
+ mov.f32 %f410, 0f38B1E96A;
312
+ mov.f32 %f409, 0fBA574D20;
313
+ mov.f32 %f408, 0f3BAAD5EA;
314
+ mov.f32 %f407, 0fBCDC1BE7;
315
+ mov.f32 %f406, 0f3DE718AF;
316
+ mov.f32 %f405, 0fBEC093AC;
317
+ mov.f32 %f404, 0f3E0375D3;
318
+ $L__BB0_22:
319
+ .loc 1 0 0
320
+ cvt.f32.bf16 %r13, %rs8;
321
+ mul.f32 %f15, %f7, 0f3F3504F3;
322
+ .loc 1 30 23
323
+ setp.ltu.f32 %p18, %f82, 0f3F8060FE;
324
+ fma.rn.ftz.f32 %f260, %f410, %f411, %f409;
325
+ fma.rn.ftz.f32 %f261, %f260, %f411, %f408;
326
+ fma.rn.ftz.f32 %f262, %f261, %f411, %f407;
327
+ fma.rn.ftz.f32 %f263, %f262, %f411, %f406;
328
+ fma.rn.ftz.f32 %f264, %f263, %f411, %f405;
329
+ fma.rn.ftz.f32 %f265, %f264, %f411, %f404;
330
+ neg.f32 %f266, %f411;
331
+ selp.f32 %f267, %f266, %f14, %p17;
332
+ fma.rn.ftz.f32 %f412, %f265, %f267, %f267;
333
+ @%p18 bra $L__BB0_24;
334
+ ex2.approx.ftz.f32 %f268, %f412;
335
+ sub.f32 %f270, %f358, %f268;
336
+ mov.b32 %r39, %f270;
337
+ mov.b32 %r40, %f14;
338
+ and.b32 %r41, %r40, -2147483648;
339
+ or.b32 %r42, %r41, %r39;
340
+ mov.b32 %f412, %r42;
341
+ $L__BB0_24:
342
+ .loc 1 0 0
343
+ mov.b32 %f8, %r13;
344
+ .loc 1 30 23
345
+ abs.ftz.f32 %f95, %f15;
346
+ setp.ge.f32 %p20, %f95, 0f3F8060FE;
347
+ mov.f32 %f419, 0f3789CA3C;
348
+ mov.f32 %f418, 0fB9F560B9;
349
+ mov.f32 %f417, 0f3BAC840B;
350
+ mov.f32 %f416, 0fBD0C8162;
351
+ mov.f32 %f415, 0f3E1CF906;
352
+ mov.f32 %f414, 0f3F6A937E;
353
+ mov.f32 %f413, 0f3F20D842;
354
+ mov.f32 %f420, %f95;
355
+ @%p20 bra $L__BB0_26;
356
+ mul.f32 %f420, %f15, %f15;
357
+ mov.f32 %f419, 0f38B1E96A;
358
+ mov.f32 %f418, 0fBA574D20;
359
+ mov.f32 %f417, 0f3BAAD5EA;
360
+ mov.f32 %f416, 0fBCDC1BE7;
361
+ mov.f32 %f415, 0f3DE718AF;
362
+ mov.f32 %f414, 0fBEC093AC;
363
+ mov.f32 %f413, 0f3E0375D3;
364
+ $L__BB0_26:
365
+ .loc 1 0 0
366
+ mul.f32 %f16, %f8, 0f3F3504F3;
367
+ .loc 1 30 23
368
+ setp.ltu.f32 %p21, %f95, 0f3F8060FE;
369
+ fma.rn.ftz.f32 %f285, %f419, %f420, %f418;
370
+ fma.rn.ftz.f32 %f286, %f285, %f420, %f417;
371
+ fma.rn.ftz.f32 %f287, %f286, %f420, %f416;
372
+ fma.rn.ftz.f32 %f288, %f287, %f420, %f415;
373
+ fma.rn.ftz.f32 %f289, %f288, %f420, %f414;
374
+ fma.rn.ftz.f32 %f290, %f289, %f420, %f413;
375
+ neg.f32 %f291, %f420;
376
+ selp.f32 %f292, %f291, %f15, %p20;
377
+ fma.rn.ftz.f32 %f421, %f290, %f292, %f292;
378
+ @%p21 bra $L__BB0_28;
379
+ ex2.approx.ftz.f32 %f293, %f421;
380
+ sub.f32 %f295, %f358, %f293;
381
+ mov.b32 %r43, %f295;
382
+ mov.b32 %r44, %f15;
383
+ and.b32 %r45, %r44, -2147483648;
384
+ or.b32 %r46, %r45, %r43;
385
+ mov.b32 %f421, %r46;
386
+ $L__BB0_28:
387
+ abs.ftz.f32 %f108, %f16;
388
+ setp.ge.f32 %p23, %f108, 0f3F8060FE;
389
+ mov.f32 %f428, 0f3789CA3C;
390
+ mov.f32 %f427, 0fB9F560B9;
391
+ mov.f32 %f426, 0f3BAC840B;
392
+ mov.f32 %f425, 0fBD0C8162;
393
+ mov.f32 %f424, 0f3E1CF906;
394
+ mov.f32 %f423, 0f3F6A937E;
395
+ mov.f32 %f422, 0f3F20D842;
396
+ mov.f32 %f429, %f108;
397
+ @%p23 bra $L__BB0_30;
398
+ mul.f32 %f429, %f16, %f16;
399
+ mov.f32 %f428, 0f38B1E96A;
400
+ mov.f32 %f427, 0fBA574D20;
401
+ mov.f32 %f426, 0f3BAAD5EA;
402
+ mov.f32 %f425, 0fBCDC1BE7;
403
+ mov.f32 %f424, 0f3DE718AF;
404
+ mov.f32 %f423, 0fBEC093AC;
405
+ mov.f32 %f422, 0f3E0375D3;
406
+ $L__BB0_30:
407
+ .loc 1 0 23
408
+ ld.param.u64 %rd2, [triton__0d1d2de_param_1];
409
+ cvt.s64.s32 %rd1, %r18;
410
+ .loc 1 30 23
411
+ setp.ltu.f32 %p24, %f108, 0f3F8060FE;
412
+ fma.rn.ftz.f32 %f310, %f428, %f429, %f427;
413
+ fma.rn.ftz.f32 %f311, %f310, %f429, %f426;
414
+ fma.rn.ftz.f32 %f312, %f311, %f429, %f425;
415
+ fma.rn.ftz.f32 %f313, %f312, %f429, %f424;
416
+ fma.rn.ftz.f32 %f314, %f313, %f429, %f423;
417
+ fma.rn.ftz.f32 %f315, %f314, %f429, %f422;
418
+ neg.f32 %f316, %f429;
419
+ selp.f32 %f317, %f316, %f16, %p23;
420
+ fma.rn.ftz.f32 %f430, %f315, %f317, %f317;
421
+ @%p24 bra $L__BB0_32;
422
+ ex2.approx.ftz.f32 %f318, %f430;
423
+ sub.f32 %f320, %f358, %f318;
424
+ mov.b32 %r47, %f320;
425
+ mov.b32 %r48, %f16;
426
+ and.b32 %r49, %r48, -2147483648;
427
+ or.b32 %r50, %r49, %r47;
428
+ mov.b32 %f430, %r50;
429
+ $L__BB0_32:
430
+ .loc 1 27 18
431
+ mul.f32 %f321, %f8, 0f3F000000;
432
+ mul.f32 %f322, %f7, 0f3F000000;
433
+ mul.f32 %f323, %f6, 0f3F000000;
434
+ mul.f32 %f324, %f5, 0f3F000000;
435
+ mul.f32 %f325, %f4, 0f3F000000;
436
+ mul.f32 %f326, %f3, 0f3F000000;
437
+ mul.f32 %f327, %f2, 0f3F000000;
438
+ mul.f32 %f328, %f1, 0f3F000000;
439
+ .loc 1 32 18
440
+ add.f32 %f329, %f367, 0f3F800000;
441
+ add.f32 %f330, %f376, 0f3F800000;
442
+ add.f32 %f331, %f385, 0f3F800000;
443
+ add.f32 %f332, %f394, 0f3F800000;
444
+ add.f32 %f333, %f403, 0f3F800000;
445
+ add.f32 %f334, %f412, 0f3F800000;
446
+ add.f32 %f335, %f421, 0f3F800000;
447
+ add.f32 %f336, %f430, 0f3F800000;
448
+ .loc 1 33 18
449
+ mul.f32 %f337, %f328, %f329;
450
+ mul.f32 %f338, %f327, %f330;
451
+ mul.f32 %f339, %f326, %f331;
452
+ mul.f32 %f340, %f325, %f332;
453
+ mul.f32 %f341, %f324, %f333;
454
+ mul.f32 %f342, %f323, %f334;
455
+ mul.f32 %f343, %f322, %f335;
456
+ mul.f32 %f344, %f321, %f336;
457
+ .loc 1 35 25
458
+ shl.b64 %rd7, %rd1, 1;
459
+ add.s64 %rd6, %rd2, %rd7;
460
+ .loc 1 35 37
461
+ mov.b32 %r51, %f337;
462
+ cvt.rn.bf16.f32 %rs9, %r51;
463
+ mov.b32 %r52, %f338;
464
+ cvt.rn.bf16.f32 %rs10, %r52;
465
+ mov.b32 %r53, %f339;
466
+ cvt.rn.bf16.f32 %rs11, %r53;
467
+ mov.b32 %r54, %f340;
468
+ cvt.rn.bf16.f32 %rs12, %r54;
469
+ mov.b32 %r55, %f341;
470
+ cvt.rn.bf16.f32 %rs13, %r55;
471
+ mov.b32 %r56, %f342;
472
+ cvt.rn.bf16.f32 %rs14, %r56;
473
+ mov.b32 %r57, %f343;
474
+ cvt.rn.bf16.f32 %rs15, %r57;
475
+ mov.b32 %r58, %f344;
476
+ cvt.rn.bf16.f32 %rs16, %r58;
477
+ mov.b32 %r63, {%rs9, %rs10};
478
+ mov.b32 %r64, {%rs11, %rs12};
479
+ mov.b32 %r65, {%rs13, %rs14};
480
+ mov.b32 %r66, {%rs15, %rs16};
481
+ @%p1 st.global.v4.b32 [ %rd6 + 0 ], { %r63, %r64, %r65, %r66 };
482
+ .loc 1 35 4
483
+ ret;
484
+ $L__tmp1:
485
+ $L__func_end0:
486
+
487
+ }
488
+ // .globl __nv_erff
489
+ .visible .func (.param .b32 func_retval0) __nv_erff(
490
+ .param .b32 __nv_erff_param_0
491
+ )
492
+ {
493
+ .reg .pred %p<4>;
494
+ .reg .b32 %r<5>;
495
+ .reg .f32 %f<49>;
496
+ $L__func_begin1:
497
+
498
+ ld.param.f32 %f14, [__nv_erff_param_0];
499
+ abs.ftz.f32 %f1, %f14;
500
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
501
+ mov.f32 %f46, 0f3789CA3C;
502
+ mov.f32 %f45, 0fB9F560B9;
503
+ mov.f32 %f44, 0f3BAC840B;
504
+ mov.f32 %f43, 0fBD0C8162;
505
+ mov.f32 %f42, 0f3E1CF906;
506
+ mov.f32 %f41, 0f3F6A937E;
507
+ mov.f32 %f40, 0f3F20D842;
508
+ mov.f32 %f47, %f1;
509
+ @%p1 bra $L__BB1_2;
510
+ mul.f32 %f47, %f14, %f14;
511
+ mov.f32 %f46, 0f38B1E96A;
512
+ mov.f32 %f45, 0fBA574D20;
513
+ mov.f32 %f44, 0f3BAAD5EA;
514
+ mov.f32 %f43, 0fBCDC1BE7;
515
+ mov.f32 %f42, 0f3DE718AF;
516
+ mov.f32 %f41, 0fBEC093AC;
517
+ mov.f32 %f40, 0f3E0375D3;
518
+ $L__BB1_2:
519
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
520
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
521
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
522
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
523
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
524
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
525
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
526
+ neg.f32 %f35, %f47;
527
+ selp.f32 %f36, %f35, %f14, %p1;
528
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
529
+ @%p2 bra $L__BB1_4;
530
+ ex2.approx.ftz.f32 %f37, %f48;
531
+ mov.f32 %f38, 0f3F800000;
532
+ sub.f32 %f39, %f38, %f37;
533
+ mov.b32 %r1, %f39;
534
+ mov.b32 %r2, %f14;
535
+ and.b32 %r3, %r2, -2147483648;
536
+ or.b32 %r4, %r3, %r1;
537
+ mov.b32 %f48, %r4;
538
+ $L__BB1_4:
539
+ st.param.f32 [func_retval0+0], %f48;
540
+ ret;
541
+ $L__func_end1:
542
+
543
+ }
544
+ .file 1 "/tmp/torchinductor_root/jf/cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py"
545
+ .section .debug_abbrev
546
+ {
547
+ .b8 1
548
+ .b8 17
549
+ .b8 1
550
+ .b8 37
551
+ .b8 8
552
+ .b8 19
553
+ .b8 5
554
+ .b8 3
555
+ .b8 8
556
+ .b8 16
557
+ .b8 6
558
+ .b8 27
559
+ .b8 8
560
+ .b8 180
561
+ .b8 66
562
+ .b8 12
563
+ .b8 17
564
+ .b8 1
565
+ .b8 18
566
+ .b8 1
567
+ .b8 0
568
+ .b8 0
569
+ .b8 2
570
+ .b8 46
571
+ .b8 0
572
+ .b8 17
573
+ .b8 1
574
+ .b8 18
575
+ .b8 1
576
+ .b8 64
577
+ .b8 10
578
+ .b8 135
579
+ .b8 64
580
+ .b8 8
581
+ .b8 3
582
+ .b8 8
583
+ .b8 58
584
+ .b8 11
585
+ .b8 59
586
+ .b8 11
587
+ .b8 63
588
+ .b8 12
589
+ .b8 0
590
+ .b8 0
591
+ .b8 0
592
+ }
593
+ .section .debug_info
594
+ {
595
+ .b32 176
596
+ .b8 2
597
+ .b8 0
598
+ .b32 .debug_abbrev
599
+ .b8 8
600
+ .b8 1
601
+ .b8 116
602
+ .b8 114
603
+ .b8 105
604
+ .b8 116
605
+ .b8 111
606
+ .b8 110
607
+ .b8 0
608
+ .b8 2
609
+ .b8 0
610
+ .b8 99
611
+ .b8 106
612
+ .b8 102
613
+ .b8 111
614
+ .b8 113
615
+ .b8 111
616
+ .b8 51
617
+ .b8 110
618
+ .b8 117
619
+ .b8 116
620
+ .b8 110
621
+ .b8 105
622
+ .b8 53
623
+ .b8 99
624
+ .b8 109
625
+ .b8 116
626
+ .b8 119
627
+ .b8 52
628
+ .b8 98
629
+ .b8 114
630
+ .b8 108
631
+ .b8 97
632
+ .b8 51
633
+ .b8 52
634
+ .b8 99
635
+ .b8 122
636
+ .b8 52
637
+ .b8 53
638
+ .b8 102
639
+ .b8 117
640
+ .b8 115
641
+ .b8 97
642
+ .b8 100
643
+ .b8 101
644
+ .b8 104
645
+ .b8 107
646
+ .b8 120
647
+ .b8 102
648
+ .b8 107
649
+ .b8 114
650
+ .b8 50
651
+ .b8 102
652
+ .b8 105
653
+ .b8 101
654
+ .b8 50
655
+ .b8 113
656
+ .b8 103
657
+ .b8 111
658
+ .b8 55
659
+ .b8 118
660
+ .b8 119
661
+ .b8 116
662
+ .b8 46
663
+ .b8 112
664
+ .b8 121
665
+ .b8 0
666
+ .b32 .debug_line
667
+ .b8 47
668
+ .b8 116
669
+ .b8 109
670
+ .b8 112
671
+ .b8 47
672
+ .b8 116
673
+ .b8 111
674
+ .b8 114
675
+ .b8 99
676
+ .b8 104
677
+ .b8 105
678
+ .b8 110
679
+ .b8 100
680
+ .b8 117
681
+ .b8 99
682
+ .b8 116
683
+ .b8 111
684
+ .b8 114
685
+ .b8 95
686
+ .b8 114
687
+ .b8 111
688
+ .b8 111
689
+ .b8 116
690
+ .b8 47
691
+ .b8 106
692
+ .b8 102
693
+ .b8 0
694
+ .b8 1
695
+ .b64 $L__func_begin0
696
+ .b64 $L__func_end0
697
+ .b8 2
698
+ .b64 $L__func_begin0
699
+ .b64 $L__func_end0
700
+ .b8 1
701
+ .b8 156
702
+ .b8 116
703
+ .b8 114
704
+ .b8 105
705
+ .b8 116
706
+ .b8 111
707
+ .b8 110
708
+ .b8 95
709
+ .b8 95
710
+ .b8 48
711
+ .b8 100
712
+ .b8 49
713
+ .b8 100
714
+ .b8 50
715
+ .b8 100
716
+ .b8 101
717
+ .b8 0
718
+ .b8 116
719
+ .b8 114
720
+ .b8 105
721
+ .b8 116
722
+ .b8 111
723
+ .b8 110
724
+ .b8 95
725
+ .b8 95
726
+ .b8 48
727
+ .b8 100
728
+ .b8 49
729
+ .b8 100
730
+ .b8 50
731
+ .b8 100
732
+ .b8 101
733
+ .b8 0
734
+ .b8 1
735
+ .b8 18
736
+ .b8 1
737
+ .b8 0
738
+ }
739
+ .section .debug_pubnames
740
+ {
741
+ .b32 $L__pubNames_end0-$L__pubNames_start0
742
+ $L__pubNames_start0:
743
+ .b8 2
744
+ .b8 0
745
+ .b32 .debug_info
746
+ .b32 180
747
+ .b32 125
748
+ .b8 116
749
+ .b8 114
750
+ .b8 105
751
+ .b8 116
752
+ .b8 111
753
+ .b8 110
754
+ .b8 95
755
+ .b8 95
756
+ .b8 48
757
+ .b8 100
758
+ .b8 49
759
+ .b8 100
760
+ .b8 50
761
+ .b8 100
762
+ .b8 101
763
+ .b8 0
764
+ .b32 0
765
+ $L__pubNames_end0:
766
+ }
767
+ .section .debug_pubtypes
768
+ {
769
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
770
+ $L__pubTypes_start0:
771
+ .b8 2
772
+ .b8 0
773
+ .b32 .debug_info
774
+ .b32 180
775
+ .b32 0
776
+ $L__pubTypes_end0:
777
+ }
778
+ .section .debug_loc { }
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
4
+ %cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
5
+ %cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
6
+ %c1024_i32 = arith.constant 1024 : i32
7
+ %0 = tt.get_program_id x : i32
8
+ %1 = arith.muli %0, %c1024_i32 : i32
9
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
10
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32>
11
+ %4 = arith.addi %3, %2 : tensor<1024xi32>
12
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
13
+ %6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
14
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
15
+ %8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
16
+ %9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
17
+ %10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
18
+ %11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
19
+ %12 = arith.addf %11, %cst : tensor<1024xf32>
20
+ %13 = arith.mulf %9, %12 : tensor<1024xf32>
21
+ %14 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
22
+ %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
23
+ %16 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
24
+ tt.store %15, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
25
+ tt.return
26
+ }
27
+ }
.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir ADDED
The diff for this file is too large to render. See raw diff
 
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i32 = arith.constant 512 : i32
4
+ %c256_i32 = arith.constant 256 : i32
5
+ %cst = arith.constant 0.000000e+00 : f32
6
+ %cst_0 = arith.constant 2.560000e+02 : f32
7
+ %cst_1 = arith.constant 9.99999974E-6 : f32
8
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32>
9
+ %cst_3 = arith.constant dense<256> : tensor<1xi64>
10
+ %cst_4 = arith.constant dense<50257> : tensor<1xi64>
11
+ %cst_5 = arith.constant dense<0> : tensor<1xi64>
12
+ %cst_6 = arith.constant dense<256> : tensor<256xi32>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
15
+ %2 = arith.cmpi slt, %1, %cst_6 : tensor<256xi32>
16
+ %3 = arith.remsi %0, %c512_i32 : i32
17
+ %4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
18
+ %5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
19
+ %6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
20
+ %7 = arith.muli %3, %c256_i32 : i32
21
+ %8 = tt.splat %7 : (i32) -> tensor<256xi32>
22
+ %9 = arith.addi %1, %8 : tensor<256xi32>
23
+ %10 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
24
+ %11 = tt.addptr %10, %9 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
25
+ %12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
26
+ %13 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
27
+ %14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
28
+ %15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
29
+ %16 = arith.addi %6, %cst_4 : tensor<1xi64>
30
+ %17 = arith.cmpi slt, %6, %cst_5 : tensor<1xi64>
31
+ %18 = arith.select %17, %16, %6 : tensor<1xi1>, tensor<1xi64>
32
+ %19 = arith.cmpi sge, %18, %cst_5 : tensor<1xi64>
33
+ %20 = arith.cmpi slt, %18, %cst_4 : tensor<1xi64>
34
+ %21 = arith.andi %19, %20 : tensor<1xi1>
35
+ tt.assert %21, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1>
36
+ %22 = arith.muli %18, %cst_3 : tensor<1xi64>
37
+ %23 = tt.broadcast %22 : (tensor<1xi64>) -> tensor<256xi64>
38
+ %24 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
39
+ %25 = arith.addi %24, %23 : tensor<256xi64>
40
+ %26 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
41
+ %27 = tt.addptr %26, %25 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
42
+ %28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
43
+ %29 = arith.addf %28, %12 : tensor<256xf32>
44
+ %30 = arith.select %2, %29, %cst_2 : tensor<256xi1>, tensor<256xf32>
45
+ %31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
46
+ ^bb0(%arg10: f32, %arg11: f32):
47
+ %60 = arith.addf %arg10, %arg11 : f32
48
+ tt.reduce.return %60 : f32
49
+ }) : (tensor<256xf32>) -> f32
50
+ %32 = arith.addf %31, %cst : f32
51
+ %33 = arith.divf %32, %cst_0 : f32
52
+ %34 = tt.splat %33 : (f32) -> tensor<1xf32>
53
+ %35 = tt.splat %33 : (f32) -> tensor<256xf32>
54
+ %36 = arith.subf %29, %35 : tensor<256xf32>
55
+ %37 = arith.mulf %36, %36 : tensor<256xf32>
56
+ %38 = arith.select %2, %37, %cst_2 : tensor<256xi1>, tensor<256xf32>
57
+ %39 = "tt.reduce"(%38) <{axis = 0 : i32}> ({
58
+ ^bb0(%arg10: f32, %arg11: f32):
59
+ %60 = arith.addf %arg10, %arg11 : f32
60
+ tt.reduce.return %60 : f32
61
+ }) : (tensor<256xf32>) -> f32
62
+ %40 = arith.addf %39, %cst : f32
63
+ %41 = arith.divf %40, %cst_0 : f32
64
+ %42 = arith.addf %41, %cst_1 : f32
65
+ %43 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
66
+ %44 = tt.splat %43 : (f32) -> tensor<1xf32>
67
+ %45 = tt.splat %43 : (f32) -> tensor<256xf32>
68
+ %46 = arith.mulf %36, %45 : tensor<256xf32>
69
+ %47 = arith.mulf %46, %15 : tensor<256xf32>
70
+ %48 = arith.muli %0, %c256_i32 : i32
71
+ %49 = tt.splat %48 : (i32) -> tensor<256xi32>
72
+ %50 = arith.addi %1, %49 : tensor<256xi32>
73
+ %51 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
74
+ %52 = tt.addptr %51, %50 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
75
+ tt.store %52, %29, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
76
+ gpu.barrier
77
+ %53 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
78
+ %54 = tt.splat %53 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
79
+ tt.store %54, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
80
+ %55 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
81
+ %56 = tt.addptr %55, %50 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
82
+ %57 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
83
+ tt.store %56, %57, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
84
+ %58 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
85
+ %59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
86
+ tt.store %59, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
87
+ tt.return
88
+ }
89
+ }
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<512> : tensor<128xi32, #blocked>
5
+ %c128_i32 = arith.constant 128 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c128_i32 : i32
8
+ %2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<128xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<128xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<128xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>, #blocked>, tensor<128xi32, #blocked>
14
+ %8 = arith.extsi %4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked>
15
+ tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %cst = arith.constant dense<512> : tensor<256xi32>
4
+ %c256_i32 = arith.constant 256 : i32
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.muli %0, %c256_i32 : i32
7
+ %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
8
+ %3 = tt.splat %1 : (i32) -> tensor<256xi32>
9
+ %4 = arith.addi %3, %2 : tensor<256xi32>
10
+ %5 = arith.cmpi slt, %4, %cst : tensor<256xi32>
11
+ %6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>>
12
+ %7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>>, tensor<256xi32>
13
+ %8 = arith.extsi %4 : tensor<256xi32> to tensor<256xi64>
14
+ tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin ADDED
Binary file (28.6 kB). View file
 
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+ .global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
11
+
12
+ .visible .entry triton__0d1d2de(
13
+ .param .u64 triton__0d1d2de_param_0,
14
+ .param .u64 triton__0d1d2de_param_1,
15
+ .param .u32 triton__0d1d2de_param_2
16
+ )
17
+ .maxntid 128, 1, 1
18
+ {
19
+ .reg .pred %p<28>;
20
+ .reg .b16 %rs<25>;
21
+ .reg .b32 %r<79>;
22
+ .reg .f32 %f<487>;
23
+ .reg .b64 %rd<8>;
24
+ .loc 1 18 0
25
+ $L__func_begin0:
26
+ .loc 1 18 0
27
+
28
+ ld.param.u64 %rd4, [triton__0d1d2de_param_0];
29
+ ld.param.u64 %rd5, [triton__0d1d2de_param_1];
30
+ $L__tmp0:
31
+ .loc 1 21 36
32
+ mov.u32 %r26, %tid.x;
33
+ shl.b32 %r27, %r26, 3;
34
+ and.b32 %r28, %r27, 1016;
35
+ .loc 1 20 28
36
+ mov.u32 %r1, %ctaid.x;
37
+ .loc 1 20 33
38
+ shl.b32 %r29, %r1, 10;
39
+ .loc 1 21 23
40
+ or.b32 %r30, %r29, %r28;
41
+ .loc 1 24 34
42
+ mul.wide.s32 %rd6, %r30, 2;
43
+ add.s64 %rd7, %rd4, %rd6;
44
+ mov.pred %p1, -1;
45
+ .loc 1 24 39
46
+ mov.u32 %r2, 0x0;
47
+ mov.u32 %r3, 0x0;
48
+ mov.u32 %r4, 0x0;
49
+ mov.u32 %r5, 0x0;
50
+ @%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd7 + 0 ];
51
+ .loc 1 25 30
52
+ add.s64 %rd3, %rd5, %rd6;
53
+ .loc 1 25 35
54
+ mov.u32 %r14, 0x0;
55
+ mov.u32 %r15, 0x0;
56
+ mov.u32 %r16, 0x0;
57
+ mov.u32 %r17, 0x0;
58
+ @%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd3 + 0 ];
59
+ cvt.u16.u32 %rs9, %r14;
60
+ { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
61
+ cvt.u16.u32 %rs11, %r15;
62
+ .loc 1 25 44
63
+ cvt.f32.bf16 %r18, %rs9;
64
+ mov.b32 %f9, %r18;
65
+ cvt.f32.bf16 %r19, %rs10;
66
+ mov.b32 %f10, %r19;
67
+ .loc 1 29 18
68
+ mul.f32 %f17, %f9, 0f3F3504F3;
69
+ .loc 1 30 23
70
+ abs.ftz.f32 %f25, %f17;
71
+ setp.ge.f32 %p3, %f25, 0f3F8060FE;
72
+ mov.f32 %f421, 0f3789CA3C;
73
+ mov.f32 %f420, 0fB9F560B9;
74
+ mov.f32 %f419, 0f3BAC840B;
75
+ mov.f32 %f418, 0fBD0C8162;
76
+ mov.f32 %f417, 0f3E1CF906;
77
+ mov.f32 %f416, 0f3F6A937E;
78
+ mov.f32 %f415, 0f3F20D842;
79
+ mov.f32 %f422, %f25;
80
+ @%p3 bra $L__BB0_2;
81
+ .loc 1 0 23
82
+ mov.f32 %f421, 0f38B1E96A;
83
+ mov.f32 %f420, 0fBA574D20;
84
+ mov.f32 %f419, 0f3BAAD5EA;
85
+ mov.f32 %f418, 0fBCDC1BE7;
86
+ mov.f32 %f417, 0f3DE718AF;
87
+ mov.f32 %f416, 0fBEC093AC;
88
+ mov.f32 %f415, 0f3E0375D3;
89
+ .loc 1 30 23
90
+ mul.f32 %f422, %f17, %f17;
91
+ $L__BB0_2:
92
+ .loc 1 0 0
93
+ cvt.f32.bf16 %r20, %rs11;
94
+ mul.f32 %f18, %f10, 0f3F3504F3;
95
+ .loc 1 30 23
96
+ setp.ltu.f32 %p4, %f25, 0f3F8060FE;
97
+ fma.rn.ftz.f32 %f143, %f421, %f422, %f420;
98
+ fma.rn.ftz.f32 %f144, %f143, %f422, %f419;
99
+ fma.rn.ftz.f32 %f145, %f144, %f422, %f418;
100
+ fma.rn.ftz.f32 %f146, %f145, %f422, %f417;
101
+ fma.rn.ftz.f32 %f147, %f146, %f422, %f416;
102
+ fma.rn.ftz.f32 %f148, %f147, %f422, %f415;
103
+ neg.f32 %f149, %f422;
104
+ selp.f32 %f150, %f149, %f17, %p3;
105
+ fma.rn.ftz.f32 %f423, %f148, %f150, %f150;
106
+ mov.f32 %f414, 0f3F800000;
107
+ @%p4 bra $L__BB0_4;
108
+ ex2.approx.ftz.f32 %f151, %f423;
109
+ sub.f32 %f153, %f414, %f151;
110
+ mov.b32 %r31, %f153;
111
+ mov.b32 %r32, %f17;
112
+ and.b32 %r33, %r32, -2147483648;
113
+ or.b32 %r34, %r33, %r31;
114
+ mov.b32 %f423, %r34;
115
+ $L__BB0_4:
116
+ .loc 1 0 0
117
+ { .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
118
+ mov.b32 %f11, %r20;
119
+ .loc 1 30 23
120
+ abs.ftz.f32 %f38, %f18;
121
+ setp.ge.f32 %p6, %f38, 0f3F8060FE;
122
+ mov.f32 %f430, 0f3789CA3C;
123
+ mov.f32 %f429, 0fB9F560B9;
124
+ mov.f32 %f428, 0f3BAC840B;
125
+ mov.f32 %f427, 0fBD0C8162;
126
+ mov.f32 %f426, 0f3E1CF906;
127
+ mov.f32 %f425, 0f3F6A937E;
128
+ mov.f32 %f424, 0f3F20D842;
129
+ mov.f32 %f431, %f38;
130
+ @%p6 bra $L__BB0_6;
131
+ mul.f32 %f431, %f18, %f18;
132
+ mov.f32 %f430, 0f38B1E96A;
133
+ mov.f32 %f429, 0fBA574D20;
134
+ mov.f32 %f428, 0f3BAAD5EA;
135
+ mov.f32 %f427, 0fBCDC1BE7;
136
+ mov.f32 %f426, 0f3DE718AF;
137
+ mov.f32 %f425, 0fBEC093AC;
138
+ mov.f32 %f424, 0f3E0375D3;
139
+ $L__BB0_6:
140
+ .loc 1 0 0
141
+ cvt.f32.bf16 %r21, %rs12;
142
+ mul.f32 %f19, %f11, 0f3F3504F3;
143
+ .loc 1 30 23
144
+ setp.ltu.f32 %p7, %f38, 0f3F8060FE;
145
+ fma.rn.ftz.f32 %f168, %f430, %f431, %f429;
146
+ fma.rn.ftz.f32 %f169, %f168, %f431, %f428;
147
+ fma.rn.ftz.f32 %f170, %f169, %f431, %f427;
148
+ fma.rn.ftz.f32 %f171, %f170, %f431, %f426;
149
+ fma.rn.ftz.f32 %f172, %f171, %f431, %f425;
150
+ fma.rn.ftz.f32 %f173, %f172, %f431, %f424;
151
+ neg.f32 %f174, %f431;
152
+ selp.f32 %f175, %f174, %f18, %p6;
153
+ fma.rn.ftz.f32 %f432, %f173, %f175, %f175;
154
+ @%p7 bra $L__BB0_8;
155
+ ex2.approx.ftz.f32 %f176, %f432;
156
+ sub.f32 %f178, %f414, %f176;
157
+ mov.b32 %r35, %f178;
158
+ mov.b32 %r36, %f18;
159
+ and.b32 %r37, %r36, -2147483648;
160
+ or.b32 %r38, %r37, %r35;
161
+ mov.b32 %f432, %r38;
162
+ $L__BB0_8:
163
+ .loc 1 0 0
164
+ cvt.u16.u32 %rs13, %r16;
165
+ mov.b32 %f12, %r21;
166
+ .loc 1 30 23
167
+ abs.ftz.f32 %f51, %f19;
168
+ setp.ge.f32 %p9, %f51, 0f3F8060FE;
169
+ mov.f32 %f439, 0f3789CA3C;
170
+ mov.f32 %f438, 0fB9F560B9;
171
+ mov.f32 %f437, 0f3BAC840B;
172
+ mov.f32 %f436, 0fBD0C8162;
173
+ mov.f32 %f435, 0f3E1CF906;
174
+ mov.f32 %f434, 0f3F6A937E;
175
+ mov.f32 %f433, 0f3F20D842;
176
+ mov.f32 %f440, %f51;
177
+ @%p9 bra $L__BB0_10;
178
+ mul.f32 %f440, %f19, %f19;
179
+ mov.f32 %f439, 0f38B1E96A;
180
+ mov.f32 %f438, 0fBA574D20;
181
+ mov.f32 %f437, 0f3BAAD5EA;
182
+ mov.f32 %f436, 0fBCDC1BE7;
183
+ mov.f32 %f435, 0f3DE718AF;
184
+ mov.f32 %f434, 0fBEC093AC;
185
+ mov.f32 %f433, 0f3E0375D3;
186
+ $L__BB0_10:
187
+ .loc 1 0 0
188
+ cvt.f32.bf16 %r22, %rs13;
189
+ mul.f32 %f20, %f12, 0f3F3504F3;
190
+ .loc 1 30 23
191
+ setp.ltu.f32 %p10, %f51, 0f3F8060FE;
192
+ fma.rn.ftz.f32 %f193, %f439, %f440, %f438;
193
+ fma.rn.ftz.f32 %f194, %f193, %f440, %f437;
194
+ fma.rn.ftz.f32 %f195, %f194, %f440, %f436;
195
+ fma.rn.ftz.f32 %f196, %f195, %f440, %f435;
196
+ fma.rn.ftz.f32 %f197, %f196, %f440, %f434;
197
+ fma.rn.ftz.f32 %f198, %f197, %f440, %f433;
198
+ neg.f32 %f199, %f440;
199
+ selp.f32 %f200, %f199, %f19, %p9;
200
+ fma.rn.ftz.f32 %f441, %f198, %f200, %f200;
201
+ @%p10 bra $L__BB0_12;
202
+ ex2.approx.ftz.f32 %f201, %f441;
203
+ sub.f32 %f203, %f414, %f201;
204
+ mov.b32 %r39, %f203;
205
+ mov.b32 %r40, %f19;
206
+ and.b32 %r41, %r40, -2147483648;
207
+ or.b32 %r42, %r41, %r39;
208
+ mov.b32 %f441, %r42;
209
+ $L__BB0_12:
210
+ .loc 1 0 0
211
+ { .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
212
+ mov.b32 %f13, %r22;
213
+ .loc 1 30 23
214
+ abs.ftz.f32 %f64, %f20;
215
+ setp.ge.f32 %p12, %f64, 0f3F8060FE;
216
+ mov.f32 %f448, 0f3789CA3C;
217
+ mov.f32 %f447, 0fB9F560B9;
218
+ mov.f32 %f446, 0f3BAC840B;
219
+ mov.f32 %f445, 0fBD0C8162;
220
+ mov.f32 %f444, 0f3E1CF906;
221
+ mov.f32 %f443, 0f3F6A937E;
222
+ mov.f32 %f442, 0f3F20D842;
223
+ mov.f32 %f449, %f64;
224
+ @%p12 bra $L__BB0_14;
225
+ mul.f32 %f449, %f20, %f20;
226
+ mov.f32 %f448, 0f38B1E96A;
227
+ mov.f32 %f447, 0fBA574D20;
228
+ mov.f32 %f446, 0f3BAAD5EA;
229
+ mov.f32 %f445, 0fBCDC1BE7;
230
+ mov.f32 %f444, 0f3DE718AF;
231
+ mov.f32 %f443, 0fBEC093AC;
232
+ mov.f32 %f442, 0f3E0375D3;
233
+ $L__BB0_14:
234
+ .loc 1 0 0
235
+ cvt.f32.bf16 %r23, %rs14;
236
+ mul.f32 %f21, %f13, 0f3F3504F3;
237
+ .loc 1 30 23
238
+ setp.ltu.f32 %p13, %f64, 0f3F8060FE;
239
+ fma.rn.ftz.f32 %f218, %f448, %f449, %f447;
240
+ fma.rn.ftz.f32 %f219, %f218, %f449, %f446;
241
+ fma.rn.ftz.f32 %f220, %f219, %f449, %f445;
242
+ fma.rn.ftz.f32 %f221, %f220, %f449, %f444;
243
+ fma.rn.ftz.f32 %f222, %f221, %f449, %f443;
244
+ fma.rn.ftz.f32 %f223, %f222, %f449, %f442;
245
+ neg.f32 %f224, %f449;
246
+ selp.f32 %f225, %f224, %f20, %p12;
247
+ fma.rn.ftz.f32 %f450, %f223, %f225, %f225;
248
+ @%p13 bra $L__BB0_16;
249
+ ex2.approx.ftz.f32 %f226, %f450;
250
+ sub.f32 %f228, %f414, %f226;
251
+ mov.b32 %r43, %f228;
252
+ mov.b32 %r44, %f20;
253
+ and.b32 %r45, %r44, -2147483648;
254
+ or.b32 %r46, %r45, %r43;
255
+ mov.b32 %f450, %r46;
256
+ $L__BB0_16:
257
+ .loc 1 0 0
258
+ cvt.u16.u32 %rs15, %r17;
259
+ mov.b32 %f14, %r23;
260
+ .loc 1 30 23
261
+ abs.ftz.f32 %f77, %f21;
262
+ setp.ge.f32 %p15, %f77, 0f3F8060FE;
263
+ mov.f32 %f457, 0f3789CA3C;
264
+ mov.f32 %f456, 0fB9F560B9;
265
+ mov.f32 %f455, 0f3BAC840B;
266
+ mov.f32 %f454, 0fBD0C8162;
267
+ mov.f32 %f453, 0f3E1CF906;
268
+ mov.f32 %f452, 0f3F6A937E;
269
+ mov.f32 %f451, 0f3F20D842;
270
+ mov.f32 %f458, %f77;
271
+ @%p15 bra $L__BB0_18;
272
+ mul.f32 %f458, %f21, %f21;
273
+ mov.f32 %f457, 0f38B1E96A;
274
+ mov.f32 %f456, 0fBA574D20;
275
+ mov.f32 %f455, 0f3BAAD5EA;
276
+ mov.f32 %f454, 0fBCDC1BE7;
277
+ mov.f32 %f453, 0f3DE718AF;
278
+ mov.f32 %f452, 0fBEC093AC;
279
+ mov.f32 %f451, 0f3E0375D3;
280
+ $L__BB0_18:
281
+ .loc 1 0 0
282
+ cvt.f32.bf16 %r24, %rs15;
283
+ mul.f32 %f22, %f14, 0f3F3504F3;
284
+ .loc 1 30 23
285
+ setp.ltu.f32 %p16, %f77, 0f3F8060FE;
286
+ fma.rn.ftz.f32 %f243, %f457, %f458, %f456;
287
+ fma.rn.ftz.f32 %f244, %f243, %f458, %f455;
288
+ fma.rn.ftz.f32 %f245, %f244, %f458, %f454;
289
+ fma.rn.ftz.f32 %f246, %f245, %f458, %f453;
290
+ fma.rn.ftz.f32 %f247, %f246, %f458, %f452;
291
+ fma.rn.ftz.f32 %f248, %f247, %f458, %f451;
292
+ neg.f32 %f249, %f458;
293
+ selp.f32 %f250, %f249, %f21, %p15;
294
+ fma.rn.ftz.f32 %f459, %f248, %f250, %f250;
295
+ @%p16 bra $L__BB0_20;
296
+ ex2.approx.ftz.f32 %f251, %f459;
297
+ sub.f32 %f253, %f414, %f251;
298
+ mov.b32 %r47, %f253;
299
+ mov.b32 %r48, %f21;
300
+ and.b32 %r49, %r48, -2147483648;
301
+ or.b32 %r50, %r49, %r47;
302
+ mov.b32 %f459, %r50;
303
+ $L__BB0_20:
304
+ .loc 1 0 0
305
+ { .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
306
+ mov.b32 %f15, %r24;
307
+ .loc 1 30 23
308
+ abs.ftz.f32 %f90, %f22;
309
+ setp.ge.f32 %p18, %f90, 0f3F8060FE;
310
+ mov.f32 %f466, 0f3789CA3C;
311
+ mov.f32 %f465, 0fB9F560B9;
312
+ mov.f32 %f464, 0f3BAC840B;
313
+ mov.f32 %f463, 0fBD0C8162;
314
+ mov.f32 %f462, 0f3E1CF906;
315
+ mov.f32 %f461, 0f3F6A937E;
316
+ mov.f32 %f460, 0f3F20D842;
317
+ mov.f32 %f467, %f90;
318
+ @%p18 bra $L__BB0_22;
319
+ mul.f32 %f467, %f22, %f22;
320
+ mov.f32 %f466, 0f38B1E96A;
321
+ mov.f32 %f465, 0fBA574D20;
322
+ mov.f32 %f464, 0f3BAAD5EA;
323
+ mov.f32 %f463, 0fBCDC1BE7;
324
+ mov.f32 %f462, 0f3DE718AF;
325
+ mov.f32 %f461, 0fBEC093AC;
326
+ mov.f32 %f460, 0f3E0375D3;
327
+ $L__BB0_22:
328
+ .loc 1 0 0
329
+ cvt.f32.bf16 %r25, %rs16;
330
+ mul.f32 %f23, %f15, 0f3F3504F3;
331
+ .loc 1 30 23
332
+ setp.ltu.f32 %p19, %f90, 0f3F8060FE;
333
+ fma.rn.ftz.f32 %f268, %f466, %f467, %f465;
334
+ fma.rn.ftz.f32 %f269, %f268, %f467, %f464;
335
+ fma.rn.ftz.f32 %f270, %f269, %f467, %f463;
336
+ fma.rn.ftz.f32 %f271, %f270, %f467, %f462;
337
+ fma.rn.ftz.f32 %f272, %f271, %f467, %f461;
338
+ fma.rn.ftz.f32 %f273, %f272, %f467, %f460;
339
+ neg.f32 %f274, %f467;
340
+ selp.f32 %f275, %f274, %f22, %p18;
341
+ fma.rn.ftz.f32 %f468, %f273, %f275, %f275;
342
+ @%p19 bra $L__BB0_24;
343
+ ex2.approx.ftz.f32 %f276, %f468;
344
+ sub.f32 %f278, %f414, %f276;
345
+ mov.b32 %r51, %f278;
346
+ mov.b32 %r52, %f22;
347
+ and.b32 %r53, %r52, -2147483648;
348
+ or.b32 %r54, %r53, %r51;
349
+ mov.b32 %f468, %r54;
350
+ $L__BB0_24:
351
+ .loc 1 0 0
352
+ mov.b32 %f16, %r25;
353
+ .loc 1 30 23
354
+ abs.ftz.f32 %f103, %f23;
355
+ setp.ge.f32 %p21, %f103, 0f3F8060FE;
356
+ mov.f32 %f475, 0f3789CA3C;
357
+ mov.f32 %f474, 0fB9F560B9;
358
+ mov.f32 %f473, 0f3BAC840B;
359
+ mov.f32 %f472, 0fBD0C8162;
360
+ mov.f32 %f471, 0f3E1CF906;
361
+ mov.f32 %f470, 0f3F6A937E;
362
+ mov.f32 %f469, 0f3F20D842;
363
+ mov.f32 %f476, %f103;
364
+ @%p21 bra $L__BB0_26;
365
+ mul.f32 %f476, %f23, %f23;
366
+ mov.f32 %f475, 0f38B1E96A;
367
+ mov.f32 %f474, 0fBA574D20;
368
+ mov.f32 %f473, 0f3BAAD5EA;
369
+ mov.f32 %f472, 0fBCDC1BE7;
370
+ mov.f32 %f471, 0f3DE718AF;
371
+ mov.f32 %f470, 0fBEC093AC;
372
+ mov.f32 %f469, 0f3E0375D3;
373
+ $L__BB0_26:
374
+ .loc 1 0 0
375
+ cvt.u16.u32 %rs1, %r2;
376
+ { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
377
+ cvt.u16.u32 %rs3, %r3;
378
+ { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
379
+ cvt.u16.u32 %rs5, %r4;
380
+ { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
381
+ cvt.u16.u32 %rs7, %r5;
382
+ { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
383
+ mul.f32 %f24, %f16, 0f3F3504F3;
384
+ .loc 1 30 23
385
+ setp.ltu.f32 %p22, %f103, 0f3F8060FE;
386
+ fma.rn.ftz.f32 %f293, %f475, %f476, %f474;
387
+ fma.rn.ftz.f32 %f294, %f293, %f476, %f473;
388
+ fma.rn.ftz.f32 %f295, %f294, %f476, %f472;
389
+ fma.rn.ftz.f32 %f296, %f295, %f476, %f471;
390
+ fma.rn.ftz.f32 %f297, %f296, %f476, %f470;
391
+ fma.rn.ftz.f32 %f298, %f297, %f476, %f469;
392
+ neg.f32 %f299, %f476;
393
+ selp.f32 %f300, %f299, %f23, %p21;
394
+ fma.rn.ftz.f32 %f477, %f298, %f300, %f300;
395
+ @%p22 bra $L__BB0_28;
396
+ ex2.approx.ftz.f32 %f301, %f477;
397
+ sub.f32 %f303, %f414, %f301;
398
+ mov.b32 %r55, %f303;
399
+ mov.b32 %r56, %f23;
400
+ and.b32 %r57, %r56, -2147483648;
401
+ or.b32 %r58, %r57, %r55;
402
+ mov.b32 %f477, %r58;
403
+ $L__BB0_28:
404
+ .loc 1 0 0
405
+ cvt.f32.bf16 %r6, %rs1;
406
+ cvt.f32.bf16 %r7, %rs2;
407
+ cvt.f32.bf16 %r8, %rs3;
408
+ cvt.f32.bf16 %r9, %rs4;
409
+ cvt.f32.bf16 %r10, %rs5;
410
+ cvt.f32.bf16 %r11, %rs6;
411
+ cvt.f32.bf16 %r12, %rs7;
412
+ cvt.f32.bf16 %r13, %rs8;
413
+ .loc 1 30 23
414
+ abs.ftz.f32 %f116, %f24;
415
+ setp.ge.f32 %p24, %f116, 0f3F8060FE;
416
+ mov.f32 %f484, 0f3789CA3C;
417
+ mov.f32 %f483, 0fB9F560B9;
418
+ mov.f32 %f482, 0f3BAC840B;
419
+ mov.f32 %f481, 0fBD0C8162;
420
+ mov.f32 %f480, 0f3E1CF906;
421
+ mov.f32 %f479, 0f3F6A937E;
422
+ mov.f32 %f478, 0f3F20D842;
423
+ mov.f32 %f485, %f116;
424
+ @%p24 bra $L__BB0_30;
425
+ mul.f32 %f485, %f24, %f24;
426
+ mov.f32 %f484, 0f38B1E96A;
427
+ mov.f32 %f483, 0fBA574D20;
428
+ mov.f32 %f482, 0f3BAAD5EA;
429
+ mov.f32 %f481, 0fBCDC1BE7;
430
+ mov.f32 %f480, 0f3DE718AF;
431
+ mov.f32 %f479, 0fBEC093AC;
432
+ mov.f32 %f478, 0f3E0375D3;
433
+ $L__BB0_30:
434
+ .loc 1 0 0
435
+ mov.b32 %f1, %r6;
436
+ mov.b32 %f2, %r7;
437
+ mov.b32 %f3, %r8;
438
+ mov.b32 %f4, %r9;
439
+ mov.b32 %f5, %r10;
440
+ mov.b32 %f6, %r11;
441
+ mov.b32 %f7, %r12;
442
+ mov.b32 %f8, %r13;
443
+ .loc 1 30 23
444
+ setp.ltu.f32 %p25, %f116, 0f3F8060FE;
445
+ fma.rn.ftz.f32 %f318, %f484, %f485, %f483;
446
+ fma.rn.ftz.f32 %f319, %f318, %f485, %f482;
447
+ fma.rn.ftz.f32 %f320, %f319, %f485, %f481;
448
+ fma.rn.ftz.f32 %f321, %f320, %f485, %f480;
449
+ fma.rn.ftz.f32 %f322, %f321, %f485, %f479;
450
+ fma.rn.ftz.f32 %f323, %f322, %f485, %f478;
451
+ neg.f32 %f324, %f485;
452
+ selp.f32 %f325, %f324, %f24, %p24;
453
+ fma.rn.ftz.f32 %f486, %f323, %f325, %f325;
454
+ @%p25 bra $L__BB0_32;
455
+ ex2.approx.ftz.f32 %f326, %f486;
456
+ sub.f32 %f328, %f414, %f326;
457
+ mov.b32 %r59, %f328;
458
+ mov.b32 %r60, %f24;
459
+ and.b32 %r61, %r60, -2147483648;
460
+ or.b32 %r62, %r61, %r59;
461
+ mov.b32 %f486, %r62;
462
+ $L__BB0_32:
463
+ .loc 1 32 18
464
+ add.f32 %f345, %f423, 0f3F800000;
465
+ add.f32 %f346, %f432, 0f3F800000;
466
+ add.f32 %f347, %f441, 0f3F800000;
467
+ add.f32 %f348, %f450, 0f3F800000;
468
+ add.f32 %f349, %f459, 0f3F800000;
469
+ add.f32 %f350, %f468, 0f3F800000;
470
+ add.f32 %f351, %f477, 0f3F800000;
471
+ add.f32 %f352, %f486, 0f3F800000;
472
+ .loc 1 35 19
473
+ mul.f32 %f353, %f9, %f9;
474
+ mul.f32 %f354, %f10, %f10;
475
+ mul.f32 %f355, %f11, %f11;
476
+ mul.f32 %f356, %f12, %f12;
477
+ mul.f32 %f357, %f13, %f13;
478
+ mul.f32 %f358, %f14, %f14;
479
+ mul.f32 %f359, %f15, %f15;
480
+ mul.f32 %f360, %f16, %f16;
481
+ .loc 1 37 20
482
+ mul.f32 %f361, %f353, 0fBF000000;
483
+ mul.f32 %f362, %f354, 0fBF000000;
484
+ mul.f32 %f363, %f355, 0fBF000000;
485
+ mul.f32 %f364, %f356, 0fBF000000;
486
+ mul.f32 %f365, %f357, 0fBF000000;
487
+ mul.f32 %f366, %f358, 0fBF000000;
488
+ mul.f32 %f367, %f359, 0fBF000000;
489
+ mul.f32 %f368, %f360, 0fBF000000;
490
+ .loc 1 38 19
491
+ mul.f32 %f330, %f361, 0f3FB8AA3B;
492
+ ex2.approx.f32 %f329, %f330;
493
+ mul.f32 %f332, %f362, 0f3FB8AA3B;
494
+ ex2.approx.f32 %f331, %f332;
495
+ mul.f32 %f334, %f363, 0f3FB8AA3B;
496
+ ex2.approx.f32 %f333, %f334;
497
+ mul.f32 %f336, %f364, 0f3FB8AA3B;
498
+ ex2.approx.f32 %f335, %f336;
499
+ mul.f32 %f338, %f365, 0f3FB8AA3B;
500
+ ex2.approx.f32 %f337, %f338;
501
+ mul.f32 %f340, %f366, 0f3FB8AA3B;
502
+ ex2.approx.f32 %f339, %f340;
503
+ mul.f32 %f342, %f367, 0f3FB8AA3B;
504
+ ex2.approx.f32 %f341, %f342;
505
+ mul.f32 %f344, %f368, 0f3FB8AA3B;
506
+ ex2.approx.f32 %f343, %f344;
507
+ .loc 1 40 20
508
+ mul.f32 %f369, %f329, 0f3ECC422A;
509
+ mul.f32 %f370, %f331, 0f3ECC422A;
510
+ mul.f32 %f371, %f333, 0f3ECC422A;
511
+ mul.f32 %f372, %f335, 0f3ECC422A;
512
+ mul.f32 %f373, %f337, 0f3ECC422A;
513
+ mul.f32 %f374, %f339, 0f3ECC422A;
514
+ mul.f32 %f375, %f341, 0f3ECC422A;
515
+ mul.f32 %f376, %f343, 0f3ECC422A;
516
+ .loc 1 41 19
517
+ mul.f32 %f377, %f9, %f369;
518
+ mul.f32 %f378, %f10, %f370;
519
+ mul.f32 %f379, %f11, %f371;
520
+ mul.f32 %f380, %f12, %f372;
521
+ mul.f32 %f381, %f13, %f373;
522
+ mul.f32 %f382, %f14, %f374;
523
+ mul.f32 %f383, %f15, %f375;
524
+ mul.f32 %f384, %f16, %f376;
525
+ .loc 1 42 20
526
+ fma.rn.f32 %f385, %f345, 0f3F000000, %f377;
527
+ fma.rn.f32 %f386, %f346, 0f3F000000, %f378;
528
+ fma.rn.f32 %f387, %f347, 0f3F000000, %f379;
529
+ fma.rn.f32 %f388, %f348, 0f3F000000, %f380;
530
+ fma.rn.f32 %f389, %f349, 0f3F000000, %f381;
531
+ fma.rn.f32 %f390, %f350, 0f3F000000, %f382;
532
+ fma.rn.f32 %f391, %f351, 0f3F000000, %f383;
533
+ fma.rn.f32 %f392, %f352, 0f3F000000, %f384;
534
+ .loc 1 43 19
535
+ mul.f32 %f393, %f1, %f385;
536
+ mul.f32 %f394, %f2, %f386;
537
+ mul.f32 %f395, %f3, %f387;
538
+ mul.f32 %f396, %f4, %f388;
539
+ mul.f32 %f397, %f5, %f389;
540
+ mul.f32 %f398, %f6, %f390;
541
+ mul.f32 %f399, %f7, %f391;
542
+ mul.f32 %f400, %f8, %f392;
543
+ .loc 1 45 40
544
+ mov.b32 %r63, %f393;
545
+ cvt.rn.bf16.f32 %rs17, %r63;
546
+ mov.b32 %r64, %f394;
547
+ cvt.rn.bf16.f32 %rs18, %r64;
548
+ mov.b32 %r65, %f395;
549
+ cvt.rn.bf16.f32 %rs19, %r65;
550
+ mov.b32 %r66, %f396;
551
+ cvt.rn.bf16.f32 %rs20, %r66;
552
+ mov.b32 %r67, %f397;
553
+ cvt.rn.bf16.f32 %rs21, %r67;
554
+ mov.b32 %r68, %f398;
555
+ cvt.rn.bf16.f32 %rs22, %r68;
556
+ mov.b32 %r69, %f399;
557
+ cvt.rn.bf16.f32 %rs23, %r69;
558
+ mov.b32 %r70, %f400;
559
+ cvt.rn.bf16.f32 %rs24, %r70;
560
+ mov.b32 %r75, {%rs17, %rs18};
561
+ mov.b32 %r76, {%rs19, %rs20};
562
+ mov.b32 %r77, {%rs21, %rs22};
563
+ mov.b32 %r78, {%rs23, %rs24};
564
+ @%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r75, %r76, %r77, %r78 };
565
+ .loc 1 45 4
566
+ ret;
567
+ $L__tmp1:
568
+ $L__func_end0:
569
+
570
+ }
571
+ // .globl __nv_erff
572
+ .visible .func (.param .b32 func_retval0) __nv_erff(
573
+ .param .b32 __nv_erff_param_0
574
+ )
575
+ {
576
+ .reg .pred %p<4>;
577
+ .reg .b32 %r<5>;
578
+ .reg .f32 %f<49>;
579
+ $L__func_begin1:
580
+
581
+ ld.param.f32 %f14, [__nv_erff_param_0];
582
+ abs.ftz.f32 %f1, %f14;
583
+ setp.ge.f32 %p1, %f1, 0f3F8060FE;
584
+ mov.f32 %f46, 0f3789CA3C;
585
+ mov.f32 %f45, 0fB9F560B9;
586
+ mov.f32 %f44, 0f3BAC840B;
587
+ mov.f32 %f43, 0fBD0C8162;
588
+ mov.f32 %f42, 0f3E1CF906;
589
+ mov.f32 %f41, 0f3F6A937E;
590
+ mov.f32 %f40, 0f3F20D842;
591
+ mov.f32 %f47, %f1;
592
+ @%p1 bra $L__BB1_2;
593
+ mul.f32 %f47, %f14, %f14;
594
+ mov.f32 %f46, 0f38B1E96A;
595
+ mov.f32 %f45, 0fBA574D20;
596
+ mov.f32 %f44, 0f3BAAD5EA;
597
+ mov.f32 %f43, 0fBCDC1BE7;
598
+ mov.f32 %f42, 0f3DE718AF;
599
+ mov.f32 %f41, 0fBEC093AC;
600
+ mov.f32 %f40, 0f3E0375D3;
601
+ $L__BB1_2:
602
+ setp.ltu.f32 %p2, %f1, 0f3F8060FE;
603
+ fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
604
+ fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
605
+ fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
606
+ fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
607
+ fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
608
+ fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
609
+ neg.f32 %f35, %f47;
610
+ selp.f32 %f36, %f35, %f14, %p1;
611
+ fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
612
+ @%p2 bra $L__BB1_4;
613
+ ex2.approx.ftz.f32 %f37, %f48;
614
+ mov.f32 %f38, 0f3F800000;
615
+ sub.f32 %f39, %f38, %f37;
616
+ mov.b32 %r1, %f39;
617
+ mov.b32 %r2, %f14;
618
+ and.b32 %r3, %r2, -2147483648;
619
+ or.b32 %r4, %r3, %r1;
620
+ mov.b32 %f48, %r4;
621
+ $L__BB1_4:
622
+ st.param.f32 [func_retval0+0], %f48;
623
+ ret;
624
+ $L__func_end1:
625
+
626
+ }
627
+ .file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
628
+ .section .debug_abbrev
629
+ {
630
+ .b8 1
631
+ .b8 17
632
+ .b8 1
633
+ .b8 37
634
+ .b8 8
635
+ .b8 19
636
+ .b8 5
637
+ .b8 3
638
+ .b8 8
639
+ .b8 16
640
+ .b8 6
641
+ .b8 27
642
+ .b8 8
643
+ .b8 180
644
+ .b8 66
645
+ .b8 12
646
+ .b8 17
647
+ .b8 1
648
+ .b8 18
649
+ .b8 1
650
+ .b8 0
651
+ .b8 0
652
+ .b8 2
653
+ .b8 46
654
+ .b8 0
655
+ .b8 17
656
+ .b8 1
657
+ .b8 18
658
+ .b8 1
659
+ .b8 64
660
+ .b8 10
661
+ .b8 135
662
+ .b8 64
663
+ .b8 8
664
+ .b8 3
665
+ .b8 8
666
+ .b8 58
667
+ .b8 11
668
+ .b8 59
669
+ .b8 11
670
+ .b8 63
671
+ .b8 12
672
+ .b8 0
673
+ .b8 0
674
+ .b8 0
675
+ }
676
+ .section .debug_info
677
+ {
678
+ .b32 176
679
+ .b8 2
680
+ .b8 0
681
+ .b32 .debug_abbrev
682
+ .b8 8
683
+ .b8 1
684
+ .b8 116
685
+ .b8 114
686
+ .b8 105
687
+ .b8 116
688
+ .b8 111
689
+ .b8 110
690
+ .b8 0
691
+ .b8 2
692
+ .b8 0
693
+ .b8 99
694
+ .b8 53
695
+ .b8 106
696
+ .b8 120
697
+ .b8 97
698
+ .b8 103
699
+ .b8 117
700
+ .b8 120
701
+ .b8 104
702
+ .b8 111
703
+ .b8 51
704
+ .b8 110
705
+ .b8 104
706
+ .b8 114
707
+ .b8 108
708
+ .b8 116
709
+ .b8 53
710
+ .b8 118
711
+ .b8 99
712
+ .b8 105
713
+ .b8 110
714
+ .b8 110
715
+ .b8 122
716
+ .b8 53
717
+ .b8 102
718
+ .b8 101
719
+ .b8 118
720
+ .b8 111
721
+ .b8 100
722
+ .b8 117
723
+ .b8 109
724
+ .b8 108
725
+ .b8 112
726
+ .b8 119
727
+ .b8 110
728
+ .b8 52
729
+ .b8 119
730
+ .b8 121
731
+ .b8 98
732
+ .b8 50
733
+ .b8 118
734
+ .b8 120
735
+ .b8 51
736
+ .b8 120
737
+ .b8 114
738
+ .b8 118
739
+ .b8 101
740
+ .b8 105
741
+ .b8 99
742
+ .b8 101
743
+ .b8 114
744
+ .b8 108
745
+ .b8 46
746
+ .b8 112
747
+ .b8 121
748
+ .b8 0
749
+ .b32 .debug_line
750
+ .b8 47
751
+ .b8 116
752
+ .b8 109
753
+ .b8 112
754
+ .b8 47
755
+ .b8 116
756
+ .b8 111
757
+ .b8 114
758
+ .b8 99
759
+ .b8 104
760
+ .b8 105
761
+ .b8 110
762
+ .b8 100
763
+ .b8 117
764
+ .b8 99
765
+ .b8 116
766
+ .b8 111
767
+ .b8 114
768
+ .b8 95
769
+ .b8 114
770
+ .b8 111
771
+ .b8 111
772
+ .b8 116
773
+ .b8 47
774
+ .b8 53
775
+ .b8 106
776
+ .b8 0
777
+ .b8 1
778
+ .b64 $L__func_begin0
779
+ .b64 $L__func_end0
780
+ .b8 2
781
+ .b64 $L__func_begin0
782
+ .b64 $L__func_end0
783
+ .b8 1
784
+ .b8 156
785
+ .b8 116
786
+ .b8 114
787
+ .b8 105
788
+ .b8 116
789
+ .b8 111
790
+ .b8 110
791
+ .b8 95
792
+ .b8 95
793
+ .b8 48
794
+ .b8 100
795
+ .b8 49
796
+ .b8 100
797
+ .b8 50
798
+ .b8 100
799
+ .b8 101
800
+ .b8 0
801
+ .b8 116
802
+ .b8 114
803
+ .b8 105
804
+ .b8 116
805
+ .b8 111
806
+ .b8 110
807
+ .b8 95
808
+ .b8 95
809
+ .b8 48
810
+ .b8 100
811
+ .b8 49
812
+ .b8 100
813
+ .b8 50
814
+ .b8 100
815
+ .b8 101
816
+ .b8 0
817
+ .b8 1
818
+ .b8 18
819
+ .b8 1
820
+ .b8 0
821
+ }
822
+ .section .debug_pubnames
823
+ {
824
+ .b32 $L__pubNames_end0-$L__pubNames_start0
825
+ $L__pubNames_start0:
826
+ .b8 2
827
+ .b8 0
828
+ .b32 .debug_info
829
+ .b32 180
830
+ .b32 125
831
+ .b8 116
832
+ .b8 114
833
+ .b8 105
834
+ .b8 116
835
+ .b8 111
836
+ .b8 110
837
+ .b8 95
838
+ .b8 95
839
+ .b8 48
840
+ .b8 100
841
+ .b8 49
842
+ .b8 100
843
+ .b8 50
844
+ .b8 100
845
+ .b8 101
846
+ .b8 0
847
+ .b32 0
848
+ $L__pubNames_end0:
849
+ }
850
+ .section .debug_pubtypes
851
+ {
852
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
853
+ $L__pubTypes_start0:
854
+ .b8 2
855
+ .b8 0
856
+ .b32 .debug_info
857
+ .b32 180
858
+ .b32 0
859
+ $L__pubTypes_end0:
860
+ }
861
+ .section .debug_loc { }
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin ADDED
Binary file (7.33 kB). View file
 
.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c256_i32 = arith.constant 256 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c256_i32 : i32
6
+ %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<256xi32>
8
+ %4 = arith.addi %3, %2 : tensor<256xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
12
+ %8 = arith.extf %7 : tensor<256xbf16> to tensor<256xf32>
13
+ %9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
14
+ %10 = tt.addptr %9, %4 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
15
+ tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
16
+ tt.return
17
+ }
18
+ }