Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .local/share/Trash/info/train_002.bin.trashinfo +3 -0
- .local/share/Trash/info/train_004.bin.trashinfo +3 -0
- .local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png +0 -0
- .local/share/jupyter/nbextensions/ruler/icon.png +0 -0
- .local/share/jupyter/nbextensions/ruler/main.js +124 -0
- .local/share/jupyter/nbextensions/runtools/readme.md +119 -0
- .local/share/jupyter/nbextensions/runtools/runtools_show_hide.png +0 -0
- .local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml +6 -0
- .local/share/jupyter/nbextensions/select_keymap/README.md +14 -0
- .local/share/jupyter/nbextensions/skill/main.js +14 -0
- .local/share/jupyter/nbextensions/skill/skill.yaml +6 -0
- .local/share/jupyter/nbextensions/skip-traceback/traceback.png +0 -0
- .local/share/jupyter/nbextensions/splitcell/splitcell.js +101 -0
- .local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png +0 -0
- .local/share/jupyter/nbextensions/varInspector/README.md +36 -0
- .local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc +0 -0
- .local/share/jupyter/nbextensions/varInspector/demo.gif +0 -0
- .local/share/jupyter/nbextensions/varInspector/icon.png +0 -0
- .local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt +21 -0
- .local/share/jupyter/nbextensions/varInspector/var_list.py +63 -0
- .local/share/jupyter/nbextensions/varInspector/var_list.r +17 -0
- .local/share/jupyter/nbextensions/zenmode/README.md +4 -0
- .local/share/jupyter/nbextensions/zenmode/images/back2.jpg +0 -0
- .local/share/jupyter/nbextensions/zenmode/images/back21.jpg +0 -0
- .local/share/jupyter/nbextensions/zenmode/images/back3.jpg +0 -0
- .local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png +0 -0
- .local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png +0 -0
- .local/share/jupyter/nbextensions/zenmode/main.css +34 -0
- .local/share/jupyter/runtime/jpserver-434.json +13 -0
- .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin +0 -0
- .triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir +60 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin +0 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx +453 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir +38 -0
- .triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir +34 -0
- .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx +312 -0
- .triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir +19 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin +0 -0
- .triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir +56 -0
- .triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir +88 -0
- .triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx +778 -0
- .triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir +27 -0
- .triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir +0 -0
- .triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir +89 -0
- .triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir +18 -0
- .triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir +17 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin +0 -0
- .triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx +861 -0
- .triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin +0 -0
- .triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir +18 -0
.local/share/Trash/info/train_002.bin.trashinfo
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[Trash Info]
|
2 |
+
Path=/root/data/fineweb/train_002.bin
|
3 |
+
DeletionDate=2024-09-26T05:50:34
|
.local/share/Trash/info/train_004.bin.trashinfo
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[Trash Info]
|
2 |
+
Path=/root/data/fineweb/train_004.bin
|
3 |
+
DeletionDate=2024-09-26T05:50:34
|
.local/share/jupyter/nbextensions/python-markdown/python-markdown-post.png
ADDED
.local/share/jupyter/nbextensions/ruler/icon.png
ADDED
.local/share/jupyter/nbextensions/ruler/main.js
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Add rulers to codecells
|
2 |
+
define([
|
3 |
+
'base/js/namespace',
|
4 |
+
'base/js/events',
|
5 |
+
'services/config',
|
6 |
+
'notebook/js/codecell',
|
7 |
+
'codemirror/lib/codemirror',
|
8 |
+
'codemirror/addon/display/rulers'
|
9 |
+
], function (Jupyter, events, configmod, codecell, codemirror) {
|
10 |
+
"use strict";
|
11 |
+
|
12 |
+
var log_prefix = '[ruler]';
|
13 |
+
|
14 |
+
// define default config parameter values
|
15 |
+
var params = {
|
16 |
+
ruler_column: [78],
|
17 |
+
ruler_color: ["#ff0000"],
|
18 |
+
ruler_linestyle: ["dashed"],
|
19 |
+
ruler_do_css_patch: false
|
20 |
+
};
|
21 |
+
|
22 |
+
|
23 |
+
var rulers = [];
|
24 |
+
|
25 |
+
var isNumber = function (n) {
|
26 |
+
return !isNaN(parseFloat(n)) && isFinite(n);
|
27 |
+
};
|
28 |
+
|
29 |
+
// updates default params with any specified in the provided config data
|
30 |
+
var update_params = function (config_data) {
|
31 |
+
for (var key in params) {
|
32 |
+
if (config_data.hasOwnProperty(key)) {
|
33 |
+
params[key] = config_data[key];
|
34 |
+
}
|
35 |
+
}
|
36 |
+
};
|
37 |
+
|
38 |
+
var on_config_loaded = function () {
|
39 |
+
|
40 |
+
if (Jupyter.notebook !== undefined) {
|
41 |
+
var i, config = Jupyter.notebook.config;
|
42 |
+
} else {
|
43 |
+
var i, config = Jupyter.editor.config;
|
44 |
+
}
|
45 |
+
|
46 |
+
if (config.data.hasOwnProperty('ruler_color') && config.data.ruler_color.length > 0) {
|
47 |
+
params.ruler_color = config.data.ruler_color;
|
48 |
+
}
|
49 |
+
|
50 |
+
if (config.data.hasOwnProperty('ruler_column')) {
|
51 |
+
var new_columns = [];
|
52 |
+
for (i in config.data.ruler_column) {
|
53 |
+
if (isNumber(config.data.ruler_column[i])) {
|
54 |
+
new_columns.push(config.data.ruler_column[i]);
|
55 |
+
}
|
56 |
+
}
|
57 |
+
if (new_columns.length > 0) {
|
58 |
+
params.ruler_column = new_columns;
|
59 |
+
}
|
60 |
+
}
|
61 |
+
|
62 |
+
if (config.data.hasOwnProperty('ruler_linestyle') && config.data.ruler_linestyle.length > 0) {
|
63 |
+
params.ruler_linestyle = config.data.ruler_linestyle;
|
64 |
+
}
|
65 |
+
|
66 |
+
for (i in params.ruler_column) {
|
67 |
+
rulers.push({
|
68 |
+
color: params.ruler_color[i % params.ruler_color.length],
|
69 |
+
column: params.ruler_column[i],
|
70 |
+
lineStyle: params.ruler_linestyle[i % params.ruler_linestyle.length]
|
71 |
+
});
|
72 |
+
}
|
73 |
+
console.debug(log_prefix, 'ruler specs:', rulers);
|
74 |
+
|
75 |
+
if (Jupyter.notebook !== undefined) {
|
76 |
+
var i, config = Jupyter.notebook.config;
|
77 |
+
|
78 |
+
// Change default for new cells
|
79 |
+
codecell.CodeCell.options_default.cm_config.rulers = rulers;
|
80 |
+
// Apply to any already-existing cells
|
81 |
+
var cells = Jupyter.notebook.get_cells().forEach(function (cell) {
|
82 |
+
if (cell instanceof codecell.CodeCell) {
|
83 |
+
cell.code_mirror.setOption('rulers', rulers);
|
84 |
+
}
|
85 |
+
});
|
86 |
+
|
87 |
+
}
|
88 |
+
else {
|
89 |
+
Jupyter.editor.codemirror.setOption('rulers', rulers);
|
90 |
+
}
|
91 |
+
};
|
92 |
+
|
93 |
+
var load_extension = function () {
|
94 |
+
|
95 |
+
// first, check which view we're in, in order to decide whether to load
|
96 |
+
var conf_sect;
|
97 |
+
if (Jupyter.notebook) {
|
98 |
+
// we're in notebook view
|
99 |
+
conf_sect = Jupyter.notebook.config;
|
100 |
+
}
|
101 |
+
else if (Jupyter.editor) {
|
102 |
+
// we're in file-editor view
|
103 |
+
conf_sect = Jupyter.editor.config;
|
104 |
+
}
|
105 |
+
else {
|
106 |
+
// we're some other view like dashboard, terminal, etc, so bail now
|
107 |
+
return;
|
108 |
+
}
|
109 |
+
|
110 |
+
conf_sect.loaded
|
111 |
+
.then(function () {
|
112 |
+
update_params(conf_sect.data);
|
113 |
+
})
|
114 |
+
.then(on_config_loaded)
|
115 |
+
.catch(function on_error(reason) {
|
116 |
+
console.warn(log_prefix, 'error:', reason);
|
117 |
+
});
|
118 |
+
};
|
119 |
+
|
120 |
+
var extension = {
|
121 |
+
load_ipython_extension: load_extension
|
122 |
+
};
|
123 |
+
return extension;
|
124 |
+
});
|
.local/share/jupyter/nbextensions/runtools/readme.md
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Runtools
|
2 |
+
========
|
3 |
+
Runtools provide a number of additional functions for working with code cells in the IPython notebook:
|
4 |
+
|
5 |
+
Code Cell Execution
|
6 |
+
-------------------
|
7 |
+
|
8 |
+
* Execute a single cell
|
9 |
+
* Execute from top cell to currently selected cell
|
10 |
+
* Execute from currently selected cell to bottom cell
|
11 |
+
* Execute all cells
|
12 |
+
* Execute all cells, ignore exceptions (requires [ipython/pull/6521](https://github.com/ipython/ipython/pull/6521))
|
13 |
+
* Execute marked code cells (cells with green gutter area are marked)
|
14 |
+
* Stop execution (duplicate to standard toolbar button)
|
15 |
+
|
16 |
+
When executing marked cells, they are put in a execution list, and
|
17 |
+
executed in order. The execution list can be modified by unmarking
|
18 |
+
a cell not yet run. The execution list can be stopped by clicking on
|
19 |
+
`stop execution`. Execution of the currently running cell can be stopped
|
20 |
+
by pressing `stop execution` twice.
|
21 |
+
|
22 |
+
Code Cell Marking
|
23 |
+
-----------------
|
24 |
+
|
25 |
+
* Mark one or more code cell
|
26 |
+
|
27 |
+
|
28 |
+
Code Cell Display
|
29 |
+
-----------------
|
30 |
+
|
31 |
+
* Hide or show input (i.e. the source code) of marked code cells
|
32 |
+
* Hide or show output of marked code cells
|
33 |
+
|
34 |
+
|
35 |
+
Description
|
36 |
+
-----------
|
37 |
+
|
38 |
+
The *runtools* extension adds a button to turn on/off a floating toolbar:
|
39 |
+
![](icon.png)
|
40 |
+
|
41 |
+
This adds Code execution buttons:
|
42 |
+
![](runtools_execute.png)
|
43 |
+
|
44 |
+
Codecells can be marked by clicking on the gutter of a codecell or by clicking on the markers toolbar:
|
45 |
+
![](runtools_marker.png)
|
46 |
+
|
47 |
+
Marked codecells can be locked to read-only mode and moved upd and down:
|
48 |
+
![](runtools_lock.png)
|
49 |
+
|
50 |
+
The input and output areas of marked codecells can be hidden:
|
51 |
+
![](runtools_show_hide.png)
|
52 |
+
|
53 |
+
A IPython notebook with marked cells looks like this:
|
54 |
+
![](runtools_nb.png)
|
55 |
+
|
56 |
+
|
57 |
+
Demo
|
58 |
+
----
|
59 |
+
|
60 |
+
![](demo.gif)
|
61 |
+
|
62 |
+
|
63 |
+
Internals
|
64 |
+
---------
|
65 |
+
|
66 |
+
New metadata elements added to each cell:
|
67 |
+
* `cell.metadata.hide_input` - hide input field of the cell
|
68 |
+
* `cell.metadata.hide_output` - hide output field of the cell
|
69 |
+
* `cell.metadata.run_control.marked` - mark a codecell
|
70 |
+
|
71 |
+
To export a notebook with hidden input/output fields, the custom template `hide_input_output.tpl` is required.
|
72 |
+
It should have been installed in the `templates` folder.
|
73 |
+
You can find the `templates` folder of `jupyter_contrib_nbextensions` from python using
|
74 |
+
|
75 |
+
```python
|
76 |
+
from jupyter_contrib_nbextensions.nbconvert_support import templates_directory
|
77 |
+
print(templates_directory())
|
78 |
+
```
|
79 |
+
|
80 |
+
The template needs to be in a path where nbconvert can find it. This can be your local path or specified in
|
81 |
+
`jupyter_nbconvert_config` or `jupyter_notebook_config` as `c.Exporter.extra_template_paths`, see [Jupyter docs](https://jupyter-notebook.readthedocs.io/en/latest/config.html).
|
82 |
+
|
83 |
+
For HTML export a template is provided as `nbextensions.tpl` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template:
|
84 |
+
```
|
85 |
+
{%- extends 'full.tpl' -%}
|
86 |
+
|
87 |
+
{% block input_group -%}
|
88 |
+
{%- if cell.metadata.hide_input -%}
|
89 |
+
{%- else -%}
|
90 |
+
{{ super() }}
|
91 |
+
{%- endif -%}
|
92 |
+
{% endblock input_group %}
|
93 |
+
|
94 |
+
{% block output_group -%}
|
95 |
+
{%- if cell.metadata.hide_output -%}
|
96 |
+
{%- else -%}
|
97 |
+
{{ super() }}
|
98 |
+
{%- endif -%}
|
99 |
+
{% endblock output_group %}
|
100 |
+
```
|
101 |
+
|
102 |
+
For LaTeX export a different template is required, which is included as `nbextensions.tplx` in the `jupyter_contrib_nbextensions` templates directory. Alternatively you can create your own template:
|
103 |
+
```
|
104 |
+
((- extends 'report.tplx' -))
|
105 |
+
|
106 |
+
((* block input_group -))
|
107 |
+
((- if cell.metadata.hide_input -))
|
108 |
+
((- else -))
|
109 |
+
((( super() )))
|
110 |
+
((- endif -))
|
111 |
+
(( endblock input_group *))
|
112 |
+
|
113 |
+
((* block output_group -))
|
114 |
+
((- if cell.metadata.hide_output -))
|
115 |
+
((- else -))
|
116 |
+
((( super() )))
|
117 |
+
((- endif -))
|
118 |
+
(( endblock output_group *))
|
119 |
+
```
|
.local/share/jupyter/nbextensions/runtools/runtools_show_hide.png
ADDED
.local/share/jupyter/nbextensions/scratchpad/scratchpad.yaml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Type: Jupyter Notebook Extension
|
2 |
+
Name: Scratchpad
|
3 |
+
Description: Adds a scratchpad cell to Jupyter notebook.
|
4 |
+
Link: README.md
|
5 |
+
Main: main.js
|
6 |
+
Compatibility: 4.x, 5.x
|
.local/share/jupyter/nbextensions/select_keymap/README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Select CodeMirror Keymap
|
2 |
+
=======
|
3 |
+
|
4 |
+
This extension lets you choose between the available CodeMirror keymaps: default, emacs, vim, and sublime.
|
5 |
+
|
6 |
+
There's a chance that this might cause key conflicts, especially with other extensions.
|
7 |
+
|
8 |
+
Most browsers consume some of the global keybindings like `Ctrl+n`. [The Menu Wizard add-on for Firefox](https://addons.mozilla.org/en-US/firefox/addon/s3menu-wizard/) allows you to disable some of the global key shortcuts, thus passing the keys through to CodeMirror.
|
9 |
+
|
10 |
+
![Demo](select_keymap.png)
|
11 |
+
|
12 |
+
Based on:
|
13 |
+
* [jupyter-emacskeys](https://github.com/rmcgibbo/jupyter-emacskeys)
|
14 |
+
* [notebook_input_mode](https://github.com/asford/notebook_input_mode)
|
.local/share/jupyter/nbextensions/skill/main.js
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
define(function() {
|
2 |
+
"use strict";
|
3 |
+
// jupyter nbextensions must export a load_ipython_extension function to
|
4 |
+
// avoid throwing an error. Also, loading the module should do nothing
|
5 |
+
// unless the function is called, so we wrap requiring the codemirror mode
|
6 |
+
// in the load call.
|
7 |
+
return {
|
8 |
+
load_ipython_extension: function () {
|
9 |
+
requirejs(['./skill'], function () {
|
10 |
+
console.log('[SKILL Syntax] loaded');
|
11 |
+
});
|
12 |
+
}
|
13 |
+
};
|
14 |
+
});
|
.local/share/jupyter/nbextensions/skill/skill.yaml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Type: IPython Notebook Extension
|
2 |
+
Name: SKILL Syntax
|
3 |
+
Description: Enable SKILL syntax support for CodeMirror
|
4 |
+
Link: README.md
|
5 |
+
Main: main.js
|
6 |
+
Compatibility: 4.x, 5.x
|
.local/share/jupyter/nbextensions/skip-traceback/traceback.png
ADDED
.local/share/jupyter/nbextensions/splitcell/splitcell.js
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Allow for split cells in jupyter notebooks
|
2 |
+
|
3 |
+
define([
|
4 |
+
'base/js/namespace',
|
5 |
+
'base/js/events'
|
6 |
+
], function (
|
7 |
+
Jupyter,
|
8 |
+
events
|
9 |
+
) {
|
10 |
+
"use strict";
|
11 |
+
|
12 |
+
//define default config parameter values
|
13 |
+
var params = {
|
14 |
+
toggle_cell_style_keybinding : 'shift-s'
|
15 |
+
};
|
16 |
+
|
17 |
+
//updates default params with any specified in the server's config
|
18 |
+
var update_params = function(){
|
19 |
+
var config = Jupyter.notebook.config;
|
20 |
+
for (var key in params){
|
21 |
+
if (config.data.hasOwnProperty(key)){
|
22 |
+
params[key] = config.data[key];
|
23 |
+
}
|
24 |
+
}
|
25 |
+
};
|
26 |
+
|
27 |
+
var setup = function (){
|
28 |
+
// update defaults
|
29 |
+
update_params();
|
30 |
+
|
31 |
+
//register actions with ActionHandler instance
|
32 |
+
var prefix = 'auto';
|
33 |
+
var name = 'toggle-cell-style';
|
34 |
+
var action = {
|
35 |
+
icon : 'fa-arrows-h',
|
36 |
+
help : 'Toggle split/centered cell style',
|
37 |
+
help_index : 'eb',
|
38 |
+
id : 'split_cells',
|
39 |
+
handler : toggle_cell_style
|
40 |
+
};
|
41 |
+
|
42 |
+
var action_full_name = Jupyter.keyboard_manager.actions.register(action, name, prefix);
|
43 |
+
|
44 |
+
//define keyboard shortucts
|
45 |
+
var command_mode_shortcuts = {};
|
46 |
+
command_mode_shortcuts[params.toggle_cell_style_keybinding] = action_full_name;
|
47 |
+
|
48 |
+
//register keyboard shortucts with keyboard_manager
|
49 |
+
Jupyter.notebook.keyboard_manager.command_shortcuts.add_shortcuts(command_mode_shortcuts);
|
50 |
+
Jupyter.toolbar.add_buttons_group([action_full_name]);
|
51 |
+
};
|
52 |
+
|
53 |
+
|
54 |
+
var toggle_cell_style = function(){
|
55 |
+
var cell = Jupyter.notebook.get_selected_cell();
|
56 |
+
if (!("cell_style" in cell.metadata)){cell.metadata.cell_style = 'split';}
|
57 |
+
else if (cell.metadata.cell_style == 'center'){cell.metadata.cell_style = 'split';}
|
58 |
+
else {cell.metadata.cell_style = 'center';}
|
59 |
+
|
60 |
+
update_cell_style_element(cell);
|
61 |
+
};
|
62 |
+
|
63 |
+
var get_cell_style_html = function(cell_style){
|
64 |
+
console.log(cell_style);
|
65 |
+
if (cell_style == "split")
|
66 |
+
{return "float:left; width:50%;";}
|
67 |
+
return "width:100%;";
|
68 |
+
};
|
69 |
+
|
70 |
+
var update_cell_style_element = function(cell){
|
71 |
+
var cell_style_html = get_cell_style_html(cell.metadata.cell_style);
|
72 |
+
cell.element.attr('style', cell_style_html);
|
73 |
+
};
|
74 |
+
|
75 |
+
function initialize () {
|
76 |
+
// On Load lets set the cell styles correctly
|
77 |
+
var cells = Jupyter.notebook.get_cells();
|
78 |
+
var ncells = Jupyter.notebook.ncells();
|
79 |
+
|
80 |
+
for (var i=0; i<ncells; i++){
|
81 |
+
var cell = cells[i];
|
82 |
+
if ("cell_style" in cell.metadata){
|
83 |
+
update_cell_style_element(cell, cell.metadata.cell_style);
|
84 |
+
}
|
85 |
+
}
|
86 |
+
}
|
87 |
+
|
88 |
+
var load_extension = function() {
|
89 |
+
Jupyter.notebook.config.loaded.then(setup);
|
90 |
+
|
91 |
+
if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
|
92 |
+
// notebook already loaded. Update directly
|
93 |
+
initialize();
|
94 |
+
}
|
95 |
+
events.on("notebook_loaded.Notebook", initialize);
|
96 |
+
};
|
97 |
+
|
98 |
+
return {
|
99 |
+
load_ipython_extension : load_extension
|
100 |
+
};
|
101 |
+
});
|
.local/share/jupyter/nbextensions/toggle_all_line_numbers/icon.png
ADDED
.local/share/jupyter/nbextensions/varInspector/README.md
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Variable Inspector
|
2 |
+
|
3 |
+
## Description and main features
|
4 |
+
|
5 |
+
The Variable Inspector extension, which currently supports python and R kernels, enables to collect all defined variables and display them in a floating window. The window not only display the name of variables but also their type, size in memory and content. The columns are sortable. The window is draggable, resizable, collapsable. The list of displayed variables is automatically updated at each cell execution. Variables can be deleted from workspace by clicking a link. Position and state (displayed/collapsed) are stored in the notebook's metadata and restored at startup.
|
6 |
+
|
7 |
+
The extension supports multiple kernels. To add support for a new kernel, one has to
|
8 |
+
- provide a library which loads required modules and define a function which lists all variables, together with their name, type, size and content. The output of this function must be a JSON representation of a list of objects (one for each variable) with keys 'varName','varType', 'varSize', 'varContent',
|
9 |
+
- provide the command for deleting a variable, as `delete_cmd_prefix` and `delete_cmd_postfix`, eg. for `rm(variable)`, specify `rm(` and `)`.
|
10 |
+
- give the command to refresh the list of variables (usually this is a call to the function defined in the library above). This information can be provided either in the source file or in the yaml config file.
|
11 |
+
|
12 |
+
In any case, contributions to support further kernels will be very welcome!
|
13 |
+
|
14 |
+
#### Demo:
|
15 |
+
![](demo.gif)
|
16 |
+
|
17 |
+
|
18 |
+
## Configuration
|
19 |
+
The initial configuration can be given using the IPython-contrib nbextensions facility. It includes:
|
20 |
+
|
21 |
+
- varInspector.window_display - Display at startup or not (default: false)
|
22 |
+
- varInspector.cols.lenName: (and .lenType, .lenVar) - Width of columns (actually the max number of character to display in each column)
|
23 |
+
- varInspector.kernels_config - json object defining the kernels specific code and commands.
|
24 |
+
|
25 |
+
|
26 |
+
## Notes
|
27 |
+
- The displayed size of variables use the `getsizeof()` python method. This method doesn't work for all types, so the reported size is to be considered with some caution. The extension includes some code to correctly return the size of numpy arrays, pandas Series and DataFrame but the size for some other types may be incorrect.
|
28 |
+
- The extension builds on some code provided [here](https://github.com/jupyter-widgets/ipywidgets/blob/master/docs/source/examples/Variable%20Inspector.ipynb) (essentially the `_fill` method)
|
29 |
+
- The extension uses Christian Bach's [table sorter jquery plugin](https://github.com/christianbach/tablesorter). License file is included.
|
30 |
+
|
31 |
+
|
32 |
+
## History
|
33 |
+
|
34 |
+
- @jfbercher march 22, 2017 -- initial release
|
35 |
+
- @jfbercher april 03, 2017 -- multiple kernel support. added support for R kernels.
|
36 |
+
- @jfbercher june 30, 2017 -- fixed #1014 (use of `%reset` with IPython kernel) and #1015 printing with python 2 kernel.
|
.local/share/jupyter/nbextensions/varInspector/__pycache__/var_list.cpython-310.pyc
ADDED
Binary file (2.02 kB). View file
|
|
.local/share/jupyter/nbextensions/varInspector/demo.gif
ADDED
.local/share/jupyter/nbextensions/varInspector/icon.png
ADDED
.local/share/jupyter/nbextensions/varInspector/tablesorter_LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The MIT License (MIT)
|
2 |
+
|
3 |
+
Copyright (c) 2014 Christian Bach
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
.local/share/jupyter/nbextensions/varInspector/var_list.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from sys import getsizeof
|
3 |
+
|
4 |
+
from IPython import get_ipython
|
5 |
+
from IPython.core.magics.namespace import NamespaceMagics
|
6 |
+
_nms = NamespaceMagics()
|
7 |
+
_Jupyter = get_ipython()
|
8 |
+
_nms.shell = _Jupyter.kernel.shell
|
9 |
+
|
10 |
+
try:
|
11 |
+
import numpy as np
|
12 |
+
except ImportError:
|
13 |
+
pass
|
14 |
+
|
15 |
+
def _getsizeof(x):
|
16 |
+
# return the size of variable x. Amended version of sys.getsizeof
|
17 |
+
# which also supports ndarray, Series and DataFrame
|
18 |
+
if type(x).__name__ in ['ndarray', 'Series']:
|
19 |
+
return x.nbytes
|
20 |
+
elif type(x).__name__ == 'DataFrame':
|
21 |
+
return x.memory_usage().sum()
|
22 |
+
else:
|
23 |
+
return getsizeof(x)
|
24 |
+
|
25 |
+
def _getshapeof(x):
|
26 |
+
#returns the shape of x if it has one
|
27 |
+
#returns None otherwise - might want to return an empty string for an empty column
|
28 |
+
try:
|
29 |
+
return x.shape
|
30 |
+
except AttributeError: #x does not have a shape
|
31 |
+
return None
|
32 |
+
|
33 |
+
def _getcontentof(x):
|
34 |
+
length = 150
|
35 |
+
if type(x).__name__ == 'DataFrame':
|
36 |
+
colnames = ', '.join(x.columns.map(str))
|
37 |
+
content = "Column names: %s" % colnames
|
38 |
+
elif type(x).__name__ == 'Series':
|
39 |
+
content = "Series [%d rows]" % x.shape
|
40 |
+
elif type(x).__name__ == 'ndarray':
|
41 |
+
content = x.__repr__()
|
42 |
+
else:
|
43 |
+
if hasattr(x, '__len__'):
|
44 |
+
if len(x) > length:
|
45 |
+
content = str(x[:length])
|
46 |
+
else:
|
47 |
+
content = str(x)
|
48 |
+
if len(content) > 150:
|
49 |
+
return content[:150] + " ..."
|
50 |
+
return content
|
51 |
+
|
52 |
+
def var_dic_list():
|
53 |
+
types_to_exclude = ['module', 'function', 'builtin_function_or_method',
|
54 |
+
'instance', '_Feature', 'type', 'ufunc']
|
55 |
+
values = _nms.who_ls()
|
56 |
+
vardic = [{'varName': v, 'varType': type(eval(v)).__name__, 'varSize': str(_getsizeof(eval(v))), 'varShape': str(_getshapeof(eval(v))) if _getshapeof(eval(v)) else '', 'varContent': _getcontentof(eval(v)) } # noqa
|
57 |
+
|
58 |
+
for v in values if (v not in ['_html', '_nms', 'NamespaceMagics', '_Jupyter']) & (type(eval(v)).__name__ not in types_to_exclude)] # noqa
|
59 |
+
return json.dumps(vardic)
|
60 |
+
|
61 |
+
|
62 |
+
# command to refresh the list of variables
|
63 |
+
print(var_dic_list())
|
.local/share/jupyter/nbextensions/varInspector/var_list.r
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
library(jsonlite)
|
2 |
+
var_dic_list = function(){
|
3 |
+
ll = ls(.GlobalEnv, all.names = FALSE)
|
4 |
+
varList=list()
|
5 |
+
iter = 1
|
6 |
+
for (k in ll){
|
7 |
+
if (class(get(k))!='function'){
|
8 |
+
class = class(get(k)); rk = capture.output(str(get(k))); size = object.size(get(k)); sk = substr(get(k),0, 200);
|
9 |
+
# [{'varName':v, 'varType': type(eval(v)).__name__, 'varSize': _getsizeof(eval(v)), 'varContent': str(eval(v))[:200]}
|
10 |
+
l = list(varName = k, varType = class, varSize = size, varContent = sk)
|
11 |
+
varList[[iter]] = l
|
12 |
+
# print(l)
|
13 |
+
iter = iter + 1}
|
14 |
+
}
|
15 |
+
return(toJSON(varList, simplifyVector = FALSE, force=TRUE))
|
16 |
+
}
|
17 |
+
cat(var_dic_list())
|
.local/share/jupyter/nbextensions/zenmode/README.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Zenmode
|
2 |
+
=======
|
3 |
+
|
4 |
+
A little extension to give Zenmode functionality to the IPython notebook
|
.local/share/jupyter/nbextensions/zenmode/images/back2.jpg
ADDED
.local/share/jupyter/nbextensions/zenmode/images/back21.jpg
ADDED
.local/share/jupyter/nbextensions/zenmode/images/back3.jpg
ADDED
.local/share/jupyter/nbextensions/zenmode/images/ipynblogo0.png
ADDED
.local/share/jupyter/nbextensions/zenmode/images/ipynblogo1.png
ADDED
.local/share/jupyter/nbextensions/zenmode/main.css
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.navbar-inner {
|
2 |
+
opacity: 0.5;
|
3 |
+
-webkit-transition: opacity 0.3s ease-in-out;
|
4 |
+
-moz-transition: opacity 0.3s ease-in-out;
|
5 |
+
-o-transition: opacity 0.3s ease-in-out;
|
6 |
+
transition: opacity 0.3s ease-in-out;
|
7 |
+
}
|
8 |
+
|
9 |
+
.navbar-inner:hover {
|
10 |
+
opacity: 1.0;
|
11 |
+
}
|
12 |
+
|
13 |
+
#maintoolbar .navbar-text {
|
14 |
+
display: none !important;
|
15 |
+
}
|
16 |
+
|
17 |
+
#notebook-container {
|
18 |
+
background-color: rgba(255, 255, 255, 0);
|
19 |
+
}
|
20 |
+
|
21 |
+
/*
|
22 |
+
.cell {
|
23 |
+
background-color: rgb(255, 255, 255);
|
24 |
+
}
|
25 |
+
|
26 |
+
.CodeMirror {
|
27 |
+
background: #F8FCCF;
|
28 |
+
}
|
29 |
+
|
30 |
+
div.input_area {
|
31 |
+
margin: 2px;
|
32 |
+
border: none;
|
33 |
+
}
|
34 |
+
*/
|
.local/share/jupyter/runtime/jpserver-434.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_url": "/",
|
3 |
+
"hostname": "0.0.0.0",
|
4 |
+
"password": false,
|
5 |
+
"pid": 434,
|
6 |
+
"port": 8080,
|
7 |
+
"root_dir": "/root",
|
8 |
+
"secure": true,
|
9 |
+
"sock": "",
|
10 |
+
"token": "5a434251505375f2b42435914de608ef3450739f4e14b0be1cfeae3b7364239e",
|
11 |
+
"url": "https://184d1c0992ce:8080/",
|
12 |
+
"version": "2.12.5"
|
13 |
+
}
|
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.cubin
ADDED
Binary file (13.3 kB). View file
|
|
.triton/dump/0f43b9f3b1f9407355f6ad39f7d56744/triton_.ttgir
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [0, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2d3de4e(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<256> : tensor<4x1xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<4x1xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<512> : tensor<4x1xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<256> : tensor<4x1xi32, #blocked>
|
9 |
+
%cst_3 = arith.constant dense<131072> : tensor<1x128xi32, #blocked1>
|
10 |
+
%cst_4 = arith.constant dense<120> : tensor<1x128xi32, #blocked1>
|
11 |
+
%cst_5 = arith.constant dense<0.000000e+00> : tensor<4x128xf32, #blocked1>
|
12 |
+
%cst_6 = arith.constant dense<true> : tensor<4x1xi1, #blocked>
|
13 |
+
%c4_i32 = arith.constant 4 : i32
|
14 |
+
%0 = tt.get_program_id x : i32
|
15 |
+
%1 = arith.muli %0, %c4_i32 : i32
|
16 |
+
%2 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
17 |
+
%3 = tt.make_range {end = 4 : i32, start = 0 : i32} : tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
18 |
+
%4 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4x1xi32, #blocked1>
|
19 |
+
%5 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<4xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xi32, #blocked>
|
20 |
+
%6 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked1>
|
21 |
+
%7 = tt.splat %1 : (i32) -> tensor<4x1xi32, #blocked>
|
22 |
+
%8 = arith.addi %6, %4 : tensor<4x1xi32, #blocked1>
|
23 |
+
%9 = arith.addi %7, %5 : tensor<4x1xi32, #blocked>
|
24 |
+
%10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
|
25 |
+
%11 = tt.expand_dims %10 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x128xi32, #blocked1>
|
26 |
+
%12 = arith.cmpi slt, %11, %cst_4 : tensor<1x128xi32, #blocked1>
|
27 |
+
%13 = arith.muli %11, %cst_3 : tensor<1x128xi32, #blocked1>
|
28 |
+
%14 = tt.broadcast %8 : (tensor<4x1xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
|
29 |
+
%15 = tt.broadcast %13 : (tensor<1x128xi32, #blocked1>) -> tensor<4x128xi32, #blocked1>
|
30 |
+
%16 = arith.addi %14, %15 : tensor<4x128xi32, #blocked1>
|
31 |
+
%17 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<4x128x!tt.ptr<f32, 1>, #blocked1>
|
32 |
+
%18 = tt.addptr %17, %16 : tensor<4x128x!tt.ptr<f32, 1>, #blocked1>, tensor<4x128xi32, #blocked1>
|
33 |
+
%19 = tt.broadcast %12 : (tensor<1x128xi1, #blocked1>) -> tensor<4x128xi1, #blocked1>
|
34 |
+
%20 = tt.load %18, %19, %cst_5 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<4x128xf32, #blocked1>
|
35 |
+
%21 = arith.addf %20, %cst_5 : tensor<4x128xf32, #blocked1>
|
36 |
+
%22 = arith.select %19, %21, %cst_5 : tensor<4x128xi1, #blocked1>, tensor<4x128xf32, #blocked1>
|
37 |
+
%23 = "tt.reduce"(%22) <{axis = 1 : i32}> ({
|
38 |
+
^bb0(%arg5: f32, %arg6: f32):
|
39 |
+
%40 = arith.addf %arg5, %arg6 : f32
|
40 |
+
tt.reduce.return %40 : f32
|
41 |
+
}) : (tensor<4x128xf32, #blocked1>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
|
42 |
+
%24 = triton_gpu.convert_layout %23 : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
|
43 |
+
%25 = tt.expand_dims %24 {axis = 1 : i32} : (tensor<4xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<4x1xf32, #blocked>
|
44 |
+
%26 = arith.divsi %9, %cst_2 : tensor<4x1xi32, #blocked>
|
45 |
+
%27 = arith.remsi %9, %cst_2 : tensor<4x1xi32, #blocked>
|
46 |
+
%28 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<4x1x!tt.ptr<i64, 1>, #blocked>
|
47 |
+
%29 = tt.addptr %28, %26 : tensor<4x1x!tt.ptr<i64, 1>, #blocked>, tensor<4x1xi32, #blocked>
|
48 |
+
%30 = tt.load %29 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<4x1xi64, #blocked>
|
49 |
+
%31 = arith.addi %30, %cst_1 : tensor<4x1xi64, #blocked>
|
50 |
+
%32 = arith.cmpi slt, %30, %cst_0 : tensor<4x1xi64, #blocked>
|
51 |
+
%33 = arith.select %32, %31, %30 : tensor<4x1xi1, #blocked>, tensor<4x1xi64, #blocked>
|
52 |
+
%34 = arith.muli %33, %cst : tensor<4x1xi64, #blocked>
|
53 |
+
%35 = arith.extsi %27 : tensor<4x1xi32, #blocked> to tensor<4x1xi64, #blocked>
|
54 |
+
%36 = arith.addi %35, %34 : tensor<4x1xi64, #blocked>
|
55 |
+
%37 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<4x1x!tt.ptr<f32, 1>, #blocked>
|
56 |
+
%38 = tt.addptr %37, %36 : tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xi64, #blocked>
|
57 |
+
%39 = "tt.atomic_rmw"(%38, %25, %cst_6) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x1x!tt.ptr<f32, 1>, #blocked>, tensor<4x1xf32, #blocked>, tensor<4x1xi1, #blocked>) -> tensor<4x1xf32, #blocked>
|
58 |
+
tt.return
|
59 |
+
}
|
60 |
+
}
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.cubin
ADDED
Binary file (13 kB). View file
|
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ptx
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.extern .func __assertfail
|
11 |
+
(
|
12 |
+
.param .b64 __assertfail_param_0,
|
13 |
+
.param .b64 __assertfail_param_1,
|
14 |
+
.param .b32 __assertfail_param_2,
|
15 |
+
.param .b64 __assertfail_param_3,
|
16 |
+
.param .b64 __assertfail_param_4
|
17 |
+
)
|
18 |
+
;
|
19 |
+
.global .align 1 .b8 assertFunc_0[25] = {95, 99, 97, 108, 108, 95, 119, 105, 116, 104, 95, 102, 114, 97, 109, 101, 115, 95, 114, 101, 109, 111, 118, 101, 100};
|
20 |
+
.global .align 1 .b8 assertFile_0[38] = {60, 102, 114, 111, 122, 101, 110, 32, 105, 109, 112, 111, 114, 116, 108, 105, 98, 46, 95, 98, 111, 111, 116, 115, 116, 114, 97, 112, 95, 101, 120, 116, 101, 114, 110, 97, 108, 62};
|
21 |
+
.global .align 1 .b8 assertMessage_0[38] = {105, 110, 100, 101, 120, 32, 111, 117, 116, 32, 111, 102, 32, 98, 111, 117, 110, 100, 115, 58, 32, 48, 32, 60, 61, 32, 116, 109, 112, 55, 32, 60, 32, 53, 48, 50, 53, 55};
|
22 |
+
.extern .shared .align 1 .b8 global_smem[];
|
23 |
+
|
24 |
+
.visible .entry triton__0d1d2de(
|
25 |
+
.param .u64 triton__0d1d2de_param_0,
|
26 |
+
.param .u64 triton__0d1d2de_param_1,
|
27 |
+
.param .u64 triton__0d1d2de_param_2
|
28 |
+
)
|
29 |
+
.maxntid 128, 1, 1
|
30 |
+
{
|
31 |
+
.reg .pred %p<24>;
|
32 |
+
.reg .b16 %rs<21>;
|
33 |
+
.reg .b32 %r<21>;
|
34 |
+
.reg .b64 %rd<58>;
|
35 |
+
.loc 1 18 0
|
36 |
+
$L__func_begin0:
|
37 |
+
.loc 1 18 0
|
38 |
+
|
39 |
+
ld.param.u64 %rd9, [triton__0d1d2de_param_1];
|
40 |
+
ld.param.u64 %rd16, [triton__0d1d2de_param_0];
|
41 |
+
$L__tmp0:
|
42 |
+
.loc 1 21 36
|
43 |
+
mov.u32 %r4, %tid.x;
|
44 |
+
and.b32 %r1, %r4, 127;
|
45 |
+
shl.b32 %r2, %r1, 1;
|
46 |
+
or.b32 %r5, %r2, 1;
|
47 |
+
or.b32 %r6, %r2, 256;
|
48 |
+
.loc 1 20 28
|
49 |
+
mov.u32 %r3, %ctaid.x;
|
50 |
+
.loc 1 20 46
|
51 |
+
mul.wide.s32 %rd1, %r3, 512;
|
52 |
+
cvt.u64.u32 %rd17, %r2;
|
53 |
+
cvt.u64.u32 %rd18, %r6;
|
54 |
+
.loc 1 21 23
|
55 |
+
or.b64 %rd2, %rd1, %rd17;
|
56 |
+
or.b64 %rd3, %rd1, %rd18;
|
57 |
+
.loc 1 24 30
|
58 |
+
shl.b64 %rd19, %rd2, 3;
|
59 |
+
add.s64 %rd12, %rd16, %rd19;
|
60 |
+
add.s64 %rd15, %rd12, 2048;
|
61 |
+
mov.pred %p20, -1;
|
62 |
+
.loc 1 24 35
|
63 |
+
mov.u64 %rd10, 0x0;
|
64 |
+
mov.u64 %rd11, 0x0;
|
65 |
+
@%p20 ld.global.v2.b64 { %rd10, %rd11 }, [ %rd12 + 0 ];
|
66 |
+
mov.u64 %rd13, 0x0;
|
67 |
+
mov.u64 %rd14, 0x0;
|
68 |
+
@%p20 ld.global.v2.b64 { %rd13, %rd14 }, [ %rd15 + 0 ];
|
69 |
+
.loc 1 26 19
|
70 |
+
setp.eq.s64 %p3, %rd14, -1;
|
71 |
+
setp.eq.s64 %p4, %rd13, -1;
|
72 |
+
setp.eq.s64 %p5, %rd11, -1;
|
73 |
+
setp.eq.s64 %p6, %rd10, -1;
|
74 |
+
.loc 1 28 32
|
75 |
+
selp.b64 %rd20, 0, %rd10, %p6;
|
76 |
+
selp.b64 %rd21, 0, %rd11, %p5;
|
77 |
+
selp.b64 %rd22, 0, %rd13, %p4;
|
78 |
+
selp.b64 %rd23, 0, %rd14, %p3;
|
79 |
+
.loc 1 29 18
|
80 |
+
add.s64 %rd24, %rd23, 50257;
|
81 |
+
add.s64 %rd25, %rd22, 50257;
|
82 |
+
add.s64 %rd26, %rd21, 50257;
|
83 |
+
add.s64 %rd27, %rd20, 50257;
|
84 |
+
.loc 1 30 18
|
85 |
+
setp.lt.s64 %p7, %rd23, 0;
|
86 |
+
setp.lt.s64 %p8, %rd22, 0;
|
87 |
+
setp.lt.s64 %p9, %rd21, 0;
|
88 |
+
setp.lt.s64 %p10, %rd20, 0;
|
89 |
+
.loc 1 31 32
|
90 |
+
selp.b64 %rd7, %rd27, %rd20, %p10;
|
91 |
+
selp.b64 %rd6, %rd26, %rd21, %p9;
|
92 |
+
selp.b64 %rd5, %rd25, %rd22, %p8;
|
93 |
+
selp.b64 %rd4, %rd24, %rd23, %p7;
|
94 |
+
.loc 1 32 36
|
95 |
+
setp.lt.u64 %p11, %rd4, 50257;
|
96 |
+
setp.lt.u64 %p12, %rd5, 50257;
|
97 |
+
setp.lt.u64 %p13, %rd6, 50257;
|
98 |
+
setp.lt.u64 %p14, %rd7, 50257;
|
99 |
+
mov.u32 %r7, global_smem;
|
100 |
+
add.s32 %r8, %r7, %r2;
|
101 |
+
selp.u16 %rs1, 1, 0, %p14;
|
102 |
+
st.shared.u8 [%r8], %rs1;
|
103 |
+
cvt.u64.u32 %rd8, %r5;
|
104 |
+
selp.u16 %rs2, 1, 0, %p13;
|
105 |
+
st.shared.u8 [%r8+1], %rs2;
|
106 |
+
bar.sync 0;
|
107 |
+
add.s32 %r9, %r7, %r1;
|
108 |
+
ld.shared.u8 %rs3, [%r9];
|
109 |
+
ld.shared.u8 %rs4, [%r9+128];
|
110 |
+
bar.sync 0;
|
111 |
+
selp.u16 %rs5, 1, 0, %p12;
|
112 |
+
st.shared.u8 [%r8], %rs5;
|
113 |
+
selp.u16 %rs6, 1, 0, %p11;
|
114 |
+
st.shared.u8 [%r8+1], %rs6;
|
115 |
+
bar.sync 0;
|
116 |
+
ld.shared.u8 %rs7, [%r9];
|
117 |
+
ld.shared.u8 %rs8, [%r9+128];
|
118 |
+
setp.eq.s16 %p15, %rs7, 0;
|
119 |
+
selp.u16 %rs9, 1, 0, %p15;
|
120 |
+
shl.b16 %rs10, %rs9, 2;
|
121 |
+
setp.eq.s16 %p16, %rs8, 0;
|
122 |
+
selp.u16 %rs11, -1, 0, %p16;
|
123 |
+
shl.b16 %rs12, %rs11, 3;
|
124 |
+
or.b16 %rs13, %rs12, %rs10;
|
125 |
+
setp.eq.s16 %p17, %rs4, 0;
|
126 |
+
selp.u16 %rs14, 1, 0, %p17;
|
127 |
+
setp.eq.s16 %p18, %rs3, 0;
|
128 |
+
selp.u16 %rs15, -1, 0, %p18;
|
129 |
+
shl.b16 %rs16, %rs15, 1;
|
130 |
+
or.b16 %rs17, %rs14, %rs16;
|
131 |
+
and.b16 %rs18, %rs17, 3;
|
132 |
+
or.b16 %rs19, %rs18, %rs13;
|
133 |
+
.loc 1 32 51
|
134 |
+
and.b16 %rs20, %rs19, 15;
|
135 |
+
setp.eq.s16 %p19, %rs20, 0;
|
136 |
+
@%p19 bra $L__BB0_2;
|
137 |
+
mov.u64 %rd28, assertMessage_0;
|
138 |
+
cvta.global.u64 %rd29, %rd28;
|
139 |
+
mov.u64 %rd30, assertFile_0;
|
140 |
+
cvta.global.u64 %rd31, %rd30;
|
141 |
+
mov.u64 %rd32, assertFunc_0;
|
142 |
+
cvta.global.u64 %rd33, %rd32;
|
143 |
+
mov.b32 %r10, 883;
|
144 |
+
mov.u64 %rd34, 1;
|
145 |
+
{ // callseq 0, 0
|
146 |
+
.reg .b32 temp_param_reg;
|
147 |
+
.param .b64 param0;
|
148 |
+
st.param.b64 [param0+0], %rd29;
|
149 |
+
.param .b64 param1;
|
150 |
+
st.param.b64 [param1+0], %rd31;
|
151 |
+
.param .b32 param2;
|
152 |
+
st.param.b32 [param2+0], %r10;
|
153 |
+
.param .b64 param3;
|
154 |
+
st.param.b64 [param3+0], %rd33;
|
155 |
+
.param .b64 param4;
|
156 |
+
st.param.b64 [param4+0], %rd34;
|
157 |
+
call.uni
|
158 |
+
__assertfail,
|
159 |
+
(
|
160 |
+
param0,
|
161 |
+
param1,
|
162 |
+
param2,
|
163 |
+
param3,
|
164 |
+
param4
|
165 |
+
);
|
166 |
+
} // callseq 0
|
167 |
+
$L__BB0_2:
|
168 |
+
.loc 1 21 36
|
169 |
+
or.b32 %r15, %r2, 257;
|
170 |
+
cvt.u64.u32 %rd39, %r15;
|
171 |
+
.loc 1 21 23
|
172 |
+
or.b64 %rd40, %rd1, %rd39;
|
173 |
+
or.b64 %rd41, %rd1, %rd8;
|
174 |
+
.loc 1 34 25
|
175 |
+
shl.b64 %rd42, %rd7, 2;
|
176 |
+
add.s64 %rd43, %rd9, %rd42;
|
177 |
+
mul.lo.s64 %rd44, %rd2, 201028;
|
178 |
+
add.s64 %rd45, %rd43, %rd44;
|
179 |
+
shl.b64 %rd46, %rd6, 2;
|
180 |
+
add.s64 %rd47, %rd9, %rd46;
|
181 |
+
mul.lo.s64 %rd48, %rd41, 201028;
|
182 |
+
add.s64 %rd49, %rd47, %rd48;
|
183 |
+
shl.b64 %rd50, %rd5, 2;
|
184 |
+
add.s64 %rd51, %rd9, %rd50;
|
185 |
+
mul.lo.s64 %rd52, %rd3, 201028;
|
186 |
+
add.s64 %rd53, %rd51, %rd52;
|
187 |
+
shl.b64 %rd54, %rd4, 2;
|
188 |
+
add.s64 %rd55, %rd9, %rd54;
|
189 |
+
mul.lo.s64 %rd56, %rd40, 201028;
|
190 |
+
add.s64 %rd57, %rd55, %rd56;
|
191 |
+
.loc 1 34 51
|
192 |
+
bar.sync 0;
|
193 |
+
shl.b32 %r16, %r2, 3;
|
194 |
+
add.s32 %r18, %r7, %r16;
|
195 |
+
st.shared.u64 [%r18], %rd45;
|
196 |
+
st.shared.u64 [%r18+8], %rd49;
|
197 |
+
bar.sync 0;
|
198 |
+
shl.b32 %r19, %r1, 3;
|
199 |
+
add.s32 %r20, %r7, %r19;
|
200 |
+
ld.shared.u64 %rd35, [%r20];
|
201 |
+
ld.shared.u64 %rd36, [%r20+1024];
|
202 |
+
bar.sync 0;
|
203 |
+
st.shared.u64 [%r18], %rd53;
|
204 |
+
st.shared.u64 [%r18+8], %rd57;
|
205 |
+
bar.sync 0;
|
206 |
+
ld.shared.u64 %rd37, [%r20];
|
207 |
+
ld.shared.u64 %rd38, [%r20+1024];
|
208 |
+
mov.b32 %r11, -1082130432;
|
209 |
+
@%p20 st.global.b32 [ %rd35 + 0 ], { %r11 };
|
210 |
+
@%p20 st.global.b32 [ %rd36 + 0 ], { %r11 };
|
211 |
+
@%p20 st.global.b32 [ %rd37 + 0 ], { %r11 };
|
212 |
+
@%p20 st.global.b32 [ %rd38 + 0 ], { %r11 };
|
213 |
+
.loc 1 34 4
|
214 |
+
ret;
|
215 |
+
$L__tmp1:
|
216 |
+
$L__func_end0:
|
217 |
+
|
218 |
+
}
|
219 |
+
.file 1 "/tmp/torchinductor_root/hl/chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py"
|
220 |
+
.section .debug_abbrev
|
221 |
+
{
|
222 |
+
.b8 1
|
223 |
+
.b8 17
|
224 |
+
.b8 1
|
225 |
+
.b8 37
|
226 |
+
.b8 8
|
227 |
+
.b8 19
|
228 |
+
.b8 5
|
229 |
+
.b8 3
|
230 |
+
.b8 8
|
231 |
+
.b8 16
|
232 |
+
.b8 6
|
233 |
+
.b8 27
|
234 |
+
.b8 8
|
235 |
+
.b8 180
|
236 |
+
.b8 66
|
237 |
+
.b8 12
|
238 |
+
.b8 17
|
239 |
+
.b8 1
|
240 |
+
.b8 18
|
241 |
+
.b8 1
|
242 |
+
.b8 0
|
243 |
+
.b8 0
|
244 |
+
.b8 2
|
245 |
+
.b8 46
|
246 |
+
.b8 0
|
247 |
+
.b8 17
|
248 |
+
.b8 1
|
249 |
+
.b8 18
|
250 |
+
.b8 1
|
251 |
+
.b8 64
|
252 |
+
.b8 10
|
253 |
+
.b8 135
|
254 |
+
.b8 64
|
255 |
+
.b8 8
|
256 |
+
.b8 3
|
257 |
+
.b8 8
|
258 |
+
.b8 58
|
259 |
+
.b8 11
|
260 |
+
.b8 59
|
261 |
+
.b8 11
|
262 |
+
.b8 63
|
263 |
+
.b8 12
|
264 |
+
.b8 0
|
265 |
+
.b8 0
|
266 |
+
.b8 0
|
267 |
+
}
|
268 |
+
.section .debug_info
|
269 |
+
{
|
270 |
+
.b32 176
|
271 |
+
.b8 2
|
272 |
+
.b8 0
|
273 |
+
.b32 .debug_abbrev
|
274 |
+
.b8 8
|
275 |
+
.b8 1
|
276 |
+
.b8 116
|
277 |
+
.b8 114
|
278 |
+
.b8 105
|
279 |
+
.b8 116
|
280 |
+
.b8 111
|
281 |
+
.b8 110
|
282 |
+
.b8 0
|
283 |
+
.b8 2
|
284 |
+
.b8 0
|
285 |
+
.b8 99
|
286 |
+
.b8 104
|
287 |
+
.b8 108
|
288 |
+
.b8 114
|
289 |
+
.b8 107
|
290 |
+
.b8 103
|
291 |
+
.b8 112
|
292 |
+
.b8 118
|
293 |
+
.b8 118
|
294 |
+
.b8 98
|
295 |
+
.b8 100
|
296 |
+
.b8 105
|
297 |
+
.b8 122
|
298 |
+
.b8 100
|
299 |
+
.b8 122
|
300 |
+
.b8 55
|
301 |
+
.b8 115
|
302 |
+
.b8 108
|
303 |
+
.b8 108
|
304 |
+
.b8 113
|
305 |
+
.b8 117
|
306 |
+
.b8 101
|
307 |
+
.b8 116
|
308 |
+
.b8 50
|
309 |
+
.b8 106
|
310 |
+
.b8 55
|
311 |
+
.b8 122
|
312 |
+
.b8 104
|
313 |
+
.b8 116
|
314 |
+
.b8 101
|
315 |
+
.b8 115
|
316 |
+
.b8 54
|
317 |
+
.b8 109
|
318 |
+
.b8 101
|
319 |
+
.b8 104
|
320 |
+
.b8 54
|
321 |
+
.b8 107
|
322 |
+
.b8 101
|
323 |
+
.b8 110
|
324 |
+
.b8 114
|
325 |
+
.b8 113
|
326 |
+
.b8 120
|
327 |
+
.b8 111
|
328 |
+
.b8 118
|
329 |
+
.b8 50
|
330 |
+
.b8 54
|
331 |
+
.b8 109
|
332 |
+
.b8 115
|
333 |
+
.b8 119
|
334 |
+
.b8 118
|
335 |
+
.b8 119
|
336 |
+
.b8 55
|
337 |
+
.b8 46
|
338 |
+
.b8 112
|
339 |
+
.b8 121
|
340 |
+
.b8 0
|
341 |
+
.b32 .debug_line
|
342 |
+
.b8 47
|
343 |
+
.b8 116
|
344 |
+
.b8 109
|
345 |
+
.b8 112
|
346 |
+
.b8 47
|
347 |
+
.b8 116
|
348 |
+
.b8 111
|
349 |
+
.b8 114
|
350 |
+
.b8 99
|
351 |
+
.b8 104
|
352 |
+
.b8 105
|
353 |
+
.b8 110
|
354 |
+
.b8 100
|
355 |
+
.b8 117
|
356 |
+
.b8 99
|
357 |
+
.b8 116
|
358 |
+
.b8 111
|
359 |
+
.b8 114
|
360 |
+
.b8 95
|
361 |
+
.b8 114
|
362 |
+
.b8 111
|
363 |
+
.b8 111
|
364 |
+
.b8 116
|
365 |
+
.b8 47
|
366 |
+
.b8 104
|
367 |
+
.b8 108
|
368 |
+
.b8 0
|
369 |
+
.b8 1
|
370 |
+
.b64 $L__func_begin0
|
371 |
+
.b64 $L__func_end0
|
372 |
+
.b8 2
|
373 |
+
.b64 $L__func_begin0
|
374 |
+
.b64 $L__func_end0
|
375 |
+
.b8 1
|
376 |
+
.b8 156
|
377 |
+
.b8 116
|
378 |
+
.b8 114
|
379 |
+
.b8 105
|
380 |
+
.b8 116
|
381 |
+
.b8 111
|
382 |
+
.b8 110
|
383 |
+
.b8 95
|
384 |
+
.b8 95
|
385 |
+
.b8 48
|
386 |
+
.b8 100
|
387 |
+
.b8 49
|
388 |
+
.b8 100
|
389 |
+
.b8 50
|
390 |
+
.b8 100
|
391 |
+
.b8 101
|
392 |
+
.b8 0
|
393 |
+
.b8 116
|
394 |
+
.b8 114
|
395 |
+
.b8 105
|
396 |
+
.b8 116
|
397 |
+
.b8 111
|
398 |
+
.b8 110
|
399 |
+
.b8 95
|
400 |
+
.b8 95
|
401 |
+
.b8 48
|
402 |
+
.b8 100
|
403 |
+
.b8 49
|
404 |
+
.b8 100
|
405 |
+
.b8 50
|
406 |
+
.b8 100
|
407 |
+
.b8 101
|
408 |
+
.b8 0
|
409 |
+
.b8 1
|
410 |
+
.b8 18
|
411 |
+
.b8 1
|
412 |
+
.b8 0
|
413 |
+
}
|
414 |
+
.section .debug_pubnames
|
415 |
+
{
|
416 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
417 |
+
$L__pubNames_start0:
|
418 |
+
.b8 2
|
419 |
+
.b8 0
|
420 |
+
.b32 .debug_info
|
421 |
+
.b32 180
|
422 |
+
.b32 125
|
423 |
+
.b8 116
|
424 |
+
.b8 114
|
425 |
+
.b8 105
|
426 |
+
.b8 116
|
427 |
+
.b8 111
|
428 |
+
.b8 110
|
429 |
+
.b8 95
|
430 |
+
.b8 95
|
431 |
+
.b8 48
|
432 |
+
.b8 100
|
433 |
+
.b8 49
|
434 |
+
.b8 100
|
435 |
+
.b8 50
|
436 |
+
.b8 100
|
437 |
+
.b8 101
|
438 |
+
.b8 0
|
439 |
+
.b32 0
|
440 |
+
$L__pubNames_end0:
|
441 |
+
}
|
442 |
+
.section .debug_pubtypes
|
443 |
+
{
|
444 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
445 |
+
$L__pubTypes_start0:
|
446 |
+
.b8 2
|
447 |
+
.b8 0
|
448 |
+
.b32 .debug_info
|
449 |
+
.b32 180
|
450 |
+
.b32 0
|
451 |
+
$L__pubTypes_end0:
|
452 |
+
}
|
453 |
+
.section .debug_loc { }
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttgir
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
3 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
4 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
5 |
+
%cst = arith.constant dense<50257> : tensor<512xi64, #blocked>
|
6 |
+
%cst_0 = arith.constant dense<0> : tensor<512xi64, #blocked>
|
7 |
+
%cst_1 = arith.constant dense<-1> : tensor<512xi64, #blocked>
|
8 |
+
%cst_2 = arith.constant dense<-1.000000e+00> : tensor<512xf32, #blocked1>
|
9 |
+
%c512_i64 = arith.constant 512 : i64
|
10 |
+
%0 = tt.get_program_id x : i32
|
11 |
+
%1 = arith.extsi %0 : i32 to i64
|
12 |
+
%2 = arith.muli %1, %c512_i64 : i64
|
13 |
+
%3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
|
14 |
+
%4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
|
15 |
+
%5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
|
16 |
+
%6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
|
17 |
+
%7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>, #blocked>
|
18 |
+
%8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>, #blocked>, tensor<512xi64, #blocked>
|
19 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64, #blocked>
|
20 |
+
%10 = arith.cmpi ne, %9, %cst_1 : tensor<512xi64, #blocked>
|
21 |
+
%11 = arith.select %10, %9, %cst_0 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
|
22 |
+
%12 = arith.addi %11, %cst : tensor<512xi64, #blocked>
|
23 |
+
%13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64, #blocked>
|
24 |
+
%14 = arith.select %13, %12, %11 : tensor<512xi1, #blocked>, tensor<512xi64, #blocked>
|
25 |
+
%15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64, #blocked>
|
26 |
+
%16 = arith.cmpi slt, %14, %cst : tensor<512xi64, #blocked>
|
27 |
+
%17 = arith.andi %15, %16 : tensor<512xi1, #blocked>
|
28 |
+
%18 = triton_gpu.convert_layout %17 : (tensor<512xi1, #blocked>) -> tensor<512xi1, #blocked1>
|
29 |
+
tt.assert %18, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1, #blocked1>
|
30 |
+
%19 = arith.muli %6, %cst : tensor<512xi64, #blocked>
|
31 |
+
%20 = arith.addi %14, %19 : tensor<512xi64, #blocked>
|
32 |
+
%21 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%22 = tt.addptr %21, %20 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
|
34 |
+
%23 = triton_gpu.convert_layout %22 : (tensor<512x!tt.ptr<f32, 1>, #blocked>) -> tensor<512x!tt.ptr<f32, 1>, #blocked1>
|
35 |
+
tt.store %23, %cst_2 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked1>
|
36 |
+
tt.return
|
37 |
+
}
|
38 |
+
}
|
.triton/dump/199215289adb100508718a5a762ba4d7/triton_.ttir
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<50257> : tensor<512xi64>
|
4 |
+
%cst_0 = arith.constant dense<0> : tensor<512xi64>
|
5 |
+
%c512_i64 = arith.constant 512 : i64
|
6 |
+
%cst_1 = arith.constant dense<-1.000000e+00> : tensor<512xf32>
|
7 |
+
%cst_2 = arith.constant dense<-1> : tensor<512xi64>
|
8 |
+
%0 = tt.get_program_id x : i32
|
9 |
+
%1 = arith.extsi %0 : i32 to i64
|
10 |
+
%2 = arith.muli %1, %c512_i64 : i64
|
11 |
+
%3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
|
12 |
+
%4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64>
|
13 |
+
%5 = tt.splat %2 : (i64) -> tensor<512xi64>
|
14 |
+
%6 = arith.addi %5, %4 : tensor<512xi64>
|
15 |
+
%7 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<512x!tt.ptr<i64, 1>>
|
16 |
+
%8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<i64, 1>>, tensor<512xi64>
|
17 |
+
%9 = tt.load %8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xi64>
|
18 |
+
%10 = arith.cmpi ne, %9, %cst_2 : tensor<512xi64>
|
19 |
+
%11 = arith.select %10, %9, %cst_0 : tensor<512xi1>, tensor<512xi64>
|
20 |
+
%12 = arith.addi %11, %cst : tensor<512xi64>
|
21 |
+
%13 = arith.cmpi slt, %11, %cst_0 : tensor<512xi64>
|
22 |
+
%14 = arith.select %13, %12, %11 : tensor<512xi1>, tensor<512xi64>
|
23 |
+
%15 = arith.cmpi sge, %14, %cst_0 : tensor<512xi64>
|
24 |
+
%16 = arith.cmpi slt, %14, %cst : tensor<512xi64>
|
25 |
+
%17 = arith.andi %15, %16 : tensor<512xi1>
|
26 |
+
tt.assert %17, "index out of bounds: 0 <= tmp7 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<512xi1>
|
27 |
+
%18 = arith.muli %6, %cst : tensor<512xi64>
|
28 |
+
%19 = arith.addi %14, %18 : tensor<512xi64>
|
29 |
+
%20 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
|
30 |
+
%21 = tt.addptr %20, %19 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi64>
|
31 |
+
tt.store %21, %cst_1 {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
|
32 |
+
tt.return
|
33 |
+
}
|
34 |
+
}
|
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ptx
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
|
11 |
+
.visible .entry triton__0d1d2de(
|
12 |
+
.param .u64 triton__0d1d2de_param_0,
|
13 |
+
.param .u64 triton__0d1d2de_param_1,
|
14 |
+
.param .u32 triton__0d1d2de_param_2
|
15 |
+
)
|
16 |
+
.maxntid 128, 1, 1
|
17 |
+
{
|
18 |
+
.reg .pred %p<4>;
|
19 |
+
.reg .b16 %rs<9>;
|
20 |
+
.reg .b32 %r<31>;
|
21 |
+
.reg .b64 %rd<8>;
|
22 |
+
.loc 1 18 0
|
23 |
+
$L__func_begin0:
|
24 |
+
.loc 1 18 0
|
25 |
+
|
26 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
27 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
28 |
+
$L__tmp0:
|
29 |
+
.loc 1 21 36
|
30 |
+
mov.u32 %r22, %tid.x;
|
31 |
+
shl.b32 %r23, %r22, 3;
|
32 |
+
and.b32 %r24, %r23, 1016;
|
33 |
+
.loc 1 20 28
|
34 |
+
mov.u32 %r1, %ctaid.x;
|
35 |
+
.loc 1 20 33
|
36 |
+
shl.b32 %r25, %r1, 10;
|
37 |
+
.loc 1 21 23
|
38 |
+
or.b32 %r26, %r25, %r24;
|
39 |
+
.loc 1 24 30
|
40 |
+
mul.wide.s32 %rd6, %r26, 4;
|
41 |
+
add.s64 %rd1, %rd4, %rd6;
|
42 |
+
add.s64 %rd2, %rd1, 16;
|
43 |
+
mov.pred %p1, -1;
|
44 |
+
.loc 1 24 35
|
45 |
+
mov.u32 %r10, 0x0;
|
46 |
+
mov.u32 %r11, 0x0;
|
47 |
+
mov.u32 %r12, 0x0;
|
48 |
+
mov.u32 %r13, 0x0;
|
49 |
+
@%p1 ld.global.v4.b32 { %r10, %r11, %r12, %r13 }, [ %rd1 + 0 ];
|
50 |
+
mov.u32 %r14, 0x0;
|
51 |
+
mov.u32 %r15, 0x0;
|
52 |
+
mov.u32 %r16, 0x0;
|
53 |
+
mov.u32 %r17, 0x0;
|
54 |
+
@%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd2 + 0 ];
|
55 |
+
.loc 1 26 25
|
56 |
+
mul.wide.s32 %rd7, %r26, 2;
|
57 |
+
add.s64 %rd3, %rd5, %rd7;
|
58 |
+
.loc 1 26 36
|
59 |
+
cvt.rn.bf16.f32 %rs1, %r10;
|
60 |
+
cvt.rn.bf16.f32 %rs2, %r11;
|
61 |
+
cvt.rn.bf16.f32 %rs3, %r12;
|
62 |
+
cvt.rn.bf16.f32 %rs4, %r13;
|
63 |
+
cvt.rn.bf16.f32 %rs5, %r14;
|
64 |
+
cvt.rn.bf16.f32 %rs6, %r15;
|
65 |
+
cvt.rn.bf16.f32 %rs7, %r16;
|
66 |
+
cvt.rn.bf16.f32 %rs8, %r17;
|
67 |
+
mov.b32 %r27, {%rs1, %rs2};
|
68 |
+
mov.b32 %r28, {%rs3, %rs4};
|
69 |
+
mov.b32 %r29, {%rs5, %rs6};
|
70 |
+
mov.b32 %r30, {%rs7, %rs8};
|
71 |
+
@%p1 st.global.v4.b32 [ %rd3 + 0 ], { %r27, %r28, %r29, %r30 };
|
72 |
+
.loc 1 26 4
|
73 |
+
ret;
|
74 |
+
$L__tmp1:
|
75 |
+
$L__func_end0:
|
76 |
+
|
77 |
+
}
|
78 |
+
.file 1 "/tmp/torchinductor_root/5t/c5tryp5qwkhreijk7s5x327wofz54lwj4kvctuqdzv2vrf2xyons.py"
|
79 |
+
.section .debug_abbrev
|
80 |
+
{
|
81 |
+
.b8 1
|
82 |
+
.b8 17
|
83 |
+
.b8 1
|
84 |
+
.b8 37
|
85 |
+
.b8 8
|
86 |
+
.b8 19
|
87 |
+
.b8 5
|
88 |
+
.b8 3
|
89 |
+
.b8 8
|
90 |
+
.b8 16
|
91 |
+
.b8 6
|
92 |
+
.b8 27
|
93 |
+
.b8 8
|
94 |
+
.b8 180
|
95 |
+
.b8 66
|
96 |
+
.b8 12
|
97 |
+
.b8 17
|
98 |
+
.b8 1
|
99 |
+
.b8 18
|
100 |
+
.b8 1
|
101 |
+
.b8 0
|
102 |
+
.b8 0
|
103 |
+
.b8 2
|
104 |
+
.b8 46
|
105 |
+
.b8 0
|
106 |
+
.b8 17
|
107 |
+
.b8 1
|
108 |
+
.b8 18
|
109 |
+
.b8 1
|
110 |
+
.b8 64
|
111 |
+
.b8 10
|
112 |
+
.b8 135
|
113 |
+
.b8 64
|
114 |
+
.b8 8
|
115 |
+
.b8 3
|
116 |
+
.b8 8
|
117 |
+
.b8 58
|
118 |
+
.b8 11
|
119 |
+
.b8 59
|
120 |
+
.b8 11
|
121 |
+
.b8 63
|
122 |
+
.b8 12
|
123 |
+
.b8 0
|
124 |
+
.b8 0
|
125 |
+
.b8 0
|
126 |
+
}
|
127 |
+
.section .debug_info
|
128 |
+
{
|
129 |
+
.b32 176
|
130 |
+
.b8 2
|
131 |
+
.b8 0
|
132 |
+
.b32 .debug_abbrev
|
133 |
+
.b8 8
|
134 |
+
.b8 1
|
135 |
+
.b8 116
|
136 |
+
.b8 114
|
137 |
+
.b8 105
|
138 |
+
.b8 116
|
139 |
+
.b8 111
|
140 |
+
.b8 110
|
141 |
+
.b8 0
|
142 |
+
.b8 2
|
143 |
+
.b8 0
|
144 |
+
.b8 99
|
145 |
+
.b8 53
|
146 |
+
.b8 116
|
147 |
+
.b8 114
|
148 |
+
.b8 121
|
149 |
+
.b8 112
|
150 |
+
.b8 53
|
151 |
+
.b8 113
|
152 |
+
.b8 119
|
153 |
+
.b8 107
|
154 |
+
.b8 104
|
155 |
+
.b8 114
|
156 |
+
.b8 101
|
157 |
+
.b8 105
|
158 |
+
.b8 106
|
159 |
+
.b8 107
|
160 |
+
.b8 55
|
161 |
+
.b8 115
|
162 |
+
.b8 53
|
163 |
+
.b8 120
|
164 |
+
.b8 51
|
165 |
+
.b8 50
|
166 |
+
.b8 55
|
167 |
+
.b8 119
|
168 |
+
.b8 111
|
169 |
+
.b8 102
|
170 |
+
.b8 122
|
171 |
+
.b8 53
|
172 |
+
.b8 52
|
173 |
+
.b8 108
|
174 |
+
.b8 119
|
175 |
+
.b8 106
|
176 |
+
.b8 52
|
177 |
+
.b8 107
|
178 |
+
.b8 118
|
179 |
+
.b8 99
|
180 |
+
.b8 116
|
181 |
+
.b8 117
|
182 |
+
.b8 113
|
183 |
+
.b8 100
|
184 |
+
.b8 122
|
185 |
+
.b8 118
|
186 |
+
.b8 50
|
187 |
+
.b8 118
|
188 |
+
.b8 114
|
189 |
+
.b8 102
|
190 |
+
.b8 50
|
191 |
+
.b8 120
|
192 |
+
.b8 121
|
193 |
+
.b8 111
|
194 |
+
.b8 110
|
195 |
+
.b8 115
|
196 |
+
.b8 46
|
197 |
+
.b8 112
|
198 |
+
.b8 121
|
199 |
+
.b8 0
|
200 |
+
.b32 .debug_line
|
201 |
+
.b8 47
|
202 |
+
.b8 116
|
203 |
+
.b8 109
|
204 |
+
.b8 112
|
205 |
+
.b8 47
|
206 |
+
.b8 116
|
207 |
+
.b8 111
|
208 |
+
.b8 114
|
209 |
+
.b8 99
|
210 |
+
.b8 104
|
211 |
+
.b8 105
|
212 |
+
.b8 110
|
213 |
+
.b8 100
|
214 |
+
.b8 117
|
215 |
+
.b8 99
|
216 |
+
.b8 116
|
217 |
+
.b8 111
|
218 |
+
.b8 114
|
219 |
+
.b8 95
|
220 |
+
.b8 114
|
221 |
+
.b8 111
|
222 |
+
.b8 111
|
223 |
+
.b8 116
|
224 |
+
.b8 47
|
225 |
+
.b8 53
|
226 |
+
.b8 116
|
227 |
+
.b8 0
|
228 |
+
.b8 1
|
229 |
+
.b64 $L__func_begin0
|
230 |
+
.b64 $L__func_end0
|
231 |
+
.b8 2
|
232 |
+
.b64 $L__func_begin0
|
233 |
+
.b64 $L__func_end0
|
234 |
+
.b8 1
|
235 |
+
.b8 156
|
236 |
+
.b8 116
|
237 |
+
.b8 114
|
238 |
+
.b8 105
|
239 |
+
.b8 116
|
240 |
+
.b8 111
|
241 |
+
.b8 110
|
242 |
+
.b8 95
|
243 |
+
.b8 95
|
244 |
+
.b8 48
|
245 |
+
.b8 100
|
246 |
+
.b8 49
|
247 |
+
.b8 100
|
248 |
+
.b8 50
|
249 |
+
.b8 100
|
250 |
+
.b8 101
|
251 |
+
.b8 0
|
252 |
+
.b8 116
|
253 |
+
.b8 114
|
254 |
+
.b8 105
|
255 |
+
.b8 116
|
256 |
+
.b8 111
|
257 |
+
.b8 110
|
258 |
+
.b8 95
|
259 |
+
.b8 95
|
260 |
+
.b8 48
|
261 |
+
.b8 100
|
262 |
+
.b8 49
|
263 |
+
.b8 100
|
264 |
+
.b8 50
|
265 |
+
.b8 100
|
266 |
+
.b8 101
|
267 |
+
.b8 0
|
268 |
+
.b8 1
|
269 |
+
.b8 18
|
270 |
+
.b8 1
|
271 |
+
.b8 0
|
272 |
+
}
|
273 |
+
.section .debug_pubnames
|
274 |
+
{
|
275 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
276 |
+
$L__pubNames_start0:
|
277 |
+
.b8 2
|
278 |
+
.b8 0
|
279 |
+
.b32 .debug_info
|
280 |
+
.b32 180
|
281 |
+
.b32 125
|
282 |
+
.b8 116
|
283 |
+
.b8 114
|
284 |
+
.b8 105
|
285 |
+
.b8 116
|
286 |
+
.b8 111
|
287 |
+
.b8 110
|
288 |
+
.b8 95
|
289 |
+
.b8 95
|
290 |
+
.b8 48
|
291 |
+
.b8 100
|
292 |
+
.b8 49
|
293 |
+
.b8 100
|
294 |
+
.b8 50
|
295 |
+
.b8 100
|
296 |
+
.b8 101
|
297 |
+
.b8 0
|
298 |
+
.b32 0
|
299 |
+
$L__pubNames_end0:
|
300 |
+
}
|
301 |
+
.section .debug_pubtypes
|
302 |
+
{
|
303 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
304 |
+
$L__pubTypes_start0:
|
305 |
+
.b8 2
|
306 |
+
.b8 0
|
307 |
+
.b32 .debug_info
|
308 |
+
.b32 180
|
309 |
+
.b32 0
|
310 |
+
$L__pubTypes_end0:
|
311 |
+
}
|
312 |
+
.section .debug_loc { }
|
.triton/dump/1c14bdb6903aa6825e214bbdf57fd077/triton_.ttgir
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%c1024_i32 = arith.constant 1024 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
|
10 |
+
%5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
|
11 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
|
12 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
|
13 |
+
%8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
|
14 |
+
%9 = tt.addptr %8, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
|
15 |
+
%10 = arith.truncf %7 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
|
16 |
+
tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
|
17 |
+
tt.return
|
18 |
+
}
|
19 |
+
}
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.cubin
ADDED
Binary file (14.1 kB). View file
|
|
.triton/dump/1c188b233fcb854770e6a3cf1802c844/triton_.ttir
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3de4de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c8_i32 = arith.constant 8 : i32
|
4 |
+
%c128_i32 = arith.constant 128 : i32
|
5 |
+
%c0_i32 = arith.constant 0 : i32
|
6 |
+
%cst = arith.constant dense<32768> : tensor<64x1xi32>
|
7 |
+
%cst_0 = arith.constant dense<256> : tensor<1x8xi32>
|
8 |
+
%cst_1 = arith.constant dense<128> : tensor<1x8xi32>
|
9 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
|
10 |
+
%cst_3 = arith.constant dense<256> : tensor<64x1xi32>
|
11 |
+
%c64_i32 = arith.constant 64 : i32
|
12 |
+
%0 = tt.get_program_id x : i32
|
13 |
+
%1 = arith.muli %0, %c64_i32 : i32
|
14 |
+
%2 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
|
15 |
+
%3 = tt.expand_dims %2 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
|
16 |
+
%4 = tt.splat %1 : (i32) -> tensor<64x1xi32>
|
17 |
+
%5 = arith.addi %4, %3 : tensor<64x1xi32>
|
18 |
+
%6 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
|
19 |
+
%7 = tt.expand_dims %6 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
|
20 |
+
%8 = arith.remsi %5, %cst_3 : tensor<64x1xi32>
|
21 |
+
%9 = arith.divsi %5, %cst_3 : tensor<64x1xi32>
|
22 |
+
%10 = tt.broadcast %8 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
23 |
+
%11 = arith.muli %9, %cst : tensor<64x1xi32>
|
24 |
+
%12 = tt.broadcast %11 : (tensor<64x1xi32>) -> tensor<64x8xi32>
|
25 |
+
%13 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
26 |
+
%14 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
|
27 |
+
%15 = scf.for %arg5 = %c0_i32 to %c128_i32 step %c8_i32 iter_args(%arg6 = %cst_2) -> (tensor<64x8xf32>) : i32 {
|
28 |
+
%20 = tt.splat %arg5 : (i32) -> tensor<1x8xi32>
|
29 |
+
%21 = arith.addi %20, %7 : tensor<1x8xi32>
|
30 |
+
%22 = arith.cmpi slt, %21, %cst_1 : tensor<1x8xi32>
|
31 |
+
%23 = arith.muli %21, %cst_0 : tensor<1x8xi32>
|
32 |
+
%24 = tt.broadcast %23 : (tensor<1x8xi32>) -> tensor<64x8xi32>
|
33 |
+
%25 = arith.addi %10, %24 : tensor<64x8xi32>
|
34 |
+
%26 = arith.addi %25, %12 : tensor<64x8xi32>
|
35 |
+
%27 = tt.addptr %13, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
36 |
+
%28 = tt.broadcast %22 : (tensor<1x8xi1>) -> tensor<64x8xi1>
|
37 |
+
%29 = tt.load %27, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
38 |
+
%30 = tt.addptr %14, %26 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi32>
|
39 |
+
%31 = tt.load %30, %28, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
|
40 |
+
%32 = arith.mulf %29, %31 : tensor<64x8xf32>
|
41 |
+
%33 = arith.addf %arg6, %32 : tensor<64x8xf32>
|
42 |
+
%34 = arith.select %28, %33, %arg6 : tensor<64x8xi1>, tensor<64x8xf32>
|
43 |
+
scf.yield %34 : tensor<64x8xf32>
|
44 |
+
}
|
45 |
+
%16 = "tt.reduce"(%15) <{axis = 1 : i32}> ({
|
46 |
+
^bb0(%arg5: f32, %arg6: f32):
|
47 |
+
%20 = arith.addf %arg5, %arg6 : f32
|
48 |
+
tt.reduce.return %20 : f32
|
49 |
+
}) : (tensor<64x8xf32>) -> tensor<64xf32>
|
50 |
+
%17 = tt.expand_dims %16 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
|
51 |
+
%18 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<64x1x!tt.ptr<f32, 1>>
|
52 |
+
%19 = tt.addptr %18, %5 : tensor<64x1x!tt.ptr<f32, 1>>, tensor<64x1xi32>
|
53 |
+
tt.store %19, %17 {cache = 1 : i32, evict = 1 : i32} : tensor<64x1xf32>
|
54 |
+
tt.return
|
55 |
+
}
|
56 |
+
}
|
.triton/dump/21d0195c63fb062bfc567b79c9bb2771/triton_.ttgir
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [2], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<256> : tensor<256xi32, #blocked>
|
5 |
+
%cst_0 = arith.constant dense<-1> : tensor<1xi64, #blocked>
|
6 |
+
%cst_1 = arith.constant dense<2.560000e+02> : tensor<1xf32, #blocked>
|
7 |
+
%cst_2 = arith.constant dense<256> : tensor<1xi64, #blocked>
|
8 |
+
%cst_3 = arith.constant dense<0> : tensor<1xi64, #blocked>
|
9 |
+
%cst_4 = arith.constant dense<50257> : tensor<1xi64, #blocked>
|
10 |
+
%cst_5 = arith.constant 0.000000e+00 : f32
|
11 |
+
%c256_i32 = arith.constant 256 : i32
|
12 |
+
%cst_6 = arith.constant dense<0.000000e+00> : tensor<256xf32, #blocked>
|
13 |
+
%cst_7 = arith.constant dense<2.560000e+02> : tensor<256xf32, #blocked>
|
14 |
+
%cst_8 = arith.constant dense<0.000000e+00> : tensor<256xbf16, #blocked>
|
15 |
+
%0 = tt.get_program_id x : i32
|
16 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #blocked>
|
17 |
+
%2 = arith.cmpi slt, %1, %cst : tensor<256xi32, #blocked>
|
18 |
+
%3 = arith.muli %0, %c256_i32 : i32
|
19 |
+
%4 = tt.splat %3 : (i32) -> tensor<256xi32, #blocked>
|
20 |
+
%5 = arith.addi %1, %4 : tensor<256xi32, #blocked>
|
21 |
+
%6 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>, #blocked>
|
22 |
+
%7 = tt.addptr %6, %5 : tensor<256x!tt.ptr<bf16, 1>, #blocked>, tensor<256xi32, #blocked>
|
23 |
+
%8 = tt.load %7, %2, %cst_8 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16, #blocked>
|
24 |
+
%9 = arith.extf %8 : tensor<256xbf16, #blocked> to tensor<256xf32, #blocked>
|
25 |
+
%10 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
26 |
+
%11 = tt.addptr %10, %1 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
27 |
+
%12 = tt.load %11, %2, %cst_6 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
28 |
+
%13 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
29 |
+
%14 = tt.addptr %13, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
30 |
+
%15 = tt.load %14, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
31 |
+
%16 = tt.addptr %arg4, %0 : !tt.ptr<f32, 1>, i32
|
32 |
+
%17 = tt.splat %16 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
33 |
+
%18 = tt.load %17 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
34 |
+
%19 = tt.addptr %arg5, %0 : !tt.ptr<f32, 1>, i32
|
35 |
+
%20 = tt.splat %19 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>, #blocked>
|
36 |
+
%21 = tt.load %20 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xf32, #blocked>
|
37 |
+
%22 = tt.addptr %arg6, %0 : !tt.ptr<i64, 1>, i32
|
38 |
+
%23 = tt.splat %22 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>, #blocked>
|
39 |
+
%24 = tt.load %23 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64, #blocked>
|
40 |
+
%25 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
41 |
+
%26 = tt.addptr %25, %5 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi32, #blocked>
|
42 |
+
%27 = tt.load %26, %2, %cst_6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32, #blocked>
|
43 |
+
%28 = arith.mulf %9, %12 : tensor<256xf32, #blocked>
|
44 |
+
%29 = arith.select %2, %28, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
45 |
+
%30 = "tt.reduce"(%29) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg10: f32, %arg11: f32):
|
47 |
+
%63 = arith.addf %arg10, %arg11 : f32
|
48 |
+
tt.reduce.return %63 : f32
|
49 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
50 |
+
%31 = arith.addf %30, %cst_5 : f32
|
51 |
+
%32 = tt.broadcast %18 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
52 |
+
%33 = arith.subf %15, %32 : tensor<256xf32, #blocked>
|
53 |
+
%34 = tt.broadcast %21 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
54 |
+
%35 = arith.mulf %33, %34 : tensor<256xf32, #blocked>
|
55 |
+
%36 = arith.mulf %28, %35 : tensor<256xf32, #blocked>
|
56 |
+
%37 = arith.select %2, %36, %cst_6 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
57 |
+
%38 = "tt.reduce"(%37) <{axis = 0 : i32}> ({
|
58 |
+
^bb0(%arg10: f32, %arg11: f32):
|
59 |
+
%63 = arith.addf %arg10, %arg11 : f32
|
60 |
+
tt.reduce.return %63 : f32
|
61 |
+
}) : (tensor<256xf32, #blocked>) -> f32
|
62 |
+
%39 = arith.addf %38, %cst_5 : f32
|
63 |
+
%40 = arith.cmpi eq, %24, %cst_0 : tensor<1xi64, #blocked>
|
64 |
+
%41 = arith.divf %21, %cst_1 : tensor<1xf32, #blocked>
|
65 |
+
%42 = arith.mulf %28, %cst_7 : tensor<256xf32, #blocked>
|
66 |
+
%43 = tt.splat %31 : (f32) -> tensor<256xf32, #blocked>
|
67 |
+
%44 = arith.subf %42, %43 : tensor<256xf32, #blocked>
|
68 |
+
%45 = tt.splat %39 : (f32) -> tensor<256xf32, #blocked>
|
69 |
+
%46 = arith.mulf %35, %45 : tensor<256xf32, #blocked>
|
70 |
+
%47 = arith.subf %44, %46 : tensor<256xf32, #blocked>
|
71 |
+
%48 = tt.broadcast %41 : (tensor<1xf32, #blocked>) -> tensor<256xf32, #blocked>
|
72 |
+
%49 = arith.mulf %48, %47 : tensor<256xf32, #blocked>
|
73 |
+
%50 = arith.addf %27, %49 : tensor<256xf32, #blocked>
|
74 |
+
%51 = tt.broadcast %40 : (tensor<1xi1, #blocked>) -> tensor<256xi1, #blocked>
|
75 |
+
%52 = arith.select %51, %cst_6, %50 : tensor<256xi1, #blocked>, tensor<256xf32, #blocked>
|
76 |
+
%53 = arith.addi %24, %cst_4 : tensor<1xi64, #blocked>
|
77 |
+
%54 = arith.cmpi slt, %24, %cst_3 : tensor<1xi64, #blocked>
|
78 |
+
%55 = arith.select %54, %53, %24 : tensor<1xi1, #blocked>, tensor<1xi64, #blocked>
|
79 |
+
%56 = arith.muli %55, %cst_2 : tensor<1xi64, #blocked>
|
80 |
+
%57 = tt.broadcast %56 : (tensor<1xi64, #blocked>) -> tensor<256xi64, #blocked>
|
81 |
+
%58 = arith.extsi %1 : tensor<256xi32, #blocked> to tensor<256xi64, #blocked>
|
82 |
+
%59 = arith.addi %58, %57 : tensor<256xi64, #blocked>
|
83 |
+
%60 = tt.splat %arg7 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>, #blocked>
|
84 |
+
%61 = tt.addptr %60, %59 : tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xi64, #blocked>
|
85 |
+
%62 = "tt.atomic_rmw"(%61, %52, %2) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<256x!tt.ptr<f32, 1>, #blocked>, tensor<256xf32, #blocked>, tensor<256xi1, #blocked>) -> tensor<256xf32, #blocked>
|
86 |
+
tt.return
|
87 |
+
}
|
88 |
+
}
|
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ptx
ADDED
@@ -0,0 +1,778 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 128, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<27>;
|
20 |
+
.reg .b16 %rs<17>;
|
21 |
+
.reg .b32 %r<67>;
|
22 |
+
.reg .f32 %f<431>;
|
23 |
+
.reg .b64 %rd<8>;
|
24 |
+
.loc 1 18 0
|
25 |
+
$L__func_begin0:
|
26 |
+
.loc 1 18 0
|
27 |
+
|
28 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
29 |
+
$L__tmp0:
|
30 |
+
.loc 1 21 36
|
31 |
+
mov.u32 %r14, %tid.x;
|
32 |
+
shl.b32 %r15, %r14, 3;
|
33 |
+
and.b32 %r16, %r15, 1016;
|
34 |
+
.loc 1 20 28
|
35 |
+
mov.u32 %r1, %ctaid.x;
|
36 |
+
.loc 1 20 33
|
37 |
+
shl.b32 %r17, %r1, 10;
|
38 |
+
.loc 1 21 23
|
39 |
+
or.b32 %r18, %r17, %r16;
|
40 |
+
.loc 1 24 30
|
41 |
+
mul.wide.s32 %rd5, %r18, 2;
|
42 |
+
add.s64 %rd3, %rd4, %rd5;
|
43 |
+
mov.pred %p1, -1;
|
44 |
+
.loc 1 24 35
|
45 |
+
mov.u32 %r2, 0x0;
|
46 |
+
mov.u32 %r3, 0x0;
|
47 |
+
mov.u32 %r4, 0x0;
|
48 |
+
mov.u32 %r5, 0x0;
|
49 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd3 + 0 ];
|
50 |
+
cvt.u16.u32 %rs1, %r2;
|
51 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
52 |
+
cvt.u16.u32 %rs3, %r3;
|
53 |
+
.loc 1 24 44
|
54 |
+
cvt.f32.bf16 %r6, %rs1;
|
55 |
+
mov.b32 %f1, %r6;
|
56 |
+
cvt.f32.bf16 %r7, %rs2;
|
57 |
+
mov.b32 %f2, %r7;
|
58 |
+
.loc 1 29 18
|
59 |
+
mul.f32 %f9, %f1, 0f3F3504F3;
|
60 |
+
.loc 1 30 23
|
61 |
+
abs.ftz.f32 %f17, %f9;
|
62 |
+
setp.ge.f32 %p2, %f17, 0f3F8060FE;
|
63 |
+
mov.f32 %f365, 0f3789CA3C;
|
64 |
+
mov.f32 %f364, 0fB9F560B9;
|
65 |
+
mov.f32 %f363, 0f3BAC840B;
|
66 |
+
mov.f32 %f362, 0fBD0C8162;
|
67 |
+
mov.f32 %f361, 0f3E1CF906;
|
68 |
+
mov.f32 %f360, 0f3F6A937E;
|
69 |
+
mov.f32 %f359, 0f3F20D842;
|
70 |
+
mov.f32 %f366, %f17;
|
71 |
+
@%p2 bra $L__BB0_2;
|
72 |
+
.loc 1 0 23
|
73 |
+
mov.f32 %f365, 0f38B1E96A;
|
74 |
+
mov.f32 %f364, 0fBA574D20;
|
75 |
+
mov.f32 %f363, 0f3BAAD5EA;
|
76 |
+
mov.f32 %f362, 0fBCDC1BE7;
|
77 |
+
mov.f32 %f361, 0f3DE718AF;
|
78 |
+
mov.f32 %f360, 0fBEC093AC;
|
79 |
+
mov.f32 %f359, 0f3E0375D3;
|
80 |
+
.loc 1 30 23
|
81 |
+
mul.f32 %f366, %f9, %f9;
|
82 |
+
$L__BB0_2:
|
83 |
+
.loc 1 0 0
|
84 |
+
cvt.f32.bf16 %r8, %rs3;
|
85 |
+
mul.f32 %f10, %f2, 0f3F3504F3;
|
86 |
+
.loc 1 30 23
|
87 |
+
setp.ltu.f32 %p3, %f17, 0f3F8060FE;
|
88 |
+
fma.rn.ftz.f32 %f135, %f365, %f366, %f364;
|
89 |
+
fma.rn.ftz.f32 %f136, %f135, %f366, %f363;
|
90 |
+
fma.rn.ftz.f32 %f137, %f136, %f366, %f362;
|
91 |
+
fma.rn.ftz.f32 %f138, %f137, %f366, %f361;
|
92 |
+
fma.rn.ftz.f32 %f139, %f138, %f366, %f360;
|
93 |
+
fma.rn.ftz.f32 %f140, %f139, %f366, %f359;
|
94 |
+
neg.f32 %f141, %f366;
|
95 |
+
selp.f32 %f142, %f141, %f9, %p2;
|
96 |
+
fma.rn.ftz.f32 %f367, %f140, %f142, %f142;
|
97 |
+
mov.f32 %f358, 0f3F800000;
|
98 |
+
@%p3 bra $L__BB0_4;
|
99 |
+
ex2.approx.ftz.f32 %f143, %f367;
|
100 |
+
sub.f32 %f145, %f358, %f143;
|
101 |
+
mov.b32 %r19, %f145;
|
102 |
+
mov.b32 %r20, %f9;
|
103 |
+
and.b32 %r21, %r20, -2147483648;
|
104 |
+
or.b32 %r22, %r21, %r19;
|
105 |
+
mov.b32 %f367, %r22;
|
106 |
+
$L__BB0_4:
|
107 |
+
.loc 1 0 0
|
108 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
109 |
+
mov.b32 %f3, %r8;
|
110 |
+
.loc 1 30 23
|
111 |
+
abs.ftz.f32 %f30, %f10;
|
112 |
+
setp.ge.f32 %p5, %f30, 0f3F8060FE;
|
113 |
+
mov.f32 %f374, 0f3789CA3C;
|
114 |
+
mov.f32 %f373, 0fB9F560B9;
|
115 |
+
mov.f32 %f372, 0f3BAC840B;
|
116 |
+
mov.f32 %f371, 0fBD0C8162;
|
117 |
+
mov.f32 %f370, 0f3E1CF906;
|
118 |
+
mov.f32 %f369, 0f3F6A937E;
|
119 |
+
mov.f32 %f368, 0f3F20D842;
|
120 |
+
mov.f32 %f375, %f30;
|
121 |
+
@%p5 bra $L__BB0_6;
|
122 |
+
mul.f32 %f375, %f10, %f10;
|
123 |
+
mov.f32 %f374, 0f38B1E96A;
|
124 |
+
mov.f32 %f373, 0fBA574D20;
|
125 |
+
mov.f32 %f372, 0f3BAAD5EA;
|
126 |
+
mov.f32 %f371, 0fBCDC1BE7;
|
127 |
+
mov.f32 %f370, 0f3DE718AF;
|
128 |
+
mov.f32 %f369, 0fBEC093AC;
|
129 |
+
mov.f32 %f368, 0f3E0375D3;
|
130 |
+
$L__BB0_6:
|
131 |
+
.loc 1 0 0
|
132 |
+
cvt.f32.bf16 %r9, %rs4;
|
133 |
+
mul.f32 %f11, %f3, 0f3F3504F3;
|
134 |
+
.loc 1 30 23
|
135 |
+
setp.ltu.f32 %p6, %f30, 0f3F8060FE;
|
136 |
+
fma.rn.ftz.f32 %f160, %f374, %f375, %f373;
|
137 |
+
fma.rn.ftz.f32 %f161, %f160, %f375, %f372;
|
138 |
+
fma.rn.ftz.f32 %f162, %f161, %f375, %f371;
|
139 |
+
fma.rn.ftz.f32 %f163, %f162, %f375, %f370;
|
140 |
+
fma.rn.ftz.f32 %f164, %f163, %f375, %f369;
|
141 |
+
fma.rn.ftz.f32 %f165, %f164, %f375, %f368;
|
142 |
+
neg.f32 %f166, %f375;
|
143 |
+
selp.f32 %f167, %f166, %f10, %p5;
|
144 |
+
fma.rn.ftz.f32 %f376, %f165, %f167, %f167;
|
145 |
+
@%p6 bra $L__BB0_8;
|
146 |
+
ex2.approx.ftz.f32 %f168, %f376;
|
147 |
+
sub.f32 %f170, %f358, %f168;
|
148 |
+
mov.b32 %r23, %f170;
|
149 |
+
mov.b32 %r24, %f10;
|
150 |
+
and.b32 %r25, %r24, -2147483648;
|
151 |
+
or.b32 %r26, %r25, %r23;
|
152 |
+
mov.b32 %f376, %r26;
|
153 |
+
$L__BB0_8:
|
154 |
+
.loc 1 0 0
|
155 |
+
cvt.u16.u32 %rs5, %r4;
|
156 |
+
mov.b32 %f4, %r9;
|
157 |
+
.loc 1 30 23
|
158 |
+
abs.ftz.f32 %f43, %f11;
|
159 |
+
setp.ge.f32 %p8, %f43, 0f3F8060FE;
|
160 |
+
mov.f32 %f383, 0f3789CA3C;
|
161 |
+
mov.f32 %f382, 0fB9F560B9;
|
162 |
+
mov.f32 %f381, 0f3BAC840B;
|
163 |
+
mov.f32 %f380, 0fBD0C8162;
|
164 |
+
mov.f32 %f379, 0f3E1CF906;
|
165 |
+
mov.f32 %f378, 0f3F6A937E;
|
166 |
+
mov.f32 %f377, 0f3F20D842;
|
167 |
+
mov.f32 %f384, %f43;
|
168 |
+
@%p8 bra $L__BB0_10;
|
169 |
+
mul.f32 %f384, %f11, %f11;
|
170 |
+
mov.f32 %f383, 0f38B1E96A;
|
171 |
+
mov.f32 %f382, 0fBA574D20;
|
172 |
+
mov.f32 %f381, 0f3BAAD5EA;
|
173 |
+
mov.f32 %f380, 0fBCDC1BE7;
|
174 |
+
mov.f32 %f379, 0f3DE718AF;
|
175 |
+
mov.f32 %f378, 0fBEC093AC;
|
176 |
+
mov.f32 %f377, 0f3E0375D3;
|
177 |
+
$L__BB0_10:
|
178 |
+
.loc 1 0 0
|
179 |
+
cvt.f32.bf16 %r10, %rs5;
|
180 |
+
mul.f32 %f12, %f4, 0f3F3504F3;
|
181 |
+
.loc 1 30 23
|
182 |
+
setp.ltu.f32 %p9, %f43, 0f3F8060FE;
|
183 |
+
fma.rn.ftz.f32 %f185, %f383, %f384, %f382;
|
184 |
+
fma.rn.ftz.f32 %f186, %f185, %f384, %f381;
|
185 |
+
fma.rn.ftz.f32 %f187, %f186, %f384, %f380;
|
186 |
+
fma.rn.ftz.f32 %f188, %f187, %f384, %f379;
|
187 |
+
fma.rn.ftz.f32 %f189, %f188, %f384, %f378;
|
188 |
+
fma.rn.ftz.f32 %f190, %f189, %f384, %f377;
|
189 |
+
neg.f32 %f191, %f384;
|
190 |
+
selp.f32 %f192, %f191, %f11, %p8;
|
191 |
+
fma.rn.ftz.f32 %f385, %f190, %f192, %f192;
|
192 |
+
@%p9 bra $L__BB0_12;
|
193 |
+
ex2.approx.ftz.f32 %f193, %f385;
|
194 |
+
sub.f32 %f195, %f358, %f193;
|
195 |
+
mov.b32 %r27, %f195;
|
196 |
+
mov.b32 %r28, %f11;
|
197 |
+
and.b32 %r29, %r28, -2147483648;
|
198 |
+
or.b32 %r30, %r29, %r27;
|
199 |
+
mov.b32 %f385, %r30;
|
200 |
+
$L__BB0_12:
|
201 |
+
.loc 1 0 0
|
202 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
|
203 |
+
mov.b32 %f5, %r10;
|
204 |
+
.loc 1 30 23
|
205 |
+
abs.ftz.f32 %f56, %f12;
|
206 |
+
setp.ge.f32 %p11, %f56, 0f3F8060FE;
|
207 |
+
mov.f32 %f392, 0f3789CA3C;
|
208 |
+
mov.f32 %f391, 0fB9F560B9;
|
209 |
+
mov.f32 %f390, 0f3BAC840B;
|
210 |
+
mov.f32 %f389, 0fBD0C8162;
|
211 |
+
mov.f32 %f388, 0f3E1CF906;
|
212 |
+
mov.f32 %f387, 0f3F6A937E;
|
213 |
+
mov.f32 %f386, 0f3F20D842;
|
214 |
+
mov.f32 %f393, %f56;
|
215 |
+
@%p11 bra $L__BB0_14;
|
216 |
+
mul.f32 %f393, %f12, %f12;
|
217 |
+
mov.f32 %f392, 0f38B1E96A;
|
218 |
+
mov.f32 %f391, 0fBA574D20;
|
219 |
+
mov.f32 %f390, 0f3BAAD5EA;
|
220 |
+
mov.f32 %f389, 0fBCDC1BE7;
|
221 |
+
mov.f32 %f388, 0f3DE718AF;
|
222 |
+
mov.f32 %f387, 0fBEC093AC;
|
223 |
+
mov.f32 %f386, 0f3E0375D3;
|
224 |
+
$L__BB0_14:
|
225 |
+
.loc 1 0 0
|
226 |
+
cvt.f32.bf16 %r11, %rs6;
|
227 |
+
mul.f32 %f13, %f5, 0f3F3504F3;
|
228 |
+
.loc 1 30 23
|
229 |
+
setp.ltu.f32 %p12, %f56, 0f3F8060FE;
|
230 |
+
fma.rn.ftz.f32 %f210, %f392, %f393, %f391;
|
231 |
+
fma.rn.ftz.f32 %f211, %f210, %f393, %f390;
|
232 |
+
fma.rn.ftz.f32 %f212, %f211, %f393, %f389;
|
233 |
+
fma.rn.ftz.f32 %f213, %f212, %f393, %f388;
|
234 |
+
fma.rn.ftz.f32 %f214, %f213, %f393, %f387;
|
235 |
+
fma.rn.ftz.f32 %f215, %f214, %f393, %f386;
|
236 |
+
neg.f32 %f216, %f393;
|
237 |
+
selp.f32 %f217, %f216, %f12, %p11;
|
238 |
+
fma.rn.ftz.f32 %f394, %f215, %f217, %f217;
|
239 |
+
@%p12 bra $L__BB0_16;
|
240 |
+
ex2.approx.ftz.f32 %f218, %f394;
|
241 |
+
sub.f32 %f220, %f358, %f218;
|
242 |
+
mov.b32 %r31, %f220;
|
243 |
+
mov.b32 %r32, %f12;
|
244 |
+
and.b32 %r33, %r32, -2147483648;
|
245 |
+
or.b32 %r34, %r33, %r31;
|
246 |
+
mov.b32 %f394, %r34;
|
247 |
+
$L__BB0_16:
|
248 |
+
.loc 1 0 0
|
249 |
+
cvt.u16.u32 %rs7, %r5;
|
250 |
+
mov.b32 %f6, %r11;
|
251 |
+
.loc 1 30 23
|
252 |
+
abs.ftz.f32 %f69, %f13;
|
253 |
+
setp.ge.f32 %p14, %f69, 0f3F8060FE;
|
254 |
+
mov.f32 %f401, 0f3789CA3C;
|
255 |
+
mov.f32 %f400, 0fB9F560B9;
|
256 |
+
mov.f32 %f399, 0f3BAC840B;
|
257 |
+
mov.f32 %f398, 0fBD0C8162;
|
258 |
+
mov.f32 %f397, 0f3E1CF906;
|
259 |
+
mov.f32 %f396, 0f3F6A937E;
|
260 |
+
mov.f32 %f395, 0f3F20D842;
|
261 |
+
mov.f32 %f402, %f69;
|
262 |
+
@%p14 bra $L__BB0_18;
|
263 |
+
mul.f32 %f402, %f13, %f13;
|
264 |
+
mov.f32 %f401, 0f38B1E96A;
|
265 |
+
mov.f32 %f400, 0fBA574D20;
|
266 |
+
mov.f32 %f399, 0f3BAAD5EA;
|
267 |
+
mov.f32 %f398, 0fBCDC1BE7;
|
268 |
+
mov.f32 %f397, 0f3DE718AF;
|
269 |
+
mov.f32 %f396, 0fBEC093AC;
|
270 |
+
mov.f32 %f395, 0f3E0375D3;
|
271 |
+
$L__BB0_18:
|
272 |
+
.loc 1 0 0
|
273 |
+
cvt.f32.bf16 %r12, %rs7;
|
274 |
+
mul.f32 %f14, %f6, 0f3F3504F3;
|
275 |
+
.loc 1 30 23
|
276 |
+
setp.ltu.f32 %p15, %f69, 0f3F8060FE;
|
277 |
+
fma.rn.ftz.f32 %f235, %f401, %f402, %f400;
|
278 |
+
fma.rn.ftz.f32 %f236, %f235, %f402, %f399;
|
279 |
+
fma.rn.ftz.f32 %f237, %f236, %f402, %f398;
|
280 |
+
fma.rn.ftz.f32 %f238, %f237, %f402, %f397;
|
281 |
+
fma.rn.ftz.f32 %f239, %f238, %f402, %f396;
|
282 |
+
fma.rn.ftz.f32 %f240, %f239, %f402, %f395;
|
283 |
+
neg.f32 %f241, %f402;
|
284 |
+
selp.f32 %f242, %f241, %f13, %p14;
|
285 |
+
fma.rn.ftz.f32 %f403, %f240, %f242, %f242;
|
286 |
+
@%p15 bra $L__BB0_20;
|
287 |
+
ex2.approx.ftz.f32 %f243, %f403;
|
288 |
+
sub.f32 %f245, %f358, %f243;
|
289 |
+
mov.b32 %r35, %f245;
|
290 |
+
mov.b32 %r36, %f13;
|
291 |
+
and.b32 %r37, %r36, -2147483648;
|
292 |
+
or.b32 %r38, %r37, %r35;
|
293 |
+
mov.b32 %f403, %r38;
|
294 |
+
$L__BB0_20:
|
295 |
+
.loc 1 0 0
|
296 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
|
297 |
+
mov.b32 %f7, %r12;
|
298 |
+
.loc 1 30 23
|
299 |
+
abs.ftz.f32 %f82, %f14;
|
300 |
+
setp.ge.f32 %p17, %f82, 0f3F8060FE;
|
301 |
+
mov.f32 %f410, 0f3789CA3C;
|
302 |
+
mov.f32 %f409, 0fB9F560B9;
|
303 |
+
mov.f32 %f408, 0f3BAC840B;
|
304 |
+
mov.f32 %f407, 0fBD0C8162;
|
305 |
+
mov.f32 %f406, 0f3E1CF906;
|
306 |
+
mov.f32 %f405, 0f3F6A937E;
|
307 |
+
mov.f32 %f404, 0f3F20D842;
|
308 |
+
mov.f32 %f411, %f82;
|
309 |
+
@%p17 bra $L__BB0_22;
|
310 |
+
mul.f32 %f411, %f14, %f14;
|
311 |
+
mov.f32 %f410, 0f38B1E96A;
|
312 |
+
mov.f32 %f409, 0fBA574D20;
|
313 |
+
mov.f32 %f408, 0f3BAAD5EA;
|
314 |
+
mov.f32 %f407, 0fBCDC1BE7;
|
315 |
+
mov.f32 %f406, 0f3DE718AF;
|
316 |
+
mov.f32 %f405, 0fBEC093AC;
|
317 |
+
mov.f32 %f404, 0f3E0375D3;
|
318 |
+
$L__BB0_22:
|
319 |
+
.loc 1 0 0
|
320 |
+
cvt.f32.bf16 %r13, %rs8;
|
321 |
+
mul.f32 %f15, %f7, 0f3F3504F3;
|
322 |
+
.loc 1 30 23
|
323 |
+
setp.ltu.f32 %p18, %f82, 0f3F8060FE;
|
324 |
+
fma.rn.ftz.f32 %f260, %f410, %f411, %f409;
|
325 |
+
fma.rn.ftz.f32 %f261, %f260, %f411, %f408;
|
326 |
+
fma.rn.ftz.f32 %f262, %f261, %f411, %f407;
|
327 |
+
fma.rn.ftz.f32 %f263, %f262, %f411, %f406;
|
328 |
+
fma.rn.ftz.f32 %f264, %f263, %f411, %f405;
|
329 |
+
fma.rn.ftz.f32 %f265, %f264, %f411, %f404;
|
330 |
+
neg.f32 %f266, %f411;
|
331 |
+
selp.f32 %f267, %f266, %f14, %p17;
|
332 |
+
fma.rn.ftz.f32 %f412, %f265, %f267, %f267;
|
333 |
+
@%p18 bra $L__BB0_24;
|
334 |
+
ex2.approx.ftz.f32 %f268, %f412;
|
335 |
+
sub.f32 %f270, %f358, %f268;
|
336 |
+
mov.b32 %r39, %f270;
|
337 |
+
mov.b32 %r40, %f14;
|
338 |
+
and.b32 %r41, %r40, -2147483648;
|
339 |
+
or.b32 %r42, %r41, %r39;
|
340 |
+
mov.b32 %f412, %r42;
|
341 |
+
$L__BB0_24:
|
342 |
+
.loc 1 0 0
|
343 |
+
mov.b32 %f8, %r13;
|
344 |
+
.loc 1 30 23
|
345 |
+
abs.ftz.f32 %f95, %f15;
|
346 |
+
setp.ge.f32 %p20, %f95, 0f3F8060FE;
|
347 |
+
mov.f32 %f419, 0f3789CA3C;
|
348 |
+
mov.f32 %f418, 0fB9F560B9;
|
349 |
+
mov.f32 %f417, 0f3BAC840B;
|
350 |
+
mov.f32 %f416, 0fBD0C8162;
|
351 |
+
mov.f32 %f415, 0f3E1CF906;
|
352 |
+
mov.f32 %f414, 0f3F6A937E;
|
353 |
+
mov.f32 %f413, 0f3F20D842;
|
354 |
+
mov.f32 %f420, %f95;
|
355 |
+
@%p20 bra $L__BB0_26;
|
356 |
+
mul.f32 %f420, %f15, %f15;
|
357 |
+
mov.f32 %f419, 0f38B1E96A;
|
358 |
+
mov.f32 %f418, 0fBA574D20;
|
359 |
+
mov.f32 %f417, 0f3BAAD5EA;
|
360 |
+
mov.f32 %f416, 0fBCDC1BE7;
|
361 |
+
mov.f32 %f415, 0f3DE718AF;
|
362 |
+
mov.f32 %f414, 0fBEC093AC;
|
363 |
+
mov.f32 %f413, 0f3E0375D3;
|
364 |
+
$L__BB0_26:
|
365 |
+
.loc 1 0 0
|
366 |
+
mul.f32 %f16, %f8, 0f3F3504F3;
|
367 |
+
.loc 1 30 23
|
368 |
+
setp.ltu.f32 %p21, %f95, 0f3F8060FE;
|
369 |
+
fma.rn.ftz.f32 %f285, %f419, %f420, %f418;
|
370 |
+
fma.rn.ftz.f32 %f286, %f285, %f420, %f417;
|
371 |
+
fma.rn.ftz.f32 %f287, %f286, %f420, %f416;
|
372 |
+
fma.rn.ftz.f32 %f288, %f287, %f420, %f415;
|
373 |
+
fma.rn.ftz.f32 %f289, %f288, %f420, %f414;
|
374 |
+
fma.rn.ftz.f32 %f290, %f289, %f420, %f413;
|
375 |
+
neg.f32 %f291, %f420;
|
376 |
+
selp.f32 %f292, %f291, %f15, %p20;
|
377 |
+
fma.rn.ftz.f32 %f421, %f290, %f292, %f292;
|
378 |
+
@%p21 bra $L__BB0_28;
|
379 |
+
ex2.approx.ftz.f32 %f293, %f421;
|
380 |
+
sub.f32 %f295, %f358, %f293;
|
381 |
+
mov.b32 %r43, %f295;
|
382 |
+
mov.b32 %r44, %f15;
|
383 |
+
and.b32 %r45, %r44, -2147483648;
|
384 |
+
or.b32 %r46, %r45, %r43;
|
385 |
+
mov.b32 %f421, %r46;
|
386 |
+
$L__BB0_28:
|
387 |
+
abs.ftz.f32 %f108, %f16;
|
388 |
+
setp.ge.f32 %p23, %f108, 0f3F8060FE;
|
389 |
+
mov.f32 %f428, 0f3789CA3C;
|
390 |
+
mov.f32 %f427, 0fB9F560B9;
|
391 |
+
mov.f32 %f426, 0f3BAC840B;
|
392 |
+
mov.f32 %f425, 0fBD0C8162;
|
393 |
+
mov.f32 %f424, 0f3E1CF906;
|
394 |
+
mov.f32 %f423, 0f3F6A937E;
|
395 |
+
mov.f32 %f422, 0f3F20D842;
|
396 |
+
mov.f32 %f429, %f108;
|
397 |
+
@%p23 bra $L__BB0_30;
|
398 |
+
mul.f32 %f429, %f16, %f16;
|
399 |
+
mov.f32 %f428, 0f38B1E96A;
|
400 |
+
mov.f32 %f427, 0fBA574D20;
|
401 |
+
mov.f32 %f426, 0f3BAAD5EA;
|
402 |
+
mov.f32 %f425, 0fBCDC1BE7;
|
403 |
+
mov.f32 %f424, 0f3DE718AF;
|
404 |
+
mov.f32 %f423, 0fBEC093AC;
|
405 |
+
mov.f32 %f422, 0f3E0375D3;
|
406 |
+
$L__BB0_30:
|
407 |
+
.loc 1 0 23
|
408 |
+
ld.param.u64 %rd2, [triton__0d1d2de_param_1];
|
409 |
+
cvt.s64.s32 %rd1, %r18;
|
410 |
+
.loc 1 30 23
|
411 |
+
setp.ltu.f32 %p24, %f108, 0f3F8060FE;
|
412 |
+
fma.rn.ftz.f32 %f310, %f428, %f429, %f427;
|
413 |
+
fma.rn.ftz.f32 %f311, %f310, %f429, %f426;
|
414 |
+
fma.rn.ftz.f32 %f312, %f311, %f429, %f425;
|
415 |
+
fma.rn.ftz.f32 %f313, %f312, %f429, %f424;
|
416 |
+
fma.rn.ftz.f32 %f314, %f313, %f429, %f423;
|
417 |
+
fma.rn.ftz.f32 %f315, %f314, %f429, %f422;
|
418 |
+
neg.f32 %f316, %f429;
|
419 |
+
selp.f32 %f317, %f316, %f16, %p23;
|
420 |
+
fma.rn.ftz.f32 %f430, %f315, %f317, %f317;
|
421 |
+
@%p24 bra $L__BB0_32;
|
422 |
+
ex2.approx.ftz.f32 %f318, %f430;
|
423 |
+
sub.f32 %f320, %f358, %f318;
|
424 |
+
mov.b32 %r47, %f320;
|
425 |
+
mov.b32 %r48, %f16;
|
426 |
+
and.b32 %r49, %r48, -2147483648;
|
427 |
+
or.b32 %r50, %r49, %r47;
|
428 |
+
mov.b32 %f430, %r50;
|
429 |
+
$L__BB0_32:
|
430 |
+
.loc 1 27 18
|
431 |
+
mul.f32 %f321, %f8, 0f3F000000;
|
432 |
+
mul.f32 %f322, %f7, 0f3F000000;
|
433 |
+
mul.f32 %f323, %f6, 0f3F000000;
|
434 |
+
mul.f32 %f324, %f5, 0f3F000000;
|
435 |
+
mul.f32 %f325, %f4, 0f3F000000;
|
436 |
+
mul.f32 %f326, %f3, 0f3F000000;
|
437 |
+
mul.f32 %f327, %f2, 0f3F000000;
|
438 |
+
mul.f32 %f328, %f1, 0f3F000000;
|
439 |
+
.loc 1 32 18
|
440 |
+
add.f32 %f329, %f367, 0f3F800000;
|
441 |
+
add.f32 %f330, %f376, 0f3F800000;
|
442 |
+
add.f32 %f331, %f385, 0f3F800000;
|
443 |
+
add.f32 %f332, %f394, 0f3F800000;
|
444 |
+
add.f32 %f333, %f403, 0f3F800000;
|
445 |
+
add.f32 %f334, %f412, 0f3F800000;
|
446 |
+
add.f32 %f335, %f421, 0f3F800000;
|
447 |
+
add.f32 %f336, %f430, 0f3F800000;
|
448 |
+
.loc 1 33 18
|
449 |
+
mul.f32 %f337, %f328, %f329;
|
450 |
+
mul.f32 %f338, %f327, %f330;
|
451 |
+
mul.f32 %f339, %f326, %f331;
|
452 |
+
mul.f32 %f340, %f325, %f332;
|
453 |
+
mul.f32 %f341, %f324, %f333;
|
454 |
+
mul.f32 %f342, %f323, %f334;
|
455 |
+
mul.f32 %f343, %f322, %f335;
|
456 |
+
mul.f32 %f344, %f321, %f336;
|
457 |
+
.loc 1 35 25
|
458 |
+
shl.b64 %rd7, %rd1, 1;
|
459 |
+
add.s64 %rd6, %rd2, %rd7;
|
460 |
+
.loc 1 35 37
|
461 |
+
mov.b32 %r51, %f337;
|
462 |
+
cvt.rn.bf16.f32 %rs9, %r51;
|
463 |
+
mov.b32 %r52, %f338;
|
464 |
+
cvt.rn.bf16.f32 %rs10, %r52;
|
465 |
+
mov.b32 %r53, %f339;
|
466 |
+
cvt.rn.bf16.f32 %rs11, %r53;
|
467 |
+
mov.b32 %r54, %f340;
|
468 |
+
cvt.rn.bf16.f32 %rs12, %r54;
|
469 |
+
mov.b32 %r55, %f341;
|
470 |
+
cvt.rn.bf16.f32 %rs13, %r55;
|
471 |
+
mov.b32 %r56, %f342;
|
472 |
+
cvt.rn.bf16.f32 %rs14, %r56;
|
473 |
+
mov.b32 %r57, %f343;
|
474 |
+
cvt.rn.bf16.f32 %rs15, %r57;
|
475 |
+
mov.b32 %r58, %f344;
|
476 |
+
cvt.rn.bf16.f32 %rs16, %r58;
|
477 |
+
mov.b32 %r63, {%rs9, %rs10};
|
478 |
+
mov.b32 %r64, {%rs11, %rs12};
|
479 |
+
mov.b32 %r65, {%rs13, %rs14};
|
480 |
+
mov.b32 %r66, {%rs15, %rs16};
|
481 |
+
@%p1 st.global.v4.b32 [ %rd6 + 0 ], { %r63, %r64, %r65, %r66 };
|
482 |
+
.loc 1 35 4
|
483 |
+
ret;
|
484 |
+
$L__tmp1:
|
485 |
+
$L__func_end0:
|
486 |
+
|
487 |
+
}
|
488 |
+
// .globl __nv_erff
|
489 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
490 |
+
.param .b32 __nv_erff_param_0
|
491 |
+
)
|
492 |
+
{
|
493 |
+
.reg .pred %p<4>;
|
494 |
+
.reg .b32 %r<5>;
|
495 |
+
.reg .f32 %f<49>;
|
496 |
+
$L__func_begin1:
|
497 |
+
|
498 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
499 |
+
abs.ftz.f32 %f1, %f14;
|
500 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
501 |
+
mov.f32 %f46, 0f3789CA3C;
|
502 |
+
mov.f32 %f45, 0fB9F560B9;
|
503 |
+
mov.f32 %f44, 0f3BAC840B;
|
504 |
+
mov.f32 %f43, 0fBD0C8162;
|
505 |
+
mov.f32 %f42, 0f3E1CF906;
|
506 |
+
mov.f32 %f41, 0f3F6A937E;
|
507 |
+
mov.f32 %f40, 0f3F20D842;
|
508 |
+
mov.f32 %f47, %f1;
|
509 |
+
@%p1 bra $L__BB1_2;
|
510 |
+
mul.f32 %f47, %f14, %f14;
|
511 |
+
mov.f32 %f46, 0f38B1E96A;
|
512 |
+
mov.f32 %f45, 0fBA574D20;
|
513 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
514 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
515 |
+
mov.f32 %f42, 0f3DE718AF;
|
516 |
+
mov.f32 %f41, 0fBEC093AC;
|
517 |
+
mov.f32 %f40, 0f3E0375D3;
|
518 |
+
$L__BB1_2:
|
519 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
520 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
521 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
522 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
523 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
524 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
525 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
526 |
+
neg.f32 %f35, %f47;
|
527 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
528 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
529 |
+
@%p2 bra $L__BB1_4;
|
530 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
531 |
+
mov.f32 %f38, 0f3F800000;
|
532 |
+
sub.f32 %f39, %f38, %f37;
|
533 |
+
mov.b32 %r1, %f39;
|
534 |
+
mov.b32 %r2, %f14;
|
535 |
+
and.b32 %r3, %r2, -2147483648;
|
536 |
+
or.b32 %r4, %r3, %r1;
|
537 |
+
mov.b32 %f48, %r4;
|
538 |
+
$L__BB1_4:
|
539 |
+
st.param.f32 [func_retval0+0], %f48;
|
540 |
+
ret;
|
541 |
+
$L__func_end1:
|
542 |
+
|
543 |
+
}
|
544 |
+
.file 1 "/tmp/torchinductor_root/jf/cjfoqo3nutni5cmtw4brla34cz45fusadehkxfkr2fie2qgo7vwt.py"
|
545 |
+
.section .debug_abbrev
|
546 |
+
{
|
547 |
+
.b8 1
|
548 |
+
.b8 17
|
549 |
+
.b8 1
|
550 |
+
.b8 37
|
551 |
+
.b8 8
|
552 |
+
.b8 19
|
553 |
+
.b8 5
|
554 |
+
.b8 3
|
555 |
+
.b8 8
|
556 |
+
.b8 16
|
557 |
+
.b8 6
|
558 |
+
.b8 27
|
559 |
+
.b8 8
|
560 |
+
.b8 180
|
561 |
+
.b8 66
|
562 |
+
.b8 12
|
563 |
+
.b8 17
|
564 |
+
.b8 1
|
565 |
+
.b8 18
|
566 |
+
.b8 1
|
567 |
+
.b8 0
|
568 |
+
.b8 0
|
569 |
+
.b8 2
|
570 |
+
.b8 46
|
571 |
+
.b8 0
|
572 |
+
.b8 17
|
573 |
+
.b8 1
|
574 |
+
.b8 18
|
575 |
+
.b8 1
|
576 |
+
.b8 64
|
577 |
+
.b8 10
|
578 |
+
.b8 135
|
579 |
+
.b8 64
|
580 |
+
.b8 8
|
581 |
+
.b8 3
|
582 |
+
.b8 8
|
583 |
+
.b8 58
|
584 |
+
.b8 11
|
585 |
+
.b8 59
|
586 |
+
.b8 11
|
587 |
+
.b8 63
|
588 |
+
.b8 12
|
589 |
+
.b8 0
|
590 |
+
.b8 0
|
591 |
+
.b8 0
|
592 |
+
}
|
593 |
+
.section .debug_info
|
594 |
+
{
|
595 |
+
.b32 176
|
596 |
+
.b8 2
|
597 |
+
.b8 0
|
598 |
+
.b32 .debug_abbrev
|
599 |
+
.b8 8
|
600 |
+
.b8 1
|
601 |
+
.b8 116
|
602 |
+
.b8 114
|
603 |
+
.b8 105
|
604 |
+
.b8 116
|
605 |
+
.b8 111
|
606 |
+
.b8 110
|
607 |
+
.b8 0
|
608 |
+
.b8 2
|
609 |
+
.b8 0
|
610 |
+
.b8 99
|
611 |
+
.b8 106
|
612 |
+
.b8 102
|
613 |
+
.b8 111
|
614 |
+
.b8 113
|
615 |
+
.b8 111
|
616 |
+
.b8 51
|
617 |
+
.b8 110
|
618 |
+
.b8 117
|
619 |
+
.b8 116
|
620 |
+
.b8 110
|
621 |
+
.b8 105
|
622 |
+
.b8 53
|
623 |
+
.b8 99
|
624 |
+
.b8 109
|
625 |
+
.b8 116
|
626 |
+
.b8 119
|
627 |
+
.b8 52
|
628 |
+
.b8 98
|
629 |
+
.b8 114
|
630 |
+
.b8 108
|
631 |
+
.b8 97
|
632 |
+
.b8 51
|
633 |
+
.b8 52
|
634 |
+
.b8 99
|
635 |
+
.b8 122
|
636 |
+
.b8 52
|
637 |
+
.b8 53
|
638 |
+
.b8 102
|
639 |
+
.b8 117
|
640 |
+
.b8 115
|
641 |
+
.b8 97
|
642 |
+
.b8 100
|
643 |
+
.b8 101
|
644 |
+
.b8 104
|
645 |
+
.b8 107
|
646 |
+
.b8 120
|
647 |
+
.b8 102
|
648 |
+
.b8 107
|
649 |
+
.b8 114
|
650 |
+
.b8 50
|
651 |
+
.b8 102
|
652 |
+
.b8 105
|
653 |
+
.b8 101
|
654 |
+
.b8 50
|
655 |
+
.b8 113
|
656 |
+
.b8 103
|
657 |
+
.b8 111
|
658 |
+
.b8 55
|
659 |
+
.b8 118
|
660 |
+
.b8 119
|
661 |
+
.b8 116
|
662 |
+
.b8 46
|
663 |
+
.b8 112
|
664 |
+
.b8 121
|
665 |
+
.b8 0
|
666 |
+
.b32 .debug_line
|
667 |
+
.b8 47
|
668 |
+
.b8 116
|
669 |
+
.b8 109
|
670 |
+
.b8 112
|
671 |
+
.b8 47
|
672 |
+
.b8 116
|
673 |
+
.b8 111
|
674 |
+
.b8 114
|
675 |
+
.b8 99
|
676 |
+
.b8 104
|
677 |
+
.b8 105
|
678 |
+
.b8 110
|
679 |
+
.b8 100
|
680 |
+
.b8 117
|
681 |
+
.b8 99
|
682 |
+
.b8 116
|
683 |
+
.b8 111
|
684 |
+
.b8 114
|
685 |
+
.b8 95
|
686 |
+
.b8 114
|
687 |
+
.b8 111
|
688 |
+
.b8 111
|
689 |
+
.b8 116
|
690 |
+
.b8 47
|
691 |
+
.b8 106
|
692 |
+
.b8 102
|
693 |
+
.b8 0
|
694 |
+
.b8 1
|
695 |
+
.b64 $L__func_begin0
|
696 |
+
.b64 $L__func_end0
|
697 |
+
.b8 2
|
698 |
+
.b64 $L__func_begin0
|
699 |
+
.b64 $L__func_end0
|
700 |
+
.b8 1
|
701 |
+
.b8 156
|
702 |
+
.b8 116
|
703 |
+
.b8 114
|
704 |
+
.b8 105
|
705 |
+
.b8 116
|
706 |
+
.b8 111
|
707 |
+
.b8 110
|
708 |
+
.b8 95
|
709 |
+
.b8 95
|
710 |
+
.b8 48
|
711 |
+
.b8 100
|
712 |
+
.b8 49
|
713 |
+
.b8 100
|
714 |
+
.b8 50
|
715 |
+
.b8 100
|
716 |
+
.b8 101
|
717 |
+
.b8 0
|
718 |
+
.b8 116
|
719 |
+
.b8 114
|
720 |
+
.b8 105
|
721 |
+
.b8 116
|
722 |
+
.b8 111
|
723 |
+
.b8 110
|
724 |
+
.b8 95
|
725 |
+
.b8 95
|
726 |
+
.b8 48
|
727 |
+
.b8 100
|
728 |
+
.b8 49
|
729 |
+
.b8 100
|
730 |
+
.b8 50
|
731 |
+
.b8 100
|
732 |
+
.b8 101
|
733 |
+
.b8 0
|
734 |
+
.b8 1
|
735 |
+
.b8 18
|
736 |
+
.b8 1
|
737 |
+
.b8 0
|
738 |
+
}
|
739 |
+
.section .debug_pubnames
|
740 |
+
{
|
741 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
742 |
+
$L__pubNames_start0:
|
743 |
+
.b8 2
|
744 |
+
.b8 0
|
745 |
+
.b32 .debug_info
|
746 |
+
.b32 180
|
747 |
+
.b32 125
|
748 |
+
.b8 116
|
749 |
+
.b8 114
|
750 |
+
.b8 105
|
751 |
+
.b8 116
|
752 |
+
.b8 111
|
753 |
+
.b8 110
|
754 |
+
.b8 95
|
755 |
+
.b8 95
|
756 |
+
.b8 48
|
757 |
+
.b8 100
|
758 |
+
.b8 49
|
759 |
+
.b8 100
|
760 |
+
.b8 50
|
761 |
+
.b8 100
|
762 |
+
.b8 101
|
763 |
+
.b8 0
|
764 |
+
.b32 0
|
765 |
+
$L__pubNames_end0:
|
766 |
+
}
|
767 |
+
.section .debug_pubtypes
|
768 |
+
{
|
769 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
770 |
+
$L__pubTypes_start0:
|
771 |
+
.b8 2
|
772 |
+
.b8 0
|
773 |
+
.b32 .debug_info
|
774 |
+
.b32 180
|
775 |
+
.b32 0
|
776 |
+
$L__pubTypes_end0:
|
777 |
+
}
|
778 |
+
.section .debug_loc { }
|
.triton/dump/415aac87553b7d064f52694fa7254686/triton_.ttir
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<1.000000e+00> : tensor<1024xf32>
|
4 |
+
%cst_0 = arith.constant dense<0.707106769> : tensor<1024xf32>
|
5 |
+
%cst_1 = arith.constant dense<5.000000e-01> : tensor<1024xf32>
|
6 |
+
%c1024_i32 = arith.constant 1024 : i32
|
7 |
+
%0 = tt.get_program_id x : i32
|
8 |
+
%1 = arith.muli %0, %c1024_i32 : i32
|
9 |
+
%2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
|
10 |
+
%3 = tt.splat %1 : (i32) -> tensor<1024xi32>
|
11 |
+
%4 = arith.addi %3, %2 : tensor<1024xi32>
|
12 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
13 |
+
%6 = tt.addptr %5, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
14 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xbf16>
|
15 |
+
%8 = arith.extf %7 : tensor<1024xbf16> to tensor<1024xf32>
|
16 |
+
%9 = arith.mulf %8, %cst_1 : tensor<1024xf32>
|
17 |
+
%10 = arith.mulf %8, %cst_0 : tensor<1024xf32>
|
18 |
+
%11 = tt.extern_elementwise %10 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_erff"} : (tensor<1024xf32>) -> tensor<1024xf32>
|
19 |
+
%12 = arith.addf %11, %cst : tensor<1024xf32>
|
20 |
+
%13 = arith.mulf %9, %12 : tensor<1024xf32>
|
21 |
+
%14 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>>
|
22 |
+
%15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<bf16, 1>>, tensor<1024xi32>
|
23 |
+
%16 = arith.truncf %13 : tensor<1024xf32> to tensor<1024xbf16>
|
24 |
+
tt.store %15, %16 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16>
|
25 |
+
tt.return
|
26 |
+
}
|
27 |
+
}
|
.triton/dump/51e329eae41e4ee17aa201fff8371d94/triton_.llir
ADDED
The diff for this file is too large to render.
See raw diff
|
|
.triton/dump/645565eaba0a18dd23ef200fe9abb0c0/triton_.ttir
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2d3d4d5d6d7d8de9de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg7: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c512_i32 = arith.constant 512 : i32
|
4 |
+
%c256_i32 = arith.constant 256 : i32
|
5 |
+
%cst = arith.constant 0.000000e+00 : f32
|
6 |
+
%cst_0 = arith.constant 2.560000e+02 : f32
|
7 |
+
%cst_1 = arith.constant 9.99999974E-6 : f32
|
8 |
+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<256xf32>
|
9 |
+
%cst_3 = arith.constant dense<256> : tensor<1xi64>
|
10 |
+
%cst_4 = arith.constant dense<50257> : tensor<1xi64>
|
11 |
+
%cst_5 = arith.constant dense<0> : tensor<1xi64>
|
12 |
+
%cst_6 = arith.constant dense<256> : tensor<256xi32>
|
13 |
+
%0 = tt.get_program_id x : i32
|
14 |
+
%1 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
15 |
+
%2 = arith.cmpi slt, %1, %cst_6 : tensor<256xi32>
|
16 |
+
%3 = arith.remsi %0, %c512_i32 : i32
|
17 |
+
%4 = tt.addptr %arg1, %0 : !tt.ptr<i64, 1>, i32
|
18 |
+
%5 = tt.splat %4 : (!tt.ptr<i64, 1>) -> tensor<1x!tt.ptr<i64, 1>>
|
19 |
+
%6 = tt.load %5 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<1xi64>
|
20 |
+
%7 = arith.muli %3, %c256_i32 : i32
|
21 |
+
%8 = tt.splat %7 : (i32) -> tensor<256xi32>
|
22 |
+
%9 = arith.addi %1, %8 : tensor<256xi32>
|
23 |
+
%10 = tt.splat %arg3 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
24 |
+
%11 = tt.addptr %10, %9 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
25 |
+
%12 = tt.load %11, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
26 |
+
%13 = tt.splat %arg4 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
27 |
+
%14 = tt.addptr %13, %1 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
28 |
+
%15 = tt.load %14, %2, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<256xf32>
|
29 |
+
%16 = arith.addi %6, %cst_4 : tensor<1xi64>
|
30 |
+
%17 = arith.cmpi slt, %6, %cst_5 : tensor<1xi64>
|
31 |
+
%18 = arith.select %17, %16, %6 : tensor<1xi1>, tensor<1xi64>
|
32 |
+
%19 = arith.cmpi sge, %18, %cst_5 : tensor<1xi64>
|
33 |
+
%20 = arith.cmpi slt, %18, %cst_4 : tensor<1xi64>
|
34 |
+
%21 = arith.andi %19, %20 : tensor<1xi1>
|
35 |
+
tt.assert %21, "index out of bounds: 0 <= tmp3 < 50257", "<frozen importlib._bootstrap_external>", "_call_with_frames_removed", 883 : tensor<1xi1>
|
36 |
+
%22 = arith.muli %18, %cst_3 : tensor<1xi64>
|
37 |
+
%23 = tt.broadcast %22 : (tensor<1xi64>) -> tensor<256xi64>
|
38 |
+
%24 = arith.extsi %1 : tensor<256xi32> to tensor<256xi64>
|
39 |
+
%25 = arith.addi %24, %23 : tensor<256xi64>
|
40 |
+
%26 = tt.splat %arg2 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
41 |
+
%27 = tt.addptr %26, %25 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi64>
|
42 |
+
%28 = tt.load %27, %2, %cst_2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xf32>
|
43 |
+
%29 = arith.addf %28, %12 : tensor<256xf32>
|
44 |
+
%30 = arith.select %2, %29, %cst_2 : tensor<256xi1>, tensor<256xf32>
|
45 |
+
%31 = "tt.reduce"(%30) <{axis = 0 : i32}> ({
|
46 |
+
^bb0(%arg10: f32, %arg11: f32):
|
47 |
+
%60 = arith.addf %arg10, %arg11 : f32
|
48 |
+
tt.reduce.return %60 : f32
|
49 |
+
}) : (tensor<256xf32>) -> f32
|
50 |
+
%32 = arith.addf %31, %cst : f32
|
51 |
+
%33 = arith.divf %32, %cst_0 : f32
|
52 |
+
%34 = tt.splat %33 : (f32) -> tensor<1xf32>
|
53 |
+
%35 = tt.splat %33 : (f32) -> tensor<256xf32>
|
54 |
+
%36 = arith.subf %29, %35 : tensor<256xf32>
|
55 |
+
%37 = arith.mulf %36, %36 : tensor<256xf32>
|
56 |
+
%38 = arith.select %2, %37, %cst_2 : tensor<256xi1>, tensor<256xf32>
|
57 |
+
%39 = "tt.reduce"(%38) <{axis = 0 : i32}> ({
|
58 |
+
^bb0(%arg10: f32, %arg11: f32):
|
59 |
+
%60 = arith.addf %arg10, %arg11 : f32
|
60 |
+
tt.reduce.return %60 : f32
|
61 |
+
}) : (tensor<256xf32>) -> f32
|
62 |
+
%40 = arith.addf %39, %cst : f32
|
63 |
+
%41 = arith.divf %40, %cst_0 : f32
|
64 |
+
%42 = arith.addf %41, %cst_1 : f32
|
65 |
+
%43 = tt.extern_elementwise %42 {libname = "libdevice", libpath = "/usr/local/lib/python3.10/dist-packages/triton/language/../third_party/cuda/lib/libdevice.10.bc", pure = true, symbol = "__nv_rsqrtf"} : (f32) -> f32
|
66 |
+
%44 = tt.splat %43 : (f32) -> tensor<1xf32>
|
67 |
+
%45 = tt.splat %43 : (f32) -> tensor<256xf32>
|
68 |
+
%46 = arith.mulf %36, %45 : tensor<256xf32>
|
69 |
+
%47 = arith.mulf %46, %15 : tensor<256xf32>
|
70 |
+
%48 = arith.muli %0, %c256_i32 : i32
|
71 |
+
%49 = tt.splat %48 : (i32) -> tensor<256xi32>
|
72 |
+
%50 = arith.addi %1, %49 : tensor<256xi32>
|
73 |
+
%51 = tt.splat %arg5 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
74 |
+
%52 = tt.addptr %51, %50 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
75 |
+
tt.store %52, %29, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
76 |
+
gpu.barrier
|
77 |
+
%53 = tt.addptr %arg0, %0 : !tt.ptr<f32, 1>, i32
|
78 |
+
%54 = tt.splat %53 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
79 |
+
tt.store %54, %44 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
80 |
+
%55 = tt.splat %arg7 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
81 |
+
%56 = tt.addptr %55, %50 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
82 |
+
%57 = arith.truncf %47 : tensor<256xf32> to tensor<256xbf16>
|
83 |
+
tt.store %56, %57, %2 {cache = 1 : i32, evict = 1 : i32} : tensor<256xbf16>
|
84 |
+
%58 = tt.addptr %arg6, %0 : !tt.ptr<f32, 1>, i32
|
85 |
+
%59 = tt.splat %58 : (!tt.ptr<f32, 1>) -> tensor<1x!tt.ptr<f32, 1>>
|
86 |
+
tt.store %59, %34 {cache = 1 : i32, evict = 1 : i32} : tensor<1xf32>
|
87 |
+
tt.return
|
88 |
+
}
|
89 |
+
}
|
.triton/dump/7dc5bb3e5c2bb99527fff34c6fba7810/triton_.ttgir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
|
2 |
+
module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
|
3 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
4 |
+
%cst = arith.constant dense<512> : tensor<128xi32, #blocked>
|
5 |
+
%c128_i32 = arith.constant 128 : i32
|
6 |
+
%0 = tt.get_program_id x : i32
|
7 |
+
%1 = arith.muli %0, %c128_i32 : i32
|
8 |
+
%2 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked>
|
9 |
+
%3 = tt.splat %1 : (i32) -> tensor<128xi32, #blocked>
|
10 |
+
%4 = arith.addi %3, %2 : tensor<128xi32, #blocked>
|
11 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<128xi32, #blocked>
|
12 |
+
%6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<128x!tt.ptr<i64, 1>, #blocked>
|
13 |
+
%7 = tt.addptr %6, %4 : tensor<128x!tt.ptr<i64, 1>, #blocked>, tensor<128xi32, #blocked>
|
14 |
+
%8 = arith.extsi %4 : tensor<128xi32, #blocked> to tensor<128xi64, #blocked>
|
15 |
+
tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<128xi64, #blocked>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|
.triton/dump/8c4bac4d904709a8b7e8c698132d974c/triton_.ttir
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1de(%arg0: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%cst = arith.constant dense<512> : tensor<256xi32>
|
4 |
+
%c256_i32 = arith.constant 256 : i32
|
5 |
+
%0 = tt.get_program_id x : i32
|
6 |
+
%1 = arith.muli %0, %c256_i32 : i32
|
7 |
+
%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
8 |
+
%3 = tt.splat %1 : (i32) -> tensor<256xi32>
|
9 |
+
%4 = arith.addi %3, %2 : tensor<256xi32>
|
10 |
+
%5 = arith.cmpi slt, %4, %cst : tensor<256xi32>
|
11 |
+
%6 = tt.splat %arg0 : (!tt.ptr<i64, 1>) -> tensor<256x!tt.ptr<i64, 1>>
|
12 |
+
%7 = tt.addptr %6, %4 : tensor<256x!tt.ptr<i64, 1>>, tensor<256xi32>
|
13 |
+
%8 = arith.extsi %4 : tensor<256xi32> to tensor<256xi64>
|
14 |
+
tt.store %7, %8, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<256xi64>
|
15 |
+
tt.return
|
16 |
+
}
|
17 |
+
}
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.cubin
ADDED
Binary file (28.6 kB). View file
|
|
.triton/dump/93e5abc5363b9438178c618128714f73/triton_.ptx
ADDED
@@ -0,0 +1,861 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//
|
2 |
+
// Generated by LLVM NVPTX Back-End
|
3 |
+
//
|
4 |
+
|
5 |
+
.version 8.2
|
6 |
+
.target sm_89
|
7 |
+
.address_size 64
|
8 |
+
|
9 |
+
// .globl triton__0d1d2de
|
10 |
+
.global .align 1 .b8 _$_str[11] = {95, 95, 67, 85, 68, 65, 95, 70, 84, 90, 0};
|
11 |
+
|
12 |
+
.visible .entry triton__0d1d2de(
|
13 |
+
.param .u64 triton__0d1d2de_param_0,
|
14 |
+
.param .u64 triton__0d1d2de_param_1,
|
15 |
+
.param .u32 triton__0d1d2de_param_2
|
16 |
+
)
|
17 |
+
.maxntid 128, 1, 1
|
18 |
+
{
|
19 |
+
.reg .pred %p<28>;
|
20 |
+
.reg .b16 %rs<25>;
|
21 |
+
.reg .b32 %r<79>;
|
22 |
+
.reg .f32 %f<487>;
|
23 |
+
.reg .b64 %rd<8>;
|
24 |
+
.loc 1 18 0
|
25 |
+
$L__func_begin0:
|
26 |
+
.loc 1 18 0
|
27 |
+
|
28 |
+
ld.param.u64 %rd4, [triton__0d1d2de_param_0];
|
29 |
+
ld.param.u64 %rd5, [triton__0d1d2de_param_1];
|
30 |
+
$L__tmp0:
|
31 |
+
.loc 1 21 36
|
32 |
+
mov.u32 %r26, %tid.x;
|
33 |
+
shl.b32 %r27, %r26, 3;
|
34 |
+
and.b32 %r28, %r27, 1016;
|
35 |
+
.loc 1 20 28
|
36 |
+
mov.u32 %r1, %ctaid.x;
|
37 |
+
.loc 1 20 33
|
38 |
+
shl.b32 %r29, %r1, 10;
|
39 |
+
.loc 1 21 23
|
40 |
+
or.b32 %r30, %r29, %r28;
|
41 |
+
.loc 1 24 34
|
42 |
+
mul.wide.s32 %rd6, %r30, 2;
|
43 |
+
add.s64 %rd7, %rd4, %rd6;
|
44 |
+
mov.pred %p1, -1;
|
45 |
+
.loc 1 24 39
|
46 |
+
mov.u32 %r2, 0x0;
|
47 |
+
mov.u32 %r3, 0x0;
|
48 |
+
mov.u32 %r4, 0x0;
|
49 |
+
mov.u32 %r5, 0x0;
|
50 |
+
@%p1 ld.global.v4.b32 { %r2, %r3, %r4, %r5 }, [ %rd7 + 0 ];
|
51 |
+
.loc 1 25 30
|
52 |
+
add.s64 %rd3, %rd5, %rd6;
|
53 |
+
.loc 1 25 35
|
54 |
+
mov.u32 %r14, 0x0;
|
55 |
+
mov.u32 %r15, 0x0;
|
56 |
+
mov.u32 %r16, 0x0;
|
57 |
+
mov.u32 %r17, 0x0;
|
58 |
+
@%p1 ld.global.v4.b32 { %r14, %r15, %r16, %r17 }, [ %rd3 + 0 ];
|
59 |
+
cvt.u16.u32 %rs9, %r14;
|
60 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r14; }
|
61 |
+
cvt.u16.u32 %rs11, %r15;
|
62 |
+
.loc 1 25 44
|
63 |
+
cvt.f32.bf16 %r18, %rs9;
|
64 |
+
mov.b32 %f9, %r18;
|
65 |
+
cvt.f32.bf16 %r19, %rs10;
|
66 |
+
mov.b32 %f10, %r19;
|
67 |
+
.loc 1 29 18
|
68 |
+
mul.f32 %f17, %f9, 0f3F3504F3;
|
69 |
+
.loc 1 30 23
|
70 |
+
abs.ftz.f32 %f25, %f17;
|
71 |
+
setp.ge.f32 %p3, %f25, 0f3F8060FE;
|
72 |
+
mov.f32 %f421, 0f3789CA3C;
|
73 |
+
mov.f32 %f420, 0fB9F560B9;
|
74 |
+
mov.f32 %f419, 0f3BAC840B;
|
75 |
+
mov.f32 %f418, 0fBD0C8162;
|
76 |
+
mov.f32 %f417, 0f3E1CF906;
|
77 |
+
mov.f32 %f416, 0f3F6A937E;
|
78 |
+
mov.f32 %f415, 0f3F20D842;
|
79 |
+
mov.f32 %f422, %f25;
|
80 |
+
@%p3 bra $L__BB0_2;
|
81 |
+
.loc 1 0 23
|
82 |
+
mov.f32 %f421, 0f38B1E96A;
|
83 |
+
mov.f32 %f420, 0fBA574D20;
|
84 |
+
mov.f32 %f419, 0f3BAAD5EA;
|
85 |
+
mov.f32 %f418, 0fBCDC1BE7;
|
86 |
+
mov.f32 %f417, 0f3DE718AF;
|
87 |
+
mov.f32 %f416, 0fBEC093AC;
|
88 |
+
mov.f32 %f415, 0f3E0375D3;
|
89 |
+
.loc 1 30 23
|
90 |
+
mul.f32 %f422, %f17, %f17;
|
91 |
+
$L__BB0_2:
|
92 |
+
.loc 1 0 0
|
93 |
+
cvt.f32.bf16 %r20, %rs11;
|
94 |
+
mul.f32 %f18, %f10, 0f3F3504F3;
|
95 |
+
.loc 1 30 23
|
96 |
+
setp.ltu.f32 %p4, %f25, 0f3F8060FE;
|
97 |
+
fma.rn.ftz.f32 %f143, %f421, %f422, %f420;
|
98 |
+
fma.rn.ftz.f32 %f144, %f143, %f422, %f419;
|
99 |
+
fma.rn.ftz.f32 %f145, %f144, %f422, %f418;
|
100 |
+
fma.rn.ftz.f32 %f146, %f145, %f422, %f417;
|
101 |
+
fma.rn.ftz.f32 %f147, %f146, %f422, %f416;
|
102 |
+
fma.rn.ftz.f32 %f148, %f147, %f422, %f415;
|
103 |
+
neg.f32 %f149, %f422;
|
104 |
+
selp.f32 %f150, %f149, %f17, %p3;
|
105 |
+
fma.rn.ftz.f32 %f423, %f148, %f150, %f150;
|
106 |
+
mov.f32 %f414, 0f3F800000;
|
107 |
+
@%p4 bra $L__BB0_4;
|
108 |
+
ex2.approx.ftz.f32 %f151, %f423;
|
109 |
+
sub.f32 %f153, %f414, %f151;
|
110 |
+
mov.b32 %r31, %f153;
|
111 |
+
mov.b32 %r32, %f17;
|
112 |
+
and.b32 %r33, %r32, -2147483648;
|
113 |
+
or.b32 %r34, %r33, %r31;
|
114 |
+
mov.b32 %f423, %r34;
|
115 |
+
$L__BB0_4:
|
116 |
+
.loc 1 0 0
|
117 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs12}, %r15; }
|
118 |
+
mov.b32 %f11, %r20;
|
119 |
+
.loc 1 30 23
|
120 |
+
abs.ftz.f32 %f38, %f18;
|
121 |
+
setp.ge.f32 %p6, %f38, 0f3F8060FE;
|
122 |
+
mov.f32 %f430, 0f3789CA3C;
|
123 |
+
mov.f32 %f429, 0fB9F560B9;
|
124 |
+
mov.f32 %f428, 0f3BAC840B;
|
125 |
+
mov.f32 %f427, 0fBD0C8162;
|
126 |
+
mov.f32 %f426, 0f3E1CF906;
|
127 |
+
mov.f32 %f425, 0f3F6A937E;
|
128 |
+
mov.f32 %f424, 0f3F20D842;
|
129 |
+
mov.f32 %f431, %f38;
|
130 |
+
@%p6 bra $L__BB0_6;
|
131 |
+
mul.f32 %f431, %f18, %f18;
|
132 |
+
mov.f32 %f430, 0f38B1E96A;
|
133 |
+
mov.f32 %f429, 0fBA574D20;
|
134 |
+
mov.f32 %f428, 0f3BAAD5EA;
|
135 |
+
mov.f32 %f427, 0fBCDC1BE7;
|
136 |
+
mov.f32 %f426, 0f3DE718AF;
|
137 |
+
mov.f32 %f425, 0fBEC093AC;
|
138 |
+
mov.f32 %f424, 0f3E0375D3;
|
139 |
+
$L__BB0_6:
|
140 |
+
.loc 1 0 0
|
141 |
+
cvt.f32.bf16 %r21, %rs12;
|
142 |
+
mul.f32 %f19, %f11, 0f3F3504F3;
|
143 |
+
.loc 1 30 23
|
144 |
+
setp.ltu.f32 %p7, %f38, 0f3F8060FE;
|
145 |
+
fma.rn.ftz.f32 %f168, %f430, %f431, %f429;
|
146 |
+
fma.rn.ftz.f32 %f169, %f168, %f431, %f428;
|
147 |
+
fma.rn.ftz.f32 %f170, %f169, %f431, %f427;
|
148 |
+
fma.rn.ftz.f32 %f171, %f170, %f431, %f426;
|
149 |
+
fma.rn.ftz.f32 %f172, %f171, %f431, %f425;
|
150 |
+
fma.rn.ftz.f32 %f173, %f172, %f431, %f424;
|
151 |
+
neg.f32 %f174, %f431;
|
152 |
+
selp.f32 %f175, %f174, %f18, %p6;
|
153 |
+
fma.rn.ftz.f32 %f432, %f173, %f175, %f175;
|
154 |
+
@%p7 bra $L__BB0_8;
|
155 |
+
ex2.approx.ftz.f32 %f176, %f432;
|
156 |
+
sub.f32 %f178, %f414, %f176;
|
157 |
+
mov.b32 %r35, %f178;
|
158 |
+
mov.b32 %r36, %f18;
|
159 |
+
and.b32 %r37, %r36, -2147483648;
|
160 |
+
or.b32 %r38, %r37, %r35;
|
161 |
+
mov.b32 %f432, %r38;
|
162 |
+
$L__BB0_8:
|
163 |
+
.loc 1 0 0
|
164 |
+
cvt.u16.u32 %rs13, %r16;
|
165 |
+
mov.b32 %f12, %r21;
|
166 |
+
.loc 1 30 23
|
167 |
+
abs.ftz.f32 %f51, %f19;
|
168 |
+
setp.ge.f32 %p9, %f51, 0f3F8060FE;
|
169 |
+
mov.f32 %f439, 0f3789CA3C;
|
170 |
+
mov.f32 %f438, 0fB9F560B9;
|
171 |
+
mov.f32 %f437, 0f3BAC840B;
|
172 |
+
mov.f32 %f436, 0fBD0C8162;
|
173 |
+
mov.f32 %f435, 0f3E1CF906;
|
174 |
+
mov.f32 %f434, 0f3F6A937E;
|
175 |
+
mov.f32 %f433, 0f3F20D842;
|
176 |
+
mov.f32 %f440, %f51;
|
177 |
+
@%p9 bra $L__BB0_10;
|
178 |
+
mul.f32 %f440, %f19, %f19;
|
179 |
+
mov.f32 %f439, 0f38B1E96A;
|
180 |
+
mov.f32 %f438, 0fBA574D20;
|
181 |
+
mov.f32 %f437, 0f3BAAD5EA;
|
182 |
+
mov.f32 %f436, 0fBCDC1BE7;
|
183 |
+
mov.f32 %f435, 0f3DE718AF;
|
184 |
+
mov.f32 %f434, 0fBEC093AC;
|
185 |
+
mov.f32 %f433, 0f3E0375D3;
|
186 |
+
$L__BB0_10:
|
187 |
+
.loc 1 0 0
|
188 |
+
cvt.f32.bf16 %r22, %rs13;
|
189 |
+
mul.f32 %f20, %f12, 0f3F3504F3;
|
190 |
+
.loc 1 30 23
|
191 |
+
setp.ltu.f32 %p10, %f51, 0f3F8060FE;
|
192 |
+
fma.rn.ftz.f32 %f193, %f439, %f440, %f438;
|
193 |
+
fma.rn.ftz.f32 %f194, %f193, %f440, %f437;
|
194 |
+
fma.rn.ftz.f32 %f195, %f194, %f440, %f436;
|
195 |
+
fma.rn.ftz.f32 %f196, %f195, %f440, %f435;
|
196 |
+
fma.rn.ftz.f32 %f197, %f196, %f440, %f434;
|
197 |
+
fma.rn.ftz.f32 %f198, %f197, %f440, %f433;
|
198 |
+
neg.f32 %f199, %f440;
|
199 |
+
selp.f32 %f200, %f199, %f19, %p9;
|
200 |
+
fma.rn.ftz.f32 %f441, %f198, %f200, %f200;
|
201 |
+
@%p10 bra $L__BB0_12;
|
202 |
+
ex2.approx.ftz.f32 %f201, %f441;
|
203 |
+
sub.f32 %f203, %f414, %f201;
|
204 |
+
mov.b32 %r39, %f203;
|
205 |
+
mov.b32 %r40, %f19;
|
206 |
+
and.b32 %r41, %r40, -2147483648;
|
207 |
+
or.b32 %r42, %r41, %r39;
|
208 |
+
mov.b32 %f441, %r42;
|
209 |
+
$L__BB0_12:
|
210 |
+
.loc 1 0 0
|
211 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs14}, %r16; }
|
212 |
+
mov.b32 %f13, %r22;
|
213 |
+
.loc 1 30 23
|
214 |
+
abs.ftz.f32 %f64, %f20;
|
215 |
+
setp.ge.f32 %p12, %f64, 0f3F8060FE;
|
216 |
+
mov.f32 %f448, 0f3789CA3C;
|
217 |
+
mov.f32 %f447, 0fB9F560B9;
|
218 |
+
mov.f32 %f446, 0f3BAC840B;
|
219 |
+
mov.f32 %f445, 0fBD0C8162;
|
220 |
+
mov.f32 %f444, 0f3E1CF906;
|
221 |
+
mov.f32 %f443, 0f3F6A937E;
|
222 |
+
mov.f32 %f442, 0f3F20D842;
|
223 |
+
mov.f32 %f449, %f64;
|
224 |
+
@%p12 bra $L__BB0_14;
|
225 |
+
mul.f32 %f449, %f20, %f20;
|
226 |
+
mov.f32 %f448, 0f38B1E96A;
|
227 |
+
mov.f32 %f447, 0fBA574D20;
|
228 |
+
mov.f32 %f446, 0f3BAAD5EA;
|
229 |
+
mov.f32 %f445, 0fBCDC1BE7;
|
230 |
+
mov.f32 %f444, 0f3DE718AF;
|
231 |
+
mov.f32 %f443, 0fBEC093AC;
|
232 |
+
mov.f32 %f442, 0f3E0375D3;
|
233 |
+
$L__BB0_14:
|
234 |
+
.loc 1 0 0
|
235 |
+
cvt.f32.bf16 %r23, %rs14;
|
236 |
+
mul.f32 %f21, %f13, 0f3F3504F3;
|
237 |
+
.loc 1 30 23
|
238 |
+
setp.ltu.f32 %p13, %f64, 0f3F8060FE;
|
239 |
+
fma.rn.ftz.f32 %f218, %f448, %f449, %f447;
|
240 |
+
fma.rn.ftz.f32 %f219, %f218, %f449, %f446;
|
241 |
+
fma.rn.ftz.f32 %f220, %f219, %f449, %f445;
|
242 |
+
fma.rn.ftz.f32 %f221, %f220, %f449, %f444;
|
243 |
+
fma.rn.ftz.f32 %f222, %f221, %f449, %f443;
|
244 |
+
fma.rn.ftz.f32 %f223, %f222, %f449, %f442;
|
245 |
+
neg.f32 %f224, %f449;
|
246 |
+
selp.f32 %f225, %f224, %f20, %p12;
|
247 |
+
fma.rn.ftz.f32 %f450, %f223, %f225, %f225;
|
248 |
+
@%p13 bra $L__BB0_16;
|
249 |
+
ex2.approx.ftz.f32 %f226, %f450;
|
250 |
+
sub.f32 %f228, %f414, %f226;
|
251 |
+
mov.b32 %r43, %f228;
|
252 |
+
mov.b32 %r44, %f20;
|
253 |
+
and.b32 %r45, %r44, -2147483648;
|
254 |
+
or.b32 %r46, %r45, %r43;
|
255 |
+
mov.b32 %f450, %r46;
|
256 |
+
$L__BB0_16:
|
257 |
+
.loc 1 0 0
|
258 |
+
cvt.u16.u32 %rs15, %r17;
|
259 |
+
mov.b32 %f14, %r23;
|
260 |
+
.loc 1 30 23
|
261 |
+
abs.ftz.f32 %f77, %f21;
|
262 |
+
setp.ge.f32 %p15, %f77, 0f3F8060FE;
|
263 |
+
mov.f32 %f457, 0f3789CA3C;
|
264 |
+
mov.f32 %f456, 0fB9F560B9;
|
265 |
+
mov.f32 %f455, 0f3BAC840B;
|
266 |
+
mov.f32 %f454, 0fBD0C8162;
|
267 |
+
mov.f32 %f453, 0f3E1CF906;
|
268 |
+
mov.f32 %f452, 0f3F6A937E;
|
269 |
+
mov.f32 %f451, 0f3F20D842;
|
270 |
+
mov.f32 %f458, %f77;
|
271 |
+
@%p15 bra $L__BB0_18;
|
272 |
+
mul.f32 %f458, %f21, %f21;
|
273 |
+
mov.f32 %f457, 0f38B1E96A;
|
274 |
+
mov.f32 %f456, 0fBA574D20;
|
275 |
+
mov.f32 %f455, 0f3BAAD5EA;
|
276 |
+
mov.f32 %f454, 0fBCDC1BE7;
|
277 |
+
mov.f32 %f453, 0f3DE718AF;
|
278 |
+
mov.f32 %f452, 0fBEC093AC;
|
279 |
+
mov.f32 %f451, 0f3E0375D3;
|
280 |
+
$L__BB0_18:
|
281 |
+
.loc 1 0 0
|
282 |
+
cvt.f32.bf16 %r24, %rs15;
|
283 |
+
mul.f32 %f22, %f14, 0f3F3504F3;
|
284 |
+
.loc 1 30 23
|
285 |
+
setp.ltu.f32 %p16, %f77, 0f3F8060FE;
|
286 |
+
fma.rn.ftz.f32 %f243, %f457, %f458, %f456;
|
287 |
+
fma.rn.ftz.f32 %f244, %f243, %f458, %f455;
|
288 |
+
fma.rn.ftz.f32 %f245, %f244, %f458, %f454;
|
289 |
+
fma.rn.ftz.f32 %f246, %f245, %f458, %f453;
|
290 |
+
fma.rn.ftz.f32 %f247, %f246, %f458, %f452;
|
291 |
+
fma.rn.ftz.f32 %f248, %f247, %f458, %f451;
|
292 |
+
neg.f32 %f249, %f458;
|
293 |
+
selp.f32 %f250, %f249, %f21, %p15;
|
294 |
+
fma.rn.ftz.f32 %f459, %f248, %f250, %f250;
|
295 |
+
@%p16 bra $L__BB0_20;
|
296 |
+
ex2.approx.ftz.f32 %f251, %f459;
|
297 |
+
sub.f32 %f253, %f414, %f251;
|
298 |
+
mov.b32 %r47, %f253;
|
299 |
+
mov.b32 %r48, %f21;
|
300 |
+
and.b32 %r49, %r48, -2147483648;
|
301 |
+
or.b32 %r50, %r49, %r47;
|
302 |
+
mov.b32 %f459, %r50;
|
303 |
+
$L__BB0_20:
|
304 |
+
.loc 1 0 0
|
305 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs16}, %r17; }
|
306 |
+
mov.b32 %f15, %r24;
|
307 |
+
.loc 1 30 23
|
308 |
+
abs.ftz.f32 %f90, %f22;
|
309 |
+
setp.ge.f32 %p18, %f90, 0f3F8060FE;
|
310 |
+
mov.f32 %f466, 0f3789CA3C;
|
311 |
+
mov.f32 %f465, 0fB9F560B9;
|
312 |
+
mov.f32 %f464, 0f3BAC840B;
|
313 |
+
mov.f32 %f463, 0fBD0C8162;
|
314 |
+
mov.f32 %f462, 0f3E1CF906;
|
315 |
+
mov.f32 %f461, 0f3F6A937E;
|
316 |
+
mov.f32 %f460, 0f3F20D842;
|
317 |
+
mov.f32 %f467, %f90;
|
318 |
+
@%p18 bra $L__BB0_22;
|
319 |
+
mul.f32 %f467, %f22, %f22;
|
320 |
+
mov.f32 %f466, 0f38B1E96A;
|
321 |
+
mov.f32 %f465, 0fBA574D20;
|
322 |
+
mov.f32 %f464, 0f3BAAD5EA;
|
323 |
+
mov.f32 %f463, 0fBCDC1BE7;
|
324 |
+
mov.f32 %f462, 0f3DE718AF;
|
325 |
+
mov.f32 %f461, 0fBEC093AC;
|
326 |
+
mov.f32 %f460, 0f3E0375D3;
|
327 |
+
$L__BB0_22:
|
328 |
+
.loc 1 0 0
|
329 |
+
cvt.f32.bf16 %r25, %rs16;
|
330 |
+
mul.f32 %f23, %f15, 0f3F3504F3;
|
331 |
+
.loc 1 30 23
|
332 |
+
setp.ltu.f32 %p19, %f90, 0f3F8060FE;
|
333 |
+
fma.rn.ftz.f32 %f268, %f466, %f467, %f465;
|
334 |
+
fma.rn.ftz.f32 %f269, %f268, %f467, %f464;
|
335 |
+
fma.rn.ftz.f32 %f270, %f269, %f467, %f463;
|
336 |
+
fma.rn.ftz.f32 %f271, %f270, %f467, %f462;
|
337 |
+
fma.rn.ftz.f32 %f272, %f271, %f467, %f461;
|
338 |
+
fma.rn.ftz.f32 %f273, %f272, %f467, %f460;
|
339 |
+
neg.f32 %f274, %f467;
|
340 |
+
selp.f32 %f275, %f274, %f22, %p18;
|
341 |
+
fma.rn.ftz.f32 %f468, %f273, %f275, %f275;
|
342 |
+
@%p19 bra $L__BB0_24;
|
343 |
+
ex2.approx.ftz.f32 %f276, %f468;
|
344 |
+
sub.f32 %f278, %f414, %f276;
|
345 |
+
mov.b32 %r51, %f278;
|
346 |
+
mov.b32 %r52, %f22;
|
347 |
+
and.b32 %r53, %r52, -2147483648;
|
348 |
+
or.b32 %r54, %r53, %r51;
|
349 |
+
mov.b32 %f468, %r54;
|
350 |
+
$L__BB0_24:
|
351 |
+
.loc 1 0 0
|
352 |
+
mov.b32 %f16, %r25;
|
353 |
+
.loc 1 30 23
|
354 |
+
abs.ftz.f32 %f103, %f23;
|
355 |
+
setp.ge.f32 %p21, %f103, 0f3F8060FE;
|
356 |
+
mov.f32 %f475, 0f3789CA3C;
|
357 |
+
mov.f32 %f474, 0fB9F560B9;
|
358 |
+
mov.f32 %f473, 0f3BAC840B;
|
359 |
+
mov.f32 %f472, 0fBD0C8162;
|
360 |
+
mov.f32 %f471, 0f3E1CF906;
|
361 |
+
mov.f32 %f470, 0f3F6A937E;
|
362 |
+
mov.f32 %f469, 0f3F20D842;
|
363 |
+
mov.f32 %f476, %f103;
|
364 |
+
@%p21 bra $L__BB0_26;
|
365 |
+
mul.f32 %f476, %f23, %f23;
|
366 |
+
mov.f32 %f475, 0f38B1E96A;
|
367 |
+
mov.f32 %f474, 0fBA574D20;
|
368 |
+
mov.f32 %f473, 0f3BAAD5EA;
|
369 |
+
mov.f32 %f472, 0fBCDC1BE7;
|
370 |
+
mov.f32 %f471, 0f3DE718AF;
|
371 |
+
mov.f32 %f470, 0fBEC093AC;
|
372 |
+
mov.f32 %f469, 0f3E0375D3;
|
373 |
+
$L__BB0_26:
|
374 |
+
.loc 1 0 0
|
375 |
+
cvt.u16.u32 %rs1, %r2;
|
376 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r2; }
|
377 |
+
cvt.u16.u32 %rs3, %r3;
|
378 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
|
379 |
+
cvt.u16.u32 %rs5, %r4;
|
380 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r4; }
|
381 |
+
cvt.u16.u32 %rs7, %r5;
|
382 |
+
{ .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
|
383 |
+
mul.f32 %f24, %f16, 0f3F3504F3;
|
384 |
+
.loc 1 30 23
|
385 |
+
setp.ltu.f32 %p22, %f103, 0f3F8060FE;
|
386 |
+
fma.rn.ftz.f32 %f293, %f475, %f476, %f474;
|
387 |
+
fma.rn.ftz.f32 %f294, %f293, %f476, %f473;
|
388 |
+
fma.rn.ftz.f32 %f295, %f294, %f476, %f472;
|
389 |
+
fma.rn.ftz.f32 %f296, %f295, %f476, %f471;
|
390 |
+
fma.rn.ftz.f32 %f297, %f296, %f476, %f470;
|
391 |
+
fma.rn.ftz.f32 %f298, %f297, %f476, %f469;
|
392 |
+
neg.f32 %f299, %f476;
|
393 |
+
selp.f32 %f300, %f299, %f23, %p21;
|
394 |
+
fma.rn.ftz.f32 %f477, %f298, %f300, %f300;
|
395 |
+
@%p22 bra $L__BB0_28;
|
396 |
+
ex2.approx.ftz.f32 %f301, %f477;
|
397 |
+
sub.f32 %f303, %f414, %f301;
|
398 |
+
mov.b32 %r55, %f303;
|
399 |
+
mov.b32 %r56, %f23;
|
400 |
+
and.b32 %r57, %r56, -2147483648;
|
401 |
+
or.b32 %r58, %r57, %r55;
|
402 |
+
mov.b32 %f477, %r58;
|
403 |
+
$L__BB0_28:
|
404 |
+
.loc 1 0 0
|
405 |
+
cvt.f32.bf16 %r6, %rs1;
|
406 |
+
cvt.f32.bf16 %r7, %rs2;
|
407 |
+
cvt.f32.bf16 %r8, %rs3;
|
408 |
+
cvt.f32.bf16 %r9, %rs4;
|
409 |
+
cvt.f32.bf16 %r10, %rs5;
|
410 |
+
cvt.f32.bf16 %r11, %rs6;
|
411 |
+
cvt.f32.bf16 %r12, %rs7;
|
412 |
+
cvt.f32.bf16 %r13, %rs8;
|
413 |
+
.loc 1 30 23
|
414 |
+
abs.ftz.f32 %f116, %f24;
|
415 |
+
setp.ge.f32 %p24, %f116, 0f3F8060FE;
|
416 |
+
mov.f32 %f484, 0f3789CA3C;
|
417 |
+
mov.f32 %f483, 0fB9F560B9;
|
418 |
+
mov.f32 %f482, 0f3BAC840B;
|
419 |
+
mov.f32 %f481, 0fBD0C8162;
|
420 |
+
mov.f32 %f480, 0f3E1CF906;
|
421 |
+
mov.f32 %f479, 0f3F6A937E;
|
422 |
+
mov.f32 %f478, 0f3F20D842;
|
423 |
+
mov.f32 %f485, %f116;
|
424 |
+
@%p24 bra $L__BB0_30;
|
425 |
+
mul.f32 %f485, %f24, %f24;
|
426 |
+
mov.f32 %f484, 0f38B1E96A;
|
427 |
+
mov.f32 %f483, 0fBA574D20;
|
428 |
+
mov.f32 %f482, 0f3BAAD5EA;
|
429 |
+
mov.f32 %f481, 0fBCDC1BE7;
|
430 |
+
mov.f32 %f480, 0f3DE718AF;
|
431 |
+
mov.f32 %f479, 0fBEC093AC;
|
432 |
+
mov.f32 %f478, 0f3E0375D3;
|
433 |
+
$L__BB0_30:
|
434 |
+
.loc 1 0 0
|
435 |
+
mov.b32 %f1, %r6;
|
436 |
+
mov.b32 %f2, %r7;
|
437 |
+
mov.b32 %f3, %r8;
|
438 |
+
mov.b32 %f4, %r9;
|
439 |
+
mov.b32 %f5, %r10;
|
440 |
+
mov.b32 %f6, %r11;
|
441 |
+
mov.b32 %f7, %r12;
|
442 |
+
mov.b32 %f8, %r13;
|
443 |
+
.loc 1 30 23
|
444 |
+
setp.ltu.f32 %p25, %f116, 0f3F8060FE;
|
445 |
+
fma.rn.ftz.f32 %f318, %f484, %f485, %f483;
|
446 |
+
fma.rn.ftz.f32 %f319, %f318, %f485, %f482;
|
447 |
+
fma.rn.ftz.f32 %f320, %f319, %f485, %f481;
|
448 |
+
fma.rn.ftz.f32 %f321, %f320, %f485, %f480;
|
449 |
+
fma.rn.ftz.f32 %f322, %f321, %f485, %f479;
|
450 |
+
fma.rn.ftz.f32 %f323, %f322, %f485, %f478;
|
451 |
+
neg.f32 %f324, %f485;
|
452 |
+
selp.f32 %f325, %f324, %f24, %p24;
|
453 |
+
fma.rn.ftz.f32 %f486, %f323, %f325, %f325;
|
454 |
+
@%p25 bra $L__BB0_32;
|
455 |
+
ex2.approx.ftz.f32 %f326, %f486;
|
456 |
+
sub.f32 %f328, %f414, %f326;
|
457 |
+
mov.b32 %r59, %f328;
|
458 |
+
mov.b32 %r60, %f24;
|
459 |
+
and.b32 %r61, %r60, -2147483648;
|
460 |
+
or.b32 %r62, %r61, %r59;
|
461 |
+
mov.b32 %f486, %r62;
|
462 |
+
$L__BB0_32:
|
463 |
+
.loc 1 32 18
|
464 |
+
add.f32 %f345, %f423, 0f3F800000;
|
465 |
+
add.f32 %f346, %f432, 0f3F800000;
|
466 |
+
add.f32 %f347, %f441, 0f3F800000;
|
467 |
+
add.f32 %f348, %f450, 0f3F800000;
|
468 |
+
add.f32 %f349, %f459, 0f3F800000;
|
469 |
+
add.f32 %f350, %f468, 0f3F800000;
|
470 |
+
add.f32 %f351, %f477, 0f3F800000;
|
471 |
+
add.f32 %f352, %f486, 0f3F800000;
|
472 |
+
.loc 1 35 19
|
473 |
+
mul.f32 %f353, %f9, %f9;
|
474 |
+
mul.f32 %f354, %f10, %f10;
|
475 |
+
mul.f32 %f355, %f11, %f11;
|
476 |
+
mul.f32 %f356, %f12, %f12;
|
477 |
+
mul.f32 %f357, %f13, %f13;
|
478 |
+
mul.f32 %f358, %f14, %f14;
|
479 |
+
mul.f32 %f359, %f15, %f15;
|
480 |
+
mul.f32 %f360, %f16, %f16;
|
481 |
+
.loc 1 37 20
|
482 |
+
mul.f32 %f361, %f353, 0fBF000000;
|
483 |
+
mul.f32 %f362, %f354, 0fBF000000;
|
484 |
+
mul.f32 %f363, %f355, 0fBF000000;
|
485 |
+
mul.f32 %f364, %f356, 0fBF000000;
|
486 |
+
mul.f32 %f365, %f357, 0fBF000000;
|
487 |
+
mul.f32 %f366, %f358, 0fBF000000;
|
488 |
+
mul.f32 %f367, %f359, 0fBF000000;
|
489 |
+
mul.f32 %f368, %f360, 0fBF000000;
|
490 |
+
.loc 1 38 19
|
491 |
+
mul.f32 %f330, %f361, 0f3FB8AA3B;
|
492 |
+
ex2.approx.f32 %f329, %f330;
|
493 |
+
mul.f32 %f332, %f362, 0f3FB8AA3B;
|
494 |
+
ex2.approx.f32 %f331, %f332;
|
495 |
+
mul.f32 %f334, %f363, 0f3FB8AA3B;
|
496 |
+
ex2.approx.f32 %f333, %f334;
|
497 |
+
mul.f32 %f336, %f364, 0f3FB8AA3B;
|
498 |
+
ex2.approx.f32 %f335, %f336;
|
499 |
+
mul.f32 %f338, %f365, 0f3FB8AA3B;
|
500 |
+
ex2.approx.f32 %f337, %f338;
|
501 |
+
mul.f32 %f340, %f366, 0f3FB8AA3B;
|
502 |
+
ex2.approx.f32 %f339, %f340;
|
503 |
+
mul.f32 %f342, %f367, 0f3FB8AA3B;
|
504 |
+
ex2.approx.f32 %f341, %f342;
|
505 |
+
mul.f32 %f344, %f368, 0f3FB8AA3B;
|
506 |
+
ex2.approx.f32 %f343, %f344;
|
507 |
+
.loc 1 40 20
|
508 |
+
mul.f32 %f369, %f329, 0f3ECC422A;
|
509 |
+
mul.f32 %f370, %f331, 0f3ECC422A;
|
510 |
+
mul.f32 %f371, %f333, 0f3ECC422A;
|
511 |
+
mul.f32 %f372, %f335, 0f3ECC422A;
|
512 |
+
mul.f32 %f373, %f337, 0f3ECC422A;
|
513 |
+
mul.f32 %f374, %f339, 0f3ECC422A;
|
514 |
+
mul.f32 %f375, %f341, 0f3ECC422A;
|
515 |
+
mul.f32 %f376, %f343, 0f3ECC422A;
|
516 |
+
.loc 1 41 19
|
517 |
+
mul.f32 %f377, %f9, %f369;
|
518 |
+
mul.f32 %f378, %f10, %f370;
|
519 |
+
mul.f32 %f379, %f11, %f371;
|
520 |
+
mul.f32 %f380, %f12, %f372;
|
521 |
+
mul.f32 %f381, %f13, %f373;
|
522 |
+
mul.f32 %f382, %f14, %f374;
|
523 |
+
mul.f32 %f383, %f15, %f375;
|
524 |
+
mul.f32 %f384, %f16, %f376;
|
525 |
+
.loc 1 42 20
|
526 |
+
fma.rn.f32 %f385, %f345, 0f3F000000, %f377;
|
527 |
+
fma.rn.f32 %f386, %f346, 0f3F000000, %f378;
|
528 |
+
fma.rn.f32 %f387, %f347, 0f3F000000, %f379;
|
529 |
+
fma.rn.f32 %f388, %f348, 0f3F000000, %f380;
|
530 |
+
fma.rn.f32 %f389, %f349, 0f3F000000, %f381;
|
531 |
+
fma.rn.f32 %f390, %f350, 0f3F000000, %f382;
|
532 |
+
fma.rn.f32 %f391, %f351, 0f3F000000, %f383;
|
533 |
+
fma.rn.f32 %f392, %f352, 0f3F000000, %f384;
|
534 |
+
.loc 1 43 19
|
535 |
+
mul.f32 %f393, %f1, %f385;
|
536 |
+
mul.f32 %f394, %f2, %f386;
|
537 |
+
mul.f32 %f395, %f3, %f387;
|
538 |
+
mul.f32 %f396, %f4, %f388;
|
539 |
+
mul.f32 %f397, %f5, %f389;
|
540 |
+
mul.f32 %f398, %f6, %f390;
|
541 |
+
mul.f32 %f399, %f7, %f391;
|
542 |
+
mul.f32 %f400, %f8, %f392;
|
543 |
+
.loc 1 45 40
|
544 |
+
mov.b32 %r63, %f393;
|
545 |
+
cvt.rn.bf16.f32 %rs17, %r63;
|
546 |
+
mov.b32 %r64, %f394;
|
547 |
+
cvt.rn.bf16.f32 %rs18, %r64;
|
548 |
+
mov.b32 %r65, %f395;
|
549 |
+
cvt.rn.bf16.f32 %rs19, %r65;
|
550 |
+
mov.b32 %r66, %f396;
|
551 |
+
cvt.rn.bf16.f32 %rs20, %r66;
|
552 |
+
mov.b32 %r67, %f397;
|
553 |
+
cvt.rn.bf16.f32 %rs21, %r67;
|
554 |
+
mov.b32 %r68, %f398;
|
555 |
+
cvt.rn.bf16.f32 %rs22, %r68;
|
556 |
+
mov.b32 %r69, %f399;
|
557 |
+
cvt.rn.bf16.f32 %rs23, %r69;
|
558 |
+
mov.b32 %r70, %f400;
|
559 |
+
cvt.rn.bf16.f32 %rs24, %r70;
|
560 |
+
mov.b32 %r75, {%rs17, %rs18};
|
561 |
+
mov.b32 %r76, {%rs19, %rs20};
|
562 |
+
mov.b32 %r77, {%rs21, %rs22};
|
563 |
+
mov.b32 %r78, {%rs23, %rs24};
|
564 |
+
@%p1 st.global.v4.b32 [ %rd7 + 0 ], { %r75, %r76, %r77, %r78 };
|
565 |
+
.loc 1 45 4
|
566 |
+
ret;
|
567 |
+
$L__tmp1:
|
568 |
+
$L__func_end0:
|
569 |
+
|
570 |
+
}
|
571 |
+
// .globl __nv_erff
|
572 |
+
.visible .func (.param .b32 func_retval0) __nv_erff(
|
573 |
+
.param .b32 __nv_erff_param_0
|
574 |
+
)
|
575 |
+
{
|
576 |
+
.reg .pred %p<4>;
|
577 |
+
.reg .b32 %r<5>;
|
578 |
+
.reg .f32 %f<49>;
|
579 |
+
$L__func_begin1:
|
580 |
+
|
581 |
+
ld.param.f32 %f14, [__nv_erff_param_0];
|
582 |
+
abs.ftz.f32 %f1, %f14;
|
583 |
+
setp.ge.f32 %p1, %f1, 0f3F8060FE;
|
584 |
+
mov.f32 %f46, 0f3789CA3C;
|
585 |
+
mov.f32 %f45, 0fB9F560B9;
|
586 |
+
mov.f32 %f44, 0f3BAC840B;
|
587 |
+
mov.f32 %f43, 0fBD0C8162;
|
588 |
+
mov.f32 %f42, 0f3E1CF906;
|
589 |
+
mov.f32 %f41, 0f3F6A937E;
|
590 |
+
mov.f32 %f40, 0f3F20D842;
|
591 |
+
mov.f32 %f47, %f1;
|
592 |
+
@%p1 bra $L__BB1_2;
|
593 |
+
mul.f32 %f47, %f14, %f14;
|
594 |
+
mov.f32 %f46, 0f38B1E96A;
|
595 |
+
mov.f32 %f45, 0fBA574D20;
|
596 |
+
mov.f32 %f44, 0f3BAAD5EA;
|
597 |
+
mov.f32 %f43, 0fBCDC1BE7;
|
598 |
+
mov.f32 %f42, 0f3DE718AF;
|
599 |
+
mov.f32 %f41, 0fBEC093AC;
|
600 |
+
mov.f32 %f40, 0f3E0375D3;
|
601 |
+
$L__BB1_2:
|
602 |
+
setp.ltu.f32 %p2, %f1, 0f3F8060FE;
|
603 |
+
fma.rn.ftz.f32 %f29, %f46, %f47, %f45;
|
604 |
+
fma.rn.ftz.f32 %f30, %f29, %f47, %f44;
|
605 |
+
fma.rn.ftz.f32 %f31, %f30, %f47, %f43;
|
606 |
+
fma.rn.ftz.f32 %f32, %f31, %f47, %f42;
|
607 |
+
fma.rn.ftz.f32 %f33, %f32, %f47, %f41;
|
608 |
+
fma.rn.ftz.f32 %f34, %f33, %f47, %f40;
|
609 |
+
neg.f32 %f35, %f47;
|
610 |
+
selp.f32 %f36, %f35, %f14, %p1;
|
611 |
+
fma.rn.ftz.f32 %f48, %f34, %f36, %f36;
|
612 |
+
@%p2 bra $L__BB1_4;
|
613 |
+
ex2.approx.ftz.f32 %f37, %f48;
|
614 |
+
mov.f32 %f38, 0f3F800000;
|
615 |
+
sub.f32 %f39, %f38, %f37;
|
616 |
+
mov.b32 %r1, %f39;
|
617 |
+
mov.b32 %r2, %f14;
|
618 |
+
and.b32 %r3, %r2, -2147483648;
|
619 |
+
or.b32 %r4, %r3, %r1;
|
620 |
+
mov.b32 %f48, %r4;
|
621 |
+
$L__BB1_4:
|
622 |
+
st.param.f32 [func_retval0+0], %f48;
|
623 |
+
ret;
|
624 |
+
$L__func_end1:
|
625 |
+
|
626 |
+
}
|
627 |
+
.file 1 "/tmp/torchinductor_root/5j/c5jxaguxho3nhrlt5vcinnz5fevodumlpwn4wyb2vx3xrveicerl.py"
|
628 |
+
.section .debug_abbrev
|
629 |
+
{
|
630 |
+
.b8 1
|
631 |
+
.b8 17
|
632 |
+
.b8 1
|
633 |
+
.b8 37
|
634 |
+
.b8 8
|
635 |
+
.b8 19
|
636 |
+
.b8 5
|
637 |
+
.b8 3
|
638 |
+
.b8 8
|
639 |
+
.b8 16
|
640 |
+
.b8 6
|
641 |
+
.b8 27
|
642 |
+
.b8 8
|
643 |
+
.b8 180
|
644 |
+
.b8 66
|
645 |
+
.b8 12
|
646 |
+
.b8 17
|
647 |
+
.b8 1
|
648 |
+
.b8 18
|
649 |
+
.b8 1
|
650 |
+
.b8 0
|
651 |
+
.b8 0
|
652 |
+
.b8 2
|
653 |
+
.b8 46
|
654 |
+
.b8 0
|
655 |
+
.b8 17
|
656 |
+
.b8 1
|
657 |
+
.b8 18
|
658 |
+
.b8 1
|
659 |
+
.b8 64
|
660 |
+
.b8 10
|
661 |
+
.b8 135
|
662 |
+
.b8 64
|
663 |
+
.b8 8
|
664 |
+
.b8 3
|
665 |
+
.b8 8
|
666 |
+
.b8 58
|
667 |
+
.b8 11
|
668 |
+
.b8 59
|
669 |
+
.b8 11
|
670 |
+
.b8 63
|
671 |
+
.b8 12
|
672 |
+
.b8 0
|
673 |
+
.b8 0
|
674 |
+
.b8 0
|
675 |
+
}
|
676 |
+
.section .debug_info
|
677 |
+
{
|
678 |
+
.b32 176
|
679 |
+
.b8 2
|
680 |
+
.b8 0
|
681 |
+
.b32 .debug_abbrev
|
682 |
+
.b8 8
|
683 |
+
.b8 1
|
684 |
+
.b8 116
|
685 |
+
.b8 114
|
686 |
+
.b8 105
|
687 |
+
.b8 116
|
688 |
+
.b8 111
|
689 |
+
.b8 110
|
690 |
+
.b8 0
|
691 |
+
.b8 2
|
692 |
+
.b8 0
|
693 |
+
.b8 99
|
694 |
+
.b8 53
|
695 |
+
.b8 106
|
696 |
+
.b8 120
|
697 |
+
.b8 97
|
698 |
+
.b8 103
|
699 |
+
.b8 117
|
700 |
+
.b8 120
|
701 |
+
.b8 104
|
702 |
+
.b8 111
|
703 |
+
.b8 51
|
704 |
+
.b8 110
|
705 |
+
.b8 104
|
706 |
+
.b8 114
|
707 |
+
.b8 108
|
708 |
+
.b8 116
|
709 |
+
.b8 53
|
710 |
+
.b8 118
|
711 |
+
.b8 99
|
712 |
+
.b8 105
|
713 |
+
.b8 110
|
714 |
+
.b8 110
|
715 |
+
.b8 122
|
716 |
+
.b8 53
|
717 |
+
.b8 102
|
718 |
+
.b8 101
|
719 |
+
.b8 118
|
720 |
+
.b8 111
|
721 |
+
.b8 100
|
722 |
+
.b8 117
|
723 |
+
.b8 109
|
724 |
+
.b8 108
|
725 |
+
.b8 112
|
726 |
+
.b8 119
|
727 |
+
.b8 110
|
728 |
+
.b8 52
|
729 |
+
.b8 119
|
730 |
+
.b8 121
|
731 |
+
.b8 98
|
732 |
+
.b8 50
|
733 |
+
.b8 118
|
734 |
+
.b8 120
|
735 |
+
.b8 51
|
736 |
+
.b8 120
|
737 |
+
.b8 114
|
738 |
+
.b8 118
|
739 |
+
.b8 101
|
740 |
+
.b8 105
|
741 |
+
.b8 99
|
742 |
+
.b8 101
|
743 |
+
.b8 114
|
744 |
+
.b8 108
|
745 |
+
.b8 46
|
746 |
+
.b8 112
|
747 |
+
.b8 121
|
748 |
+
.b8 0
|
749 |
+
.b32 .debug_line
|
750 |
+
.b8 47
|
751 |
+
.b8 116
|
752 |
+
.b8 109
|
753 |
+
.b8 112
|
754 |
+
.b8 47
|
755 |
+
.b8 116
|
756 |
+
.b8 111
|
757 |
+
.b8 114
|
758 |
+
.b8 99
|
759 |
+
.b8 104
|
760 |
+
.b8 105
|
761 |
+
.b8 110
|
762 |
+
.b8 100
|
763 |
+
.b8 117
|
764 |
+
.b8 99
|
765 |
+
.b8 116
|
766 |
+
.b8 111
|
767 |
+
.b8 114
|
768 |
+
.b8 95
|
769 |
+
.b8 114
|
770 |
+
.b8 111
|
771 |
+
.b8 111
|
772 |
+
.b8 116
|
773 |
+
.b8 47
|
774 |
+
.b8 53
|
775 |
+
.b8 106
|
776 |
+
.b8 0
|
777 |
+
.b8 1
|
778 |
+
.b64 $L__func_begin0
|
779 |
+
.b64 $L__func_end0
|
780 |
+
.b8 2
|
781 |
+
.b64 $L__func_begin0
|
782 |
+
.b64 $L__func_end0
|
783 |
+
.b8 1
|
784 |
+
.b8 156
|
785 |
+
.b8 116
|
786 |
+
.b8 114
|
787 |
+
.b8 105
|
788 |
+
.b8 116
|
789 |
+
.b8 111
|
790 |
+
.b8 110
|
791 |
+
.b8 95
|
792 |
+
.b8 95
|
793 |
+
.b8 48
|
794 |
+
.b8 100
|
795 |
+
.b8 49
|
796 |
+
.b8 100
|
797 |
+
.b8 50
|
798 |
+
.b8 100
|
799 |
+
.b8 101
|
800 |
+
.b8 0
|
801 |
+
.b8 116
|
802 |
+
.b8 114
|
803 |
+
.b8 105
|
804 |
+
.b8 116
|
805 |
+
.b8 111
|
806 |
+
.b8 110
|
807 |
+
.b8 95
|
808 |
+
.b8 95
|
809 |
+
.b8 48
|
810 |
+
.b8 100
|
811 |
+
.b8 49
|
812 |
+
.b8 100
|
813 |
+
.b8 50
|
814 |
+
.b8 100
|
815 |
+
.b8 101
|
816 |
+
.b8 0
|
817 |
+
.b8 1
|
818 |
+
.b8 18
|
819 |
+
.b8 1
|
820 |
+
.b8 0
|
821 |
+
}
|
822 |
+
.section .debug_pubnames
|
823 |
+
{
|
824 |
+
.b32 $L__pubNames_end0-$L__pubNames_start0
|
825 |
+
$L__pubNames_start0:
|
826 |
+
.b8 2
|
827 |
+
.b8 0
|
828 |
+
.b32 .debug_info
|
829 |
+
.b32 180
|
830 |
+
.b32 125
|
831 |
+
.b8 116
|
832 |
+
.b8 114
|
833 |
+
.b8 105
|
834 |
+
.b8 116
|
835 |
+
.b8 111
|
836 |
+
.b8 110
|
837 |
+
.b8 95
|
838 |
+
.b8 95
|
839 |
+
.b8 48
|
840 |
+
.b8 100
|
841 |
+
.b8 49
|
842 |
+
.b8 100
|
843 |
+
.b8 50
|
844 |
+
.b8 100
|
845 |
+
.b8 101
|
846 |
+
.b8 0
|
847 |
+
.b32 0
|
848 |
+
$L__pubNames_end0:
|
849 |
+
}
|
850 |
+
.section .debug_pubtypes
|
851 |
+
{
|
852 |
+
.b32 $L__pubTypes_end0-$L__pubTypes_start0
|
853 |
+
$L__pubTypes_start0:
|
854 |
+
.b8 2
|
855 |
+
.b8 0
|
856 |
+
.b32 .debug_info
|
857 |
+
.b32 180
|
858 |
+
.b32 0
|
859 |
+
$L__pubTypes_end0:
|
860 |
+
}
|
861 |
+
.section .debug_loc { }
|
.triton/dump/94361ae8a918b76700c87078e3d5a751/triton_.cubin
ADDED
Binary file (7.33 kB). View file
|
|
.triton/dump/a4652f539404a11e3c068d96115a7427/triton_.ttir
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
module {
|
2 |
+
tt.func public @triton__0d1d2de(%arg0: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
|
3 |
+
%c256_i32 = arith.constant 256 : i32
|
4 |
+
%0 = tt.get_program_id x : i32
|
5 |
+
%1 = arith.muli %0, %c256_i32 : i32
|
6 |
+
%2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32>
|
7 |
+
%3 = tt.splat %1 : (i32) -> tensor<256xi32>
|
8 |
+
%4 = arith.addi %3, %2 : tensor<256xi32>
|
9 |
+
%5 = tt.splat %arg0 : (!tt.ptr<bf16, 1>) -> tensor<256x!tt.ptr<bf16, 1>>
|
10 |
+
%6 = tt.addptr %5, %4 : tensor<256x!tt.ptr<bf16, 1>>, tensor<256xi32>
|
11 |
+
%7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256xbf16>
|
12 |
+
%8 = arith.extf %7 : tensor<256xbf16> to tensor<256xf32>
|
13 |
+
%9 = tt.splat %arg1 : (!tt.ptr<f32, 1>) -> tensor<256x!tt.ptr<f32, 1>>
|
14 |
+
%10 = tt.addptr %9, %4 : tensor<256x!tt.ptr<f32, 1>>, tensor<256xi32>
|
15 |
+
tt.store %10, %8 {cache = 1 : i32, evict = 1 : i32} : tensor<256xf32>
|
16 |
+
tt.return
|
17 |
+
}
|
18 |
+
}
|