0-hero commited on
Commit
f67f72f
·
verified ·
1 Parent(s): 1656e39

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .cache/pip/http-v2/f/b/c/e/7/fbce770ef113d7303a71734c8a71c6ac1b2cbc4e832a8e70832c37ca.body +0 -0
  2. .cache/wandb/logs/core-debug-20240926_124123.log +14 -0
  3. .local/share/jupyter/nbextensions/addbefore/icon.png +0 -0
  4. .local/share/jupyter/nbextensions/addbefore/readme.md +12 -0
  5. .local/share/jupyter/nbextensions/autosavetime/main.js +81 -0
  6. .local/share/jupyter/nbextensions/autoscroll/README.md +22 -0
  7. .local/share/jupyter/nbextensions/autoscroll/icon.png +0 -0
  8. .local/share/jupyter/nbextensions/cell_filter/README.md +4 -0
  9. .local/share/jupyter/nbextensions/cell_filter/cell_filter.yml +7 -0
  10. .local/share/jupyter/nbextensions/code_prettify/2to3.js +49 -0
  11. .local/share/jupyter/nbextensions/contrib_nbextensions_help_item/main.js +35 -0
  12. .local/share/jupyter/nbextensions/datestamper/main.js +42 -0
  13. .local/share/jupyter/nbextensions/equation-numbering/icon.png +0 -0
  14. .local/share/jupyter/nbextensions/execute_time/ExecuteTime.yaml +88 -0
  15. .local/share/jupyter/nbextensions/execution_dependencies/README.md +43 -0
  16. .local/share/jupyter/nbextensions/execution_dependencies/execution_dependencies.js +139 -0
  17. .local/share/jupyter/nbextensions/execution_dependencies/execution_dependencies.yml +9 -0
  18. .local/share/jupyter/nbextensions/exercise/exercise.yaml +28 -0
  19. .local/share/jupyter/nbextensions/exercise/icon.png +0 -0
  20. .local/share/jupyter/nbextensions/exercise/main.css +13 -0
  21. .local/share/jupyter/nbextensions/exercise2/icon.png +0 -0
  22. .local/share/jupyter/nbextensions/exercise2/image.gif +0 -0
  23. .local/share/jupyter/nbextensions/exercise2/main.css +60 -0
  24. .local/share/jupyter/nbextensions/exercise2/readme.md +52 -0
  25. .local/share/jupyter/nbextensions/export_embedded/export_embedded.yaml +7 -0
  26. .local/share/jupyter/nbextensions/export_embedded/icon.png +0 -0
  27. .local/share/jupyter/nbextensions/export_embedded/main.js +56 -0
  28. .local/share/jupyter/nbextensions/export_embedded/readme.md +7 -0
  29. .local/share/jupyter/nbextensions/freeze/readme.md +24 -0
  30. .local/share/jupyter/nbextensions/help_panel/help_panel.css +56 -0
  31. .triton/dump/0359b089f02b5ddabaef8985c60f3daf/triton_.ttgir +21 -0
  32. .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.llir +300 -0
  33. .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ptx +734 -0
  34. .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ttgir +92 -0
  35. .triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ttir +99 -0
  36. .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.cubin +0 -0
  37. .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.llir +43 -0
  38. .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttgir +18 -0
  39. .triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttir +17 -0
  40. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.cubin +0 -0
  41. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.llir +793 -0
  42. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ptx +1517 -0
  43. .triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttgir +92 -0
  44. .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.cubin +0 -0
  45. .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.llir +1108 -0
  46. .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ptx +1927 -0
  47. .triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttgir +92 -0
  48. .triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.ptx +295 -0
  49. .triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.ttir +18 -0
  50. .triton/dump/4ce9eb7fe63f19e54893f0c74df91471/triton_.cubin +0 -0
.cache/pip/http-v2/f/b/c/e/7/fbce770ef113d7303a71734c8a71c6ac1b2cbc4e832a8e70832c37ca.body ADDED
Binary file (78.4 kB). View file
 
.cache/wandb/logs/core-debug-20240926_124123.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-09-26T12:41:23.048835354Z","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmpvaxwxqk7/port-6230.txt","pid":6230,"debug":false,"disable-analytics":false}
2
+ {"time":"2024-09-26T12:41:23.048873271Z","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false}
3
+ {"time":"2024-09-26T12:41:23.049626557Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":6230}
4
+ {"time":"2024-09-26T12:41:23.049598776Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":41101,"Zone":""}}
5
+ {"time":"2024-09-26T12:41:23.244733652Z","level":"INFO","msg":"created new connection","id":"127.0.0.1:48216"}
6
+ {"time":"2024-09-26T12:41:23.685497123Z","level":"INFO","msg":"connection init received","streamId":"zc6s8e8w","id":"127.0.0.1:48216"}
7
+ {"time":"2024-09-26T12:41:23.68571591Z","level":"ERROR","msg":"error creating symlink","error":"symlink /root/.cache/wandb/logs/core-debug-20240926_124123.log /root/wandb/run-20240926_124123-zc6s8e8w/logs/debug-core.log: file exists"}
8
+ {"time":"2024-09-26T12:41:23.687506497Z","level":"INFO","msg":"connection init completed","streamId":"zc6s8e8w","id":"127.0.0.1:48216"}
9
+ {"time":"2024-09-26T19:27:20.668791042Z","level":"INFO","msg":"handle finish received","streamId":"zc6s8e8w","id":"127.0.0.1:48216"}
10
+ {"time":"2024-09-26T19:27:22.057519583Z","level":"INFO","msg":"connection: teardown","id":"127.0.0.1:48216"}
11
+ {"time":"2024-09-26T19:27:22.057561736Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2024-09-26T19:27:22.057679612Z","level":"INFO","msg":"closed connection","id":"127.0.0.1:48216"}
13
+ {"time":"2024-09-26T19:27:22.057729867Z","level":"INFO","msg":"connection closed","id":"127.0.0.1:48216"}
14
+ {"time":"2024-09-26T19:27:22.057743237Z","level":"INFO","msg":"server is closed"}
.local/share/jupyter/nbextensions/addbefore/icon.png ADDED
.local/share/jupyter/nbextensions/addbefore/readme.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Addbefore
2
+ =========
3
+
4
+ This extension adds a button to add a empty cell before the currently active cell.
5
+ As was present in IPython 1.0.
6
+ It is with a circled up arrow.
7
+ The plus signed "Add Cell After button" is moved to be next to Add Cell Before,
8
+ and given matching circled down arrow icon.
9
+
10
+ The functionality of the buttons are as per in the Insert Menu,
11
+ for Insert Cell Above, and Insert Cell Below.
12
+ A empty cell is added, and it takes the cursor focus.
.local/share/jupyter/nbextensions/autosavetime/main.js ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ define([
2
+ 'jquery',
3
+ 'base/js/namespace',
4
+ 'base/js/events'
5
+ ], function(
6
+ $,
7
+ IPython,
8
+ events
9
+ ) {
10
+ "use strict";
11
+
12
+ // define default values for config parameters
13
+ var params = {
14
+ autosavetime_set_starting_interval : false,
15
+ autosavetime_starting_interval : 2,
16
+ autosavetime_show_selector : true
17
+ };
18
+
19
+ // update params with any specified in the server's config file
20
+ var update_params = function() {
21
+ var config = IPython.notebook.config;
22
+ for (var key in params) {
23
+ if (config.data.hasOwnProperty(key))
24
+ params[key] = config.data[key];
25
+ }
26
+ };
27
+
28
+ var initialize = function () {
29
+ update_params();
30
+
31
+ var si = params.autosavetime_starting_interval;
32
+ var set_si = params.autosavetime_set_starting_interval;
33
+
34
+ if (params.autosavetime_show_selector) {
35
+ var select = $('<select class="ui-widget-content"/>');
36
+ select.change(function() {
37
+ var interval = parseInt($(this).val(), 10) * 60 * 1000;
38
+ IPython.notebook.set_autosave_interval(interval);
39
+ });
40
+
41
+ var thresholds = [0,2,5,10,15,20,30,60];
42
+
43
+ if (set_si && thresholds.indexOf(si) < 0) thresholds.push(si);
44
+
45
+ thresholds.sort(function(a, b) { return a-b; });
46
+
47
+ for (var i in thresholds) {
48
+ var thr = thresholds[i];
49
+ select.append($('<option/>').attr('value', thr).text(thr));
50
+ }
51
+
52
+ select.find('option[value="2"]').text('2 (default)');
53
+ select.find('option[value="0"]').text('off');
54
+
55
+ if (set_si) select.val(si);
56
+
57
+ IPython.toolbar.element.append(
58
+ $('<label class="navbar-text"/>').text('Autosave interval (min):')
59
+ ).append(select);
60
+ }
61
+
62
+ events.on("autosave_enabled.Notebook", function(event, value) {
63
+ if (set_si) {
64
+ IPython.notebook.set_autosave_interval(si * 60 * 1000);
65
+ }
66
+ else {
67
+ if (params.autosavetime_show_selector) {
68
+ select.val(parseInt(value, 10) / 60 / 1000);
69
+ }
70
+ }
71
+ });
72
+ };
73
+
74
+ var load_ipython_extension = function() {
75
+ return IPython.notebook.config.loaded.then(initialize);
76
+ };
77
+
78
+ return {
79
+ load_ipython_extension : load_ipython_extension
80
+ };
81
+ });
.local/share/jupyter/nbextensions/autoscroll/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ autoscroll
2
+ ==========
3
+
4
+
5
+ Description
6
+ -----------
7
+
8
+ Optionally set the output autoscroll threshold, and/or add a selector to the
9
+ toolbar to set it, and/or add a toolbar button to enable/disable it.
10
+
11
+
12
+ Parameters
13
+ ----------
14
+
15
+ * `autoscroll_set_on_load` -
16
+ Set an autoscroll threshold on notebook load. If false, the default is unchanged.
17
+ * `autoscroll_starting_threshold` -
18
+ Autoscroll threshold which would be set on notebook load. `-1` disables autoscrolling.
19
+ * `autoscroll_show_selector` -
20
+ Add a selector to the toolbar to change the autoscroll threshold
21
+ * `autoscroll_show_button` -
22
+ Add a button to the toolbar to disable/enable autoscrolling
.local/share/jupyter/nbextensions/autoscroll/icon.png ADDED
.local/share/jupyter/nbextensions/cell_filter/README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Cell filter
2
+ ===========
3
+
4
+ An extension that allows you to filter cells by tags. Keywords entered into the search bar separated by spaces joins them with logical AND.
.local/share/jupyter/nbextensions/cell_filter/cell_filter.yml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Compatibility: 4.x, 5.x
3
+ Name: Cell Filter
4
+ Main: cell_filter.js
5
+ Link: README.md
6
+ Description: |
7
+ An extension that allows you to filter cells by tags. Keywords entered into the search bar separated by spaces joins them with logical AND.
.local/share/jupyter/nbextensions/code_prettify/2to3.js ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Jupyter-Contrib Team.
2
+ // Distributed under the terms of the Modified BSD License.
3
+ // Authors: @EWouters, @jfbercher and @jcb91
4
+ // Based on: https://github.com/jfbercher/code_prettify and
5
+ // https://gist.github.com/takluyver/c8839593c615bb2f6e80
6
+
7
+ define(['./kernel_exec_on_cell'], function(kernel_exec_on_cell) {
8
+ 'use strict';
9
+
10
+ var mod_name = '2to3';
11
+
12
+ // gives default settings
13
+ var cfg = {
14
+ add_toolbar_button: true,
15
+ hotkeys: {
16
+ process_selected: 'Ctrl-M',
17
+ process_all: 'Ctrl-Shift-M',
18
+ },
19
+ register_hotkey: true,
20
+ show_alerts_for_errors: true,
21
+ button_icon: 'fa-space-shuttle',
22
+ button_label: 'Convert Python 2 to 3',
23
+ kbd_shortcut_text: 'Convert Python 2 to 3 in' // ' current cell(s)'
24
+ };
25
+
26
+ cfg.kernel_config_map = { // map of parameters for supported kernels
27
+ "python": {
28
+ "library": [
29
+ "import lib2to3.refactor, json",
30
+ "_2to3_refactoring_tool = lib2to3.refactor.RefactoringTool(",
31
+ " set(lib2to3.refactor.get_fixers_from_package('lib2to3.fixes')))",
32
+ "def _2to3_refactor_cell(src):",
33
+ " try:",
34
+ " tree = _2to3_refactoring_tool.refactor_string(src+'\\n', '<dummy_name>')",
35
+ " except (lib2to3.pgen2.parse.ParseError, lib2to3.pgen2.tokenize.TokenError):",
36
+ " return src ",
37
+ " else:",
38
+ " return str(tree)[:-1]",
39
+ ].join('\n'),
40
+ "prefix": "print(json.dumps(_2to3_refactor_cell(u",
41
+ "postfix": ")))"
42
+ }
43
+ };
44
+
45
+ var converter = new kernel_exec_on_cell.define_plugin(mod_name, cfg);
46
+ converter.load_ipython_extension = converter.initialize_plugin;
47
+ return converter;
48
+
49
+ });
.local/share/jupyter/nbextensions/contrib_nbextensions_help_item/main.js ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Small extension to add an help menu pointing
2
+ // to jupyter_contrib_nbextensions at readthedocs.
3
+
4
+ define(['jquery', 'base/js/namespace'], function($, Jupyter) {
5
+ "use strict";
6
+
7
+ function add_help_menu_item() {
8
+
9
+ if ($('#jupyter_contrib_nbextensions_help').length > 0) {
10
+ return;
11
+ }
12
+ var menu_item = $('<li/>')
13
+ .append(
14
+ $('<a/>')
15
+ .html('Jupyter-contrib <br> nbextensions')
16
+ .attr('title', 'Jupyter_contrib_nbextensions documentation')
17
+ .attr('id', "jupyter_contrib_nbextensions_help")
18
+ .attr('href', 'http://jupyter-contrib-nbextensions.readthedocs.io/en/latest/')
19
+ .attr('target', "_blank")
20
+ .append(
21
+ $('<i/>')
22
+ .addClass('fa fa-external-link menu-icon pull-right')
23
+ ))
24
+ menu_item.insertBefore($($("#help_menu > .divider")[1]))
25
+ }
26
+
27
+
28
+ var load_ipython_extension = function() {
29
+ add_help_menu_item();
30
+ };
31
+
32
+ return {
33
+ load_ipython_extension: load_ipython_extension
34
+ };
35
+ });
.local/share/jupyter/nbextensions/datestamper/main.js ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ define([
2
+ "base/js/namespace",
3
+ "jquery"
4
+ ], function (IPython, $) {
5
+ "use strict";
6
+
7
+ var padZero = function(val){
8
+ return ("0" + val).slice(-2);
9
+ };
10
+
11
+ var datestring = function(){
12
+ var d = new Date();
13
+ return (
14
+ d.getFullYear() + "-" + padZero(d.getMonth() + 1) + "-" + padZero(d.getDate()) +
15
+ " " + padZero(d.getHours()) + ":" + padZero(d.getMinutes()) + ":" + padZero(d.getSeconds())
16
+ );
17
+ };
18
+
19
+ var datestamp = function(){
20
+ var cell = IPython.notebook.get_selected_cell();
21
+ var do_render = !((cell.cell_type === "raw") || (cell.cell_type === "code"));
22
+ if(do_render) cell.unrender();
23
+ cell.code_mirror.focus();
24
+ cell.code_mirror.doc.replaceSelection(datestring() + " ", "end");
25
+ if(do_render) cell.edit_mode();
26
+ };
27
+
28
+ var load_ipython_extension = function () {
29
+ IPython.toolbar.add_buttons_group([
30
+ IPython.keyboard_manager.actions.register ({
31
+ help : 'insert datestamp',
32
+ icon : 'fa-calendar',
33
+ handler: datestamp
34
+ }, 'insert-datestamp', 'datestamp')
35
+ ]);
36
+ };
37
+
38
+ var extension = {
39
+ load_ipython_extension : load_ipython_extension,
40
+ };
41
+ return extension;
42
+ });
.local/share/jupyter/nbextensions/equation-numbering/icon.png ADDED
.local/share/jupyter/nbextensions/execute_time/ExecuteTime.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: IPython Notebook Extension
2
+ Name: ExecuteTime
3
+ Description: Display when each cell has been executed and how long it took
4
+ Link: readme.md
5
+ Icon: icon.png
6
+ Main: ExecuteTime.js
7
+ Compatibility: 4.x, 5.x
8
+ Parameters:
9
+
10
+ - name: ExecuteTime.clear_timings_on_clear_output
11
+ description: |
12
+ When cells' outputs are cleared, also clear their timing data, e.g. when
13
+ using the "Kernel > Restart & Clear Output" menu item
14
+ input_type: checkbox
15
+ default: false
16
+
17
+ - name: ExecuteTime.clear_timings_on_kernel_restart
18
+ description: |
19
+ Clear all cells' execution timing data on any kernel restart event
20
+ input_type: checkbox
21
+ default: false
22
+
23
+ - name: ExecuteTime.display_absolute_timings
24
+ description: |
25
+ Display absolute timings for the start time of execution.
26
+ Setting false will display a relative timestamp like 'a few seconds ago'
27
+ default: true
28
+ input_type: checkbox
29
+
30
+ - name: ExecuteTime.display_absolute_format
31
+ description: |
32
+ The format to use when displaying absolute timings (see above)
33
+ default: 'YYYY-MM-DD HH:mm:ss'
34
+ input_type: text
35
+
36
+ - name: ExecuteTime.relative_timing_update_period
37
+ description: |
38
+ Seconds to wait between updating the relative timestamps, if using them
39
+ (see above)
40
+ default: 10
41
+ input_type: number
42
+ step: 1
43
+ min: 1
44
+ max: 600
45
+
46
+ - name: ExecuteTime.display_in_utc
47
+ description: |
48
+ Display times in UTC, rather than in the local timezone set by the browser
49
+ default: false
50
+ input_type: checkbox
51
+
52
+ - name: ExecuteTime.default_kernel_to_utc
53
+ description: |
54
+ For kernel timestamps which do not specify a timezone, assume UTC
55
+ default: true
56
+ input_type: checkbox
57
+
58
+ - name: ExecuteTime.display_right_aligned
59
+ description: |
60
+ Right-align the text in the timing area under each cell
61
+ default: false
62
+ input_type: checkbox
63
+
64
+ - name: ExecuteTime.highlight.use
65
+ description: |
66
+ Highlight the displayed execution time on completion of execution
67
+ default: true
68
+ input_type: checkbox
69
+
70
+ - name: ExecuteTime.highlight.color
71
+ description: |
72
+ Color to use for highlighting the displayed execution time
73
+ default: '#00BB00'
74
+ input_type: color
75
+
76
+ - name: ExecuteTime.template.executed
77
+ description: |
78
+ Template for the timing message for executed cells. See readme for
79
+ replacement tokens.
80
+ default: 'executed in ${duration}, finished ${end_time}'
81
+ input_type: text
82
+
83
+ - name: ExecuteTime.template.queued
84
+ description: |
85
+ Template for the timing message for queued cells. See readme for
86
+ replacement tokens.
87
+ default: 'execution queued ${start_time}'
88
+ input_type: text
.local/share/jupyter/nbextensions/execution_dependencies/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ execution_dependencies
2
+ ======================
3
+
4
+ Writing extensive notebooks can become very complicated since many cells act as stepping stones to produce intermediate results for later cells. Thus, it becomes tedious to
5
+ keep track of the cells that have to be run in order to run a certain cell. This extension simplifies handling the execution dependencies by introducing tag annotations to
6
+ identify each cell and indicate a dependency on others. This improves on the current state which requires remembering all dependencies by heart or annotating the cells in the comments.
7
+
8
+ If a cell with dependencies is run, the extension checks recursively for all dependencies of the cell, then executes them before executing the cell after all the dependencies have finished.
9
+ Dependencies are definitely executed and not only once per kernel session.
10
+
11
+ The two annotations are added to the tags of a cell and are as follows:
12
+
13
+ * add a hashmark (#) and an identification tag to the tags to identify a cell (e.g. #initializer-cell). The #identifiers must be unique among all cells.
14
+ * add an arrow (=>) and an identification tag to the tags to add a dependency on a certain cell (e.g. =>initializer-cell).
15
+
16
+ Based on these dependencies, the kernel will now execute the dependencies before the cell that depends on them. If the cell's dependencies have further dependencies, these will in turn
17
+ be executed before them. In conclusion, the kernel looks through the tree of dependencies of the cell executed by the user and executes its dependencies in their appropriate order,
18
+ then executes the cell.
19
+
20
+ A more extensive example is described below:
21
+
22
+ A cell A has the identifier #A.
23
+
24
+ | Cell A [tags: #A] |
25
+ | ------------- |
26
+ | Content Cell |
27
+ | Content Cell |
28
+
29
+
30
+ A cell B has the identifier #B and depends on A (=>A).
31
+
32
+
33
+ | Cell B [tags: #B, =>A] |
34
+ | ------------- |
35
+ | Content Cell |
36
+ | Content Cell |
37
+
38
+ If the user runs A, only A is executed, since it has no dependencies. On the other hand, if the user runs B, the kernel finds the dependency on A, and thus first runs A and then runs B.
39
+
40
+ Running a cell C that is dependent on B and on A as well, the kernel then first runs A and then runs B before running C, avoiding to run cell A twice.
41
+
42
+
43
+ If you are missing anything, open up an issue at the repository prepending [execute_dependencies] to the title.
.local/share/jupyter/nbextensions/execution_dependencies/execution_dependencies.js ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * execution_dependencies.js
3
+ * Introduce tag annotations to identify each cell and indicate a dependency on others.
4
+ * Upon running a cell, its dependencies are run first to prepare all dependencies.
5
+ * Then the cell triggered by the user is run as soon as all its dependencies are met.
6
+ *
7
+ *
8
+ * @version 0.1.0
9
+ * @author Benjamin Ellenberger, https://github.com/benelot
10
+ * @updated 2018-01-31
11
+ *
12
+ *
13
+ */
14
+ define([
15
+ 'jquery',
16
+ 'base/js/dialog',
17
+ 'base/js/namespace',
18
+ 'notebook/js/codecell'
19
+ ], function (
20
+ $,
21
+ dialog,
22
+ Jupyter,
23
+ codecell
24
+ ) {
25
+ "use strict";
26
+
27
+ var CodeCell = codecell.CodeCell;
28
+
29
+ return {
30
+ load_ipython_extension: function () {
31
+ console.log('[execution_dependencies] patching CodeCell.execute');
32
+ var orig_execute = codecell.CodeCell.prototype.execute; // keep original cell execute function
33
+ CodeCell.prototype.execute = function (stop_on_error) {
34
+ var root_tags = this.metadata.tags || []; // get tags of the cell executed by the user (root cell)
35
+ if(root_tags.some(tag => /=>.*/.test(tag))) { // if the root cell contains any dependencies, resolve dependency tree
36
+ var root_cell = this;
37
+ var root_cell_id = root_cell.cell_id;
38
+ var cells_with_id = Jupyter.notebook.get_cells().filter(function (cell, idx, cells) { // ...get all cells which have at least one id (these are the only ones we could have in deps)
39
+ var tags = cell.metadata.tags || [];
40
+ return (cell === root_cell || tags.some(tag => /#.*/.test(tag)));
41
+ });
42
+
43
+ console.log('[execution_dependencies] collecting ids and dependencies...');
44
+ var cell_map = {}
45
+ var dep_graph = {}
46
+ cells_with_id.forEach(function (cell) { // ...get all identified cells (the ones that have at least one #tag)
47
+ var tags = cell.metadata.tags || [];
48
+ var cell_ids = tags.filter(tag => /#.*/.test(tag)).map(tag => tag.substring(1)); // ...get all identifiers of the current cell and drop the #
49
+ if(cell === root_cell){
50
+ if(cell_ids.length < 1) {
51
+ cell_ids.push(root_cell.cell_id); // ...use internal root cell id for internal usage
52
+ }
53
+ else {
54
+ root_cell_id = cell_ids[0]; // get any of the root cell ids
55
+ }
56
+ }
57
+
58
+ var dep_ids = tags.filter(tag => /=>.*/.test(tag)).map(tag => tag.substring(2)); // ...get all dependencies and drop the =>
59
+
60
+ cell_ids.forEach(function (id) {
61
+ //console.log('ID:', id, 'deps: ', dep_ids.toString())
62
+ cell_map[id] = cell;
63
+ dep_graph[id] = dep_ids;
64
+
65
+ });
66
+ });
67
+
68
+ if(dep_graph[root_cell_id].length > 0) {
69
+ console.log('[execution_dependencies] collecting depdendency graph in-degrees...');
70
+ var processing_queue = [root_cell_id];
71
+ var processed_nodes = 0;
72
+ var in_degree = {}; // ...collect in-degrees of nodes
73
+ while(processing_queue.length > 0 && processed_nodes < Object.keys(dep_graph).length) {// ...stay processing deps while the queue contains nodes and the processed nodes are below total node quantity
74
+ var id = processing_queue.shift(); // .....pop front of queue and front-push it to the processing order
75
+ //console.log("ID: ", id);
76
+ for(var i=0, dep_qty=dep_graph[id].length; i < dep_qty; i++) {
77
+ var dep = dep_graph[id][i];
78
+ // console.log(' dep: ', dep);
79
+ in_degree[id] = in_degree[id] || 0;
80
+ in_degree[dep] = in_degree[dep] === undefined ? 1 : ++in_degree[dep];
81
+ processing_queue.unshift(dep);
82
+ }
83
+ processed_nodes++;
84
+ }
85
+
86
+ console.log('[execution_dependencies] starting topological sort...');
87
+ processing_queue = [root_cell_id]; // ...add root node with in-degree 0 to queue (this excludes all disconnected subgraphs)
88
+ processed_nodes = 0; // ...number of processed nodes (to detect circular dependencies)
89
+ var processing_order = [];
90
+ while(processing_queue.length > 0 && processed_nodes < Object.keys(dep_graph).length) {// ...stay processing deps while the queue contains nodes and the processed nodes are below total node quantity
91
+ var id = processing_queue.shift(); // .....pop front of queue and front-push it to the processing order
92
+ processing_order.unshift(id);
93
+ //console.log("ID: ", id);
94
+ for(var i=0, dep_qty=dep_graph[id].length; i < dep_qty; i++) { // ......iterate over dependent nodes of current id and decrease their in-degree by 1
95
+ var dep = dep_graph[id][i];
96
+ // console.log(' dep: ', dep);
97
+ in_degree[dep]--;
98
+ if(in_degree[dep] == 0) { // ......queue dependency if in-degree is 0
99
+ processing_queue.unshift(dep);
100
+ }
101
+ }
102
+ processed_nodes++;
103
+ }
104
+
105
+ console.log('[execution_dependencies] checking for circular dependencies...');
106
+ if(processed_nodes > Object.keys(dep_graph).length) { // ...if more nodes where processed than the number of graph nodes, there is a circular dependency
107
+ dialog.modal({
108
+ title : 'Circular dependencies in the execute dependencies of this cell',
109
+ body : 'There is a circular dependency in this cell\'s execute dependencies. The cell will be run without dependencies. If this does not work, fix the dependencies and rerun the cell.',
110
+ buttons: {'OK': {'class' : 'btn-primary'}},
111
+ notebook: Jupyter.notebook,
112
+ keyboard_manager: Jupyter.keyboard_manager,
113
+ });
114
+ }
115
+ else if(!Jupyter.notebook.trusted) { // ...if the notebook is not trusted, we do not execute dependencies, but only print them out to the user
116
+ dialog.modal({
117
+ title : 'Execute dependencies in untrusted notebook',
118
+ body : 'This notebook is not trusted, so execute dependencies will not be automatically run. You can still run them manually, though. Run in order (the last one is the cell you wanted to execute): ' + processing_order,
119
+ buttons: {'OK': {'class' : 'btn-primary'}},
120
+ notebook: Jupyter.notebook,
121
+ keyboard_manager: Jupyter.keyboard_manager,
122
+ });
123
+ }
124
+ else{
125
+ processing_order.pop()
126
+ console.log('[execution_dependencies] executing dependency cells in order ', processing_order ,'...');
127
+ var dependency_cells = processing_order.map(id =>cell_map[id]); // ...get dependent cells by their id
128
+ //console.log("Execute cells..", dependency_cells)
129
+ dependency_cells.forEach(cell => orig_execute.call(cell, stop_on_error)); // ...execute all dependent cells in sequence using the original execute method
130
+ }
131
+ }
132
+ }
133
+ console.log('[execution_dependencies] executing requested cell...');
134
+ orig_execute.call(this, stop_on_error); // execute original cell execute function
135
+ };
136
+ console.log('[execution_dependencies] loaded');
137
+ }
138
+ };
139
+ });
.local/share/jupyter/nbextensions/execution_dependencies/execution_dependencies.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Compatibility: 4.x, 5.x
3
+ Name: Execution Dependencies
4
+ Main: execution_dependencies.js
5
+ Link: README.md
6
+ Description: |
7
+ Introduce tag annotations to identify each cell and indicate a dependency on others.
8
+ Upon running a cell, its dependencies are run first to prepare all dependencies.
9
+ Then the cell triggered by the user is run as soon as all its dependencies are met.
.local/share/jupyter/nbextensions/exercise/exercise.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Name: Exercise
3
+ Description: |
4
+ Define a group of cells as an "exercise".
5
+ The first cell is the question,
6
+ while the rest of the group from the answer or solution.
7
+ The solution can be hidden/shown by clicking on a widget added to the
8
+ question cell.
9
+ Link: readme.md
10
+ Icon: icon.png
11
+ Main: main.js
12
+ Compatibility: 4.x, 5.x
13
+ Parameters:
14
+
15
+ - name: add_button
16
+ description: Add a toolbar button to create/remove an exercise
17
+ input_type: checkbox
18
+ default: true
19
+
20
+ - name: use_hotkey
21
+ description: Add a keyboard shortcut to create/remove an exercise
22
+ input_type: checkbox
23
+ default: true
24
+
25
+ - name: hotkey
26
+ description: Keyboard shortcut optionally used to create/remove an exercise
27
+ input_type: hotkey
28
+ default: 'Alt-D'
.local/share/jupyter/nbextensions/exercise/icon.png ADDED
.local/share/jupyter/nbextensions/exercise/main.css ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .highlight-mask
2
+ {
3
+ background: transparent url('../images/theme/transBlack75.png') repeat 0 0;
4
+ display: none;
5
+ position: absolute;
6
+ }
7
+ .highlight-drag
8
+ {
9
+ background-color: transparent;
10
+ border: dashed #ff3333 3px;
11
+ position: absolute;
12
+ display: none;
13
+ }
.local/share/jupyter/nbextensions/exercise2/icon.png ADDED
.local/share/jupyter/nbextensions/exercise2/image.gif ADDED
.local/share/jupyter/nbextensions/exercise2/main.css ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .exercise2 {
2
+ display: flex;
3
+ width: 100%;
4
+ flex-direction: row;
5
+ align-content: flex-end;
6
+ }
7
+
8
+ .onoffswitch {
9
+ display: inline;
10
+ position: relative; width: 167px;
11
+ margin-top:8px;
12
+ -webkit-user-select:none; -moz-user-select:none; -ms-user-select: none;
13
+ }
14
+ .onoffswitch-checkbox {
15
+ display: none;
16
+ }
17
+ .onoffswitch-label {
18
+ display: block; overflow: hidden; cursor: pointer;
19
+ border: 2px solid #999999; border-radius: 20px;
20
+ margin:0;
21
+ }
22
+ .onoffswitch-inner {
23
+ display: block; width: 200%; margin-left: -100%;
24
+ transition: margin 0.3s ease-in 0s;
25
+ }
26
+ .onoffswitch-inner:before, .onoffswitch-inner:after {
27
+ display: block; float: left; width: 50%; height: 30px; padding: 0; line-height: 30px;
28
+ font-size: 15px; color: white; font-family: Trebuchet, Arial, sans-serif; font-weight: bold;
29
+ box-sizing: border-box;
30
+ }
31
+ .onoffswitch-inner:before {
32
+ content: "Hide Solution";
33
+ padding-left: 10px;
34
+ background-color: #34A7C1; color: #FFFFFF;
35
+ }
36
+
37
+ .onoffswitch-inner:after {
38
+ content: "Show Solution";
39
+ padding-right: 10px;
40
+ background-color: #73FA7E; color: #999999;
41
+ text-align: right;
42
+ }
43
+ .onoffswitch-switch {
44
+ display: block; width: 14px; margin: 6px;
45
+ padding-top: 0px;
46
+ background: #FFFFFF;
47
+ position: absolute; top: 0; bottom: 0;
48
+ text-align: center;
49
+ vertical-align: middle;
50
+ right: 133px;
51
+ border: 2px solid #999999; border-radius: 20px;
52
+ transition: all 0.25s ease-in 0s;
53
+ }
54
+ .onoffswitch-checkbox:checked + .onoffswitch-label .onoffswitch-inner {
55
+ margin-left: 0;
56
+ }
57
+ .onoffswitch-checkbox:checked + .onoffswitch-label .onoffswitch-switch {
58
+ right: 0px;
59
+ }
60
+
.local/share/jupyter/nbextensions/exercise2/readme.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Exercise2
2
+ =========
3
+
4
+ These are two extensions for Jupyter, for hiding/showing solutions cells.
5
+ They use the same approach and codebase and differ only by the type of
6
+ `cell widget` used the show/hide the solutions. The two extensions can be used
7
+ simultaneously. They require the `rubberband` extension to be installed and
8
+ enabled.
9
+
10
+ The example below demonstrates some of the features of the exercise extensions.
11
+
12
+ - First, an solution or "details" cell is created by (a) selecting two cells with the rubberband and (b) clicking on the menu-button [exercise extension]
13
+ - Second, the two next cells are selected using a keyboard shortcut, and a solution is created using the shortcut Alt-D [exercise2 extension]
14
+ - Third, the two solutions are expanded by clicking on the corresponding widgets
15
+ - Fourth, the solutions are removed by selecting them and clicking on the buttons in the toolbar.
16
+
17
+ ![](image.gif)
18
+
19
+
20
+ The extensions provide
21
+ ----------------------
22
+
23
+ - a menubar button
24
+ - a cell widget -- A plus/minus button in `exercise` and a sliding checkbox in `exercise2`.
25
+
26
+ The menubar button is devoted to the creation or removing of the solution. The solution consists in several consecutive cells that can be selected by the usual notebook multicell selection methods (e.g. *Shift-down* (select next) or *Shift-up* (select previous) keyboard shortcuts, or using the rubberband extension.
27
+
28
+
29
+ ### Creating a solution
30
+
31
+ Several cells being selected, pressing the menubar button adds a `cell widget` and hides the cells excepted the first one which serves as a heading cell. *Do not forget to keep the Shift key pressed down while clicking on the menu button
32
+ (otherwise selected cells will be lost)*. It is also possible to use a keyboard shortcut for creating the solution from selected cells: Alt-S for exercise extension and Alt-D for exercise2.
33
+
34
+
35
+ ### Removing a solution
36
+
37
+ If a solution heading (first) cell is selected, then clicking the menu bar button removes this solution and its solutions cells are shown. Using the keyboard shortcut has the same effect.
38
+
39
+
40
+ ### Showing/hiding solution
41
+
42
+ At creation of the solution, the solution cells are hidden. Clicking the `cell widget` toggles the hidden/shown state of the solution.
43
+
44
+
45
+ ### Persistence
46
+
47
+ The state of solutions, hidden or shown, is preserved and automatically restored at startup and on reload.
48
+
49
+
50
+ ### Internals
51
+
52
+ exercise and exercise2 add respectively a solution and solution2 metadata to solution cells, with for value the current state hidden/shown of the solution. For exercise, a div with the plus/minus character is prepended to the solution heading cell. For exercise2, a flex-wrap style is added to the solution heading cell and a checkbox widget, with some css styling, is appended to the cell. A solution[.2]_first metadada is also added to enable an easy detection of the first cell in an "exercise" and then allow several consecutive exercises.
.local/share/jupyter/nbextensions/export_embedded/export_embedded.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Type: Jupyter Notebook Extension
2
+ Compatibility: 5.x
3
+ Main: main.js
4
+ Name: Export Embedded HTML
5
+ Description: Export to HTML with images embedded
6
+ Icon: icon.png
7
+ Link: readme.md
.local/share/jupyter/nbextensions/export_embedded/icon.png ADDED
.local/share/jupyter/nbextensions/export_embedded/main.js ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // toggle display of all code cells' inputs
2
+
3
+ define([
4
+ 'jquery',
5
+ 'base/js/namespace',
6
+ 'base/js/events'
7
+ ], function(
8
+ $,
9
+ Jupyter,
10
+ events
11
+ ) {
12
+ "use strict";
13
+
14
+ function initialize () {
15
+ }
16
+
17
+ var load_ipython_extension = function() {
18
+
19
+ var v = Jupyter.version.split(".")
20
+ if(Number(v[0])*10+ Number(v[1]) < 51)
21
+ {
22
+ console.log('Notebook version 5.1.0 or higher required for this extension')
23
+ return
24
+ }
25
+
26
+ /* Add an entry in the download menu */
27
+ var dwm = $("#download_menu")
28
+ var downloadEntry = $('<li id="download_html_embed"><a href="#">HTML Embedded (.html)</a></li>')
29
+ dwm.append(downloadEntry)
30
+ downloadEntry.click(function () {
31
+ Jupyter.menubar._nbconvert('html_embed', true);
32
+ });
33
+
34
+ /* Add also a Button, currently disabled */
35
+ /*
36
+ Jupyter.toolbar.add_buttons_group([
37
+ Jupyter.keyboard_manager.actions.register ({
38
+ help : 'Embedded HTML Export',
39
+ icon : 'fa-save',
40
+ handler: function() {
41
+ Jupyter.menubar._nbconvert('html_embed', true);
42
+ }
43
+ }, 'export-embedded-html', 'export_embedded')
44
+ ]);
45
+ */
46
+ if (Jupyter.notebook !== undefined && Jupyter.notebook._fully_loaded) {
47
+ // notebook_loaded.Notebook event has already happened
48
+ initialize();
49
+ }
50
+ events.on('notebook_loaded.Notebook', initialize);
51
+ };
52
+
53
+ return {
54
+ load_ipython_extension : load_ipython_extension
55
+ };
56
+ });
.local/share/jupyter/nbextensions/export_embedded/readme.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Export HTML With Embedded Images
2
+ ================================
3
+ This extension allows exporting an embedded HTML by an additional download option in File -> Download -> HTML Embedded, (works like: jupyter nbconvert --to html_embed notebook.ipynb)
4
+
5
+ **Note**: This extension can so far only successfully read relative images paths in the markdown cells (e.g. `![](graphics/pic.png)`) when jupyter is started in the same folder (working directory) where the relative paths can be resolved!
6
+
7
+
.local/share/jupyter/nbextensions/freeze/readme.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Freeze
2
+
3
+ This extension allows to make cells read-only or frozen. It provides three buttons:
4
+ * unlock
5
+ * read-only
6
+ * frozen
7
+
8
+
9
+ For **code-cells**:<br>
10
+ _read-only_: it can be executed, but its input cannot be changed.<br>
11
+ _frozen_: It cannot be either altered or executed.
12
+
13
+ For **markdown-cells**:<br>
14
+ _read-only_: It's input can be viewed by double-clicking on it, but cannot be changed.<br>
15
+ _frozen_: Input cannot be viewed by double-clicking.
16
+
17
+ To change the state of a selected cell, press the corresponding button.
18
+
19
+ The individual cell's state is stored in its metadata and is applied to the cell if the extension is loaded.
20
+
21
+ ## Internals
22
+
23
+ The _read-only_ state is stored in the `cell.metadata.editable` attribute. Cells are editable by default.
24
+ The _frozen_ state is stored in the `cell.metadata.run_control.frozen`attribute.
.local/share/jupyter/nbextensions/help_panel/help_panel.css ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #side_panel {
2
+ position: relative;
3
+ top: 0;
4
+ right: 0;
5
+ height: 100%;
6
+ font-size: 0.9em;
7
+ overflow: auto;
8
+ background-color: #FFFFFF;
9
+ }
10
+
11
+ @media print {
12
+ /* print-previews can't handle 100%-height element as main part of page */
13
+ #side_panel {
14
+ height: auto;
15
+ }
16
+
17
+ /*
18
+ see
19
+ stackoverflow.com/questions/19948474
20
+ stackoverflow.com/questions/20243767
21
+ */
22
+ .col-md-6 {
23
+ float: left;
24
+ width: 50%;
25
+ }
26
+
27
+ .quickhelp {
28
+ page-break-inside: avoid;
29
+ }
30
+ }
31
+
32
+ .side_panel_splitbar {
33
+ position: absolute;
34
+ left: 0;
35
+ top: 0;
36
+ cursor: col-resize;
37
+ height: 100%;
38
+ width: 8px;
39
+ background: url(./img/handle-v.png) 2px 50% no-repeat;
40
+ background-color: #F6F6F6;
41
+ }
42
+
43
+ .side_panel_inner {
44
+ overflow: auto;
45
+ height: inherit;
46
+ margin-left: 8px;
47
+ }
48
+
49
+ .side_panel_inner > div:not(.alert) {
50
+ padding: 0.5em;
51
+ }
52
+
53
+ .help_panel_hide .modal-backdrop,
54
+ .help_panel_hide .modal {
55
+ display: none !important;
56
+ }
.triton/dump/0359b089f02b5ddabaef8985c60f3daf/triton_.ttgir ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [8], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<12865792> : tensor<1024xi32, #blocked>
5
+ %c1024_i32 = arith.constant 1024 : i32
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.muli %0, %c1024_i32 : i32
8
+ %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked>
9
+ %3 = tt.splat %1 : (i32) -> tensor<1024xi32, #blocked>
10
+ %4 = arith.addi %3, %2 : tensor<1024xi32, #blocked>
11
+ %5 = arith.cmpi slt, %4, %cst : tensor<1024xi32, #blocked>
12
+ %6 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<1024x!tt.ptr<f32, 1>, #blocked>
13
+ %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32, 1>, #blocked>, tensor<1024xi32, #blocked>
14
+ %8 = tt.load %7, %5 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<1024xf32, #blocked>
15
+ %9 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<1024x!tt.ptr<bf16, 1>, #blocked>
16
+ %10 = tt.addptr %9, %4 : tensor<1024x!tt.ptr<bf16, 1>, #blocked>, tensor<1024xi32, #blocked>
17
+ %11 = arith.truncf %8 : tensor<1024xf32, #blocked> to tensor<1024xbf16, #blocked>
18
+ tt.store %10, %11, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<1024xbf16, #blocked>
19
+ tt.return
20
+ }
21
+ }
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.llir ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
5
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %11 = lshr i32 %10, 3, !dbg !8
7
+ %12 = and i32 %11, 15, !dbg !8
8
+ %13 = or i32 %12, 16, !dbg !8
9
+ %14 = or i32 %12, 32, !dbg !8
10
+ %15 = or i32 %12, 48, !dbg !8
11
+ %16 = and i32 %10, 7, !dbg !9
12
+ %17 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !10
13
+ %18 = sext i32 %17 to i64, !dbg !11
14
+ %19 = shl nsw i64 %18, 6, !dbg !12
15
+ %20 = zext nneg i32 %12 to i64
16
+ %21 = zext nneg i32 %13 to i64
17
+ %22 = zext nneg i32 %14 to i64
18
+ %23 = zext nneg i32 %15 to i64
19
+ %24 = or i64 %19, %20, !dbg !13
20
+ %25 = or i64 %19, %21, !dbg !13
21
+ %26 = or i64 %19, %22, !dbg !13
22
+ %27 = or i64 %19, %23, !dbg !13
23
+ %28 = getelementptr i64, ptr addrspace(1) %1, i64 %24, !dbg !14
24
+ %29 = getelementptr i64, ptr addrspace(1) %1, i64 %25, !dbg !14
25
+ %30 = getelementptr i64, ptr addrspace(1) %1, i64 %26, !dbg !14
26
+ %31 = getelementptr i64, ptr addrspace(1) %1, i64 %27, !dbg !14
27
+ %32 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #2, !dbg !15
28
+ %33 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %29, i1 true) #2, !dbg !15
29
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #2, !dbg !15
30
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %31, i1 true) #2, !dbg !15
31
+ %36 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #2, !dbg !16
32
+ %37 = bitcast i32 %36 to float, !dbg !16
33
+ %38 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #2, !dbg !17
34
+ %39 = bitcast i32 %38 to float, !dbg !17
35
+ %40 = mul nsw i64 %24, 50257, !dbg !18
36
+ %41 = mul nsw i64 %25, 50257, !dbg !18
37
+ %42 = mul nsw i64 %26, 50257, !dbg !18
38
+ %43 = mul nsw i64 %27, 50257, !dbg !18
39
+ %44 = insertelement <4 x i64> poison, i64 %32, i64 0, !dbg !19
40
+ %45 = insertelement <4 x i64> %44, i64 %33, i64 1, !dbg !19
41
+ %46 = insertelement <4 x i64> %45, i64 %34, i64 2, !dbg !19
42
+ %47 = insertelement <4 x i64> %46, i64 %35, i64 3, !dbg !19
43
+ %48 = icmp eq <4 x i64> %47, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !19
44
+ %49 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %37, float %39) #2, !dbg !20
45
+ %50 = insertelement <4 x float> poison, float %49, i64 0, !dbg !21
46
+ %51 = shufflevector <4 x float> %50, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !21
47
+ %52 = select <4 x i1> %48, <4 x float> zeroinitializer, <4 x float> %51, !dbg !21
48
+ %53 = getelementptr float, ptr addrspace(1) %0, i64 %40
49
+ %54 = getelementptr float, ptr addrspace(1) %0, i64 %41
50
+ %55 = getelementptr float, ptr addrspace(1) %0, i64 %42
51
+ %56 = getelementptr float, ptr addrspace(1) %0, i64 %43
52
+ br label %57, !dbg !22
53
+
54
+ 57: ; preds = %9, %57
55
+ %58 = phi i32 [ 0, %9 ], [ %81, %57 ]
56
+ %59 = phi <4 x float> [ zeroinitializer, %9 ], [ %80, %57 ]
57
+ %60 = or i32 %58, %16, !dbg !23
58
+ %61 = zext nneg i32 %60 to i64, !dbg !23
59
+ %62 = icmp ult i32 %60, 50257, !dbg !24
60
+ %63 = getelementptr float, ptr addrspace(1) %53, i64 %61, !dbg !25
61
+ %64 = getelementptr float, ptr addrspace(1) %54, i64 %61, !dbg !25
62
+ %65 = getelementptr float, ptr addrspace(1) %55, i64 %61, !dbg !25
63
+ %66 = getelementptr float, ptr addrspace(1) %56, i64 %61, !dbg !25
64
+ %67 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %63, i1 %62, i32 0, i1 %62) #2, !dbg !26
65
+ %68 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %64, i1 %62, i32 0, i1 %62) #2, !dbg !26
66
+ %69 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %65, i1 %62, i32 0, i1 %62) #2, !dbg !26
67
+ %70 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %66, i1 %62, i32 0, i1 %62) #2, !dbg !26
68
+ %71 = insertelement <4 x i32> poison, i32 %67, i64 0, !dbg !26
69
+ %72 = insertelement <4 x i32> %71, i32 %68, i64 1, !dbg !26
70
+ %73 = insertelement <4 x i32> %72, i32 %69, i64 2, !dbg !26
71
+ %74 = insertelement <4 x i32> %73, i32 %70, i64 3, !dbg !26
72
+ %75 = bitcast <4 x i32> %74 to <4 x float>, !dbg !26
73
+ %76 = fmul <4 x float> %52, %75, !dbg !27
74
+ %77 = insertelement <4 x i1> poison, i1 %62, i64 0, !dbg !28
75
+ %78 = shufflevector <4 x i1> %77, <4 x i1> poison, <4 x i32> zeroinitializer, !dbg !28
76
+ %79 = select <4 x i1> %78, <4 x float> %76, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !28
77
+ %80 = fadd <4 x float> %59, %79, !dbg !28
78
+ %81 = add nuw nsw i32 %58, 8, !dbg !22
79
+ %82 = icmp ult i32 %58, 50249, !dbg !22
80
+ br i1 %82, label %57, label %83, !dbg !22
81
+
82
+ 83: ; preds = %57
83
+ %84 = extractelement <4 x float> %80, i64 0, !dbg !29
84
+ %85 = bitcast float %84 to i32, !dbg !29
85
+ %86 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %85, i32 4, i32 31), !dbg !29
86
+ %87 = bitcast i32 %86 to float, !dbg !29
87
+ %88 = fadd float %84, %87, !dbg !33
88
+ %89 = bitcast float %88 to i32, !dbg !29
89
+ %90 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %89, i32 2, i32 31), !dbg !29
90
+ %91 = bitcast i32 %90 to float, !dbg !29
91
+ %92 = fadd float %88, %91, !dbg !33
92
+ %93 = bitcast float %92 to i32, !dbg !29
93
+ %94 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %93, i32 1, i32 31), !dbg !29
94
+ %95 = bitcast i32 %94 to float, !dbg !29
95
+ %96 = fadd float %92, %95, !dbg !33
96
+ %97 = extractelement <4 x float> %80, i64 1, !dbg !29
97
+ %98 = bitcast float %97 to i32, !dbg !29
98
+ %99 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %98, i32 4, i32 31), !dbg !29
99
+ %100 = bitcast i32 %99 to float, !dbg !29
100
+ %101 = fadd float %97, %100, !dbg !33
101
+ %102 = bitcast float %101 to i32, !dbg !29
102
+ %103 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %102, i32 2, i32 31), !dbg !29
103
+ %104 = bitcast i32 %103 to float, !dbg !29
104
+ %105 = fadd float %101, %104, !dbg !33
105
+ %106 = bitcast float %105 to i32, !dbg !29
106
+ %107 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %106, i32 1, i32 31), !dbg !29
107
+ %108 = bitcast i32 %107 to float, !dbg !29
108
+ %109 = fadd float %105, %108, !dbg !33
109
+ %110 = extractelement <4 x float> %80, i64 2, !dbg !29
110
+ %111 = bitcast float %110 to i32, !dbg !29
111
+ %112 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %111, i32 4, i32 31), !dbg !29
112
+ %113 = bitcast i32 %112 to float, !dbg !29
113
+ %114 = fadd float %110, %113, !dbg !33
114
+ %115 = bitcast float %114 to i32, !dbg !29
115
+ %116 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %115, i32 2, i32 31), !dbg !29
116
+ %117 = bitcast i32 %116 to float, !dbg !29
117
+ %118 = fadd float %114, %117, !dbg !33
118
+ %119 = bitcast float %118 to i32, !dbg !29
119
+ %120 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %119, i32 1, i32 31), !dbg !29
120
+ %121 = bitcast i32 %120 to float, !dbg !29
121
+ %122 = fadd float %118, %121, !dbg !33
122
+ %123 = extractelement <4 x float> %80, i64 3, !dbg !29
123
+ %124 = bitcast float %123 to i32, !dbg !29
124
+ %125 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %124, i32 4, i32 31), !dbg !29
125
+ %126 = bitcast i32 %125 to float, !dbg !29
126
+ %127 = fadd float %123, %126, !dbg !33
127
+ %128 = bitcast float %127 to i32, !dbg !29
128
+ %129 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %128, i32 2, i32 31), !dbg !29
129
+ %130 = bitcast i32 %129 to float, !dbg !29
130
+ %131 = fadd float %127, %130, !dbg !33
131
+ %132 = bitcast float %131 to i32, !dbg !29
132
+ %133 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %132, i32 1, i32 31), !dbg !29
133
+ %134 = bitcast i32 %133 to float, !dbg !29
134
+ %135 = fadd float %131, %134, !dbg !33
135
+ %136 = extractelement <4 x float> %52, i64 0, !dbg !37
136
+ %137 = extractelement <4 x float> %52, i64 1, !dbg !37
137
+ %138 = extractelement <4 x float> %52, i64 2, !dbg !37
138
+ %139 = extractelement <4 x float> %52, i64 3, !dbg !37
139
+ br label %140, !dbg !38
140
+
141
+ 140: ; preds = %83, %140
142
+ %141 = phi i32 [ 0, %83 ], [ %217, %140 ]
143
+ %142 = or i32 %141, %16, !dbg !39
144
+ %143 = zext nneg i32 %142 to i64, !dbg !39
145
+ %144 = icmp ult i32 %142, 50257, !dbg !40
146
+ %145 = add nsw i64 %40, %143, !dbg !41
147
+ %146 = add nsw i64 %41, %143, !dbg !41
148
+ %147 = add nsw i64 %42, %143, !dbg !41
149
+ %148 = add nsw i64 %43, %143, !dbg !41
150
+ %149 = getelementptr i16, ptr addrspace(1) %4, i64 %145, !dbg !42
151
+ %150 = getelementptr i16, ptr addrspace(1) %4, i64 %146, !dbg !42
152
+ %151 = getelementptr i16, ptr addrspace(1) %4, i64 %147, !dbg !42
153
+ %152 = getelementptr i16, ptr addrspace(1) %4, i64 %148, !dbg !42
154
+ %153 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %149, i1 %144, i16 0, i1 %144) #2, !dbg !43
155
+ %154 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %150, i1 %144, i16 0, i1 %144) #2, !dbg !43
156
+ %155 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %151, i1 %144, i16 0, i1 %144) #2, !dbg !43
157
+ %156 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %152, i1 %144, i16 0, i1 %144) #2, !dbg !43
158
+ %157 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %153) #2, !dbg !44
159
+ %158 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %154) #2, !dbg !44
160
+ %159 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %155) #2, !dbg !44
161
+ %160 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %156) #2, !dbg !44
162
+ %161 = getelementptr float, ptr addrspace(1) %0, i64 %145, !dbg !45
163
+ %162 = getelementptr float, ptr addrspace(1) %0, i64 %146, !dbg !45
164
+ %163 = getelementptr float, ptr addrspace(1) %0, i64 %147, !dbg !45
165
+ %164 = getelementptr float, ptr addrspace(1) %0, i64 %148, !dbg !45
166
+ %165 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %161, i1 %144, i32 0, i1 %144) #2, !dbg !46
167
+ %166 = bitcast i32 %165 to float, !dbg !46
168
+ %167 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %162, i1 %144, i32 0, i1 %144) #2, !dbg !46
169
+ %168 = bitcast i32 %167 to float, !dbg !46
170
+ %169 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %163, i1 %144, i32 0, i1 %144) #2, !dbg !46
171
+ %170 = bitcast i32 %169 to float, !dbg !46
172
+ %171 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %164, i1 %144, i32 0, i1 %144) #2, !dbg !46
173
+ %172 = bitcast i32 %171 to float, !dbg !46
174
+ %173 = getelementptr i16, ptr addrspace(1) %5, i64 %145, !dbg !47
175
+ %174 = getelementptr i16, ptr addrspace(1) %5, i64 %146, !dbg !47
176
+ %175 = getelementptr i16, ptr addrspace(1) %5, i64 %147, !dbg !47
177
+ %176 = getelementptr i16, ptr addrspace(1) %5, i64 %148, !dbg !47
178
+ %177 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %173, i1 %144, i16 0, i1 %144) #2, !dbg !48
179
+ %178 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %174, i1 %144, i16 0, i1 %144) #2, !dbg !48
180
+ %179 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %175, i1 %144, i16 0, i1 %144) #2, !dbg !48
181
+ %180 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %176, i1 %144, i16 0, i1 %144) #2, !dbg !48
182
+ %181 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %177) #2, !dbg !49
183
+ %182 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %178) #2, !dbg !49
184
+ %183 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %179) #2, !dbg !49
185
+ %184 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %180) #2, !dbg !49
186
+ %185 = fmul float %136, %166, !dbg !37
187
+ %186 = fmul float %137, %168, !dbg !37
188
+ %187 = fmul float %138, %170, !dbg !37
189
+ %188 = fmul float %139, %172, !dbg !37
190
+ %189 = fmul float %181, 0x3FF7154760000000, !dbg !50
191
+ %190 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %189) #2, !dbg !50
192
+ %191 = fmul float %182, 0x3FF7154760000000, !dbg !50
193
+ %192 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %191) #2, !dbg !50
194
+ %193 = fmul float %183, 0x3FF7154760000000, !dbg !50
195
+ %194 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %193) #2, !dbg !50
196
+ %195 = fmul float %184, 0x3FF7154760000000, !dbg !50
197
+ %196 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %195) #2, !dbg !50
198
+ %197 = fmul float %96, %190, !dbg !51
199
+ %198 = fmul float %109, %192, !dbg !51
200
+ %199 = fmul float %122, %194, !dbg !51
201
+ %200 = fmul float %135, %196, !dbg !51
202
+ %201 = fsub float %185, %197, !dbg !52
203
+ %202 = fsub float %186, %198, !dbg !52
204
+ %203 = fsub float %187, %199, !dbg !52
205
+ %204 = fsub float %188, %200, !dbg !52
206
+ %205 = fadd float %157, %201, !dbg !53
207
+ %206 = fadd float %158, %202, !dbg !53
208
+ %207 = fadd float %159, %203, !dbg !53
209
+ %208 = fadd float %160, %204, !dbg !53
210
+ %209 = getelementptr i16, ptr addrspace(1) %6, i64 %145, !dbg !54
211
+ %210 = getelementptr i16, ptr addrspace(1) %6, i64 %146, !dbg !54
212
+ %211 = getelementptr i16, ptr addrspace(1) %6, i64 %147, !dbg !54
213
+ %212 = getelementptr i16, ptr addrspace(1) %6, i64 %148, !dbg !54
214
+ %213 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %205) #2, !dbg !55
215
+ %214 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %206) #2, !dbg !55
216
+ %215 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %207) #2, !dbg !55
217
+ %216 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %208) #2, !dbg !55
218
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %213, ptr addrspace(1) %209, i1 %144) #2, !dbg !55
219
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %214, ptr addrspace(1) %210, i1 %144) #2, !dbg !55
220
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %215, ptr addrspace(1) %211, i1 %144) #2, !dbg !55
221
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %216, ptr addrspace(1) %212, i1 %144) #2, !dbg !55
222
+ %217 = add nuw nsw i32 %141, 8, !dbg !38
223
+ %218 = icmp ult i32 %141, 50249, !dbg !38
224
+ br i1 %218, label %140, label %219, !dbg !38
225
+
226
+ 219: ; preds = %140
227
+ ret void, !dbg !56
228
+ }
229
+
230
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
231
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
232
+
233
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
234
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
235
+
236
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
237
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
238
+ attributes #2 = { nounwind }
239
+
240
+ !llvm.module.flags = !{!0}
241
+ !llvm.dbg.cu = !{!1}
242
+ !nvvm.annotations = !{!3, !4, !4, !3}
243
+
244
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
245
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
246
+ !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
247
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
248
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 128}
249
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
250
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
251
+ !7 = !{}
252
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
253
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
254
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
255
+ !11 = !DILocation(line: 21, column: 34, scope: !5)
256
+ !12 = !DILocation(line: 21, column: 46, scope: !5)
257
+ !13 = !DILocation(line: 22, column: 23, scope: !5)
258
+ !14 = !DILocation(line: 26, column: 30, scope: !5)
259
+ !15 = !DILocation(line: 26, column: 35, scope: !5)
260
+ !16 = !DILocation(line: 27, column: 19, scope: !5)
261
+ !17 = !DILocation(line: 29, column: 19, scope: !5)
262
+ !18 = !DILocation(line: 36, column: 46, scope: !5)
263
+ !19 = !DILocation(line: 38, column: 23, scope: !5)
264
+ !20 = !DILocation(line: 39, column: 22, scope: !5)
265
+ !21 = !DILocation(line: 41, column: 37, scope: !5)
266
+ !22 = !DILocation(line: 32, column: 36, scope: !5)
267
+ !23 = !DILocation(line: 33, column: 27, scope: !5)
268
+ !24 = !DILocation(line: 34, column: 25, scope: !5)
269
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
270
+ !26 = !DILocation(line: 36, column: 52, scope: !5)
271
+ !27 = !DILocation(line: 42, column: 23, scope: !5)
272
+ !28 = !DILocation(line: 45, column: 40, scope: !5)
273
+ !29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
274
+ !30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
275
+ !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
276
+ !32 = !DILocation(line: 46, column: 27, scope: !30)
277
+ !33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35)
278
+ !34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
279
+ !35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36)
280
+ !36 = !DILocation(line: 46, column: 27, scope: !34)
281
+ !37 = !DILocation(line: 63, column: 24, scope: !5)
282
+ !38 = !DILocation(line: 51, column: 36, scope: !5)
283
+ !39 = !DILocation(line: 52, column: 27, scope: !5)
284
+ !40 = !DILocation(line: 53, column: 25, scope: !5)
285
+ !41 = !DILocation(line: 55, column: 41, scope: !5)
286
+ !42 = !DILocation(line: 55, column: 35, scope: !5)
287
+ !43 = !DILocation(line: 55, column: 53, scope: !5)
288
+ !44 = !DILocation(line: 55, column: 105, scope: !5)
289
+ !45 = !DILocation(line: 56, column: 35, scope: !5)
290
+ !46 = !DILocation(line: 56, column: 53, scope: !5)
291
+ !47 = !DILocation(line: 57, column: 35, scope: !5)
292
+ !48 = !DILocation(line: 57, column: 53, scope: !5)
293
+ !49 = !DILocation(line: 57, column: 105, scope: !5)
294
+ !50 = !DILocation(line: 65, column: 23, scope: !5)
295
+ !51 = !DILocation(line: 66, column: 24, scope: !5)
296
+ !52 = !DILocation(line: 67, column: 24, scope: !5)
297
+ !53 = !DILocation(line: 69, column: 24, scope: !5)
298
+ !54 = !DILocation(line: 70, column: 29, scope: !5)
299
+ !55 = !DILocation(line: 70, column: 54, scope: !5)
300
+ !56 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ptx ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+
11
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
12
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
21
+ )
22
+ .maxntid 128, 1, 1
23
+ {
24
+ .reg .pred %p<49>;
25
+ .reg .b16 %rs<33>;
26
+ .reg .b32 %r<72>;
27
+ .reg .f32 %f<98>;
28
+ .reg .b64 %rd<66>;
29
+ .loc 1 18 0
30
+ $L__func_begin0:
31
+ .loc 1 18 0
32
+
33
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
34
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
35
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
36
+ ld.param.u64 %rd28, [triton__0d1d2d3d4d5d6d7de8_param_0];
37
+ ld.param.u64 %rd29, [triton__0d1d2d3d4d5d6d7de8_param_1];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r13, %tid.x;
41
+ ld.param.u64 %rd26, [triton__0d1d2d3d4d5d6d7de8_param_2];
42
+ bfe.u32 %r14, %r13, 3, 4;
43
+ ld.param.u64 %rd27, [triton__0d1d2d3d4d5d6d7de8_param_3];
44
+ .loc 1 24 33
45
+ and.b32 %r1, %r13, 7;
46
+ .loc 1 21 28
47
+ mov.u32 %r6, %ctaid.x;
48
+ .loc 1 21 34
49
+ cvt.s64.s32 %rd1, %r6;
50
+ .loc 1 21 46
51
+ mul.wide.s32 %rd30, %r6, 64;
52
+ cvt.u64.u32 %rd2, %r14;
53
+ .loc 1 22 23
54
+ or.b64 %rd31, %rd30, %rd2;
55
+ .loc 1 26 30
56
+ shl.b64 %rd32, %rd31, 3;
57
+ add.s64 %rd19, %rd29, %rd32;
58
+ add.s64 %rd21, %rd19, 128;
59
+ add.s64 %rd23, %rd19, 256;
60
+ add.s64 %rd25, %rd19, 384;
61
+ mov.pred %p1, -1;
62
+ .loc 1 26 35
63
+ mov.u64 %rd18, 0x0;
64
+ @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
65
+ mov.u64 %rd20, 0x0;
66
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
67
+ mov.u64 %rd22, 0x0;
68
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
69
+ mov.u64 %rd24, 0x0;
70
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
71
+ .loc 1 27 19
72
+ mov.u32 %r10, 0x0;
73
+ @%p1 ld.global.b32 { %r10 }, [ %rd26 + 0 ];
74
+ .loc 1 29 19
75
+ mov.u32 %r11, 0x0;
76
+ @%p1 ld.global.b32 { %r11 }, [ %rd27 + 0 ];
77
+ .loc 1 38 23
78
+ setp.eq.s64 %p7, %rd18, -1;
79
+ setp.eq.s64 %p8, %rd20, -1;
80
+ setp.eq.s64 %p9, %rd22, -1;
81
+ setp.eq.s64 %p10, %rd24, -1;
82
+ .loc 1 39 22
83
+ div.full.f32 %r9, %r10, %r11;
84
+ mov.b32 %f25, %r9;
85
+ .loc 1 41 37
86
+ selp.f32 %f4, 0f00000000, %f25, %p10;
87
+ selp.f32 %f3, 0f00000000, %f25, %p9;
88
+ selp.f32 %f2, 0f00000000, %f25, %p8;
89
+ selp.f32 %f1, 0f00000000, %f25, %p7;
90
+ .loc 1 32 36
91
+ mul.wide.s32 %rd33, %r6, 12865792;
92
+ mul.wide.u32 %rd34, %r14, 201028;
93
+ add.s64 %rd35, %rd33, %rd34;
94
+ cvt.u64.u32 %rd36, %r13;
95
+ and.b64 %rd3, %rd36, 7;
96
+ mul.wide.u32 %rd37, %r1, 4;
97
+ add.s64 %rd38, %rd35, %rd37;
98
+ add.s64 %rd39, %rd38, %rd28;
99
+ add.s64 %rd65, %rd39, 9649344;
100
+ mov.f32 %f94, 0f00000000;
101
+ mov.b32 %r70, -8;
102
+ mov.u64 %rd63, %rd65;
103
+ mov.f32 %f95, %f94;
104
+ mov.f32 %f96, %f94;
105
+ mov.f32 %f97, %f94;
106
+ $L__BB0_1:
107
+ add.s32 %r70, %r70, 8;
108
+ .loc 1 33 27
109
+ add.s32 %r23, %r70, %r1;
110
+ .loc 1 34 25
111
+ setp.lt.u32 %p11, %r23, 50257;
112
+ .loc 1 36 34
113
+ add.s64 %rd40, %rd63, -9649344;
114
+ add.s64 %rd41, %rd63, -6432896;
115
+ add.s64 %rd42, %rd63, -3216448;
116
+ mov.b32 %r54, 0;
117
+ .loc 1 36 52
118
+ mov.u32 %r15, 0x0;
119
+ @%p11 ld.global.L1::evict_last.b32 { %r15 }, [ %rd40 + 0 ];
120
+ @!%p11 mov.u32 %r15, %r54;
121
+ mov.u32 %r17, 0x0;
122
+ @%p11 ld.global.L1::evict_last.b32 { %r17 }, [ %rd41 + 0 ];
123
+ @!%p11 mov.u32 %r17, %r54;
124
+ mov.u32 %r19, 0x0;
125
+ @%p11 ld.global.L1::evict_last.b32 { %r19 }, [ %rd42 + 0 ];
126
+ @!%p11 mov.u32 %r19, %r54;
127
+ mov.u32 %r21, 0x0;
128
+ @%p11 ld.global.L1::evict_last.b32 { %r21 }, [ %rd63 + 0 ];
129
+ @!%p11 mov.u32 %r21, %r54;
130
+ mov.b32 %f26, %r21;
131
+ mov.b32 %f27, %r19;
132
+ mov.b32 %f28, %r17;
133
+ mov.b32 %f29, %r15;
134
+ .loc 1 42 23
135
+ mul.f32 %f30, %f1, %f29;
136
+ mul.f32 %f31, %f2, %f28;
137
+ mul.f32 %f32, %f3, %f27;
138
+ mul.f32 %f33, %f4, %f26;
139
+ .loc 1 45 40
140
+ selp.f32 %f34, %f33, 0f80000000, %p11;
141
+ selp.f32 %f35, %f32, 0f80000000, %p11;
142
+ selp.f32 %f36, %f31, 0f80000000, %p11;
143
+ selp.f32 %f37, %f30, 0f80000000, %p11;
144
+ add.f32 %f94, %f94, %f37;
145
+ add.f32 %f95, %f95, %f36;
146
+ add.f32 %f96, %f96, %f35;
147
+ add.f32 %f97, %f97, %f34;
148
+ .loc 1 32 36
149
+ add.s64 %rd63, %rd63, 32;
150
+ setp.lt.u32 %p19, %r70, 50249;
151
+ @%p19 bra $L__BB0_1;
152
+ $L__tmp1:
153
+ .loc 2 243 36
154
+ mov.b32 %r25, %f94;
155
+ shfl.sync.bfly.b32 %r26, %r25, 4, 31, -1;
156
+ mov.b32 %f38, %r26;
157
+ $L__tmp2:
158
+ .loc 2 233 15
159
+ add.f32 %f39, %f94, %f38;
160
+ $L__tmp3:
161
+ .loc 2 243 36
162
+ mov.b32 %r27, %f39;
163
+ shfl.sync.bfly.b32 %r28, %r27, 2, 31, -1;
164
+ mov.b32 %f40, %r28;
165
+ $L__tmp4:
166
+ .loc 2 233 15
167
+ add.f32 %f41, %f39, %f40;
168
+ $L__tmp5:
169
+ .loc 2 243 36
170
+ mov.b32 %r29, %f41;
171
+ shfl.sync.bfly.b32 %r30, %r29, 1, 31, -1;
172
+ mov.b32 %f42, %r30;
173
+ $L__tmp6:
174
+ .loc 2 233 15
175
+ add.f32 %f13, %f41, %f42;
176
+ $L__tmp7:
177
+ .loc 2 243 36
178
+ mov.b32 %r31, %f95;
179
+ shfl.sync.bfly.b32 %r32, %r31, 4, 31, -1;
180
+ mov.b32 %f43, %r32;
181
+ $L__tmp8:
182
+ .loc 2 233 15
183
+ add.f32 %f44, %f95, %f43;
184
+ $L__tmp9:
185
+ .loc 2 243 36
186
+ mov.b32 %r33, %f44;
187
+ shfl.sync.bfly.b32 %r34, %r33, 2, 31, -1;
188
+ mov.b32 %f45, %r34;
189
+ $L__tmp10:
190
+ .loc 2 233 15
191
+ add.f32 %f46, %f44, %f45;
192
+ $L__tmp11:
193
+ .loc 2 243 36
194
+ mov.b32 %r35, %f46;
195
+ shfl.sync.bfly.b32 %r36, %r35, 1, 31, -1;
196
+ mov.b32 %f47, %r36;
197
+ $L__tmp12:
198
+ .loc 2 233 15
199
+ add.f32 %f14, %f46, %f47;
200
+ $L__tmp13:
201
+ .loc 2 243 36
202
+ mov.b32 %r37, %f96;
203
+ shfl.sync.bfly.b32 %r38, %r37, 4, 31, -1;
204
+ mov.b32 %f48, %r38;
205
+ $L__tmp14:
206
+ .loc 2 233 15
207
+ add.f32 %f49, %f96, %f48;
208
+ $L__tmp15:
209
+ .loc 2 243 36
210
+ mov.b32 %r39, %f49;
211
+ shfl.sync.bfly.b32 %r40, %r39, 2, 31, -1;
212
+ mov.b32 %f50, %r40;
213
+ $L__tmp16:
214
+ .loc 2 233 15
215
+ add.f32 %f51, %f49, %f50;
216
+ $L__tmp17:
217
+ .loc 2 243 36
218
+ mov.b32 %r41, %f51;
219
+ shfl.sync.bfly.b32 %r42, %r41, 1, 31, -1;
220
+ mov.b32 %f52, %r42;
221
+ $L__tmp18:
222
+ .loc 2 233 15
223
+ add.f32 %f15, %f51, %f52;
224
+ $L__tmp19:
225
+ .loc 2 243 36
226
+ mov.b32 %r43, %f97;
227
+ shfl.sync.bfly.b32 %r44, %r43, 4, 31, -1;
228
+ mov.b32 %f53, %r44;
229
+ $L__tmp20:
230
+ .loc 2 233 15
231
+ add.f32 %f54, %f97, %f53;
232
+ $L__tmp21:
233
+ .loc 2 243 36
234
+ mov.b32 %r45, %f54;
235
+ shfl.sync.bfly.b32 %r46, %r45, 2, 31, -1;
236
+ mov.b32 %f55, %r46;
237
+ $L__tmp22:
238
+ .loc 2 233 15
239
+ add.f32 %f56, %f54, %f55;
240
+ $L__tmp23:
241
+ .loc 2 243 36
242
+ mov.b32 %r47, %f56;
243
+ shfl.sync.bfly.b32 %r48, %r47, 1, 31, -1;
244
+ mov.b32 %f57, %r48;
245
+ $L__tmp24:
246
+ .loc 2 233 15
247
+ add.f32 %f16, %f56, %f57;
248
+ $L__tmp25:
249
+ .loc 1 51 36
250
+ shl.b64 %rd44, %rd3, 1;
251
+ add.s64 %rd7, %rd17, %rd44;
252
+ mul.lo.s64 %rd45, %rd1, 6432896;
253
+ mul.lo.s64 %rd46, %rd2, 100514;
254
+ add.s64 %rd64, %rd45, %rd46;
255
+ add.s64 %rd9, %rd16, %rd44;
256
+ add.s64 %rd10, %rd15, %rd44;
257
+ mov.b32 %r71, -8;
258
+ mov.u16 %rs2, 0;
259
+ $L__BB0_3:
260
+ add.s32 %r71, %r71, 8;
261
+ .loc 1 52 27
262
+ add.s32 %r69, %r71, %r1;
263
+ .loc 1 53 25
264
+ setp.lt.u32 %p20, %r69, 50257;
265
+ .loc 1 55 35
266
+ add.s64 %rd47, %rd10, %rd64;
267
+ add.s64 %rd48, %rd47, 1608224;
268
+ add.s64 %rd49, %rd47, 3216448;
269
+ .loc 1 55 53
270
+ add.s64 %rd50, %rd47, 4824672;
271
+ mov.u16 %rs1, 0x0;
272
+ @%p20 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd47 + 0 ];
273
+ @!%p20 mov.u16 %rs1, %rs2;
274
+ mov.u16 %rs3, 0x0;
275
+ @%p20 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd48 + 0 ];
276
+ @!%p20 mov.u16 %rs3, %rs2;
277
+ mov.u16 %rs5, 0x0;
278
+ @%p20 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd49 + 0 ];
279
+ @!%p20 mov.u16 %rs5, %rs2;
280
+ mov.u16 %rs7, 0x0;
281
+ @%p20 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd50 + 0 ];
282
+ @!%p20 mov.u16 %rs7, %rs2;
283
+ .loc 1 55 105
284
+ cvt.f32.bf16 %r49, %rs1;
285
+ mov.b32 %f66, %r49;
286
+ cvt.f32.bf16 %r50, %rs3;
287
+ mov.b32 %f67, %r50;
288
+ cvt.f32.bf16 %r51, %rs5;
289
+ mov.b32 %f68, %r51;
290
+ cvt.f32.bf16 %r52, %rs7;
291
+ mov.b32 %f69, %r52;
292
+ .loc 1 56 35
293
+ add.s64 %rd51, %rd65, -9649344;
294
+ add.s64 %rd52, %rd65, -6432896;
295
+ add.s64 %rd53, %rd65, -3216448;
296
+ .loc 1 56 53
297
+ mov.u32 %r53, 0x0;
298
+ @%p20 ld.global.L1::evict_first.b32 { %r53 }, [ %rd51 + 0 ];
299
+ @!%p20 mov.u32 %r53, %r54;
300
+ mov.b32 %f70, %r53;
301
+ mov.u32 %r55, 0x0;
302
+ @%p20 ld.global.L1::evict_first.b32 { %r55 }, [ %rd52 + 0 ];
303
+ @!%p20 mov.u32 %r55, %r54;
304
+ mov.b32 %f71, %r55;
305
+ mov.u32 %r57, 0x0;
306
+ @%p20 ld.global.L1::evict_first.b32 { %r57 }, [ %rd53 + 0 ];
307
+ @!%p20 mov.u32 %r57, %r54;
308
+ mov.b32 %f72, %r57;
309
+ mov.u32 %r59, 0x0;
310
+ @%p20 ld.global.L1::evict_first.b32 { %r59 }, [ %rd65 + 0 ];
311
+ @!%p20 mov.u32 %r59, %r54;
312
+ mov.b32 %f73, %r59;
313
+ .loc 1 57 35
314
+ add.s64 %rd55, %rd9, %rd64;
315
+ add.s64 %rd56, %rd55, 1608224;
316
+ add.s64 %rd57, %rd55, 3216448;
317
+ .loc 1 57 53
318
+ add.s64 %rd58, %rd55, 4824672;
319
+ mov.u16 %rs13, 0x0;
320
+ @%p20 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd55 + 0 ];
321
+ @!%p20 mov.u16 %rs13, %rs2;
322
+ mov.u16 %rs15, 0x0;
323
+ @%p20 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd56 + 0 ];
324
+ @!%p20 mov.u16 %rs15, %rs2;
325
+ mov.u16 %rs17, 0x0;
326
+ @%p20 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd57 + 0 ];
327
+ @!%p20 mov.u16 %rs17, %rs2;
328
+ mov.u16 %rs19, 0x0;
329
+ @%p20 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd58 + 0 ];
330
+ @!%p20 mov.u16 %rs19, %rs2;
331
+ .loc 1 57 105
332
+ cvt.f32.bf16 %r61, %rs13;
333
+ mov.b32 %f74, %r61;
334
+ cvt.f32.bf16 %r62, %rs15;
335
+ mov.b32 %f75, %r62;
336
+ cvt.f32.bf16 %r63, %rs17;
337
+ mov.b32 %f76, %r63;
338
+ cvt.f32.bf16 %r64, %rs19;
339
+ mov.b32 %f77, %r64;
340
+ .loc 1 65 23
341
+ mul.f32 %f59, %f74, 0f3FB8AA3B;
342
+ ex2.approx.f32 %f58, %f59;
343
+ mul.f32 %f61, %f75, 0f3FB8AA3B;
344
+ ex2.approx.f32 %f60, %f61;
345
+ mul.f32 %f63, %f76, 0f3FB8AA3B;
346
+ ex2.approx.f32 %f62, %f63;
347
+ mul.f32 %f65, %f77, 0f3FB8AA3B;
348
+ ex2.approx.f32 %f64, %f65;
349
+ .loc 1 66 24
350
+ mul.f32 %f78, %f13, %f58;
351
+ mul.f32 %f79, %f14, %f60;
352
+ mul.f32 %f80, %f15, %f62;
353
+ mul.f32 %f81, %f16, %f64;
354
+ .loc 1 67 24
355
+ neg.f32 %f82, %f78;
356
+ fma.rn.f32 %f83, %f1, %f70, %f82;
357
+ neg.f32 %f84, %f79;
358
+ fma.rn.f32 %f85, %f2, %f71, %f84;
359
+ neg.f32 %f86, %f80;
360
+ fma.rn.f32 %f87, %f3, %f72, %f86;
361
+ neg.f32 %f88, %f81;
362
+ fma.rn.f32 %f89, %f4, %f73, %f88;
363
+ .loc 1 69 24
364
+ add.f32 %f90, %f66, %f83;
365
+ add.f32 %f91, %f67, %f85;
366
+ add.f32 %f92, %f68, %f87;
367
+ add.f32 %f93, %f69, %f89;
368
+ .loc 1 70 29
369
+ add.s64 %rd59, %rd7, %rd64;
370
+ add.s64 %rd60, %rd59, 1608224;
371
+ add.s64 %rd61, %rd59, 3216448;
372
+ .loc 1 70 54
373
+ add.s64 %rd62, %rd59, 4824672;
374
+ mov.b32 %r65, %f90;
375
+ cvt.rn.bf16.f32 %rs25, %r65;
376
+ mov.b32 %r66, %f91;
377
+ cvt.rn.bf16.f32 %rs26, %r66;
378
+ mov.b32 %r67, %f92;
379
+ cvt.rn.bf16.f32 %rs27, %r67;
380
+ mov.b32 %r68, %f93;
381
+ cvt.rn.bf16.f32 %rs28, %r68;
382
+ @%p20 st.global.b16 [ %rd59 + 0 ], { %rs25 };
383
+ @%p20 st.global.b16 [ %rd60 + 0 ], { %rs26 };
384
+ @%p20 st.global.b16 [ %rd61 + 0 ], { %rs27 };
385
+ @%p20 st.global.b16 [ %rd62 + 0 ], { %rs28 };
386
+ .loc 1 51 36
387
+ add.s64 %rd65, %rd65, 32;
388
+ add.s64 %rd64, %rd64, 16;
389
+ setp.lt.u32 %p48, %r71, 50249;
390
+ @%p48 bra $L__BB0_3;
391
+ .loc 1 51 4
392
+ ret;
393
+ $L__tmp26:
394
+ $L__func_end0:
395
+
396
+ }
397
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
398
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
399
+ .section .debug_abbrev
400
+ {
401
+ .b8 1
402
+ .b8 17
403
+ .b8 1
404
+ .b8 37
405
+ .b8 8
406
+ .b8 19
407
+ .b8 5
408
+ .b8 3
409
+ .b8 8
410
+ .b8 16
411
+ .b8 6
412
+ .b8 27
413
+ .b8 8
414
+ .b8 180
415
+ .b8 66
416
+ .b8 12
417
+ .b8 17
418
+ .b8 1
419
+ .b8 18
420
+ .b8 1
421
+ .b8 0
422
+ .b8 0
423
+ .b8 2
424
+ .b8 46
425
+ .b8 0
426
+ .b8 135
427
+ .b8 64
428
+ .b8 8
429
+ .b8 3
430
+ .b8 8
431
+ .b8 58
432
+ .b8 11
433
+ .b8 59
434
+ .b8 11
435
+ .b8 63
436
+ .b8 12
437
+ .b8 32
438
+ .b8 11
439
+ .b8 0
440
+ .b8 0
441
+ .b8 3
442
+ .b8 46
443
+ .b8 1
444
+ .b8 17
445
+ .b8 1
446
+ .b8 18
447
+ .b8 1
448
+ .b8 64
449
+ .b8 10
450
+ .b8 49
451
+ .b8 19
452
+ .b8 0
453
+ .b8 0
454
+ .b8 4
455
+ .b8 29
456
+ .b8 0
457
+ .b8 49
458
+ .b8 19
459
+ .b8 17
460
+ .b8 1
461
+ .b8 18
462
+ .b8 1
463
+ .b8 88
464
+ .b8 11
465
+ .b8 89
466
+ .b8 11
467
+ .b8 87
468
+ .b8 11
469
+ .b8 0
470
+ .b8 0
471
+ .b8 5
472
+ .b8 29
473
+ .b8 1
474
+ .b8 49
475
+ .b8 19
476
+ .b8 17
477
+ .b8 1
478
+ .b8 18
479
+ .b8 1
480
+ .b8 88
481
+ .b8 11
482
+ .b8 89
483
+ .b8 11
484
+ .b8 87
485
+ .b8 11
486
+ .b8 0
487
+ .b8 0
488
+ .b8 0
489
+ }
490
+ .section .debug_info
491
+ {
492
+ .b32 278
493
+ .b8 2
494
+ .b8 0
495
+ .b32 .debug_abbrev
496
+ .b8 8
497
+ .b8 1
498
+ .b8 116
499
+ .b8 114
500
+ .b8 105
501
+ .b8 116
502
+ .b8 111
503
+ .b8 110
504
+ .b8 0
505
+ .b8 2
506
+ .b8 0
507
+ .b8 99
508
+ .b8 107
509
+ .b8 122
510
+ .b8 103
511
+ .b8 108
512
+ .b8 55
513
+ .b8 116
514
+ .b8 104
515
+ .b8 98
516
+ .b8 52
517
+ .b8 120
518
+ .b8 100
519
+ .b8 102
520
+ .b8 107
521
+ .b8 102
522
+ .b8 110
523
+ .b8 100
524
+ .b8 50
525
+ .b8 116
526
+ .b8 105
527
+ .b8 100
528
+ .b8 107
529
+ .b8 115
530
+ .b8 54
531
+ .b8 109
532
+ .b8 116
533
+ .b8 53
534
+ .b8 102
535
+ .b8 51
536
+ .b8 104
537
+ .b8 97
538
+ .b8 117
539
+ .b8 119
540
+ .b8 102
541
+ .b8 121
542
+ .b8 106
543
+ .b8 102
544
+ .b8 108
545
+ .b8 98
546
+ .b8 116
547
+ .b8 122
548
+ .b8 121
549
+ .b8 101
550
+ .b8 112
551
+ .b8 111
552
+ .b8 53
553
+ .b8 111
554
+ .b8 120
555
+ .b8 107
556
+ .b8 118
557
+ .b8 104
558
+ .b8 107
559
+ .b8 46
560
+ .b8 112
561
+ .b8 121
562
+ .b8 0
563
+ .b32 .debug_line
564
+ .b8 47
565
+ .b8 116
566
+ .b8 109
567
+ .b8 112
568
+ .b8 47
569
+ .b8 116
570
+ .b8 111
571
+ .b8 114
572
+ .b8 99
573
+ .b8 104
574
+ .b8 105
575
+ .b8 110
576
+ .b8 100
577
+ .b8 117
578
+ .b8 99
579
+ .b8 116
580
+ .b8 111
581
+ .b8 114
582
+ .b8 95
583
+ .b8 114
584
+ .b8 111
585
+ .b8 111
586
+ .b8 116
587
+ .b8 47
588
+ .b8 107
589
+ .b8 122
590
+ .b8 0
591
+ .b8 1
592
+ .b64 $L__func_begin0
593
+ .b64 $L__func_end0
594
+ .b8 2
595
+ .b8 116
596
+ .b8 114
597
+ .b8 105
598
+ .b8 116
599
+ .b8 111
600
+ .b8 110
601
+ .b8 95
602
+ .b8 95
603
+ .b8 48
604
+ .b8 100
605
+ .b8 49
606
+ .b8 100
607
+ .b8 50
608
+ .b8 100
609
+ .b8 51
610
+ .b8 100
611
+ .b8 52
612
+ .b8 100
613
+ .b8 53
614
+ .b8 100
615
+ .b8 54
616
+ .b8 100
617
+ .b8 55
618
+ .b8 100
619
+ .b8 101
620
+ .b8 56
621
+ .b8 0
622
+ .b8 116
623
+ .b8 114
624
+ .b8 105
625
+ .b8 116
626
+ .b8 111
627
+ .b8 110
628
+ .b8 95
629
+ .b8 95
630
+ .b8 48
631
+ .b8 100
632
+ .b8 49
633
+ .b8 100
634
+ .b8 50
635
+ .b8 100
636
+ .b8 51
637
+ .b8 100
638
+ .b8 52
639
+ .b8 100
640
+ .b8 53
641
+ .b8 100
642
+ .b8 54
643
+ .b8 100
644
+ .b8 55
645
+ .b8 100
646
+ .b8 101
647
+ .b8 56
648
+ .b8 0
649
+ .b8 1
650
+ .b8 18
651
+ .b8 1
652
+ .b8 1
653
+ .b8 3
654
+ .b64 $L__func_begin0
655
+ .b64 $L__func_end0
656
+ .b8 1
657
+ .b8 156
658
+ .b32 125
659
+ .b8 4
660
+ .b32 125
661
+ .b64 $L__tmp1
662
+ .b64 $L__tmp24
663
+ .b8 2
664
+ .b8 46
665
+ .b8 27
666
+ .b8 5
667
+ .b32 125
668
+ .b64 $L__tmp2
669
+ .b64 $L__tmp25
670
+ .b8 2
671
+ .b8 46
672
+ .b8 27
673
+ .b8 4
674
+ .b32 125
675
+ .b64 $L__tmp2
676
+ .b64 $L__tmp25
677
+ .b8 2
678
+ .b8 243
679
+ .b8 36
680
+ .b8 0
681
+ .b8 0
682
+ .b8 0
683
+ }
684
+ .section .debug_pubnames
685
+ {
686
+ .b32 $L__pubNames_end0-$L__pubNames_start0
687
+ $L__pubNames_start0:
688
+ .b8 2
689
+ .b8 0
690
+ .b32 .debug_info
691
+ .b32 282
692
+ .b32 125
693
+ .b8 116
694
+ .b8 114
695
+ .b8 105
696
+ .b8 116
697
+ .b8 111
698
+ .b8 110
699
+ .b8 95
700
+ .b8 95
701
+ .b8 48
702
+ .b8 100
703
+ .b8 49
704
+ .b8 100
705
+ .b8 50
706
+ .b8 100
707
+ .b8 51
708
+ .b8 100
709
+ .b8 52
710
+ .b8 100
711
+ .b8 53
712
+ .b8 100
713
+ .b8 54
714
+ .b8 100
715
+ .b8 55
716
+ .b8 100
717
+ .b8 101
718
+ .b8 56
719
+ .b8 0
720
+ .b32 0
721
+ $L__pubNames_end0:
722
+ }
723
+ .section .debug_pubtypes
724
+ {
725
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
726
+ $L__pubTypes_start0:
727
+ .b8 2
728
+ .b8 0
729
+ .b32 .debug_info
730
+ .b32 282
731
+ .b32 0
732
+ $L__pubTypes_end0:
733
+ }
734
+ .section .debug_loc { }
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ttgir ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x1xf32, #blocked>
5
+ %cst_0 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<-1> : tensor<64x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x8xf32, #blocked>
8
+ %c64_i64 = arith.constant 64 : i64
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x8xi64, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c8_i32 = arith.constant 8 : i32
12
+ %c50257_i32 = arith.constant 50257 : i32
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xbf16, #blocked>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.extsi %0 : i32 to i64
16
+ %2 = arith.muli %1, %c64_i64 : i64
17
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
19
+ %5 = arith.extsi %4 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
20
+ %6 = tt.splat %2 : (i64) -> tensor<64x1xi64, #blocked>
21
+ %7 = arith.addi %6, %5 : tensor<64x1xi64, #blocked>
22
+ %8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x8xi32, #blocked>
24
+ %10 = arith.extsi %9 : tensor<1x8xi32, #blocked> to tensor<1x8xi64, #blocked>
25
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
26
+ %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi64, #blocked>
27
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
28
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
29
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
30
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
31
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
32
+ %18 = arith.muli %7, %cst_0 : tensor<64x1xi64, #blocked>
33
+ %19 = tt.broadcast %18 : (tensor<64x1xi64, #blocked>) -> tensor<64x8xi64, #blocked>
34
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>, #blocked>
35
+ %21 = arith.cmpi ne, %13, %cst_1 : tensor<64x1xi64, #blocked>
36
+ %22 = arith.divf %15, %17 : f32
37
+ %23 = tt.splat %22 : (f32) -> tensor<64x1xf32, #blocked>
38
+ %24 = arith.select %21, %23, %cst : tensor<64x1xi1, #blocked>, tensor<64x1xf32, #blocked>
39
+ %25 = tt.broadcast %24 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
40
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c8_i32 iter_args(%arg10 = %cst_2) -> (tensor<64x8xf32, #blocked>) : i32 {
41
+ %33 = arith.extsi %arg9 : i32 to i64
42
+ %34 = tt.splat %33 : (i64) -> tensor<1x8xi64, #blocked>
43
+ %35 = arith.addi %34, %10 : tensor<1x8xi64, #blocked>
44
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x8xi64, #blocked>
45
+ %37 = tt.broadcast %35 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
46
+ %38 = arith.addi %37, %19 : tensor<64x8xi64, #blocked>
47
+ %39 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
48
+ %40 = tt.broadcast %36 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
49
+ %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
50
+ %42 = arith.mulf %41, %25 : tensor<64x8xf32, #blocked>
51
+ %43 = arith.addf %arg10, %42 : tensor<64x8xf32, #blocked>
52
+ %44 = arith.select %40, %43, %arg10 : tensor<64x8xi1, #blocked>, tensor<64x8xf32, #blocked>
53
+ scf.yield %44 : tensor<64x8xf32, #blocked>
54
+ }
55
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
56
+ ^bb0(%arg9: f32, %arg10: f32):
57
+ %33 = arith.addf %arg9, %arg10 : f32
58
+ tt.reduce.return %33 : f32
59
+ }) : (tensor<64x8xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
60
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
61
+ %29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
62
+ %30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
63
+ %31 = tt.broadcast %28 : (tensor<64x1xf32, #blocked>) -> tensor<64x8xf32, #blocked>
64
+ %32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>, #blocked>
65
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c8_i32 : i32 {
66
+ %33 = arith.extsi %arg9 : i32 to i64
67
+ %34 = tt.splat %33 : (i64) -> tensor<1x8xi64, #blocked>
68
+ %35 = arith.addi %34, %10 : tensor<1x8xi64, #blocked>
69
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x8xi64, #blocked>
70
+ %37 = tt.broadcast %35 : (tensor<1x8xi64, #blocked>) -> tensor<64x8xi64, #blocked>
71
+ %38 = arith.addi %37, %19 : tensor<64x8xi64, #blocked>
72
+ %39 = tt.addptr %29, %38 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi64, #blocked>
73
+ %40 = tt.broadcast %36 : (tensor<1x8xi1, #blocked>) -> tensor<64x8xi1, #blocked>
74
+ %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
75
+ %42 = arith.extf %41 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
76
+ %43 = tt.addptr %20, %38 : tensor<64x8x!tt.ptr<f32, 1>, #blocked>, tensor<64x8xi64, #blocked>
77
+ %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32, #blocked>
78
+ %45 = tt.addptr %30, %38 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi64, #blocked>
79
+ %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16, #blocked>
80
+ %47 = arith.extf %46 : tensor<64x8xbf16, #blocked> to tensor<64x8xf32, #blocked>
81
+ %48 = arith.mulf %44, %25 : tensor<64x8xf32, #blocked>
82
+ %49 = math.exp %47 : tensor<64x8xf32, #blocked>
83
+ %50 = arith.mulf %49, %31 : tensor<64x8xf32, #blocked>
84
+ %51 = arith.subf %48, %50 : tensor<64x8xf32, #blocked>
85
+ %52 = arith.addf %42, %51 : tensor<64x8xf32, #blocked>
86
+ %53 = tt.addptr %32, %38 : tensor<64x8x!tt.ptr<bf16, 1>, #blocked>, tensor<64x8xi64, #blocked>
87
+ %54 = arith.truncf %52 : tensor<64x8xf32, #blocked> to tensor<64x8xbf16, #blocked>
88
+ tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16, #blocked>
89
+ }
90
+ tt.return
91
+ }
92
+ }
.triton/dump/0ef13ec90cf21db4d33a072ff09ec2d4/triton_.ttir ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
3
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x8xbf16>
4
+ %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x1xf32>
5
+ %c50257_i32 = arith.constant 50257 : i32
6
+ %c8_i32 = arith.constant 8 : i32
7
+ %c0_i32 = arith.constant 0 : i32
8
+ %cst_1 = arith.constant dense<50257> : tensor<64x1xi64>
9
+ %cst_2 = arith.constant dense<50257> : tensor<1x8xi64>
10
+ %c64_i64 = arith.constant 64 : i64
11
+ %cst_3 = arith.constant dense<-1> : tensor<64x1xi64>
12
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x8xf32>
13
+ %0 = tt.get_program_id x : i32
14
+ %1 = arith.extsi %0 : i32 to i64
15
+ %2 = arith.muli %1, %c64_i64 : i64
16
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32>
17
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32>) -> tensor<64x1xi32>
18
+ %5 = arith.extsi %4 : tensor<64x1xi32> to tensor<64x1xi64>
19
+ %6 = tt.splat %2 : (i64) -> tensor<64x1xi64>
20
+ %7 = arith.addi %6, %5 : tensor<64x1xi64>
21
+ %8 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32>
22
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<8xi32>) -> tensor<1x8xi32>
23
+ %10 = arith.extsi %9 : tensor<1x8xi32> to tensor<1x8xi64>
24
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>>
25
+ %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>>, tensor<64x1xi64>
26
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64>
27
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
28
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
29
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
30
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
31
+ %18 = arith.muli %7, %cst_1 : tensor<64x1xi64>
32
+ %19 = tt.broadcast %18 : (tensor<64x1xi64>) -> tensor<64x8xi64>
33
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
34
+ %21 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64>
35
+ %22 = arith.divf %15, %17 : f32
36
+ %23 = tt.splat %22 : (f32) -> tensor<64x1xf32>
37
+ %24 = arith.select %21, %23, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
38
+ %25 = tt.broadcast %24 : (tensor<64x1xf32>) -> tensor<64x8xf32>
39
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c8_i32 iter_args(%arg10 = %cst_4) -> (tensor<64x8xf32>) : i32 {
40
+ %41 = arith.extsi %arg9 : i32 to i64
41
+ %42 = tt.splat %41 : (i64) -> tensor<1x8xi64>
42
+ %43 = arith.addi %42, %10 : tensor<1x8xi64>
43
+ %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x8xi64>
44
+ %45 = tt.broadcast %43 : (tensor<1x8xi64>) -> tensor<64x8xi64>
45
+ %46 = arith.addi %45, %19 : tensor<64x8xi64>
46
+ %47 = tt.addptr %20, %46 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
47
+ %48 = tt.broadcast %44 : (tensor<1x8xi1>) -> tensor<64x8xi1>
48
+ %49 = tt.load %47, %48, %cst_4 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x8xf32>
49
+ %50 = arith.mulf %49, %25 : tensor<64x8xf32>
50
+ %51 = arith.addf %arg10, %50 : tensor<64x8xf32>
51
+ %52 = arith.select %48, %51, %arg10 : tensor<64x8xi1>, tensor<64x8xf32>
52
+ scf.yield %52 : tensor<64x8xf32>
53
+ }
54
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
55
+ ^bb0(%arg9: f32, %arg10: f32):
56
+ %41 = arith.addf %arg9, %arg10 : f32
57
+ tt.reduce.return %41 : f32
58
+ }) : (tensor<64x8xf32>) -> tensor<64xf32>
59
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32>) -> tensor<64x1xf32>
60
+ %29 = arith.muli %7, %cst_1 : tensor<64x1xi64>
61
+ %30 = tt.broadcast %29 : (tensor<64x1xi64>) -> tensor<64x8xi64>
62
+ %31 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
63
+ %32 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x8x!tt.ptr<f32, 1>>
64
+ %33 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
65
+ %34 = arith.cmpi ne, %13, %cst_3 : tensor<64x1xi64>
66
+ %35 = arith.divf %15, %17 : f32
67
+ %36 = tt.splat %35 : (f32) -> tensor<64x1xf32>
68
+ %37 = arith.select %34, %36, %cst_0 : tensor<64x1xi1>, tensor<64x1xf32>
69
+ %38 = tt.broadcast %37 : (tensor<64x1xf32>) -> tensor<64x8xf32>
70
+ %39 = tt.broadcast %28 : (tensor<64x1xf32>) -> tensor<64x8xf32>
71
+ %40 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x8x!tt.ptr<bf16, 1>>
72
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c8_i32 : i32 {
73
+ %41 = arith.extsi %arg9 : i32 to i64
74
+ %42 = tt.splat %41 : (i64) -> tensor<1x8xi64>
75
+ %43 = arith.addi %42, %10 : tensor<1x8xi64>
76
+ %44 = arith.cmpi slt, %43, %cst_2 : tensor<1x8xi64>
77
+ %45 = tt.broadcast %43 : (tensor<1x8xi64>) -> tensor<64x8xi64>
78
+ %46 = arith.addi %45, %30 : tensor<64x8xi64>
79
+ %47 = tt.addptr %31, %46 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi64>
80
+ %48 = tt.broadcast %44 : (tensor<1x8xi1>) -> tensor<64x8xi1>
81
+ %49 = tt.load %47, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
82
+ %50 = arith.extf %49 : tensor<64x8xbf16> to tensor<64x8xf32>
83
+ %51 = tt.addptr %32, %46 : tensor<64x8x!tt.ptr<f32, 1>>, tensor<64x8xi64>
84
+ %52 = tt.load %51, %48, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xf32>
85
+ %53 = tt.addptr %33, %46 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi64>
86
+ %54 = tt.load %53, %48, %cst {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x8xbf16>
87
+ %55 = arith.extf %54 : tensor<64x8xbf16> to tensor<64x8xf32>
88
+ %56 = arith.mulf %52, %38 : tensor<64x8xf32>
89
+ %57 = math.exp %55 : tensor<64x8xf32>
90
+ %58 = arith.mulf %57, %39 : tensor<64x8xf32>
91
+ %59 = arith.subf %56, %58 : tensor<64x8xf32>
92
+ %60 = arith.addf %50, %59 : tensor<64x8xf32>
93
+ %61 = tt.addptr %40, %46 : tensor<64x8x!tt.ptr<bf16, 1>>, tensor<64x8xi64>
94
+ %62 = arith.truncf %60 : tensor<64x8xf32> to tensor<64x8xbf16>
95
+ tt.store %61, %62, %48 {cache = 1 : i32, evict = 1 : i32} : tensor<64x8xbf16>
96
+ }
97
+ tt.return
98
+ }
99
+ }
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.cubin ADDED
Binary file (4.52 kB). View file
 
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.llir ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ define void @triton__0d1de(ptr addrspace(1) %0, i64 %1) local_unnamed_addr !dbg !5 {
5
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
6
+ %4 = shl i32 %3, 1, !dbg !8
7
+ %5 = and i32 %4, 510, !dbg !8
8
+ %6 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #1, !dbg !9
9
+ %7 = sext i32 %6 to i64, !dbg !10
10
+ %8 = shl nsw i64 %7, 9, !dbg !11
11
+ %9 = zext nneg i32 %5 to i64
12
+ %10 = or i64 %8, %9, !dbg !12
13
+ %11 = getelementptr float, ptr addrspace(1) %0, i64 %10, !dbg !13
14
+ tail call void asm sideeffect "@$3 st.global.v2.b32 [ $2 + 0 ], { $0, $1 };", "r,r,l,b"(i32 0, i32 0, ptr addrspace(1) %11, i1 true) #1, !dbg !14
15
+ ret void, !dbg !15
16
+ }
17
+
18
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
19
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
20
+
21
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
22
+ attributes #1 = { nounwind }
23
+
24
+ !llvm.module.flags = !{!0}
25
+ !llvm.dbg.cu = !{!1}
26
+ !nvvm.annotations = !{!3, !4, !4, !3}
27
+
28
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
29
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
30
+ !2 = !DIFile(filename: "cpkw3bdoamlgzvqjeyuk34b3jcjf57htisara7lukflexo3t22ew.py", directory: "/tmp/torchinductor_root/pk")
31
+ !3 = !{ptr @triton__0d1de, !"kernel", i32 1}
32
+ !4 = !{ptr @triton__0d1de, !"maxntidx", i32 256}
33
+ !5 = distinct !DISubprogram(name: "triton__0d1de", linkageName: "triton__0d1de", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
34
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
35
+ !7 = !{}
36
+ !8 = !DILocation(line: 21, column: 36, scope: !5)
37
+ !9 = !DILocation(line: 20, column: 28, scope: !5)
38
+ !10 = !DILocation(line: 20, column: 34, scope: !5)
39
+ !11 = !DILocation(line: 20, column: 46, scope: !5)
40
+ !12 = !DILocation(line: 21, column: 23, scope: !5)
41
+ !13 = !DILocation(line: 25, column: 25, scope: !5)
42
+ !14 = !DILocation(line: 25, column: 36, scope: !5)
43
+ !15 = !DILocation(line: 25, column: 4, scope: !5)
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttgir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [2], threadsPerWarp = [32], warpsPerCTA = [8], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<512xf32, #blocked>
5
+ %c512_i64 = arith.constant 512 : i64
6
+ %0 = tt.get_program_id x : i32
7
+ %1 = arith.extsi %0 : i32 to i64
8
+ %2 = arith.muli %1, %c512_i64 : i64
9
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #blocked>
10
+ %4 = arith.extsi %3 : tensor<512xi32, #blocked> to tensor<512xi64, #blocked>
11
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64, #blocked>
12
+ %6 = arith.addi %5, %4 : tensor<512xi64, #blocked>
13
+ %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>, #blocked>
14
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<f32, 1>, #blocked>, tensor<512xi64, #blocked>
15
+ tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32, #blocked>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/246118bec10f09cdce32d0be7c22b5ae/triton_.ttir ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i64 = arith.constant 512 : i64
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<512xf32>
5
+ %0 = tt.get_program_id x : i32
6
+ %1 = arith.extsi %0 : i32 to i64
7
+ %2 = arith.muli %1, %c512_i64 : i64
8
+ %3 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
9
+ %4 = arith.extsi %3 : tensor<512xi32> to tensor<512xi64>
10
+ %5 = tt.splat %2 : (i64) -> tensor<512xi64>
11
+ %6 = arith.addi %5, %4 : tensor<512xi64>
12
+ %7 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
13
+ %8 = tt.addptr %7, %6 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi64>
14
+ tt.store %8, %cst {cache = 1 : i32, evict = 1 : i32} : tensor<512xf32>
15
+ tt.return
16
+ }
17
+ }
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.cubin ADDED
Binary file (52.1 kB). View file
 
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.llir ADDED
@@ -0,0 +1,793 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
7
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %11 = lshr i32 %10, 5, !dbg !8
9
+ %urem = and i32 %10, 255, !dbg !9
10
+ %12 = or i32 %urem, 256, !dbg !9
11
+ %13 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
12
+ %14 = sext i32 %13 to i64, !dbg !11
13
+ %15 = shl nsw i64 %14, 3, !dbg !12
14
+ %16 = or i64 %15, 1, !dbg !13
15
+ %17 = or i64 %15, 2, !dbg !13
16
+ %18 = or i64 %15, 3, !dbg !13
17
+ %19 = or i64 %15, 4, !dbg !13
18
+ %20 = or i64 %15, 5, !dbg !13
19
+ %21 = or i64 %15, 6, !dbg !13
20
+ %22 = or i64 %15, 7, !dbg !13
21
+ %23 = insertelement <2 x i32> poison, i32 %urem, i64 0
22
+ %24 = insertelement <2 x i32> %23, i32 %12, i64 1
23
+ %25 = zext nneg <2 x i32> %24 to <2 x i64>
24
+ %26 = getelementptr i64, ptr addrspace(1) %1, i64 %15, !dbg !14
25
+ %27 = getelementptr i64, ptr addrspace(1) %1, i64 %16, !dbg !14
26
+ %28 = getelementptr i64, ptr addrspace(1) %1, i64 %17, !dbg !14
27
+ %29 = getelementptr i64, ptr addrspace(1) %1, i64 %18, !dbg !14
28
+ %30 = getelementptr i64, ptr addrspace(1) %1, i64 %19, !dbg !14
29
+ %31 = getelementptr i64, ptr addrspace(1) %1, i64 %20, !dbg !14
30
+ %32 = getelementptr i64, ptr addrspace(1) %1, i64 %21, !dbg !14
31
+ %33 = getelementptr i64, ptr addrspace(1) %1, i64 %22, !dbg !14
32
+ %34 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %26, i1 true) #3, !dbg !15
33
+ %35 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %27, i1 true) #3, !dbg !15
34
+ %36 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %28, i1 true) #3, !dbg !15
35
+ %37 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %29, i1 true) #3, !dbg !15
36
+ %38 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %30, i1 true) #3, !dbg !15
37
+ %39 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %31, i1 true) #3, !dbg !15
38
+ %40 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %32, i1 true) #3, !dbg !15
39
+ %41 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %33, i1 true) #3, !dbg !15
40
+ %42 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16
41
+ %43 = bitcast i32 %42 to float, !dbg !16
42
+ %44 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17
43
+ %45 = bitcast i32 %44 to float, !dbg !17
44
+ %46 = mul nsw i64 %14, 402056, !dbg !18
45
+ %47 = mul nsw i64 %16, 50257, !dbg !18
46
+ %48 = mul nsw i64 %17, 50257, !dbg !18
47
+ %49 = mul nsw i64 %18, 50257, !dbg !18
48
+ %50 = mul nsw i64 %19, 50257, !dbg !18
49
+ %51 = mul nsw i64 %20, 50257, !dbg !18
50
+ %52 = mul nsw i64 %21, 50257, !dbg !18
51
+ %53 = mul nsw i64 %22, 50257, !dbg !18
52
+ %54 = insertelement <8 x i64> poison, i64 %34, i64 0, !dbg !19
53
+ %55 = insertelement <8 x i64> %54, i64 %35, i64 1, !dbg !19
54
+ %56 = insertelement <8 x i64> %55, i64 %36, i64 2, !dbg !19
55
+ %57 = insertelement <8 x i64> %56, i64 %37, i64 3, !dbg !19
56
+ %58 = insertelement <8 x i64> %57, i64 %38, i64 4, !dbg !19
57
+ %59 = insertelement <8 x i64> %58, i64 %39, i64 5, !dbg !19
58
+ %60 = insertelement <8 x i64> %59, i64 %40, i64 6, !dbg !19
59
+ %61 = insertelement <8 x i64> %60, i64 %41, i64 7, !dbg !19
60
+ %62 = icmp eq <8 x i64> %61, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !19
61
+ %63 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %43, float %45) #3, !dbg !20
62
+ %64 = insertelement <8 x float> poison, float %63, i64 0, !dbg !21
63
+ %65 = shufflevector <8 x float> %64, <8 x float> poison, <8 x i32> zeroinitializer, !dbg !21
64
+ %66 = select <8 x i1> %62, <8 x float> zeroinitializer, <8 x float> %65, !dbg !21
65
+ %67 = shufflevector <8 x float> %66, <8 x float> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>, !dbg !21
66
+ br label %68, !dbg !22
67
+
68
+ 68: ; preds = %9, %68
69
+ %69 = phi i32 [ 0, %9 ], [ %135, %68 ]
70
+ %70 = phi <16 x float> [ zeroinitializer, %9 ], [ %134, %68 ]
71
+ %71 = zext nneg i32 %69 to i64, !dbg !23
72
+ %72 = insertelement <2 x i64> poison, i64 %71, i64 0, !dbg !23
73
+ %73 = shufflevector <2 x i64> %72, <2 x i64> poison, <2 x i32> zeroinitializer, !dbg !23
74
+ %74 = or <2 x i64> %73, %25, !dbg !23
75
+ %75 = icmp ult <2 x i64> %74, <i64 50257, i64 50257>, !dbg !24
76
+ %76 = shufflevector <2 x i1> %75, <2 x i1> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, !dbg !24
77
+ %77 = extractelement <2 x i64> %74, i64 0, !dbg !25
78
+ %78 = getelementptr float, ptr addrspace(1) %0, i64 %77, !dbg !25
79
+ %79 = getelementptr float, ptr addrspace(1) %78, i64 %46, !dbg !25
80
+ %80 = extractelement <2 x i64> %74, i64 1, !dbg !25
81
+ %81 = getelementptr float, ptr addrspace(1) %0, i64 %80, !dbg !25
82
+ %82 = getelementptr float, ptr addrspace(1) %81, i64 %46, !dbg !25
83
+ %83 = getelementptr float, ptr addrspace(1) %78, i64 %47, !dbg !25
84
+ %84 = getelementptr float, ptr addrspace(1) %81, i64 %47, !dbg !25
85
+ %85 = getelementptr float, ptr addrspace(1) %78, i64 %48, !dbg !25
86
+ %86 = getelementptr float, ptr addrspace(1) %81, i64 %48, !dbg !25
87
+ %87 = getelementptr float, ptr addrspace(1) %78, i64 %49, !dbg !25
88
+ %88 = getelementptr float, ptr addrspace(1) %81, i64 %49, !dbg !25
89
+ %89 = getelementptr float, ptr addrspace(1) %78, i64 %50, !dbg !25
90
+ %90 = getelementptr float, ptr addrspace(1) %81, i64 %50, !dbg !25
91
+ %91 = getelementptr float, ptr addrspace(1) %78, i64 %51, !dbg !25
92
+ %92 = getelementptr float, ptr addrspace(1) %81, i64 %51, !dbg !25
93
+ %93 = getelementptr float, ptr addrspace(1) %78, i64 %52, !dbg !25
94
+ %94 = getelementptr float, ptr addrspace(1) %81, i64 %52, !dbg !25
95
+ %95 = getelementptr float, ptr addrspace(1) %78, i64 %53, !dbg !25
96
+ %96 = getelementptr float, ptr addrspace(1) %81, i64 %53, !dbg !25
97
+ %97 = extractelement <2 x i1> %75, i64 0, !dbg !26
98
+ %98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %79, i1 %97, i32 0, i1 %97) #3, !dbg !26
99
+ %99 = extractelement <2 x i1> %75, i64 1, !dbg !26
100
+ %100 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %82, i1 %99, i32 0, i1 %99) #3, !dbg !26
101
+ %101 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %83, i1 %97, i32 0, i1 %97) #3, !dbg !26
102
+ %102 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %84, i1 %99, i32 0, i1 %99) #3, !dbg !26
103
+ %103 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %85, i1 %97, i32 0, i1 %97) #3, !dbg !26
104
+ %104 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %86, i1 %99, i32 0, i1 %99) #3, !dbg !26
105
+ %105 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %87, i1 %97, i32 0, i1 %97) #3, !dbg !26
106
+ %106 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %88, i1 %99, i32 0, i1 %99) #3, !dbg !26
107
+ %107 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %89, i1 %97, i32 0, i1 %97) #3, !dbg !26
108
+ %108 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %90, i1 %99, i32 0, i1 %99) #3, !dbg !26
109
+ %109 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %91, i1 %97, i32 0, i1 %97) #3, !dbg !26
110
+ %110 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %92, i1 %99, i32 0, i1 %99) #3, !dbg !26
111
+ %111 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %93, i1 %97, i32 0, i1 %97) #3, !dbg !26
112
+ %112 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %94, i1 %99, i32 0, i1 %99) #3, !dbg !26
113
+ %113 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %95, i1 %97, i32 0, i1 %97) #3, !dbg !26
114
+ %114 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %96, i1 %99, i32 0, i1 %99) #3, !dbg !26
115
+ %115 = insertelement <16 x i32> poison, i32 %98, i64 0, !dbg !26
116
+ %116 = insertelement <16 x i32> %115, i32 %100, i64 1, !dbg !26
117
+ %117 = insertelement <16 x i32> %116, i32 %101, i64 2, !dbg !26
118
+ %118 = insertelement <16 x i32> %117, i32 %102, i64 3, !dbg !26
119
+ %119 = insertelement <16 x i32> %118, i32 %103, i64 4, !dbg !26
120
+ %120 = insertelement <16 x i32> %119, i32 %104, i64 5, !dbg !26
121
+ %121 = insertelement <16 x i32> %120, i32 %105, i64 6, !dbg !26
122
+ %122 = insertelement <16 x i32> %121, i32 %106, i64 7, !dbg !26
123
+ %123 = insertelement <16 x i32> %122, i32 %107, i64 8, !dbg !26
124
+ %124 = insertelement <16 x i32> %123, i32 %108, i64 9, !dbg !26
125
+ %125 = insertelement <16 x i32> %124, i32 %109, i64 10, !dbg !26
126
+ %126 = insertelement <16 x i32> %125, i32 %110, i64 11, !dbg !26
127
+ %127 = insertelement <16 x i32> %126, i32 %111, i64 12, !dbg !26
128
+ %128 = insertelement <16 x i32> %127, i32 %112, i64 13, !dbg !26
129
+ %129 = insertelement <16 x i32> %128, i32 %113, i64 14, !dbg !26
130
+ %130 = insertelement <16 x i32> %129, i32 %114, i64 15, !dbg !26
131
+ %131 = bitcast <16 x i32> %130 to <16 x float>, !dbg !26
132
+ %132 = fmul <16 x float> %67, %131, !dbg !27
133
+ %133 = select <16 x i1> %76, <16 x float> %132, <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !28
134
+ %134 = fadd <16 x float> %70, %133, !dbg !28
135
+ %135 = add nuw nsw i32 %69, 512, !dbg !22
136
+ %136 = icmp ult i32 %69, 49745, !dbg !22
137
+ br i1 %136, label %68, label %137, !dbg !22
138
+
139
+ 137: ; preds = %68
140
+ %138 = and i32 %10, 31, !dbg !8
141
+ %139 = and i32 %11, 7, !dbg !9
142
+ %shift = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
143
+ %140 = fadd <16 x float> %134, %shift, !dbg !29
144
+ %141 = extractelement <16 x float> %140, i64 0, !dbg !29
145
+ %shift54 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
146
+ %142 = fadd <16 x float> %134, %shift54, !dbg !29
147
+ %143 = extractelement <16 x float> %142, i64 2, !dbg !29
148
+ %shift55 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
149
+ %144 = fadd <16 x float> %134, %shift55, !dbg !29
150
+ %145 = extractelement <16 x float> %144, i64 4, !dbg !29
151
+ %shift56 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
152
+ %146 = fadd <16 x float> %134, %shift56, !dbg !29
153
+ %147 = extractelement <16 x float> %146, i64 6, !dbg !29
154
+ %shift57 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
155
+ %148 = fadd <16 x float> %134, %shift57, !dbg !29
156
+ %149 = extractelement <16 x float> %148, i64 8, !dbg !29
157
+ %shift58 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, !dbg !29
158
+ %150 = fadd <16 x float> %134, %shift58, !dbg !29
159
+ %151 = extractelement <16 x float> %150, i64 10, !dbg !29
160
+ %shift59 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 13, i32 poison, i32 poison, i32 poison>, !dbg !29
161
+ %152 = fadd <16 x float> %134, %shift59, !dbg !29
162
+ %153 = extractelement <16 x float> %152, i64 12, !dbg !29
163
+ %shift60 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 15, i32 poison>, !dbg !29
164
+ %154 = fadd <16 x float> %134, %shift60, !dbg !29
165
+ %155 = extractelement <16 x float> %154, i64 14, !dbg !29
166
+ %156 = bitcast float %141 to i32, !dbg !35
167
+ %157 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %156, i32 16, i32 31), !dbg !35
168
+ %158 = bitcast i32 %157 to float, !dbg !35
169
+ %159 = fadd float %141, %158, !dbg !29
170
+ %160 = bitcast float %159 to i32, !dbg !35
171
+ %161 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %160, i32 8, i32 31), !dbg !35
172
+ %162 = bitcast i32 %161 to float, !dbg !35
173
+ %163 = fadd float %159, %162, !dbg !29
174
+ %164 = bitcast float %163 to i32, !dbg !35
175
+ %165 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %164, i32 4, i32 31), !dbg !35
176
+ %166 = bitcast i32 %165 to float, !dbg !35
177
+ %167 = fadd float %163, %166, !dbg !29
178
+ %168 = bitcast float %167 to i32, !dbg !35
179
+ %169 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %168, i32 2, i32 31), !dbg !35
180
+ %170 = bitcast i32 %169 to float, !dbg !35
181
+ %171 = fadd float %167, %170, !dbg !29
182
+ %172 = bitcast float %171 to i32, !dbg !35
183
+ %173 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %172, i32 1, i32 31), !dbg !35
184
+ %174 = bitcast i32 %173 to float, !dbg !35
185
+ %175 = fadd float %171, %174, !dbg !29
186
+ %176 = bitcast float %143 to i32, !dbg !35
187
+ %177 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %176, i32 16, i32 31), !dbg !35
188
+ %178 = bitcast i32 %177 to float, !dbg !35
189
+ %179 = fadd float %143, %178, !dbg !29
190
+ %180 = bitcast float %179 to i32, !dbg !35
191
+ %181 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %180, i32 8, i32 31), !dbg !35
192
+ %182 = bitcast i32 %181 to float, !dbg !35
193
+ %183 = fadd float %179, %182, !dbg !29
194
+ %184 = bitcast float %183 to i32, !dbg !35
195
+ %185 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %184, i32 4, i32 31), !dbg !35
196
+ %186 = bitcast i32 %185 to float, !dbg !35
197
+ %187 = fadd float %183, %186, !dbg !29
198
+ %188 = bitcast float %187 to i32, !dbg !35
199
+ %189 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %188, i32 2, i32 31), !dbg !35
200
+ %190 = bitcast i32 %189 to float, !dbg !35
201
+ %191 = fadd float %187, %190, !dbg !29
202
+ %192 = bitcast float %191 to i32, !dbg !35
203
+ %193 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %192, i32 1, i32 31), !dbg !35
204
+ %194 = bitcast i32 %193 to float, !dbg !35
205
+ %195 = fadd float %191, %194, !dbg !29
206
+ %196 = bitcast float %145 to i32, !dbg !35
207
+ %197 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %196, i32 16, i32 31), !dbg !35
208
+ %198 = bitcast i32 %197 to float, !dbg !35
209
+ %199 = fadd float %145, %198, !dbg !29
210
+ %200 = bitcast float %199 to i32, !dbg !35
211
+ %201 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %200, i32 8, i32 31), !dbg !35
212
+ %202 = bitcast i32 %201 to float, !dbg !35
213
+ %203 = fadd float %199, %202, !dbg !29
214
+ %204 = bitcast float %203 to i32, !dbg !35
215
+ %205 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %204, i32 4, i32 31), !dbg !35
216
+ %206 = bitcast i32 %205 to float, !dbg !35
217
+ %207 = fadd float %203, %206, !dbg !29
218
+ %208 = bitcast float %207 to i32, !dbg !35
219
+ %209 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %208, i32 2, i32 31), !dbg !35
220
+ %210 = bitcast i32 %209 to float, !dbg !35
221
+ %211 = fadd float %207, %210, !dbg !29
222
+ %212 = bitcast float %211 to i32, !dbg !35
223
+ %213 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %212, i32 1, i32 31), !dbg !35
224
+ %214 = bitcast i32 %213 to float, !dbg !35
225
+ %215 = fadd float %211, %214, !dbg !29
226
+ %216 = bitcast float %147 to i32, !dbg !35
227
+ %217 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %216, i32 16, i32 31), !dbg !35
228
+ %218 = bitcast i32 %217 to float, !dbg !35
229
+ %219 = fadd float %147, %218, !dbg !29
230
+ %220 = bitcast float %219 to i32, !dbg !35
231
+ %221 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %220, i32 8, i32 31), !dbg !35
232
+ %222 = bitcast i32 %221 to float, !dbg !35
233
+ %223 = fadd float %219, %222, !dbg !29
234
+ %224 = bitcast float %223 to i32, !dbg !35
235
+ %225 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %224, i32 4, i32 31), !dbg !35
236
+ %226 = bitcast i32 %225 to float, !dbg !35
237
+ %227 = fadd float %223, %226, !dbg !29
238
+ %228 = bitcast float %227 to i32, !dbg !35
239
+ %229 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %228, i32 2, i32 31), !dbg !35
240
+ %230 = bitcast i32 %229 to float, !dbg !35
241
+ %231 = fadd float %227, %230, !dbg !29
242
+ %232 = bitcast float %231 to i32, !dbg !35
243
+ %233 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %232, i32 1, i32 31), !dbg !35
244
+ %234 = bitcast i32 %233 to float, !dbg !35
245
+ %235 = fadd float %231, %234, !dbg !29
246
+ %236 = bitcast float %149 to i32, !dbg !35
247
+ %237 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %236, i32 16, i32 31), !dbg !35
248
+ %238 = bitcast i32 %237 to float, !dbg !35
249
+ %239 = fadd float %149, %238, !dbg !29
250
+ %240 = bitcast float %239 to i32, !dbg !35
251
+ %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 8, i32 31), !dbg !35
252
+ %242 = bitcast i32 %241 to float, !dbg !35
253
+ %243 = fadd float %239, %242, !dbg !29
254
+ %244 = bitcast float %243 to i32, !dbg !35
255
+ %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 4, i32 31), !dbg !35
256
+ %246 = bitcast i32 %245 to float, !dbg !35
257
+ %247 = fadd float %243, %246, !dbg !29
258
+ %248 = bitcast float %247 to i32, !dbg !35
259
+ %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 2, i32 31), !dbg !35
260
+ %250 = bitcast i32 %249 to float, !dbg !35
261
+ %251 = fadd float %247, %250, !dbg !29
262
+ %252 = bitcast float %251 to i32, !dbg !35
263
+ %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 1, i32 31), !dbg !35
264
+ %254 = bitcast i32 %253 to float, !dbg !35
265
+ %255 = fadd float %251, %254, !dbg !29
266
+ %256 = bitcast float %151 to i32, !dbg !35
267
+ %257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 16, i32 31), !dbg !35
268
+ %258 = bitcast i32 %257 to float, !dbg !35
269
+ %259 = fadd float %151, %258, !dbg !29
270
+ %260 = bitcast float %259 to i32, !dbg !35
271
+ %261 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %260, i32 8, i32 31), !dbg !35
272
+ %262 = bitcast i32 %261 to float, !dbg !35
273
+ %263 = fadd float %259, %262, !dbg !29
274
+ %264 = bitcast float %263 to i32, !dbg !35
275
+ %265 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %264, i32 4, i32 31), !dbg !35
276
+ %266 = bitcast i32 %265 to float, !dbg !35
277
+ %267 = fadd float %263, %266, !dbg !29
278
+ %268 = bitcast float %267 to i32, !dbg !35
279
+ %269 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %268, i32 2, i32 31), !dbg !35
280
+ %270 = bitcast i32 %269 to float, !dbg !35
281
+ %271 = fadd float %267, %270, !dbg !29
282
+ %272 = bitcast float %271 to i32, !dbg !35
283
+ %273 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %272, i32 1, i32 31), !dbg !35
284
+ %274 = bitcast i32 %273 to float, !dbg !35
285
+ %275 = fadd float %271, %274, !dbg !29
286
+ %276 = bitcast float %153 to i32, !dbg !35
287
+ %277 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %276, i32 16, i32 31), !dbg !35
288
+ %278 = bitcast i32 %277 to float, !dbg !35
289
+ %279 = fadd float %153, %278, !dbg !29
290
+ %280 = bitcast float %279 to i32, !dbg !35
291
+ %281 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %280, i32 8, i32 31), !dbg !35
292
+ %282 = bitcast i32 %281 to float, !dbg !35
293
+ %283 = fadd float %279, %282, !dbg !29
294
+ %284 = bitcast float %283 to i32, !dbg !35
295
+ %285 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %284, i32 4, i32 31), !dbg !35
296
+ %286 = bitcast i32 %285 to float, !dbg !35
297
+ %287 = fadd float %283, %286, !dbg !29
298
+ %288 = bitcast float %287 to i32, !dbg !35
299
+ %289 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %288, i32 2, i32 31), !dbg !35
300
+ %290 = bitcast i32 %289 to float, !dbg !35
301
+ %291 = fadd float %287, %290, !dbg !29
302
+ %292 = bitcast float %291 to i32, !dbg !35
303
+ %293 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %292, i32 1, i32 31), !dbg !35
304
+ %294 = bitcast i32 %293 to float, !dbg !35
305
+ %295 = fadd float %291, %294, !dbg !29
306
+ %296 = bitcast float %155 to i32, !dbg !35
307
+ %297 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %296, i32 16, i32 31), !dbg !35
308
+ %298 = bitcast i32 %297 to float, !dbg !35
309
+ %299 = fadd float %155, %298, !dbg !29
310
+ %300 = bitcast float %299 to i32, !dbg !35
311
+ %301 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %300, i32 8, i32 31), !dbg !35
312
+ %302 = bitcast i32 %301 to float, !dbg !35
313
+ %303 = fadd float %299, %302, !dbg !29
314
+ %304 = bitcast float %303 to i32, !dbg !35
315
+ %305 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %304, i32 4, i32 31), !dbg !35
316
+ %306 = bitcast i32 %305 to float, !dbg !35
317
+ %307 = fadd float %303, %306, !dbg !29
318
+ %308 = bitcast float %307 to i32, !dbg !35
319
+ %309 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %308, i32 2, i32 31), !dbg !35
320
+ %310 = bitcast i32 %309 to float, !dbg !35
321
+ %311 = fadd float %307, %310, !dbg !29
322
+ %312 = bitcast float %311 to i32, !dbg !35
323
+ %313 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %312, i32 1, i32 31), !dbg !35
324
+ %314 = bitcast i32 %313 to float, !dbg !35
325
+ %315 = fadd float %311, %314, !dbg !29
326
+ %316 = icmp eq i32 %138, 0, !dbg !35
327
+ %317 = zext nneg i32 %139 to i64, !dbg !35
328
+ %318 = getelementptr float, ptr addrspace(3) @global_smem, i64 %317, !dbg !35
329
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %318, float %175, i1 %316) #3, !dbg !35
330
+ %319 = or i32 %139, 8, !dbg !35
331
+ %320 = zext nneg i32 %319 to i64, !dbg !35
332
+ %321 = getelementptr float, ptr addrspace(3) @global_smem, i64 %320, !dbg !35
333
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %321, float %195, i1 %316) #3, !dbg !35
334
+ %322 = or i32 %139, 16, !dbg !35
335
+ %323 = zext nneg i32 %322 to i64, !dbg !35
336
+ %324 = getelementptr float, ptr addrspace(3) @global_smem, i64 %323, !dbg !35
337
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %324, float %215, i1 %316) #3, !dbg !35
338
+ %325 = or i32 %139, 24, !dbg !35
339
+ %326 = zext nneg i32 %325 to i64, !dbg !35
340
+ %327 = getelementptr float, ptr addrspace(3) @global_smem, i64 %326, !dbg !35
341
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %327, float %235, i1 %316) #3, !dbg !35
342
+ %328 = or i32 %139, 32, !dbg !35
343
+ %329 = zext nneg i32 %328 to i64, !dbg !35
344
+ %330 = getelementptr float, ptr addrspace(3) @global_smem, i64 %329, !dbg !35
345
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %330, float %255, i1 %316) #3, !dbg !35
346
+ %331 = or i32 %139, 40, !dbg !35
347
+ %332 = zext nneg i32 %331 to i64, !dbg !35
348
+ %333 = getelementptr float, ptr addrspace(3) @global_smem, i64 %332, !dbg !35
349
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %333, float %275, i1 %316) #3, !dbg !35
350
+ %334 = or i32 %139, 48, !dbg !35
351
+ %335 = zext nneg i32 %334 to i64, !dbg !35
352
+ %336 = getelementptr float, ptr addrspace(3) @global_smem, i64 %335, !dbg !35
353
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %336, float %295, i1 %316) #3, !dbg !35
354
+ %337 = or i32 %139, 56, !dbg !35
355
+ %338 = zext nneg i32 %337 to i64, !dbg !35
356
+ %339 = getelementptr float, ptr addrspace(3) @global_smem, i64 %338, !dbg !35
357
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %339, float %315, i1 %316) #3, !dbg !35
358
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
359
+ %340 = icmp slt i32 %10, 64, !dbg !35
360
+ %341 = sext i32 %10 to i64, !dbg !35
361
+ %342 = getelementptr float, ptr addrspace(3) @global_smem, i64 %341, !dbg !35
362
+ %343 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %342, i1 %340) #3, !dbg !35
363
+ %344 = bitcast float %343 to i32, !dbg !35
364
+ %345 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %344, i32 4, i32 31), !dbg !35
365
+ %346 = bitcast i32 %345 to float, !dbg !35
366
+ %347 = fadd float %343, %346, !dbg !29
367
+ %348 = bitcast float %347 to i32, !dbg !35
368
+ %349 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %348, i32 2, i32 31), !dbg !35
369
+ %350 = bitcast i32 %349 to float, !dbg !35
370
+ %351 = fadd float %347, %350, !dbg !29
371
+ %352 = bitcast float %351 to i32, !dbg !35
372
+ %353 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %352, i32 1, i32 31), !dbg !35
373
+ %354 = bitcast i32 %353 to float, !dbg !35
374
+ %355 = fadd float %351, %354, !dbg !29
375
+ %356 = and i32 %10, 7, !dbg !35
376
+ %357 = icmp eq i32 %356, 0, !dbg !35
377
+ %358 = and i1 %340, %357, !dbg !35
378
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %342, float %355, i1 %358) #3, !dbg !35
379
+ tail call void @llvm.nvvm.barrier0(), !dbg !35
380
+ %359 = load float, ptr addrspace(3) @global_smem, align 4, !dbg !35
381
+ %360 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 32), align 4, !dbg !35
382
+ %361 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 64), align 4, !dbg !35
383
+ %362 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 96), align 4, !dbg !35
384
+ %363 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 128), align 4, !dbg !35
385
+ %364 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 160), align 4, !dbg !35
386
+ %365 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 192), align 4, !dbg !35
387
+ %366 = load float, ptr addrspace(3) getelementptr ([0 x i8], ptr addrspace(3) @global_smem, i64 0, i64 224), align 4, !dbg !35
388
+ %367 = extractelement <2 x i64> %25, i64 0, !dbg !37
389
+ %368 = extractelement <2 x i64> %25, i64 1, !dbg !37
390
+ %369 = extractelement <8 x float> %66, i64 0, !dbg !38
391
+ %370 = extractelement <8 x float> %66, i64 1, !dbg !38
392
+ %371 = extractelement <8 x float> %66, i64 2, !dbg !38
393
+ %372 = extractelement <8 x float> %66, i64 3, !dbg !38
394
+ %373 = extractelement <8 x float> %66, i64 4, !dbg !38
395
+ %374 = extractelement <8 x float> %66, i64 5, !dbg !38
396
+ %375 = extractelement <8 x float> %66, i64 6, !dbg !38
397
+ %376 = extractelement <8 x float> %66, i64 7, !dbg !38
398
+ br label %377, !dbg !39
399
+
400
+ 377: ; preds = %137, %377
401
+ %378 = phi i32 [ 0, %137 ], [ %672, %377 ]
402
+ %379 = zext nneg i32 %378 to i64, !dbg !37
403
+ %380 = or i64 %367, %379, !dbg !37
404
+ %381 = or i64 %368, %379, !dbg !37
405
+ %382 = icmp ult i64 %380, 50257, !dbg !40
406
+ %383 = icmp ult i64 %381, 50257, !dbg !40
407
+ %384 = add nsw i64 %380, %46, !dbg !41
408
+ %385 = add nsw i64 %381, %46, !dbg !41
409
+ %386 = add nsw i64 %380, %47, !dbg !41
410
+ %387 = add nsw i64 %381, %47, !dbg !41
411
+ %388 = add nsw i64 %380, %48, !dbg !41
412
+ %389 = add nsw i64 %381, %48, !dbg !41
413
+ %390 = add nsw i64 %380, %49, !dbg !41
414
+ %391 = add nsw i64 %381, %49, !dbg !41
415
+ %392 = add nsw i64 %380, %50, !dbg !41
416
+ %393 = add nsw i64 %381, %50, !dbg !41
417
+ %394 = add nsw i64 %380, %51, !dbg !41
418
+ %395 = add nsw i64 %381, %51, !dbg !41
419
+ %396 = add nsw i64 %380, %52, !dbg !41
420
+ %397 = add nsw i64 %381, %52, !dbg !41
421
+ %398 = add nsw i64 %380, %53, !dbg !41
422
+ %399 = add nsw i64 %381, %53, !dbg !41
423
+ %400 = getelementptr i16, ptr addrspace(1) %4, i64 %384, !dbg !42
424
+ %401 = getelementptr i16, ptr addrspace(1) %4, i64 %385, !dbg !42
425
+ %402 = getelementptr i16, ptr addrspace(1) %4, i64 %386, !dbg !42
426
+ %403 = getelementptr i16, ptr addrspace(1) %4, i64 %387, !dbg !42
427
+ %404 = getelementptr i16, ptr addrspace(1) %4, i64 %388, !dbg !42
428
+ %405 = getelementptr i16, ptr addrspace(1) %4, i64 %389, !dbg !42
429
+ %406 = getelementptr i16, ptr addrspace(1) %4, i64 %390, !dbg !42
430
+ %407 = getelementptr i16, ptr addrspace(1) %4, i64 %391, !dbg !42
431
+ %408 = getelementptr i16, ptr addrspace(1) %4, i64 %392, !dbg !42
432
+ %409 = getelementptr i16, ptr addrspace(1) %4, i64 %393, !dbg !42
433
+ %410 = getelementptr i16, ptr addrspace(1) %4, i64 %394, !dbg !42
434
+ %411 = getelementptr i16, ptr addrspace(1) %4, i64 %395, !dbg !42
435
+ %412 = getelementptr i16, ptr addrspace(1) %4, i64 %396, !dbg !42
436
+ %413 = getelementptr i16, ptr addrspace(1) %4, i64 %397, !dbg !42
437
+ %414 = getelementptr i16, ptr addrspace(1) %4, i64 %398, !dbg !42
438
+ %415 = getelementptr i16, ptr addrspace(1) %4, i64 %399, !dbg !42
439
+ %416 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %400, i1 %382, i16 0, i1 %382) #3, !dbg !43
440
+ %417 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %401, i1 %383, i16 0, i1 %383) #3, !dbg !43
441
+ %418 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %402, i1 %382, i16 0, i1 %382) #3, !dbg !43
442
+ %419 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %403, i1 %383, i16 0, i1 %383) #3, !dbg !43
443
+ %420 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %404, i1 %382, i16 0, i1 %382) #3, !dbg !43
444
+ %421 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %405, i1 %383, i16 0, i1 %383) #3, !dbg !43
445
+ %422 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %406, i1 %382, i16 0, i1 %382) #3, !dbg !43
446
+ %423 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %407, i1 %383, i16 0, i1 %383) #3, !dbg !43
447
+ %424 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %408, i1 %382, i16 0, i1 %382) #3, !dbg !43
448
+ %425 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %409, i1 %383, i16 0, i1 %383) #3, !dbg !43
449
+ %426 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %410, i1 %382, i16 0, i1 %382) #3, !dbg !43
450
+ %427 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %411, i1 %383, i16 0, i1 %383) #3, !dbg !43
451
+ %428 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %412, i1 %382, i16 0, i1 %382) #3, !dbg !43
452
+ %429 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %413, i1 %383, i16 0, i1 %383) #3, !dbg !43
453
+ %430 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %414, i1 %382, i16 0, i1 %382) #3, !dbg !43
454
+ %431 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %415, i1 %383, i16 0, i1 %383) #3, !dbg !43
455
+ %432 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %416) #3, !dbg !44
456
+ %433 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %417) #3, !dbg !44
457
+ %434 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %418) #3, !dbg !44
458
+ %435 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %419) #3, !dbg !44
459
+ %436 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %420) #3, !dbg !44
460
+ %437 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %421) #3, !dbg !44
461
+ %438 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %422) #3, !dbg !44
462
+ %439 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %423) #3, !dbg !44
463
+ %440 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %424) #3, !dbg !44
464
+ %441 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %425) #3, !dbg !44
465
+ %442 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %426) #3, !dbg !44
466
+ %443 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %427) #3, !dbg !44
467
+ %444 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %428) #3, !dbg !44
468
+ %445 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %429) #3, !dbg !44
469
+ %446 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %430) #3, !dbg !44
470
+ %447 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %431) #3, !dbg !44
471
+ %448 = getelementptr float, ptr addrspace(1) %0, i64 %384, !dbg !45
472
+ %449 = getelementptr float, ptr addrspace(1) %0, i64 %385, !dbg !45
473
+ %450 = getelementptr float, ptr addrspace(1) %0, i64 %386, !dbg !45
474
+ %451 = getelementptr float, ptr addrspace(1) %0, i64 %387, !dbg !45
475
+ %452 = getelementptr float, ptr addrspace(1) %0, i64 %388, !dbg !45
476
+ %453 = getelementptr float, ptr addrspace(1) %0, i64 %389, !dbg !45
477
+ %454 = getelementptr float, ptr addrspace(1) %0, i64 %390, !dbg !45
478
+ %455 = getelementptr float, ptr addrspace(1) %0, i64 %391, !dbg !45
479
+ %456 = getelementptr float, ptr addrspace(1) %0, i64 %392, !dbg !45
480
+ %457 = getelementptr float, ptr addrspace(1) %0, i64 %393, !dbg !45
481
+ %458 = getelementptr float, ptr addrspace(1) %0, i64 %394, !dbg !45
482
+ %459 = getelementptr float, ptr addrspace(1) %0, i64 %395, !dbg !45
483
+ %460 = getelementptr float, ptr addrspace(1) %0, i64 %396, !dbg !45
484
+ %461 = getelementptr float, ptr addrspace(1) %0, i64 %397, !dbg !45
485
+ %462 = getelementptr float, ptr addrspace(1) %0, i64 %398, !dbg !45
486
+ %463 = getelementptr float, ptr addrspace(1) %0, i64 %399, !dbg !45
487
+ %464 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %448, i1 %382, i32 0, i1 %382) #3, !dbg !46
488
+ %465 = bitcast i32 %464 to float, !dbg !46
489
+ %466 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %449, i1 %383, i32 0, i1 %383) #3, !dbg !46
490
+ %467 = bitcast i32 %466 to float, !dbg !46
491
+ %468 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %450, i1 %382, i32 0, i1 %382) #3, !dbg !46
492
+ %469 = bitcast i32 %468 to float, !dbg !46
493
+ %470 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %451, i1 %383, i32 0, i1 %383) #3, !dbg !46
494
+ %471 = bitcast i32 %470 to float, !dbg !46
495
+ %472 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %452, i1 %382, i32 0, i1 %382) #3, !dbg !46
496
+ %473 = bitcast i32 %472 to float, !dbg !46
497
+ %474 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %453, i1 %383, i32 0, i1 %383) #3, !dbg !46
498
+ %475 = bitcast i32 %474 to float, !dbg !46
499
+ %476 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %454, i1 %382, i32 0, i1 %382) #3, !dbg !46
500
+ %477 = bitcast i32 %476 to float, !dbg !46
501
+ %478 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %455, i1 %383, i32 0, i1 %383) #3, !dbg !46
502
+ %479 = bitcast i32 %478 to float, !dbg !46
503
+ %480 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %456, i1 %382, i32 0, i1 %382) #3, !dbg !46
504
+ %481 = bitcast i32 %480 to float, !dbg !46
505
+ %482 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %457, i1 %383, i32 0, i1 %383) #3, !dbg !46
506
+ %483 = bitcast i32 %482 to float, !dbg !46
507
+ %484 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %458, i1 %382, i32 0, i1 %382) #3, !dbg !46
508
+ %485 = bitcast i32 %484 to float, !dbg !46
509
+ %486 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %459, i1 %383, i32 0, i1 %383) #3, !dbg !46
510
+ %487 = bitcast i32 %486 to float, !dbg !46
511
+ %488 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %460, i1 %382, i32 0, i1 %382) #3, !dbg !46
512
+ %489 = bitcast i32 %488 to float, !dbg !46
513
+ %490 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %461, i1 %383, i32 0, i1 %383) #3, !dbg !46
514
+ %491 = bitcast i32 %490 to float, !dbg !46
515
+ %492 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %462, i1 %382, i32 0, i1 %382) #3, !dbg !46
516
+ %493 = bitcast i32 %492 to float, !dbg !46
517
+ %494 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %463, i1 %383, i32 0, i1 %383) #3, !dbg !46
518
+ %495 = bitcast i32 %494 to float, !dbg !46
519
+ %496 = getelementptr i16, ptr addrspace(1) %5, i64 %384, !dbg !47
520
+ %497 = getelementptr i16, ptr addrspace(1) %5, i64 %385, !dbg !47
521
+ %498 = getelementptr i16, ptr addrspace(1) %5, i64 %386, !dbg !47
522
+ %499 = getelementptr i16, ptr addrspace(1) %5, i64 %387, !dbg !47
523
+ %500 = getelementptr i16, ptr addrspace(1) %5, i64 %388, !dbg !47
524
+ %501 = getelementptr i16, ptr addrspace(1) %5, i64 %389, !dbg !47
525
+ %502 = getelementptr i16, ptr addrspace(1) %5, i64 %390, !dbg !47
526
+ %503 = getelementptr i16, ptr addrspace(1) %5, i64 %391, !dbg !47
527
+ %504 = getelementptr i16, ptr addrspace(1) %5, i64 %392, !dbg !47
528
+ %505 = getelementptr i16, ptr addrspace(1) %5, i64 %393, !dbg !47
529
+ %506 = getelementptr i16, ptr addrspace(1) %5, i64 %394, !dbg !47
530
+ %507 = getelementptr i16, ptr addrspace(1) %5, i64 %395, !dbg !47
531
+ %508 = getelementptr i16, ptr addrspace(1) %5, i64 %396, !dbg !47
532
+ %509 = getelementptr i16, ptr addrspace(1) %5, i64 %397, !dbg !47
533
+ %510 = getelementptr i16, ptr addrspace(1) %5, i64 %398, !dbg !47
534
+ %511 = getelementptr i16, ptr addrspace(1) %5, i64 %399, !dbg !47
535
+ %512 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %496, i1 %382, i16 0, i1 %382) #3, !dbg !48
536
+ %513 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %497, i1 %383, i16 0, i1 %383) #3, !dbg !48
537
+ %514 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %498, i1 %382, i16 0, i1 %382) #3, !dbg !48
538
+ %515 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %499, i1 %383, i16 0, i1 %383) #3, !dbg !48
539
+ %516 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %500, i1 %382, i16 0, i1 %382) #3, !dbg !48
540
+ %517 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %501, i1 %383, i16 0, i1 %383) #3, !dbg !48
541
+ %518 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %502, i1 %382, i16 0, i1 %382) #3, !dbg !48
542
+ %519 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %503, i1 %383, i16 0, i1 %383) #3, !dbg !48
543
+ %520 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %504, i1 %382, i16 0, i1 %382) #3, !dbg !48
544
+ %521 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %505, i1 %383, i16 0, i1 %383) #3, !dbg !48
545
+ %522 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %506, i1 %382, i16 0, i1 %382) #3, !dbg !48
546
+ %523 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %507, i1 %383, i16 0, i1 %383) #3, !dbg !48
547
+ %524 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %508, i1 %382, i16 0, i1 %382) #3, !dbg !48
548
+ %525 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %509, i1 %383, i16 0, i1 %383) #3, !dbg !48
549
+ %526 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %510, i1 %382, i16 0, i1 %382) #3, !dbg !48
550
+ %527 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %511, i1 %383, i16 0, i1 %383) #3, !dbg !48
551
+ %528 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %512) #3, !dbg !49
552
+ %529 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %513) #3, !dbg !49
553
+ %530 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %514) #3, !dbg !49
554
+ %531 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %515) #3, !dbg !49
555
+ %532 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %516) #3, !dbg !49
556
+ %533 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %517) #3, !dbg !49
557
+ %534 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %518) #3, !dbg !49
558
+ %535 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %519) #3, !dbg !49
559
+ %536 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %520) #3, !dbg !49
560
+ %537 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %521) #3, !dbg !49
561
+ %538 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %522) #3, !dbg !49
562
+ %539 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %523) #3, !dbg !49
563
+ %540 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %524) #3, !dbg !49
564
+ %541 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %525) #3, !dbg !49
565
+ %542 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %526) #3, !dbg !49
566
+ %543 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %527) #3, !dbg !49
567
+ %544 = fmul float %369, %465, !dbg !38
568
+ %545 = fmul float %369, %467, !dbg !38
569
+ %546 = fmul float %370, %469, !dbg !38
570
+ %547 = fmul float %370, %471, !dbg !38
571
+ %548 = fmul float %371, %473, !dbg !38
572
+ %549 = fmul float %371, %475, !dbg !38
573
+ %550 = fmul float %372, %477, !dbg !38
574
+ %551 = fmul float %372, %479, !dbg !38
575
+ %552 = fmul float %373, %481, !dbg !38
576
+ %553 = fmul float %373, %483, !dbg !38
577
+ %554 = fmul float %374, %485, !dbg !38
578
+ %555 = fmul float %374, %487, !dbg !38
579
+ %556 = fmul float %375, %489, !dbg !38
580
+ %557 = fmul float %375, %491, !dbg !38
581
+ %558 = fmul float %376, %493, !dbg !38
582
+ %559 = fmul float %376, %495, !dbg !38
583
+ %560 = fmul float %528, 0x3FF7154760000000, !dbg !50
584
+ %561 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %560) #3, !dbg !50
585
+ %562 = fmul float %529, 0x3FF7154760000000, !dbg !50
586
+ %563 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %562) #3, !dbg !50
587
+ %564 = fmul float %530, 0x3FF7154760000000, !dbg !50
588
+ %565 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %564) #3, !dbg !50
589
+ %566 = fmul float %531, 0x3FF7154760000000, !dbg !50
590
+ %567 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %566) #3, !dbg !50
591
+ %568 = fmul float %532, 0x3FF7154760000000, !dbg !50
592
+ %569 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %568) #3, !dbg !50
593
+ %570 = fmul float %533, 0x3FF7154760000000, !dbg !50
594
+ %571 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %570) #3, !dbg !50
595
+ %572 = fmul float %534, 0x3FF7154760000000, !dbg !50
596
+ %573 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %572) #3, !dbg !50
597
+ %574 = fmul float %535, 0x3FF7154760000000, !dbg !50
598
+ %575 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %574) #3, !dbg !50
599
+ %576 = fmul float %536, 0x3FF7154760000000, !dbg !50
600
+ %577 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %576) #3, !dbg !50
601
+ %578 = fmul float %537, 0x3FF7154760000000, !dbg !50
602
+ %579 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %578) #3, !dbg !50
603
+ %580 = fmul float %538, 0x3FF7154760000000, !dbg !50
604
+ %581 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %580) #3, !dbg !50
605
+ %582 = fmul float %539, 0x3FF7154760000000, !dbg !50
606
+ %583 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %582) #3, !dbg !50
607
+ %584 = fmul float %540, 0x3FF7154760000000, !dbg !50
608
+ %585 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %584) #3, !dbg !50
609
+ %586 = fmul float %541, 0x3FF7154760000000, !dbg !50
610
+ %587 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %586) #3, !dbg !50
611
+ %588 = fmul float %542, 0x3FF7154760000000, !dbg !50
612
+ %589 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %588) #3, !dbg !50
613
+ %590 = fmul float %543, 0x3FF7154760000000, !dbg !50
614
+ %591 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %590) #3, !dbg !50
615
+ %592 = fmul float %359, %561, !dbg !51
616
+ %593 = fmul float %359, %563, !dbg !51
617
+ %594 = fmul float %360, %565, !dbg !51
618
+ %595 = fmul float %360, %567, !dbg !51
619
+ %596 = fmul float %361, %569, !dbg !51
620
+ %597 = fmul float %361, %571, !dbg !51
621
+ %598 = fmul float %362, %573, !dbg !51
622
+ %599 = fmul float %362, %575, !dbg !51
623
+ %600 = fmul float %363, %577, !dbg !51
624
+ %601 = fmul float %363, %579, !dbg !51
625
+ %602 = fmul float %364, %581, !dbg !51
626
+ %603 = fmul float %364, %583, !dbg !51
627
+ %604 = fmul float %365, %585, !dbg !51
628
+ %605 = fmul float %365, %587, !dbg !51
629
+ %606 = fmul float %366, %589, !dbg !51
630
+ %607 = fmul float %366, %591, !dbg !51
631
+ %608 = fsub float %544, %592, !dbg !52
632
+ %609 = fsub float %545, %593, !dbg !52
633
+ %610 = fsub float %546, %594, !dbg !52
634
+ %611 = fsub float %547, %595, !dbg !52
635
+ %612 = fsub float %548, %596, !dbg !52
636
+ %613 = fsub float %549, %597, !dbg !52
637
+ %614 = fsub float %550, %598, !dbg !52
638
+ %615 = fsub float %551, %599, !dbg !52
639
+ %616 = fsub float %552, %600, !dbg !52
640
+ %617 = fsub float %553, %601, !dbg !52
641
+ %618 = fsub float %554, %602, !dbg !52
642
+ %619 = fsub float %555, %603, !dbg !52
643
+ %620 = fsub float %556, %604, !dbg !52
644
+ %621 = fsub float %557, %605, !dbg !52
645
+ %622 = fsub float %558, %606, !dbg !52
646
+ %623 = fsub float %559, %607, !dbg !52
647
+ %624 = fadd float %432, %608, !dbg !53
648
+ %625 = fadd float %433, %609, !dbg !53
649
+ %626 = fadd float %434, %610, !dbg !53
650
+ %627 = fadd float %435, %611, !dbg !53
651
+ %628 = fadd float %436, %612, !dbg !53
652
+ %629 = fadd float %437, %613, !dbg !53
653
+ %630 = fadd float %438, %614, !dbg !53
654
+ %631 = fadd float %439, %615, !dbg !53
655
+ %632 = fadd float %440, %616, !dbg !53
656
+ %633 = fadd float %441, %617, !dbg !53
657
+ %634 = fadd float %442, %618, !dbg !53
658
+ %635 = fadd float %443, %619, !dbg !53
659
+ %636 = fadd float %444, %620, !dbg !53
660
+ %637 = fadd float %445, %621, !dbg !53
661
+ %638 = fadd float %446, %622, !dbg !53
662
+ %639 = fadd float %447, %623, !dbg !53
663
+ %640 = getelementptr i16, ptr addrspace(1) %6, i64 %384, !dbg !54
664
+ %641 = getelementptr i16, ptr addrspace(1) %6, i64 %385, !dbg !54
665
+ %642 = getelementptr i16, ptr addrspace(1) %6, i64 %386, !dbg !54
666
+ %643 = getelementptr i16, ptr addrspace(1) %6, i64 %387, !dbg !54
667
+ %644 = getelementptr i16, ptr addrspace(1) %6, i64 %388, !dbg !54
668
+ %645 = getelementptr i16, ptr addrspace(1) %6, i64 %389, !dbg !54
669
+ %646 = getelementptr i16, ptr addrspace(1) %6, i64 %390, !dbg !54
670
+ %647 = getelementptr i16, ptr addrspace(1) %6, i64 %391, !dbg !54
671
+ %648 = getelementptr i16, ptr addrspace(1) %6, i64 %392, !dbg !54
672
+ %649 = getelementptr i16, ptr addrspace(1) %6, i64 %393, !dbg !54
673
+ %650 = getelementptr i16, ptr addrspace(1) %6, i64 %394, !dbg !54
674
+ %651 = getelementptr i16, ptr addrspace(1) %6, i64 %395, !dbg !54
675
+ %652 = getelementptr i16, ptr addrspace(1) %6, i64 %396, !dbg !54
676
+ %653 = getelementptr i16, ptr addrspace(1) %6, i64 %397, !dbg !54
677
+ %654 = getelementptr i16, ptr addrspace(1) %6, i64 %398, !dbg !54
678
+ %655 = getelementptr i16, ptr addrspace(1) %6, i64 %399, !dbg !54
679
+ %656 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %624) #3, !dbg !55
680
+ %657 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %625) #3, !dbg !55
681
+ %658 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %626) #3, !dbg !55
682
+ %659 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %627) #3, !dbg !55
683
+ %660 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %628) #3, !dbg !55
684
+ %661 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %629) #3, !dbg !55
685
+ %662 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %630) #3, !dbg !55
686
+ %663 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %631) #3, !dbg !55
687
+ %664 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %632) #3, !dbg !55
688
+ %665 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %633) #3, !dbg !55
689
+ %666 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %634) #3, !dbg !55
690
+ %667 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %635) #3, !dbg !55
691
+ %668 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %636) #3, !dbg !55
692
+ %669 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %637) #3, !dbg !55
693
+ %670 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %638) #3, !dbg !55
694
+ %671 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %639) #3, !dbg !55
695
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %656, ptr addrspace(1) %640, i1 %382) #3, !dbg !55
696
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %657, ptr addrspace(1) %641, i1 %383) #3, !dbg !55
697
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %658, ptr addrspace(1) %642, i1 %382) #3, !dbg !55
698
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %659, ptr addrspace(1) %643, i1 %383) #3, !dbg !55
699
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %660, ptr addrspace(1) %644, i1 %382) #3, !dbg !55
700
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %661, ptr addrspace(1) %645, i1 %383) #3, !dbg !55
701
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %662, ptr addrspace(1) %646, i1 %382) #3, !dbg !55
702
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %663, ptr addrspace(1) %647, i1 %383) #3, !dbg !55
703
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %664, ptr addrspace(1) %648, i1 %382) #3, !dbg !55
704
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %665, ptr addrspace(1) %649, i1 %383) #3, !dbg !55
705
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %666, ptr addrspace(1) %650, i1 %382) #3, !dbg !55
706
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %667, ptr addrspace(1) %651, i1 %383) #3, !dbg !55
707
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %668, ptr addrspace(1) %652, i1 %382) #3, !dbg !55
708
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %669, ptr addrspace(1) %653, i1 %383) #3, !dbg !55
709
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %670, ptr addrspace(1) %654, i1 %382) #3, !dbg !55
710
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %671, ptr addrspace(1) %655, i1 %383) #3, !dbg !55
711
+ %672 = add nuw nsw i32 %378, 512, !dbg !39
712
+ %673 = icmp ult i32 %378, 49745, !dbg !39
713
+ br i1 %673, label %377, label %674, !dbg !39
714
+
715
+ 674: ; preds = %377
716
+ ret void, !dbg !56
717
+ }
718
+
719
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
720
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
721
+
722
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
723
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
724
+
725
+ ; Function Attrs: convergent nocallback nounwind
726
+ declare void @llvm.nvvm.barrier0() #2
727
+
728
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
729
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
730
+ attributes #2 = { convergent nocallback nounwind }
731
+ attributes #3 = { nounwind }
732
+
733
+ !llvm.module.flags = !{!0}
734
+ !llvm.dbg.cu = !{!1}
735
+ !nvvm.annotations = !{!3, !4, !4, !3}
736
+
737
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
738
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
739
+ !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
740
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
741
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
742
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
743
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
744
+ !7 = !{}
745
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
746
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
747
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
748
+ !11 = !DILocation(line: 21, column: 34, scope: !5)
749
+ !12 = !DILocation(line: 21, column: 46, scope: !5)
750
+ !13 = !DILocation(line: 22, column: 23, scope: !5)
751
+ !14 = !DILocation(line: 26, column: 30, scope: !5)
752
+ !15 = !DILocation(line: 26, column: 35, scope: !5)
753
+ !16 = !DILocation(line: 27, column: 19, scope: !5)
754
+ !17 = !DILocation(line: 29, column: 19, scope: !5)
755
+ !18 = !DILocation(line: 36, column: 46, scope: !5)
756
+ !19 = !DILocation(line: 38, column: 23, scope: !5)
757
+ !20 = !DILocation(line: 39, column: 22, scope: !5)
758
+ !21 = !DILocation(line: 41, column: 37, scope: !5)
759
+ !22 = !DILocation(line: 32, column: 36, scope: !5)
760
+ !23 = !DILocation(line: 33, column: 27, scope: !5)
761
+ !24 = !DILocation(line: 34, column: 25, scope: !5)
762
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
763
+ !26 = !DILocation(line: 36, column: 52, scope: !5)
764
+ !27 = !DILocation(line: 42, column: 23, scope: !5)
765
+ !28 = !DILocation(line: 45, column: 40, scope: !5)
766
+ !29 = !DILocation(line: 233, column: 15, scope: !30, inlinedAt: !33)
767
+ !30 = distinct !DILexicalBlockFile(scope: !32, file: !31, discriminator: 0)
768
+ !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
769
+ !32 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
770
+ !33 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !34)
771
+ !34 = !DILocation(line: 46, column: 27, scope: !30)
772
+ !35 = !DILocation(line: 243, column: 36, scope: !32, inlinedAt: !36)
773
+ !36 = !DILocation(line: 46, column: 27, scope: !32)
774
+ !37 = !DILocation(line: 52, column: 27, scope: !5)
775
+ !38 = !DILocation(line: 63, column: 24, scope: !5)
776
+ !39 = !DILocation(line: 51, column: 36, scope: !5)
777
+ !40 = !DILocation(line: 53, column: 25, scope: !5)
778
+ !41 = !DILocation(line: 55, column: 41, scope: !5)
779
+ !42 = !DILocation(line: 55, column: 35, scope: !5)
780
+ !43 = !DILocation(line: 55, column: 53, scope: !5)
781
+ !44 = !DILocation(line: 55, column: 105, scope: !5)
782
+ !45 = !DILocation(line: 56, column: 35, scope: !5)
783
+ !46 = !DILocation(line: 56, column: 53, scope: !5)
784
+ !47 = !DILocation(line: 57, column: 35, scope: !5)
785
+ !48 = !DILocation(line: 57, column: 53, scope: !5)
786
+ !49 = !DILocation(line: 57, column: 105, scope: !5)
787
+ !50 = !DILocation(line: 65, column: 23, scope: !5)
788
+ !51 = !DILocation(line: 66, column: 24, scope: !5)
789
+ !52 = !DILocation(line: 67, column: 24, scope: !5)
790
+ !53 = !DILocation(line: 69, column: 24, scope: !5)
791
+ !54 = !DILocation(line: 70, column: 29, scope: !5)
792
+ !55 = !DILocation(line: 70, column: 54, scope: !5)
793
+ !56 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ptx ADDED
@@ -0,0 +1,1517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
22
+ )
23
+ .maxntid 256, 1, 1
24
+ {
25
+ .reg .pred %p<176>;
26
+ .reg .b16 %rs<129>;
27
+ .reg .b32 %r<238>;
28
+ .reg .f32 %f<393>;
29
+ .reg .b64 %rd<166>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd39, [triton__0d1d2d3d4d5d6d7de8_param_6];
35
+ ld.param.u64 %rd38, [triton__0d1d2d3d4d5d6d7de8_param_5];
36
+ ld.param.u64 %rd37, [triton__0d1d2d3d4d5d6d7de8_param_4];
37
+ ld.param.u64 %rd36, [triton__0d1d2d3d4d5d6d7de8_param_0];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r1, %tid.x;
41
+ ld.param.u64 %rd59, [triton__0d1d2d3d4d5d6d7de8_param_1];
42
+ shr.u32 %r2, %r1, 5;
43
+ ld.param.u64 %rd56, [triton__0d1d2d3d4d5d6d7de8_param_2];
44
+ .loc 1 24 33
45
+ and.b32 %r9, %r1, 255;
46
+ ld.param.u64 %rd57, [triton__0d1d2d3d4d5d6d7de8_param_3];
47
+ or.b32 %r10, %r9, 256;
48
+ .loc 1 21 28
49
+ mov.u32 %r3, %ctaid.x;
50
+ .loc 1 21 34
51
+ cvt.s64.s32 %rd1, %r3;
52
+ .loc 1 21 46
53
+ mul.wide.s32 %rd60, %r3, 8;
54
+ .loc 1 22 23
55
+ or.b64 %rd61, %rd60, 1;
56
+ cvt.u64.u32 %rd2, %r9;
57
+ cvt.u64.u32 %rd3, %r10;
58
+ .loc 1 26 30
59
+ shl.b64 %rd62, %rd60, 3;
60
+ add.s64 %rd41, %rd59, %rd62;
61
+ add.s64 %rd43, %rd41, 8;
62
+ add.s64 %rd45, %rd41, 16;
63
+ add.s64 %rd47, %rd41, 24;
64
+ add.s64 %rd49, %rd41, 32;
65
+ add.s64 %rd51, %rd41, 40;
66
+ add.s64 %rd53, %rd41, 48;
67
+ add.s64 %rd55, %rd41, 56;
68
+ mov.pred %p1, -1;
69
+ .loc 1 26 35
70
+ mov.u64 %rd40, 0x0;
71
+ @%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
72
+ mov.u64 %rd42, 0x0;
73
+ @%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
74
+ mov.u64 %rd44, 0x0;
75
+ @%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
76
+ mov.u64 %rd46, 0x0;
77
+ @%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
78
+ mov.u64 %rd48, 0x0;
79
+ @%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
80
+ mov.u64 %rd50, 0x0;
81
+ @%p1 ld.global.L1::evict_last.b64 { %rd50 }, [ %rd51 + 0 ];
82
+ mov.u64 %rd52, 0x0;
83
+ @%p1 ld.global.L1::evict_last.b64 { %rd52 }, [ %rd53 + 0 ];
84
+ mov.u64 %rd54, 0x0;
85
+ @%p1 ld.global.L1::evict_last.b64 { %rd54 }, [ %rd55 + 0 ];
86
+ .loc 1 27 19
87
+ mov.u32 %r7, 0x0;
88
+ @%p1 ld.global.b32 { %r7 }, [ %rd56 + 0 ];
89
+ .loc 1 29 19
90
+ mov.u32 %r8, 0x0;
91
+ @%p1 ld.global.b32 { %r8 }, [ %rd57 + 0 ];
92
+ .loc 1 36 46
93
+ mul.wide.s32 %rd4, %r3, 402056;
94
+ mul.lo.s64 %rd5, %rd61, 50257;
95
+ .loc 1 38 23
96
+ setp.eq.s64 %p11, %rd40, -1;
97
+ setp.eq.s64 %p12, %rd42, -1;
98
+ setp.eq.s64 %p13, %rd44, -1;
99
+ setp.eq.s64 %p14, %rd46, -1;
100
+ setp.eq.s64 %p15, %rd48, -1;
101
+ setp.eq.s64 %p16, %rd50, -1;
102
+ setp.eq.s64 %p17, %rd52, -1;
103
+ setp.eq.s64 %p18, %rd54, -1;
104
+ .loc 1 39 22
105
+ div.full.f32 %r6, %r7, %r8;
106
+ mov.b32 %f89, %r6;
107
+ .loc 1 41 37
108
+ selp.f32 %f8, 0f00000000, %f89, %p18;
109
+ selp.f32 %f7, 0f00000000, %f89, %p17;
110
+ selp.f32 %f6, 0f00000000, %f89, %p16;
111
+ selp.f32 %f5, 0f00000000, %f89, %p15;
112
+ selp.f32 %f4, 0f00000000, %f89, %p14;
113
+ selp.f32 %f3, 0f00000000, %f89, %p13;
114
+ selp.f32 %f2, 0f00000000, %f89, %p12;
115
+ selp.f32 %f1, 0f00000000, %f89, %p11;
116
+ mov.f32 %f377, 0f00000000;
117
+ mov.u64 %rd157, 0;
118
+ shl.b64 %rd83, %rd4, 2;
119
+ shl.b64 %rd86, %rd5, 2;
120
+ mov.f32 %f378, %f377;
121
+ mov.f32 %f379, %f377;
122
+ mov.f32 %f380, %f377;
123
+ mov.f32 %f381, %f377;
124
+ mov.f32 %f382, %f377;
125
+ mov.f32 %f383, %f377;
126
+ mov.f32 %f384, %f377;
127
+ mov.f32 %f385, %f377;
128
+ mov.f32 %f386, %f377;
129
+ mov.f32 %f387, %f377;
130
+ mov.f32 %f388, %f377;
131
+ mov.f32 %f389, %f377;
132
+ mov.f32 %f390, %f377;
133
+ mov.f32 %f391, %f377;
134
+ mov.f32 %f392, %f377;
135
+ $L__BB0_1:
136
+ .loc 1 33 27
137
+ or.b64 %rd79, %rd157, %rd2;
138
+ or.b64 %rd80, %rd157, %rd3;
139
+ .loc 1 34 25
140
+ setp.lt.u64 %p22, %rd80, 50257;
141
+ setp.lt.u64 %p20, %rd79, 50257;
142
+ .loc 1 36 34
143
+ shl.b64 %rd81, %rd79, 2;
144
+ add.s64 %rd82, %rd36, %rd81;
145
+ add.s64 %rd63, %rd82, %rd83;
146
+ shl.b64 %rd84, %rd80, 2;
147
+ add.s64 %rd85, %rd36, %rd84;
148
+ add.s64 %rd64, %rd85, %rd83;
149
+ add.s64 %rd65, %rd82, %rd86;
150
+ add.s64 %rd66, %rd85, %rd86;
151
+ add.s64 %rd67, %rd65, 201028;
152
+ add.s64 %rd68, %rd66, 201028;
153
+ add.s64 %rd69, %rd65, 402056;
154
+ add.s64 %rd70, %rd66, 402056;
155
+ add.s64 %rd71, %rd65, 603084;
156
+ add.s64 %rd72, %rd66, 603084;
157
+ add.s64 %rd73, %rd65, 804112;
158
+ add.s64 %rd74, %rd66, 804112;
159
+ add.s64 %rd75, %rd65, 1005140;
160
+ add.s64 %rd76, %rd66, 1005140;
161
+ add.s64 %rd77, %rd65, 1206168;
162
+ add.s64 %rd78, %rd66, 1206168;
163
+ mov.b32 %r173, 0;
164
+ .loc 1 36 52
165
+ mov.u32 %r11, 0x0;
166
+ @%p20 ld.global.L1::evict_last.b32 { %r11 }, [ %rd63 + 0 ];
167
+ @!%p20 mov.u32 %r11, %r173;
168
+ mov.u32 %r13, 0x0;
169
+ @%p22 ld.global.L1::evict_last.b32 { %r13 }, [ %rd64 + 0 ];
170
+ @!%p22 mov.u32 %r13, %r173;
171
+ mov.u32 %r15, 0x0;
172
+ @%p20 ld.global.L1::evict_last.b32 { %r15 }, [ %rd65 + 0 ];
173
+ @!%p20 mov.u32 %r15, %r173;
174
+ mov.u32 %r17, 0x0;
175
+ @%p22 ld.global.L1::evict_last.b32 { %r17 }, [ %rd66 + 0 ];
176
+ @!%p22 mov.u32 %r17, %r173;
177
+ mov.u32 %r19, 0x0;
178
+ @%p20 ld.global.L1::evict_last.b32 { %r19 }, [ %rd67 + 0 ];
179
+ @!%p20 mov.u32 %r19, %r173;
180
+ mov.u32 %r21, 0x0;
181
+ @%p22 ld.global.L1::evict_last.b32 { %r21 }, [ %rd68 + 0 ];
182
+ @!%p22 mov.u32 %r21, %r173;
183
+ mov.u32 %r23, 0x0;
184
+ @%p20 ld.global.L1::evict_last.b32 { %r23 }, [ %rd69 + 0 ];
185
+ @!%p20 mov.u32 %r23, %r173;
186
+ mov.u32 %r25, 0x0;
187
+ @%p22 ld.global.L1::evict_last.b32 { %r25 }, [ %rd70 + 0 ];
188
+ @!%p22 mov.u32 %r25, %r173;
189
+ mov.u32 %r27, 0x0;
190
+ @%p20 ld.global.L1::evict_last.b32 { %r27 }, [ %rd71 + 0 ];
191
+ @!%p20 mov.u32 %r27, %r173;
192
+ mov.u32 %r29, 0x0;
193
+ @%p22 ld.global.L1::evict_last.b32 { %r29 }, [ %rd72 + 0 ];
194
+ @!%p22 mov.u32 %r29, %r173;
195
+ mov.u32 %r31, 0x0;
196
+ @%p20 ld.global.L1::evict_last.b32 { %r31 }, [ %rd73 + 0 ];
197
+ @!%p20 mov.u32 %r31, %r173;
198
+ mov.u32 %r33, 0x0;
199
+ @%p22 ld.global.L1::evict_last.b32 { %r33 }, [ %rd74 + 0 ];
200
+ @!%p22 mov.u32 %r33, %r173;
201
+ mov.u32 %r35, 0x0;
202
+ @%p20 ld.global.L1::evict_last.b32 { %r35 }, [ %rd75 + 0 ];
203
+ @!%p20 mov.u32 %r35, %r173;
204
+ mov.u32 %r37, 0x0;
205
+ @%p22 ld.global.L1::evict_last.b32 { %r37 }, [ %rd76 + 0 ];
206
+ @!%p22 mov.u32 %r37, %r173;
207
+ mov.u32 %r39, 0x0;
208
+ @%p20 ld.global.L1::evict_last.b32 { %r39 }, [ %rd77 + 0 ];
209
+ @!%p20 mov.u32 %r39, %r173;
210
+ mov.u32 %r41, 0x0;
211
+ @%p22 ld.global.L1::evict_last.b32 { %r41 }, [ %rd78 + 0 ];
212
+ @!%p22 mov.u32 %r41, %r173;
213
+ mov.b32 %f90, %r41;
214
+ mov.b32 %f91, %r39;
215
+ mov.b32 %f92, %r37;
216
+ mov.b32 %f93, %r35;
217
+ mov.b32 %f94, %r33;
218
+ mov.b32 %f95, %r31;
219
+ mov.b32 %f96, %r29;
220
+ mov.b32 %f97, %r27;
221
+ mov.b32 %f98, %r25;
222
+ mov.b32 %f99, %r23;
223
+ mov.b32 %f100, %r21;
224
+ mov.b32 %f101, %r19;
225
+ mov.b32 %f102, %r17;
226
+ mov.b32 %f103, %r15;
227
+ mov.b32 %f104, %r13;
228
+ mov.b32 %f105, %r11;
229
+ .loc 1 42 23
230
+ mul.f32 %f106, %f1, %f105;
231
+ mul.f32 %f107, %f1, %f104;
232
+ mul.f32 %f108, %f2, %f103;
233
+ mul.f32 %f109, %f2, %f102;
234
+ mul.f32 %f110, %f3, %f101;
235
+ mul.f32 %f111, %f3, %f100;
236
+ mul.f32 %f112, %f4, %f99;
237
+ mul.f32 %f113, %f4, %f98;
238
+ mul.f32 %f114, %f5, %f97;
239
+ mul.f32 %f115, %f5, %f96;
240
+ mul.f32 %f116, %f6, %f95;
241
+ mul.f32 %f117, %f6, %f94;
242
+ mul.f32 %f118, %f7, %f93;
243
+ mul.f32 %f119, %f7, %f92;
244
+ mul.f32 %f120, %f8, %f91;
245
+ mul.f32 %f121, %f8, %f90;
246
+ .loc 1 45 40
247
+ selp.f32 %f122, %f121, 0f80000000, %p22;
248
+ selp.f32 %f123, %f120, 0f80000000, %p20;
249
+ selp.f32 %f124, %f119, 0f80000000, %p22;
250
+ selp.f32 %f125, %f118, 0f80000000, %p20;
251
+ selp.f32 %f126, %f117, 0f80000000, %p22;
252
+ selp.f32 %f127, %f116, 0f80000000, %p20;
253
+ selp.f32 %f128, %f115, 0f80000000, %p22;
254
+ selp.f32 %f129, %f114, 0f80000000, %p20;
255
+ selp.f32 %f130, %f113, 0f80000000, %p22;
256
+ selp.f32 %f131, %f112, 0f80000000, %p20;
257
+ selp.f32 %f132, %f111, 0f80000000, %p22;
258
+ selp.f32 %f133, %f110, 0f80000000, %p20;
259
+ selp.f32 %f134, %f109, 0f80000000, %p22;
260
+ selp.f32 %f135, %f108, 0f80000000, %p20;
261
+ selp.f32 %f136, %f107, 0f80000000, %p22;
262
+ selp.f32 %f137, %f106, 0f80000000, %p20;
263
+ add.f32 %f377, %f377, %f137;
264
+ add.f32 %f378, %f378, %f136;
265
+ add.f32 %f379, %f379, %f135;
266
+ add.f32 %f380, %f380, %f134;
267
+ add.f32 %f381, %f381, %f133;
268
+ add.f32 %f382, %f382, %f132;
269
+ add.f32 %f383, %f383, %f131;
270
+ add.f32 %f384, %f384, %f130;
271
+ add.f32 %f385, %f385, %f129;
272
+ add.f32 %f386, %f386, %f128;
273
+ add.f32 %f387, %f387, %f127;
274
+ add.f32 %f388, %f388, %f126;
275
+ add.f32 %f389, %f389, %f125;
276
+ add.f32 %f390, %f390, %f124;
277
+ add.f32 %f391, %f391, %f123;
278
+ add.f32 %f392, %f392, %f122;
279
+ .loc 1 32 36
280
+ add.s64 %rd157, %rd157, 512;
281
+ cvt.u32.u64 %r43, %rd157;
282
+ add.s32 %r44, %r43, -512;
283
+ setp.lt.u32 %p51, %r44, 49745;
284
+ @%p51 bra $L__BB0_1;
285
+ .loc 1 22 44
286
+ and.b32 %r65, %r1, 31;
287
+ .loc 1 24 33
288
+ and.b32 %r66, %r2, 7;
289
+ $L__tmp1:
290
+ .loc 2 233 15
291
+ add.f32 %f138, %f377, %f378;
292
+ add.f32 %f139, %f379, %f380;
293
+ add.f32 %f140, %f381, %f382;
294
+ add.f32 %f141, %f383, %f384;
295
+ add.f32 %f142, %f385, %f386;
296
+ add.f32 %f143, %f387, %f388;
297
+ add.f32 %f144, %f389, %f390;
298
+ add.f32 %f145, %f391, %f392;
299
+ $L__tmp2:
300
+ .loc 2 243 36
301
+ mov.b32 %r67, %f138;
302
+ shfl.sync.bfly.b32 %r68, %r67, 16, 31, -1;
303
+ mov.b32 %f146, %r68;
304
+ $L__tmp3:
305
+ .loc 2 233 15
306
+ add.f32 %f147, %f138, %f146;
307
+ $L__tmp4:
308
+ .loc 2 243 36
309
+ mov.b32 %r69, %f147;
310
+ shfl.sync.bfly.b32 %r70, %r69, 8, 31, -1;
311
+ mov.b32 %f148, %r70;
312
+ $L__tmp5:
313
+ .loc 2 233 15
314
+ add.f32 %f149, %f147, %f148;
315
+ $L__tmp6:
316
+ .loc 2 243 36
317
+ mov.b32 %r71, %f149;
318
+ shfl.sync.bfly.b32 %r72, %r71, 4, 31, -1;
319
+ mov.b32 %f150, %r72;
320
+ $L__tmp7:
321
+ .loc 2 233 15
322
+ add.f32 %f151, %f149, %f150;
323
+ $L__tmp8:
324
+ .loc 2 243 36
325
+ mov.b32 %r73, %f151;
326
+ shfl.sync.bfly.b32 %r74, %r73, 2, 31, -1;
327
+ mov.b32 %f152, %r74;
328
+ $L__tmp9:
329
+ .loc 2 233 15
330
+ add.f32 %f153, %f151, %f152;
331
+ $L__tmp10:
332
+ .loc 2 243 36
333
+ mov.b32 %r75, %f153;
334
+ shfl.sync.bfly.b32 %r76, %r75, 1, 31, -1;
335
+ mov.b32 %f154, %r76;
336
+ $L__tmp11:
337
+ .loc 2 233 15
338
+ add.f32 %f155, %f153, %f154;
339
+ $L__tmp12:
340
+ .loc 2 243 36
341
+ mov.b32 %r77, %f139;
342
+ shfl.sync.bfly.b32 %r78, %r77, 16, 31, -1;
343
+ mov.b32 %f156, %r78;
344
+ $L__tmp13:
345
+ .loc 2 233 15
346
+ add.f32 %f157, %f139, %f156;
347
+ $L__tmp14:
348
+ .loc 2 243 36
349
+ mov.b32 %r79, %f157;
350
+ shfl.sync.bfly.b32 %r80, %r79, 8, 31, -1;
351
+ mov.b32 %f158, %r80;
352
+ $L__tmp15:
353
+ .loc 2 233 15
354
+ add.f32 %f159, %f157, %f158;
355
+ $L__tmp16:
356
+ .loc 2 243 36
357
+ mov.b32 %r81, %f159;
358
+ shfl.sync.bfly.b32 %r82, %r81, 4, 31, -1;
359
+ mov.b32 %f160, %r82;
360
+ $L__tmp17:
361
+ .loc 2 233 15
362
+ add.f32 %f161, %f159, %f160;
363
+ $L__tmp18:
364
+ .loc 2 243 36
365
+ mov.b32 %r83, %f161;
366
+ shfl.sync.bfly.b32 %r84, %r83, 2, 31, -1;
367
+ mov.b32 %f162, %r84;
368
+ $L__tmp19:
369
+ .loc 2 233 15
370
+ add.f32 %f163, %f161, %f162;
371
+ $L__tmp20:
372
+ .loc 2 243 36
373
+ mov.b32 %r85, %f163;
374
+ shfl.sync.bfly.b32 %r86, %r85, 1, 31, -1;
375
+ mov.b32 %f164, %r86;
376
+ $L__tmp21:
377
+ .loc 2 233 15
378
+ add.f32 %f165, %f163, %f164;
379
+ $L__tmp22:
380
+ .loc 2 243 36
381
+ mov.b32 %r87, %f140;
382
+ shfl.sync.bfly.b32 %r88, %r87, 16, 31, -1;
383
+ mov.b32 %f166, %r88;
384
+ $L__tmp23:
385
+ .loc 2 233 15
386
+ add.f32 %f167, %f140, %f166;
387
+ $L__tmp24:
388
+ .loc 2 243 36
389
+ mov.b32 %r89, %f167;
390
+ shfl.sync.bfly.b32 %r90, %r89, 8, 31, -1;
391
+ mov.b32 %f168, %r90;
392
+ $L__tmp25:
393
+ .loc 2 233 15
394
+ add.f32 %f169, %f167, %f168;
395
+ $L__tmp26:
396
+ .loc 2 243 36
397
+ mov.b32 %r91, %f169;
398
+ shfl.sync.bfly.b32 %r92, %r91, 4, 31, -1;
399
+ mov.b32 %f170, %r92;
400
+ $L__tmp27:
401
+ .loc 2 233 15
402
+ add.f32 %f171, %f169, %f170;
403
+ $L__tmp28:
404
+ .loc 2 243 36
405
+ mov.b32 %r93, %f171;
406
+ shfl.sync.bfly.b32 %r94, %r93, 2, 31, -1;
407
+ mov.b32 %f172, %r94;
408
+ $L__tmp29:
409
+ .loc 2 233 15
410
+ add.f32 %f173, %f171, %f172;
411
+ $L__tmp30:
412
+ .loc 2 243 36
413
+ mov.b32 %r95, %f173;
414
+ shfl.sync.bfly.b32 %r96, %r95, 1, 31, -1;
415
+ mov.b32 %f174, %r96;
416
+ $L__tmp31:
417
+ .loc 2 233 15
418
+ add.f32 %f175, %f173, %f174;
419
+ $L__tmp32:
420
+ .loc 2 243 36
421
+ mov.b32 %r97, %f141;
422
+ shfl.sync.bfly.b32 %r98, %r97, 16, 31, -1;
423
+ mov.b32 %f176, %r98;
424
+ $L__tmp33:
425
+ .loc 2 233 15
426
+ add.f32 %f177, %f141, %f176;
427
+ $L__tmp34:
428
+ .loc 2 243 36
429
+ mov.b32 %r99, %f177;
430
+ shfl.sync.bfly.b32 %r100, %r99, 8, 31, -1;
431
+ mov.b32 %f178, %r100;
432
+ $L__tmp35:
433
+ .loc 2 233 15
434
+ add.f32 %f179, %f177, %f178;
435
+ $L__tmp36:
436
+ .loc 2 243 36
437
+ mov.b32 %r101, %f179;
438
+ shfl.sync.bfly.b32 %r102, %r101, 4, 31, -1;
439
+ mov.b32 %f180, %r102;
440
+ $L__tmp37:
441
+ .loc 2 233 15
442
+ add.f32 %f181, %f179, %f180;
443
+ $L__tmp38:
444
+ .loc 2 243 36
445
+ mov.b32 %r103, %f181;
446
+ shfl.sync.bfly.b32 %r104, %r103, 2, 31, -1;
447
+ mov.b32 %f182, %r104;
448
+ $L__tmp39:
449
+ .loc 2 233 15
450
+ add.f32 %f183, %f181, %f182;
451
+ $L__tmp40:
452
+ .loc 2 243 36
453
+ mov.b32 %r105, %f183;
454
+ shfl.sync.bfly.b32 %r106, %r105, 1, 31, -1;
455
+ mov.b32 %f184, %r106;
456
+ $L__tmp41:
457
+ .loc 2 233 15
458
+ add.f32 %f185, %f183, %f184;
459
+ $L__tmp42:
460
+ .loc 2 243 36
461
+ mov.b32 %r107, %f142;
462
+ shfl.sync.bfly.b32 %r108, %r107, 16, 31, -1;
463
+ mov.b32 %f186, %r108;
464
+ $L__tmp43:
465
+ .loc 2 233 15
466
+ add.f32 %f187, %f142, %f186;
467
+ $L__tmp44:
468
+ .loc 2 243 36
469
+ mov.b32 %r109, %f187;
470
+ shfl.sync.bfly.b32 %r110, %r109, 8, 31, -1;
471
+ mov.b32 %f188, %r110;
472
+ $L__tmp45:
473
+ .loc 2 233 15
474
+ add.f32 %f189, %f187, %f188;
475
+ $L__tmp46:
476
+ .loc 2 243 36
477
+ mov.b32 %r111, %f189;
478
+ shfl.sync.bfly.b32 %r112, %r111, 4, 31, -1;
479
+ mov.b32 %f190, %r112;
480
+ $L__tmp47:
481
+ .loc 2 233 15
482
+ add.f32 %f191, %f189, %f190;
483
+ $L__tmp48:
484
+ .loc 2 243 36
485
+ mov.b32 %r113, %f191;
486
+ shfl.sync.bfly.b32 %r114, %r113, 2, 31, -1;
487
+ mov.b32 %f192, %r114;
488
+ $L__tmp49:
489
+ .loc 2 233 15
490
+ add.f32 %f193, %f191, %f192;
491
+ $L__tmp50:
492
+ .loc 2 243 36
493
+ mov.b32 %r115, %f193;
494
+ shfl.sync.bfly.b32 %r116, %r115, 1, 31, -1;
495
+ mov.b32 %f194, %r116;
496
+ $L__tmp51:
497
+ .loc 2 233 15
498
+ add.f32 %f195, %f193, %f194;
499
+ $L__tmp52:
500
+ .loc 2 243 36
501
+ mov.b32 %r117, %f143;
502
+ shfl.sync.bfly.b32 %r118, %r117, 16, 31, -1;
503
+ mov.b32 %f196, %r118;
504
+ $L__tmp53:
505
+ .loc 2 233 15
506
+ add.f32 %f197, %f143, %f196;
507
+ $L__tmp54:
508
+ .loc 2 243 36
509
+ mov.b32 %r119, %f197;
510
+ shfl.sync.bfly.b32 %r120, %r119, 8, 31, -1;
511
+ mov.b32 %f198, %r120;
512
+ $L__tmp55:
513
+ .loc 2 233 15
514
+ add.f32 %f199, %f197, %f198;
515
+ $L__tmp56:
516
+ .loc 2 243 36
517
+ mov.b32 %r121, %f199;
518
+ shfl.sync.bfly.b32 %r122, %r121, 4, 31, -1;
519
+ mov.b32 %f200, %r122;
520
+ $L__tmp57:
521
+ .loc 2 233 15
522
+ add.f32 %f201, %f199, %f200;
523
+ $L__tmp58:
524
+ .loc 2 243 36
525
+ mov.b32 %r123, %f201;
526
+ shfl.sync.bfly.b32 %r124, %r123, 2, 31, -1;
527
+ mov.b32 %f202, %r124;
528
+ $L__tmp59:
529
+ .loc 2 233 15
530
+ add.f32 %f203, %f201, %f202;
531
+ $L__tmp60:
532
+ .loc 2 243 36
533
+ mov.b32 %r125, %f203;
534
+ shfl.sync.bfly.b32 %r126, %r125, 1, 31, -1;
535
+ mov.b32 %f204, %r126;
536
+ $L__tmp61:
537
+ .loc 2 233 15
538
+ add.f32 %f205, %f203, %f204;
539
+ $L__tmp62:
540
+ .loc 2 243 36
541
+ mov.b32 %r127, %f144;
542
+ shfl.sync.bfly.b32 %r128, %r127, 16, 31, -1;
543
+ mov.b32 %f206, %r128;
544
+ $L__tmp63:
545
+ .loc 2 233 15
546
+ add.f32 %f207, %f144, %f206;
547
+ $L__tmp64:
548
+ .loc 2 243 36
549
+ mov.b32 %r129, %f207;
550
+ shfl.sync.bfly.b32 %r130, %r129, 8, 31, -1;
551
+ mov.b32 %f208, %r130;
552
+ $L__tmp65:
553
+ .loc 2 233 15
554
+ add.f32 %f209, %f207, %f208;
555
+ $L__tmp66:
556
+ .loc 2 243 36
557
+ mov.b32 %r131, %f209;
558
+ shfl.sync.bfly.b32 %r132, %r131, 4, 31, -1;
559
+ mov.b32 %f210, %r132;
560
+ $L__tmp67:
561
+ .loc 2 233 15
562
+ add.f32 %f211, %f209, %f210;
563
+ $L__tmp68:
564
+ .loc 2 243 36
565
+ mov.b32 %r133, %f211;
566
+ shfl.sync.bfly.b32 %r134, %r133, 2, 31, -1;
567
+ mov.b32 %f212, %r134;
568
+ $L__tmp69:
569
+ .loc 2 233 15
570
+ add.f32 %f213, %f211, %f212;
571
+ $L__tmp70:
572
+ .loc 2 243 36
573
+ mov.b32 %r135, %f213;
574
+ shfl.sync.bfly.b32 %r136, %r135, 1, 31, -1;
575
+ mov.b32 %f214, %r136;
576
+ $L__tmp71:
577
+ .loc 2 233 15
578
+ add.f32 %f215, %f213, %f214;
579
+ $L__tmp72:
580
+ .loc 2 243 36
581
+ mov.b32 %r137, %f145;
582
+ shfl.sync.bfly.b32 %r138, %r137, 16, 31, -1;
583
+ mov.b32 %f216, %r138;
584
+ $L__tmp73:
585
+ .loc 2 233 15
586
+ add.f32 %f217, %f145, %f216;
587
+ $L__tmp74:
588
+ .loc 2 243 36
589
+ mov.b32 %r139, %f217;
590
+ shfl.sync.bfly.b32 %r140, %r139, 8, 31, -1;
591
+ mov.b32 %f218, %r140;
592
+ $L__tmp75:
593
+ .loc 2 233 15
594
+ add.f32 %f219, %f217, %f218;
595
+ $L__tmp76:
596
+ .loc 2 243 36
597
+ mov.b32 %r141, %f219;
598
+ shfl.sync.bfly.b32 %r142, %r141, 4, 31, -1;
599
+ mov.b32 %f220, %r142;
600
+ $L__tmp77:
601
+ .loc 2 233 15
602
+ add.f32 %f221, %f219, %f220;
603
+ $L__tmp78:
604
+ .loc 2 243 36
605
+ mov.b32 %r143, %f221;
606
+ shfl.sync.bfly.b32 %r144, %r143, 2, 31, -1;
607
+ mov.b32 %f222, %r144;
608
+ $L__tmp79:
609
+ .loc 2 233 15
610
+ add.f32 %f223, %f221, %f222;
611
+ $L__tmp80:
612
+ .loc 2 243 36
613
+ mov.b32 %r145, %f223;
614
+ shfl.sync.bfly.b32 %r146, %r145, 1, 31, -1;
615
+ mov.b32 %f224, %r146;
616
+ $L__tmp81:
617
+ .loc 2 233 15
618
+ add.f32 %f225, %f223, %f224;
619
+ $L__tmp82:
620
+ .loc 2 243 36
621
+ setp.eq.s32 %p52, %r65, 0;
622
+ shl.b32 %r147, %r66, 2;
623
+ mov.u32 %r148, global_smem;
624
+ add.s32 %r45, %r148, %r147;
625
+ mov.b32 %r46, %f155;
626
+ @%p52 st.shared.b32 [ %r45 + 0 ], %r46;
627
+ add.s32 %r47, %r45, 32;
628
+ mov.b32 %r48, %f165;
629
+ @%p52 st.shared.b32 [ %r47 + 0 ], %r48;
630
+ add.s32 %r49, %r45, 64;
631
+ mov.b32 %r50, %f175;
632
+ @%p52 st.shared.b32 [ %r49 + 0 ], %r50;
633
+ add.s32 %r51, %r45, 96;
634
+ mov.b32 %r52, %f185;
635
+ @%p52 st.shared.b32 [ %r51 + 0 ], %r52;
636
+ add.s32 %r53, %r45, 128;
637
+ mov.b32 %r54, %f195;
638
+ @%p52 st.shared.b32 [ %r53 + 0 ], %r54;
639
+ add.s32 %r55, %r45, 160;
640
+ mov.b32 %r56, %f205;
641
+ @%p52 st.shared.b32 [ %r55 + 0 ], %r56;
642
+ add.s32 %r57, %r45, 192;
643
+ mov.b32 %r58, %f215;
644
+ @%p52 st.shared.b32 [ %r57 + 0 ], %r58;
645
+ add.s32 %r59, %r45, 224;
646
+ mov.b32 %r60, %f225;
647
+ @%p52 st.shared.b32 [ %r59 + 0 ], %r60;
648
+ bar.sync 0;
649
+ setp.lt.s32 %p60, %r1, 64;
650
+ shl.b32 %r149, %r1, 2;
651
+ add.s32 %r62, %r148, %r149;
652
+ @%p60 ld.shared.b32 %r61, [ %r62 + 0 ];
653
+ mov.b32 %f226, %r61;
654
+ shfl.sync.bfly.b32 %r150, %r61, 4, 31, -1;
655
+ mov.b32 %f227, %r150;
656
+ $L__tmp83:
657
+ .loc 2 233 15
658
+ add.f32 %f228, %f226, %f227;
659
+ $L__tmp84:
660
+ .loc 2 243 36
661
+ mov.b32 %r151, %f228;
662
+ shfl.sync.bfly.b32 %r152, %r151, 2, 31, -1;
663
+ mov.b32 %f229, %r152;
664
+ $L__tmp85:
665
+ .loc 2 233 15
666
+ add.f32 %f230, %f228, %f229;
667
+ $L__tmp86:
668
+ .loc 2 243 36
669
+ mov.b32 %r153, %f230;
670
+ shfl.sync.bfly.b32 %r154, %r153, 1, 31, -1;
671
+ mov.b32 %f231, %r154;
672
+ $L__tmp87:
673
+ .loc 2 233 15
674
+ add.f32 %f232, %f230, %f231;
675
+ $L__tmp88:
676
+ .loc 2 243 36
677
+ and.b32 %r155, %r1, 7;
678
+ setp.eq.s32 %p62, %r155, 0;
679
+ and.pred %p61, %p60, %p62;
680
+ mov.b32 %r64, %f232;
681
+ @%p61 st.shared.b32 [ %r62 + 0 ], %r64;
682
+ bar.sync 0;
683
+ ld.shared.f32 %f57, [global_smem];
684
+ ld.shared.f32 %f58, [global_smem+32];
685
+ ld.shared.f32 %f59, [global_smem+64];
686
+ ld.shared.f32 %f60, [global_smem+96];
687
+ ld.shared.f32 %f61, [global_smem+128];
688
+ ld.shared.f32 %f62, [global_smem+160];
689
+ ld.shared.f32 %f63, [global_smem+192];
690
+ ld.shared.f32 %f64, [global_smem+224];
691
+ $L__tmp89:
692
+ .loc 1 51 36
693
+ mul.lo.s64 %rd10, %rd1, 804112;
694
+ shl.b64 %rd88, %rd3, 1;
695
+ add.s64 %rd164, %rd39, %rd88;
696
+ add.s64 %rd163, %rd38, %rd88;
697
+ shl.b64 %rd13, %rd3, 2;
698
+ mul.lo.s64 %rd89, %rd1, 1608224;
699
+ add.s64 %rd162, %rd36, %rd89;
700
+ add.s64 %rd161, %rd37, %rd88;
701
+ shl.b64 %rd90, %rd2, 1;
702
+ add.s64 %rd160, %rd39, %rd90;
703
+ add.s64 %rd159, %rd38, %rd90;
704
+ shl.b64 %rd18, %rd2, 2;
705
+ add.s64 %rd158, %rd37, %rd90;
706
+ mov.u64 %rd165, 0;
707
+ mov.u16 %rs2, 0;
708
+ $L__BB0_3:
709
+ .loc 1 52 27
710
+ add.s64 %rd155, %rd2, %rd165;
711
+ .loc 1 53 25
712
+ add.s64 %rd156, %rd3, %rd165;
713
+ setp.lt.u64 %p63, %rd155, 50257;
714
+ setp.lt.u64 %p65, %rd156, 50257;
715
+ .loc 1 55 35
716
+ add.s64 %rd91, %rd158, %rd10;
717
+ add.s64 %rd92, %rd161, %rd10;
718
+ add.s64 %rd93, %rd91, 100514;
719
+ add.s64 %rd94, %rd92, 100514;
720
+ add.s64 %rd95, %rd91, 201028;
721
+ add.s64 %rd96, %rd92, 201028;
722
+ add.s64 %rd97, %rd91, 301542;
723
+ add.s64 %rd98, %rd92, 301542;
724
+ add.s64 %rd99, %rd91, 402056;
725
+ add.s64 %rd100, %rd92, 402056;
726
+ add.s64 %rd101, %rd91, 502570;
727
+ add.s64 %rd102, %rd92, 502570;
728
+ add.s64 %rd103, %rd91, 603084;
729
+ add.s64 %rd104, %rd92, 603084;
730
+ add.s64 %rd105, %rd91, 703598;
731
+ .loc 1 55 53
732
+ add.s64 %rd106, %rd92, 703598;
733
+ mov.u16 %rs1, 0x0;
734
+ @%p63 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd91 + 0 ];
735
+ @!%p63 mov.u16 %rs1, %rs2;
736
+ mov.u16 %rs3, 0x0;
737
+ @%p65 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd92 + 0 ];
738
+ @!%p65 mov.u16 %rs3, %rs2;
739
+ mov.u16 %rs5, 0x0;
740
+ @%p63 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd93 + 0 ];
741
+ @!%p63 mov.u16 %rs5, %rs2;
742
+ mov.u16 %rs7, 0x0;
743
+ @%p65 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd94 + 0 ];
744
+ @!%p65 mov.u16 %rs7, %rs2;
745
+ mov.u16 %rs9, 0x0;
746
+ @%p63 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd95 + 0 ];
747
+ @!%p63 mov.u16 %rs9, %rs2;
748
+ mov.u16 %rs11, 0x0;
749
+ @%p65 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd96 + 0 ];
750
+ @!%p65 mov.u16 %rs11, %rs2;
751
+ mov.u16 %rs13, 0x0;
752
+ @%p63 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd97 + 0 ];
753
+ @!%p63 mov.u16 %rs13, %rs2;
754
+ mov.u16 %rs15, 0x0;
755
+ @%p65 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd98 + 0 ];
756
+ @!%p65 mov.u16 %rs15, %rs2;
757
+ mov.u16 %rs17, 0x0;
758
+ @%p63 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd99 + 0 ];
759
+ @!%p63 mov.u16 %rs17, %rs2;
760
+ mov.u16 %rs19, 0x0;
761
+ @%p65 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd100 + 0 ];
762
+ @!%p65 mov.u16 %rs19, %rs2;
763
+ mov.u16 %rs21, 0x0;
764
+ @%p63 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd101 + 0 ];
765
+ @!%p63 mov.u16 %rs21, %rs2;
766
+ mov.u16 %rs23, 0x0;
767
+ @%p65 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd102 + 0 ];
768
+ @!%p65 mov.u16 %rs23, %rs2;
769
+ mov.u16 %rs25, 0x0;
770
+ @%p63 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd103 + 0 ];
771
+ @!%p63 mov.u16 %rs25, %rs2;
772
+ mov.u16 %rs27, 0x0;
773
+ @%p65 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd104 + 0 ];
774
+ @!%p65 mov.u16 %rs27, %rs2;
775
+ mov.u16 %rs29, 0x0;
776
+ @%p63 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd105 + 0 ];
777
+ @!%p63 mov.u16 %rs29, %rs2;
778
+ mov.u16 %rs31, 0x0;
779
+ @%p65 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd106 + 0 ];
780
+ @!%p65 mov.u16 %rs31, %rs2;
781
+ .loc 1 55 105
782
+ cvt.f32.bf16 %r156, %rs1;
783
+ mov.b32 %f265, %r156;
784
+ cvt.f32.bf16 %r157, %rs3;
785
+ mov.b32 %f266, %r157;
786
+ cvt.f32.bf16 %r158, %rs5;
787
+ mov.b32 %f267, %r158;
788
+ cvt.f32.bf16 %r159, %rs7;
789
+ mov.b32 %f268, %r159;
790
+ cvt.f32.bf16 %r160, %rs9;
791
+ mov.b32 %f269, %r160;
792
+ cvt.f32.bf16 %r161, %rs11;
793
+ mov.b32 %f270, %r161;
794
+ cvt.f32.bf16 %r162, %rs13;
795
+ mov.b32 %f271, %r162;
796
+ cvt.f32.bf16 %r163, %rs15;
797
+ mov.b32 %f272, %r163;
798
+ cvt.f32.bf16 %r164, %rs17;
799
+ mov.b32 %f273, %r164;
800
+ cvt.f32.bf16 %r165, %rs19;
801
+ mov.b32 %f274, %r165;
802
+ cvt.f32.bf16 %r166, %rs21;
803
+ mov.b32 %f275, %r166;
804
+ cvt.f32.bf16 %r167, %rs23;
805
+ mov.b32 %f276, %r167;
806
+ cvt.f32.bf16 %r168, %rs25;
807
+ mov.b32 %f277, %r168;
808
+ cvt.f32.bf16 %r169, %rs27;
809
+ mov.b32 %f278, %r169;
810
+ cvt.f32.bf16 %r170, %rs29;
811
+ mov.b32 %f279, %r170;
812
+ cvt.f32.bf16 %r171, %rs31;
813
+ mov.b32 %f280, %r171;
814
+ .loc 1 56 35
815
+ add.s64 %rd107, %rd162, %rd18;
816
+ add.s64 %rd108, %rd162, %rd13;
817
+ add.s64 %rd109, %rd107, 201028;
818
+ add.s64 %rd110, %rd108, 201028;
819
+ add.s64 %rd111, %rd107, 402056;
820
+ add.s64 %rd112, %rd108, 402056;
821
+ add.s64 %rd113, %rd107, 603084;
822
+ add.s64 %rd114, %rd108, 603084;
823
+ add.s64 %rd115, %rd107, 804112;
824
+ add.s64 %rd116, %rd108, 804112;
825
+ add.s64 %rd117, %rd107, 1005140;
826
+ add.s64 %rd118, %rd108, 1005140;
827
+ add.s64 %rd119, %rd107, 1206168;
828
+ add.s64 %rd120, %rd108, 1206168;
829
+ add.s64 %rd121, %rd107, 1407196;
830
+ .loc 1 56 53
831
+ add.s64 %rd122, %rd108, 1407196;
832
+ mov.u32 %r172, 0x0;
833
+ @%p63 ld.global.L1::evict_first.b32 { %r172 }, [ %rd107 + 0 ];
834
+ @!%p63 mov.u32 %r172, %r173;
835
+ mov.b32 %f281, %r172;
836
+ mov.u32 %r174, 0x0;
837
+ @%p65 ld.global.L1::evict_first.b32 { %r174 }, [ %rd108 + 0 ];
838
+ @!%p65 mov.u32 %r174, %r173;
839
+ mov.b32 %f282, %r174;
840
+ mov.u32 %r176, 0x0;
841
+ @%p63 ld.global.L1::evict_first.b32 { %r176 }, [ %rd109 + 0 ];
842
+ @!%p63 mov.u32 %r176, %r173;
843
+ mov.b32 %f283, %r176;
844
+ mov.u32 %r178, 0x0;
845
+ @%p65 ld.global.L1::evict_first.b32 { %r178 }, [ %rd110 + 0 ];
846
+ @!%p65 mov.u32 %r178, %r173;
847
+ mov.b32 %f284, %r178;
848
+ mov.u32 %r180, 0x0;
849
+ @%p63 ld.global.L1::evict_first.b32 { %r180 }, [ %rd111 + 0 ];
850
+ @!%p63 mov.u32 %r180, %r173;
851
+ mov.b32 %f285, %r180;
852
+ mov.u32 %r182, 0x0;
853
+ @%p65 ld.global.L1::evict_first.b32 { %r182 }, [ %rd112 + 0 ];
854
+ @!%p65 mov.u32 %r182, %r173;
855
+ mov.b32 %f286, %r182;
856
+ mov.u32 %r184, 0x0;
857
+ @%p63 ld.global.L1::evict_first.b32 { %r184 }, [ %rd113 + 0 ];
858
+ @!%p63 mov.u32 %r184, %r173;
859
+ mov.b32 %f287, %r184;
860
+ mov.u32 %r186, 0x0;
861
+ @%p65 ld.global.L1::evict_first.b32 { %r186 }, [ %rd114 + 0 ];
862
+ @!%p65 mov.u32 %r186, %r173;
863
+ mov.b32 %f288, %r186;
864
+ mov.u32 %r188, 0x0;
865
+ @%p63 ld.global.L1::evict_first.b32 { %r188 }, [ %rd115 + 0 ];
866
+ @!%p63 mov.u32 %r188, %r173;
867
+ mov.b32 %f289, %r188;
868
+ mov.u32 %r190, 0x0;
869
+ @%p65 ld.global.L1::evict_first.b32 { %r190 }, [ %rd116 + 0 ];
870
+ @!%p65 mov.u32 %r190, %r173;
871
+ mov.b32 %f290, %r190;
872
+ mov.u32 %r192, 0x0;
873
+ @%p63 ld.global.L1::evict_first.b32 { %r192 }, [ %rd117 + 0 ];
874
+ @!%p63 mov.u32 %r192, %r173;
875
+ mov.b32 %f291, %r192;
876
+ mov.u32 %r194, 0x0;
877
+ @%p65 ld.global.L1::evict_first.b32 { %r194 }, [ %rd118 + 0 ];
878
+ @!%p65 mov.u32 %r194, %r173;
879
+ mov.b32 %f292, %r194;
880
+ mov.u32 %r196, 0x0;
881
+ @%p63 ld.global.L1::evict_first.b32 { %r196 }, [ %rd119 + 0 ];
882
+ @!%p63 mov.u32 %r196, %r173;
883
+ mov.b32 %f293, %r196;
884
+ mov.u32 %r198, 0x0;
885
+ @%p65 ld.global.L1::evict_first.b32 { %r198 }, [ %rd120 + 0 ];
886
+ @!%p65 mov.u32 %r198, %r173;
887
+ mov.b32 %f294, %r198;
888
+ mov.u32 %r200, 0x0;
889
+ @%p63 ld.global.L1::evict_first.b32 { %r200 }, [ %rd121 + 0 ];
890
+ @!%p63 mov.u32 %r200, %r173;
891
+ mov.b32 %f295, %r200;
892
+ mov.u32 %r202, 0x0;
893
+ @%p65 ld.global.L1::evict_first.b32 { %r202 }, [ %rd122 + 0 ];
894
+ @!%p65 mov.u32 %r202, %r173;
895
+ mov.b32 %f296, %r202;
896
+ .loc 1 57 35
897
+ add.s64 %rd123, %rd159, %rd10;
898
+ add.s64 %rd124, %rd163, %rd10;
899
+ add.s64 %rd125, %rd123, 100514;
900
+ add.s64 %rd126, %rd124, 100514;
901
+ add.s64 %rd127, %rd123, 201028;
902
+ add.s64 %rd128, %rd124, 201028;
903
+ add.s64 %rd129, %rd123, 301542;
904
+ add.s64 %rd130, %rd124, 301542;
905
+ add.s64 %rd131, %rd123, 402056;
906
+ add.s64 %rd132, %rd124, 402056;
907
+ add.s64 %rd133, %rd123, 502570;
908
+ add.s64 %rd134, %rd124, 502570;
909
+ add.s64 %rd135, %rd123, 603084;
910
+ add.s64 %rd136, %rd124, 603084;
911
+ add.s64 %rd137, %rd123, 703598;
912
+ .loc 1 57 53
913
+ add.s64 %rd138, %rd124, 703598;
914
+ mov.u16 %rs49, 0x0;
915
+ @%p63 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd123 + 0 ];
916
+ @!%p63 mov.u16 %rs49, %rs2;
917
+ mov.u16 %rs51, 0x0;
918
+ @%p65 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd124 + 0 ];
919
+ @!%p65 mov.u16 %rs51, %rs2;
920
+ mov.u16 %rs53, 0x0;
921
+ @%p63 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd125 + 0 ];
922
+ @!%p63 mov.u16 %rs53, %rs2;
923
+ mov.u16 %rs55, 0x0;
924
+ @%p65 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd126 + 0 ];
925
+ @!%p65 mov.u16 %rs55, %rs2;
926
+ mov.u16 %rs57, 0x0;
927
+ @%p63 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd127 + 0 ];
928
+ @!%p63 mov.u16 %rs57, %rs2;
929
+ mov.u16 %rs59, 0x0;
930
+ @%p65 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd128 + 0 ];
931
+ @!%p65 mov.u16 %rs59, %rs2;
932
+ mov.u16 %rs61, 0x0;
933
+ @%p63 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd129 + 0 ];
934
+ @!%p63 mov.u16 %rs61, %rs2;
935
+ mov.u16 %rs63, 0x0;
936
+ @%p65 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd130 + 0 ];
937
+ @!%p65 mov.u16 %rs63, %rs2;
938
+ mov.u16 %rs65, 0x0;
939
+ @%p63 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd131 + 0 ];
940
+ @!%p63 mov.u16 %rs65, %rs2;
941
+ mov.u16 %rs67, 0x0;
942
+ @%p65 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd132 + 0 ];
943
+ @!%p65 mov.u16 %rs67, %rs2;
944
+ mov.u16 %rs69, 0x0;
945
+ @%p63 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd133 + 0 ];
946
+ @!%p63 mov.u16 %rs69, %rs2;
947
+ mov.u16 %rs71, 0x0;
948
+ @%p65 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd134 + 0 ];
949
+ @!%p65 mov.u16 %rs71, %rs2;
950
+ mov.u16 %rs73, 0x0;
951
+ @%p63 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd135 + 0 ];
952
+ @!%p63 mov.u16 %rs73, %rs2;
953
+ mov.u16 %rs75, 0x0;
954
+ @%p65 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd136 + 0 ];
955
+ @!%p65 mov.u16 %rs75, %rs2;
956
+ mov.u16 %rs77, 0x0;
957
+ @%p63 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd137 + 0 ];
958
+ @!%p63 mov.u16 %rs77, %rs2;
959
+ mov.u16 %rs79, 0x0;
960
+ @%p65 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd138 + 0 ];
961
+ @!%p65 mov.u16 %rs79, %rs2;
962
+ .loc 1 57 105
963
+ cvt.f32.bf16 %r204, %rs49;
964
+ mov.b32 %f297, %r204;
965
+ cvt.f32.bf16 %r205, %rs51;
966
+ mov.b32 %f298, %r205;
967
+ cvt.f32.bf16 %r206, %rs53;
968
+ mov.b32 %f299, %r206;
969
+ cvt.f32.bf16 %r207, %rs55;
970
+ mov.b32 %f300, %r207;
971
+ cvt.f32.bf16 %r208, %rs57;
972
+ mov.b32 %f301, %r208;
973
+ cvt.f32.bf16 %r209, %rs59;
974
+ mov.b32 %f302, %r209;
975
+ cvt.f32.bf16 %r210, %rs61;
976
+ mov.b32 %f303, %r210;
977
+ cvt.f32.bf16 %r211, %rs63;
978
+ mov.b32 %f304, %r211;
979
+ cvt.f32.bf16 %r212, %rs65;
980
+ mov.b32 %f305, %r212;
981
+ cvt.f32.bf16 %r213, %rs67;
982
+ mov.b32 %f306, %r213;
983
+ cvt.f32.bf16 %r214, %rs69;
984
+ mov.b32 %f307, %r214;
985
+ cvt.f32.bf16 %r215, %rs71;
986
+ mov.b32 %f308, %r215;
987
+ cvt.f32.bf16 %r216, %rs73;
988
+ mov.b32 %f309, %r216;
989
+ cvt.f32.bf16 %r217, %rs75;
990
+ mov.b32 %f310, %r217;
991
+ cvt.f32.bf16 %r218, %rs77;
992
+ mov.b32 %f311, %r218;
993
+ cvt.f32.bf16 %r219, %rs79;
994
+ mov.b32 %f312, %r219;
995
+ .loc 1 65 23
996
+ mul.f32 %f234, %f297, 0f3FB8AA3B;
997
+ ex2.approx.f32 %f233, %f234;
998
+ mul.f32 %f236, %f298, 0f3FB8AA3B;
999
+ ex2.approx.f32 %f235, %f236;
1000
+ mul.f32 %f238, %f299, 0f3FB8AA3B;
1001
+ ex2.approx.f32 %f237, %f238;
1002
+ mul.f32 %f240, %f300, 0f3FB8AA3B;
1003
+ ex2.approx.f32 %f239, %f240;
1004
+ mul.f32 %f242, %f301, 0f3FB8AA3B;
1005
+ ex2.approx.f32 %f241, %f242;
1006
+ mul.f32 %f244, %f302, 0f3FB8AA3B;
1007
+ ex2.approx.f32 %f243, %f244;
1008
+ mul.f32 %f246, %f303, 0f3FB8AA3B;
1009
+ ex2.approx.f32 %f245, %f246;
1010
+ mul.f32 %f248, %f304, 0f3FB8AA3B;
1011
+ ex2.approx.f32 %f247, %f248;
1012
+ mul.f32 %f250, %f305, 0f3FB8AA3B;
1013
+ ex2.approx.f32 %f249, %f250;
1014
+ mul.f32 %f252, %f306, 0f3FB8AA3B;
1015
+ ex2.approx.f32 %f251, %f252;
1016
+ mul.f32 %f254, %f307, 0f3FB8AA3B;
1017
+ ex2.approx.f32 %f253, %f254;
1018
+ mul.f32 %f256, %f308, 0f3FB8AA3B;
1019
+ ex2.approx.f32 %f255, %f256;
1020
+ mul.f32 %f258, %f309, 0f3FB8AA3B;
1021
+ ex2.approx.f32 %f257, %f258;
1022
+ mul.f32 %f260, %f310, 0f3FB8AA3B;
1023
+ ex2.approx.f32 %f259, %f260;
1024
+ mul.f32 %f262, %f311, 0f3FB8AA3B;
1025
+ ex2.approx.f32 %f261, %f262;
1026
+ mul.f32 %f264, %f312, 0f3FB8AA3B;
1027
+ ex2.approx.f32 %f263, %f264;
1028
+ .loc 1 66 24
1029
+ mul.f32 %f313, %f57, %f233;
1030
+ mul.f32 %f314, %f57, %f235;
1031
+ mul.f32 %f315, %f58, %f237;
1032
+ mul.f32 %f316, %f58, %f239;
1033
+ mul.f32 %f317, %f59, %f241;
1034
+ mul.f32 %f318, %f59, %f243;
1035
+ mul.f32 %f319, %f60, %f245;
1036
+ mul.f32 %f320, %f60, %f247;
1037
+ mul.f32 %f321, %f61, %f249;
1038
+ mul.f32 %f322, %f61, %f251;
1039
+ mul.f32 %f323, %f62, %f253;
1040
+ mul.f32 %f324, %f62, %f255;
1041
+ mul.f32 %f325, %f63, %f257;
1042
+ mul.f32 %f326, %f63, %f259;
1043
+ mul.f32 %f327, %f64, %f261;
1044
+ mul.f32 %f328, %f64, %f263;
1045
+ .loc 1 67 24
1046
+ neg.f32 %f329, %f313;
1047
+ fma.rn.f32 %f330, %f1, %f281, %f329;
1048
+ neg.f32 %f331, %f314;
1049
+ fma.rn.f32 %f332, %f1, %f282, %f331;
1050
+ neg.f32 %f333, %f315;
1051
+ fma.rn.f32 %f334, %f2, %f283, %f333;
1052
+ neg.f32 %f335, %f316;
1053
+ fma.rn.f32 %f336, %f2, %f284, %f335;
1054
+ neg.f32 %f337, %f317;
1055
+ fma.rn.f32 %f338, %f3, %f285, %f337;
1056
+ neg.f32 %f339, %f318;
1057
+ fma.rn.f32 %f340, %f3, %f286, %f339;
1058
+ neg.f32 %f341, %f319;
1059
+ fma.rn.f32 %f342, %f4, %f287, %f341;
1060
+ neg.f32 %f343, %f320;
1061
+ fma.rn.f32 %f344, %f4, %f288, %f343;
1062
+ neg.f32 %f345, %f321;
1063
+ fma.rn.f32 %f346, %f5, %f289, %f345;
1064
+ neg.f32 %f347, %f322;
1065
+ fma.rn.f32 %f348, %f5, %f290, %f347;
1066
+ neg.f32 %f349, %f323;
1067
+ fma.rn.f32 %f350, %f6, %f291, %f349;
1068
+ neg.f32 %f351, %f324;
1069
+ fma.rn.f32 %f352, %f6, %f292, %f351;
1070
+ neg.f32 %f353, %f325;
1071
+ fma.rn.f32 %f354, %f7, %f293, %f353;
1072
+ neg.f32 %f355, %f326;
1073
+ fma.rn.f32 %f356, %f7, %f294, %f355;
1074
+ neg.f32 %f357, %f327;
1075
+ fma.rn.f32 %f358, %f8, %f295, %f357;
1076
+ neg.f32 %f359, %f328;
1077
+ fma.rn.f32 %f360, %f8, %f296, %f359;
1078
+ .loc 1 69 24
1079
+ add.f32 %f361, %f265, %f330;
1080
+ add.f32 %f362, %f266, %f332;
1081
+ add.f32 %f363, %f267, %f334;
1082
+ add.f32 %f364, %f268, %f336;
1083
+ add.f32 %f365, %f269, %f338;
1084
+ add.f32 %f366, %f270, %f340;
1085
+ add.f32 %f367, %f271, %f342;
1086
+ add.f32 %f368, %f272, %f344;
1087
+ add.f32 %f369, %f273, %f346;
1088
+ add.f32 %f370, %f274, %f348;
1089
+ add.f32 %f371, %f275, %f350;
1090
+ add.f32 %f372, %f276, %f352;
1091
+ add.f32 %f373, %f277, %f354;
1092
+ add.f32 %f374, %f278, %f356;
1093
+ add.f32 %f375, %f279, %f358;
1094
+ add.f32 %f376, %f280, %f360;
1095
+ .loc 1 70 29
1096
+ add.s64 %rd139, %rd160, %rd10;
1097
+ add.s64 %rd140, %rd164, %rd10;
1098
+ add.s64 %rd141, %rd139, 100514;
1099
+ add.s64 %rd142, %rd140, 100514;
1100
+ add.s64 %rd143, %rd139, 201028;
1101
+ add.s64 %rd144, %rd140, 201028;
1102
+ add.s64 %rd145, %rd139, 301542;
1103
+ add.s64 %rd146, %rd140, 301542;
1104
+ add.s64 %rd147, %rd139, 402056;
1105
+ add.s64 %rd148, %rd140, 402056;
1106
+ add.s64 %rd149, %rd139, 502570;
1107
+ add.s64 %rd150, %rd140, 502570;
1108
+ add.s64 %rd151, %rd139, 603084;
1109
+ add.s64 %rd152, %rd140, 603084;
1110
+ add.s64 %rd153, %rd139, 703598;
1111
+ .loc 1 70 54
1112
+ add.s64 %rd154, %rd140, 703598;
1113
+ mov.b32 %r220, %f361;
1114
+ cvt.rn.bf16.f32 %rs97, %r220;
1115
+ mov.b32 %r221, %f362;
1116
+ cvt.rn.bf16.f32 %rs98, %r221;
1117
+ mov.b32 %r222, %f363;
1118
+ cvt.rn.bf16.f32 %rs99, %r222;
1119
+ mov.b32 %r223, %f364;
1120
+ cvt.rn.bf16.f32 %rs100, %r223;
1121
+ mov.b32 %r224, %f365;
1122
+ cvt.rn.bf16.f32 %rs101, %r224;
1123
+ mov.b32 %r225, %f366;
1124
+ cvt.rn.bf16.f32 %rs102, %r225;
1125
+ mov.b32 %r226, %f367;
1126
+ cvt.rn.bf16.f32 %rs103, %r226;
1127
+ mov.b32 %r227, %f368;
1128
+ cvt.rn.bf16.f32 %rs104, %r227;
1129
+ mov.b32 %r228, %f369;
1130
+ cvt.rn.bf16.f32 %rs105, %r228;
1131
+ mov.b32 %r229, %f370;
1132
+ cvt.rn.bf16.f32 %rs106, %r229;
1133
+ mov.b32 %r230, %f371;
1134
+ cvt.rn.bf16.f32 %rs107, %r230;
1135
+ mov.b32 %r231, %f372;
1136
+ cvt.rn.bf16.f32 %rs108, %r231;
1137
+ mov.b32 %r232, %f373;
1138
+ cvt.rn.bf16.f32 %rs109, %r232;
1139
+ mov.b32 %r233, %f374;
1140
+ cvt.rn.bf16.f32 %rs110, %r233;
1141
+ mov.b32 %r234, %f375;
1142
+ cvt.rn.bf16.f32 %rs111, %r234;
1143
+ mov.b32 %r235, %f376;
1144
+ cvt.rn.bf16.f32 %rs112, %r235;
1145
+ @%p63 st.global.b16 [ %rd139 + 0 ], { %rs97 };
1146
+ @%p65 st.global.b16 [ %rd140 + 0 ], { %rs98 };
1147
+ @%p63 st.global.b16 [ %rd141 + 0 ], { %rs99 };
1148
+ @%p65 st.global.b16 [ %rd142 + 0 ], { %rs100 };
1149
+ @%p63 st.global.b16 [ %rd143 + 0 ], { %rs101 };
1150
+ @%p65 st.global.b16 [ %rd144 + 0 ], { %rs102 };
1151
+ @%p63 st.global.b16 [ %rd145 + 0 ], { %rs103 };
1152
+ @%p65 st.global.b16 [ %rd146 + 0 ], { %rs104 };
1153
+ @%p63 st.global.b16 [ %rd147 + 0 ], { %rs105 };
1154
+ @%p65 st.global.b16 [ %rd148 + 0 ], { %rs106 };
1155
+ @%p63 st.global.b16 [ %rd149 + 0 ], { %rs107 };
1156
+ @%p65 st.global.b16 [ %rd150 + 0 ], { %rs108 };
1157
+ @%p63 st.global.b16 [ %rd151 + 0 ], { %rs109 };
1158
+ @%p65 st.global.b16 [ %rd152 + 0 ], { %rs110 };
1159
+ @%p63 st.global.b16 [ %rd153 + 0 ], { %rs111 };
1160
+ @%p65 st.global.b16 [ %rd154 + 0 ], { %rs112 };
1161
+ .loc 1 51 36
1162
+ add.s64 %rd165, %rd165, 512;
1163
+ cvt.u32.u64 %r236, %rd165;
1164
+ add.s32 %r237, %r236, -512;
1165
+ add.s64 %rd164, %rd164, 1024;
1166
+ add.s64 %rd163, %rd163, 1024;
1167
+ add.s64 %rd162, %rd162, 2048;
1168
+ add.s64 %rd161, %rd161, 1024;
1169
+ add.s64 %rd160, %rd160, 1024;
1170
+ add.s64 %rd159, %rd159, 1024;
1171
+ add.s64 %rd158, %rd158, 1024;
1172
+ setp.lt.u32 %p175, %r237, 49745;
1173
+ @%p175 bra $L__BB0_3;
1174
+ .loc 1 51 4
1175
+ ret;
1176
+ $L__tmp90:
1177
+ $L__func_end0:
1178
+
1179
+ }
1180
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
1181
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
1182
+ .section .debug_abbrev
1183
+ {
1184
+ .b8 1
1185
+ .b8 17
1186
+ .b8 1
1187
+ .b8 37
1188
+ .b8 8
1189
+ .b8 19
1190
+ .b8 5
1191
+ .b8 3
1192
+ .b8 8
1193
+ .b8 16
1194
+ .b8 6
1195
+ .b8 27
1196
+ .b8 8
1197
+ .b8 180
1198
+ .b8 66
1199
+ .b8 12
1200
+ .b8 17
1201
+ .b8 1
1202
+ .b8 18
1203
+ .b8 1
1204
+ .b8 0
1205
+ .b8 0
1206
+ .b8 2
1207
+ .b8 46
1208
+ .b8 0
1209
+ .b8 135
1210
+ .b8 64
1211
+ .b8 8
1212
+ .b8 3
1213
+ .b8 8
1214
+ .b8 58
1215
+ .b8 11
1216
+ .b8 59
1217
+ .b8 11
1218
+ .b8 63
1219
+ .b8 12
1220
+ .b8 32
1221
+ .b8 11
1222
+ .b8 0
1223
+ .b8 0
1224
+ .b8 3
1225
+ .b8 46
1226
+ .b8 1
1227
+ .b8 17
1228
+ .b8 1
1229
+ .b8 18
1230
+ .b8 1
1231
+ .b8 64
1232
+ .b8 10
1233
+ .b8 49
1234
+ .b8 19
1235
+ .b8 0
1236
+ .b8 0
1237
+ .b8 4
1238
+ .b8 29
1239
+ .b8 1
1240
+ .b8 49
1241
+ .b8 19
1242
+ .b8 17
1243
+ .b8 1
1244
+ .b8 18
1245
+ .b8 1
1246
+ .b8 88
1247
+ .b8 11
1248
+ .b8 89
1249
+ .b8 11
1250
+ .b8 87
1251
+ .b8 11
1252
+ .b8 0
1253
+ .b8 0
1254
+ .b8 5
1255
+ .b8 29
1256
+ .b8 0
1257
+ .b8 49
1258
+ .b8 19
1259
+ .b8 17
1260
+ .b8 1
1261
+ .b8 18
1262
+ .b8 1
1263
+ .b8 88
1264
+ .b8 11
1265
+ .b8 89
1266
+ .b8 11
1267
+ .b8 87
1268
+ .b8 11
1269
+ .b8 0
1270
+ .b8 0
1271
+ .b8 0
1272
+ }
1273
+ .section .debug_info
1274
+ {
1275
+ .b32 278
1276
+ .b8 2
1277
+ .b8 0
1278
+ .b32 .debug_abbrev
1279
+ .b8 8
1280
+ .b8 1
1281
+ .b8 116
1282
+ .b8 114
1283
+ .b8 105
1284
+ .b8 116
1285
+ .b8 111
1286
+ .b8 110
1287
+ .b8 0
1288
+ .b8 2
1289
+ .b8 0
1290
+ .b8 99
1291
+ .b8 107
1292
+ .b8 122
1293
+ .b8 103
1294
+ .b8 108
1295
+ .b8 55
1296
+ .b8 116
1297
+ .b8 104
1298
+ .b8 98
1299
+ .b8 52
1300
+ .b8 120
1301
+ .b8 100
1302
+ .b8 102
1303
+ .b8 107
1304
+ .b8 102
1305
+ .b8 110
1306
+ .b8 100
1307
+ .b8 50
1308
+ .b8 116
1309
+ .b8 105
1310
+ .b8 100
1311
+ .b8 107
1312
+ .b8 115
1313
+ .b8 54
1314
+ .b8 109
1315
+ .b8 116
1316
+ .b8 53
1317
+ .b8 102
1318
+ .b8 51
1319
+ .b8 104
1320
+ .b8 97
1321
+ .b8 117
1322
+ .b8 119
1323
+ .b8 102
1324
+ .b8 121
1325
+ .b8 106
1326
+ .b8 102
1327
+ .b8 108
1328
+ .b8 98
1329
+ .b8 116
1330
+ .b8 122
1331
+ .b8 121
1332
+ .b8 101
1333
+ .b8 112
1334
+ .b8 111
1335
+ .b8 53
1336
+ .b8 111
1337
+ .b8 120
1338
+ .b8 107
1339
+ .b8 118
1340
+ .b8 104
1341
+ .b8 107
1342
+ .b8 46
1343
+ .b8 112
1344
+ .b8 121
1345
+ .b8 0
1346
+ .b32 .debug_line
1347
+ .b8 47
1348
+ .b8 116
1349
+ .b8 109
1350
+ .b8 112
1351
+ .b8 47
1352
+ .b8 116
1353
+ .b8 111
1354
+ .b8 114
1355
+ .b8 99
1356
+ .b8 104
1357
+ .b8 105
1358
+ .b8 110
1359
+ .b8 100
1360
+ .b8 117
1361
+ .b8 99
1362
+ .b8 116
1363
+ .b8 111
1364
+ .b8 114
1365
+ .b8 95
1366
+ .b8 114
1367
+ .b8 111
1368
+ .b8 111
1369
+ .b8 116
1370
+ .b8 47
1371
+ .b8 107
1372
+ .b8 122
1373
+ .b8 0
1374
+ .b8 1
1375
+ .b64 $L__func_begin0
1376
+ .b64 $L__func_end0
1377
+ .b8 2
1378
+ .b8 116
1379
+ .b8 114
1380
+ .b8 105
1381
+ .b8 116
1382
+ .b8 111
1383
+ .b8 110
1384
+ .b8 95
1385
+ .b8 95
1386
+ .b8 48
1387
+ .b8 100
1388
+ .b8 49
1389
+ .b8 100
1390
+ .b8 50
1391
+ .b8 100
1392
+ .b8 51
1393
+ .b8 100
1394
+ .b8 52
1395
+ .b8 100
1396
+ .b8 53
1397
+ .b8 100
1398
+ .b8 54
1399
+ .b8 100
1400
+ .b8 55
1401
+ .b8 100
1402
+ .b8 101
1403
+ .b8 56
1404
+ .b8 0
1405
+ .b8 116
1406
+ .b8 114
1407
+ .b8 105
1408
+ .b8 116
1409
+ .b8 111
1410
+ .b8 110
1411
+ .b8 95
1412
+ .b8 95
1413
+ .b8 48
1414
+ .b8 100
1415
+ .b8 49
1416
+ .b8 100
1417
+ .b8 50
1418
+ .b8 100
1419
+ .b8 51
1420
+ .b8 100
1421
+ .b8 52
1422
+ .b8 100
1423
+ .b8 53
1424
+ .b8 100
1425
+ .b8 54
1426
+ .b8 100
1427
+ .b8 55
1428
+ .b8 100
1429
+ .b8 101
1430
+ .b8 56
1431
+ .b8 0
1432
+ .b8 1
1433
+ .b8 18
1434
+ .b8 1
1435
+ .b8 1
1436
+ .b8 3
1437
+ .b64 $L__func_begin0
1438
+ .b64 $L__func_end0
1439
+ .b8 1
1440
+ .b8 156
1441
+ .b32 125
1442
+ .b8 4
1443
+ .b32 125
1444
+ .b64 $L__tmp1
1445
+ .b64 $L__tmp88
1446
+ .b8 2
1447
+ .b8 46
1448
+ .b8 27
1449
+ .b8 5
1450
+ .b32 125
1451
+ .b64 $L__tmp1
1452
+ .b64 $L__tmp88
1453
+ .b8 2
1454
+ .b8 243
1455
+ .b8 36
1456
+ .b8 0
1457
+ .b8 5
1458
+ .b32 125
1459
+ .b64 $L__tmp2
1460
+ .b64 $L__tmp89
1461
+ .b8 2
1462
+ .b8 46
1463
+ .b8 27
1464
+ .b8 0
1465
+ .b8 0
1466
+ }
1467
+ .section .debug_pubnames
1468
+ {
1469
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1470
+ $L__pubNames_start0:
1471
+ .b8 2
1472
+ .b8 0
1473
+ .b32 .debug_info
1474
+ .b32 282
1475
+ .b32 125
1476
+ .b8 116
1477
+ .b8 114
1478
+ .b8 105
1479
+ .b8 116
1480
+ .b8 111
1481
+ .b8 110
1482
+ .b8 95
1483
+ .b8 95
1484
+ .b8 48
1485
+ .b8 100
1486
+ .b8 49
1487
+ .b8 100
1488
+ .b8 50
1489
+ .b8 100
1490
+ .b8 51
1491
+ .b8 100
1492
+ .b8 52
1493
+ .b8 100
1494
+ .b8 53
1495
+ .b8 100
1496
+ .b8 54
1497
+ .b8 100
1498
+ .b8 55
1499
+ .b8 100
1500
+ .b8 101
1501
+ .b8 56
1502
+ .b8 0
1503
+ .b32 0
1504
+ $L__pubNames_end0:
1505
+ }
1506
+ .section .debug_pubtypes
1507
+ {
1508
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1509
+ $L__pubTypes_start0:
1510
+ .b8 2
1511
+ .b8 0
1512
+ .b32 .debug_info
1513
+ .b32 282
1514
+ .b32 0
1515
+ $L__pubTypes_end0:
1516
+ }
1517
+ .section .debug_loc { }
.triton/dump/33dcd7dc40e8b1089e9a4c61a9c826b5/triton_.ttgir ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 8], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<8x1xf32, #blocked>
5
+ %cst_0 = arith.constant dense<50257> : tensor<8x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<-1> : tensor<8x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<8x512xf32, #blocked>
8
+ %c8_i64 = arith.constant 8 : i64
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x512xi64, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c512_i32 = arith.constant 512 : i32
12
+ %c50257_i32 = arith.constant 50257 : i32
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<8x512xbf16, #blocked>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.extsi %0 : i32 to i64
16
+ %2 = arith.muli %1, %c8_i64 : i64
17
+ %3 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<8xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xi32, #blocked>
19
+ %5 = arith.extsi %4 : tensor<8x1xi32, #blocked> to tensor<8x1xi64, #blocked>
20
+ %6 = tt.splat %2 : (i64) -> tensor<8x1xi64, #blocked>
21
+ %7 = arith.addi %6, %5 : tensor<8x1xi64, #blocked>
22
+ %8 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<512xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x512xi32, #blocked>
24
+ %10 = arith.extsi %9 : tensor<1x512xi32, #blocked> to tensor<1x512xi64, #blocked>
25
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<8x1x!tt.ptr<i64, 1>, #blocked>
26
+ %12 = tt.addptr %11, %7 : tensor<8x1x!tt.ptr<i64, 1>, #blocked>, tensor<8x1xi64, #blocked>
27
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x1xi64, #blocked>
28
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
29
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
30
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
31
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
32
+ %18 = arith.muli %7, %cst_0 : tensor<8x1xi64, #blocked>
33
+ %19 = tt.broadcast %18 : (tensor<8x1xi64, #blocked>) -> tensor<8x512xi64, #blocked>
34
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<8x512x!tt.ptr<f32, 1>, #blocked>
35
+ %21 = arith.cmpi ne, %13, %cst_1 : tensor<8x1xi64, #blocked>
36
+ %22 = arith.divf %15, %17 : f32
37
+ %23 = tt.splat %22 : (f32) -> tensor<8x1xf32, #blocked>
38
+ %24 = arith.select %21, %23, %cst : tensor<8x1xi1, #blocked>, tensor<8x1xf32, #blocked>
39
+ %25 = tt.broadcast %24 : (tensor<8x1xf32, #blocked>) -> tensor<8x512xf32, #blocked>
40
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 iter_args(%arg10 = %cst_2) -> (tensor<8x512xf32, #blocked>) : i32 {
41
+ %33 = arith.extsi %arg9 : i32 to i64
42
+ %34 = tt.splat %33 : (i64) -> tensor<1x512xi64, #blocked>
43
+ %35 = arith.addi %34, %10 : tensor<1x512xi64, #blocked>
44
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x512xi64, #blocked>
45
+ %37 = tt.broadcast %35 : (tensor<1x512xi64, #blocked>) -> tensor<8x512xi64, #blocked>
46
+ %38 = arith.addi %37, %19 : tensor<8x512xi64, #blocked>
47
+ %39 = tt.addptr %20, %38 : tensor<8x512x!tt.ptr<f32, 1>, #blocked>, tensor<8x512xi64, #blocked>
48
+ %40 = tt.broadcast %36 : (tensor<1x512xi1, #blocked>) -> tensor<8x512xi1, #blocked>
49
+ %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<8x512xf32, #blocked>
50
+ %42 = arith.mulf %41, %25 : tensor<8x512xf32, #blocked>
51
+ %43 = arith.addf %arg10, %42 : tensor<8x512xf32, #blocked>
52
+ %44 = arith.select %40, %43, %arg10 : tensor<8x512xi1, #blocked>, tensor<8x512xf32, #blocked>
53
+ scf.yield %44 : tensor<8x512xf32, #blocked>
54
+ }
55
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
56
+ ^bb0(%arg9: f32, %arg10: f32):
57
+ %33 = arith.addf %arg9, %arg10 : f32
58
+ tt.reduce.return %33 : f32
59
+ }) : (tensor<8x512xf32, #blocked>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
60
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<8x1xf32, #blocked>
61
+ %29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
62
+ %30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
63
+ %31 = tt.broadcast %28 : (tensor<8x1xf32, #blocked>) -> tensor<8x512xf32, #blocked>
64
+ %32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<8x512x!tt.ptr<bf16, 1>, #blocked>
65
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c512_i32 : i32 {
66
+ %33 = arith.extsi %arg9 : i32 to i64
67
+ %34 = tt.splat %33 : (i64) -> tensor<1x512xi64, #blocked>
68
+ %35 = arith.addi %34, %10 : tensor<1x512xi64, #blocked>
69
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x512xi64, #blocked>
70
+ %37 = tt.broadcast %35 : (tensor<1x512xi64, #blocked>) -> tensor<8x512xi64, #blocked>
71
+ %38 = arith.addi %37, %19 : tensor<8x512xi64, #blocked>
72
+ %39 = tt.addptr %29, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
73
+ %40 = tt.broadcast %36 : (tensor<1x512xi1, #blocked>) -> tensor<8x512xi1, #blocked>
74
+ %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16, #blocked>
75
+ %42 = arith.extf %41 : tensor<8x512xbf16, #blocked> to tensor<8x512xf32, #blocked>
76
+ %43 = tt.addptr %20, %38 : tensor<8x512x!tt.ptr<f32, 1>, #blocked>, tensor<8x512xi64, #blocked>
77
+ %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xf32, #blocked>
78
+ %45 = tt.addptr %30, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
79
+ %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<8x512xbf16, #blocked>
80
+ %47 = arith.extf %46 : tensor<8x512xbf16, #blocked> to tensor<8x512xf32, #blocked>
81
+ %48 = arith.mulf %44, %25 : tensor<8x512xf32, #blocked>
82
+ %49 = math.exp %47 : tensor<8x512xf32, #blocked>
83
+ %50 = arith.mulf %49, %31 : tensor<8x512xf32, #blocked>
84
+ %51 = arith.subf %48, %50 : tensor<8x512xf32, #blocked>
85
+ %52 = arith.addf %42, %51 : tensor<8x512xf32, #blocked>
86
+ %53 = tt.addptr %32, %38 : tensor<8x512x!tt.ptr<bf16, 1>, #blocked>, tensor<8x512xi64, #blocked>
87
+ %54 = arith.truncf %52 : tensor<8x512xf32, #blocked> to tensor<8x512xbf16, #blocked>
88
+ tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<8x512xbf16, #blocked>
89
+ }
90
+ tt.return
91
+ }
92
+ }
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.cubin ADDED
Binary file (63.9 kB). View file
 
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.llir ADDED
@@ -0,0 +1,1108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ; ModuleID = 'LLVMDialectModule'
2
+ source_filename = "LLVMDialectModule"
3
+
4
+ @global_smem = external addrspace(3) global [0 x i8]
5
+
6
+ define void @triton__0d1d2d3d4d5d6d7de8(ptr addrspace(1) %0, ptr addrspace(1) %1, ptr addrspace(1) %2, ptr addrspace(1) %3, ptr addrspace(1) %4, ptr addrspace(1) %5, ptr addrspace(1) %6, i64 %7, i64 %8) local_unnamed_addr !dbg !5 {
7
+ %10 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !8
8
+ %11 = and i32 %10, 31, !dbg !8
9
+ %12 = lshr i32 %10, 6, !dbg !8
10
+ %13 = and i32 %12, 3, !dbg !8
11
+ %14 = or i32 %13, 4, !dbg !8
12
+ %15 = or i32 %13, 8, !dbg !8
13
+ %16 = or i32 %13, 12, !dbg !8
14
+ %17 = or i32 %13, 16, !dbg !8
15
+ %18 = or i32 %13, 20, !dbg !8
16
+ %19 = or i32 %13, 24, !dbg !8
17
+ %20 = or i32 %13, 28, !dbg !8
18
+ %21 = or i32 %13, 32, !dbg !8
19
+ %22 = or i32 %13, 36, !dbg !8
20
+ %23 = or i32 %13, 40, !dbg !8
21
+ %24 = or i32 %13, 44, !dbg !8
22
+ %25 = or i32 %13, 48, !dbg !8
23
+ %26 = or i32 %13, 52, !dbg !8
24
+ %27 = or i32 %13, 56, !dbg !8
25
+ %28 = or i32 %13, 60, !dbg !8
26
+ %urem = and i32 %10, 63, !dbg !9
27
+ %29 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #3, !dbg !10
28
+ %30 = sext i32 %29 to i64, !dbg !11
29
+ %31 = shl nsw i64 %30, 6, !dbg !12
30
+ %32 = zext nneg i32 %13 to i64
31
+ %33 = zext nneg i32 %14 to i64
32
+ %34 = zext nneg i32 %15 to i64
33
+ %35 = zext nneg i32 %16 to i64
34
+ %36 = zext nneg i32 %17 to i64
35
+ %37 = zext nneg i32 %18 to i64
36
+ %38 = zext nneg i32 %19 to i64
37
+ %39 = zext nneg i32 %20 to i64
38
+ %40 = zext nneg i32 %21 to i64
39
+ %41 = zext nneg i32 %22 to i64
40
+ %42 = zext nneg i32 %23 to i64
41
+ %43 = zext nneg i32 %24 to i64
42
+ %44 = zext nneg i32 %25 to i64
43
+ %45 = zext nneg i32 %26 to i64
44
+ %46 = zext nneg i32 %27 to i64
45
+ %47 = zext nneg i32 %28 to i64
46
+ %48 = or i64 %31, %32, !dbg !13
47
+ %49 = or i64 %31, %33, !dbg !13
48
+ %50 = or i64 %31, %34, !dbg !13
49
+ %51 = or i64 %31, %35, !dbg !13
50
+ %52 = or i64 %31, %36, !dbg !13
51
+ %53 = or i64 %31, %37, !dbg !13
52
+ %54 = or i64 %31, %38, !dbg !13
53
+ %55 = or i64 %31, %39, !dbg !13
54
+ %56 = or i64 %31, %40, !dbg !13
55
+ %57 = or i64 %31, %41, !dbg !13
56
+ %58 = or i64 %31, %42, !dbg !13
57
+ %59 = or i64 %31, %43, !dbg !13
58
+ %60 = or i64 %31, %44, !dbg !13
59
+ %61 = or i64 %31, %45, !dbg !13
60
+ %62 = or i64 %31, %46, !dbg !13
61
+ %63 = or i64 %31, %47, !dbg !13
62
+ %64 = getelementptr i64, ptr addrspace(1) %1, i64 %48, !dbg !14
63
+ %65 = getelementptr i64, ptr addrspace(1) %1, i64 %49, !dbg !14
64
+ %66 = getelementptr i64, ptr addrspace(1) %1, i64 %50, !dbg !14
65
+ %67 = getelementptr i64, ptr addrspace(1) %1, i64 %51, !dbg !14
66
+ %68 = getelementptr i64, ptr addrspace(1) %1, i64 %52, !dbg !14
67
+ %69 = getelementptr i64, ptr addrspace(1) %1, i64 %53, !dbg !14
68
+ %70 = getelementptr i64, ptr addrspace(1) %1, i64 %54, !dbg !14
69
+ %71 = getelementptr i64, ptr addrspace(1) %1, i64 %55, !dbg !14
70
+ %72 = getelementptr i64, ptr addrspace(1) %1, i64 %56, !dbg !14
71
+ %73 = getelementptr i64, ptr addrspace(1) %1, i64 %57, !dbg !14
72
+ %74 = getelementptr i64, ptr addrspace(1) %1, i64 %58, !dbg !14
73
+ %75 = getelementptr i64, ptr addrspace(1) %1, i64 %59, !dbg !14
74
+ %76 = getelementptr i64, ptr addrspace(1) %1, i64 %60, !dbg !14
75
+ %77 = getelementptr i64, ptr addrspace(1) %1, i64 %61, !dbg !14
76
+ %78 = getelementptr i64, ptr addrspace(1) %1, i64 %62, !dbg !14
77
+ %79 = getelementptr i64, ptr addrspace(1) %1, i64 %63, !dbg !14
78
+ %80 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %64, i1 true) #3, !dbg !15
79
+ %81 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %65, i1 true) #3, !dbg !15
80
+ %82 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %66, i1 true) #3, !dbg !15
81
+ %83 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %67, i1 true) #3, !dbg !15
82
+ %84 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %68, i1 true) #3, !dbg !15
83
+ %85 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %69, i1 true) #3, !dbg !15
84
+ %86 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %70, i1 true) #3, !dbg !15
85
+ %87 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %71, i1 true) #3, !dbg !15
86
+ %88 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %72, i1 true) #3, !dbg !15
87
+ %89 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %73, i1 true) #3, !dbg !15
88
+ %90 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %74, i1 true) #3, !dbg !15
89
+ %91 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %75, i1 true) #3, !dbg !15
90
+ %92 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %76, i1 true) #3, !dbg !15
91
+ %93 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %77, i1 true) #3, !dbg !15
92
+ %94 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %78, i1 true) #3, !dbg !15
93
+ %95 = tail call i64 asm sideeffect "mov.u64 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b64 { $0 }, [ $1 + 0 ];", "=l,l,b"(ptr addrspace(1) %79, i1 true) #3, !dbg !15
94
+ %96 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %2, i1 true) #3, !dbg !16
95
+ %97 = bitcast i32 %96 to float, !dbg !16
96
+ %98 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.b32 { $0 }, [ $1 + 0 ];", "=r,l,b"(ptr addrspace(1) %3, i1 true) #3, !dbg !17
97
+ %99 = bitcast i32 %98 to float, !dbg !17
98
+ %100 = mul nsw i64 %48, 50257, !dbg !18
99
+ %101 = mul nsw i64 %49, 50257, !dbg !18
100
+ %102 = mul nsw i64 %50, 50257, !dbg !18
101
+ %103 = mul nsw i64 %51, 50257, !dbg !18
102
+ %104 = mul nsw i64 %52, 50257, !dbg !18
103
+ %105 = mul nsw i64 %53, 50257, !dbg !18
104
+ %106 = mul nsw i64 %54, 50257, !dbg !18
105
+ %107 = mul nsw i64 %55, 50257, !dbg !18
106
+ %108 = mul nsw i64 %56, 50257, !dbg !18
107
+ %109 = mul nsw i64 %57, 50257, !dbg !18
108
+ %110 = mul nsw i64 %58, 50257, !dbg !18
109
+ %111 = mul nsw i64 %59, 50257, !dbg !18
110
+ %112 = mul nsw i64 %60, 50257, !dbg !18
111
+ %113 = mul nsw i64 %61, 50257, !dbg !18
112
+ %114 = mul nsw i64 %62, 50257, !dbg !18
113
+ %115 = mul nsw i64 %63, 50257, !dbg !18
114
+ %116 = insertelement <16 x i64> poison, i64 %80, i64 0, !dbg !19
115
+ %117 = insertelement <16 x i64> %116, i64 %81, i64 1, !dbg !19
116
+ %118 = insertelement <16 x i64> %117, i64 %82, i64 2, !dbg !19
117
+ %119 = insertelement <16 x i64> %118, i64 %83, i64 3, !dbg !19
118
+ %120 = insertelement <16 x i64> %119, i64 %84, i64 4, !dbg !19
119
+ %121 = insertelement <16 x i64> %120, i64 %85, i64 5, !dbg !19
120
+ %122 = insertelement <16 x i64> %121, i64 %86, i64 6, !dbg !19
121
+ %123 = insertelement <16 x i64> %122, i64 %87, i64 7, !dbg !19
122
+ %124 = insertelement <16 x i64> %123, i64 %88, i64 8, !dbg !19
123
+ %125 = insertelement <16 x i64> %124, i64 %89, i64 9, !dbg !19
124
+ %126 = insertelement <16 x i64> %125, i64 %90, i64 10, !dbg !19
125
+ %127 = insertelement <16 x i64> %126, i64 %91, i64 11, !dbg !19
126
+ %128 = insertelement <16 x i64> %127, i64 %92, i64 12, !dbg !19
127
+ %129 = insertelement <16 x i64> %128, i64 %93, i64 13, !dbg !19
128
+ %130 = insertelement <16 x i64> %129, i64 %94, i64 14, !dbg !19
129
+ %131 = insertelement <16 x i64> %130, i64 %95, i64 15, !dbg !19
130
+ %132 = icmp eq <16 x i64> %131, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !19
131
+ %133 = tail call float asm "div.full.f32 $0, $1, $2;", "=r,r,r"(float %97, float %99) #3, !dbg !20
132
+ %134 = insertelement <16 x float> poison, float %133, i64 0, !dbg !21
133
+ %135 = shufflevector <16 x float> %134, <16 x float> poison, <16 x i32> zeroinitializer, !dbg !21
134
+ %136 = select <16 x i1> %132, <16 x float> zeroinitializer, <16 x float> %135, !dbg !21
135
+ %137 = getelementptr float, ptr addrspace(1) %0, i64 %100
136
+ %138 = getelementptr float, ptr addrspace(1) %0, i64 %101
137
+ %139 = getelementptr float, ptr addrspace(1) %0, i64 %102
138
+ %140 = getelementptr float, ptr addrspace(1) %0, i64 %103
139
+ %141 = getelementptr float, ptr addrspace(1) %0, i64 %104
140
+ %142 = getelementptr float, ptr addrspace(1) %0, i64 %105
141
+ %143 = getelementptr float, ptr addrspace(1) %0, i64 %106
142
+ %144 = getelementptr float, ptr addrspace(1) %0, i64 %107
143
+ %145 = getelementptr float, ptr addrspace(1) %0, i64 %108
144
+ %146 = getelementptr float, ptr addrspace(1) %0, i64 %109
145
+ %147 = getelementptr float, ptr addrspace(1) %0, i64 %110
146
+ %148 = getelementptr float, ptr addrspace(1) %0, i64 %111
147
+ %149 = getelementptr float, ptr addrspace(1) %0, i64 %112
148
+ %150 = getelementptr float, ptr addrspace(1) %0, i64 %113
149
+ %151 = getelementptr float, ptr addrspace(1) %0, i64 %114
150
+ %152 = getelementptr float, ptr addrspace(1) %0, i64 %115
151
+ br label %153, !dbg !22
152
+
153
+ 153: ; preds = %9, %153
154
+ %154 = phi i32 [ 0, %9 ], [ %213, %153 ]
155
+ %155 = phi <16 x float> [ zeroinitializer, %9 ], [ %212, %153 ]
156
+ %156 = or i32 %154, %urem, !dbg !23
157
+ %157 = zext nneg i32 %156 to i64, !dbg !23
158
+ %158 = icmp ult i32 %156, 50257, !dbg !24
159
+ %159 = getelementptr float, ptr addrspace(1) %137, i64 %157, !dbg !25
160
+ %160 = getelementptr float, ptr addrspace(1) %138, i64 %157, !dbg !25
161
+ %161 = getelementptr float, ptr addrspace(1) %139, i64 %157, !dbg !25
162
+ %162 = getelementptr float, ptr addrspace(1) %140, i64 %157, !dbg !25
163
+ %163 = getelementptr float, ptr addrspace(1) %141, i64 %157, !dbg !25
164
+ %164 = getelementptr float, ptr addrspace(1) %142, i64 %157, !dbg !25
165
+ %165 = getelementptr float, ptr addrspace(1) %143, i64 %157, !dbg !25
166
+ %166 = getelementptr float, ptr addrspace(1) %144, i64 %157, !dbg !25
167
+ %167 = getelementptr float, ptr addrspace(1) %145, i64 %157, !dbg !25
168
+ %168 = getelementptr float, ptr addrspace(1) %146, i64 %157, !dbg !25
169
+ %169 = getelementptr float, ptr addrspace(1) %147, i64 %157, !dbg !25
170
+ %170 = getelementptr float, ptr addrspace(1) %148, i64 %157, !dbg !25
171
+ %171 = getelementptr float, ptr addrspace(1) %149, i64 %157, !dbg !25
172
+ %172 = getelementptr float, ptr addrspace(1) %150, i64 %157, !dbg !25
173
+ %173 = getelementptr float, ptr addrspace(1) %151, i64 %157, !dbg !25
174
+ %174 = getelementptr float, ptr addrspace(1) %152, i64 %157, !dbg !25
175
+ %175 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %159, i1 %158, i32 0, i1 %158) #3, !dbg !26
176
+ %176 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %160, i1 %158, i32 0, i1 %158) #3, !dbg !26
177
+ %177 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %161, i1 %158, i32 0, i1 %158) #3, !dbg !26
178
+ %178 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %162, i1 %158, i32 0, i1 %158) #3, !dbg !26
179
+ %179 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %163, i1 %158, i32 0, i1 %158) #3, !dbg !26
180
+ %180 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %164, i1 %158, i32 0, i1 %158) #3, !dbg !26
181
+ %181 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %165, i1 %158, i32 0, i1 %158) #3, !dbg !26
182
+ %182 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %166, i1 %158, i32 0, i1 %158) #3, !dbg !26
183
+ %183 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %167, i1 %158, i32 0, i1 %158) #3, !dbg !26
184
+ %184 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %168, i1 %158, i32 0, i1 %158) #3, !dbg !26
185
+ %185 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %169, i1 %158, i32 0, i1 %158) #3, !dbg !26
186
+ %186 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %170, i1 %158, i32 0, i1 %158) #3, !dbg !26
187
+ %187 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %171, i1 %158, i32 0, i1 %158) #3, !dbg !26
188
+ %188 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %172, i1 %158, i32 0, i1 %158) #3, !dbg !26
189
+ %189 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %173, i1 %158, i32 0, i1 %158) #3, !dbg !26
190
+ %190 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_last.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %174, i1 %158, i32 0, i1 %158) #3, !dbg !26
191
+ %191 = insertelement <16 x i32> poison, i32 %175, i64 0, !dbg !26
192
+ %192 = insertelement <16 x i32> %191, i32 %176, i64 1, !dbg !26
193
+ %193 = insertelement <16 x i32> %192, i32 %177, i64 2, !dbg !26
194
+ %194 = insertelement <16 x i32> %193, i32 %178, i64 3, !dbg !26
195
+ %195 = insertelement <16 x i32> %194, i32 %179, i64 4, !dbg !26
196
+ %196 = insertelement <16 x i32> %195, i32 %180, i64 5, !dbg !26
197
+ %197 = insertelement <16 x i32> %196, i32 %181, i64 6, !dbg !26
198
+ %198 = insertelement <16 x i32> %197, i32 %182, i64 7, !dbg !26
199
+ %199 = insertelement <16 x i32> %198, i32 %183, i64 8, !dbg !26
200
+ %200 = insertelement <16 x i32> %199, i32 %184, i64 9, !dbg !26
201
+ %201 = insertelement <16 x i32> %200, i32 %185, i64 10, !dbg !26
202
+ %202 = insertelement <16 x i32> %201, i32 %186, i64 11, !dbg !26
203
+ %203 = insertelement <16 x i32> %202, i32 %187, i64 12, !dbg !26
204
+ %204 = insertelement <16 x i32> %203, i32 %188, i64 13, !dbg !26
205
+ %205 = insertelement <16 x i32> %204, i32 %189, i64 14, !dbg !26
206
+ %206 = insertelement <16 x i32> %205, i32 %190, i64 15, !dbg !26
207
+ %207 = bitcast <16 x i32> %206 to <16 x float>, !dbg !26
208
+ %208 = fmul <16 x float> %136, %207, !dbg !27
209
+ %209 = insertelement <16 x i1> poison, i1 %158, i64 0, !dbg !28
210
+ %210 = shufflevector <16 x i1> %209, <16 x i1> poison, <16 x i32> zeroinitializer, !dbg !28
211
+ %211 = select <16 x i1> %210, <16 x float> %208, <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, !dbg !28
212
+ %212 = fadd <16 x float> %155, %211, !dbg !28
213
+ %213 = add nuw nsw i32 %154, 64, !dbg !22
214
+ %214 = icmp ult i32 %154, 50193, !dbg !22
215
+ br i1 %214, label %153, label %215, !dbg !22
216
+
217
+ 215: ; preds = %153
218
+ %216 = lshr i32 %10, 5, !dbg !8
219
+ %217 = and i32 %216, 1, !dbg !9
220
+ %218 = extractelement <16 x float> %212, i64 0, !dbg !29
221
+ %219 = bitcast float %218 to i32, !dbg !29
222
+ %220 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %219, i32 16, i32 31), !dbg !29
223
+ %221 = bitcast i32 %220 to float, !dbg !29
224
+ %222 = fadd float %218, %221, !dbg !33
225
+ %223 = bitcast float %222 to i32, !dbg !29
226
+ %224 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %223, i32 8, i32 31), !dbg !29
227
+ %225 = bitcast i32 %224 to float, !dbg !29
228
+ %226 = fadd float %222, %225, !dbg !33
229
+ %227 = bitcast float %226 to i32, !dbg !29
230
+ %228 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %227, i32 4, i32 31), !dbg !29
231
+ %229 = bitcast i32 %228 to float, !dbg !29
232
+ %230 = fadd float %226, %229, !dbg !33
233
+ %231 = bitcast float %230 to i32, !dbg !29
234
+ %232 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %231, i32 2, i32 31), !dbg !29
235
+ %233 = bitcast i32 %232 to float, !dbg !29
236
+ %234 = fadd float %230, %233, !dbg !33
237
+ %235 = bitcast float %234 to i32, !dbg !29
238
+ %236 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %235, i32 1, i32 31), !dbg !29
239
+ %237 = bitcast i32 %236 to float, !dbg !29
240
+ %238 = fadd float %234, %237, !dbg !33
241
+ %239 = extractelement <16 x float> %212, i64 1, !dbg !29
242
+ %240 = bitcast float %239 to i32, !dbg !29
243
+ %241 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %240, i32 16, i32 31), !dbg !29
244
+ %242 = bitcast i32 %241 to float, !dbg !29
245
+ %243 = fadd float %239, %242, !dbg !33
246
+ %244 = bitcast float %243 to i32, !dbg !29
247
+ %245 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %244, i32 8, i32 31), !dbg !29
248
+ %246 = bitcast i32 %245 to float, !dbg !29
249
+ %247 = fadd float %243, %246, !dbg !33
250
+ %248 = bitcast float %247 to i32, !dbg !29
251
+ %249 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %248, i32 4, i32 31), !dbg !29
252
+ %250 = bitcast i32 %249 to float, !dbg !29
253
+ %251 = fadd float %247, %250, !dbg !33
254
+ %252 = bitcast float %251 to i32, !dbg !29
255
+ %253 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %252, i32 2, i32 31), !dbg !29
256
+ %254 = bitcast i32 %253 to float, !dbg !29
257
+ %255 = fadd float %251, %254, !dbg !33
258
+ %256 = bitcast float %255 to i32, !dbg !29
259
+ %257 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %256, i32 1, i32 31), !dbg !29
260
+ %258 = bitcast i32 %257 to float, !dbg !29
261
+ %259 = fadd float %255, %258, !dbg !33
262
+ %260 = extractelement <16 x float> %212, i64 2, !dbg !29
263
+ %261 = bitcast float %260 to i32, !dbg !29
264
+ %262 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %261, i32 16, i32 31), !dbg !29
265
+ %263 = bitcast i32 %262 to float, !dbg !29
266
+ %264 = fadd float %260, %263, !dbg !33
267
+ %265 = bitcast float %264 to i32, !dbg !29
268
+ %266 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %265, i32 8, i32 31), !dbg !29
269
+ %267 = bitcast i32 %266 to float, !dbg !29
270
+ %268 = fadd float %264, %267, !dbg !33
271
+ %269 = bitcast float %268 to i32, !dbg !29
272
+ %270 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %269, i32 4, i32 31), !dbg !29
273
+ %271 = bitcast i32 %270 to float, !dbg !29
274
+ %272 = fadd float %268, %271, !dbg !33
275
+ %273 = bitcast float %272 to i32, !dbg !29
276
+ %274 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %273, i32 2, i32 31), !dbg !29
277
+ %275 = bitcast i32 %274 to float, !dbg !29
278
+ %276 = fadd float %272, %275, !dbg !33
279
+ %277 = bitcast float %276 to i32, !dbg !29
280
+ %278 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %277, i32 1, i32 31), !dbg !29
281
+ %279 = bitcast i32 %278 to float, !dbg !29
282
+ %280 = fadd float %276, %279, !dbg !33
283
+ %281 = extractelement <16 x float> %212, i64 3, !dbg !29
284
+ %282 = bitcast float %281 to i32, !dbg !29
285
+ %283 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %282, i32 16, i32 31), !dbg !29
286
+ %284 = bitcast i32 %283 to float, !dbg !29
287
+ %285 = fadd float %281, %284, !dbg !33
288
+ %286 = bitcast float %285 to i32, !dbg !29
289
+ %287 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %286, i32 8, i32 31), !dbg !29
290
+ %288 = bitcast i32 %287 to float, !dbg !29
291
+ %289 = fadd float %285, %288, !dbg !33
292
+ %290 = bitcast float %289 to i32, !dbg !29
293
+ %291 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %290, i32 4, i32 31), !dbg !29
294
+ %292 = bitcast i32 %291 to float, !dbg !29
295
+ %293 = fadd float %289, %292, !dbg !33
296
+ %294 = bitcast float %293 to i32, !dbg !29
297
+ %295 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %294, i32 2, i32 31), !dbg !29
298
+ %296 = bitcast i32 %295 to float, !dbg !29
299
+ %297 = fadd float %293, %296, !dbg !33
300
+ %298 = bitcast float %297 to i32, !dbg !29
301
+ %299 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %298, i32 1, i32 31), !dbg !29
302
+ %300 = bitcast i32 %299 to float, !dbg !29
303
+ %301 = fadd float %297, %300, !dbg !33
304
+ %302 = extractelement <16 x float> %212, i64 4, !dbg !29
305
+ %303 = bitcast float %302 to i32, !dbg !29
306
+ %304 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %303, i32 16, i32 31), !dbg !29
307
+ %305 = bitcast i32 %304 to float, !dbg !29
308
+ %306 = fadd float %302, %305, !dbg !33
309
+ %307 = bitcast float %306 to i32, !dbg !29
310
+ %308 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %307, i32 8, i32 31), !dbg !29
311
+ %309 = bitcast i32 %308 to float, !dbg !29
312
+ %310 = fadd float %306, %309, !dbg !33
313
+ %311 = bitcast float %310 to i32, !dbg !29
314
+ %312 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %311, i32 4, i32 31), !dbg !29
315
+ %313 = bitcast i32 %312 to float, !dbg !29
316
+ %314 = fadd float %310, %313, !dbg !33
317
+ %315 = bitcast float %314 to i32, !dbg !29
318
+ %316 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %315, i32 2, i32 31), !dbg !29
319
+ %317 = bitcast i32 %316 to float, !dbg !29
320
+ %318 = fadd float %314, %317, !dbg !33
321
+ %319 = bitcast float %318 to i32, !dbg !29
322
+ %320 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %319, i32 1, i32 31), !dbg !29
323
+ %321 = bitcast i32 %320 to float, !dbg !29
324
+ %322 = fadd float %318, %321, !dbg !33
325
+ %323 = extractelement <16 x float> %212, i64 5, !dbg !29
326
+ %324 = bitcast float %323 to i32, !dbg !29
327
+ %325 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %324, i32 16, i32 31), !dbg !29
328
+ %326 = bitcast i32 %325 to float, !dbg !29
329
+ %327 = fadd float %323, %326, !dbg !33
330
+ %328 = bitcast float %327 to i32, !dbg !29
331
+ %329 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %328, i32 8, i32 31), !dbg !29
332
+ %330 = bitcast i32 %329 to float, !dbg !29
333
+ %331 = fadd float %327, %330, !dbg !33
334
+ %332 = bitcast float %331 to i32, !dbg !29
335
+ %333 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %332, i32 4, i32 31), !dbg !29
336
+ %334 = bitcast i32 %333 to float, !dbg !29
337
+ %335 = fadd float %331, %334, !dbg !33
338
+ %336 = bitcast float %335 to i32, !dbg !29
339
+ %337 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %336, i32 2, i32 31), !dbg !29
340
+ %338 = bitcast i32 %337 to float, !dbg !29
341
+ %339 = fadd float %335, %338, !dbg !33
342
+ %340 = bitcast float %339 to i32, !dbg !29
343
+ %341 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %340, i32 1, i32 31), !dbg !29
344
+ %342 = bitcast i32 %341 to float, !dbg !29
345
+ %343 = fadd float %339, %342, !dbg !33
346
+ %344 = extractelement <16 x float> %212, i64 6, !dbg !29
347
+ %345 = bitcast float %344 to i32, !dbg !29
348
+ %346 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %345, i32 16, i32 31), !dbg !29
349
+ %347 = bitcast i32 %346 to float, !dbg !29
350
+ %348 = fadd float %344, %347, !dbg !33
351
+ %349 = bitcast float %348 to i32, !dbg !29
352
+ %350 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %349, i32 8, i32 31), !dbg !29
353
+ %351 = bitcast i32 %350 to float, !dbg !29
354
+ %352 = fadd float %348, %351, !dbg !33
355
+ %353 = bitcast float %352 to i32, !dbg !29
356
+ %354 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %353, i32 4, i32 31), !dbg !29
357
+ %355 = bitcast i32 %354 to float, !dbg !29
358
+ %356 = fadd float %352, %355, !dbg !33
359
+ %357 = bitcast float %356 to i32, !dbg !29
360
+ %358 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %357, i32 2, i32 31), !dbg !29
361
+ %359 = bitcast i32 %358 to float, !dbg !29
362
+ %360 = fadd float %356, %359, !dbg !33
363
+ %361 = bitcast float %360 to i32, !dbg !29
364
+ %362 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %361, i32 1, i32 31), !dbg !29
365
+ %363 = bitcast i32 %362 to float, !dbg !29
366
+ %364 = fadd float %360, %363, !dbg !33
367
+ %365 = extractelement <16 x float> %212, i64 7, !dbg !29
368
+ %366 = bitcast float %365 to i32, !dbg !29
369
+ %367 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %366, i32 16, i32 31), !dbg !29
370
+ %368 = bitcast i32 %367 to float, !dbg !29
371
+ %369 = fadd float %365, %368, !dbg !33
372
+ %370 = bitcast float %369 to i32, !dbg !29
373
+ %371 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %370, i32 8, i32 31), !dbg !29
374
+ %372 = bitcast i32 %371 to float, !dbg !29
375
+ %373 = fadd float %369, %372, !dbg !33
376
+ %374 = bitcast float %373 to i32, !dbg !29
377
+ %375 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %374, i32 4, i32 31), !dbg !29
378
+ %376 = bitcast i32 %375 to float, !dbg !29
379
+ %377 = fadd float %373, %376, !dbg !33
380
+ %378 = bitcast float %377 to i32, !dbg !29
381
+ %379 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %378, i32 2, i32 31), !dbg !29
382
+ %380 = bitcast i32 %379 to float, !dbg !29
383
+ %381 = fadd float %377, %380, !dbg !33
384
+ %382 = bitcast float %381 to i32, !dbg !29
385
+ %383 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %382, i32 1, i32 31), !dbg !29
386
+ %384 = bitcast i32 %383 to float, !dbg !29
387
+ %385 = fadd float %381, %384, !dbg !33
388
+ %386 = extractelement <16 x float> %212, i64 8, !dbg !29
389
+ %387 = bitcast float %386 to i32, !dbg !29
390
+ %388 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %387, i32 16, i32 31), !dbg !29
391
+ %389 = bitcast i32 %388 to float, !dbg !29
392
+ %390 = fadd float %386, %389, !dbg !33
393
+ %391 = bitcast float %390 to i32, !dbg !29
394
+ %392 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %391, i32 8, i32 31), !dbg !29
395
+ %393 = bitcast i32 %392 to float, !dbg !29
396
+ %394 = fadd float %390, %393, !dbg !33
397
+ %395 = bitcast float %394 to i32, !dbg !29
398
+ %396 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %395, i32 4, i32 31), !dbg !29
399
+ %397 = bitcast i32 %396 to float, !dbg !29
400
+ %398 = fadd float %394, %397, !dbg !33
401
+ %399 = bitcast float %398 to i32, !dbg !29
402
+ %400 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %399, i32 2, i32 31), !dbg !29
403
+ %401 = bitcast i32 %400 to float, !dbg !29
404
+ %402 = fadd float %398, %401, !dbg !33
405
+ %403 = bitcast float %402 to i32, !dbg !29
406
+ %404 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %403, i32 1, i32 31), !dbg !29
407
+ %405 = bitcast i32 %404 to float, !dbg !29
408
+ %406 = fadd float %402, %405, !dbg !33
409
+ %407 = extractelement <16 x float> %212, i64 9, !dbg !29
410
+ %408 = bitcast float %407 to i32, !dbg !29
411
+ %409 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %408, i32 16, i32 31), !dbg !29
412
+ %410 = bitcast i32 %409 to float, !dbg !29
413
+ %411 = fadd float %407, %410, !dbg !33
414
+ %412 = bitcast float %411 to i32, !dbg !29
415
+ %413 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %412, i32 8, i32 31), !dbg !29
416
+ %414 = bitcast i32 %413 to float, !dbg !29
417
+ %415 = fadd float %411, %414, !dbg !33
418
+ %416 = bitcast float %415 to i32, !dbg !29
419
+ %417 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %416, i32 4, i32 31), !dbg !29
420
+ %418 = bitcast i32 %417 to float, !dbg !29
421
+ %419 = fadd float %415, %418, !dbg !33
422
+ %420 = bitcast float %419 to i32, !dbg !29
423
+ %421 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %420, i32 2, i32 31), !dbg !29
424
+ %422 = bitcast i32 %421 to float, !dbg !29
425
+ %423 = fadd float %419, %422, !dbg !33
426
+ %424 = bitcast float %423 to i32, !dbg !29
427
+ %425 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %424, i32 1, i32 31), !dbg !29
428
+ %426 = bitcast i32 %425 to float, !dbg !29
429
+ %427 = fadd float %423, %426, !dbg !33
430
+ %428 = extractelement <16 x float> %212, i64 10, !dbg !29
431
+ %429 = bitcast float %428 to i32, !dbg !29
432
+ %430 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %429, i32 16, i32 31), !dbg !29
433
+ %431 = bitcast i32 %430 to float, !dbg !29
434
+ %432 = fadd float %428, %431, !dbg !33
435
+ %433 = bitcast float %432 to i32, !dbg !29
436
+ %434 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %433, i32 8, i32 31), !dbg !29
437
+ %435 = bitcast i32 %434 to float, !dbg !29
438
+ %436 = fadd float %432, %435, !dbg !33
439
+ %437 = bitcast float %436 to i32, !dbg !29
440
+ %438 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %437, i32 4, i32 31), !dbg !29
441
+ %439 = bitcast i32 %438 to float, !dbg !29
442
+ %440 = fadd float %436, %439, !dbg !33
443
+ %441 = bitcast float %440 to i32, !dbg !29
444
+ %442 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %441, i32 2, i32 31), !dbg !29
445
+ %443 = bitcast i32 %442 to float, !dbg !29
446
+ %444 = fadd float %440, %443, !dbg !33
447
+ %445 = bitcast float %444 to i32, !dbg !29
448
+ %446 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %445, i32 1, i32 31), !dbg !29
449
+ %447 = bitcast i32 %446 to float, !dbg !29
450
+ %448 = fadd float %444, %447, !dbg !33
451
+ %449 = extractelement <16 x float> %212, i64 11, !dbg !29
452
+ %450 = bitcast float %449 to i32, !dbg !29
453
+ %451 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %450, i32 16, i32 31), !dbg !29
454
+ %452 = bitcast i32 %451 to float, !dbg !29
455
+ %453 = fadd float %449, %452, !dbg !33
456
+ %454 = bitcast float %453 to i32, !dbg !29
457
+ %455 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %454, i32 8, i32 31), !dbg !29
458
+ %456 = bitcast i32 %455 to float, !dbg !29
459
+ %457 = fadd float %453, %456, !dbg !33
460
+ %458 = bitcast float %457 to i32, !dbg !29
461
+ %459 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %458, i32 4, i32 31), !dbg !29
462
+ %460 = bitcast i32 %459 to float, !dbg !29
463
+ %461 = fadd float %457, %460, !dbg !33
464
+ %462 = bitcast float %461 to i32, !dbg !29
465
+ %463 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %462, i32 2, i32 31), !dbg !29
466
+ %464 = bitcast i32 %463 to float, !dbg !29
467
+ %465 = fadd float %461, %464, !dbg !33
468
+ %466 = bitcast float %465 to i32, !dbg !29
469
+ %467 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %466, i32 1, i32 31), !dbg !29
470
+ %468 = bitcast i32 %467 to float, !dbg !29
471
+ %469 = fadd float %465, %468, !dbg !33
472
+ %470 = extractelement <16 x float> %212, i64 12, !dbg !29
473
+ %471 = bitcast float %470 to i32, !dbg !29
474
+ %472 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %471, i32 16, i32 31), !dbg !29
475
+ %473 = bitcast i32 %472 to float, !dbg !29
476
+ %474 = fadd float %470, %473, !dbg !33
477
+ %475 = bitcast float %474 to i32, !dbg !29
478
+ %476 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %475, i32 8, i32 31), !dbg !29
479
+ %477 = bitcast i32 %476 to float, !dbg !29
480
+ %478 = fadd float %474, %477, !dbg !33
481
+ %479 = bitcast float %478 to i32, !dbg !29
482
+ %480 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %479, i32 4, i32 31), !dbg !29
483
+ %481 = bitcast i32 %480 to float, !dbg !29
484
+ %482 = fadd float %478, %481, !dbg !33
485
+ %483 = bitcast float %482 to i32, !dbg !29
486
+ %484 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %483, i32 2, i32 31), !dbg !29
487
+ %485 = bitcast i32 %484 to float, !dbg !29
488
+ %486 = fadd float %482, %485, !dbg !33
489
+ %487 = bitcast float %486 to i32, !dbg !29
490
+ %488 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %487, i32 1, i32 31), !dbg !29
491
+ %489 = bitcast i32 %488 to float, !dbg !29
492
+ %490 = fadd float %486, %489, !dbg !33
493
+ %491 = extractelement <16 x float> %212, i64 13, !dbg !29
494
+ %492 = bitcast float %491 to i32, !dbg !29
495
+ %493 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %492, i32 16, i32 31), !dbg !29
496
+ %494 = bitcast i32 %493 to float, !dbg !29
497
+ %495 = fadd float %491, %494, !dbg !33
498
+ %496 = bitcast float %495 to i32, !dbg !29
499
+ %497 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %496, i32 8, i32 31), !dbg !29
500
+ %498 = bitcast i32 %497 to float, !dbg !29
501
+ %499 = fadd float %495, %498, !dbg !33
502
+ %500 = bitcast float %499 to i32, !dbg !29
503
+ %501 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %500, i32 4, i32 31), !dbg !29
504
+ %502 = bitcast i32 %501 to float, !dbg !29
505
+ %503 = fadd float %499, %502, !dbg !33
506
+ %504 = bitcast float %503 to i32, !dbg !29
507
+ %505 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %504, i32 2, i32 31), !dbg !29
508
+ %506 = bitcast i32 %505 to float, !dbg !29
509
+ %507 = fadd float %503, %506, !dbg !33
510
+ %508 = bitcast float %507 to i32, !dbg !29
511
+ %509 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %508, i32 1, i32 31), !dbg !29
512
+ %510 = bitcast i32 %509 to float, !dbg !29
513
+ %511 = fadd float %507, %510, !dbg !33
514
+ %512 = extractelement <16 x float> %212, i64 14, !dbg !29
515
+ %513 = bitcast float %512 to i32, !dbg !29
516
+ %514 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %513, i32 16, i32 31), !dbg !29
517
+ %515 = bitcast i32 %514 to float, !dbg !29
518
+ %516 = fadd float %512, %515, !dbg !33
519
+ %517 = bitcast float %516 to i32, !dbg !29
520
+ %518 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %517, i32 8, i32 31), !dbg !29
521
+ %519 = bitcast i32 %518 to float, !dbg !29
522
+ %520 = fadd float %516, %519, !dbg !33
523
+ %521 = bitcast float %520 to i32, !dbg !29
524
+ %522 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %521, i32 4, i32 31), !dbg !29
525
+ %523 = bitcast i32 %522 to float, !dbg !29
526
+ %524 = fadd float %520, %523, !dbg !33
527
+ %525 = bitcast float %524 to i32, !dbg !29
528
+ %526 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %525, i32 2, i32 31), !dbg !29
529
+ %527 = bitcast i32 %526 to float, !dbg !29
530
+ %528 = fadd float %524, %527, !dbg !33
531
+ %529 = bitcast float %528 to i32, !dbg !29
532
+ %530 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %529, i32 1, i32 31), !dbg !29
533
+ %531 = bitcast i32 %530 to float, !dbg !29
534
+ %532 = fadd float %528, %531, !dbg !33
535
+ %533 = extractelement <16 x float> %212, i64 15, !dbg !29
536
+ %534 = bitcast float %533 to i32, !dbg !29
537
+ %535 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %534, i32 16, i32 31), !dbg !29
538
+ %536 = bitcast i32 %535 to float, !dbg !29
539
+ %537 = fadd float %533, %536, !dbg !33
540
+ %538 = bitcast float %537 to i32, !dbg !29
541
+ %539 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %538, i32 8, i32 31), !dbg !29
542
+ %540 = bitcast i32 %539 to float, !dbg !29
543
+ %541 = fadd float %537, %540, !dbg !33
544
+ %542 = bitcast float %541 to i32, !dbg !29
545
+ %543 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %542, i32 4, i32 31), !dbg !29
546
+ %544 = bitcast i32 %543 to float, !dbg !29
547
+ %545 = fadd float %541, %544, !dbg !33
548
+ %546 = bitcast float %545 to i32, !dbg !29
549
+ %547 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %546, i32 2, i32 31), !dbg !29
550
+ %548 = bitcast i32 %547 to float, !dbg !29
551
+ %549 = fadd float %545, %548, !dbg !33
552
+ %550 = bitcast float %549 to i32, !dbg !29
553
+ %551 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %550, i32 1, i32 31), !dbg !29
554
+ %552 = bitcast i32 %551 to float, !dbg !29
555
+ %553 = fadd float %549, %552, !dbg !33
556
+ %554 = icmp eq i32 %11, 0, !dbg !29
557
+ %555 = shl nuw nsw i32 %13, 1, !dbg !29
558
+ %556 = or i32 %555, %217, !dbg !29
559
+ %557 = zext nneg i32 %556 to i64, !dbg !29
560
+ %558 = getelementptr float, ptr addrspace(3) @global_smem, i64 %557, !dbg !29
561
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %558, float %238, i1 %554) #3, !dbg !29
562
+ %559 = shl nuw nsw i32 %14, 1, !dbg !29
563
+ %560 = or i32 %559, %217, !dbg !29
564
+ %561 = zext nneg i32 %560 to i64, !dbg !29
565
+ %562 = getelementptr float, ptr addrspace(3) @global_smem, i64 %561, !dbg !29
566
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %562, float %259, i1 %554) #3, !dbg !29
567
+ %563 = shl nuw nsw i32 %15, 1, !dbg !29
568
+ %564 = or i32 %563, %217, !dbg !29
569
+ %565 = zext nneg i32 %564 to i64, !dbg !29
570
+ %566 = getelementptr float, ptr addrspace(3) @global_smem, i64 %565, !dbg !29
571
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %566, float %280, i1 %554) #3, !dbg !29
572
+ %567 = shl nuw nsw i32 %16, 1, !dbg !29
573
+ %568 = or i32 %567, %217, !dbg !29
574
+ %569 = zext nneg i32 %568 to i64, !dbg !29
575
+ %570 = getelementptr float, ptr addrspace(3) @global_smem, i64 %569, !dbg !29
576
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %570, float %301, i1 %554) #3, !dbg !29
577
+ %571 = shl nuw nsw i32 %17, 1, !dbg !29
578
+ %572 = or i32 %571, %217, !dbg !29
579
+ %573 = zext nneg i32 %572 to i64, !dbg !29
580
+ %574 = getelementptr float, ptr addrspace(3) @global_smem, i64 %573, !dbg !29
581
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %574, float %322, i1 %554) #3, !dbg !29
582
+ %575 = shl nuw nsw i32 %18, 1, !dbg !29
583
+ %576 = or i32 %575, %217, !dbg !29
584
+ %577 = zext nneg i32 %576 to i64, !dbg !29
585
+ %578 = getelementptr float, ptr addrspace(3) @global_smem, i64 %577, !dbg !29
586
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %578, float %343, i1 %554) #3, !dbg !29
587
+ %579 = shl nuw nsw i32 %19, 1, !dbg !29
588
+ %580 = or i32 %579, %217, !dbg !29
589
+ %581 = zext nneg i32 %580 to i64, !dbg !29
590
+ %582 = getelementptr float, ptr addrspace(3) @global_smem, i64 %581, !dbg !29
591
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %582, float %364, i1 %554) #3, !dbg !29
592
+ %583 = shl nuw nsw i32 %20, 1, !dbg !29
593
+ %584 = or i32 %583, %217, !dbg !29
594
+ %585 = zext nneg i32 %584 to i64, !dbg !29
595
+ %586 = getelementptr float, ptr addrspace(3) @global_smem, i64 %585, !dbg !29
596
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %586, float %385, i1 %554) #3, !dbg !29
597
+ %587 = shl nuw nsw i32 %21, 1, !dbg !29
598
+ %588 = or i32 %587, %217, !dbg !29
599
+ %589 = zext nneg i32 %588 to i64, !dbg !29
600
+ %590 = getelementptr float, ptr addrspace(3) @global_smem, i64 %589, !dbg !29
601
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %590, float %406, i1 %554) #3, !dbg !29
602
+ %591 = shl nuw nsw i32 %22, 1, !dbg !29
603
+ %592 = or i32 %591, %217, !dbg !29
604
+ %593 = zext nneg i32 %592 to i64, !dbg !29
605
+ %594 = getelementptr float, ptr addrspace(3) @global_smem, i64 %593, !dbg !29
606
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %594, float %427, i1 %554) #3, !dbg !29
607
+ %595 = shl nuw nsw i32 %23, 1, !dbg !29
608
+ %596 = or i32 %595, %217, !dbg !29
609
+ %597 = zext nneg i32 %596 to i64, !dbg !29
610
+ %598 = getelementptr float, ptr addrspace(3) @global_smem, i64 %597, !dbg !29
611
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %598, float %448, i1 %554) #3, !dbg !29
612
+ %599 = shl nuw nsw i32 %24, 1, !dbg !29
613
+ %600 = or i32 %599, %217, !dbg !29
614
+ %601 = zext nneg i32 %600 to i64, !dbg !29
615
+ %602 = getelementptr float, ptr addrspace(3) @global_smem, i64 %601, !dbg !29
616
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %602, float %469, i1 %554) #3, !dbg !29
617
+ %603 = shl nuw nsw i32 %25, 1, !dbg !29
618
+ %604 = or i32 %603, %217, !dbg !29
619
+ %605 = zext nneg i32 %604 to i64, !dbg !29
620
+ %606 = getelementptr float, ptr addrspace(3) @global_smem, i64 %605, !dbg !29
621
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %606, float %490, i1 %554) #3, !dbg !29
622
+ %607 = shl nuw nsw i32 %26, 1, !dbg !29
623
+ %608 = or i32 %607, %217, !dbg !29
624
+ %609 = zext nneg i32 %608 to i64, !dbg !29
625
+ %610 = getelementptr float, ptr addrspace(3) @global_smem, i64 %609, !dbg !29
626
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %610, float %511, i1 %554) #3, !dbg !29
627
+ %611 = shl nuw nsw i32 %27, 1, !dbg !29
628
+ %612 = or i32 %611, %217, !dbg !29
629
+ %613 = zext nneg i32 %612 to i64, !dbg !29
630
+ %614 = getelementptr float, ptr addrspace(3) @global_smem, i64 %613, !dbg !29
631
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %614, float %532, i1 %554) #3, !dbg !29
632
+ %615 = shl nuw nsw i32 %28, 1, !dbg !29
633
+ %616 = or i32 %615, %217, !dbg !29
634
+ %617 = zext nneg i32 %616 to i64, !dbg !29
635
+ %618 = getelementptr float, ptr addrspace(3) @global_smem, i64 %617, !dbg !29
636
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %618, float %553, i1 %554) #3, !dbg !29
637
+ tail call void @llvm.nvvm.barrier0(), !dbg !29
638
+ %619 = icmp slt i32 %10, 128, !dbg !29
639
+ %620 = sext i32 %10 to i64, !dbg !29
640
+ %621 = getelementptr float, ptr addrspace(3) @global_smem, i64 %620, !dbg !29
641
+ %622 = tail call float asm sideeffect "@$2 ld.shared.b32 $0, [ $1 + 0 ];", "=r,r,b"(ptr addrspace(3) %621, i1 %619) #3, !dbg !29
642
+ %623 = bitcast float %622 to i32, !dbg !29
643
+ %624 = tail call i32 @llvm.nvvm.shfl.sync.bfly.i32(i32 -1, i32 %623, i32 1, i32 31), !dbg !29
644
+ %625 = bitcast i32 %624 to float, !dbg !29
645
+ %626 = fadd float %622, %625, !dbg !33
646
+ %627 = and i32 %10, 1, !dbg !29
647
+ %628 = icmp eq i32 %627, 0, !dbg !29
648
+ %629 = and i1 %619, %628, !dbg !29
649
+ tail call void asm sideeffect "@$2 st.shared.b32 [ $0 + 0 ], $1;", "r,r,b"(ptr addrspace(3) %621, float %626, i1 %629) #3, !dbg !29
650
+ tail call void @llvm.nvvm.barrier0(), !dbg !29
651
+ %630 = zext nneg i32 %555 to i64, !dbg !29
652
+ %631 = getelementptr float, ptr addrspace(3) @global_smem, i64 %630, !dbg !29
653
+ %632 = load float, ptr addrspace(3) %631, align 4, !dbg !29
654
+ %633 = zext nneg i32 %559 to i64, !dbg !29
655
+ %634 = getelementptr float, ptr addrspace(3) @global_smem, i64 %633, !dbg !29
656
+ %635 = load float, ptr addrspace(3) %634, align 4, !dbg !29
657
+ %636 = zext nneg i32 %563 to i64, !dbg !29
658
+ %637 = getelementptr float, ptr addrspace(3) @global_smem, i64 %636, !dbg !29
659
+ %638 = load float, ptr addrspace(3) %637, align 4, !dbg !29
660
+ %639 = zext nneg i32 %567 to i64, !dbg !29
661
+ %640 = getelementptr float, ptr addrspace(3) @global_smem, i64 %639, !dbg !29
662
+ %641 = load float, ptr addrspace(3) %640, align 4, !dbg !29
663
+ %642 = zext nneg i32 %571 to i64, !dbg !29
664
+ %643 = getelementptr float, ptr addrspace(3) @global_smem, i64 %642, !dbg !29
665
+ %644 = load float, ptr addrspace(3) %643, align 4, !dbg !29
666
+ %645 = zext nneg i32 %575 to i64, !dbg !29
667
+ %646 = getelementptr float, ptr addrspace(3) @global_smem, i64 %645, !dbg !29
668
+ %647 = load float, ptr addrspace(3) %646, align 4, !dbg !29
669
+ %648 = zext nneg i32 %579 to i64, !dbg !29
670
+ %649 = getelementptr float, ptr addrspace(3) @global_smem, i64 %648, !dbg !29
671
+ %650 = load float, ptr addrspace(3) %649, align 4, !dbg !29
672
+ %651 = zext nneg i32 %583 to i64, !dbg !29
673
+ %652 = getelementptr float, ptr addrspace(3) @global_smem, i64 %651, !dbg !29
674
+ %653 = load float, ptr addrspace(3) %652, align 4, !dbg !29
675
+ %654 = zext nneg i32 %587 to i64, !dbg !29
676
+ %655 = getelementptr float, ptr addrspace(3) @global_smem, i64 %654, !dbg !29
677
+ %656 = load float, ptr addrspace(3) %655, align 4, !dbg !29
678
+ %657 = zext nneg i32 %591 to i64, !dbg !29
679
+ %658 = getelementptr float, ptr addrspace(3) @global_smem, i64 %657, !dbg !29
680
+ %659 = load float, ptr addrspace(3) %658, align 4, !dbg !29
681
+ %660 = zext nneg i32 %595 to i64, !dbg !29
682
+ %661 = getelementptr float, ptr addrspace(3) @global_smem, i64 %660, !dbg !29
683
+ %662 = load float, ptr addrspace(3) %661, align 4, !dbg !29
684
+ %663 = zext nneg i32 %599 to i64, !dbg !29
685
+ %664 = getelementptr float, ptr addrspace(3) @global_smem, i64 %663, !dbg !29
686
+ %665 = load float, ptr addrspace(3) %664, align 4, !dbg !29
687
+ %666 = zext nneg i32 %603 to i64, !dbg !29
688
+ %667 = getelementptr float, ptr addrspace(3) @global_smem, i64 %666, !dbg !29
689
+ %668 = load float, ptr addrspace(3) %667, align 4, !dbg !29
690
+ %669 = zext nneg i32 %607 to i64, !dbg !29
691
+ %670 = getelementptr float, ptr addrspace(3) @global_smem, i64 %669, !dbg !29
692
+ %671 = load float, ptr addrspace(3) %670, align 4, !dbg !29
693
+ %672 = zext nneg i32 %611 to i64, !dbg !29
694
+ %673 = getelementptr float, ptr addrspace(3) @global_smem, i64 %672, !dbg !29
695
+ %674 = load float, ptr addrspace(3) %673, align 4, !dbg !29
696
+ %675 = zext nneg i32 %615 to i64, !dbg !29
697
+ %676 = getelementptr float, ptr addrspace(3) @global_smem, i64 %675, !dbg !29
698
+ %677 = load float, ptr addrspace(3) %676, align 4, !dbg !29
699
+ %678 = extractelement <16 x float> %136, i64 0, !dbg !37
700
+ %679 = extractelement <16 x float> %136, i64 1, !dbg !37
701
+ %680 = extractelement <16 x float> %136, i64 2, !dbg !37
702
+ %681 = extractelement <16 x float> %136, i64 3, !dbg !37
703
+ %682 = extractelement <16 x float> %136, i64 4, !dbg !37
704
+ %683 = extractelement <16 x float> %136, i64 5, !dbg !37
705
+ %684 = extractelement <16 x float> %136, i64 6, !dbg !37
706
+ %685 = extractelement <16 x float> %136, i64 7, !dbg !37
707
+ %686 = extractelement <16 x float> %136, i64 8, !dbg !37
708
+ %687 = extractelement <16 x float> %136, i64 9, !dbg !37
709
+ %688 = extractelement <16 x float> %136, i64 10, !dbg !37
710
+ %689 = extractelement <16 x float> %136, i64 11, !dbg !37
711
+ %690 = extractelement <16 x float> %136, i64 12, !dbg !37
712
+ %691 = extractelement <16 x float> %136, i64 13, !dbg !37
713
+ %692 = extractelement <16 x float> %136, i64 14, !dbg !37
714
+ %693 = extractelement <16 x float> %136, i64 15, !dbg !37
715
+ br label %694, !dbg !38
716
+
717
+ 694: ; preds = %215, %694
718
+ %695 = phi i32 [ 0, %215 ], [ %987, %694 ]
719
+ %696 = or i32 %695, %urem, !dbg !39
720
+ %697 = zext nneg i32 %696 to i64, !dbg !39
721
+ %698 = icmp ult i32 %696, 50257, !dbg !40
722
+ %699 = add nsw i64 %100, %697, !dbg !41
723
+ %700 = add nsw i64 %101, %697, !dbg !41
724
+ %701 = add nsw i64 %102, %697, !dbg !41
725
+ %702 = add nsw i64 %103, %697, !dbg !41
726
+ %703 = add nsw i64 %104, %697, !dbg !41
727
+ %704 = add nsw i64 %105, %697, !dbg !41
728
+ %705 = add nsw i64 %106, %697, !dbg !41
729
+ %706 = add nsw i64 %107, %697, !dbg !41
730
+ %707 = add nsw i64 %108, %697, !dbg !41
731
+ %708 = add nsw i64 %109, %697, !dbg !41
732
+ %709 = add nsw i64 %110, %697, !dbg !41
733
+ %710 = add nsw i64 %111, %697, !dbg !41
734
+ %711 = add nsw i64 %112, %697, !dbg !41
735
+ %712 = add nsw i64 %113, %697, !dbg !41
736
+ %713 = add nsw i64 %114, %697, !dbg !41
737
+ %714 = add nsw i64 %115, %697, !dbg !41
738
+ %715 = getelementptr i16, ptr addrspace(1) %4, i64 %699, !dbg !42
739
+ %716 = getelementptr i16, ptr addrspace(1) %4, i64 %700, !dbg !42
740
+ %717 = getelementptr i16, ptr addrspace(1) %4, i64 %701, !dbg !42
741
+ %718 = getelementptr i16, ptr addrspace(1) %4, i64 %702, !dbg !42
742
+ %719 = getelementptr i16, ptr addrspace(1) %4, i64 %703, !dbg !42
743
+ %720 = getelementptr i16, ptr addrspace(1) %4, i64 %704, !dbg !42
744
+ %721 = getelementptr i16, ptr addrspace(1) %4, i64 %705, !dbg !42
745
+ %722 = getelementptr i16, ptr addrspace(1) %4, i64 %706, !dbg !42
746
+ %723 = getelementptr i16, ptr addrspace(1) %4, i64 %707, !dbg !42
747
+ %724 = getelementptr i16, ptr addrspace(1) %4, i64 %708, !dbg !42
748
+ %725 = getelementptr i16, ptr addrspace(1) %4, i64 %709, !dbg !42
749
+ %726 = getelementptr i16, ptr addrspace(1) %4, i64 %710, !dbg !42
750
+ %727 = getelementptr i16, ptr addrspace(1) %4, i64 %711, !dbg !42
751
+ %728 = getelementptr i16, ptr addrspace(1) %4, i64 %712, !dbg !42
752
+ %729 = getelementptr i16, ptr addrspace(1) %4, i64 %713, !dbg !42
753
+ %730 = getelementptr i16, ptr addrspace(1) %4, i64 %714, !dbg !42
754
+ %731 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %715, i1 %698, i16 0, i1 %698) #3, !dbg !43
755
+ %732 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %716, i1 %698, i16 0, i1 %698) #3, !dbg !43
756
+ %733 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %717, i1 %698, i16 0, i1 %698) #3, !dbg !43
757
+ %734 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %718, i1 %698, i16 0, i1 %698) #3, !dbg !43
758
+ %735 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %719, i1 %698, i16 0, i1 %698) #3, !dbg !43
759
+ %736 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %720, i1 %698, i16 0, i1 %698) #3, !dbg !43
760
+ %737 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %721, i1 %698, i16 0, i1 %698) #3, !dbg !43
761
+ %738 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %722, i1 %698, i16 0, i1 %698) #3, !dbg !43
762
+ %739 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %723, i1 %698, i16 0, i1 %698) #3, !dbg !43
763
+ %740 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %724, i1 %698, i16 0, i1 %698) #3, !dbg !43
764
+ %741 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %725, i1 %698, i16 0, i1 %698) #3, !dbg !43
765
+ %742 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %726, i1 %698, i16 0, i1 %698) #3, !dbg !43
766
+ %743 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %727, i1 %698, i16 0, i1 %698) #3, !dbg !43
767
+ %744 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %728, i1 %698, i16 0, i1 %698) #3, !dbg !43
768
+ %745 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %729, i1 %698, i16 0, i1 %698) #3, !dbg !43
769
+ %746 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %730, i1 %698, i16 0, i1 %698) #3, !dbg !43
770
+ %747 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %731) #3, !dbg !44
771
+ %748 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %732) #3, !dbg !44
772
+ %749 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %733) #3, !dbg !44
773
+ %750 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %734) #3, !dbg !44
774
+ %751 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %735) #3, !dbg !44
775
+ %752 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %736) #3, !dbg !44
776
+ %753 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %737) #3, !dbg !44
777
+ %754 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %738) #3, !dbg !44
778
+ %755 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %739) #3, !dbg !44
779
+ %756 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %740) #3, !dbg !44
780
+ %757 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %741) #3, !dbg !44
781
+ %758 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %742) #3, !dbg !44
782
+ %759 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %743) #3, !dbg !44
783
+ %760 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %744) #3, !dbg !44
784
+ %761 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %745) #3, !dbg !44
785
+ %762 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %746) #3, !dbg !44
786
+ %763 = getelementptr float, ptr addrspace(1) %0, i64 %699, !dbg !45
787
+ %764 = getelementptr float, ptr addrspace(1) %0, i64 %700, !dbg !45
788
+ %765 = getelementptr float, ptr addrspace(1) %0, i64 %701, !dbg !45
789
+ %766 = getelementptr float, ptr addrspace(1) %0, i64 %702, !dbg !45
790
+ %767 = getelementptr float, ptr addrspace(1) %0, i64 %703, !dbg !45
791
+ %768 = getelementptr float, ptr addrspace(1) %0, i64 %704, !dbg !45
792
+ %769 = getelementptr float, ptr addrspace(1) %0, i64 %705, !dbg !45
793
+ %770 = getelementptr float, ptr addrspace(1) %0, i64 %706, !dbg !45
794
+ %771 = getelementptr float, ptr addrspace(1) %0, i64 %707, !dbg !45
795
+ %772 = getelementptr float, ptr addrspace(1) %0, i64 %708, !dbg !45
796
+ %773 = getelementptr float, ptr addrspace(1) %0, i64 %709, !dbg !45
797
+ %774 = getelementptr float, ptr addrspace(1) %0, i64 %710, !dbg !45
798
+ %775 = getelementptr float, ptr addrspace(1) %0, i64 %711, !dbg !45
799
+ %776 = getelementptr float, ptr addrspace(1) %0, i64 %712, !dbg !45
800
+ %777 = getelementptr float, ptr addrspace(1) %0, i64 %713, !dbg !45
801
+ %778 = getelementptr float, ptr addrspace(1) %0, i64 %714, !dbg !45
802
+ %779 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %763, i1 %698, i32 0, i1 %698) #3, !dbg !46
803
+ %780 = bitcast i32 %779 to float, !dbg !46
804
+ %781 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %764, i1 %698, i32 0, i1 %698) #3, !dbg !46
805
+ %782 = bitcast i32 %781 to float, !dbg !46
806
+ %783 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %765, i1 %698, i32 0, i1 %698) #3, !dbg !46
807
+ %784 = bitcast i32 %783 to float, !dbg !46
808
+ %785 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %766, i1 %698, i32 0, i1 %698) #3, !dbg !46
809
+ %786 = bitcast i32 %785 to float, !dbg !46
810
+ %787 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %767, i1 %698, i32 0, i1 %698) #3, !dbg !46
811
+ %788 = bitcast i32 %787 to float, !dbg !46
812
+ %789 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %768, i1 %698, i32 0, i1 %698) #3, !dbg !46
813
+ %790 = bitcast i32 %789 to float, !dbg !46
814
+ %791 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %769, i1 %698, i32 0, i1 %698) #3, !dbg !46
815
+ %792 = bitcast i32 %791 to float, !dbg !46
816
+ %793 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %770, i1 %698, i32 0, i1 %698) #3, !dbg !46
817
+ %794 = bitcast i32 %793 to float, !dbg !46
818
+ %795 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %771, i1 %698, i32 0, i1 %698) #3, !dbg !46
819
+ %796 = bitcast i32 %795 to float, !dbg !46
820
+ %797 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %772, i1 %698, i32 0, i1 %698) #3, !dbg !46
821
+ %798 = bitcast i32 %797 to float, !dbg !46
822
+ %799 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %773, i1 %698, i32 0, i1 %698) #3, !dbg !46
823
+ %800 = bitcast i32 %799 to float, !dbg !46
824
+ %801 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %774, i1 %698, i32 0, i1 %698) #3, !dbg !46
825
+ %802 = bitcast i32 %801 to float, !dbg !46
826
+ %803 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %775, i1 %698, i32 0, i1 %698) #3, !dbg !46
827
+ %804 = bitcast i32 %803 to float, !dbg !46
828
+ %805 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %776, i1 %698, i32 0, i1 %698) #3, !dbg !46
829
+ %806 = bitcast i32 %805 to float, !dbg !46
830
+ %807 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %777, i1 %698, i32 0, i1 %698) #3, !dbg !46
831
+ %808 = bitcast i32 %807 to float, !dbg !46
832
+ %809 = tail call i32 asm sideeffect "mov.u32 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b32 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u32 $0, $3;", "=r,l,b,r,b"(ptr addrspace(1) %778, i1 %698, i32 0, i1 %698) #3, !dbg !46
833
+ %810 = bitcast i32 %809 to float, !dbg !46
834
+ %811 = getelementptr i16, ptr addrspace(1) %5, i64 %699, !dbg !47
835
+ %812 = getelementptr i16, ptr addrspace(1) %5, i64 %700, !dbg !47
836
+ %813 = getelementptr i16, ptr addrspace(1) %5, i64 %701, !dbg !47
837
+ %814 = getelementptr i16, ptr addrspace(1) %5, i64 %702, !dbg !47
838
+ %815 = getelementptr i16, ptr addrspace(1) %5, i64 %703, !dbg !47
839
+ %816 = getelementptr i16, ptr addrspace(1) %5, i64 %704, !dbg !47
840
+ %817 = getelementptr i16, ptr addrspace(1) %5, i64 %705, !dbg !47
841
+ %818 = getelementptr i16, ptr addrspace(1) %5, i64 %706, !dbg !47
842
+ %819 = getelementptr i16, ptr addrspace(1) %5, i64 %707, !dbg !47
843
+ %820 = getelementptr i16, ptr addrspace(1) %5, i64 %708, !dbg !47
844
+ %821 = getelementptr i16, ptr addrspace(1) %5, i64 %709, !dbg !47
845
+ %822 = getelementptr i16, ptr addrspace(1) %5, i64 %710, !dbg !47
846
+ %823 = getelementptr i16, ptr addrspace(1) %5, i64 %711, !dbg !47
847
+ %824 = getelementptr i16, ptr addrspace(1) %5, i64 %712, !dbg !47
848
+ %825 = getelementptr i16, ptr addrspace(1) %5, i64 %713, !dbg !47
849
+ %826 = getelementptr i16, ptr addrspace(1) %5, i64 %714, !dbg !47
850
+ %827 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %811, i1 %698, i16 0, i1 %698) #3, !dbg !48
851
+ %828 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %812, i1 %698, i16 0, i1 %698) #3, !dbg !48
852
+ %829 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %813, i1 %698, i16 0, i1 %698) #3, !dbg !48
853
+ %830 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %814, i1 %698, i16 0, i1 %698) #3, !dbg !48
854
+ %831 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %815, i1 %698, i16 0, i1 %698) #3, !dbg !48
855
+ %832 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %816, i1 %698, i16 0, i1 %698) #3, !dbg !48
856
+ %833 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %817, i1 %698, i16 0, i1 %698) #3, !dbg !48
857
+ %834 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %818, i1 %698, i16 0, i1 %698) #3, !dbg !48
858
+ %835 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %819, i1 %698, i16 0, i1 %698) #3, !dbg !48
859
+ %836 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %820, i1 %698, i16 0, i1 %698) #3, !dbg !48
860
+ %837 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %821, i1 %698, i16 0, i1 %698) #3, !dbg !48
861
+ %838 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %822, i1 %698, i16 0, i1 %698) #3, !dbg !48
862
+ %839 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %823, i1 %698, i16 0, i1 %698) #3, !dbg !48
863
+ %840 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %824, i1 %698, i16 0, i1 %698) #3, !dbg !48
864
+ %841 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %825, i1 %698, i16 0, i1 %698) #3, !dbg !48
865
+ %842 = tail call i16 asm sideeffect "mov.u16 $0, 0x0;\0A\09@$2 ld.global.L1::evict_first.b16 { $0 }, [ $1 + 0 ];\0A\09@!$4 mov.u16 $0, $3;", "=c,l,b,c,b"(ptr addrspace(1) %826, i1 %698, i16 0, i1 %698) #3, !dbg !48
866
+ %843 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %827) #3, !dbg !49
867
+ %844 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %828) #3, !dbg !49
868
+ %845 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %829) #3, !dbg !49
869
+ %846 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %830) #3, !dbg !49
870
+ %847 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %831) #3, !dbg !49
871
+ %848 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %832) #3, !dbg !49
872
+ %849 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %833) #3, !dbg !49
873
+ %850 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %834) #3, !dbg !49
874
+ %851 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %835) #3, !dbg !49
875
+ %852 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %836) #3, !dbg !49
876
+ %853 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %837) #3, !dbg !49
877
+ %854 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %838) #3, !dbg !49
878
+ %855 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %839) #3, !dbg !49
879
+ %856 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %840) #3, !dbg !49
880
+ %857 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %841) #3, !dbg !49
881
+ %858 = tail call float asm "cvt.f32.bf16 $0, $1;", "=r,h"(i16 %842) #3, !dbg !49
882
+ %859 = fmul float %678, %780, !dbg !37
883
+ %860 = fmul float %679, %782, !dbg !37
884
+ %861 = fmul float %680, %784, !dbg !37
885
+ %862 = fmul float %681, %786, !dbg !37
886
+ %863 = fmul float %682, %788, !dbg !37
887
+ %864 = fmul float %683, %790, !dbg !37
888
+ %865 = fmul float %684, %792, !dbg !37
889
+ %866 = fmul float %685, %794, !dbg !37
890
+ %867 = fmul float %686, %796, !dbg !37
891
+ %868 = fmul float %687, %798, !dbg !37
892
+ %869 = fmul float %688, %800, !dbg !37
893
+ %870 = fmul float %689, %802, !dbg !37
894
+ %871 = fmul float %690, %804, !dbg !37
895
+ %872 = fmul float %691, %806, !dbg !37
896
+ %873 = fmul float %692, %808, !dbg !37
897
+ %874 = fmul float %693, %810, !dbg !37
898
+ %875 = fmul float %843, 0x3FF7154760000000, !dbg !50
899
+ %876 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %875) #3, !dbg !50
900
+ %877 = fmul float %844, 0x3FF7154760000000, !dbg !50
901
+ %878 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %877) #3, !dbg !50
902
+ %879 = fmul float %845, 0x3FF7154760000000, !dbg !50
903
+ %880 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %879) #3, !dbg !50
904
+ %881 = fmul float %846, 0x3FF7154760000000, !dbg !50
905
+ %882 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %881) #3, !dbg !50
906
+ %883 = fmul float %847, 0x3FF7154760000000, !dbg !50
907
+ %884 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %883) #3, !dbg !50
908
+ %885 = fmul float %848, 0x3FF7154760000000, !dbg !50
909
+ %886 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %885) #3, !dbg !50
910
+ %887 = fmul float %849, 0x3FF7154760000000, !dbg !50
911
+ %888 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %887) #3, !dbg !50
912
+ %889 = fmul float %850, 0x3FF7154760000000, !dbg !50
913
+ %890 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %889) #3, !dbg !50
914
+ %891 = fmul float %851, 0x3FF7154760000000, !dbg !50
915
+ %892 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %891) #3, !dbg !50
916
+ %893 = fmul float %852, 0x3FF7154760000000, !dbg !50
917
+ %894 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %893) #3, !dbg !50
918
+ %895 = fmul float %853, 0x3FF7154760000000, !dbg !50
919
+ %896 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %895) #3, !dbg !50
920
+ %897 = fmul float %854, 0x3FF7154760000000, !dbg !50
921
+ %898 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %897) #3, !dbg !50
922
+ %899 = fmul float %855, 0x3FF7154760000000, !dbg !50
923
+ %900 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %899) #3, !dbg !50
924
+ %901 = fmul float %856, 0x3FF7154760000000, !dbg !50
925
+ %902 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %901) #3, !dbg !50
926
+ %903 = fmul float %857, 0x3FF7154760000000, !dbg !50
927
+ %904 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %903) #3, !dbg !50
928
+ %905 = fmul float %858, 0x3FF7154760000000, !dbg !50
929
+ %906 = tail call float asm "ex2.approx.f32 $0, $1;", "=f,f"(float %905) #3, !dbg !50
930
+ %907 = fmul float %632, %876, !dbg !51
931
+ %908 = fmul float %635, %878, !dbg !51
932
+ %909 = fmul float %638, %880, !dbg !51
933
+ %910 = fmul float %641, %882, !dbg !51
934
+ %911 = fmul float %644, %884, !dbg !51
935
+ %912 = fmul float %647, %886, !dbg !51
936
+ %913 = fmul float %650, %888, !dbg !51
937
+ %914 = fmul float %653, %890, !dbg !51
938
+ %915 = fmul float %656, %892, !dbg !51
939
+ %916 = fmul float %659, %894, !dbg !51
940
+ %917 = fmul float %662, %896, !dbg !51
941
+ %918 = fmul float %665, %898, !dbg !51
942
+ %919 = fmul float %668, %900, !dbg !51
943
+ %920 = fmul float %671, %902, !dbg !51
944
+ %921 = fmul float %674, %904, !dbg !51
945
+ %922 = fmul float %677, %906, !dbg !51
946
+ %923 = fsub float %859, %907, !dbg !52
947
+ %924 = fsub float %860, %908, !dbg !52
948
+ %925 = fsub float %861, %909, !dbg !52
949
+ %926 = fsub float %862, %910, !dbg !52
950
+ %927 = fsub float %863, %911, !dbg !52
951
+ %928 = fsub float %864, %912, !dbg !52
952
+ %929 = fsub float %865, %913, !dbg !52
953
+ %930 = fsub float %866, %914, !dbg !52
954
+ %931 = fsub float %867, %915, !dbg !52
955
+ %932 = fsub float %868, %916, !dbg !52
956
+ %933 = fsub float %869, %917, !dbg !52
957
+ %934 = fsub float %870, %918, !dbg !52
958
+ %935 = fsub float %871, %919, !dbg !52
959
+ %936 = fsub float %872, %920, !dbg !52
960
+ %937 = fsub float %873, %921, !dbg !52
961
+ %938 = fsub float %874, %922, !dbg !52
962
+ %939 = fadd float %747, %923, !dbg !53
963
+ %940 = fadd float %748, %924, !dbg !53
964
+ %941 = fadd float %749, %925, !dbg !53
965
+ %942 = fadd float %750, %926, !dbg !53
966
+ %943 = fadd float %751, %927, !dbg !53
967
+ %944 = fadd float %752, %928, !dbg !53
968
+ %945 = fadd float %753, %929, !dbg !53
969
+ %946 = fadd float %754, %930, !dbg !53
970
+ %947 = fadd float %755, %931, !dbg !53
971
+ %948 = fadd float %756, %932, !dbg !53
972
+ %949 = fadd float %757, %933, !dbg !53
973
+ %950 = fadd float %758, %934, !dbg !53
974
+ %951 = fadd float %759, %935, !dbg !53
975
+ %952 = fadd float %760, %936, !dbg !53
976
+ %953 = fadd float %761, %937, !dbg !53
977
+ %954 = fadd float %762, %938, !dbg !53
978
+ %955 = getelementptr i16, ptr addrspace(1) %6, i64 %699, !dbg !54
979
+ %956 = getelementptr i16, ptr addrspace(1) %6, i64 %700, !dbg !54
980
+ %957 = getelementptr i16, ptr addrspace(1) %6, i64 %701, !dbg !54
981
+ %958 = getelementptr i16, ptr addrspace(1) %6, i64 %702, !dbg !54
982
+ %959 = getelementptr i16, ptr addrspace(1) %6, i64 %703, !dbg !54
983
+ %960 = getelementptr i16, ptr addrspace(1) %6, i64 %704, !dbg !54
984
+ %961 = getelementptr i16, ptr addrspace(1) %6, i64 %705, !dbg !54
985
+ %962 = getelementptr i16, ptr addrspace(1) %6, i64 %706, !dbg !54
986
+ %963 = getelementptr i16, ptr addrspace(1) %6, i64 %707, !dbg !54
987
+ %964 = getelementptr i16, ptr addrspace(1) %6, i64 %708, !dbg !54
988
+ %965 = getelementptr i16, ptr addrspace(1) %6, i64 %709, !dbg !54
989
+ %966 = getelementptr i16, ptr addrspace(1) %6, i64 %710, !dbg !54
990
+ %967 = getelementptr i16, ptr addrspace(1) %6, i64 %711, !dbg !54
991
+ %968 = getelementptr i16, ptr addrspace(1) %6, i64 %712, !dbg !54
992
+ %969 = getelementptr i16, ptr addrspace(1) %6, i64 %713, !dbg !54
993
+ %970 = getelementptr i16, ptr addrspace(1) %6, i64 %714, !dbg !54
994
+ %971 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %939) #3, !dbg !55
995
+ %972 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %940) #3, !dbg !55
996
+ %973 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %941) #3, !dbg !55
997
+ %974 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %942) #3, !dbg !55
998
+ %975 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %943) #3, !dbg !55
999
+ %976 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %944) #3, !dbg !55
1000
+ %977 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %945) #3, !dbg !55
1001
+ %978 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %946) #3, !dbg !55
1002
+ %979 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %947) #3, !dbg !55
1003
+ %980 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %948) #3, !dbg !55
1004
+ %981 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %949) #3, !dbg !55
1005
+ %982 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %950) #3, !dbg !55
1006
+ %983 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %951) #3, !dbg !55
1007
+ %984 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %952) #3, !dbg !55
1008
+ %985 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %953) #3, !dbg !55
1009
+ %986 = tail call i16 asm "cvt.rn.bf16.f32 $0, $1;", "=h,r"(float %954) #3, !dbg !55
1010
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %971, ptr addrspace(1) %955, i1 %698) #3, !dbg !55
1011
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %972, ptr addrspace(1) %956, i1 %698) #3, !dbg !55
1012
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %973, ptr addrspace(1) %957, i1 %698) #3, !dbg !55
1013
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %974, ptr addrspace(1) %958, i1 %698) #3, !dbg !55
1014
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %975, ptr addrspace(1) %959, i1 %698) #3, !dbg !55
1015
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %976, ptr addrspace(1) %960, i1 %698) #3, !dbg !55
1016
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %977, ptr addrspace(1) %961, i1 %698) #3, !dbg !55
1017
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %978, ptr addrspace(1) %962, i1 %698) #3, !dbg !55
1018
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %979, ptr addrspace(1) %963, i1 %698) #3, !dbg !55
1019
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %980, ptr addrspace(1) %964, i1 %698) #3, !dbg !55
1020
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %981, ptr addrspace(1) %965, i1 %698) #3, !dbg !55
1021
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %982, ptr addrspace(1) %966, i1 %698) #3, !dbg !55
1022
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %983, ptr addrspace(1) %967, i1 %698) #3, !dbg !55
1023
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %984, ptr addrspace(1) %968, i1 %698) #3, !dbg !55
1024
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %985, ptr addrspace(1) %969, i1 %698) #3, !dbg !55
1025
+ tail call void asm sideeffect "@$2 st.global.b16 [ $1 + 0 ], { $0 };", "c,l,b"(i16 %986, ptr addrspace(1) %970, i1 %698) #3, !dbg !55
1026
+ %987 = add nuw nsw i32 %695, 64, !dbg !38
1027
+ %988 = icmp ult i32 %695, 50193, !dbg !38
1028
+ br i1 %988, label %694, label %989, !dbg !38
1029
+
1030
+ 989: ; preds = %694
1031
+ ret void, !dbg !56
1032
+ }
1033
+
1034
+ ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
1035
+ declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
1036
+
1037
+ ; Function Attrs: convergent nocallback nounwind memory(inaccessiblemem: readwrite)
1038
+ declare i32 @llvm.nvvm.shfl.sync.bfly.i32(i32, i32, i32, i32) #1
1039
+
1040
+ ; Function Attrs: convergent nocallback nounwind
1041
+ declare void @llvm.nvvm.barrier0() #2
1042
+
1043
+ attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
1044
+ attributes #1 = { convergent nocallback nounwind memory(inaccessiblemem: readwrite) }
1045
+ attributes #2 = { convergent nocallback nounwind }
1046
+ attributes #3 = { nounwind }
1047
+
1048
+ !llvm.module.flags = !{!0}
1049
+ !llvm.dbg.cu = !{!1}
1050
+ !nvvm.annotations = !{!3, !4, !4, !3}
1051
+
1052
+ !0 = !{i32 2, !"Debug Info Version", i32 3}
1053
+ !1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
1054
+ !2 = !DIFile(filename: "ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py", directory: "/tmp/torchinductor_root/kz")
1055
+ !3 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"kernel", i32 1}
1056
+ !4 = !{ptr @triton__0d1d2d3d4d5d6d7de8, !"maxntidx", i32 256}
1057
+ !5 = distinct !DISubprogram(name: "triton__0d1d2d3d4d5d6d7de8", linkageName: "triton__0d1d2d3d4d5d6d7de8", scope: !2, file: !2, line: 18, type: !6, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
1058
+ !6 = !DISubroutineType(cc: DW_CC_normal, types: !7)
1059
+ !7 = !{}
1060
+ !8 = !DILocation(line: 22, column: 44, scope: !5)
1061
+ !9 = !DILocation(line: 24, column: 33, scope: !5)
1062
+ !10 = !DILocation(line: 21, column: 28, scope: !5)
1063
+ !11 = !DILocation(line: 21, column: 34, scope: !5)
1064
+ !12 = !DILocation(line: 21, column: 46, scope: !5)
1065
+ !13 = !DILocation(line: 22, column: 23, scope: !5)
1066
+ !14 = !DILocation(line: 26, column: 30, scope: !5)
1067
+ !15 = !DILocation(line: 26, column: 35, scope: !5)
1068
+ !16 = !DILocation(line: 27, column: 19, scope: !5)
1069
+ !17 = !DILocation(line: 29, column: 19, scope: !5)
1070
+ !18 = !DILocation(line: 36, column: 46, scope: !5)
1071
+ !19 = !DILocation(line: 38, column: 23, scope: !5)
1072
+ !20 = !DILocation(line: 39, column: 22, scope: !5)
1073
+ !21 = !DILocation(line: 41, column: 37, scope: !5)
1074
+ !22 = !DILocation(line: 32, column: 36, scope: !5)
1075
+ !23 = !DILocation(line: 33, column: 27, scope: !5)
1076
+ !24 = !DILocation(line: 34, column: 25, scope: !5)
1077
+ !25 = !DILocation(line: 36, column: 34, scope: !5)
1078
+ !26 = !DILocation(line: 36, column: 52, scope: !5)
1079
+ !27 = !DILocation(line: 42, column: 23, scope: !5)
1080
+ !28 = !DILocation(line: 45, column: 40, scope: !5)
1081
+ !29 = !DILocation(line: 243, column: 36, scope: !30, inlinedAt: !32)
1082
+ !30 = distinct !DILexicalBlockFile(scope: !5, file: !31, discriminator: 0)
1083
+ !31 = !DIFile(filename: "standard.py", directory: "/usr/local/lib/python3.10/dist-packages/triton/language")
1084
+ !32 = !DILocation(line: 46, column: 27, scope: !30)
1085
+ !33 = !DILocation(line: 233, column: 15, scope: !34, inlinedAt: !35)
1086
+ !34 = distinct !DILexicalBlockFile(scope: !30, file: !31, discriminator: 0)
1087
+ !35 = !DILocation(line: 243, column: 36, scope: !34, inlinedAt: !36)
1088
+ !36 = !DILocation(line: 46, column: 27, scope: !34)
1089
+ !37 = !DILocation(line: 63, column: 24, scope: !5)
1090
+ !38 = !DILocation(line: 51, column: 36, scope: !5)
1091
+ !39 = !DILocation(line: 52, column: 27, scope: !5)
1092
+ !40 = !DILocation(line: 53, column: 25, scope: !5)
1093
+ !41 = !DILocation(line: 55, column: 41, scope: !5)
1094
+ !42 = !DILocation(line: 55, column: 35, scope: !5)
1095
+ !43 = !DILocation(line: 55, column: 53, scope: !5)
1096
+ !44 = !DILocation(line: 55, column: 105, scope: !5)
1097
+ !45 = !DILocation(line: 56, column: 35, scope: !5)
1098
+ !46 = !DILocation(line: 56, column: 53, scope: !5)
1099
+ !47 = !DILocation(line: 57, column: 35, scope: !5)
1100
+ !48 = !DILocation(line: 57, column: 53, scope: !5)
1101
+ !49 = !DILocation(line: 57, column: 105, scope: !5)
1102
+ !50 = !DILocation(line: 65, column: 23, scope: !5)
1103
+ !51 = !DILocation(line: 66, column: 24, scope: !5)
1104
+ !52 = !DILocation(line: 67, column: 24, scope: !5)
1105
+ !53 = !DILocation(line: 69, column: 24, scope: !5)
1106
+ !54 = !DILocation(line: 70, column: 29, scope: !5)
1107
+ !55 = !DILocation(line: 70, column: 54, scope: !5)
1108
+ !56 = !DILocation(line: 51, column: 4, scope: !5)
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ptx ADDED
@@ -0,0 +1,1927 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2d3d4d5d6d7de8
10
+ .extern .shared .align 1 .b8 global_smem[];
11
+
12
+ .visible .entry triton__0d1d2d3d4d5d6d7de8(
13
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_0,
14
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_1,
15
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_2,
16
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_3,
17
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_4,
18
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_5,
19
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_6,
20
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_7,
21
+ .param .u64 triton__0d1d2d3d4d5d6d7de8_param_8
22
+ )
23
+ .maxntid 256, 1, 1
24
+ {
25
+ .reg .pred %p<201>;
26
+ .reg .b16 %rs<129>;
27
+ .reg .b32 %r<399>;
28
+ .reg .f32 %f<469>;
29
+ .reg .b64 %rd<150>;
30
+ .loc 1 18 0
31
+ $L__func_begin0:
32
+ .loc 1 18 0
33
+
34
+ ld.param.u64 %rd17, [triton__0d1d2d3d4d5d6d7de8_param_6];
35
+ ld.param.u64 %rd16, [triton__0d1d2d3d4d5d6d7de8_param_5];
36
+ ld.param.u64 %rd15, [triton__0d1d2d3d4d5d6d7de8_param_4];
37
+ ld.param.u64 %rd52, [triton__0d1d2d3d4d5d6d7de8_param_0];
38
+ $L__tmp0:
39
+ .loc 1 22 44
40
+ mov.u32 %r1, %tid.x;
41
+ ld.param.u64 %rd53, [triton__0d1d2d3d4d5d6d7de8_param_1];
42
+ and.b32 %r2, %r1, 31;
43
+ ld.param.u64 %rd50, [triton__0d1d2d3d4d5d6d7de8_param_2];
44
+ ld.param.u64 %rd51, [triton__0d1d2d3d4d5d6d7de8_param_3];
45
+ bfe.u32 %r30, %r1, 6, 2;
46
+ or.b32 %r3, %r30, 4;
47
+ or.b32 %r4, %r30, 8;
48
+ or.b32 %r5, %r30, 12;
49
+ or.b32 %r6, %r30, 16;
50
+ or.b32 %r7, %r30, 20;
51
+ or.b32 %r8, %r30, 24;
52
+ or.b32 %r9, %r30, 28;
53
+ or.b32 %r10, %r30, 32;
54
+ or.b32 %r11, %r30, 36;
55
+ or.b32 %r12, %r30, 40;
56
+ or.b32 %r13, %r30, 44;
57
+ or.b32 %r14, %r30, 48;
58
+ or.b32 %r15, %r30, 52;
59
+ or.b32 %r16, %r30, 56;
60
+ or.b32 %r17, %r30, 60;
61
+ .loc 1 24 33
62
+ and.b32 %r18, %r1, 63;
63
+ .loc 1 21 28
64
+ mov.u32 %r23, %ctaid.x;
65
+ .loc 1 21 34
66
+ cvt.s64.s32 %rd1, %r23;
67
+ .loc 1 21 46
68
+ mul.wide.s32 %rd54, %r23, 64;
69
+ cvt.u64.u32 %rd2, %r30;
70
+ .loc 1 22 23
71
+ or.b64 %rd55, %rd54, %rd2;
72
+ .loc 1 26 30
73
+ shl.b64 %rd56, %rd55, 3;
74
+ add.s64 %rd19, %rd53, %rd56;
75
+ add.s64 %rd21, %rd19, 32;
76
+ add.s64 %rd23, %rd19, 64;
77
+ add.s64 %rd25, %rd19, 96;
78
+ add.s64 %rd27, %rd19, 128;
79
+ add.s64 %rd29, %rd19, 160;
80
+ add.s64 %rd31, %rd19, 192;
81
+ add.s64 %rd33, %rd19, 224;
82
+ add.s64 %rd35, %rd19, 256;
83
+ add.s64 %rd37, %rd19, 288;
84
+ add.s64 %rd39, %rd19, 320;
85
+ add.s64 %rd41, %rd19, 352;
86
+ add.s64 %rd43, %rd19, 384;
87
+ add.s64 %rd45, %rd19, 416;
88
+ add.s64 %rd47, %rd19, 448;
89
+ add.s64 %rd49, %rd19, 480;
90
+ mov.pred %p1, -1;
91
+ .loc 1 26 35
92
+ mov.u64 %rd18, 0x0;
93
+ @%p1 ld.global.L1::evict_last.b64 { %rd18 }, [ %rd19 + 0 ];
94
+ mov.u64 %rd20, 0x0;
95
+ @%p1 ld.global.L1::evict_last.b64 { %rd20 }, [ %rd21 + 0 ];
96
+ mov.u64 %rd22, 0x0;
97
+ @%p1 ld.global.L1::evict_last.b64 { %rd22 }, [ %rd23 + 0 ];
98
+ mov.u64 %rd24, 0x0;
99
+ @%p1 ld.global.L1::evict_last.b64 { %rd24 }, [ %rd25 + 0 ];
100
+ mov.u64 %rd26, 0x0;
101
+ @%p1 ld.global.L1::evict_last.b64 { %rd26 }, [ %rd27 + 0 ];
102
+ mov.u64 %rd28, 0x0;
103
+ @%p1 ld.global.L1::evict_last.b64 { %rd28 }, [ %rd29 + 0 ];
104
+ mov.u64 %rd30, 0x0;
105
+ @%p1 ld.global.L1::evict_last.b64 { %rd30 }, [ %rd31 + 0 ];
106
+ mov.u64 %rd32, 0x0;
107
+ @%p1 ld.global.L1::evict_last.b64 { %rd32 }, [ %rd33 + 0 ];
108
+ mov.u64 %rd34, 0x0;
109
+ @%p1 ld.global.L1::evict_last.b64 { %rd34 }, [ %rd35 + 0 ];
110
+ mov.u64 %rd36, 0x0;
111
+ @%p1 ld.global.L1::evict_last.b64 { %rd36 }, [ %rd37 + 0 ];
112
+ mov.u64 %rd38, 0x0;
113
+ @%p1 ld.global.L1::evict_last.b64 { %rd38 }, [ %rd39 + 0 ];
114
+ mov.u64 %rd40, 0x0;
115
+ @%p1 ld.global.L1::evict_last.b64 { %rd40 }, [ %rd41 + 0 ];
116
+ mov.u64 %rd42, 0x0;
117
+ @%p1 ld.global.L1::evict_last.b64 { %rd42 }, [ %rd43 + 0 ];
118
+ mov.u64 %rd44, 0x0;
119
+ @%p1 ld.global.L1::evict_last.b64 { %rd44 }, [ %rd45 + 0 ];
120
+ mov.u64 %rd46, 0x0;
121
+ @%p1 ld.global.L1::evict_last.b64 { %rd46 }, [ %rd47 + 0 ];
122
+ mov.u64 %rd48, 0x0;
123
+ @%p1 ld.global.L1::evict_last.b64 { %rd48 }, [ %rd49 + 0 ];
124
+ .loc 1 27 19
125
+ mov.u32 %r27, 0x0;
126
+ @%p1 ld.global.b32 { %r27 }, [ %rd50 + 0 ];
127
+ .loc 1 29 19
128
+ mov.u32 %r28, 0x0;
129
+ @%p1 ld.global.b32 { %r28 }, [ %rd51 + 0 ];
130
+ .loc 1 38 23
131
+ setp.eq.s64 %p19, %rd18, -1;
132
+ setp.eq.s64 %p20, %rd20, -1;
133
+ setp.eq.s64 %p21, %rd22, -1;
134
+ setp.eq.s64 %p22, %rd24, -1;
135
+ setp.eq.s64 %p23, %rd26, -1;
136
+ setp.eq.s64 %p24, %rd28, -1;
137
+ setp.eq.s64 %p25, %rd30, -1;
138
+ setp.eq.s64 %p26, %rd32, -1;
139
+ setp.eq.s64 %p27, %rd34, -1;
140
+ setp.eq.s64 %p28, %rd36, -1;
141
+ setp.eq.s64 %p29, %rd38, -1;
142
+ setp.eq.s64 %p30, %rd40, -1;
143
+ setp.eq.s64 %p31, %rd42, -1;
144
+ setp.eq.s64 %p32, %rd44, -1;
145
+ setp.eq.s64 %p33, %rd46, -1;
146
+ setp.eq.s64 %p34, %rd48, -1;
147
+ .loc 1 39 22
148
+ div.full.f32 %r26, %r27, %r28;
149
+ mov.b32 %f97, %r26;
150
+ .loc 1 41 37
151
+ selp.f32 %f16, 0f00000000, %f97, %p34;
152
+ selp.f32 %f15, 0f00000000, %f97, %p33;
153
+ selp.f32 %f14, 0f00000000, %f97, %p32;
154
+ selp.f32 %f13, 0f00000000, %f97, %p31;
155
+ selp.f32 %f12, 0f00000000, %f97, %p30;
156
+ selp.f32 %f11, 0f00000000, %f97, %p29;
157
+ selp.f32 %f10, 0f00000000, %f97, %p28;
158
+ selp.f32 %f9, 0f00000000, %f97, %p27;
159
+ selp.f32 %f8, 0f00000000, %f97, %p26;
160
+ selp.f32 %f7, 0f00000000, %f97, %p25;
161
+ selp.f32 %f6, 0f00000000, %f97, %p24;
162
+ selp.f32 %f5, 0f00000000, %f97, %p23;
163
+ selp.f32 %f4, 0f00000000, %f97, %p22;
164
+ selp.f32 %f3, 0f00000000, %f97, %p21;
165
+ selp.f32 %f2, 0f00000000, %f97, %p20;
166
+ selp.f32 %f1, 0f00000000, %f97, %p19;
167
+ .loc 1 32 36
168
+ mul.wide.s32 %rd57, %r23, 12865792;
169
+ mul.wide.u32 %rd58, %r30, 201028;
170
+ add.s64 %rd59, %rd57, %rd58;
171
+ cvt.u64.u32 %rd60, %r1;
172
+ and.b64 %rd3, %rd60, 63;
173
+ mul.wide.u32 %rd61, %r18, 4;
174
+ add.s64 %rd62, %rd59, %rd61;
175
+ add.s64 %rd63, %rd62, %rd52;
176
+ add.s64 %rd149, %rd63, 12061680;
177
+ mov.f32 %f453, 0f00000000;
178
+ mov.b32 %r397, -64;
179
+ mov.u64 %rd147, %rd149;
180
+ mov.f32 %f454, %f453;
181
+ mov.f32 %f455, %f453;
182
+ mov.f32 %f456, %f453;
183
+ mov.f32 %f457, %f453;
184
+ mov.f32 %f458, %f453;
185
+ mov.f32 %f459, %f453;
186
+ mov.f32 %f460, %f453;
187
+ mov.f32 %f461, %f453;
188
+ mov.f32 %f462, %f453;
189
+ mov.f32 %f463, %f453;
190
+ mov.f32 %f464, %f453;
191
+ mov.f32 %f465, %f453;
192
+ mov.f32 %f466, %f453;
193
+ mov.f32 %f467, %f453;
194
+ mov.f32 %f468, %f453;
195
+ $L__BB0_1:
196
+ add.s32 %r397, %r397, 64;
197
+ .loc 1 33 27
198
+ add.s32 %r63, %r397, %r18;
199
+ .loc 1 34 25
200
+ setp.lt.u32 %p35, %r63, 50257;
201
+ .loc 1 36 34
202
+ add.s64 %rd64, %rd147, -12061680;
203
+ add.s64 %rd65, %rd147, -11257568;
204
+ add.s64 %rd66, %rd147, -10453456;
205
+ add.s64 %rd67, %rd147, -9649344;
206
+ add.s64 %rd68, %rd147, -8845232;
207
+ add.s64 %rd69, %rd147, -8041120;
208
+ add.s64 %rd70, %rd147, -7237008;
209
+ add.s64 %rd71, %rd147, -6432896;
210
+ add.s64 %rd72, %rd147, -5628784;
211
+ add.s64 %rd73, %rd147, -4824672;
212
+ add.s64 %rd74, %rd147, -4020560;
213
+ add.s64 %rd75, %rd147, -3216448;
214
+ add.s64 %rd76, %rd147, -2412336;
215
+ add.s64 %rd77, %rd147, -1608224;
216
+ add.s64 %rd78, %rd147, -804112;
217
+ mov.b32 %r333, 0;
218
+ .loc 1 36 52
219
+ mov.u32 %r31, 0x0;
220
+ @%p35 ld.global.L1::evict_last.b32 { %r31 }, [ %rd64 + 0 ];
221
+ @!%p35 mov.u32 %r31, %r333;
222
+ mov.u32 %r33, 0x0;
223
+ @%p35 ld.global.L1::evict_last.b32 { %r33 }, [ %rd65 + 0 ];
224
+ @!%p35 mov.u32 %r33, %r333;
225
+ mov.u32 %r35, 0x0;
226
+ @%p35 ld.global.L1::evict_last.b32 { %r35 }, [ %rd66 + 0 ];
227
+ @!%p35 mov.u32 %r35, %r333;
228
+ mov.u32 %r37, 0x0;
229
+ @%p35 ld.global.L1::evict_last.b32 { %r37 }, [ %rd67 + 0 ];
230
+ @!%p35 mov.u32 %r37, %r333;
231
+ mov.u32 %r39, 0x0;
232
+ @%p35 ld.global.L1::evict_last.b32 { %r39 }, [ %rd68 + 0 ];
233
+ @!%p35 mov.u32 %r39, %r333;
234
+ mov.u32 %r41, 0x0;
235
+ @%p35 ld.global.L1::evict_last.b32 { %r41 }, [ %rd69 + 0 ];
236
+ @!%p35 mov.u32 %r41, %r333;
237
+ mov.u32 %r43, 0x0;
238
+ @%p35 ld.global.L1::evict_last.b32 { %r43 }, [ %rd70 + 0 ];
239
+ @!%p35 mov.u32 %r43, %r333;
240
+ mov.u32 %r45, 0x0;
241
+ @%p35 ld.global.L1::evict_last.b32 { %r45 }, [ %rd71 + 0 ];
242
+ @!%p35 mov.u32 %r45, %r333;
243
+ mov.u32 %r47, 0x0;
244
+ @%p35 ld.global.L1::evict_last.b32 { %r47 }, [ %rd72 + 0 ];
245
+ @!%p35 mov.u32 %r47, %r333;
246
+ mov.u32 %r49, 0x0;
247
+ @%p35 ld.global.L1::evict_last.b32 { %r49 }, [ %rd73 + 0 ];
248
+ @!%p35 mov.u32 %r49, %r333;
249
+ mov.u32 %r51, 0x0;
250
+ @%p35 ld.global.L1::evict_last.b32 { %r51 }, [ %rd74 + 0 ];
251
+ @!%p35 mov.u32 %r51, %r333;
252
+ mov.u32 %r53, 0x0;
253
+ @%p35 ld.global.L1::evict_last.b32 { %r53 }, [ %rd75 + 0 ];
254
+ @!%p35 mov.u32 %r53, %r333;
255
+ mov.u32 %r55, 0x0;
256
+ @%p35 ld.global.L1::evict_last.b32 { %r55 }, [ %rd76 + 0 ];
257
+ @!%p35 mov.u32 %r55, %r333;
258
+ mov.u32 %r57, 0x0;
259
+ @%p35 ld.global.L1::evict_last.b32 { %r57 }, [ %rd77 + 0 ];
260
+ @!%p35 mov.u32 %r57, %r333;
261
+ mov.u32 %r59, 0x0;
262
+ @%p35 ld.global.L1::evict_last.b32 { %r59 }, [ %rd78 + 0 ];
263
+ @!%p35 mov.u32 %r59, %r333;
264
+ mov.u32 %r61, 0x0;
265
+ @%p35 ld.global.L1::evict_last.b32 { %r61 }, [ %rd147 + 0 ];
266
+ @!%p35 mov.u32 %r61, %r333;
267
+ mov.b32 %f98, %r61;
268
+ mov.b32 %f99, %r59;
269
+ mov.b32 %f100, %r57;
270
+ mov.b32 %f101, %r55;
271
+ mov.b32 %f102, %r53;
272
+ mov.b32 %f103, %r51;
273
+ mov.b32 %f104, %r49;
274
+ mov.b32 %f105, %r47;
275
+ mov.b32 %f106, %r45;
276
+ mov.b32 %f107, %r43;
277
+ mov.b32 %f108, %r41;
278
+ mov.b32 %f109, %r39;
279
+ mov.b32 %f110, %r37;
280
+ mov.b32 %f111, %r35;
281
+ mov.b32 %f112, %r33;
282
+ mov.b32 %f113, %r31;
283
+ .loc 1 42 23
284
+ mul.f32 %f114, %f1, %f113;
285
+ mul.f32 %f115, %f2, %f112;
286
+ mul.f32 %f116, %f3, %f111;
287
+ mul.f32 %f117, %f4, %f110;
288
+ mul.f32 %f118, %f5, %f109;
289
+ mul.f32 %f119, %f6, %f108;
290
+ mul.f32 %f120, %f7, %f107;
291
+ mul.f32 %f121, %f8, %f106;
292
+ mul.f32 %f122, %f9, %f105;
293
+ mul.f32 %f123, %f10, %f104;
294
+ mul.f32 %f124, %f11, %f103;
295
+ mul.f32 %f125, %f12, %f102;
296
+ mul.f32 %f126, %f13, %f101;
297
+ mul.f32 %f127, %f14, %f100;
298
+ mul.f32 %f128, %f15, %f99;
299
+ mul.f32 %f129, %f16, %f98;
300
+ .loc 1 45 40
301
+ selp.f32 %f130, %f129, 0f80000000, %p35;
302
+ selp.f32 %f131, %f128, 0f80000000, %p35;
303
+ selp.f32 %f132, %f127, 0f80000000, %p35;
304
+ selp.f32 %f133, %f126, 0f80000000, %p35;
305
+ selp.f32 %f134, %f125, 0f80000000, %p35;
306
+ selp.f32 %f135, %f124, 0f80000000, %p35;
307
+ selp.f32 %f136, %f123, 0f80000000, %p35;
308
+ selp.f32 %f137, %f122, 0f80000000, %p35;
309
+ selp.f32 %f138, %f121, 0f80000000, %p35;
310
+ selp.f32 %f139, %f120, 0f80000000, %p35;
311
+ selp.f32 %f140, %f119, 0f80000000, %p35;
312
+ selp.f32 %f141, %f118, 0f80000000, %p35;
313
+ selp.f32 %f142, %f117, 0f80000000, %p35;
314
+ selp.f32 %f143, %f116, 0f80000000, %p35;
315
+ selp.f32 %f144, %f115, 0f80000000, %p35;
316
+ selp.f32 %f145, %f114, 0f80000000, %p35;
317
+ add.f32 %f453, %f453, %f145;
318
+ add.f32 %f454, %f454, %f144;
319
+ add.f32 %f455, %f455, %f143;
320
+ add.f32 %f456, %f456, %f142;
321
+ add.f32 %f457, %f457, %f141;
322
+ add.f32 %f458, %f458, %f140;
323
+ add.f32 %f459, %f459, %f139;
324
+ add.f32 %f460, %f460, %f138;
325
+ add.f32 %f461, %f461, %f137;
326
+ add.f32 %f462, %f462, %f136;
327
+ add.f32 %f463, %f463, %f135;
328
+ add.f32 %f464, %f464, %f134;
329
+ add.f32 %f465, %f465, %f133;
330
+ add.f32 %f466, %f466, %f132;
331
+ add.f32 %f467, %f467, %f131;
332
+ add.f32 %f468, %f468, %f130;
333
+ .loc 1 32 36
334
+ add.s64 %rd147, %rd147, 256;
335
+ setp.lt.u32 %p67, %r397, 50193;
336
+ @%p67 bra $L__BB0_1;
337
+ .loc 1 0 36
338
+ cvt.u32.u64 %r101, %rd2;
339
+ $L__tmp1:
340
+ .loc 2 243 36
341
+ mov.b32 %r102, %f453;
342
+ shfl.sync.bfly.b32 %r103, %r102, 16, 31, -1;
343
+ mov.b32 %f146, %r103;
344
+ $L__tmp2:
345
+ .loc 2 233 15
346
+ add.f32 %f147, %f453, %f146;
347
+ $L__tmp3:
348
+ .loc 2 243 36
349
+ mov.b32 %r104, %f147;
350
+ shfl.sync.bfly.b32 %r105, %r104, 8, 31, -1;
351
+ mov.b32 %f148, %r105;
352
+ $L__tmp4:
353
+ .loc 2 233 15
354
+ add.f32 %f149, %f147, %f148;
355
+ $L__tmp5:
356
+ .loc 2 243 36
357
+ mov.b32 %r106, %f149;
358
+ shfl.sync.bfly.b32 %r107, %r106, 4, 31, -1;
359
+ mov.b32 %f150, %r107;
360
+ $L__tmp6:
361
+ .loc 2 233 15
362
+ add.f32 %f151, %f149, %f150;
363
+ $L__tmp7:
364
+ .loc 2 243 36
365
+ mov.b32 %r108, %f151;
366
+ shfl.sync.bfly.b32 %r109, %r108, 2, 31, -1;
367
+ mov.b32 %f152, %r109;
368
+ $L__tmp8:
369
+ .loc 2 233 15
370
+ add.f32 %f153, %f151, %f152;
371
+ $L__tmp9:
372
+ .loc 2 243 36
373
+ mov.b32 %r110, %f153;
374
+ shfl.sync.bfly.b32 %r111, %r110, 1, 31, -1;
375
+ mov.b32 %f154, %r111;
376
+ $L__tmp10:
377
+ .loc 2 233 15
378
+ add.f32 %f155, %f153, %f154;
379
+ $L__tmp11:
380
+ .loc 2 243 36
381
+ mov.b32 %r112, %f454;
382
+ shfl.sync.bfly.b32 %r113, %r112, 16, 31, -1;
383
+ mov.b32 %f156, %r113;
384
+ $L__tmp12:
385
+ .loc 2 233 15
386
+ add.f32 %f157, %f454, %f156;
387
+ $L__tmp13:
388
+ .loc 2 243 36
389
+ mov.b32 %r114, %f157;
390
+ shfl.sync.bfly.b32 %r115, %r114, 8, 31, -1;
391
+ mov.b32 %f158, %r115;
392
+ $L__tmp14:
393
+ .loc 2 233 15
394
+ add.f32 %f159, %f157, %f158;
395
+ $L__tmp15:
396
+ .loc 2 243 36
397
+ mov.b32 %r116, %f159;
398
+ shfl.sync.bfly.b32 %r117, %r116, 4, 31, -1;
399
+ mov.b32 %f160, %r117;
400
+ $L__tmp16:
401
+ .loc 2 233 15
402
+ add.f32 %f161, %f159, %f160;
403
+ $L__tmp17:
404
+ .loc 2 243 36
405
+ mov.b32 %r118, %f161;
406
+ shfl.sync.bfly.b32 %r119, %r118, 2, 31, -1;
407
+ mov.b32 %f162, %r119;
408
+ $L__tmp18:
409
+ .loc 2 233 15
410
+ add.f32 %f163, %f161, %f162;
411
+ $L__tmp19:
412
+ .loc 2 243 36
413
+ mov.b32 %r120, %f163;
414
+ shfl.sync.bfly.b32 %r121, %r120, 1, 31, -1;
415
+ mov.b32 %f164, %r121;
416
+ $L__tmp20:
417
+ .loc 2 233 15
418
+ add.f32 %f165, %f163, %f164;
419
+ $L__tmp21:
420
+ .loc 2 243 36
421
+ mov.b32 %r122, %f455;
422
+ shfl.sync.bfly.b32 %r123, %r122, 16, 31, -1;
423
+ mov.b32 %f166, %r123;
424
+ $L__tmp22:
425
+ .loc 2 233 15
426
+ add.f32 %f167, %f455, %f166;
427
+ $L__tmp23:
428
+ .loc 2 243 36
429
+ mov.b32 %r124, %f167;
430
+ shfl.sync.bfly.b32 %r125, %r124, 8, 31, -1;
431
+ mov.b32 %f168, %r125;
432
+ $L__tmp24:
433
+ .loc 2 233 15
434
+ add.f32 %f169, %f167, %f168;
435
+ $L__tmp25:
436
+ .loc 2 243 36
437
+ mov.b32 %r126, %f169;
438
+ shfl.sync.bfly.b32 %r127, %r126, 4, 31, -1;
439
+ mov.b32 %f170, %r127;
440
+ $L__tmp26:
441
+ .loc 2 233 15
442
+ add.f32 %f171, %f169, %f170;
443
+ $L__tmp27:
444
+ .loc 2 243 36
445
+ mov.b32 %r128, %f171;
446
+ shfl.sync.bfly.b32 %r129, %r128, 2, 31, -1;
447
+ mov.b32 %f172, %r129;
448
+ $L__tmp28:
449
+ .loc 2 233 15
450
+ add.f32 %f173, %f171, %f172;
451
+ $L__tmp29:
452
+ .loc 2 243 36
453
+ mov.b32 %r130, %f173;
454
+ shfl.sync.bfly.b32 %r131, %r130, 1, 31, -1;
455
+ mov.b32 %f174, %r131;
456
+ $L__tmp30:
457
+ .loc 2 233 15
458
+ add.f32 %f175, %f173, %f174;
459
+ $L__tmp31:
460
+ .loc 2 243 36
461
+ mov.b32 %r132, %f456;
462
+ shfl.sync.bfly.b32 %r133, %r132, 16, 31, -1;
463
+ mov.b32 %f176, %r133;
464
+ $L__tmp32:
465
+ .loc 2 233 15
466
+ add.f32 %f177, %f456, %f176;
467
+ $L__tmp33:
468
+ .loc 2 243 36
469
+ mov.b32 %r134, %f177;
470
+ shfl.sync.bfly.b32 %r135, %r134, 8, 31, -1;
471
+ mov.b32 %f178, %r135;
472
+ $L__tmp34:
473
+ .loc 2 233 15
474
+ add.f32 %f179, %f177, %f178;
475
+ $L__tmp35:
476
+ .loc 2 243 36
477
+ mov.b32 %r136, %f179;
478
+ shfl.sync.bfly.b32 %r137, %r136, 4, 31, -1;
479
+ mov.b32 %f180, %r137;
480
+ $L__tmp36:
481
+ .loc 2 233 15
482
+ add.f32 %f181, %f179, %f180;
483
+ $L__tmp37:
484
+ .loc 2 243 36
485
+ mov.b32 %r138, %f181;
486
+ shfl.sync.bfly.b32 %r139, %r138, 2, 31, -1;
487
+ mov.b32 %f182, %r139;
488
+ $L__tmp38:
489
+ .loc 2 233 15
490
+ add.f32 %f183, %f181, %f182;
491
+ $L__tmp39:
492
+ .loc 2 243 36
493
+ mov.b32 %r140, %f183;
494
+ shfl.sync.bfly.b32 %r141, %r140, 1, 31, -1;
495
+ mov.b32 %f184, %r141;
496
+ $L__tmp40:
497
+ .loc 2 233 15
498
+ add.f32 %f185, %f183, %f184;
499
+ $L__tmp41:
500
+ .loc 2 243 36
501
+ mov.b32 %r142, %f457;
502
+ shfl.sync.bfly.b32 %r143, %r142, 16, 31, -1;
503
+ mov.b32 %f186, %r143;
504
+ $L__tmp42:
505
+ .loc 2 233 15
506
+ add.f32 %f187, %f457, %f186;
507
+ $L__tmp43:
508
+ .loc 2 243 36
509
+ mov.b32 %r144, %f187;
510
+ shfl.sync.bfly.b32 %r145, %r144, 8, 31, -1;
511
+ mov.b32 %f188, %r145;
512
+ $L__tmp44:
513
+ .loc 2 233 15
514
+ add.f32 %f189, %f187, %f188;
515
+ $L__tmp45:
516
+ .loc 2 243 36
517
+ mov.b32 %r146, %f189;
518
+ shfl.sync.bfly.b32 %r147, %r146, 4, 31, -1;
519
+ mov.b32 %f190, %r147;
520
+ $L__tmp46:
521
+ .loc 2 233 15
522
+ add.f32 %f191, %f189, %f190;
523
+ $L__tmp47:
524
+ .loc 2 243 36
525
+ mov.b32 %r148, %f191;
526
+ shfl.sync.bfly.b32 %r149, %r148, 2, 31, -1;
527
+ mov.b32 %f192, %r149;
528
+ $L__tmp48:
529
+ .loc 2 233 15
530
+ add.f32 %f193, %f191, %f192;
531
+ $L__tmp49:
532
+ .loc 2 243 36
533
+ mov.b32 %r150, %f193;
534
+ shfl.sync.bfly.b32 %r151, %r150, 1, 31, -1;
535
+ mov.b32 %f194, %r151;
536
+ $L__tmp50:
537
+ .loc 2 233 15
538
+ add.f32 %f195, %f193, %f194;
539
+ $L__tmp51:
540
+ .loc 2 243 36
541
+ mov.b32 %r152, %f458;
542
+ shfl.sync.bfly.b32 %r153, %r152, 16, 31, -1;
543
+ mov.b32 %f196, %r153;
544
+ $L__tmp52:
545
+ .loc 2 233 15
546
+ add.f32 %f197, %f458, %f196;
547
+ $L__tmp53:
548
+ .loc 2 243 36
549
+ mov.b32 %r154, %f197;
550
+ shfl.sync.bfly.b32 %r155, %r154, 8, 31, -1;
551
+ mov.b32 %f198, %r155;
552
+ $L__tmp54:
553
+ .loc 2 233 15
554
+ add.f32 %f199, %f197, %f198;
555
+ $L__tmp55:
556
+ .loc 2 243 36
557
+ mov.b32 %r156, %f199;
558
+ shfl.sync.bfly.b32 %r157, %r156, 4, 31, -1;
559
+ mov.b32 %f200, %r157;
560
+ $L__tmp56:
561
+ .loc 2 233 15
562
+ add.f32 %f201, %f199, %f200;
563
+ $L__tmp57:
564
+ .loc 2 243 36
565
+ mov.b32 %r158, %f201;
566
+ shfl.sync.bfly.b32 %r159, %r158, 2, 31, -1;
567
+ mov.b32 %f202, %r159;
568
+ $L__tmp58:
569
+ .loc 2 233 15
570
+ add.f32 %f203, %f201, %f202;
571
+ $L__tmp59:
572
+ .loc 2 243 36
573
+ mov.b32 %r160, %f203;
574
+ shfl.sync.bfly.b32 %r161, %r160, 1, 31, -1;
575
+ mov.b32 %f204, %r161;
576
+ $L__tmp60:
577
+ .loc 2 233 15
578
+ add.f32 %f205, %f203, %f204;
579
+ $L__tmp61:
580
+ .loc 2 243 36
581
+ mov.b32 %r162, %f459;
582
+ shfl.sync.bfly.b32 %r163, %r162, 16, 31, -1;
583
+ mov.b32 %f206, %r163;
584
+ $L__tmp62:
585
+ .loc 2 233 15
586
+ add.f32 %f207, %f459, %f206;
587
+ $L__tmp63:
588
+ .loc 2 243 36
589
+ mov.b32 %r164, %f207;
590
+ shfl.sync.bfly.b32 %r165, %r164, 8, 31, -1;
591
+ mov.b32 %f208, %r165;
592
+ $L__tmp64:
593
+ .loc 2 233 15
594
+ add.f32 %f209, %f207, %f208;
595
+ $L__tmp65:
596
+ .loc 2 243 36
597
+ mov.b32 %r166, %f209;
598
+ shfl.sync.bfly.b32 %r167, %r166, 4, 31, -1;
599
+ mov.b32 %f210, %r167;
600
+ $L__tmp66:
601
+ .loc 2 233 15
602
+ add.f32 %f211, %f209, %f210;
603
+ $L__tmp67:
604
+ .loc 2 243 36
605
+ mov.b32 %r168, %f211;
606
+ shfl.sync.bfly.b32 %r169, %r168, 2, 31, -1;
607
+ mov.b32 %f212, %r169;
608
+ $L__tmp68:
609
+ .loc 2 233 15
610
+ add.f32 %f213, %f211, %f212;
611
+ $L__tmp69:
612
+ .loc 2 243 36
613
+ mov.b32 %r170, %f213;
614
+ shfl.sync.bfly.b32 %r171, %r170, 1, 31, -1;
615
+ mov.b32 %f214, %r171;
616
+ $L__tmp70:
617
+ .loc 2 233 15
618
+ add.f32 %f215, %f213, %f214;
619
+ $L__tmp71:
620
+ .loc 2 243 36
621
+ mov.b32 %r172, %f460;
622
+ shfl.sync.bfly.b32 %r173, %r172, 16, 31, -1;
623
+ mov.b32 %f216, %r173;
624
+ $L__tmp72:
625
+ .loc 2 233 15
626
+ add.f32 %f217, %f460, %f216;
627
+ $L__tmp73:
628
+ .loc 2 243 36
629
+ mov.b32 %r174, %f217;
630
+ shfl.sync.bfly.b32 %r175, %r174, 8, 31, -1;
631
+ mov.b32 %f218, %r175;
632
+ $L__tmp74:
633
+ .loc 2 233 15
634
+ add.f32 %f219, %f217, %f218;
635
+ $L__tmp75:
636
+ .loc 2 243 36
637
+ mov.b32 %r176, %f219;
638
+ shfl.sync.bfly.b32 %r177, %r176, 4, 31, -1;
639
+ mov.b32 %f220, %r177;
640
+ $L__tmp76:
641
+ .loc 2 233 15
642
+ add.f32 %f221, %f219, %f220;
643
+ $L__tmp77:
644
+ .loc 2 243 36
645
+ mov.b32 %r178, %f221;
646
+ shfl.sync.bfly.b32 %r179, %r178, 2, 31, -1;
647
+ mov.b32 %f222, %r179;
648
+ $L__tmp78:
649
+ .loc 2 233 15
650
+ add.f32 %f223, %f221, %f222;
651
+ $L__tmp79:
652
+ .loc 2 243 36
653
+ mov.b32 %r180, %f223;
654
+ shfl.sync.bfly.b32 %r181, %r180, 1, 31, -1;
655
+ mov.b32 %f224, %r181;
656
+ $L__tmp80:
657
+ .loc 2 233 15
658
+ add.f32 %f225, %f223, %f224;
659
+ $L__tmp81:
660
+ .loc 2 243 36
661
+ mov.b32 %r182, %f461;
662
+ shfl.sync.bfly.b32 %r183, %r182, 16, 31, -1;
663
+ mov.b32 %f226, %r183;
664
+ $L__tmp82:
665
+ .loc 2 233 15
666
+ add.f32 %f227, %f461, %f226;
667
+ $L__tmp83:
668
+ .loc 2 243 36
669
+ mov.b32 %r184, %f227;
670
+ shfl.sync.bfly.b32 %r185, %r184, 8, 31, -1;
671
+ mov.b32 %f228, %r185;
672
+ $L__tmp84:
673
+ .loc 2 233 15
674
+ add.f32 %f229, %f227, %f228;
675
+ $L__tmp85:
676
+ .loc 2 243 36
677
+ mov.b32 %r186, %f229;
678
+ shfl.sync.bfly.b32 %r187, %r186, 4, 31, -1;
679
+ mov.b32 %f230, %r187;
680
+ $L__tmp86:
681
+ .loc 2 233 15
682
+ add.f32 %f231, %f229, %f230;
683
+ $L__tmp87:
684
+ .loc 2 243 36
685
+ mov.b32 %r188, %f231;
686
+ shfl.sync.bfly.b32 %r189, %r188, 2, 31, -1;
687
+ mov.b32 %f232, %r189;
688
+ $L__tmp88:
689
+ .loc 2 233 15
690
+ add.f32 %f233, %f231, %f232;
691
+ $L__tmp89:
692
+ .loc 2 243 36
693
+ mov.b32 %r190, %f233;
694
+ shfl.sync.bfly.b32 %r191, %r190, 1, 31, -1;
695
+ mov.b32 %f234, %r191;
696
+ $L__tmp90:
697
+ .loc 2 233 15
698
+ add.f32 %f235, %f233, %f234;
699
+ $L__tmp91:
700
+ .loc 2 243 36
701
+ mov.b32 %r192, %f462;
702
+ shfl.sync.bfly.b32 %r193, %r192, 16, 31, -1;
703
+ mov.b32 %f236, %r193;
704
+ $L__tmp92:
705
+ .loc 2 233 15
706
+ add.f32 %f237, %f462, %f236;
707
+ $L__tmp93:
708
+ .loc 2 243 36
709
+ mov.b32 %r194, %f237;
710
+ shfl.sync.bfly.b32 %r195, %r194, 8, 31, -1;
711
+ mov.b32 %f238, %r195;
712
+ $L__tmp94:
713
+ .loc 2 233 15
714
+ add.f32 %f239, %f237, %f238;
715
+ $L__tmp95:
716
+ .loc 2 243 36
717
+ mov.b32 %r196, %f239;
718
+ shfl.sync.bfly.b32 %r197, %r196, 4, 31, -1;
719
+ mov.b32 %f240, %r197;
720
+ $L__tmp96:
721
+ .loc 2 233 15
722
+ add.f32 %f241, %f239, %f240;
723
+ $L__tmp97:
724
+ .loc 2 243 36
725
+ mov.b32 %r198, %f241;
726
+ shfl.sync.bfly.b32 %r199, %r198, 2, 31, -1;
727
+ mov.b32 %f242, %r199;
728
+ $L__tmp98:
729
+ .loc 2 233 15
730
+ add.f32 %f243, %f241, %f242;
731
+ $L__tmp99:
732
+ .loc 2 243 36
733
+ mov.b32 %r200, %f243;
734
+ shfl.sync.bfly.b32 %r201, %r200, 1, 31, -1;
735
+ mov.b32 %f244, %r201;
736
+ $L__tmp100:
737
+ .loc 2 233 15
738
+ add.f32 %f245, %f243, %f244;
739
+ $L__tmp101:
740
+ .loc 2 243 36
741
+ mov.b32 %r202, %f463;
742
+ shfl.sync.bfly.b32 %r203, %r202, 16, 31, -1;
743
+ mov.b32 %f246, %r203;
744
+ $L__tmp102:
745
+ .loc 2 233 15
746
+ add.f32 %f247, %f463, %f246;
747
+ $L__tmp103:
748
+ .loc 2 243 36
749
+ mov.b32 %r204, %f247;
750
+ shfl.sync.bfly.b32 %r205, %r204, 8, 31, -1;
751
+ mov.b32 %f248, %r205;
752
+ $L__tmp104:
753
+ .loc 2 233 15
754
+ add.f32 %f249, %f247, %f248;
755
+ $L__tmp105:
756
+ .loc 2 243 36
757
+ mov.b32 %r206, %f249;
758
+ shfl.sync.bfly.b32 %r207, %r206, 4, 31, -1;
759
+ mov.b32 %f250, %r207;
760
+ $L__tmp106:
761
+ .loc 2 233 15
762
+ add.f32 %f251, %f249, %f250;
763
+ $L__tmp107:
764
+ .loc 2 243 36
765
+ mov.b32 %r208, %f251;
766
+ shfl.sync.bfly.b32 %r209, %r208, 2, 31, -1;
767
+ mov.b32 %f252, %r209;
768
+ $L__tmp108:
769
+ .loc 2 233 15
770
+ add.f32 %f253, %f251, %f252;
771
+ $L__tmp109:
772
+ .loc 2 243 36
773
+ mov.b32 %r210, %f253;
774
+ shfl.sync.bfly.b32 %r211, %r210, 1, 31, -1;
775
+ mov.b32 %f254, %r211;
776
+ $L__tmp110:
777
+ .loc 2 233 15
778
+ add.f32 %f255, %f253, %f254;
779
+ $L__tmp111:
780
+ .loc 2 243 36
781
+ mov.b32 %r212, %f464;
782
+ shfl.sync.bfly.b32 %r213, %r212, 16, 31, -1;
783
+ mov.b32 %f256, %r213;
784
+ $L__tmp112:
785
+ .loc 2 233 15
786
+ add.f32 %f257, %f464, %f256;
787
+ $L__tmp113:
788
+ .loc 2 243 36
789
+ mov.b32 %r214, %f257;
790
+ shfl.sync.bfly.b32 %r215, %r214, 8, 31, -1;
791
+ mov.b32 %f258, %r215;
792
+ $L__tmp114:
793
+ .loc 2 233 15
794
+ add.f32 %f259, %f257, %f258;
795
+ $L__tmp115:
796
+ .loc 2 243 36
797
+ mov.b32 %r216, %f259;
798
+ shfl.sync.bfly.b32 %r217, %r216, 4, 31, -1;
799
+ mov.b32 %f260, %r217;
800
+ $L__tmp116:
801
+ .loc 2 233 15
802
+ add.f32 %f261, %f259, %f260;
803
+ $L__tmp117:
804
+ .loc 2 243 36
805
+ mov.b32 %r218, %f261;
806
+ shfl.sync.bfly.b32 %r219, %r218, 2, 31, -1;
807
+ mov.b32 %f262, %r219;
808
+ $L__tmp118:
809
+ .loc 2 233 15
810
+ add.f32 %f263, %f261, %f262;
811
+ $L__tmp119:
812
+ .loc 2 243 36
813
+ mov.b32 %r220, %f263;
814
+ shfl.sync.bfly.b32 %r221, %r220, 1, 31, -1;
815
+ mov.b32 %f264, %r221;
816
+ $L__tmp120:
817
+ .loc 2 233 15
818
+ add.f32 %f265, %f263, %f264;
819
+ $L__tmp121:
820
+ .loc 2 243 36
821
+ mov.b32 %r222, %f465;
822
+ shfl.sync.bfly.b32 %r223, %r222, 16, 31, -1;
823
+ mov.b32 %f266, %r223;
824
+ $L__tmp122:
825
+ .loc 2 233 15
826
+ add.f32 %f267, %f465, %f266;
827
+ $L__tmp123:
828
+ .loc 2 243 36
829
+ mov.b32 %r224, %f267;
830
+ shfl.sync.bfly.b32 %r225, %r224, 8, 31, -1;
831
+ mov.b32 %f268, %r225;
832
+ $L__tmp124:
833
+ .loc 2 233 15
834
+ add.f32 %f269, %f267, %f268;
835
+ $L__tmp125:
836
+ .loc 2 243 36
837
+ mov.b32 %r226, %f269;
838
+ shfl.sync.bfly.b32 %r227, %r226, 4, 31, -1;
839
+ mov.b32 %f270, %r227;
840
+ $L__tmp126:
841
+ .loc 2 233 15
842
+ add.f32 %f271, %f269, %f270;
843
+ $L__tmp127:
844
+ .loc 2 243 36
845
+ mov.b32 %r228, %f271;
846
+ shfl.sync.bfly.b32 %r229, %r228, 2, 31, -1;
847
+ mov.b32 %f272, %r229;
848
+ $L__tmp128:
849
+ .loc 2 233 15
850
+ add.f32 %f273, %f271, %f272;
851
+ $L__tmp129:
852
+ .loc 2 243 36
853
+ mov.b32 %r230, %f273;
854
+ shfl.sync.bfly.b32 %r231, %r230, 1, 31, -1;
855
+ mov.b32 %f274, %r231;
856
+ $L__tmp130:
857
+ .loc 2 233 15
858
+ add.f32 %f275, %f273, %f274;
859
+ $L__tmp131:
860
+ .loc 2 243 36
861
+ mov.b32 %r232, %f466;
862
+ shfl.sync.bfly.b32 %r233, %r232, 16, 31, -1;
863
+ mov.b32 %f276, %r233;
864
+ $L__tmp132:
865
+ .loc 2 233 15
866
+ add.f32 %f277, %f466, %f276;
867
+ $L__tmp133:
868
+ .loc 2 243 36
869
+ mov.b32 %r234, %f277;
870
+ shfl.sync.bfly.b32 %r235, %r234, 8, 31, -1;
871
+ mov.b32 %f278, %r235;
872
+ $L__tmp134:
873
+ .loc 2 233 15
874
+ add.f32 %f279, %f277, %f278;
875
+ $L__tmp135:
876
+ .loc 2 243 36
877
+ mov.b32 %r236, %f279;
878
+ shfl.sync.bfly.b32 %r237, %r236, 4, 31, -1;
879
+ mov.b32 %f280, %r237;
880
+ $L__tmp136:
881
+ .loc 2 233 15
882
+ add.f32 %f281, %f279, %f280;
883
+ $L__tmp137:
884
+ .loc 2 243 36
885
+ mov.b32 %r238, %f281;
886
+ shfl.sync.bfly.b32 %r239, %r238, 2, 31, -1;
887
+ mov.b32 %f282, %r239;
888
+ $L__tmp138:
889
+ .loc 2 233 15
890
+ add.f32 %f283, %f281, %f282;
891
+ $L__tmp139:
892
+ .loc 2 243 36
893
+ mov.b32 %r240, %f283;
894
+ shfl.sync.bfly.b32 %r241, %r240, 1, 31, -1;
895
+ mov.b32 %f284, %r241;
896
+ $L__tmp140:
897
+ .loc 2 233 15
898
+ add.f32 %f285, %f283, %f284;
899
+ $L__tmp141:
900
+ .loc 2 243 36
901
+ mov.b32 %r242, %f467;
902
+ shfl.sync.bfly.b32 %r243, %r242, 16, 31, -1;
903
+ mov.b32 %f286, %r243;
904
+ $L__tmp142:
905
+ .loc 2 233 15
906
+ add.f32 %f287, %f467, %f286;
907
+ $L__tmp143:
908
+ .loc 2 243 36
909
+ mov.b32 %r244, %f287;
910
+ shfl.sync.bfly.b32 %r245, %r244, 8, 31, -1;
911
+ mov.b32 %f288, %r245;
912
+ $L__tmp144:
913
+ .loc 2 233 15
914
+ add.f32 %f289, %f287, %f288;
915
+ $L__tmp145:
916
+ .loc 2 243 36
917
+ mov.b32 %r246, %f289;
918
+ shfl.sync.bfly.b32 %r247, %r246, 4, 31, -1;
919
+ mov.b32 %f290, %r247;
920
+ $L__tmp146:
921
+ .loc 2 233 15
922
+ add.f32 %f291, %f289, %f290;
923
+ $L__tmp147:
924
+ .loc 2 243 36
925
+ mov.b32 %r248, %f291;
926
+ shfl.sync.bfly.b32 %r249, %r248, 2, 31, -1;
927
+ mov.b32 %f292, %r249;
928
+ $L__tmp148:
929
+ .loc 2 233 15
930
+ add.f32 %f293, %f291, %f292;
931
+ $L__tmp149:
932
+ .loc 2 243 36
933
+ mov.b32 %r250, %f293;
934
+ shfl.sync.bfly.b32 %r251, %r250, 1, 31, -1;
935
+ mov.b32 %f294, %r251;
936
+ $L__tmp150:
937
+ .loc 2 233 15
938
+ add.f32 %f295, %f293, %f294;
939
+ $L__tmp151:
940
+ .loc 2 243 36
941
+ mov.b32 %r252, %f468;
942
+ shfl.sync.bfly.b32 %r253, %r252, 16, 31, -1;
943
+ mov.b32 %f296, %r253;
944
+ $L__tmp152:
945
+ .loc 2 233 15
946
+ add.f32 %f297, %f468, %f296;
947
+ $L__tmp153:
948
+ .loc 2 243 36
949
+ mov.b32 %r254, %f297;
950
+ shfl.sync.bfly.b32 %r255, %r254, 8, 31, -1;
951
+ mov.b32 %f298, %r255;
952
+ $L__tmp154:
953
+ .loc 2 233 15
954
+ add.f32 %f299, %f297, %f298;
955
+ $L__tmp155:
956
+ .loc 2 243 36
957
+ mov.b32 %r256, %f299;
958
+ shfl.sync.bfly.b32 %r257, %r256, 4, 31, -1;
959
+ mov.b32 %f300, %r257;
960
+ $L__tmp156:
961
+ .loc 2 233 15
962
+ add.f32 %f301, %f299, %f300;
963
+ $L__tmp157:
964
+ .loc 2 243 36
965
+ mov.b32 %r258, %f301;
966
+ shfl.sync.bfly.b32 %r259, %r258, 2, 31, -1;
967
+ mov.b32 %f302, %r259;
968
+ $L__tmp158:
969
+ .loc 2 233 15
970
+ add.f32 %f303, %f301, %f302;
971
+ $L__tmp159:
972
+ .loc 2 243 36
973
+ mov.b32 %r260, %f303;
974
+ shfl.sync.bfly.b32 %r261, %r260, 1, 31, -1;
975
+ mov.b32 %f304, %r261;
976
+ $L__tmp160:
977
+ .loc 2 233 15
978
+ add.f32 %f305, %f303, %f304;
979
+ $L__tmp161:
980
+ .loc 2 243 36
981
+ setp.eq.s32 %p68, %r2, 0;
982
+ shr.u32 %r262, %r1, 3;
983
+ and.b32 %r263, %r262, 4;
984
+ shl.b32 %r264, %r101, 3;
985
+ or.b32 %r265, %r264, %r263;
986
+ mov.u32 %r266, global_smem;
987
+ add.s32 %r64, %r266, %r265;
988
+ mov.b32 %r65, %f155;
989
+ @%p68 st.shared.b32 [ %r64 + 0 ], %r65;
990
+ shl.b32 %r267, %r3, 3;
991
+ or.b32 %r268, %r267, %r263;
992
+ add.s32 %r66, %r266, %r268;
993
+ mov.b32 %r67, %f165;
994
+ @%p68 st.shared.b32 [ %r66 + 0 ], %r67;
995
+ shl.b32 %r269, %r4, 3;
996
+ or.b32 %r270, %r269, %r263;
997
+ add.s32 %r68, %r266, %r270;
998
+ mov.b32 %r69, %f175;
999
+ @%p68 st.shared.b32 [ %r68 + 0 ], %r69;
1000
+ shl.b32 %r271, %r5, 3;
1001
+ or.b32 %r272, %r271, %r263;
1002
+ add.s32 %r70, %r266, %r272;
1003
+ mov.b32 %r71, %f185;
1004
+ @%p68 st.shared.b32 [ %r70 + 0 ], %r71;
1005
+ shl.b32 %r273, %r6, 3;
1006
+ or.b32 %r274, %r273, %r263;
1007
+ add.s32 %r72, %r266, %r274;
1008
+ mov.b32 %r73, %f195;
1009
+ @%p68 st.shared.b32 [ %r72 + 0 ], %r73;
1010
+ shl.b32 %r275, %r7, 3;
1011
+ or.b32 %r276, %r275, %r263;
1012
+ add.s32 %r74, %r266, %r276;
1013
+ mov.b32 %r75, %f205;
1014
+ @%p68 st.shared.b32 [ %r74 + 0 ], %r75;
1015
+ shl.b32 %r277, %r8, 3;
1016
+ or.b32 %r278, %r277, %r263;
1017
+ add.s32 %r76, %r266, %r278;
1018
+ mov.b32 %r77, %f215;
1019
+ @%p68 st.shared.b32 [ %r76 + 0 ], %r77;
1020
+ shl.b32 %r279, %r9, 3;
1021
+ or.b32 %r280, %r279, %r263;
1022
+ add.s32 %r78, %r266, %r280;
1023
+ mov.b32 %r79, %f225;
1024
+ @%p68 st.shared.b32 [ %r78 + 0 ], %r79;
1025
+ shl.b32 %r281, %r10, 3;
1026
+ or.b32 %r282, %r281, %r263;
1027
+ add.s32 %r80, %r266, %r282;
1028
+ mov.b32 %r81, %f235;
1029
+ @%p68 st.shared.b32 [ %r80 + 0 ], %r81;
1030
+ shl.b32 %r283, %r11, 3;
1031
+ or.b32 %r284, %r283, %r263;
1032
+ add.s32 %r82, %r266, %r284;
1033
+ mov.b32 %r83, %f245;
1034
+ @%p68 st.shared.b32 [ %r82 + 0 ], %r83;
1035
+ shl.b32 %r285, %r12, 3;
1036
+ or.b32 %r286, %r285, %r263;
1037
+ add.s32 %r84, %r266, %r286;
1038
+ mov.b32 %r85, %f255;
1039
+ @%p68 st.shared.b32 [ %r84 + 0 ], %r85;
1040
+ shl.b32 %r287, %r13, 3;
1041
+ or.b32 %r288, %r287, %r263;
1042
+ add.s32 %r86, %r266, %r288;
1043
+ mov.b32 %r87, %f265;
1044
+ @%p68 st.shared.b32 [ %r86 + 0 ], %r87;
1045
+ shl.b32 %r289, %r14, 3;
1046
+ or.b32 %r290, %r289, %r263;
1047
+ add.s32 %r88, %r266, %r290;
1048
+ mov.b32 %r89, %f275;
1049
+ @%p68 st.shared.b32 [ %r88 + 0 ], %r89;
1050
+ shl.b32 %r291, %r15, 3;
1051
+ or.b32 %r292, %r291, %r263;
1052
+ add.s32 %r90, %r266, %r292;
1053
+ mov.b32 %r91, %f285;
1054
+ @%p68 st.shared.b32 [ %r90 + 0 ], %r91;
1055
+ shl.b32 %r293, %r16, 3;
1056
+ or.b32 %r294, %r293, %r263;
1057
+ add.s32 %r92, %r266, %r294;
1058
+ mov.b32 %r93, %f295;
1059
+ @%p68 st.shared.b32 [ %r92 + 0 ], %r93;
1060
+ shl.b32 %r295, %r17, 3;
1061
+ or.b32 %r296, %r295, %r263;
1062
+ add.s32 %r94, %r266, %r296;
1063
+ mov.b32 %r95, %f305;
1064
+ @%p68 st.shared.b32 [ %r94 + 0 ], %r95;
1065
+ bar.sync 0;
1066
+ setp.lt.s32 %p84, %r1, 128;
1067
+ shl.b32 %r297, %r1, 2;
1068
+ add.s32 %r97, %r266, %r297;
1069
+ @%p84 ld.shared.b32 %r96, [ %r97 + 0 ];
1070
+ mov.b32 %f306, %r96;
1071
+ shfl.sync.bfly.b32 %r298, %r96, 1, 31, -1;
1072
+ mov.b32 %f307, %r298;
1073
+ $L__tmp162:
1074
+ .loc 2 233 15
1075
+ add.f32 %f308, %f306, %f307;
1076
+ $L__tmp163:
1077
+ .loc 2 243 36
1078
+ and.b32 %r299, %r1, 1;
1079
+ setp.eq.b32 %p86, %r299, 1;
1080
+ not.pred %p87, %p86;
1081
+ and.pred %p85, %p84, %p87;
1082
+ mov.b32 %r99, %f308;
1083
+ @%p85 st.shared.b32 [ %r97 + 0 ], %r99;
1084
+ bar.sync 0;
1085
+ add.s32 %r300, %r266, %r264;
1086
+ ld.shared.f32 %f49, [%r300];
1087
+ add.s32 %r301, %r266, %r267;
1088
+ ld.shared.f32 %f50, [%r301];
1089
+ add.s32 %r302, %r266, %r269;
1090
+ ld.shared.f32 %f51, [%r302];
1091
+ add.s32 %r303, %r266, %r271;
1092
+ ld.shared.f32 %f52, [%r303];
1093
+ add.s32 %r304, %r266, %r273;
1094
+ ld.shared.f32 %f53, [%r304];
1095
+ add.s32 %r305, %r266, %r275;
1096
+ ld.shared.f32 %f54, [%r305];
1097
+ add.s32 %r306, %r266, %r277;
1098
+ ld.shared.f32 %f55, [%r306];
1099
+ add.s32 %r307, %r266, %r279;
1100
+ ld.shared.f32 %f56, [%r307];
1101
+ add.s32 %r308, %r266, %r281;
1102
+ ld.shared.f32 %f57, [%r308];
1103
+ add.s32 %r309, %r266, %r283;
1104
+ ld.shared.f32 %f58, [%r309];
1105
+ add.s32 %r310, %r266, %r285;
1106
+ ld.shared.f32 %f59, [%r310];
1107
+ add.s32 %r311, %r266, %r287;
1108
+ ld.shared.f32 %f60, [%r311];
1109
+ add.s32 %r312, %r266, %r289;
1110
+ ld.shared.f32 %f61, [%r312];
1111
+ add.s32 %r313, %r266, %r291;
1112
+ ld.shared.f32 %f62, [%r313];
1113
+ add.s32 %r314, %r266, %r293;
1114
+ ld.shared.f32 %f63, [%r314];
1115
+ add.s32 %r315, %r266, %r295;
1116
+ ld.shared.f32 %f64, [%r315];
1117
+ $L__tmp164:
1118
+ .loc 1 51 36
1119
+ shl.b64 %rd80, %rd3, 1;
1120
+ add.s64 %rd7, %rd17, %rd80;
1121
+ mul.lo.s64 %rd81, %rd1, 6432896;
1122
+ mul.lo.s64 %rd82, %rd2, 100514;
1123
+ add.s64 %rd148, %rd81, %rd82;
1124
+ add.s64 %rd9, %rd16, %rd80;
1125
+ add.s64 %rd10, %rd15, %rd80;
1126
+ mov.b32 %r398, -64;
1127
+ mov.u16 %rs2, 0;
1128
+ $L__BB0_3:
1129
+ add.s32 %r398, %r398, 64;
1130
+ .loc 1 52 27
1131
+ add.s32 %r396, %r398, %r18;
1132
+ .loc 1 53 25
1133
+ setp.lt.u32 %p88, %r396, 50257;
1134
+ .loc 1 55 35
1135
+ add.s64 %rd83, %rd10, %rd148;
1136
+ add.s64 %rd84, %rd83, 402056;
1137
+ add.s64 %rd85, %rd83, 804112;
1138
+ add.s64 %rd86, %rd83, 1206168;
1139
+ add.s64 %rd87, %rd83, 1608224;
1140
+ add.s64 %rd88, %rd83, 2010280;
1141
+ add.s64 %rd89, %rd83, 2412336;
1142
+ add.s64 %rd90, %rd83, 2814392;
1143
+ add.s64 %rd91, %rd83, 3216448;
1144
+ add.s64 %rd92, %rd83, 3618504;
1145
+ add.s64 %rd93, %rd83, 4020560;
1146
+ add.s64 %rd94, %rd83, 4422616;
1147
+ add.s64 %rd95, %rd83, 4824672;
1148
+ add.s64 %rd96, %rd83, 5226728;
1149
+ add.s64 %rd97, %rd83, 5628784;
1150
+ .loc 1 55 53
1151
+ add.s64 %rd98, %rd83, 6030840;
1152
+ mov.u16 %rs1, 0x0;
1153
+ @%p88 ld.global.L1::evict_first.b16 { %rs1 }, [ %rd83 + 0 ];
1154
+ @!%p88 mov.u16 %rs1, %rs2;
1155
+ mov.u16 %rs3, 0x0;
1156
+ @%p88 ld.global.L1::evict_first.b16 { %rs3 }, [ %rd84 + 0 ];
1157
+ @!%p88 mov.u16 %rs3, %rs2;
1158
+ mov.u16 %rs5, 0x0;
1159
+ @%p88 ld.global.L1::evict_first.b16 { %rs5 }, [ %rd85 + 0 ];
1160
+ @!%p88 mov.u16 %rs5, %rs2;
1161
+ mov.u16 %rs7, 0x0;
1162
+ @%p88 ld.global.L1::evict_first.b16 { %rs7 }, [ %rd86 + 0 ];
1163
+ @!%p88 mov.u16 %rs7, %rs2;
1164
+ mov.u16 %rs9, 0x0;
1165
+ @%p88 ld.global.L1::evict_first.b16 { %rs9 }, [ %rd87 + 0 ];
1166
+ @!%p88 mov.u16 %rs9, %rs2;
1167
+ mov.u16 %rs11, 0x0;
1168
+ @%p88 ld.global.L1::evict_first.b16 { %rs11 }, [ %rd88 + 0 ];
1169
+ @!%p88 mov.u16 %rs11, %rs2;
1170
+ mov.u16 %rs13, 0x0;
1171
+ @%p88 ld.global.L1::evict_first.b16 { %rs13 }, [ %rd89 + 0 ];
1172
+ @!%p88 mov.u16 %rs13, %rs2;
1173
+ mov.u16 %rs15, 0x0;
1174
+ @%p88 ld.global.L1::evict_first.b16 { %rs15 }, [ %rd90 + 0 ];
1175
+ @!%p88 mov.u16 %rs15, %rs2;
1176
+ mov.u16 %rs17, 0x0;
1177
+ @%p88 ld.global.L1::evict_first.b16 { %rs17 }, [ %rd91 + 0 ];
1178
+ @!%p88 mov.u16 %rs17, %rs2;
1179
+ mov.u16 %rs19, 0x0;
1180
+ @%p88 ld.global.L1::evict_first.b16 { %rs19 }, [ %rd92 + 0 ];
1181
+ @!%p88 mov.u16 %rs19, %rs2;
1182
+ mov.u16 %rs21, 0x0;
1183
+ @%p88 ld.global.L1::evict_first.b16 { %rs21 }, [ %rd93 + 0 ];
1184
+ @!%p88 mov.u16 %rs21, %rs2;
1185
+ mov.u16 %rs23, 0x0;
1186
+ @%p88 ld.global.L1::evict_first.b16 { %rs23 }, [ %rd94 + 0 ];
1187
+ @!%p88 mov.u16 %rs23, %rs2;
1188
+ mov.u16 %rs25, 0x0;
1189
+ @%p88 ld.global.L1::evict_first.b16 { %rs25 }, [ %rd95 + 0 ];
1190
+ @!%p88 mov.u16 %rs25, %rs2;
1191
+ mov.u16 %rs27, 0x0;
1192
+ @%p88 ld.global.L1::evict_first.b16 { %rs27 }, [ %rd96 + 0 ];
1193
+ @!%p88 mov.u16 %rs27, %rs2;
1194
+ mov.u16 %rs29, 0x0;
1195
+ @%p88 ld.global.L1::evict_first.b16 { %rs29 }, [ %rd97 + 0 ];
1196
+ @!%p88 mov.u16 %rs29, %rs2;
1197
+ mov.u16 %rs31, 0x0;
1198
+ @%p88 ld.global.L1::evict_first.b16 { %rs31 }, [ %rd98 + 0 ];
1199
+ @!%p88 mov.u16 %rs31, %rs2;
1200
+ .loc 1 55 105
1201
+ cvt.f32.bf16 %r316, %rs1;
1202
+ mov.b32 %f341, %r316;
1203
+ cvt.f32.bf16 %r317, %rs3;
1204
+ mov.b32 %f342, %r317;
1205
+ cvt.f32.bf16 %r318, %rs5;
1206
+ mov.b32 %f343, %r318;
1207
+ cvt.f32.bf16 %r319, %rs7;
1208
+ mov.b32 %f344, %r319;
1209
+ cvt.f32.bf16 %r320, %rs9;
1210
+ mov.b32 %f345, %r320;
1211
+ cvt.f32.bf16 %r321, %rs11;
1212
+ mov.b32 %f346, %r321;
1213
+ cvt.f32.bf16 %r322, %rs13;
1214
+ mov.b32 %f347, %r322;
1215
+ cvt.f32.bf16 %r323, %rs15;
1216
+ mov.b32 %f348, %r323;
1217
+ cvt.f32.bf16 %r324, %rs17;
1218
+ mov.b32 %f349, %r324;
1219
+ cvt.f32.bf16 %r325, %rs19;
1220
+ mov.b32 %f350, %r325;
1221
+ cvt.f32.bf16 %r326, %rs21;
1222
+ mov.b32 %f351, %r326;
1223
+ cvt.f32.bf16 %r327, %rs23;
1224
+ mov.b32 %f352, %r327;
1225
+ cvt.f32.bf16 %r328, %rs25;
1226
+ mov.b32 %f353, %r328;
1227
+ cvt.f32.bf16 %r329, %rs27;
1228
+ mov.b32 %f354, %r329;
1229
+ cvt.f32.bf16 %r330, %rs29;
1230
+ mov.b32 %f355, %r330;
1231
+ cvt.f32.bf16 %r331, %rs31;
1232
+ mov.b32 %f356, %r331;
1233
+ .loc 1 56 35
1234
+ add.s64 %rd99, %rd149, -12061680;
1235
+ add.s64 %rd100, %rd149, -11257568;
1236
+ add.s64 %rd101, %rd149, -10453456;
1237
+ add.s64 %rd102, %rd149, -9649344;
1238
+ add.s64 %rd103, %rd149, -8845232;
1239
+ add.s64 %rd104, %rd149, -8041120;
1240
+ add.s64 %rd105, %rd149, -7237008;
1241
+ add.s64 %rd106, %rd149, -6432896;
1242
+ add.s64 %rd107, %rd149, -5628784;
1243
+ add.s64 %rd108, %rd149, -4824672;
1244
+ add.s64 %rd109, %rd149, -4020560;
1245
+ add.s64 %rd110, %rd149, -3216448;
1246
+ add.s64 %rd111, %rd149, -2412336;
1247
+ add.s64 %rd112, %rd149, -1608224;
1248
+ add.s64 %rd113, %rd149, -804112;
1249
+ .loc 1 56 53
1250
+ mov.u32 %r332, 0x0;
1251
+ @%p88 ld.global.L1::evict_first.b32 { %r332 }, [ %rd99 + 0 ];
1252
+ @!%p88 mov.u32 %r332, %r333;
1253
+ mov.b32 %f357, %r332;
1254
+ mov.u32 %r334, 0x0;
1255
+ @%p88 ld.global.L1::evict_first.b32 { %r334 }, [ %rd100 + 0 ];
1256
+ @!%p88 mov.u32 %r334, %r333;
1257
+ mov.b32 %f358, %r334;
1258
+ mov.u32 %r336, 0x0;
1259
+ @%p88 ld.global.L1::evict_first.b32 { %r336 }, [ %rd101 + 0 ];
1260
+ @!%p88 mov.u32 %r336, %r333;
1261
+ mov.b32 %f359, %r336;
1262
+ mov.u32 %r338, 0x0;
1263
+ @%p88 ld.global.L1::evict_first.b32 { %r338 }, [ %rd102 + 0 ];
1264
+ @!%p88 mov.u32 %r338, %r333;
1265
+ mov.b32 %f360, %r338;
1266
+ mov.u32 %r340, 0x0;
1267
+ @%p88 ld.global.L1::evict_first.b32 { %r340 }, [ %rd103 + 0 ];
1268
+ @!%p88 mov.u32 %r340, %r333;
1269
+ mov.b32 %f361, %r340;
1270
+ mov.u32 %r342, 0x0;
1271
+ @%p88 ld.global.L1::evict_first.b32 { %r342 }, [ %rd104 + 0 ];
1272
+ @!%p88 mov.u32 %r342, %r333;
1273
+ mov.b32 %f362, %r342;
1274
+ mov.u32 %r344, 0x0;
1275
+ @%p88 ld.global.L1::evict_first.b32 { %r344 }, [ %rd105 + 0 ];
1276
+ @!%p88 mov.u32 %r344, %r333;
1277
+ mov.b32 %f363, %r344;
1278
+ mov.u32 %r346, 0x0;
1279
+ @%p88 ld.global.L1::evict_first.b32 { %r346 }, [ %rd106 + 0 ];
1280
+ @!%p88 mov.u32 %r346, %r333;
1281
+ mov.b32 %f364, %r346;
1282
+ mov.u32 %r348, 0x0;
1283
+ @%p88 ld.global.L1::evict_first.b32 { %r348 }, [ %rd107 + 0 ];
1284
+ @!%p88 mov.u32 %r348, %r333;
1285
+ mov.b32 %f365, %r348;
1286
+ mov.u32 %r350, 0x0;
1287
+ @%p88 ld.global.L1::evict_first.b32 { %r350 }, [ %rd108 + 0 ];
1288
+ @!%p88 mov.u32 %r350, %r333;
1289
+ mov.b32 %f366, %r350;
1290
+ mov.u32 %r352, 0x0;
1291
+ @%p88 ld.global.L1::evict_first.b32 { %r352 }, [ %rd109 + 0 ];
1292
+ @!%p88 mov.u32 %r352, %r333;
1293
+ mov.b32 %f367, %r352;
1294
+ mov.u32 %r354, 0x0;
1295
+ @%p88 ld.global.L1::evict_first.b32 { %r354 }, [ %rd110 + 0 ];
1296
+ @!%p88 mov.u32 %r354, %r333;
1297
+ mov.b32 %f368, %r354;
1298
+ mov.u32 %r356, 0x0;
1299
+ @%p88 ld.global.L1::evict_first.b32 { %r356 }, [ %rd111 + 0 ];
1300
+ @!%p88 mov.u32 %r356, %r333;
1301
+ mov.b32 %f369, %r356;
1302
+ mov.u32 %r358, 0x0;
1303
+ @%p88 ld.global.L1::evict_first.b32 { %r358 }, [ %rd112 + 0 ];
1304
+ @!%p88 mov.u32 %r358, %r333;
1305
+ mov.b32 %f370, %r358;
1306
+ mov.u32 %r360, 0x0;
1307
+ @%p88 ld.global.L1::evict_first.b32 { %r360 }, [ %rd113 + 0 ];
1308
+ @!%p88 mov.u32 %r360, %r333;
1309
+ mov.b32 %f371, %r360;
1310
+ mov.u32 %r362, 0x0;
1311
+ @%p88 ld.global.L1::evict_first.b32 { %r362 }, [ %rd149 + 0 ];
1312
+ @!%p88 mov.u32 %r362, %r333;
1313
+ mov.b32 %f372, %r362;
1314
+ .loc 1 57 35
1315
+ add.s64 %rd115, %rd9, %rd148;
1316
+ add.s64 %rd116, %rd115, 402056;
1317
+ add.s64 %rd117, %rd115, 804112;
1318
+ add.s64 %rd118, %rd115, 1206168;
1319
+ add.s64 %rd119, %rd115, 1608224;
1320
+ add.s64 %rd120, %rd115, 2010280;
1321
+ add.s64 %rd121, %rd115, 2412336;
1322
+ add.s64 %rd122, %rd115, 2814392;
1323
+ add.s64 %rd123, %rd115, 3216448;
1324
+ add.s64 %rd124, %rd115, 3618504;
1325
+ add.s64 %rd125, %rd115, 4020560;
1326
+ add.s64 %rd126, %rd115, 4422616;
1327
+ add.s64 %rd127, %rd115, 4824672;
1328
+ add.s64 %rd128, %rd115, 5226728;
1329
+ add.s64 %rd129, %rd115, 5628784;
1330
+ .loc 1 57 53
1331
+ add.s64 %rd130, %rd115, 6030840;
1332
+ mov.u16 %rs49, 0x0;
1333
+ @%p88 ld.global.L1::evict_first.b16 { %rs49 }, [ %rd115 + 0 ];
1334
+ @!%p88 mov.u16 %rs49, %rs2;
1335
+ mov.u16 %rs51, 0x0;
1336
+ @%p88 ld.global.L1::evict_first.b16 { %rs51 }, [ %rd116 + 0 ];
1337
+ @!%p88 mov.u16 %rs51, %rs2;
1338
+ mov.u16 %rs53, 0x0;
1339
+ @%p88 ld.global.L1::evict_first.b16 { %rs53 }, [ %rd117 + 0 ];
1340
+ @!%p88 mov.u16 %rs53, %rs2;
1341
+ mov.u16 %rs55, 0x0;
1342
+ @%p88 ld.global.L1::evict_first.b16 { %rs55 }, [ %rd118 + 0 ];
1343
+ @!%p88 mov.u16 %rs55, %rs2;
1344
+ mov.u16 %rs57, 0x0;
1345
+ @%p88 ld.global.L1::evict_first.b16 { %rs57 }, [ %rd119 + 0 ];
1346
+ @!%p88 mov.u16 %rs57, %rs2;
1347
+ mov.u16 %rs59, 0x0;
1348
+ @%p88 ld.global.L1::evict_first.b16 { %rs59 }, [ %rd120 + 0 ];
1349
+ @!%p88 mov.u16 %rs59, %rs2;
1350
+ mov.u16 %rs61, 0x0;
1351
+ @%p88 ld.global.L1::evict_first.b16 { %rs61 }, [ %rd121 + 0 ];
1352
+ @!%p88 mov.u16 %rs61, %rs2;
1353
+ mov.u16 %rs63, 0x0;
1354
+ @%p88 ld.global.L1::evict_first.b16 { %rs63 }, [ %rd122 + 0 ];
1355
+ @!%p88 mov.u16 %rs63, %rs2;
1356
+ mov.u16 %rs65, 0x0;
1357
+ @%p88 ld.global.L1::evict_first.b16 { %rs65 }, [ %rd123 + 0 ];
1358
+ @!%p88 mov.u16 %rs65, %rs2;
1359
+ mov.u16 %rs67, 0x0;
1360
+ @%p88 ld.global.L1::evict_first.b16 { %rs67 }, [ %rd124 + 0 ];
1361
+ @!%p88 mov.u16 %rs67, %rs2;
1362
+ mov.u16 %rs69, 0x0;
1363
+ @%p88 ld.global.L1::evict_first.b16 { %rs69 }, [ %rd125 + 0 ];
1364
+ @!%p88 mov.u16 %rs69, %rs2;
1365
+ mov.u16 %rs71, 0x0;
1366
+ @%p88 ld.global.L1::evict_first.b16 { %rs71 }, [ %rd126 + 0 ];
1367
+ @!%p88 mov.u16 %rs71, %rs2;
1368
+ mov.u16 %rs73, 0x0;
1369
+ @%p88 ld.global.L1::evict_first.b16 { %rs73 }, [ %rd127 + 0 ];
1370
+ @!%p88 mov.u16 %rs73, %rs2;
1371
+ mov.u16 %rs75, 0x0;
1372
+ @%p88 ld.global.L1::evict_first.b16 { %rs75 }, [ %rd128 + 0 ];
1373
+ @!%p88 mov.u16 %rs75, %rs2;
1374
+ mov.u16 %rs77, 0x0;
1375
+ @%p88 ld.global.L1::evict_first.b16 { %rs77 }, [ %rd129 + 0 ];
1376
+ @!%p88 mov.u16 %rs77, %rs2;
1377
+ mov.u16 %rs79, 0x0;
1378
+ @%p88 ld.global.L1::evict_first.b16 { %rs79 }, [ %rd130 + 0 ];
1379
+ @!%p88 mov.u16 %rs79, %rs2;
1380
+ .loc 1 57 105
1381
+ cvt.f32.bf16 %r364, %rs49;
1382
+ mov.b32 %f373, %r364;
1383
+ cvt.f32.bf16 %r365, %rs51;
1384
+ mov.b32 %f374, %r365;
1385
+ cvt.f32.bf16 %r366, %rs53;
1386
+ mov.b32 %f375, %r366;
1387
+ cvt.f32.bf16 %r367, %rs55;
1388
+ mov.b32 %f376, %r367;
1389
+ cvt.f32.bf16 %r368, %rs57;
1390
+ mov.b32 %f377, %r368;
1391
+ cvt.f32.bf16 %r369, %rs59;
1392
+ mov.b32 %f378, %r369;
1393
+ cvt.f32.bf16 %r370, %rs61;
1394
+ mov.b32 %f379, %r370;
1395
+ cvt.f32.bf16 %r371, %rs63;
1396
+ mov.b32 %f380, %r371;
1397
+ cvt.f32.bf16 %r372, %rs65;
1398
+ mov.b32 %f381, %r372;
1399
+ cvt.f32.bf16 %r373, %rs67;
1400
+ mov.b32 %f382, %r373;
1401
+ cvt.f32.bf16 %r374, %rs69;
1402
+ mov.b32 %f383, %r374;
1403
+ cvt.f32.bf16 %r375, %rs71;
1404
+ mov.b32 %f384, %r375;
1405
+ cvt.f32.bf16 %r376, %rs73;
1406
+ mov.b32 %f385, %r376;
1407
+ cvt.f32.bf16 %r377, %rs75;
1408
+ mov.b32 %f386, %r377;
1409
+ cvt.f32.bf16 %r378, %rs77;
1410
+ mov.b32 %f387, %r378;
1411
+ cvt.f32.bf16 %r379, %rs79;
1412
+ mov.b32 %f388, %r379;
1413
+ .loc 1 65 23
1414
+ mul.f32 %f310, %f373, 0f3FB8AA3B;
1415
+ ex2.approx.f32 %f309, %f310;
1416
+ mul.f32 %f312, %f374, 0f3FB8AA3B;
1417
+ ex2.approx.f32 %f311, %f312;
1418
+ mul.f32 %f314, %f375, 0f3FB8AA3B;
1419
+ ex2.approx.f32 %f313, %f314;
1420
+ mul.f32 %f316, %f376, 0f3FB8AA3B;
1421
+ ex2.approx.f32 %f315, %f316;
1422
+ mul.f32 %f318, %f377, 0f3FB8AA3B;
1423
+ ex2.approx.f32 %f317, %f318;
1424
+ mul.f32 %f320, %f378, 0f3FB8AA3B;
1425
+ ex2.approx.f32 %f319, %f320;
1426
+ mul.f32 %f322, %f379, 0f3FB8AA3B;
1427
+ ex2.approx.f32 %f321, %f322;
1428
+ mul.f32 %f324, %f380, 0f3FB8AA3B;
1429
+ ex2.approx.f32 %f323, %f324;
1430
+ mul.f32 %f326, %f381, 0f3FB8AA3B;
1431
+ ex2.approx.f32 %f325, %f326;
1432
+ mul.f32 %f328, %f382, 0f3FB8AA3B;
1433
+ ex2.approx.f32 %f327, %f328;
1434
+ mul.f32 %f330, %f383, 0f3FB8AA3B;
1435
+ ex2.approx.f32 %f329, %f330;
1436
+ mul.f32 %f332, %f384, 0f3FB8AA3B;
1437
+ ex2.approx.f32 %f331, %f332;
1438
+ mul.f32 %f334, %f385, 0f3FB8AA3B;
1439
+ ex2.approx.f32 %f333, %f334;
1440
+ mul.f32 %f336, %f386, 0f3FB8AA3B;
1441
+ ex2.approx.f32 %f335, %f336;
1442
+ mul.f32 %f338, %f387, 0f3FB8AA3B;
1443
+ ex2.approx.f32 %f337, %f338;
1444
+ mul.f32 %f340, %f388, 0f3FB8AA3B;
1445
+ ex2.approx.f32 %f339, %f340;
1446
+ .loc 1 66 24
1447
+ mul.f32 %f389, %f49, %f309;
1448
+ mul.f32 %f390, %f50, %f311;
1449
+ mul.f32 %f391, %f51, %f313;
1450
+ mul.f32 %f392, %f52, %f315;
1451
+ mul.f32 %f393, %f53, %f317;
1452
+ mul.f32 %f394, %f54, %f319;
1453
+ mul.f32 %f395, %f55, %f321;
1454
+ mul.f32 %f396, %f56, %f323;
1455
+ mul.f32 %f397, %f57, %f325;
1456
+ mul.f32 %f398, %f58, %f327;
1457
+ mul.f32 %f399, %f59, %f329;
1458
+ mul.f32 %f400, %f60, %f331;
1459
+ mul.f32 %f401, %f61, %f333;
1460
+ mul.f32 %f402, %f62, %f335;
1461
+ mul.f32 %f403, %f63, %f337;
1462
+ mul.f32 %f404, %f64, %f339;
1463
+ .loc 1 67 24
1464
+ neg.f32 %f405, %f389;
1465
+ fma.rn.f32 %f406, %f1, %f357, %f405;
1466
+ neg.f32 %f407, %f390;
1467
+ fma.rn.f32 %f408, %f2, %f358, %f407;
1468
+ neg.f32 %f409, %f391;
1469
+ fma.rn.f32 %f410, %f3, %f359, %f409;
1470
+ neg.f32 %f411, %f392;
1471
+ fma.rn.f32 %f412, %f4, %f360, %f411;
1472
+ neg.f32 %f413, %f393;
1473
+ fma.rn.f32 %f414, %f5, %f361, %f413;
1474
+ neg.f32 %f415, %f394;
1475
+ fma.rn.f32 %f416, %f6, %f362, %f415;
1476
+ neg.f32 %f417, %f395;
1477
+ fma.rn.f32 %f418, %f7, %f363, %f417;
1478
+ neg.f32 %f419, %f396;
1479
+ fma.rn.f32 %f420, %f8, %f364, %f419;
1480
+ neg.f32 %f421, %f397;
1481
+ fma.rn.f32 %f422, %f9, %f365, %f421;
1482
+ neg.f32 %f423, %f398;
1483
+ fma.rn.f32 %f424, %f10, %f366, %f423;
1484
+ neg.f32 %f425, %f399;
1485
+ fma.rn.f32 %f426, %f11, %f367, %f425;
1486
+ neg.f32 %f427, %f400;
1487
+ fma.rn.f32 %f428, %f12, %f368, %f427;
1488
+ neg.f32 %f429, %f401;
1489
+ fma.rn.f32 %f430, %f13, %f369, %f429;
1490
+ neg.f32 %f431, %f402;
1491
+ fma.rn.f32 %f432, %f14, %f370, %f431;
1492
+ neg.f32 %f433, %f403;
1493
+ fma.rn.f32 %f434, %f15, %f371, %f433;
1494
+ neg.f32 %f435, %f404;
1495
+ fma.rn.f32 %f436, %f16, %f372, %f435;
1496
+ .loc 1 69 24
1497
+ add.f32 %f437, %f341, %f406;
1498
+ add.f32 %f438, %f342, %f408;
1499
+ add.f32 %f439, %f343, %f410;
1500
+ add.f32 %f440, %f344, %f412;
1501
+ add.f32 %f441, %f345, %f414;
1502
+ add.f32 %f442, %f346, %f416;
1503
+ add.f32 %f443, %f347, %f418;
1504
+ add.f32 %f444, %f348, %f420;
1505
+ add.f32 %f445, %f349, %f422;
1506
+ add.f32 %f446, %f350, %f424;
1507
+ add.f32 %f447, %f351, %f426;
1508
+ add.f32 %f448, %f352, %f428;
1509
+ add.f32 %f449, %f353, %f430;
1510
+ add.f32 %f450, %f354, %f432;
1511
+ add.f32 %f451, %f355, %f434;
1512
+ add.f32 %f452, %f356, %f436;
1513
+ .loc 1 70 29
1514
+ add.s64 %rd131, %rd7, %rd148;
1515
+ add.s64 %rd132, %rd131, 402056;
1516
+ add.s64 %rd133, %rd131, 804112;
1517
+ add.s64 %rd134, %rd131, 1206168;
1518
+ add.s64 %rd135, %rd131, 1608224;
1519
+ add.s64 %rd136, %rd131, 2010280;
1520
+ add.s64 %rd137, %rd131, 2412336;
1521
+ add.s64 %rd138, %rd131, 2814392;
1522
+ add.s64 %rd139, %rd131, 3216448;
1523
+ add.s64 %rd140, %rd131, 3618504;
1524
+ add.s64 %rd141, %rd131, 4020560;
1525
+ add.s64 %rd142, %rd131, 4422616;
1526
+ add.s64 %rd143, %rd131, 4824672;
1527
+ add.s64 %rd144, %rd131, 5226728;
1528
+ add.s64 %rd145, %rd131, 5628784;
1529
+ .loc 1 70 54
1530
+ add.s64 %rd146, %rd131, 6030840;
1531
+ mov.b32 %r380, %f437;
1532
+ cvt.rn.bf16.f32 %rs97, %r380;
1533
+ mov.b32 %r381, %f438;
1534
+ cvt.rn.bf16.f32 %rs98, %r381;
1535
+ mov.b32 %r382, %f439;
1536
+ cvt.rn.bf16.f32 %rs99, %r382;
1537
+ mov.b32 %r383, %f440;
1538
+ cvt.rn.bf16.f32 %rs100, %r383;
1539
+ mov.b32 %r384, %f441;
1540
+ cvt.rn.bf16.f32 %rs101, %r384;
1541
+ mov.b32 %r385, %f442;
1542
+ cvt.rn.bf16.f32 %rs102, %r385;
1543
+ mov.b32 %r386, %f443;
1544
+ cvt.rn.bf16.f32 %rs103, %r386;
1545
+ mov.b32 %r387, %f444;
1546
+ cvt.rn.bf16.f32 %rs104, %r387;
1547
+ mov.b32 %r388, %f445;
1548
+ cvt.rn.bf16.f32 %rs105, %r388;
1549
+ mov.b32 %r389, %f446;
1550
+ cvt.rn.bf16.f32 %rs106, %r389;
1551
+ mov.b32 %r390, %f447;
1552
+ cvt.rn.bf16.f32 %rs107, %r390;
1553
+ mov.b32 %r391, %f448;
1554
+ cvt.rn.bf16.f32 %rs108, %r391;
1555
+ mov.b32 %r392, %f449;
1556
+ cvt.rn.bf16.f32 %rs109, %r392;
1557
+ mov.b32 %r393, %f450;
1558
+ cvt.rn.bf16.f32 %rs110, %r393;
1559
+ mov.b32 %r394, %f451;
1560
+ cvt.rn.bf16.f32 %rs111, %r394;
1561
+ mov.b32 %r395, %f452;
1562
+ cvt.rn.bf16.f32 %rs112, %r395;
1563
+ @%p88 st.global.b16 [ %rd131 + 0 ], { %rs97 };
1564
+ @%p88 st.global.b16 [ %rd132 + 0 ], { %rs98 };
1565
+ @%p88 st.global.b16 [ %rd133 + 0 ], { %rs99 };
1566
+ @%p88 st.global.b16 [ %rd134 + 0 ], { %rs100 };
1567
+ @%p88 st.global.b16 [ %rd135 + 0 ], { %rs101 };
1568
+ @%p88 st.global.b16 [ %rd136 + 0 ], { %rs102 };
1569
+ @%p88 st.global.b16 [ %rd137 + 0 ], { %rs103 };
1570
+ @%p88 st.global.b16 [ %rd138 + 0 ], { %rs104 };
1571
+ @%p88 st.global.b16 [ %rd139 + 0 ], { %rs105 };
1572
+ @%p88 st.global.b16 [ %rd140 + 0 ], { %rs106 };
1573
+ @%p88 st.global.b16 [ %rd141 + 0 ], { %rs107 };
1574
+ @%p88 st.global.b16 [ %rd142 + 0 ], { %rs108 };
1575
+ @%p88 st.global.b16 [ %rd143 + 0 ], { %rs109 };
1576
+ @%p88 st.global.b16 [ %rd144 + 0 ], { %rs110 };
1577
+ @%p88 st.global.b16 [ %rd145 + 0 ], { %rs111 };
1578
+ @%p88 st.global.b16 [ %rd146 + 0 ], { %rs112 };
1579
+ .loc 1 51 36
1580
+ add.s64 %rd149, %rd149, 256;
1581
+ add.s64 %rd148, %rd148, 128;
1582
+ setp.lt.u32 %p200, %r398, 50193;
1583
+ @%p200 bra $L__BB0_3;
1584
+ .loc 1 51 4
1585
+ ret;
1586
+ $L__tmp165:
1587
+ $L__func_end0:
1588
+
1589
+ }
1590
+ .file 1 "/tmp/torchinductor_root/kz/ckzgl7thb4xdfkfnd2tidks6mt5f3hauwfyjflbtzyepo5oxkvhk.py"
1591
+ .file 2 "/usr/local/lib/python3.10/dist-packages/triton/language/standard.py"
1592
+ .section .debug_abbrev
1593
+ {
1594
+ .b8 1
1595
+ .b8 17
1596
+ .b8 1
1597
+ .b8 37
1598
+ .b8 8
1599
+ .b8 19
1600
+ .b8 5
1601
+ .b8 3
1602
+ .b8 8
1603
+ .b8 16
1604
+ .b8 6
1605
+ .b8 27
1606
+ .b8 8
1607
+ .b8 180
1608
+ .b8 66
1609
+ .b8 12
1610
+ .b8 17
1611
+ .b8 1
1612
+ .b8 18
1613
+ .b8 1
1614
+ .b8 0
1615
+ .b8 0
1616
+ .b8 2
1617
+ .b8 46
1618
+ .b8 0
1619
+ .b8 135
1620
+ .b8 64
1621
+ .b8 8
1622
+ .b8 3
1623
+ .b8 8
1624
+ .b8 58
1625
+ .b8 11
1626
+ .b8 59
1627
+ .b8 11
1628
+ .b8 63
1629
+ .b8 12
1630
+ .b8 32
1631
+ .b8 11
1632
+ .b8 0
1633
+ .b8 0
1634
+ .b8 3
1635
+ .b8 46
1636
+ .b8 1
1637
+ .b8 17
1638
+ .b8 1
1639
+ .b8 18
1640
+ .b8 1
1641
+ .b8 64
1642
+ .b8 10
1643
+ .b8 49
1644
+ .b8 19
1645
+ .b8 0
1646
+ .b8 0
1647
+ .b8 4
1648
+ .b8 29
1649
+ .b8 0
1650
+ .b8 49
1651
+ .b8 19
1652
+ .b8 17
1653
+ .b8 1
1654
+ .b8 18
1655
+ .b8 1
1656
+ .b8 88
1657
+ .b8 11
1658
+ .b8 89
1659
+ .b8 11
1660
+ .b8 87
1661
+ .b8 11
1662
+ .b8 0
1663
+ .b8 0
1664
+ .b8 5
1665
+ .b8 29
1666
+ .b8 1
1667
+ .b8 49
1668
+ .b8 19
1669
+ .b8 17
1670
+ .b8 1
1671
+ .b8 18
1672
+ .b8 1
1673
+ .b8 88
1674
+ .b8 11
1675
+ .b8 89
1676
+ .b8 11
1677
+ .b8 87
1678
+ .b8 11
1679
+ .b8 0
1680
+ .b8 0
1681
+ .b8 0
1682
+ }
1683
+ .section .debug_info
1684
+ {
1685
+ .b32 278
1686
+ .b8 2
1687
+ .b8 0
1688
+ .b32 .debug_abbrev
1689
+ .b8 8
1690
+ .b8 1
1691
+ .b8 116
1692
+ .b8 114
1693
+ .b8 105
1694
+ .b8 116
1695
+ .b8 111
1696
+ .b8 110
1697
+ .b8 0
1698
+ .b8 2
1699
+ .b8 0
1700
+ .b8 99
1701
+ .b8 107
1702
+ .b8 122
1703
+ .b8 103
1704
+ .b8 108
1705
+ .b8 55
1706
+ .b8 116
1707
+ .b8 104
1708
+ .b8 98
1709
+ .b8 52
1710
+ .b8 120
1711
+ .b8 100
1712
+ .b8 102
1713
+ .b8 107
1714
+ .b8 102
1715
+ .b8 110
1716
+ .b8 100
1717
+ .b8 50
1718
+ .b8 116
1719
+ .b8 105
1720
+ .b8 100
1721
+ .b8 107
1722
+ .b8 115
1723
+ .b8 54
1724
+ .b8 109
1725
+ .b8 116
1726
+ .b8 53
1727
+ .b8 102
1728
+ .b8 51
1729
+ .b8 104
1730
+ .b8 97
1731
+ .b8 117
1732
+ .b8 119
1733
+ .b8 102
1734
+ .b8 121
1735
+ .b8 106
1736
+ .b8 102
1737
+ .b8 108
1738
+ .b8 98
1739
+ .b8 116
1740
+ .b8 122
1741
+ .b8 121
1742
+ .b8 101
1743
+ .b8 112
1744
+ .b8 111
1745
+ .b8 53
1746
+ .b8 111
1747
+ .b8 120
1748
+ .b8 107
1749
+ .b8 118
1750
+ .b8 104
1751
+ .b8 107
1752
+ .b8 46
1753
+ .b8 112
1754
+ .b8 121
1755
+ .b8 0
1756
+ .b32 .debug_line
1757
+ .b8 47
1758
+ .b8 116
1759
+ .b8 109
1760
+ .b8 112
1761
+ .b8 47
1762
+ .b8 116
1763
+ .b8 111
1764
+ .b8 114
1765
+ .b8 99
1766
+ .b8 104
1767
+ .b8 105
1768
+ .b8 110
1769
+ .b8 100
1770
+ .b8 117
1771
+ .b8 99
1772
+ .b8 116
1773
+ .b8 111
1774
+ .b8 114
1775
+ .b8 95
1776
+ .b8 114
1777
+ .b8 111
1778
+ .b8 111
1779
+ .b8 116
1780
+ .b8 47
1781
+ .b8 107
1782
+ .b8 122
1783
+ .b8 0
1784
+ .b8 1
1785
+ .b64 $L__func_begin0
1786
+ .b64 $L__func_end0
1787
+ .b8 2
1788
+ .b8 116
1789
+ .b8 114
1790
+ .b8 105
1791
+ .b8 116
1792
+ .b8 111
1793
+ .b8 110
1794
+ .b8 95
1795
+ .b8 95
1796
+ .b8 48
1797
+ .b8 100
1798
+ .b8 49
1799
+ .b8 100
1800
+ .b8 50
1801
+ .b8 100
1802
+ .b8 51
1803
+ .b8 100
1804
+ .b8 52
1805
+ .b8 100
1806
+ .b8 53
1807
+ .b8 100
1808
+ .b8 54
1809
+ .b8 100
1810
+ .b8 55
1811
+ .b8 100
1812
+ .b8 101
1813
+ .b8 56
1814
+ .b8 0
1815
+ .b8 116
1816
+ .b8 114
1817
+ .b8 105
1818
+ .b8 116
1819
+ .b8 111
1820
+ .b8 110
1821
+ .b8 95
1822
+ .b8 95
1823
+ .b8 48
1824
+ .b8 100
1825
+ .b8 49
1826
+ .b8 100
1827
+ .b8 50
1828
+ .b8 100
1829
+ .b8 51
1830
+ .b8 100
1831
+ .b8 52
1832
+ .b8 100
1833
+ .b8 53
1834
+ .b8 100
1835
+ .b8 54
1836
+ .b8 100
1837
+ .b8 55
1838
+ .b8 100
1839
+ .b8 101
1840
+ .b8 56
1841
+ .b8 0
1842
+ .b8 1
1843
+ .b8 18
1844
+ .b8 1
1845
+ .b8 1
1846
+ .b8 3
1847
+ .b64 $L__func_begin0
1848
+ .b64 $L__func_end0
1849
+ .b8 1
1850
+ .b8 156
1851
+ .b32 125
1852
+ .b8 4
1853
+ .b32 125
1854
+ .b64 $L__tmp1
1855
+ .b64 $L__tmp164
1856
+ .b8 2
1857
+ .b8 46
1858
+ .b8 27
1859
+ .b8 5
1860
+ .b32 125
1861
+ .b64 $L__tmp2
1862
+ .b64 $L__tmp163
1863
+ .b8 2
1864
+ .b8 46
1865
+ .b8 27
1866
+ .b8 4
1867
+ .b32 125
1868
+ .b64 $L__tmp2
1869
+ .b64 $L__tmp163
1870
+ .b8 2
1871
+ .b8 243
1872
+ .b8 36
1873
+ .b8 0
1874
+ .b8 0
1875
+ .b8 0
1876
+ }
1877
+ .section .debug_pubnames
1878
+ {
1879
+ .b32 $L__pubNames_end0-$L__pubNames_start0
1880
+ $L__pubNames_start0:
1881
+ .b8 2
1882
+ .b8 0
1883
+ .b32 .debug_info
1884
+ .b32 282
1885
+ .b32 125
1886
+ .b8 116
1887
+ .b8 114
1888
+ .b8 105
1889
+ .b8 116
1890
+ .b8 111
1891
+ .b8 110
1892
+ .b8 95
1893
+ .b8 95
1894
+ .b8 48
1895
+ .b8 100
1896
+ .b8 49
1897
+ .b8 100
1898
+ .b8 50
1899
+ .b8 100
1900
+ .b8 51
1901
+ .b8 100
1902
+ .b8 52
1903
+ .b8 100
1904
+ .b8 53
1905
+ .b8 100
1906
+ .b8 54
1907
+ .b8 100
1908
+ .b8 55
1909
+ .b8 100
1910
+ .b8 101
1911
+ .b8 56
1912
+ .b8 0
1913
+ .b32 0
1914
+ $L__pubNames_end0:
1915
+ }
1916
+ .section .debug_pubtypes
1917
+ {
1918
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
1919
+ $L__pubTypes_start0:
1920
+ .b8 2
1921
+ .b8 0
1922
+ .b32 .debug_info
1923
+ .b32 282
1924
+ .b32 0
1925
+ $L__pubTypes_end0:
1926
+ }
1927
+ .section .debug_loc { }
.triton/dump/3a1c03243d4f9adf7326739f5f7e7c9b/triton_.ttgir ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
2
+ module attributes {"triton_gpu.compute-capability" = 89 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
3
+ tt.func public @triton__0d1d2d3d4d5d6d7de8(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<i64, 1> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg6: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg7: i64 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg8: i64) attributes {noinline = false} {
4
+ %cst = arith.constant dense<0.000000e+00> : tensor<64x1xf32, #blocked>
5
+ %cst_0 = arith.constant dense<50257> : tensor<64x1xi64, #blocked>
6
+ %cst_1 = arith.constant dense<-1> : tensor<64x1xi64, #blocked>
7
+ %cst_2 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #blocked>
8
+ %c64_i64 = arith.constant 64 : i64
9
+ %cst_3 = arith.constant dense<50257> : tensor<1x64xi64, #blocked>
10
+ %c0_i32 = arith.constant 0 : i32
11
+ %c64_i32 = arith.constant 64 : i32
12
+ %c50257_i32 = arith.constant 50257 : i32
13
+ %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xbf16, #blocked>
14
+ %0 = tt.get_program_id x : i32
15
+ %1 = arith.extsi %0 : i32 to i64
16
+ %2 = arith.muli %1, %c64_i64 : i64
17
+ %3 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
18
+ %4 = tt.expand_dims %3 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xi32, #blocked>
19
+ %5 = arith.extsi %4 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
20
+ %6 = tt.splat %2 : (i64) -> tensor<64x1xi64, #blocked>
21
+ %7 = arith.addi %6, %5 : tensor<64x1xi64, #blocked>
22
+ %8 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
23
+ %9 = tt.expand_dims %8 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) -> tensor<1x64xi32, #blocked>
24
+ %10 = arith.extsi %9 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
25
+ %11 = tt.splat %arg1 : (!tt.ptr<i64, 1>) -> tensor<64x1x!tt.ptr<i64, 1>, #blocked>
26
+ %12 = tt.addptr %11, %7 : tensor<64x1x!tt.ptr<i64, 1>, #blocked>, tensor<64x1xi64, #blocked>
27
+ %13 = tt.load %12 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x1xi64, #blocked>
28
+ %14 = tt.addptr %arg2, %c0_i32 : !tt.ptr<f32, 1>, i32
29
+ %15 = tt.load %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
30
+ %16 = tt.addptr %arg3, %c0_i32 : !tt.ptr<f32, 1>, i32
31
+ %17 = tt.load %16 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
32
+ %18 = arith.muli %7, %cst_0 : tensor<64x1xi64, #blocked>
33
+ %19 = tt.broadcast %18 : (tensor<64x1xi64, #blocked>) -> tensor<64x64xi64, #blocked>
34
+ %20 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<64x64x!tt.ptr<f32, 1>, #blocked>
35
+ %21 = arith.cmpi ne, %13, %cst_1 : tensor<64x1xi64, #blocked>
36
+ %22 = arith.divf %15, %17 : f32
37
+ %23 = tt.splat %22 : (f32) -> tensor<64x1xf32, #blocked>
38
+ %24 = arith.select %21, %23, %cst : tensor<64x1xi1, #blocked>, tensor<64x1xf32, #blocked>
39
+ %25 = tt.broadcast %24 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
40
+ %26 = scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 iter_args(%arg10 = %cst_2) -> (tensor<64x64xf32, #blocked>) : i32 {
41
+ %33 = arith.extsi %arg9 : i32 to i64
42
+ %34 = tt.splat %33 : (i64) -> tensor<1x64xi64, #blocked>
43
+ %35 = arith.addi %34, %10 : tensor<1x64xi64, #blocked>
44
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x64xi64, #blocked>
45
+ %37 = tt.broadcast %35 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
46
+ %38 = arith.addi %37, %19 : tensor<64x64xi64, #blocked>
47
+ %39 = tt.addptr %20, %38 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
48
+ %40 = tt.broadcast %36 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
49
+ %41 = tt.load %39, %40, %cst_2 {cache = 1 : i32, evict = 3 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
50
+ %42 = arith.mulf %41, %25 : tensor<64x64xf32, #blocked>
51
+ %43 = arith.addf %arg10, %42 : tensor<64x64xf32, #blocked>
52
+ %44 = arith.select %40, %43, %arg10 : tensor<64x64xi1, #blocked>, tensor<64x64xf32, #blocked>
53
+ scf.yield %44 : tensor<64x64xf32, #blocked>
54
+ }
55
+ %27 = "tt.reduce"(%26) <{axis = 1 : i32}> ({
56
+ ^bb0(%arg9: f32, %arg10: f32):
57
+ %33 = arith.addf %arg9, %arg10 : f32
58
+ tt.reduce.return %33 : f32
59
+ }) : (tensor<64x64xf32, #blocked>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
60
+ %28 = tt.expand_dims %27 {axis = 1 : i32} : (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64x1xf32, #blocked>
61
+ %29 = tt.splat %arg4 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
62
+ %30 = tt.splat %arg5 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
63
+ %31 = tt.broadcast %28 : (tensor<64x1xf32, #blocked>) -> tensor<64x64xf32, #blocked>
64
+ %32 = tt.splat %arg6 : (!tt.ptr<bf16, 1>) -> tensor<64x64x!tt.ptr<bf16, 1>, #blocked>
65
+ scf.for %arg9 = %c0_i32 to %c50257_i32 step %c64_i32 : i32 {
66
+ %33 = arith.extsi %arg9 : i32 to i64
67
+ %34 = tt.splat %33 : (i64) -> tensor<1x64xi64, #blocked>
68
+ %35 = arith.addi %34, %10 : tensor<1x64xi64, #blocked>
69
+ %36 = arith.cmpi slt, %35, %cst_3 : tensor<1x64xi64, #blocked>
70
+ %37 = tt.broadcast %35 : (tensor<1x64xi64, #blocked>) -> tensor<64x64xi64, #blocked>
71
+ %38 = arith.addi %37, %19 : tensor<64x64xi64, #blocked>
72
+ %39 = tt.addptr %29, %38 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi64, #blocked>
73
+ %40 = tt.broadcast %36 : (tensor<1x64xi1, #blocked>) -> tensor<64x64xi1, #blocked>
74
+ %41 = tt.load %39, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
75
+ %42 = arith.extf %41 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
76
+ %43 = tt.addptr %20, %38 : tensor<64x64x!tt.ptr<f32, 1>, #blocked>, tensor<64x64xi64, #blocked>
77
+ %44 = tt.load %43, %40, %cst_2 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xf32, #blocked>
78
+ %45 = tt.addptr %30, %38 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi64, #blocked>
79
+ %46 = tt.load %45, %40, %cst_4 {cache = 1 : i32, evict = 2 : i32, isVolatile = false} : tensor<64x64xbf16, #blocked>
80
+ %47 = arith.extf %46 : tensor<64x64xbf16, #blocked> to tensor<64x64xf32, #blocked>
81
+ %48 = arith.mulf %44, %25 : tensor<64x64xf32, #blocked>
82
+ %49 = math.exp %47 : tensor<64x64xf32, #blocked>
83
+ %50 = arith.mulf %49, %31 : tensor<64x64xf32, #blocked>
84
+ %51 = arith.subf %48, %50 : tensor<64x64xf32, #blocked>
85
+ %52 = arith.addf %42, %51 : tensor<64x64xf32, #blocked>
86
+ %53 = tt.addptr %32, %38 : tensor<64x64x!tt.ptr<bf16, 1>, #blocked>, tensor<64x64xi64, #blocked>
87
+ %54 = arith.truncf %52 : tensor<64x64xf32, #blocked> to tensor<64x64xbf16, #blocked>
88
+ tt.store %53, %54, %40 {cache = 1 : i32, evict = 1 : i32} : tensor<64x64xbf16, #blocked>
89
+ }
90
+ tt.return
91
+ }
92
+ }
.triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.ptx ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Generated by LLVM NVPTX Back-End
3
+ //
4
+
5
+ .version 8.2
6
+ .target sm_89
7
+ .address_size 64
8
+
9
+ // .globl triton__0d1d2de
10
+
11
+ .visible .entry triton__0d1d2de(
12
+ .param .u64 triton__0d1d2de_param_0,
13
+ .param .u64 triton__0d1d2de_param_1,
14
+ .param .u32 triton__0d1d2de_param_2
15
+ )
16
+ .maxntid 256, 1, 1
17
+ {
18
+ .reg .pred %p<3>;
19
+ .reg .b16 %rs<3>;
20
+ .reg .b32 %r<13>;
21
+ .reg .b64 %rd<7>;
22
+ .loc 1 18 0
23
+ $L__func_begin0:
24
+ .loc 1 18 0
25
+
26
+ ld.param.u64 %rd3, [triton__0d1d2de_param_0];
27
+ ld.param.u64 %rd4, [triton__0d1d2de_param_1];
28
+ $L__tmp0:
29
+ .loc 1 21 36
30
+ mov.u32 %r7, %tid.x;
31
+ shl.b32 %r8, %r7, 1;
32
+ and.b32 %r9, %r8, 510;
33
+ .loc 1 20 28
34
+ mov.u32 %r1, %ctaid.x;
35
+ .loc 1 20 33
36
+ shl.b32 %r10, %r1, 9;
37
+ .loc 1 21 23
38
+ or.b32 %r11, %r10, %r9;
39
+ .loc 1 24 30
40
+ mul.wide.s32 %rd5, %r11, 4;
41
+ add.s64 %rd1, %rd3, %rd5;
42
+ mov.pred %p1, -1;
43
+ .loc 1 24 35
44
+ mov.u32 %r4, 0x0;
45
+ mov.u32 %r5, 0x0;
46
+ @%p1 ld.global.v2.b32 { %r4, %r5 }, [ %rd1 + 0 ];
47
+ .loc 1 26 25
48
+ mul.wide.s32 %rd6, %r11, 2;
49
+ add.s64 %rd2, %rd4, %rd6;
50
+ .loc 1 26 36
51
+ cvt.rn.bf16.f32 %rs1, %r4;
52
+ cvt.rn.bf16.f32 %rs2, %r5;
53
+ mov.b32 %r12, {%rs1, %rs2};
54
+ @%p1 st.global.b32 [ %rd2 + 0 ], { %r12 };
55
+ .loc 1 26 4
56
+ ret;
57
+ $L__tmp1:
58
+ $L__func_end0:
59
+
60
+ }
61
+ .file 1 "/tmp/torchinductor_root/zj/czjxjqxojsyyr4zmce6q6twysnucw6p4l5ujgp6ts2ecrm3ue3ex.py"
62
+ .section .debug_abbrev
63
+ {
64
+ .b8 1
65
+ .b8 17
66
+ .b8 1
67
+ .b8 37
68
+ .b8 8
69
+ .b8 19
70
+ .b8 5
71
+ .b8 3
72
+ .b8 8
73
+ .b8 16
74
+ .b8 6
75
+ .b8 27
76
+ .b8 8
77
+ .b8 180
78
+ .b8 66
79
+ .b8 12
80
+ .b8 17
81
+ .b8 1
82
+ .b8 18
83
+ .b8 1
84
+ .b8 0
85
+ .b8 0
86
+ .b8 2
87
+ .b8 46
88
+ .b8 0
89
+ .b8 17
90
+ .b8 1
91
+ .b8 18
92
+ .b8 1
93
+ .b8 64
94
+ .b8 10
95
+ .b8 135
96
+ .b8 64
97
+ .b8 8
98
+ .b8 3
99
+ .b8 8
100
+ .b8 58
101
+ .b8 11
102
+ .b8 59
103
+ .b8 11
104
+ .b8 63
105
+ .b8 12
106
+ .b8 0
107
+ .b8 0
108
+ .b8 0
109
+ }
110
+ .section .debug_info
111
+ {
112
+ .b32 176
113
+ .b8 2
114
+ .b8 0
115
+ .b32 .debug_abbrev
116
+ .b8 8
117
+ .b8 1
118
+ .b8 116
119
+ .b8 114
120
+ .b8 105
121
+ .b8 116
122
+ .b8 111
123
+ .b8 110
124
+ .b8 0
125
+ .b8 2
126
+ .b8 0
127
+ .b8 99
128
+ .b8 122
129
+ .b8 106
130
+ .b8 120
131
+ .b8 106
132
+ .b8 113
133
+ .b8 120
134
+ .b8 111
135
+ .b8 106
136
+ .b8 115
137
+ .b8 121
138
+ .b8 121
139
+ .b8 114
140
+ .b8 52
141
+ .b8 122
142
+ .b8 109
143
+ .b8 99
144
+ .b8 101
145
+ .b8 54
146
+ .b8 113
147
+ .b8 54
148
+ .b8 116
149
+ .b8 119
150
+ .b8 121
151
+ .b8 115
152
+ .b8 110
153
+ .b8 117
154
+ .b8 99
155
+ .b8 119
156
+ .b8 54
157
+ .b8 112
158
+ .b8 52
159
+ .b8 108
160
+ .b8 53
161
+ .b8 117
162
+ .b8 106
163
+ .b8 103
164
+ .b8 112
165
+ .b8 54
166
+ .b8 116
167
+ .b8 115
168
+ .b8 50
169
+ .b8 101
170
+ .b8 99
171
+ .b8 114
172
+ .b8 109
173
+ .b8 51
174
+ .b8 117
175
+ .b8 101
176
+ .b8 51
177
+ .b8 101
178
+ .b8 120
179
+ .b8 46
180
+ .b8 112
181
+ .b8 121
182
+ .b8 0
183
+ .b32 .debug_line
184
+ .b8 47
185
+ .b8 116
186
+ .b8 109
187
+ .b8 112
188
+ .b8 47
189
+ .b8 116
190
+ .b8 111
191
+ .b8 114
192
+ .b8 99
193
+ .b8 104
194
+ .b8 105
195
+ .b8 110
196
+ .b8 100
197
+ .b8 117
198
+ .b8 99
199
+ .b8 116
200
+ .b8 111
201
+ .b8 114
202
+ .b8 95
203
+ .b8 114
204
+ .b8 111
205
+ .b8 111
206
+ .b8 116
207
+ .b8 47
208
+ .b8 122
209
+ .b8 106
210
+ .b8 0
211
+ .b8 1
212
+ .b64 $L__func_begin0
213
+ .b64 $L__func_end0
214
+ .b8 2
215
+ .b64 $L__func_begin0
216
+ .b64 $L__func_end0
217
+ .b8 1
218
+ .b8 156
219
+ .b8 116
220
+ .b8 114
221
+ .b8 105
222
+ .b8 116
223
+ .b8 111
224
+ .b8 110
225
+ .b8 95
226
+ .b8 95
227
+ .b8 48
228
+ .b8 100
229
+ .b8 49
230
+ .b8 100
231
+ .b8 50
232
+ .b8 100
233
+ .b8 101
234
+ .b8 0
235
+ .b8 116
236
+ .b8 114
237
+ .b8 105
238
+ .b8 116
239
+ .b8 111
240
+ .b8 110
241
+ .b8 95
242
+ .b8 95
243
+ .b8 48
244
+ .b8 100
245
+ .b8 49
246
+ .b8 100
247
+ .b8 50
248
+ .b8 100
249
+ .b8 101
250
+ .b8 0
251
+ .b8 1
252
+ .b8 18
253
+ .b8 1
254
+ .b8 0
255
+ }
256
+ .section .debug_pubnames
257
+ {
258
+ .b32 $L__pubNames_end0-$L__pubNames_start0
259
+ $L__pubNames_start0:
260
+ .b8 2
261
+ .b8 0
262
+ .b32 .debug_info
263
+ .b32 180
264
+ .b32 125
265
+ .b8 116
266
+ .b8 114
267
+ .b8 105
268
+ .b8 116
269
+ .b8 111
270
+ .b8 110
271
+ .b8 95
272
+ .b8 95
273
+ .b8 48
274
+ .b8 100
275
+ .b8 49
276
+ .b8 100
277
+ .b8 50
278
+ .b8 100
279
+ .b8 101
280
+ .b8 0
281
+ .b32 0
282
+ $L__pubNames_end0:
283
+ }
284
+ .section .debug_pubtypes
285
+ {
286
+ .b32 $L__pubTypes_end0-$L__pubTypes_start0
287
+ $L__pubTypes_start0:
288
+ .b8 2
289
+ .b8 0
290
+ .b32 .debug_info
291
+ .b32 180
292
+ .b32 0
293
+ $L__pubTypes_end0:
294
+ }
295
+ .section .debug_loc { }
.triton/dump/4993935f9a0e5939755cfb42600362cf/triton_.ttir ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module {
2
+ tt.func public @triton__0d1d2de(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<bf16, 1> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} {
3
+ %c512_i32 = arith.constant 512 : i32
4
+ %0 = tt.get_program_id x : i32
5
+ %1 = arith.muli %0, %c512_i32 : i32
6
+ %2 = tt.make_range {end = 512 : i32, start = 0 : i32} : tensor<512xi32>
7
+ %3 = tt.splat %1 : (i32) -> tensor<512xi32>
8
+ %4 = arith.addi %3, %2 : tensor<512xi32>
9
+ %5 = tt.splat %arg0 : (!tt.ptr<f32, 1>) -> tensor<512x!tt.ptr<f32, 1>>
10
+ %6 = tt.addptr %5, %4 : tensor<512x!tt.ptr<f32, 1>>, tensor<512xi32>
11
+ %7 = tt.load %6 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<512xf32>
12
+ %8 = tt.splat %arg1 : (!tt.ptr<bf16, 1>) -> tensor<512x!tt.ptr<bf16, 1>>
13
+ %9 = tt.addptr %8, %4 : tensor<512x!tt.ptr<bf16, 1>>, tensor<512xi32>
14
+ %10 = arith.truncf %7 : tensor<512xf32> to tensor<512xbf16>
15
+ tt.store %9, %10 {cache = 1 : i32, evict = 1 : i32} : tensor<512xbf16>
16
+ tt.return
17
+ }
18
+ }
.triton/dump/4ce9eb7fe63f19e54893f0c74df91471/triton_.cubin ADDED
Binary file (10.8 kB). View file