0-hero's picture
Add files using upload-large-folder tool
f9d5f95 verified
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
@assertFunc_0 = internal constant [25 x i8] c"_call_with_frames_removed"
@assertFile_0 = internal constant [38 x i8] c"<frozen importlib._bootstrap_external>"
@assertMessage_0 = internal constant [38 x i8] c"index out of bounds: 0 <= tmp7 < 50257"
@global_smem = external local_unnamed_addr addrspace(3) global [0 x i8]
declare void @__assertfail(ptr, ptr, i32, ptr, i64) local_unnamed_addr
define void @triton__0d1d2de(ptr addrspace(1) %0, ptr addrspace(1) %1, i64 %2) local_unnamed_addr !dbg !7 {
%4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !10
%5 = and i32 %4, 127, !dbg !10
%6 = shl nuw nsw i32 %5, 1, !dbg !10
%7 = or i32 %6, 1, !dbg !10
%8 = or i32 %6, 256, !dbg !10
%9 = tail call i32 asm "mov.u32 $0, %ctaid.x;", "=r"() #2, !dbg !11
%10 = sext i32 %9 to i64, !dbg !12
%11 = shl nsw i64 %10, 9, !dbg !13
%12 = zext nneg i32 %6 to i64
%13 = zext nneg i32 %8 to i64
%14 = or i64 %11, %12, !dbg !14
%15 = or i64 %11, %13, !dbg !14
%16 = getelementptr i64, ptr addrspace(1) %0, i64 %14, !dbg !15
%17 = getelementptr i64, ptr addrspace(1) %0, i64 %15, !dbg !15
%18 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %16, i1 true) #2, !dbg !16
%19 = extractvalue { i64, i64 } %18, 0, !dbg !16
%20 = extractvalue { i64, i64 } %18, 1, !dbg !16
%21 = tail call { i64, i64 } asm sideeffect "mov.u64 $0, 0x0;\0A\09mov.u64 $1, 0x0;\0A\09@$3 ld.global.v2.b64 { $0, $1 }, [ $2 + 0 ];", "=l,=l,l,b"(ptr addrspace(1) %17, i1 true) #2, !dbg !16
%22 = extractvalue { i64, i64 } %21, 0, !dbg !16
%23 = extractvalue { i64, i64 } %21, 1, !dbg !16
%24 = insertelement <4 x i64> poison, i64 %23, i64 0, !dbg !17
%25 = insertelement <4 x i64> %24, i64 %22, i64 1, !dbg !17
%26 = insertelement <4 x i64> %25, i64 %20, i64 2, !dbg !17
%27 = insertelement <4 x i64> %26, i64 %19, i64 3, !dbg !17
%28 = icmp eq <4 x i64> %27, <i64 -1, i64 -1, i64 -1, i64 -1>, !dbg !17
%29 = select <4 x i1> %28, <4 x i64> zeroinitializer, <4 x i64> %27, !dbg !18
%30 = add <4 x i64> %29, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !19
%31 = icmp slt <4 x i64> %29, zeroinitializer, !dbg !20
%32 = select <4 x i1> %31, <4 x i64> %30, <4 x i64> %29, !dbg !21
%33 = icmp ult <4 x i64> %32, <i64 50257, i64 50257, i64 50257, i64 50257>, !dbg !22
%34 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %12, !dbg !22
%35 = extractelement <4 x i1> %33, i64 3, !dbg !22
%36 = zext i1 %35 to i8, !dbg !22
%37 = insertelement <1 x i8> undef, i8 %36, i64 0, !dbg !22
store <1 x i8> %37, ptr addrspace(3) %34, align 1, !dbg !22
%38 = zext nneg i32 %7 to i64, !dbg !22
%39 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %38, !dbg !22
%40 = extractelement <4 x i1> %33, i64 2, !dbg !22
%41 = zext i1 %40 to i8, !dbg !22
%42 = insertelement <1 x i8> undef, i8 %41, i64 0, !dbg !22
store <1 x i8> %42, ptr addrspace(3) %39, align 1, !dbg !22
tail call void @llvm.nvvm.barrier0(), !dbg !22
%43 = zext nneg i32 %5 to i64, !dbg !22
%44 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %43, !dbg !22
%45 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
%46 = or i32 %5, 128, !dbg !22
%47 = zext nneg i32 %46 to i64, !dbg !22
%48 = getelementptr i8, ptr addrspace(3) @global_smem, i64 %47, !dbg !22
%49 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
tail call void @llvm.nvvm.barrier0(), !dbg !22
%50 = extractelement <4 x i1> %33, i64 1, !dbg !22
%51 = zext i1 %50 to i8, !dbg !22
%52 = insertelement <1 x i8> undef, i8 %51, i64 0, !dbg !22
store <1 x i8> %52, ptr addrspace(3) %34, align 1, !dbg !22
%53 = extractelement <4 x i1> %33, i64 0, !dbg !22
%54 = zext i1 %53 to i8, !dbg !22
%55 = insertelement <1 x i8> undef, i8 %54, i64 0, !dbg !22
store <1 x i8> %55, ptr addrspace(3) %39, align 1, !dbg !22
tail call void @llvm.nvvm.barrier0(), !dbg !22
%56 = load i8, ptr addrspace(3) %44, align 1, !dbg !22
%57 = load i8, ptr addrspace(3) %48, align 1, !dbg !22
%58 = insertelement <4 x i8> poison, i8 %49, i64 0, !dbg !22
%59 = insertelement <4 x i8> %58, i8 %45, i64 1, !dbg !22
%60 = insertelement <4 x i8> %59, i8 %56, i64 2, !dbg !22
%61 = insertelement <4 x i8> %60, i8 %57, i64 3, !dbg !22
%62 = icmp eq <4 x i8> %61, zeroinitializer, !dbg !22
%63 = bitcast <4 x i1> %62 to i4, !dbg !23
%.not = icmp eq i4 %63, 0, !dbg !23
br i1 %.not, label %65, label %64, !dbg !23
64: ; preds = %3
tail call void @__assertfail(ptr nonnull @assertMessage_0, ptr nonnull @assertFile_0, i32 883, ptr nonnull @assertFunc_0, i64 1), !dbg !23
br label %65, !dbg !23
65: ; preds = %64, %3
%66 = or i32 %6, 257, !dbg !10
%67 = zext nneg i32 %66 to i64
%68 = or i64 %11, %67, !dbg !14
%69 = or i64 %11, %38, !dbg !14
%70 = mul nsw i64 %14, 50257, !dbg !24
%71 = mul nsw i64 %69, 50257, !dbg !24
%72 = mul nsw i64 %15, 50257, !dbg !24
%73 = mul nsw i64 %68, 50257, !dbg !24
%74 = extractelement <4 x i64> %32, i64 3, !dbg !25
%75 = getelementptr float, ptr addrspace(1) %1, i64 %74, !dbg !25
%76 = getelementptr float, ptr addrspace(1) %75, i64 %70, !dbg !25
%77 = extractelement <4 x i64> %32, i64 2, !dbg !25
%78 = getelementptr float, ptr addrspace(1) %1, i64 %77, !dbg !25
%79 = getelementptr float, ptr addrspace(1) %78, i64 %71, !dbg !25
%80 = extractelement <4 x i64> %32, i64 1, !dbg !25
%81 = getelementptr float, ptr addrspace(1) %1, i64 %80, !dbg !25
%82 = getelementptr float, ptr addrspace(1) %81, i64 %72, !dbg !25
%83 = extractelement <4 x i64> %32, i64 0, !dbg !25
%84 = getelementptr float, ptr addrspace(1) %1, i64 %83, !dbg !25
%85 = getelementptr float, ptr addrspace(1) %84, i64 %73, !dbg !25
tail call void @llvm.nvvm.barrier0(), !dbg !26
%86 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %12, !dbg !26
%87 = ptrtoint ptr addrspace(1) %76 to i64, !dbg !26
%88 = insertelement <1 x i64> undef, i64 %87, i64 0, !dbg !26
store <1 x i64> %88, ptr addrspace(3) %86, align 8, !dbg !26
%89 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %38, !dbg !26
%90 = ptrtoint ptr addrspace(1) %79 to i64, !dbg !26
%91 = insertelement <1 x i64> undef, i64 %90, i64 0, !dbg !26
store <1 x i64> %91, ptr addrspace(3) %89, align 8, !dbg !26
tail call void @llvm.nvvm.barrier0(), !dbg !26
%92 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %43, !dbg !26
%93 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
%94 = inttoptr i64 %93 to ptr addrspace(1), !dbg !26
%95 = getelementptr i64, ptr addrspace(3) @global_smem, i64 %47, !dbg !26
%96 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
%97 = inttoptr i64 %96 to ptr addrspace(1), !dbg !26
tail call void @llvm.nvvm.barrier0(), !dbg !26
%98 = ptrtoint ptr addrspace(1) %82 to i64, !dbg !26
%99 = insertelement <1 x i64> undef, i64 %98, i64 0, !dbg !26
store <1 x i64> %99, ptr addrspace(3) %86, align 8, !dbg !26
%100 = ptrtoint ptr addrspace(1) %85 to i64, !dbg !26
%101 = insertelement <1 x i64> undef, i64 %100, i64 0, !dbg !26
store <1 x i64> %101, ptr addrspace(3) %89, align 8, !dbg !26
tail call void @llvm.nvvm.barrier0(), !dbg !26
%102 = load i64, ptr addrspace(3) %92, align 8, !dbg !26
%103 = inttoptr i64 %102 to ptr addrspace(1), !dbg !26
%104 = load i64, ptr addrspace(3) %95, align 8, !dbg !26
%105 = inttoptr i64 %104 to ptr addrspace(1), !dbg !26
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %94, i1 true) #2, !dbg !26
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %97, i1 true) #2, !dbg !26
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %103, i1 true) #2, !dbg !26
tail call void asm sideeffect "@$2 st.global.b32 [ $1 + 0 ], { $0 };", "r,l,b"(i32 -1082130432, ptr addrspace(1) %105, i1 true) #2, !dbg !26
ret void, !dbg !27
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: convergent nocallback nounwind
declare void @llvm.nvvm.barrier0() #1
attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { convergent nocallback nounwind }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0, !1}
!llvm.dbg.cu = !{!2}
!nvvm.annotations = !{!4, !5, !5, !4}
!llvm.ident = !{!6}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = distinct !DICompileUnit(language: DW_LANG_C, file: !3, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
!3 = !DIFile(filename: "chlrkgpvvbdizdz7sllquet2j7zhtes6meh6kenrqxov26mswvw7.py", directory: "/tmp/torchinductor_root/hl")
!4 = !{ptr @triton__0d1d2de, !"kernel", i32 1}
!5 = !{ptr @triton__0d1d2de, !"maxntidx", i32 128}
!6 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!7 = distinct !DISubprogram(name: "triton__0d1d2de", linkageName: "triton__0d1d2de", scope: !3, file: !3, line: 18, type: !8, scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
!8 = !DISubroutineType(cc: DW_CC_normal, types: !9)
!9 = !{}
!10 = !DILocation(line: 21, column: 36, scope: !7)
!11 = !DILocation(line: 20, column: 28, scope: !7)
!12 = !DILocation(line: 20, column: 34, scope: !7)
!13 = !DILocation(line: 20, column: 46, scope: !7)
!14 = !DILocation(line: 21, column: 23, scope: !7)
!15 = !DILocation(line: 24, column: 30, scope: !7)
!16 = !DILocation(line: 24, column: 35, scope: !7)
!17 = !DILocation(line: 26, column: 19, scope: !7)
!18 = !DILocation(line: 28, column: 32, scope: !7)
!19 = !DILocation(line: 29, column: 18, scope: !7)
!20 = !DILocation(line: 30, column: 18, scope: !7)
!21 = !DILocation(line: 31, column: 32, scope: !7)
!22 = !DILocation(line: 32, column: 36, scope: !7)
!23 = !DILocation(line: 32, column: 51, scope: !7)
!24 = !DILocation(line: 34, column: 39, scope: !7)
!25 = !DILocation(line: 34, column: 25, scope: !7)
!26 = !DILocation(line: 34, column: 51, scope: !7)
!27 = !DILocation(line: 34, column: 4, scope: !7)