1; RUN: llc < %s -mcpu=sm_20 | FileCheck %s 2 3target triple = "nvptx64-nvidia-cuda" 4 5%struct.ham = type { [4 x i32] } 6 7; // Verify that load with static offset into parameter is done directly. 8; CHECK-LABEL: .visible .entry static_offset 9; CHECK-NOT: .local 10; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] 11; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 12; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] 13; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] 14; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12]; 15; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; 16; Function Attrs: nofree norecurse nounwind willreturn mustprogress 17define dso_local void @static_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { 18bb: 19 %tmp = icmp eq i32 %arg2, 3 20 br i1 %tmp, label %bb3, label %bb6 21 22bb3: ; preds = %bb 23 %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 3 24 %tmp5 = load i32, i32* %tmp4, align 4 25 store i32 %tmp5, i32* %arg, align 4 26 br label %bb6 27 28bb6: ; preds = %bb3, %bb 29 ret void 30} 31 32; // Verify that load with dynamic offset into parameter is also done directly. 33; CHECK-LABEL: .visible .entry dynamic_offset 34; CHECK-NOT: .local 35; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] 36; CHECK: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 37; CHECK: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] 38; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] 39; CHECK: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]], 40; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]]; 41; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; 42 43; Function Attrs: nofree norecurse nounwind willreturn mustprogress 44define dso_local void @dynamic_offset(i32* nocapture %arg, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { 45bb: 46 %tmp = sext i32 %arg2 to i64 47 %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp 48 %tmp4 = load i32, i32* %tmp3, align 4 49 store i32 %tmp4, i32* %arg, align 4 50 ret void 51} 52 53; Same as above, but with a bitcast present in the chain 54; CHECK-LABEL:.visible .entry gep_bitcast 55; CHECK-NOT: .local 56; CHECK-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_param_0] 57; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_param_1 58; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_param_2] 59; CHECK: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] 60; CHECK: st.global.u8 [{{%rd[0-9]+}}], [[value]]; 61; 62; Function Attrs: nofree norecurse nounwind willreturn mustprogress 63define dso_local void @gep_bitcast(i8* nocapture %out, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { 64bb: 65 %n64 = sext i32 %n to i64 66 %gep = getelementptr inbounds %struct.ham, %struct.ham* %in, i64 0, i32 0, i64 %n64 67 %bc = bitcast i32* %gep to i8* 68 %load = load i8, i8* %bc, align 4 69 store i8 %load, i8* %out, align 4 70 ret void 71} 72 73; Same as above, but with an ASC(101) present in the chain 74; CHECK-LABEL:.visible .entry gep_bitcast_asc 75; CHECK-NOT: .local 76; CHECK-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0] 77; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_asc_param_1 78; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_asc_param_2] 79; CHECK: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] 80; CHECK: st.global.u8 [{{%rd[0-9]+}}], [[value]]; 81; 82; Function Attrs: nofree norecurse nounwind willreturn mustprogress 83define dso_local void @gep_bitcast_asc(i8* nocapture %out, %struct.ham* nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { 84bb: 85 %n64 = sext i32 %n to i64 86 %gep = getelementptr inbounds %struct.ham, %struct.ham* %in, i64 0, i32 0, i64 %n64 87 %bc = bitcast i32* %gep to i8* 88 %asc = addrspacecast i8* %bc to i8 addrspace(101)* 89 %load = load i8, i8 addrspace(101)* %asc, align 4 90 store i8 %load, i8* %out, align 4 91 ret void 92} 93 94 95; Verify that if the pointer escapes, then we do fall back onto using a temp copy. 96; CHECK-LABEL: .visible .entry pointer_escapes 97; CHECK: .local .align 8 .b8 __local_depot{{.*}} 98; CHECK: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] 99; CHECK: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0; 100; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12]; 101; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8]; 102; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4]; 103; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1]; 104; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12], 105; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8], 106; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4], 107; CHECK-DAG: st.local.u32 [%[[copy_addr]]], 108; CHECK: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] 109; CHECK: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]], 110; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]]; 111; CHECK st.global.u32 [[[result_addr_g]]], [[value]]; 112 113; Function Attrs: convergent norecurse nounwind mustprogress 114define dso_local void @pointer_escapes(i32* nocapture %arg, %struct.ham* byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 { 115bb: 116 %tmp = sext i32 %arg2 to i64 117 %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg1, i64 0, i32 0, i64 %tmp 118 %tmp4 = load i32, i32* %tmp3, align 4 119 store i32 %tmp4, i32* %arg, align 4 120 %tmp5 = call i32* @escape(i32* nonnull %tmp3) #3 121 ret void 122} 123 124; Function Attrs: convergent nounwind 125declare dso_local i32* @escape(i32*) local_unnamed_addr 126 127 128!llvm.module.flags = !{!0, !1, !2} 129!nvvm.annotations = !{!3, !4, !5, !6, !7} 130 131!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]} 132!1 = !{i32 1, !"wchar_size", i32 4} 133!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} 134!3 = !{void (i32*, %struct.ham*, i32)* @static_offset, !"kernel", i32 1} 135!4 = !{void (i32*, %struct.ham*, i32)* @dynamic_offset, !"kernel", i32 1} 136!5 = !{void (i32*, %struct.ham*, i32)* @pointer_escapes, !"kernel", i32 1} 137!6 = !{void (i8*, %struct.ham*, i32)* @gep_bitcast, !"kernel", i32 1} 138!7 = !{void (i8*, %struct.ham*, i32)* @gep_bitcast_asc, !"kernel", i32 1} 139