1; RUN: opt -S -mtriple=amdgcn--  -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s
2
3; DESCRIPTION:
4;
5; There are three lds globals defined here, and these three lds are used respectively within
6; three non-kernel functions. There are three kernels, which *indirectly* call two of the
7; non-kernel functions. Hence pointer replacement should take place for all three lds, and
8; pointer initialization within kernel should selectively happen depending on which lds is
9; reachable from the kernel.
10;
11
12; Original LDS should exist.
13; CHECK: @lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4
14; CHECK: @lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4
15; CHECK: @lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4
16@lds_used_within_function_1 = internal addrspace(3) global [4 x i32] undef, align 4
17@lds_used_within_function_2 = internal addrspace(3) global [4 x i32] undef, align 4
18@lds_used_within_function_3 = internal addrspace(3) global [4 x i32] undef, align 4
19
20; Function pointers should exist.
21; CHECK: @ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8
22; CHECK: @ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8
23; CHECK: @ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8
24@ptr_to_func1 = internal local_unnamed_addr externally_initialized global void (float)* @function_1, align 8
25@ptr_to_func2 = internal local_unnamed_addr externally_initialized global void (i16)* @function_2, align 8
26@ptr_to_func3 = internal local_unnamed_addr externally_initialized global void (i8)* @function_3, align 8
27
28; Pointers should be created.
29; CHECK: @lds_used_within_function_1.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
30; CHECK: @lds_used_within_function_2.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
31; CHECK: @lds_used_within_function_3.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
32
33; Pointer replacement code should be added.
34define internal void @function_3(i8 %c) {
35; CHECK-LABEL: entry:
36; CHECK:   %0 = load i16, i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
37; CHECK:   %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
38; CHECK:   %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
39; CHECK:   %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
40; CHECK:   ret void
41entry:
42  %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_3, i32 0, i32 0
43  ret void
44}
45
46; Pointer replacement code should be added.
47define internal void @function_2(i16 %i) {
48; CHECK-LABEL: entry:
49; CHECK:   %0 = load i16, i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
50; CHECK:   %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
51; CHECK:   %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
52; CHECK:   %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
53; CHECK:   ret void
54entry:
55  %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_2, i32 0, i32 0
56  ret void
57}
58
59; Pointer replacement code should be added.
60define internal void @function_1(float %f) {
61; CHECK-LABEL: entry:
62; CHECK:   %0 = load i16, i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
63; CHECK:   %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
64; CHECK:   %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
65; CHECK:   %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0
66; CHECK:   ret void
67entry:
68  %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds_used_within_function_1, i32 0, i32 0
69  ret void
70}
71
72; Pointer initialization code shoud be added
73define protected amdgpu_kernel void @kernel_calls_function_3_and_1() {
74; CHECK-LABEL: entry:
75; CHECK:   %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
76; CHECK:   %1 = icmp eq i32 %0, 0
77; CHECK:   br i1 %1, label %2, label %3
78;
79; CHECK-LABEL: 2:
80; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
81; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
82; CHECK:   br label %3
83;
84; CHECK-LABEL: 3:
85; CHECK:   call void @llvm.amdgcn.wave.barrier()
86; CHECK:   %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
87; CHECK:   %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
88; CHECK:   call void %fptr3(i8 1)
89; CHECK:   call void %fptr1(float 2.000000e+00)
90; CHECK:   ret void
91entry:
92  %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
93  %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
94  call void %fptr3(i8 1)
95  call void %fptr1(float 2.0)
96  ret void
97}
98
99; Pointer initialization code shoud be added
100define protected amdgpu_kernel void @kernel_calls_function_2_and_3() {
101; CHECK-LABEL: entry:
102; CHECK:   %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
103; CHECK:   %1 = icmp eq i32 %0, 0
104; CHECK:   br i1 %1, label %2, label %3
105;
106; CHECK-LABEL: 2:
107; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_3 to i16), i16 addrspace(3)* @lds_used_within_function_3.ptr, align 2
108; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
109; CHECK:   br label %3
110;
111; CHECK-LABEL: 3:
112; CHECK:   call void @llvm.amdgcn.wave.barrier()
113; CHECK:   %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
114; CHECK:   %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
115; CHECK:   call void %fptr2(i16 3)
116; CHECK:   call void %fptr3(i8 4)
117; CHECK:   ret void
118entry:
119  %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
120  %fptr3 = load void (i8)*, void (i8)** @ptr_to_func3, align 8
121  call void %fptr2(i16 3)
122  call void %fptr3(i8 4)
123  ret void
124}
125
126; Pointer initialization code shoud be added
127define protected amdgpu_kernel void @kernel_calls_function_1_and_2() {
128; CHECK-LABEL: entry:
129; CHECK:   %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
130; CHECK:   %1 = icmp eq i32 %0, 0
131; CHECK:   br i1 %1, label %2, label %3
132;
133; CHECK-LABEL: 2:
134; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_2 to i16), i16 addrspace(3)* @lds_used_within_function_2.ptr, align 2
135; CHECK:   store i16 ptrtoint ([4 x i32] addrspace(3)* @lds_used_within_function_1 to i16), i16 addrspace(3)* @lds_used_within_function_1.ptr, align 2
136; CHECK:   br label %3
137;
138; CHECK-LABEL: 3:
139; CHECK:   call void @llvm.amdgcn.wave.barrier()
140; CHECK:   %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
141; CHECK:   %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
142; CHECK:   call void %fptr1(float 5.000000e+00)
143; CHECK:   call void %fptr2(i16 6)
144; CHECK:   ret void
145entry:
146  %fptr1 = load void (float)*, void (float)** @ptr_to_func1, align 8
147  %fptr2 = load void (i16)*, void (i16)** @ptr_to_func2, align 8
148  call void %fptr1(float 5.0)
149  call void %fptr2(i16 6)
150  ret void
151}
152