1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
2
3
4; There is no dependence between the store and the two loads. So we can combine
5; the loads and schedule it freely.
6
7; GCN-LABEL: {{^}}ds_combine_nodep
8
9; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
10; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
11; GCN: s_waitcnt lgkmcnt({{[0-9]+}})
12define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
13
14  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
15  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
16  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
17  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
18  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
19  %v0 = extractelement <3 x float> %load0, i32 2
20
21  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
22  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
23
24  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
25  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
26  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
27
28  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
29  %v1 = load float, float addrspace(3)* %vaddr1, align 4
30
31  %sum = fadd float %v0, %v1
32  store float %sum, float addrspace(1)* %out, align 4
33  ret void
34}
35
36
37; The store depends on the first load, so we could not move the first load down to combine with
38; the second load directly. However, we can move the store after the combined load.
39
40; GCN-LABEL: {{^}}ds_combine_WAR
41
42; GCN:      ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
43; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
44define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {
45
46  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
47  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
48  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
49  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
50  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
51  %v0 = extractelement <3 x float> %load0, i32 2
52
53  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
54  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
55
56  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
57  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
58  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
59
60  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
61  %v1 = load float, float addrspace(3)* %vaddr1, align 4
62
63  %sum = fadd float %v0, %v1
64  store float %sum, float addrspace(1)* %out, align 4
65  ret void
66}
67
68
69; The second load depends on the store. We can combine the two loads, and the combined load is
70; at the original place of the second load.
71
72; GCN-LABEL: {{^}}ds_combine_RAW
73
74; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
75; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
76define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
77
78  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
79  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
80  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
81  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
82  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
83  %v0 = extractelement <3 x float> %load0, i32 2
84
85  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
86  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
87
88  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
89  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
90  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
91
92  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
93  %v1 = load float, float addrspace(3)* %vaddr1, align 4
94
95  %sum = fadd float %v0, %v1
96  store float %sum, float addrspace(1)* %out, align 4
97  ret void
98}
99
100
101; The store depends on the first load, also the second load depends on the store.
102; So we can not combine the two loads.
103
104; GCN-LABEL: {{^}}ds_combine_WAR_RAW
105
106; GCN:      ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
107; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
108; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
109define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
110
111  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
112  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
113  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
114  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
115  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
116  %v0 = extractelement <3 x float> %load0, i32 2
117
118  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
119  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
120
121  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
122  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
123  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
124
125  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
126  %v1 = load float, float addrspace(3)* %vaddr1, align 4
127
128  %sum = fadd float %v0, %v1
129  store float %sum, float addrspace(1)* %out, align 4
130  ret void
131}
132