1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s
4
5declare half @llvm.fma.f16(half %a, half %b, half %c)
6declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
7declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
8
9; GCN-LABEL: {{^}}fma_f16
10; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
11; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
12; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
13; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
14; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
15; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
16; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
17; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
18; VIGFX9:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
19; GCN: buffer_store_short v[[R_F16]]
20; GCN: s_endpgm
21define amdgpu_kernel void @fma_f16(
22    half addrspace(1)* %r,
23    half addrspace(1)* %a,
24    half addrspace(1)* %b,
25    half addrspace(1)* %c) {
26  %a.val = load half, half addrspace(1)* %a
27  %b.val = load half, half addrspace(1)* %b
28  %c.val = load half, half addrspace(1)* %c
29  %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val)
30  store half %r.val, half addrspace(1)* %r
31  ret void
32}
33
34; GCN-LABEL: {{^}}fma_f16_imm_a
35; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
36; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
37
38; SI:  s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
39; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
40; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
41; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]]
42; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
43; VIGFX9:  s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
44; VIGFX9:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
45; GCN: buffer_store_short v[[R_F16]]
46; GCN: s_endpgm
47define amdgpu_kernel void @fma_f16_imm_a(
48    half addrspace(1)* %r,
49    half addrspace(1)* %b,
50    half addrspace(1)* %c) {
51  %b.val = load half, half addrspace(1)* %b
52  %c.val = load half, half addrspace(1)* %c
53  %r.val = call half @llvm.fma.f16(half 3.0, half %b.val, half %c.val)
54  store half %r.val, half addrspace(1)* %r
55  ret void
56}
57
58; GCN-LABEL: {{^}}fma_f16_imm_b
59; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
60; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
61; SI:  s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
62; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
63; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
64; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
65; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
66; VIGFX9:  s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
67; VIGFX9:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
68; GCN: buffer_store_short v[[R_F16]]
69; GCN: s_endpgm
70define amdgpu_kernel void @fma_f16_imm_b(
71    half addrspace(1)* %r,
72    half addrspace(1)* %a,
73    half addrspace(1)* %c) {
74  %a.val = load half, half addrspace(1)* %a
75  %c.val = load half, half addrspace(1)* %c
76  %r.val = call half @llvm.fma.f16(half %a.val, half 3.0, half %c.val)
77  store half %r.val, half addrspace(1)* %r
78  ret void
79}
80
81; GCN-LABEL: {{^}}fma_f16_imm_c
82; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
83; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
84; SI:  s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
85; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
86; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
87; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]]
88; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
89; VIGFX9:  s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
90; VIGFX9:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
91; GCN: buffer_store_short v[[R_F16]]
92; GCN: s_endpgm
93define amdgpu_kernel void @fma_f16_imm_c(
94    half addrspace(1)* %r,
95    half addrspace(1)* %a,
96    half addrspace(1)* %b) {
97  %a.val = load half, half addrspace(1)* %a
98  %b.val = load half, half addrspace(1)* %b
99  %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half 3.0)
100  store half %r.val, half addrspace(1)* %r
101  ret void
102}
103
104; GCN-LABEL: {{^}}fma_v2f16
105; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
106; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
107; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
108
109; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
110; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
111; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
112; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
113; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
114; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
115; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
116
117; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
118; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
119
120
121; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
122; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
123; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
124; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
125
126; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
127; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
128; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
129; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
130; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
131
132; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
133
134; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
135; GCN-NOT: and
136; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
137; GCN: buffer_store_dword v[[R_V2_F16]]
138; GCN: s_endpgm
139define amdgpu_kernel void @fma_v2f16(
140    <2 x half> addrspace(1)* %r,
141    <2 x half> addrspace(1)* %a,
142    <2 x half> addrspace(1)* %b,
143    <2 x half> addrspace(1)* %c) {
144  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
145  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
146  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
147  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
148  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
149  ret void
150}
151
152; GCN-LABEL: {{^}}fma_v2f16_imm_a:
153; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
154; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
155
156
157; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
158; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
159
160
161; SI:  s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
162; VIGFX9:  s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
163; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
164; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
165
166; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
167; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
168; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
169; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
170
171; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], s[[A_F32]], v[[C_F32_1]]
172; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], s[[A_F32]], v[[C_F32_0]]
173; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
174; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
175
176; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]]
177; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]]
178
179; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
180
181; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
182; GCN-NOT: and
183; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
184; GCN: buffer_store_dword v[[R_V2_F16]]
185; GCN: s_endpgm
186define amdgpu_kernel void @fma_v2f16_imm_a(
187    <2 x half> addrspace(1)* %r,
188    <2 x half> addrspace(1)* %b,
189    <2 x half> addrspace(1)* %c) {
190  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
191  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
192  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> <half 3.0, half 3.0>, <2 x half> %b.val, <2 x half> %c.val)
193  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
194  ret void
195}
196
197; GCN-LABEL: {{^}}fma_v2f16_imm_b:
198; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
199; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
200
201; VI:      buffer_load_dword v[[C_V2_F16:[0-9]+]]
202; VIGFX9:  buffer_load_dword v[[A_V2_F16:[0-9]+]]
203; GFX9:    buffer_load_dword v[[C_V2_F16:[0-9]+]]
204
205; SI:  s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
206; VIGFX9:  s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
207
208; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
209; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
210; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
211; SI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
212
213; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
214; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
215; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], s[[B_F32]], v[[C_F32_0]]
216; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
217; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], s[[B_F32]], v[[C_F32_1]]
218; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
219
220; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
221; VI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
222; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
223; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]]
224
225; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
226
227; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
228; GCN-NOT: and
229; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
230; GCN: buffer_store_dword v[[R_V2_F16]]
231; GCN: s_endpgm
232define amdgpu_kernel void @fma_v2f16_imm_b(
233    <2 x half> addrspace(1)* %r,
234    <2 x half> addrspace(1)* %a,
235    <2 x half> addrspace(1)* %c) {
236  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
237  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
238  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> <half 3.0, half 3.0>, <2 x half> %c.val)
239  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
240  ret void
241}
242
243; GCN-LABEL: {{^}}fma_v2f16_imm_c:
244; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
245; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
246
247; GFX9:   buffer_load_dword v[[A_V2_F16:[0-9]+]]
248; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
249; VI:     buffer_load_dword v[[A_V2_F16:[0-9]+]]
250
251; SI:  s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
252; VIGFX9:  s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
253
254; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
255; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
256
257; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
258; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
259
260; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
261; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
262
263; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], s[[C_F32]]
264; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], s[[C_F32]]
265; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
266; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
267; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
268; GCN-NOT: and
269; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
270
271; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
272; VI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
273; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]]
274; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], s[[C_F16]]
275; GCN-NOT: and
276; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
277
278; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]]
279
280; GCN: buffer_store_dword v[[R_V2_F16]]
281; GCN: s_endpgm
282define amdgpu_kernel void @fma_v2f16_imm_c(
283    <2 x half> addrspace(1)* %r,
284    <2 x half> addrspace(1)* %a,
285    <2 x half> addrspace(1)* %b) {
286  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
287  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
288  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> <half 3.0, half 3.0>)
289  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
290  ret void
291}
292
293; GCN-LABEL: {{^}}fma_v4f16
294; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}}
295; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}}
296; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}}
297
298; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]]
299; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
300; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]]
301; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]]
302; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]]
303; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]]
304; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
305; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
306; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]]
307; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]]
308; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
309; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
310; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]]
311; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]]
312; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]]
313; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]]
314; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]]
315; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]]
316
317; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
318; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
319; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]]
320; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]]
321
322; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
323; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
324; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]]
325; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]]
326
327; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]]
328; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]]
329
330; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
331; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]]
332; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
333; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
334; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
335; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
336
337; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
338; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]]
339; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
340; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
341
342; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]]
343; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]]
344
345; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
346; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
347
348; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}}
349; GCN: s_endpgm
350
351define amdgpu_kernel void @fma_v4f16(
352    <4 x half> addrspace(1)* %r,
353    <4 x half> addrspace(1)* %a,
354    <4 x half> addrspace(1)* %b,
355    <4 x half> addrspace(1)* %c) {
356  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
357  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
358  %c.val = load <4 x half>, <4 x half> addrspace(1)* %c
359  %r.val = call <4 x half> @llvm.fma.v4f16(<4 x half> %a.val, <4 x half> %b.val, <4 x half> %c.val)
360  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
361  ret void
362}
363