1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
4
5; GCN-LABEL: {{^}}fmul_f16
6; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
7; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
8; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
9; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
10; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
11; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
12; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
13; GCN: buffer_store_short v[[R_F16]]
14; GCN: s_endpgm
15define amdgpu_kernel void @fmul_f16(
16    half addrspace(1)* %r,
17    half addrspace(1)* %a,
18    half addrspace(1)* %b) {
19entry:
20  %a.val = load volatile half, half addrspace(1)* %a
21  %b.val = load volatile half, half addrspace(1)* %b
22  %r.val = fmul half %a.val, %b.val
23  store half %r.val, half addrspace(1)* %r
24  ret void
25}
26
27; GCN-LABEL: {{^}}fmul_f16_imm_a
28; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
29; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
30; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
31; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
32; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
33; GCN: buffer_store_short v[[R_F16]]
34; GCN: s_endpgm
35define amdgpu_kernel void @fmul_f16_imm_a(
36    half addrspace(1)* %r,
37    half addrspace(1)* %b) {
38entry:
39  %b.val = load volatile half, half addrspace(1)* %b
40  %r.val = fmul half 3.0, %b.val
41  store half %r.val, half addrspace(1)* %r
42  ret void
43}
44
45; GCN-LABEL: {{^}}fmul_f16_imm_b
46; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
47; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
48; SI:  v_mul_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
49; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50
51; GFX89: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
52; GCN: buffer_store_short v[[R_F16]]
53; GCN: s_endpgm
54define amdgpu_kernel void @fmul_f16_imm_b(
55    half addrspace(1)* %r,
56    half addrspace(1)* %a) {
57entry:
58  %a.val = load volatile half, half addrspace(1)* %a
59  %r.val = fmul half %a.val, 4.0
60  store half %r.val, half addrspace(1)* %r
61  ret void
62}
63
64; GCN-LABEL: {{^}}fmul_v2f16:
65; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
66; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
67
68; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
69; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
70; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
71; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
72; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
73; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
74; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
75; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
76; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
77; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
78; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
79; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]],  v[[R_F16_HI]]
80
81; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
82; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
84
85; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
86; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
87; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
88
89; GCN: buffer_store_dword v[[R_V2_F16]]
90; GCN: s_endpgm
91define amdgpu_kernel void @fmul_v2f16(
92    <2 x half> addrspace(1)* %r,
93    <2 x half> addrspace(1)* %a,
94    <2 x half> addrspace(1)* %b) {
95entry:
96  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
97  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
98  %r.val = fmul <2 x half> %a.val, %b.val
99  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
100  ret void
101}
102
103; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
104; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
105; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
106; SI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
107; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
108; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
109; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
110; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
111; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
112
113
114; VI-DAG:  v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
115; VI-DAG:  v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
116; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
117
118; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
119; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
120
121; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
122; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
123
124; GCN: buffer_store_dword v[[R_V2_F16]]
125; GCN: s_endpgm
126define amdgpu_kernel void @fmul_v2f16_imm_a(
127    <2 x half> addrspace(1)* %r,
128    <2 x half> addrspace(1)* %b) {
129entry:
130  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
131  %r.val = fmul <2 x half> <half 3.0, half 4.0>, %b.val
132  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
133  ret void
134}
135
136; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
137; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
138; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
139; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
140; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
141; SI-DAG:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
142; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
143; SI-DAG:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
144; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
145
146; VI-DAG:  v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
147; VI-DAG:  v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
148; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
149
150; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
151; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
152
153; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
154; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
155
156; GCN: buffer_store_dword v[[R_V2_F16]]
157; GCN: s_endpgm
158define amdgpu_kernel void @fmul_v2f16_imm_b(
159    <2 x half> addrspace(1)* %r,
160    <2 x half> addrspace(1)* %a) {
161entry:
162  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
163  %r.val = fmul <2 x half> %a.val, <half 4.0, half 3.0>
164  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
165  ret void
166}
167
168; GCN-LABEL: {{^}}fmul_v4f16:
169; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
170; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
171
172; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
173; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
174; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
175
176; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
177; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
178; VI: v_mul_f16_sdwa
179; VI: v_mul_f16_e32
180; VI: v_mul_f16_sdwa
181; VI: v_mul_f16_e32
182; VI: v_or_b32
183; VI: v_or_b32
184define amdgpu_kernel void @fmul_v4f16(
185    <4 x half> addrspace(1)* %r,
186    <4 x half> addrspace(1)* %a,
187    <4 x half> addrspace(1)* %b) {
188entry:
189  %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
190  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
191  %r.val = fmul <4 x half> %a.val, %b.val
192  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
193  ret void
194}
195
196; GCN-LABEL: {{^}}fmul_v4f16_imm_a:
197; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
198; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
199; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
200
201; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], [[K0]]
202; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], [[K1]]
203; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
204
205; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
206
207; VI-DAG: v_mul_f16_sdwa v[[MUL_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
208; VI-DAG: v_mul_f16_e32 v[[MUL_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
209; VI-DAG: v_add_f16_sdwa v[[MUL_LO_HI:[0-9]+]], v[[A_LO]], v[[A_LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
210; VI-DAG: v_mul_f16_e32 v[[MUL_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
211
212; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MUL_LO_LO]], v[[MUL_LO_HI]]
213; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MUL_HI_LO]], v[[MUL_HI_HI]]
214
215; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
216define amdgpu_kernel void @fmul_v4f16_imm_a(
217    <4 x half> addrspace(1)* %r,
218    <4 x half> addrspace(1)* %b) {
219entry:
220  %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
221  %r.val = fmul <4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, %b.val
222  store <4 x half> %r.val, <4 x half> addrspace(1)* %r
223  ret void
224}
225