1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
4
5; GCN-LABEL: {{^}}fsub_f16:
6; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
7; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
8; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
9; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
10; SI:  v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
11; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
12; GFX89:  v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
13; GCN: buffer_store_short v[[R_F16]]
14; GCN: s_endpgm
15define amdgpu_kernel void @fsub_f16(
16    half addrspace(1)* %r,
17    half addrspace(1)* %a,
18    half addrspace(1)* %b) {
19entry:
20  %a.val = load volatile half, half addrspace(1)* %a
21  %b.val = load volatile half, half addrspace(1)* %b
22  %r.val = fsub half %a.val, %b.val
23  store half %r.val, half addrspace(1)* %r
24  ret void
25}
26
27; GCN-LABEL: {{^}}fsub_f16_imm_a:
28; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
29; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
30; SI:  v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
31; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
32; GFX89:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
33; GCN: buffer_store_short v[[R_F16]]
34; GCN: s_endpgm
35define amdgpu_kernel void @fsub_f16_imm_a(
36    half addrspace(1)* %r,
37    half addrspace(1)* %b) {
38entry:
39  %b.val = load volatile half, half addrspace(1)* %b
40  %r.val = fsub half 1.0, %b.val
41  store half %r.val, half addrspace(1)* %r
42  ret void
43}
44
45; GCN-LABEL: {{^}}fsub_f16_imm_b:
46; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
47; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
48; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]]
49; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50; GFX89:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
51; GCN: buffer_store_short v[[R_F16]]
52; GCN: s_endpgm
53define amdgpu_kernel void @fsub_f16_imm_b(
54    half addrspace(1)* %r,
55    half addrspace(1)* %a) {
56entry:
57  %a.val = load volatile half, half addrspace(1)* %a
58  %r.val = fsub half %a.val, 2.0
59  store half %r.val, half addrspace(1)* %r
60  ret void
61}
62
63; GCN-LABEL: {{^}}fsub_v2f16:
64; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
65; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
66
67; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
68; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
69; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
71
72; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
73; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
74; SI-DAG:  v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
75; SI-DAG:  v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
76; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
77; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
78; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
79; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
80
81; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
82; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
83
84; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
85; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
86; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
87
88
89; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
90; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
91
92; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
93
94; GCN: buffer_store_dword v[[R_V2_F16]]
95; GCN: s_endpgm
96
97define amdgpu_kernel void @fsub_v2f16(
98    <2 x half> addrspace(1)* %r,
99    <2 x half> addrspace(1)* %a,
100    <2 x half> addrspace(1)* %b) {
101entry:
102  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
103  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
104  %r.val = fsub <2 x half> %a.val, %b.val
105  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
106  ret void
107}
108
109; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
110; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
111
112; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
113; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
114; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
115; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
116; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
117; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
118; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
119; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
120; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
121
122; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
123; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
124; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
125; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
126
127; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
128; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0]
129
130; GCN: buffer_store_dword v[[R_V2_F16]]
131; GCN: s_endpgm
132
133define amdgpu_kernel void @fsub_v2f16_imm_a(
134    <2 x half> addrspace(1)* %r,
135    <2 x half> addrspace(1)* %b) {
136entry:
137  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
138  %r.val = fsub <2 x half> <half 1.0, half 2.0>, %b.val
139  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
140  ret void
141}
142
143; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
144; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
145
146; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
147; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
148; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
149; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
150; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
151; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
152; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
153; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
154; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
155
156; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
157; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
158; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
159; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
160
161; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
162; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}}
163
164; GCN: buffer_store_dword v[[R_V2_F16]]
165; GCN: s_endpgm
166
167define amdgpu_kernel void @fsub_v2f16_imm_b(
168    <2 x half> addrspace(1)* %r,
169    <2 x half> addrspace(1)* %a) {
170entry:
171  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
172  %r.val = fsub <2 x half> %a.val, <half 2.0, half 1.0>
173  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
174  ret void
175}
176