1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24 #include "helpers.h"
25
26 using namespace aco;
27
28 BEGIN_TEST(optimize.neg)
29 for (unsigned i = GFX9; i <= GFX10; i++) {
30 //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
31 if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
32 continue;
33
34 //! v1: %res0 = v_mul_f32 %a, -%b
35 //! p_unit_test 0, %res0
36 Temp neg_b = fneg(inputs[1]);
37 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
38
39 //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
40 //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
41 //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
42 //! p_unit_test 1, %res1
43 Temp neg_a = fneg(inputs[0]);
44 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
45
46 //! v1: %res2 = v_mul_f32 %a, %b
47 //! p_unit_test 2, %res2
48 Temp neg_neg_a = fneg(neg_a);
49 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
50
51 //! v1: %res3 = v_mul_f32 |%a|, %b
52 //! p_unit_test 3, %res3
53 Temp abs_neg_a = fabs(neg_a);
54 writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
55
56 //! v1: %res4 = v_mul_f32 -|%a|, %b
57 //! p_unit_test 4, %res4
58 Temp abs_a = fabs(inputs[0]);
59 Temp neg_abs_a = fneg(abs_a);
60 writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
61
62 //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
63 //! p_unit_test 5, %res5
64 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
65
66 //! v1: %res6 = v_subrev_f32 %a, %b
67 //! p_unit_test 6, %res6
68 writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
69
70 //! v1: %res7 = v_sub_f32 %b, %a
71 //! p_unit_test 7, %res7
72 writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
73
74 //! v1: %res8 = v_mul_f32 %a, -%c
75 //! p_unit_test 8, %res8
76 Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
77 writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
78
79 // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
80 // //! p_unit_test 9, %res9
81 Temp abs_neg_abs_a = fabs(neg_abs_a);
82 writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
83
84 finish_opt_test();
85 }
86 END_TEST
87
88 BEGIN_TEST(optimize.output_modifiers)
89 //>> v1: %a, v1: %b = p_startpgm
90 if (!setup_cs("v1 v1", GFX9))
91 return;
92
93 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
94
95 /* 32-bit modifiers */
96
97 //! v1: %res0 = v_add_f32 %a, %b *0.5
98 //! p_unit_test 0, %res0
99 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
100 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
101
102 //! v1: %res1 = v_add_f32 %a, %b *2
103 //! p_unit_test 1, %res1
104 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
105 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
106
107 //! v1: %res2 = v_add_f32 %a, %b *4
108 //! p_unit_test 2, %res2
109 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
110 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
111
112 //! v1: %res3 = v_add_f32 %a, %b clamp
113 //! p_unit_test 3, %res3
114 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
115 writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
116 Operand::c32(0x3f800000u), tmp));
117
118 //! v1: %res4 = v_add_f32 %a, %b *2 clamp
119 //! p_unit_test 4, %res4
120 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
121 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
122 writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
123 Operand::c32(0x3f800000u), tmp));
124
125 /* 16-bit modifiers */
126
127 //! v2b: %res5 = v_add_f16 %a, %b *0.5
128 //! p_unit_test 5, %res5
129 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
130 writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
131
132 //! v2b: %res6 = v_add_f16 %a, %b *2
133 //! p_unit_test 6, %res6
134 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
135 writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
136
137 //! v2b: %res7 = v_add_f16 %a, %b *4
138 //! p_unit_test 7, %res7
139 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
140 writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
141
142 //! v2b: %res8 = v_add_f16 %a, %b clamp
143 //! p_unit_test 8, %res8
144 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
145 writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
146 Operand::c16(0x3c00u), tmp));
147
148 //! v2b: %res9 = v_add_f16 %a, %b *2 clamp
149 //! p_unit_test 9, %res9
150 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
151 tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
152 writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
153 Operand::c16(0x3c00u), tmp));
154
155 /* clamping is done after omod */
156
157 //! v1: %res10_tmp = v_add_f32 %a, %b clamp
158 //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
159 //! p_unit_test 10, %res10
160 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
161 tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
162 tmp);
163 writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
164
165 /* unsupported instructions */
166
167 //! v1: %res11_tmp = v_xor_b32 %a, %b
168 //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
169 //! p_unit_test 11, %res11
170 tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
171 writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
172
173 /* several users */
174
175 //! v1: %res12_tmp = v_add_f32 %a, %b
176 //! p_unit_test %res12_tmp
177 //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
178 //! p_unit_test 12, %res12
179 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
180 bld.pseudo(aco_opcode::p_unit_test, tmp);
181 writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
182
183 //! v1: %res13 = v_add_f32 %a, %b
184 //! p_unit_test 13, %res13
185 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
186 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
187 writeout(13, tmp);
188
189 /* omod has no effect if denormals are enabled but clamp is fine */
190
191 //>> BB1
192 //! /* logical preds: / linear preds: / kind: uniform, */
193 program->next_fp_mode.denorm32 = fp_denorm_keep;
194 program->next_fp_mode.denorm16_64 = fp_denorm_flush;
195 bld.reset(program->create_and_insert_block());
196
197 //! v1: %res14_tmp = v_add_f32 %a, %b
198 //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
199 //! p_unit_test 14, %res14
200 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
201 writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
202
203 //! v1: %res15 = v_add_f32 %a, %b clamp
204 //! p_unit_test 15, %res15
205 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
206 writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
207 Operand::c32(0x3f800000u), tmp));
208
209 //>> BB2
210 //! /* logical preds: / linear preds: / kind: uniform, */
211 program->next_fp_mode.denorm32 = fp_denorm_flush;
212 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
213 bld.reset(program->create_and_insert_block());
214
215 //! v2b: %res16_tmp = v_add_f16 %a, %b
216 //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
217 //! p_unit_test 16, %res16
218 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
219 writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
220
221 //! v2b: %res17 = v_add_f16 %a, %b clamp
222 //! p_unit_test 17, %res17
223 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
224 writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
225 Operand::c16(0x3c00u), tmp));
226
227 /* omod flushes -0.0 to +0.0 */
228
229 //>> BB3
230 //! /* logical preds: / linear preds: / kind: uniform, */
231 program->next_fp_mode.denorm32 = fp_denorm_keep;
232 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
233 program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
234 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
235 bld.reset(program->create_and_insert_block());
236
237 //! v1: %res18_tmp = v_add_f32 %a, %b
238 //! v1: %res18 = v_mul_f32 2.0, %res18_tmp
239 //! p_unit_test 18, %res18
240 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
241 writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
242 //! v1: %res19 = v_add_f32 %a, %b clamp
243 //! p_unit_test 19, %res19
244 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
245 writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
246 Operand::c32(0x3f800000u), tmp));
247
248 //>> BB4
249 //! /* logical preds: / linear preds: / kind: uniform, */
250 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
251 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
252 bld.reset(program->create_and_insert_block());
253 //! v2b: %res20_tmp = v_add_f16 %a, %b
254 //! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
255 //! p_unit_test 20, %res20
256 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
257 writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
258 //! v2b: %res21 = v_add_f16 %a, %b clamp
259 //! p_unit_test 21, %res21
260 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
261 writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
262 Operand::c16(0x3c00u), tmp));
263
264 finish_opt_test();
265 END_TEST
266
create_subbrev_co(Operand op0,Operand op1,Operand op2)267 Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
268 {
269 return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
270 }
271
272 BEGIN_TEST(optimize.cndmask)
273 for (unsigned i = GFX9; i <= GFX10; i++) {
274 //>> v1: %a, s1: %b, s2: %c = p_startpgm
275 if (!setup_cs("v1 s1 s2", (chip_class)i))
276 continue;
277
278 Temp subbrev;
279
280 //! v1: %res0 = v_cndmask_b32 0, %a, %c
281 //! p_unit_test 0, %res0
282 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
283 writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
284
285 //! v1: %res1 = v_cndmask_b32 0, 42, %c
286 //! p_unit_test 1, %res1
287 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
288 writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
289
290 //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
291 //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
292 //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
293 //! p_unit_test 2, %res2
294 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
295 writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
296
297 //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
298 //! v1: %xor = v_xor_b32 %a, %subbrev1
299 //! v1: %res3 = v_cndmask_b32 0, %xor, %c
300 //! p_unit_test 3, %res3
301 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
302 Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
303 writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
304
305 //! v1: %res4 = v_cndmask_b32 0, %a, %c
306 //! p_unit_test 4, %res4
307 Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
308 Operand::c32(1u), Operand(inputs[2]));
309 Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
310 writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
311
312 finish_opt_test();
313 }
314 END_TEST
315
316 BEGIN_TEST(optimize.add_lshl)
317 for (unsigned i = GFX8; i <= GFX10; i++) {
318 //>> s1: %a, v1: %b = p_startpgm
319 if (!setup_cs("s1 v1", (chip_class)i))
320 continue;
321
322 Temp shift;
323
324 //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
325 //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
326 //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
327 //! p_unit_test 0, %res0
328 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
329 Operand::c32(3u));
330 writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
331 Operand::c32(4u)));
332
333 //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
334 //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
335 //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
336 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
337 //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
338 //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
339 //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
340 //! p_unit_test 1, %res1
341 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
342 Operand::c32(3u));
343 Temp sadd =
344 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
345 Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
346 writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
347
348 //~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
349 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
350 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
351 //! p_unit_test 2, %res2
352 Temp lshl =
353 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));
354 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
355
356 //~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
357 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
358 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
359 //! p_unit_test 3, %res3
360 Operand a_24bit = Operand(inputs[0]);
361 a_24bit.set24bit(true);
362 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));
363 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
364
365 //! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
366 //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
367 //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
368 //! p_unit_test 4, %carry
369 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
370 Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
371 writeout(4, carry);
372
373 //~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
374 //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
375 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
376 //! p_unit_test 5, %res5
377 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
378 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
379
380 //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
381 //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
382 //! p_unit_test 6, %res6
383 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
384 writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
385
386 //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
387 //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
388 //! p_unit_test 7, %res7
389 Operand a_16bit = Operand(inputs[0]);
390 a_16bit.set16bit(true);
391 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));
392 writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
393
394 finish_opt_test();
395 }
396 END_TEST
397
398 BEGIN_TEST(optimize.bcnt)
399 for (unsigned i = GFX8; i <= GFX10; i++) {
400 //>> v1: %a, s1: %b = p_startpgm
401 if (!setup_cs("v1 s1", (chip_class)i))
402 continue;
403
404 Temp bcnt;
405
406 //! v1: %res0 = v_bcnt_u32_b32 %a, %a
407 //! p_unit_test 0, %res0
408 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
409 writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
410
411 //! v1: %res1 = v_bcnt_u32_b32 %a, %b
412 //! p_unit_test 1, %res1
413 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
414 writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
415
416 //! v1: %res2 = v_bcnt_u32_b32 %a, 42
417 //! p_unit_test 2, %res2
418 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
419 writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
420
421 //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
422 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
423 //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
424 //! p_unit_test 3, %res3
425 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
426 writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
427
428 //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
429 //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
430 //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
431 //! p_unit_test 4, %carry
432 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
433 Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
434 writeout(4, carry);
435
436 finish_opt_test();
437 }
438 END_TEST
439
440 struct clamp_config {
441 const char *name;
442 aco_opcode min, max, med3;
443 Operand lb, ub;
444 };
445
446 static const clamp_config clamp_configs[] = {
447 /* 0.0, 4.0 */
448 {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
449 Operand::zero(), Operand::c32(0x40800000u)},
450 {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
451 Operand::c16(0u), Operand::c16(0x4400)},
452 /* -1.0, 0.0 */
453 {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
454 Operand::c32(0xbf800000u), Operand::zero()},
455 {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
456 Operand::c16(0xBC00), Operand::c16(0u)},
457 /* 0, 3 */
458 {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
459 Operand::zero(), Operand::c32(3u)},
460 {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
461 Operand::c16(0u), Operand::c16(3u)},
462 {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
463 Operand::zero(), Operand::c32(3u)},
464 {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
465 Operand::c16(0u), Operand::c16(3u)},
466 /* -5, 0 */
467 {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
468 Operand::c32(0xfffffffbu), Operand::zero()},
469 {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
470 Operand::c16(0xfffbu), Operand::c16(0u)},
471 };
472
473 BEGIN_TEST(optimize.clamp)
474 for (clamp_config cfg : clamp_configs) {
475 if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
476 continue;
477
478 //! cfg: @match_func(min max med3 lb ub)
479 fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
480 fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
481 fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
482 aco_print_operand(&cfg.lb, output);
483 fprintf(output, " ");
484 aco_print_operand(&cfg.ub, output);
485 fprintf(output, "\n");
486
487 //>> v1: %a, v1: %b, v1: %c = p_startpgm
488
489 //! v1: %res0 = @med3 @ub, @lb, %a
490 //! p_unit_test 0, %res0
491 writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
492 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
493
494 //! v1: %res1 = @med3 @lb, @ub, %a
495 //! p_unit_test 1, %res1
496 writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
497 bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
498
499 /* min constant must be greater than max constant */
500 //! v1: %res2_tmp = @min @lb, %a
501 //! v1: %res2 = @max @ub, %res2_tmp
502 //! p_unit_test 2, %res2
503 writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
504 bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
505
506 //! v1: %res3_tmp = @max @ub, %a
507 //! v1: %res3 = @min @lb, %res3_tmp
508 //! p_unit_test 3, %res3
509 writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
510 bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
511
512 /* needs two constants */
513
514 //! v1: %res4_tmp = @max @lb, %a
515 //! v1: %res4 = @min %b, %res4_tmp
516 //! p_unit_test 4, %res4
517 writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
518 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
519
520 //! v1: %res5_tmp = @max %b, %a
521 //! v1: %res5 = @min @ub, %res5_tmp
522 //! p_unit_test 5, %res5
523 writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
524 bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
525
526 //! v1: %res6_tmp = @max %c, %a
527 //! v1: %res6 = @min %b, %res6_tmp
528 //! p_unit_test 6, %res6
529 writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
530 bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
531
532 /* correct NaN behaviour with precise */
533
534 //! v1: %res7 = @med3 @ub, @lb, %a
535 //! p_unit_test 7, %res7
536 Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
537 max.def(0).setPrecise(true);
538 Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
539 max.def(0).setPrecise(true);
540 writeout(7, min);
541
542 //! v1: (precise)%res8_tmp = @min @ub, %a
543 //! v1: %res8 = @max @lb, %res8_tmp
544 //! p_unit_test 8, %res8
545 min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
546 min.def(0).setPrecise(true);
547 writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
548
549 finish_opt_test();
550 }
551 END_TEST
552
553 BEGIN_TEST(optimize.const_comparison_ordering)
554 //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
555 if (!setup_cs("v1 v1 v2 v1", GFX9))
556 return;
557
558 /* optimize to unordered comparison */
559 //! s2: %res0 = v_cmp_nge_f32 4.0, %a
560 //! p_unit_test 0, %res0
561 writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
562 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
563 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
564 Operand::c32(0x40800000u), inputs[0])));
565
566 //! s2: %res1 = v_cmp_nge_f32 4.0, %a
567 //! p_unit_test 1, %res1
568 writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
569 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
570 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
571 Operand::c32(0x40800000u), inputs[0])));
572
573 //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
574 //! p_unit_test 2, %res2
575 writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
576 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
577 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
578 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
579
580 /* optimize to ordered comparison */
581 //! s2: %res3 = v_cmp_lt_f32 4.0, %a
582 //! p_unit_test 3, %res3
583 writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
584 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
585 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
586 Operand::c32(0x40800000u), inputs[0])));
587
588 //! s2: %res4 = v_cmp_lt_f32 4.0, %a
589 //! p_unit_test 4, %res4
590 writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
591 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
592 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
593 Operand::c32(0x40800000u), inputs[0])));
594
595 //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
596 //! p_unit_test 5, %res5
597 writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
598 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
599 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
600 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
601
602 /* similar but unoptimizable expressions */
603 //! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
604 //! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
605 //! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
606 //! p_unit_test 6, %res6
607 Temp src1 =
608 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
609 Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
610 writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
611
612 //! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
613 //! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
614 //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
615 //! p_unit_test 7, %res7
616 src1 =
617 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
618 src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
619 writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
620
621 //! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
622 //! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
623 //! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
624 //! p_unit_test 8, %res8
625 src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
626 src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
627 writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
628
629 //! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
630 //! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
631 //! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
632 //! p_unit_test 9, %res9
633 src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
634 src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
635 writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
636
637 /* bit sizes */
638 //! s2: %res10 = v_cmp_nge_f16 4.0, %b
639 //! p_unit_test 10, %res10
640 Temp input1_16 =
641 bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
642 writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
643 bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
644 bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
645 input1_16)));
646
647 //! s2: %res11 = v_cmp_nge_f64 4.0, %c
648 //! p_unit_test 11, %res11
649 writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
650 bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
651 bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
652 Operand::c64(0x4010000000000000u), inputs[2])));
653
654 /* NaN */
655 uint16_t nan16 = 0x7e00;
656 uint32_t nan32 = 0x7fc00000;
657 uint64_t nan64 = 0xffffffffffffffffllu;
658
659 //! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
660 //! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
661 //! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
662 //! p_unit_test 12, %res12
663 src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
664 src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
665 writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
666
667 //! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
668 //! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
669 //! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
670 //! p_unit_test 13, %res13
671 src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
672 src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
673 writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
674
675 //! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
676 //! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
677 //! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
678 //! p_unit_test 14, %res14
679 src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);
680 src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
681 writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
682
683 finish_opt_test();
684 END_TEST
685
686 BEGIN_TEST(optimize.add3)
687 //>> v1: %a, v1: %b, v1: %c = p_startpgm
688 if (!setup_cs("v1 v1 v1", GFX9))
689 return;
690
691 //! v1: %res0 = v_add3_u32 %a, %b, %c
692 //! p_unit_test 0, %res0
693 Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
694 writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
695
696 //! v1: %tmp1 = v_add_u32 %b, %c clamp
697 //! v1: %res1 = v_add_u32 %a, %tmp1
698 //! p_unit_test 1, %res1
699 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
700 tmp.instr->vop3().clamp = true;
701 writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
702
703 //! v1: %tmp2 = v_add_u32 %b, %c
704 //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
705 //! p_unit_test 2, %res2
706 tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
707 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
708 tmp.instr->vop3().clamp = true;
709 writeout(2, tmp);
710
711 finish_opt_test();
712 END_TEST
713
714 BEGIN_TEST(optimize.minmax)
715 for (unsigned i = GFX9; i <= GFX10; i++) {
716 //>> v1: %a = p_startpgm
717 if (!setup_cs("v1", (chip_class)i))
718 continue;
719
720 //! v1: %res0 = v_max3_f32 0, -0, %a
721 //! p_unit_test 0, %res0
722 Temp xor0 = fneg(inputs[0]);
723 Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
724 Temp xor1 = fneg(min);
725 writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
726
727 //! v1: %res1 = v_max3_f32 0, -0, -%a
728 //! p_unit_test 1, %res1
729 min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
730 xor1 = fneg(min);
731 writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
732
733 finish_opt_test();
734 }
735 END_TEST
736
737 BEGIN_TEST(optimize.mad_32_24)
738 for (unsigned i = GFX8; i <= GFX9; i++) {
739 //>> v1: %a, v1: %b, v1: %c = p_startpgm
740 if (!setup_cs("v1 v1 v1", (chip_class)i))
741 continue;
742
743 //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
744 //! p_unit_test 0, %res0
745 Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
746 writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
747
748 //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
749 //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
750 //! p_unit_test 1, %res1
751 mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
752 writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
753
754 finish_opt_test();
755 }
756 END_TEST
757
758 BEGIN_TEST(optimize.add_lshlrev)
759 for (unsigned i = GFX8; i <= GFX10; i++) {
760 //>> v1: %a, v1: %b, s1: %c = p_startpgm
761 if (!setup_cs("v1 v1 s1", (chip_class)i))
762 continue;
763
764 Temp lshl;
765
766 //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
767 //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
768 //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
769 //! p_unit_test 0, %res0
770 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
771 writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
772
773 //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
774 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
775 //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
776 //! p_unit_test 1, %res1
777 Operand a_24bit = Operand(inputs[0]);
778 a_24bit.set24bit(true);
779 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
780 writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
781
782 //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
783 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
784 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
785 //! p_unit_test 2, %res2
786 Operand b_24bit = Operand(inputs[1]);
787 b_24bit.set24bit(true);
788 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
789 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
790
791 //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
792 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
793 //! p_unit_test 3, %res3
794 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
795 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
796
797 //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
798 //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
799 //! p_unit_test 4, %res4
800 Operand a_16bit = Operand(inputs[0]);
801 a_16bit.set16bit(true);
802 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
803 writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
804
805 //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
806 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
807 //! p_unit_test 5, %res5
808 Operand c_24bit = Operand(inputs[2]);
809 c_24bit.set24bit(true);
810 lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
811 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
812
813 finish_opt_test();
814 }
815 END_TEST
816
817 enum denorm_op {
818 denorm_mul1 = 0,
819 denorm_fneg = 1,
820 denorm_fabs = 2,
821 denorm_fnegabs = 3,
822 };
823
824 static const char *denorm_op_names[] = {
825 "mul1",
826 "fneg",
827 "fabs",
828 "fnegabs",
829 };
830
831 struct denorm_config {
832 bool flush;
833 unsigned op;
834 aco_opcode src;
835 aco_opcode dest;
836 };
837
srcdest_op_name(aco_opcode op)838 static const char *srcdest_op_name(aco_opcode op)
839 {
840 switch (op) {
841 case aco_opcode::v_cndmask_b32:
842 return "cndmask";
843 case aco_opcode::v_min_f32:
844 return "min";
845 case aco_opcode::v_rcp_f32:
846 return "rcp";
847 default:
848 return "none";
849 }
850 }
851
emit_denorm_srcdest(aco_opcode op,Temp val)852 static Temp emit_denorm_srcdest(aco_opcode op, Temp val)
853 {
854 switch (op) {
855 case aco_opcode::v_cndmask_b32:
856 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
857 case aco_opcode::v_min_f32:
858 return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
859 case aco_opcode::v_rcp_f32:
860 return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
861 default:
862 return val;
863 }
864 }
865
866 BEGIN_TEST(optimize.denorm_propagation)
867 for (unsigned i = GFX8; i <= GFX9; i++) {
868 std::vector<denorm_config> configs;
869 for (bool flush : {false, true}) {
870 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
871 configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
872
873 for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
874 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
875 configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
876 }
877
878 for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
879 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
880 configs.push_back({flush, op, src, aco_opcode::num_opcodes});
881 }
882 }
883
884 for (denorm_config cfg : configs) {
885 char subvariant[128];
886 sprintf(subvariant, "_%s_%s_%s_%s",
887 cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
888 denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
889 if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant))
890 continue;
891
892 bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
893 cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
894 !cfg.flush;
895
896 fprintf(output, "src, dest, op: %s %s %s\n",
897 srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
898 fprintf(output, "can_propagate: %u\n", can_propagate);
899 //! src, dest, op: $src $dest $op
900 //! can_propagate: #can_propagate
901 //>> v1: %a, s2: %b = p_startpgm
902
903 //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
904 //; 'min': 'v1: %{} = v_min_f32 0, {}',
905 //; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
906 //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
907 //; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
908 //; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
909 //; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
910 //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
911
912 //; name = 'a'
913 //; if src != 'none':
914 //; insert_pattern(patterns[src].format('src_res', '%'+name))
915 //; name = 'src_res'
916
917 //; if can_propagate:
918 //; name = inline_ops[op].format(name)
919 //; else:
920 //; insert_pattern(ops[op].format('op_res', name))
921 //; name = '%op_res'
922
923 //; if dest != 'none':
924 //; insert_pattern(patterns[dest].format('dest_res', name))
925 //; name = '%dest_res'
926
927 //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
928 //! p_unit_test 0, %res
929
930 program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
931
932 Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
933 switch (cfg.op) {
934 case denorm_mul1:
935 val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
936 break;
937 case denorm_fneg:
938 val = fneg(val);
939 break;
940 case denorm_fabs:
941 val = fabs(val);
942 break;
943 case denorm_fnegabs:
944 val = fneg(fabs(val));
945 break;
946 }
947 val = emit_denorm_srcdest(cfg.dest, val);
948 writeout(
949 0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
950
951 finish_opt_test();
952 }
953 }
954 END_TEST
955
956 BEGIN_TEST(optimizer.dpp)
957 //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
958 if (!setup_cs("v1 v1 s2 s1", GFX10_3))
959 return;
960
961 Operand a(inputs[0]);
962 Operand b(inputs[1]);
963 Operand c(inputs[2]);
964 Operand d(inputs[3]);
965
966 /* basic optimization */
967 //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
968 //! p_unit_test 0, %res0
969 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
970 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
971 writeout(0, res0);
972
973 /* operand swapping */
974 //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
975 //! p_unit_test 1, %res1
976 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
977 Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
978 writeout(1, res1);
979
980 //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
981 //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
982 //! p_unit_test 2, %res2
983 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
984 Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
985 writeout(2, res2);
986
987 /* modifiers */
988 //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
989 //! p_unit_test 3, %res3
990 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
991 tmp3.instr->dpp().neg[0] = true;
992 Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
993 writeout(3, res3);
994
995 //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
996 //! p_unit_test 4, %res4
997 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
998 auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
999 res4.instr->vop3().neg[0] = true;
1000 writeout(4, res4);
1001
1002 //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
1003 //! v1: %res5 = v_add_f32 %tmp5, %b clamp
1004 //! p_unit_test 5, %res5
1005 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1006 auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
1007 res5.instr->vop3().clamp = true;
1008 writeout(5, res5);
1009
1010 //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
1011 //! p_unit_test 6, %res6
1012 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1013 tmp6.instr->dpp().neg[0] = true;
1014 auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
1015 res6.instr->vop3().abs[0] = true;
1016 writeout(6, res6);
1017
1018 //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
1019 //! p_unit_test 7, %res7
1020 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1021 auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
1022 res7.instr->vop3().abs[0] = true;
1023 writeout(7, res7);
1024
1025 /* vcc */
1026 //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
1027 //! p_unit_test 8, %res8
1028 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1029 Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
1030 writeout(8, res8);
1031
1032 /* sgprs */
1033 //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
1034 //! v1: %res9 = v_add_f32 %tmp9, %d
1035 //! p_unit_test 9, %res9
1036 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1037 Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
1038 writeout(9, res9);
1039
1040 //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
1041 //! v1: %res10 = v_add_f32 %d, %tmp10
1042 //! p_unit_test 10, %res10
1043 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1044 Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
1045 writeout(10, res10);
1046
1047 finish_opt_test();
1048 END_TEST
1049
1050 BEGIN_TEST(optimize.dpp_prop)
1051 //>> v1: %a, s1: %b = p_startpgm
1052 if (!setup_cs("v1 s1", GFX10))
1053 return;
1054
1055 //! v1: %one = p_parallelcopy 1
1056 //! v1: %res0 = v_mul_f32 1, %a
1057 //! p_unit_test 0, %res0
1058 Temp one = bld.copy(bld.def(v1), Operand::c32(1));
1059 writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
1060
1061 //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
1062 //! p_unit_test 1, %res1
1063 writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
1064
1065 //! v1: %res2 = v_mul_f32 0x12345678, %a
1066 //! p_unit_test 2, %res2
1067 Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
1068 writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1069
1070 //! v1: %literal2 = p_parallelcopy 0x12345679
1071 //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
1072 //! p_unit_test 3, %res3
1073 Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
1074 writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1075
1076 //! v1: %b_v = p_parallelcopy %b
1077 //! v1: %res4 = v_mul_f32 %b, %a
1078 //! p_unit_test 4, %res4
1079 Temp b_v = bld.copy(bld.def(v1), inputs[1]);
1080 writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
1081
1082 //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
1083 //! p_unit_test 5, %res5
1084 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
1085
1086 //! v1: %res6 = v_rcp_f32 %b
1087 //! p_unit_test 6, %res6
1088 writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
1089
1090 finish_opt_test();
1091 END_TEST
1092
1093