1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 #include "helpers.h"
25 
26 using namespace aco;
27 
28 BEGIN_TEST(optimize.neg)
29    for (unsigned i = GFX9; i <= GFX10; i++) {
30       //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
31       if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
32          continue;
33 
34       //! v1: %res0 = v_mul_f32 %a, -%b
35       //! p_unit_test 0, %res0
36       Temp neg_b = fneg(inputs[1]);
37       writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
38 
39       //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
40       //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
41       //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
42       //! p_unit_test 1, %res1
43       Temp neg_a = fneg(inputs[0]);
44       writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
45 
46       //! v1: %res2 = v_mul_f32 %a, %b
47       //! p_unit_test 2, %res2
48       Temp neg_neg_a = fneg(neg_a);
49       writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
50 
51       //! v1: %res3 = v_mul_f32 |%a|, %b
52       //! p_unit_test 3, %res3
53       Temp abs_neg_a = fabs(neg_a);
54       writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
55 
56       //! v1: %res4 = v_mul_f32 -|%a|, %b
57       //! p_unit_test 4, %res4
58       Temp abs_a = fabs(inputs[0]);
59       Temp neg_abs_a = fneg(abs_a);
60       writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
61 
62       //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
63       //! p_unit_test 5, %res5
64       writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
65 
66       //! v1: %res6 = v_subrev_f32 %a, %b
67       //! p_unit_test 6, %res6
68       writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
69 
70       //! v1: %res7 = v_sub_f32 %b, %a
71       //! p_unit_test 7, %res7
72       writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
73 
74       //! v1: %res8 = v_mul_f32 %a, -%c
75       //! p_unit_test 8, %res8
76       Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
77       writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
78 
79       // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
80       // //! p_unit_test 9, %res9
81       Temp abs_neg_abs_a = fabs(neg_abs_a);
82       writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
83 
84       finish_opt_test();
85    }
86 END_TEST
87 
88 BEGIN_TEST(optimize.output_modifiers)
89    //>> v1: %a, v1: %b = p_startpgm
90    if (!setup_cs("v1 v1", GFX9))
91       return;
92 
93    program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
94 
95    /* 32-bit modifiers */
96 
97    //! v1: %res0 = v_add_f32 %a, %b *0.5
98    //! p_unit_test 0, %res0
99    Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
100    writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
101 
102    //! v1: %res1 = v_add_f32 %a, %b *2
103    //! p_unit_test 1, %res1
104    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
105    writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
106 
107    //! v1: %res2 = v_add_f32 %a, %b *4
108    //! p_unit_test 2, %res2
109    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
110    writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
111 
112    //! v1: %res3 = v_add_f32 %a, %b clamp
113    //! p_unit_test 3, %res3
114    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
115    writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
116                         Operand::c32(0x3f800000u), tmp));
117 
118    //! v1: %res4 = v_add_f32 %a, %b *2 clamp
119    //! p_unit_test 4, %res4
120    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
121    tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
122    writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
123                         Operand::c32(0x3f800000u), tmp));
124 
125    /* 16-bit modifiers */
126 
127    //! v2b: %res5 = v_add_f16 %a, %b *0.5
128    //! p_unit_test 5, %res5
129    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
130    writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
131 
132    //! v2b: %res6 = v_add_f16 %a, %b *2
133    //! p_unit_test 6, %res6
134    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
135    writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
136 
137    //! v2b: %res7 = v_add_f16 %a, %b *4
138    //! p_unit_test 7, %res7
139    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
140    writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
141 
142    //! v2b: %res8 = v_add_f16 %a, %b clamp
143    //! p_unit_test 8, %res8
144    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
145    writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
146                         Operand::c16(0x3c00u), tmp));
147 
148    //! v2b: %res9 = v_add_f16 %a, %b *2 clamp
149    //! p_unit_test 9, %res9
150    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
151    tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
152    writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
153                         Operand::c16(0x3c00u), tmp));
154 
155    /* clamping is done after omod */
156 
157    //! v1: %res10_tmp = v_add_f32 %a, %b clamp
158    //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
159    //! p_unit_test 10, %res10
160    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
161    tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
162                   tmp);
163    writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
164 
165    /* unsupported instructions */
166 
167    //! v1: %res11_tmp = v_xor_b32 %a, %b
168    //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
169    //! p_unit_test 11, %res11
170    tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
171    writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
172 
173    /* several users */
174 
175    //! v1: %res12_tmp = v_add_f32 %a, %b
176    //! p_unit_test %res12_tmp
177    //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
178    //! p_unit_test 12, %res12
179    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
180    bld.pseudo(aco_opcode::p_unit_test, tmp);
181    writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
182 
183    //! v1: %res13 = v_add_f32 %a, %b
184    //! p_unit_test 13, %res13
185    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
186    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
187    writeout(13, tmp);
188 
189    /* omod has no effect if denormals are enabled but clamp is fine */
190 
191    //>> BB1
192    //! /* logical preds: / linear preds: / kind: uniform, */
193    program->next_fp_mode.denorm32 = fp_denorm_keep;
194    program->next_fp_mode.denorm16_64 = fp_denorm_flush;
195    bld.reset(program->create_and_insert_block());
196 
197    //! v1: %res14_tmp = v_add_f32 %a, %b
198    //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
199    //! p_unit_test 14, %res14
200    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
201    writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
202 
203    //! v1: %res15 = v_add_f32 %a, %b clamp
204    //! p_unit_test 15, %res15
205    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
206    writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
207                          Operand::c32(0x3f800000u), tmp));
208 
209    //>> BB2
210    //! /* logical preds: / linear preds: / kind: uniform, */
211    program->next_fp_mode.denorm32 = fp_denorm_flush;
212    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
213    bld.reset(program->create_and_insert_block());
214 
215    //! v2b: %res16_tmp = v_add_f16 %a, %b
216    //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
217    //! p_unit_test 16, %res16
218    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
219    writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
220 
221    //! v2b: %res17 = v_add_f16 %a, %b clamp
222    //! p_unit_test 17, %res17
223    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
224    writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
225                          Operand::c16(0x3c00u), tmp));
226 
227    /* omod flushes -0.0 to +0.0 */
228 
229    //>> BB3
230    //! /* logical preds: / linear preds: / kind: uniform, */
231    program->next_fp_mode.denorm32 = fp_denorm_keep;
232    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
233    program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
234    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
235    bld.reset(program->create_and_insert_block());
236 
237    //! v1: %res18_tmp = v_add_f32 %a, %b
238    //! v1: %res18 = v_mul_f32 2.0, %res18_tmp
239    //! p_unit_test 18, %res18
240    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
241    writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
242    //! v1: %res19 = v_add_f32 %a, %b clamp
243    //! p_unit_test 19, %res19
244    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
245    writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
246                          Operand::c32(0x3f800000u), tmp));
247 
248    //>> BB4
249    //! /* logical preds: / linear preds: / kind: uniform, */
250    program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
251    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
252    bld.reset(program->create_and_insert_block());
253    //! v2b: %res20_tmp = v_add_f16 %a, %b
254    //! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
255    //! p_unit_test 20, %res20
256    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
257    writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
258    //! v2b: %res21 = v_add_f16 %a, %b clamp
259    //! p_unit_test 21, %res21
260    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
261    writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
262                          Operand::c16(0x3c00u), tmp));
263 
264    finish_opt_test();
265 END_TEST
266 
create_subbrev_co(Operand op0,Operand op1,Operand op2)267 Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
268 {
269    return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
270 }
271 
272 BEGIN_TEST(optimize.cndmask)
273    for (unsigned i = GFX9; i <= GFX10; i++) {
274       //>> v1: %a, s1: %b, s2: %c = p_startpgm
275       if (!setup_cs("v1 s1 s2", (chip_class)i))
276          continue;
277 
278       Temp subbrev;
279 
280       //! v1: %res0 = v_cndmask_b32 0, %a, %c
281       //! p_unit_test 0, %res0
282       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
283       writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
284 
285       //! v1: %res1 = v_cndmask_b32 0, 42, %c
286       //! p_unit_test 1, %res1
287       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
288       writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
289 
290       //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
291       //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
292       //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
293       //! p_unit_test 2, %res2
294       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
295       writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
296 
297       //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
298       //! v1: %xor = v_xor_b32 %a, %subbrev1
299       //! v1: %res3 = v_cndmask_b32 0, %xor, %c
300       //! p_unit_test 3, %res3
301       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
302       Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
303       writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
304 
305       //! v1: %res4 = v_cndmask_b32 0, %a, %c
306       //! p_unit_test 4, %res4
307       Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
308                                   Operand::c32(1u), Operand(inputs[2]));
309       Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
310       writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
311 
312       finish_opt_test();
313    }
314 END_TEST
315 
316 BEGIN_TEST(optimize.add_lshl)
317    for (unsigned i = GFX8; i <= GFX10; i++) {
318       //>> s1: %a, v1: %b = p_startpgm
319       if (!setup_cs("s1 v1", (chip_class)i))
320          continue;
321 
322       Temp shift;
323 
324       //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
325       //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
326       //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
327       //! p_unit_test 0, %res0
328       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
329                        Operand::c32(3u));
330       writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
331                            Operand::c32(4u)));
332 
333       //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
334       //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
335       //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
336       //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
337       //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
338       //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
339       //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
340       //! p_unit_test 1, %res1
341       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
342                        Operand::c32(3u));
343       Temp sadd =
344          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
345       Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
346       writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
347 
348       //~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
349       //~gfx8! v1: %res2,  s2: %_ = v_add_co_u32 %lshl2, %b
350       //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
351       //! p_unit_test 2, %res2
352       Temp lshl =
353          bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));
354       writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
355 
356       //~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
357       //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
358       //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
359       //! p_unit_test 3, %res3
360       Operand a_24bit = Operand(inputs[0]);
361       a_24bit.set24bit(true);
362       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));
363       writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
364 
365       //! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
366       //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
367       //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
368       //! p_unit_test 4, %carry
369       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
370       Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
371       writeout(4, carry);
372 
373       //~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
374       //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
375       //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
376       //! p_unit_test 5, %res5
377       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
378       writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
379 
380       //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
381       //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
382       //! p_unit_test 6, %res6
383       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
384       writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
385 
386       //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
387       //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
388       //! p_unit_test 7, %res7
389       Operand a_16bit = Operand(inputs[0]);
390       a_16bit.set16bit(true);
391       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));
392       writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
393 
394       finish_opt_test();
395    }
396 END_TEST
397 
398 BEGIN_TEST(optimize.bcnt)
399    for (unsigned i = GFX8; i <= GFX10; i++) {
400       //>> v1: %a, s1: %b = p_startpgm
401       if (!setup_cs("v1 s1", (chip_class)i))
402          continue;
403 
404       Temp bcnt;
405 
406       //! v1: %res0 = v_bcnt_u32_b32 %a, %a
407       //! p_unit_test 0, %res0
408       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
409       writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
410 
411       //! v1: %res1 = v_bcnt_u32_b32 %a, %b
412       //! p_unit_test 1, %res1
413       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
414       writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
415 
416       //! v1: %res2 = v_bcnt_u32_b32 %a, 42
417       //! p_unit_test 2, %res2
418       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
419       writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
420 
421       //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
422       //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
423       //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
424       //! p_unit_test 3, %res3
425       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
426       writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
427 
428       //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
429       //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
430       //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
431       //! p_unit_test 4, %carry
432       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
433       Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
434       writeout(4, carry);
435 
436       finish_opt_test();
437    }
438 END_TEST
439 
440 struct clamp_config {
441    const char *name;
442    aco_opcode min, max, med3;
443    Operand lb, ub;
444 };
445 
446 static const clamp_config clamp_configs[] = {
447    /* 0.0, 4.0 */
448    {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
449     Operand::zero(), Operand::c32(0x40800000u)},
450    {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
451     Operand::c16(0u), Operand::c16(0x4400)},
452    /* -1.0, 0.0 */
453    {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
454     Operand::c32(0xbf800000u), Operand::zero()},
455    {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
456     Operand::c16(0xBC00), Operand::c16(0u)},
457    /* 0, 3 */
458    {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
459     Operand::zero(), Operand::c32(3u)},
460    {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
461     Operand::c16(0u), Operand::c16(3u)},
462    {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
463     Operand::zero(), Operand::c32(3u)},
464    {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
465     Operand::c16(0u), Operand::c16(3u)},
466    /* -5, 0 */
467    {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
468     Operand::c32(0xfffffffbu), Operand::zero()},
469    {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
470     Operand::c16(0xfffbu), Operand::c16(0u)},
471 };
472 
473 BEGIN_TEST(optimize.clamp)
474    for (clamp_config cfg : clamp_configs) {
475       if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
476          continue;
477 
478       //! cfg: @match_func(min max med3 lb ub)
479       fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
480       fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
481       fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
482       aco_print_operand(&cfg.lb, output);
483       fprintf(output, " ");
484       aco_print_operand(&cfg.ub, output);
485       fprintf(output, "\n");
486 
487       //>> v1: %a, v1: %b, v1: %c = p_startpgm
488 
489       //! v1: %res0 = @med3 @ub, @lb, %a
490       //! p_unit_test 0, %res0
491       writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
492                            bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
493 
494       //! v1: %res1 = @med3 @lb, @ub, %a
495       //! p_unit_test 1, %res1
496       writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
497                            bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
498 
499       /* min constant must be greater than max constant */
500       //! v1: %res2_tmp = @min @lb, %a
501       //! v1: %res2 = @max @ub, %res2_tmp
502       //! p_unit_test 2, %res2
503       writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
504                            bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
505 
506       //! v1: %res3_tmp = @max @ub, %a
507       //! v1: %res3 = @min @lb, %res3_tmp
508       //! p_unit_test 3, %res3
509       writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
510                            bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
511 
512       /* needs two constants */
513 
514       //! v1: %res4_tmp = @max @lb, %a
515       //! v1: %res4 = @min %b, %res4_tmp
516       //! p_unit_test 4, %res4
517       writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
518                            bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
519 
520       //! v1: %res5_tmp = @max %b, %a
521       //! v1: %res5 = @min @ub, %res5_tmp
522       //! p_unit_test 5, %res5
523       writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
524                            bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
525 
526       //! v1: %res6_tmp = @max %c, %a
527       //! v1: %res6 = @min %b, %res6_tmp
528       //! p_unit_test 6, %res6
529       writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
530                            bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
531 
532       /* correct NaN behaviour with precise */
533 
534       //! v1: %res7 = @med3 @ub, @lb, %a
535       //! p_unit_test 7, %res7
536       Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
537       max.def(0).setPrecise(true);
538       Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
539       max.def(0).setPrecise(true);
540       writeout(7, min);
541 
542       //! v1: (precise)%res8_tmp = @min @ub, %a
543       //! v1: %res8 = @max @lb, %res8_tmp
544       //! p_unit_test 8, %res8
545       min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
546       min.def(0).setPrecise(true);
547       writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
548 
549       finish_opt_test();
550    }
551 END_TEST
552 
553 BEGIN_TEST(optimize.const_comparison_ordering)
554    //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
555    if (!setup_cs("v1 v1 v2 v1", GFX9))
556       return;
557 
558    /* optimize to unordered comparison */
559    //! s2: %res0 = v_cmp_nge_f32 4.0, %a
560    //! p_unit_test 0, %res0
561    writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
562                         bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
563                         bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
564                                  Operand::c32(0x40800000u), inputs[0])));
565 
566    //! s2: %res1 = v_cmp_nge_f32 4.0, %a
567    //! p_unit_test 1, %res1
568    writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
569                         bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
570                         bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
571                                  Operand::c32(0x40800000u), inputs[0])));
572 
573    //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
574    //! p_unit_test 2, %res2
575    writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
576                         bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
577                         bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
578                                  bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
579 
580    /* optimize to ordered comparison */
581    //! s2: %res3 = v_cmp_lt_f32 4.0, %a
582    //! p_unit_test 3, %res3
583    writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
584                         bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
585                         bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
586                                  Operand::c32(0x40800000u), inputs[0])));
587 
588    //! s2: %res4 = v_cmp_lt_f32 4.0, %a
589    //! p_unit_test 4, %res4
590    writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
591                         bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
592                         bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
593                                  Operand::c32(0x40800000u), inputs[0])));
594 
595    //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
596    //! p_unit_test 5, %res5
597    writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
598                         bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
599                         bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
600                                  bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
601 
602    /* similar but unoptimizable expressions */
603    //! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
604    //! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
605    //! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
606    //! p_unit_test 6, %res6
607    Temp src1 =
608       bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
609    Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
610    writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
611 
612    //! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
613    //! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
614    //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
615    //! p_unit_test 7, %res7
616    src1 =
617       bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
618    src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
619    writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
620 
621    //! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
622    //! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
623    //! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
624    //! p_unit_test 8, %res8
625    src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
626    src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
627    writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
628 
629    //! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
630    //! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
631    //! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
632    //! p_unit_test 9, %res9
633    src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
634    src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
635    writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
636 
637    /* bit sizes */
638    //! s2: %res10 = v_cmp_nge_f16 4.0, %b
639    //! p_unit_test 10, %res10
640    Temp input1_16 =
641       bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
642    writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
643                          bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
644                          bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
645                                   input1_16)));
646 
647    //! s2: %res11 = v_cmp_nge_f64 4.0, %c
648    //! p_unit_test 11, %res11
649    writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
650                          bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
651                          bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
652                                   Operand::c64(0x4010000000000000u), inputs[2])));
653 
654    /* NaN */
655    uint16_t nan16 = 0x7e00;
656    uint32_t nan32 = 0x7fc00000;
657    uint64_t nan64 = 0xffffffffffffffffllu;
658 
659    //! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
660    //! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
661    //! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
662    //! p_unit_test 12, %res12
663    src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
664    src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
665    writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
666 
667    //! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
668    //! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
669    //! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
670    //! p_unit_test 13, %res13
671    src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
672    src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
673    writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
674 
675    //! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
676    //! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
677    //! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
678    //! p_unit_test 14, %res14
679    src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);
680    src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
681    writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
682 
683    finish_opt_test();
684 END_TEST
685 
686 BEGIN_TEST(optimize.add3)
687    //>> v1: %a, v1: %b, v1: %c = p_startpgm
688    if (!setup_cs("v1 v1 v1", GFX9))
689       return;
690 
691    //! v1: %res0 = v_add3_u32 %a, %b, %c
692    //! p_unit_test 0, %res0
693    Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
694    writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
695 
696    //! v1: %tmp1 = v_add_u32 %b, %c clamp
697    //! v1: %res1 = v_add_u32 %a, %tmp1
698    //! p_unit_test 1, %res1
699    tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
700    tmp.instr->vop3().clamp = true;
701    writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
702 
703    //! v1: %tmp2 = v_add_u32 %b, %c
704    //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
705    //! p_unit_test 2, %res2
706    tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
707    tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
708    tmp.instr->vop3().clamp = true;
709    writeout(2, tmp);
710 
711    finish_opt_test();
712 END_TEST
713 
714 BEGIN_TEST(optimize.minmax)
715    for (unsigned i = GFX9; i <= GFX10; i++) {
716       //>> v1: %a = p_startpgm
717       if (!setup_cs("v1", (chip_class)i))
718          continue;
719 
720       //! v1: %res0 = v_max3_f32 0, -0, %a
721       //! p_unit_test 0, %res0
722       Temp xor0 = fneg(inputs[0]);
723       Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
724       Temp xor1 = fneg(min);
725       writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
726 
727       //! v1: %res1 = v_max3_f32 0, -0, -%a
728       //! p_unit_test 1, %res1
729       min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
730       xor1 = fneg(min);
731       writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
732 
733       finish_opt_test();
734    }
735 END_TEST
736 
737 BEGIN_TEST(optimize.mad_32_24)
738    for (unsigned i = GFX8; i <= GFX9; i++) {
739       //>> v1: %a, v1: %b, v1: %c = p_startpgm
740       if (!setup_cs("v1 v1 v1", (chip_class)i))
741          continue;
742 
743       //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
744       //! p_unit_test 0, %res0
745       Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
746       writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
747 
748       //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
749       //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
750       //! p_unit_test 1, %res1
751       mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
752       writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
753 
754       finish_opt_test();
755    }
756 END_TEST
757 
758 BEGIN_TEST(optimize.add_lshlrev)
759    for (unsigned i = GFX8; i <= GFX10; i++) {
760       //>> v1: %a, v1: %b, s1: %c = p_startpgm
761       if (!setup_cs("v1 v1 s1", (chip_class)i))
762          continue;
763 
764       Temp lshl;
765 
766       //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
767       //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
768       //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
769       //! p_unit_test 0, %res0
770       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
771       writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
772 
773       //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
774       //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
775       //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
776       //! p_unit_test 1, %res1
777       Operand a_24bit = Operand(inputs[0]);
778       a_24bit.set24bit(true);
779       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
780       writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
781 
782       //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
783       //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
784       //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
785       //! p_unit_test 2, %res2
786       Operand b_24bit = Operand(inputs[1]);
787       b_24bit.set24bit(true);
788       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
789       writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
790 
791       //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
792       //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
793       //! p_unit_test 3, %res3
794       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
795       writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
796 
797       //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
798       //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
799       //! p_unit_test 4, %res4
800       Operand a_16bit = Operand(inputs[0]);
801       a_16bit.set16bit(true);
802       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
803       writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
804 
805       //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
806       //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
807       //! p_unit_test 5, %res5
808       Operand c_24bit = Operand(inputs[2]);
809       c_24bit.set24bit(true);
810       lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
811       writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
812 
813       finish_opt_test();
814    }
815 END_TEST
816 
817 enum denorm_op {
818    denorm_mul1 = 0,
819    denorm_fneg = 1,
820    denorm_fabs = 2,
821    denorm_fnegabs = 3,
822 };
823 
824 static const char *denorm_op_names[] = {
825    "mul1",
826    "fneg",
827    "fabs",
828    "fnegabs",
829 };
830 
831 struct denorm_config {
832    bool flush;
833    unsigned op;
834    aco_opcode src;
835    aco_opcode dest;
836 };
837 
srcdest_op_name(aco_opcode op)838 static const char *srcdest_op_name(aco_opcode op)
839 {
840    switch (op) {
841    case aco_opcode::v_cndmask_b32:
842       return "cndmask";
843    case aco_opcode::v_min_f32:
844       return "min";
845    case aco_opcode::v_rcp_f32:
846       return "rcp";
847    default:
848       return "none";
849    }
850 }
851 
emit_denorm_srcdest(aco_opcode op,Temp val)852 static Temp emit_denorm_srcdest(aco_opcode op, Temp val)
853 {
854    switch (op) {
855    case aco_opcode::v_cndmask_b32:
856       return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
857    case aco_opcode::v_min_f32:
858       return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
859    case aco_opcode::v_rcp_f32:
860       return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
861    default:
862       return val;
863    }
864 }
865 
866 BEGIN_TEST(optimize.denorm_propagation)
867    for (unsigned i = GFX8; i <= GFX9; i++) {
868       std::vector<denorm_config> configs;
869       for (bool flush : {false, true}) {
870          for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
871             configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
872 
873          for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
874             for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
875                configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
876          }
877 
878          for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
879             for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
880                configs.push_back({flush, op, src, aco_opcode::num_opcodes});
881          }
882       }
883 
884       for (denorm_config cfg : configs) {
885          char subvariant[128];
886          sprintf(subvariant, "_%s_%s_%s_%s",
887                  cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
888                  denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
889          if (!setup_cs("v1 s2", (chip_class)i, CHIP_UNKNOWN, subvariant))
890             continue;
891 
892          bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
893                               cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
894                               !cfg.flush;
895 
896          fprintf(output, "src, dest, op: %s %s %s\n",
897                  srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
898          fprintf(output, "can_propagate: %u\n", can_propagate);
899          //! src, dest, op: $src $dest $op
900          //! can_propagate: #can_propagate
901          //>> v1: %a, s2: %b = p_startpgm
902 
903          //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
904          //;             'min': 'v1: %{} = v_min_f32 0, {}',
905          //;             'rcp': 'v1: %{} = v_rcp_f32 {}'}
906          //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
907          //;        'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
908          //;        'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
909          //;        'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
910          //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
911 
912          //; name = 'a'
913          //; if src != 'none':
914          //;    insert_pattern(patterns[src].format('src_res', '%'+name))
915          //;    name = 'src_res'
916 
917          //; if can_propagate:
918          //;    name = inline_ops[op].format(name)
919          //; else:
920          //;    insert_pattern(ops[op].format('op_res', name))
921          //;    name = '%op_res'
922 
923          //; if dest != 'none':
924          //;    insert_pattern(patterns[dest].format('dest_res', name))
925          //;    name = '%dest_res'
926 
927          //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
928          //! p_unit_test 0, %res
929 
930          program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
931 
932          Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
933          switch (cfg.op) {
934          case denorm_mul1:
935             val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
936             break;
937          case denorm_fneg:
938             val = fneg(val);
939             break;
940          case denorm_fabs:
941             val = fabs(val);
942             break;
943          case denorm_fnegabs:
944             val = fneg(fabs(val));
945             break;
946          }
947          val = emit_denorm_srcdest(cfg.dest, val);
948          writeout(
949             0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
950 
951          finish_opt_test();
952       }
953    }
954 END_TEST
955 
956 BEGIN_TEST(optimizer.dpp)
957    //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
958    if (!setup_cs("v1 v1 s2 s1", GFX10_3))
959       return;
960 
961    Operand a(inputs[0]);
962    Operand b(inputs[1]);
963    Operand c(inputs[2]);
964    Operand d(inputs[3]);
965 
966    /* basic optimization */
967    //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
968    //! p_unit_test 0, %res0
969    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
970    Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
971    writeout(0, res0);
972 
973    /* operand swapping */
974    //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
975    //! p_unit_test 1, %res1
976    Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
977    Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
978    writeout(1, res1);
979 
980    //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
981    //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
982    //! p_unit_test 2, %res2
983    Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
984    Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
985    writeout(2, res2);
986 
987    /* modifiers */
988    //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
989    //! p_unit_test 3, %res3
990    auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
991    tmp3.instr->dpp().neg[0] = true;
992    Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
993    writeout(3, res3);
994 
995    //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
996    //! p_unit_test 4, %res4
997    Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
998    auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
999    res4.instr->vop3().neg[0] = true;
1000    writeout(4, res4);
1001 
1002    //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
1003    //! v1: %res5 = v_add_f32 %tmp5, %b clamp
1004    //! p_unit_test 5, %res5
1005    Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1006    auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
1007    res5.instr->vop3().clamp = true;
1008    writeout(5, res5);
1009 
1010    //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
1011    //! p_unit_test 6, %res6
1012    auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1013    tmp6.instr->dpp().neg[0] = true;
1014    auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
1015    res6.instr->vop3().abs[0] = true;
1016    writeout(6, res6);
1017 
1018    //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
1019    //! p_unit_test 7, %res7
1020    Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1021    auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
1022    res7.instr->vop3().abs[0] = true;
1023    writeout(7, res7);
1024 
1025    /* vcc */
1026    //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
1027    //! p_unit_test 8, %res8
1028    Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1029    Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
1030    writeout(8, res8);
1031 
1032    /* sgprs */
1033    //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
1034    //! v1: %res9 = v_add_f32 %tmp9, %d
1035    //! p_unit_test 9, %res9
1036    Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1037    Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
1038    writeout(9, res9);
1039 
1040    //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
1041    //! v1: %res10 = v_add_f32 %d, %tmp10
1042    //! p_unit_test 10, %res10
1043    Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1044    Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
1045    writeout(10, res10);
1046 
1047    finish_opt_test();
1048 END_TEST
1049 
1050 BEGIN_TEST(optimize.dpp_prop)
1051    //>> v1: %a, s1: %b = p_startpgm
1052    if (!setup_cs("v1 s1", GFX10))
1053       return;
1054 
1055    //! v1: %one = p_parallelcopy 1
1056    //! v1: %res0 = v_mul_f32 1, %a
1057    //! p_unit_test 0, %res0
1058    Temp one = bld.copy(bld.def(v1), Operand::c32(1));
1059    writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
1060 
1061    //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
1062    //! p_unit_test 1, %res1
1063    writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
1064 
1065    //! v1: %res2 = v_mul_f32 0x12345678, %a
1066    //! p_unit_test 2, %res2
1067    Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
1068    writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1069 
1070    //! v1: %literal2 = p_parallelcopy 0x12345679
1071    //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
1072    //! p_unit_test 3, %res3
1073    Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
1074    writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1075 
1076    //! v1: %b_v = p_parallelcopy %b
1077    //! v1: %res4 = v_mul_f32 %b, %a
1078    //! p_unit_test 4, %res4
1079    Temp b_v = bld.copy(bld.def(v1), inputs[1]);
1080    writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
1081 
1082    //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
1083    //! p_unit_test 5, %res5
1084    writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
1085 
1086    //! v1: %res6 = v_rcp_f32 %b
1087    //! p_unit_test 6, %res6
1088    writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
1089 
1090    finish_opt_test();
1091 END_TEST
1092 
1093