1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include "common/sid.h"
29
30 #include <map>
31 #include <vector>
32
33 namespace aco {
34
35 struct lower_context {
36 Program* program;
37 Block* block;
38 std::vector<aco_ptr<Instruction>> instructions;
39 };
40
41 /* used by handle_operands() indirectly through Builder::copy */
42 uint8_t int8_mul_table[512] = {
43 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9,
44 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19,
45 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29,
46 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39,
47 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49,
48 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59,
49 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23,
50 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243,
51 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23,
52 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33,
53 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27,
54 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17,
55 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43,
56 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243,
57 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45,
58 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53,
59 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13,
60 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245,
61 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63,
62 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35,
63 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19,
64 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43,
65 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
66 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55,
67 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249,
68 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255};
69
70 aco_opcode
get_reduce_opcode(chip_class chip,ReduceOp op)71 get_reduce_opcode(chip_class chip, ReduceOp op)
72 {
73 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
74 * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
75 * DPP with the arithmetic instructions. This requires to sign-extend.
76 */
77 switch (op) {
78 case iadd8:
79 case iadd16:
80 if (chip >= GFX10) {
81 return aco_opcode::v_add_u32;
82 } else if (chip >= GFX8) {
83 return aco_opcode::v_add_u16;
84 } else {
85 return aco_opcode::v_add_co_u32;
86 }
87 break;
88 case imul8:
89 case imul16:
90 if (chip >= GFX10) {
91 return aco_opcode::v_mul_lo_u16_e64;
92 } else if (chip >= GFX8) {
93 return aco_opcode::v_mul_lo_u16;
94 } else {
95 return aco_opcode::v_mul_u32_u24;
96 }
97 break;
98 case fadd16: return aco_opcode::v_add_f16;
99 case fmul16: return aco_opcode::v_mul_f16;
100 case imax8:
101 case imax16:
102 if (chip >= GFX10) {
103 return aco_opcode::v_max_i32;
104 } else if (chip >= GFX8) {
105 return aco_opcode::v_max_i16;
106 } else {
107 return aco_opcode::v_max_i32;
108 }
109 break;
110 case imin8:
111 case imin16:
112 if (chip >= GFX10) {
113 return aco_opcode::v_min_i32;
114 } else if (chip >= GFX8) {
115 return aco_opcode::v_min_i16;
116 } else {
117 return aco_opcode::v_min_i32;
118 }
119 break;
120 case umin8:
121 case umin16:
122 if (chip >= GFX10) {
123 return aco_opcode::v_min_u32;
124 } else if (chip >= GFX8) {
125 return aco_opcode::v_min_u16;
126 } else {
127 return aco_opcode::v_min_u32;
128 }
129 break;
130 case umax8:
131 case umax16:
132 if (chip >= GFX10) {
133 return aco_opcode::v_max_u32;
134 } else if (chip >= GFX8) {
135 return aco_opcode::v_max_u16;
136 } else {
137 return aco_opcode::v_max_u32;
138 }
139 break;
140 case fmin16: return aco_opcode::v_min_f16;
141 case fmax16: return aco_opcode::v_max_f16;
142 case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
143 case imul32: return aco_opcode::v_mul_lo_u32;
144 case fadd32: return aco_opcode::v_add_f32;
145 case fmul32: return aco_opcode::v_mul_f32;
146 case imax32: return aco_opcode::v_max_i32;
147 case imin32: return aco_opcode::v_min_i32;
148 case umin32: return aco_opcode::v_min_u32;
149 case umax32: return aco_opcode::v_max_u32;
150 case fmin32: return aco_opcode::v_min_f32;
151 case fmax32: return aco_opcode::v_max_f32;
152 case iand8:
153 case iand16:
154 case iand32: return aco_opcode::v_and_b32;
155 case ixor8:
156 case ixor16:
157 case ixor32: return aco_opcode::v_xor_b32;
158 case ior8:
159 case ior16:
160 case ior32: return aco_opcode::v_or_b32;
161 case iadd64: return aco_opcode::num_opcodes;
162 case imul64: return aco_opcode::num_opcodes;
163 case fadd64: return aco_opcode::v_add_f64;
164 case fmul64: return aco_opcode::v_mul_f64;
165 case imin64: return aco_opcode::num_opcodes;
166 case imax64: return aco_opcode::num_opcodes;
167 case umin64: return aco_opcode::num_opcodes;
168 case umax64: return aco_opcode::num_opcodes;
169 case fmin64: return aco_opcode::v_min_f64;
170 case fmax64: return aco_opcode::v_max_f64;
171 case iand64: return aco_opcode::num_opcodes;
172 case ior64: return aco_opcode::num_opcodes;
173 case ixor64: return aco_opcode::num_opcodes;
174 default: return aco_opcode::num_opcodes;
175 }
176 }
177
178 bool
is_vop3_reduce_opcode(aco_opcode opcode)179 is_vop3_reduce_opcode(aco_opcode opcode)
180 {
181 /* 64-bit reductions are VOP3. */
182 if (opcode == aco_opcode::num_opcodes)
183 return true;
184
185 return instr_info.format[(int)opcode] == Format::VOP3;
186 }
187
188 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)189 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
190 {
191 Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
192 if (instr->definitions.size() >= 2) {
193 assert(instr->definitions[1].regClass() == bld.lm);
194 instr->definitions[1].setFixed(vcc);
195 }
196 }
197
198 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)199 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
200 PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
201 unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
202 {
203 Builder bld(ctx->program, &ctx->instructions);
204 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
205 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
206 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
207 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
208 Operand src1_64 = Operand(src1_reg, v2);
209 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
210 Operand vtmp_op64 = Operand(vtmp_reg, v2);
211 if (op == iadd64) {
212 if (ctx->program->chip_class >= GFX10) {
213 if (identity)
214 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
215 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
216 bound_ctrl);
217 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
218 } else {
219 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
220 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
221 }
222 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
223 Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
224 } else if (op == iand64) {
225 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
226 bound_ctrl);
227 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
228 bound_ctrl);
229 } else if (op == ior64) {
230 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
231 bound_ctrl);
232 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
233 bound_ctrl);
234 } else if (op == ixor64) {
235 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
236 bound_ctrl);
237 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
238 bound_ctrl);
239 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
240 aco_opcode cmp = aco_opcode::num_opcodes;
241 switch (op) {
242 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
243 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
244 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
245 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
246 default: break;
247 }
248
249 if (identity) {
250 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
251 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
252 }
253 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
254 bound_ctrl);
255 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
256 bound_ctrl);
257
258 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
259 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
260 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
261 } else if (op == imul64) {
262 /* t4 = dpp(x_hi)
263 * t1 = umul_lo(t4, y_lo)
264 * t3 = dpp(x_lo)
265 * t0 = umul_lo(t3, y_hi)
266 * t2 = iadd(t0, t1)
267 * t5 = umul_hi(t3, y_lo)
268 * res_hi = iadd(t2, t5)
269 * res_lo = umul_lo(t3, y_lo)
270 * Requires that res_hi != src0[0] and res_hi != src1[0]
271 * and that vtmp[0] != res_hi.
272 */
273 if (identity)
274 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
275 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
276 bound_ctrl);
277 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
278 if (identity)
279 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
280 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
281 bound_ctrl);
282 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
283 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
284 if (identity)
285 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
286 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
287 bound_ctrl);
288 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
289 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
290 if (identity)
291 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
292 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
293 bound_ctrl);
294 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
295 }
296 }
297
298 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)299 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
300 ReduceOp op)
301 {
302 Builder bld(ctx->program, &ctx->instructions);
303 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
304 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
305 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
306 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
307 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
308 Operand src1_64 = Operand(src1_reg, v2);
309
310 if (src0_rc == s1 &&
311 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
312 assert(vtmp.reg() != 0);
313 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
314 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
315 src0_reg = vtmp;
316 src0[0] = Operand(vtmp, v1);
317 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
318 src0_64 = Operand(vtmp, v2);
319 } else if (src0_rc == s1 && op == iadd64) {
320 assert(vtmp.reg() != 0);
321 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
322 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
323 }
324
325 if (op == iadd64) {
326 if (ctx->program->chip_class >= GFX10) {
327 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
328 } else {
329 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
330 }
331 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
332 Operand(vcc, bld.lm));
333 } else if (op == iand64) {
334 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
335 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
336 } else if (op == ior64) {
337 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
338 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
339 } else if (op == ixor64) {
340 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
341 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
342 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
343 aco_opcode cmp = aco_opcode::num_opcodes;
344 switch (op) {
345 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
346 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
347 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
348 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
349 default: break;
350 }
351
352 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
353 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
354 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
355 } else if (op == imul64) {
356 if (src1_reg == dst_reg) {
357 /* it's fine if src0==dst but not if src1==dst */
358 std::swap(src0_reg, src1_reg);
359 std::swap(src0[0], src1[0]);
360 std::swap(src0[1], src1[1]);
361 std::swap(src0_64, src1_64);
362 }
363 assert(!(src0_reg == src1_reg));
364 /* t1 = umul_lo(x_hi, y_lo)
365 * t0 = umul_lo(x_lo, y_hi)
366 * t2 = iadd(t0, t1)
367 * t5 = umul_hi(x_lo, y_lo)
368 * res_hi = iadd(t2, t5)
369 * res_lo = umul_lo(x_lo, y_lo)
370 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
371 */
372 Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
373 Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
374 Operand tmp0_op = src0[1];
375 Operand tmp1_op = src1[1];
376 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
377 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
378 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
379 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
380 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
381 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
382 }
383 }
384
385 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)386 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
387 ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
388 bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
389 {
390 Builder bld(ctx->program, &ctx->instructions);
391 RegClass rc = RegClass(RegType::vgpr, size);
392 Definition dst(dst_reg, rc);
393 Operand src0(src0_reg, rc);
394 Operand src1(src1_reg, rc);
395
396 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
397 bool vop3 = is_vop3_reduce_opcode(opcode);
398
399 if (!vop3) {
400 if (opcode == aco_opcode::v_add_co_u32)
401 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
402 bound_ctrl);
403 else
404 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
405 return;
406 }
407
408 if (opcode == aco_opcode::num_opcodes) {
409 emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
410 bound_ctrl, identity);
411 return;
412 }
413
414 if (identity)
415 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
416 if (identity && size >= 2)
417 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
418
419 for (unsigned i = 0; i < size; i++)
420 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
421 Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
422
423 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
424 }
425
426 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)427 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
428 ReduceOp op, unsigned size)
429 {
430 Builder bld(ctx->program, &ctx->instructions);
431 RegClass rc = RegClass(RegType::vgpr, size);
432 Definition dst(dst_reg, rc);
433 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
434 Operand src1(src1_reg, rc);
435
436 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
437 bool vop3 = is_vop3_reduce_opcode(opcode);
438
439 if (opcode == aco_opcode::num_opcodes) {
440 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
441 return;
442 }
443
444 if (vop3) {
445 bld.vop3(opcode, dst, src0, src1);
446 } else if (opcode == aco_opcode::v_add_co_u32) {
447 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
448 } else {
449 bld.vop2(opcode, dst, src0, src1);
450 }
451 }
452
453 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)454 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
455 unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
456 {
457 Builder bld(ctx->program, &ctx->instructions);
458 for (unsigned i = 0; i < size; i++) {
459 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
460 Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
461 }
462 }
463
464 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)465 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
466 {
467 for (unsigned i = 0; i < size; i++) {
468 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
469 Operand(PhysReg{src + i}, v1), ds_pattern);
470 }
471 }
472
473 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)474 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
475 PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
476 {
477 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
478 assert(cluster_size <= ctx->program->wave_size);
479
480 Builder bld(ctx->program, &ctx->instructions);
481
482 Operand identity[2];
483 identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
484 identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
485 Operand vcndmask_identity[2] = {identity[0], identity[1]};
486
487 /* First, copy the source to tmp and set inactive lanes to the identity */
488 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
489 Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
490
491 for (unsigned i = 0; i < src.size(); i++) {
492 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
493 * except on GFX10, where v_writelane_b32 can take a literal. */
494 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan &&
495 ctx->program->chip_class < GFX10) {
496 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
497 identity[i] = Operand(PhysReg{sitmp + i}, s1);
498
499 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
500 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
501 } else if (identity[i].isLiteral()) {
502 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
503 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
504 }
505 }
506
507 for (unsigned i = 0; i < src.size(); i++) {
508 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
509 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
510 Operand(stmp, bld.lm));
511 }
512
513 if (src.regClass() == v1b) {
514 if (ctx->program->chip_class >= GFX8) {
515 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
516 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
517 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
518 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
519 bool sext = reduce_op == imin8 || reduce_op == imax8;
520 sdwa->sel[0] = SubdwordSel(1, 0, sext);
521 sdwa->dst_sel = SubdwordSel::dword;
522 bld.insert(std::move(sdwa));
523 } else {
524 aco_opcode opcode;
525
526 if (reduce_op == imin8 || reduce_op == imax8)
527 opcode = aco_opcode::v_bfe_i32;
528 else
529 opcode = aco_opcode::v_bfe_u32;
530
531 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
532 Operand::c32(8u));
533 }
534 } else if (src.regClass() == v2b) {
535 if (ctx->program->chip_class >= GFX10 &&
536 (reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
537 reduce_op == umin16 || reduce_op == umax16)) {
538 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
539 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
540 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
541 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
542 bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
543 sdwa->sel[0] = SubdwordSel(2, 0, sext);
544 sdwa->dst_sel = SubdwordSel::dword;
545 bld.insert(std::move(sdwa));
546 } else if (ctx->program->chip_class == GFX6 || ctx->program->chip_class == GFX7) {
547 aco_opcode opcode;
548
549 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
550 opcode = aco_opcode::v_bfe_i32;
551 else
552 opcode = aco_opcode::v_bfe_u32;
553
554 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
555 Operand::c32(16u));
556 }
557 }
558
559 bool reduction_needs_last_op = false;
560 switch (op) {
561 case aco_opcode::p_reduce:
562 if (cluster_size == 1)
563 break;
564
565 if (ctx->program->chip_class <= GFX7) {
566 reduction_needs_last_op = true;
567 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
568 if (cluster_size == 2)
569 break;
570 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
571 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
572 if (cluster_size == 4)
573 break;
574 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
575 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
576 if (cluster_size == 8)
577 break;
578 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
579 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
580 if (cluster_size == 16)
581 break;
582 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
583 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
584 if (cluster_size == 32)
585 break;
586 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
587 for (unsigned i = 0; i < src.size(); i++)
588 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
589 Operand::zero());
590 // TODO: it would be more effective to do the last reduction step on SALU
591 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
592 reduction_needs_last_op = false;
593 break;
594 }
595
596 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
597 0xf, false);
598 if (cluster_size == 2)
599 break;
600 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
601 0xf, false);
602 if (cluster_size == 4)
603 break;
604 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
605 false);
606 if (cluster_size == 8)
607 break;
608 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
609 if (cluster_size == 16)
610 break;
611
612 if (ctx->program->chip_class >= GFX10) {
613 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
614 for (unsigned i = 0; i < src.size(); i++)
615 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
616 Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
617
618 if (cluster_size == 32) {
619 reduction_needs_last_op = true;
620 break;
621 }
622
623 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
624 for (unsigned i = 0; i < src.size(); i++)
625 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
626 Operand::zero());
627 // TODO: it would be more effective to do the last reduction step on SALU
628 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
629 break;
630 }
631
632 if (cluster_size == 32) {
633 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
634 reduction_needs_last_op = true;
635 break;
636 }
637 assert(cluster_size == 64);
638 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
639 false);
640 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
641 false);
642 break;
643 case aco_opcode::p_exclusive_scan:
644 if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
645 /* shift rows right */
646 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
647
648 /* fill in the gaps in rows 1 and 3 */
649 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u));
650 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u));
651 for (unsigned i = 0; i < src.size(); i++) {
652 Instruction* perm =
653 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
654 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
655 Operand::c32(0xffffffffu))
656 .instr;
657 perm->vop3().opsel = 1; /* FI (Fetch Inactive) */
658 }
659 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX));
660
661 if (ctx->program->wave_size == 64) {
662 /* fill in the gap in row 2 */
663 for (unsigned i = 0; i < src.size(); i++) {
664 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
665 Operand::c32(31u));
666 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
667 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
668 }
669 }
670 std::swap(tmp, vtmp);
671 } else if (ctx->program->chip_class >= GFX8) {
672 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
673 } else {
674 // TODO: use LDS on CS with a single write and shifted read
675 /* wavefront shift_right by 1 on SI/CI */
676 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
677 emit_ds_swizzle(bld, tmp, tmp, src.size(),
678 ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
679 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u));
680 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
681 for (unsigned i = 0; i < src.size(); i++)
682 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
683 Operand(PhysReg{tmp + i}, v1));
684
685 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
686 emit_ds_swizzle(bld, tmp, tmp, src.size(),
687 ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
688 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u));
689 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
690 for (unsigned i = 0; i < src.size(); i++)
691 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
692 Operand(PhysReg{tmp + i}, v1));
693
694 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
695 emit_ds_swizzle(bld, tmp, tmp, src.size(),
696 ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
697 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u),
698 Operand::c32(16u));
699 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u),
700 Operand::c32(16u));
701 for (unsigned i = 0; i < src.size(); i++)
702 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
703 Operand(PhysReg{tmp + i}, v1));
704
705 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
706 for (unsigned i = 0; i < src.size(); i++) {
707 bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
708 Operand(PhysReg{vtmp + i}, v1));
709 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
710 Operand::zero());
711 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
712 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
713 identity[i] = Operand::zero(); /* prevent further uses of identity */
714 }
715 std::swap(tmp, vtmp);
716 }
717
718 for (unsigned i = 0; i < src.size(); i++) {
719 if (!identity[i].isConstant() ||
720 identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
721 if (ctx->program->chip_class < GFX10)
722 assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
723 identity[i].physReg() == PhysReg{sitmp + i});
724 bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
725 Operand(PhysReg{tmp + i}, v1));
726 }
727 }
728 FALLTHROUGH;
729 case aco_opcode::p_inclusive_scan:
730 assert(cluster_size == ctx->program->wave_size);
731 if (ctx->program->chip_class <= GFX7) {
732 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
733 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu));
734 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
735 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
736
737 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
738 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
739 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu));
740 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
741 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
742
743 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
744 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
745 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u));
746 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
747 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
748
749 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
750 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
751 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u));
752 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
753 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
754
755 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
756 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
757 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
758 Operand::c32(16u));
759 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),
760 Operand::c32(16u));
761 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
762
763 for (unsigned i = 0; i < src.size(); i++)
764 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
765 Operand::c32(31u));
766 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
767 Operand::c32(32u));
768 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
769 break;
770 }
771
772 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
773 identity);
774 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
775 identity);
776 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
777 identity);
778 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
779 identity);
780 if (ctx->program->chip_class >= GFX10) {
781 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
782 Operand::c32(16u));
783 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),
784 Operand::c32(16u));
785 for (unsigned i = 0; i < src.size(); i++) {
786 Instruction* perm =
787 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
788 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
789 Operand::c32(0xffffffffu))
790 .instr;
791 perm->vop3().opsel = 1; /* FI (Fetch Inactive) */
792 }
793 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
794
795 if (ctx->program->wave_size == 64) {
796 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
797 Operand::c32(32u));
798 for (unsigned i = 0; i < src.size(); i++)
799 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
800 Operand::c32(31u));
801 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
802 }
803 } else {
804 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
805 false, identity);
806 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
807 false, identity);
808 }
809 break;
810 default: unreachable("Invalid reduction mode");
811 }
812
813 if (op == aco_opcode::p_reduce) {
814 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
815 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
816 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
817 return;
818 }
819
820 if (reduction_needs_last_op)
821 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
822 }
823
824 /* restore exec */
825 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
826
827 if (dst.regClass().type() == RegType::sgpr) {
828 for (unsigned k = 0; k < src.size(); k++) {
829 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
830 Operand::c32(ctx->program->wave_size - 1));
831 }
832 } else if (dst.physReg() != tmp) {
833 for (unsigned k = 0; k < src.size(); k++) {
834 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
835 Operand(PhysReg{tmp + k}, v1));
836 }
837 }
838 }
839
840 void
emit_gfx10_wave64_bpermute(Program * program,aco_ptr<Instruction> & instr,Builder & bld)841 emit_gfx10_wave64_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
842 {
843 /* Emulates proper bpermute on GFX10 in wave64 mode.
844 *
845 * This is necessary because on GFX10 the bpermute instruction only works
846 * on half waves (you can think of it as having a cluster size of 32), so we
847 * manually swap the data between the two halves using two shared VGPRs.
848 */
849
850 assert(program->chip_class >= GFX10);
851 assert(program->wave_size == 64);
852
853 unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
854 Definition dst = instr->definitions[0];
855 Definition tmp_exec = instr->definitions[1];
856 Definition clobber_scc = instr->definitions[2];
857 Operand index_x4 = instr->operands[0];
858 Operand input_data = instr->operands[1];
859 Operand same_half = instr->operands[2];
860
861 assert(dst.regClass() == v1);
862 assert(tmp_exec.regClass() == bld.lm);
863 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
864 assert(same_half.regClass() == bld.lm);
865 assert(index_x4.regClass() == v1);
866 assert(input_data.regClass().type() == RegType::vgpr);
867 assert(input_data.bytes() <= 4);
868 assert(dst.physReg() != index_x4.physReg());
869 assert(dst.physReg() != input_data.physReg());
870 assert(tmp_exec.physReg() != same_half.physReg());
871
872 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
873 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
874
875 /* Permute the input within the same half-wave */
876 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
877
878 /* HI: Copy data from high lanes 32-63 to shared vgpr */
879 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
880 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
881 /* Save EXEC */
882 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
883 /* Set EXEC to enable LO lanes only */
884 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero());
885 /* LO: Copy data from low lanes 0-31 to shared vgpr */
886 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
887 /* LO: bpermute shared vgpr (high lanes' data) */
888 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
889 Operand(shared_vgpr_hi, v1));
890 /* Set EXEC to enable HI lanes only */
891 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u));
892 /* HI: bpermute shared vgpr (low lanes' data) */
893 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
894 Operand(shared_vgpr_lo, v1));
895
896 /* Only enable lanes which use the other half's data */
897 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
898 Operand(tmp_exec.physReg(), s2), same_half);
899 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
900 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
901 0x3, 0xf, false);
902 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
903 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
904 0xc, 0xf, false);
905
906 /* Restore saved EXEC */
907 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
908
909 /* RA assumes that the result is always in the low part of the register, so we have to shift, if
910 * it's not there already */
911 if (input_data.physReg().byte()) {
912 unsigned right_shift = input_data.physReg().byte() * 8;
913 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
914 Operand(dst.physReg(), v1));
915 }
916 }
917
918 void
emit_gfx6_bpermute(Program * program,aco_ptr<Instruction> & instr,Builder & bld)919 emit_gfx6_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
920 {
921 /* Emulates bpermute using readlane instructions */
922
923 Operand index = instr->operands[0];
924 Operand input = instr->operands[1];
925 Definition dst = instr->definitions[0];
926 Definition temp_exec = instr->definitions[1];
927 Definition clobber_vcc = instr->definitions[2];
928
929 assert(dst.regClass() == v1);
930 assert(temp_exec.regClass() == bld.lm);
931 assert(clobber_vcc.regClass() == bld.lm);
932 assert(clobber_vcc.physReg() == vcc);
933 assert(index.regClass() == v1);
934 assert(index.physReg() != dst.physReg());
935 assert(input.regClass().type() == RegType::vgpr);
936 assert(input.bytes() <= 4);
937 assert(input.physReg() != dst.physReg());
938
939 /* Save original EXEC */
940 bld.sop1(aco_opcode::s_mov_b64, temp_exec, Operand(exec, s2));
941
942 /* An "unrolled loop" that is executed per each lane.
943 * This takes only a few instructions per lane, as opposed to a "real" loop
944 * with branching, where the branch instruction alone would take 16+ cycles.
945 */
946 for (unsigned n = 0; n < program->wave_size; ++n) {
947 /* Activate the lane which has N for its source index */
948 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand::c32(n),
949 index);
950 /* Read the data from lane N */
951 bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
952 /* On the active lane, move the data we read from lane N to the destination VGPR */
953 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
954 /* Restore original EXEC */
955 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(temp_exec.physReg(), s2));
956 }
957 }
958
959 struct copy_operation {
960 Operand op;
961 Definition def;
962 unsigned bytes;
963 union {
964 uint8_t uses[8];
965 uint64_t is_used = 0;
966 };
967 };
968
969 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)970 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
971 const copy_operation& src, bool ignore_uses, unsigned max_size)
972 {
973 PhysReg def_reg = src.def.physReg();
974 PhysReg op_reg = src.op.physReg();
975 def_reg.reg_b += offset;
976 op_reg.reg_b += offset;
977
978 /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */
979 if (ctx->program->chip_class < GFX10 && src.def.regClass().type() == RegType::vgpr)
980 max_size = MIN2(max_size, 4);
981 unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
982
983 /* make sure the size is a power of two and reg % bytes == 0 */
984 unsigned bytes = 1;
985 for (; bytes <= max_size; bytes *= 2) {
986 unsigned next = bytes * 2u;
987 bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
988 offset + next <= src.bytes && next <= max_size;
989 if (!src.op.isConstant() && can_increase)
990 can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
991 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
992 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
993 if (!can_increase)
994 break;
995 }
996
997 *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
998 if (src.op.isConstant()) {
999 assert(bytes >= 1 && bytes <= 8);
1000 uint64_t val = src.op.constantValue64() >> (offset * 8u);
1001 *op = Operand::get_const(ctx->program->chip_class, val, bytes);
1002 } else {
1003 RegClass op_cls = src.op.regClass().resize(bytes);
1004 *op = Operand(op_reg, op_cls);
1005 op->setTemp(Temp(src.op.tempId(), op_cls));
1006 }
1007 }
1008
1009 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1010 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1011 {
1012 int intersection_start = MAX2(b_start - a_start, 0);
1013 int intersection_end = MAX2(b_start + b_size - a_start, 0);
1014 if (intersection_start >= a_size || intersection_end == 0)
1015 return 0;
1016
1017 uint32_t mask = u_bit_consecutive(0, a_size);
1018 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1019 }
1020
1021 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1022 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1023 {
1024 assert(op.bytes() == dst.bytes());
1025
1026 if (dst.bytes() == 4 && op.isLiteral()) {
1027 uint32_t imm = op.constantValue();
1028 if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) {
1029 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
1030 return;
1031 } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
1032 uint32_t rev = util_bitreverse(imm);
1033 if (dst.regClass() == s1)
1034 bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev));
1035 else
1036 bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev));
1037 return;
1038 } else if (dst.regClass() == s1 && imm != 0) {
1039 unsigned start = (ffs(imm) - 1) & 0x1f;
1040 unsigned size = util_bitcount(imm) & 0x1f;
1041 if ((((1u << size) - 1u) << start) == imm) {
1042 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
1043 return;
1044 }
1045 }
1046 }
1047
1048 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8)
1049 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1050
1051 if (dst.regClass() == s1) {
1052 bld.sop1(aco_opcode::s_mov_b32, dst, op);
1053 } else if (dst.regClass() == s2) {
1054 /* s_ashr_i64 writes SCC, so we can't use it */
1055 assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
1056 bld.sop1(aco_opcode::s_mov_b64, dst, op);
1057 } else if (dst.regClass() == v2) {
1058 if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1059 bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1060 } else {
1061 assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1062 bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1063 }
1064 } else if (dst.regClass() == v1) {
1065 bld.vop1(aco_opcode::v_mov_b32, dst, op);
1066 } else {
1067 assert(dst.regClass() == v1b || dst.regClass() == v2b);
1068
1069 if (dst.regClass() == v1b && ctx->program->chip_class >= GFX9) {
1070 uint8_t val = op.constantValue();
1071 Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1072 if (op32.isLiteral()) {
1073 uint32_t a = (uint32_t)int8_mul_table[val * 2];
1074 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1075 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1076 Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1077 Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1078 } else {
1079 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1080 }
1081 } else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX9 && !op.isLiteral()) {
1082 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1083 /* use v_mov_b32 to avoid possible issues with denormal flushing or
1084 * NaN. v_add_f16 is still needed for float constants. */
1085 uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1086 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1087 } else {
1088 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1089 }
1090 } else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10 &&
1091 (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {
1092 if (dst.physReg().byte() == 2) {
1093 Operand def_lo(dst.physReg().advance(-2), v2b);
1094 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);
1095 instr->vop3().opsel = 0;
1096 } else {
1097 assert(dst.physReg().byte() == 0);
1098 Operand def_hi(dst.physReg().advance(2), v2b);
1099 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);
1100 instr->vop3().opsel = 2;
1101 }
1102 } else {
1103 uint32_t offset = dst.physReg().byte() * 8u;
1104 uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1105 uint32_t val = (op.constantValue() << offset) & mask;
1106 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1107 Operand def_op(dst.physReg(), v1);
1108 if (val != mask)
1109 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1110 if (val != 0)
1111 bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1112 }
1113 }
1114 }
1115
1116 void
copy_linear_vgpr(Builder & bld,Definition def,Operand op,bool preserve_scc,PhysReg scratch_sgpr)1117 copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1118 {
1119 if (preserve_scc)
1120 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1121
1122 for (unsigned i = 0; i < 2; i++) {
1123 if (def.size() == 2)
1124 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1125 else
1126 bld.vop1(aco_opcode::v_mov_b32, def, op);
1127
1128 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1129 Operand(exec, bld.lm));
1130 }
1131
1132 if (preserve_scc)
1133 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1134 Operand::zero());
1135 }
1136
1137 void
swap_linear_vgpr(Builder & bld,Definition def,Operand op,bool preserve_scc,PhysReg scratch_sgpr)1138 swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1139 {
1140 if (preserve_scc)
1141 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1142
1143 Operand def_as_op = Operand(def.physReg(), def.regClass());
1144 Definition op_as_def = Definition(op.physReg(), op.regClass());
1145
1146 for (unsigned i = 0; i < 2; i++) {
1147 if (bld.program->chip_class >= GFX9) {
1148 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1149 } else {
1150 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1151 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1152 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1153 }
1154
1155 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1156 Operand(exec, bld.lm));
1157 }
1158
1159 if (preserve_scc)
1160 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1161 Operand::zero());
1162 }
1163
1164 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1165 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1166 PhysReg scratch_sgpr)
1167 {
1168 bool did_copy = false;
1169 for (unsigned offset = 0; offset < copy.bytes;) {
1170 if (copy.uses[offset]) {
1171 offset++;
1172 continue;
1173 }
1174
1175 Definition def;
1176 Operand op;
1177 split_copy(ctx, offset, &def, &op, copy, false, 8);
1178
1179 if (def.physReg() == scc) {
1180 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1181 *preserve_scc = true;
1182 } else if (op.isConstant()) {
1183 copy_constant(ctx, bld, def, op);
1184 } else if (def.regClass().is_linear_vgpr()) {
1185 copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
1186 } else if (def.regClass() == v1) {
1187 bld.vop1(aco_opcode::v_mov_b32, def, op);
1188 } else if (def.regClass() == v2) {
1189 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1190 } else if (def.regClass() == s1) {
1191 bld.sop1(aco_opcode::s_mov_b32, def, op);
1192 } else if (def.regClass() == s2) {
1193 bld.sop1(aco_opcode::s_mov_b64, def, op);
1194 } else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) {
1195 if (op.physReg().byte()) {
1196 assert(def.physReg().byte() == 0);
1197 bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
1198 } else if (def.physReg().byte()) {
1199 assert(op.physReg().byte() == 0);
1200 /* preserve the target's lower half */
1201 uint32_t bits = def.physReg().byte() * 8;
1202 PhysReg lo_reg = PhysReg(def.physReg().reg());
1203 Definition lo_half =
1204 Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
1205 Definition dst =
1206 Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
1207
1208 if (def.physReg().reg() == op.physReg().reg()) {
1209 bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
1210 Operand(lo_reg, lo_half.regClass()));
1211 if (def.physReg().byte() == 1) {
1212 bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
1213 } else if (def.physReg().byte() == 2) {
1214 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
1215 } else if (def.physReg().byte() == 3) {
1216 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
1217 Operand::c32((1 << bits) + 1u));
1218 bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
1219 }
1220 } else {
1221 lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
1222 bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
1223 Operand(lo_reg, lo_half.regClass()));
1224 bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
1225 Operand(lo_half.physReg(), lo_half.regClass()),
1226 Operand::c32(4 - def.physReg().byte()));
1227 }
1228 } else {
1229 bld.vop1(aco_opcode::v_mov_b32, def, op);
1230 }
1231 } else if (def.regClass().is_subdword()) {
1232 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1233 } else {
1234 unreachable("unsupported copy");
1235 }
1236
1237 did_copy = true;
1238 offset += def.bytes();
1239 }
1240 return did_copy;
1241 }
1242
1243 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1244 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1245 Pseudo_instruction* pi)
1246 {
1247 unsigned offset = 0;
1248
1249 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1250 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1251 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1252 * swap */
1253 PhysReg op = copy.op.physReg();
1254 PhysReg def = copy.def.physReg();
1255 op.reg_b &= ~0x3;
1256 def.reg_b &= ~0x3;
1257
1258 copy_operation tmp;
1259 tmp.op = Operand(op, v1);
1260 tmp.def = Definition(def, v1);
1261 tmp.bytes = 4;
1262 memset(tmp.uses, 1, 4);
1263 do_swap(ctx, bld, tmp, preserve_scc, pi);
1264
1265 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1266 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1267 tmp.op = Operand(op, v1b);
1268 tmp.def = Definition(def, v1b);
1269 tmp.bytes = 1;
1270 tmp.uses[0] = 1;
1271 do_swap(ctx, bld, tmp, preserve_scc, pi);
1272
1273 offset = copy.bytes;
1274 }
1275
1276 for (; offset < copy.bytes;) {
1277 Definition def;
1278 Operand op;
1279 unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1280 split_copy(ctx, offset, &def, &op, copy, true, max_size);
1281
1282 assert(op.regClass() == def.regClass());
1283 Operand def_as_op = Operand(def.physReg(), def.regClass());
1284 Definition op_as_def = Definition(op.physReg(), op.regClass());
1285 if (def.regClass().is_linear_vgpr()) {
1286 swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
1287 } else if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
1288 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1289 } else if (def.regClass() == v1) {
1290 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1291 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1292 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1293 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1294 } else if (op.physReg() == scc || def.physReg() == scc) {
1295 /* we need to swap scc and another sgpr */
1296 assert(!preserve_scc);
1297
1298 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1299
1300 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1301 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1302 Operand::zero());
1303 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1304 } else if (def.regClass() == s1) {
1305 if (preserve_scc) {
1306 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1307 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1308 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1309 } else {
1310 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1311 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1312 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1313 }
1314 } else if (def.regClass() == s2) {
1315 if (preserve_scc)
1316 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1317 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1318 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1319 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1320 if (preserve_scc)
1321 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1322 Operand::zero());
1323 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1324 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1325 Operand::c32(2u));
1326 } else {
1327 assert(def.regClass().is_subdword());
1328 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1329 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1330 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1331 }
1332
1333 offset += def.bytes();
1334 }
1335
1336 if (ctx->program->chip_class <= GFX7)
1337 return;
1338
1339 /* fixup in case we swapped bytes we shouldn't have */
1340 copy_operation tmp_copy = copy;
1341 tmp_copy.op.setFixed(copy.def.physReg());
1342 tmp_copy.def.setFixed(copy.op.physReg());
1343 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1344 }
1345
1346 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1347 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1348 {
1349 if (lo.isConstant() && hi.isConstant()) {
1350 copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1351 return;
1352 }
1353
1354 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1355 (ctx->program->chip_class >= GFX10 ||
1356 (ctx->program->chip_class >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1357
1358 if (can_use_pack) {
1359 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1360 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1361 instr->vop3().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1362 return;
1363 }
1364
1365 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1366 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1367 (!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() ||
1368 ctx->program->chip_class >= GFX10)) {
1369 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1370 return;
1371 }
1372
1373 Definition def_lo = Definition(def.physReg(), v2b);
1374 Definition def_hi = Definition(def.physReg().advance(2), v2b);
1375
1376 if (lo.isConstant()) {
1377 /* move hi and zero low bits */
1378 if (hi.physReg().byte() == 0)
1379 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1380 else
1381 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1382 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1383 Operand(def.physReg(), v1));
1384 return;
1385 }
1386 if (hi.isConstant()) {
1387 /* move lo and zero high bits */
1388 if (lo.physReg().byte() == 2)
1389 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1390 else
1391 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1392 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1393 Operand(def.physReg(), v1));
1394 return;
1395 }
1396
1397 if (lo.physReg().reg() == def.physReg().reg()) {
1398 /* lo is in the high bits of def */
1399 assert(lo.physReg().byte() == 2);
1400 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1401 lo.setFixed(def.physReg());
1402 } else if (hi.physReg() == def.physReg()) {
1403 /* hi is in the low bits of def */
1404 assert(hi.physReg().byte() == 0);
1405 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1406 hi.setFixed(def.physReg().advance(2));
1407 } else if (ctx->program->chip_class >= GFX8) {
1408 /* either lo or hi can be placed with just a v_mov */
1409 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1410 Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1411 PhysReg reg = def.physReg().advance(op.physReg().byte());
1412 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1413 op.setFixed(reg);
1414 }
1415
1416 if (ctx->program->chip_class >= GFX8) {
1417 /* either hi or lo are already placed correctly */
1418 if (lo.physReg().reg() == def.physReg().reg())
1419 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1420 else
1421 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1422 return;
1423 }
1424
1425 /* alignbyte needs the operands in the following way:
1426 * | xx hi | lo xx | >> 2 byte */
1427 if (lo.physReg().byte() != hi.physReg().byte()) {
1428 /* | xx lo | hi xx | => | lo hi | lo hi | */
1429 assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
1430 bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
1431 lo = Operand(def_hi.physReg(), v2b);
1432 hi = Operand(def_lo.physReg(), v2b);
1433 } else if (lo.physReg().byte() == 0) {
1434 /* | xx hi | xx lo | => | xx hi | lo 00 | */
1435 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
1436 lo = Operand(def_hi.physReg(), v2b);
1437 } else {
1438 /* | hi xx | lo xx | => | 00 hi | lo xx | */
1439 assert(hi.physReg().byte() == 2);
1440 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
1441 hi = Operand(def_lo.physReg(), v2b);
1442 }
1443 /* perform the alignbyte */
1444 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1445 }
1446
1447 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1448 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1449 copy_operation& copy)
1450 {
1451 // TODO try more relaxed alignment for subdword copies
1452 unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1453 unsigned next_op_align = next_def_align;
1454 if (copy.def.regClass().type() == RegType::vgpr)
1455 next_def_align = MIN2(next_def_align, 4);
1456 if (copy.op.regClass().type() == RegType::vgpr)
1457 next_op_align = MIN2(next_op_align, 4);
1458
1459 if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1460 (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1461 return;
1462
1463 auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1464 if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1465 copy.op.isConstant() != other->second.op.isConstant())
1466 return;
1467
1468 /* don't create 64-bit copies before GFX10 */
1469 if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1470 ctx->program->chip_class < GFX10)
1471 return;
1472
1473 unsigned new_size = copy.bytes + other->second.bytes;
1474 if (copy.op.isConstant()) {
1475 uint64_t val =
1476 copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1477 if (!util_is_power_of_two_or_zero(new_size))
1478 return;
1479 if (!Operand::is_constant_representable(val, new_size, true,
1480 copy.def.regClass().type() == RegType::vgpr))
1481 return;
1482 copy.op = Operand::get_const(ctx->program->chip_class, val, new_size);
1483 } else {
1484 if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1485 return;
1486 copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1487 }
1488
1489 copy.bytes = new_size;
1490 copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1491 copy_map.erase(other);
1492 }
1493
1494 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,chip_class chip_class,Pseudo_instruction * pi)1495 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1496 chip_class chip_class, Pseudo_instruction* pi)
1497 {
1498 Builder bld(ctx->program, &ctx->instructions);
1499 unsigned num_instructions_before = ctx->instructions.size();
1500 aco_ptr<Instruction> mov;
1501 bool writes_scc = false;
1502
1503 /* count the number of uses for each dst reg */
1504 for (auto it = copy_map.begin(); it != copy_map.end();) {
1505
1506 if (it->second.def.physReg() == scc)
1507 writes_scc = true;
1508
1509 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1510
1511 /* if src and dst reg are the same, remove operation */
1512 if (it->first == it->second.op.physReg()) {
1513 it = copy_map.erase(it);
1514 continue;
1515 }
1516
1517 /* split large copies */
1518 if (it->second.bytes > 8) {
1519 assert(!it->second.op.isConstant());
1520 assert(!it->second.def.regClass().is_subdword());
1521 RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);
1522 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1523 rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);
1524 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1525 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1526 copy_map[hi_def.physReg()] = copy;
1527 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1528 it->second.op = Operand(it->second.op.physReg(),
1529 it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
1530 it->second.def = Definition(it->second.def.physReg(),
1531 it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
1532 it->second.bytes = 8;
1533 }
1534
1535 try_coalesce_copies(ctx, copy_map, it->second);
1536
1537 /* check if the definition reg is used by another copy operation */
1538 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1539 if (copy.second.op.isConstant())
1540 continue;
1541 for (uint16_t i = 0; i < it->second.bytes; i++) {
1542 /* distance might underflow */
1543 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1544 if (distance < copy.second.bytes)
1545 it->second.uses[i] += 1;
1546 }
1547 }
1548
1549 ++it;
1550 }
1551
1552 /* first, handle paths in the location transfer graph */
1553 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1554 bool skip_partial_copies = true;
1555 for (auto it = copy_map.begin();;) {
1556 if (copy_map.empty()) {
1557 ctx->program->statistics[statistic_copies] +=
1558 ctx->instructions.size() - num_instructions_before;
1559 return;
1560 }
1561 if (it == copy_map.end()) {
1562 if (!skip_partial_copies)
1563 break;
1564 skip_partial_copies = false;
1565 it = copy_map.begin();
1566 }
1567
1568 /* check if we can pack one register at once */
1569 if (it->first.byte() == 0 && it->second.bytes == 2) {
1570 PhysReg reg_hi = it->first.advance(2);
1571 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1572 if (other != copy_map.end() && other->second.bytes == 2) {
1573 /* check if the target register is otherwise unused */
1574 bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1575 other->second.op.physReg() == it->first);
1576 bool unused_hi = !other->second.is_used ||
1577 (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1578 if (unused_lo && unused_hi) {
1579 Operand lo = it->second.op;
1580 Operand hi = other->second.op;
1581 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1582 copy_map.erase(it);
1583 copy_map.erase(other);
1584
1585 for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1586 for (uint16_t i = 0; i < other2.second.bytes; i++) {
1587 /* distance might underflow */
1588 unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1589 unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1590 if (distance_lo < 2 || distance_hi < 2)
1591 other2.second.uses[i] -= 1;
1592 }
1593 }
1594 it = copy_map.begin();
1595 continue;
1596 }
1597 }
1598 }
1599
1600 /* on GFX6/7, we need some small workarounds as there is no
1601 * SDWA instruction to do partial register writes */
1602 if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) {
1603 if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
1604 pi->opcode == aco_opcode::p_split_vector) {
1605 /* Other operations might overwrite the high bits, so change all users
1606 * of the high bits to the new target where they are still available.
1607 * This mechanism depends on also emitting dead definitions. */
1608 PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1609 while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1610 std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1611 for (other = copy_map.begin(); other != copy_map.end(); other++) {
1612 /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1613 if (other->second.op.physReg() == reg_hi) {
1614 other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1615 break; /* break because an operand can only be used once */
1616 }
1617 }
1618 reg_hi = reg_hi.advance(it->second.bytes);
1619 }
1620 } else if (it->first.byte()) {
1621 assert(pi->opcode == aco_opcode::p_create_vector);
1622 /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1623 * move to the target operand's high bits. This is save to do as it cannot be an operand
1624 */
1625 PhysReg lo = PhysReg(it->first.reg());
1626 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1627 if (other != copy_map.end()) {
1628 assert(other->second.bytes == it->first.byte());
1629 PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1630 it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1631 it->second.is_used = 0;
1632 other->second.bytes += it->second.bytes;
1633 other->second.def.setTemp(Temp(other->second.def.tempId(),
1634 RegClass::get(RegType::vgpr, other->second.bytes)));
1635 other->second.op.setTemp(Temp(other->second.op.tempId(),
1636 RegClass::get(RegType::vgpr, other->second.bytes)));
1637 /* if the new target's high bits are also a target, change uses */
1638 std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1639 if (target != copy_map.end()) {
1640 for (unsigned i = 0; i < it->second.bytes; i++)
1641 target->second.uses[i]++;
1642 }
1643 }
1644 }
1645 }
1646
1647 /* find portions where the target reg is not used as operand for any other copy */
1648 if (it->second.is_used) {
1649 if (it->second.op.isConstant() || skip_partial_copies) {
1650 /* we have to skip constants until is_used=0.
1651 * we also skip partial copies at the beginning to help coalescing */
1652 ++it;
1653 continue;
1654 }
1655
1656 unsigned has_zero_use_bytes = 0;
1657 for (unsigned i = 0; i < it->second.bytes; i++)
1658 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1659
1660 if (has_zero_use_bytes) {
1661 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1662 * copies is usually beneficial for sub-dword copies, but if doing
1663 * a partial copy allows further copies, it should be done instead. */
1664 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1665 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1666 /* on GFX6/7, we can only do copies with full registers */
1667 if (partial_copy || ctx->program->chip_class <= GFX7)
1668 break;
1669 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1670 /* distance might underflow */
1671 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1672 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1673 !it->second.uses[distance])
1674 partial_copy = true;
1675 }
1676 }
1677
1678 if (!partial_copy) {
1679 ++it;
1680 continue;
1681 }
1682 } else {
1683 /* full target reg is used: register swapping needed */
1684 ++it;
1685 continue;
1686 }
1687 }
1688
1689 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1690 skip_partial_copies = did_copy;
1691 std::pair<PhysReg, copy_operation> copy = *it;
1692
1693 if (it->second.is_used == 0) {
1694 /* the target reg is not used as operand for any other copy, so we
1695 * copied to all of it */
1696 copy_map.erase(it);
1697 it = copy_map.begin();
1698 } else {
1699 /* we only performed some portions of this copy, so split it to only
1700 * leave the portions that still need to be done */
1701 copy_operation original = it->second; /* the map insertion below can overwrite this */
1702 copy_map.erase(it);
1703 for (unsigned offset = 0; offset < original.bytes;) {
1704 if (original.uses[offset] == 0) {
1705 offset++;
1706 continue;
1707 }
1708 Definition def;
1709 Operand op;
1710 split_copy(ctx, offset, &def, &op, original, false, 8);
1711
1712 copy_operation new_copy = {op, def, def.bytes()};
1713 for (unsigned i = 0; i < new_copy.bytes; i++)
1714 new_copy.uses[i] = original.uses[i + offset];
1715 copy_map[def.physReg()] = new_copy;
1716
1717 offset += def.bytes();
1718 }
1719
1720 it = copy_map.begin();
1721 }
1722
1723 /* Reduce the number of uses of the operand reg by one. Do this after
1724 * splitting the copy or removing it in case the copy writes to it's own
1725 * operand (for example, v[7:8] = v[8:9]) */
1726 if (did_copy && !copy.second.op.isConstant()) {
1727 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1728 for (uint16_t i = 0; i < other.second.bytes; i++) {
1729 /* distance might underflow */
1730 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1731 if (distance < copy.second.bytes && !copy.second.uses[distance])
1732 other.second.uses[i] -= 1;
1733 }
1734 }
1735 }
1736 }
1737
1738 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1739 unsigned largest = 0;
1740 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1741 largest = MAX2(largest, op.second.bytes);
1742
1743 while (!copy_map.empty()) {
1744
1745 /* Perform larger swaps first, because larger swaps swaps can make other
1746 * swaps unnecessary. */
1747 auto it = copy_map.begin();
1748 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1749 if (it2->second.bytes > it->second.bytes) {
1750 it = it2;
1751 if (it->second.bytes == largest)
1752 break;
1753 }
1754 }
1755
1756 /* should already be done */
1757 assert(!it->second.op.isConstant());
1758
1759 assert(it->second.op.isFixed());
1760 assert(it->second.def.regClass() == it->second.op.regClass());
1761
1762 if (it->first == it->second.op.physReg()) {
1763 copy_map.erase(it);
1764 continue;
1765 }
1766
1767 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
1768 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
1769
1770 /* to resolve the cycle, we have to swap the src reg with the dst reg */
1771 copy_operation swap = it->second;
1772
1773 /* if this is self-intersecting, we have to split it because
1774 * self-intersecting swaps don't make sense */
1775 PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
1776 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
1777 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
1778
1779 copy_operation remaining;
1780 src.reg_b += offset;
1781 dst.reg_b += offset;
1782 remaining.bytes = swap.bytes - offset;
1783 memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
1784 remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
1785 remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
1786 copy_map[dst] = remaining;
1787
1788 memset(swap.uses + offset, 0, swap.bytes - offset);
1789 swap.bytes = offset;
1790 }
1791
1792 /* GFX6-7 can only swap full registers */
1793 if (ctx->program->chip_class <= GFX7)
1794 swap.bytes = align(swap.bytes, 4);
1795
1796 do_swap(ctx, bld, swap, preserve_scc, pi);
1797
1798 /* remove from map */
1799 copy_map.erase(it);
1800
1801 /* change the operand reg of the target's uses and split uses if needed */
1802 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1803 for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
1804 if (target->second.op.physReg() == swap.def.physReg() &&
1805 swap.bytes == target->second.bytes) {
1806 target->second.op.setFixed(swap.op.physReg());
1807 break;
1808 }
1809
1810 uint32_t imask =
1811 get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1812 target->second.op.physReg().reg_b, target->second.bytes);
1813
1814 if (!imask)
1815 continue;
1816
1817 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
1818
1819 /* split and update the middle (the portion that reads the swap's
1820 * definition) to read the swap's operand instead */
1821 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
1822 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
1823 int before_bytes = MAX2(-offset, 0);
1824 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
1825 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
1826
1827 if (after_bytes) {
1828 unsigned after_offset = before_bytes + middle_bytes;
1829 assert(after_offset > 0);
1830 copy_operation copy;
1831 copy.bytes = after_bytes;
1832 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
1833 RegClass rc = target->second.op.regClass().resize(after_bytes);
1834 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
1835 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
1836 copy_map[copy.def.physReg()] = copy;
1837 }
1838
1839 if (middle_bytes) {
1840 copy_operation copy;
1841 copy.bytes = middle_bytes;
1842 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
1843 RegClass rc = target->second.op.regClass().resize(middle_bytes);
1844 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
1845 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
1846 copy_map[copy.def.physReg()] = copy;
1847 }
1848
1849 if (before_bytes) {
1850 copy_operation copy;
1851 target->second.bytes = before_bytes;
1852 RegClass rc = target->second.op.regClass().resize(before_bytes);
1853 target->second.op = Operand(target->second.op.physReg(), rc);
1854 target->second.def = Definition(target->second.def.physReg(), rc);
1855 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
1856 }
1857
1858 /* break early since we know each byte of the swap's definition is used
1859 * at most once */
1860 bytes_left &= ~imask;
1861 if (!bytes_left)
1862 break;
1863 }
1864 }
1865 ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
1866 }
1867
1868 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)1869 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
1870 {
1871 if (bld.program->chip_class >= GFX10) {
1872 if (set_round)
1873 bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);
1874 if (set_denorm)
1875 bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
1876 } else if (set_round || set_denorm) {
1877 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
1878 Instruction* instr =
1879 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::c8(new_mode.val), (7 << 11) | 1).instr;
1880 /* has to be a literal */
1881 instr->operands[0].setFixed(PhysReg{255});
1882 }
1883 }
1884
1885 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block,bool always_set)1886 emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
1887 {
1888 float_mode config_mode;
1889 config_mode.val = program.config->float_mode;
1890
1891 bool set_round = always_set && block->fp_mode.round != config_mode.round;
1892 bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm;
1893 if (block->kind & block_kind_top_level) {
1894 for (unsigned pred : block->linear_preds) {
1895 if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
1896 set_round = true;
1897 if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
1898 set_denorm = true;
1899 }
1900 }
1901 /* only allow changing modes at top-level blocks so this doesn't break
1902 * the "jump over empty blocks" optimization */
1903 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
1904 emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
1905 }
1906
1907 void
lower_to_hw_instr(Program * program)1908 lower_to_hw_instr(Program* program)
1909 {
1910 Block* discard_block = NULL;
1911
1912 for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
1913 Block* block = &program->blocks[block_idx];
1914 lower_context ctx;
1915 ctx.program = program;
1916 ctx.block = block;
1917 Builder bld(program, &ctx.instructions);
1918
1919 emit_set_mode_from_block(bld, *program, block, (block_idx == 0));
1920
1921 for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
1922 aco_ptr<Instruction>& instr = block->instructions[instr_idx];
1923 aco_ptr<Instruction> mov;
1924 if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
1925 Pseudo_instruction* pi = &instr->pseudo();
1926
1927 switch (instr->opcode) {
1928 case aco_opcode::p_extract_vector: {
1929 PhysReg reg = instr->operands[0].physReg();
1930 Definition& def = instr->definitions[0];
1931 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
1932
1933 if (reg == def.physReg())
1934 break;
1935
1936 RegClass op_rc = def.regClass().is_subdword()
1937 ? def.regClass()
1938 : RegClass(instr->operands[0].getTemp().type(), def.size());
1939 std::map<PhysReg, copy_operation> copy_operations;
1940 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
1941 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1942 break;
1943 }
1944 case aco_opcode::p_create_vector: {
1945 std::map<PhysReg, copy_operation> copy_operations;
1946 PhysReg reg = instr->definitions[0].physReg();
1947
1948 for (const Operand& op : instr->operands) {
1949 if (op.isConstant()) {
1950 const Definition def = Definition(
1951 reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
1952 copy_operations[reg] = {op, def, op.bytes()};
1953 reg.reg_b += op.bytes();
1954 continue;
1955 }
1956 if (op.isUndefined()) {
1957 // TODO: coalesce subdword copies if dst byte is 0
1958 reg.reg_b += op.bytes();
1959 continue;
1960 }
1961
1962 RegClass rc_def =
1963 op.regClass().is_subdword()
1964 ? op.regClass()
1965 : instr->definitions[0].getTemp().regClass().resize(op.bytes());
1966 const Definition def = Definition(reg, rc_def);
1967 copy_operations[def.physReg()] = {op, def, op.bytes()};
1968 reg.reg_b += op.bytes();
1969 }
1970 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1971 break;
1972 }
1973 case aco_opcode::p_split_vector: {
1974 std::map<PhysReg, copy_operation> copy_operations;
1975 PhysReg reg = instr->operands[0].physReg();
1976
1977 for (const Definition& def : instr->definitions) {
1978 RegClass rc_op = def.regClass().is_subdword()
1979 ? def.regClass()
1980 : instr->operands[0].getTemp().regClass().resize(def.bytes());
1981 const Operand op = Operand(reg, rc_op);
1982 copy_operations[def.physReg()] = {op, def, def.bytes()};
1983 reg.reg_b += def.bytes();
1984 }
1985 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1986 break;
1987 }
1988 case aco_opcode::p_parallelcopy:
1989 case aco_opcode::p_wqm: {
1990 std::map<PhysReg, copy_operation> copy_operations;
1991 for (unsigned j = 0; j < instr->operands.size(); j++) {
1992 assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
1993 copy_operations[instr->definitions[j].physReg()] = {
1994 instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
1995 }
1996 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1997 break;
1998 }
1999 case aco_opcode::p_exit_early_if: {
2000 /* don't bother with an early exit near the end of the program */
2001 if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2002 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2003 unsigned null_exp_dest =
2004 (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
2005 bool ignore_early_exit = true;
2006
2007 for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2008 const aco_ptr<Instruction>& instr2 = block->instructions[k];
2009 if (instr2->opcode == aco_opcode::s_endpgm ||
2010 instr2->opcode == aco_opcode::p_logical_end)
2011 continue;
2012 else if (instr2->opcode == aco_opcode::exp &&
2013 instr2->exp().dest == null_exp_dest)
2014 continue;
2015 else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2016 instr2->definitions[0].isFixed() &&
2017 instr2->definitions[0].physReg() == exec)
2018 continue;
2019
2020 ignore_early_exit = false;
2021 }
2022
2023 if (ignore_early_exit)
2024 break;
2025 }
2026
2027 if (!discard_block) {
2028 discard_block = program->create_and_insert_block();
2029 block = &program->blocks[block_idx];
2030
2031 bld.reset(discard_block);
2032 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2033 V_008DFC_SQ_EXP_NULL, false, true, true);
2034 bld.sopp(aco_opcode::s_endpgm);
2035
2036 bld.reset(&ctx.instructions);
2037 }
2038
2039 // TODO: exec can be zero here with block_kind_discard
2040
2041 assert(instr->operands[0].physReg() == scc);
2042 bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0],
2043 discard_block->index);
2044
2045 discard_block->linear_preds.push_back(block->index);
2046 block->linear_succs.push_back(discard_block->index);
2047 break;
2048 }
2049 case aco_opcode::p_spill: {
2050 assert(instr->operands[0].regClass() == v1.as_linear());
2051 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2052 Operand src =
2053 instr->operands[2].isConstant()
2054 ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2055 : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2056 bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2057 Operand::c32(instr->operands[1].constantValue() + i),
2058 instr->operands[0]);
2059 }
2060 break;
2061 }
2062 case aco_opcode::p_reload: {
2063 assert(instr->operands[0].regClass() == v1.as_linear());
2064 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2065 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2066 instr->operands[0],
2067 Operand::c32(instr->operands[1].constantValue() + i));
2068 break;
2069 }
2070 case aco_opcode::p_as_uniform: {
2071 if (instr->operands[0].isConstant() ||
2072 instr->operands[0].regClass().type() == RegType::sgpr) {
2073 std::map<PhysReg, copy_operation> copy_operations;
2074 copy_operations[instr->definitions[0].physReg()] = {
2075 instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2076 handle_operands(copy_operations, &ctx, program->chip_class, pi);
2077 } else {
2078 assert(instr->operands[0].regClass().type() == RegType::vgpr);
2079 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2080 assert(instr->operands[0].size() == instr->definitions[0].size());
2081 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2082 bld.vop1(aco_opcode::v_readfirstlane_b32,
2083 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2084 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2085 }
2086 }
2087 break;
2088 }
2089 case aco_opcode::p_bpermute: {
2090 if (ctx.program->chip_class <= GFX7)
2091 emit_gfx6_bpermute(program, instr, bld);
2092 else if (ctx.program->chip_class >= GFX10 && ctx.program->wave_size == 64)
2093 emit_gfx10_wave64_bpermute(program, instr, bld);
2094 else
2095 unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
2096 break;
2097 }
2098 case aco_opcode::p_constaddr: {
2099 unsigned id = instr->definitions[0].tempId();
2100 PhysReg reg = instr->definitions[0].physReg();
2101 bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2102 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2103 Operand(reg, s1), Operand::c32(id));
2104 bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc),
2105 Operand(reg.advance(4), s1), Operand::zero(), Operand(scc, s1));
2106 break;
2107 }
2108 case aco_opcode::p_extract: {
2109 assert(instr->operands[1].isConstant());
2110 assert(instr->operands[2].isConstant());
2111 assert(instr->operands[3].isConstant());
2112 if (instr->definitions[0].regClass() == s1)
2113 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2114 Definition dst = instr->definitions[0];
2115 Operand op = instr->operands[0];
2116 unsigned bits = instr->operands[2].constantValue();
2117 unsigned index = instr->operands[1].constantValue();
2118 unsigned offset = index * bits;
2119 bool signext = !instr->operands[3].constantEquals(0);
2120
2121 if (dst.regClass() == s1) {
2122 if (offset == (32 - bits)) {
2123 bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2124 bld.def(s1, scc), op, Operand::c32(offset));
2125 } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2126 bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2127 dst, op);
2128 } else {
2129 bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2130 bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2131 }
2132 } else if ((dst.regClass() == v1 && op.regClass() == v1) ||
2133 ctx.program->chip_class <= GFX7) {
2134 assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2135 if (offset == (32 - bits) && op.regClass() != s1) {
2136 bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2137 Operand::c32(offset), op);
2138 } else {
2139 bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2140 Operand::c32(offset), Operand::c32(bits));
2141 }
2142 } else {
2143 assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2144 op.regClass() == v1b);
2145 SDWA_instruction& sdwa =
2146 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op).instr->sdwa();
2147 sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2148 }
2149 break;
2150 }
2151 case aco_opcode::p_insert: {
2152 assert(instr->operands[1].isConstant());
2153 assert(instr->operands[2].isConstant());
2154 if (instr->definitions[0].regClass() == s1)
2155 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2156 Definition dst = instr->definitions[0];
2157 Operand op = instr->operands[0];
2158 unsigned bits = instr->operands[2].constantValue();
2159 unsigned index = instr->operands[1].constantValue();
2160 unsigned offset = index * bits;
2161
2162 if (dst.regClass() == s1) {
2163 if (offset == (32 - bits)) {
2164 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2165 Operand::c32(offset));
2166 } else if (offset == 0) {
2167 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2168 Operand::c32(bits << 16));
2169 } else {
2170 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2171 Operand::c32(bits << 16));
2172 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2173 Operand(dst.physReg(), s1), Operand::c32(offset));
2174 }
2175 } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) {
2176 if (offset == (dst.bytes() * 8u - bits)) {
2177 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2178 } else if (offset == 0) {
2179 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2180 } else if (program->chip_class >= GFX9 ||
2181 (op.regClass() != s1 && program->chip_class >= GFX8)) {
2182 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op).instr->sdwa().dst_sel =
2183 SubdwordSel(bits / 8, offset / 8, false);
2184 } else {
2185 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2186 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2187 Operand(dst.physReg(), v1));
2188 }
2189 } else {
2190 assert(dst.regClass() == v2b);
2191 bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2192 .instr->sdwa()
2193 .sel[1] = SubdwordSel::ubyte;
2194 }
2195 break;
2196 }
2197 default: break;
2198 }
2199 } else if (instr->isBranch()) {
2200 Pseudo_branch_instruction* branch = &instr->branch();
2201 uint32_t target = branch->target[0];
2202
2203 /* check if all blocks from current to target are empty */
2204 /* In case there are <= 4 SALU or <= 2 VALU instructions, remove the branch */
2205 bool can_remove = block->index < target;
2206 unsigned num_scalar = 0;
2207 unsigned num_vector = 0;
2208 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {
2209 /* uniform branches must not be ignored if they
2210 * are about to jump over actual instructions */
2211 if (!program->blocks[i].instructions.empty() &&
2212 (branch->opcode != aco_opcode::p_cbranch_z ||
2213 branch->operands[0].physReg() != exec)) {
2214 can_remove = false;
2215 break;
2216 }
2217
2218 for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2219 if (inst->isSOPP()) {
2220 can_remove = false;
2221 } else if (inst->isSALU()) {
2222 num_scalar++;
2223 } else if (inst->isVALU()) {
2224 num_vector++;
2225 } else {
2226 can_remove = false;
2227 }
2228
2229 if (num_scalar + num_vector * 2 > 4)
2230 can_remove = false;
2231
2232 if (!can_remove)
2233 break;
2234 }
2235 }
2236
2237 if (can_remove)
2238 continue;
2239
2240 switch (instr->opcode) {
2241 case aco_opcode::p_branch:
2242 assert(block->linear_succs[0] == target);
2243 bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2244 break;
2245 case aco_opcode::p_cbranch_nz:
2246 assert(block->linear_succs[1] == target);
2247 if (branch->operands[0].physReg() == exec)
2248 bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2249 else if (branch->operands[0].physReg() == vcc)
2250 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2251 else {
2252 assert(branch->operands[0].physReg() == scc);
2253 bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2254 }
2255 break;
2256 case aco_opcode::p_cbranch_z:
2257 assert(block->linear_succs[1] == target);
2258 if (branch->operands[0].physReg() == exec)
2259 bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
2260 else if (branch->operands[0].physReg() == vcc)
2261 bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
2262 else {
2263 assert(branch->operands[0].physReg() == scc);
2264 bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
2265 }
2266 break;
2267 default: unreachable("Unknown Pseudo branch instruction!");
2268 }
2269
2270 } else if (instr->isReduction()) {
2271 Pseudo_reduction_instruction& reduce = instr->reduction();
2272 emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
2273 reduce.operands[1].physReg(), // tmp
2274 reduce.definitions[1].physReg(), // stmp
2275 reduce.operands[2].physReg(), // vtmp
2276 reduce.definitions[2].physReg(), // sitmp
2277 reduce.operands[0], reduce.definitions[0]);
2278 } else if (instr->isBarrier()) {
2279 Pseudo_barrier_instruction& barrier = instr->barrier();
2280
2281 /* Anything larger than a workgroup isn't possible. Anything
2282 * smaller requires no instructions and this pseudo instruction
2283 * would only be included to control optimizations. */
2284 bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
2285 program->workgroup_size > program->wave_size;
2286
2287 bld.insert(std::move(instr));
2288 if (emit_s_barrier)
2289 bld.sopp(aco_opcode::s_barrier);
2290 } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
2291 float_mode new_mode = block->fp_mode;
2292 new_mode.round16_64 = fp_round_ne;
2293 bool set_round = new_mode.round != block->fp_mode.round;
2294
2295 emit_set_mode(bld, new_mode, set_round, false);
2296
2297 instr->opcode = aco_opcode::v_cvt_f16_f32;
2298 ctx.instructions.emplace_back(std::move(instr));
2299
2300 emit_set_mode(bld, block->fp_mode, set_round, false);
2301 } else {
2302 ctx.instructions.emplace_back(std::move(instr));
2303 }
2304 }
2305 block->instructions.swap(ctx.instructions);
2306 }
2307 }
2308
2309 } // namespace aco
2310