1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  *
24  */
25 
26 #include "aco_instruction_selection.h"
27 
28 #include "aco_builder.h"
29 #include "aco_ir.h"
30 
31 #include "common/ac_exp_param.h"
32 #include "common/sid.h"
33 #include "vulkan/radv_descriptor_set.h"
34 
35 #include "util/fast_idiv_by_const.h"
36 #include "util/memstream.h"
37 
38 #include <array>
39 #include <functional>
40 #include <map>
41 #include <numeric>
42 #include <stack>
43 #include <utility>
44 #include <vector>
45 
46 namespace aco {
47 namespace {
48 
49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
50 
51 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
53           const char* msg)
54 {
55    char* out;
56    size_t outsize;
57    struct u_memstream mem;
58    u_memstream_open(&mem, &out, &outsize);
59    FILE* const memf = u_memstream_get(&mem);
60 
61    fprintf(memf, "%s: ", msg);
62    nir_print_instr(instr, memf);
63    u_memstream_close(&mem);
64 
65    _aco_err(ctx->program, file, line, out);
66    free(out);
67 }
68 
69 struct if_context {
70    Temp cond;
71 
72    bool divergent_old;
73    bool exec_potentially_empty_discard_old;
74    bool exec_potentially_empty_break_old;
75    uint16_t exec_potentially_empty_break_depth_old;
76 
77    unsigned BB_if_idx;
78    unsigned invert_idx;
79    bool uniform_has_then_branch;
80    bool then_branch_divergent;
81    Block BB_invert;
82    Block BB_endif;
83 };
84 
85 struct loop_context {
86    Block loop_exit;
87 
88    unsigned header_idx_old;
89    Block* exit_old;
90    bool divergent_cont_old;
91    bool divergent_branch_old;
92    bool divergent_if_old;
93 };
94 
95 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
96 
97 static void
add_logical_edge(unsigned pred_idx,Block * succ)98 add_logical_edge(unsigned pred_idx, Block* succ)
99 {
100    succ->logical_preds.emplace_back(pred_idx);
101 }
102 
103 static void
add_linear_edge(unsigned pred_idx,Block * succ)104 add_linear_edge(unsigned pred_idx, Block* succ)
105 {
106    succ->linear_preds.emplace_back(pred_idx);
107 }
108 
109 static void
add_edge(unsigned pred_idx,Block * succ)110 add_edge(unsigned pred_idx, Block* succ)
111 {
112    add_logical_edge(pred_idx, succ);
113    add_linear_edge(pred_idx, succ);
114 }
115 
116 static void
append_logical_start(Block * b)117 append_logical_start(Block* b)
118 {
119    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
120 }
121 
122 static void
append_logical_end(Block * b)123 append_logical_end(Block* b)
124 {
125    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
126 }
127 
128 Temp
get_ssa_temp(struct isel_context * ctx,nir_ssa_def * def)129 get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
130 {
131    uint32_t id = ctx->first_temp_id + def->index;
132    return Temp(id, ctx->program->temp_rc[id]);
133 }
134 
135 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())136 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
137 {
138    Builder bld(ctx->program, ctx->block);
139    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
140    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
141 
142    if (ctx->program->wave_size == 32) {
143       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
144       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
145    }
146 
147    Operand mask_lo = Operand::c32(-1u);
148    Operand mask_hi = Operand::c32(-1u);
149 
150    if (mask.isTemp()) {
151       RegClass rc = RegClass(mask.regClass().type(), 1);
152       Builder::Result mask_split =
153          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
154       mask_lo = Operand(mask_split.def(0).getTemp());
155       mask_hi = Operand(mask_split.def(1).getTemp());
156    } else if (mask.physReg() == exec) {
157       mask_lo = Operand(exec_lo, s1);
158       mask_hi = Operand(exec_hi, s1);
159    }
160 
161    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
162 
163    if (ctx->program->chip_class <= GFX7)
164       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
165    else
166       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
167 }
168 
169 Temp
emit_wqm(Builder & bld,Temp src,Temp dst=Temp (0,s1),bool program_needs_wqm=false)170 emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
171 {
172    if (!dst.id())
173       dst = bld.tmp(src.regClass());
174 
175    assert(src.size() == dst.size());
176 
177    if (bld.program->stage != fragment_fs) {
178       if (!dst.id())
179          return src;
180 
181       bld.copy(Definition(dst), src);
182       return dst;
183    }
184 
185    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
186    bld.program->needs_wqm |= program_needs_wqm;
187    return dst;
188 }
189 
190 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)191 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
192 {
193    if (index.regClass() == s1)
194       return bld.readlane(bld.def(s1), data, index);
195 
196    if (ctx->options->chip_class <= GFX7) {
197       /* GFX6-7: there is no bpermute instruction */
198       Operand index_op(index);
199       Operand input_data(data);
200       index_op.setLateKill(true);
201       input_data.setLateKill(true);
202 
203       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
204                         index_op, input_data);
205    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
206 
207       /* GFX10 wave64 mode: emulate full-wave bpermute */
208       Temp index_is_lo =
209          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
210       Builder::Result index_is_lo_split =
211          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
212       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
213                                      index_is_lo_split.def(1).getTemp());
214       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
215                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
216       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
217       Operand input_data(data);
218 
219       index_x4.setLateKill(true);
220       input_data.setLateKill(true);
221       same_half.setLateKill(true);
222 
223       /* We need one pair of shared VGPRs:
224        * Note, that these have twice the allocation granularity of normal VGPRs */
225       ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
226 
227       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
228                         index_x4, input_data, same_half);
229    } else {
230       /* GFX8-9 or GFX10 wave32: bpermute works normally */
231       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
232       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
233    }
234 }
235 
236 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask)237 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
238 {
239    if (ctx->options->chip_class >= GFX8) {
240       unsigned and_mask = mask & 0x1f;
241       unsigned or_mask = (mask >> 5) & 0x1f;
242       unsigned xor_mask = (mask >> 10) & 0x1f;
243 
244       uint16_t dpp_ctrl = 0xffff;
245 
246       // TODO: we could use DPP8 for some swizzles
247       if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
248          unsigned res[4] = {0, 1, 2, 3};
249          for (unsigned i = 0; i < 4; i++)
250             res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
251          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
252       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
253          dpp_ctrl = dpp_row_rr(8);
254       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
255          dpp_ctrl = dpp_row_mirror;
256       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
257          dpp_ctrl = dpp_row_half_mirror;
258       }
259 
260       if (dpp_ctrl != 0xffff)
261          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
262    }
263 
264    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
265 }
266 
267 Temp
as_vgpr(isel_context * ctx,Temp val)268 as_vgpr(isel_context* ctx, Temp val)
269 {
270    if (val.type() == RegType::sgpr) {
271       Builder bld(ctx->program, ctx->block);
272       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
273    }
274    assert(val.type() == RegType::vgpr);
275    return val;
276 }
277 
278 // assumes a != 0xffffffff
279 void
emit_v_div_u32(isel_context * ctx,Temp dst,Temp a,uint32_t b)280 emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
281 {
282    assert(b != 0);
283    Builder bld(ctx->program, ctx->block);
284 
285    if (util_is_power_of_two_or_zero(b)) {
286       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
287       return;
288    }
289 
290    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
291 
292    assert(info.multiplier <= 0xffffffff);
293 
294    bool pre_shift = info.pre_shift != 0;
295    bool increment = info.increment != 0;
296    bool multiply = true;
297    bool post_shift = info.post_shift != 0;
298 
299    if (!pre_shift && !increment && !multiply && !post_shift) {
300       bld.copy(Definition(dst), a);
301       return;
302    }
303 
304    Temp pre_shift_dst = a;
305    if (pre_shift) {
306       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
307       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
308                a);
309    }
310 
311    Temp increment_dst = pre_shift_dst;
312    if (increment) {
313       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
314       bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
315    }
316 
317    Temp multiply_dst = increment_dst;
318    if (multiply) {
319       multiply_dst = post_shift ? bld.tmp(v1) : dst;
320       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
321                bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
322    }
323 
324    if (post_shift) {
325       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
326                multiply_dst);
327    }
328 }
329 
330 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)331 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
332 {
333    Builder bld(ctx->program, ctx->block);
334    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
335 }
336 
337 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)338 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
339 {
340    /* no need to extract the whole vector */
341    if (src.regClass() == dst_rc) {
342       assert(idx == 0);
343       return src;
344    }
345 
346    assert(src.bytes() > (idx * dst_rc.bytes()));
347    Builder bld(ctx->program, ctx->block);
348    auto it = ctx->allocated_vec.find(src.id());
349    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
350       if (it->second[idx].regClass() == dst_rc) {
351          return it->second[idx];
352       } else {
353          assert(!dst_rc.is_subdword());
354          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
355          return bld.copy(bld.def(dst_rc), it->second[idx]);
356       }
357    }
358 
359    if (dst_rc.is_subdword())
360       src = as_vgpr(ctx, src);
361 
362    if (src.bytes() == dst_rc.bytes()) {
363       assert(idx == 0);
364       return bld.copy(bld.def(dst_rc), src);
365    } else {
366       Temp dst = bld.tmp(dst_rc);
367       emit_extract_vector(ctx, src, idx, dst);
368       return dst;
369    }
370 }
371 
372 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)373 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
374 {
375    if (num_components == 1)
376       return;
377    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
378       return;
379    RegClass rc;
380    if (num_components > vec_src.size()) {
381       if (vec_src.type() == RegType::sgpr) {
382          /* should still help get_alu_src() */
383          emit_split_vector(ctx, vec_src, vec_src.size());
384          return;
385       }
386       /* sub-dword split */
387       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
388    } else {
389       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
390    }
391    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
392       aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
393    split->operands[0] = Operand(vec_src);
394    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
395    for (unsigned i = 0; i < num_components; i++) {
396       elems[i] = ctx->program->allocateTmp(rc);
397       split->definitions[i] = Definition(elems[i]);
398    }
399    ctx->block->instructions.emplace_back(std::move(split));
400    ctx->allocated_vec.emplace(vec_src.id(), elems);
401 }
402 
403 /* This vector expansion uses a mask to determine which elements in the new vector
404  * come from the original vector. The other elements are undefined. */
405 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask)406 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
407 {
408    emit_split_vector(ctx, vec_src, util_bitcount(mask));
409 
410    if (vec_src == dst)
411       return;
412 
413    Builder bld(ctx->program, ctx->block);
414    if (num_components == 1) {
415       if (dst.type() == RegType::sgpr)
416          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
417       else
418          bld.copy(Definition(dst), vec_src);
419       return;
420    }
421 
422    unsigned component_size = dst.size() / num_components;
423    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
424 
425    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
426       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
427    vec->definitions[0] = Definition(dst);
428    unsigned k = 0;
429    for (unsigned i = 0; i < num_components; i++) {
430       if (mask & (1 << i)) {
431          Temp src =
432             emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
433          if (dst.type() == RegType::sgpr)
434             src = bld.as_uniform(src);
435          vec->operands[i] = Operand(src);
436       } else {
437          vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);
438       }
439       elems[i] = vec->operands[i].getTemp();
440    }
441    ctx->block->instructions.emplace_back(std::move(vec));
442    ctx->allocated_vec.emplace(dst.id(), elems);
443 }
444 
445 /* adjust misaligned small bit size loads */
446 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)447 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
448 {
449    Builder bld(ctx->program, ctx->block);
450    Operand shift;
451    Temp select = Temp();
452    if (offset.isConstant()) {
453       assert(offset.constantValue() && offset.constantValue() < 4);
454       shift = Operand::c32(offset.constantValue() * 8);
455    } else {
456       /* bit_offset = 8 * (offset & 0x3) */
457       Temp tmp =
458          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
459       select = bld.tmp(s1);
460       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
461                        Operand::c32(3u));
462    }
463 
464    if (vec.size() == 1) {
465       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
466    } else if (vec.size() == 2) {
467       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
468       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
469       if (tmp == dst)
470          emit_split_vector(ctx, dst, 2);
471       else
472          emit_extract_vector(ctx, tmp, 0, dst);
473    } else if (vec.size() == 3 || vec.size() == 4) {
474       Temp lo = bld.tmp(s2), hi;
475       if (vec.size() == 3) {
476          /* this can happen if we use VMEM for a uniform load */
477          hi = bld.tmp(s1);
478          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
479       } else {
480          hi = bld.tmp(s2);
481          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
482          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
483       }
484       if (select != Temp())
485          hi =
486             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
487       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
488       Temp mid = bld.tmp(s1);
489       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
490       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
491       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
492       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
493       emit_split_vector(ctx, dst, 2);
494    }
495 }
496 
497 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)498 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
499 {
500    Builder bld(ctx->program, ctx->block);
501    if (offset.isTemp()) {
502       Temp tmp[4] = {vec, vec, vec, vec};
503 
504       if (vec.size() == 4) {
505          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
506          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
507                     Definition(tmp[2]), Definition(tmp[3]), vec);
508       } else if (vec.size() == 3) {
509          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
510          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
511                     Definition(tmp[2]), vec);
512       } else if (vec.size() == 2) {
513          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
514          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
515       }
516       for (unsigned i = 0; i < dst.size(); i++)
517          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
518 
519       vec = tmp[0];
520       if (dst.size() == 2)
521          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
522 
523       offset = Operand::zero();
524    }
525 
526    unsigned num_components = vec.bytes() / component_size;
527    if (vec.regClass() == dst.regClass()) {
528       assert(offset.constantValue() == 0);
529       bld.copy(Definition(dst), vec);
530       emit_split_vector(ctx, dst, num_components);
531       return;
532    }
533 
534    emit_split_vector(ctx, vec, num_components);
535    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
536    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
537 
538    assert(offset.constantValue() % component_size == 0);
539    unsigned skip = offset.constantValue() / component_size;
540    for (unsigned i = skip; i < num_components; i++)
541       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
542 
543    if (dst.type() == RegType::vgpr) {
544       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
545       num_components = dst.bytes() / component_size;
546       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
547          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
548       for (unsigned i = 0; i < num_components; i++)
549          create_vec->operands[i] = Operand(elems[i]);
550       create_vec->definitions[0] = Definition(dst);
551       bld.insert(std::move(create_vec));
552 
553    } else if (skip) {
554       /* if dst is sgpr - split the src, but move the original to sgpr. */
555       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
556       byte_align_scalar(ctx, vec, offset, dst);
557    } else {
558       assert(dst.size() == vec.size());
559       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
560    }
561 
562    ctx->allocated_vec.emplace(dst.id(), elems);
563 }
564 
565 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))566 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
567 {
568    Builder bld(ctx->program, ctx->block);
569    if (!dst.id())
570       dst = bld.tmp(bld.lm);
571 
572    assert(val.regClass() == s1);
573    assert(dst.regClass() == bld.lm);
574 
575    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
576                    bld.scc(val));
577 }
578 
579 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))580 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
581 {
582    Builder bld(ctx->program, ctx->block);
583    if (!dst.id())
584       dst = bld.tmp(s1);
585 
586    assert(val.regClass() == bld.lm);
587    assert(dst.regClass() == s1);
588 
589    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
590    Temp tmp = bld.tmp(s1);
591    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
592    return emit_wqm(bld, tmp, dst);
593 }
594 
595 /**
596  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
597  * src_bits and dst_bits are truncated.
598  *
599  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
600  * bit is indicated by src_bits in this case.
601  *
602  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
603  */
604 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())605 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
606             bool sign_extend, Temp dst = Temp())
607 {
608    assert(!(sign_extend && dst_bits < src_bits) &&
609           "Shrinking integers is not supported for signed inputs");
610 
611    if (!dst.id()) {
612       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
613          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
614       else
615          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
616    }
617 
618    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
619    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
620 
621    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
622       /* Copy the raw value, leaving an undefined value in the upper bits for
623        * the caller to handle appropriately */
624       return bld.copy(Definition(dst), src);
625    } else if (dst.bytes() < src.bytes()) {
626       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
627    }
628 
629    Temp tmp = dst;
630    if (dst_bits == 64)
631       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
632 
633    if (tmp == src) {
634    } else if (src.regClass() == s1) {
635       assert(src_bits < 32);
636       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
637                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
638    } else {
639       assert(src_bits < 32);
640       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits),
641                  Operand::c32((unsigned)sign_extend));
642    }
643 
644    if (dst_bits == 64) {
645       if (sign_extend && dst.regClass() == s2) {
646          Temp high =
647             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
648          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
649       } else if (sign_extend && dst.regClass() == v2) {
650          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
651          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
652       } else {
653          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
654       }
655    }
656 
657    return dst;
658 }
659 
660 enum sgpr_extract_mode {
661    sgpr_extract_sext,
662    sgpr_extract_zext,
663    sgpr_extract_undef,
664 };
665 
666 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)667 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
668 {
669    Temp vec = get_ssa_temp(ctx, src->src.ssa);
670    unsigned src_size = src->src.ssa->bit_size;
671    unsigned swizzle = src->swizzle[0];
672 
673    if (vec.size() > 1) {
674       assert(src_size == 16);
675       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
676       swizzle = swizzle & 1;
677    }
678 
679    Builder bld(ctx->program, ctx->block);
680    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
681 
682    if (mode == sgpr_extract_undef && swizzle == 0)
683       bld.copy(Definition(tmp), vec);
684    else
685       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
686                  Operand::c32(swizzle), Operand::c32(src_size),
687                  Operand::c32((mode == sgpr_extract_sext)));
688 
689    if (dst.regClass() == s2)
690       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
691 
692    return dst;
693 }
694 
695 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)696 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
697 {
698    if (src.src.ssa->num_components == 1 && size == 1)
699       return get_ssa_temp(ctx, src.src.ssa);
700 
701    Temp vec = get_ssa_temp(ctx, src.src.ssa);
702    unsigned elem_size = src.src.ssa->bit_size / 8u;
703    bool identity_swizzle = true;
704 
705    for (unsigned i = 0; identity_swizzle && i < size; i++) {
706       if (src.swizzle[i] != i)
707          identity_swizzle = false;
708    }
709    if (identity_swizzle)
710       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
711 
712    assert(elem_size > 0);
713    assert(vec.bytes() % elem_size == 0);
714 
715    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
716       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
717       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
718                                            sgpr_extract_undef);
719    }
720 
721    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
722    if (as_uniform)
723       vec = as_vgpr(ctx, vec);
724 
725    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
726                                     : RegClass(vec.type(), elem_size / 4);
727    if (size == 1) {
728       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
729    } else {
730       assert(size <= 4);
731       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
732       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
733          aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
734       for (unsigned i = 0; i < size; ++i) {
735          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
736          vec_instr->operands[i] = Operand{elems[i]};
737       }
738       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
739       vec_instr->definitions[0] = Definition(dst);
740       ctx->block->instructions.emplace_back(std::move(vec_instr));
741       ctx->allocated_vec.emplace(dst.id(), elems);
742       return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
743    }
744 }
745 
746 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)747 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
748 {
749    /* returns v2b or v1 for vop3p usage.
750     * The source expects exactly 2 16bit components
751     * which are within the same dword
752     */
753    assert(src.src.ssa->bit_size == 16);
754    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
755 
756    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
757    if (tmp.size() == 1)
758       return tmp;
759 
760    /* the size is larger than 1 dword: check the swizzle */
761    unsigned dword = src.swizzle[0] >> 1;
762 
763    /* extract a full dword if possible */
764    if (tmp.bytes() >= (dword + 1) * 4) {
765       return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
766    } else {
767       /* This must be a swizzled access to %a.zz where %a is v6b */
768       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
769       assert(tmp.regClass() == v6b && dword == 1);
770       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
771    }
772 }
773 
774 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)775 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
776 {
777    nir_ssa_scalar scalar =
778       nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
779    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
780 }
781 
782 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)783 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
784 {
785    if (ptr.size() == 2)
786       return ptr;
787    Builder bld(ctx->program, ctx->block);
788    if (ptr.type() == RegType::vgpr && !non_uniform)
789       ptr = bld.as_uniform(ptr);
790    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
791                      Operand::c32((unsigned)ctx->options->address32_hi));
792 }
793 
794 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)795 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
796                       bool writes_scc, uint8_t uses_ub = 0)
797 {
798    aco_ptr<SOP2_instruction> sop2{
799       create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
800    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
801    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
802    sop2->definitions[0] = Definition(dst);
803    if (instr->no_unsigned_wrap)
804       sop2->definitions[0].setNUW(true);
805    if (writes_scc)
806       sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
807 
808    for (int i = 0; i < 2; i++) {
809       if (uses_ub & (1 << i)) {
810          uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
811          if (src_ub <= 0xffff)
812             sop2->operands[i].set16bit(true);
813          else if (src_ub <= 0xffffff)
814             sop2->operands[i].set24bit(true);
815       }
816    }
817 
818    ctx->block->instructions.emplace_back(std::move(sop2));
819 }
820 
821 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)822 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
823                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
824                       bool nuw = false, uint8_t uses_ub = 0)
825 {
826    Builder bld(ctx->program, ctx->block);
827    bld.is_precise = instr->exact;
828 
829    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
830    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
831    if (src1.type() == RegType::sgpr) {
832       if (commutative && src0.type() == RegType::vgpr) {
833          Temp t = src0;
834          src0 = src1;
835          src1 = t;
836       } else {
837          src1 = as_vgpr(ctx, src1);
838       }
839    }
840 
841    Operand op[2] = {Operand(src0), Operand(src1)};
842 
843    for (int i = 0; i < 2; i++) {
844       if (uses_ub & (1 << i)) {
845          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
846          if (src_ub <= 0xffff)
847             op[i].set16bit(true);
848          else if (src_ub <= 0xffffff)
849             op[i].set24bit(true);
850       }
851    }
852 
853    if (flush_denorms && ctx->program->chip_class < GFX9) {
854       assert(dst.size() == 1);
855       Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
856       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
857    } else {
858       if (nuw) {
859          bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
860       } else {
861          bld.vop2(opc, Definition(dst), op[0], op[1]);
862       }
863    }
864 }
865 
866 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)867 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
868 {
869    Builder bld(ctx->program, ctx->block);
870    bld.is_precise = instr->exact;
871 
872    Temp src0 = get_alu_src(ctx, instr->src[0]);
873    Temp src1 = get_alu_src(ctx, instr->src[1]);
874 
875    if (src1.type() == RegType::sgpr) {
876       assert(src0.type() == RegType::vgpr);
877       std::swap(src0, src1);
878    }
879 
880    Temp src00 = bld.tmp(src0.type(), 1);
881    Temp src01 = bld.tmp(src0.type(), 1);
882    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
883    Temp src10 = bld.tmp(v1);
884    Temp src11 = bld.tmp(v1);
885    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
886    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
887    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
888    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
889 }
890 
891 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)892 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
893                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
894 {
895    assert(num_sources == 2 || num_sources == 3);
896    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
897    bool has_sgpr = false;
898    for (unsigned i = 0; i < num_sources; i++) {
899       src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
900       if (has_sgpr)
901          src[i] = as_vgpr(ctx, src[i]);
902       else
903          has_sgpr = src[i].type() == RegType::sgpr;
904    }
905 
906    Builder bld(ctx->program, ctx->block);
907    bld.is_precise = instr->exact;
908    if (flush_denorms && ctx->program->chip_class < GFX9) {
909       Temp tmp;
910       if (num_sources == 3)
911          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
912       else
913          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
914       if (dst.size() == 1)
915          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
916       else
917          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
918    } else if (num_sources == 3) {
919       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
920    } else {
921       bld.vop3(op, Definition(dst), src[0], src[1]);
922    }
923 }
924 
925 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)926 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
927                        bool swap_srcs = false)
928 {
929    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
930    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
931    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
932       src1 = as_vgpr(ctx, src1);
933    assert(instr->dest.dest.ssa.num_components == 2);
934 
935    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
936    unsigned opsel_lo =
937       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
938    unsigned opsel_hi =
939       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
940 
941    Builder bld(ctx->program, ctx->block);
942    bld.is_precise = instr->exact;
943    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
944    emit_split_vector(ctx, dst, 2);
945    return res;
946 }
947 
948 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp)949 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp)
950 {
951    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
952    bool has_sgpr = false;
953    for (unsigned i = 0; i < 3; i++) {
954       src[i] = get_alu_src(ctx, instr->src[i]);
955       if (has_sgpr)
956          src[i] = as_vgpr(ctx, src[i]);
957       else
958          has_sgpr = src[i].type() == RegType::sgpr;
959    }
960 
961    Builder bld(ctx->program, ctx->block);
962    bld.is_precise = instr->exact;
963    bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp;
964 }
965 
966 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)967 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
968 {
969    Builder bld(ctx->program, ctx->block);
970    bld.is_precise = instr->exact;
971    if (dst.type() == RegType::sgpr)
972       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
973                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
974    else
975       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
976 }
977 
978 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)979 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
980 {
981    Temp src0 = get_alu_src(ctx, instr->src[0]);
982    Temp src1 = get_alu_src(ctx, instr->src[1]);
983    assert(src0.size() == src1.size());
984 
985    aco_ptr<Instruction> vopc;
986    if (src1.type() == RegType::sgpr) {
987       if (src0.type() == RegType::vgpr) {
988          /* to swap the operands, we might also have to change the opcode */
989          switch (op) {
990          case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
991          case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
992          case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
993          case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
994          case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
995          case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
996          case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
997          case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
998          case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
999          case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1000          case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1001          case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1002          case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1003          case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1004          case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1005          case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1006          case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1007          case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1008          default: /* eq and ne are commutative */ break;
1009          }
1010          Temp t = src0;
1011          src0 = src1;
1012          src1 = t;
1013       } else {
1014          src1 = as_vgpr(ctx, src1);
1015       }
1016    }
1017 
1018    Builder bld(ctx->program, ctx->block);
1019    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
1020 }
1021 
1022 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1023 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1024 {
1025    Temp src0 = get_alu_src(ctx, instr->src[0]);
1026    Temp src1 = get_alu_src(ctx, instr->src[1]);
1027    Builder bld(ctx->program, ctx->block);
1028 
1029    assert(dst.regClass() == bld.lm);
1030    assert(src0.type() == RegType::sgpr);
1031    assert(src1.type() == RegType::sgpr);
1032    assert(src0.regClass() == src1.regClass());
1033 
1034    /* Emit the SALU comparison instruction */
1035    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1036    /* Turn the result into a per-lane bool */
1037    bool_to_vector_condition(ctx, cmp, dst);
1038 }
1039 
1040 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1041 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1042                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1043                 aco_opcode s64_op = aco_opcode::num_opcodes)
1044 {
1045    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1046                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1047                                                              : aco_opcode::num_opcodes;
1048    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1049                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1050                                                              : v16_op;
1051    bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
1052                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1053                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1054    aco_opcode op = use_valu ? v_op : s_op;
1055    assert(op != aco_opcode::num_opcodes);
1056    assert(dst.regClass() == ctx->program->lane_mask);
1057 
1058    if (use_valu)
1059       emit_vopc_instruction(ctx, instr, op, dst);
1060    else
1061       emit_sopc_instruction(ctx, instr, op, dst);
1062 }
1063 
1064 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1065 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1066                    Temp dst)
1067 {
1068    Builder bld(ctx->program, ctx->block);
1069    Temp src0 = get_alu_src(ctx, instr->src[0]);
1070    Temp src1 = get_alu_src(ctx, instr->src[1]);
1071 
1072    assert(dst.regClass() == bld.lm);
1073    assert(src0.regClass() == bld.lm);
1074    assert(src1.regClass() == bld.lm);
1075 
1076    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1077 }
1078 
1079 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1080 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1081 {
1082    Builder bld(ctx->program, ctx->block);
1083    Temp cond = get_alu_src(ctx, instr->src[0]);
1084    Temp then = get_alu_src(ctx, instr->src[1]);
1085    Temp els = get_alu_src(ctx, instr->src[2]);
1086 
1087    assert(cond.regClass() == bld.lm);
1088 
1089    if (dst.type() == RegType::vgpr) {
1090       aco_ptr<Instruction> bcsel;
1091       if (dst.size() == 1) {
1092          then = as_vgpr(ctx, then);
1093          els = as_vgpr(ctx, els);
1094 
1095          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1096       } else if (dst.size() == 2) {
1097          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1098          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1099          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1100          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1101 
1102          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1103          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1104 
1105          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1106       } else {
1107          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1108       }
1109       return;
1110    }
1111 
1112    if (instr->dest.dest.ssa.bit_size == 1) {
1113       assert(dst.regClass() == bld.lm);
1114       assert(then.regClass() == bld.lm);
1115       assert(els.regClass() == bld.lm);
1116    }
1117 
1118    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1119       if (dst.regClass() == s1 || dst.regClass() == s2) {
1120          assert((then.regClass() == s1 || then.regClass() == s2) &&
1121                 els.regClass() == then.regClass());
1122          assert(dst.size() == then.size());
1123          aco_opcode op =
1124             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1125          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1126       } else {
1127          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1128       }
1129       return;
1130    }
1131 
1132    /* divergent boolean bcsel
1133     * this implements bcsel on bools: dst = s0 ? s1 : s2
1134     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1135    assert(instr->dest.dest.ssa.bit_size == 1);
1136 
1137    if (cond.id() != then.id())
1138       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1139 
1140    if (cond.id() == els.id())
1141       bld.copy(Definition(dst), then);
1142    else
1143       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1144                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1145 }
1146 
1147 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode op,uint32_t undo)1148 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1149                uint32_t undo)
1150 {
1151    /* multiply by 16777216 to handle denormals */
1152    Temp is_denormal =
1153       bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
1154                bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));
1155    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1156    scaled = bld.vop1(op, bld.def(v1), scaled);
1157    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1158 
1159    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1160 
1161    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1162 }
1163 
1164 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1165 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1166 {
1167    if (ctx->block->fp_mode.denorm32 == 0) {
1168       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1169       return;
1170    }
1171 
1172    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1173 }
1174 
1175 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1176 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1177 {
1178    if (ctx->block->fp_mode.denorm32 == 0) {
1179       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1180       return;
1181    }
1182 
1183    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1184 }
1185 
1186 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1187 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1188 {
1189    if (ctx->block->fp_mode.denorm32 == 0) {
1190       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1191       return;
1192    }
1193 
1194    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1195 }
1196 
1197 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1198 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1199 {
1200    if (ctx->block->fp_mode.denorm32 == 0) {
1201       bld.vop1(aco_opcode::v_log_f32, dst, val);
1202       return;
1203    }
1204 
1205    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1206 }
1207 
1208 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1209 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1210 {
1211    if (ctx->options->chip_class >= GFX7)
1212       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1213 
1214    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1215    /* TODO: create more efficient code! */
1216    if (val.type() == RegType::sgpr)
1217       val = as_vgpr(ctx, val);
1218 
1219    /* Split the input value. */
1220    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1221    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1222 
1223    /* Extract the exponent and compute the unbiased value. */
1224    Temp exponent =
1225       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1226    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1227 
1228    /* Extract the fractional part. */
1229    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1230                                 Operand::c32(0x000fffffu));
1231    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1232 
1233    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1234    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1235               fract_mask);
1236 
1237    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1238    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1239    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1240    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1241    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1242 
1243    /* Get the sign bit. */
1244    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1245 
1246    /* Decide the operation to apply depending on the unbiased exponent. */
1247    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,
1248                                Operand::zero());
1249    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1250                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1251    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1252    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1253    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1254    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1255 
1256    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1257 }
1258 
1259 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1260 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1261 {
1262    if (ctx->options->chip_class >= GFX7)
1263       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1264 
1265    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1266     * lowered at NIR level for precision reasons). */
1267    Temp src0 = as_vgpr(ctx, val);
1268 
1269    Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */
1270    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1271                              Operand::c32(0x3fefffffu));
1272 
1273    Temp isnan =
1274       bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1275    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1276    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1277 
1278    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1279    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1280    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1281    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1282 
1283    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1284    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1285 
1286    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1287 
1288    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1289    add->vop3().neg[1] = true;
1290 
1291    return add->definitions[0].getTemp();
1292 }
1293 
1294 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1295 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1296 {
1297    if (bld.program->chip_class < GFX8) {
1298       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1299       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1300                           add.def(1).getTemp());
1301    }
1302 
1303    Builder::Result add(NULL);
1304    if (bld.program->chip_class >= GFX9) {
1305       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1306    } else {
1307       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1308    }
1309    add.instr->vop3().clamp = 1;
1310    return dst.getTemp();
1311 }
1312 
1313 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1314 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1315 {
1316    if (!instr->dest.dest.is_ssa) {
1317       isel_err(&instr->instr, "nir alu dst not in ssa");
1318       abort();
1319    }
1320    Builder bld(ctx->program, ctx->block);
1321    bld.is_precise = instr->exact;
1322    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1323    switch (instr->op) {
1324    case nir_op_vec2:
1325    case nir_op_vec3:
1326    case nir_op_vec4:
1327    case nir_op_vec5: {
1328       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1329       unsigned num = instr->dest.dest.ssa.num_components;
1330       for (unsigned i = 0; i < num; ++i)
1331          elems[i] = get_alu_src(ctx, instr->src[i]);
1332 
1333       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1334          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1335             aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1336          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1337          for (unsigned i = 0; i < num; ++i) {
1338             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1339                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1340             vec->operands[i] = Operand{elems[i]};
1341          }
1342          vec->definitions[0] = Definition(dst);
1343          ctx->block->instructions.emplace_back(std::move(vec));
1344          ctx->allocated_vec.emplace(dst.id(), elems);
1345       } else {
1346          bool use_s_pack = ctx->program->chip_class >= GFX9;
1347          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));
1348 
1349          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1350          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1351          for (unsigned i = 0; i < num; i++) {
1352             unsigned packed_size = use_s_pack ? 16 : 32;
1353             unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
1354             unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
1355             if (nir_src_is_const(instr->src[i].src)) {
1356                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1357                continue;
1358             }
1359 
1360             if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1361                elems[i] =
1362                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1363 
1364             if (offset)
1365                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1366                                    Operand::c32(offset));
1367 
1368             if (packed[idx].id())
1369                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1370                                       packed[idx]);
1371             else
1372                packed[idx] = elems[i];
1373          }
1374 
1375          if (use_s_pack) {
1376             for (unsigned i = 0; i < dst.size(); i++) {
1377                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1378 
1379                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1380                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1381                                        packed[i * 2 + 1]);
1382                else if (packed[i * 2 + 1].id())
1383                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1384                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1385                else if (packed[i * 2].id())
1386                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1387                                        Operand::c32(const_vals[i * 2 + 1]));
1388 
1389                if (same)
1390                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1391                else
1392                   const_vals[i] = 0;
1393             }
1394          }
1395 
1396          for (unsigned i = 0; i < dst.size(); i++) {
1397             if (const_vals[i] && packed[i].id())
1398                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1399                                     Operand::c32(const_vals[i]), packed[i]);
1400             else if (!packed[i].id())
1401                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1402          }
1403 
1404          if (dst.size() == 1)
1405             bld.copy(Definition(dst), packed[0]);
1406          else if (dst.size() == 2)
1407             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1408          else
1409             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
1410                        packed[2]);
1411       }
1412       break;
1413    }
1414    case nir_op_mov: {
1415       Temp src = get_alu_src(ctx, instr->src[0]);
1416       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1417          /* use size() instead of bytes() for 8/16-bit */
1418          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1419          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1420       } else {
1421          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1422          bld.copy(Definition(dst), src);
1423       }
1424       break;
1425    }
1426    case nir_op_inot: {
1427       Temp src = get_alu_src(ctx, instr->src[0]);
1428       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1429          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1430       } else if (dst.regClass() == v2) {
1431          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1432          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1433          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1434          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1435          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1436       } else if (dst.type() == RegType::sgpr) {
1437          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1438          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1439       } else {
1440          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1441       }
1442       break;
1443    }
1444    case nir_op_iabs: {
1445       Temp src = get_alu_src(ctx, instr->src[0]);
1446       if (dst.regClass() == s1) {
1447          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1448       } else if (dst.regClass() == v1) {
1449          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1450                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1451       } else {
1452          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1453       }
1454       break;
1455    }
1456    case nir_op_isign: {
1457       Temp src = get_alu_src(ctx, instr->src[0]);
1458       if (dst.regClass() == s1) {
1459          Temp tmp =
1460             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1461          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1462       } else if (dst.regClass() == s2) {
1463          Temp neg =
1464             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1465          Temp neqz;
1466          if (ctx->program->chip_class >= GFX8)
1467             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1468          else
1469             neqz =
1470                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1471                   .def(1)
1472                   .getTemp();
1473          /* SCC gets zero-extended to 64 bit */
1474          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1475       } else if (dst.regClass() == v1) {
1476          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1477       } else if (dst.regClass() == v2) {
1478          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1479          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1480          Temp gtz =
1481             bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
1482          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1483          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1484          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1485       } else {
1486          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487       }
1488       break;
1489    }
1490    case nir_op_imax: {
1491       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1492          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1493       } else if (dst.regClass() == v2b) {
1494          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1495       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1496          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1497       } else if (dst.regClass() == v1) {
1498          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1499       } else if (dst.regClass() == s1) {
1500          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1501       } else {
1502          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1503       }
1504       break;
1505    }
1506    case nir_op_umax: {
1507       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1508          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1509       } else if (dst.regClass() == v2b) {
1510          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1511       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1512          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1513       } else if (dst.regClass() == v1) {
1514          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1515       } else if (dst.regClass() == s1) {
1516          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1517       } else {
1518          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1519       }
1520       break;
1521    }
1522    case nir_op_imin: {
1523       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1524          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1525       } else if (dst.regClass() == v2b) {
1526          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1527       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1528          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1529       } else if (dst.regClass() == v1) {
1530          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1531       } else if (dst.regClass() == s1) {
1532          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1533       } else {
1534          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1535       }
1536       break;
1537    }
1538    case nir_op_umin: {
1539       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1540          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1541       } else if (dst.regClass() == v2b) {
1542          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1543       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1544          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1545       } else if (dst.regClass() == v1) {
1546          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1547       } else if (dst.regClass() == s1) {
1548          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1549       } else {
1550          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1551       }
1552       break;
1553    }
1554    case nir_op_ior: {
1555       if (instr->dest.dest.ssa.bit_size == 1) {
1556          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1557       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1558          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1559       } else if (dst.regClass() == v2) {
1560          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1561       } else if (dst.regClass() == s1) {
1562          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1563       } else if (dst.regClass() == s2) {
1564          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1565       } else {
1566          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1567       }
1568       break;
1569    }
1570    case nir_op_iand: {
1571       if (instr->dest.dest.ssa.bit_size == 1) {
1572          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1573       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1574          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1575       } else if (dst.regClass() == v2) {
1576          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1577       } else if (dst.regClass() == s1) {
1578          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1579       } else if (dst.regClass() == s2) {
1580          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1581       } else {
1582          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1583       }
1584       break;
1585    }
1586    case nir_op_ixor: {
1587       if (instr->dest.dest.ssa.bit_size == 1) {
1588          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1589       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1590          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1591       } else if (dst.regClass() == v2) {
1592          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1593       } else if (dst.regClass() == s1) {
1594          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1595       } else if (dst.regClass() == s2) {
1596          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1597       } else {
1598          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1599       }
1600       break;
1601    }
1602    case nir_op_ushr: {
1603       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1604          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1605       } else if (dst.regClass() == v2b) {
1606          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1607       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1608          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1609       } else if (dst.regClass() == v1) {
1610          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1611       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1612          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1613                   get_alu_src(ctx, instr->src[0]));
1614       } else if (dst.regClass() == v2) {
1615          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1616       } else if (dst.regClass() == s2) {
1617          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1618       } else if (dst.regClass() == s1) {
1619          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1620       } else {
1621          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1622       }
1623       break;
1624    }
1625    case nir_op_ishl: {
1626       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1627          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1628       } else if (dst.regClass() == v2b) {
1629          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1630       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1631          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1632       } else if (dst.regClass() == v1) {
1633          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1634                                false, 2);
1635       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1636          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1637                   get_alu_src(ctx, instr->src[0]));
1638       } else if (dst.regClass() == v2) {
1639          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1640       } else if (dst.regClass() == s1) {
1641          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1642       } else if (dst.regClass() == s2) {
1643          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1644       } else {
1645          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1646       }
1647       break;
1648    }
1649    case nir_op_ishr: {
1650       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1651          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1652       } else if (dst.regClass() == v2b) {
1653          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1654       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1655          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1656       } else if (dst.regClass() == v1) {
1657          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1658       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1659          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1660                   get_alu_src(ctx, instr->src[0]));
1661       } else if (dst.regClass() == v2) {
1662          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1663       } else if (dst.regClass() == s1) {
1664          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1665       } else if (dst.regClass() == s2) {
1666          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1667       } else {
1668          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1669       }
1670       break;
1671    }
1672    case nir_op_find_lsb: {
1673       Temp src = get_alu_src(ctx, instr->src[0]);
1674       if (src.regClass() == s1) {
1675          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1676       } else if (src.regClass() == v1) {
1677          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1678       } else if (src.regClass() == s2) {
1679          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1680       } else {
1681          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1682       }
1683       break;
1684    }
1685    case nir_op_ufind_msb:
1686    case nir_op_ifind_msb: {
1687       Temp src = get_alu_src(ctx, instr->src[0]);
1688       if (src.regClass() == s1 || src.regClass() == s2) {
1689          aco_opcode op = src.regClass() == s2
1690                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1691                                                              : aco_opcode::s_flbit_i32_i64)
1692                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1693                                                              : aco_opcode::s_flbit_i32);
1694          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1695 
1696          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1697                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1698          Temp msb = sub.def(0).getTemp();
1699          Temp carry = sub.def(1).getTemp();
1700 
1701          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1702                   bld.scc(carry));
1703       } else if (src.regClass() == v1) {
1704          aco_opcode op =
1705             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1706          Temp msb_rev = bld.tmp(v1);
1707          emit_vop1_instruction(ctx, instr, op, msb_rev);
1708          Temp msb = bld.tmp(v1);
1709          Temp carry =
1710             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1711          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1712       } else if (src.regClass() == v2) {
1713          aco_opcode op =
1714             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1715 
1716          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1717          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1718 
1719          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1720                          bld.vop1(op, bld.def(v1), lo));
1721          hi = bld.vop1(op, bld.def(v1), hi);
1722          Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1723 
1724          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1725 
1726          Temp msb = bld.tmp(v1);
1727          Temp carry =
1728             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1729          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1730       } else {
1731          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1732       }
1733       break;
1734    }
1735    case nir_op_bitfield_reverse: {
1736       if (dst.regClass() == s1) {
1737          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1738       } else if (dst.regClass() == v1) {
1739          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1740       } else {
1741          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1742       }
1743       break;
1744    }
1745    case nir_op_iadd: {
1746       if (dst.regClass() == s1) {
1747          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1748          break;
1749       } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1750          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1751          break;
1752       } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1753          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1754          break;
1755       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1756          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1757          break;
1758       }
1759 
1760       Temp src0 = get_alu_src(ctx, instr->src[0]);
1761       Temp src1 = get_alu_src(ctx, instr->src[1]);
1762       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1763          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1764          break;
1765       }
1766 
1767       assert(src0.size() == 2 && src1.size() == 2);
1768       Temp src00 = bld.tmp(src0.type(), 1);
1769       Temp src01 = bld.tmp(dst.type(), 1);
1770       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1771       Temp src10 = bld.tmp(src1.type(), 1);
1772       Temp src11 = bld.tmp(dst.type(), 1);
1773       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1774 
1775       if (dst.regClass() == s2) {
1776          Temp carry = bld.tmp(s1);
1777          Temp dst0 =
1778             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1779          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1780                               bld.scc(carry));
1781          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1782       } else if (dst.regClass() == v2) {
1783          Temp dst0 = bld.tmp(v1);
1784          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1785          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1786          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1787       } else {
1788          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1789       }
1790       break;
1791    }
1792    case nir_op_uadd_sat: {
1793       Temp src0 = get_alu_src(ctx, instr->src[0]);
1794       Temp src1 = get_alu_src(ctx, instr->src[1]);
1795       if (dst.regClass() == s1) {
1796          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1797          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1798          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1799                   bld.scc(carry));
1800       } else if (dst.regClass() == v2b) {
1801          Instruction* add_instr;
1802          if (ctx->program->chip_class >= GFX10) {
1803             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1804          } else {
1805             if (src1.type() == RegType::sgpr)
1806                std::swap(src0, src1);
1807             add_instr =
1808                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1809          }
1810          add_instr->vop3().clamp = 1;
1811       } else if (dst.regClass() == v1) {
1812          uadd32_sat(bld, Definition(dst), src0, src1);
1813       } else {
1814          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1815       }
1816       break;
1817    }
1818    case nir_op_iadd_sat: {
1819       Temp src0 = get_alu_src(ctx, instr->src[0]);
1820       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1821       if (dst.regClass() == v2b) {
1822          Instruction* add_instr =
1823             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1824          add_instr->vop3().clamp = 1;
1825       } else if (dst.regClass() == v1) {
1826          Instruction* add_instr =
1827             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
1828          add_instr->vop3().clamp = 1;
1829       } else {
1830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1831       }
1832       break;
1833    }
1834    case nir_op_uadd_carry: {
1835       Temp src0 = get_alu_src(ctx, instr->src[0]);
1836       Temp src1 = get_alu_src(ctx, instr->src[1]);
1837       if (dst.regClass() == s1) {
1838          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1839          break;
1840       }
1841       if (dst.regClass() == v1) {
1842          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1843          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1844                       carry);
1845          break;
1846       }
1847 
1848       Temp src00 = bld.tmp(src0.type(), 1);
1849       Temp src01 = bld.tmp(dst.type(), 1);
1850       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1851       Temp src10 = bld.tmp(src1.type(), 1);
1852       Temp src11 = bld.tmp(dst.type(), 1);
1853       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1854       if (dst.regClass() == s2) {
1855          Temp carry = bld.tmp(s1);
1856          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1857          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1858                           bld.scc(carry))
1859                     .def(1)
1860                     .getTemp();
1861          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1862       } else if (dst.regClass() == v2) {
1863          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1864          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1865          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1866                               Operand::c32(1u), carry);
1867          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1868       } else {
1869          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1870       }
1871       break;
1872    }
1873    case nir_op_isub: {
1874       if (dst.regClass() == s1) {
1875          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1876          break;
1877       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1878          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1879          break;
1880       }
1881 
1882       Temp src0 = get_alu_src(ctx, instr->src[0]);
1883       Temp src1 = get_alu_src(ctx, instr->src[1]);
1884       if (dst.regClass() == v1) {
1885          bld.vsub32(Definition(dst), src0, src1);
1886          break;
1887       } else if (dst.bytes() <= 2) {
1888          if (ctx->program->chip_class >= GFX10)
1889             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1890          else if (src1.type() == RegType::sgpr)
1891             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1892          else if (ctx->program->chip_class >= GFX8)
1893             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1894          else
1895             bld.vsub32(Definition(dst), src0, src1);
1896          break;
1897       }
1898 
1899       Temp src00 = bld.tmp(src0.type(), 1);
1900       Temp src01 = bld.tmp(dst.type(), 1);
1901       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1902       Temp src10 = bld.tmp(src1.type(), 1);
1903       Temp src11 = bld.tmp(dst.type(), 1);
1904       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1905       if (dst.regClass() == s2) {
1906          Temp borrow = bld.tmp(s1);
1907          Temp dst0 =
1908             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1909          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1910                               bld.scc(borrow));
1911          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1912       } else if (dst.regClass() == v2) {
1913          Temp lower = bld.tmp(v1);
1914          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1915          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1916          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1917       } else {
1918          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1919       }
1920       break;
1921    }
1922    case nir_op_usub_borrow: {
1923       Temp src0 = get_alu_src(ctx, instr->src[0]);
1924       Temp src1 = get_alu_src(ctx, instr->src[1]);
1925       if (dst.regClass() == s1) {
1926          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1927          break;
1928       } else if (dst.regClass() == v1) {
1929          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1930          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1931                       borrow);
1932          break;
1933       }
1934 
1935       Temp src00 = bld.tmp(src0.type(), 1);
1936       Temp src01 = bld.tmp(dst.type(), 1);
1937       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1938       Temp src10 = bld.tmp(src1.type(), 1);
1939       Temp src11 = bld.tmp(dst.type(), 1);
1940       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1941       if (dst.regClass() == s2) {
1942          Temp borrow = bld.tmp(s1);
1943          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1944          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1945                            bld.scc(borrow))
1946                      .def(1)
1947                      .getTemp();
1948          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1949       } else if (dst.regClass() == v2) {
1950          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1951          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1952          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1953                                Operand::c32(1u), borrow);
1954          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1955       } else {
1956          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1957       }
1958       break;
1959    }
1960    case nir_op_imul: {
1961       if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1962          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
1963       } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1964          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
1965       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1966          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
1967       } else if (dst.type() == RegType::vgpr) {
1968          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1969          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1970 
1971          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
1972             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
1973             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
1974                                   true /* commutative */, false, false, nuw_16bit);
1975          } else if (nir_src_is_const(instr->src[0].src)) {
1976             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
1977                           nir_src_as_uint(instr->src[0].src), false);
1978          } else if (nir_src_is_const(instr->src[1].src)) {
1979             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
1980                           nir_src_as_uint(instr->src[1].src), false);
1981          } else {
1982             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
1983          }
1984       } else if (dst.regClass() == s1) {
1985          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1986       } else {
1987          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1988       }
1989       break;
1990    }
1991    case nir_op_umul_high: {
1992       if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1993          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
1994       } else if (dst.bytes() == 4) {
1995          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1996          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1997 
1998          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
1999          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2000             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2001          } else {
2002             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2003          }
2004 
2005          if (dst.regClass() == s1)
2006             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2007       } else {
2008          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2009       }
2010       break;
2011    }
2012    case nir_op_imul_high: {
2013       if (dst.regClass() == v1) {
2014          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2015       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
2016          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2017       } else if (dst.regClass() == s1) {
2018          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2019                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2020          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2021       } else {
2022          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2023       }
2024       break;
2025    }
2026    case nir_op_fmul: {
2027       if (dst.regClass() == v2b) {
2028          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2029       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2030          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2031       } else if (dst.regClass() == v1) {
2032          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2033       } else if (dst.regClass() == v2) {
2034          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2035       } else {
2036          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2037       }
2038       break;
2039    }
2040    case nir_op_fadd: {
2041       if (dst.regClass() == v2b) {
2042          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2043       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2044          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2045       } else if (dst.regClass() == v1) {
2046          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2047       } else if (dst.regClass() == v2) {
2048          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2049       } else {
2050          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2051       }
2052       break;
2053    }
2054    case nir_op_fsub: {
2055       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2056          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2057          VOP3P_instruction& sub = add->vop3p();
2058          sub.neg_lo[1] = true;
2059          sub.neg_hi[1] = true;
2060          break;
2061       }
2062 
2063       Temp src0 = get_alu_src(ctx, instr->src[0]);
2064       Temp src1 = get_alu_src(ctx, instr->src[1]);
2065       if (dst.regClass() == v2b) {
2066          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2067             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2068          else
2069             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2070       } else if (dst.regClass() == v1) {
2071          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2072             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2073          else
2074             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2075       } else if (dst.regClass() == v2) {
2076          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2077                                      as_vgpr(ctx, src1));
2078          add->vop3().neg[1] = true;
2079       } else {
2080          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2081       }
2082       break;
2083    }
2084    case nir_op_fmax: {
2085       if (dst.regClass() == v2b) {
2086          // TODO: check fp_mode.must_flush_denorms16_64
2087          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2088       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2089          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2090       } else if (dst.regClass() == v1) {
2091          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2092                                ctx->block->fp_mode.must_flush_denorms32);
2093       } else if (dst.regClass() == v2) {
2094          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2095                                 ctx->block->fp_mode.must_flush_denorms16_64);
2096       } else {
2097          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2098       }
2099       break;
2100    }
2101    case nir_op_fmin: {
2102       if (dst.regClass() == v2b) {
2103          // TODO: check fp_mode.must_flush_denorms16_64
2104          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2105       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2106          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2107       } else if (dst.regClass() == v1) {
2108          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2109                                ctx->block->fp_mode.must_flush_denorms32);
2110       } else if (dst.regClass() == v2) {
2111          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2112                                 ctx->block->fp_mode.must_flush_denorms16_64);
2113       } else {
2114          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2115       }
2116       break;
2117    }
2118    case nir_op_sdot_4x8_iadd: {
2119       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2120       break;
2121    }
2122    case nir_op_sdot_4x8_iadd_sat: {
2123       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2124       break;
2125    }
2126    case nir_op_udot_4x8_uadd: {
2127       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2128       break;
2129    }
2130    case nir_op_udot_4x8_uadd_sat: {
2131       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2132       break;
2133    }
2134    case nir_op_sdot_2x16_iadd: {
2135       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2136       break;
2137    }
2138    case nir_op_sdot_2x16_iadd_sat: {
2139       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2140       break;
2141    }
2142    case nir_op_udot_2x16_uadd: {
2143       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2144       break;
2145    }
2146    case nir_op_udot_2x16_uadd_sat: {
2147       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2148       break;
2149    }
2150    case nir_op_cube_face_coord_amd: {
2151       Temp in = get_alu_src(ctx, instr->src[0], 3);
2152       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2153                      emit_extract_vector(ctx, in, 2, v1)};
2154       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2155       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
2156       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2157       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2158       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2159                     bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
2160       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2161                     bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
2162       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
2163       break;
2164    }
2165    case nir_op_cube_face_index_amd: {
2166       Temp in = get_alu_src(ctx, instr->src[0], 3);
2167       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2168                      emit_extract_vector(ctx, in, 2, v1)};
2169       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
2170       break;
2171    }
2172    case nir_op_bcsel: {
2173       emit_bcsel(ctx, instr, dst);
2174       break;
2175    }
2176    case nir_op_frsq: {
2177       if (dst.regClass() == v2b) {
2178          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2179       } else if (dst.regClass() == v1) {
2180          Temp src = get_alu_src(ctx, instr->src[0]);
2181          emit_rsq(ctx, bld, Definition(dst), src);
2182       } else if (dst.regClass() == v2) {
2183          /* Lowered at NIR level for precision reasons. */
2184          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2185       } else {
2186          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2187       }
2188       break;
2189    }
2190    case nir_op_fneg: {
2191       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2192          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2193          bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),
2194                    instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2195          emit_split_vector(ctx, dst, 2);
2196          break;
2197       }
2198       Temp src = get_alu_src(ctx, instr->src[0]);
2199       if (dst.regClass() == v2b) {
2200          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2201       } else if (dst.regClass() == v1) {
2202          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2203                   as_vgpr(ctx, src));
2204       } else if (dst.regClass() == v2) {
2205          if (ctx->block->fp_mode.must_flush_denorms16_64)
2206             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2207                            as_vgpr(ctx, src));
2208          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2209          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2210          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2211          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2212       } else {
2213          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2214       }
2215       break;
2216    }
2217    case nir_op_fabs: {
2218       Temp src = get_alu_src(ctx, instr->src[0]);
2219       if (dst.regClass() == v2b) {
2220          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2221                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2222                                .instr;
2223          mul->vop3().abs[1] = true;
2224       } else if (dst.regClass() == v1) {
2225          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2226                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2227                                .instr;
2228          mul->vop3().abs[1] = true;
2229       } else if (dst.regClass() == v2) {
2230          if (ctx->block->fp_mode.must_flush_denorms16_64)
2231             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2232                            as_vgpr(ctx, src));
2233          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2234          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2235          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2236          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2237       } else {
2238          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2239       }
2240       break;
2241    }
2242    case nir_op_fsat: {
2243       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2244          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2245          Instruction* vop3p =
2246             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2247                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2248          vop3p->vop3p().clamp = true;
2249          emit_split_vector(ctx, dst, 2);
2250          break;
2251       }
2252       Temp src = get_alu_src(ctx, instr->src[0]);
2253       if (dst.regClass() == v2b) {
2254          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2255                   src);
2256       } else if (dst.regClass() == v1) {
2257          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2258                   Operand::c32(0x3f800000u), src);
2259          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2260           * operands */
2261          // TODO: confirm that this holds under any circumstances
2262       } else if (dst.regClass() == v2) {
2263          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2264          add->vop3().clamp = true;
2265       } else {
2266          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2267       }
2268       break;
2269    }
2270    case nir_op_flog2: {
2271       if (dst.regClass() == v2b) {
2272          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2273       } else if (dst.regClass() == v1) {
2274          Temp src = get_alu_src(ctx, instr->src[0]);
2275          emit_log2(ctx, bld, Definition(dst), src);
2276       } else {
2277          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2278       }
2279       break;
2280    }
2281    case nir_op_frcp: {
2282       if (dst.regClass() == v2b) {
2283          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2284       } else if (dst.regClass() == v1) {
2285          Temp src = get_alu_src(ctx, instr->src[0]);
2286          emit_rcp(ctx, bld, Definition(dst), src);
2287       } else if (dst.regClass() == v2) {
2288          /* Lowered at NIR level for precision reasons. */
2289          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2290       } else {
2291          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2292       }
2293       break;
2294    }
2295    case nir_op_fexp2: {
2296       if (dst.regClass() == v2b) {
2297          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2298       } else if (dst.regClass() == v1) {
2299          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2300       } else {
2301          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2302       }
2303       break;
2304    }
2305    case nir_op_fsqrt: {
2306       if (dst.regClass() == v2b) {
2307          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2308       } else if (dst.regClass() == v1) {
2309          Temp src = get_alu_src(ctx, instr->src[0]);
2310          emit_sqrt(ctx, bld, Definition(dst), src);
2311       } else if (dst.regClass() == v2) {
2312          /* Lowered at NIR level for precision reasons. */
2313          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2314       } else {
2315          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2316       }
2317       break;
2318    }
2319    case nir_op_ffract: {
2320       if (dst.regClass() == v2b) {
2321          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2322       } else if (dst.regClass() == v1) {
2323          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2324       } else if (dst.regClass() == v2) {
2325          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2326       } else {
2327          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2328       }
2329       break;
2330    }
2331    case nir_op_ffloor: {
2332       if (dst.regClass() == v2b) {
2333          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2334       } else if (dst.regClass() == v1) {
2335          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2336       } else if (dst.regClass() == v2) {
2337          Temp src = get_alu_src(ctx, instr->src[0]);
2338          emit_floor_f64(ctx, bld, Definition(dst), src);
2339       } else {
2340          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2341       }
2342       break;
2343    }
2344    case nir_op_fceil: {
2345       if (dst.regClass() == v2b) {
2346          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2347       } else if (dst.regClass() == v1) {
2348          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2349       } else if (dst.regClass() == v2) {
2350          if (ctx->options->chip_class >= GFX7) {
2351             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2352          } else {
2353             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2354             /* trunc = trunc(src0)
2355              * if (src0 > 0.0 && src0 != trunc)
2356              *    trunc += 1.0
2357              */
2358             Temp src0 = get_alu_src(ctx, instr->src[0]);
2359             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2360             Temp tmp0 =
2361                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2362             Temp tmp1 =
2363                bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2364             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
2365                                  tmp0, tmp1);
2366             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2367                                 bld.copy(bld.def(v1), Operand::zero()),
2368                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2369             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2370                              bld.copy(bld.def(v1), Operand::zero()), add);
2371             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2372          }
2373       } else {
2374          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2375       }
2376       break;
2377    }
2378    case nir_op_ftrunc: {
2379       if (dst.regClass() == v2b) {
2380          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2381       } else if (dst.regClass() == v1) {
2382          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2383       } else if (dst.regClass() == v2) {
2384          Temp src = get_alu_src(ctx, instr->src[0]);
2385          emit_trunc_f64(ctx, bld, Definition(dst), src);
2386       } else {
2387          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2388       }
2389       break;
2390    }
2391    case nir_op_fround_even: {
2392       if (dst.regClass() == v2b) {
2393          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2394       } else if (dst.regClass() == v1) {
2395          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2396       } else if (dst.regClass() == v2) {
2397          if (ctx->options->chip_class >= GFX7) {
2398             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2399          } else {
2400             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2401             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2402             Temp src0 = get_alu_src(ctx, instr->src[0]);
2403             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2404 
2405             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2406                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2407             Temp bfi =
2408                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2409                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2410             Temp tmp =
2411                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2412                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2413             Instruction* sub =
2414                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2415                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2416             sub->vop3().neg[1] = true;
2417             tmp = sub->definitions[0].getTemp();
2418 
2419             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2420                                 Operand::c32(0x432fffffu));
2421             Instruction* vop3 =
2422                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2423             vop3->vop3().abs[0] = true;
2424             Temp cond = vop3->definitions[0].getTemp();
2425 
2426             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2427             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2428             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2429                                      as_vgpr(ctx, src0_lo), cond);
2430             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2431                                      as_vgpr(ctx, src0_hi), cond);
2432 
2433             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2434          }
2435       } else {
2436          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2437       }
2438       break;
2439    }
2440    case nir_op_fsin:
2441    case nir_op_fcos: {
2442       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2443       aco_ptr<Instruction> norm;
2444       if (dst.regClass() == v2b) {
2445          Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
2446          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2447          aco_opcode opcode =
2448             instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2449          bld.vop1(opcode, Definition(dst), tmp);
2450       } else if (dst.regClass() == v1) {
2451          Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
2452          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2453 
2454          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2455          if (ctx->options->chip_class < GFX9)
2456             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2457 
2458          aco_opcode opcode =
2459             instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2460          bld.vop1(opcode, Definition(dst), tmp);
2461       } else {
2462          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2463       }
2464       break;
2465    }
2466    case nir_op_ldexp: {
2467       if (dst.regClass() == v2b) {
2468          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2469       } else if (dst.regClass() == v1) {
2470          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2471       } else if (dst.regClass() == v2) {
2472          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2473       } else {
2474          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2475       }
2476       break;
2477    }
2478    case nir_op_frexp_sig: {
2479       if (dst.regClass() == v2b) {
2480          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2481       } else if (dst.regClass() == v1) {
2482          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2483       } else if (dst.regClass() == v2) {
2484          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2485       } else {
2486          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2487       }
2488       break;
2489    }
2490    case nir_op_frexp_exp: {
2491       if (instr->src[0].src.ssa->bit_size == 16) {
2492          Temp src = get_alu_src(ctx, instr->src[0]);
2493          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2494          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2495          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2496       } else if (instr->src[0].src.ssa->bit_size == 32) {
2497          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2498       } else if (instr->src[0].src.ssa->bit_size == 64) {
2499          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2500       } else {
2501          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2502       }
2503       break;
2504    }
2505    case nir_op_fsign: {
2506       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2507       if (dst.regClass() == v2b) {
2508          assert(ctx->program->chip_class >= GFX9);
2509          /* replace negative zero with positive zero */
2510          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2511          src =
2512             bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2513          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2514       } else if (dst.regClass() == v1) {
2515          src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2516          src =
2517             bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2518          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2519       } else if (dst.regClass() == v2) {
2520          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),
2521                               Operand::zero(), src);
2522          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2523          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2524                                    emit_extract_vector(ctx, src, 1, v1), cond);
2525 
2526          cond =
2527             bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
2528          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2529          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2530 
2531          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2532       } else {
2533          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2534       }
2535       break;
2536    }
2537    case nir_op_f2f16:
2538    case nir_op_f2f16_rtne: {
2539       Temp src = get_alu_src(ctx, instr->src[0]);
2540       if (instr->src[0].src.ssa->bit_size == 64)
2541          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2542       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2543          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2544           * keep value numbering and the scheduler simpler.
2545           */
2546          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2547       else
2548          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2549       break;
2550    }
2551    case nir_op_f2f16_rtz: {
2552       Temp src = get_alu_src(ctx, instr->src[0]);
2553       if (instr->src[0].src.ssa->bit_size == 64)
2554          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2555       if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2556          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2557       else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
2558          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2559       else
2560          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2561       break;
2562    }
2563    case nir_op_f2f32: {
2564       if (instr->src[0].src.ssa->bit_size == 16) {
2565          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2566       } else if (instr->src[0].src.ssa->bit_size == 64) {
2567          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2568       } else {
2569          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2570       }
2571       break;
2572    }
2573    case nir_op_f2f64: {
2574       Temp src = get_alu_src(ctx, instr->src[0]);
2575       if (instr->src[0].src.ssa->bit_size == 16)
2576          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2577       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2578       break;
2579    }
2580    case nir_op_i2f16: {
2581       assert(dst.regClass() == v2b);
2582       Temp src = get_alu_src(ctx, instr->src[0]);
2583       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2584       if (input_size <= 16) {
2585          /* Expand integer to the size expected by the uint→float converter used below */
2586          unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2587          if (input_size != target_size) {
2588             src = convert_int(ctx, bld, src, input_size, target_size, true);
2589          }
2590       } else if (input_size == 64) {
2591          /* Truncate down to 32 bits; if any of the upper bits are relevant,
2592           * the value does not fall into the single-precision float range
2593           * anyway. SPIR-V does not mandate any specific behavior for such
2594           * large inputs.
2595           */
2596          src = convert_int(ctx, bld, src, 64, 32, false);
2597       }
2598 
2599       if (ctx->program->chip_class >= GFX8 && input_size <= 16) {
2600          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2601       } else {
2602          /* Convert to f32 and then down to f16. This is needed to handle
2603           * inputs slightly outside the range [INT16_MIN, INT16_MAX],
2604           * which are representable via f16 but wouldn't be converted
2605           * correctly by v_cvt_f16_i16.
2606           *
2607           * This is also the fallback-path taken on GFX7 and earlier, which
2608           * do not support direct f16⟷i16 conversions.
2609           */
2610          src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2611          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2612       }
2613       break;
2614    }
2615    case nir_op_i2f32: {
2616       assert(dst.size() == 1);
2617       Temp src = get_alu_src(ctx, instr->src[0]);
2618       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2619       if (input_size <= 32) {
2620          if (input_size <= 16) {
2621             /* Sign-extend to 32-bits */
2622             src = convert_int(ctx, bld, src, input_size, 32, true);
2623          }
2624          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2625       } else {
2626          assert(input_size == 64);
2627          RegClass rc = RegClass(src.type(), 1);
2628          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2629          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2630          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2631          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2632          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2633          upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2634          bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2635       }
2636 
2637       break;
2638    }
2639    case nir_op_i2f64: {
2640       if (instr->src[0].src.ssa->bit_size <= 32) {
2641          Temp src = get_alu_src(ctx, instr->src[0]);
2642          if (instr->src[0].src.ssa->bit_size <= 16)
2643             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2644          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2645       } else if (instr->src[0].src.ssa->bit_size == 64) {
2646          Temp src = get_alu_src(ctx, instr->src[0]);
2647          RegClass rc = RegClass(src.type(), 1);
2648          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2649          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2650          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2651          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2652          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2653          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2654 
2655       } else {
2656          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2657       }
2658       break;
2659    }
2660    case nir_op_u2f16: {
2661       assert(dst.regClass() == v2b);
2662       Temp src = get_alu_src(ctx, instr->src[0]);
2663       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2664       if (input_size <= 16) {
2665          /* Expand integer to the size expected by the uint→float converter used below */
2666          unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2667          if (input_size != target_size) {
2668             src = convert_int(ctx, bld, src, input_size, target_size, false);
2669          }
2670       } else if (input_size == 64) {
2671          /* Truncate down to 32 bits; if any of the upper bits are non-zero,
2672           * the value does not fall into the single-precision float range
2673           * anyway. SPIR-V does not mandate any specific behavior for such
2674           * large inputs.
2675           */
2676          src = convert_int(ctx, bld, src, 64, 32, false);
2677       }
2678 
2679       if (ctx->program->chip_class >= GFX8) {
2680          /* float16 has a range of [0, 65519]. Converting from larger
2681           * inputs is UB, so we just need to consider the lower 16 bits */
2682          bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2683       } else {
2684          /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2685          src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2686          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2687       }
2688       break;
2689    }
2690    case nir_op_u2f32: {
2691       assert(dst.size() == 1);
2692       Temp src = get_alu_src(ctx, instr->src[0]);
2693       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2694       if (input_size == 8) {
2695          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2696       } else if (input_size <= 32) {
2697          if (input_size == 16)
2698             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2699          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2700       } else {
2701          assert(input_size == 64);
2702          RegClass rc = RegClass(src.type(), 1);
2703          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2704          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2705          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2706          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2707          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2708          upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2709          bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2710       }
2711       break;
2712    }
2713    case nir_op_u2f64: {
2714       if (instr->src[0].src.ssa->bit_size <= 32) {
2715          Temp src = get_alu_src(ctx, instr->src[0]);
2716          if (instr->src[0].src.ssa->bit_size <= 16)
2717             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2718          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2719       } else if (instr->src[0].src.ssa->bit_size == 64) {
2720          Temp src = get_alu_src(ctx, instr->src[0]);
2721          RegClass rc = RegClass(src.type(), 1);
2722          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2723          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2724          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2725          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2726          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2727          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2728       } else {
2729          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2730       }
2731       break;
2732    }
2733    case nir_op_f2i8:
2734    case nir_op_f2i16: {
2735       if (instr->src[0].src.ssa->bit_size == 16) {
2736          if (ctx->program->chip_class >= GFX8) {
2737             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2738          } else {
2739             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
2740             Temp tmp = bld.tmp(v1);
2741             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2742             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
2743             tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2744                               (dst.type() == RegType::sgpr) ? Temp() : dst);
2745             if (dst.type() == RegType::sgpr) {
2746                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2747             }
2748          }
2749       } else if (instr->src[0].src.ssa->bit_size == 32) {
2750          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2751       } else {
2752          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2753       }
2754       break;
2755    }
2756    case nir_op_f2u8:
2757    case nir_op_f2u16: {
2758       if (instr->src[0].src.ssa->bit_size == 16) {
2759          if (ctx->program->chip_class >= GFX8) {
2760             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2761          } else {
2762             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2763             Temp tmp = bld.tmp(v1);
2764             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2765             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
2766             tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2767                               (dst.type() == RegType::sgpr) ? Temp() : dst);
2768             if (dst.type() == RegType::sgpr) {
2769                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2770             }
2771          }
2772       } else if (instr->src[0].src.ssa->bit_size == 32) {
2773          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2774       } else {
2775          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2776       }
2777       break;
2778    }
2779    case nir_op_f2i32: {
2780       Temp src = get_alu_src(ctx, instr->src[0]);
2781       if (instr->src[0].src.ssa->bit_size == 16) {
2782          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2783          if (dst.type() == RegType::vgpr) {
2784             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2785          } else {
2786             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2787                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2788          }
2789       } else if (instr->src[0].src.ssa->bit_size == 32) {
2790          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2791       } else if (instr->src[0].src.ssa->bit_size == 64) {
2792          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2793       } else {
2794          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2795       }
2796       break;
2797    }
2798    case nir_op_f2u32: {
2799       Temp src = get_alu_src(ctx, instr->src[0]);
2800       if (instr->src[0].src.ssa->bit_size == 16) {
2801          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2802          if (dst.type() == RegType::vgpr) {
2803             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2804          } else {
2805             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2806                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2807          }
2808       } else if (instr->src[0].src.ssa->bit_size == 32) {
2809          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2810       } else if (instr->src[0].src.ssa->bit_size == 64) {
2811          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2812       } else {
2813          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2814       }
2815       break;
2816    }
2817    case nir_op_f2i64: {
2818       Temp src = get_alu_src(ctx, instr->src[0]);
2819       if (instr->src[0].src.ssa->bit_size == 16)
2820          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2821 
2822       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2823          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2824          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,
2825                              Operand::c32(64u));
2826          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2827          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);
2828          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2829          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);
2830          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2831          Temp new_exponent = bld.tmp(v1);
2832          Temp borrow =
2833             bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();
2834          if (ctx->program->chip_class >= GFX8)
2835             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2836          else
2837             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2838          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));
2839          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2840          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2841          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,
2842                               Operand::c32(0xffffffffu), borrow);
2843          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2844          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2845          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2846          Temp new_lower = bld.tmp(v1);
2847          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2848          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2849          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2850 
2851       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2852          if (src.type() == RegType::vgpr)
2853             src = bld.as_uniform(src);
2854          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2855                                   Operand::c32(0x80017u));
2856          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2857                              Operand::c32(126u));
2858          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2859                              exponent);
2860          exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),
2861                              Operand::c32(64u), exponent);
2862          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2863                                   Operand::c32(0x7fffffu), src);
2864          Temp sign =
2865             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));
2866          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2867                              Operand::c32(0x800000u), mantissa);
2868          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2869                              Operand::c32(7u));
2870          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2871          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2872                              Operand::c32(63u), exponent);
2873          mantissa =
2874             bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2875          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
2876                               Operand::c32(0xffffffffu)); // exp >= 64
2877          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));
2878          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2879          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2880          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2881          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2882          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2883          Temp borrow = bld.tmp(s1);
2884          lower =
2885             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2886          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
2887                           bld.scc(borrow));
2888          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2889 
2890       } else if (instr->src[0].src.ssa->bit_size == 64) {
2891          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2892                                Operand::c32(0x3df00000u));
2893          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2894          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2895          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2896                           Operand::c32(0xc1f00000u));
2897          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2898          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2899          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2900          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2901          if (dst.type() == RegType::sgpr) {
2902             lower = bld.as_uniform(lower);
2903             upper = bld.as_uniform(upper);
2904          }
2905          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2906 
2907       } else {
2908          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2909       }
2910       break;
2911    }
2912    case nir_op_f2u64: {
2913       Temp src = get_alu_src(ctx, instr->src[0]);
2914       if (instr->src[0].src.ssa->bit_size == 16)
2915          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2916 
2917       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2918          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2919          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
2920                                            Operand::c32(64u), exponent);
2921          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);
2922          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2923          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2924          Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);
2925          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2926          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2927          Temp new_exponent = bld.tmp(v1);
2928          Temp cond_small =
2929             bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();
2930          if (ctx->program->chip_class >= GFX8)
2931             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2932          else
2933             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2934          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2935          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2936          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2937          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),
2938                               cond_small);
2939          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,
2940                           exponent_in_range);
2941          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,
2942                           exponent_in_range);
2943          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2944 
2945       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2946          if (src.type() == RegType::vgpr)
2947             src = bld.as_uniform(src);
2948          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2949                                   Operand::c32(0x80017u));
2950          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2951                              Operand::c32(126u));
2952          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2953                              exponent);
2954          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2955                                   Operand::c32(0x7fffffu), src);
2956          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2957                              Operand::c32(0x800000u), mantissa);
2958          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2959                                         Operand::c32(24u), exponent);
2960          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2961                                exponent_small);
2962          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2963          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2964                                         exponent, Operand::c32(24u));
2965          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
2966                              exponent_large);
2967          Temp cond =
2968             bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);
2969          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,
2970                              Operand::c32(0xffffffffu), cond);
2971          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2972          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2973          Temp cond_small =
2974             bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));
2975          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2976          upper =
2977             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);
2978          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2979 
2980       } else if (instr->src[0].src.ssa->bit_size == 64) {
2981          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2982                                Operand::c32(0x3df00000u));
2983          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2984          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2985          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2986                           Operand::c32(0xc1f00000u));
2987          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2988          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2989          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2990          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2991          if (dst.type() == RegType::sgpr) {
2992             lower = bld.as_uniform(lower);
2993             upper = bld.as_uniform(upper);
2994          }
2995          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2996 
2997       } else {
2998          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2999       }
3000       break;
3001    }
3002    case nir_op_b2f16: {
3003       Temp src = get_alu_src(ctx, instr->src[0]);
3004       assert(src.regClass() == bld.lm);
3005 
3006       if (dst.regClass() == s1) {
3007          src = bool_to_scalar_condition(ctx, src);
3008          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3009       } else if (dst.regClass() == v2b) {
3010          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3011          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3012       } else {
3013          unreachable("Wrong destination register class for nir_op_b2f16.");
3014       }
3015       break;
3016    }
3017    case nir_op_b2f32: {
3018       Temp src = get_alu_src(ctx, instr->src[0]);
3019       assert(src.regClass() == bld.lm);
3020 
3021       if (dst.regClass() == s1) {
3022          src = bool_to_scalar_condition(ctx, src);
3023          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3024       } else if (dst.regClass() == v1) {
3025          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3026                       Operand::c32(0x3f800000u), src);
3027       } else {
3028          unreachable("Wrong destination register class for nir_op_b2f32.");
3029       }
3030       break;
3031    }
3032    case nir_op_b2f64: {
3033       Temp src = get_alu_src(ctx, instr->src[0]);
3034       assert(src.regClass() == bld.lm);
3035 
3036       if (dst.regClass() == s2) {
3037          src = bool_to_scalar_condition(ctx, src);
3038          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3039                   Operand::zero(), bld.scc(src));
3040       } else if (dst.regClass() == v2) {
3041          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3042          Temp upper =
3043             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3044          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3045       } else {
3046          unreachable("Wrong destination register class for nir_op_b2f64.");
3047       }
3048       break;
3049    }
3050    case nir_op_i2i8:
3051    case nir_op_i2i16:
3052    case nir_op_i2i32:
3053    case nir_op_i2i64: {
3054       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3055          /* no need to do the extract in get_alu_src() */
3056          sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3057                                      ? sgpr_extract_sext
3058                                      : sgpr_extract_undef;
3059          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3060       } else {
3061          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3062          const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
3063          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3064                      output_bitsize > input_bitsize, dst);
3065       }
3066       break;
3067    }
3068    case nir_op_u2u8:
3069    case nir_op_u2u16:
3070    case nir_op_u2u32:
3071    case nir_op_u2u64: {
3072       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3073          /* no need to do the extract in get_alu_src() */
3074          sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3075                                      ? sgpr_extract_zext
3076                                      : sgpr_extract_undef;
3077          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3078       } else {
3079          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3080                      instr->dest.dest.ssa.bit_size, false, dst);
3081       }
3082       break;
3083    }
3084    case nir_op_b2b32:
3085    case nir_op_b2i8:
3086    case nir_op_b2i16:
3087    case nir_op_b2i32:
3088    case nir_op_b2i64: {
3089       Temp src = get_alu_src(ctx, instr->src[0]);
3090       assert(src.regClass() == bld.lm);
3091 
3092       Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
3093       if (tmp.regClass() == s1) {
3094          bool_to_scalar_condition(ctx, src, tmp);
3095       } else if (tmp.type() == RegType::vgpr) {
3096          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),
3097                       src);
3098       } else {
3099          unreachable("Invalid register class for b2i32");
3100       }
3101 
3102       if (tmp != dst)
3103          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
3104       break;
3105    }
3106    case nir_op_b2b1:
3107    case nir_op_i2b1: {
3108       Temp src = get_alu_src(ctx, instr->src[0]);
3109       assert(dst.regClass() == bld.lm);
3110 
3111       if (src.type() == RegType::vgpr) {
3112          assert(src.regClass() == v1 || src.regClass() == v2);
3113          assert(dst.regClass() == bld.lm);
3114          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3115                   Definition(dst), Operand::zero(), src)
3116             .def(0)
3117             .setHint(vcc);
3118       } else {
3119          assert(src.regClass() == s1 || src.regClass() == s2);
3120          Temp tmp;
3121          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
3122             tmp =
3123                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3124                   .def(1)
3125                   .getTemp();
3126          } else {
3127             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3128                            bld.scc(bld.def(s1)), Operand::zero(), src);
3129          }
3130          bool_to_vector_condition(ctx, tmp, dst);
3131       }
3132       break;
3133    }
3134    case nir_op_unpack_64_2x32:
3135    case nir_op_unpack_32_2x16:
3136    case nir_op_unpack_64_4x16:
3137       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3138       emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3139       break;
3140    case nir_op_pack_64_2x32_split: {
3141       Temp src0 = get_alu_src(ctx, instr->src[0]);
3142       Temp src1 = get_alu_src(ctx, instr->src[1]);
3143 
3144       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3145       break;
3146    }
3147    case nir_op_unpack_64_2x32_split_x:
3148       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3149                  get_alu_src(ctx, instr->src[0]));
3150       break;
3151    case nir_op_unpack_64_2x32_split_y:
3152       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3153                  get_alu_src(ctx, instr->src[0]));
3154       break;
3155    case nir_op_unpack_32_2x16_split_x:
3156       if (dst.type() == RegType::vgpr) {
3157          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3158                     get_alu_src(ctx, instr->src[0]));
3159       } else {
3160          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3161       }
3162       break;
3163    case nir_op_unpack_32_2x16_split_y:
3164       if (dst.type() == RegType::vgpr) {
3165          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3166                     get_alu_src(ctx, instr->src[0]));
3167       } else {
3168          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3169                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3170                     Operand::zero());
3171       }
3172       break;
3173    case nir_op_pack_32_2x16_split: {
3174       Temp src0 = get_alu_src(ctx, instr->src[0]);
3175       Temp src1 = get_alu_src(ctx, instr->src[1]);
3176       if (dst.regClass() == v1) {
3177          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3178          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3179          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3180       } else {
3181          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3182                          Operand::c32(0xFFFFu));
3183          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3184                          Operand::c32(16u));
3185          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3186       }
3187       break;
3188    }
3189    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3190    case nir_op_pack_half_2x16_split: {
3191       if (dst.regClass() == v1) {
3192          if (!ctx->block->fp_mode.care_about_round16_64 ||
3193              ctx->block->fp_mode.round16_64 == fp_round_tz) {
3194             if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
3195                emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3196             else
3197                emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3198          } else {
3199             Temp src0 =
3200                bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
3201             Temp src1 =
3202                bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
3203             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3204          }
3205       } else {
3206          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3207       }
3208       break;
3209    }
3210    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3211    case nir_op_unpack_half_2x16_split_x: {
3212       Temp src = get_alu_src(ctx, instr->src[0]);
3213       if (src.regClass() == v1)
3214          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3215       if (dst.regClass() == v1) {
3216          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3217                 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3218          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3219       } else {
3220          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3221       }
3222       break;
3223    }
3224    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3225    case nir_op_unpack_half_2x16_split_y: {
3226       Temp src = get_alu_src(ctx, instr->src[0]);
3227       if (src.regClass() == s1)
3228          src =
3229             bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));
3230       else
3231          src =
3232             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3233       if (dst.regClass() == v1) {
3234          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3235                 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3236          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3237       } else {
3238          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3239       }
3240       break;
3241    }
3242    case nir_op_sad_u8x4: {
3243       assert(dst.regClass() == v1);
3244       emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3245       break;
3246    }
3247    case nir_op_fquantize2f16: {
3248       Temp src = get_alu_src(ctx, instr->src[0]);
3249       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
3250       Temp f32, cmp_res;
3251 
3252       if (ctx->program->chip_class >= GFX8) {
3253          Temp mask = bld.copy(
3254             bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3255          cmp_res =
3256             bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
3257          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3258       } else {
3259          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3260           * so compare the result and flush to 0 if it's smaller.
3261           */
3262          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3263          Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3264          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3265          tmp0->vop3().abs[0] = true;
3266          Temp tmp1 =
3267             bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);
3268          cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3269                             tmp0->definitions[0].getTemp(), tmp1);
3270       }
3271 
3272       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3273          Temp copysign_0 =
3274             bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3275          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3276       } else {
3277          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3278       }
3279       break;
3280    }
3281    case nir_op_bfm: {
3282       Temp bits = get_alu_src(ctx, instr->src[0]);
3283       Temp offset = get_alu_src(ctx, instr->src[1]);
3284 
3285       if (dst.regClass() == s1) {
3286          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3287       } else if (dst.regClass() == v1) {
3288          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3289       } else {
3290          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3291       }
3292       break;
3293    }
3294    case nir_op_bitfield_select: {
3295 
3296       /* dst = (insert & bitmask) | (base & ~bitmask) */
3297       if (dst.regClass() == s1) {
3298          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3299          Temp insert = get_alu_src(ctx, instr->src[1]);
3300          Temp base = get_alu_src(ctx, instr->src[2]);
3301          aco_ptr<Instruction> sop2;
3302          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3303          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3304          Operand lhs;
3305          if (const_insert && const_bitmask) {
3306             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3307          } else {
3308             insert =
3309                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3310             lhs = Operand(insert);
3311          }
3312 
3313          Operand rhs;
3314          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3315          if (const_base && const_bitmask) {
3316             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3317          } else {
3318             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3319             rhs = Operand(base);
3320          }
3321 
3322          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3323 
3324       } else if (dst.regClass() == v1) {
3325          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3326       } else {
3327          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3328       }
3329       break;
3330    }
3331    case nir_op_ubfe:
3332    case nir_op_ibfe: {
3333       if (dst.bytes() != 4)
3334          unreachable("Unsupported BFE bit size");
3335 
3336       if (dst.type() == RegType::sgpr) {
3337          Temp base = get_alu_src(ctx, instr->src[0]);
3338 
3339          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3340          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3341          if (const_offset && const_bits) {
3342             uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
3343             aco_opcode opcode =
3344                instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3345             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3346             break;
3347          }
3348 
3349          Temp offset = get_alu_src(ctx, instr->src[1]);
3350          Temp bits = get_alu_src(ctx, instr->src[2]);
3351          if (instr->op == nir_op_ubfe) {
3352             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3353             Temp masked =
3354                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3355             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3356          } else {
3357             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)
3358                                          : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
3359                                                     bld.def(s1, scc), bits, Operand::c32(16u));
3360             Operand offset_op = const_offset
3361                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3362                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3363                                               offset, Operand::c32(0x1fu));
3364 
3365             Temp extract =
3366                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3367             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3368          }
3369 
3370       } else {
3371          aco_opcode opcode =
3372             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3373          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3374       }
3375       break;
3376    }
3377    case nir_op_extract_u8:
3378    case nir_op_extract_i8:
3379    case nir_op_extract_u16:
3380    case nir_op_extract_i16: {
3381       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3382       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3383       uint32_t bits = comp == 4 ? 8 : 16;
3384       unsigned index = nir_src_as_uint(instr->src[1].src);
3385       if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3386          assert(index == 0);
3387          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3388       } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {
3389          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3390          unsigned swizzle = instr->src[0].swizzle[0];
3391          if (vec.size() > 1) {
3392             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3393             swizzle = swizzle & 1;
3394          }
3395          index += swizzle * instr->dest.dest.ssa.bit_size / bits;
3396          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3397                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3398       } else {
3399          Temp src = get_alu_src(ctx, instr->src[0]);
3400          Definition def(dst);
3401          if (dst.bytes() == 8) {
3402             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3403             index %= comp;
3404             def = bld.def(src.type(), 1);
3405          }
3406          assert(def.bytes() <= 4);
3407          if (def.regClass() == s1) {
3408             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3409                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3410          } else {
3411             src = emit_extract_vector(ctx, src, 0, def.regClass());
3412             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3413                        Operand::c32(bits), Operand::c32(is_signed));
3414          }
3415          if (dst.size() == 2)
3416             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3417                        Operand::zero());
3418       }
3419       break;
3420    }
3421    case nir_op_insert_u8:
3422    case nir_op_insert_u16: {
3423       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3424       uint32_t bits = comp == 4 ? 8 : 16;
3425       unsigned index = nir_src_as_uint(instr->src[1].src);
3426       if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3427          assert(index == 0);
3428          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3429       } else {
3430          Temp src = get_alu_src(ctx, instr->src[0]);
3431          Definition def(dst);
3432          bool swap = false;
3433          if (dst.bytes() == 8) {
3434             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3435             swap = index >= comp;
3436             index %= comp;
3437             def = bld.def(src.type(), 1);
3438          }
3439          if (def.regClass() == s1) {
3440             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3441                        Operand::c32(index), Operand::c32(bits));
3442          } else {
3443             src = emit_extract_vector(ctx, src, 0, def.regClass());
3444             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3445                        Operand::c32(bits));
3446          }
3447          if (dst.size() == 2 && swap)
3448             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3449                        def.getTemp());
3450          else if (dst.size() == 2)
3451             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3452                        Operand::zero());
3453       }
3454       break;
3455    }
3456    case nir_op_bit_count: {
3457       Temp src = get_alu_src(ctx, instr->src[0]);
3458       if (src.regClass() == s1) {
3459          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3460       } else if (src.regClass() == v1) {
3461          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3462       } else if (src.regClass() == v2) {
3463          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3464                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3465                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3466       } else if (src.regClass() == s2) {
3467          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3468       } else {
3469          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3470       }
3471       break;
3472    }
3473    case nir_op_flt: {
3474       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3475                       aco_opcode::v_cmp_lt_f64);
3476       break;
3477    }
3478    case nir_op_fge: {
3479       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3480                       aco_opcode::v_cmp_ge_f64);
3481       break;
3482    }
3483    case nir_op_feq: {
3484       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3485                       aco_opcode::v_cmp_eq_f64);
3486       break;
3487    }
3488    case nir_op_fneu: {
3489       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3490                       aco_opcode::v_cmp_neq_f64);
3491       break;
3492    }
3493    case nir_op_ilt: {
3494       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3495                       aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3496       break;
3497    }
3498    case nir_op_ige: {
3499       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3500                       aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3501       break;
3502    }
3503    case nir_op_ieq: {
3504       if (instr->src[0].src.ssa->bit_size == 1)
3505          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3506       else
3507          emit_comparison(
3508             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3509             aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3510             ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3511       break;
3512    }
3513    case nir_op_ine: {
3514       if (instr->src[0].src.ssa->bit_size == 1)
3515          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3516       else
3517          emit_comparison(
3518             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3519             aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3520             ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3521       break;
3522    }
3523    case nir_op_ult: {
3524       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3525                       aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3526       break;
3527    }
3528    case nir_op_uge: {
3529       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3530                       aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3531       break;
3532    }
3533    case nir_op_fddx:
3534    case nir_op_fddy:
3535    case nir_op_fddx_fine:
3536    case nir_op_fddy_fine:
3537    case nir_op_fddx_coarse:
3538    case nir_op_fddy_coarse: {
3539       if (!nir_src_is_divergent(instr->src[0].src)) {
3540          /* Source is the same in all lanes, so the derivative is zero.
3541           * This also avoids emitting invalid IR.
3542           */
3543          bld.copy(Definition(dst), Operand::zero());
3544          break;
3545       }
3546 
3547       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3548       uint16_t dpp_ctrl1, dpp_ctrl2;
3549       if (instr->op == nir_op_fddx_fine) {
3550          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3551          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3552       } else if (instr->op == nir_op_fddy_fine) {
3553          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3554          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3555       } else {
3556          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3557          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3558             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3559          else
3560             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3561       }
3562 
3563       Temp tmp;
3564       if (ctx->program->chip_class >= GFX8) {
3565          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3566          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3567       } else {
3568          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3569          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3570          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3571       }
3572       emit_wqm(bld, tmp, dst, true);
3573       break;
3574    }
3575    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3576    }
3577 }
3578 
3579 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)3580 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3581 {
3582    Temp dst = get_ssa_temp(ctx, &instr->def);
3583 
3584    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3585    // which get truncated the lsb if double and msb if int
3586    // for now, we only use s_mov_b64 with 64bit inline constants
3587    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3588    assert(dst.type() == RegType::sgpr);
3589 
3590    Builder bld(ctx->program, ctx->block);
3591 
3592    if (instr->def.bit_size == 1) {
3593       assert(dst.regClass() == bld.lm);
3594       int val = instr->value[0].b ? -1 : 0;
3595       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3596       bld.copy(Definition(dst), op);
3597    } else if (instr->def.bit_size == 8) {
3598       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3599    } else if (instr->def.bit_size == 16) {
3600       /* sign-extend to use s_movk_i32 instead of a literal */
3601       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3602    } else if (dst.size() == 1) {
3603       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3604    } else {
3605       assert(dst.size() != 1);
3606       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3607          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3608       if (instr->def.bit_size == 64)
3609          for (unsigned i = 0; i < dst.size(); i++)
3610             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3611       else {
3612          for (unsigned i = 0; i < dst.size(); i++)
3613             vec->operands[i] = Operand::c32(instr->value[i].u32);
3614       }
3615       vec->definitions[0] = Definition(dst);
3616       ctx->block->instructions.emplace_back(std::move(vec));
3617    }
3618 }
3619 
3620 uint32_t
widen_mask(uint32_t mask,unsigned multiplier)3621 widen_mask(uint32_t mask, unsigned multiplier)
3622 {
3623    uint32_t new_mask = 0;
3624    for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3625       if (mask & (1u << i))
3626          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3627    return new_mask;
3628 }
3629 
3630 struct LoadEmitInfo {
3631    Operand offset;
3632    Temp dst;
3633    unsigned num_components;
3634    unsigned component_size;
3635    Temp resource = Temp(0, s1);
3636    unsigned component_stride = 0;
3637    unsigned const_offset = 0;
3638    unsigned align_mul = 0;
3639    unsigned align_offset = 0;
3640 
3641    bool glc = false;
3642    bool slc = false;
3643    unsigned swizzle_component_size = 0;
3644    memory_sync_info sync;
3645    Temp soffset = Temp(0, s1);
3646 };
3647 
3648 struct EmitLoadParameters {
3649    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3650                              unsigned bytes_needed, unsigned align, unsigned const_offset,
3651                              Temp dst_hint);
3652 
3653    Callback callback;
3654    bool byte_align_loads;
3655    bool supports_8bit_16bit_loads;
3656    unsigned max_const_offset_plus_one;
3657 };
3658 
3659 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)3660 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3661           const EmitLoadParameters& params)
3662 {
3663    unsigned load_size = info.num_components * info.component_size;
3664    unsigned component_size = info.component_size;
3665 
3666    unsigned num_vals = 0;
3667    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3668 
3669    unsigned const_offset = info.const_offset;
3670 
3671    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3672    unsigned align_offset = (info.align_offset + const_offset) % align_mul;
3673 
3674    unsigned bytes_read = 0;
3675    while (bytes_read < load_size) {
3676       unsigned bytes_needed = load_size - bytes_read;
3677 
3678       /* add buffer for unaligned loads */
3679       int byte_align = 0;
3680       if (params.byte_align_loads) {
3681          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3682       }
3683 
3684       if (byte_align) {
3685          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3686              !params.supports_8bit_16bit_loads) {
3687             if (info.component_stride) {
3688                assert(params.supports_8bit_16bit_loads && "unimplemented");
3689                bytes_needed = 2;
3690                byte_align = 0;
3691             } else {
3692                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3693                bytes_needed = align(bytes_needed, 4);
3694             }
3695          } else {
3696             byte_align = 0;
3697          }
3698       }
3699 
3700       if (info.swizzle_component_size)
3701          bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3702       if (info.component_stride)
3703          bytes_needed = MIN2(bytes_needed, info.component_size);
3704 
3705       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3706 
3707       /* reduce constant offset */
3708       Operand offset = info.offset;
3709       unsigned reduced_const_offset = const_offset;
3710       bool remove_const_offset_completely = need_to_align_offset;
3711       if (const_offset &&
3712           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
3713          unsigned to_add = const_offset;
3714          if (remove_const_offset_completely) {
3715             reduced_const_offset = 0;
3716          } else {
3717             to_add =
3718                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
3719             reduced_const_offset %= params.max_const_offset_plus_one;
3720          }
3721          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3722          if (offset.isConstant()) {
3723             offset = Operand::c32(offset.constantValue() + to_add);
3724          } else if (offset_tmp.regClass() == s1) {
3725             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
3726                               Operand::c32(to_add));
3727          } else if (offset_tmp.regClass() == v1) {
3728             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
3729          } else {
3730             Temp lo = bld.tmp(offset_tmp.type(), 1);
3731             Temp hi = bld.tmp(offset_tmp.type(), 1);
3732             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3733 
3734             if (offset_tmp.regClass() == s2) {
3735                Temp carry = bld.tmp(s1);
3736                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
3737                              Operand::c32(to_add));
3738                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3739                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3740             } else {
3741                Temp new_lo = bld.tmp(v1);
3742                Temp carry =
3743                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
3744                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
3745                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3746             }
3747          }
3748       }
3749 
3750       /* align offset down if needed */
3751       Operand aligned_offset = offset;
3752       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3753       if (need_to_align_offset) {
3754          align = 4;
3755          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3756          if (offset.isConstant()) {
3757             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
3758          } else if (offset_tmp.regClass() == s1) {
3759             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3760                                       Operand::c32(0xfffffffcu), offset_tmp);
3761          } else if (offset_tmp.regClass() == s2) {
3762             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
3763                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
3764          } else if (offset_tmp.regClass() == v1) {
3765             aligned_offset =
3766                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
3767          } else if (offset_tmp.regClass() == v2) {
3768             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3769             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3770             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
3771             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3772          }
3773       }
3774       Temp aligned_offset_tmp =
3775          aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
3776 
3777       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3778                                  reduced_const_offset, byte_align ? Temp() : info.dst);
3779 
3780       /* the callback wrote directly to dst */
3781       if (val == info.dst) {
3782          assert(num_vals == 0);
3783          emit_split_vector(ctx, info.dst, info.num_components);
3784          return;
3785       }
3786 
3787       /* shift result right if needed */
3788       if (params.byte_align_loads && info.component_size < 4) {
3789          Operand byte_align_off = Operand::c32(byte_align);
3790          if (byte_align == -1) {
3791             if (offset.isConstant())
3792                byte_align_off = Operand::c32(offset.constantValue() % 4u);
3793             else if (offset.size() == 2)
3794                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
3795                                                             RegClass(offset.getTemp().type(), 1)));
3796             else
3797                byte_align_off = offset;
3798          }
3799 
3800          assert(val.bytes() >= load_size && "unimplemented");
3801          if (val.type() == RegType::sgpr)
3802             byte_align_scalar(ctx, val, byte_align_off, info.dst);
3803          else
3804             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
3805          return;
3806       }
3807 
3808       /* add result to list and advance */
3809       if (info.component_stride) {
3810          assert(val.bytes() == info.component_size && "unimplemented");
3811          const_offset += info.component_stride;
3812          align_offset = (align_offset + info.component_stride) % align_mul;
3813       } else {
3814          const_offset += val.bytes();
3815          align_offset = (align_offset + val.bytes()) % align_mul;
3816       }
3817       bytes_read += val.bytes();
3818       vals[num_vals++] = val;
3819    }
3820 
3821    /* create array of components */
3822    unsigned components_split = 0;
3823    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3824    bool has_vgprs = false;
3825    for (unsigned i = 0; i < num_vals;) {
3826       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
3827       unsigned num_tmps = 0;
3828       unsigned tmp_size = 0;
3829       RegType reg_type = RegType::sgpr;
3830       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3831          if (vals[i].type() == RegType::vgpr)
3832             reg_type = RegType::vgpr;
3833          tmp_size += vals[i].bytes();
3834          tmp[num_tmps++] = vals[i++];
3835       }
3836       if (num_tmps > 1) {
3837          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3838             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3839          for (unsigned j = 0; j < num_tmps; j++)
3840             vec->operands[j] = Operand(tmp[j]);
3841          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3842          vec->definitions[0] = Definition(tmp[0]);
3843          bld.insert(std::move(vec));
3844       }
3845 
3846       if (tmp[0].bytes() % component_size) {
3847          /* trim tmp[0] */
3848          assert(i == num_vals);
3849          RegClass new_rc =
3850             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3851          tmp[0] =
3852             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
3853       }
3854 
3855       RegClass elem_rc = RegClass::get(reg_type, component_size);
3856 
3857       unsigned start = components_split;
3858 
3859       if (tmp_size == elem_rc.bytes()) {
3860          allocated_vec[components_split++] = tmp[0];
3861       } else {
3862          assert(tmp_size % elem_rc.bytes() == 0);
3863          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3864             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3865          for (auto& def : split->definitions) {
3866             Temp component = bld.tmp(elem_rc);
3867             allocated_vec[components_split++] = component;
3868             def = Definition(component);
3869          }
3870          split->operands[0] = Operand(tmp[0]);
3871          bld.insert(std::move(split));
3872       }
3873 
3874       /* try to p_as_uniform early so we can create more optimizable code and
3875        * also update allocated_vec */
3876       for (unsigned j = start; j < components_split; j++) {
3877          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
3878             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3879          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3880       }
3881    }
3882 
3883    /* concatenate components and p_as_uniform() result if needed */
3884    if (info.dst.type() == RegType::vgpr || !has_vgprs)
3885       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
3886 
3887    int padding_bytes =
3888       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
3889 
3890    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3891       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
3892    for (unsigned i = 0; i < info.num_components; i++)
3893       vec->operands[i] = Operand(allocated_vec[i]);
3894    if (padding_bytes)
3895       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3896    if (info.dst.type() == RegType::sgpr && has_vgprs) {
3897       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
3898       vec->definitions[0] = Definition(tmp);
3899       bld.insert(std::move(vec));
3900       bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
3901    } else {
3902       vec->definitions[0] = Definition(info.dst);
3903       bld.insert(std::move(vec));
3904    }
3905 }
3906 
3907 Operand
load_lds_size_m0(Builder & bld)3908 load_lds_size_m0(Builder& bld)
3909 {
3910    /* m0 does not need to be initialized on GFX9+ */
3911    if (bld.program->chip_class >= GFX9)
3912       return Operand(s1);
3913 
3914    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
3915 }
3916 
3917 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)3918 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3919                   unsigned align, unsigned const_offset, Temp dst_hint)
3920 {
3921    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3922 
3923    Operand m = load_lds_size_m0(bld);
3924 
3925    bool large_ds_read = bld.program->chip_class >= GFX7;
3926    bool usable_read2 = bld.program->chip_class >= GFX7;
3927 
3928    bool read2 = false;
3929    unsigned size = 0;
3930    aco_opcode op;
3931    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3932       size = 16;
3933       op = aco_opcode::ds_read_b128;
3934    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3935       size = 16;
3936       read2 = true;
3937       op = aco_opcode::ds_read2_b64;
3938    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3939       size = 12;
3940       op = aco_opcode::ds_read_b96;
3941    } else if (bytes_needed >= 8 && align % 8 == 0) {
3942       size = 8;
3943       op = aco_opcode::ds_read_b64;
3944    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
3945       size = 8;
3946       read2 = true;
3947       op = aco_opcode::ds_read2_b32;
3948    } else if (bytes_needed >= 4 && align % 4 == 0) {
3949       size = 4;
3950       op = aco_opcode::ds_read_b32;
3951    } else if (bytes_needed >= 2 && align % 2 == 0) {
3952       size = 2;
3953       op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
3954    } else {
3955       size = 1;
3956       op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
3957    }
3958 
3959    unsigned const_offset_unit = read2 ? size / 2u : 1u;
3960    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
3961 
3962    if (const_offset > (const_offset_range - const_offset_unit)) {
3963       unsigned excess = const_offset - (const_offset % const_offset_range);
3964       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
3965       const_offset -= excess;
3966    }
3967 
3968    const_offset /= const_offset_unit;
3969 
3970    RegClass rc = RegClass::get(RegType::vgpr, size);
3971    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3972    Instruction* instr;
3973    if (read2)
3974       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3975    else
3976       instr = bld.ds(op, Definition(val), offset, m, const_offset);
3977    instr->ds().sync = info.sync;
3978 
3979    if (m.isUndefined())
3980       instr->operands.pop_back();
3981 
3982    return val;
3983 }
3984 
3985 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
3986 
3987 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)3988 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3989                    unsigned align, unsigned const_offset, Temp dst_hint)
3990 {
3991    unsigned size = 0;
3992    aco_opcode op;
3993    if (bytes_needed <= 4) {
3994       size = 1;
3995       op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3996    } else if (bytes_needed <= 8) {
3997       size = 2;
3998       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3999    } else if (bytes_needed <= 16) {
4000       size = 4;
4001       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4002    } else if (bytes_needed <= 32) {
4003       size = 8;
4004       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4005    } else {
4006       size = 16;
4007       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4008    }
4009    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4010    if (info.resource.id()) {
4011       load->operands[0] = Operand(info.resource);
4012       load->operands[1] = Operand(offset);
4013    } else {
4014       load->operands[0] = Operand(offset);
4015       load->operands[1] = Operand::zero();
4016    }
4017    RegClass rc(RegType::sgpr, size);
4018    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4019    load->definitions[0] = Definition(val);
4020    load->glc = info.glc;
4021    load->dlc = info.glc && bld.program->chip_class >= GFX10;
4022    load->sync = info.sync;
4023    bld.insert(std::move(load));
4024    return val;
4025 }
4026 
4027 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4028 
4029 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4030 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4031                     unsigned align_, unsigned const_offset, Temp dst_hint)
4032 {
4033    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4034    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4035 
4036    if (info.soffset.id()) {
4037       if (soffset.isTemp())
4038          vaddr = bld.copy(bld.def(v1), soffset);
4039       soffset = Operand(info.soffset);
4040    }
4041 
4042    unsigned bytes_size = 0;
4043    aco_opcode op;
4044    if (bytes_needed == 1 || align_ % 2) {
4045       bytes_size = 1;
4046       op = aco_opcode::buffer_load_ubyte;
4047    } else if (bytes_needed == 2 || align_ % 4) {
4048       bytes_size = 2;
4049       op = aco_opcode::buffer_load_ushort;
4050    } else if (bytes_needed <= 4) {
4051       bytes_size = 4;
4052       op = aco_opcode::buffer_load_dword;
4053    } else if (bytes_needed <= 8) {
4054       bytes_size = 8;
4055       op = aco_opcode::buffer_load_dwordx2;
4056    } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
4057       bytes_size = 12;
4058       op = aco_opcode::buffer_load_dwordx3;
4059    } else {
4060       bytes_size = 16;
4061       op = aco_opcode::buffer_load_dwordx4;
4062    }
4063    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4064    mubuf->operands[0] = Operand(info.resource);
4065    mubuf->operands[1] = vaddr;
4066    mubuf->operands[2] = soffset;
4067    mubuf->offen = (offset.type() == RegType::vgpr);
4068    mubuf->glc = info.glc;
4069    mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
4070    mubuf->slc = info.slc;
4071    mubuf->sync = info.sync;
4072    mubuf->offset = const_offset;
4073    mubuf->swizzled = info.swizzle_component_size != 0;
4074    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4075    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4076    mubuf->definitions[0] = Definition(val);
4077    bld.insert(std::move(mubuf));
4078 
4079    return val;
4080 }
4081 
4082 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4083 const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
4084 
4085 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4086 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4087 {
4088    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4089                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4090 
4091    if (addr.type() == RegType::vgpr)
4092       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4093                         Operand::c32(-1u), Operand::c32(rsrc_conf));
4094    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4095                      Operand::c32(rsrc_conf));
4096 }
4097 
4098 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4099 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4100                      unsigned align_, unsigned const_offset, Temp dst_hint)
4101 {
4102    unsigned bytes_size = 0;
4103    bool use_mubuf = bld.program->chip_class == GFX6;
4104    bool global = bld.program->chip_class >= GFX9;
4105    aco_opcode op;
4106    if (bytes_needed == 1) {
4107       bytes_size = 1;
4108       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4109            : global  ? aco_opcode::global_load_ubyte
4110                      : aco_opcode::flat_load_ubyte;
4111    } else if (bytes_needed == 2) {
4112       bytes_size = 2;
4113       op = use_mubuf ? aco_opcode::buffer_load_ushort
4114            : global  ? aco_opcode::global_load_ushort
4115                      : aco_opcode::flat_load_ushort;
4116    } else if (bytes_needed <= 4) {
4117       bytes_size = 4;
4118       op = use_mubuf ? aco_opcode::buffer_load_dword
4119            : global  ? aco_opcode::global_load_dword
4120                      : aco_opcode::flat_load_dword;
4121    } else if (bytes_needed <= 8) {
4122       bytes_size = 8;
4123       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4124            : global  ? aco_opcode::global_load_dwordx2
4125                      : aco_opcode::flat_load_dwordx2;
4126    } else if (bytes_needed <= 12 && !use_mubuf) {
4127       bytes_size = 12;
4128       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4129    } else {
4130       bytes_size = 16;
4131       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4132            : global  ? aco_opcode::global_load_dwordx4
4133                      : aco_opcode::flat_load_dwordx4;
4134    }
4135    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
4136    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4137    if (use_mubuf) {
4138       aco_ptr<MUBUF_instruction> mubuf{
4139          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4140       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
4141       mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4142       mubuf->operands[2] = Operand::zero();
4143       mubuf->glc = info.glc;
4144       mubuf->dlc = false;
4145       mubuf->offset = 0;
4146       mubuf->addr64 = offset.type() == RegType::vgpr;
4147       mubuf->disable_wqm = false;
4148       mubuf->sync = info.sync;
4149       mubuf->definitions[0] = Definition(val);
4150       bld.insert(std::move(mubuf));
4151    } else {
4152       offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
4153 
4154       aco_ptr<FLAT_instruction> flat{
4155          create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4156       flat->operands[0] = Operand(offset);
4157       flat->operands[1] = Operand(s1);
4158       flat->glc = info.glc;
4159       flat->dlc = info.glc && bld.program->chip_class >= GFX10;
4160       flat->sync = info.sync;
4161       flat->offset = 0u;
4162       flat->definitions[0] = Definition(val);
4163       bld.insert(std::move(flat));
4164    }
4165 
4166    return val;
4167 }
4168 
4169 const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
4170 
4171 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4172 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4173          Temp address, unsigned base_offset, unsigned align)
4174 {
4175    assert(util_is_power_of_two_nonzero(align));
4176 
4177    Builder bld(ctx->program, ctx->block);
4178 
4179    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4180    info.align_mul = align;
4181    info.align_offset = 0;
4182    info.sync = memory_sync_info(storage_shared);
4183    info.const_offset = base_offset;
4184    emit_load(ctx, bld, info, lds_load_params);
4185 
4186    return dst;
4187 }
4188 
4189 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4190 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4191                  Temp src)
4192 {
4193    if (!count)
4194       return;
4195 
4196    Builder bld(ctx->program, ctx->block);
4197 
4198    /* count == 1 fast path */
4199    if (count == 1) {
4200       if (dst_type == RegType::sgpr)
4201          dst[0] = bld.as_uniform(src);
4202       else
4203          dst[0] = as_vgpr(ctx, src);
4204       return;
4205    }
4206 
4207    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4208    unsigned elem_size_bytes =
4209       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4210 
4211    ASSERTED bool is_subdword = elem_size_bytes < 4;
4212    assert(!is_subdword || dst_type == RegType::vgpr);
4213 
4214    for (unsigned i = 0; i < count; i++)
4215       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4216 
4217    std::vector<Temp> temps;
4218    /* use allocated_vec if possible */
4219    auto it = ctx->allocated_vec.find(src.id());
4220    if (it != ctx->allocated_vec.end()) {
4221       if (!it->second[0].id())
4222          goto split;
4223       unsigned elem_size = it->second[0].bytes();
4224       assert(src.bytes() % elem_size == 0);
4225 
4226       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4227          if (!it->second[i].id())
4228             goto split;
4229       }
4230       if (elem_size_bytes % elem_size)
4231          goto split;
4232 
4233       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4234       elem_size_bytes = elem_size;
4235    }
4236 
4237 split:
4238    /* split src if necessary */
4239    if (temps.empty()) {
4240       if (is_subdword && src.type() == RegType::sgpr)
4241          src = as_vgpr(ctx, src);
4242       if (dst_type == RegType::sgpr)
4243          src = bld.as_uniform(src);
4244 
4245       unsigned num_elems = src.bytes() / elem_size_bytes;
4246       aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4247          aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4248       split->operands[0] = Operand(src);
4249       for (unsigned i = 0; i < num_elems; i++) {
4250          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4251          split->definitions[i] = Definition(temps.back());
4252       }
4253       bld.insert(std::move(split));
4254    }
4255 
4256    unsigned idx = 0;
4257    for (unsigned i = 0; i < count; i++) {
4258       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4259       if (op_count == 1) {
4260          if (dst_type == RegType::sgpr)
4261             dst[i] = bld.as_uniform(temps[idx++]);
4262          else
4263             dst[i] = as_vgpr(ctx, temps[idx++]);
4264          continue;
4265       }
4266 
4267       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4268                                                                       Format::PSEUDO, op_count, 1)};
4269       for (unsigned j = 0; j < op_count; j++) {
4270          Temp tmp = temps[idx++];
4271          if (dst_type == RegType::sgpr)
4272             tmp = bld.as_uniform(tmp);
4273          vec->operands[j] = Operand(tmp);
4274       }
4275       vec->definitions[0] = Definition(dst[i]);
4276       bld.insert(std::move(vec));
4277    }
4278    return;
4279 }
4280 
4281 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)4282 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4283 {
4284    unsigned start_elem = ffs(todo_mask) - 1;
4285    bool skip = !(mask & (1 << start_elem));
4286    if (skip)
4287       mask = ~mask & todo_mask;
4288 
4289    mask &= todo_mask;
4290 
4291    u_bit_scan_consecutive_range(&mask, start, count);
4292 
4293    return !skip;
4294 }
4295 
4296 void
advance_write_mask(uint32_t * todo_mask,int start,int count)4297 advance_write_mask(uint32_t* todo_mask, int start, int count)
4298 {
4299    *todo_mask &= ~u_bit_consecutive(0, count) << start;
4300 }
4301 
4302 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)4303 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4304           unsigned base_offset, unsigned align)
4305 {
4306    assert(util_is_power_of_two_nonzero(align));
4307    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4308 
4309    Builder bld(ctx->program, ctx->block);
4310    bool large_ds_write = ctx->options->chip_class >= GFX7;
4311    bool usable_write2 = ctx->options->chip_class >= GFX7;
4312 
4313    unsigned write_count = 0;
4314    Temp write_datas[32];
4315    unsigned offsets[32];
4316    unsigned bytes[32];
4317    aco_opcode opcodes[32];
4318 
4319    wrmask = widen_mask(wrmask, elem_size_bytes);
4320 
4321    uint32_t todo = u_bit_consecutive(0, data.bytes());
4322    while (todo) {
4323       int offset, byte;
4324       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4325          offsets[write_count] = offset;
4326          bytes[write_count] = byte;
4327          opcodes[write_count] = aco_opcode::num_opcodes;
4328          write_count++;
4329          advance_write_mask(&todo, offset, byte);
4330          continue;
4331       }
4332 
4333       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4334       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4335       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4336       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4337 
4338       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4339       aco_opcode op = aco_opcode::num_opcodes;
4340       if (byte >= 16 && aligned16 && large_ds_write) {
4341          op = aco_opcode::ds_write_b128;
4342          byte = 16;
4343       } else if (byte >= 12 && aligned16 && large_ds_write) {
4344          op = aco_opcode::ds_write_b96;
4345          byte = 12;
4346       } else if (byte >= 8 && aligned8) {
4347          op = aco_opcode::ds_write_b64;
4348          byte = 8;
4349       } else if (byte >= 4 && aligned4) {
4350          op = aco_opcode::ds_write_b32;
4351          byte = 4;
4352       } else if (byte >= 2 && aligned2) {
4353          op = aco_opcode::ds_write_b16;
4354          byte = 2;
4355       } else if (byte >= 1) {
4356          op = aco_opcode::ds_write_b8;
4357          byte = 1;
4358       } else {
4359          assert(false);
4360       }
4361 
4362       offsets[write_count] = offset;
4363       bytes[write_count] = byte;
4364       opcodes[write_count] = op;
4365       write_count++;
4366       advance_write_mask(&todo, offset, byte);
4367    }
4368 
4369    Operand m = load_lds_size_m0(bld);
4370 
4371    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4372 
4373    for (unsigned i = 0; i < write_count; i++) {
4374       aco_opcode op = opcodes[i];
4375       if (op == aco_opcode::num_opcodes)
4376          continue;
4377 
4378       Temp split_data = write_datas[i];
4379 
4380       unsigned second = write_count;
4381       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4382          for (second = i + 1; second < write_count; second++) {
4383             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4384                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4385                opcodes[second] = aco_opcode::num_opcodes;
4386                break;
4387             }
4388          }
4389       }
4390 
4391       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4392       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4393 
4394       unsigned inline_offset = base_offset + offsets[i];
4395       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4396       Temp address_offset = address;
4397       if (inline_offset > max_offset) {
4398          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4399          inline_offset = offsets[i];
4400       }
4401 
4402       /* offsets[i] shouldn't be large enough for this to happen */
4403       assert(inline_offset <= max_offset);
4404 
4405       Instruction* instr;
4406       if (write2) {
4407          Temp second_data = write_datas[second];
4408          inline_offset /= split_data.bytes();
4409          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4410                         inline_offset + write2_off);
4411       } else {
4412          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4413       }
4414       instr->ds().sync = memory_sync_info(storage_shared);
4415 
4416       if (m.isUndefined())
4417          instr->operands.pop_back();
4418    }
4419 }
4420 
4421 aco_opcode
get_buffer_store_op(unsigned bytes)4422 get_buffer_store_op(unsigned bytes)
4423 {
4424    switch (bytes) {
4425    case 1: return aco_opcode::buffer_store_byte;
4426    case 2: return aco_opcode::buffer_store_short;
4427    case 4: return aco_opcode::buffer_store_dword;
4428    case 8: return aco_opcode::buffer_store_dwordx2;
4429    case 12: return aco_opcode::buffer_store_dwordx3;
4430    case 16: return aco_opcode::buffer_store_dwordx4;
4431    }
4432    unreachable("Unexpected store size");
4433    return aco_opcode::num_opcodes;
4434 }
4435 
4436 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)4437 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4438                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4439                    Temp* write_datas, unsigned* offsets)
4440 {
4441    unsigned write_count_with_skips = 0;
4442    bool skips[16];
4443    unsigned bytes[16];
4444 
4445    /* determine how to split the data */
4446    unsigned todo = u_bit_consecutive(0, data.bytes());
4447    while (todo) {
4448       int offset, byte;
4449       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4450       offsets[write_count_with_skips] = offset;
4451       if (skips[write_count_with_skips]) {
4452          bytes[write_count_with_skips] = byte;
4453          advance_write_mask(&todo, offset, byte);
4454          write_count_with_skips++;
4455          continue;
4456       }
4457 
4458       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
4459        * larger than swizzle_element_size */
4460       byte = MIN2(byte, swizzle_element_size);
4461       if (byte % 4)
4462          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
4463 
4464       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
4465       if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)
4466          byte = 8;
4467 
4468       /* dword or larger stores have to be dword-aligned */
4469       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
4470       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
4471       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
4472       if (!dword_aligned)
4473          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
4474 
4475       bytes[write_count_with_skips] = byte;
4476       advance_write_mask(&todo, offset, byte);
4477       write_count_with_skips++;
4478    }
4479 
4480    /* actually split data */
4481    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
4482 
4483    /* remove skips */
4484    for (unsigned i = 0; i < write_count_with_skips; i++) {
4485       if (skips[i])
4486          continue;
4487       write_datas[*write_count] = write_datas[i];
4488       offsets[*write_count] = offsets[i];
4489       (*write_count)++;
4490    }
4491 }
4492 
4493 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())4494 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
4495                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
4496 {
4497    Builder bld(ctx->program, ctx->block);
4498    unsigned dword_size = elem_size_bytes / 4;
4499 
4500    if (!dst.id())
4501       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
4502 
4503    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4504    aco_ptr<Pseudo_instruction> instr{
4505       create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
4506    instr->definitions[0] = Definition(dst);
4507 
4508    for (unsigned i = 0; i < cnt; ++i) {
4509       if (arr[i].id()) {
4510          assert(arr[i].size() == dword_size);
4511          allocated_vec[i] = arr[i];
4512          instr->operands[i] = Operand(arr[i]);
4513       } else {
4514          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
4515                               Operand::zero(dword_size == 2 ? 8 : 4));
4516          allocated_vec[i] = zero;
4517          instr->operands[i] = Operand(zero);
4518       }
4519    }
4520 
4521    bld.insert(std::move(instr));
4522 
4523    if (split_cnt)
4524       emit_split_vector(ctx, dst, split_cnt);
4525    else
4526       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
4527 
4528    return dst;
4529 }
4530 
4531 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)4532 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
4533 {
4534    if (const_offset >= 4096) {
4535       unsigned excess_const_offset = const_offset / 4096u * 4096u;
4536       const_offset %= 4096u;
4537 
4538       if (!voffset.id())
4539          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
4540       else if (unlikely(voffset.regClass() == s1))
4541          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
4542                             Operand::c32(excess_const_offset), Operand(voffset));
4543       else if (likely(voffset.regClass() == v1))
4544          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
4545       else
4546          unreachable("Unsupported register class of voffset");
4547    }
4548 
4549    return const_offset;
4550 }
4551 
4552 void
emit_single_mubuf_store(isel_context * ctx,Temp descriptor,Temp voffset,Temp soffset,Temp vdata,unsigned const_offset=0u,memory_sync_info sync=memory_sync_info (),bool slc=false,bool swizzled=false)4553 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
4554                         unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
4555                         bool slc = false, bool swizzled = false)
4556 {
4557    assert(vdata.id());
4558    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4559    assert(vdata.size() >= 1 && vdata.size() <= 4);
4560 
4561    Builder bld(ctx->program, ctx->block);
4562    aco_opcode op = get_buffer_store_op(vdata.bytes());
4563    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4564 
4565    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4566    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
4567    Builder::Result r =
4568       bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4569                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4570                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4571                 /* dlc*/ false, /* slc */ slc);
4572 
4573    r.instr->mubuf().sync = sync;
4574 }
4575 
4576 void
store_vmem_mubuf(isel_context * ctx,Temp src,Temp descriptor,Temp voffset,Temp soffset,unsigned base_const_offset,unsigned elem_size_bytes,unsigned write_mask,bool allow_combining=true,memory_sync_info sync=memory_sync_info (),bool slc=false)4577 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4578                  unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4579                  bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
4580                  bool slc = false)
4581 {
4582    Builder bld(ctx->program, ctx->block);
4583    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4584    assert(write_mask);
4585    write_mask = widen_mask(write_mask, elem_size_bytes);
4586 
4587    unsigned write_count = 0;
4588    Temp write_datas[32];
4589    unsigned offsets[32];
4590    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
4591                       &write_count, write_datas, offsets);
4592 
4593    for (unsigned i = 0; i < write_count; i++) {
4594       unsigned const_offset = offsets[i] + base_const_offset;
4595       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
4596                               slc, !allow_combining);
4597    }
4598 }
4599 
4600 void
load_vmem_mubuf(isel_context * ctx,Temp dst,Temp descriptor,Temp voffset,Temp soffset,unsigned base_const_offset,unsigned elem_size_bytes,unsigned num_components,unsigned stride=0u,bool allow_combining=true,bool allow_reorder=true,bool slc=false)4601 load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4602                 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4603                 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
4604                 bool slc = false)
4605 {
4606    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4607    assert((num_components * elem_size_bytes) == dst.bytes());
4608    assert(!!stride != allow_combining);
4609 
4610    Builder bld(ctx->program, ctx->block);
4611 
4612    LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4613    info.component_stride = allow_combining ? 0 : stride;
4614    info.glc = true;
4615    info.slc = slc;
4616    info.swizzle_component_size = allow_combining ? 0 : 4;
4617    info.align_mul = MIN2(elem_size_bytes, 4);
4618    info.align_offset = 0;
4619    info.soffset = soffset;
4620    info.const_offset = base_const_offset;
4621    emit_load(ctx, bld, info, mubuf_load_params);
4622 }
4623 
4624 Temp
wave_id_in_threadgroup(isel_context * ctx)4625 wave_id_in_threadgroup(isel_context* ctx)
4626 {
4627    Builder bld(ctx->program, ctx->block);
4628    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
4629                    get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
4630 }
4631 
4632 Temp
thread_id_in_threadgroup(isel_context * ctx)4633 thread_id_in_threadgroup(isel_context* ctx)
4634 {
4635    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
4636 
4637    Builder bld(ctx->program, ctx->block);
4638    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
4639 
4640    if (ctx->program->workgroup_size <= ctx->program->wave_size)
4641       return tid_in_wave;
4642 
4643    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
4644    Temp num_pre_threads =
4645       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
4646                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
4647    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
4648 }
4649 
4650 Temp
get_tess_rel_patch_id(isel_context * ctx)4651 get_tess_rel_patch_id(isel_context* ctx)
4652 {
4653    Builder bld(ctx->program, ctx->block);
4654 
4655    switch (ctx->shader->info.stage) {
4656    case MESA_SHADER_TESS_CTRL:
4657       return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
4658                         Operand::zero(), Operand::c32(8u), Operand::zero());
4659    case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
4660    default: unreachable("Unsupported stage in get_tess_rel_patch_id");
4661    }
4662 }
4663 
4664 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)4665 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
4666 {
4667    unsigned write_mask = nir_intrinsic_write_mask(instr);
4668    unsigned component = nir_intrinsic_component(instr);
4669    unsigned idx = nir_intrinsic_base(instr) * 4u + component;
4670    nir_src offset = *nir_get_io_offset_src(instr);
4671 
4672    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4673       return false;
4674 
4675    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4676 
4677    if (instr->src[0].ssa->bit_size == 64)
4678       write_mask = widen_mask(write_mask, 2);
4679 
4680    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4681 
4682    for (unsigned i = 0; i < 8; ++i) {
4683       if (write_mask & (1 << i)) {
4684          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4685          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4686       }
4687       idx++;
4688    }
4689 
4690    return true;
4691 }
4692 
4693 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)4694 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
4695 {
4696    /* Only TCS per-vertex inputs are supported by this function.
4697     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
4698     * is the same.
4699     */
4700    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4701       return false;
4702 
4703    nir_src* off_src = nir_get_io_offset_src(instr);
4704    nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
4705    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
4706    bool can_use_temps =
4707       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
4708       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4709 
4710    if (!can_use_temps)
4711       return false;
4712 
4713    unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
4714                   4 * nir_src_as_uint(*off_src);
4715    Temp* src = &ctx->inputs.temps[idx];
4716    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4717 
4718    return true;
4719 }
4720 
4721 static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
4722 
4723 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)4724 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
4725 {
4726    if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
4727        ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
4728        (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4729        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4730       bool stored_to_temps = store_output_to_temps(ctx, instr);
4731       if (!stored_to_temps) {
4732          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
4733          abort();
4734       }
4735    } else {
4736       unreachable("Shader stage not implemented");
4737    }
4738 
4739    /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
4740     * have to emit an exp here manually */
4741    if (ctx->stage.hw == HWStage::NGG &&
4742        (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
4743        nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
4744       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
4745 }
4746 
4747 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)4748 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
4749                   Temp prim_mask)
4750 {
4751    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4752    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4753 
4754    Builder bld(ctx->program, ctx->block);
4755 
4756    if (dst.regClass() == v2b) {
4757       if (ctx->program->dev.has_16bank_lds) {
4758          assert(ctx->options->chip_class <= GFX8);
4759          Builder::Result interp_p1 =
4760             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
4761                        bld.m0(prim_mask), idx, component);
4762          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
4763                                 bld.m0(prim_mask), interp_p1, idx, component);
4764          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
4765                     interp_p1, idx, component);
4766       } else {
4767          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4768 
4769          if (ctx->options->chip_class == GFX8)
4770             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4771 
4772          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
4773                                                 bld.m0(prim_mask), idx, component);
4774          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
4775                     component);
4776       }
4777    } else {
4778       Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4779                                              bld.m0(prim_mask), idx, component);
4780 
4781       if (ctx->program->dev.has_16bank_lds)
4782          interp_p1.instr->operands[0].setLateKill(true);
4783 
4784       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
4785                  idx, component);
4786    }
4787 }
4788 
4789 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)4790 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
4791 {
4792    Builder bld(ctx->program, ctx->block);
4793 
4794    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4795       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4796    for (unsigned i = 0; i < num_components; i++) {
4797       if (ctx->args->ac.frag_pos[i].used)
4798          vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4799       else
4800          vec->operands[i] = Operand(v1);
4801    }
4802    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4803       assert(num_components == 4);
4804       vec->operands[3] =
4805          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4806    }
4807 
4808    if (ctx->options->adjust_frag_coord_z &&
4809        G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4810       /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
4811       Operand frag_z = vec->operands[2];
4812       Temp adjusted_frag_z = bld.tmp(v1);
4813       Temp tmp;
4814 
4815       /* dFdx fine */
4816       Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));
4817       tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));
4818       emit_wqm(bld, tmp, adjusted_frag_z, true);
4819 
4820       /* adjusted_frag_z * 0.0625 + frag_z */
4821       adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,
4822                                  Operand::c32(0x3d800000u /* 0.0625 */), frag_z);
4823 
4824       /* VRS Rate X = Ancillary[2:3] */
4825       Temp x_rate =
4826          bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4827                   Operand::c32(2u), Operand::c32(2u));
4828 
4829       /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
4830       Temp cond =
4831          bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4832       vec->operands[2] =
4833          bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
4834    }
4835 
4836    for (Operand& op : vec->operands)
4837       op = op.isUndefined() ? Operand::zero() : op;
4838 
4839    vec->definitions[0] = Definition(dst);
4840    ctx->block->instructions.emplace_back(std::move(vec));
4841    emit_split_vector(ctx, dst, num_components);
4842    return;
4843 }
4844 
4845 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)4846 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
4847 {
4848    Builder bld(ctx->program, ctx->block);
4849    Temp cond;
4850 
4851    /* VRS Rate X = Ancillary[2:3]
4852     * VRS Rate Y = Ancillary[4:5]
4853     */
4854    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4855                           Operand::c32(2u), Operand::c32(2u));
4856    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4857                           Operand::c32(4u), Operand::c32(2u));
4858 
4859    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
4860    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4861    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4862                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
4863 
4864    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
4865    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
4866    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4867                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
4868 
4869    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
4870 }
4871 
4872 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)4873 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
4874 {
4875    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4876    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4877    unsigned idx = nir_intrinsic_base(instr);
4878    unsigned component = nir_intrinsic_component(instr);
4879    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4880 
4881    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
4882 
4883    if (instr->dest.ssa.num_components == 1) {
4884       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4885    } else {
4886       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4887          aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4888       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
4889          Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1);
4890          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
4891          vec->operands[i] = Operand(tmp);
4892       }
4893       vec->definitions[0] = Definition(dst);
4894       ctx->block->instructions.emplace_back(std::move(vec));
4895    }
4896 }
4897 
4898 bool
check_vertex_fetch_size(isel_context * ctx,const ac_data_format_info * vtx_info,unsigned offset,unsigned binding_align,unsigned channels)4899 check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4900                         unsigned binding_align, unsigned channels)
4901 {
4902    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4903    if (vtx_info->chan_byte_size != 4 && channels == 3)
4904       return false;
4905 
4906    /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
4907     * alignment issues that triggers memory violations and eventually a GPU
4908     * hang. This can happen if the stride (static or dynamic) is unaligned and
4909     * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
4910     * offset is 2 for R16G16B16A16_SNORM).
4911     */
4912    return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
4913           (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
4914 }
4915 
4916 uint8_t
get_fetch_data_format(isel_context * ctx,const ac_data_format_info * vtx_info,unsigned offset,unsigned * channels,unsigned max_channels,unsigned binding_align)4917 get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4918                       unsigned* channels, unsigned max_channels, unsigned binding_align)
4919 {
4920    if (!vtx_info->chan_byte_size) {
4921       *channels = vtx_info->num_channels;
4922       return vtx_info->chan_format;
4923    }
4924 
4925    unsigned num_channels = *channels;
4926    if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
4927       unsigned new_channels = num_channels + 1;
4928       /* first, assume more loads is worse and try using a larger data format */
4929       while (new_channels <= max_channels &&
4930              !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
4931          new_channels++;
4932       }
4933 
4934       if (new_channels > max_channels) {
4935          /* then try decreasing load size (at the cost of more loads) */
4936          new_channels = *channels;
4937          while (new_channels > 1 &&
4938                 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
4939             new_channels--;
4940       }
4941 
4942       if (new_channels < *channels)
4943          *channels = new_channels;
4944       num_channels = new_channels;
4945    }
4946 
4947    switch (vtx_info->chan_format) {
4948    case V_008F0C_BUF_DATA_FORMAT_8:
4949       return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4950                                     V_008F0C_BUF_DATA_FORMAT_INVALID,
4951                                     V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4952    case V_008F0C_BUF_DATA_FORMAT_16:
4953       return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4954                                     V_008F0C_BUF_DATA_FORMAT_INVALID,
4955                                     V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4956    case V_008F0C_BUF_DATA_FORMAT_32:
4957       return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4958                                     V_008F0C_BUF_DATA_FORMAT_32_32_32,
4959                                     V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4960    }
4961    unreachable("shouldn't reach here");
4962    return V_008F0C_BUF_DATA_FORMAT_INVALID;
4963 }
4964 
4965 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4966  * so we may need to fix it up. */
4967 Temp
adjust_vertex_fetch_alpha(isel_context * ctx,enum radv_vs_input_alpha_adjust adjustment,Temp alpha)4968 adjust_vertex_fetch_alpha(isel_context* ctx, enum radv_vs_input_alpha_adjust adjustment, Temp alpha)
4969 {
4970    Builder bld(ctx->program, ctx->block);
4971 
4972    if (adjustment == ALPHA_ADJUST_SSCALED)
4973       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4974 
4975    /* For the integer-like cases, do a natural sign extension.
4976     *
4977     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4978     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4979     * exponent.
4980     */
4981    unsigned offset = adjustment == ALPHA_ADJUST_SNORM ? 23u : 0u;
4982    alpha =
4983       bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
4984 
4985    /* Convert back to the right type. */
4986    if (adjustment == ALPHA_ADJUST_SNORM) {
4987       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4988       alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
4989    } else if (adjustment == ALPHA_ADJUST_SSCALED) {
4990       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4991    }
4992 
4993    return alpha;
4994 }
4995 
4996 void
visit_load_input(isel_context * ctx,nir_intrinsic_instr * instr)4997 visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
4998 {
4999    Builder bld(ctx->program, ctx->block);
5000    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5001    nir_src offset = *nir_get_io_offset_src(instr);
5002 
5003    if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.dynamic_inputs) {
5004       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5005          isel_err(offset.ssa->parent_instr,
5006                   "Unimplemented non-zero nir_intrinsic_load_input offset");
5007 
5008       unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5009       unsigned component = nir_intrinsic_component(instr);
5010       unsigned bitsize = instr->dest.ssa.bit_size;
5011       unsigned num_components = instr->dest.ssa.num_components;
5012 
5013       Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
5014 
5015       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5016          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5017       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5018       for (unsigned i = 0; i < num_components; i++) {
5019          elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
5020          if (bitsize == 16) {
5021             if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float)
5022                elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]);
5023             else
5024                elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i],
5025                                      Operand::c32(0u));
5026          }
5027          vec->operands[i] = Operand(elems[i]);
5028       }
5029       vec->definitions[0] = Definition(dst);
5030       ctx->block->instructions.emplace_back(std::move(vec));
5031       ctx->allocated_vec.emplace(dst.id(), elems);
5032    } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
5033 
5034       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5035          isel_err(offset.ssa->parent_instr,
5036                   "Unimplemented non-zero nir_intrinsic_load_input offset");
5037 
5038       Temp vertex_buffers =
5039          convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
5040 
5041       unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5042       unsigned component = nir_intrinsic_component(instr);
5043       unsigned bitsize = instr->dest.ssa.bit_size;
5044       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
5045       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
5046       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
5047       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
5048       unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
5049       enum radv_vs_input_alpha_adjust alpha_adjust =
5050          ctx->options->key.vs.vertex_alpha_adjust[location];
5051 
5052       unsigned dfmt = attrib_format & 0xf;
5053       unsigned nfmt = (attrib_format >> 4) & 0x7;
5054       const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
5055 
5056       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
5057       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
5058       bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location);
5059       if (post_shuffle)
5060          num_channels = MAX2(num_channels, 3);
5061 
5062       unsigned desc_index =
5063          ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
5064       desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
5065                                  u_bit_consecutive(0, desc_index));
5066       Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
5067       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
5068 
5069       Temp index;
5070       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
5071          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
5072          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
5073          if (divisor) {
5074             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
5075             if (divisor != 1) {
5076                Temp divided = bld.tmp(v1);
5077                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5078                index = bld.vadd32(bld.def(v1), start_instance, divided);
5079             } else {
5080                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
5081             }
5082          } else {
5083             index = bld.copy(bld.def(v1), start_instance);
5084          }
5085       } else {
5086          index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5087                             get_arg(ctx, ctx->args->ac.vertex_id));
5088       }
5089 
5090       Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
5091       unsigned channel_start = 0;
5092       bool direct_fetch = false;
5093 
5094       /* skip unused channels at the start */
5095       if (vtx_info->chan_byte_size && !post_shuffle) {
5096          channel_start = ffs(mask) - 1;
5097          for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
5098             channels[i] = Temp(0, s1);
5099       } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
5100          num_channels = 3 - (ffs(mask) - 1);
5101       }
5102 
5103       /* load channels */
5104       while (channel_start < num_channels) {
5105          unsigned fetch_component = num_channels - channel_start;
5106          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
5107          bool expanded = false;
5108 
5109          /* use MUBUF when possible to avoid possible alignment issues */
5110          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
5111          bool use_mubuf =
5112             (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
5113              nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
5114             vtx_info->chan_byte_size == 4;
5115          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
5116          if (!use_mubuf) {
5117             fetch_dfmt =
5118                get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5119                                      vtx_info->num_channels - channel_start, binding_align);
5120          } else {
5121             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
5122                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
5123                fetch_component = 4;
5124                expanded = true;
5125             }
5126          }
5127 
5128          unsigned fetch_bytes = fetch_component * bitsize / 8;
5129 
5130          Temp fetch_index = index;
5131          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
5132             fetch_index =
5133                bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
5134             fetch_offset = fetch_offset % attrib_stride;
5135          }
5136 
5137          Operand soffset = Operand::zero();
5138          if (fetch_offset >= 4096) {
5139             soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
5140             fetch_offset %= 4096;
5141          }
5142 
5143          aco_opcode opcode;
5144          switch (fetch_bytes) {
5145          case 2:
5146             assert(!use_mubuf && bitsize == 16);
5147             opcode = aco_opcode::tbuffer_load_format_d16_x;
5148             break;
5149          case 4:
5150             if (bitsize == 16) {
5151                assert(!use_mubuf);
5152                opcode = aco_opcode::tbuffer_load_format_d16_xy;
5153             } else {
5154                opcode =
5155                   use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
5156             }
5157             break;
5158          case 6:
5159             assert(!use_mubuf && bitsize == 16);
5160             opcode = aco_opcode::tbuffer_load_format_d16_xyz;
5161             break;
5162          case 8:
5163             if (bitsize == 16) {
5164                assert(!use_mubuf);
5165                opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
5166             } else {
5167                opcode =
5168                   use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
5169             }
5170             break;
5171          case 12:
5172             assert(ctx->options->chip_class >= GFX7 ||
5173                    (!use_mubuf && ctx->options->chip_class == GFX6));
5174             opcode =
5175                use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
5176             break;
5177          case 16:
5178             opcode =
5179                use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
5180             break;
5181          default: unreachable("Unimplemented load_input vector size");
5182          }
5183 
5184          Temp fetch_dst;
5185          if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
5186              (alpha_adjust == ALPHA_ADJUST_NONE || num_channels <= 3)) {
5187             direct_fetch = true;
5188             fetch_dst = dst;
5189          } else {
5190             fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
5191          }
5192 
5193          if (use_mubuf) {
5194             Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
5195                                            soffset, fetch_offset, false, false, true)
5196                                     .instr;
5197             mubuf->mubuf().vtx_binding = attrib_binding + 1;
5198          } else {
5199             Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
5200                                            soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
5201                                     .instr;
5202             mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
5203          }
5204 
5205          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
5206 
5207          if (fetch_component == 1) {
5208             channels[channel_start] = fetch_dst;
5209          } else {
5210             for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
5211                channels[channel_start + i] =
5212                   emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5213          }
5214 
5215          channel_start += fetch_component;
5216       }
5217 
5218       if (!direct_fetch) {
5219          bool is_float =
5220             nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
5221 
5222          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
5223          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
5224          const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
5225          unsigned num_components = instr->dest.ssa.num_components;
5226 
5227          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5228             aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5229          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5230          unsigned num_temp = 0;
5231          for (unsigned i = 0; i < num_components; i++) {
5232             unsigned idx = i + component;
5233             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
5234                Temp channel = channels[swizzle[idx]];
5235                if (idx == 3 && alpha_adjust != ALPHA_ADJUST_NONE)
5236                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
5237                vec->operands[i] = Operand(channel);
5238 
5239                num_temp++;
5240                elems[i] = channel;
5241             } else if (is_float && idx == 3) {
5242                vec->operands[i] = Operand::c32(0x3f800000u);
5243             } else if (!is_float && idx == 3) {
5244                vec->operands[i] = Operand::c32(1u);
5245             } else {
5246                vec->operands[i] = Operand::zero();
5247             }
5248          }
5249          vec->definitions[0] = Definition(dst);
5250          ctx->block->instructions.emplace_back(std::move(vec));
5251          emit_split_vector(ctx, dst, num_components);
5252 
5253          if (num_temp == num_components)
5254             ctx->allocated_vec.emplace(dst.id(), elems);
5255       }
5256    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5257       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5258          isel_err(offset.ssa->parent_instr,
5259                   "Unimplemented non-zero nir_intrinsic_load_input offset");
5260 
5261       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5262 
5263       unsigned idx = nir_intrinsic_base(instr);
5264       unsigned component = nir_intrinsic_component(instr);
5265       unsigned vertex_id = 2; /* P0 */
5266 
5267       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
5268          nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
5269          switch (src0->u32) {
5270          case 0:
5271             vertex_id = 2; /* P0 */
5272             break;
5273          case 1:
5274             vertex_id = 0; /* P10 */
5275             break;
5276          case 2:
5277             vertex_id = 1; /* P20 */
5278             break;
5279          default: unreachable("invalid vertex index");
5280          }
5281       }
5282 
5283       if (instr->dest.ssa.num_components == 1 &&
5284           instr->dest.ssa.bit_size != 64) {
5285          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
5286                     bld.m0(prim_mask), idx, component);
5287       } else {
5288          unsigned num_components = instr->dest.ssa.num_components;
5289          if (instr->dest.ssa.bit_size == 64)
5290             num_components *= 2;
5291          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5292             aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5293          for (unsigned i = 0; i < num_components; i++) {
5294             unsigned chan_component = (component + i) % 4;
5295             unsigned chan_idx = idx + (component + i) / 4;
5296             vec->operands[i] = bld.vintrp(
5297                aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1),
5298                Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component);
5299          }
5300          vec->definitions[0] = Definition(dst);
5301          bld.insert(std::move(vec));
5302       }
5303    } else {
5304       unreachable("Shader stage not implemented");
5305    }
5306 }
5307 
5308 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5309 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5310 {
5311    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5312 
5313    Builder bld(ctx->program, ctx->block);
5314    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5315 
5316    if (load_input_from_temps(ctx, instr, dst))
5317       return;
5318 
5319    unreachable("LDS-based TCS input should have been lowered in NIR.");
5320 }
5321 
5322 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5323 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5324 {
5325    switch (ctx->shader->info.stage) {
5326    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5327    default: unreachable("Unimplemented shader stage");
5328    }
5329 }
5330 
5331 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5332 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5333 {
5334    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5335 
5336    Builder bld(ctx->program, ctx->block);
5337    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5338 
5339    Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5340    Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5341    Operand tes_w = Operand::zero();
5342 
5343    if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5344       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5345       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5346       tes_w = Operand(tmp);
5347    }
5348 
5349    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5350    emit_split_vector(ctx, tess_coord, 3);
5351 }
5352 
5353 Temp
load_desc_ptr(isel_context * ctx,unsigned desc_set)5354 load_desc_ptr(isel_context* ctx, unsigned desc_set)
5355 {
5356    const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs;
5357 
5358    if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
5359       Builder bld(ctx->program, ctx->block);
5360       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5361       Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
5362       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
5363    }
5364 
5365    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5366 }
5367 
5368 void
visit_load_resource(isel_context * ctx,nir_intrinsic_instr * instr)5369 visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
5370 {
5371    Builder bld(ctx->program, ctx->block);
5372    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5373    if (!nir_dest_is_divergent(instr->dest))
5374       index = bld.as_uniform(index);
5375    unsigned desc_set = nir_intrinsic_desc_set(instr);
5376    unsigned binding = nir_intrinsic_binding(instr);
5377 
5378    Temp desc_ptr;
5379    radv_pipeline_layout* pipeline_layout = ctx->options->layout;
5380    radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
5381    unsigned offset = layout->binding[binding].offset;
5382    unsigned stride;
5383    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5384        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5385       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
5386                      layout->binding[binding].dynamic_offset_offset;
5387       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5388       offset = pipeline_layout->push_constant_size + 16 * idx;
5389       stride = 16;
5390    } else {
5391       desc_ptr = load_desc_ptr(ctx, desc_set);
5392       stride = layout->binding[binding].size;
5393    }
5394 
5395    if (nir_src_is_const(instr->src[0])) {
5396       index =
5397          bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));
5398    } else if (index.type() == RegType::vgpr) {
5399       if (stride != 1) {
5400          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5401          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5402       }
5403       if (offset)
5404          index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);
5405    } else {
5406       if (stride != 1)
5407          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
5408       if (offset)
5409          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5410                           Operand::c32(offset), index);
5411    }
5412 
5413    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5414    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5415    elems[0] = desc_ptr;
5416    elems[1] = index;
5417    ctx->allocated_vec.emplace(dst.id(), elems);
5418    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());
5419 }
5420 
5421 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,bool glc=false,bool allow_smem=true,memory_sync_info sync=memory_sync_info ())5422 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5423             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5424             bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5425 {
5426    Builder bld(ctx->program, ctx->block);
5427 
5428    bool use_smem =
5429       dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5430    if (use_smem)
5431       offset = bld.as_uniform(offset);
5432    else {
5433       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5434        * work correctly when the SGPR offset is used.
5435        */
5436       if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
5437          offset = as_vgpr(ctx, offset);
5438    }
5439 
5440    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5441    info.glc = glc;
5442    info.sync = sync;
5443    info.align_mul = align_mul;
5444    info.align_offset = align_offset;
5445    if (use_smem)
5446       emit_load(ctx, bld, info, smem_load_params);
5447    else
5448       emit_load(ctx, bld, info, mubuf_load_params);
5449 }
5450 
5451 Temp
load_buffer_rsrc(isel_context * ctx,Temp rsrc)5452 load_buffer_rsrc(isel_context* ctx, Temp rsrc)
5453 {
5454    Builder bld(ctx->program, ctx->block);
5455    Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
5456    Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5457    set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);
5458    return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
5459 }
5460 
5461 bool
is_inline_ubo(isel_context * ctx,nir_src rsrc)5462 is_inline_ubo(isel_context* ctx, nir_src rsrc)
5463 {
5464    nir_binding binding = nir_chase_binding(rsrc);
5465    if (!binding.success)
5466       return false;
5467 
5468    radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
5469    return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
5470 }
5471 
5472 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5473 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5474 {
5475    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5476    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5477 
5478    Builder bld(ctx->program, ctx->block);
5479 
5480    if (is_inline_ubo(ctx, instr->src[0])) {
5481       Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
5482       Temp binding_off =
5483          bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5484       rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
5485 
5486       uint32_t desc_type =
5487          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5488          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5489       if (ctx->options->chip_class >= GFX10) {
5490          desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5491                       S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5492       } else {
5493          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5494                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5495       }
5496       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,
5497                         Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5498                         Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));
5499    } else {
5500       rsrc = load_buffer_rsrc(ctx, rsrc);
5501    }
5502    unsigned size = instr->dest.ssa.bit_size / 8;
5503    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5504                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5505 }
5506 
5507 void
visit_load_sbt_amd(isel_context * ctx,nir_intrinsic_instr * instr)5508 visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5509 {
5510    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5511    unsigned binding = nir_intrinsic_binding(instr);
5512 
5513    Builder bld(ctx->program, ctx->block);
5514    Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
5515    Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
5516    bld.smem(aco_opcode::s_load_dwordx4, Definition(dst), desc_base, desc_off);
5517 }
5518 
5519 void
visit_load_push_constant(isel_context * ctx,nir_intrinsic_instr * instr)5520 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5521 {
5522    Builder bld(ctx->program, ctx->block);
5523    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5524    unsigned offset = nir_intrinsic_base(instr);
5525    unsigned count = instr->dest.ssa.num_components;
5526    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5527 
5528    if (index_cv && instr->dest.ssa.bit_size == 32) {
5529       struct radv_userdata_info *loc =
5530          &ctx->args->shader_info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
5531       unsigned start = (offset + index_cv->u32) / 4u;
5532       unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
5533 
5534       start -= ctx->args->shader_info->min_push_constant_used / 4;
5535       if (start + count <= num_inline_push_consts) {
5536          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5537          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5538             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5539          for (unsigned i = 0; i < count; ++i) {
5540             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5541             vec->operands[i] = Operand{elems[i]};
5542          }
5543          vec->definitions[0] = Definition(dst);
5544          ctx->block->instructions.emplace_back(std::move(vec));
5545          ctx->allocated_vec.emplace(dst.id(), elems);
5546          return;
5547       }
5548    }
5549 
5550    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5551    if (offset != 0) // TODO check if index != 0 as well
5552       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5553                              Operand::c32(offset), index);
5554    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5555    Temp vec = dst;
5556    bool trim = false;
5557    bool aligned = true;
5558 
5559    if (instr->dest.ssa.bit_size == 8) {
5560       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5561       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5562       if (!aligned)
5563          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5564    } else if (instr->dest.ssa.bit_size == 16) {
5565       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5566       if (!aligned)
5567          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5568    }
5569 
5570    aco_opcode op;
5571 
5572    switch (vec.size()) {
5573    case 1: op = aco_opcode::s_load_dword; break;
5574    case 2: op = aco_opcode::s_load_dwordx2; break;
5575    case 3:
5576       vec = bld.tmp(s4);
5577       trim = true;
5578       FALLTHROUGH;
5579    case 4: op = aco_opcode::s_load_dwordx4; break;
5580    case 6:
5581       vec = bld.tmp(s8);
5582       trim = true;
5583       FALLTHROUGH;
5584    case 8: op = aco_opcode::s_load_dwordx8; break;
5585    default: unreachable("unimplemented or forbidden load_push_constant.");
5586    }
5587 
5588    bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
5589 
5590    if (!aligned) {
5591       Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5592       byte_align_scalar(ctx, vec, byte_offset, dst);
5593       return;
5594    }
5595 
5596    if (trim) {
5597       emit_split_vector(ctx, vec, 4);
5598       RegClass rc = dst.size() == 3 ? s1 : s2;
5599       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5600                  emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5601    }
5602    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5603 }
5604 
5605 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5606 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5607 {
5608    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5609 
5610    Builder bld(ctx->program, ctx->block);
5611 
5612    uint32_t desc_type =
5613       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5614       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5615    if (ctx->options->chip_class >= GFX10) {
5616       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5617                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5618    } else {
5619       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5620                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5621    }
5622 
5623    unsigned base = nir_intrinsic_base(instr);
5624    unsigned range = nir_intrinsic_range(instr);
5625 
5626    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5627    if (base && offset.type() == RegType::sgpr)
5628       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5629                               Operand::c32(base));
5630    else if (base && offset.type() == RegType::vgpr)
5631       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5632 
5633    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5634                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5635                                      Operand::c32(ctx->constant_data_offset)),
5636                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5637                           Operand::c32(desc_type));
5638    unsigned size = instr->dest.ssa.bit_size / 8;
5639    // TODO: get alignment information for subdword constants
5640    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5641 }
5642 
5643 void
visit_discard_if(isel_context * ctx,nir_intrinsic_instr * instr)5644 visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
5645 {
5646    if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5647       ctx->cf_info.exec_potentially_empty_discard = true;
5648 
5649    ctx->program->needs_exact = true;
5650 
5651    // TODO: optimize uniform conditions
5652    Builder bld(ctx->program, ctx->block);
5653    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5654    assert(src.regClass() == bld.lm);
5655    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5656    bld.pseudo(aco_opcode::p_discard_if, src);
5657    ctx->block->kind |= block_kind_uses_discard_if;
5658    return;
5659 }
5660 
5661 void
visit_discard(isel_context * ctx,nir_intrinsic_instr * instr)5662 visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
5663 {
5664    Builder bld(ctx->program, ctx->block);
5665 
5666    if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5667       ctx->cf_info.exec_potentially_empty_discard = true;
5668 
5669    bool divergent =
5670       ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
5671 
5672    if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
5673       /* we handle discards the same way as jump instructions */
5674       append_logical_end(ctx->block);
5675 
5676       /* in loops, discard behaves like break */
5677       Block* linear_target = ctx->cf_info.parent_loop.exit;
5678       ctx->block->kind |= block_kind_discard;
5679 
5680       /* uniform discard - loop ends here */
5681       assert(nir_instr_is_last(&instr->instr));
5682       ctx->block->kind |= block_kind_uniform;
5683       ctx->cf_info.has_branch = true;
5684       bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
5685       add_linear_edge(ctx->block->index, linear_target);
5686       return;
5687    }
5688 
5689    /* it can currently happen that NIR doesn't remove the unreachable code */
5690    if (!nir_instr_is_last(&instr->instr)) {
5691       ctx->program->needs_exact = true;
5692       /* save exec somewhere temporarily so that it doesn't get
5693        * overwritten before the discard from outer exec masks */
5694       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
5695                            Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));
5696       bld.pseudo(aco_opcode::p_discard_if, cond);
5697       ctx->block->kind |= block_kind_uses_discard_if;
5698       return;
5699    }
5700 
5701    /* This condition is incorrect for uniformly branched discards in a loop
5702     * predicated by a divergent condition, but the above code catches that case
5703     * and the discard would end up turning into a discard_if.
5704     * For example:
5705     * if (divergent) {
5706     *    while (...) {
5707     *       if (uniform) {
5708     *          discard;
5709     *       }
5710     *    }
5711     * }
5712     */
5713    if (!ctx->cf_info.parent_if.is_divergent) {
5714       /* program just ends here */
5715       ctx->block->kind |= block_kind_uses_discard_if;
5716       bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));
5717       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5718    } else {
5719       ctx->block->kind |= block_kind_discard;
5720       /* branch and linear edge is added by visit_if() */
5721    }
5722 }
5723 
5724 enum aco_descriptor_type {
5725    ACO_DESC_IMAGE,
5726    ACO_DESC_FMASK,
5727    ACO_DESC_SAMPLER,
5728    ACO_DESC_BUFFER,
5729    ACO_DESC_PLANE_0,
5730    ACO_DESC_PLANE_1,
5731    ACO_DESC_PLANE_2,
5732 };
5733 
5734 static bool
should_declare_array(isel_context * ctx,enum glsl_sampler_dim sampler_dim,bool is_array)5735 should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5736 {
5737    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5738       return false;
5739    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5740    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5741           dim == ac_image_2darraymsaa;
5742 }
5743 
5744 Temp
get_sampler_desc(isel_context * ctx,nir_deref_instr * deref_instr,enum aco_descriptor_type desc_type,const nir_tex_instr * tex_instr,bool write)5745 get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
5746                  enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
5747 {
5748    /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5749       std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
5750       32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
5751    */
5752    Temp index = Temp();
5753    bool index_set = false;
5754    unsigned constant_index = 0;
5755    unsigned descriptor_set;
5756    unsigned base_index;
5757    Builder bld(ctx->program, ctx->block);
5758 
5759    if (!deref_instr) {
5760       assert(tex_instr);
5761       descriptor_set = 0;
5762       base_index = tex_instr->sampler_index;
5763    } else {
5764       while (deref_instr->deref_type != nir_deref_type_var) {
5765          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5766          if (!array_size)
5767             array_size = 1;
5768 
5769          assert(deref_instr->deref_type == nir_deref_type_array);
5770          nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
5771          if (const_value) {
5772             constant_index += array_size * const_value->u32;
5773          } else {
5774             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5775             if (indirect.type() == RegType::vgpr)
5776                indirect = bld.as_uniform(indirect);
5777 
5778             if (array_size != 1)
5779                indirect =
5780                   bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
5781 
5782             if (!index_set) {
5783                index = indirect;
5784                index_set = true;
5785             } else {
5786                index =
5787                   bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5788             }
5789          }
5790 
5791          deref_instr = nir_src_as_deref(deref_instr->parent);
5792       }
5793       descriptor_set = deref_instr->var->data.descriptor_set;
5794       base_index = deref_instr->var->data.binding;
5795    }
5796 
5797    Temp list = load_desc_ptr(ctx, descriptor_set);
5798    list = convert_pointer_to_64_bit(ctx, list);
5799 
5800    struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
5801    struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
5802    unsigned offset = binding->offset;
5803    unsigned stride = binding->size;
5804    aco_opcode opcode;
5805    RegClass type;
5806 
5807    assert(base_index < layout->binding_count);
5808 
5809    switch (desc_type) {
5810    case ACO_DESC_IMAGE:
5811       type = s8;
5812       opcode = aco_opcode::s_load_dwordx8;
5813       break;
5814    case ACO_DESC_FMASK:
5815       type = s8;
5816       opcode = aco_opcode::s_load_dwordx8;
5817       offset += 32;
5818       break;
5819    case ACO_DESC_SAMPLER:
5820       type = s4;
5821       opcode = aco_opcode::s_load_dwordx4;
5822       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5823          offset += radv_combined_image_descriptor_sampler_offset(binding);
5824       break;
5825    case ACO_DESC_BUFFER:
5826       type = s4;
5827       opcode = aco_opcode::s_load_dwordx4;
5828       break;
5829    case ACO_DESC_PLANE_0:
5830    case ACO_DESC_PLANE_1:
5831       type = s8;
5832       opcode = aco_opcode::s_load_dwordx8;
5833       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5834       break;
5835    case ACO_DESC_PLANE_2:
5836       type = s4;
5837       opcode = aco_opcode::s_load_dwordx4;
5838       offset += 64;
5839       break;
5840    default: unreachable("invalid desc_type\n");
5841    }
5842 
5843    offset += constant_index * stride;
5844 
5845    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5846        (!index_set || binding->immutable_samplers_equal)) {
5847       if (binding->immutable_samplers_equal)
5848          constant_index = 0;
5849 
5850       const uint32_t* samplers = radv_immutable_samplers(layout, binding);
5851       uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
5852       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5853                         Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
5854                         Operand::c32(samplers[constant_index * 4 + 1]),
5855                         Operand::c32(samplers[constant_index * 4 + 2]),
5856                         Operand::c32(samplers[constant_index * 4 + 3]));
5857    }
5858 
5859    Operand off;
5860    if (!index_set) {
5861       off = bld.copy(bld.def(s1), Operand::c32(offset));
5862    } else {
5863       off = Operand(
5864          (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
5865                         bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));
5866    }
5867 
5868    Temp res = bld.smem(opcode, bld.def(type), list, off);
5869 
5870    if (desc_type == ACO_DESC_PLANE_2) {
5871       Temp components[8];
5872       for (unsigned i = 0; i < 8; i++)
5873          components[i] = bld.tmp(s1);
5874       bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5875                  Definition(components[2]), Definition(components[3]), res);
5876 
5877       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
5878       bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5879                  Definition(components[4]), Definition(components[5]), Definition(components[6]),
5880                  Definition(components[7]), desc2);
5881 
5882       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5883                        components[2], components[3], components[4], components[5], components[6],
5884                        components[7]);
5885    } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
5886               !write) {
5887       Temp components[8];
5888       for (unsigned i = 0; i < 8; i++)
5889          components[i] = bld.tmp(s1);
5890 
5891       bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5892                  Definition(components[2]), Definition(components[3]), Definition(components[4]),
5893                  Definition(components[5]), Definition(components[6]), Definition(components[7]),
5894                  res);
5895 
5896       /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
5897        * hardware bug.
5898        */
5899       components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
5900                                bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
5901 
5902       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5903                        components[2], components[3], components[4], components[5], components[6],
5904                        components[7]);
5905    } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
5906       Temp components[4];
5907       for (unsigned i = 0; i < 4; i++)
5908          components[i] = bld.tmp(s1);
5909 
5910       bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5911                  Definition(components[2]), Definition(components[3]), res);
5912 
5913       /* We want to always use the linear filtering truncation behaviour for
5914        * nir_texop_tg4, even if the sampler uses nearest/point filtering.
5915        */
5916       components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
5917                                Operand::c32(C_008F30_TRUNC_COORD));
5918 
5919       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
5920                        components[2], components[3]);
5921    }
5922 
5923    return res;
5924 }
5925 
5926 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)5927 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5928 {
5929    switch (dim) {
5930    case GLSL_SAMPLER_DIM_BUF: return 1;
5931    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5932    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5933    case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
5934    case GLSL_SAMPLER_DIM_3D:
5935    case GLSL_SAMPLER_DIM_CUBE: return 3;
5936    case GLSL_SAMPLER_DIM_RECT:
5937    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5938    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
5939    default: break;
5940    }
5941    return 0;
5942 }
5943 
5944 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Definition dst,Temp rsrc,Operand samp,std::vector<Temp> coords,unsigned wqm_mask=0,Operand vdata=Operand (v1))5945 emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
5946           std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
5947 {
5948    /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
5949    unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
5950    bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;
5951 
5952    if (!use_nsa) {
5953       Temp coord = coords[0];
5954       if (coords.size() > 1) {
5955          coord = bld.tmp(RegType::vgpr, coords.size());
5956 
5957          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5958             aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5959          for (unsigned i = 0; i < coords.size(); i++)
5960             vec->operands[i] = Operand(coords[i]);
5961          vec->definitions[0] = Definition(coord);
5962          bld.insert(std::move(vec));
5963       } else if (coord.type() == RegType::sgpr) {
5964          coord = bld.copy(bld.def(v1), coord);
5965       }
5966 
5967       if (wqm_mask) {
5968          /* We don't need the bias, sample index, compare value or offset to be
5969           * computed in WQM but if the p_create_vector copies the coordinates, then it
5970           * needs to be in WQM. */
5971          coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
5972       }
5973 
5974       coords[0] = coord;
5975       coords.resize(1);
5976    } else {
5977       for (unsigned i = 0; i < coords.size(); i++) {
5978          if (wqm_mask & (1u << i))
5979             coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
5980       }
5981 
5982       for (Temp& coord : coords) {
5983          if (coord.type() == RegType::sgpr)
5984             coord = bld.copy(bld.def(v1), coord);
5985       }
5986    }
5987 
5988    aco_ptr<MIMG_instruction> mimg{
5989       create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
5990    if (dst.isTemp())
5991       mimg->definitions[0] = dst;
5992    mimg->operands[0] = Operand(rsrc);
5993    mimg->operands[1] = samp;
5994    mimg->operands[2] = vdata;
5995    for (unsigned i = 0; i < coords.size(); i++)
5996       mimg->operands[3 + i] = Operand(coords[i]);
5997 
5998    MIMG_instruction* res = mimg.get();
5999    bld.insert(std::move(mimg));
6000    return res;
6001 }
6002 
6003 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6004 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6005 {
6006    Builder bld(ctx->program, ctx->block);
6007    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6008    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6009    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6010    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6011    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6012    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6013    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6014 
6015    std::vector<Temp> args;
6016    args.push_back(emit_extract_vector(ctx, node, 0, v1));
6017    args.push_back(emit_extract_vector(ctx, node, 1, v1));
6018    args.push_back(as_vgpr(ctx, tmax));
6019    args.push_back(emit_extract_vector(ctx, origin, 0, v1));
6020    args.push_back(emit_extract_vector(ctx, origin, 1, v1));
6021    args.push_back(emit_extract_vector(ctx, origin, 2, v1));
6022    args.push_back(emit_extract_vector(ctx, dir, 0, v1));
6023    args.push_back(emit_extract_vector(ctx, dir, 1, v1));
6024    args.push_back(emit_extract_vector(ctx, dir, 2, v1));
6025    args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
6026    args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
6027    args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
6028 
6029    MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
6030                                       resource, Operand(s4), args);
6031    mimg->dim = ac_image_1d;
6032    mimg->dmask = 0xf;
6033    mimg->unrm = true;
6034    mimg->r128 = true;
6035 }
6036 
6037 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6038 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6039 {
6040 
6041    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6042    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6043    bool is_array = nir_intrinsic_image_array(instr);
6044    ASSERTED bool add_frag_pos =
6045       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6046    assert(!add_frag_pos && "Input attachments should be lowered.");
6047    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6048    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6049    int count = image_type_to_components_count(dim, is_array);
6050    std::vector<Temp> coords(count);
6051    Builder bld(ctx->program, ctx->block);
6052 
6053    if (is_ms)
6054       coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1);
6055 
6056    if (gfx9_1d) {
6057       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6058       coords.resize(coords.size() + 1);
6059       coords[1] = bld.copy(bld.def(v1), Operand::zero());
6060       if (is_array)
6061          coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6062    } else {
6063       for (int i = 0; i < count; i++)
6064          coords[i] = emit_extract_vector(ctx, src0, i, v1);
6065    }
6066 
6067    if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6068        instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
6069        instr->intrinsic == nir_intrinsic_image_deref_store) {
6070       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
6071       bool level_zero =
6072          nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
6073 
6074       if (!level_zero)
6075          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6076    }
6077 
6078    return coords;
6079 }
6080 
6081 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6082 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6083 {
6084    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6085    if (semantics & semantic_atomicrmw)
6086       return memory_sync_info(storage, semantics);
6087 
6088    unsigned access = nir_intrinsic_access(instr);
6089 
6090    if (access & ACCESS_VOLATILE)
6091       semantics |= semantic_volatile;
6092    if (access & ACCESS_CAN_REORDER)
6093       semantics |= semantic_can_reorder | semantic_private;
6094 
6095    return memory_sync_info(storage, semantics);
6096 }
6097 
6098 Operand
emit_tfe_init(Builder & bld,Temp dst)6099 emit_tfe_init(Builder& bld, Temp dst)
6100 {
6101    Temp tmp = bld.tmp(dst.regClass());
6102 
6103    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6104       aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6105    for (unsigned i = 0; i < dst.size(); i++)
6106       vec->operands[i] = Operand::zero();
6107    vec->definitions[0] = Definition(tmp);
6108    /* Since this is fixed to an instruction's definition register, any CSE will
6109     * just create copies. Copying costs about the same as zero-initialization,
6110     * but these copies can break up clauses.
6111     */
6112    vec->definitions[0].setNoCSE(true);
6113    bld.insert(std::move(vec));
6114 
6115    return Operand(tmp);
6116 }
6117 
6118 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6119 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6120 {
6121    Builder bld(ctx->program, ctx->block);
6122    const nir_variable* var =
6123       nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6124    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6125    bool is_array = nir_intrinsic_image_array(instr);
6126    bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
6127    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6128 
6129    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6130    unsigned access = var->data.access | nir_intrinsic_access(instr);
6131 
6132    unsigned result_size = instr->dest.ssa.num_components - is_sparse;
6133    unsigned expand_mask =
6134       nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
6135    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6136    if (dim == GLSL_SAMPLER_DIM_BUF)
6137       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6138    unsigned dmask = expand_mask;
6139    if (instr->dest.ssa.bit_size == 64) {
6140       expand_mask &= 0x9;
6141       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6142       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6143    }
6144    if (is_sparse)
6145       expand_mask |= 1 << result_size;
6146    unsigned num_components = util_bitcount(dmask) + is_sparse;
6147 
6148    Temp tmp;
6149    if (num_components == dst.size() && dst.type() == RegType::vgpr)
6150       tmp = dst;
6151    else
6152       tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
6153 
6154    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6155                                     dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
6156                                     nullptr, false);
6157 
6158    if (dim == GLSL_SAMPLER_DIM_BUF) {
6159       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6160 
6161       aco_opcode opcode;
6162       switch (util_bitcount(dmask)) {
6163       case 1: opcode = aco_opcode::buffer_load_format_x; break;
6164       case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6165       case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6166       case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6167       default: unreachable(">4 channel buffer image load");
6168       }
6169       aco_ptr<MUBUF_instruction> load{
6170          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6171       load->operands[0] = Operand(resource);
6172       load->operands[1] = Operand(vindex);
6173       load->operands[2] = Operand::c32(0);
6174       load->definitions[0] = Definition(tmp);
6175       load->idxen = true;
6176       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6177       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6178       load->sync = sync;
6179       load->tfe = is_sparse;
6180       if (load->tfe)
6181          load->operands[3] = emit_tfe_init(bld, tmp);
6182       ctx->block->instructions.emplace_back(std::move(load));
6183    } else {
6184       std::vector<Temp> coords = get_image_coords(ctx, instr);
6185 
6186       bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6187       aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6188 
6189       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6190       MIMG_instruction* load =
6191          emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
6192       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6193       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6194       load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6195       load->dmask = dmask;
6196       load->unrm = true;
6197       load->da = should_declare_array(ctx, dim, is_array);
6198       load->sync = sync;
6199       load->tfe = is_sparse;
6200    }
6201 
6202    if (is_sparse && instr->dest.ssa.bit_size == 64) {
6203       /* The result components are 64-bit but the sparse residency code is
6204        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6205        */
6206       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6207                        Operand::zero());
6208    }
6209 
6210    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
6211 }
6212 
6213 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6214 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6215 {
6216    const nir_variable* var =
6217       nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6218    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6219    bool is_array = nir_intrinsic_image_array(instr);
6220    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6221 
6222    /* only R64_UINT and R64_SINT supported */
6223    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6224       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6225    data = as_vgpr(ctx, data);
6226 
6227    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6228    unsigned access = var->data.access | nir_intrinsic_access(instr);
6229    bool glc = ctx->options->chip_class == GFX6 ||
6230                     access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
6231                  ? 1
6232                  : 0;
6233 
6234    if (dim == GLSL_SAMPLER_DIM_BUF) {
6235       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6236                                    ACO_DESC_BUFFER, nullptr, true);
6237       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6238       aco_opcode opcode;
6239       switch (data.size()) {
6240       case 1: opcode = aco_opcode::buffer_store_format_x; break;
6241       case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6242       case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6243       case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6244       default: unreachable(">4 channel buffer image store");
6245       }
6246       aco_ptr<MUBUF_instruction> store{
6247          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6248       store->operands[0] = Operand(rsrc);
6249       store->operands[1] = Operand(vindex);
6250       store->operands[2] = Operand::c32(0);
6251       store->operands[3] = Operand(data);
6252       store->idxen = true;
6253       store->glc = glc;
6254       store->dlc = false;
6255       store->disable_wqm = true;
6256       store->sync = sync;
6257       ctx->program->needs_exact = true;
6258       ctx->block->instructions.emplace_back(std::move(store));
6259       return;
6260    }
6261 
6262    assert(data.type() == RegType::vgpr);
6263    std::vector<Temp> coords = get_image_coords(ctx, instr);
6264    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6265                                     ACO_DESC_IMAGE, nullptr, true);
6266 
6267    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6268    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6269 
6270    Builder bld(ctx->program, ctx->block);
6271    MIMG_instruction* store =
6272       emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
6273    store->glc = glc;
6274    store->dlc = false;
6275    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6276    store->dmask = (1 << data.size()) - 1;
6277    store->unrm = true;
6278    store->da = should_declare_array(ctx, dim, is_array);
6279    store->disable_wqm = true;
6280    store->sync = sync;
6281    ctx->program->needs_exact = true;
6282    return;
6283 }
6284 
6285 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6286 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6287 {
6288    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6289    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6290    bool is_array = nir_intrinsic_image_array(instr);
6291    Builder bld(ctx->program, ctx->block);
6292 
6293    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6294    bool is_64bit = data.bytes() == 8;
6295    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6296 
6297    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6298       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6299                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6300 
6301    aco_opcode buf_op, buf_op64, image_op;
6302    switch (instr->intrinsic) {
6303    case nir_intrinsic_image_deref_atomic_add:
6304       buf_op = aco_opcode::buffer_atomic_add;
6305       buf_op64 = aco_opcode::buffer_atomic_add_x2;
6306       image_op = aco_opcode::image_atomic_add;
6307       break;
6308    case nir_intrinsic_image_deref_atomic_umin:
6309       buf_op = aco_opcode::buffer_atomic_umin;
6310       buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6311       image_op = aco_opcode::image_atomic_umin;
6312       break;
6313    case nir_intrinsic_image_deref_atomic_imin:
6314       buf_op = aco_opcode::buffer_atomic_smin;
6315       buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6316       image_op = aco_opcode::image_atomic_smin;
6317       break;
6318    case nir_intrinsic_image_deref_atomic_umax:
6319       buf_op = aco_opcode::buffer_atomic_umax;
6320       buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6321       image_op = aco_opcode::image_atomic_umax;
6322       break;
6323    case nir_intrinsic_image_deref_atomic_imax:
6324       buf_op = aco_opcode::buffer_atomic_smax;
6325       buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6326       image_op = aco_opcode::image_atomic_smax;
6327       break;
6328    case nir_intrinsic_image_deref_atomic_and:
6329       buf_op = aco_opcode::buffer_atomic_and;
6330       buf_op64 = aco_opcode::buffer_atomic_and_x2;
6331       image_op = aco_opcode::image_atomic_and;
6332       break;
6333    case nir_intrinsic_image_deref_atomic_or:
6334       buf_op = aco_opcode::buffer_atomic_or;
6335       buf_op64 = aco_opcode::buffer_atomic_or_x2;
6336       image_op = aco_opcode::image_atomic_or;
6337       break;
6338    case nir_intrinsic_image_deref_atomic_xor:
6339       buf_op = aco_opcode::buffer_atomic_xor;
6340       buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6341       image_op = aco_opcode::image_atomic_xor;
6342       break;
6343    case nir_intrinsic_image_deref_atomic_exchange:
6344       buf_op = aco_opcode::buffer_atomic_swap;
6345       buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6346       image_op = aco_opcode::image_atomic_swap;
6347       break;
6348    case nir_intrinsic_image_deref_atomic_comp_swap:
6349       buf_op = aco_opcode::buffer_atomic_cmpswap;
6350       buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6351       image_op = aco_opcode::image_atomic_cmpswap;
6352       break;
6353    case nir_intrinsic_image_deref_atomic_fmin:
6354       buf_op = aco_opcode::buffer_atomic_fmin;
6355       buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6356       image_op = aco_opcode::image_atomic_fmin;
6357       break;
6358    case nir_intrinsic_image_deref_atomic_fmax:
6359       buf_op = aco_opcode::buffer_atomic_fmax;
6360       buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6361       image_op = aco_opcode::image_atomic_fmax;
6362       break;
6363    default:
6364       unreachable("visit_image_atomic should only be called with "
6365                   "nir_intrinsic_image_deref_atomic_* instructions.");
6366    }
6367 
6368    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6369    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6370 
6371    if (dim == GLSL_SAMPLER_DIM_BUF) {
6372       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6373       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6374                                        ACO_DESC_BUFFER, nullptr, true);
6375       // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
6376       // implemented.");
6377       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6378          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6379       mubuf->operands[0] = Operand(resource);
6380       mubuf->operands[1] = Operand(vindex);
6381       mubuf->operands[2] = Operand::c32(0);
6382       mubuf->operands[3] = Operand(data);
6383       if (return_previous)
6384          mubuf->definitions[0] = Definition(dst);
6385       mubuf->offset = 0;
6386       mubuf->idxen = true;
6387       mubuf->glc = return_previous;
6388       mubuf->dlc = false; /* Not needed for atomics */
6389       mubuf->disable_wqm = true;
6390       mubuf->sync = sync;
6391       ctx->program->needs_exact = true;
6392       ctx->block->instructions.emplace_back(std::move(mubuf));
6393       return;
6394    }
6395 
6396    std::vector<Temp> coords = get_image_coords(ctx, instr);
6397    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6398                                     ACO_DESC_IMAGE, nullptr, true);
6399    Definition def = return_previous ? Definition(dst) : Definition();
6400    MIMG_instruction* mimg =
6401       emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
6402    mimg->glc = return_previous;
6403    mimg->dlc = false; /* Not needed for atomics */
6404    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6405    mimg->dmask = (1 << data.size()) - 1;
6406    mimg->unrm = true;
6407    mimg->da = should_declare_array(ctx, dim, is_array);
6408    mimg->disable_wqm = true;
6409    mimg->sync = sync;
6410    ctx->program->needs_exact = true;
6411    return;
6412 }
6413 
6414 void
get_buffer_size(isel_context * ctx,Temp desc,Temp dst)6415 get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6416 {
6417    if (ctx->options->chip_class == GFX8) {
6418       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6419       Builder bld(ctx->program, ctx->block);
6420 
6421       Temp size = emit_extract_vector(ctx, desc, 2, s1);
6422 
6423       Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
6424                                 bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);
6425       size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
6426                            bld.as_uniform(size_div3), Operand::c32(1u));
6427 
6428       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6429       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
6430                         Operand::c32((5u << 16) | 16u));
6431 
6432       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));
6433       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6434 
6435       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6436       bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
6437                bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6438       if (dst.type() == RegType::vgpr)
6439          bld.copy(Definition(dst), shr_dst);
6440 
6441       /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6442    } else {
6443       emit_extract_vector(ctx, desc, 2, dst);
6444    }
6445 }
6446 
6447 void
visit_image_size(isel_context * ctx,nir_intrinsic_instr * instr)6448 visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6449 {
6450    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6451    bool is_array = nir_intrinsic_image_array(instr);
6452    Builder bld(ctx->program, ctx->block);
6453 
6454    if (dim == GLSL_SAMPLER_DIM_BUF) {
6455       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6456                                    ACO_DESC_BUFFER, NULL, false);
6457       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6458    }
6459 
6460    /* LOD */
6461    assert(nir_src_as_uint(instr->src[1]) == 0);
6462    std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
6463 
6464    /* Resource */
6465    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6466                                     ACO_DESC_IMAGE, NULL, false);
6467 
6468    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6469 
6470    MIMG_instruction* mimg =
6471       emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
6472    uint8_t& dmask = mimg->dmask;
6473    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6474    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6475    mimg->da = is_array;
6476 
6477    if (ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
6478       assert(instr->dest.ssa.num_components == 2);
6479       dmask = 0x5;
6480    }
6481 
6482    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6483 }
6484 
6485 void
get_image_samples(isel_context * ctx,Definition dst,Temp resource)6486 get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6487 {
6488    Builder bld(ctx->program, ctx->block);
6489 
6490    Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6491    Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6492                                 Operand::c32(16u | 4u << 16));
6493    Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),
6494                            samples_log2);
6495    Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6496                         Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));
6497 
6498    Operand default_sample = Operand::c32(1u);
6499    if (ctx->options->robust_buffer_access) {
6500       /* Extract the second dword of the descriptor, if it's
6501        * all zero, then it's a null descriptor.
6502        */
6503       Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6504       Temp is_non_null_descriptor =
6505          bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());
6506       default_sample = Operand(is_non_null_descriptor);
6507    }
6508 
6509    Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));
6510    bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
6511 }
6512 
6513 void
visit_image_samples(isel_context * ctx,nir_intrinsic_instr * instr)6514 visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6515 {
6516    Builder bld(ctx->program, ctx->block);
6517    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6518    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6519                                     ACO_DESC_IMAGE, NULL, false);
6520    get_image_samples(ctx, Definition(dst), resource);
6521 }
6522 
6523 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6524 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6525 {
6526    Builder bld(ctx->program, ctx->block);
6527    unsigned num_components = instr->num_components;
6528 
6529    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6530    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6531 
6532    unsigned access = nir_intrinsic_access(instr);
6533    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6534    unsigned size = instr->dest.ssa.bit_size / 8;
6535 
6536    bool allow_smem = access & ACCESS_CAN_REORDER;
6537 
6538    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6539                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6540                get_memory_sync_info(instr, storage_buffer, 0));
6541 }
6542 
6543 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6544 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6545 {
6546    Builder bld(ctx->program, ctx->block);
6547    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6548    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6549    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6550    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6551 
6552    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6553 
6554    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6555    bool glc =
6556       nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6557 
6558    unsigned write_count = 0;
6559    Temp write_datas[32];
6560    unsigned offsets[32];
6561    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6562                       write_datas, offsets);
6563 
6564    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6565     * correctly when the SGPR offset is used.
6566     */
6567    if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
6568       offset = as_vgpr(ctx, offset);
6569 
6570    for (unsigned i = 0; i < write_count; i++) {
6571       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6572 
6573       aco_ptr<MUBUF_instruction> store{
6574          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6575       store->operands[0] = Operand(rsrc);
6576       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6577       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6578       store->operands[3] = Operand(write_datas[i]);
6579       store->offset = offsets[i];
6580       store->offen = (offset.type() == RegType::vgpr);
6581       store->glc = glc;
6582       store->dlc = false;
6583       store->disable_wqm = true;
6584       store->sync = sync;
6585       ctx->program->needs_exact = true;
6586       ctx->block->instructions.emplace_back(std::move(store));
6587    }
6588 }
6589 
6590 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6591 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6592 {
6593    Builder bld(ctx->program, ctx->block);
6594    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6595    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6596 
6597    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6598       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6599                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6600 
6601    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6602    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6603 
6604    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6605 
6606    aco_opcode op32, op64;
6607    switch (instr->intrinsic) {
6608    case nir_intrinsic_ssbo_atomic_add:
6609       op32 = aco_opcode::buffer_atomic_add;
6610       op64 = aco_opcode::buffer_atomic_add_x2;
6611       break;
6612    case nir_intrinsic_ssbo_atomic_imin:
6613       op32 = aco_opcode::buffer_atomic_smin;
6614       op64 = aco_opcode::buffer_atomic_smin_x2;
6615       break;
6616    case nir_intrinsic_ssbo_atomic_umin:
6617       op32 = aco_opcode::buffer_atomic_umin;
6618       op64 = aco_opcode::buffer_atomic_umin_x2;
6619       break;
6620    case nir_intrinsic_ssbo_atomic_imax:
6621       op32 = aco_opcode::buffer_atomic_smax;
6622       op64 = aco_opcode::buffer_atomic_smax_x2;
6623       break;
6624    case nir_intrinsic_ssbo_atomic_umax:
6625       op32 = aco_opcode::buffer_atomic_umax;
6626       op64 = aco_opcode::buffer_atomic_umax_x2;
6627       break;
6628    case nir_intrinsic_ssbo_atomic_and:
6629       op32 = aco_opcode::buffer_atomic_and;
6630       op64 = aco_opcode::buffer_atomic_and_x2;
6631       break;
6632    case nir_intrinsic_ssbo_atomic_or:
6633       op32 = aco_opcode::buffer_atomic_or;
6634       op64 = aco_opcode::buffer_atomic_or_x2;
6635       break;
6636    case nir_intrinsic_ssbo_atomic_xor:
6637       op32 = aco_opcode::buffer_atomic_xor;
6638       op64 = aco_opcode::buffer_atomic_xor_x2;
6639       break;
6640    case nir_intrinsic_ssbo_atomic_exchange:
6641       op32 = aco_opcode::buffer_atomic_swap;
6642       op64 = aco_opcode::buffer_atomic_swap_x2;
6643       break;
6644    case nir_intrinsic_ssbo_atomic_comp_swap:
6645       op32 = aco_opcode::buffer_atomic_cmpswap;
6646       op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6647       break;
6648    case nir_intrinsic_ssbo_atomic_fmin:
6649       op32 = aco_opcode::buffer_atomic_fmin;
6650       op64 = aco_opcode::buffer_atomic_fmin_x2;
6651       break;
6652    case nir_intrinsic_ssbo_atomic_fmax:
6653       op32 = aco_opcode::buffer_atomic_fmax;
6654       op64 = aco_opcode::buffer_atomic_fmax_x2;
6655       break;
6656    default:
6657       unreachable(
6658          "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6659    }
6660    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6661    aco_ptr<MUBUF_instruction> mubuf{
6662       create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6663    mubuf->operands[0] = Operand(rsrc);
6664    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6665    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6666    mubuf->operands[3] = Operand(data);
6667    if (return_previous)
6668       mubuf->definitions[0] = Definition(dst);
6669    mubuf->offset = 0;
6670    mubuf->offen = (offset.type() == RegType::vgpr);
6671    mubuf->glc = return_previous;
6672    mubuf->dlc = false; /* Not needed for atomics */
6673    mubuf->disable_wqm = true;
6674    mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6675    ctx->program->needs_exact = true;
6676    ctx->block->instructions.emplace_back(std::move(mubuf));
6677 }
6678 
6679 void
visit_get_ssbo_size(isel_context * ctx,nir_intrinsic_instr * instr)6680 visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
6681 {
6682 
6683    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
6684    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6685    bool non_uniform = dst.type() == RegType::vgpr;
6686 
6687    Builder bld(ctx->program, ctx->block);
6688    if (non_uniform) {
6689       Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
6690       Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));
6691       Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);
6692       index = convert_pointer_to_64_bit(ctx, index, non_uniform);
6693 
6694       LoadEmitInfo info = {Operand(index), dst, 1, 4};
6695       info.align_mul = 4;
6696       info.const_offset = 8;
6697       emit_load(ctx, bld, info, global_load_params);
6698    } else {
6699       emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);
6700    }
6701 }
6702 
6703 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6704 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6705 {
6706    Builder bld(ctx->program, ctx->block);
6707    unsigned num_components = instr->num_components;
6708    unsigned component_size = instr->dest.ssa.bit_size / 8;
6709 
6710    LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6711                         get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
6712    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6713    info.align_mul = nir_intrinsic_align_mul(instr);
6714    info.align_offset = nir_intrinsic_align_offset(instr);
6715    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6716    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6717     * it's safe to use SMEM */
6718    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6719    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
6720        !can_use_smem) {
6721       emit_load(ctx, bld, info, global_load_params);
6722    } else {
6723       info.offset = Operand(bld.as_uniform(info.offset));
6724       emit_load(ctx, bld, info, smem_load_params);
6725    }
6726 }
6727 
6728 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6729 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6730 {
6731    Builder bld(ctx->program, ctx->block);
6732    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6733    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6734 
6735    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6736    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6737    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6738    bool glc =
6739       nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6740 
6741    if (ctx->options->chip_class >= GFX7)
6742       addr = as_vgpr(ctx, addr);
6743 
6744    unsigned write_count = 0;
6745    Temp write_datas[32];
6746    unsigned offsets[32];
6747    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6748                       write_datas, offsets);
6749 
6750    for (unsigned i = 0; i < write_count; i++) {
6751       if (ctx->options->chip_class >= GFX7) {
6752          unsigned offset = offsets[i];
6753          Temp store_addr = addr;
6754          if (offset > 0 && ctx->options->chip_class < GFX9) {
6755             Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6756             Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6757             Temp carry = bld.tmp(bld.lm);
6758             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6759 
6760             bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
6761                      bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);
6762             bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6763                      Operand::zero(), addr1, carry)
6764                .def(1)
6765                .setHint(vcc);
6766 
6767             store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6768 
6769             offset = 0;
6770          }
6771 
6772          bool global = ctx->options->chip_class >= GFX9;
6773          aco_opcode op;
6774          switch (write_datas[i].bytes()) {
6775          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6776          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6777          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6778          case 8:
6779             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6780             break;
6781          case 12:
6782             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6783             break;
6784          case 16:
6785             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6786             break;
6787          default: unreachable("store_global not implemented for this size.");
6788          }
6789 
6790          aco_ptr<FLAT_instruction> flat{
6791             create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6792          flat->operands[0] = Operand(store_addr);
6793          flat->operands[1] = Operand(s1);
6794          flat->operands[2] = Operand(write_datas[i]);
6795          flat->glc = glc;
6796          flat->dlc = false;
6797          flat->offset = offset;
6798          flat->disable_wqm = true;
6799          flat->sync = sync;
6800          ctx->program->needs_exact = true;
6801          ctx->block->instructions.emplace_back(std::move(flat));
6802       } else {
6803          assert(ctx->options->chip_class == GFX6);
6804 
6805          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6806 
6807          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6808 
6809          aco_ptr<MUBUF_instruction> mubuf{
6810             create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6811          mubuf->operands[0] = Operand(rsrc);
6812          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6813          mubuf->operands[2] = Operand::zero();
6814          mubuf->operands[3] = Operand(write_datas[i]);
6815          mubuf->glc = glc;
6816          mubuf->dlc = false;
6817          mubuf->offset = offsets[i];
6818          mubuf->addr64 = addr.type() == RegType::vgpr;
6819          mubuf->disable_wqm = true;
6820          mubuf->sync = sync;
6821          ctx->program->needs_exact = true;
6822          ctx->block->instructions.emplace_back(std::move(mubuf));
6823       }
6824    }
6825 }
6826 
6827 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6828 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6829 {
6830    Builder bld(ctx->program, ctx->block);
6831    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6832    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6833    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6834 
6835    if (ctx->options->chip_class >= GFX7)
6836       addr = as_vgpr(ctx, addr);
6837 
6838    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6839       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6840                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6841 
6842    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6843 
6844    aco_opcode op32, op64;
6845 
6846    if (ctx->options->chip_class >= GFX7) {
6847       bool global = ctx->options->chip_class >= GFX9;
6848       switch (instr->intrinsic) {
6849       case nir_intrinsic_global_atomic_add:
6850          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6851          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6852          break;
6853       case nir_intrinsic_global_atomic_imin:
6854          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6855          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6856          break;
6857       case nir_intrinsic_global_atomic_umin:
6858          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6859          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6860          break;
6861       case nir_intrinsic_global_atomic_imax:
6862          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6863          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6864          break;
6865       case nir_intrinsic_global_atomic_umax:
6866          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6867          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6868          break;
6869       case nir_intrinsic_global_atomic_and:
6870          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6871          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6872          break;
6873       case nir_intrinsic_global_atomic_or:
6874          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6875          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6876          break;
6877       case nir_intrinsic_global_atomic_xor:
6878          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6879          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6880          break;
6881       case nir_intrinsic_global_atomic_exchange:
6882          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6883          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6884          break;
6885       case nir_intrinsic_global_atomic_comp_swap:
6886          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6887          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6888          break;
6889       case nir_intrinsic_global_atomic_fmin:
6890          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6891          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6892          break;
6893       case nir_intrinsic_global_atomic_fmax:
6894          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6895          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6896          break;
6897       default:
6898          unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6899                      "instructions.");
6900       }
6901 
6902       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6903       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6904          op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6905       flat->operands[0] = Operand(addr);
6906       flat->operands[1] = Operand(s1);
6907       flat->operands[2] = Operand(data);
6908       if (return_previous)
6909          flat->definitions[0] = Definition(dst);
6910       flat->glc = return_previous;
6911       flat->dlc = false; /* Not needed for atomics */
6912       flat->offset = 0;
6913       flat->disable_wqm = true;
6914       flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6915       ctx->program->needs_exact = true;
6916       ctx->block->instructions.emplace_back(std::move(flat));
6917    } else {
6918       assert(ctx->options->chip_class == GFX6);
6919 
6920       switch (instr->intrinsic) {
6921       case nir_intrinsic_global_atomic_add:
6922          op32 = aco_opcode::buffer_atomic_add;
6923          op64 = aco_opcode::buffer_atomic_add_x2;
6924          break;
6925       case nir_intrinsic_global_atomic_imin:
6926          op32 = aco_opcode::buffer_atomic_smin;
6927          op64 = aco_opcode::buffer_atomic_smin_x2;
6928          break;
6929       case nir_intrinsic_global_atomic_umin:
6930          op32 = aco_opcode::buffer_atomic_umin;
6931          op64 = aco_opcode::buffer_atomic_umin_x2;
6932          break;
6933       case nir_intrinsic_global_atomic_imax:
6934          op32 = aco_opcode::buffer_atomic_smax;
6935          op64 = aco_opcode::buffer_atomic_smax_x2;
6936          break;
6937       case nir_intrinsic_global_atomic_umax:
6938          op32 = aco_opcode::buffer_atomic_umax;
6939          op64 = aco_opcode::buffer_atomic_umax_x2;
6940          break;
6941       case nir_intrinsic_global_atomic_and:
6942          op32 = aco_opcode::buffer_atomic_and;
6943          op64 = aco_opcode::buffer_atomic_and_x2;
6944          break;
6945       case nir_intrinsic_global_atomic_or:
6946          op32 = aco_opcode::buffer_atomic_or;
6947          op64 = aco_opcode::buffer_atomic_or_x2;
6948          break;
6949       case nir_intrinsic_global_atomic_xor:
6950          op32 = aco_opcode::buffer_atomic_xor;
6951          op64 = aco_opcode::buffer_atomic_xor_x2;
6952          break;
6953       case nir_intrinsic_global_atomic_exchange:
6954          op32 = aco_opcode::buffer_atomic_swap;
6955          op64 = aco_opcode::buffer_atomic_swap_x2;
6956          break;
6957       case nir_intrinsic_global_atomic_comp_swap:
6958          op32 = aco_opcode::buffer_atomic_cmpswap;
6959          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6960          break;
6961       case nir_intrinsic_global_atomic_fmin:
6962          op32 = aco_opcode::buffer_atomic_fmin;
6963          op64 = aco_opcode::buffer_atomic_fmin_x2;
6964          break;
6965       case nir_intrinsic_global_atomic_fmax:
6966          op32 = aco_opcode::buffer_atomic_fmax;
6967          op64 = aco_opcode::buffer_atomic_fmax_x2;
6968          break;
6969       default:
6970          unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6971                      "instructions.");
6972       }
6973 
6974       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6975 
6976       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6977 
6978       aco_ptr<MUBUF_instruction> mubuf{
6979          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6980       mubuf->operands[0] = Operand(rsrc);
6981       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6982       mubuf->operands[2] = Operand::zero();
6983       mubuf->operands[3] = Operand(data);
6984       if (return_previous)
6985          mubuf->definitions[0] = Definition(dst);
6986       mubuf->glc = return_previous;
6987       mubuf->dlc = false;
6988       mubuf->offset = 0;
6989       mubuf->addr64 = addr.type() == RegType::vgpr;
6990       mubuf->disable_wqm = true;
6991       mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6992       ctx->program->needs_exact = true;
6993       ctx->block->instructions.emplace_back(std::move(mubuf));
6994    }
6995 }
6996 
6997 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)6998 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6999 {
7000    Builder bld(ctx->program, ctx->block);
7001 
7002    Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7003    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7004    Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7005    Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7006 
7007    bool swizzled = nir_intrinsic_is_swizzled(intrin);
7008    bool reorder = nir_intrinsic_can_reorder(intrin);
7009    bool slc = nir_intrinsic_slc_amd(intrin);
7010 
7011    unsigned const_offset = nir_intrinsic_base(intrin);
7012    unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
7013    unsigned num_components = intrin->dest.ssa.num_components;
7014    unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
7015 
7016    load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7017                    num_components, swizzle_element_size, !swizzled, reorder, slc);
7018 }
7019 
7020 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7021 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7022 {
7023    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7024    Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7025    Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7026    Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7027 
7028    bool swizzled = nir_intrinsic_is_swizzled(intrin);
7029    bool slc = nir_intrinsic_slc_amd(intrin);
7030 
7031    unsigned const_offset = nir_intrinsic_base(intrin);
7032    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7033    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7034 
7035    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7036    memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
7037 
7038    store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7039                     write_mask, !swizzled, sync, slc);
7040 }
7041 
7042 sync_scope
translate_nir_scope(nir_scope scope)7043 translate_nir_scope(nir_scope scope)
7044 {
7045    switch (scope) {
7046    case NIR_SCOPE_NONE:
7047    case NIR_SCOPE_INVOCATION: return scope_invocation;
7048    case NIR_SCOPE_SUBGROUP: return scope_subgroup;
7049    case NIR_SCOPE_WORKGROUP: return scope_workgroup;
7050    case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7051    case NIR_SCOPE_DEVICE: return scope_device;
7052    case NIR_SCOPE_SHADER_CALL: return scope_invocation;
7053    }
7054    unreachable("invalid scope");
7055 }
7056 
7057 void
emit_scoped_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7058 emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7059 {
7060    Builder bld(ctx->program, ctx->block);
7061 
7062    unsigned semantics = 0;
7063    unsigned storage = 0;
7064    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7065    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7066 
7067    /* We use shared storage for the following:
7068     * - compute shaders expose it in their API
7069     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7070     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7071     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7072     */
7073    bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7074                               ctx->stage.hw == HWStage::HS ||
7075                               (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
7076                               ctx->stage.hw == HWStage::NGG;
7077 
7078    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7079     * They are allowed in CS, TCS, and in any NGG shader.
7080     */
7081    ASSERTED bool workgroup_scope_allowed =
7082       ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7083 
7084    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7085    if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
7086       storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
7087    if (shared_storage_used && (nir_storage & nir_var_mem_shared))
7088       storage |= storage_shared;
7089 
7090    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7091    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7092       semantics |= semantic_acquire | semantic_release;
7093    if (nir_semantics & NIR_MEMORY_RELEASE)
7094       semantics |= semantic_acquire | semantic_release;
7095 
7096    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7097    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7098 
7099    bld.barrier(aco_opcode::p_barrier,
7100                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7101                exec_scope);
7102 }
7103 
7104 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7105 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7106 {
7107    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
7108    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7109    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7110    Builder bld(ctx->program, ctx->block);
7111 
7112    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
7113    unsigned num_components = instr->dest.ssa.num_components;
7114    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7115    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7116 }
7117 
7118 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7119 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7120 {
7121    unsigned writemask = nir_intrinsic_write_mask(instr);
7122    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7123    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7124    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7125 
7126    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7127    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7128 }
7129 
7130 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7131 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7132 {
7133    unsigned offset = nir_intrinsic_base(instr);
7134    Builder bld(ctx->program, ctx->block);
7135    Operand m = load_lds_size_m0(bld);
7136    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7137    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7138 
7139    unsigned num_operands = 3;
7140    aco_opcode op32, op64, op32_rtn, op64_rtn;
7141    switch (instr->intrinsic) {
7142    case nir_intrinsic_shared_atomic_add:
7143       op32 = aco_opcode::ds_add_u32;
7144       op64 = aco_opcode::ds_add_u64;
7145       op32_rtn = aco_opcode::ds_add_rtn_u32;
7146       op64_rtn = aco_opcode::ds_add_rtn_u64;
7147       break;
7148    case nir_intrinsic_shared_atomic_imin:
7149       op32 = aco_opcode::ds_min_i32;
7150       op64 = aco_opcode::ds_min_i64;
7151       op32_rtn = aco_opcode::ds_min_rtn_i32;
7152       op64_rtn = aco_opcode::ds_min_rtn_i64;
7153       break;
7154    case nir_intrinsic_shared_atomic_umin:
7155       op32 = aco_opcode::ds_min_u32;
7156       op64 = aco_opcode::ds_min_u64;
7157       op32_rtn = aco_opcode::ds_min_rtn_u32;
7158       op64_rtn = aco_opcode::ds_min_rtn_u64;
7159       break;
7160    case nir_intrinsic_shared_atomic_imax:
7161       op32 = aco_opcode::ds_max_i32;
7162       op64 = aco_opcode::ds_max_i64;
7163       op32_rtn = aco_opcode::ds_max_rtn_i32;
7164       op64_rtn = aco_opcode::ds_max_rtn_i64;
7165       break;
7166    case nir_intrinsic_shared_atomic_umax:
7167       op32 = aco_opcode::ds_max_u32;
7168       op64 = aco_opcode::ds_max_u64;
7169       op32_rtn = aco_opcode::ds_max_rtn_u32;
7170       op64_rtn = aco_opcode::ds_max_rtn_u64;
7171       break;
7172    case nir_intrinsic_shared_atomic_and:
7173       op32 = aco_opcode::ds_and_b32;
7174       op64 = aco_opcode::ds_and_b64;
7175       op32_rtn = aco_opcode::ds_and_rtn_b32;
7176       op64_rtn = aco_opcode::ds_and_rtn_b64;
7177       break;
7178    case nir_intrinsic_shared_atomic_or:
7179       op32 = aco_opcode::ds_or_b32;
7180       op64 = aco_opcode::ds_or_b64;
7181       op32_rtn = aco_opcode::ds_or_rtn_b32;
7182       op64_rtn = aco_opcode::ds_or_rtn_b64;
7183       break;
7184    case nir_intrinsic_shared_atomic_xor:
7185       op32 = aco_opcode::ds_xor_b32;
7186       op64 = aco_opcode::ds_xor_b64;
7187       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7188       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7189       break;
7190    case nir_intrinsic_shared_atomic_exchange:
7191       op32 = aco_opcode::ds_write_b32;
7192       op64 = aco_opcode::ds_write_b64;
7193       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7194       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7195       break;
7196    case nir_intrinsic_shared_atomic_comp_swap:
7197       op32 = aco_opcode::ds_cmpst_b32;
7198       op64 = aco_opcode::ds_cmpst_b64;
7199       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7200       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7201       num_operands = 4;
7202       break;
7203    case nir_intrinsic_shared_atomic_fadd:
7204       op32 = aco_opcode::ds_add_f32;
7205       op32_rtn = aco_opcode::ds_add_rtn_f32;
7206       op64 = aco_opcode::num_opcodes;
7207       op64_rtn = aco_opcode::num_opcodes;
7208       break;
7209    case nir_intrinsic_shared_atomic_fmin:
7210       op32 = aco_opcode::ds_min_f32;
7211       op32_rtn = aco_opcode::ds_min_rtn_f32;
7212       op64 = aco_opcode::ds_min_f64;
7213       op64_rtn = aco_opcode::ds_min_rtn_f64;
7214       break;
7215    case nir_intrinsic_shared_atomic_fmax:
7216       op32 = aco_opcode::ds_max_f32;
7217       op32_rtn = aco_opcode::ds_max_rtn_f32;
7218       op64 = aco_opcode::ds_max_f64;
7219       op64_rtn = aco_opcode::ds_max_rtn_f64;
7220       break;
7221    default: unreachable("Unhandled shared atomic intrinsic");
7222    }
7223 
7224    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
7225 
7226    aco_opcode op;
7227    if (data.size() == 1) {
7228       assert(instr->dest.ssa.bit_size == 32);
7229       op = return_previous ? op32_rtn : op32;
7230    } else {
7231       assert(instr->dest.ssa.bit_size == 64);
7232       op = return_previous ? op64_rtn : op64;
7233    }
7234 
7235    if (offset > 65535) {
7236       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7237       offset = 0;
7238    }
7239 
7240    aco_ptr<DS_instruction> ds;
7241    ds.reset(
7242       create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7243    ds->operands[0] = Operand(address);
7244    ds->operands[1] = Operand(data);
7245    if (num_operands == 4) {
7246       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7247       ds->operands[2] = Operand(data2);
7248    }
7249    ds->operands[num_operands - 1] = m;
7250    ds->offset0 = offset;
7251    if (return_previous)
7252       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7253    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7254 
7255    if (m.isUndefined())
7256       ds->operands.pop_back();
7257 
7258    ctx->block->instructions.emplace_back(std::move(ds));
7259 }
7260 
7261 Temp
get_scratch_resource(isel_context * ctx)7262 get_scratch_resource(isel_context* ctx)
7263 {
7264    Builder bld(ctx->program, ctx->block);
7265    Temp scratch_addr = ctx->program->private_segment_buffer;
7266    if (ctx->stage != compute_cs)
7267       scratch_addr =
7268          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7269 
7270    uint32_t rsrc_conf =
7271       S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7272 
7273    if (ctx->program->chip_class >= GFX10) {
7274       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7275                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
7276    } else if (ctx->program->chip_class <=
7277               GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7278       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7279                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7280    }
7281 
7282    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7283    if (ctx->program->chip_class <= GFX8)
7284       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7285 
7286    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7287                      Operand::c32(rsrc_conf));
7288 }
7289 
7290 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7291 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7292 {
7293    Builder bld(ctx->program, ctx->block);
7294    Temp rsrc = get_scratch_resource(ctx);
7295    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7296    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7297 
7298    LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
7299                         instr->dest.ssa.bit_size / 8u, rsrc};
7300    info.align_mul = nir_intrinsic_align_mul(instr);
7301    info.align_offset = nir_intrinsic_align_offset(instr);
7302    info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
7303    info.sync = memory_sync_info(storage_scratch, semantic_private);
7304    info.soffset = ctx->program->scratch_offset;
7305    emit_load(ctx, bld, info, scratch_load_params);
7306 }
7307 
7308 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7309 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7310 {
7311    Builder bld(ctx->program, ctx->block);
7312    Temp rsrc = get_scratch_resource(ctx);
7313    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7314    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7315 
7316    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7317    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7318 
7319    unsigned write_count = 0;
7320    Temp write_datas[32];
7321    unsigned offsets[32];
7322    unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
7323    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7324                       &write_count, write_datas, offsets);
7325 
7326    for (unsigned i = 0; i < write_count; i++) {
7327       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7328       Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
7329                                      offsets[i], true, true);
7330       mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7331    }
7332 }
7333 
7334 void
visit_load_sample_mask_in(isel_context * ctx,nir_intrinsic_instr * instr)7335 visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
7336 {
7337    uint8_t log2_ps_iter_samples;
7338    if (ctx->program->info->ps.uses_sample_shading) {
7339       log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples);
7340    } else {
7341       log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples;
7342    }
7343 
7344    Builder bld(ctx->program, ctx->block);
7345 
7346    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7347 
7348    if (log2_ps_iter_samples) {
7349       /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
7350       Temp sample_id =
7351          bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
7352                   Operand::c32(8u), Operand::c32(4u));
7353       Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
7354                            bld.copy(bld.def(v1), Operand::c32(1u)));
7355       bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
7356                get_arg(ctx, ctx->args->ac.sample_coverage));
7357    } else {
7358       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
7359    }
7360 }
7361 
7362 void
visit_emit_vertex_with_counter(isel_context * ctx,nir_intrinsic_instr * instr)7363 visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7364 {
7365    Builder bld(ctx->program, ctx->block);
7366 
7367    unsigned stream = nir_intrinsic_stream_id(instr);
7368    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7369    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
7370    nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
7371 
7372    /* get GSVS ring */
7373    Temp gsvs_ring =
7374       bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7375                Operand::c32(RING_GSVS_GS * 16u));
7376 
7377    unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
7378 
7379    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7380    unsigned stream_offset = 0;
7381    for (unsigned i = 0; i < stream; i++) {
7382       unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
7383                              ctx->shader->info.gs.vertices_out;
7384       stream_offset += prev_stride * ctx->program->wave_size;
7385    }
7386 
7387    /* Limit on the stride field for <= GFX7. */
7388    assert(stride < (1 << 14));
7389 
7390    Temp gsvs_dwords[4];
7391    for (unsigned i = 0; i < 4; i++)
7392       gsvs_dwords[i] = bld.tmp(s1);
7393    bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
7394               Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
7395 
7396    if (stream_offset) {
7397       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
7398 
7399       Temp carry = bld.tmp(s1);
7400       gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
7401                                 gsvs_dwords[0], stream_offset_tmp);
7402       gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
7403                                 gsvs_dwords[1], Operand::zero(), bld.scc(carry));
7404    }
7405 
7406    gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
7407                              Operand::c32(S_008F04_STRIDE(stride)));
7408    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7409 
7410    gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
7411                           gsvs_dwords[2], gsvs_dwords[3]);
7412 
7413    unsigned offset = 0;
7414    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
7415       if (ctx->program->info->gs.output_streams[i] != stream)
7416          continue;
7417 
7418       for (unsigned j = 0; j < 4; j++) {
7419          if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
7420             continue;
7421 
7422          if (ctx->outputs.mask[i] & (1 << j)) {
7423             Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
7424             unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
7425             if (const_offset >= 4096u) {
7426                if (vaddr_offset.isUndefined())
7427                   vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
7428                else
7429                   vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
7430                                             vaddr_offset);
7431                const_offset %= 4096u;
7432             }
7433 
7434             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
7435                aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7436             mtbuf->operands[0] = Operand(gsvs_ring);
7437             mtbuf->operands[1] = vaddr_offset;
7438             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7439             mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7440             mtbuf->offen = !vaddr_offset.isUndefined();
7441             mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7442             mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7443             mtbuf->offset = const_offset;
7444             mtbuf->glc = true;
7445             mtbuf->slc = true;
7446             mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
7447             bld.insert(std::move(mtbuf));
7448          }
7449 
7450          offset += ctx->shader->info.gs.vertices_out;
7451       }
7452 
7453       /* outputs for the next vertex are undefined and keeping them around can
7454        * create invalid IR with control flow */
7455       ctx->outputs.mask[i] = 0;
7456    }
7457 
7458    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7459 }
7460 
7461 Temp
emit_boolean_reduce(isel_context * ctx,nir_op op,unsigned cluster_size,Temp src)7462 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7463 {
7464    Builder bld(ctx->program, ctx->block);
7465 
7466    if (cluster_size == 1) {
7467       return src;
7468    }
7469    if (op == nir_op_iand && cluster_size == 4) {
7470       /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
7471       Temp tmp =
7472          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7473       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7474                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7475    } else if (op == nir_op_ior && cluster_size == 4) {
7476       /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7477       return bld.sop1(
7478          Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7479          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7480    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7481       /* subgroupAnd(val) -> (exec & ~val) == 0 */
7482       Temp tmp =
7483          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
7484             .def(1)
7485             .getTemp();
7486       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7487       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7488    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7489       /* subgroupOr(val) -> (val & exec) != 0 */
7490       Temp tmp =
7491          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7492             .def(1)
7493             .getTemp();
7494       return bool_to_vector_condition(ctx, tmp);
7495    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7496       /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7497       Temp tmp =
7498          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7499       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7500       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7501                .def(1)
7502                .getTemp();
7503       return bool_to_vector_condition(ctx, tmp);
7504    } else {
7505       /* subgroupClustered{And,Or,Xor}(val, n):
7506        *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7507        *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7508        * subgroupClusteredAnd():
7509        *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7510        * subgroupClusteredOr():
7511        *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7512        * subgroupClusteredXor():
7513        *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7514        */
7515       Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7516       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7517                                      Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7518 
7519       Temp tmp;
7520       if (op == nir_op_iand)
7521          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7522                         Operand(exec, bld.lm));
7523       else
7524          tmp =
7525             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7526 
7527       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7528 
7529       if (ctx->program->chip_class <= GFX7)
7530          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7531       else if (ctx->program->wave_size == 64)
7532          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7533       else
7534          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7535       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7536       if (cluster_mask != 0xffffffff)
7537          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7538 
7539       if (op == nir_op_iand) {
7540          return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),
7541                          tmp);
7542       } else if (op == nir_op_ior) {
7543          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7544       } else if (op == nir_op_ixor) {
7545          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7546                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7547          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7548       }
7549       assert(false);
7550       return Temp();
7551    }
7552 }
7553 
7554 Temp
emit_boolean_exclusive_scan(isel_context * ctx,nir_op op,Temp src)7555 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7556 {
7557    Builder bld(ctx->program, ctx->block);
7558    assert(src.regClass() == bld.lm);
7559 
7560    /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7561     * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7562     * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7563     */
7564    Temp tmp;
7565    if (op == nir_op_iand)
7566       tmp =
7567          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7568    else
7569       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7570 
7571    Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7572 
7573    if (op == nir_op_iand)
7574       return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7575    else if (op == nir_op_ior)
7576       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7577    else if (op == nir_op_ixor)
7578       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),
7579                       bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7580 
7581    assert(false);
7582    return Temp();
7583 }
7584 
7585 Temp
emit_boolean_inclusive_scan(isel_context * ctx,nir_op op,Temp src)7586 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7587 {
7588    Builder bld(ctx->program, ctx->block);
7589 
7590    /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7591     * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7592     * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7593     */
7594    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7595    if (op == nir_op_iand)
7596       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7597    else if (op == nir_op_ior)
7598       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7599    else if (op == nir_op_ixor)
7600       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7601 
7602    assert(false);
7603    return Temp();
7604 }
7605 
7606 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7607 get_reduce_op(nir_op op, unsigned bit_size)
7608 {
7609    switch (op) {
7610 #define CASEI(name)                                                                                \
7611    case nir_op_##name:                                                                             \
7612       return (bit_size == 32)   ? name##32                                                         \
7613              : (bit_size == 16) ? name##16                                                         \
7614              : (bit_size == 8)  ? name##8                                                          \
7615                                 : name##64;
7616 #define CASEF(name)                                                                                \
7617    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7618       CASEI(iadd)
7619       CASEI(imul)
7620       CASEI(imin)
7621       CASEI(umin)
7622       CASEI(imax)
7623       CASEI(umax)
7624       CASEI(iand)
7625       CASEI(ior)
7626       CASEI(ixor)
7627       CASEF(fadd)
7628       CASEF(fmul)
7629       CASEF(fmin)
7630       CASEF(fmax)
7631    default: unreachable("unknown reduction op");
7632 #undef CASEI
7633 #undef CASEF
7634    }
7635 }
7636 
7637 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7638 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7639 {
7640    Builder bld(ctx->program, ctx->block);
7641    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7642    assert(dst.regClass().type() != RegType::vgpr);
7643    if (src.regClass().type() == RegType::vgpr)
7644       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7645    else
7646       bld.copy(dst, src);
7647 }
7648 
7649 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7650 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7651 {
7652    Builder bld(ctx->program, ctx->block);
7653    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7654 
7655    if (op == nir_op_fadd) {
7656       src_tmp = as_vgpr(ctx, src_tmp);
7657       Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
7658 
7659       if (src.ssa->bit_size == 16) {
7660          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7661          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7662       } else {
7663          assert(src.ssa->bit_size == 32);
7664          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7665          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7666       }
7667 
7668       if (tmp != dst.getTemp())
7669          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7670 
7671       return;
7672    }
7673 
7674    if (dst.regClass() == s1)
7675       src_tmp = bld.as_uniform(src_tmp);
7676 
7677    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7678       count =
7679          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7680    else if (op == nir_op_ixor)
7681       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7682 
7683    assert(dst.getTemp().type() == count.type());
7684 
7685    if (nir_src_is_const(src)) {
7686       if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7687          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7688       else if (nir_src_as_uint(src) == 1)
7689          bld.copy(dst, count);
7690       else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
7691          bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */
7692       else if (nir_src_as_uint(src) == 0)
7693          bld.copy(dst, Operand::zero());
7694       else if (count.type() == RegType::vgpr)
7695          bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7696       else
7697          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7698    } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
7699       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7700    } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
7701       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7702    } else if (dst.getTemp().type() == RegType::vgpr) {
7703       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7704    } else {
7705       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7706    }
7707 }
7708 
7709 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7710 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7711 {
7712    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7713    if (op == nir_op_imul || op == nir_op_fmul)
7714       return false;
7715 
7716    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7717       Builder bld(ctx->program, ctx->block);
7718       Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7719       unsigned bit_size = instr->src[0].ssa->bit_size;
7720       if (bit_size > 32)
7721          return false;
7722 
7723       Temp thread_count =
7724          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7725 
7726       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7727    } else {
7728       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7729    }
7730 
7731    return true;
7732 }
7733 
7734 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7735 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7736 {
7737    Builder bld(ctx->program, ctx->block);
7738    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7739    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7740    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7741 
7742    if (op == nir_op_imul || op == nir_op_fmul)
7743       return false;
7744 
7745    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7746       if (instr->src[0].ssa->bit_size > 32)
7747          return false;
7748 
7749       Temp packed_tid;
7750       if (inc)
7751          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7752       else
7753          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7754 
7755       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7756       return true;
7757    }
7758 
7759    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7760           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7761 
7762    if (inc) {
7763       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7764       return true;
7765    }
7766 
7767    /* Copy the source and write the reduction operation identity to the first lane. */
7768    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7769    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7770    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7771    if (dst.bytes() == 8) {
7772       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7773       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7774       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7775       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7776 
7777       lo =
7778          bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);
7779       hi =
7780          bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);
7781       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7782    } else {
7783       uint32_t identity = get_reduction_identity(reduce_op, 0);
7784       bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,
7785                     as_vgpr(ctx, src));
7786    }
7787 
7788    return true;
7789 }
7790 
7791 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7792 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7793                      Definition dst, Temp src)
7794 {
7795    assert(src.bytes() <= 8);
7796    assert(src.type() == RegType::vgpr);
7797 
7798    Builder bld(ctx->program, ctx->block);
7799 
7800    unsigned num_defs = 0;
7801    Definition defs[5];
7802    defs[num_defs++] = dst;
7803    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7804 
7805    /* scalar identity temporary */
7806    bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
7807                      aco_op != aco_opcode::p_reduce;
7808    if (aco_op == aco_opcode::p_exclusive_scan) {
7809       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7810                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7811                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7812                      op == fmul64);
7813    }
7814    if (need_sitmp)
7815       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7816 
7817    /* scc clobber */
7818    defs[num_defs++] = bld.def(s1, scc);
7819 
7820    /* vcc clobber */
7821    bool clobber_vcc = false;
7822    if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)
7823       clobber_vcc = true;
7824    if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)
7825       clobber_vcc = true;
7826    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7827       clobber_vcc = true;
7828 
7829    if (clobber_vcc)
7830       defs[num_defs++] = bld.def(bld.lm, vcc);
7831 
7832    Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7833       aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7834    reduce->operands[0] = Operand(src);
7835    /* setup_reduce_temp will update these undef operands if needed */
7836    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7837    reduce->operands[2] = Operand(v1.as_linear());
7838    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7839 
7840    reduce->reduce_op = op;
7841    reduce->cluster_size = cluster_size;
7842    bld.insert(std::move(reduce));
7843 
7844    return dst.getTemp();
7845 }
7846 
7847 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)7848 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7849 {
7850    Builder bld(ctx->program, ctx->block);
7851    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7852    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7853 
7854    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7855    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7856    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7857    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7858 
7859    /* Build DD X/Y */
7860    if (ctx->program->chip_class >= GFX8) {
7861       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7862       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7863       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7864       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7865       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7866       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7867    } else {
7868       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7869       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7870       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7871       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7872       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7873       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7874       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7875       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7876       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7877       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7878    }
7879 
7880    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7881    aco_opcode mad =
7882       ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7883    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7884    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7885    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7886    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7887    Temp wqm1 = bld.tmp(v1);
7888    emit_wqm(bld, tmp1, wqm1, true);
7889    Temp wqm2 = bld.tmp(v1);
7890    emit_wqm(bld, tmp2, wqm2, true);
7891    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7892    return;
7893 }
7894 
7895 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7896 void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
7897 static void create_vs_exports(isel_context* ctx);
7898 
7899 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)7900 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin,
7901                  enum glsl_interp_mode interp)
7902 {
7903    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
7904    if (intrin == nir_intrinsic_load_barycentric_pixel ||
7905        intrin == nir_intrinsic_load_barycentric_at_sample ||
7906        intrin == nir_intrinsic_load_barycentric_at_offset) {
7907       return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center);
7908    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
7909       return linear ? ctx->linear_centroid : ctx->persp_centroid;
7910    } else {
7911       assert(intrin == nir_intrinsic_load_barycentric_sample);
7912       return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample);
7913    }
7914 }
7915 
7916 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)7917 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7918 {
7919    Builder bld(ctx->program, ctx->block);
7920    switch (instr->intrinsic) {
7921    case nir_intrinsic_load_barycentric_sample:
7922    case nir_intrinsic_load_barycentric_pixel:
7923    case nir_intrinsic_load_barycentric_centroid: {
7924       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7925       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
7926       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7927       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7928       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7929       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
7930       emit_split_vector(ctx, dst, 2);
7931       break;
7932    }
7933    case nir_intrinsic_load_barycentric_model: {
7934       Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7935 
7936       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7937       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7938       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7939       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7940       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
7941                  Operand(p3));
7942       emit_split_vector(ctx, dst, 3);
7943       break;
7944    }
7945    case nir_intrinsic_load_barycentric_at_sample: {
7946       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7947       switch (ctx->options->key.ps.num_samples) {
7948       case 2: sample_pos_offset += 1 << 3; break;
7949       case 4: sample_pos_offset += 3 << 3; break;
7950       case 8: sample_pos_offset += 7 << 3; break;
7951       default: break;
7952       }
7953       Temp sample_pos;
7954       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7955       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7956       Temp private_segment_buffer = ctx->program->private_segment_buffer;
7957       // TODO: bounds checking?
7958       if (addr.type() == RegType::sgpr) {
7959          Operand offset;
7960          if (const_addr) {
7961             sample_pos_offset += const_addr->u32 << 3;
7962             offset = Operand::c32(sample_pos_offset);
7963          } else if (ctx->options->chip_class >= GFX9) {
7964             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
7965                               Operand::c32(sample_pos_offset));
7966          } else {
7967             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,
7968                               Operand::c32(3u));
7969             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
7970                               Operand::c32(sample_pos_offset));
7971          }
7972 
7973          Operand off = bld.copy(bld.def(s1), Operand(offset));
7974          sample_pos =
7975             bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7976 
7977       } else if (ctx->options->chip_class >= GFX9) {
7978          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7979          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
7980                                  private_segment_buffer, sample_pos_offset);
7981       } else if (ctx->options->chip_class >= GFX7) {
7982          /* addr += private_segment_buffer + sample_pos_offset */
7983          Temp tmp0 = bld.tmp(s1);
7984          Temp tmp1 = bld.tmp(s1);
7985          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
7986                     private_segment_buffer);
7987          Definition scc_tmp = bld.def(s1, scc);
7988          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,
7989                          Operand::c32(sample_pos_offset));
7990          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,
7991                          Operand::zero(), bld.scc(scc_tmp.getTemp()));
7992          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7993          Temp pck0 = bld.tmp(v1);
7994          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7995          tmp1 = as_vgpr(ctx, tmp1);
7996          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
7997                                   bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);
7998          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7999 
8000          /* sample_pos = flat_load_dwordx2 addr */
8001          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
8002       } else {
8003          assert(ctx->options->chip_class == GFX6);
8004 
8005          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
8006                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
8007          Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
8008                                 Operand::zero(), Operand::c32(rsrc_conf));
8009 
8010          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
8011          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());
8012 
8013          sample_pos = bld.tmp(v2);
8014 
8015          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
8016             aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
8017          load->definitions[0] = Definition(sample_pos);
8018          load->operands[0] = Operand(rsrc);
8019          load->operands[1] = Operand(addr);
8020          load->operands[2] = Operand::zero();
8021          load->offset = sample_pos_offset;
8022          load->offen = 0;
8023          load->addr64 = true;
8024          load->glc = false;
8025          load->dlc = false;
8026          load->disable_wqm = false;
8027          ctx->block->instructions.emplace_back(std::move(load));
8028       }
8029 
8030       /* sample_pos -= 0.5 */
8031       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
8032       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
8033       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
8034       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
8035       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
8036 
8037       Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8038       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8039       break;
8040    }
8041    case nir_intrinsic_load_barycentric_at_offset: {
8042       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8043       RegClass rc = RegClass(offset.type(), 1);
8044       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8045       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8046       Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8047       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8048       break;
8049    }
8050    case nir_intrinsic_load_front_face: {
8051       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8052                Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))
8053          .def(0)
8054          .setHint(vcc);
8055       break;
8056    }
8057    case nir_intrinsic_load_view_index: {
8058       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8059       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8060       break;
8061    }
8062    case nir_intrinsic_load_frag_coord: {
8063       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8064       break;
8065    }
8066    case nir_intrinsic_load_frag_shading_rate:
8067       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8068       break;
8069    case nir_intrinsic_load_sample_pos: {
8070       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8071       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8072       bld.pseudo(
8073          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8074          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8075          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8076       break;
8077    }
8078    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8079    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8080    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8081    case nir_intrinsic_load_input:
8082    case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8083    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8084    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8085    case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8086    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8087    case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
8088    case nir_intrinsic_terminate:
8089    case nir_intrinsic_discard: visit_discard(ctx, instr); break;
8090    case nir_intrinsic_terminate_if:
8091    case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
8092    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8093    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8094    case nir_intrinsic_shared_atomic_add:
8095    case nir_intrinsic_shared_atomic_imin:
8096    case nir_intrinsic_shared_atomic_umin:
8097    case nir_intrinsic_shared_atomic_imax:
8098    case nir_intrinsic_shared_atomic_umax:
8099    case nir_intrinsic_shared_atomic_and:
8100    case nir_intrinsic_shared_atomic_or:
8101    case nir_intrinsic_shared_atomic_xor:
8102    case nir_intrinsic_shared_atomic_exchange:
8103    case nir_intrinsic_shared_atomic_comp_swap:
8104    case nir_intrinsic_shared_atomic_fadd:
8105    case nir_intrinsic_shared_atomic_fmin:
8106    case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break;
8107    case nir_intrinsic_image_deref_load:
8108    case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
8109    case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
8110    case nir_intrinsic_image_deref_atomic_add:
8111    case nir_intrinsic_image_deref_atomic_umin:
8112    case nir_intrinsic_image_deref_atomic_imin:
8113    case nir_intrinsic_image_deref_atomic_umax:
8114    case nir_intrinsic_image_deref_atomic_imax:
8115    case nir_intrinsic_image_deref_atomic_and:
8116    case nir_intrinsic_image_deref_atomic_or:
8117    case nir_intrinsic_image_deref_atomic_xor:
8118    case nir_intrinsic_image_deref_atomic_exchange:
8119    case nir_intrinsic_image_deref_atomic_comp_swap:
8120    case nir_intrinsic_image_deref_atomic_fmin:
8121    case nir_intrinsic_image_deref_atomic_fmax: visit_image_atomic(ctx, instr); break;
8122    case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
8123    case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
8124    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8125    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8126    case nir_intrinsic_load_global_constant:
8127    case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
8128    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8129    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8130    case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
8131    case nir_intrinsic_global_atomic_add:
8132    case nir_intrinsic_global_atomic_imin:
8133    case nir_intrinsic_global_atomic_umin:
8134    case nir_intrinsic_global_atomic_imax:
8135    case nir_intrinsic_global_atomic_umax:
8136    case nir_intrinsic_global_atomic_and:
8137    case nir_intrinsic_global_atomic_or:
8138    case nir_intrinsic_global_atomic_xor:
8139    case nir_intrinsic_global_atomic_exchange:
8140    case nir_intrinsic_global_atomic_comp_swap:
8141    case nir_intrinsic_global_atomic_fmin:
8142    case nir_intrinsic_global_atomic_fmax: visit_global_atomic(ctx, instr); break;
8143    case nir_intrinsic_ssbo_atomic_add:
8144    case nir_intrinsic_ssbo_atomic_imin:
8145    case nir_intrinsic_ssbo_atomic_umin:
8146    case nir_intrinsic_ssbo_atomic_imax:
8147    case nir_intrinsic_ssbo_atomic_umax:
8148    case nir_intrinsic_ssbo_atomic_and:
8149    case nir_intrinsic_ssbo_atomic_or:
8150    case nir_intrinsic_ssbo_atomic_xor:
8151    case nir_intrinsic_ssbo_atomic_exchange:
8152    case nir_intrinsic_ssbo_atomic_comp_swap:
8153    case nir_intrinsic_ssbo_atomic_fmin:
8154    case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break;
8155    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8156    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8157    case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
8158    case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8159    case nir_intrinsic_load_num_workgroups: {
8160       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8161       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
8162       emit_split_vector(ctx, dst, 3);
8163       break;
8164    }
8165    case nir_intrinsic_load_ray_launch_size: {
8166       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8167       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.ray_launch_size)));
8168       emit_split_vector(ctx, dst, 3);
8169       break;
8170    }
8171    case nir_intrinsic_load_local_invocation_id: {
8172       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8173       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8174       emit_split_vector(ctx, dst, 3);
8175       break;
8176    }
8177    case nir_intrinsic_load_workgroup_id: {
8178       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8179       const struct ac_arg* args = ctx->args->ac.workgroup_ids;
8180       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8181                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
8182                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
8183                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());
8184       emit_split_vector(ctx, dst, 3);
8185       break;
8186    }
8187    case nir_intrinsic_load_local_invocation_index: {
8188       if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8189          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8190                   get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8191          break;
8192       } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8193          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8194          break;
8195       }
8196 
8197       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8198 
8199       /* The tg_size bits [6:11] contain the subgroup id,
8200        * we need this multiplied by the wave size, and then OR the thread id to it.
8201        */
8202       if (ctx->program->wave_size == 64) {
8203          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8204           * feed that to v_or */
8205          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8206                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8207          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8208                   id);
8209       } else {
8210          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8211          Temp tg_num =
8212             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8213                      get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8214          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8215                   tg_num, Operand::c32(0x5u), id);
8216       }
8217       break;
8218    }
8219    case nir_intrinsic_load_subgroup_id: {
8220       if (ctx->stage == compute_cs) {
8221          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8222                   bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8223                   Operand::c32(0x6u | (0x6u << 16)));
8224       } else if (ctx->stage.hw == HWStage::NGG) {
8225          /* Get the id of the current wave within the threadgroup (workgroup) */
8226          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8227                   bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8228                   Operand::c32(24u | (4u << 16)));
8229       } else {
8230          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8231       }
8232       break;
8233    }
8234    case nir_intrinsic_load_subgroup_invocation: {
8235       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8236       break;
8237    }
8238    case nir_intrinsic_load_num_subgroups: {
8239       if (ctx->stage == compute_cs)
8240          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8241                   bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8242       else if (ctx->stage.hw == HWStage::NGG)
8243          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8244                   bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8245                   Operand::c32(28u | (4u << 16)));
8246       else
8247          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8248       break;
8249    }
8250    case nir_intrinsic_ballot: {
8251       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8252       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8253 
8254       if (instr->src[0].ssa->bit_size == 1) {
8255          assert(src.regClass() == bld.lm);
8256       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8257          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8258       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8259          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8260       } else {
8261          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8262       }
8263 
8264       /* Make sure that all inactive lanes return zero.
8265        * Value-numbering might remove the comparison above */
8266       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8267       if (dst.size() != bld.lm.size()) {
8268          /* Wave32 with ballot size set to 64 */
8269          src =
8270             bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8271       }
8272 
8273       emit_wqm(bld, src, dst);
8274       break;
8275    }
8276    case nir_intrinsic_shuffle:
8277    case nir_intrinsic_read_invocation: {
8278       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8279       if (!nir_src_is_divergent(instr->src[0])) {
8280          emit_uniform_subgroup(ctx, instr, src);
8281       } else {
8282          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8283          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8284              !nir_src_is_divergent(instr->src[1]))
8285             tid = bld.as_uniform(tid);
8286          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8287 
8288          if (instr->dest.ssa.bit_size != 1)
8289             src = as_vgpr(ctx, src);
8290 
8291          if (src.regClass() == v1b || src.regClass() == v2b) {
8292             Temp tmp = bld.tmp(v1);
8293             tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8294             if (dst.type() == RegType::vgpr)
8295                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8296                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8297             else
8298                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8299          } else if (src.regClass() == v1) {
8300             emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8301          } else if (src.regClass() == v2) {
8302             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8303             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8304             lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8305             hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8306             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8307             emit_split_vector(ctx, dst, 2);
8308          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
8309             assert(src.regClass() == bld.lm);
8310             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8311             bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8312          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
8313             assert(src.regClass() == bld.lm);
8314             Temp tmp;
8315             if (ctx->program->chip_class <= GFX7)
8316                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8317             else if (ctx->program->wave_size == 64)
8318                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8319             else
8320                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8321             tmp = emit_extract_vector(ctx, tmp, 0, v1);
8322             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8323             emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8324                      dst);
8325          } else {
8326             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8327          }
8328       }
8329       break;
8330    }
8331    case nir_intrinsic_load_sample_id: {
8332       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8333                get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8334       break;
8335    }
8336    case nir_intrinsic_load_sample_mask_in: {
8337       visit_load_sample_mask_in(ctx, instr);
8338       break;
8339    }
8340    case nir_intrinsic_read_first_invocation: {
8341       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8342       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8343       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8344          emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8345       } else if (src.regClass() == v2) {
8346          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8347          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8348          lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8349          hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8350          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8351          emit_split_vector(ctx, dst, 2);
8352       } else if (instr->dest.ssa.bit_size == 1) {
8353          assert(src.regClass() == bld.lm);
8354          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8355                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8356          bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8357       } else {
8358          bld.copy(Definition(dst), src);
8359       }
8360       break;
8361    }
8362    case nir_intrinsic_vote_all: {
8363       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8364       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8365       assert(src.regClass() == bld.lm);
8366       assert(dst.regClass() == bld.lm);
8367 
8368       Temp tmp =
8369          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
8370             .def(1)
8371             .getTemp();
8372       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8373       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8374       break;
8375    }
8376    case nir_intrinsic_vote_any: {
8377       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8378       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8379       assert(src.regClass() == bld.lm);
8380       assert(dst.regClass() == bld.lm);
8381 
8382       Temp tmp = bool_to_scalar_condition(ctx, src);
8383       bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8384       break;
8385    }
8386    case nir_intrinsic_reduce:
8387    case nir_intrinsic_inclusive_scan:
8388    case nir_intrinsic_exclusive_scan: {
8389       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8390       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8391       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8392       unsigned cluster_size =
8393          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8394       cluster_size = util_next_power_of_two(
8395          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8396 
8397       if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8398           instr->dest.ssa.bit_size != 1) {
8399          /* We use divergence analysis to assign the regclass, so check if it's
8400           * working as expected */
8401          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8402          if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8403             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8404          assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
8405 
8406          if (instr->intrinsic == nir_intrinsic_reduce) {
8407             if (emit_uniform_reduce(ctx, instr))
8408                break;
8409          } else if (emit_uniform_scan(ctx, instr)) {
8410             break;
8411          }
8412       }
8413 
8414       if (instr->dest.ssa.bit_size == 1) {
8415          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8416             op = nir_op_iand;
8417          else if (op == nir_op_iadd)
8418             op = nir_op_ixor;
8419          else if (op == nir_op_umax || op == nir_op_imax)
8420             op = nir_op_ior;
8421          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8422 
8423          switch (instr->intrinsic) {
8424          case nir_intrinsic_reduce:
8425             emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8426             break;
8427          case nir_intrinsic_exclusive_scan:
8428             emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8429             break;
8430          case nir_intrinsic_inclusive_scan:
8431             emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8432             break;
8433          default: assert(false);
8434          }
8435       } else if (cluster_size == 1) {
8436          bld.copy(Definition(dst), src);
8437       } else {
8438          unsigned bit_size = instr->src[0].ssa->bit_size;
8439 
8440          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8441 
8442          ReduceOp reduce_op = get_reduce_op(op, bit_size);
8443 
8444          aco_opcode aco_op;
8445          switch (instr->intrinsic) {
8446          case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8447          case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8448          case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8449          default: unreachable("unknown reduce intrinsic");
8450          }
8451 
8452          Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8453                                              bld.def(dst.regClass()), src);
8454          emit_wqm(bld, tmp_dst, dst);
8455       }
8456       break;
8457    }
8458    case nir_intrinsic_quad_broadcast:
8459    case nir_intrinsic_quad_swap_horizontal:
8460    case nir_intrinsic_quad_swap_vertical:
8461    case nir_intrinsic_quad_swap_diagonal:
8462    case nir_intrinsic_quad_swizzle_amd: {
8463       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8464 
8465       if (!nir_dest_is_divergent(instr->dest)) {
8466          emit_uniform_subgroup(ctx, instr, src);
8467          break;
8468       }
8469 
8470       /* Quad broadcast lane. */
8471       unsigned lane = 0;
8472       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8473       bool bool_use_valu = instr->dest.ssa.bit_size == 1;
8474 
8475       uint16_t dpp_ctrl = 0;
8476 
8477       switch (instr->intrinsic) {
8478       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8479       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8480       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8481       case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8482       case nir_intrinsic_quad_broadcast:
8483          lane = nir_src_as_const_value(instr->src[1])->u32;
8484          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8485          bool_use_valu = false;
8486          break;
8487       default: break;
8488       }
8489 
8490       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8491       Temp tmp(dst);
8492 
8493       /* Setup source. */
8494       if (bool_use_valu)
8495          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8496                             Operand::c32(-1), src);
8497       else if (instr->dest.ssa.bit_size != 1)
8498          src = as_vgpr(ctx, src);
8499 
8500       /* Setup temporary destination. */
8501       if (bool_use_valu)
8502          tmp = bld.tmp(v1);
8503       else if (ctx->program->stage == fragment_fs)
8504          tmp = bld.tmp(dst.regClass());
8505 
8506       if (instr->dest.ssa.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8507          /* Special case for quad broadcast using SALU only. */
8508          assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
8509 
8510          uint32_t half_mask = 0x11111111u << lane;
8511          Operand mask_tmp = bld.lm.bytes() == 4
8512                                ? Operand::c32(half_mask)
8513                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8514                                             Operand::c32(half_mask), Operand::c32(half_mask));
8515 
8516          src =
8517             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8518          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8519          bld.sop1(Builder::s_wqm, Definition(tmp), src);
8520       } else if (instr->dest.ssa.bit_size <= 32 || bool_use_valu) {
8521          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->dest.ssa.bit_size / 8;
8522          Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
8523 
8524          if (ctx->program->chip_class >= GFX8)
8525             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
8526          else
8527             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8528 
8529          if (excess_bytes)
8530             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
8531                        bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
8532       } else if (instr->dest.ssa.bit_size == 64) {
8533          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8534          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8535 
8536          if (ctx->program->chip_class >= GFX8) {
8537             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl);
8538             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl);
8539          } else {
8540             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8541             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8542          }
8543 
8544          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
8545          emit_split_vector(ctx, tmp, 2);
8546       } else {
8547          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8548       }
8549 
8550       if (tmp.id() != dst.id()) {
8551          if (bool_use_valu)
8552             tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
8553 
8554          /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8555          emit_wqm(bld, tmp, dst, true);
8556       }
8557 
8558       break;
8559    }
8560    case nir_intrinsic_masked_swizzle_amd: {
8561       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8562       if (!nir_dest_is_divergent(instr->dest)) {
8563          emit_uniform_subgroup(ctx, instr, src);
8564          break;
8565       }
8566       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8567       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8568 
8569       if (instr->dest.ssa.bit_size != 1)
8570          src = as_vgpr(ctx, src);
8571 
8572       if (instr->dest.ssa.bit_size == 1) {
8573          assert(src.regClass() == bld.lm);
8574          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8575                             Operand::c32(-1), src);
8576          src = emit_masked_swizzle(ctx, bld, src, mask);
8577          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8578          emit_wqm(bld, tmp, dst);
8579       } else if (dst.regClass() == v1b) {
8580          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8581          emit_extract_vector(ctx, tmp, 0, dst);
8582       } else if (dst.regClass() == v2b) {
8583          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8584          emit_extract_vector(ctx, tmp, 0, dst);
8585       } else if (dst.regClass() == v1) {
8586          emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8587       } else if (dst.regClass() == v2) {
8588          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8589          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8590          lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8591          hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8592          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8593          emit_split_vector(ctx, dst, 2);
8594       } else {
8595          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8596       }
8597       break;
8598    }
8599    case nir_intrinsic_write_invocation_amd: {
8600       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8601       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8602       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8603       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8604       if (dst.regClass() == v1) {
8605          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8606          emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8607       } else if (dst.regClass() == v2) {
8608          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8609          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8610          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8611          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8612          Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8613          Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8614          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8615          emit_split_vector(ctx, dst, 2);
8616       } else {
8617          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8618       }
8619       break;
8620    }
8621    case nir_intrinsic_mbcnt_amd: {
8622       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8623       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8624       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8625       /* Fit 64-bit mask for wave32 */
8626       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8627       Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8628       emit_wqm(bld, wqm_tmp, dst);
8629       break;
8630    }
8631    case nir_intrinsic_byte_permute_amd: {
8632       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8633       assert(dst.regClass() == v1);
8634       assert(ctx->program->chip_class >= GFX8);
8635       bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
8636                as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
8637                as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8638       break;
8639    }
8640    case nir_intrinsic_lane_permute_16_amd: {
8641       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8642       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8643       assert(ctx->program->chip_class >= GFX10);
8644 
8645       if (src.regClass() == s1) {
8646          bld.copy(Definition(dst), src);
8647       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8648          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8649                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8650                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8651       } else {
8652          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8653       }
8654       break;
8655    }
8656    case nir_intrinsic_load_helper_invocation:
8657    case nir_intrinsic_is_helper_invocation: {
8658       /* load_helper() after demote() get lowered to is_helper().
8659        * Otherwise, these two behave the same. */
8660       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8661       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8662       ctx->block->kind |= block_kind_needs_lowering;
8663       ctx->program->needs_exact = true;
8664       break;
8665    }
8666    case nir_intrinsic_demote:
8667       bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));
8668 
8669       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8670          ctx->cf_info.exec_potentially_empty_discard = true;
8671       ctx->block->kind |= block_kind_uses_demote;
8672       ctx->program->needs_exact = true;
8673       break;
8674    case nir_intrinsic_demote_if: {
8675       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8676       assert(src.regClass() == bld.lm);
8677       Temp cond =
8678          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8679       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8680 
8681       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8682          ctx->cf_info.exec_potentially_empty_discard = true;
8683       ctx->block->kind |= block_kind_uses_demote;
8684       ctx->program->needs_exact = true;
8685       break;
8686    }
8687    case nir_intrinsic_first_invocation: {
8688       emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8689                get_ssa_temp(ctx, &instr->dest.ssa));
8690       break;
8691    }
8692    case nir_intrinsic_last_invocation: {
8693       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8694       Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8695                            Operand::c32(ctx->program->wave_size - 1u), flbit);
8696       emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
8697       break;
8698    }
8699    case nir_intrinsic_elect: {
8700       /* p_elect is lowered in aco_insert_exec_mask.
8701        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8702        * two p_elect with different exec masks as the same.
8703        */
8704       Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
8705       emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
8706       ctx->block->kind |= block_kind_needs_lowering;
8707       break;
8708    }
8709    case nir_intrinsic_shader_clock: {
8710       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8711       if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
8712           ctx->options->chip_class >= GFX10_3) {
8713          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8714          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8715          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8716       } else {
8717          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
8718                                 ? aco_opcode::s_memrealtime
8719                                 : aco_opcode::s_memtime;
8720          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8721       }
8722       emit_split_vector(ctx, dst, 2);
8723       break;
8724    }
8725    case nir_intrinsic_load_vertex_id_zero_base: {
8726       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8727       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8728       break;
8729    }
8730    case nir_intrinsic_load_first_vertex: {
8731       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8732       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8733       break;
8734    }
8735    case nir_intrinsic_load_base_instance: {
8736       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8737       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8738       break;
8739    }
8740    case nir_intrinsic_load_instance_id: {
8741       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8742       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8743       break;
8744    }
8745    case nir_intrinsic_load_draw_id: {
8746       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8747       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8748       break;
8749    }
8750    case nir_intrinsic_load_invocation_id: {
8751       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8752 
8753       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8754          if (ctx->options->chip_class >= GFX10)
8755             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8756                          get_arg(ctx, ctx->args->ac.gs_invocation_id));
8757          else
8758             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8759       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8760          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
8761                   Operand::c32(8u), Operand::c32(5u));
8762       } else {
8763          unreachable("Unsupported stage for load_invocation_id");
8764       }
8765 
8766       break;
8767    }
8768    case nir_intrinsic_load_primitive_id: {
8769       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8770 
8771       switch (ctx->shader->info.stage) {
8772       case MESA_SHADER_GEOMETRY:
8773          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8774          break;
8775       case MESA_SHADER_TESS_CTRL:
8776          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8777          break;
8778       case MESA_SHADER_TESS_EVAL:
8779          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8780          break;
8781       default:
8782          if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
8783             /* In case of NGG, the GS threads always have the primitive ID
8784              * even if there is no SW GS. */
8785             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8786             break;
8787          }
8788          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8789       }
8790 
8791       break;
8792    }
8793    case nir_intrinsic_load_patch_vertices_in: {
8794       assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8795              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8796 
8797       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8798       bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.tess_input_vertices));
8799       break;
8800    }
8801    case nir_intrinsic_emit_vertex_with_counter: {
8802       assert(ctx->stage.hw == HWStage::GS);
8803       visit_emit_vertex_with_counter(ctx, instr);
8804       break;
8805    }
8806    case nir_intrinsic_end_primitive_with_counter: {
8807       if (ctx->stage.hw != HWStage::NGG) {
8808          unsigned stream = nir_intrinsic_stream_id(instr);
8809          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
8810                   sendmsg_gs(true, false, stream));
8811       }
8812       break;
8813    }
8814    case nir_intrinsic_set_vertex_and_primitive_count: {
8815       assert(ctx->stage.hw == HWStage::GS);
8816       /* unused in the legacy pipeline, the HW keeps track of this for us */
8817       break;
8818    }
8819    case nir_intrinsic_load_tess_rel_patch_id_amd: {
8820       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));
8821       break;
8822    }
8823    case nir_intrinsic_load_ring_tess_factors_amd: {
8824       bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8825                ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));
8826       break;
8827    }
8828    case nir_intrinsic_load_ring_tess_factors_offset_amd: {
8829       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8830                get_arg(ctx, ctx->args->ac.tcs_factor_offset));
8831       break;
8832    }
8833    case nir_intrinsic_load_ring_tess_offchip_amd: {
8834       bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8835                ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));
8836       break;
8837    }
8838    case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
8839       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8840                get_arg(ctx, ctx->args->ac.tess_offchip_offset));
8841       break;
8842    }
8843    case nir_intrinsic_load_ring_esgs_amd: {
8844       unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;
8845       bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8846                ctx->program->private_segment_buffer, Operand::c32(ring * 16u));
8847       break;
8848    }
8849    case nir_intrinsic_load_ring_es2gs_offset_amd: {
8850       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8851                get_arg(ctx, ctx->args->ac.es2gs_offset));
8852       break;
8853    }
8854    case nir_intrinsic_load_gs_vertex_offset_amd: {
8855       /* GFX6-8 uses 6 separate args, while GFX9+ packs these into only 3 args. */
8856       unsigned b = nir_intrinsic_base(instr);
8857       assert(b <= (ctx->program->chip_class >= GFX9 ? 2 : 5));
8858       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8859                get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
8860       break;
8861    }
8862    case nir_intrinsic_has_input_vertex_amd:
8863    case nir_intrinsic_has_input_primitive_amd: {
8864       assert(ctx->stage.hw == HWStage::NGG);
8865       unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;
8866       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
8867       break;
8868    }
8869    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
8870    case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
8871       assert(ctx->stage.hw == HWStage::NGG);
8872       unsigned pos =
8873          instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
8874       bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8875                bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
8876                Operand::c32(pos | (9u << 16u)));
8877       break;
8878    }
8879    case nir_intrinsic_load_initial_edgeflags_amd: {
8880       assert(ctx->stage.hw == HWStage::NGG);
8881 
8882       Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
8883       /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
8884       Temp flags =
8885          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
8886       /* Move the bits to their desired position: 8->9, 9->19, 10->29. */
8887       flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
8888       /* Remove garbage bits that are a byproduct of the multiplication. */
8889       bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8890                Operand::c32(0x20080200), flags);
8891       break;
8892    }
8893    case nir_intrinsic_load_packed_passthrough_primitive_amd: {
8894       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8895                get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
8896       break;
8897    }
8898    case nir_intrinsic_export_vertex_amd: {
8899       ctx->block->kind |= block_kind_export_end;
8900       create_vs_exports(ctx);
8901       break;
8902    }
8903    case nir_intrinsic_export_primitive_amd: {
8904       assert(ctx->stage.hw == HWStage::NGG);
8905       Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
8906       bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
8907               1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
8908               true /* done */, false /* valid mask */);
8909       break;
8910    }
8911    case nir_intrinsic_alloc_vertices_and_primitives_amd: {
8912       assert(ctx->stage.hw == HWStage::NGG);
8913       Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
8914       Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
8915       ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
8916       break;
8917    }
8918    case nir_intrinsic_gds_atomic_add_amd: {
8919       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8920       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8921       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8922       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8923       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8924              true);
8925       break;
8926    }
8927    case nir_intrinsic_load_shader_query_enabled_amd: {
8928       unsigned cmp_bit = 0;
8929       Temp shader_query_enabled =
8930          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8931                   get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));
8932       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8933                bool_to_vector_condition(ctx, shader_query_enabled));
8934       break;
8935    }
8936    case nir_intrinsic_load_cull_front_face_enabled_amd:
8937    case nir_intrinsic_load_cull_back_face_enabled_amd:
8938    case nir_intrinsic_load_cull_ccw_amd:
8939    case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
8940       unsigned cmp_bit;
8941       if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)
8942          cmp_bit = 0;
8943       else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)
8944          cmp_bit = 1;
8945       else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)
8946          cmp_bit = 2;
8947       else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)
8948          cmp_bit = 3;
8949       else
8950          unreachable("unimplemented culling intrinsic");
8951 
8952       Builder::Result enabled =
8953          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8954                   get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));
8955       enabled.instr->definitions[0].setNoCSE(true);
8956       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8957                bool_to_vector_condition(ctx, enabled));
8958       break;
8959    }
8960    case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
8961    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8962    case nir_intrinsic_load_cull_any_enabled_amd: {
8963       Builder::Result cull_any_enabled =
8964          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8965                   get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0xbu));
8966       cull_any_enabled.instr->definitions[1].setNoCSE(true);
8967       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8968                bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
8969       break;
8970    }
8971    case nir_intrinsic_load_cull_small_prim_precision_amd: {
8972       /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
8973       Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
8974                                get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(24u));
8975       /* small_prim_precision = 1.0 * 2^X */
8976       bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8977                Operand::c32(0x3f800000u), Operand(exponent));
8978       break;
8979    }
8980    case nir_intrinsic_load_viewport_x_scale: {
8981       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8982                get_arg(ctx, ctx->args->ngg_viewport_scale[0]));
8983       break;
8984    }
8985    case nir_intrinsic_load_viewport_y_scale: {
8986       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8987                get_arg(ctx, ctx->args->ngg_viewport_scale[1]));
8988       break;
8989    }
8990    case nir_intrinsic_load_viewport_x_offset: {
8991       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8992                get_arg(ctx, ctx->args->ngg_viewport_translate[0]));
8993       break;
8994    }
8995    case nir_intrinsic_load_viewport_y_offset: {
8996       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8997                get_arg(ctx, ctx->args->ngg_viewport_translate[1]));
8998       break;
8999    }
9000    case nir_intrinsic_overwrite_vs_arguments_amd: {
9001       ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9002       ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9003       break;
9004    }
9005    case nir_intrinsic_overwrite_tes_arguments_amd: {
9006       ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9007       ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9008       ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9009          get_ssa_temp(ctx, instr->src[2].ssa);
9010       ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9011       break;
9012    }
9013    default:
9014       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9015       abort();
9016 
9017       break;
9018    }
9019 }
9020 
9021 void
tex_fetch_ptrs(isel_context * ctx,nir_tex_instr * instr,Temp * res_ptr,Temp * samp_ptr,enum glsl_base_type * stype)9022 tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
9023                enum glsl_base_type* stype)
9024 {
9025    nir_deref_instr* texture_deref_instr = NULL;
9026    nir_deref_instr* sampler_deref_instr = NULL;
9027    int plane = -1;
9028 
9029    for (unsigned i = 0; i < instr->num_srcs; i++) {
9030       switch (instr->src[i].src_type) {
9031       case nir_tex_src_texture_deref:
9032          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
9033          break;
9034       case nir_tex_src_sampler_deref:
9035          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
9036          break;
9037       case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
9038       default: break;
9039       }
9040    }
9041 
9042    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
9043 
9044    if (!sampler_deref_instr)
9045       sampler_deref_instr = texture_deref_instr;
9046 
9047    if (plane >= 0) {
9048       assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
9049       *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
9050                                   (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
9051    } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9052       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
9053    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9054       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9055    } else {
9056       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
9057    }
9058    if (samp_ptr) {
9059       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);
9060 
9061       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
9062          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
9063          Builder bld(ctx->program, ctx->block);
9064 
9065          /* to avoid unnecessary moves, we split and recombine sampler and image */
9066          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
9067                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9068          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9069          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
9070                     Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
9071                     Definition(img[6]), Definition(img[7]), *res_ptr);
9072          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
9073                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
9074 
9075          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
9076          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
9077                                img[3], img[4], img[5], img[6], img[7]);
9078          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
9079                                 samp[3]);
9080       }
9081    }
9082 }
9083 
9084 void
build_cube_select(isel_context * ctx,Temp ma,Temp id,Temp deriv,Temp * out_ma,Temp * out_sc,Temp * out_tc)9085 build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9086                   Temp* out_tc)
9087 {
9088    Builder bld(ctx->program, ctx->block);
9089 
9090    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9091    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9092    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9093 
9094    Operand neg_one = Operand::c32(0xbf800000u);
9095    Operand one = Operand::c32(0x3f800000u);
9096    Operand two = Operand::c32(0x40000000u);
9097    Operand four = Operand::c32(0x40800000u);
9098 
9099    Temp is_ma_positive =
9100       bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);
9101    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
9102    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);
9103 
9104    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
9105    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
9106    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
9107    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
9108                                bld.def(s1, scc), is_ma_z, is_ma_y);
9109 
9110    /* select sc */
9111    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
9112    Temp sgn = bld.vop2_e64(
9113       aco_opcode::v_cndmask_b32, bld.def(v1),
9114       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
9115    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9116 
9117    /* select tc */
9118    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
9119    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
9120    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9121 
9122    /* select ma */
9123    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9124                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
9125                   deriv_z, is_ma_z);
9126    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);
9127    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
9128 }
9129 
9130 void
prepare_cube_coords(isel_context * ctx,std::vector<Temp> & coords,Temp * ddx,Temp * ddy,bool is_deriv,bool is_array)9131 prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9132                     bool is_deriv, bool is_array)
9133 {
9134    Builder bld(ctx->program, ctx->block);
9135    Temp ma, tc, sc, id;
9136    aco_opcode madak =
9137       ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9138    aco_opcode madmk =
9139       ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9140 
9141    if (is_array) {
9142       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
9143 
9144       /* see comment in ac_prepare_cube_coords() */
9145       if (ctx->options->chip_class <= GFX8)
9146          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);
9147    }
9148 
9149    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9150 
9151    aco_ptr<VOP3_instruction> vop3a{
9152       create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
9153    vop3a->operands[0] = Operand(ma);
9154    vop3a->abs[0] = true;
9155    Temp invma = bld.tmp(v1);
9156    vop3a->definitions[0] = Definition(invma);
9157    ctx->block->instructions.emplace_back(std::move(vop3a));
9158 
9159    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9160    if (!is_deriv)
9161       sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9162 
9163    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9164    if (!is_deriv)
9165       tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9166 
9167    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9168 
9169    if (is_deriv) {
9170       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
9171       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
9172 
9173       for (unsigned i = 0; i < 2; i++) {
9174          /* see comment in ac_prepare_cube_coords() */
9175          Temp deriv_ma;
9176          Temp deriv_sc, deriv_tc;
9177          build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9178 
9179          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
9180 
9181          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9182                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
9183                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
9184          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9185                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
9186                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
9187          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
9188       }
9189 
9190       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);
9191       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);
9192    }
9193 
9194    if (is_array)
9195       id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));
9196    coords.resize(3);
9197    coords[0] = sc;
9198    coords[1] = tc;
9199    coords[2] = id;
9200 }
9201 
9202 void
get_const_vec(nir_ssa_def * vec,nir_const_value * cv[4])9203 get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
9204 {
9205    if (vec->parent_instr->type != nir_instr_type_alu)
9206       return;
9207    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9208    if (vec_instr->op != nir_op_vec(vec->num_components))
9209       return;
9210 
9211    for (unsigned i = 0; i < vec->num_components; i++) {
9212       cv[i] =
9213          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9214    }
9215 }
9216 
9217 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9218 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9219 {
9220    assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
9221 
9222    Builder bld(ctx->program, ctx->block);
9223    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9224         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9225         has_sample_index = false, has_clamped_lod = false;
9226    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9227                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp();
9228    std::vector<Temp> coords;
9229    std::vector<Temp> derivs;
9230    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9231    enum glsl_base_type stype;
9232    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &stype);
9233 
9234    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
9235                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
9236    bool tg4_integer_cube_workaround =
9237       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9238 
9239    for (unsigned i = 0; i < instr->num_srcs; i++) {
9240       switch (instr->src[i].src_type) {
9241       case nir_tex_src_coord: {
9242          Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9243          for (unsigned j = 0; j < coord.size(); j++)
9244             coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));
9245          break;
9246       }
9247       case nir_tex_src_bias:
9248          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9249          has_bias = true;
9250          break;
9251       case nir_tex_src_lod: {
9252          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9253             level_zero = true;
9254          } else {
9255             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9256             has_lod = true;
9257          }
9258          break;
9259       }
9260       case nir_tex_src_min_lod:
9261          clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9262          has_clamped_lod = true;
9263          break;
9264       case nir_tex_src_comparator:
9265          if (instr->is_shadow) {
9266             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9267             has_compare = true;
9268          }
9269          break;
9270       case nir_tex_src_offset:
9271          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9272          get_const_vec(instr->src[i].src.ssa, const_offset);
9273          has_offset = true;
9274          break;
9275       case nir_tex_src_ddx:
9276          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
9277          has_ddx = true;
9278          break;
9279       case nir_tex_src_ddy:
9280          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
9281          has_ddy = true;
9282          break;
9283       case nir_tex_src_ms_index:
9284          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
9285          has_sample_index = true;
9286          break;
9287       case nir_tex_src_texture_offset:
9288       case nir_tex_src_sampler_offset:
9289       default: break;
9290       }
9291    }
9292 
9293    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9294       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9295 
9296    if (instr->op == nir_texop_texture_samples) {
9297       get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9298       return;
9299    }
9300 
9301    if (has_offset && instr->op != nir_texop_txf) {
9302       aco_ptr<Instruction> tmp_instr;
9303       Temp acc, pack = Temp();
9304 
9305       uint32_t pack_const = 0;
9306       for (unsigned i = 0; i < offset.size(); i++) {
9307          if (!const_offset[i])
9308             continue;
9309          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9310       }
9311 
9312       if (offset.type() == RegType::sgpr) {
9313          for (unsigned i = 0; i < offset.size(); i++) {
9314             if (const_offset[i])
9315                continue;
9316 
9317             acc = emit_extract_vector(ctx, offset, i, s1);
9318             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9319                            Operand::c32(0x3Fu));
9320 
9321             if (i) {
9322                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9323                               Operand::c32(8u * i));
9324             }
9325 
9326             if (pack == Temp()) {
9327                pack = acc;
9328             } else {
9329                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9330             }
9331          }
9332 
9333          if (pack_const && pack != Temp())
9334             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9335                             Operand::c32(pack_const), pack);
9336       } else {
9337          for (unsigned i = 0; i < offset.size(); i++) {
9338             if (const_offset[i])
9339                continue;
9340 
9341             acc = emit_extract_vector(ctx, offset, i, v1);
9342             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9343 
9344             if (i) {
9345                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9346             }
9347 
9348             if (pack == Temp()) {
9349                pack = acc;
9350             } else {
9351                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9352             }
9353          }
9354 
9355          if (pack_const && pack != Temp())
9356             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9357       }
9358       if (pack_const && pack == Temp())
9359          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9360       else if (pack == Temp())
9361          has_offset = false;
9362       else
9363          offset = pack;
9364    }
9365 
9366    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
9367       prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9368                           instr->is_array && instr->op != nir_texop_lod);
9369 
9370    /* pack derivatives */
9371    if (has_ddx || has_ddy) {
9372       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
9373          assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
9374          Temp zero = bld.copy(bld.def(v1), Operand::zero());
9375          derivs = {ddx, zero, ddy, zero};
9376       } else {
9377          for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
9378             derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
9379          for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
9380             derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
9381       }
9382       has_derivs = true;
9383    }
9384 
9385    if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9386        instr->is_array && instr->op != nir_texop_txf)
9387       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
9388 
9389    if (instr->coord_components > 2 &&
9390        (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9391         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
9392         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9393        instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_fragment_fetch_amd &&
9394        instr->op != nir_texop_fragment_mask_fetch_amd)
9395       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
9396 
9397    if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9398        instr->op != nir_texop_lod && instr->coord_components) {
9399       assert(coords.size() > 0 && coords.size() < 3);
9400 
9401       coords.insert(std::next(coords.begin()),
9402                     bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)
9403                                                                      : Operand::c32(0x3f000000)));
9404    }
9405 
9406    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9407 
9408    if (has_offset && instr->op == nir_texop_txf) {
9409       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
9410          Temp off = emit_extract_vector(ctx, offset, i, v1);
9411          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
9412       }
9413       has_offset = false;
9414    }
9415 
9416    /* Build tex instruction */
9417    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;
9418    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9419       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9420    if (instr->is_sparse)
9421       dmask = MAX2(dmask, 1) | 0x10;
9422    unsigned dim =
9423       ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9424          ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
9425          : 0;
9426    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9427    Temp tmp_dst = dst;
9428 
9429    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9430    if (instr->op == nir_texop_tg4) {
9431       assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));
9432       if (instr->is_shadow)
9433          dmask = 1;
9434       else
9435          dmask = 1 << instr->component;
9436       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9437          tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
9438    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9439       tmp_dst = bld.tmp(v1);
9440    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
9441               dst.type() == RegType::sgpr) {
9442       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
9443    }
9444 
9445    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
9446       if (!has_lod)
9447          lod = bld.copy(bld.def(v1), Operand::zero());
9448 
9449       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
9450                                         resource, Operand(s4), std::vector<Temp>{lod});
9451       if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
9452           instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
9453          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
9454       } else if (instr->op == nir_texop_query_levels) {
9455          tex->dmask = 1 << 3;
9456       } else {
9457          tex->dmask = dmask;
9458       }
9459       tex->da = da;
9460       tex->dim = dim;
9461 
9462       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9463       return;
9464    }
9465 
9466    Temp tg4_compare_cube_wa64 = Temp();
9467 
9468    if (tg4_integer_workarounds) {
9469       Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9470       Temp size = bld.tmp(v2);
9471       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
9472                                         resource, Operand(s4), std::vector<Temp>{tg4_lod});
9473       tex->dim = dim;
9474       tex->dmask = 0x3;
9475       tex->da = da;
9476       emit_split_vector(ctx, size, size.size());
9477 
9478       Temp half_texel[2];
9479       for (unsigned i = 0; i < 2; i++) {
9480          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9481          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9482          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9483          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9484                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9485       }
9486 
9487       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9488          /* In vulkan, whether the sampler uses unnormalized
9489           * coordinates or not is a dynamic property of the
9490           * sampler. Hence, to figure out whether or not we
9491           * need to divide by the texture size, we need to test
9492           * the sampler at runtime. This tests the bit set by
9493           * radv_init_sampler().
9494           */
9495          unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9496          Temp not_needed =
9497             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9498 
9499          not_needed = bool_to_vector_condition(ctx, not_needed);
9500          half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9501                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9502          half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9503                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9504       }
9505 
9506       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9507                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9508 
9509       if (tg4_integer_cube_workaround) {
9510          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9511          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9512          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9513             aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9514          split->operands[0] = Operand(resource);
9515          for (unsigned i = 0; i < resource.size(); i++) {
9516             desc[i] = bld.tmp(s1);
9517             split->definitions[i] = Definition(desc[i]);
9518          }
9519          ctx->block->instructions.emplace_back(std::move(split));
9520 
9521          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9522                               Operand::c32(20u | (6u << 16)));
9523          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9524                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9525 
9526          Temp nfmt;
9527          if (stype == GLSL_TYPE_UINT) {
9528             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9529                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9530                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9531          } else {
9532             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9533                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9534                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9535          }
9536          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9537          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9538 
9539          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9540                          Operand::c32(26u));
9541 
9542          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9543                             Operand::c32(C_008F14_NUM_FORMAT));
9544          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9545 
9546          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9547             aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9548          for (unsigned i = 0; i < resource.size(); i++)
9549             vec->operands[i] = Operand(desc[i]);
9550          resource = bld.tmp(resource.regClass());
9551          vec->definitions[0] = Definition(resource);
9552          ctx->block->instructions.emplace_back(std::move(vec));
9553 
9554          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9555                                   tg4_compare_cube_wa64);
9556          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9557                                   tg4_compare_cube_wa64);
9558       }
9559       coords[0] = new_coords[0];
9560       coords[1] = new_coords[1];
9561    }
9562 
9563    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9564       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9565       // ac_build_buffer_load_format_gfx9_safe()
9566 
9567       assert(coords.size() == 1);
9568       aco_opcode op;
9569       switch (util_last_bit(dmask & 0xf)) {
9570       case 1: op = aco_opcode::buffer_load_format_x; break;
9571       case 2: op = aco_opcode::buffer_load_format_xy; break;
9572       case 3: op = aco_opcode::buffer_load_format_xyz; break;
9573       case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9574       default: unreachable("Tex instruction loads more than 4 components.");
9575       }
9576 
9577       aco_ptr<MUBUF_instruction> mubuf{
9578          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9579       mubuf->operands[0] = Operand(resource);
9580       mubuf->operands[1] = Operand(coords[0]);
9581       mubuf->operands[2] = Operand::c32(0);
9582       mubuf->definitions[0] = Definition(tmp_dst);
9583       mubuf->idxen = true;
9584       mubuf->tfe = instr->is_sparse;
9585       if (mubuf->tfe)
9586          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9587       ctx->block->instructions.emplace_back(std::move(mubuf));
9588 
9589       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9590       return;
9591    }
9592 
9593    /* gather MIMG address components */
9594    std::vector<Temp> args;
9595    unsigned wqm_mask = 0;
9596    if (has_offset) {
9597       wqm_mask |= u_bit_consecutive(args.size(), 1);
9598       args.emplace_back(offset);
9599    }
9600    if (has_bias)
9601       args.emplace_back(bias);
9602    if (has_compare)
9603       args.emplace_back(compare);
9604    if (has_derivs)
9605       args.insert(args.end(), derivs.begin(), derivs.end());
9606 
9607    wqm_mask |= u_bit_consecutive(args.size(), coords.size());
9608    args.insert(args.end(), coords.begin(), coords.end());
9609 
9610    if (has_sample_index)
9611       args.emplace_back(sample_index);
9612    if (has_lod)
9613       args.emplace_back(lod);
9614    if (has_clamped_lod)
9615       args.emplace_back(clamped_lod);
9616 
9617    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9618        instr->op == nir_texop_fragment_mask_fetch_amd) {
9619       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9620                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9621                          ? aco_opcode::image_load
9622                          : aco_opcode::image_load_mip;
9623       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9624       MIMG_instruction* tex =
9625          emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
9626       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9627          tex->dim = da ? ac_image_2darray : ac_image_2d;
9628       else
9629          tex->dim = dim;
9630       tex->dmask = dmask & 0xf;
9631       tex->unrm = true;
9632       tex->da = da;
9633       tex->tfe = instr->is_sparse;
9634 
9635       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9636          /* Use 0x76543210 if the image doesn't have FMASK. */
9637          assert(dmask == 1 && dst.bytes() == 4);
9638          assert(dst.id() != tmp_dst.id());
9639 
9640          if (dst.regClass() == s1) {
9641             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9642                                         emit_extract_vector(ctx, resource, 1, s1));
9643             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst),
9644                      bld.as_uniform(tmp_dst), Operand::c32(0x76543210),
9645                      bld.scc(is_not_null));
9646          } else {
9647             Temp is_not_null = bld.tmp(bld.lm);
9648             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9649                          emit_extract_vector(ctx, resource, 1, s1))
9650                .def(0)
9651                .setHint(vcc);
9652             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9653                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9654          }
9655       } else {
9656          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9657       }
9658       return;
9659    }
9660 
9661    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9662    aco_opcode opcode = aco_opcode::image_sample;
9663    if (has_offset) { /* image_sample_*_o */
9664       if (has_clamped_lod) {
9665          if (has_compare) {
9666             opcode = aco_opcode::image_sample_c_cl_o;
9667             if (has_derivs)
9668                opcode = aco_opcode::image_sample_c_d_cl_o;
9669             if (has_bias)
9670                opcode = aco_opcode::image_sample_c_b_cl_o;
9671          } else {
9672             opcode = aco_opcode::image_sample_cl_o;
9673             if (has_derivs)
9674                opcode = aco_opcode::image_sample_d_cl_o;
9675             if (has_bias)
9676                opcode = aco_opcode::image_sample_b_cl_o;
9677          }
9678       } else if (has_compare) {
9679          opcode = aco_opcode::image_sample_c_o;
9680          if (has_derivs)
9681             opcode = aco_opcode::image_sample_c_d_o;
9682          if (has_bias)
9683             opcode = aco_opcode::image_sample_c_b_o;
9684          if (level_zero)
9685             opcode = aco_opcode::image_sample_c_lz_o;
9686          if (has_lod)
9687             opcode = aco_opcode::image_sample_c_l_o;
9688       } else {
9689          opcode = aco_opcode::image_sample_o;
9690          if (has_derivs)
9691             opcode = aco_opcode::image_sample_d_o;
9692          if (has_bias)
9693             opcode = aco_opcode::image_sample_b_o;
9694          if (level_zero)
9695             opcode = aco_opcode::image_sample_lz_o;
9696          if (has_lod)
9697             opcode = aco_opcode::image_sample_l_o;
9698       }
9699    } else if (has_clamped_lod) { /* image_sample_*_cl */
9700       if (has_compare) {
9701          opcode = aco_opcode::image_sample_c_cl;
9702          if (has_derivs)
9703             opcode = aco_opcode::image_sample_c_d_cl;
9704          if (has_bias)
9705             opcode = aco_opcode::image_sample_c_b_cl;
9706       } else {
9707          opcode = aco_opcode::image_sample_cl;
9708          if (has_derivs)
9709             opcode = aco_opcode::image_sample_d_cl;
9710          if (has_bias)
9711             opcode = aco_opcode::image_sample_b_cl;
9712       }
9713    } else { /* no offset */
9714       if (has_compare) {
9715          opcode = aco_opcode::image_sample_c;
9716          if (has_derivs)
9717             opcode = aco_opcode::image_sample_c_d;
9718          if (has_bias)
9719             opcode = aco_opcode::image_sample_c_b;
9720          if (level_zero)
9721             opcode = aco_opcode::image_sample_c_lz;
9722          if (has_lod)
9723             opcode = aco_opcode::image_sample_c_l;
9724       } else {
9725          opcode = aco_opcode::image_sample;
9726          if (has_derivs)
9727             opcode = aco_opcode::image_sample_d;
9728          if (has_bias)
9729             opcode = aco_opcode::image_sample_b;
9730          if (level_zero)
9731             opcode = aco_opcode::image_sample_lz;
9732          if (has_lod)
9733             opcode = aco_opcode::image_sample_l;
9734       }
9735    }
9736 
9737    if (instr->op == nir_texop_tg4) {
9738       if (has_offset) { /* image_gather4_*_o */
9739          if (has_compare) {
9740             opcode = aco_opcode::image_gather4_c_lz_o;
9741             if (has_lod)
9742                opcode = aco_opcode::image_gather4_c_l_o;
9743             if (has_bias)
9744                opcode = aco_opcode::image_gather4_c_b_o;
9745          } else {
9746             opcode = aco_opcode::image_gather4_lz_o;
9747             if (has_lod)
9748                opcode = aco_opcode::image_gather4_l_o;
9749             if (has_bias)
9750                opcode = aco_opcode::image_gather4_b_o;
9751          }
9752       } else {
9753          if (has_compare) {
9754             opcode = aco_opcode::image_gather4_c_lz;
9755             if (has_lod)
9756                opcode = aco_opcode::image_gather4_c_l;
9757             if (has_bias)
9758                opcode = aco_opcode::image_gather4_c_b;
9759          } else {
9760             opcode = aco_opcode::image_gather4_lz;
9761             if (has_lod)
9762                opcode = aco_opcode::image_gather4_l;
9763             if (has_bias)
9764                opcode = aco_opcode::image_gather4_b;
9765          }
9766       }
9767    } else if (instr->op == nir_texop_lod) {
9768       opcode = aco_opcode::image_get_lod;
9769    }
9770 
9771    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9772                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9773                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9774 
9775    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9776    MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
9777                                      args, implicit_derivs ? wqm_mask : 0, vdata);
9778    tex->dim = dim;
9779    tex->dmask = dmask & 0xf;
9780    tex->da = da;
9781    tex->tfe = instr->is_sparse;
9782 
9783    if (tg4_integer_cube_workaround) {
9784       assert(tmp_dst.id() != dst.id());
9785       assert(tmp_dst.size() == dst.size());
9786 
9787       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9788       Temp val[4];
9789       for (unsigned i = 0; i < 4; i++) {
9790          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9791          Temp cvt_val;
9792          if (stype == GLSL_TYPE_UINT)
9793             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9794          else
9795             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9796          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9797                            tg4_compare_cube_wa64);
9798       }
9799 
9800       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9801       if (instr->is_sparse)
9802          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9803                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9804       else
9805          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9806                               val[3]);
9807    }
9808    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9809    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9810 }
9811 
9812 Operand
get_phi_operand(isel_context * ctx,nir_ssa_def * ssa,RegClass rc,bool logical)9813 get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
9814 {
9815    Temp tmp = get_ssa_temp(ctx, ssa);
9816    if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9817       return Operand(rc);
9818    } else if (logical && ssa->bit_size == 1 &&
9819               ssa->parent_instr->type == nir_instr_type_load_const) {
9820       if (ctx->program->wave_size == 64)
9821          return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX
9822                                                                                     : 0u);
9823       else
9824          return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX
9825                                                                                     : 0u);
9826    } else {
9827       return Operand(tmp);
9828    }
9829 }
9830 
9831 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)9832 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9833 {
9834    aco_ptr<Pseudo_instruction> phi;
9835    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9836    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9837 
9838    bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9839    logical |= (ctx->block->kind & block_kind_merge) != 0;
9840    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9841 
9842    /* we want a sorted list of sources, since the predecessor list is also sorted */
9843    std::map<unsigned, nir_ssa_def*> phi_src;
9844    nir_foreach_phi_src (src, instr)
9845       phi_src[src->pred->index] = src->src.ssa;
9846 
9847    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9848    unsigned num_operands = 0;
9849    Operand* const operands = (Operand*)alloca(
9850       (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9851    unsigned num_defined = 0;
9852    unsigned cur_pred_idx = 0;
9853    for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
9854       if (cur_pred_idx < preds.size()) {
9855          /* handle missing preds (IF merges with discard/break) and extra preds
9856           * (loop exit with discard) */
9857          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9858          unsigned skipped = 0;
9859          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9860             skipped++;
9861          if (cur_pred_idx + skipped < preds.size()) {
9862             for (unsigned i = 0; i < skipped; i++)
9863                operands[num_operands++] = Operand(dst.regClass());
9864             cur_pred_idx += skipped;
9865          } else {
9866             continue;
9867          }
9868       }
9869       /* Handle missing predecessors at the end. This shouldn't happen with loop
9870        * headers and we can't ignore these sources for loop header phis. */
9871       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9872          continue;
9873       cur_pred_idx++;
9874       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9875       operands[num_operands++] = op;
9876       num_defined += !op.isUndefined();
9877    }
9878    /* handle block_kind_continue_or_break at loop exit blocks */
9879    while (cur_pred_idx++ < preds.size())
9880       operands[num_operands++] = Operand(dst.regClass());
9881 
9882    /* If the loop ends with a break, still add a linear continue edge in case
9883     * that break is divergent or continue_or_break is used. We'll either remove
9884     * this operand later in visit_loop() if it's not necessary or replace the
9885     * undef with something correct. */
9886    if (!logical && ctx->block->kind & block_kind_loop_header) {
9887       nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9888       nir_block* last = nir_loop_last_block(loop);
9889       if (last->successors[0] != instr->instr.block)
9890          operands[num_operands++] = Operand(RegClass());
9891    }
9892 
9893    /* we can use a linear phi in some cases if one src is undef */
9894    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9895       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9896                                                        num_operands, 1));
9897 
9898       Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9899       Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9900       assert(invert->kind & block_kind_invert);
9901 
9902       unsigned then_block = invert->linear_preds[0];
9903 
9904       Block* insert_block = NULL;
9905       for (unsigned i = 0; i < num_operands; i++) {
9906          Operand op = operands[i];
9907          if (op.isUndefined())
9908             continue;
9909          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9910          phi->operands[0] = op;
9911          break;
9912       }
9913       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9914       phi->operands[1] = Operand(dst.regClass());
9915       phi->definitions[0] = Definition(dst);
9916       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9917       return;
9918    }
9919 
9920    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9921    for (unsigned i = 0; i < num_operands; i++)
9922       phi->operands[i] = operands[i];
9923    phi->definitions[0] = Definition(dst);
9924    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9925 }
9926 
9927 void
visit_undef(isel_context * ctx,nir_ssa_undef_instr * instr)9928 visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
9929 {
9930    Temp dst = get_ssa_temp(ctx, &instr->def);
9931 
9932    assert(dst.type() == RegType::sgpr);
9933 
9934    if (dst.size() == 1) {
9935       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9936    } else {
9937       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9938          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9939       for (unsigned i = 0; i < dst.size(); i++)
9940          vec->operands[i] = Operand::zero();
9941       vec->definitions[0] = Definition(dst);
9942       ctx->block->instructions.emplace_back(std::move(vec));
9943    }
9944 }
9945 
9946 void
begin_loop(isel_context * ctx,loop_context * lc)9947 begin_loop(isel_context* ctx, loop_context* lc)
9948 {
9949    // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9950    append_logical_end(ctx->block);
9951    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9952    Builder bld(ctx->program, ctx->block);
9953    bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
9954    unsigned loop_preheader_idx = ctx->block->index;
9955 
9956    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9957 
9958    ctx->program->next_loop_depth++;
9959 
9960    Block* loop_header = ctx->program->create_and_insert_block();
9961    loop_header->kind |= block_kind_loop_header;
9962    add_edge(loop_preheader_idx, loop_header);
9963    ctx->block = loop_header;
9964 
9965    append_logical_start(ctx->block);
9966 
9967    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9968    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9969    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9970    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9971    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9972 }
9973 
9974 void
end_loop(isel_context * ctx,loop_context * lc)9975 end_loop(isel_context* ctx, loop_context* lc)
9976 {
9977    // TODO: what if a loop ends with a unconditional or uniformly branched continue
9978    //       and this branch is never taken?
9979    if (!ctx->cf_info.has_branch) {
9980       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9981       Builder bld(ctx->program, ctx->block);
9982       append_logical_end(ctx->block);
9983 
9984       if (ctx->cf_info.exec_potentially_empty_discard ||
9985           ctx->cf_info.exec_potentially_empty_break) {
9986          /* Discards can result in code running with an empty exec mask.
9987           * This would result in divergent breaks not ever being taken. As a
9988           * workaround, break the loop when the loop mask is empty instead of
9989           * always continuing. */
9990          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9991          unsigned block_idx = ctx->block->index;
9992 
9993          /* create helper blocks to avoid critical edges */
9994          Block* break_block = ctx->program->create_and_insert_block();
9995          break_block->kind = block_kind_uniform;
9996          bld.reset(break_block);
9997          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
9998          add_linear_edge(block_idx, break_block);
9999          add_linear_edge(break_block->index, &lc->loop_exit);
10000 
10001          Block* continue_block = ctx->program->create_and_insert_block();
10002          continue_block->kind = block_kind_uniform;
10003          bld.reset(continue_block);
10004          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10005          add_linear_edge(block_idx, continue_block);
10006          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10007 
10008          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10009             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10010          ctx->block = &ctx->program->blocks[block_idx];
10011       } else {
10012          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10013          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10014             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10015          else
10016             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10017       }
10018 
10019       bld.reset(ctx->block);
10020       bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10021    }
10022 
10023    ctx->cf_info.has_branch = false;
10024    ctx->program->next_loop_depth--;
10025 
10026    // TODO: if the loop has not a single exit, we must add one °°
10027    /* emit loop successor block */
10028    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10029    append_logical_start(ctx->block);
10030 
10031 #if 0
10032    // TODO: check if it is beneficial to not branch on continues
10033    /* trim linear phis in loop header */
10034    for (auto&& instr : loop_entry->instructions) {
10035       if (instr->opcode == aco_opcode::p_linear_phi) {
10036          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10037          new_phi->definitions[0] = instr->definitions[0];
10038          for (unsigned i = 0; i < new_phi->operands.size(); i++)
10039             new_phi->operands[i] = instr->operands[i];
10040          /* check that the remaining operands are all the same */
10041          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10042             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10043          instr.swap(new_phi);
10044       } else if (instr->opcode == aco_opcode::p_phi) {
10045          continue;
10046       } else {
10047          break;
10048       }
10049    }
10050 #endif
10051 
10052    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10053    ctx->cf_info.parent_loop.exit = lc->exit_old;
10054    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10055    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10056    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10057    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10058       ctx->cf_info.exec_potentially_empty_discard = false;
10059 }
10060 
10061 void
emit_loop_jump(isel_context * ctx,bool is_break)10062 emit_loop_jump(isel_context* ctx, bool is_break)
10063 {
10064    Builder bld(ctx->program, ctx->block);
10065    Block* logical_target;
10066    append_logical_end(ctx->block);
10067    unsigned idx = ctx->block->index;
10068 
10069    if (is_break) {
10070       logical_target = ctx->cf_info.parent_loop.exit;
10071       add_logical_edge(idx, logical_target);
10072       ctx->block->kind |= block_kind_break;
10073 
10074       if (!ctx->cf_info.parent_if.is_divergent &&
10075           !ctx->cf_info.parent_loop.has_divergent_continue) {
10076          /* uniform break - directly jump out of the loop */
10077          ctx->block->kind |= block_kind_uniform;
10078          ctx->cf_info.has_branch = true;
10079          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10080          add_linear_edge(idx, logical_target);
10081          return;
10082       }
10083       ctx->cf_info.parent_loop.has_divergent_branch = true;
10084    } else {
10085       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10086       add_logical_edge(idx, logical_target);
10087       ctx->block->kind |= block_kind_continue;
10088 
10089       if (!ctx->cf_info.parent_if.is_divergent) {
10090          /* uniform continue - directly jump to the loop header */
10091          ctx->block->kind |= block_kind_uniform;
10092          ctx->cf_info.has_branch = true;
10093          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10094          add_linear_edge(idx, logical_target);
10095          return;
10096       }
10097 
10098       /* for potential uniform breaks after this continue,
10099          we must ensure that they are handled correctly */
10100       ctx->cf_info.parent_loop.has_divergent_continue = true;
10101       ctx->cf_info.parent_loop.has_divergent_branch = true;
10102    }
10103 
10104    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10105       ctx->cf_info.exec_potentially_empty_break = true;
10106       ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10107    }
10108 
10109    /* remove critical edges from linear CFG */
10110    bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10111    Block* break_block = ctx->program->create_and_insert_block();
10112    break_block->kind |= block_kind_uniform;
10113    add_linear_edge(idx, break_block);
10114    /* the loop_header pointer might be invalidated by this point */
10115    if (!is_break)
10116       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10117    add_linear_edge(break_block->index, logical_target);
10118    bld.reset(break_block);
10119    bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10120 
10121    Block* continue_block = ctx->program->create_and_insert_block();
10122    add_linear_edge(idx, continue_block);
10123    append_logical_start(continue_block);
10124    ctx->block = continue_block;
10125 }
10126 
10127 void
emit_loop_break(isel_context * ctx)10128 emit_loop_break(isel_context* ctx)
10129 {
10130    emit_loop_jump(ctx, true);
10131 }
10132 
10133 void
emit_loop_continue(isel_context * ctx)10134 emit_loop_continue(isel_context* ctx)
10135 {
10136    emit_loop_jump(ctx, false);
10137 }
10138 
10139 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10140 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10141 {
10142    /* visit_block() would usually do this but divergent jumps updates ctx->block */
10143    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10144 
10145    switch (instr->type) {
10146    case nir_jump_break: emit_loop_break(ctx); break;
10147    case nir_jump_continue: emit_loop_continue(ctx); break;
10148    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10149    }
10150 }
10151 
10152 void
visit_block(isel_context * ctx,nir_block * block)10153 visit_block(isel_context* ctx, nir_block* block)
10154 {
10155    nir_foreach_instr (instr, block) {
10156       switch (instr->type) {
10157       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10158       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10159       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10160       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10161       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10162       case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10163       case nir_instr_type_deref: break;
10164       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10165       default: isel_err(instr, "Unknown NIR instr type");
10166       }
10167    }
10168 
10169    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10170       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10171 }
10172 
10173 static Operand
create_continue_phis(isel_context * ctx,unsigned first,unsigned last,aco_ptr<Instruction> & header_phi,Operand * vals)10174 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10175                      aco_ptr<Instruction>& header_phi, Operand* vals)
10176 {
10177    vals[0] = Operand(header_phi->definitions[0].getTemp());
10178    RegClass rc = vals[0].regClass();
10179 
10180    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10181 
10182    unsigned next_pred = 1;
10183 
10184    for (unsigned idx = first + 1; idx <= last; idx++) {
10185       Block& block = ctx->program->blocks[idx];
10186       if (block.loop_nest_depth != loop_nest_depth) {
10187          vals[idx - first] = vals[idx - 1 - first];
10188          continue;
10189       }
10190 
10191       if ((block.kind & block_kind_continue) && block.index != last) {
10192          vals[idx - first] = header_phi->operands[next_pred];
10193          next_pred++;
10194          continue;
10195       }
10196 
10197       bool all_same = true;
10198       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10199          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10200 
10201       Operand val;
10202       if (all_same) {
10203          val = vals[block.linear_preds[0] - first];
10204       } else {
10205          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10206             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10207          for (unsigned i = 0; i < block.linear_preds.size(); i++)
10208             phi->operands[i] = vals[block.linear_preds[i] - first];
10209          val = Operand(ctx->program->allocateTmp(rc));
10210          phi->definitions[0] = Definition(val.getTemp());
10211          block.instructions.emplace(block.instructions.begin(), std::move(phi));
10212       }
10213       vals[idx - first] = val;
10214    }
10215 
10216    return vals[last - first];
10217 }
10218 
10219 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10220 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10221 static void end_uniform_if(isel_context* ctx, if_context* ic);
10222 
10223 static void
visit_loop(isel_context * ctx,nir_loop * loop)10224 visit_loop(isel_context* ctx, nir_loop* loop)
10225 {
10226    loop_context lc;
10227    begin_loop(ctx, &lc);
10228 
10229    /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10230     * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10231     */
10232    if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10233       Builder bld(ctx->program, ctx->block);
10234       Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10235       if_context ic;
10236       begin_uniform_if_then(ctx, &ic, cond);
10237       emit_loop_break(ctx);
10238       begin_uniform_if_else(ctx, &ic);
10239       end_uniform_if(ctx, &ic);
10240    }
10241 
10242    bool unreachable = visit_cf_list(ctx, &loop->body);
10243 
10244    unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10245 
10246    /* Fixup phis in loop header from unreachable blocks.
10247     * has_branch/has_divergent_branch also indicates if the loop ends with a
10248     * break/continue instruction, but we don't emit those if unreachable=true */
10249    if (unreachable) {
10250       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10251       bool linear = ctx->cf_info.has_branch;
10252       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10253       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10254          if ((logical && instr->opcode == aco_opcode::p_phi) ||
10255              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10256             /* the last operand should be the one that needs to be removed */
10257             instr->operands.pop_back();
10258          } else if (!is_phi(instr)) {
10259             break;
10260          }
10261       }
10262    }
10263 
10264    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10265     * and the previous one shouldn't both happen at once because a break in the
10266     * merge block would get CSE'd */
10267    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10268       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10269       Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10270       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10271          if (instr->opcode == aco_opcode::p_linear_phi) {
10272             if (ctx->cf_info.has_branch)
10273                instr->operands.pop_back();
10274             else
10275                instr->operands.back() =
10276                   create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10277          } else if (!is_phi(instr)) {
10278             break;
10279          }
10280       }
10281    }
10282 
10283    end_loop(ctx, &lc);
10284 }
10285 
10286 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond)10287 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10288 {
10289    ic->cond = cond;
10290 
10291    append_logical_end(ctx->block);
10292    ctx->block->kind |= block_kind_branch;
10293 
10294    /* branch to linear then block */
10295    assert(cond.regClass() == ctx->program->lane_mask);
10296    aco_ptr<Pseudo_branch_instruction> branch;
10297    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10298                                                               Format::PSEUDO_BRANCH, 1, 1));
10299    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10300    branch->definitions[0].setHint(vcc);
10301    branch->operands[0] = Operand(cond);
10302    ctx->block->instructions.push_back(std::move(branch));
10303 
10304    ic->BB_if_idx = ctx->block->index;
10305    ic->BB_invert = Block();
10306    /* Invert blocks are intentionally not marked as top level because they
10307     * are not part of the logical cfg. */
10308    ic->BB_invert.kind |= block_kind_invert;
10309    ic->BB_endif = Block();
10310    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10311 
10312    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10313    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10314    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10315    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10316    ctx->cf_info.parent_if.is_divergent = true;
10317 
10318    /* divergent branches use cbranch_execz */
10319    ctx->cf_info.exec_potentially_empty_discard = false;
10320    ctx->cf_info.exec_potentially_empty_break = false;
10321    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10322 
10323    /** emit logical then block */
10324    ctx->program->next_divergent_if_logical_depth++;
10325    Block* BB_then_logical = ctx->program->create_and_insert_block();
10326    add_edge(ic->BB_if_idx, BB_then_logical);
10327    ctx->block = BB_then_logical;
10328    append_logical_start(BB_then_logical);
10329 }
10330 
10331 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic)10332 begin_divergent_if_else(isel_context* ctx, if_context* ic)
10333 {
10334    Block* BB_then_logical = ctx->block;
10335    append_logical_end(BB_then_logical);
10336    /* branch from logical then block to invert block */
10337    aco_ptr<Pseudo_branch_instruction> branch;
10338    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10339                                                               Format::PSEUDO_BRANCH, 0, 1));
10340    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10341    branch->definitions[0].setHint(vcc);
10342    BB_then_logical->instructions.emplace_back(std::move(branch));
10343    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10344    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10345       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10346    BB_then_logical->kind |= block_kind_uniform;
10347    assert(!ctx->cf_info.has_branch);
10348    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10349    ctx->cf_info.parent_loop.has_divergent_branch = false;
10350    ctx->program->next_divergent_if_logical_depth--;
10351 
10352    /** emit linear then block */
10353    Block* BB_then_linear = ctx->program->create_and_insert_block();
10354    BB_then_linear->kind |= block_kind_uniform;
10355    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10356    /* branch from linear then block to invert block */
10357    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10358                                                               Format::PSEUDO_BRANCH, 0, 1));
10359    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10360    branch->definitions[0].setHint(vcc);
10361    BB_then_linear->instructions.emplace_back(std::move(branch));
10362    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10363 
10364    /** emit invert merge block */
10365    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10366    ic->invert_idx = ctx->block->index;
10367 
10368    /* branch to linear else block (skip else) */
10369    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10370                                                               Format::PSEUDO_BRANCH, 0, 1));
10371    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10372    branch->definitions[0].setHint(vcc);
10373    ctx->block->instructions.push_back(std::move(branch));
10374 
10375    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10376    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10377    ic->exec_potentially_empty_break_depth_old = std::min(
10378       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10379    /* divergent branches use cbranch_execz */
10380    ctx->cf_info.exec_potentially_empty_discard = false;
10381    ctx->cf_info.exec_potentially_empty_break = false;
10382    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10383 
10384    /** emit logical else block */
10385    ctx->program->next_divergent_if_logical_depth++;
10386    Block* BB_else_logical = ctx->program->create_and_insert_block();
10387    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10388    add_linear_edge(ic->invert_idx, BB_else_logical);
10389    ctx->block = BB_else_logical;
10390    append_logical_start(BB_else_logical);
10391 }
10392 
10393 static void
end_divergent_if(isel_context * ctx,if_context * ic)10394 end_divergent_if(isel_context* ctx, if_context* ic)
10395 {
10396    Block* BB_else_logical = ctx->block;
10397    append_logical_end(BB_else_logical);
10398 
10399    /* branch from logical else block to endif block */
10400    aco_ptr<Pseudo_branch_instruction> branch;
10401    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10402                                                               Format::PSEUDO_BRANCH, 0, 1));
10403    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10404    branch->definitions[0].setHint(vcc);
10405    BB_else_logical->instructions.emplace_back(std::move(branch));
10406    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10407    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10408       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10409    BB_else_logical->kind |= block_kind_uniform;
10410    ctx->program->next_divergent_if_logical_depth--;
10411 
10412    assert(!ctx->cf_info.has_branch);
10413    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10414 
10415    /** emit linear else block */
10416    Block* BB_else_linear = ctx->program->create_and_insert_block();
10417    BB_else_linear->kind |= block_kind_uniform;
10418    add_linear_edge(ic->invert_idx, BB_else_linear);
10419 
10420    /* branch from linear else block to endif block */
10421    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10422                                                               Format::PSEUDO_BRANCH, 0, 1));
10423    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10424    branch->definitions[0].setHint(vcc);
10425    BB_else_linear->instructions.emplace_back(std::move(branch));
10426    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10427 
10428    /** emit endif merge block */
10429    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10430    append_logical_start(ctx->block);
10431 
10432    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10433    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10434    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10435    ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10436       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10437    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10438        !ctx->cf_info.parent_if.is_divergent) {
10439       ctx->cf_info.exec_potentially_empty_break = false;
10440       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10441    }
10442    /* uniform control flow never has an empty exec-mask */
10443    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10444       ctx->cf_info.exec_potentially_empty_discard = false;
10445       ctx->cf_info.exec_potentially_empty_break = false;
10446       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10447    }
10448 }
10449 
10450 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10451 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10452 {
10453    assert(cond.regClass() == s1);
10454 
10455    append_logical_end(ctx->block);
10456    ctx->block->kind |= block_kind_uniform;
10457 
10458    aco_ptr<Pseudo_branch_instruction> branch;
10459    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10460    branch.reset(
10461       create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10462    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10463    branch->definitions[0].setHint(vcc);
10464    branch->operands[0] = Operand(cond);
10465    branch->operands[0].setFixed(scc);
10466    ctx->block->instructions.emplace_back(std::move(branch));
10467 
10468    ic->BB_if_idx = ctx->block->index;
10469    ic->BB_endif = Block();
10470    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10471 
10472    ctx->cf_info.has_branch = false;
10473    ctx->cf_info.parent_loop.has_divergent_branch = false;
10474 
10475    /** emit then block */
10476    ctx->program->next_uniform_if_depth++;
10477    Block* BB_then = ctx->program->create_and_insert_block();
10478    add_edge(ic->BB_if_idx, BB_then);
10479    append_logical_start(BB_then);
10480    ctx->block = BB_then;
10481 }
10482 
10483 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10484 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10485 {
10486    Block* BB_then = ctx->block;
10487 
10488    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10489    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10490 
10491    if (!ic->uniform_has_then_branch) {
10492       append_logical_end(BB_then);
10493       /* branch from then block to endif block */
10494       aco_ptr<Pseudo_branch_instruction> branch;
10495       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10496                                                                  Format::PSEUDO_BRANCH, 0, 1));
10497       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10498       branch->definitions[0].setHint(vcc);
10499       BB_then->instructions.emplace_back(std::move(branch));
10500       add_linear_edge(BB_then->index, &ic->BB_endif);
10501       if (!ic->then_branch_divergent)
10502          add_logical_edge(BB_then->index, &ic->BB_endif);
10503       BB_then->kind |= block_kind_uniform;
10504    }
10505 
10506    ctx->cf_info.has_branch = false;
10507    ctx->cf_info.parent_loop.has_divergent_branch = false;
10508 
10509    /** emit else block */
10510    Block* BB_else = ctx->program->create_and_insert_block();
10511    add_edge(ic->BB_if_idx, BB_else);
10512    append_logical_start(BB_else);
10513    ctx->block = BB_else;
10514 }
10515 
10516 static void
end_uniform_if(isel_context * ctx,if_context * ic)10517 end_uniform_if(isel_context* ctx, if_context* ic)
10518 {
10519    Block* BB_else = ctx->block;
10520 
10521    if (!ctx->cf_info.has_branch) {
10522       append_logical_end(BB_else);
10523       /* branch from then block to endif block */
10524       aco_ptr<Pseudo_branch_instruction> branch;
10525       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10526                                                                  Format::PSEUDO_BRANCH, 0, 1));
10527       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10528       branch->definitions[0].setHint(vcc);
10529       BB_else->instructions.emplace_back(std::move(branch));
10530       add_linear_edge(BB_else->index, &ic->BB_endif);
10531       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10532          add_logical_edge(BB_else->index, &ic->BB_endif);
10533       BB_else->kind |= block_kind_uniform;
10534    }
10535 
10536    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10537    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10538 
10539    /** emit endif merge block */
10540    ctx->program->next_uniform_if_depth--;
10541    if (!ctx->cf_info.has_branch) {
10542       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10543       append_logical_start(ctx->block);
10544    }
10545 }
10546 
10547 static bool
visit_if(isel_context * ctx,nir_if * if_stmt)10548 visit_if(isel_context* ctx, nir_if* if_stmt)
10549 {
10550    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10551    Builder bld(ctx->program, ctx->block);
10552    aco_ptr<Pseudo_branch_instruction> branch;
10553    if_context ic;
10554 
10555    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10556       /**
10557        * Uniform conditionals are represented in the following way*) :
10558        *
10559        * The linear and logical CFG:
10560        *                        BB_IF
10561        *                        /    \
10562        *       BB_THEN (logical)      BB_ELSE (logical)
10563        *                        \    /
10564        *                        BB_ENDIF
10565        *
10566        * *) Exceptions may be due to break and continue statements within loops
10567        *    If a break/continue happens within uniform control flow, it branches
10568        *    to the loop exit/entry block. Otherwise, it branches to the next
10569        *    merge block.
10570        **/
10571 
10572       assert(cond.regClass() == ctx->program->lane_mask);
10573       cond = bool_to_scalar_condition(ctx, cond);
10574 
10575       begin_uniform_if_then(ctx, &ic, cond);
10576       visit_cf_list(ctx, &if_stmt->then_list);
10577 
10578       begin_uniform_if_else(ctx, &ic);
10579       visit_cf_list(ctx, &if_stmt->else_list);
10580 
10581       end_uniform_if(ctx, &ic);
10582    } else { /* non-uniform condition */
10583       /**
10584        * To maintain a logical and linear CFG without critical edges,
10585        * non-uniform conditionals are represented in the following way*) :
10586        *
10587        * The linear CFG:
10588        *                        BB_IF
10589        *                        /    \
10590        *       BB_THEN (logical)      BB_THEN (linear)
10591        *                        \    /
10592        *                        BB_INVERT (linear)
10593        *                        /    \
10594        *       BB_ELSE (logical)      BB_ELSE (linear)
10595        *                        \    /
10596        *                        BB_ENDIF
10597        *
10598        * The logical CFG:
10599        *                        BB_IF
10600        *                        /    \
10601        *       BB_THEN (logical)      BB_ELSE (logical)
10602        *                        \    /
10603        *                        BB_ENDIF
10604        *
10605        * *) Exceptions may be due to break and continue statements within loops
10606        **/
10607 
10608       begin_divergent_if_then(ctx, &ic, cond);
10609       visit_cf_list(ctx, &if_stmt->then_list);
10610 
10611       begin_divergent_if_else(ctx, &ic);
10612       visit_cf_list(ctx, &if_stmt->else_list);
10613 
10614       end_divergent_if(ctx, &ic);
10615    }
10616 
10617    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10618 }
10619 
10620 static bool
visit_cf_list(isel_context * ctx,struct exec_list * list)10621 visit_cf_list(isel_context* ctx, struct exec_list* list)
10622 {
10623    foreach_list_typed (nir_cf_node, node, node, list) {
10624       switch (node->type) {
10625       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10626       case nir_cf_node_if:
10627          if (!visit_if(ctx, nir_cf_node_as_if(node)))
10628             return true;
10629          break;
10630       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10631       default: unreachable("unimplemented cf list type");
10632       }
10633    }
10634    return false;
10635 }
10636 
10637 static void
export_vs_varying(isel_context * ctx,int slot,bool is_pos,int * next_pos)10638 export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10639 {
10640    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10641 
10642    int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10643                    ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
10644                    : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
10645    unsigned mask = ctx->outputs.mask[slot];
10646    if (!is_pos && !mask)
10647       return;
10648    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
10649       return;
10650    aco_ptr<Export_instruction> exp{
10651       create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10652    exp->enabled_mask = mask;
10653    for (unsigned i = 0; i < 4; ++i) {
10654       if (mask & (1 << i))
10655          exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10656       else
10657          exp->operands[i] = Operand(v1);
10658    }
10659    /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
10660     * Setting valid_mask=1 prevents it and has no other effect.
10661     */
10662    exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
10663    exp->done = false;
10664    exp->compressed = false;
10665    if (is_pos)
10666       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10667    else
10668       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
10669    ctx->block->instructions.emplace_back(std::move(exp));
10670 }
10671 
10672 static void
export_vs_psiz_layer_viewport_vrs(isel_context * ctx,int * next_pos)10673 export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
10674 {
10675    aco_ptr<Export_instruction> exp{
10676       create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10677    exp->enabled_mask = 0;
10678    for (unsigned i = 0; i < 4; ++i)
10679       exp->operands[i] = Operand(v1);
10680    if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10681       exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10682       exp->enabled_mask |= 0x1;
10683    }
10684    if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
10685       exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10686       exp->enabled_mask |= 0x4;
10687    }
10688    if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
10689       if (ctx->options->chip_class < GFX9) {
10690          exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10691          exp->enabled_mask |= 0x8;
10692       } else {
10693          Builder bld(ctx->program, ctx->block);
10694 
10695          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),
10696                              Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10697          if (exp->operands[2].isTemp())
10698             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
10699 
10700          exp->operands[2] = Operand(out);
10701          exp->enabled_mask |= 0x4;
10702       }
10703    }
10704    if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10705       exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10706       exp->enabled_mask |= 0x2;
10707    } else if (ctx->options->force_vrs_rates) {
10708       /* Bits [2:3] = VRS rate X
10709        * Bits [4:5] = VRS rate Y
10710        *
10711        * The range is [-2, 1]. Values:
10712        *   1: 2x coarser shading rate in that direction.
10713        *   0: normal shading rate
10714        *  -1: 2x finer shading rate (sample shading, not directional)
10715        *  -2: 4x finer shading rate (sample shading, not directional)
10716        *
10717        * Sample shading can't go above 8 samples, so both numbers can't be -2
10718        * at the same time.
10719        */
10720       Builder bld(ctx->program, ctx->block);
10721       Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));
10722 
10723       /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
10724       Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),
10725                            Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
10726       rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10727                        bld.copy(bld.def(v1), Operand::zero()), rates, cond);
10728 
10729       exp->operands[1] = Operand(rates);
10730       exp->enabled_mask |= 0x2;
10731    }
10732 
10733    exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
10734    exp->done = false;
10735    exp->compressed = false;
10736    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10737    ctx->block->instructions.emplace_back(std::move(exp));
10738 }
10739 
10740 static void
create_vs_exports(isel_context * ctx)10741 create_vs_exports(isel_context* ctx)
10742 {
10743    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10744 
10745    const radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10746                                         ? &ctx->program->info->tes.outinfo
10747                                         : &ctx->program->info->vs.outinfo;
10748 
10749    ctx->block->kind |= block_kind_export_end;
10750 
10751    if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
10752       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10753       if (ctx->stage.has(SWStage::TES))
10754          ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10755             get_arg(ctx, ctx->args->ac.tes_patch_id);
10756       else
10757          ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10758             get_arg(ctx, ctx->args->ac.vs_prim_id);
10759    }
10760 
10761    if (ctx->options->key.has_multiview_view_index) {
10762       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
10763       ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
10764          as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
10765    }
10766 
10767    /* Hardware requires position data to always be exported, even if the
10768     * application did not write gl_Position.
10769     */
10770    ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10771 
10772    /* the order these position exports are created is important */
10773    int next_pos = 0;
10774    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10775 
10776    bool writes_primitive_shading_rate =
10777       outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
10778    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
10779        writes_primitive_shading_rate) {
10780       export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
10781    }
10782    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10783       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10784    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10785       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
10786 
10787    if (ctx->export_clip_dists) {
10788       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10789          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
10790       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10791          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
10792    }
10793 
10794    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10795       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
10796           i != VARYING_SLOT_VIEWPORT)
10797          continue;
10798 
10799       export_vs_varying(ctx, i, false, NULL);
10800    }
10801 }
10802 
10803 static bool
export_fs_mrt_z(isel_context * ctx)10804 export_fs_mrt_z(isel_context* ctx)
10805 {
10806    Builder bld(ctx->program, ctx->block);
10807    unsigned enabled_channels = 0;
10808    bool compr = false;
10809    Operand values[4];
10810 
10811    for (unsigned i = 0; i < 4; ++i) {
10812       values[i] = Operand(v1);
10813    }
10814 
10815    /* Both stencil and sample mask only need 16-bits. */
10816    if (!ctx->program->info->ps.writes_z &&
10817        (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
10818       compr = true; /* COMPR flag */
10819 
10820       if (ctx->program->info->ps.writes_stencil) {
10821          /* Stencil should be in X[23:16]. */
10822          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10823          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);
10824          enabled_channels |= 0x3;
10825       }
10826 
10827       if (ctx->program->info->ps.writes_sample_mask) {
10828          /* SampleMask should be in Y[15:0]. */
10829          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10830          enabled_channels |= 0xc;
10831       }
10832    } else {
10833       if (ctx->program->info->ps.writes_z) {
10834          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10835          enabled_channels |= 0x1;
10836       }
10837 
10838       if (ctx->program->info->ps.writes_stencil) {
10839          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10840          enabled_channels |= 0x2;
10841       }
10842 
10843       if (ctx->program->info->ps.writes_sample_mask) {
10844          values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10845          enabled_channels |= 0x4;
10846       }
10847    }
10848 
10849    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10850     * writemask component.
10851     */
10852    if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
10853        ctx->options->family != CHIP_HAINAN) {
10854       enabled_channels |= 0x1;
10855    }
10856 
10857    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10858            V_008DFC_SQ_EXP_MRTZ, compr);
10859 
10860    return true;
10861 }
10862 
10863 static bool
export_fs_mrt_color(isel_context * ctx,int slot)10864 export_fs_mrt_color(isel_context* ctx, int slot)
10865 {
10866    Builder bld(ctx->program, ctx->block);
10867    unsigned write_mask = ctx->outputs.mask[slot];
10868    Operand values[4];
10869 
10870    for (unsigned i = 0; i < 4; ++i) {
10871       if (write_mask & (1 << i)) {
10872          values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10873       } else {
10874          values[i] = Operand(v1);
10875       }
10876    }
10877 
10878    unsigned target, col_format;
10879    unsigned enabled_channels = 0;
10880    aco_opcode compr_op = (aco_opcode)0;
10881    bool compr = false;
10882 
10883    slot -= FRAG_RESULT_DATA0;
10884    target = V_008DFC_SQ_EXP_MRT + slot;
10885    col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;
10886 
10887    bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
10888    bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
10889    bool is_16bit = values[0].regClass() == v2b;
10890 
10891    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10892    if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
10893        (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10894         col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10895         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10896       for (int i = 0; i < 4; i++) {
10897          if (!(write_mask & (1 << i)))
10898             continue;
10899 
10900          Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
10901                                values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
10902          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10903                               bld.copy(bld.def(v1), Operand::zero()), isnan);
10904       }
10905    }
10906 
10907    switch (col_format) {
10908    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10909 
10910    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10911 
10912    case V_028714_SPI_SHADER_32_AR:
10913       if (ctx->options->chip_class >= GFX10) {
10914          /* Special case: on GFX10, the outputs are different for 32_AR */
10915          enabled_channels = 0x3;
10916          values[1] = values[3];
10917          values[3] = Operand(v1);
10918       } else {
10919          enabled_channels = 0x9;
10920       }
10921       break;
10922 
10923    case V_028714_SPI_SHADER_FP16_ABGR:
10924       for (int i = 0; i < 2; i++) {
10925          bool enabled = (write_mask >> (i * 2)) & 0x3;
10926          if (enabled) {
10927             enabled_channels |= 0x3 << (i * 2);
10928             if (is_16bit) {
10929                values[i] =
10930                   bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10931                              values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10932                              values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10933             } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
10934                values[i] =
10935                   bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10936                            values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10937                            values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10938             } else {
10939                values[i] =
10940                   bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10941                            values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10942                            values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10943             }
10944          } else {
10945             values[i] = Operand(v1);
10946          }
10947       }
10948       values[2] = Operand(v1);
10949       values[3] = Operand(v1);
10950       compr = true;
10951       break;
10952 
10953    case V_028714_SPI_SHADER_UNORM16_ABGR:
10954       if (is_16bit && ctx->options->chip_class >= GFX9) {
10955          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10956       } else {
10957          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10958       }
10959       break;
10960 
10961    case V_028714_SPI_SHADER_SNORM16_ABGR:
10962       if (is_16bit && ctx->options->chip_class >= GFX9) {
10963          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10964       } else {
10965          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10966       }
10967       break;
10968 
10969    case V_028714_SPI_SHADER_UINT16_ABGR: {
10970       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10971       if (is_int8 || is_int10) {
10972          /* clamp */
10973          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10974          Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
10975 
10976          for (unsigned i = 0; i < 4; i++) {
10977             if ((write_mask >> i) & 1) {
10978                values[i] =
10979                   bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10980                            i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
10981             }
10982          }
10983       } else if (is_16bit) {
10984          for (unsigned i = 0; i < 4; i++) {
10985             if ((write_mask >> i) & 1) {
10986                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10987                values[i] = Operand(tmp);
10988             }
10989          }
10990       }
10991       break;
10992    }
10993 
10994    case V_028714_SPI_SHADER_SINT16_ABGR:
10995       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10996       if (is_int8 || is_int10) {
10997          /* clamp */
10998          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10999          uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11000          Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11001          Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
11002 
11003          for (unsigned i = 0; i < 4; i++) {
11004             if ((write_mask >> i) & 1) {
11005                values[i] =
11006                   bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
11007                            i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
11008                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
11009                                     i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
11010                                     values[i]);
11011             }
11012          }
11013       } else if (is_16bit) {
11014          for (unsigned i = 0; i < 4; i++) {
11015             if ((write_mask >> i) & 1) {
11016                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11017                values[i] = Operand(tmp);
11018             }
11019          }
11020       }
11021       break;
11022 
11023    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11024 
11025    case V_028714_SPI_SHADER_ZERO:
11026    default: return false;
11027    }
11028 
11029    if ((bool)compr_op) {
11030       for (int i = 0; i < 2; i++) {
11031          /* check if at least one of the values to be compressed is enabled */
11032          bool enabled = (write_mask >> (i * 2)) & 0x3;
11033          if (enabled) {
11034             enabled_channels |= 0x3 << (i * 2);
11035             values[i] = bld.vop3(
11036                compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
11037                values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
11038          } else {
11039             values[i] = Operand(v1);
11040          }
11041       }
11042       values[2] = Operand(v1);
11043       values[3] = Operand(v1);
11044       compr = true;
11045    } else if (!compr) {
11046       for (int i = 0; i < 4; i++)
11047          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11048    }
11049 
11050    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
11051            compr);
11052    return true;
11053 }
11054 
11055 static void
create_fs_null_export(isel_context * ctx)11056 create_fs_null_export(isel_context* ctx)
11057 {
11058    /* FS must always have exports.
11059     * So when there are none, we need to add a null export.
11060     */
11061 
11062    Builder bld(ctx->program, ctx->block);
11063    unsigned dest = V_008DFC_SQ_EXP_NULL;
11064    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11065            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11066 }
11067 
11068 static void
create_fs_exports(isel_context * ctx)11069 create_fs_exports(isel_context* ctx)
11070 {
11071    bool exported = false;
11072 
11073    /* Export depth, stencil and sample mask. */
11074    if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11075        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11076       exported |= export_fs_mrt_z(ctx);
11077 
11078    /* Export all color render targets. */
11079    for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
11080       if (ctx->outputs.mask[i])
11081          exported |= export_fs_mrt_color(ctx, i);
11082 
11083    if (!exported)
11084       create_fs_null_export(ctx);
11085 
11086    ctx->block->kind |= block_kind_export_end;
11087 }
11088 
11089 static void
create_workgroup_barrier(Builder & bld)11090 create_workgroup_barrier(Builder& bld)
11091 {
11092    bld.barrier(aco_opcode::p_barrier,
11093                memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
11094 }
11095 
11096 static void
emit_stream_output(isel_context * ctx,Temp const * so_buffers,Temp const * so_write_offset,const struct radv_stream_output * output)11097 emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11098                    const struct radv_stream_output* output)
11099 {
11100    unsigned num_comps = util_bitcount(output->component_mask);
11101    unsigned writemask = (1 << num_comps) - 1;
11102    unsigned loc = output->location;
11103    unsigned buf = output->buffer;
11104 
11105    assert(num_comps && num_comps <= 4);
11106    if (!num_comps || num_comps > 4)
11107       return;
11108 
11109    unsigned first_comp = ffs(output->component_mask) - 1;
11110 
11111    Temp out[4];
11112    bool all_undef = true;
11113    assert(ctx->stage.hw == HWStage::VS);
11114    for (unsigned i = 0; i < num_comps; i++) {
11115       out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
11116       all_undef = all_undef && !out[i].id();
11117    }
11118    if (all_undef)
11119       return;
11120 
11121    while (writemask) {
11122       int start, count;
11123       u_bit_scan_consecutive_range(&writemask, &start, &count);
11124       if (count == 3 && ctx->options->chip_class == GFX6) {
11125          /* GFX6 doesn't support storing vec3, split it. */
11126          writemask |= 1u << (start + 2);
11127          count = 2;
11128       }
11129 
11130       unsigned offset = output->offset + start * 4;
11131 
11132       Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11133       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
11134          aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
11135       for (int i = 0; i < count; ++i)
11136          vec->operands[i] =
11137             (ctx->outputs.mask[loc] & 1 << (start + first_comp + i)) ? Operand(out[start + i]) : Operand::zero();
11138       vec->definitions[0] = Definition(write_data);
11139       ctx->block->instructions.emplace_back(std::move(vec));
11140 
11141       aco_opcode opcode;
11142       switch (count) {
11143       case 1: opcode = aco_opcode::buffer_store_dword; break;
11144       case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
11145       case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
11146       case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
11147       default: unreachable("Unsupported dword count.");
11148       }
11149 
11150       aco_ptr<MUBUF_instruction> store{
11151          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
11152       store->operands[0] = Operand(so_buffers[buf]);
11153       store->operands[1] = Operand(so_write_offset[buf]);
11154       store->operands[2] = Operand::c32(0);
11155       store->operands[3] = Operand(write_data);
11156       if (offset > 4095) {
11157          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
11158          Builder bld(ctx->program, ctx->block);
11159          store->operands[0] =
11160             bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
11161       } else {
11162          store->offset = offset;
11163       }
11164       store->offen = true;
11165       store->glc = true;
11166       store->dlc = false;
11167       store->slc = true;
11168       ctx->block->instructions.emplace_back(std::move(store));
11169    }
11170 }
11171 
11172 static void
emit_streamout(isel_context * ctx,unsigned stream)11173 emit_streamout(isel_context* ctx, unsigned stream)
11174 {
11175    Builder bld(ctx->program, ctx->block);
11176 
11177    Temp so_vtx_count =
11178       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11179                get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11180 
11181    Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11182 
11183    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
11184 
11185    if_context ic;
11186    begin_divergent_if_then(ctx, &ic, can_emit);
11187 
11188    bld.reset(ctx->block);
11189 
11190    Temp so_write_index =
11191       bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11192 
11193    Temp so_buffers[4];
11194    Temp so_write_offset[4];
11195    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11196 
11197    for (unsigned i = 0; i < 4; i++) {
11198       unsigned stride = ctx->program->info->so.strides[i];
11199       if (!stride)
11200          continue;
11201 
11202       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
11203                                bld.copy(bld.def(s1), Operand::c32(i * 16u)));
11204 
11205       if (stride == 1) {
11206          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11207                                 get_arg(ctx, ctx->args->ac.streamout_write_index),
11208                                 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11209          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
11210 
11211          so_write_offset[i] =
11212             bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
11213       } else {
11214          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
11215          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
11216                                  get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11217          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
11218       }
11219    }
11220 
11221    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
11222       const struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
11223       if (stream != output->stream)
11224          continue;
11225 
11226       emit_stream_output(ctx, so_buffers, so_write_offset, output);
11227    }
11228 
11229    begin_divergent_if_else(ctx, &ic);
11230    end_divergent_if(ctx, &ic);
11231 }
11232 
11233 Pseudo_instruction*
add_startpgm(struct isel_context * ctx)11234 add_startpgm(struct isel_context* ctx)
11235 {
11236    aco_ptr<Pseudo_instruction> startpgm{
11237       create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, ctx->args->ac.arg_count)};
11238    for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11239       if (ctx->args->ac.args[i].skip)
11240          continue;
11241 
11242       enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11243       unsigned size = ctx->args->ac.args[i].size;
11244       unsigned reg = ctx->args->ac.args[i].offset;
11245       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11246       Temp dst = ctx->program->allocateTmp(type);
11247       ctx->arg_temps[i] = dst;
11248       startpgm->definitions[arg] = Definition(dst);
11249       startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11250       arg++;
11251    }
11252    Pseudo_instruction* instr = startpgm.get();
11253    ctx->block->instructions.push_back(std::move(startpgm));
11254 
11255    /* Stash these in the program so that they can be accessed later when
11256     * handling spilling.
11257     */
11258    ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11259    ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11260 
11261    if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) {
11262       unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask);
11263       for (unsigned i = 0; i < num_attributes; i++) {
11264          Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
11265 
11266          unsigned idx = ctx->args->vs_inputs[i].arg_index;
11267          def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset));
11268 
11269          ctx->program->vs_inputs.push_back(def);
11270       }
11271    }
11272 
11273    return instr;
11274 }
11275 
11276 void
fix_ls_vgpr_init_bug(isel_context * ctx,Pseudo_instruction * startpgm)11277 fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11278 {
11279    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11280    Builder bld(ctx->program, ctx->block);
11281    constexpr unsigned hs_idx = 1u;
11282    Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11283                                               get_arg(ctx, ctx->args->ac.merged_wave_info),
11284                                               Operand::c32((8u << 16) | (hs_idx * 8u)));
11285    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11286 
11287    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11288 
11289    Temp instance_id =
11290       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11291                get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11292    Temp vs_rel_patch_id =
11293       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11294                get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11295    Temp vertex_id =
11296       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11297                get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11298 
11299    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11300    ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11301    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11302 }
11303 
11304 void
split_arguments(isel_context * ctx,Pseudo_instruction * startpgm)11305 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11306 {
11307    /* Split all arguments except for the first (ring_offsets) and the last
11308     * (exec) so that the dead channels don't stay live throughout the program.
11309     */
11310    for (int i = 1; i < startpgm->definitions.size(); i++) {
11311       if (startpgm->definitions[i].regClass().size() > 1) {
11312          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11313                            startpgm->definitions[i].regClass().size());
11314       }
11315    }
11316 }
11317 
11318 void
handle_bc_optimize(isel_context * ctx)11319 handle_bc_optimize(isel_context* ctx)
11320 {
11321    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
11322    Builder bld(ctx->program, ctx->block);
11323    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11324    bool uses_center =
11325       G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
11326    bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena);
11327    bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
11328 
11329    if (uses_persp_centroid)
11330       ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11331    if (uses_linear_centroid)
11332       ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11333 
11334    if (uses_center && (uses_persp_centroid || uses_linear_centroid)) {
11335       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
11336                               get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11337 
11338       if (uses_persp_centroid) {
11339          Temp new_coord[2];
11340          for (unsigned i = 0; i < 2; i++) {
11341             Temp persp_centroid =
11342                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11343             Temp persp_center =
11344                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11345             new_coord[i] =
11346                bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
11347          }
11348          ctx->persp_centroid = bld.tmp(v2);
11349          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11350                     Operand(new_coord[0]), Operand(new_coord[1]));
11351          emit_split_vector(ctx, ctx->persp_centroid, 2);
11352       }
11353 
11354       if (uses_linear_centroid) {
11355          Temp new_coord[2];
11356          for (unsigned i = 0; i < 2; i++) {
11357             Temp linear_centroid =
11358                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11359             Temp linear_center =
11360                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11361             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
11362                                     linear_center, sel);
11363          }
11364          ctx->linear_centroid = bld.tmp(v2);
11365          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11366                     Operand(new_coord[0]), Operand(new_coord[1]));
11367          emit_split_vector(ctx, ctx->linear_centroid, 2);
11368       }
11369    }
11370 }
11371 
11372 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11373 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11374 {
11375    Program* program = ctx->program;
11376 
11377    unsigned float_controls = shader->info.float_controls_execution_mode;
11378 
11379    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11380       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11381    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11382       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11383                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11384 
11385    program->next_fp_mode.must_flush_denorms32 =
11386       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11387    program->next_fp_mode.must_flush_denorms16_64 =
11388       float_controls &
11389       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11390 
11391    program->next_fp_mode.care_about_round32 =
11392       float_controls &
11393       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11394 
11395    program->next_fp_mode.care_about_round16_64 =
11396       float_controls &
11397       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11398        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11399 
11400    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11401     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11402    if (program->next_fp_mode.must_flush_denorms16_64)
11403       program->next_fp_mode.denorm16_64 = 0;
11404    else
11405       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11406 
11407    /* preserving fp32 denorms is expensive, so only do it if asked */
11408    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11409       program->next_fp_mode.denorm32 = fp_denorm_keep;
11410    else
11411       program->next_fp_mode.denorm32 = 0;
11412 
11413    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11414       program->next_fp_mode.round32 = fp_round_tz;
11415    else
11416       program->next_fp_mode.round32 = fp_round_ne;
11417 
11418    if (float_controls &
11419        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11420       program->next_fp_mode.round16_64 = fp_round_tz;
11421    else
11422       program->next_fp_mode.round16_64 = fp_round_ne;
11423 
11424    ctx->block->fp_mode = program->next_fp_mode;
11425 }
11426 
11427 void
cleanup_cfg(Program * program)11428 cleanup_cfg(Program* program)
11429 {
11430    /* create linear_succs/logical_succs */
11431    for (Block& BB : program->blocks) {
11432       for (unsigned idx : BB.linear_preds)
11433          program->blocks[idx].linear_succs.emplace_back(BB.index);
11434       for (unsigned idx : BB.logical_preds)
11435          program->blocks[idx].logical_succs.emplace_back(BB.index);
11436    }
11437 }
11438 
11439 Temp
lanecount_to_mask(isel_context * ctx,Temp count,bool allow64=true)11440 lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11441 {
11442    assert(count.regClass() == s1);
11443 
11444    Builder bld(ctx->program, ctx->block);
11445    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11446    Temp cond;
11447 
11448    if (ctx->program->wave_size == 64) {
11449       /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */
11450       if (!allow64)
11451          return mask;
11452 
11453       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11454       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11455                                 Operand::c32(6u /* log2(64) */));
11456       cond =
11457          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11458    } else {
11459       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11460        * the register */
11461       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11462    }
11463 
11464    return cond;
11465 }
11466 
11467 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11468 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11469 {
11470    Builder bld(ctx->program, ctx->block);
11471 
11472    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11473    Temp count = i == 0
11474                    ? get_arg(ctx, ctx->args->ac.merged_wave_info)
11475                    : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11476                               get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11477 
11478    return lanecount_to_mask(ctx, count);
11479 }
11480 
11481 void
ngg_emit_sendmsg_gs_alloc_req(isel_context * ctx,Temp vtx_cnt,Temp prm_cnt)11482 ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11483 {
11484    assert(vtx_cnt.id() && prm_cnt.id());
11485 
11486    Builder bld(ctx->program, ctx->block);
11487    Temp prm_cnt_0;
11488 
11489    if (ctx->program->chip_class == GFX10 &&
11490        (ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {
11491       /* Navi 1x workaround: check whether the workgroup has no output.
11492        * If so, change the number of exported vertices and primitives to 1.
11493        */
11494       prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());
11495       prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,
11496                          bld.scc(prm_cnt_0));
11497       vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,
11498                          bld.scc(prm_cnt_0));
11499    }
11500 
11501    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
11502    Temp tmp =
11503       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));
11504    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
11505 
11506    /* Request the SPI to allocate space for the primitives and vertices
11507     * that will be exported by the threadgroup.
11508     */
11509    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
11510 
11511    if (prm_cnt_0.id()) {
11512       /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.
11513        * It can't have all-zero positions because that would render an undesired pixel with
11514        * conservative rasterization.
11515        */
11516       Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
11517       Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
11518                            Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11519       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
11520                       Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11521 
11522       if_context ic_prim_0;
11523       begin_divergent_if_then(ctx, &ic_prim_0, cond);
11524       bld.reset(ctx->block);
11525       ctx->block->kind |= block_kind_export_end;
11526 
11527       /* Use zero: means that it's a triangle whose every vertex index is 0. */
11528       Temp zero = bld.copy(bld.def(v1), Operand::zero());
11529       /* Use NaN for the coordinates, so that the rasterizer allways culls it.  */
11530       Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));
11531 
11532       bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
11533               V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
11534               false /* valid mask */);
11535       bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
11536               V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
11537               true /* valid mask */);
11538 
11539       begin_divergent_if_else(ctx, &ic_prim_0);
11540       end_divergent_if(ctx, &ic_prim_0);
11541       bld.reset(ctx->block);
11542    }
11543 }
11544 
11545 } /* end namespace */
11546 
11547 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct radv_shader_args * args)11548 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11549                ac_shader_config* config, const struct radv_shader_args* args)
11550 {
11551    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
11552    if_context ic_merged_wave_info;
11553    bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11554 
11555    for (unsigned i = 0; i < shader_count; i++) {
11556       nir_shader* nir = shaders[i];
11557       init_context(&ctx, nir);
11558 
11559       setup_fp_mode(&ctx, nir);
11560 
11561       if (!i) {
11562          /* needs to be after init_context() for FS */
11563          Pseudo_instruction* startpgm = add_startpgm(&ctx);
11564          append_logical_start(ctx.block);
11565 
11566          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11567             fix_ls_vgpr_init_bug(&ctx, startpgm);
11568 
11569          split_arguments(&ctx, startpgm);
11570 
11571          if (!args->shader_info->vs.has_prolog &&
11572              (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11573             Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11574          }
11575       }
11576 
11577       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11578       nir_function_impl* func = nir_shader_get_entrypoint(nir);
11579       bool empty_shader =
11580          nir_cf_list_is_empty_block(&func->body) &&
11581          ((nir->info.stage == MESA_SHADER_VERTEX &&
11582            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11583           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11584 
11585       bool check_merged_wave_info =
11586          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11587       bool endif_merged_wave_info =
11588          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11589 
11590       if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
11591           program->stage.num_sw_stages() == 1) {
11592          /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11593           * s_sendmsg(GS_ALLOC_REQ). */
11594          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11595       }
11596 
11597       if (check_merged_wave_info) {
11598          Temp cond = merged_wave_info_to_mask(&ctx, i);
11599          begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11600       }
11601 
11602       if (i) {
11603          Builder bld(ctx.program, ctx.block);
11604 
11605          /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11606          bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11607                                  ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11608 
11609          if (!ngg_gs && !tcs_skip_barrier)
11610             create_workgroup_barrier(bld);
11611 
11612          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11613             ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11614                                         get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11615                                         Operand::c32(8u), Operand::zero());
11616          }
11617       } else if (ctx.stage == geometry_gs)
11618          ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11619 
11620       if (ctx.stage == fragment_fs)
11621          handle_bc_optimize(&ctx);
11622 
11623       visit_cf_list(&ctx, &func->body);
11624 
11625       if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
11626          emit_streamout(&ctx, 0);
11627 
11628       if (ctx.stage.hw == HWStage::VS) {
11629          create_vs_exports(&ctx);
11630       } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
11631          Builder bld(ctx.program, ctx.block);
11632          bld.barrier(aco_opcode::p_barrier,
11633                      memory_sync_info(storage_vmem_output, semantic_release, scope_device));
11634          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
11635                   sendmsg_gs_done(false, false, 0));
11636       }
11637 
11638       if (ctx.stage == fragment_fs) {
11639          create_fs_exports(&ctx);
11640       }
11641 
11642       if (endif_merged_wave_info) {
11643          begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11644          end_divergent_if(&ctx, &ic_merged_wave_info);
11645       }
11646 
11647       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11648          /* Outputs of the previous stage are inputs to the next stage */
11649          ctx.inputs = ctx.outputs;
11650          ctx.outputs = shader_io_state();
11651       }
11652 
11653       cleanup_context(&ctx);
11654    }
11655 
11656    program->config->float_mode = program->blocks[0].fp_mode.val;
11657 
11658    append_logical_end(ctx.block);
11659    ctx.block->kind |= block_kind_uniform;
11660    Builder bld(ctx.program, ctx.block);
11661    bld.sopp(aco_opcode::s_endpgm);
11662 
11663    cleanup_cfg(program);
11664 }
11665 
11666 void
select_gs_copy_shader(Program * program,struct nir_shader * gs_shader,ac_shader_config * config,const struct radv_shader_args * args)11667 select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
11668                       const struct radv_shader_args* args)
11669 {
11670    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11671 
11672    ctx.block->fp_mode = program->next_fp_mode;
11673 
11674    add_startpgm(&ctx);
11675    append_logical_start(ctx.block);
11676 
11677    Builder bld(ctx.program, ctx.block);
11678 
11679    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
11680                              program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
11681 
11682    Operand stream_id = Operand::zero();
11683    if (args->shader_info->so.num_outputs)
11684       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11685                            get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
11686 
11687    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
11688                               get_arg(&ctx, ctx.args->ac.vertex_id));
11689 
11690    std::stack<if_context, std::vector<if_context>> if_contexts;
11691 
11692    for (unsigned stream = 0; stream < 4; stream++) {
11693       if (stream_id.isConstant() && stream != stream_id.constantValue())
11694          continue;
11695 
11696       unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11697       if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11698          continue;
11699 
11700       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11701 
11702       if (!stream_id.isConstant()) {
11703          Temp cond =
11704             bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));
11705          if_contexts.emplace();
11706          begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
11707          bld.reset(ctx.block);
11708       }
11709 
11710       unsigned offset = 0;
11711       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11712          if (args->shader_info->gs.output_streams[i] != stream)
11713             continue;
11714 
11715          unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11716          unsigned length = util_last_bit(output_usage_mask);
11717          for (unsigned j = 0; j < length; ++j) {
11718             if (!(output_usage_mask & (1 << j)))
11719                continue;
11720 
11721             Temp val = bld.tmp(v1);
11722             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11723             load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
11724                             true, true);
11725 
11726             ctx.outputs.mask[i] |= 1 << j;
11727             ctx.outputs.temps[i * 4u + j] = val;
11728 
11729             offset++;
11730          }
11731       }
11732 
11733       if (args->shader_info->so.num_outputs) {
11734          emit_streamout(&ctx, stream);
11735          bld.reset(ctx.block);
11736       }
11737 
11738       if (stream == 0) {
11739          create_vs_exports(&ctx);
11740       }
11741 
11742       if (!stream_id.isConstant()) {
11743          begin_uniform_if_else(&ctx, &if_contexts.top());
11744          bld.reset(ctx.block);
11745       }
11746    }
11747 
11748    while (!if_contexts.empty()) {
11749       end_uniform_if(&ctx, &if_contexts.top());
11750       if_contexts.pop();
11751    }
11752 
11753    program->config->float_mode = program->blocks[0].fp_mode.val;
11754 
11755    append_logical_end(ctx.block);
11756    ctx.block->kind |= block_kind_uniform;
11757    bld.reset(ctx.block);
11758    bld.sopp(aco_opcode::s_endpgm);
11759 
11760    cleanup_cfg(program);
11761 }
11762 
11763 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct radv_shader_args * args)11764 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11765                            const struct radv_shader_args* args)
11766 {
11767    assert(args->options->chip_class == GFX8);
11768 
11769    init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11770                 args->options->family, args->options->wgp_mode, config);
11771 
11772    isel_context ctx = {};
11773    ctx.program = program;
11774    ctx.args = args;
11775    ctx.options = args->options;
11776    ctx.stage = program->stage;
11777 
11778    ctx.block = ctx.program->create_and_insert_block();
11779    ctx.block->kind = block_kind_top_level;
11780 
11781    program->workgroup_size = 1; /* XXX */
11782 
11783    add_startpgm(&ctx);
11784    append_logical_start(ctx.block);
11785 
11786    Builder bld(ctx.program, ctx.block);
11787 
11788    /* Load the buffer descriptor from TMA. */
11789    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11790             Operand::zero());
11791 
11792    /* Store TTMP0-TTMP1. */
11793    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11794             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11795 
11796    uint32_t hw_regs_idx[] = {
11797       2, /* HW_REG_STATUS */
11798       3, /* HW_REG_TRAP_STS */
11799       4, /* HW_REG_HW_ID */
11800       7, /* HW_REG_IB_STS */
11801    };
11802 
11803    /* Store some hardware registers. */
11804    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11805       /* "((size - 1) << 11) | register" */
11806       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11807                ((20 - 1) << 11) | hw_regs_idx[i]);
11808 
11809       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11810                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11811    }
11812 
11813    program->config->float_mode = program->blocks[0].fp_mode.val;
11814 
11815    append_logical_end(ctx.block);
11816    ctx.block->kind |= block_kind_uniform;
11817    bld.sopp(aco_opcode::s_endpgm);
11818 
11819    cleanup_cfg(program);
11820 }
11821 
11822 Operand
get_arg_fixed(const struct radv_shader_args * args,struct ac_arg arg)11823 get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg)
11824 {
11825    assert(arg.used);
11826 
11827    enum ac_arg_regfile file = args->ac.args[arg.arg_index].file;
11828    unsigned size = args->ac.args[arg.arg_index].size;
11829    unsigned reg = args->ac.args[arg.arg_index].offset;
11830 
11831    return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256),
11832                   RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size));
11833 }
11834 
11835 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)11836 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11837 {
11838    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11839 
11840    unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11841    if (bld.program->chip_class >= GFX10 && num_loads > 1)
11842       bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11843 
11844    for (unsigned i = 0; i < count;) {
11845       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11846 
11847       if (size == 4)
11848          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11849                   Operand::c32((start + i) * 16u));
11850       else if (size == 2)
11851          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11852                   Operand::c32((start + i) * 16u));
11853       else
11854          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11855                   Operand::c32((start + i) * 16u));
11856 
11857       dest = dest.advance(size * 16u);
11858       i += size;
11859    }
11860 
11861    return count;
11862 }
11863 
11864 Operand
calc_nontrivial_instance_id(Builder & bld,const struct radv_shader_args * args,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)11865 calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index,
11866                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11867                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11868 {
11869    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11870             get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u));
11871 
11872    wait_imm lgkm_imm;
11873    lgkm_imm.lgkm = 0;
11874    bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class));
11875 
11876    Definition fetch_index_def(tmp_vgpr0, v1);
11877    Operand fetch_index(tmp_vgpr0, v1);
11878 
11879    Operand div_info(tmp_sgpr, s1);
11880    if (bld.program->chip_class >= GFX8) {
11881       /* use SDWA */
11882       if (bld.program->chip_class < GFX9) {
11883          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
11884          div_info = Operand(tmp_vgpr1, v1);
11885       }
11886 
11887       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr;
11888 
11889       Instruction* instr;
11890       if (bld.program->chip_class >= GFX9)
11891          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
11892       else
11893          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
11894                                div_info, fetch_index)
11895                     .instr;
11896       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
11897 
11898       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
11899                fetch_index);
11900 
11901       instr =
11902          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
11903       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
11904    } else {
11905       Operand tmp_op(tmp_vgpr1, v1);
11906       Definition tmp_def(tmp_vgpr1, v1);
11907 
11908       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11909 
11910       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
11911       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
11912 
11913       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
11914                Operand(tmp_sgpr.advance(4), s1));
11915 
11916       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
11917       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
11918    }
11919 
11920    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
11921 
11922    return fetch_index;
11923 }
11924 
11925 void
select_vs_prolog(Program * program,const struct radv_vs_prolog_key * key,ac_shader_config * config,const struct radv_shader_args * args,unsigned * num_preserved_sgprs)11926 select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config,
11927                  const struct radv_shader_args* args, unsigned* num_preserved_sgprs)
11928 {
11929    assert(key->num_attributes > 0);
11930 
11931    /* This should be enough for any shader/stage. */
11932    unsigned max_user_sgprs = args->options->chip_class >= GFX9 ? 32 : 16;
11933    *num_preserved_sgprs = max_user_sgprs + 14;
11934 
11935    init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11936                 args->options->family, args->options->wgp_mode, config);
11937 
11938    Block* block = program->create_and_insert_block();
11939    block->kind = block_kind_top_level;
11940 
11941    program->workgroup_size = 64;
11942    calc_min_waves(program);
11943 
11944    Builder bld(program, block);
11945 
11946    block->instructions.reserve(16 + key->num_attributes * 4);
11947 
11948    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
11949 
11950    uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes);
11951    bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask;
11952 
11953    wait_imm lgkm_imm;
11954    lgkm_imm.lgkm = 0;
11955 
11956    /* choose sgprs */
11957    PhysReg vertex_buffers(align(*num_preserved_sgprs, 2));
11958    PhysReg prolog_input = vertex_buffers.advance(8);
11959    PhysReg desc(
11960       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
11961 
11962    Operand start_instance = get_arg_fixed(args, args->ac.start_instance);
11963    Operand instance_id = get_arg_fixed(args, args->ac.instance_id);
11964 
11965    PhysReg attributes_start(256 + args->ac.num_vgprs_used);
11966    /* choose vgprs that won't be used for anything else until the last attribute load */
11967    PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1);
11968    PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2);
11969    PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3);
11970    PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4);
11971    PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4);
11972 
11973    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
11974             get_arg_fixed(args, args->ac.vertex_buffers));
11975    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
11976             Operand::c32((unsigned)args->options->address32_hi));
11977 
11978    /* calculate vgpr requirements */
11979    unsigned num_vgprs = attributes_start.reg() - 256;
11980    num_vgprs += key->num_attributes * 4;
11981    if (has_nontrivial_divisors && program->chip_class <= GFX8)
11982       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
11983    unsigned num_sgprs = 0;
11984 
11985    for (unsigned loc = 0; loc < key->num_attributes;) {
11986       unsigned num_descs =
11987          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
11988       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
11989 
11990       if (loc == 0) {
11991          /* perform setup while we load the descriptors */
11992          if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) {
11993             Operand count = get_arg_fixed(args, args->ac.merged_wave_info);
11994             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
11995             if (program->wave_size == 64) {
11996                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
11997                         Operand::c32(6u /* log2(64) */));
11998                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
11999                         Operand(exec, s2), Operand(scc, s1));
12000             }
12001          }
12002 
12003          bool needs_instance_index = false;
12004          bool needs_start_instance = false;
12005          u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask)
12006          {
12007             needs_instance_index |= key->state->divisors[i] == 1;
12008             needs_start_instance |= key->state->divisors[i] == 0;
12009          }
12010          bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask;
12011          if (needs_vertex_index)
12012             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex),
12013                        get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true);
12014          if (needs_instance_index)
12015             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12016                        Operand(s2), true);
12017          if (needs_start_instance)
12018             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12019       }
12020 
12021       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12022 
12023       for (unsigned i = 0; i < num_descs; i++, loc++) {
12024          PhysReg dest(attributes_start.reg() + loc * 4u);
12025 
12026          /* calculate index */
12027          Operand fetch_index = Operand(vertex_index, v1);
12028          if (key->state->instance_rate_inputs & (1u << loc)) {
12029             uint32_t divisor = key->state->divisors[loc];
12030             if (divisor) {
12031                fetch_index = instance_id;
12032                if (key->state->nontrivial_divisors & (1u << loc)) {
12033                   unsigned index =
12034                      util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc));
12035                   fetch_index = calc_nontrivial_instance_id(
12036                      bld, args, index, instance_id, start_instance, prolog_input,
12037                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12038                } else {
12039                   fetch_index = Operand(instance_index, v1);
12040                }
12041             } else {
12042                fetch_index = Operand(start_instance_vgpr, v1);
12043             }
12044          }
12045 
12046          /* perform load */
12047          PhysReg cur_desc = desc.advance(i * 16);
12048          if ((key->misaligned_mask & (1u << loc))) {
12049             unsigned dfmt = key->state->formats[loc] & 0xf;
12050             unsigned nfmt = key->state->formats[loc] >> 4;
12051             const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
12052             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12053                bool post_shuffle = key->state->post_shuffle & (1u << loc);
12054                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12055 
12056                /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12057                 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12058                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12059                 * care).
12060                 */
12061                if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
12062                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12063                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12064                             false, true);
12065                else
12066                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12067                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
12068                             vtx_info->chan_format, nfmt, offset, false, true);
12069             }
12070             uint32_t one =
12071                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12072                   ? 1u
12073                   : 0x3f800000u;
12074             for (unsigned j = vtx_info->num_channels; j < 4; j++) {
12075                bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12076                         Operand::c32(j == 3 ? one : 0u));
12077             }
12078          } else {
12079             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12080                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12081          }
12082       }
12083    }
12084 
12085    if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) {
12086       wait_imm vm_imm;
12087       vm_imm.vm = 0;
12088       bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class));
12089    }
12090 
12091    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12092     * so we may need to fix it up. */
12093    u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi))
12094    {
12095       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12096 
12097       unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1;
12098       alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1;
12099 
12100       if (alpha_adjust == ALPHA_ADJUST_SSCALED)
12101          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12102 
12103       /* For the integer-like cases, do a natural sign extension.
12104        *
12105        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12106        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12107        * exponent.
12108        */
12109       unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
12110       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12111                Operand::c32(offset), Operand::c32(2u));
12112 
12113       /* Convert back to the right type. */
12114       if (alpha_adjust == ALPHA_ADJUST_SNORM) {
12115          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12116          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12117                   Operand(alpha, v1));
12118       } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
12119          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12120       }
12121    }
12122 
12123    block->kind |= block_kind_uniform;
12124 
12125    /* continue on to the main shader */
12126    Operand continue_pc = get_arg_fixed(args, args->prolog_inputs);
12127    if (has_nontrivial_divisors) {
12128       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12129                get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u));
12130       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12131       continue_pc = Operand(prolog_input, s2);
12132    }
12133 
12134    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12135 
12136    program->config->float_mode = program->blocks[0].fp_mode.val;
12137    /* addition on GFX6-8 requires a carry-out (we use VCC) */
12138    program->needs_vcc = program->chip_class <= GFX8;
12139    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12140    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12141 }
12142 } // namespace aco
12143