1 /*
2 * Copyright © 2019 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include "util/u_math.h"
29
30 #include <set>
31 #include <vector>
32
33 namespace aco {
34
35 namespace {
36
37 enum WQMState : uint8_t {
38 Unspecified = 0,
39 Exact = 1 << 0,
40 WQM = 1 << 1, /* with control flow applied */
41 Preserve_WQM = 1 << 2,
42 Exact_Branch = 1 << 3,
43 };
44
45 enum mask_type : uint8_t {
46 mask_type_global = 1 << 0,
47 mask_type_exact = 1 << 1,
48 mask_type_wqm = 1 << 2,
49 mask_type_loop = 1 << 3, /* active lanes of a loop */
50 };
51
52 struct wqm_ctx {
53 Program* program;
54 /* state for WQM propagation */
55 std::set<unsigned> worklist;
56 std::vector<uint16_t> defined_in;
57 std::vector<bool> needs_wqm;
58 std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
wqm_ctxaco::__anon3705a59e0111::wqm_ctx59 wqm_ctx(Program* program_)
60 : program(program_), defined_in(program->peekAllocationId(), 0xFFFF),
61 needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size())
62 {
63 for (unsigned i = 0; i < program->blocks.size(); i++)
64 worklist.insert(i);
65 }
66 };
67
68 struct loop_info {
69 Block* loop_header;
70 uint16_t num_exec_masks;
71 uint8_t needs;
72 bool has_divergent_break;
73 bool has_divergent_continue;
74 bool has_discard; /* has a discard or demote */
loop_infoaco::__anon3705a59e0111::loop_info75 loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
76 : loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
77 has_divergent_continue(cont), has_discard(discard)
78 {}
79 };
80
81 struct block_info {
82 std::vector<std::pair<Operand, uint8_t>>
83 exec; /* Vector of exec masks. Either a temporary or const -1. */
84 std::vector<WQMState> instr_needs;
85 uint8_t block_needs;
86 uint8_t ever_again_needs;
87 bool logical_end_wqm;
88 /* more... */
89 };
90
91 struct exec_ctx {
92 Program* program;
93 std::vector<block_info> info;
94 std::vector<loop_info> loop;
95 bool handle_wqm = false;
exec_ctxaco::__anon3705a59e0111::exec_ctx96 exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
97 };
98
99 bool
needs_exact(aco_ptr<Instruction> & instr)100 needs_exact(aco_ptr<Instruction>& instr)
101 {
102 if (instr->isMUBUF()) {
103 return instr->mubuf().disable_wqm;
104 } else if (instr->isMTBUF()) {
105 return instr->mtbuf().disable_wqm;
106 } else if (instr->isMIMG()) {
107 return instr->mimg().disable_wqm;
108 } else if (instr->isFlatLike()) {
109 return instr->flatlike().disable_wqm;
110 } else {
111 return instr->isEXP();
112 }
113 }
114
115 void
set_needs_wqm(wqm_ctx & ctx,Temp tmp)116 set_needs_wqm(wqm_ctx& ctx, Temp tmp)
117 {
118 if (!ctx.needs_wqm[tmp.id()]) {
119 ctx.needs_wqm[tmp.id()] = true;
120 if (ctx.defined_in[tmp.id()] != 0xFFFF)
121 ctx.worklist.insert(ctx.defined_in[tmp.id()]);
122 }
123 }
124
125 void
mark_block_wqm(wqm_ctx & ctx,unsigned block_idx)126 mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
127 {
128 if (ctx.branch_wqm[block_idx])
129 return;
130
131 ctx.branch_wqm[block_idx] = true;
132 ctx.worklist.insert(block_idx);
133
134 Block& block = ctx.program->blocks[block_idx];
135
136 /* TODO: this sets more branch conditions to WQM than it needs to
137 * it should be enough to stop at the "exec mask top level" */
138 if (block.kind & block_kind_top_level)
139 return;
140
141 for (unsigned pred_idx : block.logical_preds)
142 mark_block_wqm(ctx, pred_idx);
143 }
144
145 void
get_block_needs(wqm_ctx & ctx,exec_ctx & exec_ctx,Block * block)146 get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
147 {
148 block_info& info = exec_ctx.info[block->index];
149
150 std::vector<WQMState> instr_needs(block->instructions.size());
151
152 for (int i = block->instructions.size() - 1; i >= 0; --i) {
153 aco_ptr<Instruction>& instr = block->instructions[i];
154
155 WQMState needs = needs_exact(instr) ? Exact : Unspecified;
156 bool propagate_wqm =
157 instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform;
158 bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
159 bool pred_by_exec = needs_exec_mask(instr.get());
160 for (const Definition& definition : instr->definitions) {
161 if (!definition.isTemp())
162 continue;
163 const unsigned def = definition.tempId();
164 ctx.defined_in[def] = block->index;
165 if (needs == Unspecified && ctx.needs_wqm[def]) {
166 needs = pred_by_exec ? WQM : Unspecified;
167 propagate_wqm = true;
168 }
169 }
170
171 if (instr->isBranch() && ctx.branch_wqm[block->index]) {
172 assert(!(info.block_needs & Exact_Branch));
173 needs = WQM;
174 propagate_wqm = true;
175 }
176
177 if (propagate_wqm) {
178 for (const Operand& op : instr->operands) {
179 if (op.isTemp()) {
180 set_needs_wqm(ctx, op.getTemp());
181 }
182 }
183 } else if (preserve_wqm && info.block_needs & WQM) {
184 needs = Preserve_WQM;
185 }
186
187 /* ensure the condition controlling the control flow for this phi is in WQM */
188 if (needs == WQM && instr->opcode == aco_opcode::p_phi) {
189 for (unsigned pred_idx : block->logical_preds) {
190 mark_block_wqm(ctx, pred_idx);
191 exec_ctx.info[pred_idx].logical_end_wqm = true;
192 ctx.worklist.insert(pred_idx);
193 }
194 }
195
196 if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) ||
197 instr->opcode == aco_opcode::p_wqm) {
198 assert(needs != Exact);
199 needs = WQM;
200 }
201
202 instr_needs[i] = needs;
203 info.block_needs |= needs;
204 }
205
206 info.instr_needs = instr_needs;
207
208 /* for "if (<cond>) <wqm code>" or "while (<cond>) <wqm code>",
209 * <cond> should be computed in WQM */
210 if (info.block_needs & WQM && !(block->kind & block_kind_top_level)) {
211 for (unsigned pred_idx : block->logical_preds)
212 mark_block_wqm(ctx, pred_idx);
213 }
214 }
215
216 /* If an outer loop needs WQM but a nested loop does not, we have to ensure that
217 * the nested loop is done in WQM so that the exec is not empty upon entering
218 * the nested loop.
219 *
220 * TODO: This could be fixed with slightly better code (for loops with divergent
221 * breaks, which might benefit from being in exact) by adding Exact_Branch to a
222 * divergent branch surrounding the nested loop, if such a branch exists.
223 */
224 void
handle_wqm_loops(wqm_ctx & ctx,exec_ctx & exec_ctx,unsigned preheader)225 handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
226 {
227 for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) {
228 Block& block = exec_ctx.program->blocks[idx];
229 if (block.kind & block_kind_break)
230 mark_block_wqm(ctx, idx);
231
232 if ((block.kind & block_kind_loop_exit) && block.loop_nest_depth == 0)
233 break;
234 }
235 }
236
237 /* If an outer loop and it's nested loops does not need WQM,
238 * add_branch_code() will ensure that it enters in Exact. We have to
239 * ensure that the exact exec mask is not empty by adding Exact_Branch to
240 * the outer divergent branch.
241 */
242 void
handle_exact_loops(wqm_ctx & ctx,exec_ctx & exec_ctx,unsigned preheader)243 handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
244 {
245 assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header);
246
247 int parent_branch = preheader;
248 unsigned rel_branch_depth = 0;
249 for (; parent_branch >= 0; parent_branch--) {
250 Block& branch = exec_ctx.program->blocks[parent_branch];
251 if (branch.kind & block_kind_branch) {
252 if (rel_branch_depth == 0)
253 break;
254 rel_branch_depth--;
255 }
256
257 /* top-level blocks should never have empty exact exec masks */
258 if (branch.kind & block_kind_top_level)
259 return;
260
261 if (branch.kind & block_kind_merge)
262 rel_branch_depth++;
263 }
264 assert(parent_branch >= 0);
265
266 ASSERTED Block& branch = exec_ctx.program->blocks[parent_branch];
267 assert(branch.kind & block_kind_branch);
268 if (ctx.branch_wqm[parent_branch]) {
269 /* The branch can't be done in Exact because some other blocks in it
270 * are in WQM. So instead, ensure that the loop is done in WQM. */
271 handle_wqm_loops(ctx, exec_ctx, preheader);
272 } else {
273 exec_ctx.info[parent_branch].block_needs |= Exact_Branch;
274 }
275 }
276
277 void
calculate_wqm_needs(exec_ctx & exec_ctx)278 calculate_wqm_needs(exec_ctx& exec_ctx)
279 {
280 wqm_ctx ctx(exec_ctx.program);
281
282 while (!ctx.worklist.empty()) {
283 unsigned block_index = *std::prev(ctx.worklist.end());
284 ctx.worklist.erase(std::prev(ctx.worklist.end()));
285
286 Block& block = exec_ctx.program->blocks[block_index];
287 get_block_needs(ctx, exec_ctx, &block);
288
289 /* handle_exact_loops() needs information on outer branches, so don't
290 * handle loops until a top-level block.
291 */
292 if (block.kind & block_kind_top_level && block.index != exec_ctx.program->blocks.size() - 1) {
293 unsigned preheader = block.index;
294 do {
295 Block& preheader_block = exec_ctx.program->blocks[preheader];
296 if ((preheader_block.kind & block_kind_loop_preheader) &&
297 preheader_block.loop_nest_depth == 0) {
298 /* If the loop or a nested loop needs WQM, branch_wqm will be true for the
299 * preheader.
300 */
301 if (ctx.branch_wqm[preheader])
302 handle_wqm_loops(ctx, exec_ctx, preheader);
303 else
304 handle_exact_loops(ctx, exec_ctx, preheader);
305 }
306 preheader++;
307 } while (!(exec_ctx.program->blocks[preheader].kind & block_kind_top_level));
308 }
309 }
310
311 uint8_t ever_again_needs = 0;
312 for (int i = exec_ctx.program->blocks.size() - 1; i >= 0; i--) {
313 exec_ctx.info[i].ever_again_needs = ever_again_needs;
314 Block& block = exec_ctx.program->blocks[i];
315
316 if (block.kind & block_kind_needs_lowering)
317 exec_ctx.info[i].block_needs |= Exact;
318
319 /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
320 if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) &&
321 ever_again_needs & WQM)
322 exec_ctx.info[i].block_needs |= Preserve_WQM;
323
324 ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
325 if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if ||
326 block.kind & block_kind_uses_demote)
327 ever_again_needs |= Exact;
328
329 /* don't propagate WQM preservation further than the next top_level block */
330 if (block.kind & block_kind_top_level)
331 ever_again_needs &= ~Preserve_WQM;
332 else
333 exec_ctx.info[i].block_needs &= ~Preserve_WQM;
334 }
335 exec_ctx.handle_wqm = true;
336 }
337
338 Operand
get_exec_op(Operand t)339 get_exec_op(Operand t)
340 {
341 if (t.isUndefined())
342 return Operand(exec, t.regClass());
343 else
344 return t;
345 }
346
347 void
transition_to_WQM(exec_ctx & ctx,Builder bld,unsigned idx)348 transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
349 {
350 if (ctx.info[idx].exec.back().second & mask_type_wqm)
351 return;
352 if (ctx.info[idx].exec.back().second & mask_type_global) {
353 Operand exec_mask = ctx.info[idx].exec.back().first;
354 if (exec_mask.isUndefined()) {
355 exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), Operand(exec, bld.lm));
356 ctx.info[idx].exec.back().first = exec_mask;
357 }
358
359 exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
360 get_exec_op(exec_mask));
361 ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
362 return;
363 }
364 /* otherwise, the WQM mask should be one below the current mask */
365 ctx.info[idx].exec.pop_back();
366 assert(ctx.info[idx].exec.back().second & mask_type_wqm);
367 assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
368 assert(ctx.info[idx].exec.back().first.isTemp());
369 ctx.info[idx].exec.back().first = bld.pseudo(
370 aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
371 }
372
373 void
transition_to_Exact(exec_ctx & ctx,Builder bld,unsigned idx)374 transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
375 {
376 if (ctx.info[idx].exec.back().second & mask_type_exact)
377 return;
378 /* We can't remove the loop exec mask, because that can cause exec.size() to
379 * be less than num_exec_masks. The loop exec mask also needs to be kept
380 * around for various uses. */
381 if ((ctx.info[idx].exec.back().second & mask_type_global) &&
382 !(ctx.info[idx].exec.back().second & mask_type_loop)) {
383 ctx.info[idx].exec.pop_back();
384 assert(ctx.info[idx].exec.back().second & mask_type_exact);
385 assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
386 assert(ctx.info[idx].exec.back().first.isTemp());
387 ctx.info[idx].exec.back().first = bld.pseudo(
388 aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
389 return;
390 }
391 /* otherwise, we create an exact mask and push to the stack */
392 Operand wqm = ctx.info[idx].exec.back().first;
393 if (wqm.isUndefined()) {
394 wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
395 Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
396 } else {
397 bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc),
398 ctx.info[idx].exec[0].first, wqm);
399 }
400 ctx.info[idx].exec.back().first = Operand(wqm);
401 ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact);
402 }
403
404 unsigned
add_coupling_code(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions)405 add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
406 {
407 unsigned idx = block->index;
408 Builder bld(ctx.program, &instructions);
409 std::vector<unsigned>& preds = block->linear_preds;
410
411 /* start block */
412 if (idx == 0) {
413 aco_ptr<Instruction>& startpgm = block->instructions[0];
414 assert(startpgm->opcode == aco_opcode::p_startpgm);
415 bld.insert(std::move(startpgm));
416
417 Operand start_exec(bld.lm);
418
419 /* exec seems to need to be manually initialized with combined shaders */
420 if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG) {
421 start_exec = Operand::c32_or_c64(-1u, bld.lm == s2);
422 bld.copy(Definition(exec, bld.lm), start_exec);
423 }
424
425 if (ctx.handle_wqm) {
426 ctx.info[0].exec.emplace_back(start_exec, mask_type_global | mask_type_exact);
427 /* if this block only needs WQM, initialize already */
428 if (ctx.info[0].block_needs == WQM)
429 transition_to_WQM(ctx, bld, 0);
430 } else {
431 uint8_t mask = mask_type_global;
432 if (ctx.program->needs_wqm) {
433 bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
434 Operand(exec, bld.lm));
435 mask |= mask_type_wqm;
436 } else {
437 mask |= mask_type_exact;
438 }
439 ctx.info[0].exec.emplace_back(start_exec, mask);
440 }
441
442 return 1;
443 }
444
445 /* loop entry block */
446 if (block->kind & block_kind_loop_header) {
447 assert(preds[0] == idx - 1);
448 ctx.info[idx].exec = ctx.info[idx - 1].exec;
449 loop_info& info = ctx.loop.back();
450 while (ctx.info[idx].exec.size() > info.num_exec_masks)
451 ctx.info[idx].exec.pop_back();
452
453 /* create ssa names for outer exec masks */
454 if (info.has_discard) {
455 aco_ptr<Pseudo_instruction> phi;
456 for (int i = 0; i < info.num_exec_masks - 1; i++) {
457 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
458 Format::PSEUDO, preds.size(), 1));
459 phi->definitions[0] = bld.def(bld.lm);
460 phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
461 ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
462 }
463 }
464
465 /* create ssa name for restore mask */
466 if (info.has_divergent_break) {
467 /* this phi might be trivial but ensures a parallelcopy on the loop header */
468 aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
469 aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
470 phi->definitions[0] = bld.def(bld.lm);
471 phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
472 ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
473 }
474
475 /* create ssa name for loop active mask */
476 aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
477 aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
478 if (info.has_divergent_continue)
479 phi->definitions[0] = bld.def(bld.lm);
480 else
481 phi->definitions[0] = Definition(exec, bld.lm);
482 phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first);
483 Temp loop_active = bld.insert(std::move(phi));
484
485 if (info.has_divergent_break) {
486 uint8_t mask_type =
487 (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
488 ctx.info[idx].exec.emplace_back(loop_active, mask_type);
489 } else {
490 ctx.info[idx].exec.back().first = Operand(loop_active);
491 ctx.info[idx].exec.back().second |= mask_type_loop;
492 }
493
494 /* create a parallelcopy to move the active mask to exec */
495 unsigned i = 0;
496 if (info.has_divergent_continue) {
497 while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
498 bld.insert(std::move(block->instructions[i]));
499 i++;
500 }
501 uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
502 assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
503 ctx.info[idx].exec.emplace_back(
504 bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
505 ctx.info[idx].exec.back().first),
506 mask_type);
507 }
508
509 return i;
510 }
511
512 /* loop exit block */
513 if (block->kind & block_kind_loop_exit) {
514 Block* header = ctx.loop.back().loop_header;
515 loop_info& info = ctx.loop.back();
516
517 for (ASSERTED unsigned pred : preds)
518 assert(ctx.info[pred].exec.size() >= info.num_exec_masks);
519
520 /* fill the loop header phis */
521 std::vector<unsigned>& header_preds = header->linear_preds;
522 int instr_idx = 0;
523 if (info.has_discard) {
524 while (instr_idx < info.num_exec_masks - 1) {
525 aco_ptr<Instruction>& phi = header->instructions[instr_idx];
526 assert(phi->opcode == aco_opcode::p_linear_phi);
527 for (unsigned i = 1; i < phi->operands.size(); i++)
528 phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[instr_idx].first);
529 instr_idx++;
530 }
531 }
532
533 {
534 aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
535 assert(phi->opcode == aco_opcode::p_linear_phi);
536 for (unsigned i = 1; i < phi->operands.size(); i++)
537 phi->operands[i] =
538 get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
539 }
540
541 if (info.has_divergent_break) {
542 aco_ptr<Instruction>& phi = header->instructions[instr_idx];
543 assert(phi->opcode == aco_opcode::p_linear_phi);
544 for (unsigned i = 1; i < phi->operands.size(); i++)
545 phi->operands[i] =
546 get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
547 }
548
549 assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
550
551 /* create the loop exit phis if not trivial */
552 for (unsigned exec_idx = 0; exec_idx < info.num_exec_masks; exec_idx++) {
553 Operand same = ctx.info[preds[0]].exec[exec_idx].first;
554 uint8_t type = ctx.info[header_preds[0]].exec[exec_idx].second;
555 bool trivial = true;
556
557 for (unsigned i = 1; i < preds.size() && trivial; i++) {
558 if (ctx.info[preds[i]].exec[exec_idx].first != same)
559 trivial = false;
560 }
561
562 if (trivial) {
563 ctx.info[idx].exec.emplace_back(same, type);
564 } else {
565 /* create phi for loop footer */
566 aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
567 aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
568 phi->definitions[0] = bld.def(bld.lm);
569 if (exec_idx == info.num_exec_masks - 1u) {
570 phi->definitions[0] = Definition(exec, bld.lm);
571 }
572 for (unsigned i = 0; i < phi->operands.size(); i++)
573 phi->operands[i] = get_exec_op(ctx.info[preds[i]].exec[exec_idx].first);
574 ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
575 }
576 }
577 assert(ctx.info[idx].exec.size() == info.num_exec_masks);
578
579 /* create a parallelcopy to move the live mask to exec */
580 unsigned i = 0;
581 while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
582 bld.insert(std::move(block->instructions[i]));
583 i++;
584 }
585
586 if (ctx.handle_wqm) {
587 if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
588 if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
589 (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
590 ctx.info[idx].exec.back().second |= mask_type_global;
591 transition_to_Exact(ctx, bld, idx);
592 ctx.handle_wqm = false;
593 }
594 }
595 if (ctx.info[idx].block_needs == WQM)
596 transition_to_WQM(ctx, bld, idx);
597 else if (ctx.info[idx].block_needs == Exact)
598 transition_to_Exact(ctx, bld, idx);
599 }
600
601 assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
602 if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
603 /* move current exec mask into exec register */
604 ctx.info[idx].exec.back().first = bld.pseudo(
605 aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
606 }
607
608 ctx.loop.pop_back();
609 return i;
610 }
611
612 if (preds.size() == 1) {
613 ctx.info[idx].exec = ctx.info[preds[0]].exec;
614 } else {
615 assert(preds.size() == 2);
616 /* if one of the predecessors ends in exact mask, we pop it from stack */
617 unsigned num_exec_masks =
618 std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size());
619
620 if (block->kind & block_kind_merge)
621 num_exec_masks--;
622 if (block->kind & block_kind_top_level)
623 num_exec_masks = std::min(num_exec_masks, 2u);
624
625 /* create phis for diverged exec masks */
626 for (unsigned i = 0; i < num_exec_masks; i++) {
627 /* skip trivial phis */
628 if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
629 Operand t = ctx.info[preds[0]].exec[i].first;
630 /* discard/demote can change the state of the current exec mask */
631 assert(!t.isTemp() ||
632 ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
633 uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
634 ctx.info[idx].exec.emplace_back(t, mask);
635 continue;
636 }
637
638 bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
639 Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
640 in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
641 get_exec_op(ctx.info[preds[0]].exec[i].first),
642 get_exec_op(ctx.info[preds[1]].exec[i].first));
643 uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
644 ctx.info[idx].exec.emplace_back(phi, mask_type);
645 }
646 }
647
648 unsigned i = 0;
649 while (block->instructions[i]->opcode == aco_opcode::p_phi ||
650 block->instructions[i]->opcode == aco_opcode::p_linear_phi) {
651 bld.insert(std::move(block->instructions[i]));
652 i++;
653 }
654
655 /* try to satisfy the block's needs */
656 if (ctx.handle_wqm) {
657 if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
658 if ((ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == 0 ||
659 (ctx.info[idx].block_needs | ctx.info[idx].ever_again_needs) == Exact) {
660 ctx.info[idx].exec.back().second |= mask_type_global;
661 transition_to_Exact(ctx, bld, idx);
662 ctx.handle_wqm = false;
663 }
664 }
665 if (ctx.info[idx].block_needs == WQM)
666 transition_to_WQM(ctx, bld, idx);
667 else if (ctx.info[idx].block_needs == Exact)
668 transition_to_Exact(ctx, bld, idx);
669 }
670
671 if (block->kind & block_kind_merge && !ctx.info[idx].exec.back().first.isUndefined()) {
672 Operand restore = ctx.info[idx].exec.back().first;
673 assert(restore.size() == bld.lm.size());
674 bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), restore);
675 if (!restore.isConstant())
676 ctx.info[idx].exec.back().first = Operand(bld.lm);
677 }
678
679 return i;
680 }
681
682 void
process_instructions(exec_ctx & ctx,Block * block,std::vector<aco_ptr<Instruction>> & instructions,unsigned idx)683 process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
684 unsigned idx)
685 {
686 WQMState state;
687 if (ctx.info[block->index].exec.back().second & mask_type_wqm)
688 state = WQM;
689 else {
690 assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact);
691 state = Exact;
692 }
693
694 /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
695 bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) !=
696 (ctx.info[block->index].block_needs & (WQM | Exact))) ||
697 block->kind & block_kind_uses_discard_if ||
698 block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering;
699 if (!process) {
700 std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
701 instructions.insert(instructions.end(),
702 std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
703 std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
704 block->instructions.end()));
705 return;
706 }
707
708 Builder bld(ctx.program, &instructions);
709
710 for (; idx < block->instructions.size(); idx++) {
711 aco_ptr<Instruction> instr = std::move(block->instructions[idx]);
712
713 WQMState needs = ctx.handle_wqm ? ctx.info[block->index].instr_needs[idx] : Unspecified;
714
715 if (instr->opcode == aco_opcode::p_discard_if) {
716 if (ctx.info[block->index].block_needs & Preserve_WQM) {
717 assert(block->kind & block_kind_top_level);
718 transition_to_WQM(ctx, bld, block->index);
719 ctx.info[block->index].exec.back().second &= ~mask_type_global;
720 }
721 int num = ctx.info[block->index].exec.size();
722 assert(num);
723
724 /* discard from current exec */
725 const Operand cond = instr->operands[0];
726 Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
727 Operand(exec, bld.lm), cond)
728 .def(1)
729 .getTemp();
730
731 /* discard from inner to outer exec mask on stack */
732 for (int i = num - 2; i >= 0; i--) {
733 Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
734 ctx.info[block->index].exec[i].first, cond);
735 ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
736 exit_cond = andn2->definitions[1].getTemp();
737 }
738
739 instr->opcode = aco_opcode::p_exit_early_if;
740 instr->operands[0] = bld.scc(exit_cond);
741 assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
742
743 } else if (needs == WQM && state != WQM) {
744 transition_to_WQM(ctx, bld, block->index);
745 state = WQM;
746 } else if (needs == Exact && state != Exact) {
747 transition_to_Exact(ctx, bld, block->index);
748 state = Exact;
749 }
750
751 if (instr->opcode == aco_opcode::p_is_helper) {
752 Definition dst = instr->definitions[0];
753 assert(dst.size() == bld.lm.size());
754 if (state == Exact) {
755 instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov),
756 Format::SOP1, 1, 1));
757 instr->operands[0] = Operand::zero();
758 instr->definitions[0] = dst;
759 } else {
760 std::pair<Operand, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
761 assert(exact_mask.second & mask_type_exact);
762
763 instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2),
764 Format::SOP2, 2, 2));
765 instr->operands[0] = Operand(exec, bld.lm); /* current exec */
766 instr->operands[1] = Operand(exact_mask.first);
767 instr->definitions[0] = dst;
768 instr->definitions[1] = bld.def(s1, scc);
769 }
770 } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
771 /* turn demote into discard_if with only exact masks */
772 assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) ==
773 (mask_type_exact | mask_type_global));
774
775 int num;
776 Temp cond, exit_cond;
777 if (instr->operands[0].isConstant()) {
778 assert(instr->operands[0].constantValue() == -1u);
779 /* transition to exact and set exec to zero */
780 exit_cond = bld.tmp(s1);
781 cond =
782 bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
783 Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
784
785 num = ctx.info[block->index].exec.size() - 2;
786 if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
787 ctx.info[block->index].exec.back().first = Operand(cond);
788 ctx.info[block->index].exec.emplace_back(Operand(bld.lm), mask_type_exact);
789 }
790 } else {
791 /* demote_if: transition to exact */
792 transition_to_Exact(ctx, bld, block->index);
793 assert(instr->operands[0].isTemp());
794 cond = instr->operands[0].getTemp();
795 num = ctx.info[block->index].exec.size() - 1;
796 }
797
798 for (int i = num; i >= 0; i--) {
799 if (ctx.info[block->index].exec[i].second & mask_type_exact) {
800 Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
801 ctx.info[block->index].exec[i].first, cond);
802 if (i == (int)ctx.info[block->index].exec.size() - 1) {
803 andn2->operands[0] = Operand(exec, bld.lm);
804 andn2->definitions[0] = Definition(exec, bld.lm);
805 }
806
807 ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
808 exit_cond = andn2->definitions[1].getTemp();
809 } else {
810 assert(i != 0);
811 }
812 }
813 instr->opcode = aco_opcode::p_exit_early_if;
814 instr->operands[0] = bld.scc(exit_cond);
815 state = Exact;
816
817 } else if (instr->opcode == aco_opcode::p_elect) {
818 bool all_lanes_enabled = ctx.info[block->index].exec.back().first.constantEquals(-1u);
819 Definition dst = instr->definitions[0];
820
821 if (all_lanes_enabled) {
822 bld.copy(Definition(dst), Operand::c32_or_c64(1u, dst.size() == 2));
823 } else {
824 Temp first_lane_idx = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
825 bld.sop2(Builder::s_lshl, Definition(dst), bld.def(s1, scc),
826 Operand::c32_or_c64(1u, dst.size() == 2), Operand(first_lane_idx));
827 }
828 instr.reset();
829 continue;
830 }
831
832 bld.insert(std::move(instr));
833 }
834 }
835
836 void
add_branch_code(exec_ctx & ctx,Block * block)837 add_branch_code(exec_ctx& ctx, Block* block)
838 {
839 unsigned idx = block->index;
840 Builder bld(ctx.program, block);
841
842 if (idx == ctx.program->blocks.size() - 1)
843 return;
844
845 /* try to disable wqm handling */
846 if (ctx.handle_wqm && block->kind & block_kind_top_level) {
847 if (ctx.info[idx].exec.size() == 3) {
848 assert(ctx.info[idx].exec[1].second == mask_type_wqm);
849 ctx.info[idx].exec.pop_back();
850 }
851 assert(ctx.info[idx].exec.size() <= 2);
852
853 if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) {
854 /* transition to Exact */
855 aco_ptr<Instruction> branch = std::move(block->instructions.back());
856 block->instructions.pop_back();
857 ctx.info[idx].exec.back().second |= mask_type_global;
858 transition_to_Exact(ctx, bld, idx);
859 bld.insert(std::move(branch));
860 ctx.handle_wqm = false;
861
862 } else if (ctx.info[idx].block_needs & Preserve_WQM) {
863 /* transition to WQM and remove global flag */
864 aco_ptr<Instruction> branch = std::move(block->instructions.back());
865 block->instructions.pop_back();
866 transition_to_WQM(ctx, bld, idx);
867 ctx.info[idx].exec.back().second &= ~mask_type_global;
868 bld.insert(std::move(branch));
869 }
870 }
871
872 if (block->kind & block_kind_loop_preheader) {
873 /* collect information about the succeeding loop */
874 bool has_divergent_break = false;
875 bool has_divergent_continue = false;
876 bool has_discard = false;
877 uint8_t needs = 0;
878 unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
879
880 for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
881 Block& loop_block = ctx.program->blocks[i];
882 needs |= ctx.info[i].block_needs;
883
884 if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard ||
885 loop_block.kind & block_kind_uses_demote)
886 has_discard = true;
887 if (loop_block.loop_nest_depth != loop_nest_depth)
888 continue;
889
890 if (loop_block.kind & block_kind_uniform)
891 continue;
892 else if (loop_block.kind & block_kind_break)
893 has_divergent_break = true;
894 else if (loop_block.kind & block_kind_continue)
895 has_divergent_continue = true;
896 }
897
898 if (ctx.handle_wqm) {
899 if (needs & WQM) {
900 aco_ptr<Instruction> branch = std::move(block->instructions.back());
901 block->instructions.pop_back();
902 transition_to_WQM(ctx, bld, idx);
903 bld.insert(std::move(branch));
904 } else {
905 aco_ptr<Instruction> branch = std::move(block->instructions.back());
906 block->instructions.pop_back();
907 transition_to_Exact(ctx, bld, idx);
908 bld.insert(std::move(branch));
909 }
910 }
911
912 unsigned num_exec_masks = ctx.info[idx].exec.size();
913 if (block->kind & block_kind_top_level)
914 num_exec_masks = std::min(num_exec_masks, 2u);
915
916 ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
917 has_divergent_break, has_divergent_continue, has_discard);
918 }
919
920 /* For normal breaks, this is the exec mask. For discard+break, it's the
921 * old exec mask before it was zero'd.
922 */
923 Operand break_cond = Operand(exec, bld.lm);
924
925 if (block->kind & block_kind_discard) {
926
927 assert(block->instructions.back()->isBranch());
928 aco_ptr<Instruction> branch = std::move(block->instructions.back());
929 block->instructions.pop_back();
930
931 /* create a discard_if() instruction with the exec mask as condition */
932 unsigned num = 0;
933 if (ctx.loop.size()) {
934 /* if we're in a loop, only discard from the outer exec masks */
935 num = ctx.loop.back().num_exec_masks;
936 } else {
937 num = ctx.info[idx].exec.size() - 1;
938 }
939
940 Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
941 Definition(exec, bld.lm), Operand::zero(), Operand(exec, bld.lm));
942
943 for (int i = num - 1; i >= 0; i--) {
944 Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
945 get_exec_op(ctx.info[block->index].exec[i].first), cond);
946 if (i == (int)ctx.info[idx].exec.size() - 1)
947 andn2->definitions[0] = Definition(exec, bld.lm);
948 if (i == 0)
949 bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
950 ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
951 }
952 assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
953
954 break_cond = Operand(cond);
955 bld.insert(std::move(branch));
956 /* no return here as it can be followed by a divergent break */
957 }
958
959 if (block->kind & block_kind_continue_or_break) {
960 assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
961 block_kind_loop_header);
962 assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
963 block_kind_loop_exit);
964 assert(block->instructions.back()->opcode == aco_opcode::p_branch);
965 block->instructions.pop_back();
966
967 bool need_parallelcopy = false;
968 while (!(ctx.info[idx].exec.back().second & mask_type_loop)) {
969 ctx.info[idx].exec.pop_back();
970 need_parallelcopy = true;
971 }
972
973 if (need_parallelcopy)
974 ctx.info[idx].exec.back().first = bld.pseudo(
975 aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
976 bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
977 block->linear_succs[1], block->linear_succs[0]);
978 return;
979 }
980
981 if (block->kind & block_kind_uniform) {
982 Pseudo_branch_instruction& branch = block->instructions.back()->branch();
983 if (branch.opcode == aco_opcode::p_branch) {
984 branch.target[0] = block->linear_succs[0];
985 } else {
986 branch.target[0] = block->linear_succs[1];
987 branch.target[1] = block->linear_succs[0];
988 }
989 return;
990 }
991
992 if (block->kind & block_kind_branch) {
993
994 if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
995 ctx.info[idx].exec.back().second == mask_type_exact &&
996 !(ctx.info[idx].block_needs & Exact_Branch) &&
997 ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
998 /* return to wqm before branching */
999 ctx.info[idx].exec.pop_back();
1000 }
1001
1002 // orig = s_and_saveexec_b64
1003 assert(block->linear_succs.size() == 2);
1004 assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
1005 Temp cond = block->instructions.back()->operands[0].getTemp();
1006 block->instructions.pop_back();
1007
1008 if (ctx.info[idx].block_needs & Exact_Branch)
1009 transition_to_Exact(ctx, bld, idx);
1010
1011 uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
1012 if (ctx.info[idx].exec.back().first.constantEquals(-1u)) {
1013 bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
1014 } else {
1015 Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
1016 Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
1017
1018 ctx.info[idx].exec.back().first = Operand(old_exec);
1019 }
1020
1021 /* add next current exec to the stack */
1022 ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
1023
1024 bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
1025 block->linear_succs[1], block->linear_succs[0]);
1026 return;
1027 }
1028
1029 if (block->kind & block_kind_invert) {
1030 // exec = s_andn2_b64 (original_exec, exec)
1031 assert(block->instructions.back()->opcode == aco_opcode::p_branch);
1032 block->instructions.pop_back();
1033 assert(ctx.info[idx].exec.size() >= 2);
1034 Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
1035 bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
1036 Operand(exec, bld.lm));
1037
1038 bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
1039 block->linear_succs[1], block->linear_succs[0]);
1040 return;
1041 }
1042
1043 if (block->kind & block_kind_break) {
1044 // loop_mask = s_andn2_b64 (loop_mask, exec)
1045 assert(block->instructions.back()->opcode == aco_opcode::p_branch);
1046 block->instructions.pop_back();
1047
1048 Temp cond = Temp();
1049 for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
1050 cond = bld.tmp(s1);
1051 Operand exec_mask = ctx.info[idx].exec[exec_idx].first;
1052 exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
1053 exec_mask, break_cond);
1054 ctx.info[idx].exec[exec_idx].first = exec_mask;
1055 if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
1056 break;
1057 }
1058
1059 /* check if the successor is the merge block, otherwise set exec to 0 */
1060 // TODO: this could be done better by directly branching to the merge block
1061 unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
1062 Block& succ = ctx.program->blocks[succ_idx];
1063 if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
1064 bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
1065 }
1066
1067 bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
1068 block->linear_succs[1], block->linear_succs[0]);
1069 return;
1070 }
1071
1072 if (block->kind & block_kind_continue) {
1073 assert(block->instructions.back()->opcode == aco_opcode::p_branch);
1074 block->instructions.pop_back();
1075
1076 Temp cond = Temp();
1077 for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) {
1078 if (ctx.info[idx].exec[exec_idx].second & mask_type_loop)
1079 break;
1080 cond = bld.tmp(s1);
1081 Operand exec_mask = ctx.info[idx].exec[exec_idx].first;
1082 exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)),
1083 exec_mask, Operand(exec, bld.lm));
1084 ctx.info[idx].exec[exec_idx].first = exec_mask;
1085 }
1086 assert(cond != Temp());
1087
1088 /* check if the successor is the merge block, otherwise set exec to 0 */
1089 // TODO: this could be done better by directly branching to the merge block
1090 unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0];
1091 Block& succ = ctx.program->blocks[succ_idx];
1092 if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) {
1093 bld.copy(Definition(exec, bld.lm), Operand::zero(bld.lm.bytes()));
1094 }
1095
1096 bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
1097 block->linear_succs[1], block->linear_succs[0]);
1098 return;
1099 }
1100 }
1101
1102 void
process_block(exec_ctx & ctx,Block * block)1103 process_block(exec_ctx& ctx, Block* block)
1104 {
1105 std::vector<aco_ptr<Instruction>> instructions;
1106 instructions.reserve(block->instructions.size());
1107
1108 unsigned idx = add_coupling_code(ctx, block, instructions);
1109
1110 assert(block->index != ctx.program->blocks.size() - 1 ||
1111 ctx.info[block->index].exec.size() <= 2);
1112
1113 process_instructions(ctx, block, instructions, idx);
1114
1115 block->instructions = std::move(instructions);
1116
1117 add_branch_code(ctx, block);
1118 }
1119
1120 } /* end namespace */
1121
1122 void
insert_exec_mask(Program * program)1123 insert_exec_mask(Program* program)
1124 {
1125 exec_ctx ctx(program);
1126
1127 if (program->needs_wqm && program->needs_exact)
1128 calculate_wqm_needs(ctx);
1129
1130 for (Block& block : program->blocks)
1131 process_block(ctx, &block);
1132 }
1133
1134 } // namespace aco
1135