1 /*
2  * Copyright © 2019 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27 
28 #include <algorithm>
29 #include <map>
30 #include <unordered_map>
31 #include <vector>
32 
33 /*
34  * Implements an algorithm to lower to Concentional SSA Form (CSSA).
35  * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
36  * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
37  *
38  * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
39  * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
40  * The algorithm coalesces non-interfering phi-resources while taking value-equality
41  * into account. Re-indexes the SSA-defs.
42  */
43 
44 namespace aco {
45 namespace {
46 
47 typedef std::vector<Temp> merge_set;
48 
49 struct copy {
50    Definition def;
51    Operand op;
52 };
53 
54 struct merge_node {
55    Operand value = Operand(); /* original value: can be an SSA-def or constant value */
56    uint32_t index = -1u;      /* index into the vector of merge sets */
57    uint32_t defined_at = -1u; /* defining block */
58 
59    /* we also remember two dominating defs with the same value: */
60    Temp equal_anc_in = Temp();  /* within the same merge set */
61    Temp equal_anc_out = Temp(); /* from a different set */
62 };
63 
64 struct cssa_ctx {
65    Program* program;
66    std::vector<IDSet>& live_out;                  /* live-out sets per block */
67    std::vector<std::vector<copy>> parallelcopies; /* copies per block */
68    std::vector<merge_set> merge_sets;             /* each vector is one (ordered) merge set */
69    std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
70 };
71 
72 /* create (virtual) parallelcopies for each phi instruction and
73  * already merge copy-definitions with phi-defs into merge sets */
74 void
collect_parallelcopies(cssa_ctx & ctx)75 collect_parallelcopies(cssa_ctx& ctx)
76 {
77    ctx.parallelcopies.resize(ctx.program->blocks.size());
78    Builder bld(ctx.program);
79    for (Block& block : ctx.program->blocks) {
80       for (aco_ptr<Instruction>& phi : block.instructions) {
81          if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
82             break;
83 
84          const Definition& def = phi->definitions[0];
85 
86          /* if the definition is not temp, it is the exec mask.
87           * We can reload the exec mask directly from the spill slot.
88           */
89          if (!def.isTemp())
90             continue;
91 
92          std::vector<unsigned>& preds =
93             phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
94          uint32_t index = ctx.merge_sets.size();
95          merge_set set;
96 
97          bool has_preheader_copy = false;
98          for (unsigned i = 0; i < phi->operands.size(); i++) {
99             Operand op = phi->operands[i];
100             if (op.isUndefined())
101                continue;
102 
103             if (def.regClass().type() == RegType::sgpr && !op.isTemp()) {
104                /* SGPR inline constants and literals on GFX10+ can be spilled
105                 * and reloaded directly (without intermediate register) */
106                if (op.isConstant()) {
107                   if (ctx.program->chip_class >= GFX10)
108                      continue;
109                   if (op.size() == 1 && !op.isLiteral())
110                      continue;
111                } else {
112                   assert(op.isFixed() && op.physReg() == exec);
113                   continue;
114                }
115             }
116 
117             /* create new temporary and rename operands */
118             Temp tmp = bld.tmp(def.regClass());
119             ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op});
120             phi->operands[i] = Operand(tmp);
121 
122             /* place the new operands in the same merge set */
123             set.emplace_back(tmp);
124             ctx.merge_node_table[tmp.id()] = {op, index, preds[i]};
125 
126             /* update the liveness information */
127             if (op.isKill())
128                ctx.live_out[preds[i]].erase(op.tempId());
129             ctx.live_out[preds[i]].insert(tmp.id());
130 
131             has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header;
132          }
133 
134          if (set.empty())
135             continue;
136 
137          /* place the definition in dominance-order */
138          if (def.isTemp()) {
139             if (has_preheader_copy)
140                set.emplace(std::next(set.begin()), def.getTemp());
141             else if (block.kind & block_kind_loop_header)
142                set.emplace(set.begin(), def.getTemp());
143             else
144                set.emplace_back(def.getTemp());
145             ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index};
146          }
147          ctx.merge_sets.emplace_back(set);
148       }
149    }
150 }
151 
152 /* check whether the definition of a comes after b. */
153 inline bool
defined_after(cssa_ctx & ctx,Temp a,Temp b)154 defined_after(cssa_ctx& ctx, Temp a, Temp b)
155 {
156    merge_node& node_a = ctx.merge_node_table[a.id()];
157    merge_node& node_b = ctx.merge_node_table[b.id()];
158    if (node_a.defined_at == node_b.defined_at)
159       return a.id() > b.id();
160 
161    return node_a.defined_at > node_b.defined_at;
162 }
163 
164 /* check whether a dominates b where b is defined after a */
165 inline bool
dominates(cssa_ctx & ctx,Temp a,Temp b)166 dominates(cssa_ctx& ctx, Temp a, Temp b)
167 {
168    assert(defined_after(ctx, b, a));
169    merge_node& node_a = ctx.merge_node_table[a.id()];
170    merge_node& node_b = ctx.merge_node_table[b.id()];
171    unsigned idom = node_b.defined_at;
172    while (idom > node_a.defined_at)
173       idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
174                                                   : ctx.program->blocks[idom].linear_idom;
175 
176    return idom == node_a.defined_at;
177 }
178 
179 /* check intersection between var and parent:
180  * We already know that parent dominates var. */
181 inline bool
intersects(cssa_ctx & ctx,Temp var,Temp parent)182 intersects(cssa_ctx& ctx, Temp var, Temp parent)
183 {
184    merge_node& node_var = ctx.merge_node_table[var.id()];
185    merge_node& node_parent = ctx.merge_node_table[parent.id()];
186    assert(node_var.index != node_parent.index);
187    uint32_t block_idx = node_var.defined_at;
188 
189    /* if the parent is live-out at the definition block of var, they intersect */
190    bool parent_live = ctx.live_out[block_idx].count(parent.id());
191    if (parent_live)
192       return true;
193 
194    /* parent is defined in a different block than var */
195    if (node_parent.defined_at < node_var.defined_at) {
196       /* if the parent is not live-in, they don't interfere */
197       std::vector<uint32_t>& preds = var.type() == RegType::vgpr
198                                         ? ctx.program->blocks[block_idx].logical_preds
199                                         : ctx.program->blocks[block_idx].linear_preds;
200       for (uint32_t pred : preds) {
201          if (!ctx.live_out[pred].count(parent.id()))
202             return false;
203       }
204    }
205 
206    for (const copy& cp : ctx.parallelcopies[block_idx]) {
207       /* if var is defined at the edge, they don't intersect */
208       if (cp.def.getTemp() == var)
209          return false;
210       if (cp.op.isTemp() && cp.op.getTemp() == parent)
211          parent_live = true;
212    }
213    /* if the parent is live at the edge, they intersect */
214    if (parent_live)
215       return true;
216 
217    /* both, parent and var, are present in the same block */
218    const Block& block = ctx.program->blocks[block_idx];
219    for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) {
220       /* if the parent was not encountered yet, it can only be used by a phi */
221       if (is_phi(it->get()))
222          break;
223 
224       for (const Definition& def : (*it)->definitions) {
225          if (!def.isTemp())
226             continue;
227          /* if parent was not found yet, they don't intersect */
228          if (def.getTemp() == var)
229             return false;
230       }
231 
232       for (const Operand& op : (*it)->operands) {
233          if (!op.isTemp())
234             continue;
235          /* if the var was defined before this point, they intersect */
236          if (op.getTemp() == parent)
237             return true;
238       }
239    }
240 
241    return false;
242 }
243 
244 /* check interference between var and parent:
245  * i.e. they have different values and intersect.
246  * If parent and var share the same value, also updates the equal ancestor. */
247 inline bool
interference(cssa_ctx & ctx,Temp var,Temp parent)248 interference(cssa_ctx& ctx, Temp var, Temp parent)
249 {
250    assert(var != parent);
251    merge_node& node_var = ctx.merge_node_table[var.id()];
252    node_var.equal_anc_out = Temp();
253 
254    if (node_var.index == ctx.merge_node_table[parent.id()].index) {
255       /* check/update in other set */
256       parent = ctx.merge_node_table[parent.id()].equal_anc_out;
257    }
258 
259    Temp tmp = parent;
260    /* check if var intersects with parent or any equal-valued ancestor */
261    while (tmp != Temp() && !intersects(ctx, var, tmp)) {
262       merge_node& node_tmp = ctx.merge_node_table[tmp.id()];
263       tmp = node_tmp.equal_anc_in;
264    }
265 
266    /* no intersection found */
267    if (tmp == Temp())
268       return false;
269 
270    /* var and parent, same value, but in different sets */
271    if (node_var.value == ctx.merge_node_table[parent.id()].value) {
272       node_var.equal_anc_out = tmp;
273       return false;
274    }
275 
276    /* var and parent, different values and intersect */
277    return true;
278 }
279 
280 /* tries to merge set_b into set_a of given temporary and
281  * drops that temporary as it is being coalesced */
282 bool
try_merge_merge_set(cssa_ctx & ctx,Temp dst,merge_set & set_b)283 try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
284 {
285    auto def_node_it = ctx.merge_node_table.find(dst.id());
286    uint32_t index = def_node_it->second.index;
287    merge_set& set_a = ctx.merge_sets[index];
288    std::vector<Temp> dom; /* stack of the traversal */
289    merge_set union_set;   /* the new merged merge-set */
290    uint32_t i_a = 0;
291    uint32_t i_b = 0;
292 
293    while (i_a < set_a.size() || i_b < set_b.size()) {
294       Temp current;
295       if (i_a == set_a.size())
296          current = set_b[i_b++];
297       else if (i_b == set_b.size())
298          current = set_a[i_a++];
299       /* else pick the one defined first */
300       else if (defined_after(ctx, set_a[i_a], set_b[i_b]))
301          current = set_b[i_b++];
302       else
303          current = set_a[i_a++];
304 
305       while (!dom.empty() && !dominates(ctx, dom.back(), current))
306          dom.pop_back(); /* not the desired parent, remove */
307 
308       if (!dom.empty() && interference(ctx, current, dom.back()))
309          return false; /* intersection detected */
310 
311       dom.emplace_back(current); /* otherwise, keep checking */
312       if (current != dst)
313          union_set.emplace_back(current); /* maintain the new merge-set sorted */
314    }
315 
316    /* update hashmap */
317    for (Temp t : union_set) {
318       merge_node& node = ctx.merge_node_table[t.id()];
319       /* update the equal ancestors:
320        * i.e. the 'closest' dominating def with the same value */
321       Temp in = node.equal_anc_in;
322       Temp out = node.equal_anc_out;
323       if (in == Temp() || (out != Temp() && defined_after(ctx, out, in)))
324          node.equal_anc_in = out;
325       node.equal_anc_out = Temp();
326       /* update merge-set index */
327       node.index = index;
328    }
329    set_b = merge_set(); /* free the old set_b */
330    ctx.merge_sets[index] = union_set;
331    ctx.merge_node_table.erase(dst.id()); /* remove the temporary */
332 
333    return true;
334 }
335 
336 /* returns true if the copy can safely be omitted */
337 bool
try_coalesce_copy(cssa_ctx & ctx,copy copy,uint32_t block_idx)338 try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
339 {
340    /* we can only coalesce temporaries */
341    if (!copy.op.isTemp())
342       return false;
343 
344    /* try emplace a merge_node for the copy operand */
345    merge_node& op_node = ctx.merge_node_table[copy.op.tempId()];
346    if (op_node.defined_at == -1u) {
347       /* find defining block of operand */
348       uint32_t pred = block_idx;
349       do {
350          block_idx = pred;
351          pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
352                                                            : ctx.program->blocks[pred].linear_idom;
353       } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
354       op_node.defined_at = block_idx;
355       op_node.value = copy.op;
356    }
357 
358    /* we can only coalesce copies of the same register class */
359    if (copy.op.regClass() != copy.def.regClass())
360       return false;
361 
362    /* check if this operand has not yet been coalesced */
363    if (op_node.index == -1u) {
364       merge_set op_set = merge_set{copy.op.getTemp()};
365       return try_merge_merge_set(ctx, copy.def.getTemp(), op_set);
366    }
367 
368    /* check if this operand has been coalesced into the same set */
369    assert(ctx.merge_node_table.count(copy.def.tempId()));
370    if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index)
371       return true;
372 
373    /* otherwise, try to coalesce both merge sets */
374    return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]);
375 }
376 
377 /* node in the location-transfer-graph */
378 struct ltg_node {
379    copy cp;
380    uint32_t read_idx;
381    uint32_t num_uses = 0;
382 };
383 
384 /* emit the copies in an order that does not
385  * create interferences within a merge-set */
386 void
emit_copies_block(Builder & bld,std::map<uint32_t,ltg_node> & ltg,RegType type)387 emit_copies_block(Builder& bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
388 {
389    auto&& it = ltg.begin();
390    while (it != ltg.end()) {
391       const copy& cp = it->second.cp;
392       /* wrong regclass or still needed as operand */
393       if (cp.def.regClass().type() != type || it->second.num_uses > 0) {
394          ++it;
395          continue;
396       }
397 
398       /* emit the copy */
399       bld.copy(cp.def, it->second.cp.op);
400 
401       /* update the location transfer graph */
402       if (it->second.read_idx != -1u) {
403          auto&& other = ltg.find(it->second.read_idx);
404          if (other != ltg.end())
405             other->second.num_uses--;
406       }
407       ltg.erase(it);
408       it = ltg.begin();
409    }
410 
411    /* count the number of remaining circular dependencies */
412    unsigned num = std::count_if(ltg.begin(), ltg.end(),
413                                 [&](auto& n) { return n.second.cp.def.regClass().type() == type; });
414 
415    /* if there are circular dependencies, we just emit them as single parallelcopy */
416    if (num) {
417       // TODO: this should be restricted to a feasible number of registers
418       // and otherwise use a temporary to avoid having to reload more (spilled)
419       // variables than we have registers.
420       aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
421          aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
422       it = ltg.begin();
423       for (unsigned i = 0; i < num; i++) {
424          while (it->second.cp.def.regClass().type() != type)
425             ++it;
426 
427          copy->definitions[i] = it->second.cp.def;
428          copy->operands[i] = it->second.cp.op;
429          it = ltg.erase(it);
430       }
431       bld.insert(std::move(copy));
432    }
433 }
434 
435 /* either emits or coalesces all parallelcopies and
436  * renames the phi-operands accordingly. */
437 void
emit_parallelcopies(cssa_ctx & ctx)438 emit_parallelcopies(cssa_ctx& ctx)
439 {
440    std::unordered_map<uint32_t, Operand> renames;
441 
442    /* we iterate backwards to prioritize coalescing in else-blocks */
443    for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
444       if (ctx.parallelcopies[i].empty())
445          continue;
446 
447       std::map<uint32_t, ltg_node> ltg;
448       bool has_vgpr_copy = false;
449       bool has_sgpr_copy = false;
450 
451       /* first, try to coalesce all parallelcopies */
452       for (const copy& cp : ctx.parallelcopies[i]) {
453          if (try_coalesce_copy(ctx, cp, i)) {
454             renames.emplace(cp.def.tempId(), cp.op);
455             /* update liveness info */
456             ctx.live_out[i].erase(cp.def.tempId());
457             ctx.live_out[i].insert(cp.op.tempId());
458          } else {
459             uint32_t read_idx = -1u;
460             if (cp.op.isTemp())
461                read_idx = ctx.merge_node_table[cp.op.tempId()].index;
462             uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;
463             assert(write_idx != -1u);
464             ltg[write_idx] = {cp, read_idx};
465 
466             bool is_vgpr = cp.def.regClass().type() == RegType::vgpr;
467             has_vgpr_copy |= is_vgpr;
468             has_sgpr_copy |= !is_vgpr;
469          }
470       }
471 
472       /* build location-transfer-graph */
473       for (auto& pair : ltg) {
474          if (pair.second.read_idx == -1u)
475             continue;
476          auto&& it = ltg.find(pair.second.read_idx);
477          if (it != ltg.end())
478             it->second.num_uses++;
479       }
480 
481       /* emit parallelcopies ordered */
482       Builder bld(ctx.program);
483       Block& block = ctx.program->blocks[i];
484 
485       if (has_vgpr_copy) {
486          /* emit VGPR copies */
487          auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
488          { return inst->opcode == aco_opcode::p_logical_end; };
489          auto it =
490             std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
491          bld.reset(&block.instructions, std::prev(it.base()));
492          emit_copies_block(bld, ltg, RegType::vgpr);
493       }
494 
495       if (has_sgpr_copy) {
496          /* emit SGPR copies */
497          aco_ptr<Instruction> branch = std::move(block.instructions.back());
498          block.instructions.pop_back();
499          bld.reset(&block.instructions);
500          emit_copies_block(bld, ltg, RegType::sgpr);
501          bld.insert(std::move(branch));
502       }
503    }
504 
505    /* finally, rename coalesced phi operands */
506    for (Block& block : ctx.program->blocks) {
507       for (aco_ptr<Instruction>& phi : block.instructions) {
508          if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
509             break;
510 
511          for (Operand& op : phi->operands) {
512             if (!op.isTemp())
513                continue;
514             auto&& it = renames.find(op.tempId());
515             if (it != renames.end()) {
516                op = it->second;
517                renames.erase(it);
518             }
519          }
520       }
521    }
522    assert(renames.empty());
523 }
524 
525 } /* end namespace */
526 
527 void
lower_to_cssa(Program * program,live & live_vars)528 lower_to_cssa(Program* program, live& live_vars)
529 {
530    reindex_ssa(program, live_vars.live_out);
531    cssa_ctx ctx = {program, live_vars.live_out};
532    collect_parallelcopies(ctx);
533    emit_parallelcopies(ctx);
534 
535    /* update live variable information */
536    live_vars = live_var_analysis(program);
537 }
538 } // namespace aco
539