1 /*
2  * Copyright (C) 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3_compiler.h"
25 #include "ir3_ra.h"
26 #include "ralloc.h"
27 
28 /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
29  * of parallelcopy's to trivially turn the program into CSSA form. Then we
30  * try to "merge" SSA def's into "merge sets" which could be allocated to a
31  * single register in order to eliminate copies. First we merge phi nodes,
32  * which should always succeed because of the parallelcopy's we inserted, and
33  * then we try to coalesce the copies we introduced.
34  *
35  * The merged registers are used for three purposes:
36  *
37  * 1. We always use the same pvtmem slot for spilling all SSA defs in each
38  * merge set. This prevents us from having to insert memory-to-memory copies
39  * in the spiller and makes sure we don't insert unecessary copies.
40  * 2. When two values are live at the same time, part of the same merge
41  * set, and they overlap each other in the merge set, they always occupy
42  * overlapping physical registers in RA. This reduces register pressure and
43  * copies in several important scenarios:
44  *	- When sources of a collect are used later by something else, we don't
45  *	have to introduce copies.
46  *	- We can handle sequences of extracts that "explode" a vector into its
47  *	components without any additional copying.
48  * 3. We use the merge sets for affinities in register allocation: That is, we
49  * try to allocate all the definitions in the same merge set to the
50  * same/compatible registers. This helps us e.g. allocate sources of a collect
51  * to contiguous registers without too much special code in RA.
52  *
53  * In a "normal" register allocator, or when spilling, we'd just merge
54  * registers in the same merge set to the same register, but with SSA-based
55  * register allocation we may have to split the live interval.
56  *
57  * The implementation is based on "Revisiting Out-of-SSA Translation for
58  * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
59  * implementation in nir_from_ssa, with the twist that we also try to coalesce
60  * META_SPLIT and META_COLLECT. This makes this pass more complicated but
61  * prevents us from needing to handle these specially in RA and the spiller,
62  * which are already complicated enough. This also forces us to implement that
63  * value-comparison optimization they explain, as without it we wouldn't be
64  * able to coalesce META_SPLIT even in the simplest of cases.
65  */
66 
67 /* In order to dynamically reconstruct the dominance forest, we need the
68  * instructions ordered by a preorder traversal of the dominance tree:
69  */
70 
71 static unsigned
index_instrs(struct ir3_block * block,unsigned index)72 index_instrs(struct ir3_block *block, unsigned index)
73 {
74    foreach_instr (instr, &block->instr_list)
75       instr->ip = index++;
76 
77    for (unsigned i = 0; i < block->dom_children_count; i++)
78       index = index_instrs(block->dom_children[i], index);
79 
80    return index;
81 }
82 
83 /* Definitions within a merge set are ordered by instr->ip as set above: */
84 
85 static bool
def_after(struct ir3_register * a,struct ir3_register * b)86 def_after(struct ir3_register *a, struct ir3_register *b)
87 {
88    return a->instr->ip > b->instr->ip;
89 }
90 
91 static bool
def_dominates(struct ir3_register * a,struct ir3_register * b)92 def_dominates(struct ir3_register *a, struct ir3_register *b)
93 {
94    if (def_after(a, b)) {
95       return false;
96    } else if (a->instr->block == b->instr->block) {
97       return def_after(b, a);
98    } else {
99       return ir3_block_dominates(a->instr->block, b->instr->block);
100    }
101 }
102 
103 /* This represents a region inside a register. The offset is relative to the
104  * start of the register, and offset + size <= size(reg).
105  */
106 struct def_value {
107    struct ir3_register *reg;
108    unsigned offset, size;
109 };
110 
111 /* Chase any copies to get the source of a region inside a register. This is
112  * Value(a) in the paper.
113  */
114 static struct def_value
chase_copies(struct def_value value)115 chase_copies(struct def_value value)
116 {
117    while (true) {
118       struct ir3_instruction *instr = value.reg->instr;
119       if (instr->opc == OPC_META_SPLIT) {
120          value.offset += instr->split.off * reg_elem_size(value.reg);
121          value.reg = instr->srcs[0]->def;
122       } else if (instr->opc == OPC_META_COLLECT) {
123          if (value.offset % reg_elem_size(value.reg) != 0 ||
124              value.size > reg_elem_size(value.reg) ||
125              value.offset + value.size > reg_size(value.reg))
126             break;
127          struct ir3_register *src =
128             instr->srcs[value.offset / reg_elem_size(value.reg)];
129          if (!src->def)
130             break;
131          value.offset = 0;
132          value.reg = src->def;
133       } else {
134          /* TODO: parallelcopy */
135          break;
136       }
137    }
138 
139    return value;
140 }
141 
142 /* This represents an entry in the merge set, and consists of a register +
143  * offset from the merge set base.
144  */
145 struct merge_def {
146    struct ir3_register *reg;
147    unsigned offset;
148 };
149 
150 static bool
can_skip_interference(const struct merge_def * a,const struct merge_def * b)151 can_skip_interference(const struct merge_def *a, const struct merge_def *b)
152 {
153    unsigned a_start = a->offset;
154    unsigned b_start = b->offset;
155    unsigned a_end = a_start + reg_size(a->reg);
156    unsigned b_end = b_start + reg_size(b->reg);
157 
158    /* Registers that don't overlap never interfere */
159    if (a_end <= b_start || b_end <= a_start)
160       return true;
161 
162    /* Disallow skipping interference unless one definition contains the
163     * other. This restriction is important for register allocation, because
164     * it means that at any given point in the program, the live values in a
165     * given merge set will form a tree. If they didn't, then one live value
166     * would partially overlap another, and they would have overlapping live
167     * ranges because they're live at the same point. This simplifies register
168     * allocation and spilling.
169     */
170    if (!((a_start <= b_start && a_end >= b_end) ||
171          (b_start <= a_start && b_end >= a_end)))
172       return false;
173 
174    /* For each register, chase the intersection of a and b to find the
175     * ultimate source.
176     */
177    unsigned start = MAX2(a_start, b_start);
178    unsigned end = MIN2(a_end, b_end);
179    struct def_value a_value = chase_copies((struct def_value){
180       .reg = a->reg,
181       .offset = start - a_start,
182       .size = end - start,
183    });
184    struct def_value b_value = chase_copies((struct def_value){
185       .reg = b->reg,
186       .offset = start - b_start,
187       .size = end - start,
188    });
189    return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
190 }
191 
192 static struct ir3_merge_set *
get_merge_set(struct ir3_register * def)193 get_merge_set(struct ir3_register *def)
194 {
195    if (def->merge_set)
196       return def->merge_set;
197 
198    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
199    set->preferred_reg = ~0;
200    set->interval_start = ~0;
201    set->spill_slot = ~0;
202    set->size = reg_size(def);
203    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
204    set->regs_count = 1;
205    set->regs = ralloc(set, struct ir3_register *);
206    set->regs[0] = def;
207 
208    return set;
209 }
210 
211 /* Merges b into a */
212 static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)213 merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
214 {
215    if (b_offset < 0)
216       return merge_merge_sets(b, a, -b_offset);
217 
218    struct ir3_register **new_regs =
219       rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
220 
221    unsigned a_index = 0, b_index = 0, new_index = 0;
222    for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
223       if (b_index < b->regs_count &&
224           (a_index == a->regs_count ||
225            def_after(a->regs[a_index], b->regs[b_index]))) {
226          new_regs[new_index] = b->regs[b_index++];
227          new_regs[new_index]->merge_set_offset += b_offset;
228       } else {
229          new_regs[new_index] = a->regs[a_index++];
230       }
231       new_regs[new_index]->merge_set = a;
232    }
233 
234    assert(new_index == a->regs_count + b->regs_count);
235 
236    /* Technically this should be the lcm, but because alignment is only 1 or
237     * 2 so far this should be ok.
238     */
239    a->alignment = MAX2(a->alignment, b->alignment);
240    a->regs_count += b->regs_count;
241    ralloc_free(a->regs);
242    a->regs = new_regs;
243    a->size = MAX2(a->size, b->size + b_offset);
244 
245    return a;
246 }
247 
248 static bool
merge_sets_interfere(struct ir3_liveness * live,struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)249 merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
250                      struct ir3_merge_set *b, int b_offset)
251 {
252    if (b_offset < 0)
253       return merge_sets_interfere(live, b, a, -b_offset);
254 
255    struct merge_def dom[a->regs_count + b->regs_count];
256    unsigned a_index = 0, b_index = 0;
257    int dom_index = -1;
258 
259    /* Reject trying to merge the sets if the alignment doesn't work out */
260    if (b_offset % a->alignment != 0)
261       return true;
262 
263    while (a_index < a->regs_count || b_index < b->regs_count) {
264       struct merge_def current;
265       if (a_index == a->regs_count) {
266          current.reg = b->regs[b_index];
267          current.offset = current.reg->merge_set_offset + b_offset;
268          b_index++;
269       } else if (b_index == b->regs_count) {
270          current.reg = a->regs[a_index];
271          current.offset = current.reg->merge_set_offset;
272          a_index++;
273       } else {
274          if (def_after(b->regs[b_index], a->regs[a_index])) {
275             current.reg = a->regs[a_index];
276             current.offset = current.reg->merge_set_offset;
277             a_index++;
278          } else {
279             current.reg = b->regs[b_index];
280             current.offset = current.reg->merge_set_offset + b_offset;
281             b_index++;
282          }
283       }
284 
285       while (dom_index >= 0 &&
286              !def_dominates(dom[dom_index].reg, current.reg)) {
287          dom_index--;
288       }
289 
290       /* TODO: in the original paper, just dom[dom_index] needs to be
291        * checked for interference. We implement the value-chasing extension
292        * as well as support for sub-registers, which complicates this
293        * significantly because it's no longer the case that if a dominates b
294        * dominates c and a and b don't interfere then we only need to check
295        * interference between b and c to be sure a and c don't interfere --
296        * this means we may have to check for interference against values
297        * higher in the stack then dom[dom_index]. In the paper there's a
298        * description of a way to do less interference tests with the
299        * value-chasing extension, but we'd have to come up with something
300        * ourselves for handling the similar problems that come up with
301        * allowing values to contain subregisters. For now we just test
302        * everything in the stack.
303        */
304       for (int i = 0; i <= dom_index; i++) {
305          if (can_skip_interference(&current, &dom[i]))
306             continue;
307 
308          /* Ok, now we actually have to check interference. Since we know
309           * that dom[i] dominates current, this boils down to checking
310           * whether dom[i] is live after current.
311           */
312          if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
313             return true;
314       }
315 
316       dom[++dom_index] = current;
317    }
318 
319    return false;
320 }
321 
322 static void
try_merge_defs(struct ir3_liveness * live,struct ir3_register * a,struct ir3_register * b,unsigned b_offset)323 try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
324                struct ir3_register *b, unsigned b_offset)
325 {
326    struct ir3_merge_set *a_set = get_merge_set(a);
327    struct ir3_merge_set *b_set = get_merge_set(b);
328 
329    if (a_set == b_set) {
330       /* Note: Even in this case we may not always successfully be able to
331        * coalesce this copy, if the offsets don't line up. But in any
332        * case, we can't do anything.
333        */
334       return;
335    }
336 
337    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
338 
339    if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
340       merge_merge_sets(a_set, b_set, b_set_offset);
341 }
342 
343 void
ir3_force_merge(struct ir3_register * a,struct ir3_register * b,int b_offset)344 ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
345 {
346    struct ir3_merge_set *a_set = get_merge_set(a);
347    struct ir3_merge_set *b_set = get_merge_set(b);
348 
349    if (a_set == b_set)
350       return;
351 
352    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
353    merge_merge_sets(a_set, b_set, b_set_offset);
354 }
355 
356 static void
coalesce_phi(struct ir3_liveness * live,struct ir3_instruction * phi)357 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
358 {
359    for (unsigned i = 0; i < phi->srcs_count; i++) {
360       if (phi->srcs[i]->def)
361          try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
362    }
363 }
364 
365 static void
aggressive_coalesce_parallel_copy(struct ir3_liveness * live,struct ir3_instruction * pcopy)366 aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
367                                   struct ir3_instruction *pcopy)
368 {
369    for (unsigned i = 0; i < pcopy->dsts_count; i++) {
370       if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
371          continue;
372       try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
373    }
374 }
375 
376 static void
aggressive_coalesce_split(struct ir3_liveness * live,struct ir3_instruction * split)377 aggressive_coalesce_split(struct ir3_liveness *live,
378                           struct ir3_instruction *split)
379 {
380    try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
381                   split->split.off * reg_elem_size(split->dsts[0]));
382 }
383 
384 static void
aggressive_coalesce_collect(struct ir3_liveness * live,struct ir3_instruction * collect)385 aggressive_coalesce_collect(struct ir3_liveness *live,
386                             struct ir3_instruction *collect)
387 {
388    for (unsigned i = 0, offset = 0; i < collect->srcs_count;
389         offset += reg_elem_size(collect->srcs[i]), i++) {
390       if (!(collect->srcs[i]->flags & IR3_REG_SSA))
391          continue;
392       try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
393    }
394 }
395 
396 static void
create_parallel_copy(struct ir3_block * block)397 create_parallel_copy(struct ir3_block *block)
398 {
399    for (unsigned i = 0; i < 2; i++) {
400       if (!block->successors[i])
401          continue;
402 
403       struct ir3_block *succ = block->successors[i];
404 
405       unsigned pred_idx = ir3_block_get_pred_index(succ, block);
406 
407       unsigned phi_count = 0;
408       foreach_instr (phi, &succ->instr_list) {
409          if (phi->opc != OPC_META_PHI)
410             break;
411 
412          /* Avoid undef */
413          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
414              !phi->srcs[pred_idx]->def)
415             continue;
416 
417          /* We don't support critical edges. If we were to support them,
418           * we'd need to insert parallel copies after the phi node to solve
419           * the lost-copy problem.
420           */
421          assert(i == 0 && !block->successors[1]);
422          phi_count++;
423       }
424 
425       if (phi_count == 0)
426          continue;
427 
428       struct ir3_register *src[phi_count];
429       unsigned j = 0;
430       foreach_instr (phi, &succ->instr_list) {
431          if (phi->opc != OPC_META_PHI)
432             break;
433          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
434              !phi->srcs[pred_idx]->def)
435             continue;
436          src[j++] = phi->srcs[pred_idx];
437       }
438       assert(j == phi_count);
439 
440       struct ir3_instruction *pcopy =
441          ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
442 
443       for (j = 0; j < phi_count; j++) {
444          struct ir3_register *reg = __ssa_dst(pcopy);
445          reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
446          reg->size = src[j]->size;
447          reg->wrmask = src[j]->wrmask;
448       }
449 
450       for (j = 0; j < phi_count; j++) {
451          pcopy->srcs[pcopy->srcs_count++] =
452             ir3_reg_clone(block->shader, src[j]);
453       }
454 
455       j = 0;
456       foreach_instr (phi, &succ->instr_list) {
457          if (phi->opc != OPC_META_PHI)
458             break;
459          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
460              !phi->srcs[pred_idx]->def)
461             continue;
462          phi->srcs[pred_idx]->def = pcopy->dsts[j];
463          phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
464          j++;
465       }
466       assert(j == phi_count);
467    }
468 }
469 
470 void
ir3_create_parallel_copies(struct ir3 * ir)471 ir3_create_parallel_copies(struct ir3 *ir)
472 {
473    foreach_block (block, &ir->block_list) {
474       create_parallel_copy(block);
475    }
476 }
477 
478 static void
index_merge_sets(struct ir3_liveness * live,struct ir3 * ir)479 index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
480 {
481    unsigned offset = 0;
482    foreach_block (block, &ir->block_list) {
483       foreach_instr (instr, &block->instr_list) {
484          for (unsigned i = 0; i < instr->dsts_count; i++) {
485             struct ir3_register *dst = instr->dsts[i];
486 
487             unsigned dst_offset;
488             struct ir3_merge_set *merge_set = dst->merge_set;
489             unsigned size = reg_size(dst);
490             if (merge_set) {
491                if (merge_set->interval_start == ~0) {
492                   merge_set->interval_start = offset;
493                   offset += merge_set->size;
494                }
495                dst_offset = merge_set->interval_start + dst->merge_set_offset;
496             } else {
497                dst_offset = offset;
498                offset += size;
499             }
500 
501             dst->interval_start = dst_offset;
502             dst->interval_end = dst_offset + size;
503          }
504       }
505    }
506 
507    live->interval_offset = offset;
508 }
509 
510 #define RESET      "\x1b[0m"
511 #define BLUE       "\x1b[0;34m"
512 #define SYN_SSA(x) BLUE x RESET
513 
514 static void
dump_merge_sets(struct ir3 * ir)515 dump_merge_sets(struct ir3 *ir)
516 {
517    d("merge sets:");
518    struct set *merge_sets = _mesa_pointer_set_create(NULL);
519    foreach_block (block, &ir->block_list) {
520       foreach_instr (instr, &block->instr_list) {
521          for (unsigned i = 0; i < instr->dsts_count; i++) {
522             struct ir3_register *dst = instr->dsts[i];
523 
524             struct ir3_merge_set *merge_set = dst->merge_set;
525             if (!merge_set || _mesa_set_search(merge_sets, merge_set))
526                continue;
527 
528             d("merge set, size %u, align %u:", merge_set->size,
529               merge_set->alignment);
530             for (unsigned j = 0; j < merge_set->regs_count; j++) {
531                struct ir3_register *reg = merge_set->regs[j];
532                d("\t" SYN_SSA("ssa_%u") ":%u, offset %u",
533                  reg->instr->serialno, reg->name, reg->merge_set_offset);
534             }
535 
536             _mesa_set_add(merge_sets, merge_set);
537          }
538       }
539    }
540 
541    ralloc_free(merge_sets);
542 }
543 
544 void
ir3_merge_regs(struct ir3_liveness * live,struct ir3 * ir)545 ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
546 {
547    index_instrs(ir3_start_block(ir), 0);
548 
549    /* First pass: coalesce phis, which must be together. */
550    foreach_block (block, &ir->block_list) {
551       foreach_instr (instr, &block->instr_list) {
552          if (instr->opc != OPC_META_PHI)
553             break;
554 
555          coalesce_phi(live, instr);
556       }
557    }
558 
559    /* Second pass: aggressively coalesce parallelcopy, split, collect */
560    foreach_block (block, &ir->block_list) {
561       foreach_instr (instr, &block->instr_list) {
562          switch (instr->opc) {
563          case OPC_META_SPLIT:
564             aggressive_coalesce_split(live, instr);
565             break;
566          case OPC_META_COLLECT:
567             aggressive_coalesce_collect(live, instr);
568             break;
569          case OPC_META_PARALLEL_COPY:
570             aggressive_coalesce_parallel_copy(live, instr);
571             break;
572          default:
573             break;
574          }
575       }
576    }
577 
578    index_merge_sets(live, ir);
579 
580    if (ir3_shader_debug & IR3_DBG_RAMSGS)
581       dump_merge_sets(ir);
582 }
583