1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/dag.h"
28 #include "util/u_math.h"
29 
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32 #include "ir3_context.h"
33 
34 #ifdef DEBUG
35 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
36 #else
37 #define SCHED_DEBUG 0
38 #endif
39 #define d(fmt, ...)                                                            \
40    do {                                                                        \
41       if (SCHED_DEBUG) {                                                       \
42          mesa_logi("PSCHED: " fmt, ##__VA_ARGS__);                             \
43       }                                                                        \
44    } while (0)
45 
46 #define di(instr, fmt, ...)                                                    \
47    do {                                                                        \
48       if (SCHED_DEBUG) {                                                       \
49          struct log_stream *stream = mesa_log_streami();                       \
50          mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__);   \
51          ir3_print_instr_stream(stream, instr);                                \
52          mesa_log_stream_destroy(stream);                                      \
53       }                                                                        \
54    } while (0)
55 
56 /*
57  * Post RA Instruction Scheduling
58  */
59 
60 struct ir3_postsched_ctx {
61    struct ir3 *ir;
62 
63    struct ir3_shader_variant *v;
64 
65    void *mem_ctx;
66    struct ir3_block *block; /* the current block */
67    struct dag *dag;
68 
69    struct list_head unscheduled_list; /* unscheduled instructions */
70 
71    int sfu_delay;
72    int tex_delay;
73 };
74 
75 struct ir3_postsched_node {
76    struct dag_node dag; /* must be first for util_dynarray_foreach */
77    struct ir3_instruction *instr;
78    bool partially_evaluated_path;
79 
80    bool has_tex_src, has_sfu_src;
81 
82    unsigned delay;
83    unsigned max_delay;
84 };
85 
86 #define foreach_sched_node(__n, __list)                                        \
87    list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
88 
89 static bool
has_tex_src(struct ir3_instruction * instr)90 has_tex_src(struct ir3_instruction *instr)
91 {
92    struct ir3_postsched_node *node = instr->data;
93    return node->has_tex_src;
94 }
95 
96 static bool
has_sfu_src(struct ir3_instruction * instr)97 has_sfu_src(struct ir3_instruction *instr)
98 {
99    struct ir3_postsched_node *node = instr->data;
100    return node->has_sfu_src;
101 }
102 
103 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)104 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
105 {
106    debug_assert(ctx->block == instr->block);
107 
108    /* remove from unscheduled_list:
109     */
110    list_delinit(&instr->node);
111 
112    di(instr, "schedule");
113 
114    list_addtail(&instr->node, &instr->block->instr_list);
115 
116    struct ir3_postsched_node *n = instr->data;
117    dag_prune_head(ctx->dag, &n->dag);
118 
119    if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
120       return;
121 
122    if (is_sfu(instr)) {
123       ctx->sfu_delay = 8;
124    } else if (has_sfu_src(instr)) {
125       ctx->sfu_delay = 0;
126    } else if (ctx->sfu_delay > 0) {
127       ctx->sfu_delay--;
128    }
129 
130    if (is_tex_or_prefetch(instr)) {
131       ctx->tex_delay = 10;
132    } else if (has_tex_src(instr)) {
133       ctx->tex_delay = 0;
134    } else if (ctx->tex_delay > 0) {
135       ctx->tex_delay--;
136    }
137 }
138 
139 static void
dump_state(struct ir3_postsched_ctx * ctx)140 dump_state(struct ir3_postsched_ctx *ctx)
141 {
142    if (!SCHED_DEBUG)
143       return;
144 
145    foreach_sched_node (n, &ctx->dag->heads) {
146       di(n->instr, "maxdel=%3d    ", n->max_delay);
147 
148       util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
149          struct ir3_postsched_node *child =
150             (struct ir3_postsched_node *)edge->child;
151 
152          di(child->instr, " -> (%d parents) ", child->dag.parent_count);
153       }
154    }
155 }
156 
157 /* Determine if this is an instruction that we'd prefer not to schedule
158  * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
159  * counter, ie. the more cycles it has been since the last SFU, the less
160  * costly a sync would be.
161  */
162 static bool
would_sync(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)163 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
164 {
165    if (ctx->sfu_delay) {
166       if (has_sfu_src(instr))
167          return true;
168    }
169 
170    if (ctx->tex_delay) {
171       if (has_tex_src(instr))
172          return true;
173    }
174 
175    return false;
176 }
177 
178 /* find instruction to schedule: */
179 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)180 choose_instr(struct ir3_postsched_ctx *ctx)
181 {
182    struct ir3_postsched_node *chosen = NULL;
183 
184    dump_state(ctx);
185 
186    foreach_sched_node (n, &ctx->dag->heads) {
187       if (!is_meta(n->instr))
188          continue;
189 
190       if (!chosen || (chosen->max_delay < n->max_delay))
191          chosen = n;
192    }
193 
194    if (chosen) {
195       di(chosen->instr, "prio: chose (meta)");
196       return chosen->instr;
197    }
198 
199    /* Try to schedule inputs with a higher priority, if possible, as
200     * the last bary.f unlocks varying storage to unblock more VS
201     * warps.
202     */
203    foreach_sched_node (n, &ctx->dag->heads) {
204       if (!is_input(n->instr))
205          continue;
206 
207       if (!chosen || (chosen->max_delay < n->max_delay))
208          chosen = n;
209    }
210 
211    if (chosen) {
212       di(chosen->instr, "prio: chose (input)");
213       return chosen->instr;
214    }
215 
216    /* Next prioritize discards: */
217    foreach_sched_node (n, &ctx->dag->heads) {
218       unsigned d =
219          ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
220 
221       if (d > 0)
222          continue;
223 
224       if (!is_kill_or_demote(n->instr))
225          continue;
226 
227       if (!chosen || (chosen->max_delay < n->max_delay))
228          chosen = n;
229    }
230 
231    if (chosen) {
232       di(chosen->instr, "csp: chose (kill, hard ready)");
233       return chosen->instr;
234    }
235 
236    /* Next prioritize expensive instructions: */
237    foreach_sched_node (n, &ctx->dag->heads) {
238       unsigned d =
239          ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
240 
241       if (d > 0)
242          continue;
243 
244       if (!(is_sfu(n->instr) || is_tex(n->instr)))
245          continue;
246 
247       if (!chosen || (chosen->max_delay < n->max_delay))
248          chosen = n;
249    }
250 
251    if (chosen) {
252       di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
253       return chosen->instr;
254    }
255 
256    /*
257     * Sometimes be better to take a nop, rather than scheduling an
258     * instruction that would require an (ss) shortly after another
259     * SFU..  ie. if last SFU was just one or two instr ago, and we
260     * could choose between taking a nop and then scheduling
261     * something else, vs scheduling the immed avail instruction that
262     * would require (ss), we are better with the nop.
263     */
264    for (unsigned delay = 0; delay < 4; delay++) {
265       foreach_sched_node (n, &ctx->dag->heads) {
266          if (would_sync(ctx, n->instr))
267             continue;
268 
269          unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
270                                             ctx->v->mergedregs);
271 
272          if (d > delay)
273             continue;
274 
275          if (!chosen || (chosen->max_delay < n->max_delay))
276             chosen = n;
277       }
278 
279       if (chosen) {
280          di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
281          return chosen->instr;
282       }
283    }
284 
285    /* Next try to find a ready leader w/ soft delay (ie. including extra
286     * delay for things like tex fetch which can be synchronized w/ sync
287     * bit (but we probably do want to schedule some other instructions
288     * while we wait)
289     */
290    foreach_sched_node (n, &ctx->dag->heads) {
291       unsigned d =
292          ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
293 
294       if (d > 0)
295          continue;
296 
297       if (!chosen || (chosen->max_delay < n->max_delay))
298          chosen = n;
299    }
300 
301    if (chosen) {
302       di(chosen->instr, "csp: chose (soft ready)");
303       return chosen->instr;
304    }
305 
306    /* Next try to find a ready leader that can be scheduled without nop's,
307     * which in the case of things that need (sy)/(ss) could result in
308     * stalls.. but we've already decided there is not a better option.
309     */
310    foreach_sched_node (n, &ctx->dag->heads) {
311       unsigned d =
312          ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
313 
314       if (d > 0)
315          continue;
316 
317       if (!chosen || (chosen->max_delay < n->max_delay))
318          chosen = n;
319    }
320 
321    if (chosen) {
322       di(chosen->instr, "csp: chose (hard ready)");
323       return chosen->instr;
324    }
325 
326    /* Otherwise choose leader with maximum cost:
327     *
328     * TODO should we try to balance cost and delays?  I guess it is
329     * a balance between now-nop's and future-nop's?
330     */
331    foreach_sched_node (n, &ctx->dag->heads) {
332       if (!chosen || chosen->max_delay < n->max_delay)
333          chosen = n;
334    }
335 
336    if (chosen) {
337       di(chosen->instr, "csp: chose (leader)");
338       return chosen->instr;
339    }
340 
341    return NULL;
342 }
343 
344 struct ir3_postsched_deps_state {
345    struct ir3_postsched_ctx *ctx;
346 
347    enum { F, R } direction;
348 
349    bool merged;
350 
351    /* Track the mapping between sched node (instruction) that last
352     * wrote a given register (in whichever direction we are iterating
353     * the block)
354     *
355     * Note, this table is twice as big as the # of regs, to deal with
356     * half-precision regs.  The approach differs depending on whether
357     * the half and full precision register files are "merged" (conflict,
358     * ie. a6xx+) in which case we consider each full precision dep
359     * as two half-precision dependencies, vs older separate (non-
360     * conflicting) in which case the first half of the table is used
361     * for full precision and 2nd half for half-precision.
362     */
363    struct ir3_postsched_node *regs[2 * 256];
364 };
365 
366 /* bounds checking read/write accessors, since OoB access to stuff on
367  * the stack is gonna cause a bad day.
368  */
369 #define dep_reg(state, idx)                                                    \
370    *({                                                                         \
371       assert((idx) < ARRAY_SIZE((state)->regs));                               \
372       &(state)->regs[(idx)];                                                   \
373    })
374 
375 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after)376 add_dep(struct ir3_postsched_deps_state *state,
377         struct ir3_postsched_node *before, struct ir3_postsched_node *after)
378 {
379    if (!before || !after)
380       return;
381 
382    assert(before != after);
383 
384    if (state->direction == F) {
385       dag_add_edge(&before->dag, &after->dag, NULL);
386    } else {
387       dag_add_edge(&after->dag, &before->dag, NULL);
388    }
389 }
390 
391 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,unsigned num,int src_n)392 add_single_reg_dep(struct ir3_postsched_deps_state *state,
393                    struct ir3_postsched_node *node, unsigned num, int src_n)
394 {
395    struct ir3_postsched_node *dep = dep_reg(state, num);
396 
397    if (src_n >= 0 && dep && state->direction == F) {
398       unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
399       node->delay = MAX2(node->delay, d);
400       if (is_tex_or_prefetch(dep->instr))
401          node->has_tex_src = true;
402       if (is_tex_or_prefetch(dep->instr))
403          node->has_sfu_src = true;
404    }
405 
406    add_dep(state, dep, node);
407    if (src_n < 0) {
408       dep_reg(state, num) = node;
409    }
410 }
411 
412 /* This is where we handled full vs half-precision, and potential conflicts
413  * between half and full precision that result in additional dependencies.
414  * The 'reg' arg is really just to know half vs full precision.
415  *
416  * If non-negative, then this adds a dependency on a source register, and
417  * src_n is the index passed into ir3_delayslots() for calculating the delay:
418  * If positive, corresponds to node->instr->regs[src_n]. If negative, then
419  * this is for a destination register.
420  */
421 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,int src_n)422 add_reg_dep(struct ir3_postsched_deps_state *state,
423             struct ir3_postsched_node *node, const struct ir3_register *reg,
424             unsigned num, int src_n)
425 {
426    if (state->merged) {
427       /* Make sure that special registers like a0.x that are written as
428        * half-registers don't alias random full registers by pretending that
429        * they're full registers:
430        */
431       if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
432          /* single conflict in half-reg space: */
433          add_single_reg_dep(state, node, num, src_n);
434       } else {
435          /* two conflicts in half-reg space: */
436          add_single_reg_dep(state, node, 2 * num + 0, src_n);
437          add_single_reg_dep(state, node, 2 * num + 1, src_n);
438       }
439    } else {
440       if (reg->flags & IR3_REG_HALF)
441          num += ARRAY_SIZE(state->regs) / 2;
442       add_single_reg_dep(state, node, num, src_n);
443    }
444 }
445 
446 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)447 calculate_deps(struct ir3_postsched_deps_state *state,
448                struct ir3_postsched_node *node)
449 {
450    /* Add dependencies on instructions that previously (or next,
451     * in the reverse direction) wrote any of our src registers:
452     */
453    foreach_src_n (reg, i, node->instr) {
454       if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
455          continue;
456 
457       if (reg->flags & IR3_REG_RELATIV) {
458          /* mark entire array as read: */
459          for (unsigned j = 0; j < reg->size; j++) {
460             add_reg_dep(state, node, reg, reg->array.base + j, i);
461          }
462       } else {
463          assert(reg->wrmask >= 1);
464          u_foreach_bit (b, reg->wrmask) {
465             add_reg_dep(state, node, reg, reg->num + b, i);
466          }
467       }
468    }
469 
470    /* And then after we update the state for what this instruction
471     * wrote:
472     */
473    foreach_dst (reg, node->instr) {
474       if (reg->wrmask == 0)
475          continue;
476       if (reg->flags & IR3_REG_RELATIV) {
477          /* mark the entire array as written: */
478          for (unsigned i = 0; i < reg->size; i++) {
479             add_reg_dep(state, node, reg, reg->array.base + i, -1);
480          }
481       } else {
482          assert(reg->wrmask >= 1);
483          u_foreach_bit (b, reg->wrmask) {
484             add_reg_dep(state, node, reg, reg->num + b, -1);
485          }
486       }
487    }
488 }
489 
490 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)491 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
492 {
493    struct ir3_postsched_deps_state state = {
494       .ctx = ctx,
495       .direction = F,
496       .merged = ctx->v->mergedregs,
497    };
498 
499    foreach_instr (instr, &ctx->unscheduled_list) {
500       calculate_deps(&state, instr->data);
501    }
502 }
503 
504 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)505 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
506 {
507    struct ir3_postsched_deps_state state = {
508       .ctx = ctx,
509       .direction = R,
510       .merged = ctx->v->mergedregs,
511    };
512 
513    foreach_instr_rev (instr, &ctx->unscheduled_list) {
514       calculate_deps(&state, instr->data);
515    }
516 }
517 
518 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)519 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
520 {
521    struct ir3_postsched_node *n =
522       rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
523 
524    dag_init_node(ctx->dag, &n->dag);
525 
526    n->instr = instr;
527    instr->data = n;
528 }
529 
530 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)531 sched_dag_max_delay_cb(struct dag_node *node, void *state)
532 {
533    struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
534    uint32_t max_delay = 0;
535 
536    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
537       struct ir3_postsched_node *child =
538          (struct ir3_postsched_node *)edge->child;
539       max_delay = MAX2(child->max_delay, max_delay);
540    }
541 
542    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
543 }
544 
545 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)546 sched_dag_init(struct ir3_postsched_ctx *ctx)
547 {
548    ctx->mem_ctx = ralloc_context(NULL);
549 
550    ctx->dag = dag_create(ctx->mem_ctx);
551 
552    foreach_instr (instr, &ctx->unscheduled_list)
553       sched_node_init(ctx, instr);
554 
555    calculate_forward_deps(ctx);
556    calculate_reverse_deps(ctx);
557 
558    /*
559     * To avoid expensive texture fetches, etc, from being moved ahead
560     * of kills, track the kills we've seen so far, so we can add an
561     * extra dependency on them for tex/mem instructions
562     */
563    struct util_dynarray kills;
564    util_dynarray_init(&kills, ctx->mem_ctx);
565 
566    /* The last bary.f with the (ei) flag must be scheduled before any kills,
567     * or the hw gets angry. Keep track of inputs here so we can add the
568     * false dep on the kill instruction.
569     */
570    struct util_dynarray inputs;
571    util_dynarray_init(&inputs, ctx->mem_ctx);
572 
573    /*
574     * Normal srcs won't be in SSA at this point, those are dealt with in
575     * calculate_forward_deps() and calculate_reverse_deps().  But we still
576     * have the false-dep information in SSA form, so go ahead and add
577     * dependencies for that here:
578     */
579    foreach_instr (instr, &ctx->unscheduled_list) {
580       struct ir3_postsched_node *n = instr->data;
581 
582       foreach_ssa_src_n (src, i, instr) {
583          if (src->block != instr->block)
584             continue;
585 
586          /* we can end up with unused false-deps.. just skip them: */
587          if (src->flags & IR3_INSTR_UNUSED)
588             continue;
589 
590          struct ir3_postsched_node *sn = src->data;
591 
592          /* don't consider dependencies in other blocks: */
593          if (src->block != instr->block)
594             continue;
595 
596          dag_add_edge(&sn->dag, &n->dag, NULL);
597       }
598 
599       if (is_input(instr)) {
600          util_dynarray_append(&inputs, struct ir3_instruction *, instr);
601       } else if (is_kill_or_demote(instr)) {
602          util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
603             struct ir3_instruction *input = *instrp;
604             struct ir3_postsched_node *in = input->data;
605             dag_add_edge(&in->dag, &n->dag, NULL);
606          }
607          util_dynarray_append(&kills, struct ir3_instruction *, instr);
608       } else if (is_tex(instr) || is_mem(instr)) {
609          util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
610             struct ir3_instruction *kill = *instrp;
611             struct ir3_postsched_node *kn = kill->data;
612             dag_add_edge(&kn->dag, &n->dag, NULL);
613          }
614       }
615    }
616 
617    // TODO do we want to do this after reverse-dependencies?
618    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
619 }
620 
621 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)622 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
623 {
624    ralloc_free(ctx->mem_ctx);
625    ctx->mem_ctx = NULL;
626    ctx->dag = NULL;
627 }
628 
629 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)630 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
631 {
632    ctx->block = block;
633    ctx->tex_delay = 0;
634    ctx->sfu_delay = 0;
635 
636    /* move all instructions to the unscheduled list, and
637     * empty the block's instruction list (to which we will
638     * be inserting).
639     */
640    list_replace(&block->instr_list, &ctx->unscheduled_list);
641    list_inithead(&block->instr_list);
642 
643    // TODO once we are using post-sched for everything we can
644    // just not stick in NOP's prior to post-sched, and drop this.
645    // for now keep this, since it makes post-sched optional:
646    foreach_instr_safe (instr, &ctx->unscheduled_list) {
647       switch (instr->opc) {
648       case OPC_NOP:
649       case OPC_B:
650       case OPC_JUMP:
651          list_delinit(&instr->node);
652          break;
653       default:
654          break;
655       }
656    }
657 
658    sched_dag_init(ctx);
659 
660    /* First schedule all meta:input instructions, followed by
661     * tex-prefetch.  We want all of the instructions that load
662     * values into registers before the shader starts to go
663     * before any other instructions.  But in particular we
664     * want inputs to come before prefetches.  This is because
665     * a FS's bary_ij input may not actually be live in the
666     * shader, but it should not be scheduled on top of any
667     * other input (but can be overwritten by a tex prefetch)
668     */
669    foreach_instr_safe (instr, &ctx->unscheduled_list)
670       if (instr->opc == OPC_META_INPUT)
671          schedule(ctx, instr);
672 
673    foreach_instr_safe (instr, &ctx->unscheduled_list)
674       if (instr->opc == OPC_META_TEX_PREFETCH)
675          schedule(ctx, instr);
676 
677    while (!list_is_empty(&ctx->unscheduled_list)) {
678       struct ir3_instruction *instr = choose_instr(ctx);
679 
680       unsigned delay =
681          ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
682       d("delay=%u", delay);
683 
684       /* and if we run out of instructions that can be scheduled,
685        * then it is time for nop's:
686        */
687       debug_assert(delay <= 6);
688       while (delay > 0) {
689          ir3_NOP(block);
690          delay--;
691       }
692 
693       schedule(ctx, instr);
694    }
695 
696    sched_dag_destroy(ctx);
697 }
698 
699 static bool
is_self_mov(struct ir3_instruction * instr)700 is_self_mov(struct ir3_instruction *instr)
701 {
702    if (!is_same_type_mov(instr))
703       return false;
704 
705    if (instr->dsts[0]->num != instr->srcs[0]->num)
706       return false;
707 
708    if (instr->dsts[0]->flags & IR3_REG_RELATIV)
709       return false;
710 
711    if (instr->cat1.round != ROUND_ZERO)
712       return false;
713 
714    if (instr->srcs[0]->flags &
715        (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
716         IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
717       return false;
718 
719    return true;
720 }
721 
722 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
723  * as a result of places were before RA we are not sure that it is
724  * safe to eliminate.  We could eliminate these earlier, but sometimes
725  * they are tangled up in false-dep's, etc, so it is easier just to
726  * let them exist until after RA
727  */
728 static void
cleanup_self_movs(struct ir3 * ir)729 cleanup_self_movs(struct ir3 *ir)
730 {
731    foreach_block (block, &ir->block_list) {
732       foreach_instr_safe (instr, &block->instr_list) {
733          for (unsigned i = 0; i < instr->deps_count; i++) {
734             if (instr->deps[i] && is_self_mov(instr->deps[i])) {
735                instr->deps[i] = NULL;
736             }
737          }
738 
739          if (is_self_mov(instr))
740             list_delinit(&instr->node);
741       }
742    }
743 }
744 
745 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)746 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
747 {
748    struct ir3_postsched_ctx ctx = {
749       .ir = ir,
750       .v = v,
751    };
752 
753    ir3_remove_nops(ir);
754    cleanup_self_movs(ir);
755 
756    foreach_block (block, &ir->block_list) {
757       sched_block(&ctx, block);
758    }
759 
760    return true;
761 }
762