1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/dag.h"
28 #include "util/u_math.h"
29 
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32 
33 #ifdef DEBUG
34 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
35 #else
36 #define SCHED_DEBUG 0
37 #endif
38 #define d(fmt, ...)                                                            \
39    do {                                                                        \
40       if (SCHED_DEBUG) {                                                       \
41          mesa_logi("SCHED: " fmt, ##__VA_ARGS__);                              \
42       }                                                                        \
43    } while (0)
44 
45 #define di(instr, fmt, ...)                                                    \
46    do {                                                                        \
47       if (SCHED_DEBUG) {                                                       \
48          struct log_stream *stream = mesa_log_streami();                       \
49          mesa_log_stream_printf(stream, "SCHED: " fmt ": ", ##__VA_ARGS__);    \
50          ir3_print_instr_stream(stream, instr);                                \
51          mesa_log_stream_destroy(stream);                                      \
52       }                                                                        \
53    } while (0)
54 
55 /*
56  * Instruction Scheduling:
57  *
58  * A block-level pre-RA scheduler, which works by creating a DAG of
59  * instruction dependencies, and heuristically picking a DAG head
60  * (instruction with no unscheduled dependencies).
61  *
62  * Where possible, it tries to pick instructions that avoid nop delay
63  * slots, but it will prefer to pick instructions that reduce (or do
64  * not increase) the number of live values.
65  *
66  * If the only possible choices are instructions that increase the
67  * number of live values, it will try to pick the one with the earliest
68  * consumer (based on pre-sched program order).
69  *
70  * There are a few special cases that need to be handled, since sched
71  * is currently independent of register allocation.  Usages of address
72  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
73  * if you have two pairs of instructions that write the same special
74  * register and then read it, then those pairs cannot be interleaved.
75  * To solve this, when we are in such a scheduling "critical section",
76  * and we encounter a conflicting write to a special register, we try
77  * to schedule any remaining instructions that use that value first.
78  *
79  * TODO we can detect too-large live_values here.. would be a good place
80  * to "spill" cheap things, like move from uniform/immed.  (Constructing
81  * list of ssa def consumers before sched pass would make this easier.
82  * Also, in general it is general it might be best not to re-use load_immed
83  * across blocks.
84  *
85  * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
86  * the # of immediates in play (or at least that would help with
87  * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
88  * do this in a nir pass that inserts fneg/etc?  The cp pass should fold
89  * these into src modifiers..
90  */
91 
92 struct ir3_sched_ctx {
93    struct ir3_block *block; /* the current block */
94    struct dag *dag;
95 
96    struct list_head unscheduled_list; /* unscheduled instructions */
97    struct ir3_instruction *scheduled; /* last scheduled instr */
98    struct ir3_instruction *addr0;     /* current a0.x user, if any */
99    struct ir3_instruction *addr1;     /* current a1.x user, if any */
100    struct ir3_instruction *pred;      /* current p0.x user, if any */
101 
102    struct ir3_instruction *split; /* most-recently-split a0/a1/p0 producer */
103 
104    int remaining_kills;
105    int remaining_tex;
106 
107    bool error;
108 
109    int sfu_delay;
110    int tex_delay;
111 
112    /* We order the scheduled tex/SFU instructions, and keep track of the
113     * index of the last waited on instruction, so we can know which
114     * instructions are still outstanding (and therefore would require us to
115     * wait for all outstanding instructions before scheduling a use).
116     */
117    int tex_index, first_outstanding_tex_index;
118    int sfu_index, first_outstanding_sfu_index;
119 };
120 
121 struct ir3_sched_node {
122    struct dag_node dag; /* must be first for util_dynarray_foreach */
123    struct ir3_instruction *instr;
124 
125    unsigned delay;
126    unsigned max_delay;
127 
128    unsigned tex_index;
129    unsigned sfu_index;
130 
131    /* For instructions that are a meta:collect src, once we schedule
132     * the first src of the collect, the entire vecN is live (at least
133     * from the PoV of the first RA pass.. the 2nd scalar pass can fill
134     * in some of the gaps, but often not all).  So we want to help out
135     * RA, and realize that as soon as we schedule the first collect
136     * src, there is no penalty to schedule the remainder (ie. they
137     * don't make additional values live).  In fact we'd prefer to
138     * schedule the rest ASAP to minimize the live range of the vecN.
139     *
140     * For instructions that are the src of a collect, we track the
141     * corresponding collect, and mark them as partially live as soon
142     * as any one of the src's is scheduled.
143     */
144    struct ir3_instruction *collect;
145    bool partially_live;
146 
147    /* Is this instruction a direct or indirect dependency for a kill?
148     * If so, we should prioritize it when possible
149     */
150    bool kill_path;
151 
152    /* This node represents a shader output.  A semi-common pattern in
153     * shaders is something along the lines of:
154     *
155     *    fragcolor.w = 1.0
156     *
157     * Which we'd prefer to schedule as late as possible, since it
158     * produces a live value that is never killed/consumed.  So detect
159     * outputs up-front, and avoid scheduling them unless the reduce
160     * register pressure (or at least are neutral)
161     */
162    bool output;
163 };
164 
165 #define foreach_sched_node(__n, __list)                                        \
166    list_for_each_entry (struct ir3_sched_node, __n, __list, dag.link)
167 
168 static void sched_node_init(struct ir3_sched_ctx *ctx,
169                             struct ir3_instruction *instr);
170 static void sched_node_add_dep(struct ir3_instruction *instr,
171                                struct ir3_instruction *src, int i);
172 
173 static bool
is_scheduled(struct ir3_instruction * instr)174 is_scheduled(struct ir3_instruction *instr)
175 {
176    return !!(instr->flags & IR3_INSTR_MARK);
177 }
178 
179 /* check_src_cond() passing a ir3_sched_ctx. */
180 static bool
sched_check_src_cond(struct ir3_instruction * instr,bool (* cond)(struct ir3_instruction *,struct ir3_sched_ctx *),struct ir3_sched_ctx * ctx)181 sched_check_src_cond(struct ir3_instruction *instr,
182                      bool (*cond)(struct ir3_instruction *,
183                                   struct ir3_sched_ctx *),
184                      struct ir3_sched_ctx *ctx)
185 {
186    foreach_ssa_src (src, instr) {
187       /* meta:split/collect aren't real instructions, the thing that
188        * we actually care about is *their* srcs
189        */
190       if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) {
191          if (sched_check_src_cond(src, cond, ctx))
192             return true;
193       } else {
194          if (cond(src, ctx))
195             return true;
196       }
197    }
198 
199    return false;
200 }
201 
202 /* Is this a prefetch or tex that hasn't been waited on yet? */
203 
204 static bool
is_outstanding_tex_or_prefetch(struct ir3_instruction * instr,struct ir3_sched_ctx * ctx)205 is_outstanding_tex_or_prefetch(struct ir3_instruction *instr,
206                                struct ir3_sched_ctx *ctx)
207 {
208    if (!is_tex_or_prefetch(instr))
209       return false;
210 
211    /* The sched node is only valid within the same block, we cannot
212     * really say anything about src's from other blocks
213     */
214    if (instr->block != ctx->block)
215       return true;
216 
217    struct ir3_sched_node *n = instr->data;
218    return n->tex_index >= ctx->first_outstanding_tex_index;
219 }
220 
221 static bool
is_outstanding_sfu(struct ir3_instruction * instr,struct ir3_sched_ctx * ctx)222 is_outstanding_sfu(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
223 {
224    if (!is_sfu(instr))
225       return false;
226 
227    /* The sched node is only valid within the same block, we cannot
228     * really say anything about src's from other blocks
229     */
230    if (instr->block != ctx->block)
231       return true;
232 
233    struct ir3_sched_node *n = instr->data;
234    return n->sfu_index >= ctx->first_outstanding_sfu_index;
235 }
236 
237 static unsigned
cycle_count(struct ir3_instruction * instr)238 cycle_count(struct ir3_instruction *instr)
239 {
240    if (instr->opc == OPC_META_COLLECT) {
241       /* Assume that only immed/const sources produce moves */
242       unsigned n = 0;
243       foreach_src (src, instr) {
244          if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
245             n++;
246       }
247       return n;
248    } else if (is_meta(instr)) {
249       return 0;
250    } else {
251       return 1;
252    }
253 }
254 
255 static void
schedule(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)256 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
257 {
258    debug_assert(ctx->block == instr->block);
259 
260    /* remove from depth list:
261     */
262    list_delinit(&instr->node);
263 
264    if (writes_addr0(instr)) {
265       debug_assert(ctx->addr0 == NULL);
266       ctx->addr0 = instr;
267    }
268 
269    if (writes_addr1(instr)) {
270       debug_assert(ctx->addr1 == NULL);
271       ctx->addr1 = instr;
272    }
273 
274    if (writes_pred(instr)) {
275       debug_assert(ctx->pred == NULL);
276       ctx->pred = instr;
277    }
278 
279    instr->flags |= IR3_INSTR_MARK;
280 
281    di(instr, "schedule");
282 
283    list_addtail(&instr->node, &instr->block->instr_list);
284    ctx->scheduled = instr;
285 
286    if (is_kill_or_demote(instr)) {
287       assert(ctx->remaining_kills > 0);
288       ctx->remaining_kills--;
289    }
290 
291    struct ir3_sched_node *n = instr->data;
292 
293    /* If this instruction is a meta:collect src, mark the remaining
294     * collect srcs as partially live.
295     */
296    if (n->collect) {
297       foreach_ssa_src (src, n->collect) {
298          if (src->block != instr->block)
299             continue;
300          struct ir3_sched_node *sn = src->data;
301          sn->partially_live = true;
302       }
303    }
304 
305    dag_prune_head(ctx->dag, &n->dag);
306 
307    unsigned cycles = cycle_count(instr);
308 
309    if (is_sfu(instr)) {
310       ctx->sfu_delay = 8;
311       n->sfu_index = ctx->sfu_index++;
312    } else if (!is_meta(instr) &&
313               sched_check_src_cond(instr, is_outstanding_sfu, ctx)) {
314       ctx->sfu_delay = 0;
315       ctx->first_outstanding_sfu_index = ctx->sfu_index;
316    } else if (ctx->sfu_delay > 0) {
317       ctx->sfu_delay -= MIN2(cycles, ctx->sfu_delay);
318    }
319 
320    if (is_tex_or_prefetch(instr)) {
321       /* NOTE that this isn't an attempt to hide texture fetch latency,
322        * but an attempt to hide the cost of switching to another warp.
323        * If we can, we'd like to try to schedule another texture fetch
324        * before scheduling something that would sync.
325        */
326       ctx->tex_delay = 10;
327       assert(ctx->remaining_tex > 0);
328       ctx->remaining_tex--;
329       n->tex_index = ctx->tex_index++;
330    } else if (!is_meta(instr) &&
331               sched_check_src_cond(instr, is_outstanding_tex_or_prefetch,
332                                    ctx)) {
333       ctx->tex_delay = 0;
334       ctx->first_outstanding_tex_index = ctx->tex_index;
335    } else if (ctx->tex_delay > 0) {
336       ctx->tex_delay -= MIN2(cycles, ctx->tex_delay);
337    }
338 }
339 
340 struct ir3_sched_notes {
341    /* there is at least one kill which could be scheduled, except
342     * for unscheduled bary.f's:
343     */
344    bool blocked_kill;
345    /* there is at least one instruction that could be scheduled,
346     * except for conflicting address/predicate register usage:
347     */
348    bool addr0_conflict, addr1_conflict, pred_conflict;
349 };
350 
351 /* could an instruction be scheduled if specified ssa src was scheduled? */
352 static bool
could_sched(struct ir3_instruction * instr,struct ir3_instruction * src)353 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
354 {
355    foreach_ssa_src (other_src, instr) {
356       /* if dependency not scheduled, we aren't ready yet: */
357       if ((src != other_src) && !is_scheduled(other_src)) {
358          return false;
359       }
360    }
361    return true;
362 }
363 
364 /* Check if instruction is ok to schedule.  Make sure it is not blocked
365  * by use of addr/predicate register, etc.
366  */
367 static bool
check_instr(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes,struct ir3_instruction * instr)368 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
369             struct ir3_instruction *instr)
370 {
371    debug_assert(!is_scheduled(instr));
372 
373    if (instr == ctx->split) {
374       /* Don't schedule instructions created by splitting a a0.x/a1.x/p0.x
375        * write until another "normal" instruction has been scheduled.
376        */
377       return false;
378    }
379 
380    if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
381       /* avoid texture/memory access if we have unscheduled kills
382        * that could make the expensive operation unnecessary.  By
383        * definition, if there are remaining kills, and this instr
384        * is not a dependency of a kill, there are other instructions
385        * that we can choose from.
386        */
387       struct ir3_sched_node *n = instr->data;
388       if (!n->kill_path)
389          return false;
390    }
391 
392    /* For instructions that write address register we need to
393     * make sure there is at least one instruction that uses the
394     * addr value which is otherwise ready.
395     *
396     * NOTE if any instructions use pred register and have other
397     * src args, we would need to do the same for writes_pred()..
398     */
399    if (writes_addr0(instr)) {
400       struct ir3 *ir = instr->block->shader;
401       bool ready = false;
402       for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
403          struct ir3_instruction *indirect = ir->a0_users[i];
404          if (!indirect)
405             continue;
406          if (indirect->address->def != instr->dsts[0])
407             continue;
408          ready = could_sched(indirect, instr);
409       }
410 
411       /* nothing could be scheduled, so keep looking: */
412       if (!ready)
413          return false;
414    }
415 
416    if (writes_addr1(instr)) {
417       struct ir3 *ir = instr->block->shader;
418       bool ready = false;
419       for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
420          struct ir3_instruction *indirect = ir->a1_users[i];
421          if (!indirect)
422             continue;
423          if (indirect->address->def != instr->dsts[0])
424             continue;
425          ready = could_sched(indirect, instr);
426       }
427 
428       /* nothing could be scheduled, so keep looking: */
429       if (!ready)
430          return false;
431    }
432 
433    /* if this is a write to address/predicate register, and that
434     * register is currently in use, we need to defer until it is
435     * free:
436     */
437    if (writes_addr0(instr) && ctx->addr0) {
438       debug_assert(ctx->addr0 != instr);
439       notes->addr0_conflict = true;
440       return false;
441    }
442 
443    if (writes_addr1(instr) && ctx->addr1) {
444       debug_assert(ctx->addr1 != instr);
445       notes->addr1_conflict = true;
446       return false;
447    }
448 
449    if (writes_pred(instr) && ctx->pred) {
450       debug_assert(ctx->pred != instr);
451       notes->pred_conflict = true;
452       return false;
453    }
454 
455    /* if the instruction is a kill, we need to ensure *every*
456     * bary.f is scheduled.  The hw seems unhappy if the thread
457     * gets killed before the end-input (ei) flag is hit.
458     *
459     * We could do this by adding each bary.f instruction as
460     * virtual ssa src for the kill instruction.  But we have
461     * fixed length instr->srcs[].
462     *
463     * TODO we could handle this by false-deps now, probably.
464     */
465    if (is_kill_or_demote(instr)) {
466       struct ir3 *ir = instr->block->shader;
467 
468       for (unsigned i = 0; i < ir->baryfs_count; i++) {
469          struct ir3_instruction *baryf = ir->baryfs[i];
470          if (baryf->flags & IR3_INSTR_UNUSED)
471             continue;
472          if (!is_scheduled(baryf)) {
473             notes->blocked_kill = true;
474             return false;
475          }
476       }
477    }
478 
479    return true;
480 }
481 
482 /* Find the instr->ip of the closest use of an instruction, in
483  * pre-sched order.  This isn't going to be the same as post-sched
484  * order, but it is a reasonable approximation to limit scheduling
485  * instructions *too* early.  This is mostly to prevent bad behavior
486  * in cases where we have a large number of possible instructions
487  * to choose, to avoid creating too much parallelism (ie. blowing
488  * up register pressure)
489  *
490  * See
491  * dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
492  */
493 static int
nearest_use(struct ir3_instruction * instr)494 nearest_use(struct ir3_instruction *instr)
495 {
496    unsigned nearest = ~0;
497    foreach_ssa_use (use, instr)
498       if (!is_scheduled(use))
499          nearest = MIN2(nearest, use->ip);
500 
501    /* slight hack.. this heuristic tends to push bary.f's to later
502     * in the shader, closer to their uses.  But we actually would
503     * prefer to get these scheduled earlier, to unlock varying
504     * storage for more VS jobs:
505     */
506    if (is_input(instr))
507       nearest /= 2;
508 
509    return nearest;
510 }
511 
512 static bool
is_only_nonscheduled_use(struct ir3_instruction * instr,struct ir3_instruction * use)513 is_only_nonscheduled_use(struct ir3_instruction *instr,
514                          struct ir3_instruction *use)
515 {
516    foreach_ssa_use (other_use, instr) {
517       if (other_use != use && !is_scheduled(other_use))
518          return false;
519    }
520 
521    return true;
522 }
523 
524 /* find net change to live values if instruction were scheduled: */
525 static int
live_effect(struct ir3_instruction * instr)526 live_effect(struct ir3_instruction *instr)
527 {
528    struct ir3_sched_node *n = instr->data;
529    int new_live =
530       (n->partially_live || !instr->uses || instr->uses->entries == 0)
531          ? 0
532          : dest_regs(instr);
533    int freed_live = 0;
534 
535    /* if we schedule something that causes a vecN to be live,
536     * then count all it's other components too:
537     */
538    if (n->collect)
539       new_live *= n->collect->srcs_count;
540 
541    foreach_ssa_src_n (src, n, instr) {
542       if (__is_false_dep(instr, n))
543          continue;
544 
545       if (instr->block != src->block)
546          continue;
547 
548       if (is_only_nonscheduled_use(src, instr))
549          freed_live += dest_regs(src);
550    }
551 
552    return new_live - freed_live;
553 }
554 
555 /* Determine if this is an instruction that we'd prefer not to schedule
556  * yet, in order to avoid an (ss)/(sy) sync.  This is limited by the
557  * sfu_delay/tex_delay counters, ie. the more cycles it has been since
558  * the last SFU/tex, the less costly a sync would be, and the number of
559  * outstanding SFU/tex instructions to prevent a blowup in register pressure.
560  */
561 static bool
should_defer(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)562 should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
563 {
564    if (ctx->sfu_delay) {
565       if (sched_check_src_cond(instr, is_outstanding_sfu, ctx))
566          return true;
567    }
568 
569    /* We mostly just want to try to schedule another texture fetch
570     * before scheduling something that would (sy) sync, so we can
571     * limit this rule to cases where there are remaining texture
572     * fetches
573     */
574    if (ctx->tex_delay && ctx->remaining_tex) {
575       if (sched_check_src_cond(instr, is_outstanding_tex_or_prefetch, ctx))
576          return true;
577    }
578 
579    /* Avoid scheduling too many outstanding texture or sfu instructions at
580     * once by deferring further tex/SFU instructions. This both prevents
581     * stalls when the queue of texture/sfu instructions becomes too large,
582     * and prevents unacceptably large increases in register pressure from too
583     * many outstanding texture instructions.
584     */
585    if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr))
586       return true;
587 
588    if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr))
589       return true;
590 
591    return false;
592 }
593 
594 static struct ir3_sched_node *choose_instr_inc(struct ir3_sched_ctx *ctx,
595                                                struct ir3_sched_notes *notes,
596                                                bool defer, bool avoid_output);
597 
598 enum choose_instr_dec_rank {
599    DEC_NEUTRAL,
600    DEC_NEUTRAL_READY,
601    DEC_FREED,
602    DEC_FREED_READY,
603 };
604 
605 static const char *
dec_rank_name(enum choose_instr_dec_rank rank)606 dec_rank_name(enum choose_instr_dec_rank rank)
607 {
608    switch (rank) {
609    case DEC_NEUTRAL:
610       return "neutral";
611    case DEC_NEUTRAL_READY:
612       return "neutral+ready";
613    case DEC_FREED:
614       return "freed";
615    case DEC_FREED_READY:
616       return "freed+ready";
617    default:
618       return NULL;
619    }
620 }
621 
622 /**
623  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
624  * Scheduling for Register pressure) heuristic.
625  *
626  * Only handles the case of choosing instructions that reduce register pressure
627  * or are even.
628  */
629 static struct ir3_sched_node *
choose_instr_dec(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes,bool defer)630 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
631                  bool defer)
632 {
633    const char *mode = defer ? "-d" : "";
634    struct ir3_sched_node *chosen = NULL;
635    enum choose_instr_dec_rank chosen_rank = DEC_NEUTRAL;
636 
637    foreach_sched_node (n, &ctx->dag->heads) {
638       if (defer && should_defer(ctx, n->instr))
639          continue;
640 
641       /* Note: mergedregs is only used post-RA, just set it to false */
642       unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
643 
644       int live = live_effect(n->instr);
645       if (live > 0)
646          continue;
647 
648       if (!check_instr(ctx, notes, n->instr))
649          continue;
650 
651       enum choose_instr_dec_rank rank;
652       if (live < 0) {
653          /* Prioritize instrs which free up regs and can be scheduled with no
654           * delay.
655           */
656          if (d == 0)
657             rank = DEC_FREED_READY;
658          else
659             rank = DEC_FREED;
660       } else {
661          /* Contra the paper, pick a leader with no effect on used regs.  This
662           * may open up new opportunities, as otherwise a single-operand instr
663           * consuming a value will tend to block finding freeing that value.
664           * This had a massive effect on reducing spilling on V3D.
665           *
666           * XXX: Should this prioritize ready?
667           */
668          if (d == 0)
669             rank = DEC_NEUTRAL_READY;
670          else
671             rank = DEC_NEUTRAL;
672       }
673 
674       /* Prefer higher-ranked instructions, or in the case of a rank tie, the
675        * highest latency-to-end-of-program instruction.
676        */
677       if (!chosen || rank > chosen_rank ||
678           (rank == chosen_rank && chosen->max_delay < n->max_delay)) {
679          chosen = n;
680          chosen_rank = rank;
681       }
682    }
683 
684    if (chosen) {
685       di(chosen->instr, "dec%s: chose (%s)", mode, dec_rank_name(chosen_rank));
686       return chosen;
687    }
688 
689    return choose_instr_inc(ctx, notes, defer, true);
690 }
691 
692 enum choose_instr_inc_rank {
693    INC_DISTANCE,
694    INC_DISTANCE_READY,
695 };
696 
697 static const char *
inc_rank_name(enum choose_instr_inc_rank rank)698 inc_rank_name(enum choose_instr_inc_rank rank)
699 {
700    switch (rank) {
701    case INC_DISTANCE:
702       return "distance";
703    case INC_DISTANCE_READY:
704       return "distance+ready";
705    default:
706       return NULL;
707    }
708 }
709 
710 /**
711  * When we can't choose an instruction that reduces register pressure or
712  * is neutral, we end up here to try and pick the least bad option.
713  */
714 static struct ir3_sched_node *
choose_instr_inc(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes,bool defer,bool avoid_output)715 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
716                  bool defer, bool avoid_output)
717 {
718    const char *mode = defer ? "-d" : "";
719    struct ir3_sched_node *chosen = NULL;
720    enum choose_instr_inc_rank chosen_rank = INC_DISTANCE;
721 
722    /*
723     * From hear on out, we are picking something that increases
724     * register pressure.  So try to pick something which will
725     * be consumed soon:
726     */
727    unsigned chosen_distance = 0;
728 
729    /* Pick the max delay of the remaining ready set. */
730    foreach_sched_node (n, &ctx->dag->heads) {
731       if (avoid_output && n->output)
732          continue;
733 
734       if (defer && should_defer(ctx, n->instr))
735          continue;
736 
737       if (!check_instr(ctx, notes, n->instr))
738          continue;
739 
740       unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
741 
742       enum choose_instr_inc_rank rank;
743       if (d == 0)
744          rank = INC_DISTANCE_READY;
745       else
746          rank = INC_DISTANCE;
747 
748       unsigned distance = nearest_use(n->instr);
749 
750       if (!chosen || rank > chosen_rank ||
751           (rank == chosen_rank && distance < chosen_distance)) {
752          chosen = n;
753          chosen_distance = distance;
754          chosen_rank = rank;
755       }
756    }
757 
758    if (chosen) {
759       di(chosen->instr, "inc%s: chose (%s)", mode, inc_rank_name(chosen_rank));
760       return chosen;
761    }
762 
763    return NULL;
764 }
765 
766 /* Handles instruction selections for instructions we want to prioritize
767  * even if csp/csr would not pick them.
768  */
769 static struct ir3_sched_node *
choose_instr_prio(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes)770 choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
771 {
772    struct ir3_sched_node *chosen = NULL;
773 
774    foreach_sched_node (n, &ctx->dag->heads) {
775       /*
776        * - phi nodes and inputs must be scheduled first
777        * - split should be scheduled first, so that the vector value is
778        *   killed as soon as possible. RA cannot split up the vector and
779        *   reuse components that have been killed until it's been killed.
780        * - collect, on the other hand, should be treated as a "normal"
781        *   instruction, and may add to register pressure if its sources are
782        *   part of another vector or immediates.
783        */
784       if (!is_meta(n->instr) || n->instr->opc == OPC_META_COLLECT)
785          continue;
786 
787       if (!chosen || (chosen->max_delay < n->max_delay))
788          chosen = n;
789    }
790 
791    if (chosen) {
792       di(chosen->instr, "prio: chose (meta)");
793       return chosen;
794    }
795 
796    return NULL;
797 }
798 
799 static void
dump_state(struct ir3_sched_ctx * ctx)800 dump_state(struct ir3_sched_ctx *ctx)
801 {
802    if (!SCHED_DEBUG)
803       return;
804 
805    foreach_sched_node (n, &ctx->dag->heads) {
806       di(n->instr, "maxdel=%3d le=%d del=%u ", n->max_delay,
807          live_effect(n->instr), ir3_delay_calc_prera(ctx->block, n->instr));
808 
809       util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
810          struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
811 
812          di(child->instr, " -> (%d parents) ", child->dag.parent_count);
813       }
814    }
815 }
816 
817 /* find instruction to schedule: */
818 static struct ir3_instruction *
choose_instr(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes)819 choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
820 {
821    struct ir3_sched_node *chosen;
822 
823    dump_state(ctx);
824 
825    chosen = choose_instr_prio(ctx, notes);
826    if (chosen)
827       return chosen->instr;
828 
829    chosen = choose_instr_dec(ctx, notes, true);
830    if (chosen)
831       return chosen->instr;
832 
833    chosen = choose_instr_dec(ctx, notes, false);
834    if (chosen)
835       return chosen->instr;
836 
837    chosen = choose_instr_inc(ctx, notes, false, false);
838    if (chosen)
839       return chosen->instr;
840 
841    return NULL;
842 }
843 
844 static struct ir3_instruction *
split_instr(struct ir3_sched_ctx * ctx,struct ir3_instruction * orig_instr)845 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
846 {
847    struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
848    di(new_instr, "split instruction");
849    sched_node_init(ctx, new_instr);
850    return new_instr;
851 }
852 
853 /* "spill" the address registers by remapping any unscheduled
854  * instructions which depend on the current address register
855  * to a clone of the instruction which wrote the address reg.
856  */
857 static struct ir3_instruction *
split_addr(struct ir3_sched_ctx * ctx,struct ir3_instruction ** addr,struct ir3_instruction ** users,unsigned users_count)858 split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
859            struct ir3_instruction **users, unsigned users_count)
860 {
861    struct ir3_instruction *new_addr = NULL;
862    unsigned i;
863 
864    debug_assert(*addr);
865 
866    for (i = 0; i < users_count; i++) {
867       struct ir3_instruction *indirect = users[i];
868 
869       if (!indirect)
870          continue;
871 
872       /* skip instructions already scheduled: */
873       if (is_scheduled(indirect))
874          continue;
875 
876       /* remap remaining instructions using current addr
877        * to new addr:
878        */
879       if (indirect->address->def == (*addr)->dsts[0]) {
880          if (!new_addr) {
881             new_addr = split_instr(ctx, *addr);
882             /* original addr is scheduled, but new one isn't: */
883             new_addr->flags &= ~IR3_INSTR_MARK;
884          }
885          indirect->address->def = new_addr->dsts[0];
886          /* don't need to remove old dag edge since old addr is
887           * already scheduled:
888           */
889          sched_node_add_dep(indirect, new_addr, 0);
890          di(indirect, "new address");
891       }
892    }
893 
894    /* all remaining indirects remapped to new addr: */
895    *addr = NULL;
896 
897    return new_addr;
898 }
899 
900 /* "spill" the predicate register by remapping any unscheduled
901  * instructions which depend on the current predicate register
902  * to a clone of the instruction which wrote the address reg.
903  */
904 static struct ir3_instruction *
split_pred(struct ir3_sched_ctx * ctx)905 split_pred(struct ir3_sched_ctx *ctx)
906 {
907    struct ir3 *ir;
908    struct ir3_instruction *new_pred = NULL;
909    unsigned i;
910 
911    debug_assert(ctx->pred);
912 
913    ir = ctx->pred->block->shader;
914 
915    for (i = 0; i < ir->predicates_count; i++) {
916       struct ir3_instruction *predicated = ir->predicates[i];
917 
918       if (!predicated)
919          continue;
920 
921       /* skip instructions already scheduled: */
922       if (is_scheduled(predicated))
923          continue;
924 
925       /* remap remaining instructions using current pred
926        * to new pred:
927        *
928        * TODO is there ever a case when pred isn't first
929        * (and only) src?
930        */
931       if (ssa(predicated->srcs[0]) == ctx->pred) {
932          if (!new_pred) {
933             new_pred = split_instr(ctx, ctx->pred);
934             /* original pred is scheduled, but new one isn't: */
935             new_pred->flags &= ~IR3_INSTR_MARK;
936          }
937          predicated->srcs[0]->instr = new_pred;
938          /* don't need to remove old dag edge since old pred is
939           * already scheduled:
940           */
941          sched_node_add_dep(predicated, new_pred, 0);
942          di(predicated, "new predicate");
943       }
944    }
945 
946    if (ctx->block->condition == ctx->pred) {
947       if (!new_pred) {
948          new_pred = split_instr(ctx, ctx->pred);
949          /* original pred is scheduled, but new one isn't: */
950          new_pred->flags &= ~IR3_INSTR_MARK;
951       }
952       ctx->block->condition = new_pred;
953       d("new branch condition");
954    }
955 
956    /* all remaining predicated remapped to new pred: */
957    ctx->pred = NULL;
958 
959    return new_pred;
960 }
961 
962 static void
sched_node_init(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)963 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
964 {
965    struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
966 
967    dag_init_node(ctx->dag, &n->dag);
968 
969    n->instr = instr;
970    instr->data = n;
971 }
972 
973 static void
sched_node_add_dep(struct ir3_instruction * instr,struct ir3_instruction * src,int i)974 sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
975                    int i)
976 {
977    /* don't consider dependencies in other blocks: */
978    if (src->block != instr->block)
979       return;
980 
981    /* we could have false-dep's that end up unused: */
982    if (src->flags & IR3_INSTR_UNUSED) {
983       debug_assert(__is_false_dep(instr, i));
984       return;
985    }
986 
987    struct ir3_sched_node *n = instr->data;
988    struct ir3_sched_node *sn = src->data;
989 
990    /* If src is consumed by a collect, track that to realize that once
991     * any of the collect srcs are live, we should hurry up and schedule
992     * the rest.
993     */
994    if (instr->opc == OPC_META_COLLECT)
995       sn->collect = instr;
996 
997    dag_add_edge(&sn->dag, &n->dag, NULL);
998 
999    unsigned d = ir3_delayslots(src, instr, i, true);
1000 
1001    n->delay = MAX2(n->delay, d);
1002 }
1003 
1004 static void
mark_kill_path(struct ir3_instruction * instr)1005 mark_kill_path(struct ir3_instruction *instr)
1006 {
1007    struct ir3_sched_node *n = instr->data;
1008 
1009    if (n->kill_path) {
1010       return;
1011    }
1012 
1013    n->kill_path = true;
1014 
1015    foreach_ssa_src (src, instr) {
1016       if (src->block != instr->block)
1017          continue;
1018       mark_kill_path(src);
1019    }
1020 }
1021 
1022 /* Is it an output? */
1023 static bool
is_output_collect(struct ir3_instruction * instr)1024 is_output_collect(struct ir3_instruction *instr)
1025 {
1026    if (instr->opc != OPC_META_COLLECT)
1027       return false;
1028 
1029    foreach_ssa_use (use, instr) {
1030       if (use->opc != OPC_END && use->opc != OPC_CHMASK)
1031          return false;
1032    }
1033 
1034    return true;
1035 }
1036 
1037 /* Is it's only use as output? */
1038 static bool
is_output_only(struct ir3_instruction * instr)1039 is_output_only(struct ir3_instruction *instr)
1040 {
1041    if (!writes_gpr(instr))
1042       return false;
1043 
1044    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1045       return false;
1046 
1047    foreach_ssa_use (use, instr)
1048       if (!is_output_collect(use))
1049          return false;
1050 
1051    return true;
1052 }
1053 
1054 static void
sched_node_add_deps(struct ir3_instruction * instr)1055 sched_node_add_deps(struct ir3_instruction *instr)
1056 {
1057    /* There's nothing to do for phi nodes, since they always go first. And
1058     * phi nodes can reference sources later in the same block, so handling
1059     * sources is not only unnecessary but could cause problems.
1060     */
1061    if (instr->opc == OPC_META_PHI)
1062       return;
1063 
1064    /* Since foreach_ssa_src() already handles false-dep's we can construct
1065     * the DAG easily in a single pass.
1066     */
1067    foreach_ssa_src_n (src, i, instr) {
1068       sched_node_add_dep(instr, src, i);
1069    }
1070 
1071    /* NOTE that all inputs must be scheduled before a kill, so
1072     * mark these to be prioritized as well:
1073     */
1074    if (is_kill_or_demote(instr) || is_input(instr)) {
1075       mark_kill_path(instr);
1076    }
1077 
1078    if (is_output_only(instr)) {
1079       struct ir3_sched_node *n = instr->data;
1080       n->output = true;
1081    }
1082 }
1083 
1084 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)1085 sched_dag_max_delay_cb(struct dag_node *node, void *state)
1086 {
1087    struct ir3_sched_node *n = (struct ir3_sched_node *)node;
1088    uint32_t max_delay = 0;
1089 
1090    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
1091       struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
1092       max_delay = MAX2(child->max_delay, max_delay);
1093    }
1094 
1095    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1096 }
1097 
1098 static void
sched_dag_init(struct ir3_sched_ctx * ctx)1099 sched_dag_init(struct ir3_sched_ctx *ctx)
1100 {
1101    ctx->dag = dag_create(ctx);
1102 
1103    foreach_instr (instr, &ctx->unscheduled_list) {
1104       sched_node_init(ctx, instr);
1105       sched_node_add_deps(instr);
1106    }
1107 
1108    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
1109 }
1110 
1111 static void
sched_dag_destroy(struct ir3_sched_ctx * ctx)1112 sched_dag_destroy(struct ir3_sched_ctx *ctx)
1113 {
1114    ralloc_free(ctx->dag);
1115    ctx->dag = NULL;
1116 }
1117 
1118 static void
sched_block(struct ir3_sched_ctx * ctx,struct ir3_block * block)1119 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
1120 {
1121    ctx->block = block;
1122 
1123    /* addr/pred writes are per-block: */
1124    ctx->addr0 = NULL;
1125    ctx->addr1 = NULL;
1126    ctx->pred = NULL;
1127    ctx->tex_delay = 0;
1128    ctx->sfu_delay = 0;
1129    ctx->tex_index = ctx->first_outstanding_tex_index = 0;
1130    ctx->sfu_index = ctx->first_outstanding_sfu_index = 0;
1131 
1132    /* move all instructions to the unscheduled list, and
1133     * empty the block's instruction list (to which we will
1134     * be inserting).
1135     */
1136    list_replace(&block->instr_list, &ctx->unscheduled_list);
1137    list_inithead(&block->instr_list);
1138 
1139    sched_dag_init(ctx);
1140 
1141    ctx->remaining_kills = 0;
1142    ctx->remaining_tex = 0;
1143    foreach_instr_safe (instr, &ctx->unscheduled_list) {
1144       if (is_kill_or_demote(instr))
1145          ctx->remaining_kills++;
1146       if (is_tex_or_prefetch(instr))
1147          ctx->remaining_tex++;
1148    }
1149 
1150    /* First schedule all meta:input and meta:phi instructions, followed by
1151     * tex-prefetch.  We want all of the instructions that load values into
1152     * registers before the shader starts to go before any other instructions.
1153     * But in particular we want inputs to come before prefetches.  This is
1154     * because a FS's bary_ij input may not actually be live in the shader,
1155     * but it should not be scheduled on top of any other input (but can be
1156     * overwritten by a tex prefetch)
1157     *
1158     * Note: Because the first block cannot have predecessors, meta:input and
1159     * meta:phi cannot exist in the same block.
1160     */
1161    foreach_instr_safe (instr, &ctx->unscheduled_list)
1162       if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_PHI)
1163          schedule(ctx, instr);
1164 
1165    foreach_instr_safe (instr, &ctx->unscheduled_list)
1166       if (instr->opc == OPC_META_TEX_PREFETCH)
1167          schedule(ctx, instr);
1168 
1169    while (!list_is_empty(&ctx->unscheduled_list)) {
1170       struct ir3_sched_notes notes = {0};
1171       struct ir3_instruction *instr;
1172 
1173       instr = choose_instr(ctx, &notes);
1174       if (instr) {
1175          unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
1176          d("delay=%u", delay);
1177 
1178          /* and if we run out of instructions that can be scheduled,
1179           * then it is time for nop's:
1180           */
1181          debug_assert(delay <= 6);
1182          while (delay > 0) {
1183             ir3_NOP(block);
1184             delay--;
1185          }
1186 
1187          schedule(ctx, instr);
1188 
1189          /* Since we've scheduled a "real" instruction, we can now
1190           * schedule any split instruction created by the scheduler again.
1191           */
1192          ctx->split = NULL;
1193       } else {
1194          struct ir3_instruction *new_instr = NULL;
1195          struct ir3 *ir = block->shader;
1196 
1197          /* nothing available to schedule.. if we are blocked on
1198           * address/predicate register conflict, then break the
1199           * deadlock by cloning the instruction that wrote that
1200           * reg:
1201           */
1202          if (notes.addr0_conflict) {
1203             new_instr =
1204                split_addr(ctx, &ctx->addr0, ir->a0_users, ir->a0_users_count);
1205          } else if (notes.addr1_conflict) {
1206             new_instr =
1207                split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count);
1208          } else if (notes.pred_conflict) {
1209             new_instr = split_pred(ctx);
1210          } else {
1211             d("unscheduled_list:");
1212             foreach_instr (instr, &ctx->unscheduled_list)
1213                di(instr, "unscheduled: ");
1214             debug_assert(0);
1215             ctx->error = true;
1216             return;
1217          }
1218 
1219          if (new_instr) {
1220             list_delinit(&new_instr->node);
1221             list_addtail(&new_instr->node, &ctx->unscheduled_list);
1222          }
1223 
1224          /* If we produced a new instruction, do not schedule it next to
1225           * guarantee progress.
1226           */
1227          ctx->split = new_instr;
1228       }
1229    }
1230 
1231    sched_dag_destroy(ctx);
1232 }
1233 
1234 int
ir3_sched(struct ir3 * ir)1235 ir3_sched(struct ir3 *ir)
1236 {
1237    struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
1238 
1239    foreach_block (block, &ir->block_list) {
1240       foreach_instr (instr, &block->instr_list) {
1241          instr->data = NULL;
1242       }
1243    }
1244 
1245    ir3_count_instructions(ir);
1246    ir3_clear_mark(ir);
1247    ir3_find_ssa_uses(ir, ctx, false);
1248 
1249    foreach_block (block, &ir->block_list) {
1250       sched_block(ctx, block);
1251    }
1252 
1253    int ret = ctx->error ? -1 : 0;
1254 
1255    ralloc_free(ctx);
1256 
1257    return ret;
1258 }
1259 
1260 static unsigned
get_array_id(struct ir3_instruction * instr)1261 get_array_id(struct ir3_instruction *instr)
1262 {
1263    /* The expectation is that there is only a single array
1264     * src or dst, ir3_cp should enforce this.
1265     */
1266 
1267    foreach_dst (dst, instr)
1268       if (dst->flags & IR3_REG_ARRAY)
1269          return dst->array.id;
1270    foreach_src (src, instr)
1271       if (src->flags & IR3_REG_ARRAY)
1272          return src->array.id;
1273 
1274    unreachable("this was unexpected");
1275 }
1276 
1277 /* does instruction 'prior' need to be scheduled before 'instr'? */
1278 static bool
depends_on(struct ir3_instruction * instr,struct ir3_instruction * prior)1279 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1280 {
1281    /* TODO for dependencies that are related to a specific object, ie
1282     * a specific SSBO/image/array, we could relax this constraint to
1283     * make accesses to unrelated objects not depend on each other (at
1284     * least as long as not declared coherent)
1285     */
1286    if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) &&
1287         prior->barrier_class) ||
1288        ((prior->barrier_class & IR3_BARRIER_EVERYTHING) &&
1289         instr->barrier_class))
1290       return true;
1291 
1292    if (instr->barrier_class & prior->barrier_conflict) {
1293       if (!(instr->barrier_class &
1294             ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1295          /* if only array barrier, then we can further limit false-deps
1296           * by considering the array-id, ie reads/writes to different
1297           * arrays do not depend on each other (no aliasing)
1298           */
1299          if (get_array_id(instr) != get_array_id(prior)) {
1300             return false;
1301          }
1302       }
1303 
1304       return true;
1305    }
1306 
1307    return false;
1308 }
1309 
1310 static void
add_barrier_deps(struct ir3_block * block,struct ir3_instruction * instr)1311 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1312 {
1313    struct list_head *prev = instr->node.prev;
1314    struct list_head *next = instr->node.next;
1315 
1316    /* add dependencies on previous instructions that must be scheduled
1317     * prior to the current instruction
1318     */
1319    while (prev != &block->instr_list) {
1320       struct ir3_instruction *pi =
1321          LIST_ENTRY(struct ir3_instruction, prev, node);
1322 
1323       prev = prev->prev;
1324 
1325       if (is_meta(pi))
1326          continue;
1327 
1328       if (instr->barrier_class == pi->barrier_class) {
1329          ir3_instr_add_dep(instr, pi);
1330          break;
1331       }
1332 
1333       if (depends_on(instr, pi))
1334          ir3_instr_add_dep(instr, pi);
1335    }
1336 
1337    /* add dependencies on this instruction to following instructions
1338     * that must be scheduled after the current instruction:
1339     */
1340    while (next != &block->instr_list) {
1341       struct ir3_instruction *ni =
1342          LIST_ENTRY(struct ir3_instruction, next, node);
1343 
1344       next = next->next;
1345 
1346       if (is_meta(ni))
1347          continue;
1348 
1349       if (instr->barrier_class == ni->barrier_class) {
1350          ir3_instr_add_dep(ni, instr);
1351          break;
1352       }
1353 
1354       if (depends_on(ni, instr))
1355          ir3_instr_add_dep(ni, instr);
1356    }
1357 }
1358 
1359 /* before scheduling a block, we need to add any necessary false-dependencies
1360  * to ensure that:
1361  *
1362  *  (1) barriers are scheduled in the right order wrt instructions related
1363  *      to the barrier
1364  *
1365  *  (2) reads that come before a write actually get scheduled before the
1366  *      write
1367  */
1368 bool
ir3_sched_add_deps(struct ir3 * ir)1369 ir3_sched_add_deps(struct ir3 *ir)
1370 {
1371    bool progress = false;
1372 
1373    foreach_block (block, &ir->block_list) {
1374       foreach_instr (instr, &block->instr_list) {
1375          if (instr->barrier_class) {
1376             add_barrier_deps(block, instr);
1377             progress = true;
1378          }
1379       }
1380    }
1381 
1382    return progress;
1383 }
1384