1 /*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27
28 /** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56 static bool debug;
57
58 /**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61 typedef struct {
62 struct dag_node dag; /* must be first for our u_dynarray_foreach */
63 nir_instr *instr;
64 bool partially_evaluated_path;
65
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
68 *
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
72 * instructions.
73 */
74 uint32_t delay;
75
76 /* Cost of the maximum-delay path from this node to the leaves. */
77 uint32_t max_delay;
78
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
81 */
82 uint32_t ready_time;
83 } nir_schedule_node;
84
85 typedef struct {
86 struct dag *dag;
87
88 nir_shader *shader;
89
90 /* Mapping from nir_register * or nir_ssa_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
92 */
93 struct hash_table *remaining_uses;
94
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table *instr_map;
97
98 /* Set of nir_register * or nir_ssa_def * that have had any instruction
99 * scheduled on them.
100 */
101 struct set *live_values;
102
103 /* An abstract approximation of the number of nir_scheduler_node->delay
104 * units since the start of the shader.
105 */
106 uint32_t time;
107
108 /* Number of channels currently used by the NIR instructions that have been
109 * scheduled.
110 */
111 int pressure;
112
113 /* Options specified by the backend */
114 const nir_schedule_options *options;
115 } nir_schedule_scoreboard;
116
117 /* When walking the instructions in reverse, we use this flag to swap
118 * before/after in add_dep().
119 */
120 enum direction { F, R };
121
122 struct nir_schedule_class_dep {
123 int klass;
124 nir_schedule_node *node;
125 struct nir_schedule_class_dep *next;
126 };
127
128 typedef struct {
129 nir_schedule_scoreboard *scoreboard;
130
131 /* Map from nir_register to nir_schedule_node * */
132 struct hash_table *reg_map;
133
134 /* Scheduler nodes for last instruction involved in some class of dependency.
135 */
136 nir_schedule_node *load_input;
137 nir_schedule_node *store_shared;
138 nir_schedule_node *unknown_intrinsic;
139 nir_schedule_node *discard;
140 nir_schedule_node *jump;
141
142 struct nir_schedule_class_dep *class_deps;
143
144 enum direction dir;
145 } nir_deps_state;
146
147 static void *
_mesa_hash_table_search_data(struct hash_table * ht,void * key)148 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
149 {
150 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
151 if (!entry)
152 return NULL;
153 return entry->data;
154 }
155
156 static nir_schedule_node *
nir_schedule_get_node(struct hash_table * instr_map,nir_instr * instr)157 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
158 {
159 return _mesa_hash_table_search_data(instr_map, instr);
160 }
161
162 static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard * scoreboard,nir_src * src)163 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
164 {
165 if (src->is_ssa) {
166 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
167 } else {
168 return _mesa_hash_table_search_data(scoreboard->remaining_uses,
169 src->reg.reg);
170 }
171 }
172
173 static int
nir_schedule_def_pressure(nir_ssa_def * def)174 nir_schedule_def_pressure(nir_ssa_def *def)
175 {
176 return def->num_components;
177 }
178
179 static int
nir_schedule_src_pressure(nir_src * src)180 nir_schedule_src_pressure(nir_src *src)
181 {
182 if (src->is_ssa)
183 return nir_schedule_def_pressure(src->ssa);
184 else
185 return src->reg.reg->num_components;
186 }
187
188 static int
nir_schedule_dest_pressure(nir_dest * dest)189 nir_schedule_dest_pressure(nir_dest *dest)
190 {
191 if (dest->is_ssa)
192 return nir_schedule_def_pressure(&dest->ssa);
193 else
194 return dest->reg.reg->num_components;
195 }
196
197 /**
198 * Adds a dependency such that @after must appear in the final program after
199 * @before.
200 *
201 * We add @before as a child of @after, so that DAG heads are the outputs of
202 * the program and we make our scheduling decisions bottom to top.
203 */
204 static void
add_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)205 add_dep(nir_deps_state *state,
206 nir_schedule_node *before,
207 nir_schedule_node *after)
208 {
209 if (!before || !after)
210 return;
211
212 assert(before != after);
213
214 if (state->dir == F)
215 dag_add_edge(&before->dag, &after->dag, NULL);
216 else
217 dag_add_edge(&after->dag, &before->dag, NULL);
218 }
219
220
221 static void
add_read_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)222 add_read_dep(nir_deps_state *state,
223 nir_schedule_node *before,
224 nir_schedule_node *after)
225 {
226 add_dep(state, before, after);
227 }
228
229 static void
add_write_dep(nir_deps_state * state,nir_schedule_node ** before,nir_schedule_node * after)230 add_write_dep(nir_deps_state *state,
231 nir_schedule_node **before,
232 nir_schedule_node *after)
233 {
234 add_dep(state, *before, after);
235 *before = after;
236 }
237
238 static bool
nir_schedule_reg_src_deps(nir_src * src,void * in_state)239 nir_schedule_reg_src_deps(nir_src *src, void *in_state)
240 {
241 nir_deps_state *state = in_state;
242
243 if (src->is_ssa)
244 return true;
245
246 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
247 src->reg.reg);
248 if (!entry)
249 return true;
250 nir_schedule_node *dst_n = entry->data;
251
252 nir_schedule_node *src_n =
253 nir_schedule_get_node(state->scoreboard->instr_map,
254 src->parent_instr);
255
256 add_dep(state, dst_n, src_n);
257
258 return true;
259 }
260
261 static bool
nir_schedule_reg_dest_deps(nir_dest * dest,void * in_state)262 nir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
263 {
264 nir_deps_state *state = in_state;
265
266 if (dest->is_ssa)
267 return true;
268
269 nir_schedule_node *dest_n =
270 nir_schedule_get_node(state->scoreboard->instr_map,
271 dest->reg.parent_instr);
272
273 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
274 dest->reg.reg);
275 if (!entry) {
276 _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
277 return true;
278 }
279 nir_schedule_node **before = (nir_schedule_node **)&entry->data;
280
281 add_write_dep(state, before, dest_n);
282
283 return true;
284 }
285
286 static bool
nir_schedule_ssa_deps(nir_ssa_def * def,void * in_state)287 nir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
288 {
289 nir_deps_state *state = in_state;
290 struct hash_table *instr_map = state->scoreboard->instr_map;
291 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
292
293 nir_foreach_use(src, def) {
294 nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
295 src->parent_instr);
296
297 add_read_dep(state, def_n, use_n);
298 }
299
300 return true;
301 }
302
303 static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state * state,int klass)304 nir_schedule_get_class_dep(nir_deps_state *state,
305 int klass)
306 {
307 for (struct nir_schedule_class_dep *class_dep = state->class_deps;
308 class_dep != NULL;
309 class_dep = class_dep->next) {
310 if (class_dep->klass == klass)
311 return class_dep;
312 }
313
314 struct nir_schedule_class_dep *class_dep =
315 ralloc(state->reg_map, struct nir_schedule_class_dep);
316
317 class_dep->klass = klass;
318 class_dep->node = NULL;
319 class_dep->next = state->class_deps;
320
321 state->class_deps = class_dep;
322
323 return class_dep;
324 }
325
326 static void
nir_schedule_intrinsic_deps(nir_deps_state * state,nir_intrinsic_instr * instr)327 nir_schedule_intrinsic_deps(nir_deps_state *state,
328 nir_intrinsic_instr *instr)
329 {
330 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
331 &instr->instr);
332 const nir_schedule_options *options = state->scoreboard->options;
333 nir_schedule_dependency dep;
334
335 if (options->intrinsic_cb &&
336 options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
337 struct nir_schedule_class_dep *class_dep =
338 nir_schedule_get_class_dep(state, dep.klass);
339
340 switch (dep.type) {
341 case NIR_SCHEDULE_READ_DEPENDENCY:
342 add_read_dep(state, class_dep->node, n);
343 break;
344 case NIR_SCHEDULE_WRITE_DEPENDENCY:
345 add_write_dep(state, &class_dep->node, n);
346 break;
347 }
348 }
349
350 switch (instr->intrinsic) {
351 case nir_intrinsic_load_uniform:
352 case nir_intrinsic_load_ubo:
353 case nir_intrinsic_load_front_face:
354 break;
355
356 case nir_intrinsic_discard:
357 case nir_intrinsic_discard_if:
358 case nir_intrinsic_demote:
359 case nir_intrinsic_demote_if:
360 /* We are adding two dependencies:
361 *
362 * * A individual one that we could use to add a read_dep while handling
363 * nir_instr_type_tex
364 *
365 * * Include it on the unknown intrinsic set, as we want discard to be
366 * serialized in in the same order relative to intervening stores or
367 * atomic accesses to SSBOs and images
368 */
369 add_write_dep(state, &state->discard, n);
370 add_write_dep(state, &state->unknown_intrinsic, n);
371 break;
372
373 case nir_intrinsic_store_output:
374 /* For some hardware and stages, output stores affect the same shared
375 * memory as input loads.
376 */
377 if ((state->scoreboard->options->stages_with_shared_io_memory &
378 (1 << state->scoreboard->shader->info.stage)))
379 add_write_dep(state, &state->load_input, n);
380
381 /* Make sure that preceding discards stay before the store_output */
382 add_read_dep(state, state->discard, n);
383
384 break;
385
386 case nir_intrinsic_load_input:
387 case nir_intrinsic_load_per_vertex_input:
388 add_read_dep(state, state->load_input, n);
389 break;
390
391 case nir_intrinsic_load_shared:
392 /* Don't move load_shared beyond a following store_shared, as it could
393 * change their value
394 */
395 add_read_dep(state, state->store_shared, n);
396 break;
397
398 case nir_intrinsic_store_shared:
399 add_write_dep(state, &state->store_shared, n);
400 break;
401
402 case nir_intrinsic_control_barrier:
403 case nir_intrinsic_memory_barrier_shared:
404 add_write_dep(state, &state->store_shared, n);
405
406 /* Serialize against ssbos/atomics/etc. */
407 add_write_dep(state, &state->unknown_intrinsic, n);
408 break;
409
410 default:
411 /* Attempt to handle other intrinsics that we haven't individually
412 * categorized by serializing them in the same order relative to each
413 * other.
414 */
415 add_write_dep(state, &state->unknown_intrinsic, n);
416 break;
417 }
418 }
419
420 /**
421 * Common code for dependencies that need to be tracked both forward and
422 * backward.
423 *
424 * This is for things like "all reads of r4 have to happen between the r4
425 * writes that surround them".
426 */
427 static void
nir_schedule_calculate_deps(nir_deps_state * state,nir_schedule_node * n)428 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
429 {
430 nir_instr *instr = n->instr;
431
432 /* For NIR SSA defs, we only need to do a single pass of making the uses
433 * depend on the def.
434 */
435 if (state->dir == F)
436 nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
437
438 /* For NIR regs, track the last writer in the scheduler state so that we
439 * can keep the writes in order and let reads get reordered only between
440 * each write.
441 */
442 nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
443
444 nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
445
446 /* Make sure any other instructions keep their positions relative to
447 * jumps.
448 */
449 if (instr->type != nir_instr_type_jump)
450 add_read_dep(state, state->jump, n);
451
452 switch (instr->type) {
453 case nir_instr_type_ssa_undef:
454 case nir_instr_type_load_const:
455 case nir_instr_type_alu:
456 case nir_instr_type_deref:
457 break;
458
459 case nir_instr_type_tex:
460 /* Don't move texture ops before a discard, as that could increase
461 * memory bandwidth for reading the discarded samples.
462 */
463 add_read_dep(state, state->discard, n);
464 break;
465
466 case nir_instr_type_jump:
467 add_write_dep(state, &state->jump, n);
468 break;
469
470 case nir_instr_type_call:
471 unreachable("Calls should have been lowered");
472 break;
473
474 case nir_instr_type_parallel_copy:
475 unreachable("Parallel copies should have been lowered");
476 break;
477
478 case nir_instr_type_phi:
479 unreachable("nir_schedule() should be called after lowering from SSA");
480 break;
481
482 case nir_instr_type_intrinsic:
483 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
484 break;
485 }
486 }
487
488 static void
calculate_forward_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)489 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
490 {
491 nir_deps_state state = {
492 .scoreboard = scoreboard,
493 .dir = F,
494 .reg_map = _mesa_pointer_hash_table_create(NULL),
495 };
496
497 nir_foreach_instr(instr, block) {
498 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
499 instr);
500 nir_schedule_calculate_deps(&state, node);
501 }
502
503 ralloc_free(state.reg_map);
504 }
505
506 static void
calculate_reverse_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)507 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
508 {
509 nir_deps_state state = {
510 .scoreboard = scoreboard,
511 .dir = R,
512 .reg_map = _mesa_pointer_hash_table_create(NULL),
513 };
514
515 nir_foreach_instr_reverse(instr, block) {
516 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
517 instr);
518 nir_schedule_calculate_deps(&state, node);
519 }
520
521 ralloc_free(state.reg_map);
522 }
523
524 typedef struct {
525 nir_schedule_scoreboard *scoreboard;
526 int regs_freed;
527 } nir_schedule_regs_freed_state;
528
529 static bool
nir_schedule_regs_freed_src_cb(nir_src * src,void * in_state)530 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
531 {
532 nir_schedule_regs_freed_state *state = in_state;
533 nir_schedule_scoreboard *scoreboard = state->scoreboard;
534 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
535
536 if (remaining_uses->entries == 1 &&
537 _mesa_set_search(remaining_uses, src->parent_instr)) {
538 state->regs_freed += nir_schedule_src_pressure(src);
539 }
540
541 return true;
542 }
543
544 static bool
nir_schedule_regs_freed_def_cb(nir_ssa_def * def,void * in_state)545 nir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
546 {
547 nir_schedule_regs_freed_state *state = in_state;
548
549 state->regs_freed -= nir_schedule_def_pressure(def);
550
551 return true;
552 }
553
554 static bool
nir_schedule_regs_freed_dest_cb(nir_dest * dest,void * in_state)555 nir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
556 {
557 nir_schedule_regs_freed_state *state = in_state;
558 nir_schedule_scoreboard *scoreboard = state->scoreboard;
559
560 if (dest->is_ssa)
561 return true;
562
563 nir_register *reg = dest->reg.reg;
564
565 /* Only the first def of a reg counts against register pressure. */
566 if (!_mesa_set_search(scoreboard->live_values, reg))
567 state->regs_freed -= nir_schedule_dest_pressure(dest);
568
569 return true;
570 }
571
572 static int
nir_schedule_regs_freed(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)573 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
574 {
575 nir_schedule_regs_freed_state state = {
576 .scoreboard = scoreboard,
577 };
578
579 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
580
581 nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
582
583 nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
584
585 return state.regs_freed;
586 }
587
588 /**
589 * Chooses an instruction that will minimise the register pressure as much as
590 * possible. This should only be used as a fallback when the regular scheduling
591 * generates a shader whose register allocation fails.
592 */
593 static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard * scoreboard)594 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
595 {
596 nir_schedule_node *chosen = NULL;
597
598 /* Find the leader in the ready (shouldn't-stall) set with the mininum
599 * cost.
600 */
601 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
602 if (scoreboard->time < n->ready_time)
603 continue;
604
605 if (!chosen || chosen->max_delay > n->max_delay)
606 chosen = n;
607 }
608 if (chosen) {
609 if (debug) {
610 fprintf(stderr, "chose (ready fallback): ");
611 nir_print_instr(chosen->instr, stderr);
612 fprintf(stderr, "\n");
613 }
614
615 return chosen;
616 }
617
618 /* Otherwise, choose the leader with the minimum cost. */
619 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
620 if (!chosen || chosen->max_delay > n->max_delay)
621 chosen = n;
622 }
623 if (debug) {
624 fprintf(stderr, "chose (leader fallback): ");
625 nir_print_instr(chosen->instr, stderr);
626 fprintf(stderr, "\n");
627 }
628
629 return chosen;
630 }
631
632 /**
633 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
634 * Scheduling for Parallelism) heuristic.
635 *
636 * Picks an instruction on the critical that's ready to execute without
637 * stalls, if possible, otherwise picks the instruction on the critical path.
638 */
639 static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard * scoreboard)640 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
641 {
642 nir_schedule_node *chosen = NULL;
643
644 /* Find the leader in the ready (shouldn't-stall) set with the maximum
645 * cost.
646 */
647 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
648 if (scoreboard->time < n->ready_time)
649 continue;
650
651 if (!chosen || chosen->max_delay < n->max_delay)
652 chosen = n;
653 }
654 if (chosen) {
655 if (debug) {
656 fprintf(stderr, "chose (ready): ");
657 nir_print_instr(chosen->instr, stderr);
658 fprintf(stderr, "\n");
659 }
660
661 return chosen;
662 }
663
664 /* Otherwise, choose the leader with the maximum cost. */
665 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
666 if (!chosen || chosen->max_delay < n->max_delay)
667 chosen = n;
668 }
669 if (debug) {
670 fprintf(stderr, "chose (leader): ");
671 nir_print_instr(chosen->instr, stderr);
672 fprintf(stderr, "\n");
673 }
674
675 return chosen;
676 }
677
678 /**
679 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
680 * Scheduling for Register pressure) heuristic.
681 */
682 static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard * scoreboard)683 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
684 {
685 nir_schedule_node *chosen = NULL;
686
687 /* Find a ready inst with regs freed and pick the one with max cost. */
688 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
689 if (n->ready_time > scoreboard->time)
690 continue;
691
692 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
693
694 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
695 chosen = n;
696 }
697 }
698 if (chosen) {
699 if (debug) {
700 fprintf(stderr, "chose (freed+ready): ");
701 nir_print_instr(chosen->instr, stderr);
702 fprintf(stderr, "\n");
703 }
704
705 return chosen;
706 }
707
708 /* Find a leader with regs freed and pick the one with max cost. */
709 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
710 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
711
712 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
713 chosen = n;
714 }
715 }
716 if (chosen) {
717 if (debug) {
718 fprintf(stderr, "chose (regs freed): ");
719 nir_print_instr(chosen->instr, stderr);
720 fprintf(stderr, "\n");
721 }
722
723 return chosen;
724 }
725
726 /* Find a partially evaluated path and try to finish it off */
727 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
728 if (n->partially_evaluated_path &&
729 (!chosen || chosen->max_delay < n->max_delay)) {
730 chosen = n;
731 }
732 }
733 if (chosen) {
734 if (debug) {
735 fprintf(stderr, "chose (partial path): ");
736 nir_print_instr(chosen->instr, stderr);
737 fprintf(stderr, "\n");
738 }
739
740 return chosen;
741 }
742
743 /* Contra the paper, pick a leader with no effect on used regs. This may
744 * open up new opportunities, as otherwise a single-operand instr consuming
745 * a value will tend to block finding freeing that value. This had a
746 * massive effect on reducing spilling on V3D.
747 *
748 * XXX: Should this prioritize ready?
749 */
750 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
751 if (nir_schedule_regs_freed(scoreboard, n) != 0)
752 continue;
753
754 if (!chosen || chosen->max_delay < n->max_delay)
755 chosen = n;
756 }
757 if (chosen) {
758 if (debug) {
759 fprintf(stderr, "chose (regs no-op): ");
760 nir_print_instr(chosen->instr, stderr);
761 fprintf(stderr, "\n");
762 }
763
764 return chosen;
765 }
766
767 /* Pick the max delay of the remaining ready set. */
768 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
769 if (n->ready_time > scoreboard->time)
770 continue;
771 if (!chosen || chosen->max_delay < n->max_delay)
772 chosen = n;
773 }
774 if (chosen) {
775 if (debug) {
776 fprintf(stderr, "chose (ready max delay): ");
777 nir_print_instr(chosen->instr, stderr);
778 fprintf(stderr, "\n");
779 }
780 return chosen;
781 }
782
783 /* Pick the max delay of the remaining leaders. */
784 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
785 if (!chosen || chosen->max_delay < n->max_delay)
786 chosen = n;
787 }
788
789 if (debug) {
790 fprintf(stderr, "chose (max delay): ");
791 nir_print_instr(chosen->instr, stderr);
792 fprintf(stderr, "\n");
793 }
794
795 return chosen;
796 }
797
798 static void
dump_state(nir_schedule_scoreboard * scoreboard)799 dump_state(nir_schedule_scoreboard *scoreboard)
800 {
801 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
802 fprintf(stderr, "maxdel %5d ", n->max_delay);
803 nir_print_instr(n->instr, stderr);
804 fprintf(stderr, "\n");
805
806 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
807 nir_schedule_node *child = (nir_schedule_node *)edge->child;
808
809 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
810 nir_print_instr(child->instr, stderr);
811 fprintf(stderr, "\n");
812 }
813 }
814 }
815
816 static void
nir_schedule_mark_use(nir_schedule_scoreboard * scoreboard,void * reg_or_def,nir_instr * reg_or_def_parent,int pressure)817 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
818 void *reg_or_def,
819 nir_instr *reg_or_def_parent,
820 int pressure)
821 {
822 /* Make the value live if it's the first time it's been used. */
823 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
824 _mesa_set_add(scoreboard->live_values, reg_or_def);
825 scoreboard->pressure += pressure;
826 }
827
828 /* Make the value dead if it's the last remaining use. Be careful when one
829 * instruction uses a value twice to not decrement pressure twice.
830 */
831 struct set *remaining_uses =
832 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
833 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
834 if (entry) {
835 _mesa_set_remove(remaining_uses, entry);
836
837 if (remaining_uses->entries == 0)
838 scoreboard->pressure -= pressure;
839 }
840 }
841
842 static bool
nir_schedule_mark_src_scheduled(nir_src * src,void * state)843 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
844 {
845 nir_schedule_scoreboard *scoreboard = state;
846 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
847
848 struct set_entry *entry = _mesa_set_search(remaining_uses,
849 src->parent_instr);
850 if (entry) {
851 /* Once we've used an SSA value in one instruction, bump the priority of
852 * the other uses so the SSA value can get fully consumed.
853 *
854 * We don't do this for registers, and it's would be a hassle and it's
855 * unclear if that would help or not. Also, skip it for constants, as
856 * they're often folded as immediates into backend instructions and have
857 * many unrelated instructions all referencing the same value (0).
858 */
859 if (src->is_ssa &&
860 src->ssa->parent_instr->type != nir_instr_type_load_const) {
861 nir_foreach_use(other_src, src->ssa) {
862 if (other_src->parent_instr == src->parent_instr)
863 continue;
864
865 nir_schedule_node *n =
866 nir_schedule_get_node(scoreboard->instr_map,
867 other_src->parent_instr);
868
869 if (n && !n->partially_evaluated_path) {
870 if (debug) {
871 fprintf(stderr, " New partially evaluated path: ");
872 nir_print_instr(n->instr, stderr);
873 fprintf(stderr, "\n");
874 }
875
876 n->partially_evaluated_path = true;
877 }
878 }
879 }
880 }
881
882 nir_schedule_mark_use(scoreboard,
883 src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
884 src->parent_instr,
885 nir_schedule_src_pressure(src));
886
887 return true;
888 }
889
890 static bool
nir_schedule_mark_def_scheduled(nir_ssa_def * def,void * state)891 nir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
892 {
893 nir_schedule_scoreboard *scoreboard = state;
894
895 nir_schedule_mark_use(scoreboard, def, def->parent_instr,
896 nir_schedule_def_pressure(def));
897
898 return true;
899 }
900
901 static bool
nir_schedule_mark_dest_scheduled(nir_dest * dest,void * state)902 nir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
903 {
904 nir_schedule_scoreboard *scoreboard = state;
905
906 /* SSA defs were handled in nir_schedule_mark_def_scheduled()
907 */
908 if (dest->is_ssa)
909 return true;
910
911 /* XXX: This is not actually accurate for regs -- the last use of a reg may
912 * have a live interval that extends across control flow. We should
913 * calculate the live ranges of regs, and have scheduler nodes for the CF
914 * nodes that also "use" the reg.
915 */
916 nir_schedule_mark_use(scoreboard, dest->reg.reg,
917 dest->reg.parent_instr,
918 nir_schedule_dest_pressure(dest));
919
920 return true;
921 }
922
923 static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)924 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
925 nir_schedule_node *n)
926 {
927 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
928 nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
929 nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
930
931 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
932 nir_schedule_node *child = (nir_schedule_node *)edge->child;
933
934 child->ready_time = MAX2(child->ready_time,
935 scoreboard->time + n->delay);
936
937 if (child->dag.parent_count == 1) {
938 if (debug) {
939 fprintf(stderr, " New DAG head: ");
940 nir_print_instr(child->instr, stderr);
941 fprintf(stderr, "\n");
942 }
943 }
944 }
945
946 dag_prune_head(scoreboard->dag, &n->dag);
947
948 scoreboard->time = MAX2(n->ready_time, scoreboard->time);
949 scoreboard->time++;
950 }
951
952 static void
nir_schedule_instructions(nir_schedule_scoreboard * scoreboard,nir_block * block)953 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
954 {
955 while (!list_is_empty(&scoreboard->dag->heads)) {
956 if (debug) {
957 fprintf(stderr, "current list:\n");
958 dump_state(scoreboard);
959 }
960
961 nir_schedule_node *chosen;
962 if (scoreboard->options->fallback)
963 chosen = nir_schedule_choose_instruction_fallback(scoreboard);
964 else if (scoreboard->pressure < scoreboard->options->threshold)
965 chosen = nir_schedule_choose_instruction_csp(scoreboard);
966 else
967 chosen = nir_schedule_choose_instruction_csr(scoreboard);
968
969 /* Now that we've scheduled a new instruction, some of its children may
970 * be promoted to the list of instructions ready to be scheduled.
971 */
972 nir_schedule_mark_node_scheduled(scoreboard, chosen);
973
974 /* Move the instruction to the end (so our first chosen instructions are
975 * the start of the program).
976 */
977 exec_node_remove(&chosen->instr->node);
978 exec_list_push_tail(&block->instr_list, &chosen->instr->node);
979
980 if (debug)
981 fprintf(stderr, "\n");
982 }
983 }
984
985 static uint32_t
nir_schedule_get_delay(nir_instr * instr)986 nir_schedule_get_delay(nir_instr *instr)
987 {
988 switch (instr->type) {
989 case nir_instr_type_ssa_undef:
990 case nir_instr_type_load_const:
991 case nir_instr_type_alu:
992 case nir_instr_type_deref:
993 case nir_instr_type_jump:
994 case nir_instr_type_parallel_copy:
995 case nir_instr_type_call:
996 case nir_instr_type_phi:
997 return 1;
998
999 case nir_instr_type_intrinsic:
1000 /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
1001 return 1;
1002
1003 case nir_instr_type_tex:
1004 /* Pick some large number to try to fetch textures early and sample them
1005 * late.
1006 */
1007 return 100;
1008 }
1009
1010 return 0;
1011 }
1012
1013 static void
nir_schedule_dag_max_delay_cb(struct dag_node * node,void * state)1014 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1015 {
1016 nir_schedule_node *n = (nir_schedule_node *)node;
1017 uint32_t max_delay = 0;
1018
1019 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1020 nir_schedule_node *child = (nir_schedule_node *)edge->child;
1021 max_delay = MAX2(child->max_delay, max_delay);
1022 }
1023
1024 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1025 }
1026
1027 static void
nir_schedule_block(nir_schedule_scoreboard * scoreboard,nir_block * block)1028 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1029 {
1030 void *mem_ctx = ralloc_context(NULL);
1031 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1032
1033 scoreboard->dag = dag_create(mem_ctx);
1034
1035 nir_foreach_instr(instr, block) {
1036 nir_schedule_node *n =
1037 rzalloc(mem_ctx, nir_schedule_node);
1038
1039 n->instr = instr;
1040 n->delay = nir_schedule_get_delay(instr);
1041 dag_init_node(scoreboard->dag, &n->dag);
1042
1043 _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1044 }
1045
1046 calculate_forward_deps(scoreboard, block);
1047 calculate_reverse_deps(scoreboard, block);
1048
1049 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1050
1051 nir_schedule_instructions(scoreboard, block);
1052
1053 ralloc_free(mem_ctx);
1054 scoreboard->instr_map = NULL;
1055 }
1056
1057 static bool
nir_schedule_ssa_def_init_scoreboard(nir_ssa_def * def,void * state)1058 nir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
1059 {
1060 nir_schedule_scoreboard *scoreboard = state;
1061 struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1062
1063 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1064
1065 _mesa_set_add(def_uses, def->parent_instr);
1066
1067 nir_foreach_use(src, def) {
1068 _mesa_set_add(def_uses, src->parent_instr);
1069 }
1070
1071 /* XXX: Handle if uses */
1072
1073 return true;
1074 }
1075
1076 static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader * shader,const nir_schedule_options * options)1077 nir_schedule_get_scoreboard(nir_shader *shader,
1078 const nir_schedule_options *options)
1079 {
1080 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1081
1082 scoreboard->shader = shader;
1083 scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1084 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1085 scoreboard->options = options;
1086 scoreboard->pressure = 0;
1087
1088 nir_foreach_function(function, shader) {
1089 nir_foreach_register(reg, &function->impl->registers) {
1090 struct set *register_uses =
1091 _mesa_pointer_set_create(scoreboard);
1092
1093 _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
1094
1095 nir_foreach_use(src, reg) {
1096 _mesa_set_add(register_uses, src->parent_instr);
1097 }
1098
1099 /* XXX: Handle if uses */
1100
1101 nir_foreach_def(dest, reg) {
1102 _mesa_set_add(register_uses, dest->reg.parent_instr);
1103 }
1104 }
1105
1106 nir_foreach_block(block, function->impl) {
1107 nir_foreach_instr(instr, block) {
1108 nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
1109 scoreboard);
1110 }
1111
1112 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1113 * uses of the if src even when it doesn't help. That's not many
1114 * values, though, so meh.
1115 */
1116 }
1117 }
1118
1119 return scoreboard;
1120 }
1121
1122 static void
nir_schedule_validate_uses(nir_schedule_scoreboard * scoreboard)1123 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1124 {
1125 #ifdef NDEBUG
1126 return;
1127 #endif
1128
1129 bool any_uses = false;
1130
1131 hash_table_foreach(scoreboard->remaining_uses, entry) {
1132 struct set *remaining_uses = entry->data;
1133
1134 set_foreach(remaining_uses, instr_entry) {
1135 if (!any_uses) {
1136 fprintf(stderr, "Tracked uses remain after scheduling. "
1137 "Affected instructions: \n");
1138 any_uses = true;
1139 }
1140 nir_print_instr(instr_entry->key, stderr);
1141 fprintf(stderr, "\n");
1142 }
1143 }
1144
1145 assert(!any_uses);
1146 }
1147
1148 /**
1149 * Schedules the NIR instructions to try to decrease stalls (for example,
1150 * delaying texture reads) while managing register pressure.
1151 *
1152 * The threshold represents "number of NIR register/SSA def channels live
1153 * before switching the scheduling heuristic to reduce register pressure",
1154 * since most of our GPU architectures are scalar (extending to vector with a
1155 * flag wouldn't be hard). This number should be a bit below the number of
1156 * registers available (counting any that may be occupied by system value
1157 * payload values, for example), since the heuristic may not always be able to
1158 * free a register immediately. The amount below the limit is up to you to
1159 * tune.
1160 */
1161 void
nir_schedule(nir_shader * shader,const nir_schedule_options * options)1162 nir_schedule(nir_shader *shader,
1163 const nir_schedule_options *options)
1164 {
1165 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1166 options);
1167
1168 if (debug) {
1169 fprintf(stderr, "NIR shader before scheduling:\n");
1170 nir_print_shader(shader, stderr);
1171 }
1172
1173 nir_foreach_function(function, shader) {
1174 if (!function->impl)
1175 continue;
1176
1177 nir_foreach_block(block, function->impl) {
1178 nir_schedule_block(scoreboard, block);
1179 }
1180 }
1181
1182 nir_schedule_validate_uses(scoreboard);
1183
1184 ralloc_free(scoreboard);
1185 }
1186