1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "compiler.h"
28 #include "bi_builder.h"
29
30 /* Arguments common to worklist, passed by value for convenience */
31
32 struct bi_worklist {
33 /* # of instructions in the block */
34 unsigned count;
35
36 /* Instructions in the block */
37 bi_instr **instructions;
38
39 /* Bitset of instructions in the block ready for scheduling */
40 BITSET_WORD *worklist;
41
42 /* The backwards dependency graph. nr_dependencies is the number of
43 * unscheduled instructions that must still be scheduled after (before)
44 * this instruction. dependents are which instructions need to be
45 * scheduled before (after) this instruction. */
46 unsigned *dep_counts;
47 BITSET_WORD **dependents;
48 };
49
50 /* State of a single tuple and clause under construction */
51
52 struct bi_reg_state {
53 /* Number of register writes */
54 unsigned nr_writes;
55
56 /* Register reads, expressed as (equivalence classes of)
57 * sources. Only 3 reads are allowed, but up to 2 may spill as
58 * "forced" for the next scheduled tuple, provided such a tuple
59 * can be constructed */
60 bi_index reads[5];
61 unsigned nr_reads;
62
63 /* The previous tuple scheduled (= the next tuple executed in the
64 * program) may require certain writes, in order to bypass the register
65 * file and use a temporary passthrough for the value. Up to 2 such
66 * constraints are architecturally satisfiable */
67 unsigned forced_count;
68 bi_index forceds[2];
69 };
70
71 struct bi_tuple_state {
72 /* Is this the last tuple in the clause */
73 bool last;
74
75 /* Scheduled ADD instruction, or null if none */
76 bi_instr *add;
77
78 /* Reads for previous (succeeding) tuple */
79 bi_index prev_reads[5];
80 unsigned nr_prev_reads;
81 bi_tuple *prev;
82
83 /* Register slot state for current tuple */
84 struct bi_reg_state reg;
85
86 /* Constants are shared in the tuple. If constant_count is nonzero, it
87 * is a size for constant count. Otherwise, fau is the slot read from
88 * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89 * but within a tuple, that should be encoded as constant_count != 0
90 * and constants[0] = constants[1] = 0 */
91 unsigned constant_count;
92
93 union {
94 uint32_t constants[2];
95 enum bir_fau fau;
96 };
97
98 unsigned pcrel_idx;
99 };
100
101 struct bi_const_state {
102 unsigned constant_count;
103 bool pcrel; /* applies to first const */
104 uint32_t constants[2];
105
106 /* Index of the constant into the clause */
107 unsigned word_idx;
108 };
109
110 struct bi_clause_state {
111 /* Has a message-passing instruction already been assigned? */
112 bool message;
113
114 /* Indices already accessed, this needs to be tracked to avoid hazards
115 * around message-passing instructions */
116 unsigned access_count;
117 bi_index accesses[(BI_MAX_SRCS + 2) * 16];
118
119 unsigned tuple_count;
120 struct bi_const_state consts[8];
121 };
122
123 /* Determines messsage type by checking the table and a few special cases. Only
124 * case missing is tilebuffer instructions that access depth/stencil, which
125 * require a Z_STENCIL message (to implement
126 * ARM_shader_framebuffer_fetch_depth_stencil) */
127
128 static enum bifrost_message_type
bi_message_type_for_instr(bi_instr * ins)129 bi_message_type_for_instr(bi_instr *ins)
130 {
131 enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
132 bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
133
134 if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
135 return BIFROST_MESSAGE_Z_STENCIL;
136
137 if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
138 return BIFROST_MESSAGE_ATTRIBUTE;
139
140 return msg;
141 }
142
143 /* Attribute, texture, and UBO load (attribute message) instructions support
144 * bindless, so just check the message type */
145
146 ASSERTED static bool
bi_supports_dtsel(bi_instr * ins)147 bi_supports_dtsel(bi_instr *ins)
148 {
149 switch (bi_message_type_for_instr(ins)) {
150 case BIFROST_MESSAGE_ATTRIBUTE:
151 return ins->op != BI_OPCODE_LD_GCLK_U64;
152 case BIFROST_MESSAGE_TEX:
153 return true;
154 default:
155 return false;
156 }
157 }
158
159 /* Adds an edge to the dependency graph */
160
161 static void
bi_push_dependency(unsigned parent,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)162 bi_push_dependency(unsigned parent, unsigned child,
163 BITSET_WORD **dependents, unsigned *dep_counts)
164 {
165 if (!BITSET_TEST(dependents[parent], child)) {
166 BITSET_SET(dependents[parent], child);
167 dep_counts[child]++;
168 }
169 }
170
171 static void
add_dependency(struct util_dynarray * table,unsigned index,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)172 add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
173 BITSET_WORD **dependents, unsigned *dep_counts)
174 {
175 assert(index < 64);
176 util_dynarray_foreach(table + index, unsigned, parent)
177 bi_push_dependency(*parent, child, dependents, dep_counts);
178 }
179
180 static void
mark_access(struct util_dynarray * table,unsigned index,unsigned parent)181 mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
182 {
183 assert(index < 64);
184 util_dynarray_append(&table[index], unsigned, parent);
185 }
186
187 static bool
bi_is_sched_barrier(bi_instr * I)188 bi_is_sched_barrier(bi_instr *I)
189 {
190 switch (I->op) {
191 case BI_OPCODE_BARRIER:
192 case BI_OPCODE_DISCARD_F32:
193 return true;
194 default:
195 return false;
196 }
197 }
198
199 static void
bi_create_dependency_graph(struct bi_worklist st,bool inorder,bool is_blend)200 bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
201 {
202 struct util_dynarray last_read[64], last_write[64];
203
204 for (unsigned i = 0; i < 64; ++i) {
205 util_dynarray_init(&last_read[i], NULL);
206 util_dynarray_init(&last_write[i], NULL);
207 }
208
209 /* Initialize dependency graph */
210 for (unsigned i = 0; i < st.count; ++i) {
211 st.dependents[i] =
212 calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
213
214 st.dep_counts[i] = 0;
215 }
216
217 unsigned prev_msg = ~0;
218
219 /* Populate dependency graph */
220 for (signed i = st.count - 1; i >= 0; --i) {
221 bi_instr *ins = st.instructions[i];
222
223 bi_foreach_src(ins, s) {
224 if (ins->src[s].type != BI_INDEX_REGISTER) continue;
225 unsigned count = bi_count_read_registers(ins, s);
226
227 for (unsigned c = 0; c < count; ++c)
228 add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
229 }
230
231 /* Keep message-passing ops in order. (This pass only cares
232 * about bundling; reordering of message-passing instructions
233 * happens during earlier scheduling.) */
234
235 if (bi_message_type_for_instr(ins)) {
236 if (prev_msg != ~0)
237 bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
238
239 prev_msg = i;
240 }
241
242 /* Handle schedule barriers, adding All the deps */
243 if (inorder || bi_is_sched_barrier(ins)) {
244 for (unsigned j = 0; j < st.count; ++j) {
245 if (i == j) continue;
246
247 bi_push_dependency(MAX2(i, j), MIN2(i, j),
248 st.dependents, st.dep_counts);
249 }
250 }
251
252 bi_foreach_dest(ins, d) {
253 if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
254 unsigned dest = ins->dest[d].value;
255
256 unsigned count = bi_count_write_registers(ins, d);
257
258 for (unsigned c = 0; c < count; ++c) {
259 add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
260 add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
261 mark_access(last_write, dest + c, i);
262 }
263 }
264
265 /* Blend shaders are allowed to clobber R0-R15. Treat these
266 * registers like extra destinations for scheduling purposes.
267 */
268 if (ins->op == BI_OPCODE_BLEND && !is_blend) {
269 for (unsigned c = 0; c < 16; ++c) {
270 add_dependency(last_read, c, i, st.dependents, st.dep_counts);
271 add_dependency(last_write, c, i, st.dependents, st.dep_counts);
272 mark_access(last_write, c, i);
273 }
274 }
275
276 bi_foreach_src(ins, s) {
277 if (ins->src[s].type != BI_INDEX_REGISTER) continue;
278
279 unsigned count = bi_count_read_registers(ins, s);
280
281 for (unsigned c = 0; c < count; ++c)
282 mark_access(last_read, ins->src[s].value + c, i);
283 }
284 }
285
286 /* If there is a branch, all instructions depend on it, as interblock
287 * execution must be purely in-order */
288
289 bi_instr *last = st.instructions[st.count - 1];
290 if (last->branch_target || last->op == BI_OPCODE_JUMP) {
291 for (signed i = st.count - 2; i >= 0; --i)
292 bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
293 }
294
295 /* Free the intermediate structures */
296 for (unsigned i = 0; i < 64; ++i) {
297 util_dynarray_fini(&last_read[i]);
298 util_dynarray_fini(&last_write[i]);
299 }
300 }
301
302 /* Scheduler pseudoinstruction lowerings to enable instruction pairings.
303 * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
304 */
305
306 static bi_instr *
bi_lower_cubeface(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)307 bi_lower_cubeface(bi_context *ctx,
308 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
309 {
310 bi_instr *pinstr = tuple->add;
311 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
312 bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
313 pinstr->src[0], pinstr->src[1], pinstr->src[2]);
314
315 pinstr->op = BI_OPCODE_CUBEFACE2;
316 pinstr->dest[0] = pinstr->dest[1];
317 pinstr->dest[1] = bi_null();
318 pinstr->src[0] = cubeface1->dest[0];
319 pinstr->src[1] = bi_null();
320 pinstr->src[2] = bi_null();
321
322 return cubeface1;
323 }
324
325 /* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
326 * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
327 * arguments (rbase, address lo, address hi, rbase) */
328
329 static bi_instr *
bi_lower_atom_c(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)330 bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
331 bi_tuple_state *tuple)
332 {
333 bi_instr *pinstr = tuple->add;
334 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
335 bi_instr *atom_c = bi_atom_c_return_i32(&b,
336 pinstr->src[1], pinstr->src[2], pinstr->src[0],
337 pinstr->atom_opc);
338
339 if (bi_is_null(pinstr->dest[0]))
340 atom_c->op = BI_OPCODE_ATOM_C_I32;
341
342 pinstr->op = BI_OPCODE_ATOM_CX;
343 pinstr->src[3] = atom_c->src[2];
344
345 return atom_c;
346 }
347
348 static bi_instr *
bi_lower_atom_c1(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)349 bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
350 bi_tuple_state *tuple)
351 {
352 bi_instr *pinstr = tuple->add;
353 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
354 bi_instr *atom_c = bi_atom_c1_return_i32(&b,
355 pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
356
357 if (bi_is_null(pinstr->dest[0]))
358 atom_c->op = BI_OPCODE_ATOM_C1_I32;
359
360 pinstr->op = BI_OPCODE_ATOM_CX;
361 pinstr->src[2] = pinstr->src[1];
362 pinstr->src[1] = pinstr->src[0];
363 pinstr->src[3] = bi_dontcare();
364 pinstr->src[0] = bi_null();
365
366 return atom_c;
367 }
368
369 static bi_instr *
bi_lower_seg_add(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)370 bi_lower_seg_add(bi_context *ctx,
371 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
372 {
373 bi_instr *pinstr = tuple->add;
374 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
375
376 bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
377 pinstr->preserve_null, pinstr->seg);
378
379 pinstr->op = BI_OPCODE_SEG_ADD;
380 pinstr->src[0] = pinstr->src[1];
381 pinstr->src[1] = bi_null();
382
383 assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
384 pinstr->dest[0].value += 1;
385
386 return fma;
387 }
388
389 static bi_instr *
bi_lower_dtsel(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)390 bi_lower_dtsel(bi_context *ctx,
391 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
392 {
393 bi_instr *add = tuple->add;
394 bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
395
396 bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
397 add->src[0], add->table);
398 add->src[0] = dtsel->dest[0];
399
400 assert(bi_supports_dtsel(add));
401 return dtsel;
402 }
403
404 /* Flatten linked list to array for O(1) indexing */
405
406 static bi_instr **
bi_flatten_block(bi_block * block,unsigned * len)407 bi_flatten_block(bi_block *block, unsigned *len)
408 {
409 if (list_is_empty(&block->instructions))
410 return NULL;
411
412 *len = list_length(&block->instructions);
413 bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
414
415 unsigned i = 0;
416
417 bi_foreach_instr_in_block(block, ins)
418 instructions[i++] = ins;
419
420 return instructions;
421 }
422
423 /* The worklist would track instructions without outstanding dependencies. For
424 * debug, force in-order scheduling (no dependency graph is constructed).
425 */
426
427 static struct bi_worklist
bi_initialize_worklist(bi_block * block,bool inorder,bool is_blend)428 bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
429 {
430 struct bi_worklist st = { };
431 st.instructions = bi_flatten_block(block, &st.count);
432
433 if (!st.count)
434 return st;
435
436 st.dependents = calloc(st.count, sizeof(st.dependents[0]));
437 st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
438
439 bi_create_dependency_graph(st, inorder, is_blend);
440 st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
441
442 for (unsigned i = 0; i < st.count; ++i) {
443 if (st.dep_counts[i] == 0)
444 BITSET_SET(st.worklist, i);
445 }
446
447 return st;
448 }
449
450 static void
bi_free_worklist(struct bi_worklist st)451 bi_free_worklist(struct bi_worklist st)
452 {
453 free(st.dep_counts);
454 free(st.dependents);
455 free(st.instructions);
456 free(st.worklist);
457 }
458
459 static void
bi_update_worklist(struct bi_worklist st,unsigned idx)460 bi_update_worklist(struct bi_worklist st, unsigned idx)
461 {
462 assert(st.dep_counts[idx] == 0);
463
464 if (!st.dependents[idx])
465 return;
466
467 /* Iterate each dependent to remove one dependency (`done`),
468 * adding dependents to the worklist where possible. */
469
470 unsigned i;
471 BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
472 assert(st.dep_counts[i] != 0);
473 unsigned new_deps = --st.dep_counts[i];
474
475 if (new_deps == 0)
476 BITSET_SET(st.worklist, i);
477 }
478
479 free(st.dependents[idx]);
480 }
481
482 /* Scheduler predicates */
483
484 /* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
485 static bool
bi_can_iaddc(bi_instr * ins)486 bi_can_iaddc(bi_instr *ins)
487 {
488 return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
489 ins->src[0].swizzle == BI_SWIZZLE_H01 &&
490 ins->src[1].swizzle == BI_SWIZZLE_H01);
491 }
492
493 bool
bi_can_fma(bi_instr * ins)494 bi_can_fma(bi_instr *ins)
495 {
496 /* +IADD.i32 -> *IADDC.i32 */
497 if (bi_can_iaddc(ins))
498 return true;
499
500 /* TODO: some additional fp16 constraints */
501 return bi_opcode_props[ins->op].fma;
502 }
503
504 static bool
bi_impacted_fadd_widens(bi_instr * I)505 bi_impacted_fadd_widens(bi_instr *I)
506 {
507 enum bi_swizzle swz0 = I->src[0].swizzle;
508 enum bi_swizzle swz1 = I->src[1].swizzle;
509
510 return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
511 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
512 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
513 }
514
515 bool
bi_can_add(bi_instr * ins)516 bi_can_add(bi_instr *ins)
517 {
518 /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
519 if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
520 return false;
521
522 /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
523 if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
524 return false;
525
526 /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
527 if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
528 return false;
529
530 /* TODO: some additional fp16 constraints */
531 return bi_opcode_props[ins->op].add;
532 }
533
534 /* Architecturally, no single instruction has a "not last" constraint. However,
535 * pseudoinstructions writing multiple destinations (expanding to multiple
536 * paired instructions) can run afoul of the "no two writes on the last clause"
537 * constraint, so we check for that here.
538 */
539
540 static bool
bi_must_not_last(bi_instr * ins)541 bi_must_not_last(bi_instr *ins)
542 {
543 return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]);
544 }
545
546 /* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
547 * treat it as a message-passing instruction for the purpose of scheduling
548 * despite no passing no logical message. Otherwise invalid encoding faults may
549 * be raised for unknown reasons (possibly an errata).
550 */
551
552 bool
bi_must_message(bi_instr * ins)553 bi_must_message(bi_instr *ins)
554 {
555 return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
556 (ins->op == BI_OPCODE_DISCARD_F32);
557 }
558
559 static bool
bi_fma_atomic(enum bi_opcode op)560 bi_fma_atomic(enum bi_opcode op)
561 {
562 switch (op) {
563 case BI_OPCODE_ATOM_C_I32:
564 case BI_OPCODE_ATOM_C_I64:
565 case BI_OPCODE_ATOM_C1_I32:
566 case BI_OPCODE_ATOM_C1_I64:
567 case BI_OPCODE_ATOM_C1_RETURN_I32:
568 case BI_OPCODE_ATOM_C1_RETURN_I64:
569 case BI_OPCODE_ATOM_C_RETURN_I32:
570 case BI_OPCODE_ATOM_C_RETURN_I64:
571 case BI_OPCODE_ATOM_POST_I32:
572 case BI_OPCODE_ATOM_POST_I64:
573 case BI_OPCODE_ATOM_PRE_I64:
574 return true;
575 default:
576 return false;
577 }
578 }
579
580 bool
bi_reads_zero(bi_instr * ins)581 bi_reads_zero(bi_instr *ins)
582 {
583 return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
584 }
585
586 bool
bi_reads_temps(bi_instr * ins,unsigned src)587 bi_reads_temps(bi_instr *ins, unsigned src)
588 {
589 switch (ins->op) {
590 /* Cannot permute a temporary */
591 case BI_OPCODE_CLPER_I32:
592 case BI_OPCODE_CLPER_V6_I32:
593 return src != 0;
594 case BI_OPCODE_IMULD:
595 return false;
596 default:
597 return true;
598 }
599 }
600
601 static bool
bi_impacted_t_modifiers(bi_instr * I,unsigned src)602 bi_impacted_t_modifiers(bi_instr *I, unsigned src)
603 {
604 enum bi_swizzle swizzle = I->src[src].swizzle;
605
606 switch (I->op) {
607 case BI_OPCODE_F16_TO_F32:
608 case BI_OPCODE_F16_TO_S32:
609 case BI_OPCODE_F16_TO_U32:
610 case BI_OPCODE_MKVEC_V2I16:
611 case BI_OPCODE_S16_TO_F32:
612 case BI_OPCODE_S16_TO_S32:
613 case BI_OPCODE_U16_TO_F32:
614 case BI_OPCODE_U16_TO_U32:
615 return (swizzle != BI_SWIZZLE_H00);
616
617 case BI_OPCODE_BRANCH_F32:
618 case BI_OPCODE_LOGB_F32:
619 case BI_OPCODE_ILOGB_F32:
620 case BI_OPCODE_FADD_F32:
621 case BI_OPCODE_FCMP_F32:
622 case BI_OPCODE_FREXPE_F32:
623 case BI_OPCODE_FREXPM_F32:
624 case BI_OPCODE_FROUND_F32:
625 return (swizzle != BI_SWIZZLE_H01);
626
627 case BI_OPCODE_IADD_S32:
628 case BI_OPCODE_IADD_U32:
629 case BI_OPCODE_ISUB_S32:
630 case BI_OPCODE_ISUB_U32:
631 case BI_OPCODE_IADD_V4S8:
632 case BI_OPCODE_IADD_V4U8:
633 case BI_OPCODE_ISUB_V4S8:
634 case BI_OPCODE_ISUB_V4U8:
635 return (src == 1) && (swizzle != BI_SWIZZLE_H01);
636
637 case BI_OPCODE_S8_TO_F32:
638 case BI_OPCODE_S8_TO_S32:
639 case BI_OPCODE_U8_TO_F32:
640 case BI_OPCODE_U8_TO_U32:
641 return (swizzle != BI_SWIZZLE_B0000);
642
643 case BI_OPCODE_V2S8_TO_V2F16:
644 case BI_OPCODE_V2S8_TO_V2S16:
645 case BI_OPCODE_V2U8_TO_V2F16:
646 case BI_OPCODE_V2U8_TO_V2U16:
647 return (swizzle != BI_SWIZZLE_B0022);
648
649 case BI_OPCODE_IADD_V2S16:
650 case BI_OPCODE_IADD_V2U16:
651 case BI_OPCODE_ISUB_V2S16:
652 case BI_OPCODE_ISUB_V2U16:
653 return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
654
655 #if 0
656 /* Restriction on IADD in 64-bit clauses on G72 */
657 case BI_OPCODE_IADD_S64:
658 case BI_OPCODE_IADD_U64:
659 return (src == 1) && (swizzle != BI_SWIZZLE_D0);
660 #endif
661
662 default:
663 return false;
664 }
665 }
666
667 bool
bi_reads_t(bi_instr * ins,unsigned src)668 bi_reads_t(bi_instr *ins, unsigned src)
669 {
670 /* Branch offset cannot come from passthrough */
671 if (bi_opcode_props[ins->op].branch)
672 return src != 2;
673
674 /* Table can never read passthrough */
675 if (bi_opcode_props[ins->op].table)
676 return false;
677
678 /* Staging register reads may happen before the succeeding register
679 * block encodes a write, so effectively there is no passthrough */
680 if (src == 0 && bi_opcode_props[ins->op].sr_read)
681 return false;
682
683 /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
684 * same-cycle temporaries. Check the list for these hazards. */
685 if (bi_impacted_t_modifiers(ins, src))
686 return false;
687
688 /* Descriptor must not come from a passthrough */
689 switch (ins->op) {
690 case BI_OPCODE_LD_CVT:
691 case BI_OPCODE_LD_TILE:
692 case BI_OPCODE_ST_CVT:
693 case BI_OPCODE_ST_TILE:
694 case BI_OPCODE_TEXC:
695 return src != 2;
696 case BI_OPCODE_BLEND:
697 return src != 2 && src != 3;
698
699 /* Else, just check if we can read any temps */
700 default:
701 return bi_reads_temps(ins, src);
702 }
703 }
704
705 /* Counts the number of 64-bit constants required by a clause. TODO: We
706 * might want to account for merging, right now we overestimate, but
707 * that's probably fine most of the time */
708
709 static unsigned
bi_nconstants(struct bi_clause_state * clause)710 bi_nconstants(struct bi_clause_state *clause)
711 {
712 unsigned count_32 = 0;
713
714 for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
715 count_32 += clause->consts[i].constant_count;
716
717 return DIV_ROUND_UP(count_32, 2);
718 }
719
720 /* Would there be space for constants if we added one tuple? */
721
722 static bool
bi_space_for_more_constants(struct bi_clause_state * clause)723 bi_space_for_more_constants(struct bi_clause_state *clause)
724 {
725 return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
726 }
727
728 /* Updates the FAU assignment for a tuple. A valid FAU assignment must be
729 * possible (as a precondition), though not necessarily on the selected unit;
730 * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
731 * bi_instr_schedulable */
732
733 static bool
bi_update_fau(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,bool fma,bool destructive)734 bi_update_fau(struct bi_clause_state *clause,
735 struct bi_tuple_state *tuple,
736 bi_instr *instr, bool fma, bool destructive)
737 {
738 /* Maintain our own constants, for nondestructive mode */
739 uint32_t copied_constants[2], copied_count;
740 unsigned *constant_count = &tuple->constant_count;
741 uint32_t *constants = tuple->constants;
742 enum bir_fau fau = tuple->fau;
743
744 if (!destructive) {
745 memcpy(copied_constants, tuple->constants,
746 (*constant_count) * sizeof(constants[0]));
747 copied_count = tuple->constant_count;
748
749 constant_count = &copied_count;
750 constants = copied_constants;
751 }
752
753 bi_foreach_src(instr, s) {
754 bi_index src = instr->src[s];
755
756 if (src.type == BI_INDEX_FAU) {
757 bool no_constants = *constant_count == 0;
758 bool no_other_fau = (fau == src.value) || !fau;
759 bool mergable = no_constants && no_other_fau;
760
761 if (destructive) {
762 assert(mergable);
763 tuple->fau = src.value;
764 } else if (!mergable) {
765 return false;
766 }
767
768 fau = src.value;
769 } else if (src.type == BI_INDEX_CONSTANT) {
770 /* No need to reserve space if we have a fast 0 */
771 if (src.value == 0 && fma && bi_reads_zero(instr))
772 continue;
773
774 /* If there is a branch target, #0 by convention is the
775 * PC-relative offset to the target */
776 bool pcrel = instr->branch_target && src.value == 0;
777 bool found = false;
778
779 for (unsigned i = 0; i < *constant_count; ++i) {
780 found |= (constants[i] == src.value) &&
781 (i != tuple->pcrel_idx);
782 }
783
784 /* pcrel constants are unique, so don't match */
785 if (found && !pcrel)
786 continue;
787
788 bool no_fau = (*constant_count > 0) || !fau;
789 bool mergable = no_fau && ((*constant_count) < 2);
790
791 if (destructive) {
792 assert(mergable);
793
794 if (pcrel)
795 tuple->pcrel_idx = *constant_count;
796 } else if (!mergable)
797 return false;
798
799 constants[(*constant_count)++] = src.value;
800 }
801 }
802
803 /* Constants per clause may be limited by tuple count */
804 bool room_for_constants = (*constant_count == 0) ||
805 bi_space_for_more_constants(clause);
806
807 if (destructive)
808 assert(room_for_constants);
809 else if (!room_for_constants)
810 return false;
811
812 return true;
813 }
814
815 /* Given an in-progress tuple, a candidate new instruction to add to the tuple,
816 * and a source (index) from that candidate, determine whether this source is
817 * "new", in the sense of requiring an additional read slot. That is, checks
818 * whether the specified source reads from the register file via a read slot
819 * (determined by its type and placement) and whether the source was already
820 * specified by a prior read slot (to avoid double counting) */
821
822 static bool
bi_tuple_is_new_src(bi_instr * instr,struct bi_reg_state * reg,unsigned src_idx)823 bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
824 {
825 bi_index src = instr->src[src_idx];
826
827 /* Only consider sources which come from the register file */
828 if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
829 return false;
830
831 /* Staging register reads bypass the usual register file mechanism */
832 if (src_idx == 0 && bi_opcode_props[instr->op].sr_read)
833 return false;
834
835 /* If a source is already read in the tuple, it is already counted */
836 for (unsigned t = 0; t < reg->nr_reads; ++t)
837 if (bi_is_word_equiv(src, reg->reads[t]))
838 return false;
839
840 /* If a source is read in _this instruction_, it is already counted */
841 for (unsigned t = 0; t < src_idx; ++t)
842 if (bi_is_word_equiv(src, instr->src[t]))
843 return false;
844
845 return true;
846 }
847
848 /* Given two tuples in source order, count the number of register reads of the
849 * successor, determined as the number of unique words accessed that aren't
850 * written by the predecessor (since those are tempable).
851 */
852
853 static unsigned
bi_count_succ_reads(bi_index t0,bi_index t1,bi_index * succ_reads,unsigned nr_succ_reads)854 bi_count_succ_reads(bi_index t0, bi_index t1,
855 bi_index *succ_reads, unsigned nr_succ_reads)
856 {
857 unsigned reads = 0;
858
859 for (unsigned i = 0; i < nr_succ_reads; ++i) {
860 bool unique = true;
861
862 for (unsigned j = 0; j < i; ++j)
863 if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
864 unique = false;
865
866 if (!unique)
867 continue;
868
869 if (bi_is_word_equiv(succ_reads[i], t0))
870 continue;
871
872 if (bi_is_word_equiv(succ_reads[i], t1))
873 continue;
874
875 reads++;
876 }
877
878 return reads;
879 }
880
881 /* Not all instructions can read from the staging passthrough (as determined by
882 * reads_t), check if a given pair of instructions has such a restriction. Note
883 * we also use this mechanism to prevent data races around staging register
884 * reads, so we allow the input source to potentially be vector-valued */
885
886 static bool
bi_has_staging_passthrough_hazard(bi_index fma,bi_instr * add)887 bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
888 {
889 bi_foreach_src(add, s) {
890 bi_index src = add->src[s];
891
892 if (src.type != BI_INDEX_REGISTER)
893 continue;
894
895 unsigned count = bi_count_read_registers(add, s);
896 bool read = false;
897
898 for (unsigned d = 0; d < count; ++d)
899 read |= bi_is_equiv(fma, bi_register(src.value + d));
900
901 if (read && !bi_reads_t(add, s))
902 return true;
903 }
904
905 return false;
906 }
907
908 /* Likewise for cross-tuple passthrough (reads_temps) */
909
910 static bool
bi_has_cross_passthrough_hazard(bi_tuple * succ,bi_instr * ins)911 bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
912 {
913 bi_foreach_instr_in_tuple(succ, pins) {
914 bi_foreach_src(pins, s) {
915 if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
916 !bi_reads_temps(pins, s))
917 return true;
918 }
919 }
920
921 return false;
922 }
923
924 /* Is a register written other than the staging mechanism? ATEST is special,
925 * writing to both a staging register and a regular register (fixed packing).
926 * BLEND is special since it has to write r48 the normal way even if it never
927 * gets read. This depends on liveness analysis, as a register is not needed
928 * for a write that will be discarded after one tuple. */
929
930 static unsigned
bi_write_count(bi_instr * instr,uint64_t live_after_temp)931 bi_write_count(bi_instr *instr, uint64_t live_after_temp)
932 {
933 if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
934 return 1;
935
936 unsigned count = 0;
937
938 bi_foreach_dest(instr, d) {
939 if (d == 0 && bi_opcode_props[instr->op].sr_write)
940 continue;
941
942 if (bi_is_null(instr->dest[d]))
943 continue;
944
945 assert(instr->dest[0].type == BI_INDEX_REGISTER);
946 if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
947 count++;
948 }
949
950 return count;
951 }
952
953 /* Instruction placement entails two questions: what subset of instructions in
954 * the block can legally be scheduled? and of those which is the best? That is,
955 * we seek to maximize a cost function on a subset of the worklist satisfying a
956 * particular predicate. The necessary predicate is determined entirely by
957 * Bifrost's architectural limitations and is described in the accompanying
958 * whitepaper. The cost function is a heuristic. */
959
960 static bool
bi_instr_schedulable(bi_instr * instr,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)961 bi_instr_schedulable(bi_instr *instr,
962 struct bi_clause_state *clause,
963 struct bi_tuple_state *tuple,
964 uint64_t live_after_temp,
965 bool fma)
966 {
967 /* The units must match */
968 if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
969 return false;
970
971 /* There can only be one message-passing instruction per clause */
972 if (bi_must_message(instr) && clause->message)
973 return false;
974
975 /* Some instructions have placement requirements */
976 if (bi_opcode_props[instr->op].last && !tuple->last)
977 return false;
978
979 if (bi_must_not_last(instr) && tuple->last)
980 return false;
981
982 /* Message-passing instructions are not guaranteed write within the
983 * same clause (most likely they will not), so if a later instruction
984 * in the clause accesses the destination, the message-passing
985 * instruction can't be scheduled */
986 if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {
987 unsigned nr = bi_count_write_registers(instr, 0);
988 assert(instr->dest[0].type == BI_INDEX_REGISTER);
989 unsigned reg = instr->dest[0].value;
990
991 for (unsigned i = 0; i < clause->access_count; ++i) {
992 bi_index idx = clause->accesses[i];
993 for (unsigned d = 0; d < nr; ++d) {
994 if (bi_is_equiv(bi_register(reg + d), idx))
995 return false;
996 }
997 }
998 }
999
1000 if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1001 unsigned nr = bi_count_read_registers(instr, 0);
1002 assert(instr->src[0].type == BI_INDEX_REGISTER);
1003 unsigned reg = instr->src[0].value;
1004
1005 for (unsigned i = 0; i < clause->access_count; ++i) {
1006 bi_index idx = clause->accesses[i];
1007 for (unsigned d = 0; d < nr; ++d) {
1008 if (bi_is_equiv(bi_register(reg + d), idx))
1009 return false;
1010 }
1011 }
1012 }
1013
1014 /* If FAU is already assigned, we may not disrupt that. Do a
1015 * non-disruptive test update */
1016 if (!bi_update_fau(clause, tuple, instr, fma, false))
1017 return false;
1018
1019 /* If this choice of FMA would force a staging passthrough, the ADD
1020 * instruction must support such a passthrough */
1021 if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1022 return false;
1023
1024 /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1025 if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1026 return false;
1027
1028 /* Register file writes are limited */
1029 unsigned total_writes = tuple->reg.nr_writes;
1030 total_writes += bi_write_count(instr, live_after_temp);
1031
1032 /* Last tuple in a clause can only write a single value */
1033 if (tuple->last && total_writes > 1)
1034 return false;
1035
1036 /* Register file reads are limited, so count unique */
1037
1038 unsigned unique_new_srcs = 0;
1039
1040 bi_foreach_src(instr, s) {
1041 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1042 unique_new_srcs++;
1043 }
1044
1045 unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1046
1047 bool can_spill_to_moves = (!tuple->add);
1048 can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1049 can_spill_to_moves &= (clause->tuple_count < 7);
1050
1051 /* However, we can get an extra 1 or 2 sources by inserting moves */
1052 if (total_srcs > (can_spill_to_moves ? 4 : 3))
1053 return false;
1054
1055 /* Count effective reads for the successor */
1056 unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1057 tuple->add ? tuple->add->dest[0] : bi_null(),
1058 tuple->prev_reads, tuple->nr_prev_reads);
1059
1060 /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1061 if ((signed) total_writes > (4 - (signed) succ_reads))
1062 return false;
1063
1064 return true;
1065 }
1066
1067 static signed
bi_instr_cost(bi_instr * instr,struct bi_tuple_state * tuple)1068 bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1069 {
1070 signed cost = 0;
1071
1072 /* Instructions that can schedule to either FMA or to ADD should be
1073 * deprioritized since they're easier to reschedule elsewhere */
1074 if (bi_can_fma(instr) && bi_can_add(instr))
1075 cost++;
1076
1077 /* Message-passing instructions impose constraints on the registers
1078 * later in the clause, so schedule them as late within a clause as
1079 * possible (<==> prioritize them since we're backwards <==> decrease
1080 * cost) */
1081 if (bi_must_message(instr))
1082 cost--;
1083
1084 /* Last instructions are big constraints (XXX: no effect on shader-db) */
1085 if (bi_opcode_props[instr->op].last)
1086 cost -= 2;
1087
1088 return cost;
1089 }
1090
1091 static unsigned
bi_choose_index(struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1092 bi_choose_index(struct bi_worklist st,
1093 struct bi_clause_state *clause,
1094 struct bi_tuple_state *tuple,
1095 uint64_t live_after_temp,
1096 bool fma)
1097 {
1098 unsigned i, best_idx = ~0;
1099 signed best_cost = INT_MAX;
1100
1101 BITSET_FOREACH_SET(i, st.worklist, st.count) {
1102 bi_instr *instr = st.instructions[i];
1103
1104 if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1105 continue;
1106
1107 signed cost = bi_instr_cost(instr, tuple);
1108
1109 /* Tie break in favour of later instructions, under the
1110 * assumption this promotes temporary usage (reducing pressure
1111 * on the register file). This is a side effect of a prepass
1112 * scheduling for pressure. */
1113
1114 if (cost <= best_cost) {
1115 best_idx = i;
1116 best_cost = cost;
1117 }
1118 }
1119
1120 return best_idx;
1121 }
1122
1123 static void
bi_pop_instr(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,uint64_t live_after_temp,bool fma)1124 bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1125 bi_instr *instr, uint64_t live_after_temp, bool fma)
1126 {
1127 bi_update_fau(clause, tuple, instr, fma, true);
1128
1129 /* TODO: maybe opt a bit? or maybe doesn't matter */
1130 assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1131 memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1132 clause->access_count += BI_MAX_SRCS;
1133 memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1134 clause->access_count += BI_MAX_DESTS;
1135 tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1136
1137 bi_foreach_src(instr, s) {
1138 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1139 tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1140 }
1141 }
1142
1143 /* Choose the best instruction and pop it off the worklist. Returns NULL if no
1144 * instruction is available. This function is destructive. */
1145
1146 static bi_instr *
bi_take_instr(bi_context * ctx,struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1147 bi_take_instr(bi_context *ctx, struct bi_worklist st,
1148 struct bi_clause_state *clause,
1149 struct bi_tuple_state *tuple,
1150 uint64_t live_after_temp,
1151 bool fma)
1152 {
1153 if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1154 return bi_lower_cubeface(ctx, clause, tuple);
1155 else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32)
1156 return bi_lower_atom_c(ctx, clause, tuple);
1157 else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32)
1158 return bi_lower_atom_c1(ctx, clause, tuple);
1159 else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1160 return bi_lower_seg_add(ctx, clause, tuple);
1161 else if (tuple->add && tuple->add->table)
1162 return bi_lower_dtsel(ctx, clause, tuple);
1163
1164 /* TODO: Optimize these moves */
1165 if (!fma && tuple->nr_prev_reads > 3) {
1166 /* Only spill by one source for now */
1167 assert(tuple->nr_prev_reads == 4);
1168
1169 /* Pick a source to spill */
1170 bi_index src = tuple->prev_reads[0];
1171
1172 /* Schedule the spill */
1173 bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1174 bi_instr *mov = bi_mov_i32_to(&b, src, src);
1175 bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1176 return mov;
1177 }
1178
1179 #ifndef NDEBUG
1180 /* Don't pair instructions if debugging */
1181 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1182 return NULL;
1183 #endif
1184
1185 unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1186
1187 if (idx >= st.count)
1188 return NULL;
1189
1190 /* Update state to reflect taking the instruction */
1191 bi_instr *instr = st.instructions[idx];
1192
1193 BITSET_CLEAR(st.worklist, idx);
1194 bi_update_worklist(st, idx);
1195 bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1196
1197 /* Fixups */
1198 if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1199 assert(bi_can_iaddc(instr));
1200 instr->op = BI_OPCODE_IADDC_I32;
1201 instr->src[2] = bi_zero();
1202 }
1203
1204 return instr;
1205 }
1206
1207 /* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1208 * to a passthrough register. If except_zero is true, the zeroth (first) source
1209 * is skipped, so staging register reads are not accidentally encoded as
1210 * passthrough (which is impossible) */
1211
1212 static void
bi_use_passthrough(bi_instr * ins,bi_index old,enum bifrost_packed_src new,bool except_zero)1213 bi_use_passthrough(bi_instr *ins, bi_index old,
1214 enum bifrost_packed_src new,
1215 bool except_zero)
1216 {
1217 /* Optional for convenience */
1218 if (!ins || bi_is_null(old))
1219 return;
1220
1221 bi_foreach_src(ins, i) {
1222 if (i == 0 && except_zero)
1223 continue;
1224
1225 if (bi_is_word_equiv(ins->src[i], old)) {
1226 ins->src[i].type = BI_INDEX_PASS;
1227 ins->src[i].value = new;
1228 ins->src[i].reg = false;
1229 ins->src[i].offset = 0;
1230 }
1231 }
1232 }
1233
1234 /* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1235 * intertuple passthroughs where necessary. Passthroughs are allowed as a
1236 * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1237 * opposite the order of execution. This is deliberate -- if both FMA and ADD
1238 * write to the same logical register, the next executed tuple will get the
1239 * latter result. There's no interference issue under the assumption of correct
1240 * register allocation. */
1241
1242 static void
bi_rewrite_passthrough(bi_tuple prec,bi_tuple succ)1243 bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1244 {
1245 bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1246
1247 if (prec.add) {
1248 bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1249 bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1250 }
1251
1252 if (prec.fma) {
1253 bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1254 bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1255 }
1256 }
1257
1258 static void
bi_rewrite_fau_to_pass(bi_tuple * tuple)1259 bi_rewrite_fau_to_pass(bi_tuple *tuple)
1260 {
1261 bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1262 if (ins->src[s].type != BI_INDEX_FAU) continue;
1263
1264 bi_index pass = bi_passthrough(ins->src[s].offset ?
1265 BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1266
1267 ins->src[s] = bi_replace_index(ins->src[s], pass);
1268 }
1269 }
1270
1271 static void
bi_rewrite_zero(bi_instr * ins,bool fma)1272 bi_rewrite_zero(bi_instr *ins, bool fma)
1273 {
1274 bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1275
1276 bi_foreach_src(ins, s) {
1277 bi_index src = ins->src[s];
1278
1279 if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1280 ins->src[s] = bi_replace_index(src, zero);
1281 }
1282 }
1283
1284 /* Assumes #0 to {T, FAU} rewrite has already occurred */
1285
1286 static void
bi_rewrite_constants_to_pass(bi_tuple * tuple,uint64_t constant,bool pcrel)1287 bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1288 {
1289 bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1290 if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1291
1292 uint32_t cons = ins->src[s].value;
1293
1294 ASSERTED bool lo = (cons == (constant & 0xffffffff));
1295 bool hi = (cons == (constant >> 32ull));
1296
1297 /* PC offsets always live in the upper half, set to zero by
1298 * convention before pack time. (This is safe, since if you
1299 * wanted to compare against zero, you would use a BRANCHZ
1300 * instruction instead.) */
1301 if (cons == 0 && ins->branch_target != NULL) {
1302 assert(pcrel);
1303 hi = true;
1304 lo = false;
1305 } else if (pcrel) {
1306 hi = false;
1307 }
1308
1309 assert(lo || hi);
1310
1311 ins->src[s] = bi_replace_index(ins->src[s],
1312 bi_passthrough(hi ? BIFROST_SRC_FAU_HI :
1313 BIFROST_SRC_FAU_LO));
1314 }
1315 }
1316
1317 /* Constructs a constant state given a tuple state. This has the
1318 * postcondition that pcrel applies to the first constant by convention,
1319 * and PC-relative constants will be #0 by convention here, so swap to
1320 * match if needed */
1321
1322 static struct bi_const_state
bi_get_const_state(struct bi_tuple_state * tuple)1323 bi_get_const_state(struct bi_tuple_state *tuple)
1324 {
1325 struct bi_const_state consts = {
1326 .constant_count = tuple->constant_count,
1327 .constants[0] = tuple->constants[0],
1328 .constants[1] = tuple->constants[1],
1329 .pcrel = tuple->add && tuple->add->branch_target,
1330 };
1331
1332 /* pcrel applies to the first constant by convention, and
1333 * PC-relative constants will be #0 by convention here, so swap
1334 * to match if needed */
1335 if (consts.pcrel && consts.constants[0]) {
1336 assert(consts.constant_count == 2);
1337 assert(consts.constants[1] == 0);
1338
1339 consts.constants[1] = consts.constants[0];
1340 consts.constants[0] = 0;
1341 }
1342
1343 return consts;
1344 }
1345
1346 /* Merges constants in a clause, satisfying the following rules, assuming no
1347 * more than one tuple has pcrel:
1348 *
1349 * 1. If a tuple has two constants, they must be packed together. If one is
1350 * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1351 * (PC << 32)]. Otherwise choose an arbitrary order.
1352 *
1353 * 4. If a tuple has one constant, it may be shared with an existing
1354 * pair that already contains that constant, or it may be combined with another
1355 * (distinct) tuple of a single constant.
1356 *
1357 * This gaurantees a packing is possible. The next routine handles modification
1358 * related swapping, to satisfy format 12 and the lack of modification for
1359 * tuple count 5/8 in EC0.
1360 */
1361
1362 static uint64_t
bi_merge_u32(uint32_t c0,uint32_t c1,bool pcrel)1363 bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1364 {
1365 /* At this point in the constant merge algorithm, pcrel constants are
1366 * treated as zero, so pcrel implies at least one constants is zero */
1367 assert(!pcrel || (c0 == 0 || c1 == 0));
1368
1369 /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1370 uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1371 uint32_t lo = (c0 == hi) ? c1 : c0;
1372
1373 /* Merge in the selected order */
1374 return lo | (((uint64_t) hi) << 32ull);
1375 }
1376
1377 static unsigned
bi_merge_pairs(struct bi_const_state * consts,unsigned tuple_count,uint64_t * merged,unsigned * pcrel_pair)1378 bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1379 uint64_t *merged, unsigned *pcrel_pair)
1380 {
1381 unsigned merge_count = 0;
1382
1383 for (unsigned t = 0; t < tuple_count; ++t) {
1384 if (consts[t].constant_count != 2) continue;
1385
1386 unsigned idx = ~0;
1387 uint64_t val = bi_merge_u32(consts[t].constants[0],
1388 consts[t].constants[1], consts[t].pcrel);
1389
1390 /* Skip the pcrel pair if assigned, because if one is assigned,
1391 * this one is not pcrel by uniqueness so it's a mismatch */
1392 for (unsigned s = 0; s < merge_count; ++s) {
1393 if (merged[s] == val && (*pcrel_pair) != s) {
1394 idx = s;
1395 break;
1396 }
1397 }
1398
1399 if (idx == ~0) {
1400 idx = merge_count++;
1401 merged[idx] = val;
1402
1403 if (consts[t].pcrel)
1404 (*pcrel_pair) = idx;
1405 }
1406
1407 consts[t].word_idx = idx;
1408 }
1409
1410 return merge_count;
1411 }
1412
1413 static unsigned
bi_merge_singles(struct bi_const_state * consts,unsigned tuple_count,uint64_t * pairs,unsigned pair_count,unsigned * pcrel_pair)1414 bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1415 uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1416 {
1417 bool pending = false, pending_pcrel = false;
1418 uint32_t pending_single = 0;
1419
1420 for (unsigned t = 0; t < tuple_count; ++t) {
1421 if (consts[t].constant_count != 1) continue;
1422
1423 uint32_t val = consts[t].constants[0];
1424 unsigned idx = ~0;
1425
1426 /* Try to match, but don't match pcrel with non-pcrel, even
1427 * though we can merge a pcrel with a non-pcrel single */
1428 for (unsigned i = 0; i < pair_count; ++i) {
1429 bool lo = ((pairs[i] & 0xffffffff) == val);
1430 bool hi = ((pairs[i] >> 32) == val);
1431 bool match = (lo || hi);
1432 match &= ((*pcrel_pair) != i);
1433 if (match && !consts[t].pcrel) {
1434 idx = i;
1435 break;
1436 }
1437 }
1438
1439 if (idx == ~0) {
1440 idx = pair_count;
1441
1442 if (pending && pending_single != val) {
1443 assert(!(pending_pcrel && consts[t].pcrel));
1444 bool pcrel = pending_pcrel || consts[t].pcrel;
1445
1446 if (pcrel)
1447 *pcrel_pair = idx;
1448
1449 pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1450
1451 pending = pending_pcrel = false;
1452 } else {
1453 pending = true;
1454 pending_pcrel = consts[t].pcrel;
1455 pending_single = val;
1456 }
1457 }
1458
1459 consts[t].word_idx = idx;
1460 }
1461
1462 /* Shift so it works whether pending_pcrel is set or not */
1463 if (pending) {
1464 if (pending_pcrel)
1465 *pcrel_pair = pair_count;
1466
1467 pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1468 }
1469
1470 return pair_count;
1471 }
1472
1473 static unsigned
bi_merge_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx)1474 bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1475 {
1476 unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1477 return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1478 }
1479
1480 /* Swap two constants at word i and i+1 by swapping their actual positions and
1481 * swapping all references so the meaning of the clause is preserved */
1482
1483 static void
bi_swap_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned i)1484 bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1485 {
1486 uint64_t tmp_pair = pairs[i + 0];
1487 pairs[i + 0] = pairs[i + 1];
1488 pairs[i + 1] = tmp_pair;
1489
1490 for (unsigned t = 0; t < 8; ++t) {
1491 if (consts[t].word_idx == i)
1492 consts[t].word_idx = (i + 1);
1493 else if (consts[t].word_idx == (i + 1))
1494 consts[t].word_idx = i;
1495 }
1496 }
1497
1498 /* Given merged constants, one of which might be PC-relative, fix up the M
1499 * values so the PC-relative constant (if it exists) has the M1=4 modification
1500 * and other constants are used as-is (which might require swapping) */
1501
1502 static unsigned
bi_apply_constant_modifiers(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx,unsigned tuple_count,unsigned constant_count)1503 bi_apply_constant_modifiers(struct bi_const_state *consts,
1504 uint64_t *pairs, unsigned *pcrel_idx,
1505 unsigned tuple_count, unsigned constant_count)
1506 {
1507 unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1508
1509 /* Clauses with these tuple counts lack an M field for the packed EC0,
1510 * so EC0 cannot be PC-relative, which might require swapping (and
1511 * possibly adding an unused constant) to fit */
1512
1513 if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1514 constant_count = MAX2(constant_count, 2);
1515 *pcrel_idx = 1;
1516 bi_swap_constants(consts, pairs, 0);
1517 }
1518
1519 /* EC0 might be packed free, after that constants are packed in pairs
1520 * (with clause format 12), with M1 values computed from the pair */
1521
1522 for (unsigned i = start; i < constant_count; i += 2) {
1523 bool swap = false;
1524 bool last = (i + 1) == constant_count;
1525
1526 unsigned A1 = (pairs[i] >> 60);
1527 unsigned B1 = (pairs[i + 1] >> 60);
1528
1529 if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1530 /* PC-relative constant must be E0, not E1 */
1531 swap = (*pcrel_idx == (i + 1));
1532
1533 /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1534 * equivalent to A = (B + 4) mod 16 and that we can
1535 * control A */
1536 unsigned B = swap ? A1 : B1;
1537 unsigned A = (B + 4) & 0xF;
1538 pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1539
1540 /* Swapped if swap set, identity if swap not set */
1541 *pcrel_idx = i;
1542 } else {
1543 /* Compute M1 value if we don't swap */
1544 unsigned M1 = (16 + A1 - B1) & 0xF;
1545
1546 /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1547 * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1548 * A1) % 16 >= 8, so swapping will let them be used
1549 * unchanged */
1550 swap = (M1 != 0) && (M1 < 8);
1551
1552 /* However, we can't swap the last constant, so we
1553 * force M1 = 0 instead for this case */
1554 if (last && swap) {
1555 pairs[i + 1] |= pairs[i] & (0xfull << 60);
1556 swap = false;
1557 }
1558 }
1559
1560 if (swap) {
1561 assert(!last);
1562 bi_swap_constants(consts, pairs, i);
1563 }
1564 }
1565
1566 return constant_count;
1567 }
1568
1569 /* Schedule a single clause. If no instructions remain, return NULL. */
1570
1571 static bi_clause *
bi_schedule_clause(bi_context * ctx,bi_block * block,struct bi_worklist st,uint64_t * live)1572 bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1573 {
1574 struct bi_clause_state clause_state = { 0 };
1575 bi_clause *clause = rzalloc(ctx, bi_clause);
1576 bi_tuple *tuple = NULL;
1577
1578 const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1579
1580 /* TODO: Decide flow control better */
1581 clause->flow_control = BIFROST_FLOW_NBTB;
1582
1583 /* The last clause can only write one instruction, so initialize that */
1584 struct bi_reg_state reg_state = {};
1585 bi_index prev_reads[5] = { bi_null() };
1586 unsigned nr_prev_reads = 0;
1587
1588 /* We need to track future liveness. The main *live set tracks what is
1589 * live at the current point int he program we are scheduling, but to
1590 * determine temp eligibility, we instead want what will be live after
1591 * the next tuple in the program. If you scheduled forwards, you'd need
1592 * a crystall ball for this. Luckily we schedule backwards, so we just
1593 * delay updates to the live_after_temp by an extra tuple. */
1594 uint64_t live_after_temp = *live;
1595 uint64_t live_next_tuple = live_after_temp;
1596
1597 do {
1598 struct bi_tuple_state tuple_state = {
1599 .last = (clause->tuple_count == 0),
1600 .reg = reg_state,
1601 .nr_prev_reads = nr_prev_reads,
1602 .prev = tuple,
1603 .pcrel_idx = ~0,
1604 };
1605
1606 assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1607 memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1608
1609 unsigned idx = max_tuples - clause->tuple_count - 1;
1610
1611 tuple = &clause->tuples[idx];
1612
1613 if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1614 unsigned nr = bi_count_read_registers(clause->message, 0);
1615 live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1616 }
1617
1618 /* Since we schedule backwards, we schedule ADD first */
1619 tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1620 tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1621 tuple->add = tuple_state.add;
1622
1623 /* Update liveness from the new instructions */
1624 if (tuple->add)
1625 *live = bi_postra_liveness_ins(*live, tuple->add);
1626
1627 if (tuple->fma)
1628 *live = bi_postra_liveness_ins(*live, tuple->fma);
1629
1630 /* Rotate in the new per-tuple liveness */
1631 live_after_temp = live_next_tuple;
1632 live_next_tuple = *live;
1633
1634 /* We may have a message, but only one per clause */
1635 if (tuple->add && bi_must_message(tuple->add)) {
1636 assert(!clause_state.message);
1637 clause_state.message = true;
1638
1639 clause->message_type =
1640 bi_message_type_for_instr(tuple->add);
1641 clause->message = tuple->add;
1642
1643 /* We don't need to set dependencies for blend shaders
1644 * because the BLEND instruction in the fragment
1645 * shader should have already done the wait */
1646 if (!ctx->inputs->is_blend) {
1647 switch (tuple->add->op) {
1648 case BI_OPCODE_ATEST:
1649 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1650 break;
1651 case BI_OPCODE_LD_TILE:
1652 case BI_OPCODE_ST_TILE:
1653 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1654 break;
1655 case BI_OPCODE_BLEND:
1656 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1657 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1658 break;
1659 default:
1660 break;
1661 }
1662 }
1663 }
1664
1665 clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1666
1667 /* Before merging constants, eliminate zeroes, otherwise the
1668 * merging will fight over the #0 that never gets read (and is
1669 * never marked as read by update_fau) */
1670 if (tuple->fma && bi_reads_zero(tuple->fma))
1671 bi_rewrite_zero(tuple->fma, true);
1672
1673 /* Rewrite away FAU, constant write is deferred */
1674 if (!tuple_state.constant_count) {
1675 tuple->fau_idx = tuple_state.fau;
1676 bi_rewrite_fau_to_pass(tuple);
1677 }
1678
1679 /* Use passthrough register for cross-stage accesses. Since
1680 * there are just FMA and ADD stages, that means we rewrite to
1681 * passthrough the sources of the ADD that read from the
1682 * destination of the FMA */
1683
1684 if (tuple->fma) {
1685 bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1686 BIFROST_SRC_STAGE, false);
1687 }
1688
1689 /* Don't add an empty tuple, unless the worklist has nothing
1690 * but a (pseudo)instruction failing to schedule due to a "not
1691 * last instruction" constraint */
1692
1693 int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1694 bool not_last = (some_instruction > 0) &&
1695 bi_must_not_last(st.instructions[some_instruction - 1]);
1696
1697 bool insert_empty = tuple_state.last && not_last;
1698
1699 if (!(tuple->fma || tuple->add || insert_empty))
1700 break;
1701
1702 clause->tuple_count++;
1703
1704 /* Adding enough tuple might overflow constants */
1705 if (!bi_space_for_more_constants(&clause_state))
1706 break;
1707
1708 #ifndef NDEBUG
1709 /* Don't schedule more than 1 tuple if debugging */
1710 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1711 break;
1712 #endif
1713
1714 /* Link through the register state */
1715 STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1716 memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1717 nr_prev_reads = tuple_state.reg.nr_reads;
1718 clause_state.tuple_count++;
1719 } while(clause->tuple_count < 8);
1720
1721 /* Don't schedule an empty clause */
1722 if (!clause->tuple_count)
1723 return NULL;
1724
1725 /* Before merging, rewrite away any tuples that read only zero */
1726 for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1727 bi_tuple *tuple = &clause->tuples[i];
1728 struct bi_const_state *st = &clause_state.consts[i];
1729
1730 if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1731 continue;
1732
1733 bi_foreach_instr_in_tuple(tuple, ins)
1734 bi_rewrite_zero(ins, false);
1735
1736 /* Constant has been demoted to FAU, so don't pack it separately */
1737 st->constant_count = 0;
1738
1739 /* Default */
1740 assert(tuple->fau_idx == BIR_FAU_ZERO);
1741 }
1742
1743 uint64_t constant_pairs[8] = { 0 };
1744 unsigned pcrel_idx = ~0;
1745 unsigned constant_words =
1746 bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1747
1748 constant_words = bi_apply_constant_modifiers(clause_state.consts,
1749 constant_pairs, &pcrel_idx, clause->tuple_count,
1750 constant_words);
1751
1752 clause->pcrel_idx = pcrel_idx;
1753
1754 for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1755 bi_tuple *tuple = &clause->tuples[i];
1756
1757 /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1758 if (clause_state.consts[i].constant_count == 0)
1759 continue;
1760
1761 /* FAU is already handled */
1762 assert(!tuple->fau_idx);
1763
1764 unsigned word_idx = clause_state.consts[i].word_idx;
1765 assert(word_idx <= 8);
1766
1767 /* We could try to merge regardless of bottom bits as well, but
1768 * that's probably diminishing returns */
1769 uint64_t pair = constant_pairs[word_idx];
1770 unsigned lo = pair & 0xF;
1771
1772 tuple->fau_idx = bi_constant_field(word_idx) | lo;
1773 bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1774 }
1775
1776 clause->constant_count = constant_words;
1777 memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1778
1779 /* Branches must be last, so this can be factored out */
1780 bi_instr *last = clause->tuples[max_tuples - 1].add;
1781 clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1782 clause->block = block;
1783
1784 /* TODO: scoreboard assignment post-sched */
1785 clause->dependencies |= (1 << 0);
1786
1787 /* We emit in reverse and emitted to the back of the tuples array, so
1788 * move it up front for easy indexing */
1789 memmove(clause->tuples,
1790 clause->tuples + (max_tuples - clause->tuple_count),
1791 clause->tuple_count * sizeof(clause->tuples[0]));
1792
1793 /* Use passthrough register for cross-tuple accesses. Note this is
1794 * after the memmove, so this is forwards. Skip the first tuple since
1795 * there is nothing before it to passthrough */
1796
1797 for (unsigned t = 1; t < clause->tuple_count; ++t)
1798 bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1799
1800 return clause;
1801 }
1802
1803 static void
bi_schedule_block(bi_context * ctx,bi_block * block)1804 bi_schedule_block(bi_context *ctx, bi_block *block)
1805 {
1806 list_inithead(&block->clauses);
1807
1808 /* Copy list to dynamic array */
1809 struct bi_worklist st = bi_initialize_worklist(block,
1810 bifrost_debug & BIFROST_DBG_INORDER,
1811 ctx->inputs->is_blend);
1812
1813 if (!st.count) {
1814 bi_free_worklist(st);
1815 return;
1816 }
1817
1818 /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1819 uint64_t live = block->reg_live_out;
1820
1821 /* Schedule as many clauses as needed to fill the block */
1822 bi_clause *u = NULL;
1823 while((u = bi_schedule_clause(ctx, block, st, &live)))
1824 list_add(&u->link, &block->clauses);
1825
1826 /* Back-to-back bit affects only the last clause of a block,
1827 * the rest are implicitly true */
1828 if (!list_is_empty(&block->clauses)) {
1829 bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1830 if (bi_reconverge_branches(block))
1831 last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1832 }
1833
1834 /* Reorder instructions to match the new schedule. First remove
1835 * existing instructions and then recreate the list */
1836
1837 bi_foreach_instr_in_block_safe(block, ins) {
1838 list_del(&ins->link);
1839 }
1840
1841 bi_foreach_clause_in_block(block, clause) {
1842 for (unsigned i = 0; i < clause->tuple_count; ++i) {
1843 bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1844 list_addtail(&ins->link, &block->instructions);
1845 }
1846 }
1847 }
1848
1849 block->scheduled = true;
1850
1851 #ifndef NDEBUG
1852 unsigned i;
1853 bool incomplete = false;
1854
1855 BITSET_FOREACH_SET(i, st.worklist, st.count) {
1856 bi_print_instr(st.instructions[i], stderr);
1857 incomplete = true;
1858 }
1859
1860 if (incomplete)
1861 unreachable("The above instructions failed to schedule.");
1862 #endif
1863
1864 bi_free_worklist(st);
1865 }
1866
1867 static bool
bi_check_fau_src(bi_instr * ins,unsigned s,uint32_t * constants,unsigned * cwords,bi_index * fau)1868 bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1869 {
1870 bi_index src = ins->src[s];
1871
1872 /* Staging registers can't have FAU accesses */
1873 if (s == 0 && bi_opcode_props[ins->op].sr_read)
1874 return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1875
1876 if (src.type == BI_INDEX_CONSTANT) {
1877 /* Allow fast zero */
1878 if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1879 return true;
1880
1881 if (!bi_is_null(*fau))
1882 return false;
1883
1884 /* Else, try to inline a constant */
1885 for (unsigned i = 0; i < *cwords; ++i) {
1886 if (src.value == constants[i])
1887 return true;
1888 }
1889
1890 if (*cwords >= 2)
1891 return false;
1892
1893 constants[(*cwords)++] = src.value;
1894 } else if (src.type == BI_INDEX_FAU) {
1895 if (*cwords != 0)
1896 return false;
1897
1898 /* Can only read from one pair of FAU words */
1899 if (!bi_is_null(*fau) && (src.value != fau->value))
1900 return false;
1901
1902 /* If there is a target, we'll need a PC-relative constant */
1903 if (ins->branch_target)
1904 return false;
1905
1906 *fau = src;
1907 }
1908
1909 return true;
1910 }
1911
1912 void
bi_lower_fau(bi_context * ctx)1913 bi_lower_fau(bi_context *ctx)
1914 {
1915 bi_foreach_instr_global_safe(ctx, ins) {
1916 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
1917
1918 uint32_t constants[2];
1919 unsigned cwords = 0;
1920 bi_index fau = bi_null();
1921
1922 /* ATEST must have the ATEST datum encoded, not any other
1923 * uniform. See to it this is the case. */
1924 if (ins->op == BI_OPCODE_ATEST)
1925 fau = ins->src[2];
1926
1927 bi_foreach_src(ins, s) {
1928 if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
1929
1930 bi_index copy = bi_mov_i32(&b, ins->src[s]);
1931 ins->src[s] = bi_replace_index(ins->src[s], copy);
1932 }
1933 }
1934 }
1935
1936 /* Only v7 allows specifying a dependency on the tilebuffer for the first
1937 * clause of a shader. v6 requires adding a NOP clause with the depedency. */
1938
1939 static void
bi_add_nop_for_atest(bi_context * ctx)1940 bi_add_nop_for_atest(bi_context *ctx)
1941 {
1942 /* Only needed on v6 */
1943 if (ctx->arch >= 7)
1944 return;
1945
1946 if (list_is_empty(&ctx->blocks))
1947 return;
1948
1949 /* Fetch the first clause of the shader */
1950 bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
1951 bi_clause *clause = bi_next_clause(ctx, block, NULL);
1952
1953 if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
1954 (1 << BIFROST_SLOT_ELDEST_COLOUR))))
1955 return;
1956
1957 /* Add a NOP so we can wait for the dependencies required by the first
1958 * clause */
1959
1960 bi_instr *I = rzalloc(ctx, bi_instr);
1961 I->op = BI_OPCODE_NOP;
1962 I->dest[0] = bi_null();
1963
1964 bi_clause *new_clause = ralloc(ctx, bi_clause);
1965 *new_clause = (bi_clause) {
1966 .flow_control = BIFROST_FLOW_NBTB,
1967 .next_clause_prefetch = true,
1968 .block = clause->block,
1969
1970 .tuple_count = 1,
1971 .tuples[0] = { .fma = I, },
1972 };
1973
1974 list_add(&new_clause->link, &clause->block->clauses);
1975 }
1976
1977 void
bi_schedule(bi_context * ctx)1978 bi_schedule(bi_context *ctx)
1979 {
1980 /* Fed into both scheduling and DCE */
1981 bi_postra_liveness(ctx);
1982
1983 bi_foreach_block(ctx, block) {
1984 bi_schedule_block(ctx, block);
1985 }
1986
1987 bi_opt_dce_post_ra(ctx);
1988 bi_add_nop_for_atest(ctx);
1989 }
1990