1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "bi_builder.h"
29 
30 /* Arguments common to worklist, passed by value for convenience */
31 
32 struct bi_worklist {
33         /* # of instructions in the block */
34         unsigned count;
35 
36         /* Instructions in the block */
37         bi_instr **instructions;
38 
39         /* Bitset of instructions in the block ready for scheduling */
40         BITSET_WORD *worklist;
41 
42         /* The backwards dependency graph. nr_dependencies is the number of
43          * unscheduled instructions that must still be scheduled after (before)
44          * this instruction. dependents are which instructions need to be
45          * scheduled before (after) this instruction. */
46         unsigned *dep_counts;
47         BITSET_WORD **dependents;
48 };
49 
50 /* State of a single tuple and clause under construction */
51 
52 struct bi_reg_state {
53         /* Number of register writes */
54         unsigned nr_writes;
55 
56         /* Register reads, expressed as (equivalence classes of)
57          * sources. Only 3 reads are allowed, but up to 2 may spill as
58          * "forced" for the next scheduled tuple, provided such a tuple
59          * can be constructed */
60         bi_index reads[5];
61         unsigned nr_reads;
62 
63         /* The previous tuple scheduled (= the next tuple executed in the
64          * program) may require certain writes, in order to bypass the register
65          * file and use a temporary passthrough for the value. Up to 2 such
66          * constraints are architecturally satisfiable */
67         unsigned forced_count;
68         bi_index forceds[2];
69 };
70 
71 struct bi_tuple_state {
72         /* Is this the last tuple in the clause */
73         bool last;
74 
75         /* Scheduled ADD instruction, or null if none */
76         bi_instr *add;
77 
78         /* Reads for previous (succeeding) tuple */
79         bi_index prev_reads[5];
80         unsigned nr_prev_reads;
81         bi_tuple *prev;
82 
83         /* Register slot state for current tuple */
84         struct bi_reg_state reg;
85 
86         /* Constants are shared in the tuple. If constant_count is nonzero, it
87          * is a size for constant count. Otherwise, fau is the slot read from
88          * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89          * but within a tuple, that should be encoded as constant_count != 0
90          * and constants[0] = constants[1] = 0 */
91         unsigned constant_count;
92 
93         union {
94                 uint32_t constants[2];
95                 enum bir_fau fau;
96         };
97 
98         unsigned pcrel_idx;
99 };
100 
101 struct bi_const_state {
102         unsigned constant_count;
103         bool pcrel; /* applies to first const */
104         uint32_t constants[2];
105 
106         /* Index of the constant into the clause */
107         unsigned word_idx;
108 };
109 
110 struct bi_clause_state {
111         /* Has a message-passing instruction already been assigned? */
112         bool message;
113 
114         /* Indices already accessed, this needs to be tracked to avoid hazards
115          * around message-passing instructions */
116         unsigned access_count;
117         bi_index accesses[(BI_MAX_SRCS + 2) * 16];
118 
119         unsigned tuple_count;
120         struct bi_const_state consts[8];
121 };
122 
123 /* Determines messsage type by checking the table and a few special cases. Only
124  * case missing is tilebuffer instructions that access depth/stencil, which
125  * require a Z_STENCIL message (to implement
126  * ARM_shader_framebuffer_fetch_depth_stencil) */
127 
128 static enum bifrost_message_type
bi_message_type_for_instr(bi_instr * ins)129 bi_message_type_for_instr(bi_instr *ins)
130 {
131         enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
132         bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
133 
134         if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
135                 return BIFROST_MESSAGE_Z_STENCIL;
136 
137         if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
138                 return BIFROST_MESSAGE_ATTRIBUTE;
139 
140         return msg;
141 }
142 
143 /* Attribute, texture, and UBO load (attribute message) instructions support
144  * bindless, so just check the message type */
145 
146 ASSERTED static bool
bi_supports_dtsel(bi_instr * ins)147 bi_supports_dtsel(bi_instr *ins)
148 {
149         switch (bi_message_type_for_instr(ins)) {
150         case BIFROST_MESSAGE_ATTRIBUTE:
151                 return ins->op != BI_OPCODE_LD_GCLK_U64;
152         case BIFROST_MESSAGE_TEX:
153                 return true;
154         default:
155                 return false;
156         }
157 }
158 
159 /* Adds an edge to the dependency graph */
160 
161 static void
bi_push_dependency(unsigned parent,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)162 bi_push_dependency(unsigned parent, unsigned child,
163                 BITSET_WORD **dependents, unsigned *dep_counts)
164 {
165         if (!BITSET_TEST(dependents[parent], child)) {
166                 BITSET_SET(dependents[parent], child);
167                 dep_counts[child]++;
168         }
169 }
170 
171 static void
add_dependency(struct util_dynarray * table,unsigned index,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)172 add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
173                 BITSET_WORD **dependents, unsigned *dep_counts)
174 {
175         assert(index < 64);
176         util_dynarray_foreach(table + index, unsigned, parent)
177                 bi_push_dependency(*parent, child, dependents, dep_counts);
178 }
179 
180 static void
mark_access(struct util_dynarray * table,unsigned index,unsigned parent)181 mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
182 {
183         assert(index < 64);
184         util_dynarray_append(&table[index], unsigned, parent);
185 }
186 
187 static bool
bi_is_sched_barrier(bi_instr * I)188 bi_is_sched_barrier(bi_instr *I)
189 {
190         switch (I->op) {
191         case BI_OPCODE_BARRIER:
192         case BI_OPCODE_DISCARD_F32:
193                 return true;
194         default:
195                 return false;
196         }
197 }
198 
199 static void
bi_create_dependency_graph(struct bi_worklist st,bool inorder,bool is_blend)200 bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
201 {
202         struct util_dynarray last_read[64], last_write[64];
203 
204         for (unsigned i = 0; i < 64; ++i) {
205                 util_dynarray_init(&last_read[i], NULL);
206                 util_dynarray_init(&last_write[i], NULL);
207         }
208 
209         /* Initialize dependency graph */
210         for (unsigned i = 0; i < st.count; ++i) {
211                 st.dependents[i] =
212                         calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
213 
214                 st.dep_counts[i] = 0;
215         }
216 
217         unsigned prev_msg = ~0;
218 
219         /* Populate dependency graph */
220         for (signed i = st.count - 1; i >= 0; --i) {
221                 bi_instr *ins = st.instructions[i];
222 
223                 bi_foreach_src(ins, s) {
224                         if (ins->src[s].type != BI_INDEX_REGISTER) continue;
225                         unsigned count = bi_count_read_registers(ins, s);
226 
227                         for (unsigned c = 0; c < count; ++c)
228                                 add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
229                 }
230 
231                 /* Keep message-passing ops in order. (This pass only cares
232                  * about bundling; reordering of message-passing instructions
233                  * happens during earlier scheduling.) */
234 
235                 if (bi_message_type_for_instr(ins)) {
236                         if (prev_msg != ~0)
237                                 bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
238 
239                         prev_msg = i;
240                 }
241 
242                 /* Handle schedule barriers, adding All the deps */
243                 if (inorder || bi_is_sched_barrier(ins)) {
244                         for (unsigned j = 0; j < st.count; ++j) {
245                                 if (i == j) continue;
246 
247                                 bi_push_dependency(MAX2(i, j), MIN2(i, j),
248                                                 st.dependents, st.dep_counts);
249                         }
250                 }
251 
252                 bi_foreach_dest(ins, d) {
253                         if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
254                         unsigned dest = ins->dest[d].value;
255 
256                         unsigned count = bi_count_write_registers(ins, d);
257 
258                         for (unsigned c = 0; c < count; ++c) {
259                                 add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
260                                 add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
261                                 mark_access(last_write, dest + c, i);
262                         }
263                 }
264 
265                 /* Blend shaders are allowed to clobber R0-R15. Treat these
266                  * registers like extra destinations for scheduling purposes.
267                  */
268                 if (ins->op == BI_OPCODE_BLEND && !is_blend) {
269                         for (unsigned c = 0; c < 16; ++c) {
270                                 add_dependency(last_read, c, i, st.dependents, st.dep_counts);
271                                 add_dependency(last_write, c, i, st.dependents, st.dep_counts);
272                                 mark_access(last_write, c, i);
273                         }
274                 }
275 
276                 bi_foreach_src(ins, s) {
277                         if (ins->src[s].type != BI_INDEX_REGISTER) continue;
278 
279                         unsigned count = bi_count_read_registers(ins, s);
280 
281                         for (unsigned c = 0; c < count; ++c)
282                                 mark_access(last_read, ins->src[s].value + c, i);
283                 }
284         }
285 
286         /* If there is a branch, all instructions depend on it, as interblock
287          * execution must be purely in-order */
288 
289         bi_instr *last = st.instructions[st.count - 1];
290         if (last->branch_target || last->op == BI_OPCODE_JUMP) {
291                 for (signed i = st.count - 2; i >= 0; --i)
292                         bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
293         }
294 
295         /* Free the intermediate structures */
296         for (unsigned i = 0; i < 64; ++i) {
297                 util_dynarray_fini(&last_read[i]);
298                 util_dynarray_fini(&last_write[i]);
299         }
300 }
301 
302 /* Scheduler pseudoinstruction lowerings to enable instruction pairings.
303  * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
304  */
305 
306 static bi_instr *
bi_lower_cubeface(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)307 bi_lower_cubeface(bi_context *ctx,
308                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
309 {
310         bi_instr *pinstr = tuple->add;
311         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
312         bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
313                         pinstr->src[0], pinstr->src[1], pinstr->src[2]);
314 
315         pinstr->op = BI_OPCODE_CUBEFACE2;
316         pinstr->dest[0] = pinstr->dest[1];
317         pinstr->dest[1] = bi_null();
318         pinstr->src[0] = cubeface1->dest[0];
319         pinstr->src[1] = bi_null();
320         pinstr->src[2] = bi_null();
321 
322         return cubeface1;
323 }
324 
325 /* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
326  * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
327  * arguments (rbase, address lo, address hi, rbase) */
328 
329 static bi_instr *
bi_lower_atom_c(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)330 bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
331                 bi_tuple_state *tuple)
332 {
333         bi_instr *pinstr = tuple->add;
334         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
335         bi_instr *atom_c = bi_atom_c_return_i32(&b,
336                         pinstr->src[1], pinstr->src[2], pinstr->src[0],
337                         pinstr->atom_opc);
338 
339         if (bi_is_null(pinstr->dest[0]))
340                 atom_c->op = BI_OPCODE_ATOM_C_I32;
341 
342         pinstr->op = BI_OPCODE_ATOM_CX;
343         pinstr->src[3] = atom_c->src[2];
344 
345         return atom_c;
346 }
347 
348 static bi_instr *
bi_lower_atom_c1(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)349 bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
350                 bi_tuple_state *tuple)
351 {
352         bi_instr *pinstr = tuple->add;
353         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
354         bi_instr *atom_c = bi_atom_c1_return_i32(&b,
355                         pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
356 
357         if (bi_is_null(pinstr->dest[0]))
358                 atom_c->op = BI_OPCODE_ATOM_C1_I32;
359 
360         pinstr->op = BI_OPCODE_ATOM_CX;
361         pinstr->src[2] = pinstr->src[1];
362         pinstr->src[1] = pinstr->src[0];
363         pinstr->src[3] = bi_dontcare();
364         pinstr->src[0] = bi_null();
365 
366         return atom_c;
367 }
368 
369 static bi_instr *
bi_lower_seg_add(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)370 bi_lower_seg_add(bi_context *ctx,
371                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
372 {
373         bi_instr *pinstr = tuple->add;
374         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
375 
376         bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
377                         pinstr->preserve_null, pinstr->seg);
378 
379         pinstr->op = BI_OPCODE_SEG_ADD;
380         pinstr->src[0] = pinstr->src[1];
381         pinstr->src[1] = bi_null();
382 
383         assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
384         pinstr->dest[0].value += 1;
385 
386         return fma;
387 }
388 
389 static bi_instr *
bi_lower_dtsel(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)390 bi_lower_dtsel(bi_context *ctx,
391                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
392 {
393         bi_instr *add = tuple->add;
394         bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
395 
396         bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
397                         add->src[0], add->table);
398         add->src[0] = dtsel->dest[0];
399 
400         assert(bi_supports_dtsel(add));
401         return dtsel;
402 }
403 
404 /* Flatten linked list to array for O(1) indexing */
405 
406 static bi_instr **
bi_flatten_block(bi_block * block,unsigned * len)407 bi_flatten_block(bi_block *block, unsigned *len)
408 {
409         if (list_is_empty(&block->instructions))
410                 return NULL;
411 
412         *len = list_length(&block->instructions);
413         bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
414 
415         unsigned i = 0;
416 
417         bi_foreach_instr_in_block(block, ins)
418                 instructions[i++] = ins;
419 
420         return instructions;
421 }
422 
423 /* The worklist would track instructions without outstanding dependencies. For
424  * debug, force in-order scheduling (no dependency graph is constructed).
425  */
426 
427 static struct bi_worklist
bi_initialize_worklist(bi_block * block,bool inorder,bool is_blend)428 bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
429 {
430         struct bi_worklist st = { };
431         st.instructions = bi_flatten_block(block, &st.count);
432 
433         if (!st.count)
434                 return st;
435 
436         st.dependents = calloc(st.count, sizeof(st.dependents[0]));
437         st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
438 
439         bi_create_dependency_graph(st, inorder, is_blend);
440         st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
441 
442         for (unsigned i = 0; i < st.count; ++i) {
443                 if (st.dep_counts[i] == 0)
444                         BITSET_SET(st.worklist, i);
445         }
446 
447         return st;
448 }
449 
450 static void
bi_free_worklist(struct bi_worklist st)451 bi_free_worklist(struct bi_worklist st)
452 {
453         free(st.dep_counts);
454         free(st.dependents);
455         free(st.instructions);
456         free(st.worklist);
457 }
458 
459 static void
bi_update_worklist(struct bi_worklist st,unsigned idx)460 bi_update_worklist(struct bi_worklist st, unsigned idx)
461 {
462         assert(st.dep_counts[idx] == 0);
463 
464         if (!st.dependents[idx])
465                 return;
466 
467         /* Iterate each dependent to remove one dependency (`done`),
468          * adding dependents to the worklist where possible. */
469 
470         unsigned i;
471         BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
472                 assert(st.dep_counts[i] != 0);
473                 unsigned new_deps = --st.dep_counts[i];
474 
475                 if (new_deps == 0)
476                         BITSET_SET(st.worklist, i);
477         }
478 
479         free(st.dependents[idx]);
480 }
481 
482 /* Scheduler predicates */
483 
484 /* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
485 static bool
bi_can_iaddc(bi_instr * ins)486 bi_can_iaddc(bi_instr *ins)
487 {
488         return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
489                 ins->src[0].swizzle == BI_SWIZZLE_H01 &&
490                 ins->src[1].swizzle == BI_SWIZZLE_H01);
491 }
492 
493 bool
bi_can_fma(bi_instr * ins)494 bi_can_fma(bi_instr *ins)
495 {
496         /* +IADD.i32 -> *IADDC.i32 */
497         if (bi_can_iaddc(ins))
498                 return true;
499 
500         /* TODO: some additional fp16 constraints */
501         return bi_opcode_props[ins->op].fma;
502 }
503 
504 static bool
bi_impacted_fadd_widens(bi_instr * I)505 bi_impacted_fadd_widens(bi_instr *I)
506 {
507         enum bi_swizzle swz0 = I->src[0].swizzle;
508         enum bi_swizzle swz1 = I->src[1].swizzle;
509 
510         return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
511                 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
512                 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
513 }
514 
515 bool
bi_can_add(bi_instr * ins)516 bi_can_add(bi_instr *ins)
517 {
518         /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
519         if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
520                 return false;
521 
522         /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
523         if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
524                 return false;
525 
526         /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
527         if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
528                return false;
529 
530         /* TODO: some additional fp16 constraints */
531         return bi_opcode_props[ins->op].add;
532 }
533 
534 /* Architecturally, no single instruction has a "not last" constraint. However,
535  * pseudoinstructions writing multiple destinations (expanding to multiple
536  * paired instructions) can run afoul of the "no two writes on the last clause"
537  * constraint, so we check for that here.
538  */
539 
540 static bool
bi_must_not_last(bi_instr * ins)541 bi_must_not_last(bi_instr *ins)
542 {
543         return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]);
544 }
545 
546 /* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
547  * treat it as a message-passing instruction for the purpose of scheduling
548  * despite no passing no logical message. Otherwise invalid encoding faults may
549  * be raised for unknown reasons (possibly an errata).
550  */
551 
552 bool
bi_must_message(bi_instr * ins)553 bi_must_message(bi_instr *ins)
554 {
555         return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
556                 (ins->op == BI_OPCODE_DISCARD_F32);
557 }
558 
559 static bool
bi_fma_atomic(enum bi_opcode op)560 bi_fma_atomic(enum bi_opcode op)
561 {
562         switch (op) {
563         case BI_OPCODE_ATOM_C_I32:
564         case BI_OPCODE_ATOM_C_I64:
565         case BI_OPCODE_ATOM_C1_I32:
566         case BI_OPCODE_ATOM_C1_I64:
567         case BI_OPCODE_ATOM_C1_RETURN_I32:
568         case BI_OPCODE_ATOM_C1_RETURN_I64:
569         case BI_OPCODE_ATOM_C_RETURN_I32:
570         case BI_OPCODE_ATOM_C_RETURN_I64:
571         case BI_OPCODE_ATOM_POST_I32:
572         case BI_OPCODE_ATOM_POST_I64:
573         case BI_OPCODE_ATOM_PRE_I64:
574                 return true;
575         default:
576                 return false;
577         }
578 }
579 
580 bool
bi_reads_zero(bi_instr * ins)581 bi_reads_zero(bi_instr *ins)
582 {
583         return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
584 }
585 
586 bool
bi_reads_temps(bi_instr * ins,unsigned src)587 bi_reads_temps(bi_instr *ins, unsigned src)
588 {
589         switch (ins->op) {
590         /* Cannot permute a temporary */
591         case BI_OPCODE_CLPER_I32:
592         case BI_OPCODE_CLPER_V6_I32:
593                 return src != 0;
594         case BI_OPCODE_IMULD:
595                 return false;
596         default:
597                 return true;
598         }
599 }
600 
601 static bool
bi_impacted_t_modifiers(bi_instr * I,unsigned src)602 bi_impacted_t_modifiers(bi_instr *I, unsigned src)
603 {
604         enum bi_swizzle swizzle = I->src[src].swizzle;
605 
606         switch (I->op) {
607         case BI_OPCODE_F16_TO_F32:
608         case BI_OPCODE_F16_TO_S32:
609         case BI_OPCODE_F16_TO_U32:
610         case BI_OPCODE_MKVEC_V2I16:
611         case BI_OPCODE_S16_TO_F32:
612         case BI_OPCODE_S16_TO_S32:
613         case BI_OPCODE_U16_TO_F32:
614         case BI_OPCODE_U16_TO_U32:
615                 return (swizzle != BI_SWIZZLE_H00);
616 
617         case BI_OPCODE_BRANCH_F32:
618         case BI_OPCODE_LOGB_F32:
619         case BI_OPCODE_ILOGB_F32:
620         case BI_OPCODE_FADD_F32:
621         case BI_OPCODE_FCMP_F32:
622         case BI_OPCODE_FREXPE_F32:
623         case BI_OPCODE_FREXPM_F32:
624         case BI_OPCODE_FROUND_F32:
625                 return (swizzle != BI_SWIZZLE_H01);
626 
627         case BI_OPCODE_IADD_S32:
628         case BI_OPCODE_IADD_U32:
629         case BI_OPCODE_ISUB_S32:
630         case BI_OPCODE_ISUB_U32:
631         case BI_OPCODE_IADD_V4S8:
632         case BI_OPCODE_IADD_V4U8:
633         case BI_OPCODE_ISUB_V4S8:
634         case BI_OPCODE_ISUB_V4U8:
635                 return (src == 1) && (swizzle != BI_SWIZZLE_H01);
636 
637         case BI_OPCODE_S8_TO_F32:
638         case BI_OPCODE_S8_TO_S32:
639         case BI_OPCODE_U8_TO_F32:
640         case BI_OPCODE_U8_TO_U32:
641                 return (swizzle != BI_SWIZZLE_B0000);
642 
643         case BI_OPCODE_V2S8_TO_V2F16:
644         case BI_OPCODE_V2S8_TO_V2S16:
645         case BI_OPCODE_V2U8_TO_V2F16:
646         case BI_OPCODE_V2U8_TO_V2U16:
647                 return (swizzle != BI_SWIZZLE_B0022);
648 
649         case BI_OPCODE_IADD_V2S16:
650         case BI_OPCODE_IADD_V2U16:
651         case BI_OPCODE_ISUB_V2S16:
652         case BI_OPCODE_ISUB_V2U16:
653                 return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
654 
655 #if 0
656         /* Restriction on IADD in 64-bit clauses on G72 */
657         case BI_OPCODE_IADD_S64:
658         case BI_OPCODE_IADD_U64:
659                 return (src == 1) && (swizzle != BI_SWIZZLE_D0);
660 #endif
661 
662         default:
663                 return false;
664         }
665 }
666 
667 bool
bi_reads_t(bi_instr * ins,unsigned src)668 bi_reads_t(bi_instr *ins, unsigned src)
669 {
670         /* Branch offset cannot come from passthrough */
671         if (bi_opcode_props[ins->op].branch)
672                 return src != 2;
673 
674         /* Table can never read passthrough */
675         if (bi_opcode_props[ins->op].table)
676                 return false;
677 
678         /* Staging register reads may happen before the succeeding register
679          * block encodes a write, so effectively there is no passthrough */
680         if (src == 0 && bi_opcode_props[ins->op].sr_read)
681                 return false;
682 
683         /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
684          * same-cycle temporaries. Check the list for these hazards. */
685         if (bi_impacted_t_modifiers(ins, src))
686                 return false;
687 
688         /* Descriptor must not come from a passthrough */
689         switch (ins->op) {
690         case BI_OPCODE_LD_CVT:
691         case BI_OPCODE_LD_TILE:
692         case BI_OPCODE_ST_CVT:
693         case BI_OPCODE_ST_TILE:
694         case BI_OPCODE_TEXC:
695                 return src != 2;
696         case BI_OPCODE_BLEND:
697                 return src != 2 && src != 3;
698 
699         /* Else, just check if we can read any temps */
700         default:
701                 return bi_reads_temps(ins, src);
702         }
703 }
704 
705 /* Counts the number of 64-bit constants required by a clause. TODO: We
706  * might want to account for merging, right now we overestimate, but
707  * that's probably fine most of the time */
708 
709 static unsigned
bi_nconstants(struct bi_clause_state * clause)710 bi_nconstants(struct bi_clause_state *clause)
711 {
712         unsigned count_32 = 0;
713 
714         for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
715                 count_32 += clause->consts[i].constant_count;
716 
717         return DIV_ROUND_UP(count_32, 2);
718 }
719 
720 /* Would there be space for constants if we added one tuple? */
721 
722 static bool
bi_space_for_more_constants(struct bi_clause_state * clause)723 bi_space_for_more_constants(struct bi_clause_state *clause)
724 {
725         return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
726 }
727 
728 /* Updates the FAU assignment for a tuple. A valid FAU assignment must be
729  * possible (as a precondition), though not necessarily on the selected unit;
730  * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
731  * bi_instr_schedulable */
732 
733 static bool
bi_update_fau(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,bool fma,bool destructive)734 bi_update_fau(struct bi_clause_state *clause,
735                 struct bi_tuple_state *tuple,
736                 bi_instr *instr, bool fma, bool destructive)
737 {
738         /* Maintain our own constants, for nondestructive mode */
739         uint32_t copied_constants[2], copied_count;
740         unsigned *constant_count = &tuple->constant_count;
741         uint32_t *constants = tuple->constants;
742         enum bir_fau fau = tuple->fau;
743 
744         if (!destructive) {
745                 memcpy(copied_constants, tuple->constants,
746                                 (*constant_count) * sizeof(constants[0]));
747                 copied_count = tuple->constant_count;
748 
749                 constant_count = &copied_count;
750                 constants = copied_constants;
751         }
752 
753         bi_foreach_src(instr, s) {
754                 bi_index src = instr->src[s];
755 
756                 if (src.type == BI_INDEX_FAU) {
757                         bool no_constants = *constant_count == 0;
758                         bool no_other_fau = (fau == src.value) || !fau;
759                         bool mergable = no_constants && no_other_fau;
760 
761                         if (destructive) {
762                                 assert(mergable);
763                                 tuple->fau = src.value;
764                         } else if (!mergable) {
765                                 return false;
766                         }
767 
768                         fau = src.value;
769                 } else if (src.type == BI_INDEX_CONSTANT) {
770                         /* No need to reserve space if we have a fast 0 */
771                         if (src.value == 0 && fma && bi_reads_zero(instr))
772                                 continue;
773 
774                         /* If there is a branch target, #0 by convention is the
775                          * PC-relative offset to the target */
776                         bool pcrel = instr->branch_target && src.value == 0;
777                         bool found = false;
778 
779                         for (unsigned i = 0; i < *constant_count; ++i) {
780                                 found |= (constants[i] == src.value) &&
781                                         (i != tuple->pcrel_idx);
782                         }
783 
784                         /* pcrel constants are unique, so don't match */
785                         if (found && !pcrel)
786                                 continue;
787 
788                         bool no_fau = (*constant_count > 0) || !fau;
789                         bool mergable = no_fau && ((*constant_count) < 2);
790 
791                         if (destructive) {
792                                 assert(mergable);
793 
794                                 if (pcrel)
795                                         tuple->pcrel_idx = *constant_count;
796                         } else if (!mergable)
797                                 return false;
798 
799                         constants[(*constant_count)++] = src.value;
800                 }
801         }
802 
803         /* Constants per clause may be limited by tuple count */
804         bool room_for_constants = (*constant_count == 0) ||
805                 bi_space_for_more_constants(clause);
806 
807         if (destructive)
808                 assert(room_for_constants);
809         else if (!room_for_constants)
810                 return false;
811 
812         return true;
813 }
814 
815 /* Given an in-progress tuple, a candidate new instruction to add to the tuple,
816  * and a source (index) from that candidate, determine whether this source is
817  * "new", in the sense of requiring an additional read slot. That is, checks
818  * whether the specified source reads from the register file via a read slot
819  * (determined by its type and placement) and whether the source was already
820  * specified by a prior read slot (to avoid double counting) */
821 
822 static bool
bi_tuple_is_new_src(bi_instr * instr,struct bi_reg_state * reg,unsigned src_idx)823 bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
824 {
825         bi_index src = instr->src[src_idx];
826 
827         /* Only consider sources which come from the register file */
828         if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
829                 return false;
830 
831         /* Staging register reads bypass the usual register file mechanism */
832         if (src_idx == 0 && bi_opcode_props[instr->op].sr_read)
833                 return false;
834 
835         /* If a source is already read in the tuple, it is already counted */
836         for (unsigned t = 0; t < reg->nr_reads; ++t)
837                 if (bi_is_word_equiv(src, reg->reads[t]))
838                         return false;
839 
840         /* If a source is read in _this instruction_, it is already counted */
841         for (unsigned t = 0; t < src_idx; ++t)
842                 if (bi_is_word_equiv(src, instr->src[t]))
843                         return false;
844 
845         return true;
846 }
847 
848 /* Given two tuples in source order, count the number of register reads of the
849  * successor, determined as the number of unique words accessed that aren't
850  * written by the predecessor (since those are tempable).
851  */
852 
853 static unsigned
bi_count_succ_reads(bi_index t0,bi_index t1,bi_index * succ_reads,unsigned nr_succ_reads)854 bi_count_succ_reads(bi_index t0, bi_index t1,
855                 bi_index *succ_reads, unsigned nr_succ_reads)
856 {
857         unsigned reads = 0;
858 
859         for (unsigned i = 0; i < nr_succ_reads; ++i) {
860                 bool unique = true;
861 
862                 for (unsigned j = 0; j < i; ++j)
863                         if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
864                                 unique = false;
865 
866                 if (!unique)
867                         continue;
868 
869                 if (bi_is_word_equiv(succ_reads[i], t0))
870                         continue;
871 
872                 if (bi_is_word_equiv(succ_reads[i], t1))
873                         continue;
874 
875                 reads++;
876         }
877 
878         return reads;
879 }
880 
881 /* Not all instructions can read from the staging passthrough (as determined by
882  * reads_t), check if a given pair of instructions has such a restriction. Note
883  * we also use this mechanism to prevent data races around staging register
884  * reads, so we allow the input source to potentially be vector-valued */
885 
886 static bool
bi_has_staging_passthrough_hazard(bi_index fma,bi_instr * add)887 bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
888 {
889         bi_foreach_src(add, s) {
890                 bi_index src = add->src[s];
891 
892                 if (src.type != BI_INDEX_REGISTER)
893                         continue;
894 
895                 unsigned count = bi_count_read_registers(add, s);
896                 bool read = false;
897 
898                 for (unsigned d = 0; d < count; ++d)
899                         read |= bi_is_equiv(fma, bi_register(src.value + d));
900 
901                 if (read && !bi_reads_t(add, s))
902                         return true;
903         }
904 
905         return false;
906 }
907 
908 /* Likewise for cross-tuple passthrough (reads_temps) */
909 
910 static bool
bi_has_cross_passthrough_hazard(bi_tuple * succ,bi_instr * ins)911 bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
912 {
913         bi_foreach_instr_in_tuple(succ, pins) {
914                 bi_foreach_src(pins, s) {
915                         if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
916                                         !bi_reads_temps(pins, s))
917                                 return true;
918                 }
919         }
920 
921         return false;
922 }
923 
924 /* Is a register written other than the staging mechanism? ATEST is special,
925  * writing to both a staging register and a regular register (fixed packing).
926  * BLEND is special since it has to write r48 the normal way even if it never
927  * gets read. This depends on liveness analysis, as a register is not needed
928  * for a write that will be discarded after one tuple. */
929 
930 static unsigned
bi_write_count(bi_instr * instr,uint64_t live_after_temp)931 bi_write_count(bi_instr *instr, uint64_t live_after_temp)
932 {
933         if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
934                 return 1;
935 
936         unsigned count = 0;
937 
938         bi_foreach_dest(instr, d) {
939                 if (d == 0 && bi_opcode_props[instr->op].sr_write)
940                         continue;
941 
942                 if (bi_is_null(instr->dest[d]))
943                         continue;
944 
945                 assert(instr->dest[0].type == BI_INDEX_REGISTER);
946                 if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
947                         count++;
948         }
949 
950         return count;
951 }
952 
953 /* Instruction placement entails two questions: what subset of instructions in
954  * the block can legally be scheduled? and of those which is the best? That is,
955  * we seek to maximize a cost function on a subset of the worklist satisfying a
956  * particular predicate. The necessary predicate is determined entirely by
957  * Bifrost's architectural limitations and is described in the accompanying
958  * whitepaper. The cost function is a heuristic. */
959 
960 static bool
bi_instr_schedulable(bi_instr * instr,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)961 bi_instr_schedulable(bi_instr *instr,
962                 struct bi_clause_state *clause,
963                 struct bi_tuple_state *tuple,
964                 uint64_t live_after_temp,
965                 bool fma)
966 {
967         /* The units must match */
968         if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
969                 return false;
970 
971         /* There can only be one message-passing instruction per clause */
972         if (bi_must_message(instr) && clause->message)
973                 return false;
974 
975         /* Some instructions have placement requirements */
976         if (bi_opcode_props[instr->op].last && !tuple->last)
977                 return false;
978 
979         if (bi_must_not_last(instr) && tuple->last)
980                 return false;
981 
982         /* Message-passing instructions are not guaranteed write within the
983          * same clause (most likely they will not), so if a later instruction
984          * in the clause accesses the destination, the message-passing
985          * instruction can't be scheduled */
986         if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {
987                 unsigned nr = bi_count_write_registers(instr, 0);
988                 assert(instr->dest[0].type == BI_INDEX_REGISTER);
989                 unsigned reg = instr->dest[0].value;
990 
991                 for (unsigned i = 0; i < clause->access_count; ++i) {
992                         bi_index idx = clause->accesses[i];
993                         for (unsigned d = 0; d < nr; ++d) {
994                                 if (bi_is_equiv(bi_register(reg + d), idx))
995                                         return false;
996                         }
997                 }
998         }
999 
1000         if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1001                 unsigned nr = bi_count_read_registers(instr, 0);
1002                 assert(instr->src[0].type == BI_INDEX_REGISTER);
1003                 unsigned reg = instr->src[0].value;
1004 
1005                 for (unsigned i = 0; i < clause->access_count; ++i) {
1006                         bi_index idx = clause->accesses[i];
1007                         for (unsigned d = 0; d < nr; ++d) {
1008                                 if (bi_is_equiv(bi_register(reg + d), idx))
1009                                         return false;
1010                         }
1011                 }
1012         }
1013 
1014         /* If FAU is already assigned, we may not disrupt that. Do a
1015          * non-disruptive test update */
1016         if (!bi_update_fau(clause, tuple, instr, fma, false))
1017                 return false;
1018 
1019         /* If this choice of FMA would force a staging passthrough, the ADD
1020          * instruction must support such a passthrough */
1021         if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1022                 return false;
1023 
1024         /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1025         if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1026                 return false;
1027 
1028         /* Register file writes are limited */
1029         unsigned total_writes = tuple->reg.nr_writes;
1030         total_writes += bi_write_count(instr, live_after_temp);
1031 
1032         /* Last tuple in a clause can only write a single value */
1033         if (tuple->last && total_writes > 1)
1034                 return false;
1035 
1036         /* Register file reads are limited, so count unique */
1037 
1038         unsigned unique_new_srcs = 0;
1039 
1040         bi_foreach_src(instr, s) {
1041                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1042                         unique_new_srcs++;
1043         }
1044 
1045         unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1046 
1047         bool can_spill_to_moves = (!tuple->add);
1048         can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1049         can_spill_to_moves &= (clause->tuple_count < 7);
1050 
1051         /* However, we can get an extra 1 or 2 sources by inserting moves */
1052         if (total_srcs > (can_spill_to_moves ? 4 : 3))
1053                 return false;
1054 
1055         /* Count effective reads for the successor */
1056         unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1057                         tuple->add ? tuple->add->dest[0] : bi_null(),
1058                         tuple->prev_reads, tuple->nr_prev_reads);
1059 
1060         /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1061         if ((signed) total_writes > (4 - (signed) succ_reads))
1062                 return false;
1063 
1064         return true;
1065 }
1066 
1067 static signed
bi_instr_cost(bi_instr * instr,struct bi_tuple_state * tuple)1068 bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1069 {
1070         signed cost = 0;
1071 
1072         /* Instructions that can schedule to either FMA or to ADD should be
1073          * deprioritized since they're easier to reschedule elsewhere */
1074         if (bi_can_fma(instr) && bi_can_add(instr))
1075                 cost++;
1076 
1077         /* Message-passing instructions impose constraints on the registers
1078          * later in the clause, so schedule them as late within a clause as
1079          * possible (<==> prioritize them since we're backwards <==> decrease
1080          * cost) */
1081         if (bi_must_message(instr))
1082                 cost--;
1083 
1084         /* Last instructions are big constraints (XXX: no effect on shader-db) */
1085         if (bi_opcode_props[instr->op].last)
1086                 cost -= 2;
1087 
1088         return cost;
1089 }
1090 
1091 static unsigned
bi_choose_index(struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1092 bi_choose_index(struct bi_worklist st,
1093                 struct bi_clause_state *clause,
1094                 struct bi_tuple_state *tuple,
1095                 uint64_t live_after_temp,
1096                 bool fma)
1097 {
1098         unsigned i, best_idx = ~0;
1099         signed best_cost = INT_MAX;
1100 
1101         BITSET_FOREACH_SET(i, st.worklist, st.count) {
1102                 bi_instr *instr = st.instructions[i];
1103 
1104                 if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1105                         continue;
1106 
1107                 signed cost = bi_instr_cost(instr, tuple);
1108 
1109                 /* Tie break in favour of later instructions, under the
1110                  * assumption this promotes temporary usage (reducing pressure
1111                  * on the register file). This is a side effect of a prepass
1112                  * scheduling for pressure. */
1113 
1114                 if (cost <= best_cost) {
1115                         best_idx = i;
1116                         best_cost = cost;
1117                 }
1118         }
1119 
1120         return best_idx;
1121 }
1122 
1123 static void
bi_pop_instr(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,uint64_t live_after_temp,bool fma)1124 bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1125                 bi_instr *instr, uint64_t live_after_temp, bool fma)
1126 {
1127         bi_update_fau(clause, tuple, instr, fma, true);
1128 
1129         /* TODO: maybe opt a bit? or maybe doesn't matter */
1130         assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1131         memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1132         clause->access_count += BI_MAX_SRCS;
1133         memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1134         clause->access_count += BI_MAX_DESTS;
1135         tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1136 
1137         bi_foreach_src(instr, s) {
1138                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1139                         tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1140         }
1141 }
1142 
1143 /* Choose the best instruction and pop it off the worklist. Returns NULL if no
1144  * instruction is available. This function is destructive. */
1145 
1146 static bi_instr *
bi_take_instr(bi_context * ctx,struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1147 bi_take_instr(bi_context *ctx, struct bi_worklist st,
1148                 struct bi_clause_state *clause,
1149                 struct bi_tuple_state *tuple,
1150                 uint64_t live_after_temp,
1151                 bool fma)
1152 {
1153         if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1154                 return bi_lower_cubeface(ctx, clause, tuple);
1155         else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32)
1156                 return bi_lower_atom_c(ctx, clause, tuple);
1157         else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32)
1158                 return bi_lower_atom_c1(ctx, clause, tuple);
1159         else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1160                 return bi_lower_seg_add(ctx, clause, tuple);
1161         else if (tuple->add && tuple->add->table)
1162                 return bi_lower_dtsel(ctx, clause, tuple);
1163 
1164         /* TODO: Optimize these moves */
1165         if (!fma && tuple->nr_prev_reads > 3) {
1166                 /* Only spill by one source for now */
1167                 assert(tuple->nr_prev_reads == 4);
1168 
1169                 /* Pick a source to spill */
1170                 bi_index src = tuple->prev_reads[0];
1171 
1172                 /* Schedule the spill */
1173                 bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1174                 bi_instr *mov = bi_mov_i32_to(&b, src, src);
1175                 bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1176                 return mov;
1177         }
1178 
1179 #ifndef NDEBUG
1180         /* Don't pair instructions if debugging */
1181         if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1182                 return NULL;
1183 #endif
1184 
1185         unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1186 
1187         if (idx >= st.count)
1188                 return NULL;
1189 
1190         /* Update state to reflect taking the instruction */
1191         bi_instr *instr = st.instructions[idx];
1192 
1193         BITSET_CLEAR(st.worklist, idx);
1194         bi_update_worklist(st, idx);
1195         bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1196 
1197         /* Fixups */
1198         if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1199                 assert(bi_can_iaddc(instr));
1200                 instr->op = BI_OPCODE_IADDC_I32;
1201                 instr->src[2] = bi_zero();
1202         }
1203 
1204         return instr;
1205 }
1206 
1207 /* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1208  * to a passthrough register. If except_zero is true, the zeroth (first) source
1209  * is skipped, so staging register reads are not accidentally encoded as
1210  * passthrough (which is impossible) */
1211 
1212 static void
bi_use_passthrough(bi_instr * ins,bi_index old,enum bifrost_packed_src new,bool except_zero)1213 bi_use_passthrough(bi_instr *ins, bi_index old,
1214                 enum bifrost_packed_src new,
1215                 bool except_zero)
1216 {
1217         /* Optional for convenience */
1218         if (!ins || bi_is_null(old))
1219                 return;
1220 
1221         bi_foreach_src(ins, i) {
1222                 if (i == 0 && except_zero)
1223                         continue;
1224 
1225                 if (bi_is_word_equiv(ins->src[i], old)) {
1226                         ins->src[i].type = BI_INDEX_PASS;
1227                         ins->src[i].value = new;
1228                         ins->src[i].reg = false;
1229                         ins->src[i].offset = 0;
1230                 }
1231         }
1232 }
1233 
1234 /* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1235  * intertuple passthroughs where necessary. Passthroughs are allowed as a
1236  * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1237  * opposite the order of execution. This is deliberate -- if both FMA and ADD
1238  * write to the same logical register, the next executed tuple will get the
1239  * latter result. There's no interference issue under the assumption of correct
1240  * register allocation. */
1241 
1242 static void
bi_rewrite_passthrough(bi_tuple prec,bi_tuple succ)1243 bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1244 {
1245         bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1246 
1247         if (prec.add) {
1248                 bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1249                 bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1250         }
1251 
1252         if (prec.fma) {
1253                 bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1254                 bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1255         }
1256 }
1257 
1258 static void
bi_rewrite_fau_to_pass(bi_tuple * tuple)1259 bi_rewrite_fau_to_pass(bi_tuple *tuple)
1260 {
1261         bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1262                 if (ins->src[s].type != BI_INDEX_FAU) continue;
1263 
1264                 bi_index pass = bi_passthrough(ins->src[s].offset ?
1265                                 BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1266 
1267                 ins->src[s] = bi_replace_index(ins->src[s], pass);
1268         }
1269 }
1270 
1271 static void
bi_rewrite_zero(bi_instr * ins,bool fma)1272 bi_rewrite_zero(bi_instr *ins, bool fma)
1273 {
1274         bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1275 
1276         bi_foreach_src(ins, s) {
1277                 bi_index src = ins->src[s];
1278 
1279                 if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1280                         ins->src[s] = bi_replace_index(src, zero);
1281         }
1282 }
1283 
1284 /* Assumes #0 to {T, FAU} rewrite has already occurred */
1285 
1286 static void
bi_rewrite_constants_to_pass(bi_tuple * tuple,uint64_t constant,bool pcrel)1287 bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1288 {
1289         bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1290                 if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1291 
1292                 uint32_t cons = ins->src[s].value;
1293 
1294                 ASSERTED bool lo = (cons == (constant & 0xffffffff));
1295                 bool hi = (cons == (constant >> 32ull));
1296 
1297                 /* PC offsets always live in the upper half, set to zero by
1298                  * convention before pack time. (This is safe, since if you
1299                  * wanted to compare against zero, you would use a BRANCHZ
1300                  * instruction instead.) */
1301                 if (cons == 0 && ins->branch_target != NULL) {
1302                         assert(pcrel);
1303                         hi = true;
1304                         lo = false;
1305                 } else if (pcrel) {
1306                         hi = false;
1307                 }
1308 
1309                 assert(lo || hi);
1310 
1311                 ins->src[s] = bi_replace_index(ins->src[s],
1312                                 bi_passthrough(hi ?  BIFROST_SRC_FAU_HI :
1313                                         BIFROST_SRC_FAU_LO));
1314         }
1315 }
1316 
1317 /* Constructs a constant state given a tuple state. This has the
1318  * postcondition that pcrel applies to the first constant by convention,
1319  * and PC-relative constants will be #0 by convention here, so swap to
1320  * match if needed */
1321 
1322 static struct bi_const_state
bi_get_const_state(struct bi_tuple_state * tuple)1323 bi_get_const_state(struct bi_tuple_state *tuple)
1324 {
1325         struct bi_const_state consts = {
1326                 .constant_count = tuple->constant_count,
1327                 .constants[0] = tuple->constants[0],
1328                 .constants[1] = tuple->constants[1],
1329                 .pcrel = tuple->add && tuple->add->branch_target,
1330         };
1331 
1332         /* pcrel applies to the first constant by convention, and
1333          * PC-relative constants will be #0 by convention here, so swap
1334          * to match if needed */
1335         if (consts.pcrel && consts.constants[0]) {
1336                 assert(consts.constant_count == 2);
1337                 assert(consts.constants[1] == 0);
1338 
1339                 consts.constants[1] = consts.constants[0];
1340                 consts.constants[0] = 0;
1341         }
1342 
1343         return consts;
1344 }
1345 
1346 /* Merges constants in a clause, satisfying the following rules, assuming no
1347  * more than one tuple has pcrel:
1348  *
1349  * 1. If a tuple has two constants, they must be packed together. If one is
1350  * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1351  * (PC << 32)]. Otherwise choose an arbitrary order.
1352  *
1353  * 4. If a tuple has one constant, it may be shared with an existing
1354  * pair that already contains that constant, or it may be combined with another
1355  * (distinct) tuple of a single constant.
1356  *
1357  * This gaurantees a packing is possible. The next routine handles modification
1358  * related swapping, to satisfy format 12 and the lack of modification for
1359  * tuple count 5/8 in EC0.
1360  */
1361 
1362 static uint64_t
bi_merge_u32(uint32_t c0,uint32_t c1,bool pcrel)1363 bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1364 {
1365         /* At this point in the constant merge algorithm, pcrel constants are
1366          * treated as zero, so pcrel implies at least one constants is zero */
1367         assert(!pcrel || (c0 == 0 || c1 == 0));
1368 
1369         /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1370         uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1371         uint32_t lo = (c0 == hi) ? c1 : c0;
1372 
1373         /* Merge in the selected order */
1374         return lo | (((uint64_t) hi) << 32ull);
1375 }
1376 
1377 static unsigned
bi_merge_pairs(struct bi_const_state * consts,unsigned tuple_count,uint64_t * merged,unsigned * pcrel_pair)1378 bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1379                 uint64_t *merged, unsigned *pcrel_pair)
1380 {
1381         unsigned merge_count = 0;
1382 
1383         for (unsigned t = 0; t < tuple_count; ++t) {
1384                 if (consts[t].constant_count != 2) continue;
1385 
1386                 unsigned idx = ~0;
1387                 uint64_t val = bi_merge_u32(consts[t].constants[0],
1388                                 consts[t].constants[1], consts[t].pcrel);
1389 
1390                 /* Skip the pcrel pair if assigned, because if one is assigned,
1391                  * this one is not pcrel by uniqueness so it's a mismatch */
1392                 for (unsigned s = 0; s < merge_count; ++s) {
1393                         if (merged[s] == val && (*pcrel_pair) != s) {
1394                                 idx = s;
1395                                 break;
1396                         }
1397                 }
1398 
1399                 if (idx == ~0) {
1400                         idx = merge_count++;
1401                         merged[idx] = val;
1402 
1403                         if (consts[t].pcrel)
1404                                 (*pcrel_pair) = idx;
1405                 }
1406 
1407                 consts[t].word_idx = idx;
1408         }
1409 
1410         return merge_count;
1411 }
1412 
1413 static unsigned
bi_merge_singles(struct bi_const_state * consts,unsigned tuple_count,uint64_t * pairs,unsigned pair_count,unsigned * pcrel_pair)1414 bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1415                 uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1416 {
1417         bool pending = false, pending_pcrel = false;
1418         uint32_t pending_single = 0;
1419 
1420         for (unsigned t = 0; t < tuple_count; ++t) {
1421                 if (consts[t].constant_count != 1) continue;
1422 
1423                 uint32_t val = consts[t].constants[0];
1424                 unsigned idx = ~0;
1425 
1426                 /* Try to match, but don't match pcrel with non-pcrel, even
1427                  * though we can merge a pcrel with a non-pcrel single */
1428                 for (unsigned i = 0; i < pair_count; ++i) {
1429                         bool lo = ((pairs[i] & 0xffffffff) == val);
1430                         bool hi = ((pairs[i] >> 32) == val);
1431                         bool match = (lo || hi);
1432                         match &= ((*pcrel_pair) != i);
1433                         if (match && !consts[t].pcrel) {
1434                                 idx = i;
1435                                 break;
1436                         }
1437                 }
1438 
1439                 if (idx == ~0) {
1440                         idx = pair_count;
1441 
1442                         if (pending && pending_single != val) {
1443                                 assert(!(pending_pcrel && consts[t].pcrel));
1444                                 bool pcrel = pending_pcrel || consts[t].pcrel;
1445 
1446                                 if (pcrel)
1447                                         *pcrel_pair = idx;
1448 
1449                                 pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1450 
1451                                 pending = pending_pcrel = false;
1452                         } else {
1453                                 pending = true;
1454                                 pending_pcrel = consts[t].pcrel;
1455                                 pending_single = val;
1456                         }
1457                 }
1458 
1459                 consts[t].word_idx = idx;
1460         }
1461 
1462         /* Shift so it works whether pending_pcrel is set or not */
1463         if (pending) {
1464                 if (pending_pcrel)
1465                         *pcrel_pair = pair_count;
1466 
1467                 pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1468         }
1469 
1470         return pair_count;
1471 }
1472 
1473 static unsigned
bi_merge_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx)1474 bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1475 {
1476         unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1477         return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1478 }
1479 
1480 /* Swap two constants at word i and i+1 by swapping their actual positions and
1481  * swapping all references so the meaning of the clause is preserved */
1482 
1483 static void
bi_swap_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned i)1484 bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1485 {
1486         uint64_t tmp_pair = pairs[i + 0];
1487         pairs[i + 0] = pairs[i + 1];
1488         pairs[i + 1] = tmp_pair;
1489 
1490         for (unsigned t = 0; t < 8; ++t) {
1491                 if (consts[t].word_idx == i)
1492                         consts[t].word_idx = (i + 1);
1493                 else if (consts[t].word_idx == (i + 1))
1494                         consts[t].word_idx = i;
1495         }
1496 }
1497 
1498 /* Given merged constants, one of which might be PC-relative, fix up the M
1499  * values so the PC-relative constant (if it exists) has the M1=4 modification
1500  * and other constants are used as-is (which might require swapping) */
1501 
1502 static unsigned
bi_apply_constant_modifiers(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx,unsigned tuple_count,unsigned constant_count)1503 bi_apply_constant_modifiers(struct bi_const_state *consts,
1504                 uint64_t *pairs, unsigned *pcrel_idx,
1505                 unsigned tuple_count, unsigned constant_count)
1506 {
1507         unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1508 
1509         /* Clauses with these tuple counts lack an M field for the packed EC0,
1510          * so EC0 cannot be PC-relative, which might require swapping (and
1511          * possibly adding an unused constant) to fit */
1512 
1513         if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1514                 constant_count = MAX2(constant_count, 2);
1515                 *pcrel_idx = 1;
1516                 bi_swap_constants(consts, pairs, 0);
1517         }
1518 
1519         /* EC0 might be packed free, after that constants are packed in pairs
1520          * (with clause format 12), with M1 values computed from the pair */
1521 
1522         for (unsigned i = start; i < constant_count; i += 2) {
1523                 bool swap = false;
1524                 bool last = (i + 1) == constant_count;
1525 
1526                 unsigned A1 = (pairs[i] >> 60);
1527                 unsigned B1 = (pairs[i + 1] >> 60);
1528 
1529                 if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1530                         /* PC-relative constant must be E0, not E1 */
1531                         swap = (*pcrel_idx == (i + 1));
1532 
1533                         /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1534                          * equivalent to A = (B + 4) mod 16 and that we can
1535                          * control A */
1536                         unsigned B = swap ? A1 : B1;
1537                         unsigned A = (B + 4) & 0xF;
1538                         pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1539 
1540                         /* Swapped if swap set, identity if swap not set */
1541                         *pcrel_idx = i;
1542                 } else {
1543                         /* Compute M1 value if we don't swap */
1544                         unsigned M1 = (16 + A1 - B1) & 0xF;
1545 
1546                         /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1547                          * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1548                          * A1) % 16 >= 8, so swapping will let them be used
1549                          * unchanged */
1550                         swap = (M1 != 0) && (M1 < 8);
1551 
1552                         /* However, we can't swap the last constant, so we
1553                          * force M1 = 0 instead for this case */
1554                         if (last && swap) {
1555                                 pairs[i + 1] |= pairs[i] & (0xfull << 60);
1556                                 swap = false;
1557                         }
1558                 }
1559 
1560                 if (swap) {
1561                         assert(!last);
1562                         bi_swap_constants(consts, pairs, i);
1563                 }
1564         }
1565 
1566         return constant_count;
1567 }
1568 
1569 /* Schedule a single clause. If no instructions remain, return NULL. */
1570 
1571 static bi_clause *
bi_schedule_clause(bi_context * ctx,bi_block * block,struct bi_worklist st,uint64_t * live)1572 bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1573 {
1574         struct bi_clause_state clause_state = { 0 };
1575         bi_clause *clause = rzalloc(ctx, bi_clause);
1576         bi_tuple *tuple = NULL;
1577 
1578         const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1579 
1580         /* TODO: Decide flow control better */
1581         clause->flow_control = BIFROST_FLOW_NBTB;
1582 
1583         /* The last clause can only write one instruction, so initialize that */
1584         struct bi_reg_state reg_state = {};
1585         bi_index prev_reads[5] = { bi_null() };
1586         unsigned nr_prev_reads = 0;
1587 
1588         /* We need to track future liveness. The main *live set tracks what is
1589          * live at the current point int he program we are scheduling, but to
1590          * determine temp eligibility, we instead want what will be live after
1591          * the next tuple in the program. If you scheduled forwards, you'd need
1592          * a crystall ball for this. Luckily we schedule backwards, so we just
1593          * delay updates to the live_after_temp by an extra tuple. */
1594         uint64_t live_after_temp = *live;
1595         uint64_t live_next_tuple = live_after_temp;
1596 
1597         do {
1598                 struct bi_tuple_state tuple_state = {
1599                         .last = (clause->tuple_count == 0),
1600                         .reg = reg_state,
1601                         .nr_prev_reads = nr_prev_reads,
1602                         .prev = tuple,
1603                         .pcrel_idx = ~0,
1604                 };
1605 
1606                 assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1607                 memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1608 
1609                 unsigned idx = max_tuples - clause->tuple_count - 1;
1610 
1611                 tuple = &clause->tuples[idx];
1612 
1613                 if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1614                         unsigned nr = bi_count_read_registers(clause->message, 0);
1615                         live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1616                 }
1617 
1618                 /* Since we schedule backwards, we schedule ADD first */
1619                 tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1620                 tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1621                 tuple->add = tuple_state.add;
1622 
1623                 /* Update liveness from the new instructions */
1624                 if (tuple->add)
1625                         *live = bi_postra_liveness_ins(*live, tuple->add);
1626 
1627                 if (tuple->fma)
1628                         *live = bi_postra_liveness_ins(*live, tuple->fma);
1629 
1630                /* Rotate in the new per-tuple liveness */
1631                 live_after_temp = live_next_tuple;
1632                 live_next_tuple = *live;
1633 
1634                 /* We may have a message, but only one per clause */
1635                 if (tuple->add && bi_must_message(tuple->add)) {
1636                         assert(!clause_state.message);
1637                         clause_state.message = true;
1638 
1639                         clause->message_type =
1640                                 bi_message_type_for_instr(tuple->add);
1641                         clause->message = tuple->add;
1642 
1643                         /* We don't need to set dependencies for blend shaders
1644                          * because the BLEND instruction in the fragment
1645                          * shader should have already done the wait */
1646                         if (!ctx->inputs->is_blend) {
1647                                 switch (tuple->add->op) {
1648                                 case BI_OPCODE_ATEST:
1649                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1650                                         break;
1651                                 case BI_OPCODE_LD_TILE:
1652                                 case BI_OPCODE_ST_TILE:
1653                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1654                                         break;
1655                                 case BI_OPCODE_BLEND:
1656                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1657                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1658                                         break;
1659                                 default:
1660                                         break;
1661                                 }
1662                         }
1663                 }
1664 
1665                 clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1666 
1667                 /* Before merging constants, eliminate zeroes, otherwise the
1668                  * merging will fight over the #0 that never gets read (and is
1669                  * never marked as read by update_fau) */
1670                 if (tuple->fma && bi_reads_zero(tuple->fma))
1671                         bi_rewrite_zero(tuple->fma, true);
1672 
1673                 /* Rewrite away FAU, constant write is deferred */
1674                 if (!tuple_state.constant_count) {
1675                         tuple->fau_idx = tuple_state.fau;
1676                         bi_rewrite_fau_to_pass(tuple);
1677                 }
1678 
1679                 /* Use passthrough register for cross-stage accesses. Since
1680                  * there are just FMA and ADD stages, that means we rewrite to
1681                  * passthrough the sources of the ADD that read from the
1682                  * destination of the FMA */
1683 
1684                 if (tuple->fma) {
1685                         bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1686                                         BIFROST_SRC_STAGE, false);
1687                 }
1688 
1689                 /* Don't add an empty tuple, unless the worklist has nothing
1690                  * but a (pseudo)instruction failing to schedule due to a "not
1691                  * last instruction" constraint */
1692 
1693                 int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1694                 bool not_last = (some_instruction > 0) &&
1695                         bi_must_not_last(st.instructions[some_instruction - 1]);
1696 
1697                 bool insert_empty = tuple_state.last && not_last;
1698 
1699                 if (!(tuple->fma || tuple->add || insert_empty))
1700                         break;
1701 
1702                 clause->tuple_count++;
1703 
1704                 /* Adding enough tuple might overflow constants */
1705                 if (!bi_space_for_more_constants(&clause_state))
1706                         break;
1707 
1708 #ifndef NDEBUG
1709                 /* Don't schedule more than 1 tuple if debugging */
1710                 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1711                         break;
1712 #endif
1713 
1714                 /* Link through the register state */
1715                 STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1716                 memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1717                 nr_prev_reads = tuple_state.reg.nr_reads;
1718                 clause_state.tuple_count++;
1719         } while(clause->tuple_count < 8);
1720 
1721         /* Don't schedule an empty clause */
1722         if (!clause->tuple_count)
1723                 return NULL;
1724 
1725         /* Before merging, rewrite away any tuples that read only zero */
1726         for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1727                 bi_tuple *tuple = &clause->tuples[i];
1728                 struct bi_const_state *st = &clause_state.consts[i];
1729 
1730                 if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1731                         continue;
1732 
1733                 bi_foreach_instr_in_tuple(tuple, ins)
1734                         bi_rewrite_zero(ins, false);
1735 
1736                 /* Constant has been demoted to FAU, so don't pack it separately */
1737                 st->constant_count = 0;
1738 
1739                 /* Default */
1740                 assert(tuple->fau_idx == BIR_FAU_ZERO);
1741         }
1742 
1743         uint64_t constant_pairs[8] = { 0 };
1744         unsigned pcrel_idx = ~0;
1745         unsigned constant_words =
1746                 bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1747 
1748         constant_words = bi_apply_constant_modifiers(clause_state.consts,
1749                         constant_pairs, &pcrel_idx, clause->tuple_count,
1750                         constant_words);
1751 
1752         clause->pcrel_idx = pcrel_idx;
1753 
1754         for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1755                 bi_tuple *tuple = &clause->tuples[i];
1756 
1757                 /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1758                 if (clause_state.consts[i].constant_count == 0)
1759                         continue;
1760 
1761                 /* FAU is already handled */
1762                 assert(!tuple->fau_idx);
1763 
1764                 unsigned word_idx = clause_state.consts[i].word_idx;
1765                 assert(word_idx <= 8);
1766 
1767                 /* We could try to merge regardless of bottom bits as well, but
1768                  * that's probably diminishing returns */
1769                 uint64_t pair = constant_pairs[word_idx];
1770                 unsigned lo = pair & 0xF;
1771 
1772                 tuple->fau_idx = bi_constant_field(word_idx) | lo;
1773                 bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1774         }
1775 
1776         clause->constant_count = constant_words;
1777         memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1778 
1779         /* Branches must be last, so this can be factored out */
1780         bi_instr *last = clause->tuples[max_tuples - 1].add;
1781         clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1782         clause->block = block;
1783 
1784         /* TODO: scoreboard assignment post-sched */
1785         clause->dependencies |= (1 << 0);
1786 
1787         /* We emit in reverse and emitted to the back of the tuples array, so
1788          * move it up front for easy indexing */
1789         memmove(clause->tuples,
1790                        clause->tuples + (max_tuples - clause->tuple_count),
1791                        clause->tuple_count * sizeof(clause->tuples[0]));
1792 
1793         /* Use passthrough register for cross-tuple accesses. Note this is
1794          * after the memmove, so this is forwards. Skip the first tuple since
1795          * there is nothing before it to passthrough */
1796 
1797         for (unsigned t = 1; t < clause->tuple_count; ++t)
1798                 bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1799 
1800         return clause;
1801 }
1802 
1803 static void
bi_schedule_block(bi_context * ctx,bi_block * block)1804 bi_schedule_block(bi_context *ctx, bi_block *block)
1805 {
1806         list_inithead(&block->clauses);
1807 
1808         /* Copy list to dynamic array */
1809         struct bi_worklist st = bi_initialize_worklist(block,
1810                         bifrost_debug & BIFROST_DBG_INORDER,
1811                         ctx->inputs->is_blend);
1812 
1813         if (!st.count) {
1814                 bi_free_worklist(st);
1815                 return;
1816         }
1817 
1818         /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1819         uint64_t live = block->reg_live_out;
1820 
1821         /* Schedule as many clauses as needed to fill the block */
1822         bi_clause *u = NULL;
1823         while((u = bi_schedule_clause(ctx, block, st, &live)))
1824                 list_add(&u->link, &block->clauses);
1825 
1826         /* Back-to-back bit affects only the last clause of a block,
1827          * the rest are implicitly true */
1828         if (!list_is_empty(&block->clauses)) {
1829                 bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1830                 if (bi_reconverge_branches(block))
1831                         last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1832         }
1833 
1834         /* Reorder instructions to match the new schedule. First remove
1835          * existing instructions and then recreate the list */
1836 
1837         bi_foreach_instr_in_block_safe(block, ins) {
1838                 list_del(&ins->link);
1839         }
1840 
1841         bi_foreach_clause_in_block(block, clause) {
1842                 for (unsigned i = 0; i < clause->tuple_count; ++i)  {
1843                         bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1844                                 list_addtail(&ins->link, &block->instructions);
1845                         }
1846                 }
1847         }
1848 
1849         block->scheduled = true;
1850 
1851 #ifndef NDEBUG
1852         unsigned i;
1853         bool incomplete = false;
1854 
1855         BITSET_FOREACH_SET(i, st.worklist, st.count) {
1856                 bi_print_instr(st.instructions[i], stderr);
1857                 incomplete = true;
1858         }
1859 
1860         if (incomplete)
1861                 unreachable("The above instructions failed to schedule.");
1862 #endif
1863 
1864         bi_free_worklist(st);
1865 }
1866 
1867 static bool
bi_check_fau_src(bi_instr * ins,unsigned s,uint32_t * constants,unsigned * cwords,bi_index * fau)1868 bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1869 {
1870         bi_index src = ins->src[s];
1871 
1872         /* Staging registers can't have FAU accesses */
1873         if (s == 0 && bi_opcode_props[ins->op].sr_read)
1874                 return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1875 
1876         if (src.type == BI_INDEX_CONSTANT) {
1877                 /* Allow fast zero */
1878                 if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1879                         return true;
1880 
1881                 if (!bi_is_null(*fau))
1882                         return false;
1883 
1884                 /* Else, try to inline a constant */
1885                 for (unsigned i = 0; i < *cwords; ++i) {
1886                         if (src.value == constants[i])
1887                                 return true;
1888                 }
1889 
1890                 if (*cwords >= 2)
1891                         return false;
1892 
1893                 constants[(*cwords)++] = src.value;
1894         } else if (src.type == BI_INDEX_FAU) {
1895                 if (*cwords != 0)
1896                         return false;
1897 
1898                 /* Can only read from one pair of FAU words */
1899                 if (!bi_is_null(*fau) && (src.value != fau->value))
1900                         return false;
1901 
1902                 /* If there is a target, we'll need a PC-relative constant */
1903                 if (ins->branch_target)
1904                         return false;
1905 
1906                 *fau = src;
1907         }
1908 
1909         return true;
1910 }
1911 
1912 void
bi_lower_fau(bi_context * ctx)1913 bi_lower_fau(bi_context *ctx)
1914 {
1915         bi_foreach_instr_global_safe(ctx, ins) {
1916                 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
1917 
1918                 uint32_t constants[2];
1919                 unsigned cwords = 0;
1920                 bi_index fau = bi_null();
1921 
1922                 /* ATEST must have the ATEST datum encoded, not any other
1923                  * uniform. See to it this is the case. */
1924                 if (ins->op == BI_OPCODE_ATEST)
1925                         fau = ins->src[2];
1926 
1927                 bi_foreach_src(ins, s) {
1928                         if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
1929 
1930                         bi_index copy = bi_mov_i32(&b, ins->src[s]);
1931                         ins->src[s] = bi_replace_index(ins->src[s], copy);
1932                 }
1933         }
1934 }
1935 
1936 /* Only v7 allows specifying a dependency on the tilebuffer for the first
1937  * clause of a shader. v6 requires adding a NOP clause with the depedency. */
1938 
1939 static void
bi_add_nop_for_atest(bi_context * ctx)1940 bi_add_nop_for_atest(bi_context *ctx)
1941 {
1942         /* Only needed on v6 */
1943         if (ctx->arch >= 7)
1944                 return;
1945 
1946         if (list_is_empty(&ctx->blocks))
1947                 return;
1948 
1949         /* Fetch the first clause of the shader */
1950         bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
1951         bi_clause *clause = bi_next_clause(ctx, block, NULL);
1952 
1953         if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
1954                                                  (1 << BIFROST_SLOT_ELDEST_COLOUR))))
1955                 return;
1956 
1957         /* Add a NOP so we can wait for the dependencies required by the first
1958          * clause */
1959 
1960         bi_instr *I = rzalloc(ctx, bi_instr);
1961         I->op = BI_OPCODE_NOP;
1962         I->dest[0] = bi_null();
1963 
1964         bi_clause *new_clause = ralloc(ctx, bi_clause);
1965         *new_clause = (bi_clause) {
1966                 .flow_control = BIFROST_FLOW_NBTB,
1967                 .next_clause_prefetch = true,
1968                 .block = clause->block,
1969 
1970                 .tuple_count = 1,
1971                 .tuples[0] = { .fma = I, },
1972         };
1973 
1974         list_add(&new_clause->link, &clause->block->clauses);
1975 }
1976 
1977 void
bi_schedule(bi_context * ctx)1978 bi_schedule(bi_context *ctx)
1979 {
1980         /* Fed into both scheduling and DCE */
1981         bi_postra_liveness(ctx);
1982 
1983         bi_foreach_block(ctx, block) {
1984                 bi_schedule_block(ctx, block);
1985         }
1986 
1987         bi_opt_dce_post_ra(ctx);
1988         bi_add_nop_for_atest(ctx);
1989 }
1990