1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler.h"
25 
26 /* This file contains the final passes of the compiler. Running after
27  * scheduling and RA, the IR is now finalized, so we need to emit it to actual
28  * bits on the wire (as well as fixup branches) */
29 
30 static uint64_t
bi_pack_header(bi_clause * clause,bi_clause * next_1,bi_clause * next_2)31 bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
32 {
33         /* next_dependencies are the union of the dependencies of successors'
34          * dependencies */
35 
36         unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
37         dependency_wait |= next_2 ? next_2->dependencies : 0;
38 
39         bool staging_barrier = next_1 ? next_1->staging_barrier : false;
40         staging_barrier |= next_2 ? next_2->staging_barrier : 0;
41 
42         struct bifrost_header header = {
43                 .flow_control =
44                         (next_1 == NULL && next_2 == NULL) ?
45                         BIFROST_FLOW_END :  clause->flow_control,
46                 .terminate_discarded_threads = clause->td,
47                 .next_clause_prefetch = clause->next_clause_prefetch && next_1,
48                 .staging_barrier = staging_barrier,
49                 .staging_register = clause->staging_register,
50                 .dependency_wait = dependency_wait,
51                 .dependency_slot = clause->scoreboard_id,
52                 .message_type = clause->message_type,
53                 .next_message_type = next_1 ? next_1->message_type : 0,
54         };
55 
56         uint64_t u = 0;
57         memcpy(&u, &header, sizeof(header));
58         return u;
59 }
60 
61 /* Assigns a slot for reading, before anything is written */
62 
63 static void
bi_assign_slot_read(bi_registers * regs,bi_index src)64 bi_assign_slot_read(bi_registers *regs, bi_index src)
65 {
66         /* We only assign for registers */
67         if (src.type != BI_INDEX_REGISTER)
68                 return;
69 
70         /* Check if we already assigned the slot */
71         for (unsigned i = 0; i <= 1; ++i) {
72                 if (regs->slot[i] == src.value && regs->enabled[i])
73                         return;
74         }
75 
76         if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ)
77                 return;
78 
79         /* Assign it now */
80 
81         for (unsigned i = 0; i <= 1; ++i) {
82                 if (!regs->enabled[i]) {
83                         regs->slot[i] = src.value;
84                         regs->enabled[i] = true;
85                         return;
86                 }
87         }
88 
89         if (!regs->slot23.slot3) {
90                 regs->slot[2] = src.value;
91                 regs->slot23.slot2 = BIFROST_OP_READ;
92                 return;
93         }
94 
95         bi_print_slots(regs, stderr);
96         unreachable("Failed to find a free slot for src");
97 }
98 
99 static bi_registers
bi_assign_slots(bi_tuple * now,bi_tuple * prev)100 bi_assign_slots(bi_tuple *now, bi_tuple *prev)
101 {
102         /* We assign slots for the main register mechanism. Special ops
103          * use the data registers, which has its own mechanism entirely
104          * and thus gets skipped over here. */
105 
106         bool read_dreg = now->add && bi_opcode_props[now->add->op].sr_read;
107         bool write_dreg = prev->add && bi_opcode_props[prev->add->op].sr_write;
108 
109         /* First, assign reads */
110 
111         if (now->fma)
112                 bi_foreach_src(now->fma, src)
113                         bi_assign_slot_read(&now->regs, (now->fma)->src[src]);
114 
115         if (now->add) {
116                 bi_foreach_src(now->add, src) {
117                         if (!(src == 0 && read_dreg))
118                                 bi_assign_slot_read(&now->regs, (now->add)->src[src]);
119                 }
120         }
121 
122         /* Next, assign writes. Staging writes are assigned separately, but
123          * +ATEST wants its destination written to both a staging register
124          * _and_ a regular write, because it may not generate a message */
125 
126         if (prev->add && (!write_dreg || prev->add->op == BI_OPCODE_ATEST)) {
127                 bi_index idx = prev->add->dest[0];
128 
129                 if (idx.type == BI_INDEX_REGISTER) {
130                         now->regs.slot[3] = idx.value;
131                         now->regs.slot23.slot3 = BIFROST_OP_WRITE;
132                 }
133         }
134 
135         if (prev->fma) {
136                 bi_index idx = (prev->fma)->dest[0];
137 
138                 if (idx.type == BI_INDEX_REGISTER) {
139                         if (now->regs.slot23.slot3) {
140                                 /* Scheduler constraint: cannot read 3 and write 2 */
141                                 assert(!now->regs.slot23.slot2);
142                                 now->regs.slot[2] = idx.value;
143                                 now->regs.slot23.slot2 = BIFROST_OP_WRITE;
144                         } else {
145                                 now->regs.slot[3] = idx.value;
146                                 now->regs.slot23.slot3 = BIFROST_OP_WRITE;
147                                 now->regs.slot23.slot3_fma = true;
148                         }
149                 }
150         }
151 
152         return now->regs;
153 }
154 
155 static enum bifrost_reg_mode
bi_pack_register_mode(bi_registers r)156 bi_pack_register_mode(bi_registers r)
157 {
158         /* Handle idle as a special case */
159         if (!(r.slot23.slot2 | r.slot23.slot3))
160                 return r.first_instruction ? BIFROST_IDLE_1 : BIFROST_IDLE;
161 
162         /* Otherwise, use the LUT */
163         for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) {
164                 if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0)
165                         return i;
166         }
167 
168         bi_print_slots(&r, stderr);
169         unreachable("Invalid slot assignment");
170 }
171 
172 static uint64_t
bi_pack_registers(bi_registers regs)173 bi_pack_registers(bi_registers regs)
174 {
175         enum bifrost_reg_mode mode = bi_pack_register_mode(regs);
176         struct bifrost_regs s = { 0 };
177         uint64_t packed = 0;
178 
179         /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for
180          * first instruction and adds 16 when reg 2 == reg 3 */
181 
182         unsigned ctrl;
183         bool r2_equals_r3 = false;
184 
185         if (regs.first_instruction) {
186                 /* Bit 3 implicitly must be clear for first instructions.
187                  * The affected patterns all write both ADD/FMA, but that
188                  * is forbidden for the last instruction (whose writes are
189                  * encoded by the first), so this does not add additional
190                  * encoding constraints */
191                 assert(!(mode & 0x8));
192 
193                 /* Move bit 4 to bit 3, since bit 3 is clear */
194                 ctrl = (mode & 0x7) | ((mode & 0x10) >> 1);
195 
196                 /* If we can let r2 equal r3, we have to or the hardware raises
197                  * INSTR_INVALID_ENC (it's unclear why). */
198                 if (!(regs.slot23.slot2 && regs.slot23.slot3))
199                         r2_equals_r3 = true;
200         } else {
201                 /* We force r2=r3 or not for the upper bit */
202                 ctrl = (mode & 0xF);
203                 r2_equals_r3 = (mode & 0x10);
204         }
205 
206         if (regs.enabled[1]) {
207                 /* Gotta save that bit!~ Required by the 63-x trick */
208                 assert(regs.slot[1] > regs.slot[0]);
209                 assert(regs.enabled[0]);
210 
211                 /* Do the 63-x trick, see docs/disasm */
212                 if (regs.slot[0] > 31) {
213                         regs.slot[0] = 63 - regs.slot[0];
214                         regs.slot[1] = 63 - regs.slot[1];
215                 }
216 
217                 assert(regs.slot[0] <= 31);
218                 assert(regs.slot[1] <= 63);
219 
220                 s.ctrl = ctrl;
221                 s.reg1 = regs.slot[1];
222                 s.reg0 = regs.slot[0];
223         } else {
224                 /* slot 1 disabled, so set to zero and use slot 1 for ctrl */
225                 s.ctrl = 0;
226                 s.reg1 = ctrl << 2;
227 
228                 if (regs.enabled[0]) {
229                         /* Bit 0 upper bit of slot 0 */
230                         s.reg1 |= (regs.slot[0] >> 5);
231 
232                         /* Rest of slot 0 in usual spot */
233                         s.reg0 = (regs.slot[0] & 0b11111);
234                 } else {
235                         /* Bit 1 set if slot 0 also disabled */
236                         s.reg1 |= (1 << 1);
237                 }
238         }
239 
240         /* Force r2 =/!= r3 as needed */
241         if (r2_equals_r3) {
242                 assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3));
243 
244                 if (regs.slot23.slot2)
245                         regs.slot[3] = regs.slot[2];
246                 else
247                         regs.slot[2] = regs.slot[3];
248         } else if (!regs.first_instruction) {
249                 /* Enforced by the encoding anyway */
250                 assert(regs.slot[2] != regs.slot[3]);
251         }
252 
253         s.reg2 = regs.slot[2];
254         s.reg3 = regs.slot[3];
255         s.fau_idx = regs.fau_idx;
256 
257         memcpy(&packed, &s, sizeof(s));
258         return packed;
259 }
260 
261 /* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix
262  * this up at pack time. (Scheduling doesn't care.) */
263 
264 static void
bi_flip_slots(bi_registers * regs)265 bi_flip_slots(bi_registers *regs)
266 {
267         if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) {
268                 unsigned temp = regs->slot[0];
269                 regs->slot[0] = regs->slot[1];
270                 regs->slot[1] = temp;
271         }
272 
273 }
274 
275 static inline enum bifrost_packed_src
bi_get_src_slot(bi_registers * regs,unsigned reg)276 bi_get_src_slot(bi_registers *regs, unsigned reg)
277 {
278         if (regs->slot[0] == reg && regs->enabled[0])
279                 return BIFROST_SRC_PORT0;
280         else if (regs->slot[1] == reg && regs->enabled[1])
281                 return BIFROST_SRC_PORT1;
282         else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ)
283                 return BIFROST_SRC_PORT2;
284         else
285                 unreachable("Tried to access register with no port");
286 }
287 
288 static inline enum bifrost_packed_src
bi_get_src_new(bi_instr * ins,bi_registers * regs,unsigned s)289 bi_get_src_new(bi_instr *ins, bi_registers *regs, unsigned s)
290 {
291         if (!ins)
292                 return 0;
293 
294         bi_index src = ins->src[s];
295 
296         if (src.type == BI_INDEX_REGISTER)
297                 return bi_get_src_slot(regs, src.value);
298         else if (src.type == BI_INDEX_PASS)
299                 return src.value;
300         else if (bi_is_null(src) && ins->op == BI_OPCODE_ZS_EMIT && s < 2)
301                 return BIFROST_SRC_STAGE;
302         else {
303                 /* TODO make safer */
304                 return BIFROST_SRC_STAGE;
305         }
306 }
307 
308 static struct bi_packed_tuple
bi_pack_tuple(bi_clause * clause,bi_tuple * tuple,bi_tuple * prev,bool first_tuple,gl_shader_stage stage)309 bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage)
310 {
311         bi_assign_slots(tuple, prev);
312         tuple->regs.fau_idx = tuple->fau_idx;
313         tuple->regs.first_instruction = first_tuple;
314 
315         bi_flip_slots(&tuple->regs);
316 
317         bool sr_read = tuple->add &&
318                 bi_opcode_props[(tuple->add)->op].sr_read;
319 
320         uint64_t reg = bi_pack_registers(tuple->regs);
321         uint64_t fma = bi_pack_fma(tuple->fma,
322                         bi_get_src_new(tuple->fma, &tuple->regs, 0),
323                         bi_get_src_new(tuple->fma, &tuple->regs, 1),
324                         bi_get_src_new(tuple->fma, &tuple->regs, 2),
325                         bi_get_src_new(tuple->fma, &tuple->regs, 3));
326 
327         uint64_t add = bi_pack_add(tuple->add,
328                         bi_get_src_new(tuple->add, &tuple->regs, sr_read + 0),
329                         bi_get_src_new(tuple->add, &tuple->regs, sr_read + 1),
330                         bi_get_src_new(tuple->add, &tuple->regs, sr_read + 2),
331                         0);
332 
333         if (tuple->add) {
334                 bi_instr *add = tuple->add;
335 
336                 bool sr_write = bi_opcode_props[add->op].sr_write &&
337                         !bi_is_null(add->dest[0]);
338 
339                 if (sr_read && !bi_is_null(add->src[0])) {
340                         assert(add->src[0].type == BI_INDEX_REGISTER);
341                         clause->staging_register = add->src[0].value;
342 
343                         if (sr_write)
344                                 assert(bi_is_equiv(add->src[0], add->dest[0]));
345                 } else if (sr_write) {
346                         assert(add->dest[0].type == BI_INDEX_REGISTER);
347                         clause->staging_register = add->dest[0].value;
348                 }
349         }
350 
351         struct bi_packed_tuple packed = {
352                 .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
353                 .hi = add >> 6
354         };
355 
356         return packed;
357 }
358 
359 /* A block contains at most one PC-relative constant, from a terminal branch.
360  * Find the last instruction and if it is a relative branch, fix up the
361  * PC-relative constant to contain the absolute offset. This occurs at pack
362  * time instead of schedule time because the number of quadwords between each
363  * block is not known until after all other passes have finished.
364  */
365 
366 static void
bi_assign_branch_offset(bi_context * ctx,bi_block * block)367 bi_assign_branch_offset(bi_context *ctx, bi_block *block)
368 {
369         if (list_is_empty(&block->clauses))
370                 return;
371 
372         bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link);
373         bi_instr *br = bi_last_instr_in_clause(clause);
374 
375         if (!br->branch_target)
376                 return;
377 
378         /* Put it in the high place */
379         int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
380         int32_t bytes = qwords * 16;
381 
382         /* Copy so we can toy with the sign without undefined behaviour */
383         uint32_t raw = 0;
384         memcpy(&raw, &bytes, sizeof(raw));
385 
386         /* Clear off top bits for A1/B1 bits */
387         raw &= ~0xF0000000;
388 
389         /* Put in top 32-bits */
390         assert(clause->pcrel_idx < 8);
391         clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull;
392 }
393 
394 static void
bi_pack_constants(unsigned tuple_count,uint64_t * constants,unsigned word_idx,unsigned constant_words,bool ec0_packed,struct util_dynarray * emission)395 bi_pack_constants(unsigned tuple_count, uint64_t *constants,
396                 unsigned word_idx, unsigned constant_words, bool ec0_packed,
397                 struct util_dynarray *emission)
398 {
399         unsigned index = (word_idx << 1) + ec0_packed;
400 
401         /* Do more constants follow */
402         bool more = (word_idx + 1) < constant_words;
403 
404         /* Indexed first by tuple count and second by constant word number,
405          * indicates the position in the clause */
406         unsigned pos_lookup[8][3] = {
407                 { 0 },
408                 { 1 },
409                 { 3 },
410                 { 2, 5 },
411                 { 4, 8 },
412                 { 7, 11, 14 },
413                 { 6, 10, 13 },
414                 { 9, 12 }
415         };
416 
417         /* Compute the pos, and check everything is reasonable */
418         assert((tuple_count - 1) < 8);
419         assert(word_idx < 3);
420         unsigned pos = pos_lookup[tuple_count - 1][word_idx];
421         assert(pos != 0 || (tuple_count == 1 && word_idx == 0));
422 
423         struct bifrost_fmt_constant quad = {
424                 .pos = pos,
425                 .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL,
426                 .imm_1 = constants[index + 0] >> 4,
427                 .imm_2 = constants[index + 1] >> 4,
428         };
429 
430         util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
431 }
432 
433 uint8_t
bi_pack_literal(enum bi_clause_subword literal)434 bi_pack_literal(enum bi_clause_subword literal)
435 {
436         assert(literal >= BI_CLAUSE_SUBWORD_LITERAL_0);
437         assert(literal <= BI_CLAUSE_SUBWORD_LITERAL_7);
438 
439         return (literal - BI_CLAUSE_SUBWORD_LITERAL_0);
440 }
441 
442 static inline uint8_t
bi_clause_upper(unsigned val,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)443 bi_clause_upper(unsigned val,
444                 struct bi_packed_tuple *tuples,
445                 ASSERTED unsigned tuple_count)
446 {
447         assert(val < tuple_count);
448 
449         /* top 3-bits of 78-bits is tuple >> 75 == (tuple >> 64) >> 11 */
450         struct bi_packed_tuple tuple = tuples[val];
451         return (tuple.hi >> 11);
452 }
453 
454 uint8_t
bi_pack_upper(enum bi_clause_subword upper,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)455 bi_pack_upper(enum bi_clause_subword upper,
456                 struct bi_packed_tuple *tuples,
457                 ASSERTED unsigned tuple_count)
458 {
459         assert(upper >= BI_CLAUSE_SUBWORD_UPPER_0);
460         assert(upper <= BI_CLAUSE_SUBWORD_UPPER_7);
461 
462         return bi_clause_upper(upper - BI_CLAUSE_SUBWORD_UPPER_0, tuples,
463                         tuple_count);
464 }
465 
466 uint64_t
bi_pack_tuple_bits(enum bi_clause_subword idx,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,unsigned offset,unsigned nbits)467 bi_pack_tuple_bits(enum bi_clause_subword idx,
468                 struct bi_packed_tuple *tuples,
469                 ASSERTED unsigned tuple_count,
470                 unsigned offset, unsigned nbits)
471 {
472         assert(idx >= BI_CLAUSE_SUBWORD_TUPLE_0);
473         assert(idx <= BI_CLAUSE_SUBWORD_TUPLE_7);
474 
475         unsigned val = (idx - BI_CLAUSE_SUBWORD_TUPLE_0);
476         assert(val < tuple_count);
477 
478         struct bi_packed_tuple tuple = tuples[val];
479 
480         assert(offset + nbits < 78);
481         assert(nbits <= 64);
482 
483         /* (X >> start) & m
484          * = (((hi << 64) | lo) >> start) & m
485          * = (((hi << 64) >> start) | (lo >> start)) & m
486          * = { ((hi << (64 - start)) | (lo >> start)) & m if start <= 64
487          *   { ((hi >> (start - 64)) | (lo >> start)) & m if start >= 64
488          * = { ((hi << (64 - start)) & m) | ((lo >> start) & m) if start <= 64
489          *   { ((hi >> (start - 64)) & m) | ((lo >> start) & m) if start >= 64
490          *
491          * By setting m = 2^64 - 1, we justify doing the respective shifts as
492          * 64-bit integers. Zero special cased to avoid undefined behaviour.
493          */
494 
495         uint64_t lo = (tuple.lo >> offset);
496         uint64_t hi = (offset == 0) ? 0
497                 : (offset > 64) ? (tuple.hi >> (offset - 64))
498                 : (tuple.hi << (64 - offset));
499 
500         return (lo | hi) & ((1ULL << nbits) - 1);
501 }
502 
503 static inline uint16_t
bi_pack_lu(enum bi_clause_subword word,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)504 bi_pack_lu(enum bi_clause_subword word,
505                 struct bi_packed_tuple *tuples,
506                 ASSERTED unsigned tuple_count)
507 {
508         return (word >= BI_CLAUSE_SUBWORD_UPPER_0) ?
509                 bi_pack_upper(word, tuples, tuple_count) :
510                 bi_pack_literal(word);
511 }
512 
513 uint8_t
bi_pack_sync(enum bi_clause_subword t1,enum bi_clause_subword t2,enum bi_clause_subword t3,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,bool z)514 bi_pack_sync(enum bi_clause_subword t1,
515              enum bi_clause_subword t2,
516              enum bi_clause_subword t3,
517              struct bi_packed_tuple *tuples,
518              ASSERTED unsigned tuple_count,
519              bool z)
520 {
521         uint8_t sync =
522                 (bi_pack_lu(t3, tuples, tuple_count) << 0) |
523                 (bi_pack_lu(t2, tuples, tuple_count) << 3);
524 
525         if (t1 == BI_CLAUSE_SUBWORD_Z)
526                 sync |= z << 6;
527         else
528                 sync |= bi_pack_literal(t1) << 6;
529 
530         return sync;
531 }
532 
533 static inline uint64_t
bi_pack_t_ec(enum bi_clause_subword word,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t ec0)534 bi_pack_t_ec(enum bi_clause_subword word,
535                 struct bi_packed_tuple *tuples,
536                 ASSERTED unsigned tuple_count,
537                 uint64_t ec0)
538 {
539         if (word == BI_CLAUSE_SUBWORD_CONSTANT)
540                 return ec0;
541         else
542                 return bi_pack_tuple_bits(word, tuples, tuple_count, 0, 60);
543 }
544 
545 static uint32_t
bi_pack_subwords_56(enum bi_clause_subword t,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned tuple_subword)546 bi_pack_subwords_56(enum bi_clause_subword t,
547                 struct bi_packed_tuple *tuples,
548                 ASSERTED unsigned tuple_count,
549                 uint64_t header, uint64_t ec0,
550                 unsigned tuple_subword)
551 {
552         switch (t) {
553         case BI_CLAUSE_SUBWORD_HEADER:
554                 return (header & ((1 << 30) - 1));
555         case BI_CLAUSE_SUBWORD_RESERVED:
556                 return 0;
557         case BI_CLAUSE_SUBWORD_CONSTANT:
558                 return (ec0 >> 15) & ((1 << 30) - 1);
559         default:
560                 return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 30);
561         }
562 }
563 
564 static uint16_t
bi_pack_subword(enum bi_clause_subword t,unsigned format,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned m0,unsigned tuple_subword)565 bi_pack_subword(enum bi_clause_subword t, unsigned format,
566                 struct bi_packed_tuple *tuples,
567                 ASSERTED unsigned tuple_count,
568                 uint64_t header, uint64_t ec0, unsigned m0,
569                 unsigned tuple_subword)
570 {
571         switch (t) {
572         case BI_CLAUSE_SUBWORD_HEADER:
573                 return header >> 30;
574         case BI_CLAUSE_SUBWORD_M:
575                 return m0;
576         case BI_CLAUSE_SUBWORD_CONSTANT:
577                 return (format == 5 || format == 10) ?
578                         (ec0 & ((1 << 15) - 1)) :
579                         (ec0 >> (15 + 30));
580         case BI_CLAUSE_SUBWORD_UPPER_23:
581                 return (bi_clause_upper(2, tuples, tuple_count) << 12) |
582                         (bi_clause_upper(3, tuples, tuple_count) << 9);
583         case BI_CLAUSE_SUBWORD_UPPER_56:
584                 return (bi_clause_upper(5, tuples, tuple_count) << 12) |
585                         (bi_clause_upper(6, tuples, tuple_count) << 9);
586         case BI_CLAUSE_SUBWORD_UPPER_0 ... BI_CLAUSE_SUBWORD_UPPER_7:
587                 return bi_pack_upper(t, tuples, tuple_count) << 12;
588         default:
589                 return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 15);
590         }
591 }
592 
593 /* EC0 is 60-bits (bottom 4 already shifted off) */
594 void
bi_pack_format(struct util_dynarray * emission,unsigned index,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned m0,bool z)595 bi_pack_format(struct util_dynarray *emission,
596                 unsigned index,
597                 struct bi_packed_tuple *tuples,
598                 ASSERTED unsigned tuple_count,
599                 uint64_t header, uint64_t ec0,
600                 unsigned m0, bool z)
601 {
602         struct bi_clause_format format = bi_clause_formats[index];
603 
604         uint8_t sync = bi_pack_sync(format.tag_1, format.tag_2, format.tag_3,
605                         tuples, tuple_count, z);
606 
607         uint64_t s0_s3 = bi_pack_t_ec(format.s0_s3, tuples, tuple_count, ec0);
608 
609         uint16_t s4 = bi_pack_subword(format.s4, format.format, tuples, tuple_count, header, ec0, m0, 4);
610 
611         uint32_t s5_s6 = bi_pack_subwords_56(format.s5_s6,
612                         tuples, tuple_count, header, ec0,
613                         (format.format == 2 || format.format == 7) ? 0 : 3);
614 
615         uint64_t s7 = bi_pack_subword(format.s7, format.format, tuples, tuple_count, header, ec0, m0, 2);
616 
617         /* Now that subwords are packed, split into 64-bit halves and emit */
618         uint64_t lo = sync | ((s0_s3 & ((1ull << 56) - 1)) << 8);
619         uint64_t hi = (s0_s3 >> 56) | ((uint64_t) s4 << 4) | ((uint64_t) s5_s6 << 19) | ((uint64_t) s7 << 49);
620 
621         util_dynarray_append(emission, uint64_t, lo);
622         util_dynarray_append(emission, uint64_t, hi);
623 }
624 
625 static void
bi_pack_clause(bi_context * ctx,bi_clause * clause,bi_clause * next_1,bi_clause * next_2,struct util_dynarray * emission,gl_shader_stage stage)626 bi_pack_clause(bi_context *ctx, bi_clause *clause,
627                 bi_clause *next_1, bi_clause *next_2,
628                 struct util_dynarray *emission, gl_shader_stage stage)
629 {
630         struct bi_packed_tuple ins[8] = { 0 };
631 
632         for (unsigned i = 0; i < clause->tuple_count; ++i) {
633                 unsigned prev = ((i == 0) ? clause->tuple_count : i) - 1;
634                 ins[i] = bi_pack_tuple(clause, &clause->tuples[i],
635                                 &clause->tuples[prev], i == 0, stage);
636         }
637 
638         bool ec0_packed = bi_ec0_packed(clause->tuple_count);
639 
640         if (ec0_packed)
641                 clause->constant_count = MAX2(clause->constant_count, 1);
642 
643         unsigned constant_quads =
644                 DIV_ROUND_UP(clause->constant_count - (ec0_packed ? 1 : 0), 2);
645 
646         uint64_t header = bi_pack_header(clause, next_1, next_2);
647         uint64_t ec0 = (clause->constants[0] >> 4);
648         unsigned m0 = (clause->pcrel_idx == 0) ? 4 : 0;
649 
650         unsigned counts[8] = {
651                 1, 2, 3, 3, 4, 5, 5, 6
652         };
653 
654         unsigned indices[8][6] = {
655                 { 1 },
656                 { 0, 2 },
657                 { 0, 3, 4 },
658                 { 0, 3, 6 },
659                 { 0, 3, 7, 8 },
660                 { 0, 3, 5, 9, 10 },
661                 { 0, 3, 5, 9, 11 },
662                 { 0, 3, 5, 9, 12, 13 },
663         };
664 
665         unsigned count = counts[clause->tuple_count - 1];
666 
667         for (unsigned pos = 0; pos < count; ++pos) {
668                 ASSERTED unsigned idx = indices[clause->tuple_count - 1][pos];
669                 assert(bi_clause_formats[idx].pos == pos);
670                 assert((bi_clause_formats[idx].tag_1 == BI_CLAUSE_SUBWORD_Z) ==
671                                 (pos == count - 1));
672 
673                 /* Whether to end the clause immediately after the last tuple */
674                 bool z = (constant_quads == 0);
675 
676                 bi_pack_format(emission, indices[clause->tuple_count - 1][pos],
677                                 ins, clause->tuple_count, header, ec0, m0,
678                                 z);
679         }
680 
681         /* Pack the remaining constants */
682 
683         for (unsigned pos = 0; pos < constant_quads; ++pos) {
684                 bi_pack_constants(clause->tuple_count, clause->constants,
685                                 pos, constant_quads, ec0_packed, emission);
686         }
687 }
688 
689 static void
bi_collect_blend_ret_addr(bi_context * ctx,struct util_dynarray * emission,const bi_clause * clause)690 bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
691                           const bi_clause *clause)
692 {
693         /* No need to collect return addresses when we're in a blend shader. */
694         if (ctx->inputs->is_blend)
695                 return;
696 
697         const bi_tuple *tuple = &clause->tuples[clause->tuple_count - 1];
698         const bi_instr *ins = tuple->add;
699 
700         if (!ins || ins->op != BI_OPCODE_BLEND)
701                 return;
702 
703 
704         unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0;
705         assert(loc < ARRAY_SIZE(ctx->info->bifrost.blend));
706         assert(!ctx->info->bifrost.blend[loc].return_offset);
707         ctx->info->bifrost.blend[loc].return_offset =
708                 util_dynarray_num_elements(emission, uint8_t);
709         assert(!(ctx->info->bifrost.blend[loc].return_offset & 0x7));
710 }
711 
712 unsigned
bi_pack(bi_context * ctx,struct util_dynarray * emission)713 bi_pack(bi_context *ctx, struct util_dynarray *emission)
714 {
715         unsigned previous_size = emission->size;
716 
717         bi_foreach_block(ctx, block) {
718                 bi_assign_branch_offset(ctx, block);
719 
720                 bi_foreach_clause_in_block(block, clause) {
721                         bool is_last = (clause->link.next == &block->clauses);
722 
723                         /* Get the succeeding clauses, either two successors of
724                          * the block for the last clause in the block or just
725                          * the next clause within the block */
726 
727                         bi_clause *next = NULL, *next_2 = NULL;
728 
729                         if (is_last) {
730                                 next = bi_next_clause(ctx, block->successors[0], NULL);
731                                 next_2 = bi_next_clause(ctx, block->successors[1], NULL);
732                         } else {
733                                 next = bi_next_clause(ctx, block, clause);
734                         }
735 
736 
737                         previous_size = emission->size;
738 
739                         bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage);
740 
741                         if (!is_last)
742                                 bi_collect_blend_ret_addr(ctx, emission, clause);
743                 }
744         }
745 
746         return emission->size - previous_size;
747 }
748