1 /*
2  * Copyright (C) 2020 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compiler.h"
25 #include "bi_print.h"
26 
27 #define RETURN_PACKED(str) { \
28         uint64_t temp = 0; \
29         memcpy(&temp, &str, sizeof(str)); \
30         return temp; \
31 }
32 
33 /* This file contains the final passes of the compiler. Running after
34  * scheduling and RA, the IR is now finalized, so we need to emit it to actual
35  * bits on the wire (as well as fixup branches) */
36 
37 static uint64_t
bi_pack_header(bi_clause * clause,bi_clause * next_1,bi_clause * next_2,bool is_fragment)38 bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2, bool is_fragment)
39 {
40         /* next_dependencies are the union of the dependencies of successors'
41          * dependencies */
42 
43         unsigned scoreboard_deps = next_1 ? next_1->dependencies : 0;
44         scoreboard_deps |= next_2 ? next_2->dependencies : 0;
45 
46         struct bifrost_header header = {
47                 .back_to_back = clause->back_to_back,
48                 .no_end_of_shader = (next_1 != NULL),
49                 .elide_writes = is_fragment,
50                 .branch_cond = clause->branch_conditional || clause->back_to_back,
51                 .datareg_writebarrier = clause->data_register_write_barrier,
52                 .datareg = clause->data_register,
53                 .scoreboard_deps = scoreboard_deps,
54                 .scoreboard_index = clause->scoreboard_id,
55                 .clause_type = clause->clause_type,
56                 .next_clause_type = next_1 ? next_1->clause_type : 0,
57                 .suppress_inf = true,
58                 .suppress_nan = true,
59         };
60 
61         header.branch_cond |= header.back_to_back;
62 
63         uint64_t u = 0;
64         memcpy(&u, &header, sizeof(header));
65         return u;
66 }
67 
68 /* The uniform/constant slot allows loading a contiguous 64-bit immediate or
69  * pushed uniform per bundle. Figure out which one we need in the bundle (the
70  * scheduler needs to ensure we only have one type per bundle), validate
71  * everything, and rewrite away the register/uniform indices to use 3-bit
72  * sources directly. */
73 
74 static unsigned
bi_lookup_constant(bi_clause * clause,uint64_t cons,bool * hi,bool b64)75 bi_lookup_constant(bi_clause *clause, uint64_t cons, bool *hi, bool b64)
76 {
77         uint64_t want = (cons >> 4);
78 
79         for (unsigned i = 0; i < clause->constant_count; ++i) {
80                 /* Only check top 60-bits since that's what's actually embedded
81                  * in the clause, the bottom 4-bits are bundle-inline */
82 
83                 uint64_t candidates[2] = {
84                         clause->constants[i] >> 4,
85                         clause->constants[i] >> 36
86                 };
87 
88                 /* For <64-bit mode, we treat lo/hi separately */
89 
90                 if (!b64)
91                         candidates[0] &= (0xFFFFFFFF >> 4);
92 
93                 if (candidates[0] == want)
94                         return i;
95 
96                 if (candidates[1] == want && !b64) {
97                         *hi = true;
98                         return i;
99                 }
100         }
101 
102         unreachable("Invalid constant accessed");
103 }
104 
105 static unsigned
bi_constant_field(unsigned idx)106 bi_constant_field(unsigned idx)
107 {
108         assert(idx <= 5);
109 
110         const unsigned values[] = {
111                 4, 5, 6, 7, 2, 3
112         };
113 
114         return values[idx] << 4;
115 }
116 
117 static bool
bi_assign_uniform_constant_single(bi_registers * regs,bi_clause * clause,bi_instruction * ins,bool assigned,bool fast_zero)118 bi_assign_uniform_constant_single(
119                 bi_registers *regs,
120                 bi_clause *clause,
121                 bi_instruction *ins, bool assigned, bool fast_zero)
122 {
123         if (!ins)
124                 return assigned;
125 
126         if (ins->type == BI_BLEND) {
127                 assert(!assigned);
128                 regs->uniform_constant = 0x8;
129                 return true;
130         }
131 
132         if (ins->type == BI_BRANCH && clause->branch_constant) {
133                 /* By convention branch constant is last */
134                 unsigned idx = clause->constant_count - 1;
135 
136                 /* We can only jump to clauses which are qword aligned so the
137                  * bottom 4-bits of the offset are necessarily 0 */
138                 unsigned lo = 0;
139 
140                 /* Build the constant */
141                 unsigned C = bi_constant_field(idx) | lo;
142 
143                 if (assigned && regs->uniform_constant != C)
144                         unreachable("Mismatched uniform/const field: branch");
145 
146                 regs->uniform_constant = C;
147                 return true;
148         }
149 
150         bi_foreach_src(ins, s) {
151                 if (s == 0 && (ins->type == BI_LOAD_VAR_ADDRESS || ins->type == BI_LOAD_ATTR)) continue;
152                 if (s == 1 && (ins->type == BI_BRANCH)) continue;
153 
154                 if (ins->src[s] & BIR_INDEX_CONSTANT) {
155                         /* Let direct addresses through */
156                         if (ins->type == BI_LOAD_VAR)
157                                 continue;
158 
159                         bool hi = false;
160                         bool b64 = nir_alu_type_get_type_size(ins->src_types[s]) > 32;
161                         uint64_t cons = bi_get_immediate(ins, s);
162                         unsigned idx = bi_lookup_constant(clause, cons, &hi, b64);
163                         unsigned lo = clause->constants[idx] & 0xF;
164                         unsigned f = bi_constant_field(idx) | lo;
165 
166                         if (assigned && regs->uniform_constant != f)
167                                 unreachable("Mismatched uniform/const field: imm");
168 
169                         regs->uniform_constant = f;
170                         ins->src[s] = BIR_INDEX_PASS | (hi ? BIFROST_SRC_CONST_HI : BIFROST_SRC_CONST_LO);
171                         assigned = true;
172                 } else if (ins->src[s] & BIR_INDEX_ZERO && (ins->type == BI_LOAD_UNIFORM || ins->type == BI_LOAD_VAR)) {
173                         /* XXX: HACK UNTIL WE HAVE HI MATCHING DUE TO OVERFLOW XXX */
174                         ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_CONST_HI;
175                 } else if (ins->src[s] & BIR_INDEX_ZERO && !fast_zero) {
176                         /* FMAs have a fast zero port, ADD needs to use the
177                          * uniform/const port's special 0 mode handled here */
178                         unsigned f = 0;
179 
180                         if (assigned && regs->uniform_constant != f)
181                                 unreachable("Mismatched uniform/const field: 0");
182 
183                         regs->uniform_constant = f;
184                         ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_CONST_LO;
185                         assigned = true;
186                 } else if (ins->src[s] & BIR_INDEX_ZERO && fast_zero) {
187                         ins->src[s] = BIR_INDEX_PASS | BIFROST_SRC_STAGE;
188                 } else if (s & BIR_INDEX_UNIFORM) {
189                         unreachable("Push uniforms not implemented yet");
190                 }
191         }
192 
193         return assigned;
194 }
195 
196 static void
bi_assign_uniform_constant(bi_clause * clause,bi_registers * regs,bi_bundle bundle)197 bi_assign_uniform_constant(
198                 bi_clause *clause,
199                 bi_registers *regs,
200                 bi_bundle bundle)
201 {
202         bool assigned =
203                 bi_assign_uniform_constant_single(regs, clause, bundle.fma, false, true);
204 
205         bi_assign_uniform_constant_single(regs, clause, bundle.add, assigned, false);
206 }
207 
208 /* Assigns a port for reading, before anything is written */
209 
210 static void
bi_assign_port_read(bi_registers * regs,unsigned src)211 bi_assign_port_read(bi_registers *regs, unsigned src)
212 {
213         /* We only assign for registers */
214         if (!(src & BIR_INDEX_REGISTER))
215                 return;
216 
217         unsigned reg = src & ~BIR_INDEX_REGISTER;
218 
219         /* Check if we already assigned the port */
220         for (unsigned i = 0; i <= 1; ++i) {
221                 if (regs->port[i] == reg && regs->enabled[i])
222                         return;
223         }
224 
225         if (regs->port[3] == reg && regs->read_port3)
226                 return;
227 
228         /* Assign it now */
229 
230         for (unsigned i = 0; i <= 1; ++i) {
231                 if (!regs->enabled[i]) {
232                         regs->port[i] = reg;
233                         regs->enabled[i] = true;
234                         return;
235                 }
236         }
237 
238         if (!regs->read_port3) {
239                 regs->port[3] = reg;
240                 regs->read_port3 = true;
241                 return;
242         }
243 
244         bi_print_ports(regs, stderr);
245         unreachable("Failed to find a free port for src");
246 }
247 
248 static bi_registers
bi_assign_ports(bi_bundle * now,bi_bundle * prev)249 bi_assign_ports(bi_bundle *now, bi_bundle *prev)
250 {
251         /* We assign ports for the main register mechanism. Special ops
252          * use the data registers, which has its own mechanism entirely
253          * and thus gets skipped over here. */
254 
255         unsigned read_dreg = now->add &&
256                 bi_class_props[now->add->type] & BI_DATA_REG_SRC;
257 
258         unsigned write_dreg = prev->add &&
259                 bi_class_props[prev->add->type] & BI_DATA_REG_DEST;
260 
261         /* First, assign reads */
262 
263         if (now->fma)
264                 bi_foreach_src(now->fma, src)
265                         bi_assign_port_read(&now->regs, now->fma->src[src]);
266 
267         if (now->add) {
268                 bi_foreach_src(now->add, src) {
269                         if (!(src == 0 && read_dreg))
270                                 bi_assign_port_read(&now->regs, now->add->src[src]);
271                 }
272         }
273 
274         /* Next, assign writes */
275 
276         if (prev->add && prev->add->dest & BIR_INDEX_REGISTER && !write_dreg) {
277                 now->regs.port[2] = prev->add->dest & ~BIR_INDEX_REGISTER;
278                 now->regs.write_add = true;
279         }
280 
281         if (prev->fma && prev->fma->dest & BIR_INDEX_REGISTER) {
282                 unsigned r = prev->fma->dest & ~BIR_INDEX_REGISTER;
283 
284                 if (now->regs.write_add) {
285                         /* Scheduler constraint: cannot read 3 and write 2 */
286                         assert(!now->regs.read_port3);
287                         now->regs.port[3] = r;
288                 } else {
289                         now->regs.port[2] = r;
290                 }
291 
292                 now->regs.write_fma = true;
293         }
294 
295         return now->regs;
296 }
297 
298 /* Determines the register control field, ignoring the first? flag */
299 
300 static enum bifrost_reg_control
bi_pack_register_ctrl_lo(bi_registers r)301 bi_pack_register_ctrl_lo(bi_registers r)
302 {
303         if (r.write_fma) {
304                 if (r.write_add) {
305                         assert(!r.read_port3);
306                         return BIFROST_WRITE_ADD_P2_FMA_P3;
307                 } else {
308                         if (r.read_port3)
309                                 return BIFROST_WRITE_FMA_P2_READ_P3;
310                         else
311                                 return BIFROST_WRITE_FMA_P2;
312                 }
313         } else if (r.write_add) {
314                 if (r.read_port3)
315                         return BIFROST_WRITE_ADD_P2_READ_P3;
316                 else
317                         return BIFROST_WRITE_ADD_P2;
318         } else if (r.read_port3)
319                 return BIFROST_READ_P3;
320         else
321                 return BIFROST_REG_NONE;
322 }
323 
324 /* Ditto but account for the first? flag this time */
325 
326 static enum bifrost_reg_control
bi_pack_register_ctrl(bi_registers r)327 bi_pack_register_ctrl(bi_registers r)
328 {
329         enum bifrost_reg_control ctrl = bi_pack_register_ctrl_lo(r);
330 
331         if (r.first_instruction) {
332                 if (ctrl == BIFROST_REG_NONE)
333                         ctrl = BIFROST_FIRST_NONE;
334                 else if (ctrl == BIFROST_WRITE_FMA_P2_READ_P3)
335                         ctrl = BIFROST_FIRST_WRITE_FMA_P2_READ_P3;
336                 else
337                         ctrl |= BIFROST_FIRST_NONE;
338         }
339 
340         return ctrl;
341 }
342 
343 static uint64_t
bi_pack_registers(bi_registers regs)344 bi_pack_registers(bi_registers regs)
345 {
346         enum bifrost_reg_control ctrl = bi_pack_register_ctrl(regs);
347         struct bifrost_regs s = { 0 };
348         uint64_t packed = 0;
349 
350         if (regs.enabled[1]) {
351                 /* Gotta save that bit!~ Required by the 63-x trick */
352                 assert(regs.port[1] > regs.port[0]);
353                 assert(regs.enabled[0]);
354 
355                 /* Do the 63-x trick, see docs/disasm */
356                 if (regs.port[0] > 31) {
357                         regs.port[0] = 63 - regs.port[0];
358                         regs.port[1] = 63 - regs.port[1];
359                 }
360 
361                 assert(regs.port[0] <= 31);
362                 assert(regs.port[1] <= 63);
363 
364                 s.ctrl = ctrl;
365                 s.reg1 = regs.port[1];
366                 s.reg0 = regs.port[0];
367         } else {
368                 /* Port 1 disabled, so set to zero and use port 1 for ctrl */
369                 s.ctrl = 0;
370                 s.reg1 = ctrl << 2;
371 
372                 if (regs.enabled[0]) {
373                         /* Bit 0 upper bit of port 0 */
374                         s.reg1 |= (regs.port[0] >> 5);
375 
376                         /* Rest of port 0 in usual spot */
377                         s.reg0 = (regs.port[0] & 0b11111);
378                 } else {
379                         /* Bit 1 set if port 0 also disabled */
380                         s.reg1 |= (1 << 1);
381                 }
382         }
383 
384         /* When port 3 isn't used, we have to set it to port 2, and vice versa,
385          * or INSTR_INVALID_ENC is raised. The reason is unknown. */
386 
387         bool has_port2 = regs.write_fma || regs.write_add;
388         bool has_port3 = regs.read_port3 || (regs.write_fma && regs.write_add);
389 
390         if (!has_port3)
391                 regs.port[3] = regs.port[2];
392 
393         if (!has_port2)
394                 regs.port[2] = regs.port[3];
395 
396         s.reg3 = regs.port[3];
397         s.reg2 = regs.port[2];
398         s.uniform_const = regs.uniform_constant;
399 
400         memcpy(&packed, &s, sizeof(s));
401         return packed;
402 }
403 
404 static void
bi_set_data_register(bi_clause * clause,unsigned idx)405 bi_set_data_register(bi_clause *clause, unsigned idx)
406 {
407         assert(idx & BIR_INDEX_REGISTER);
408         unsigned reg = idx & ~BIR_INDEX_REGISTER;
409         assert(reg <= 63);
410         clause->data_register = reg;
411 }
412 
413 static void
bi_read_data_register(bi_clause * clause,bi_instruction * ins)414 bi_read_data_register(bi_clause *clause, bi_instruction *ins)
415 {
416         bi_set_data_register(clause, ins->src[0]);
417 }
418 
419 static void
bi_write_data_register(bi_clause * clause,bi_instruction * ins)420 bi_write_data_register(bi_clause *clause, bi_instruction *ins)
421 {
422         bi_set_data_register(clause, ins->dest);
423 }
424 
425 static enum bifrost_packed_src
bi_get_src_reg_port(bi_registers * regs,unsigned src)426 bi_get_src_reg_port(bi_registers *regs, unsigned src)
427 {
428         unsigned reg = src & ~BIR_INDEX_REGISTER;
429 
430         if (regs->port[0] == reg && regs->enabled[0])
431                 return BIFROST_SRC_PORT0;
432         else if (regs->port[1] == reg && regs->enabled[1])
433                 return BIFROST_SRC_PORT1;
434         else if (regs->port[3] == reg && regs->read_port3)
435                 return BIFROST_SRC_PORT3;
436         else
437                 unreachable("Tried to access register with no port");
438 }
439 
440 static enum bifrost_packed_src
bi_get_src(bi_instruction * ins,bi_registers * regs,unsigned s)441 bi_get_src(bi_instruction *ins, bi_registers *regs, unsigned s)
442 {
443         unsigned src = ins->src[s];
444 
445         if (src & BIR_INDEX_REGISTER)
446                 return bi_get_src_reg_port(regs, src);
447         else if (src & BIR_INDEX_PASS)
448                 return src & ~BIR_INDEX_PASS;
449         else {
450                 bi_print_instruction(ins, stderr);
451                 unreachable("Unknown src in above instruction");
452         }
453 }
454 
455 /* Constructs a packed 2-bit swizzle for a 16-bit vec2 source. Source must be
456  * 16-bit and written components must correspond to valid swizzles (component x
457  * or y). */
458 
459 static unsigned
bi_swiz16(bi_instruction * ins,unsigned src)460 bi_swiz16(bi_instruction *ins, unsigned src)
461 {
462         assert(nir_alu_type_get_type_size(ins->src_types[src]) == 16);
463         unsigned swizzle = 0;
464 
465         for (unsigned c = 0; c < 2; ++c) {
466                 if (!bi_writes_component(ins, src)) continue;
467 
468                 unsigned k = ins->swizzle[src][c];
469                 assert(k <= 1);
470                 swizzle |= (k << c);
471         }
472 
473         return swizzle;
474 }
475 
476 static unsigned
bi_pack_fma_fma(bi_instruction * ins,bi_registers * regs)477 bi_pack_fma_fma(bi_instruction *ins, bi_registers *regs)
478 {
479         /* (-a)(-b) = ab, so we only need one negate bit */
480         bool negate_mul = ins->src_neg[0] ^ ins->src_neg[1];
481 
482         if (ins->op.mscale) {
483                 assert(!(ins->src_abs[0] && ins->src_abs[1]));
484                 assert(!ins->src_abs[2] || !ins->src_neg[3] || !ins->src_abs[3]);
485 
486                 /* We can have exactly one abs, and can flip the multiplication
487                  * to make it fit if we have to */
488                 bool flip_ab = ins->src_abs[1];
489 
490                 struct bifrost_fma_mscale pack = {
491                         .src0 = bi_get_src(ins, regs, flip_ab ? 1 : 0),
492                         .src1 = bi_get_src(ins, regs, flip_ab ? 0 : 1),
493                         .src2 = bi_get_src(ins, regs, 2),
494                         .src3 = bi_get_src(ins, regs, 3),
495                         .mscale_mode = 0,
496                         .mode = ins->outmod,
497                         .src0_abs = ins->src_abs[0] || ins->src_abs[1],
498                         .src1_neg = negate_mul,
499                         .src2_neg = ins->src_neg[2],
500                         .op = BIFROST_FMA_OP_MSCALE,
501                 };
502 
503                 RETURN_PACKED(pack);
504         } else if (ins->dest_type == nir_type_float32) {
505                 struct bifrost_fma_fma pack = {
506                         .src0 = bi_get_src(ins, regs, 0),
507                         .src1 = bi_get_src(ins, regs, 1),
508                         .src2 = bi_get_src(ins, regs, 2),
509                         .src0_abs = ins->src_abs[0],
510                         .src1_abs = ins->src_abs[1],
511                         .src2_abs = ins->src_abs[2],
512                         .src0_neg = negate_mul,
513                         .src2_neg = ins->src_neg[2],
514                         .outmod = ins->outmod,
515                         .roundmode = ins->roundmode,
516                         .op = BIFROST_FMA_OP_FMA
517                 };
518 
519                 RETURN_PACKED(pack);
520         } else if (ins->dest_type == nir_type_float16) {
521                 struct bifrost_fma_fma16 pack = {
522                         .src0 = bi_get_src(ins, regs, 0),
523                         .src1 = bi_get_src(ins, regs, 1),
524                         .src2 = bi_get_src(ins, regs, 2),
525                         .swizzle_0 = bi_swiz16(ins, 0),
526                         .swizzle_1 = bi_swiz16(ins, 1),
527                         .swizzle_2 = bi_swiz16(ins, 2),
528                         .src0_neg = negate_mul,
529                         .src2_neg = ins->src_neg[2],
530                         .outmod = ins->outmod,
531                         .roundmode = ins->roundmode,
532                         .op = BIFROST_FMA_OP_FMA16
533                 };
534 
535                 RETURN_PACKED(pack);
536         } else {
537                 unreachable("Invalid fma dest type");
538         }
539 }
540 
541 static unsigned
bi_pack_fma_addmin_f32(bi_instruction * ins,bi_registers * regs)542 bi_pack_fma_addmin_f32(bi_instruction *ins, bi_registers *regs)
543 {
544         unsigned op =
545                 (ins->type == BI_ADD) ? BIFROST_FMA_OP_FADD32 :
546                 (ins->op.minmax == BI_MINMAX_MIN) ? BIFROST_FMA_OP_FMIN32 :
547                 BIFROST_FMA_OP_FMAX32;
548 
549         struct bifrost_fma_add pack = {
550                 .src0 = bi_get_src(ins, regs, 0),
551                 .src1 = bi_get_src(ins, regs, 1),
552                 .src0_abs = ins->src_abs[0],
553                 .src1_abs = ins->src_abs[1],
554                 .src0_neg = ins->src_neg[0],
555                 .src1_neg = ins->src_neg[1],
556                 .unk = 0x0,
557                 .outmod = ins->outmod,
558                 .roundmode = (ins->type == BI_ADD) ? ins->roundmode : ins->minmax,
559                 .op = op
560         };
561 
562         RETURN_PACKED(pack);
563 }
564 
565 static bool
bi_pack_fp16_abs(bi_instruction * ins,bi_registers * regs,bool * flip)566 bi_pack_fp16_abs(bi_instruction *ins, bi_registers *regs, bool *flip)
567 {
568         /* Absolute values are packed in a quirky way. Let k = src1 < src0. Let
569          * l be an auxiliary bit we encode. Then the hardware determines:
570          *
571          *      abs0 = l || k
572          *      abs1 = l && k
573          *
574          * Since add/min/max are commutative, this saves a bit by using the
575          * order of the operands as a bit (k). To pack this, first note:
576          *
577          *      (l && k) implies (l || k).
578          *
579          * That is, if the second argument is abs'd, then the first argument
580          * also has abs. So there are three cases:
581          *
582          * Case 0: Neither src has absolute value. Then we have l = k = 0.
583          *
584          * Case 1: Exactly one src has absolute value. Assign that source to
585          * src0 and the other source to src1. Compute k = src1 < src0 based on
586          * that assignment. Then l = ~k.
587          *
588          * Case 2: Both sources have absolute value. Then we have l = k = 1.
589          * Note to force k = 1 requires that (src1 < src0) OR (src0 < src1).
590          * That is, this encoding is only valid if src1 and src0 are distinct.
591          * This is a scheduling restriction (XXX); if an op of this type
592          * requires both identical sources to have abs value, then we must
593          * schedule to ADD (which does not use this ordering trick).
594          */
595 
596         unsigned abs_0 = ins->src_abs[0], abs_1 = ins->src_abs[1];
597         unsigned src_0 = bi_get_src(ins, regs, 0);
598         unsigned src_1 = bi_get_src(ins, regs, 1);
599 
600         assert(!(abs_0 && abs_1 && src_0 == src_1));
601 
602         if (!abs_0 && !abs_1) {
603                 /* Force k = 0 <===> NOT(src1 < src0) */
604                 *flip = (src_1 < src_0);
605                 return false;
606         } else if (abs_0 && !abs_1) {
607                 return src_1 >= src_0;
608         } else if (abs_1 && !abs_0) {
609                 *flip = true;
610                 return src_0 >= src_1;
611         } else {
612                 *flip = !(src_1 < src_0);
613                 return true;
614         }
615 }
616 
617 static unsigned
bi_pack_fmadd_min_f16(bi_instruction * ins,bi_registers * regs,bool FMA)618 bi_pack_fmadd_min_f16(bi_instruction *ins, bi_registers *regs, bool FMA)
619 {
620         unsigned op =
621                 (!FMA) ? ((ins->op.minmax == BI_MINMAX_MIN) ?
622                         BIFROST_ADD_OP_FMIN16 : BIFROST_ADD_OP_FMAX16) :
623                 (ins->type == BI_ADD) ? BIFROST_FMA_OP_FADD16 :
624                 (ins->op.minmax == BI_MINMAX_MIN) ? BIFROST_FMA_OP_FMIN16 :
625                 BIFROST_FMA_OP_FMAX16;
626 
627         bool flip = false;
628         bool l = bi_pack_fp16_abs(ins, regs, &flip);
629         unsigned src_0 = bi_get_src(ins, regs, 0);
630         unsigned src_1 = bi_get_src(ins, regs, 1);
631 
632         if (FMA) {
633                 struct bifrost_fma_add_minmax16 pack = {
634                         .src0 = flip ? src_1 : src_0,
635                         .src1 = flip ? src_0 : src_1,
636                         .src0_neg = ins->src_neg[flip ? 1 : 0],
637                         .src1_neg = ins->src_neg[flip ? 0 : 1],
638                         .src0_swizzle = bi_swiz16(ins, flip ? 1 : 0),
639                         .src1_swizzle = bi_swiz16(ins, flip ? 0 : 1),
640                         .abs1 = l,
641                         .outmod = ins->outmod,
642                         .mode = (ins->type == BI_ADD) ? ins->roundmode : ins->minmax,
643                         .op = op
644                 };
645 
646                 RETURN_PACKED(pack);
647         } else {
648                 /* Can't have modes for fp16 */
649                 assert(ins->outmod == 0);
650 
651                 struct bifrost_add_fmin16 pack = {
652                         .src0 = flip ? src_1 : src_0,
653                         .src1 = flip ? src_0 : src_1,
654                         .src0_neg = ins->src_neg[flip ? 1 : 0],
655                         .src1_neg = ins->src_neg[flip ? 0 : 1],
656                         .abs1 = l,
657                         .src0_swizzle = bi_swiz16(ins, flip ? 1 : 0),
658                         .src1_swizzle = bi_swiz16(ins, flip ? 0 : 1),
659                         .mode = ins->minmax,
660                         .op = op
661                 };
662 
663                 RETURN_PACKED(pack);
664         }
665 }
666 
667 static unsigned
bi_pack_fma_addmin(bi_instruction * ins,bi_registers * regs)668 bi_pack_fma_addmin(bi_instruction *ins, bi_registers *regs)
669 {
670         if (ins->dest_type == nir_type_float32)
671                 return bi_pack_fma_addmin_f32(ins, regs);
672         else if(ins->dest_type == nir_type_float16)
673                 return bi_pack_fmadd_min_f16(ins, regs, true);
674         else
675                 unreachable("Unknown FMA/ADD type");
676 }
677 
678 static unsigned
bi_pack_fma_1src(bi_instruction * ins,bi_registers * regs,unsigned op)679 bi_pack_fma_1src(bi_instruction *ins, bi_registers *regs, unsigned op)
680 {
681         struct bifrost_fma_inst pack = {
682                 .src0 = bi_get_src(ins, regs, 0),
683                 .op = op
684         };
685 
686         RETURN_PACKED(pack);
687 }
688 
689 static unsigned
bi_pack_fma_2src(bi_instruction * ins,bi_registers * regs,unsigned op)690 bi_pack_fma_2src(bi_instruction *ins, bi_registers *regs, unsigned op)
691 {
692         struct bifrost_fma_2src pack = {
693                 .src0 = bi_get_src(ins, regs, 0),
694                 .src1 = bi_get_src(ins, regs, 1),
695                 .op = op
696         };
697 
698         RETURN_PACKED(pack);
699 }
700 
701 static unsigned
bi_pack_add_1src(bi_instruction * ins,bi_registers * regs,unsigned op)702 bi_pack_add_1src(bi_instruction *ins, bi_registers *regs, unsigned op)
703 {
704         struct bifrost_add_inst pack = {
705                 .src0 = bi_get_src(ins, regs, 0),
706                 .op = op
707         };
708 
709         RETURN_PACKED(pack);
710 }
711 
712 static enum bifrost_csel_cond
bi_cond_to_csel(enum bi_cond cond,bool * flip,bool * invert,nir_alu_type T)713 bi_cond_to_csel(enum bi_cond cond, bool *flip, bool *invert, nir_alu_type T)
714 {
715         nir_alu_type B = nir_alu_type_get_base_type(T);
716         unsigned idx = (B == nir_type_float) ? 0 :
717                 ((B == nir_type_int) ? 1 : 2);
718 
719         switch (cond){
720         case BI_COND_LT:
721                 *flip = true;
722                 /* fallthrough */
723         case BI_COND_GT: {
724                 const enum bifrost_csel_cond ops[] = {
725                         BIFROST_FGT_F,
726                         BIFROST_IGT_I,
727                         BIFROST_UGT_I
728                 };
729 
730                 return ops[idx];
731         }
732         case BI_COND_LE:
733                 *flip = true;
734                 /* fallthrough */
735         case BI_COND_GE: {
736                 const enum bifrost_csel_cond ops[] = {
737                         BIFROST_FGE_F,
738                         BIFROST_IGE_I,
739                         BIFROST_UGE_I
740                 };
741 
742                 return ops[idx];
743         }
744         case BI_COND_NE:
745                 *invert = true;
746                 /* fallthrough */
747         case BI_COND_EQ: {
748                 const enum bifrost_csel_cond ops[] = {
749                         BIFROST_FEQ_F,
750                         BIFROST_IEQ_F,
751                         BIFROST_IEQ_F /* sign is irrelevant */
752                 };
753 
754                 return ops[idx];
755         }
756         default:
757                 unreachable("Invalid op for csel");
758         }
759 }
760 
761 static unsigned
bi_pack_fma_csel(bi_instruction * ins,bi_registers * regs)762 bi_pack_fma_csel(bi_instruction *ins, bi_registers *regs)
763 {
764         /* TODO: Use csel3 as well */
765         bool flip = false, invert = false;
766 
767         enum bifrost_csel_cond cond =
768                 bi_cond_to_csel(ins->cond, &flip, &invert, ins->src_types[0]);
769 
770         unsigned size = nir_alu_type_get_type_size(ins->dest_type);
771 
772         unsigned cmp_0 = (flip ? 1 : 0);
773         unsigned cmp_1 = (flip ? 0 : 1);
774         unsigned res_0 = (invert ? 3 : 2);
775         unsigned res_1 = (invert ? 2 : 3);
776 
777         struct bifrost_csel4 pack = {
778                 .src0 = bi_get_src(ins, regs, cmp_0),
779                 .src1 = bi_get_src(ins, regs, cmp_1),
780                 .src2 = bi_get_src(ins, regs, res_0),
781                 .src3 = bi_get_src(ins, regs, res_1),
782                 .cond = cond,
783                 .op = (size == 16) ? BIFROST_FMA_OP_CSEL4_V16 :
784                         BIFROST_FMA_OP_CSEL4
785         };
786 
787         RETURN_PACKED(pack);
788 }
789 
790 static unsigned
bi_pack_fma_frexp(bi_instruction * ins,bi_registers * regs)791 bi_pack_fma_frexp(bi_instruction *ins, bi_registers *regs)
792 {
793         unsigned op = BIFROST_FMA_OP_FREXPE_LOG;
794         return bi_pack_fma_1src(ins, regs, op);
795 }
796 
797 static unsigned
bi_pack_fma_reduce(bi_instruction * ins,bi_registers * regs)798 bi_pack_fma_reduce(bi_instruction *ins, bi_registers *regs)
799 {
800         if (ins->op.reduce == BI_REDUCE_ADD_FREXPM) {
801                 return bi_pack_fma_2src(ins, regs, BIFROST_FMA_OP_ADD_FREXPM);
802         } else {
803                 unreachable("Invalid reduce op");
804         }
805 }
806 
807 /* We have a single convert opcode in the IR but a number of opcodes that could
808  * come out. In particular we have native opcodes for:
809  *
810  * [ui]16 --> [fui]32           -- int16_to_32
811  * f16     --> f32              -- float16_to_32
812  * f32     --> f16              -- float32_to_16
813  * f32     --> [ui]32           -- float32_to_int
814  * [ui]32  --> f32              -- int_to_float32
815  * [fui]16 --> [fui]16          -- f2i_i2f16
816  */
817 
818 static unsigned
bi_pack_convert(bi_instruction * ins,bi_registers * regs,bool FMA)819 bi_pack_convert(bi_instruction *ins, bi_registers *regs, bool FMA)
820 {
821         nir_alu_type from_base = nir_alu_type_get_base_type(ins->src_types[0]);
822         unsigned from_size = nir_alu_type_get_type_size(ins->src_types[0]);
823         bool from_unsigned = from_base == nir_type_uint;
824 
825         nir_alu_type to_base = nir_alu_type_get_base_type(ins->dest_type);
826         unsigned to_size = nir_alu_type_get_type_size(ins->dest_type);
827         bool to_unsigned = to_base == nir_type_uint;
828         bool to_float = to_base == nir_type_float;
829 
830         /* Sanity check */
831         assert((from_base != to_base) || (from_size != to_size));
832         assert((MAX2(from_size, to_size) / MIN2(from_size, to_size)) <= 2);
833 
834         /* f32 to f16 is special */
835         if (from_size == 32 && to_size == 16 && from_base == to_base) {
836                 /* TODO uint/int */
837                 assert(from_base == nir_type_float);
838 
839                 struct bifrost_fma_2src pfma = {
840                         .src0 = bi_get_src(ins, regs, 0),
841                         .src1 = bi_get_src(ins, regs, 1),
842                         .op = BIFROST_FMA_FLOAT32_TO_16
843                 };
844 
845                 struct bifrost_add_2src padd = {
846                         .src0 = bi_get_src(ins, regs, 0),
847                         .src1 = bi_get_src(ins, regs, 1),
848                         .op = BIFROST_ADD_FLOAT32_TO_16
849                 };
850 
851                 if (FMA) {
852                         RETURN_PACKED(pfma);
853                 } else {
854                         RETURN_PACKED(padd);
855                 }
856         }
857 
858         /* Otherwise, figure out the mode */
859         unsigned op = 0;
860 
861         if (from_size == 16 && to_size == 32) {
862                 unsigned component = ins->swizzle[0][0];
863                 assert(component <= 1);
864 
865                 if (from_base == nir_type_float)
866                         op = BIFROST_CONVERT_5(component);
867                 else
868                         op = BIFROST_CONVERT_4(from_unsigned, component, to_float);
869         } else {
870                 unsigned mode = 0;
871                 unsigned swizzle = (from_size == 16) ? bi_swiz16(ins, 0) : 0;
872                 bool is_unsigned = from_unsigned;
873 
874                 if (from_base == nir_type_float) {
875                         assert(to_base != nir_type_float);
876                         is_unsigned = to_unsigned;
877 
878                         if (from_size == 32 && to_size == 32)
879                                 mode = BIFROST_CONV_F32_TO_I32;
880                         else if (from_size == 16 && to_size == 16)
881                                 mode = BIFROST_CONV_F16_TO_I16;
882                         else
883                                 unreachable("Invalid float conversion");
884                 } else {
885                         assert(to_base == nir_type_float);
886                         assert(from_size == to_size);
887 
888                         if (to_size == 32)
889                                 mode = BIFROST_CONV_I32_TO_F32;
890                         else if (to_size == 16)
891                                 mode = BIFROST_CONV_I16_TO_F16;
892                         else
893                                 unreachable("Invalid int conversion");
894                 }
895 
896                 /* Fixup swizzle for 32-bit only modes */
897 
898                 if (mode == BIFROST_CONV_I32_TO_F32)
899                         swizzle = 0b11;
900                 else if (mode == BIFROST_CONV_F32_TO_I32)
901                         swizzle = 0b10;
902 
903                 op = BIFROST_CONVERT(is_unsigned, ins->roundmode, swizzle, mode);
904 
905                 /* Unclear what the top bit is for... maybe 16-bit related */
906                 bool mode2 = mode == BIFROST_CONV_F16_TO_I16;
907                 bool mode6 = mode == BIFROST_CONV_I16_TO_F16;
908 
909                 if (!(mode2 || mode6))
910                         op |= 0x100;
911         }
912 
913         if (FMA)
914                 return bi_pack_fma_1src(ins, regs, BIFROST_FMA_CONVERT | op);
915         else
916                 return bi_pack_add_1src(ins, regs, BIFROST_ADD_CONVERT | op);
917 }
918 
919 static unsigned
bi_pack_fma_select(bi_instruction * ins,bi_registers * regs)920 bi_pack_fma_select(bi_instruction *ins, bi_registers *regs)
921 {
922         unsigned size = nir_alu_type_get_type_size(ins->src_types[0]);
923 
924         if (size == 16) {
925                 unsigned swiz = (ins->swizzle[0][0] | (ins->swizzle[1][0] << 1));
926                 unsigned op = BIFROST_FMA_SEL_16(swiz);
927                 return bi_pack_fma_2src(ins, regs, op);
928         } else if (size == 8) {
929                 unsigned swiz = 0;
930 
931                 for (unsigned c = 0; c < 4; ++c) {
932                         if (ins->swizzle[c][0]) {
933                                 /* Ensure lowering restriction is met */
934                                 assert(ins->swizzle[c][0] == 2);
935                                 swiz |= (1 << c);
936                         }
937                 }
938 
939                 struct bifrost_fma_sel8 pack = {
940                         .src0 = bi_get_src(ins, regs, 0),
941                         .src1 = bi_get_src(ins, regs, 1),
942                         .src2 = bi_get_src(ins, regs, 2),
943                         .src3 = bi_get_src(ins, regs, 3),
944                         .swizzle = swiz,
945                         .op = BIFROST_FMA_OP_SEL8
946                 };
947 
948                 RETURN_PACKED(pack);
949         } else {
950                 unreachable("Unimplemented");
951         }
952 }
953 
954 static enum bifrost_fcmp_cond
bi_fcmp_cond(enum bi_cond cond)955 bi_fcmp_cond(enum bi_cond cond)
956 {
957         switch (cond) {
958         case BI_COND_LT: return BIFROST_OLT;
959         case BI_COND_LE: return BIFROST_OLE;
960         case BI_COND_GE: return BIFROST_OGE;
961         case BI_COND_GT: return BIFROST_OGT;
962         case BI_COND_EQ: return BIFROST_OEQ;
963         case BI_COND_NE: return BIFROST_UNE;
964         default:         unreachable("Unknown bi_cond");
965         }
966 }
967 
968 /* a <?> b <==> b <flip(?)> a (TODO: NaN behaviour?) */
969 
970 static enum bifrost_fcmp_cond
bi_flip_fcmp(enum bifrost_fcmp_cond cond)971 bi_flip_fcmp(enum bifrost_fcmp_cond cond)
972 {
973         switch (cond) {
974         case BIFROST_OGT:
975                 return BIFROST_OLT;
976         case BIFROST_OGE:
977                 return BIFROST_OLE;
978         case BIFROST_OLT:
979                 return BIFROST_OGT;
980         case BIFROST_OLE:
981                 return BIFROST_OGE;
982         case BIFROST_OEQ:
983         case BIFROST_UNE:
984                 return cond;
985         default:
986                 unreachable("Unknown fcmp cond");
987         }
988 }
989 
990 static unsigned
bi_pack_fma_cmp(bi_instruction * ins,bi_registers * regs)991 bi_pack_fma_cmp(bi_instruction *ins, bi_registers *regs)
992 {
993         nir_alu_type Tl = ins->src_types[0];
994         nir_alu_type Tr = ins->src_types[1];
995 
996         if (Tl == nir_type_float32 || Tr == nir_type_float32) {
997                 /* TODO: Mixed 32/16 cmp */
998                 assert(Tl == Tr);
999 
1000                 enum bifrost_fcmp_cond cond = bi_fcmp_cond(ins->cond);
1001 
1002                 /* Only src1 has neg, so we arrange:
1003                  *      a < b   --- native
1004                  *      a < -b  --- native
1005                  *      -a < -b <===> a > b
1006                  *      -a < b  <===> a > -b
1007                  * TODO: Is this NaN-precise?
1008                  */
1009 
1010                 bool flip = ins->src_neg[0];
1011                 bool neg =  ins->src_neg[0] ^ ins->src_neg[1];
1012 
1013                 if (flip)
1014                         cond = bi_flip_fcmp(cond);
1015 
1016                 struct bifrost_fma_fcmp pack = {
1017                         .src0 = bi_get_src(ins, regs, 0),
1018                         .src1 = bi_get_src(ins, regs, 1),
1019                         .src0_abs = ins->src_abs[0],
1020                         .src1_abs = ins->src_abs[1],
1021                         .src1_neg = neg,
1022                         .src_expand = 0,
1023                         .unk1 = 0,
1024                         .cond = cond,
1025                         .op = BIFROST_FMA_OP_FCMP_D3D
1026                 };
1027 
1028                 RETURN_PACKED(pack);
1029         } else if (Tl == nir_type_float16 && Tr == nir_type_float16) {
1030                 bool flip = false;
1031                 bool l = bi_pack_fp16_abs(ins, regs, &flip);
1032                 enum bifrost_fcmp_cond cond = bi_fcmp_cond(ins->cond);
1033 
1034                 if (flip)
1035                         cond = bi_flip_fcmp(cond);
1036 
1037                 struct bifrost_fma_fcmp16 pack = {
1038                         .src0 = bi_get_src(ins, regs, flip ? 1 : 0),
1039                         .src1 = bi_get_src(ins, regs, flip ? 0 : 1),
1040                         .src0_swizzle = bi_swiz16(ins, flip ? 1 : 0),
1041                         .src1_swizzle = bi_swiz16(ins, flip ? 0 : 1),
1042                         .abs1 = l,
1043                         .unk = 0,
1044                         .cond = cond,
1045                         .op = BIFROST_FMA_OP_FCMP_D3D_16,
1046                 };
1047 
1048                 RETURN_PACKED(pack);
1049         } else {
1050                 unreachable("Unknown cmp type");
1051         }
1052 }
1053 
1054 static unsigned
bi_fma_bitwise_op(enum bi_bitwise_op op,bool rshift)1055 bi_fma_bitwise_op(enum bi_bitwise_op op, bool rshift)
1056 {
1057         switch (op) {
1058         case BI_BITWISE_OR:
1059                 /* Via De Morgan's */
1060                 return rshift ?
1061                         BIFROST_FMA_OP_RSHIFT_NAND :
1062                         BIFROST_FMA_OP_LSHIFT_NAND;
1063         case BI_BITWISE_AND:
1064                 return rshift ?
1065                         BIFROST_FMA_OP_RSHIFT_AND :
1066                         BIFROST_FMA_OP_LSHIFT_AND;
1067         case BI_BITWISE_XOR:
1068                 /* Shift direction handled out of band */
1069                 return BIFROST_FMA_OP_RSHIFT_XOR;
1070         default:
1071                 unreachable("Unknown op");
1072         }
1073 }
1074 
1075 static unsigned
bi_pack_fma_bitwise(bi_instruction * ins,bi_registers * regs)1076 bi_pack_fma_bitwise(bi_instruction *ins, bi_registers *regs)
1077 {
1078         unsigned size = nir_alu_type_get_type_size(ins->dest_type);
1079         assert(size <= 32);
1080 
1081         bool invert_0 = ins->bitwise.src_invert[0];
1082         bool invert_1 = ins->bitwise.src_invert[1];
1083 
1084         if (ins->op.bitwise == BI_BITWISE_OR) {
1085                 /* Becomes NAND, so via De Morgan's:
1086                  *      f(A) | f(B) = ~(~f(A) & ~f(B))
1087                  *                  = NAND(~f(A), ~f(B))
1088                  */
1089 
1090                 invert_0 = !invert_0;
1091                 invert_1 = !invert_1;
1092         } else if (ins->op.bitwise == BI_BITWISE_XOR) {
1093                 /* ~A ^ ~B = ~(A ^ ~B) = ~(~(A ^ B)) = A ^ B
1094                  * ~A ^  B = ~(A ^ B) = A ^ ~B
1095                  */
1096 
1097                 invert_0 ^= invert_1;
1098                 invert_1 = false;
1099 
1100                 /* invert_1 ends up specifying shift direction */
1101                 invert_1 = !ins->bitwise.rshift;
1102         }
1103 
1104         struct bifrost_shift_fma pack = {
1105                 .src0 = bi_get_src(ins, regs, 0),
1106                 .src1 = bi_get_src(ins, regs, 1),
1107                 .src2 = bi_get_src(ins, regs, 2),
1108                 .half = (size == 32) ? 0 : (size == 16) ? 0x7 : (size == 8) ? 0x4 : 0,
1109                 .unk = 1, /* XXX */
1110                 .invert_1 = invert_0,
1111                 .invert_2 = invert_1,
1112                 .op = bi_fma_bitwise_op(ins->op.bitwise, ins->bitwise.rshift)
1113         };
1114 
1115         RETURN_PACKED(pack);
1116 }
1117 
1118 static unsigned
bi_pack_fma_round(bi_instruction * ins,bi_registers * regs)1119 bi_pack_fma_round(bi_instruction *ins, bi_registers *regs)
1120 {
1121         bool fp16 = ins->dest_type == nir_type_float16;
1122         assert(fp16 || ins->dest_type == nir_type_float32);
1123 
1124         unsigned op = fp16
1125                 ? BIFROST_FMA_ROUND_16(ins->roundmode, bi_swiz16(ins, 0))
1126                 : BIFROST_FMA_ROUND_32(ins->roundmode);
1127 
1128         return bi_pack_fma_1src(ins, regs, op);
1129 }
1130 
1131 static unsigned
bi_pack_fma_imath(bi_instruction * ins,bi_registers * regs)1132 bi_pack_fma_imath(bi_instruction *ins, bi_registers *regs)
1133 {
1134         /* Scheduler: only ADD can have 8/16-bit imath */
1135         assert(ins->dest_type == nir_type_int32 || ins->dest_type == nir_type_uint32);
1136 
1137         unsigned op = ins->op.imath == BI_IMATH_ADD
1138                 ? BIFROST_FMA_IADD_32
1139                 : BIFROST_FMA_ISUB_32;
1140 
1141         return bi_pack_fma_2src(ins, regs, op);
1142 }
1143 
1144 static unsigned
bi_pack_fma_imul(bi_instruction * ins,bi_registers * regs)1145 bi_pack_fma_imul(bi_instruction *ins, bi_registers *regs)
1146 {
1147         assert(ins->op.imul == BI_IMUL_IMUL);
1148         unsigned op = BIFROST_FMA_IMUL_32;
1149         return bi_pack_fma_2src(ins, regs, op);
1150 }
1151 
1152 static unsigned
bi_pack_fma(bi_clause * clause,bi_bundle bundle,bi_registers * regs)1153 bi_pack_fma(bi_clause *clause, bi_bundle bundle, bi_registers *regs)
1154 {
1155         if (!bundle.fma)
1156                 return BIFROST_FMA_NOP;
1157 
1158         switch (bundle.fma->type) {
1159         case BI_ADD:
1160                 return bi_pack_fma_addmin(bundle.fma, regs);
1161         case BI_CMP:
1162                 return bi_pack_fma_cmp(bundle.fma, regs);
1163         case BI_BITWISE:
1164                 return bi_pack_fma_bitwise(bundle.fma, regs);
1165         case BI_CONVERT:
1166 		return bi_pack_convert(bundle.fma, regs, true);
1167         case BI_CSEL:
1168 		return bi_pack_fma_csel(bundle.fma, regs);
1169         case BI_FMA:
1170                 return bi_pack_fma_fma(bundle.fma, regs);
1171         case BI_FREXP:
1172                 return bi_pack_fma_frexp(bundle.fma, regs);
1173         case BI_IMATH:
1174                 return bi_pack_fma_imath(bundle.fma, regs);
1175         case BI_MINMAX:
1176                 return bi_pack_fma_addmin(bundle.fma, regs);
1177         case BI_MOV:
1178                 return bi_pack_fma_1src(bundle.fma, regs, BIFROST_FMA_OP_MOV);
1179         case BI_SELECT:
1180                 return bi_pack_fma_select(bundle.fma, regs);
1181         case BI_ROUND:
1182                 return bi_pack_fma_round(bundle.fma, regs);
1183         case BI_REDUCE_FMA:
1184                 return bi_pack_fma_reduce(bundle.fma, regs);
1185         case BI_IMUL:
1186                 return bi_pack_fma_imul(bundle.fma, regs);
1187         default:
1188                 unreachable("Cannot encode class as FMA");
1189         }
1190 }
1191 
1192 static unsigned
bi_pack_add_ld_vary(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1193 bi_pack_add_ld_vary(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1194 {
1195         unsigned size = nir_alu_type_get_type_size(ins->dest_type);
1196         assert(size == 32 || size == 16);
1197 
1198         unsigned op = (size == 32) ?
1199                 BIFROST_ADD_OP_LD_VAR_32 :
1200                 BIFROST_ADD_OP_LD_VAR_16;
1201 
1202         unsigned packed_addr = 0;
1203 
1204         if (ins->src[0] & BIR_INDEX_CONSTANT) {
1205                 /* Direct uses address field directly */
1206                 packed_addr = bi_get_immediate(ins, 0);
1207         } else {
1208                 /* Indirect gets an extra source */
1209                 packed_addr = bi_get_src(ins, regs, 0) | 0b11000;
1210         }
1211 
1212         /* The destination is thrown in the data register */
1213         assert(ins->dest & BIR_INDEX_REGISTER);
1214         clause->data_register = ins->dest & ~BIR_INDEX_REGISTER;
1215 
1216         unsigned channels = ins->vector_channels;
1217         assert(channels >= 1 && channels <= 4);
1218 
1219         struct bifrost_ld_var pack = {
1220                 .src0 = bi_get_src(ins, regs, 1),
1221                 .addr = packed_addr,
1222                 .channels = MALI_POSITIVE(channels),
1223                 .interp_mode = ins->load_vary.interp_mode,
1224                 .reuse = ins->load_vary.reuse,
1225                 .flat = ins->load_vary.flat,
1226                 .op = op
1227         };
1228 
1229         RETURN_PACKED(pack);
1230 }
1231 
1232 static unsigned
bi_pack_add_2src(bi_instruction * ins,bi_registers * regs,unsigned op)1233 bi_pack_add_2src(bi_instruction *ins, bi_registers *regs, unsigned op)
1234 {
1235         struct bifrost_add_2src pack = {
1236                 .src0 = bi_get_src(ins, regs, 0),
1237                 .src1 = bi_get_src(ins, regs, 1),
1238                 .op = op
1239         };
1240 
1241         RETURN_PACKED(pack);
1242 }
1243 
1244 static unsigned
bi_pack_add_addmin_f32(bi_instruction * ins,bi_registers * regs)1245 bi_pack_add_addmin_f32(bi_instruction *ins, bi_registers *regs)
1246 {
1247         unsigned op =
1248                 (ins->type == BI_ADD) ? BIFROST_ADD_OP_FADD32 :
1249                 (ins->op.minmax == BI_MINMAX_MIN) ? BIFROST_ADD_OP_FMIN32 :
1250                 BIFROST_ADD_OP_FMAX32;
1251 
1252         struct bifrost_add_faddmin pack = {
1253                 .src0 = bi_get_src(ins, regs, 0),
1254                 .src1 = bi_get_src(ins, regs, 1),
1255                 .src0_abs = ins->src_abs[0],
1256                 .src1_abs = ins->src_abs[1],
1257                 .src0_neg = ins->src_neg[0],
1258                 .src1_neg = ins->src_neg[1],
1259                 .outmod = ins->outmod,
1260                 .mode = (ins->type == BI_ADD) ? ins->roundmode : ins->minmax,
1261                 .op = op
1262         };
1263 
1264         RETURN_PACKED(pack);
1265 }
1266 
1267 static unsigned
bi_pack_add_add_f16(bi_instruction * ins,bi_registers * regs)1268 bi_pack_add_add_f16(bi_instruction *ins, bi_registers *regs)
1269 {
1270         /* ADD.v2f16 can't have outmod */
1271         assert(ins->outmod == BIFROST_NONE);
1272 
1273         struct bifrost_add_faddmin pack = {
1274                 .src0 = bi_get_src(ins, regs, 0),
1275                 .src1 = bi_get_src(ins, regs, 1),
1276                 .src0_abs = ins->src_abs[0],
1277                 .src1_abs = ins->src_abs[1],
1278                 .src0_neg = ins->src_neg[0],
1279                 .src1_neg = ins->src_neg[1],
1280                 .select = bi_swiz16(ins, 0), /* swizzle_0 */
1281                 .outmod = bi_swiz16(ins, 1), /* swizzle_1 */
1282                 .mode = ins->roundmode,
1283                 .op = BIFROST_ADD_OP_FADD16
1284         };
1285 
1286         RETURN_PACKED(pack);
1287 }
1288 
1289 static unsigned
bi_pack_add_addmin(bi_instruction * ins,bi_registers * regs)1290 bi_pack_add_addmin(bi_instruction *ins, bi_registers *regs)
1291 {
1292         if (ins->dest_type == nir_type_float32)
1293                 return bi_pack_add_addmin_f32(ins, regs);
1294         else if (ins->dest_type == nir_type_float16) {
1295                 if (ins->type == BI_ADD)
1296                         return bi_pack_add_add_f16(ins, regs);
1297                 else
1298                         return bi_pack_fmadd_min_f16(ins, regs, false);
1299         } else
1300                 unreachable("Unknown FMA/ADD type");
1301 }
1302 
1303 static unsigned
bi_pack_add_ld_ubo(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1304 bi_pack_add_ld_ubo(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1305 {
1306         assert(ins->vector_channels >= 1 && ins->vector_channels <= 4);
1307 
1308         const unsigned ops[4] = {
1309                 BIFROST_ADD_OP_LD_UBO_1,
1310                 BIFROST_ADD_OP_LD_UBO_2,
1311                 BIFROST_ADD_OP_LD_UBO_3,
1312                 BIFROST_ADD_OP_LD_UBO_4
1313         };
1314 
1315         bi_write_data_register(clause, ins);
1316         return bi_pack_add_2src(ins, regs, ops[ins->vector_channels - 1]);
1317 }
1318 
1319 static enum bifrost_ldst_type
bi_pack_ldst_type(nir_alu_type T)1320 bi_pack_ldst_type(nir_alu_type T)
1321 {
1322         switch (T) {
1323         case nir_type_float16: return BIFROST_LDST_F16;
1324         case nir_type_float32: return BIFROST_LDST_F32;
1325         case nir_type_int32:   return BIFROST_LDST_I32;
1326         case nir_type_uint32:  return BIFROST_LDST_U32;
1327         default: unreachable("Invalid type loaded");
1328         }
1329 }
1330 
1331 static unsigned
bi_pack_add_ld_var_addr(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1332 bi_pack_add_ld_var_addr(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1333 {
1334         struct bifrost_ld_var_addr pack = {
1335                 .src0 = bi_get_src(ins, regs, 1),
1336                 .src1 = bi_get_src(ins, regs, 2),
1337                 .location = bi_get_immediate(ins, 0),
1338                 .type = bi_pack_ldst_type(ins->src_types[3]),
1339                 .op = BIFROST_ADD_OP_LD_VAR_ADDR
1340         };
1341 
1342         bi_write_data_register(clause, ins);
1343         RETURN_PACKED(pack);
1344 }
1345 
1346 static unsigned
bi_pack_add_ld_attr(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1347 bi_pack_add_ld_attr(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1348 {
1349         assert(ins->vector_channels >= 0 && ins->vector_channels <= 4);
1350 
1351         struct bifrost_ld_attr pack = {
1352                 .src0 = bi_get_src(ins, regs, 1),
1353                 .src1 = bi_get_src(ins, regs, 2),
1354                 .location = bi_get_immediate(ins, 0),
1355                 .channels = MALI_POSITIVE(ins->vector_channels),
1356                 .type = bi_pack_ldst_type(ins->dest_type),
1357                 .op = BIFROST_ADD_OP_LD_ATTR
1358         };
1359 
1360         bi_write_data_register(clause, ins);
1361         RETURN_PACKED(pack);
1362 }
1363 
1364 static unsigned
bi_pack_add_st_vary(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1365 bi_pack_add_st_vary(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1366 {
1367         assert(ins->vector_channels >= 1 && ins->vector_channels <= 4);
1368 
1369         struct bifrost_st_vary pack = {
1370                 .src0 = bi_get_src(ins, regs, 1),
1371                 .src1 = bi_get_src(ins, regs, 2),
1372                 .src2 = bi_get_src(ins, regs, 3),
1373                 .channels = MALI_POSITIVE(ins->vector_channels),
1374                 .op = BIFROST_ADD_OP_ST_VAR
1375         };
1376 
1377         bi_read_data_register(clause, ins);
1378         RETURN_PACKED(pack);
1379 }
1380 
1381 static unsigned
bi_pack_add_atest(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1382 bi_pack_add_atest(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1383 {
1384         bool fp16 = (ins->src_types[1] == nir_type_float16);
1385 
1386         struct bifrost_add_atest pack = {
1387                 .src0 = bi_get_src(ins, regs, 0),
1388                 .src1 = bi_get_src(ins, regs, 1),
1389                 .half = fp16,
1390                 .component = fp16 ? ins->swizzle[1][0] : 1, /* Set for fp32 */
1391                 .op = BIFROST_ADD_OP_ATEST,
1392         };
1393 
1394         /* Despite *also* writing with the usual mechanism... quirky and
1395          * perhaps unnecessary, but let's match the blob */
1396         clause->data_register = ins->dest & ~BIR_INDEX_REGISTER;
1397 
1398         RETURN_PACKED(pack);
1399 }
1400 
1401 static unsigned
bi_pack_add_blend(bi_clause * clause,bi_instruction * ins,bi_registers * regs)1402 bi_pack_add_blend(bi_clause *clause, bi_instruction *ins, bi_registers *regs)
1403 {
1404         struct bifrost_add_inst pack = {
1405                 .src0 = bi_get_src(ins, regs, 1),
1406                 .op = BIFROST_ADD_OP_BLEND
1407         };
1408 
1409         /* TODO: Pack location in uniform_const */
1410         assert(ins->blend_location == 0);
1411 
1412         bi_read_data_register(clause, ins);
1413         RETURN_PACKED(pack);
1414 }
1415 
1416 static unsigned
bi_pack_add_special(bi_instruction * ins,bi_registers * regs)1417 bi_pack_add_special(bi_instruction *ins, bi_registers *regs)
1418 {
1419         unsigned op = 0;
1420         bool fp16 = ins->dest_type == nir_type_float16;
1421         bool Y = ins->swizzle[0][0];
1422 
1423         if (ins->op.special == BI_SPECIAL_FRCP) {
1424                 op = fp16 ?
1425                         (Y ? BIFROST_ADD_OP_FRCP_FAST_F16_Y :
1426                         BIFROST_ADD_OP_FRCP_FAST_F16_X) :
1427                         BIFROST_ADD_OP_FRCP_FAST_F32;
1428         } else if (ins->op.special == BI_SPECIAL_FRSQ) {
1429                 op = fp16 ?
1430                         (Y ? BIFROST_ADD_OP_FRSQ_FAST_F16_Y :
1431                         BIFROST_ADD_OP_FRSQ_FAST_F16_X) :
1432                         BIFROST_ADD_OP_FRSQ_FAST_F32;
1433 
1434         } else if (ins->op.special == BI_SPECIAL_EXP2_LOW) {
1435                 assert(!fp16);
1436                 return bi_pack_add_2src(ins, regs, BIFROST_ADD_OP_FEXP2_FAST);
1437         } else if (ins->op.special == BI_SPECIAL_IABS) {
1438                 assert(ins->src_types[0] == nir_type_int32);
1439                 op = BIFROST_ADD_OP_IABS_32;
1440         } else {
1441                 unreachable("Unknown special op");
1442         }
1443 
1444         return bi_pack_add_1src(ins, regs, op);
1445 }
1446 
1447 static unsigned
bi_pack_add_table(bi_instruction * ins,bi_registers * regs)1448 bi_pack_add_table(bi_instruction *ins, bi_registers *regs)
1449 {
1450         unsigned op = 0;
1451         assert(ins->dest_type == nir_type_float32);
1452 
1453         op = BIFROST_ADD_OP_LOG2_HELP;
1454         return bi_pack_add_1src(ins, regs, op);
1455 }
1456 static unsigned
bi_pack_add_tex_compact(bi_clause * clause,bi_instruction * ins,bi_registers * regs,gl_shader_stage stage)1457 bi_pack_add_tex_compact(bi_clause *clause, bi_instruction *ins, bi_registers *regs, gl_shader_stage stage)
1458 {
1459         bool f16 = ins->dest_type == nir_type_float16;
1460         bool vtx = stage != MESA_SHADER_FRAGMENT;
1461 
1462         struct bifrost_tex_compact pack = {
1463                 .src0 = bi_get_src(ins, regs, 0),
1464                 .src1 = bi_get_src(ins, regs, 1),
1465                 .op = f16 ? BIFROST_ADD_OP_TEX_COMPACT_F16(vtx) :
1466                         BIFROST_ADD_OP_TEX_COMPACT_F32(vtx),
1467                 .compute_lod = !vtx,
1468                 .tex_index = ins->texture.texture_index,
1469                 .sampler_index = ins->texture.sampler_index
1470         };
1471 
1472         bi_write_data_register(clause, ins);
1473         RETURN_PACKED(pack);
1474 }
1475 
1476 static unsigned
bi_pack_add_select(bi_instruction * ins,bi_registers * regs)1477 bi_pack_add_select(bi_instruction *ins, bi_registers *regs)
1478 {
1479         unsigned size = nir_alu_type_get_type_size(ins->src_types[0]);
1480         assert(size == 16);
1481 
1482         unsigned swiz = (ins->swizzle[0][0] | (ins->swizzle[1][0] << 1));
1483         unsigned op = BIFROST_ADD_SEL_16(swiz);
1484         return bi_pack_add_2src(ins, regs, op);
1485 }
1486 
1487 static enum bifrost_discard_cond
bi_cond_to_discard(enum bi_cond cond,bool * flip)1488 bi_cond_to_discard(enum bi_cond cond, bool *flip)
1489 {
1490         switch (cond){
1491         case BI_COND_GT:
1492                 *flip = true;
1493                 /* fallthrough */
1494         case BI_COND_LT:
1495                 return BIFROST_DISCARD_FLT;
1496         case BI_COND_GE:
1497                 *flip = true;
1498                 /* fallthrough */
1499         case BI_COND_LE:
1500                 return BIFROST_DISCARD_FLE;
1501         case BI_COND_NE:
1502                 return BIFROST_DISCARD_FNE;
1503         case BI_COND_EQ:
1504                 return BIFROST_DISCARD_FEQ;
1505         default:
1506                 unreachable("Invalid op for discard");
1507         }
1508 }
1509 
1510 static unsigned
bi_pack_add_discard(bi_instruction * ins,bi_registers * regs)1511 bi_pack_add_discard(bi_instruction *ins, bi_registers *regs)
1512 {
1513         bool fp16 = ins->src_types[0] == nir_type_float16;
1514         assert(fp16 || ins->src_types[0] == nir_type_float32);
1515 
1516         bool flip = false;
1517         enum bifrost_discard_cond cond = bi_cond_to_discard(ins->cond, &flip);
1518 
1519         struct bifrost_add_discard pack = {
1520                 .src0 = bi_get_src(ins, regs, flip ? 1 : 0),
1521                 .src1 = bi_get_src(ins, regs, flip ? 0 : 1),
1522                 .cond = cond,
1523                 .src0_select = fp16 ? ins->swizzle[0][0] : 0,
1524                 .src1_select = fp16 ? ins->swizzle[1][0] : 0,
1525                 .fp32 = fp16 ? 0 : 1,
1526                 .op = BIFROST_ADD_OP_DISCARD
1527         };
1528 
1529         RETURN_PACKED(pack);
1530 }
1531 
1532 static enum bifrost_icmp_cond
bi_cond_to_icmp(enum bi_cond cond,bool * flip,bool is_unsigned,bool is_16)1533 bi_cond_to_icmp(enum bi_cond cond, bool *flip, bool is_unsigned, bool is_16)
1534 {
1535         switch (cond){
1536         case BI_COND_LT:
1537                 *flip = true;
1538                 /* fallthrough */
1539         case BI_COND_GT:
1540                 return is_unsigned ? (is_16 ? BIFROST_ICMP_IGE : BIFROST_ICMP_UGT)
1541                         : BIFROST_ICMP_IGT;
1542         case BI_COND_LE:
1543                 *flip = true;
1544                 /* fallthrough */
1545         case BI_COND_GE:
1546                 return is_unsigned ? BIFROST_ICMP_UGE :
1547                         (is_16 ? BIFROST_ICMP_UGT : BIFROST_ICMP_IGE);
1548         case BI_COND_NE:
1549                 return BIFROST_ICMP_NEQ;
1550         case BI_COND_EQ:
1551                 return BIFROST_ICMP_EQ;
1552         default:
1553                 unreachable("Invalid op for icmp");
1554         }
1555 }
1556 
1557 static unsigned
bi_pack_add_icmp32(bi_instruction * ins,bi_registers * regs,bool flip,enum bifrost_icmp_cond cond)1558 bi_pack_add_icmp32(bi_instruction *ins, bi_registers *regs, bool flip,
1559                 enum bifrost_icmp_cond cond)
1560 {
1561         struct bifrost_add_icmp pack = {
1562                 .src0 = bi_get_src(ins, regs, flip ? 1 : 0),
1563                 .src1 = bi_get_src(ins, regs, flip ? 0 : 1),
1564                 .cond = cond,
1565                 .sz = 1,
1566                 .d3d = true,
1567                 .op = BIFROST_ADD_OP_ICMP_32
1568         };
1569 
1570         RETURN_PACKED(pack);
1571 }
1572 
1573 static unsigned
bi_pack_add_icmp16(bi_instruction * ins,bi_registers * regs,bool flip,enum bifrost_icmp_cond cond)1574 bi_pack_add_icmp16(bi_instruction *ins, bi_registers *regs, bool flip,
1575                 enum bifrost_icmp_cond cond)
1576 {
1577         struct bifrost_add_icmp16 pack = {
1578                 .src0 = bi_get_src(ins, regs, flip ? 1 : 0),
1579                 .src1 = bi_get_src(ins, regs, flip ? 0 : 1),
1580                 .src0_swizzle = bi_swiz16(ins, flip ? 1 : 0),
1581                 .src1_swizzle = bi_swiz16(ins, flip ? 0 : 1),
1582                 .cond = cond,
1583                 .d3d = true,
1584                 .op = BIFROST_ADD_OP_ICMP_16
1585         };
1586 
1587         RETURN_PACKED(pack);
1588 }
1589 
1590 static unsigned
bi_pack_add_cmp(bi_instruction * ins,bi_registers * regs)1591 bi_pack_add_cmp(bi_instruction *ins, bi_registers *regs)
1592 {
1593         nir_alu_type Tl = ins->src_types[0];
1594         nir_alu_type Tr = ins->src_types[1];
1595         nir_alu_type Bl = nir_alu_type_get_base_type(Tl);
1596 
1597         if (Bl == nir_type_uint || Bl == nir_type_int) {
1598                 assert(Tl == Tr);
1599                 unsigned sz = nir_alu_type_get_type_size(Tl);
1600 
1601                 bool flip = false;
1602 
1603                 enum bifrost_icmp_cond cond = bi_cond_to_icmp(
1604                                 sz == 16 ? /*bi_invert_cond*/(ins->cond) : ins->cond,
1605                                 &flip, Bl == nir_type_uint, sz == 16);
1606 
1607                 if (sz == 32)
1608                         return bi_pack_add_icmp32(ins, regs, flip, cond);
1609                 else if (sz == 16)
1610                         return bi_pack_add_icmp16(ins, regs, flip, cond);
1611                 else
1612                         unreachable("TODO");
1613         } else {
1614                 unreachable("TODO");
1615         }
1616 }
1617 
1618 static unsigned
bi_pack_add_imath(bi_instruction * ins,bi_registers * regs)1619 bi_pack_add_imath(bi_instruction *ins, bi_registers *regs)
1620 {
1621         /* TODO: 32+16 add */
1622         assert(ins->src_types[0] == ins->src_types[1]);
1623         unsigned sz = nir_alu_type_get_type_size(ins->src_types[0]);
1624         enum bi_imath_op p = ins->op.imath;
1625 
1626         unsigned op = 0;
1627 
1628         if (sz == 8) {
1629                 op = (p == BI_IMATH_ADD) ? BIFROST_ADD_IADD_8 :
1630                         BIFROST_ADD_ISUB_8;
1631         } else if (sz == 16) {
1632                 op = (p == BI_IMATH_ADD) ? BIFROST_ADD_IADD_16 :
1633                         BIFROST_ADD_ISUB_16;
1634         } else if (sz == 32) {
1635                 op = (p == BI_IMATH_ADD) ? BIFROST_ADD_IADD_32 :
1636                         BIFROST_ADD_ISUB_32;
1637         } else {
1638                 unreachable("64-bit todo");
1639         }
1640 
1641         return bi_pack_add_2src(ins, regs, op);
1642 }
1643 
1644 static unsigned
bi_pack_add_branch_cond(bi_instruction * ins,bi_registers * regs)1645 bi_pack_add_branch_cond(bi_instruction *ins, bi_registers *regs)
1646 {
1647         assert(ins->cond == BI_COND_EQ);
1648         assert(ins->src[1] == BIR_INDEX_ZERO);
1649 
1650         unsigned zero_ctrl = 0;
1651         unsigned size = nir_alu_type_get_type_size(ins->src_types[0]);
1652 
1653         if (size == 16) {
1654                 /* See BR_SIZE_ZERO swizzle disassembly */
1655                 zero_ctrl = ins->swizzle[0][0] ? 1 : 2;
1656         } else {
1657                 assert(size == 32);
1658         }
1659 
1660         /* EQ swap to NE */
1661         bool port_swapped = false;
1662 
1663         /* We assigned the constant port to fetch the branch offset so we can
1664          * just passthrough here. We put in the HI slot to match the blob since
1665          * that's where the magic flags end up */
1666         struct bifrost_branch pack = {
1667                 .src0 = bi_get_src(ins, regs, 0),
1668                 .src1 = (zero_ctrl << 1) | !port_swapped,
1669                 .src2 = BIFROST_SRC_CONST_HI,
1670                 .cond = BR_COND_EQ,
1671                 .size = BR_SIZE_ZERO,
1672                 .op = BIFROST_ADD_OP_BRANCH
1673         };
1674 
1675         RETURN_PACKED(pack);
1676 }
1677 
1678 static unsigned
bi_pack_add_branch_uncond(bi_instruction * ins,bi_registers * regs)1679 bi_pack_add_branch_uncond(bi_instruction *ins, bi_registers *regs)
1680 {
1681         struct bifrost_branch pack = {
1682                 /* It's unclear what these bits actually mean */
1683                 .src0 = BIFROST_SRC_CONST_LO,
1684                 .src1 = BIFROST_SRC_PASS_FMA,
1685 
1686                 /* Offset, see above */
1687                 .src2 = BIFROST_SRC_CONST_HI,
1688 
1689                 /* All ones in fact */
1690                 .cond = (BR_ALWAYS & 0x7),
1691                 .size = (BR_ALWAYS >> 3),
1692                 .op = BIFROST_ADD_OP_BRANCH
1693         };
1694 
1695         RETURN_PACKED(pack);
1696 }
1697 
1698 static unsigned
bi_pack_add_branch(bi_instruction * ins,bi_registers * regs)1699 bi_pack_add_branch(bi_instruction *ins, bi_registers *regs)
1700 {
1701         if (ins->cond == BI_COND_ALWAYS)
1702                 return bi_pack_add_branch_uncond(ins, regs);
1703         else
1704                 return bi_pack_add_branch_cond(ins, regs);
1705 }
1706 
1707 static unsigned
bi_pack_add(bi_clause * clause,bi_bundle bundle,bi_registers * regs,gl_shader_stage stage)1708 bi_pack_add(bi_clause *clause, bi_bundle bundle, bi_registers *regs, gl_shader_stage stage)
1709 {
1710         if (!bundle.add)
1711                 return BIFROST_ADD_NOP;
1712 
1713         switch (bundle.add->type) {
1714         case BI_ADD:
1715                 return bi_pack_add_addmin(bundle.add, regs);
1716         case BI_ATEST:
1717                 return bi_pack_add_atest(clause, bundle.add, regs);
1718         case BI_BRANCH:
1719                 return bi_pack_add_branch(bundle.add, regs);
1720         case BI_CMP:
1721                 return bi_pack_add_cmp(bundle.add, regs);
1722         case BI_BLEND:
1723                 return bi_pack_add_blend(clause, bundle.add, regs);
1724         case BI_BITWISE:
1725                 unreachable("Packing todo");
1726         case BI_CONVERT:
1727 		return bi_pack_convert(bundle.add, regs, false);
1728         case BI_DISCARD:
1729                 return bi_pack_add_discard(bundle.add, regs);
1730         case BI_FREXP:
1731                 unreachable("Packing todo");
1732         case BI_IMATH:
1733                 return bi_pack_add_imath(bundle.add, regs);
1734         case BI_LOAD:
1735                 unreachable("Packing todo");
1736         case BI_LOAD_ATTR:
1737                 return bi_pack_add_ld_attr(clause, bundle.add, regs);
1738         case BI_LOAD_UNIFORM:
1739                 return bi_pack_add_ld_ubo(clause, bundle.add, regs);
1740         case BI_LOAD_VAR:
1741                 return bi_pack_add_ld_vary(clause, bundle.add, regs);
1742         case BI_LOAD_VAR_ADDRESS:
1743                 return bi_pack_add_ld_var_addr(clause, bundle.add, regs);
1744         case BI_MINMAX:
1745                 return bi_pack_add_addmin(bundle.add, regs);
1746         case BI_MOV:
1747         case BI_STORE:
1748                 unreachable("Packing todo");
1749         case BI_STORE_VAR:
1750                 return bi_pack_add_st_vary(clause, bundle.add, regs);
1751         case BI_SPECIAL:
1752                 return bi_pack_add_special(bundle.add, regs);
1753         case BI_TABLE:
1754                 return bi_pack_add_table(bundle.add, regs);
1755         case BI_SELECT:
1756                 return bi_pack_add_select(bundle.add, regs);
1757         case BI_TEX:
1758                 if (bundle.add->op.texture == BI_TEX_COMPACT)
1759                         return bi_pack_add_tex_compact(clause, bundle.add, regs, stage);
1760                 else
1761                         unreachable("Unknown tex type");
1762         case BI_ROUND:
1763                 unreachable("Packing todo");
1764         default:
1765                 unreachable("Cannot encode class as ADD");
1766         }
1767 }
1768 
1769 struct bi_packed_bundle {
1770         uint64_t lo;
1771         uint64_t hi;
1772 };
1773 
1774 /* We must ensure port 1 > port 0 for the 63-x trick to function, so we fix
1775  * this up at pack time. (Scheduling doesn't care.) */
1776 
1777 static void
bi_flip_ports(bi_registers * regs)1778 bi_flip_ports(bi_registers *regs)
1779 {
1780         if (regs->enabled[0] && regs->enabled[1] && regs->port[1] < regs->port[0]) {
1781                 unsigned temp = regs->port[0];
1782                 regs->port[0] = regs->port[1];
1783                 regs->port[1] = temp;
1784         }
1785 
1786 }
1787 
1788 static struct bi_packed_bundle
bi_pack_bundle(bi_clause * clause,bi_bundle bundle,bi_bundle prev,bool first_bundle,gl_shader_stage stage)1789 bi_pack_bundle(bi_clause *clause, bi_bundle bundle, bi_bundle prev, bool first_bundle, gl_shader_stage stage)
1790 {
1791         bi_assign_ports(&bundle, &prev);
1792         bi_assign_uniform_constant(clause, &bundle.regs, bundle);
1793         bundle.regs.first_instruction = first_bundle;
1794 
1795         bi_flip_ports(&bundle.regs);
1796 
1797         uint64_t reg = bi_pack_registers(bundle.regs);
1798         uint64_t fma = bi_pack_fma(clause, bundle, &bundle.regs);
1799         uint64_t add = bi_pack_add(clause, bundle, &bundle.regs, stage);
1800 
1801         struct bi_packed_bundle packed = {
1802                 .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
1803                 .hi = add >> 6
1804         };
1805 
1806         return packed;
1807 }
1808 
1809 /* Packs the next two constants as a dedicated constant quadword at the end of
1810  * the clause, returning the number packed. There are two cases to consider:
1811  *
1812  * Case #1: Branching is not used. For a single constant copy the upper nibble
1813  * over, easy.
1814  *
1815  * Case #2: Branching is used. For a single constant, it suffices to set the
1816  * upper nibble to 4 and leave the latter constant 0, which matches what the
1817  * blob does.
1818  *
1819  * Extending to multiple constants is considerably more tricky and left for
1820  * future work.
1821  */
1822 
1823 static unsigned
bi_pack_constants(bi_context * ctx,bi_clause * clause,unsigned index,struct util_dynarray * emission)1824 bi_pack_constants(bi_context *ctx, bi_clause *clause,
1825                 unsigned index,
1826                 struct util_dynarray *emission)
1827 {
1828         /* After these two, are we done? Determines tag */
1829         bool done = clause->constant_count <= (index + 2);
1830         bool only = clause->constant_count <= (index + 1);
1831 
1832         /* Is the constant we're packing for a branch? */
1833         bool branches = clause->branch_constant && done;
1834 
1835         /* TODO: Pos */
1836         assert(index == 0 && clause->bundle_count == 1);
1837         assert(only);
1838 
1839         /* Compute branch offset instead of a dummy 0 */
1840         if (branches) {
1841                 bi_instruction *br = clause->bundles[clause->bundle_count - 1].add;
1842                 assert(br && br->type == BI_BRANCH && br->branch_target);
1843 
1844                 /* Put it in the high place */
1845                 int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
1846                 int32_t bytes = qwords * 16;
1847 
1848                 /* Copy so we get proper sign behaviour */
1849                 uint32_t raw = 0;
1850                 memcpy(&raw, &bytes, sizeof(raw));
1851 
1852                 /* Clear off top bits for the magic bits */
1853                 raw &= ~0xF0000000;
1854 
1855                 /* Put in top 32-bits */
1856                 clause->constants[index + 0] = ((uint64_t) raw) << 32ull;
1857         }
1858 
1859         uint64_t hi = clause->constants[index + 0] >> 60ull;
1860 
1861         struct bifrost_fmt_constant quad = {
1862                 .pos = 0, /* TODO */
1863                 .tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS,
1864                 .imm_1 = clause->constants[index + 0] >> 4,
1865                 .imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4,
1866         };
1867 
1868         if (branches) {
1869                 /* Branch offsets are less than 60-bits so this should work at
1870                  * least for now */
1871                 quad.imm_1 |= (4ull << 60ull) >> 4;
1872                 assert (hi == 0);
1873         }
1874 
1875         /* XXX: On G71, Connor observed that the difference of the top 4 bits
1876          * of the second constant with the first must be less than 8, otherwise
1877          * we have to swap them. On G52, I'm able to reproduce a similar issue
1878          * but with a different workaround (modeled above with a single
1879          * constant, unclear how to workaround for multiple constants.) Further
1880          * investigation needed. Possibly an errata. XXX */
1881 
1882         util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
1883 
1884         return 2;
1885 }
1886 
1887 static void
bi_pack_clause(bi_context * ctx,bi_clause * clause,bi_clause * next_1,bi_clause * next_2,struct util_dynarray * emission,gl_shader_stage stage)1888 bi_pack_clause(bi_context *ctx, bi_clause *clause,
1889                 bi_clause *next_1, bi_clause *next_2,
1890                 struct util_dynarray *emission, gl_shader_stage stage)
1891 {
1892         struct bi_packed_bundle ins_1 = bi_pack_bundle(clause, clause->bundles[0], clause->bundles[0], true, stage);
1893         assert(clause->bundle_count == 1);
1894 
1895         /* Used to decide if we elide writes */
1896         bool is_fragment = ctx->stage == MESA_SHADER_FRAGMENT;
1897 
1898         /* State for packing constants throughout */
1899         unsigned constant_index = 0;
1900 
1901         struct bifrost_fmt1 quad_1 = {
1902                 .tag = clause->constant_count ? BIFROST_FMT1_CONSTANTS : BIFROST_FMT1_FINAL,
1903                 .header = bi_pack_header(clause, next_1, next_2, is_fragment),
1904                 .ins_1 = ins_1.lo,
1905                 .ins_2 = ins_1.hi & ((1 << 11) - 1),
1906                 .ins_0 = (ins_1.hi >> 11) & 0b111,
1907         };
1908 
1909         util_dynarray_append(emission, struct bifrost_fmt1, quad_1);
1910 
1911         /* Pack the remaining constants */
1912 
1913         while (constant_index < clause->constant_count) {
1914                 constant_index += bi_pack_constants(ctx, clause,
1915                                 constant_index, emission);
1916         }
1917 }
1918 
1919 static bi_clause *
bi_next_clause(bi_context * ctx,pan_block * block,bi_clause * clause)1920 bi_next_clause(bi_context *ctx, pan_block *block, bi_clause *clause)
1921 {
1922         /* Try the first clause in this block if we're starting from scratch */
1923         if (!clause && !list_is_empty(&((bi_block *) block)->clauses))
1924                 return list_first_entry(&((bi_block *) block)->clauses, bi_clause, link);
1925 
1926         /* Try the next clause in this block */
1927         if (clause && clause->link.next != &((bi_block *) block)->clauses)
1928                 return list_first_entry(&(clause->link), bi_clause, link);
1929 
1930         /* Try the next block, or the one after that if it's empty, etc .*/
1931         pan_block *next_block = pan_next_block(block);
1932 
1933         bi_foreach_block_from(ctx, next_block, block) {
1934                 bi_block *blk = (bi_block *) block;
1935 
1936                 if (!list_is_empty(&blk->clauses))
1937                         return list_first_entry(&(blk->clauses), bi_clause, link);
1938         }
1939 
1940         return NULL;
1941 }
1942 
1943 void
bi_pack(bi_context * ctx,struct util_dynarray * emission)1944 bi_pack(bi_context *ctx, struct util_dynarray *emission)
1945 {
1946         util_dynarray_init(emission, NULL);
1947 
1948         bi_foreach_block(ctx, _block) {
1949                 bi_block *block = (bi_block *) _block;
1950 
1951                 /* Passthrough the first clause of where we're branching to for
1952                  * the last clause of the block (the clause with the branch) */
1953 
1954                 bi_clause *succ_clause = block->base.successors[1] ?
1955                         bi_next_clause(ctx, block->base.successors[0], NULL) : NULL;
1956 
1957                 bi_foreach_clause_in_block(block, clause) {
1958                         bool is_last = clause->link.next == &block->clauses;
1959 
1960                         bi_clause *next = bi_next_clause(ctx, _block, clause);
1961                         bi_clause *next_2 = is_last ? succ_clause : NULL;
1962 
1963                         bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage);
1964                 }
1965         }
1966 }
1967