1 /*
2  * Copyright © 2016 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35 
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v41_pack.h"
42 
43 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
53 
54 #define V3D_TSY_SET_QUORUM          0
55 #define V3D_TSY_INC_WAITERS         1
56 #define V3D_TSY_DEC_WAITERS         2
57 #define V3D_TSY_INC_QUORUM          3
58 #define V3D_TSY_DEC_QUORUM          4
59 #define V3D_TSY_FREE_ALL            5
60 #define V3D_TSY_RELEASE             6
61 #define V3D_TSY_ACQUIRE             7
62 #define V3D_TSY_WAIT                8
63 #define V3D_TSY_WAIT_INC            9
64 #define V3D_TSY_WAIT_CHECK          10
65 #define V3D_TSY_WAIT_INC_CHECK      11
66 #define V3D_TSY_WAIT_CV             12
67 #define V3D_TSY_INC_SEMAPHORE       13
68 #define V3D_TSY_DEC_SEMAPHORE       14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70 
71 enum v3d_tmu_op_type
72 {
73         V3D_TMU_OP_TYPE_REGULAR,
74         V3D_TMU_OP_TYPE_ATOMIC,
75         V3D_TMU_OP_TYPE_CACHE
76 };
77 
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81         switch(tmu_op) {
82         case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83         case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84         case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85         case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86         case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87                 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88         case V3D_TMU_OP_WRITE_UMAX:
89         case V3D_TMU_OP_WRITE_SMIN:
90         case V3D_TMU_OP_WRITE_SMAX:
91                 assert(is_write);
92                 FALLTHROUGH;
93         case V3D_TMU_OP_WRITE_AND_READ_INC:
94         case V3D_TMU_OP_WRITE_OR_READ_DEC:
95         case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96                 return V3D_TMU_OP_TYPE_ATOMIC;
97         case V3D_TMU_OP_REGULAR:
98                 return V3D_TMU_OP_TYPE_REGULAR;
99 
100         default:
101                 unreachable("Unknown tmu_op\n");
102         }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106 
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109                   struct qreg **regs,
110                   uint32_t *size,
111                   uint32_t decl_size)
112 {
113         if (*size >= decl_size)
114                 return;
115 
116         uint32_t old_size = *size;
117         *size = MAX2(*size * 2, decl_size);
118         *regs = reralloc(c, *regs, struct qreg, *size);
119         if (!*regs) {
120                 fprintf(stderr, "Malloc failure\n");
121                 abort();
122         }
123 
124         for (uint32_t i = old_size; i < *size; i++)
125                 (*regs)[i] = c->undef;
126 }
127 
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130                     struct v3d_interp_input **regs,
131                     uint32_t *size,
132                     uint32_t decl_size)
133 {
134         if (*size >= decl_size)
135                 return;
136 
137         uint32_t old_size = *size;
138         *size = MAX2(*size * 2, decl_size);
139         *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140         if (!*regs) {
141                 fprintf(stderr, "Malloc failure\n");
142                 abort();
143         }
144 
145         for (uint32_t i = old_size; i < *size; i++) {
146                 (*regs)[i].vp = c->undef;
147                 (*regs)[i].C = c->undef;
148         }
149 }
150 
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154         if (c->threads == 1)
155                 return;
156 
157         /* Always thread switch after each texture operation for now.
158          *
159          * We could do better by batching a bunch of texture fetches up and
160          * then doing one thread switch and collecting all their results
161          * afterward.
162          */
163         c->last_thrsw = vir_NOP(c);
164         c->last_thrsw->qpu.sig.thrsw = true;
165         c->last_thrsw_at_top_level = !c->in_control_flow;
166 
167         /* We need to lock the scoreboard before any tlb acess happens. If this
168          * thread switch comes after we have emitted a tlb load, then it means
169          * that we can't lock on the last thread switch any more.
170          */
171         if (c->emitted_tlb_load)
172                 c->lock_scoreboard_on_first_thrsw = true;
173 }
174 
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178         if (nir_src_is_const(instr->src[src])) {
179                 int64_t add_val = nir_src_as_int(instr->src[src]);
180                 if (add_val == 1)
181                         return V3D_TMU_OP_WRITE_AND_READ_INC;
182                 else if (add_val == -1)
183                         return V3D_TMU_OP_WRITE_OR_READ_DEC;
184         }
185 
186         return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188 
189 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)190 v3d_general_tmu_op(nir_intrinsic_instr *instr)
191 {
192         switch (instr->intrinsic) {
193         case nir_intrinsic_load_ssbo:
194         case nir_intrinsic_load_ubo:
195         case nir_intrinsic_load_uniform:
196         case nir_intrinsic_load_shared:
197         case nir_intrinsic_load_scratch:
198         case nir_intrinsic_store_ssbo:
199         case nir_intrinsic_store_shared:
200         case nir_intrinsic_store_scratch:
201                 return V3D_TMU_OP_REGULAR;
202         case nir_intrinsic_ssbo_atomic_add:
203                 return v3d_get_op_for_atomic_add(instr, 2);
204         case nir_intrinsic_shared_atomic_add:
205                 return v3d_get_op_for_atomic_add(instr, 1);
206         case nir_intrinsic_ssbo_atomic_imin:
207         case nir_intrinsic_shared_atomic_imin:
208                 return V3D_TMU_OP_WRITE_SMIN;
209         case nir_intrinsic_ssbo_atomic_umin:
210         case nir_intrinsic_shared_atomic_umin:
211                 return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
212         case nir_intrinsic_ssbo_atomic_imax:
213         case nir_intrinsic_shared_atomic_imax:
214                 return V3D_TMU_OP_WRITE_SMAX;
215         case nir_intrinsic_ssbo_atomic_umax:
216         case nir_intrinsic_shared_atomic_umax:
217                 return V3D_TMU_OP_WRITE_UMAX;
218         case nir_intrinsic_ssbo_atomic_and:
219         case nir_intrinsic_shared_atomic_and:
220                 return V3D_TMU_OP_WRITE_AND_READ_INC;
221         case nir_intrinsic_ssbo_atomic_or:
222         case nir_intrinsic_shared_atomic_or:
223                 return V3D_TMU_OP_WRITE_OR_READ_DEC;
224         case nir_intrinsic_ssbo_atomic_xor:
225         case nir_intrinsic_shared_atomic_xor:
226                 return V3D_TMU_OP_WRITE_XOR_READ_NOT;
227         case nir_intrinsic_ssbo_atomic_exchange:
228         case nir_intrinsic_shared_atomic_exchange:
229                 return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
230         case nir_intrinsic_ssbo_atomic_comp_swap:
231         case nir_intrinsic_shared_atomic_comp_swap:
232                 return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
233         default:
234                 unreachable("unknown intrinsic op");
235         }
236 }
237 
238 /**
239  * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
240  * would overflow the Output TMU fifo.
241  *
242  * It is not allowed to overflow the Output fifo, however, we can overflow
243  * Input and Config fifos. Doing that makes the shader stall, but only for as
244  * long as it needs to be able to continue so it is better for pipelining to
245  * let the QPU stall on these if needed than trying to emit TMU flushes in the
246  * driver.
247  */
248 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)249 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
250 {
251         if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
252                 return true;
253 
254         return components > 0 &&
255                c->tmu.output_fifo_size + components > 16 / c->threads;
256 }
257 
258 /**
259  * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
260  * popping all TMU fifo entries.
261  */
262 void
ntq_flush_tmu(struct v3d_compile * c)263 ntq_flush_tmu(struct v3d_compile *c)
264 {
265         if (c->tmu.flush_count == 0)
266                 return;
267 
268         vir_emit_thrsw(c);
269 
270         bool emitted_tmuwt = false;
271         for (int i = 0; i < c->tmu.flush_count; i++) {
272                 if (c->tmu.flush[i].component_mask > 0) {
273                         nir_dest *dest = c->tmu.flush[i].dest;
274                         assert(dest);
275 
276                         for (int j = 0; j < 4; j++) {
277                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
278                                         ntq_store_dest(c, dest, j,
279                                                        vir_MOV(c, vir_LDTMU(c)));
280                                 }
281                         }
282                 } else if (!emitted_tmuwt) {
283                         vir_TMUWT(c);
284                         emitted_tmuwt = true;
285                 }
286         }
287 
288         c->tmu.output_fifo_size = 0;
289         c->tmu.flush_count = 0;
290         _mesa_set_clear(c->tmu.outstanding_regs, NULL);
291 }
292 
293 /**
294  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
295  * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
296  * and more specifically, the output fifo, since that can't stall.
297  */
298 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_dest * dest,uint32_t component_mask)299 ntq_add_pending_tmu_flush(struct v3d_compile *c,
300                           nir_dest *dest,
301                           uint32_t component_mask)
302 {
303         const uint32_t num_components = util_bitcount(component_mask);
304         assert(!ntq_tmu_fifo_overflow(c, num_components));
305 
306         if (num_components > 0) {
307                 c->tmu.output_fifo_size += num_components;
308                 if (!dest->is_ssa)
309                         _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
310         }
311 
312         c->tmu.flush[c->tmu.flush_count].dest = dest;
313         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
314         c->tmu.flush_count++;
315 
316         if (c->disable_tmu_pipelining)
317                 ntq_flush_tmu(c);
318         else if (c->tmu.flush_count > 1)
319                 c->pipelined_any_tmu = true;
320 }
321 
322 enum emit_mode {
323     MODE_COUNT = 0,
324     MODE_EMIT,
325     MODE_LAST,
326 };
327 
328 /**
329  * For a TMU general store instruction:
330  *
331  * In MODE_COUNT mode, records the number of TMU writes required and flushes
332  * any outstanding TMU operations the instruction depends on, but it doesn't
333  * emit any actual register writes.
334  *
335  * In MODE_EMIT mode, emits the data register writes required by the
336  * instruction.
337  */
338 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * type_size,uint32_t * tmu_writes)339 emit_tmu_general_store_writes(struct v3d_compile *c,
340                               enum emit_mode mode,
341                               nir_intrinsic_instr *instr,
342                               uint32_t base_const_offset,
343                               uint32_t *writemask,
344                               uint32_t *const_offset,
345                               uint32_t *type_size,
346                               uint32_t *tmu_writes)
347 {
348         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
349 
350         /* Find the first set of consecutive components that
351          * are enabled in the writemask and emit the TMUD
352          * instructions for them.
353          */
354         assert(*writemask != 0);
355         uint32_t first_component = ffs(*writemask) - 1;
356         uint32_t last_component = first_component;
357         while (*writemask & BITFIELD_BIT(last_component + 1))
358                 last_component++;
359 
360         assert(first_component <= last_component &&
361                last_component < instr->num_components);
362 
363         for (int i = first_component; i <= last_component; i++) {
364                 struct qreg data = ntq_get_src(c, instr->src[0], i);
365                 if (mode == MODE_COUNT)
366                         (*tmu_writes)++;
367                 else
368                         vir_MOV_dest(c, tmud, data);
369         }
370 
371         if (mode == MODE_EMIT) {
372                 /* Update the offset for the TMU write based on the
373                  * the first component we are writing.
374                  */
375                 *type_size = nir_src_bit_size(instr->src[0]) / 8;
376                 *const_offset =
377                         base_const_offset + first_component * (*type_size);
378 
379                 /* Clear these components from the writemask */
380                 uint32_t written_mask =
381                         BITFIELD_RANGE(first_component, *tmu_writes);
382                 (*writemask) &= ~written_mask;
383         }
384 }
385 
386 /**
387  * For a TMU general atomic instruction:
388  *
389  * In MODE_COUNT mode, records the number of TMU writes required and flushes
390  * any outstanding TMU operations the instruction depends on, but it doesn't
391  * emit any actual register writes.
392  *
393  * In MODE_EMIT mode, emits the data register writes required by the
394  * instruction.
395  */
396 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)397 emit_tmu_general_atomic_writes(struct v3d_compile *c,
398                                enum emit_mode mode,
399                                nir_intrinsic_instr *instr,
400                                uint32_t tmu_op,
401                                bool has_index,
402                                uint32_t *tmu_writes)
403 {
404         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
405 
406         struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
407         if (mode == MODE_COUNT)
408                 (*tmu_writes)++;
409         else
410                 vir_MOV_dest(c, tmud, data);
411 
412         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
413                 data = ntq_get_src(c, instr->src[2 + has_index], 0);
414                 if (mode == MODE_COUNT)
415                         (*tmu_writes)++;
416                 else
417                         vir_MOV_dest(c, tmud, data);
418         }
419 }
420 
421 /**
422  * For any TMU general instruction:
423  *
424  * In MODE_COUNT mode, records the number of TMU writes required to emit the
425  * address parameter and flushes any outstanding TMU operations the instruction
426  * depends on, but it doesn't emit any actual register writes.
427  *
428  * In MODE_EMIT mode, emits register writes required to emit the address.
429  */
430 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t * tmu_writes)431 emit_tmu_general_address_write(struct v3d_compile *c,
432                                enum emit_mode mode,
433                                nir_intrinsic_instr *instr,
434                                uint32_t config,
435                                bool dynamic_src,
436                                int offset_src,
437                                struct qreg base_offset,
438                                uint32_t const_offset,
439                                uint32_t *tmu_writes)
440 {
441         if (mode == MODE_COUNT) {
442                 (*tmu_writes)++;
443                 if (dynamic_src)
444                         ntq_get_src(c, instr->src[offset_src], 0);
445                 return;
446         }
447 
448         if (vir_in_nonuniform_control_flow(c)) {
449                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
450                            V3D_QPU_PF_PUSHZ);
451         }
452 
453         struct qreg tmua;
454         if (config == ~0)
455                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
456         else
457                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
458 
459         struct qinst *tmu;
460         if (dynamic_src) {
461                 struct qreg offset = base_offset;
462                 if (const_offset != 0) {
463                         offset = vir_ADD(c, offset,
464                                          vir_uniform_ui(c, const_offset));
465                 }
466                 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
467                 tmu = vir_ADD_dest(c, tmua, offset, data);
468         } else {
469                 if (const_offset != 0) {
470                         tmu = vir_ADD_dest(c, tmua, base_offset,
471                                            vir_uniform_ui(c, const_offset));
472                 } else {
473                         tmu = vir_MOV_dest(c, tmua, base_offset);
474                 }
475         }
476 
477         if (config != ~0) {
478                 tmu->uniform =
479                         vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
480         }
481 
482         if (vir_in_nonuniform_control_flow(c))
483                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
484 }
485 
486 /**
487  * Implements indirect uniform loads and SSBO accesses through the TMU general
488  * memory access interface.
489  */
490 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch)491 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
492                      bool is_shared_or_scratch)
493 {
494         uint32_t tmu_op = v3d_general_tmu_op(instr);
495 
496         /* If we were able to replace atomic_add for an inc/dec, then we
497          * need/can to do things slightly different, like not loading the
498          * amount to add/sub, as that is implicit.
499          */
500         bool atomic_add_replaced =
501                 ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
502                   instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
503                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
504                   tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
505 
506         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
507                          instr->intrinsic == nir_intrinsic_store_scratch ||
508                          instr->intrinsic == nir_intrinsic_store_shared);
509 
510         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
511                         instr->intrinsic == nir_intrinsic_load_ubo ||
512                         instr->intrinsic == nir_intrinsic_load_ssbo ||
513                         instr->intrinsic == nir_intrinsic_load_scratch ||
514                         instr->intrinsic == nir_intrinsic_load_shared);
515 
516         if (!is_load)
517                 c->tmu_dirty_rcl = true;
518 
519         bool has_index = !is_shared_or_scratch;
520 
521         int offset_src;
522         if (instr->intrinsic == nir_intrinsic_load_uniform) {
523                 offset_src = 0;
524         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
525                    instr->intrinsic == nir_intrinsic_load_ubo ||
526                    instr->intrinsic == nir_intrinsic_load_scratch ||
527                    instr->intrinsic == nir_intrinsic_load_shared ||
528                    atomic_add_replaced) {
529                 offset_src = 0 + has_index;
530         } else if (is_store) {
531                 offset_src = 1 + has_index;
532         } else {
533                 offset_src = 0 + has_index;
534         }
535 
536         bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
537         uint32_t const_offset = 0;
538         if (!dynamic_src)
539                 const_offset = nir_src_as_uint(instr->src[offset_src]);
540 
541         struct qreg base_offset;
542         if (instr->intrinsic == nir_intrinsic_load_uniform) {
543                 const_offset += nir_intrinsic_base(instr);
544                 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
545                                           v3d_unit_data_create(0, const_offset));
546                 const_offset = 0;
547         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
548                 uint32_t index = nir_src_as_uint(instr->src[0]);
549                 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
550                  * shifted up by 1 (0 is gallium's constant buffer 0).
551                  */
552                 if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
553                         index++;
554 
555                 base_offset =
556                         vir_uniform(c, QUNIFORM_UBO_ADDR,
557                                     v3d_unit_data_create(index, const_offset));
558                 const_offset = 0;
559         } else if (is_shared_or_scratch) {
560                 /* Shared and scratch variables have no buffer index, and all
561                  * start from a common base that we set up at the start of
562                  * dispatch.
563                  */
564                 if (instr->intrinsic == nir_intrinsic_load_scratch ||
565                     instr->intrinsic == nir_intrinsic_store_scratch) {
566                         base_offset = c->spill_base;
567                 } else {
568                         base_offset = c->cs_shared_offset;
569                         const_offset += nir_intrinsic_base(instr);
570                 }
571         } else {
572                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
573                                           nir_src_as_uint(instr->src[is_store ?
574                                                                       1 : 0]));
575         }
576 
577         /* We are ready to emit TMU register writes now, but before we actually
578          * emit them we need to flush outstanding TMU operations if any of our
579          * writes reads from the result of an outstanding TMU operation before
580          * we start the TMU sequence for this operation, since otherwise the
581          * flush could happen in the middle of the TMU sequence we are about to
582          * emit, which is illegal. To do this we run this logic twice, the
583          * first time it will count required register writes and flush pending
584          * TMU requests if necessary due to a dependency, and the second one
585          * will emit the actual TMU writes.
586          */
587         const uint32_t dest_components = nir_intrinsic_dest_components(instr);
588         uint32_t base_const_offset = const_offset;
589         uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
590         uint32_t tmu_writes = 0;
591         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
592                 assert(mode == MODE_COUNT || tmu_writes > 0);
593 
594                 uint32_t type_size = 4;
595 
596                 if (is_store) {
597                         emit_tmu_general_store_writes(c, mode, instr,
598                                                       base_const_offset,
599                                                       &writemask,
600                                                       &const_offset,
601                                                       &type_size,
602                                                       &tmu_writes);
603                 } else if (!is_load && !atomic_add_replaced) {
604                         emit_tmu_general_atomic_writes(c, mode, instr,
605                                                        tmu_op, has_index,
606                                                        &tmu_writes);
607                 } else if (is_load) {
608                         type_size = nir_dest_bit_size(instr->dest) / 8;
609                 }
610 
611                 /* For atomics we use 32bit except for CMPXCHG, that we need
612                  * to use VEC2. For the rest of the cases we use the number of
613                  * tmud writes we did to decide the type. For cache operations
614                  * the type is ignored.
615                  */
616                 uint32_t config = 0;
617                 if (mode == MODE_EMIT) {
618                         uint32_t num_components;
619                         if (is_load || atomic_add_replaced) {
620                                 num_components = instr->num_components;
621                         } else {
622                                 assert(tmu_writes > 0);
623                                 num_components = tmu_writes - 1;
624                         }
625                         bool is_atomic =
626                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
627                                 V3D_TMU_OP_TYPE_ATOMIC;
628 
629                         uint32_t perquad =
630                                 is_load && !vir_in_nonuniform_control_flow(c)
631                                 ? GENERAL_TMU_LOOKUP_PER_QUAD
632                                 : GENERAL_TMU_LOOKUP_PER_PIXEL;
633                         config = 0xffffff00 | tmu_op << 3 | perquad;
634 
635                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
636                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
637                         } else if (is_atomic || num_components == 1) {
638                                 switch (type_size) {
639                                 case 4:
640                                         config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
641                                         break;
642                                 case 2:
643                                         config |= GENERAL_TMU_LOOKUP_TYPE_16BIT_UI;
644                                         break;
645                                 case 1:
646                                         config |= GENERAL_TMU_LOOKUP_TYPE_8BIT_UI;
647                                         break;
648                                 default:
649                                         unreachable("Unsupported bitsize");
650                                 }
651                         } else {
652                                 assert(type_size == 4);
653                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
654                                           num_components - 2;
655                         }
656                 }
657 
658                 emit_tmu_general_address_write(c, mode, instr, config,
659                                                dynamic_src, offset_src,
660                                                base_offset, const_offset,
661                                                &tmu_writes);
662 
663                 assert(tmu_writes > 0);
664                 if (mode == MODE_COUNT) {
665                         /* Make sure we won't exceed the 16-entry TMU
666                          * fifo if each thread is storing at the same
667                          * time.
668                          */
669                         while (tmu_writes > 16 / c->threads)
670                                 c->threads /= 2;
671 
672                         /* If pipelining this TMU operation would
673                          * overflow TMU fifos, we need to flush.
674                          */
675                         if (ntq_tmu_fifo_overflow(c, dest_components))
676                                 ntq_flush_tmu(c);
677                 } else {
678                         /* Delay emission of the thread switch and
679                          * LDTMU/TMUWT until we really need to do it to
680                          * improve pipelining.
681                          */
682                         const uint32_t component_mask =
683                                 (1 << dest_components) - 1;
684                         ntq_add_pending_tmu_flush(c, &instr->dest,
685                                                   component_mask);
686                 }
687         }
688 
689         /* nir_lower_wrmasks should've ensured that any writemask on a store
690          * operation only has consecutive bits set, in which case we should've
691          * processed the full writemask above.
692          */
693         assert(writemask == 0);
694 }
695 
696 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_ssa_def * def)697 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
698 {
699         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
700                                           def->num_components);
701         _mesa_hash_table_insert(c->def_ht, def, qregs);
702         return qregs;
703 }
704 
705 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)706 is_ld_signal(const struct v3d_qpu_sig *sig)
707 {
708         return (sig->ldunif ||
709                 sig->ldunifa ||
710                 sig->ldunifrf ||
711                 sig->ldunifarf ||
712                 sig->ldtmu ||
713                 sig->ldvary ||
714                 sig->ldvpm ||
715                 sig->ldtlb ||
716                 sig->ldtlbu);
717 }
718 
719 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)720 is_ldunif_signal(const struct v3d_qpu_sig *sig)
721 {
722         return sig->ldunif || sig->ldunifrf;
723 }
724 
725 /**
726  * This function is responsible for getting VIR results into the associated
727  * storage for a NIR instruction.
728  *
729  * If it's a NIR SSA def, then we just set the associated hash table entry to
730  * the new result.
731  *
732  * If it's a NIR reg, then we need to update the existing qreg assigned to the
733  * NIR destination with the incoming value.  To do that without introducing
734  * new MOVs, we require that the incoming qreg either be a uniform, or be
735  * SSA-defined by the previous VIR instruction in the block and rewritable by
736  * this function.  That lets us sneak ahead and insert the SF flag beforehand
737  * (knowing that the previous instruction doesn't depend on flags) and rewrite
738  * its destination to be the NIR reg's destination
739  */
740 void
ntq_store_dest(struct v3d_compile * c,nir_dest * dest,int chan,struct qreg result)741 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
742                struct qreg result)
743 {
744         struct qinst *last_inst = NULL;
745         if (!list_is_empty(&c->cur_block->instructions))
746                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
747 
748         bool is_reused_uniform =
749                 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
750                 last_inst != c->defs[result.index];
751 
752         assert(result.file == QFILE_TEMP && last_inst &&
753                (last_inst == c->defs[result.index] || is_reused_uniform));
754 
755         if (dest->is_ssa) {
756                 assert(chan < dest->ssa.num_components);
757 
758                 struct qreg *qregs;
759                 struct hash_entry *entry =
760                         _mesa_hash_table_search(c->def_ht, &dest->ssa);
761 
762                 if (entry)
763                         qregs = entry->data;
764                 else
765                         qregs = ntq_init_ssa_def(c, &dest->ssa);
766 
767                 qregs[chan] = result;
768         } else {
769                 nir_register *reg = dest->reg.reg;
770                 assert(dest->reg.base_offset == 0);
771                 assert(reg->num_array_elems == 0);
772                 struct hash_entry *entry =
773                         _mesa_hash_table_search(c->def_ht, reg);
774                 struct qreg *qregs = entry->data;
775 
776                 /* If the previous instruction can't be predicated for
777                  * the store into the nir_register, then emit a MOV
778                  * that can be.
779                  */
780                 if (is_reused_uniform ||
781                     (vir_in_nonuniform_control_flow(c) &&
782                      is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
783                         result = vir_MOV(c, result);
784                         last_inst = c->defs[result.index];
785                 }
786 
787                 /* We know they're both temps, so just rewrite index. */
788                 c->defs[last_inst->dst.index] = NULL;
789                 last_inst->dst.index = qregs[chan].index;
790 
791                 /* If we're in control flow, then make this update of the reg
792                  * conditional on the execution mask.
793                  */
794                 if (vir_in_nonuniform_control_flow(c)) {
795                         last_inst->dst.index = qregs[chan].index;
796 
797                         /* Set the flags to the current exec mask.
798                          */
799                         c->cursor = vir_before_inst(last_inst);
800                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
801                                    V3D_QPU_PF_PUSHZ);
802                         c->cursor = vir_after_inst(last_inst);
803 
804                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
805                 }
806         }
807 }
808 
809 /**
810  * This looks up the qreg associated with a particular ssa/reg used as a source
811  * in any instruction.
812  *
813  * It is expected that the definition for any NIR value read as a source has
814  * been emitted by a previous instruction, however, in the case of TMU
815  * operations we may have postponed emission of the thread switch and LDTMUs
816  * required to read the TMU results until the results are actually used to
817  * improve pipelining, which then would lead to us not finding them here
818  * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
819  * (for registers), meaning that we need to flush outstanding TMU operations
820  * to read the correct value.
821  */
822 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)823 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
824 {
825         struct hash_entry *entry;
826         if (src.is_ssa) {
827                 assert(i < src.ssa->num_components);
828 
829                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
830                 if (!entry) {
831                         ntq_flush_tmu(c);
832                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
833                 }
834         } else {
835                 nir_register *reg = src.reg.reg;
836                 assert(reg->num_array_elems == 0);
837                 assert(src.reg.base_offset == 0);
838                 assert(i < reg->num_components);
839 
840                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
841                         ntq_flush_tmu(c);
842                 entry = _mesa_hash_table_search(c->def_ht, reg);
843         }
844         assert(entry);
845 
846         struct qreg *qregs = entry->data;
847         return qregs[i];
848 }
849 
850 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)851 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
852                 unsigned src)
853 {
854         assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
855         unsigned chan = ffs(instr->dest.write_mask) - 1;
856         struct qreg r = ntq_get_src(c, instr->src[src].src,
857                                     instr->src[src].swizzle[chan]);
858 
859         assert(!instr->src[src].abs);
860         assert(!instr->src[src].negate);
861 
862         return r;
863 };
864 
865 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)866 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
867 {
868         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
869 }
870 
871 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)872 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
873 {
874         unsigned unit = instr->texture_index;
875         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
876         int dest_size = nir_tex_instr_dest_size(instr);
877 
878         struct qreg lod = c->undef;
879         if (lod_index != -1)
880                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
881 
882         for (int i = 0; i < dest_size; i++) {
883                 assert(i < 3);
884                 enum quniform_contents contents;
885 
886                 if (instr->is_array && i == dest_size - 1)
887                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
888                 else
889                         contents = QUNIFORM_TEXTURE_WIDTH + i;
890 
891                 struct qreg size = vir_uniform(c, contents, unit);
892 
893                 switch (instr->sampler_dim) {
894                 case GLSL_SAMPLER_DIM_1D:
895                 case GLSL_SAMPLER_DIM_2D:
896                 case GLSL_SAMPLER_DIM_MS:
897                 case GLSL_SAMPLER_DIM_3D:
898                 case GLSL_SAMPLER_DIM_CUBE:
899                 case GLSL_SAMPLER_DIM_BUF:
900                         /* Don't minify the array size. */
901                         if (!(instr->is_array && i == dest_size - 1)) {
902                                 size = ntq_minify(c, size, lod);
903                         }
904                         break;
905 
906                 case GLSL_SAMPLER_DIM_RECT:
907                         /* There's no LOD field for rects */
908                         break;
909 
910                 default:
911                         unreachable("Bad sampler type");
912                 }
913 
914                 ntq_store_dest(c, &instr->dest, i, size);
915         }
916 }
917 
918 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)919 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
920 {
921         unsigned unit = instr->texture_index;
922 
923         /* Since each texture sampling op requires uploading uniforms to
924          * reference the texture, there's no HW support for texture size and
925          * you just upload uniforms containing the size.
926          */
927         switch (instr->op) {
928         case nir_texop_query_levels:
929                 ntq_store_dest(c, &instr->dest, 0,
930                                vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
931                 return;
932         case nir_texop_texture_samples:
933                 ntq_store_dest(c, &instr->dest, 0,
934                                vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
935                 return;
936         case nir_texop_txs:
937                 ntq_emit_txs(c, instr);
938                 return;
939         default:
940                 break;
941         }
942 
943         if (c->devinfo->ver >= 40)
944                 v3d40_vir_emit_tex(c, instr);
945         else
946                 v3d33_vir_emit_tex(c, instr);
947 }
948 
949 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)950 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
951 {
952         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
953         if (is_cos)
954                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
955 
956         struct qreg periods = vir_FROUND(c, input);
957         struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
958         return vir_XOR(c, sin_output, vir_SHL(c,
959                                               vir_FTOIN(c, periods),
960                                               vir_uniform_ui(c, -1)));
961 }
962 
963 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)964 ntq_fsign(struct v3d_compile *c, struct qreg src)
965 {
966         struct qreg t = vir_get_temp(c);
967 
968         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
969         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
970         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
971         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
972         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
973         return vir_MOV(c, t);
974 }
975 
976 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)977 emit_fragcoord_input(struct v3d_compile *c, int attr)
978 {
979         c->inputs[attr * 4 + 0] = vir_FXCD(c);
980         c->inputs[attr * 4 + 1] = vir_FYCD(c);
981         c->inputs[attr * 4 + 2] = c->payload_z;
982         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
983 }
984 
985 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg r5)986 emit_smooth_varying(struct v3d_compile *c,
987                     struct qreg vary, struct qreg w, struct qreg r5)
988 {
989         return vir_FADD(c, vir_FMUL(c, vary, w), r5);
990 }
991 
992 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg r5)993 emit_noperspective_varying(struct v3d_compile *c,
994                            struct qreg vary, struct qreg r5)
995 {
996         return vir_FADD(c, vir_MOV(c, vary), r5);
997 }
998 
999 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg r5)1000 emit_flat_varying(struct v3d_compile *c,
1001                   struct qreg vary, struct qreg r5)
1002 {
1003         vir_MOV_dest(c, c->undef, vary);
1004         return vir_MOV(c, r5);
1005 }
1006 
1007 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)1008 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
1009                       int8_t input_idx, uint8_t swizzle, int array_index)
1010 {
1011         struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
1012         struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
1013 
1014         struct qinst *ldvary = NULL;
1015         struct qreg vary;
1016         if (c->devinfo->ver >= 41) {
1017                 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
1018                                       c->undef, c->undef);
1019                 ldvary->qpu.sig.ldvary = true;
1020                 vary = vir_emit_def(c, ldvary);
1021         } else {
1022                 vir_NOP(c)->qpu.sig.ldvary = true;
1023                 vary = r3;
1024         }
1025 
1026         /* Store the input value before interpolation so we can implement
1027          * GLSL's interpolateAt functions if the shader uses them.
1028          */
1029         if (input_idx >= 0) {
1030                 assert(var);
1031                 c->interp[input_idx].vp = vary;
1032                 c->interp[input_idx].C = vir_MOV(c, r5);
1033                 c->interp[input_idx].mode = var->data.interpolation;
1034         }
1035 
1036         /* For gl_PointCoord input or distance along a line, we'll be called
1037          * with no nir_variable, and we don't count toward VPM size so we
1038          * don't track an input slot.
1039          */
1040         if (!var) {
1041                 assert(input_idx < 0);
1042                 return emit_smooth_varying(c, vary, c->payload_w, r5);
1043         }
1044 
1045         int i = c->num_inputs++;
1046         c->input_slots[i] =
1047                 v3d_slot_from_slot_and_component(var->data.location +
1048                                                  array_index, swizzle);
1049 
1050         struct qreg result;
1051         switch (var->data.interpolation) {
1052         case INTERP_MODE_NONE:
1053         case INTERP_MODE_SMOOTH:
1054                 if (var->data.centroid) {
1055                         BITSET_SET(c->centroid_flags, i);
1056                         result = emit_smooth_varying(c, vary,
1057                                                      c->payload_w_centroid, r5);
1058                 } else {
1059                         result = emit_smooth_varying(c, vary, c->payload_w, r5);
1060                 }
1061                 break;
1062 
1063         case INTERP_MODE_NOPERSPECTIVE:
1064                 BITSET_SET(c->noperspective_flags, i);
1065                 result = emit_noperspective_varying(c, vary, r5);
1066                 break;
1067 
1068         case INTERP_MODE_FLAT:
1069                 BITSET_SET(c->flat_shade_flags, i);
1070                 result = emit_flat_varying(c, vary, r5);
1071                 break;
1072 
1073         default:
1074                 unreachable("Bad interp mode");
1075         }
1076 
1077         if (input_idx >= 0)
1078                 c->inputs[input_idx] = result;
1079         return result;
1080 }
1081 
1082 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1083 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1084                     int array_index, unsigned nelem)
1085 {
1086         for (int i = 0; i < nelem ; i++) {
1087                 int chan = var->data.location_frac + i;
1088                 int input_idx = (base_attr + array_index) * 4 + chan;
1089                 emit_fragment_varying(c, var, input_idx, chan, array_index);
1090         }
1091 }
1092 
1093 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1094 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1095                             int array_index)
1096 {
1097         /* Compact variables are scalar arrays where each set of 4 elements
1098          * consumes a single location.
1099          */
1100         int loc_offset = array_index / 4;
1101         int chan = var->data.location_frac + array_index % 4;
1102         int input_idx = (attr + loc_offset) * 4  + chan;
1103         emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1104 }
1105 
1106 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1107 add_output(struct v3d_compile *c,
1108            uint32_t decl_offset,
1109            uint8_t slot,
1110            uint8_t swizzle)
1111 {
1112         uint32_t old_array_size = c->outputs_array_size;
1113         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1114                           decl_offset + 1);
1115 
1116         if (old_array_size != c->outputs_array_size) {
1117                 c->output_slots = reralloc(c,
1118                                            c->output_slots,
1119                                            struct v3d_varying_slot,
1120                                            c->outputs_array_size);
1121         }
1122 
1123         c->output_slots[decl_offset] =
1124                 v3d_slot_from_slot_and_component(slot, swizzle);
1125 }
1126 
1127 /**
1128  * If compare_instr is a valid comparison instruction, emits the
1129  * compare_instr's comparison and returns the sel_instr's return value based
1130  * on the compare_instr's result.
1131  */
1132 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1133 ntq_emit_comparison(struct v3d_compile *c,
1134                     nir_alu_instr *compare_instr,
1135                     enum v3d_qpu_cond *out_cond)
1136 {
1137         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1138         struct qreg src1;
1139         if (nir_op_infos[compare_instr->op].num_inputs > 1)
1140                 src1 = ntq_get_alu_src(c, compare_instr, 1);
1141         bool cond_invert = false;
1142         struct qreg nop = vir_nop_reg();
1143 
1144         switch (compare_instr->op) {
1145         case nir_op_feq32:
1146         case nir_op_seq:
1147                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1148                 break;
1149         case nir_op_ieq32:
1150                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1151                 break;
1152 
1153         case nir_op_fneu32:
1154         case nir_op_sne:
1155                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1156                 cond_invert = true;
1157                 break;
1158         case nir_op_ine32:
1159                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1160                 cond_invert = true;
1161                 break;
1162 
1163         case nir_op_fge32:
1164         case nir_op_sge:
1165                 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1166                 break;
1167         case nir_op_ige32:
1168                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1169                 cond_invert = true;
1170                 break;
1171         case nir_op_uge32:
1172                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1173                 cond_invert = true;
1174                 break;
1175 
1176         case nir_op_slt:
1177         case nir_op_flt32:
1178                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1179                 break;
1180         case nir_op_ilt32:
1181                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1182                 break;
1183         case nir_op_ult32:
1184                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1185                 break;
1186 
1187         case nir_op_i2b32:
1188                 vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1189                 cond_invert = true;
1190                 break;
1191 
1192         case nir_op_f2b32:
1193                 vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1194                 cond_invert = true;
1195                 break;
1196 
1197         default:
1198                 return false;
1199         }
1200 
1201         *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1202 
1203         return true;
1204 }
1205 
1206 /* Finds an ALU instruction that generates our src value that could
1207  * (potentially) be greedily emitted in the consuming instruction.
1208  */
1209 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1210 ntq_get_alu_parent(nir_src src)
1211 {
1212         if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
1213                 return NULL;
1214         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1215         if (!instr)
1216                 return NULL;
1217 
1218         /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1219          * moving emission of the ALU instr down past another write of the
1220          * src.
1221          */
1222         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1223                 if (!instr->src[i].src.is_ssa)
1224                         return NULL;
1225         }
1226 
1227         return instr;
1228 }
1229 
1230 /* Turns a NIR bool into a condition code to predicate on. */
1231 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1232 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1233 {
1234         struct qreg qsrc = ntq_get_src(c, src, 0);
1235         /* skip if we already have src in the flags */
1236         if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1237                 return c->flags_cond;
1238 
1239         nir_alu_instr *compare = ntq_get_alu_parent(src);
1240         if (!compare)
1241                 goto out;
1242 
1243         enum v3d_qpu_cond cond;
1244         if (ntq_emit_comparison(c, compare, &cond))
1245                 return cond;
1246 
1247 out:
1248 
1249         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1250                    V3D_QPU_PF_PUSHZ);
1251         return V3D_QPU_COND_IFNA;
1252 }
1253 
1254 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1255 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1256 {
1257         struct qreg result =
1258                 vir_MOV(c, vir_SEL(c, cond,
1259                                    vir_uniform_ui(c, ~0),
1260                                    vir_uniform_ui(c, 0)));
1261         c->flags_temp = result.index;
1262         c->flags_cond = cond;
1263         return result;
1264 }
1265 
1266 static struct qreg
f2f16_rtz(struct v3d_compile * c,struct qreg f32)1267 f2f16_rtz(struct v3d_compile *c, struct qreg f32)
1268 {
1269         /* The GPU doesn't provide a mechanism to modify the f32->f16 rounding
1270          * method and seems to be using RTE by default, so we need to implement
1271          * RTZ rounding in software :-(
1272          *
1273          * The implementation identifies the cases where RTZ applies and
1274          * returns the correct result and for everything else, it just uses
1275          * the default RTE conversion.
1276          */
1277         static bool _first = true;
1278         if (_first && unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) {
1279                 fprintf(stderr, "Shader uses round-toward-zero f32->f16 "
1280                         "conversion which is not supported in hardware.\n");
1281                 _first = false;
1282         }
1283 
1284         struct qinst *inst;
1285         struct qreg tmp;
1286 
1287         struct qreg result = vir_get_temp(c);
1288 
1289         struct qreg mantissa32 = vir_AND(c, f32, vir_uniform_ui(c, 0x007fffff));
1290 
1291         /* Compute sign bit of result */
1292         struct qreg sign = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 16)),
1293                                    vir_uniform_ui(c, 0x8000));
1294 
1295         /* Check the cases were RTZ rounding is relevant based on exponent */
1296         struct qreg exp32 = vir_AND(c, vir_SHR(c, f32, vir_uniform_ui(c, 23)),
1297                                     vir_uniform_ui(c, 0xff));
1298         struct qreg exp16 = vir_ADD(c, exp32, vir_uniform_ui(c, -127 + 15));
1299 
1300         /* if (exp16 > 30) */
1301         inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 30));
1302         vir_set_pf(c, inst, V3D_QPU_PF_PUSHC);
1303         inst = vir_OR_dest(c, result, sign, vir_uniform_ui(c, 0x7bff));
1304         vir_set_cond(inst, V3D_QPU_COND_IFA);
1305 
1306         /* if (exp16 <= 30) */
1307         inst = vir_OR_dest(c, result,
1308                            vir_OR(c, sign,
1309                                   vir_SHL(c, exp16, vir_uniform_ui(c, 10))),
1310                            vir_SHR(c, mantissa32, vir_uniform_ui(c, 13)));
1311         vir_set_cond(inst, V3D_QPU_COND_IFNA);
1312 
1313         /* if (exp16 <= 0) */
1314         inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, 0));
1315         vir_set_pf(c, inst, V3D_QPU_PF_PUSHC);
1316 
1317         tmp = vir_OR(c, mantissa32, vir_uniform_ui(c, 0x800000));
1318         tmp = vir_SHR(c, tmp, vir_SUB(c, vir_uniform_ui(c, 14), exp16));
1319         inst = vir_OR_dest(c, result, sign, tmp);
1320         vir_set_cond(inst, V3D_QPU_COND_IFNA);
1321 
1322         /* Cases where RTZ mode is not relevant: use default RTE conversion.
1323          *
1324          * The cases that are not affected by RTZ are:
1325          *
1326          *  exp16 < - 10 || exp32 == 0 || exp32 == 0xff
1327          *
1328          * In V3D we can implement this condition as:
1329          *
1330          * !((exp16 >= -10) && !(exp32 == 0) && !(exp32 == 0xff)))
1331          */
1332 
1333         /* exp16 >= -10 */
1334         inst = vir_MIN_dest(c, vir_nop_reg(), exp16, vir_uniform_ui(c, -10));
1335         vir_set_pf(c, inst, V3D_QPU_PF_PUSHC);
1336 
1337         /* && !(exp32 == 0) */
1338         inst = vir_MOV_dest(c, vir_nop_reg(), exp32);
1339         vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ);
1340 
1341         /* && !(exp32 == 0xff) */
1342         inst = vir_XOR_dest(c, vir_nop_reg(), exp32, vir_uniform_ui(c, 0xff));
1343         vir_set_uf(c, inst, V3D_QPU_UF_ANDNZ);
1344 
1345         /* Use regular RTE conversion if condition is False */
1346         inst = vir_FMOV_dest(c, result, f32);
1347         vir_set_pack(inst, V3D_QPU_PACK_L);
1348         vir_set_cond(inst, V3D_QPU_COND_IFNA);
1349 
1350         return vir_MOV(c, result);
1351 }
1352 
1353 /**
1354  * Takes the result value of a signed integer width conversion from a smaller
1355  * type to a larger type and if needed, it applies sign extension to it.
1356  */
1357 static struct qreg
sign_extend(struct v3d_compile * c,struct qreg value,uint32_t src_bit_size,uint32_t dst_bit_size)1358 sign_extend(struct v3d_compile *c,
1359             struct qreg value,
1360             uint32_t src_bit_size,
1361             uint32_t dst_bit_size)
1362 {
1363         assert(src_bit_size < dst_bit_size);
1364 
1365         struct qreg tmp = vir_MOV(c, value);
1366 
1367         /* Do we need to sign-extend? */
1368         uint32_t sign_mask = 1 << (src_bit_size - 1);
1369         struct qinst *sign_check =
1370                 vir_AND_dest(c, vir_nop_reg(),
1371                              tmp, vir_uniform_ui(c, sign_mask));
1372         vir_set_pf(c, sign_check, V3D_QPU_PF_PUSHZ);
1373 
1374         /* If so, fill in leading sign bits */
1375         uint32_t extend_bits = ~(((1 << src_bit_size) - 1)) &
1376                                ((1ull << dst_bit_size) - 1);
1377         struct qinst *extend_inst =
1378                 vir_OR_dest(c, tmp, tmp,
1379                             vir_uniform_ui(c, extend_bits));
1380         vir_set_cond(extend_inst, V3D_QPU_COND_IFNA);
1381 
1382         return tmp;
1383 }
1384 
1385 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1386 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1387 {
1388         /* This should always be lowered to ALU operations for V3D. */
1389         assert(!instr->dest.saturate);
1390 
1391         /* Vectors are special in that they have non-scalarized writemasks,
1392          * and just take the first swizzle channel for each argument in order
1393          * into each writemask channel.
1394          */
1395         if (instr->op == nir_op_vec2 ||
1396             instr->op == nir_op_vec3 ||
1397             instr->op == nir_op_vec4) {
1398                 struct qreg srcs[4];
1399                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1400                         srcs[i] = ntq_get_src(c, instr->src[i].src,
1401                                               instr->src[i].swizzle[0]);
1402                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1403                         ntq_store_dest(c, &instr->dest.dest, i,
1404                                        vir_MOV(c, srcs[i]));
1405                 return;
1406         }
1407 
1408         /* General case: We can just grab the one used channel per src. */
1409         struct qreg src[nir_op_infos[instr->op].num_inputs];
1410         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1411                 src[i] = ntq_get_alu_src(c, instr, i);
1412         }
1413 
1414         struct qreg result;
1415 
1416         switch (instr->op) {
1417         case nir_op_mov:
1418                 result = vir_MOV(c, src[0]);
1419                 break;
1420 
1421         case nir_op_fneg:
1422                 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
1423                 break;
1424         case nir_op_ineg:
1425                 result = vir_NEG(c, src[0]);
1426                 break;
1427 
1428         case nir_op_fmul:
1429                 result = vir_FMUL(c, src[0], src[1]);
1430                 break;
1431         case nir_op_fadd:
1432                 result = vir_FADD(c, src[0], src[1]);
1433                 break;
1434         case nir_op_fsub:
1435                 result = vir_FSUB(c, src[0], src[1]);
1436                 break;
1437         case nir_op_fmin:
1438                 result = vir_FMIN(c, src[0], src[1]);
1439                 break;
1440         case nir_op_fmax:
1441                 result = vir_FMAX(c, src[0], src[1]);
1442                 break;
1443 
1444         case nir_op_f2i32: {
1445                 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1446                 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1447                         result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1448                 } else {
1449                         result = vir_FTOIZ(c, src[0]);
1450                 }
1451                 break;
1452         }
1453 
1454         case nir_op_f2u32:
1455                 result = vir_FTOUZ(c, src[0]);
1456                 break;
1457         case nir_op_i2f32:
1458                 result = vir_ITOF(c, src[0]);
1459                 break;
1460         case nir_op_u2f32:
1461                 result = vir_UTOF(c, src[0]);
1462                 break;
1463         case nir_op_b2f32:
1464                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1465                 break;
1466         case nir_op_b2i32:
1467                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1468                 break;
1469 
1470         case nir_op_f2f16:
1471         case nir_op_f2f16_rtne:
1472                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1473                 result = vir_FMOV(c, src[0]);
1474                 vir_set_pack(c->defs[result.index], V3D_QPU_PACK_L);
1475                 break;
1476 
1477         case nir_op_f2f16_rtz:
1478                 assert(nir_src_bit_size(instr->src[0].src) == 32);
1479                 result = f2f16_rtz(c, src[0]);
1480                 break;
1481 
1482         case nir_op_f2f32:
1483                 assert(nir_src_bit_size(instr->src[0].src) == 16);
1484                 result = vir_FMOV(c, src[0]);
1485                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1486                 break;
1487 
1488         case nir_op_i2i16: {
1489                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1490                 assert(bit_size == 32 || bit_size == 8);
1491                 if (bit_size == 32) {
1492                         /* We don't have integer pack/unpack methods for
1493                          * converting between 16-bit and 32-bit, so we implement
1494                          * the conversion manually by truncating the src.
1495                          */
1496                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1497                 } else {
1498                         struct qreg tmp = vir_AND(c, src[0],
1499                                                   vir_uniform_ui(c, 0xff));
1500                         result = vir_MOV(c, sign_extend(c, tmp, bit_size, 16));
1501                 }
1502                 break;
1503         }
1504 
1505         case nir_op_u2u16: {
1506                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1507                 assert(bit_size == 32 || bit_size == 8);
1508 
1509                 /* We don't have integer pack/unpack methods for converting
1510                  * between 16-bit and 32-bit, so we implement the conversion
1511                  * manually by truncating the src. For the 8-bit case, we
1512                  * want to make sure we don't copy garbage from any of the
1513                  * 24 MSB bits.
1514                  */
1515                 if (bit_size == 32)
1516                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xffff));
1517                 else
1518                         result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1519                 break;
1520         }
1521 
1522         case nir_op_i2i8:
1523         case nir_op_u2u8:
1524                 assert(nir_src_bit_size(instr->src[0].src) == 32 ||
1525                        nir_src_bit_size(instr->src[0].src) == 16);
1526                 /* We don't have integer pack/unpack methods for converting
1527                  * between 8-bit and 32-bit, so we implement the conversion
1528                  * manually by truncating the src.
1529                  */
1530                 result = vir_AND(c, src[0], vir_uniform_ui(c, 0xff));
1531                 break;
1532 
1533         case nir_op_u2u32: {
1534                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1535                 assert(bit_size == 16 || bit_size == 8);
1536 
1537                 /* we don't have a native 8-bit/16-bit MOV so we copy all 32-bit
1538                  * from the src but we make sure to clear any garbage bits that
1539                  * may be present in the invalid src bits.
1540                  */
1541                 uint32_t mask = (1 << bit_size) - 1;
1542                 result = vir_AND(c, src[0], vir_uniform_ui(c, mask));
1543                 break;
1544         }
1545 
1546         case nir_op_i2i32: {
1547                 uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1548                 assert(bit_size == 16 || bit_size == 8);
1549 
1550                 uint32_t mask = (1 << bit_size) - 1;
1551                 struct qreg tmp = vir_AND(c, src[0],
1552                                           vir_uniform_ui(c, mask));
1553 
1554                 result = vir_MOV(c, sign_extend(c, tmp, bit_size, 32));
1555                 break;
1556         }
1557 
1558         case nir_op_iadd:
1559                 result = vir_ADD(c, src[0], src[1]);
1560                 break;
1561         case nir_op_ushr:
1562                 result = vir_SHR(c, src[0], src[1]);
1563                 break;
1564         case nir_op_isub:
1565                 result = vir_SUB(c, src[0], src[1]);
1566                 break;
1567         case nir_op_ishr:
1568                 result = vir_ASR(c, src[0], src[1]);
1569                 break;
1570         case nir_op_ishl:
1571                 result = vir_SHL(c, src[0], src[1]);
1572                 break;
1573         case nir_op_imin:
1574                 result = vir_MIN(c, src[0], src[1]);
1575                 break;
1576         case nir_op_umin:
1577                 result = vir_UMIN(c, src[0], src[1]);
1578                 break;
1579         case nir_op_imax:
1580                 result = vir_MAX(c, src[0], src[1]);
1581                 break;
1582         case nir_op_umax:
1583                 result = vir_UMAX(c, src[0], src[1]);
1584                 break;
1585         case nir_op_iand:
1586                 result = vir_AND(c, src[0], src[1]);
1587                 break;
1588         case nir_op_ior:
1589                 result = vir_OR(c, src[0], src[1]);
1590                 break;
1591         case nir_op_ixor:
1592                 result = vir_XOR(c, src[0], src[1]);
1593                 break;
1594         case nir_op_inot:
1595                 result = vir_NOT(c, src[0]);
1596                 break;
1597 
1598         case nir_op_ufind_msb:
1599                 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
1600                 break;
1601 
1602         case nir_op_imul:
1603                 result = vir_UMUL(c, src[0], src[1]);
1604                 break;
1605 
1606         case nir_op_seq:
1607         case nir_op_sne:
1608         case nir_op_sge:
1609         case nir_op_slt: {
1610                 enum v3d_qpu_cond cond;
1611                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1612                 assert(ok);
1613                 result = vir_MOV(c, vir_SEL(c, cond,
1614                                             vir_uniform_f(c, 1.0),
1615                                             vir_uniform_f(c, 0.0)));
1616                 c->flags_temp = result.index;
1617                 c->flags_cond = cond;
1618                 break;
1619         }
1620 
1621         case nir_op_i2b32:
1622         case nir_op_f2b32:
1623         case nir_op_feq32:
1624         case nir_op_fneu32:
1625         case nir_op_fge32:
1626         case nir_op_flt32:
1627         case nir_op_ieq32:
1628         case nir_op_ine32:
1629         case nir_op_ige32:
1630         case nir_op_uge32:
1631         case nir_op_ilt32:
1632         case nir_op_ult32: {
1633                 enum v3d_qpu_cond cond;
1634                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1635                 assert(ok);
1636                 result = ntq_emit_cond_to_bool(c, cond);
1637                 break;
1638         }
1639 
1640         case nir_op_b32csel:
1641                 result = vir_MOV(c,
1642                                  vir_SEL(c,
1643                                          ntq_emit_bool_to_cond(c, instr->src[0].src),
1644                                          src[1], src[2]));
1645                 break;
1646 
1647         case nir_op_fcsel:
1648                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1649                            V3D_QPU_PF_PUSHZ);
1650                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1651                                             src[1], src[2]));
1652                 break;
1653 
1654         case nir_op_frcp:
1655                 result = vir_RECIP(c, src[0]);
1656                 break;
1657         case nir_op_frsq:
1658                 result = vir_RSQRT(c, src[0]);
1659                 break;
1660         case nir_op_fexp2:
1661                 result = vir_EXP(c, src[0]);
1662                 break;
1663         case nir_op_flog2:
1664                 result = vir_LOG(c, src[0]);
1665                 break;
1666 
1667         case nir_op_fceil:
1668                 result = vir_FCEIL(c, src[0]);
1669                 break;
1670         case nir_op_ffloor:
1671                 result = vir_FFLOOR(c, src[0]);
1672                 break;
1673         case nir_op_fround_even:
1674                 result = vir_FROUND(c, src[0]);
1675                 break;
1676         case nir_op_ftrunc:
1677                 result = vir_FTRUNC(c, src[0]);
1678                 break;
1679 
1680         case nir_op_fsin:
1681                 result = ntq_fsincos(c, src[0], false);
1682                 break;
1683         case nir_op_fcos:
1684                 result = ntq_fsincos(c, src[0], true);
1685                 break;
1686 
1687         case nir_op_fsign:
1688                 result = ntq_fsign(c, src[0]);
1689                 break;
1690 
1691         case nir_op_fabs: {
1692                 result = vir_FMOV(c, src[0]);
1693                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1694                 break;
1695         }
1696 
1697         case nir_op_iabs:
1698                 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1699                 break;
1700 
1701         case nir_op_fddx:
1702         case nir_op_fddx_coarse:
1703         case nir_op_fddx_fine:
1704                 result = vir_FDX(c, src[0]);
1705                 break;
1706 
1707         case nir_op_fddy:
1708         case nir_op_fddy_coarse:
1709         case nir_op_fddy_fine:
1710                 result = vir_FDY(c, src[0]);
1711                 break;
1712 
1713         case nir_op_uadd_carry:
1714                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1715                            V3D_QPU_PF_PUSHC);
1716                 result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
1717                 break;
1718 
1719         case nir_op_pack_half_2x16_split:
1720                 result = vir_VFPACK(c, src[0], src[1]);
1721                 break;
1722 
1723         case nir_op_unpack_half_2x16_split_x:
1724                 result = vir_FMOV(c, src[0]);
1725                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1726                 break;
1727 
1728         case nir_op_unpack_half_2x16_split_y:
1729                 result = vir_FMOV(c, src[0]);
1730                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1731                 break;
1732 
1733         case nir_op_fquantize2f16: {
1734                 /* F32 -> F16 -> F32 conversion */
1735                 struct qreg tmp = vir_FMOV(c, src[0]);
1736                 vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
1737                 tmp = vir_FMOV(c, tmp);
1738                 vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
1739 
1740                 /* Check for denorm */
1741                 struct qreg abs_src = vir_FMOV(c, src[0]);
1742                 vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
1743                 struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
1744                 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
1745                                          V3D_QPU_PF_PUSHC);
1746 
1747                 /* Return +/-0 for denorms */
1748                 struct qreg zero =
1749                         vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
1750                 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
1751                 break;
1752         }
1753 
1754         default:
1755                 fprintf(stderr, "unknown NIR ALU inst: ");
1756                 nir_print_instr(&instr->instr, stderr);
1757                 fprintf(stderr, "\n");
1758                 abort();
1759         }
1760 
1761         /* We have a scalar result, so the instruction should only have a
1762          * single channel written to.
1763          */
1764         assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1765         ntq_store_dest(c, &instr->dest.dest,
1766                        ffs(instr->dest.write_mask) - 1, result);
1767 }
1768 
1769 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1770  * specifier.  They come from a register that's preloaded with 0xffffffff
1771  * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1772  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1773  */
1774 #define TLB_TYPE_F16_COLOR         (3 << 6)
1775 #define TLB_TYPE_I32_COLOR         (1 << 6)
1776 #define TLB_TYPE_F32_COLOR         (0 << 6)
1777 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1778 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1779 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1780 #define TLB_F16_SWAP_HI_LO         (1 << 1)
1781 #define TLB_VEC_SIZE_4_F16         (1 << 0)
1782 #define TLB_VEC_SIZE_2_F16         (0 << 0)
1783 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1784 
1785 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1786  * flag is set.
1787  */
1788 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1789 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1790 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1791 #define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1792 #define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1793 
1794 /* Stencil is a single 32-bit write. */
1795 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1796 
1797 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1798 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1799 {
1800         if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1801                 return;
1802 
1803         struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1804         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1805 
1806         nir_variable *var = c->output_color_var[rt];
1807         int num_components = glsl_get_vector_elements(var->type);
1808         uint32_t conf = 0xffffff00;
1809         struct qinst *inst;
1810 
1811         conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1812                                             TLB_SAMPLE_MODE_PER_PIXEL;
1813         conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1814 
1815         if (c->fs_key->swap_color_rb & (1 << rt))
1816                 num_components = MAX2(num_components, 3);
1817         assert(num_components != 0);
1818 
1819         enum glsl_base_type type = glsl_get_base_type(var->type);
1820         bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1821         bool is_32b_tlb_format = is_int_format ||
1822                                  (c->fs_key->f32_color_rb & (1 << rt));
1823 
1824         if (is_int_format) {
1825                 /* The F32 vs I32 distinction was dropped in 4.2. */
1826                 if (c->devinfo->ver < 42)
1827                         conf |= TLB_TYPE_I32_COLOR;
1828                 else
1829                         conf |= TLB_TYPE_F32_COLOR;
1830                 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1831         } else {
1832                 if (c->fs_key->f32_color_rb & (1 << rt)) {
1833                         conf |= TLB_TYPE_F32_COLOR;
1834                         conf |= ((num_components - 1) <<
1835                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
1836                 } else {
1837                         conf |= TLB_TYPE_F16_COLOR;
1838                         conf |= TLB_F16_SWAP_HI_LO;
1839                         if (num_components >= 3)
1840                                 conf |= TLB_VEC_SIZE_4_F16;
1841                         else
1842                                 conf |= TLB_VEC_SIZE_2_F16;
1843                 }
1844         }
1845 
1846         int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1847         for (int i = 0; i < num_samples; i++) {
1848                 struct qreg *color = c->msaa_per_sample_output ?
1849                         &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1850                         &c->outputs[var->data.driver_location * 4];
1851 
1852                 struct qreg r = color[0];
1853                 struct qreg g = color[1];
1854                 struct qreg b = color[2];
1855                 struct qreg a = color[3];
1856 
1857                 if (c->fs_key->swap_color_rb & (1 << rt))  {
1858                         r = color[2];
1859                         b = color[0];
1860                 }
1861 
1862                 if (c->fs_key->sample_alpha_to_one)
1863                         a = vir_uniform_f(c, 1.0);
1864 
1865                 if (is_32b_tlb_format) {
1866                         if (i == 0) {
1867                                 inst = vir_MOV_dest(c, tlbu_reg, r);
1868                                 inst->uniform =
1869                                         vir_get_uniform_index(c,
1870                                                               QUNIFORM_CONSTANT,
1871                                                               conf);
1872                         } else {
1873                                 vir_MOV_dest(c, tlb_reg, r);
1874                         }
1875 
1876                         if (num_components >= 2)
1877                                 vir_MOV_dest(c, tlb_reg, g);
1878                         if (num_components >= 3)
1879                                 vir_MOV_dest(c, tlb_reg, b);
1880                         if (num_components >= 4)
1881                                 vir_MOV_dest(c, tlb_reg, a);
1882                 } else {
1883                         inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1884                         if (conf != ~0 && i == 0) {
1885                                 inst->dst = tlbu_reg;
1886                                 inst->uniform =
1887                                         vir_get_uniform_index(c,
1888                                                               QUNIFORM_CONSTANT,
1889                                                               conf);
1890                         }
1891 
1892                         if (num_components >= 3)
1893                                 vir_VFPACK_dest(c, tlb_reg, b, a);
1894                 }
1895         }
1896 }
1897 
1898 static void
emit_frag_end(struct v3d_compile * c)1899 emit_frag_end(struct v3d_compile *c)
1900 {
1901         if (c->output_sample_mask_index != -1) {
1902                 vir_SETMSF_dest(c, vir_nop_reg(),
1903                                 vir_AND(c,
1904                                         vir_MSF(c),
1905                                         c->outputs[c->output_sample_mask_index]));
1906         }
1907 
1908         bool has_any_tlb_color_write = false;
1909         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1910                 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1911                         has_any_tlb_color_write = true;
1912         }
1913 
1914         if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1915                 struct nir_variable *var = c->output_color_var[0];
1916                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1917 
1918                 vir_SETMSF_dest(c, vir_nop_reg(),
1919                                 vir_AND(c,
1920                                         vir_MSF(c),
1921                                         vir_FTOC(c, color[3])));
1922         }
1923 
1924         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1925 
1926         /* If the shader has no non-TLB side effects and doesn't write Z
1927          * we can promote it to enabling early_fragment_tests even
1928          * if the user didn't.
1929          */
1930         if (c->output_position_index == -1 &&
1931             !(c->s->info.num_images || c->s->info.num_ssbos) &&
1932             !c->s->info.fs.uses_discard &&
1933             !c->fs_key->sample_alpha_to_coverage &&
1934             has_any_tlb_color_write) {
1935                 c->s->info.fs.early_fragment_tests = true;
1936         }
1937 
1938         /* By default, Z buffer writes are implicit using the Z values produced
1939          * from FEP (Z value produced from rasterization). When this is not
1940          * desirable (shader writes Z explicitly, has discards, etc) we need
1941          * to let the hardware know by setting c->writes_z to true, in which
1942          * case we always need to write a Z value from the QPU, even if it is
1943          * just the passthrough Z value produced from FEP.
1944          *
1945          * Also, from the V3D 4.2 spec:
1946          *
1947          * "If a shader performs a Z read the “Fragment shader does Z writes”
1948          *  bit in the shader record must be enabled to ensure deterministic
1949          *  results"
1950          *
1951          * So if c->reads_z is set we always need to write Z, even if it is
1952          * a passthrough from the Z value produced from FEP.
1953          */
1954         if (!c->s->info.fs.early_fragment_tests || c->reads_z) {
1955                 c->writes_z = true;
1956                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1957                 struct qinst *inst;
1958 
1959                 if (c->output_position_index != -1) {
1960                         /* Shader writes to gl_FragDepth, use that */
1961                         inst = vir_MOV_dest(c, tlbu_reg,
1962                                             c->outputs[c->output_position_index]);
1963 
1964                         if (c->devinfo->ver >= 42) {
1965                                 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1966                                                   TLB_SAMPLE_MODE_PER_PIXEL);
1967                         } else {
1968                                 tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
1969                         }
1970                 } else {
1971                         /* Shader doesn't write to gl_FragDepth, take Z from
1972                          * FEP.
1973                          */
1974                         c->writes_z_from_fep = true;
1975                         inst = vir_MOV_dest(c, tlbu_reg, vir_nop_reg());
1976 
1977                         if (c->devinfo->ver >= 42) {
1978                                 /* The spec says the PER_PIXEL flag is ignored
1979                                  * for invariant writes, but the simulator
1980                                  * demands it.
1981                                  */
1982                                 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1983                                                   TLB_SAMPLE_MODE_PER_PIXEL);
1984                         } else {
1985                                 tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
1986                         }
1987 
1988                         /* Since (single-threaded) fragment shaders always need
1989                          * a TLB write, if we dond't have any we emit a
1990                          * passthrouh Z and flag us as potentially discarding,
1991                          * so that we can use Z as the required TLB write.
1992                          */
1993                         if (!has_any_tlb_color_write)
1994                                 c->s->info.fs.uses_discard = true;
1995                 }
1996 
1997                 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
1998                                                       tlb_specifier |
1999                                                       0xffffff00);
2000                 inst->is_tlb_z_write = true;
2001         }
2002 
2003         /* XXX: Performance improvement: Merge Z write and color writes TLB
2004          * uniform setup
2005          */
2006         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
2007                 vir_emit_tlb_color_write(c, rt);
2008 }
2009 
2010 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)2011 vir_VPM_WRITE_indirect(struct v3d_compile *c,
2012                        struct qreg val,
2013                        struct qreg vpm_index,
2014                        bool uniform_vpm_index)
2015 {
2016         assert(c->devinfo->ver >= 40);
2017         if (uniform_vpm_index)
2018                 vir_STVPMV(c, vpm_index, val);
2019         else
2020                 vir_STVPMD(c, vpm_index, val);
2021 }
2022 
2023 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)2024 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
2025 {
2026         if (c->devinfo->ver >= 40) {
2027                 vir_VPM_WRITE_indirect(c, val,
2028                                        vir_uniform_ui(c, vpm_index), true);
2029         } else {
2030                 /* XXX: v3d33_vir_vpm_write_setup(c); */
2031                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
2032         }
2033 }
2034 
2035 static void
emit_vert_end(struct v3d_compile * c)2036 emit_vert_end(struct v3d_compile *c)
2037 {
2038         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2039          */
2040         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
2041                 vir_VPMWT(c);
2042 }
2043 
2044 static void
emit_geom_end(struct v3d_compile * c)2045 emit_geom_end(struct v3d_compile *c)
2046 {
2047         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
2048          */
2049         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
2050                 vir_VPMWT(c);
2051 }
2052 
2053 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2054 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
2055                        unsigned bit_size,
2056                        unsigned num_components,
2057                        nir_intrinsic_instr *low,
2058                        nir_intrinsic_instr *high,
2059                        void *data)
2060 {
2061         /* TMU general access only supports 32-bit vectors */
2062         if (bit_size > 32)
2063                 return false;
2064 
2065         if ((bit_size == 8 || bit_size == 16) && num_components > 1)
2066                 return false;
2067 
2068         if (align_mul % 4 != 0 || align_offset % 4 != 0)
2069                 return false;
2070 
2071         /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
2072          * if the resulting vector crosses a 16-byte boundary.
2073          */
2074         assert(util_is_power_of_two_nonzero(align_mul));
2075         align_mul = MIN2(align_mul, 16);
2076         align_offset &= 0xf;
2077         if (16 - align_mul + align_offset + num_components * 4 > 16)
2078                 return false;
2079 
2080         return true;
2081 }
2082 
2083 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)2084 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
2085 {
2086         bool progress;
2087         unsigned lower_flrp =
2088                 (s->options->lower_flrp16 ? 16 : 0) |
2089                 (s->options->lower_flrp32 ? 32 : 0) |
2090                 (s->options->lower_flrp64 ? 64 : 0);
2091 
2092         do {
2093                 progress = false;
2094 
2095                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
2096                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2097                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
2098                 NIR_PASS(progress, s, nir_copy_prop);
2099                 NIR_PASS(progress, s, nir_opt_remove_phis);
2100                 NIR_PASS(progress, s, nir_opt_dce);
2101                 NIR_PASS(progress, s, nir_opt_dead_cf);
2102                 NIR_PASS(progress, s, nir_opt_cse);
2103                 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
2104                 NIR_PASS(progress, s, nir_opt_algebraic);
2105                 NIR_PASS(progress, s, nir_opt_constant_folding);
2106 
2107                 /* Note that vectorization may undo the load/store scalarization
2108                  * pass we run for non 32-bit TMU general load/store by
2109                  * converting, for example, 2 consecutive 16-bit loads into a
2110                  * single 32-bit load. This is fine (and desirable) as long as
2111                  * the resulting 32-bit load meets 32-bit alignment requirements,
2112                  * which mem_vectorize_callback() should be enforcing.
2113                  */
2114                 nir_load_store_vectorize_options vectorize_opts = {
2115                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
2116                                  nir_var_mem_push_const | nir_var_mem_shared |
2117                                  nir_var_mem_global,
2118                         .callback = mem_vectorize_callback,
2119                         .robust_modes = 0,
2120                 };
2121                 bool vectorize_progress = false;
2122                 NIR_PASS(vectorize_progress, s, nir_opt_load_store_vectorize,
2123                          &vectorize_opts);
2124                 if (vectorize_progress) {
2125                         NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
2126                         NIR_PASS(progress, s, nir_lower_pack);
2127                         progress = true;
2128                 }
2129 
2130                 if (lower_flrp != 0) {
2131                         bool lower_flrp_progress = false;
2132 
2133                         NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
2134                                  lower_flrp,
2135                                  false /* always_precise */);
2136                         if (lower_flrp_progress) {
2137                                 NIR_PASS(progress, s, nir_opt_constant_folding);
2138                                 progress = true;
2139                         }
2140 
2141                         /* Nothing should rematerialize any flrps, so we only
2142                          * need to do this lowering once.
2143                          */
2144                         lower_flrp = 0;
2145                 }
2146 
2147                 NIR_PASS(progress, s, nir_opt_undef);
2148                 NIR_PASS(progress, s, nir_lower_undef_to_zero);
2149 
2150                 if (c && !c->disable_loop_unrolling &&
2151                     s->options->max_unroll_iterations > 0) {
2152                        bool local_progress = false;
2153                        NIR_PASS(local_progress, s, nir_opt_loop_unroll);
2154                        c->unrolled_any_loops |= local_progress;
2155                        progress |= local_progress;
2156                 }
2157         } while (progress);
2158 
2159         nir_move_options sink_opts =
2160                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
2161                 nir_move_load_ubo | nir_move_load_ssbo | nir_move_load_uniform;
2162         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
2163 }
2164 
2165 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)2166 driver_location_compare(const nir_variable *a, const nir_variable *b)
2167 {
2168         return a->data.driver_location == b->data.driver_location ?
2169                a->data.location_frac - b->data.location_frac :
2170                a->data.driver_location - b->data.driver_location;
2171 }
2172 
2173 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t * num_components_queued,uint32_t * remaining,uint32_t vpm_index)2174 ntq_emit_vpm_read(struct v3d_compile *c,
2175                   uint32_t *num_components_queued,
2176                   uint32_t *remaining,
2177                   uint32_t vpm_index)
2178 {
2179         struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
2180 
2181         if (c->devinfo->ver >= 40 ) {
2182                 return vir_LDVPMV_IN(c,
2183                                      vir_uniform_ui(c,
2184                                                     (*num_components_queued)++));
2185         }
2186 
2187         if (*num_components_queued != 0) {
2188                 (*num_components_queued)--;
2189                 return vir_MOV(c, vpm);
2190         }
2191 
2192         uint32_t num_components = MIN2(*remaining, 32);
2193 
2194         v3d33_vir_vpm_read_setup(c, num_components);
2195 
2196         *num_components_queued = num_components - 1;
2197         *remaining -= num_components;
2198 
2199         return vir_MOV(c, vpm);
2200 }
2201 
2202 static void
ntq_setup_vs_inputs(struct v3d_compile * c)2203 ntq_setup_vs_inputs(struct v3d_compile *c)
2204 {
2205         /* Figure out how many components of each vertex attribute the shader
2206          * uses.  Each variable should have been split to individual
2207          * components and unused ones DCEed.  The vertex fetcher will load
2208          * from the start of the attribute to the number of components we
2209          * declare we need in c->vattr_sizes[].
2210          *
2211          * BGRA vertex attributes are a bit special: since we implement these
2212          * as RGBA swapping R/B components we always need at least 3 components
2213          * if component 0 is read.
2214          */
2215         nir_foreach_shader_in_variable(var, c->s) {
2216                 /* No VS attribute array support. */
2217                 assert(MAX2(glsl_get_length(var->type), 1) == 1);
2218 
2219                 unsigned loc = var->data.driver_location;
2220                 int start_component = var->data.location_frac;
2221                 int num_components = glsl_get_components(var->type);
2222 
2223                 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
2224                                            start_component + num_components);
2225 
2226                 /* Handle BGRA inputs */
2227                 if (start_component == 0 &&
2228                     c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
2229                         c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
2230                 }
2231         }
2232 
2233         unsigned num_components = 0;
2234         uint32_t vpm_components_queued = 0;
2235         bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
2236                                     SYSTEM_VALUE_INSTANCE_ID) ||
2237                         BITSET_TEST(c->s->info.system_values_read,
2238                                     SYSTEM_VALUE_INSTANCE_INDEX);
2239         bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
2240                                      SYSTEM_VALUE_BASE_INSTANCE);
2241         bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
2242                                     SYSTEM_VALUE_VERTEX_ID) ||
2243                         BITSET_TEST(c->s->info.system_values_read,
2244                                     SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
2245 
2246         num_components += uses_iid;
2247         num_components += uses_biid;
2248         num_components += uses_vid;
2249 
2250         for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
2251                 num_components += c->vattr_sizes[i];
2252 
2253         if (uses_iid) {
2254                 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
2255                                            &num_components, ~0);
2256         }
2257 
2258         if (uses_biid) {
2259                 c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,
2260                                             &num_components, ~0);
2261         }
2262 
2263         if (uses_vid) {
2264                 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
2265                                            &num_components, ~0);
2266         }
2267 
2268         /* The actual loads will happen directly in nir_intrinsic_load_input
2269          * on newer versions.
2270          */
2271         if (c->devinfo->ver >= 40)
2272                 return;
2273 
2274         for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
2275                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2276                                   (loc + 1) * 4);
2277 
2278                 for (int i = 0; i < c->vattr_sizes[loc]; i++) {
2279                         c->inputs[loc * 4 + i] =
2280                                 ntq_emit_vpm_read(c,
2281                                                   &vpm_components_queued,
2282                                                   &num_components,
2283                                                   loc * 4 + i);
2284 
2285                 }
2286         }
2287 
2288         if (c->devinfo->ver >= 40) {
2289                 assert(vpm_components_queued == num_components);
2290         } else {
2291                 assert(vpm_components_queued == 0);
2292                 assert(num_components == 0);
2293         }
2294 }
2295 
2296 static bool
program_reads_point_coord(struct v3d_compile * c)2297 program_reads_point_coord(struct v3d_compile *c)
2298 {
2299         nir_foreach_shader_in_variable(var, c->s) {
2300                 if (util_varying_is_point_coord(var->data.location,
2301                                                 c->fs_key->point_sprite_mask)) {
2302                         return true;
2303                 }
2304         }
2305 
2306         return false;
2307 }
2308 
2309 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2310 ntq_setup_gs_inputs(struct v3d_compile *c)
2311 {
2312         nir_sort_variables_with_modes(c->s, driver_location_compare,
2313                                       nir_var_shader_in);
2314 
2315         nir_foreach_shader_in_variable(var, c->s) {
2316                 /* All GS inputs are arrays with as many entries as vertices
2317                  * in the input primitive, but here we only care about the
2318                  * per-vertex input type.
2319                  */
2320                 assert(glsl_type_is_array(var->type));
2321                 const struct glsl_type *type = glsl_get_array_element(var->type);
2322                 unsigned var_len = glsl_count_vec4_slots(type, false, false);
2323                 unsigned loc = var->data.driver_location;
2324 
2325                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2326                                   (loc + var_len) * 4);
2327 
2328                 if (var->data.compact) {
2329                         for (unsigned j = 0; j < var_len; j++) {
2330                                 unsigned input_idx = c->num_inputs++;
2331                                 unsigned loc_frac = var->data.location_frac + j;
2332                                 unsigned loc = var->data.location + loc_frac / 4;
2333                                 unsigned comp = loc_frac % 4;
2334                                 c->input_slots[input_idx] =
2335                                         v3d_slot_from_slot_and_component(loc, comp);
2336                         }
2337                        continue;
2338                 }
2339 
2340                 for (unsigned j = 0; j < var_len; j++) {
2341                         unsigned num_elements =
2342                                 glsl_type_is_struct(glsl_without_array(type)) ?
2343                                 4 : glsl_get_vector_elements(type);
2344                         for (unsigned k = 0; k < num_elements; k++) {
2345                                 unsigned chan = var->data.location_frac + k;
2346                                 unsigned input_idx = c->num_inputs++;
2347                                 struct v3d_varying_slot slot =
2348                                         v3d_slot_from_slot_and_component(var->data.location + j, chan);
2349                                 c->input_slots[input_idx] = slot;
2350                         }
2351                 }
2352         }
2353 }
2354 
2355 
2356 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2357 ntq_setup_fs_inputs(struct v3d_compile *c)
2358 {
2359         nir_sort_variables_with_modes(c->s, driver_location_compare,
2360                                       nir_var_shader_in);
2361 
2362         nir_foreach_shader_in_variable(var, c->s) {
2363                 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2364                 unsigned loc = var->data.driver_location;
2365 
2366                 uint32_t inputs_array_size = c->inputs_array_size;
2367                 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2368                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2369                                   inputs_array_required_size);
2370                 resize_interp_array(c, &c->interp, &inputs_array_size,
2371                                     inputs_array_required_size);
2372 
2373                 if (var->data.location == VARYING_SLOT_POS) {
2374                         emit_fragcoord_input(c, loc);
2375                 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2376                            !c->fs_key->has_gs) {
2377                         /* If the fragment shader reads gl_PrimitiveID and we
2378                          * don't have a geometry shader in the pipeline to write
2379                          * it then we program the hardware to inject it as
2380                          * an implicit varying. Take it from there.
2381                          */
2382                         c->inputs[loc * 4] = c->primitive_id;
2383                 } else if (util_varying_is_point_coord(var->data.location,
2384                                                        c->fs_key->point_sprite_mask)) {
2385                         c->inputs[loc * 4 + 0] = c->point_x;
2386                         c->inputs[loc * 4 + 1] = c->point_y;
2387                 } else if (var->data.compact) {
2388                         for (int j = 0; j < var_len; j++)
2389                                 emit_compact_fragment_input(c, loc, var, j);
2390                 } else if (glsl_type_is_struct(glsl_without_array(var->type))) {
2391                         for (int j = 0; j < var_len; j++) {
2392                            emit_fragment_input(c, loc, var, j, 4);
2393                         }
2394                 } else {
2395                         for (int j = 0; j < var_len; j++) {
2396                                 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2397                         }
2398                 }
2399         }
2400 }
2401 
2402 static void
ntq_setup_outputs(struct v3d_compile * c)2403 ntq_setup_outputs(struct v3d_compile *c)
2404 {
2405         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2406                 return;
2407 
2408         nir_foreach_shader_out_variable(var, c->s) {
2409                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
2410                 unsigned loc = var->data.driver_location * 4;
2411 
2412                 assert(array_len == 1);
2413                 (void)array_len;
2414 
2415                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2416                         add_output(c, loc + var->data.location_frac + i,
2417                                    var->data.location,
2418                                    var->data.location_frac + i);
2419                 }
2420 
2421                 switch (var->data.location) {
2422                 case FRAG_RESULT_COLOR:
2423                         c->output_color_var[0] = var;
2424                         c->output_color_var[1] = var;
2425                         c->output_color_var[2] = var;
2426                         c->output_color_var[3] = var;
2427                         break;
2428                 case FRAG_RESULT_DATA0:
2429                 case FRAG_RESULT_DATA1:
2430                 case FRAG_RESULT_DATA2:
2431                 case FRAG_RESULT_DATA3:
2432                         c->output_color_var[var->data.location -
2433                                             FRAG_RESULT_DATA0] = var;
2434                         break;
2435                 case FRAG_RESULT_DEPTH:
2436                         c->output_position_index = loc;
2437                         break;
2438                 case FRAG_RESULT_SAMPLE_MASK:
2439                         c->output_sample_mask_index = loc;
2440                         break;
2441                 }
2442         }
2443 }
2444 
2445 /**
2446  * Sets up the mapping from nir_register to struct qreg *.
2447  *
2448  * Each nir_register gets a struct qreg per 32-bit component being stored.
2449  */
2450 static void
ntq_setup_registers(struct v3d_compile * c,struct exec_list * list)2451 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
2452 {
2453         foreach_list_typed(nir_register, nir_reg, node, list) {
2454                 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
2455                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2456                                                   array_len *
2457                                                   nir_reg->num_components);
2458 
2459                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2460 
2461                 for (int i = 0; i < array_len * nir_reg->num_components; i++)
2462                         qregs[i] = vir_get_temp(c);
2463         }
2464 }
2465 
2466 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2467 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2468 {
2469         /* XXX perf: Experiment with using immediate loads to avoid having
2470          * these end up in the uniform stream.  Watch out for breaking the
2471          * small immediates optimization in the process!
2472          */
2473         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2474         for (int i = 0; i < instr->def.num_components; i++)
2475                 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2476 
2477         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2478 }
2479 
2480 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2481 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2482 {
2483         unsigned image_index = nir_src_as_uint(instr->src[0]);
2484         bool is_array = nir_intrinsic_image_array(instr);
2485 
2486         assert(nir_src_as_uint(instr->src[1]) == 0);
2487 
2488         ntq_store_dest(c, &instr->dest, 0,
2489                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2490         if (instr->num_components > 1) {
2491                 ntq_store_dest(c, &instr->dest, 1,
2492                                vir_uniform(c,
2493                                            instr->num_components == 2 && is_array ?
2494                                                    QUNIFORM_IMAGE_ARRAY_SIZE :
2495                                                    QUNIFORM_IMAGE_HEIGHT,
2496                                            image_index));
2497         }
2498         if (instr->num_components > 2) {
2499                 ntq_store_dest(c, &instr->dest, 2,
2500                                vir_uniform(c,
2501                                            is_array ?
2502                                            QUNIFORM_IMAGE_ARRAY_SIZE :
2503                                            QUNIFORM_IMAGE_DEPTH,
2504                                            image_index));
2505         }
2506 }
2507 
2508 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2509 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2510 {
2511         assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2512 
2513         int rt = nir_src_as_uint(instr->src[0]);
2514         assert(rt < V3D_MAX_DRAW_BUFFERS);
2515 
2516         int sample_index = nir_intrinsic_base(instr) ;
2517         assert(sample_index < V3D_MAX_SAMPLES);
2518 
2519         int component = nir_intrinsic_component(instr);
2520         assert(component < 4);
2521 
2522         /* We need to emit our TLB reads after we have acquired the scoreboard
2523          * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2524          * the last thread switch to improve parallelism, however, that is only
2525          * guaranteed to happen before the tlb color writes.
2526          *
2527          * To fix that, we make sure we always emit a thread switch before the
2528          * first tlb color read. If that happens to be the last thread switch
2529          * we emit, then everything is fine, but otherwsie, if any code after
2530          * this point needs to emit additional thread switches, then we will
2531          * switch the strategy to locking the scoreboard on the first thread
2532          * switch instead -- see vir_emit_thrsw().
2533          */
2534         if (!c->emitted_tlb_load) {
2535                 if (!c->last_thrsw_at_top_level) {
2536                         assert(c->devinfo->ver >= 41);
2537                         vir_emit_thrsw(c);
2538                 }
2539 
2540                 c->emitted_tlb_load = true;
2541         }
2542 
2543         struct qreg *color_reads_for_sample =
2544                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2545 
2546         if (color_reads_for_sample[component].file == QFILE_NULL) {
2547                 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2548                 int num_components =
2549                         util_format_get_nr_components(rt_format);
2550 
2551                 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2552                 if (swap_rb)
2553                         num_components = MAX2(num_components, 3);
2554 
2555                 nir_variable *var = c->output_color_var[rt];
2556                 enum glsl_base_type type = glsl_get_base_type(var->type);
2557 
2558                 bool is_int_format = type == GLSL_TYPE_INT ||
2559                                      type == GLSL_TYPE_UINT;
2560 
2561                 bool is_32b_tlb_format = is_int_format ||
2562                                          (c->fs_key->f32_color_rb & (1 << rt));
2563 
2564                 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2565 
2566                 uint32_t conf = 0xffffff00;
2567                 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2568                                           TLB_SAMPLE_MODE_PER_PIXEL;
2569                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2570 
2571                 if (is_32b_tlb_format) {
2572                         /* The F32 vs I32 distinction was dropped in 4.2. */
2573                         conf |= (c->devinfo->ver < 42 && is_int_format) ?
2574                                 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2575 
2576                         conf |= ((num_components - 1) <<
2577                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
2578                 } else {
2579                         conf |= TLB_TYPE_F16_COLOR;
2580                         conf |= TLB_F16_SWAP_HI_LO;
2581 
2582                         if (num_components >= 3)
2583                                 conf |= TLB_VEC_SIZE_4_F16;
2584                         else
2585                                 conf |= TLB_VEC_SIZE_2_F16;
2586                 }
2587 
2588 
2589                 for (int i = 0; i < num_samples; i++) {
2590                         struct qreg r, g, b, a;
2591                         if (is_32b_tlb_format) {
2592                                 r = conf != 0xffffffff && i == 0?
2593                                         vir_TLBU_COLOR_READ(c, conf) :
2594                                         vir_TLB_COLOR_READ(c);
2595                                 if (num_components >= 2)
2596                                         g = vir_TLB_COLOR_READ(c);
2597                                 if (num_components >= 3)
2598                                         b = vir_TLB_COLOR_READ(c);
2599                                 if (num_components >= 4)
2600                                         a = vir_TLB_COLOR_READ(c);
2601                         } else {
2602                                 struct qreg rg = conf != 0xffffffff && i == 0 ?
2603                                         vir_TLBU_COLOR_READ(c, conf) :
2604                                         vir_TLB_COLOR_READ(c);
2605                                 r = vir_FMOV(c, rg);
2606                                 vir_set_unpack(c->defs[r.index], 0,
2607                                                V3D_QPU_UNPACK_L);
2608                                 g = vir_FMOV(c, rg);
2609                                 vir_set_unpack(c->defs[g.index], 0,
2610                                                V3D_QPU_UNPACK_H);
2611 
2612                                 if (num_components > 2) {
2613                                     struct qreg ba = vir_TLB_COLOR_READ(c);
2614                                     b = vir_FMOV(c, ba);
2615                                     vir_set_unpack(c->defs[b.index], 0,
2616                                                    V3D_QPU_UNPACK_L);
2617                                     a = vir_FMOV(c, ba);
2618                                     vir_set_unpack(c->defs[a.index], 0,
2619                                                    V3D_QPU_UNPACK_H);
2620                                 }
2621                         }
2622 
2623                         struct qreg *color_reads =
2624                                 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2625 
2626                         color_reads[0] = swap_rb ? b : r;
2627                         if (num_components >= 2)
2628                                 color_reads[1] = g;
2629                         if (num_components >= 3)
2630                                 color_reads[2] = swap_rb ? r : b;
2631                         if (num_components >= 4)
2632                                 color_reads[3] = a;
2633                 }
2634         }
2635 
2636         assert(color_reads_for_sample[component].file != QFILE_NULL);
2637         ntq_store_dest(c, &instr->dest, 0,
2638                        vir_MOV(c, color_reads_for_sample[component]));
2639 }
2640 
2641 static bool
2642 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr);
2643 
2644 static bool
try_emit_uniform(struct v3d_compile * c,int offset,int num_components,nir_dest * dest,enum quniform_contents contents)2645 try_emit_uniform(struct v3d_compile *c,
2646                  int offset,
2647                  int num_components,
2648                  nir_dest *dest,
2649                  enum quniform_contents contents)
2650 {
2651         /* Even though ldunif is strictly 32-bit we can still use it
2652          * to load scalar 8-bit/16-bit uniforms so long as their offset
2653          * is 32-bit aligned. In this case, ldunif would still load
2654          * 32-bit into the destination with the 8-bit/16-bit uniform
2655          * data in the LSB and garbage in the MSB, but that is fine
2656          * because we should only be accessing the valid bits of the
2657          * destination.
2658          *
2659          * FIXME: if in the future we improve our register allocator to
2660          * pack 2 16-bit variables in the MSB and LSB of the same
2661          * register then this optimization would not be valid as is,
2662          * since the load clobbers the MSB.
2663          */
2664         if (offset % 4 != 0)
2665                 return false;
2666 
2667         /* We need dwords */
2668         offset = offset / 4;
2669 
2670         for (int i = 0; i < num_components; i++) {
2671                 ntq_store_dest(c, dest, i,
2672                                vir_uniform(c, contents, offset + i));
2673         }
2674 
2675         return true;
2676 }
2677 
2678 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2679 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2680 {
2681         /* We scalarize general TMU access for anything that is not 32-bit. */
2682         assert(nir_dest_bit_size(instr->dest) == 32 ||
2683                instr->num_components == 1);
2684 
2685         /* Try to emit ldunif if possible, otherwise fallback to general TMU */
2686         if (nir_src_is_const(instr->src[0])) {
2687                 int offset = (nir_intrinsic_base(instr) +
2688                              nir_src_as_uint(instr->src[0]));
2689 
2690                 if (try_emit_uniform(c, offset, instr->num_components,
2691                                      &instr->dest, QUNIFORM_UNIFORM)) {
2692                         return;
2693                 }
2694         }
2695 
2696         if (!ntq_emit_load_unifa(c, instr)) {
2697                 ntq_emit_tmu_general(c, instr, false);
2698                 c->has_general_tmu_load = true;
2699         }
2700 }
2701 
2702 static bool
ntq_emit_inline_ubo_load(struct v3d_compile * c,nir_intrinsic_instr * instr)2703 ntq_emit_inline_ubo_load(struct v3d_compile *c, nir_intrinsic_instr *instr)
2704 {
2705         if (c->compiler->max_inline_uniform_buffers <= 0)
2706                 return false;
2707 
2708         /* On Vulkan we use indices 1..MAX_INLINE_UNIFORM_BUFFERS for inline
2709          * uniform buffers which we want to handle more like push constants
2710          * than regular UBO. OpenGL doesn't implement this feature.
2711          */
2712         assert(c->key->environment == V3D_ENVIRONMENT_VULKAN);
2713         uint32_t index = nir_src_as_uint(instr->src[0]);
2714         if (index == 0 || index > c->compiler->max_inline_uniform_buffers)
2715                 return false;
2716 
2717         /* We scalarize general TMU access for anything that is not 32-bit */
2718         assert(nir_dest_bit_size(instr->dest) == 32 ||
2719                instr->num_components == 1);
2720 
2721         if (nir_src_is_const(instr->src[1])) {
2722                 /* Index 0 is reserved for push constants */
2723                 assert(index > 0);
2724                 uint32_t inline_index = index - 1;
2725                 int offset = nir_src_as_uint(instr->src[1]);
2726                 if (try_emit_uniform(c, offset, instr->num_components,
2727                                      &instr->dest,
2728                                      QUNIFORM_INLINE_UBO_0 + inline_index)) {
2729                         return true;
2730                 }
2731         }
2732 
2733         /* Fallback to regular UBO load */
2734         return false;
2735 }
2736 
2737 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2738 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2739 {
2740         /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2741          *
2742          * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2743          * if we don't support non-uniform offsets because we also set the
2744          * lower_all_io_to_temps option in the NIR compiler. This ensures that
2745          * any indirect indexing on in/out variables is turned into indirect
2746          * indexing on temporary variables instead, that we handle by lowering
2747          * to scratch. If we implement non-uniform offset here we might be able
2748          * to avoid the temp and scratch lowering, which involves copying from
2749          * the input to the temp variable, possibly making code more optimal.
2750          */
2751         unsigned offset =
2752                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2753 
2754         if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
2755                /* Emit the LDVPM directly now, rather than at the top
2756                 * of the shader like we did for V3D 3.x (which needs
2757                 * vpmsetup when not just taking the next offset).
2758                 *
2759                 * Note that delaying like this may introduce stalls,
2760                 * as LDVPMV takes a minimum of 1 instruction but may
2761                 * be slower if the VPM unit is busy with another QPU.
2762                 */
2763                int index = 0;
2764                if (BITSET_TEST(c->s->info.system_values_read,
2765                                SYSTEM_VALUE_INSTANCE_ID)) {
2766                       index++;
2767                }
2768                if (BITSET_TEST(c->s->info.system_values_read,
2769                                SYSTEM_VALUE_BASE_INSTANCE)) {
2770                       index++;
2771                }
2772                if (BITSET_TEST(c->s->info.system_values_read,
2773                                SYSTEM_VALUE_VERTEX_ID)) {
2774                       index++;
2775                }
2776                for (int i = 0; i < offset; i++)
2777                       index += c->vattr_sizes[i];
2778                index += nir_intrinsic_component(instr);
2779                for (int i = 0; i < instr->num_components; i++) {
2780                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
2781                       ntq_store_dest(c, &instr->dest, i,
2782                                      vir_LDVPMV_IN(c, vpm_offset));
2783                 }
2784         } else {
2785                 for (int i = 0; i < instr->num_components; i++) {
2786                         int comp = nir_intrinsic_component(instr) + i;
2787                         struct qreg input = c->inputs[offset * 4 + comp];
2788                         ntq_store_dest(c, &instr->dest, i, vir_MOV(c, input));
2789 
2790                         if (c->s->info.stage == MESA_SHADER_FRAGMENT &&
2791                             input.file == c->payload_z.file &&
2792                             input.index == c->payload_z.index) {
2793                                 c->reads_z = true;
2794                         }
2795                 }
2796         }
2797 }
2798 
2799 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2800 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2801                                 nir_intrinsic_instr *instr)
2802 {
2803         assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2804 
2805         unsigned rt = nir_src_as_uint(instr->src[1]);
2806         assert(rt < V3D_MAX_DRAW_BUFFERS);
2807 
2808         unsigned sample_idx = nir_intrinsic_base(instr);
2809         assert(sample_idx < V3D_MAX_SAMPLES);
2810 
2811         unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2812         for (int i = 0; i < instr->num_components; i++) {
2813                 c->sample_colors[offset + i] =
2814                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2815         }
2816 }
2817 
2818 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2819 ntq_emit_color_write(struct v3d_compile *c,
2820                      nir_intrinsic_instr *instr)
2821 {
2822         unsigned offset = (nir_intrinsic_base(instr) +
2823                            nir_src_as_uint(instr->src[1])) * 4 +
2824                           nir_intrinsic_component(instr);
2825         for (int i = 0; i < instr->num_components; i++) {
2826                 c->outputs[offset + i] =
2827                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2828         }
2829 }
2830 
2831 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2832 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2833 {
2834         assert(instr->num_components == 1);
2835 
2836         struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2837 
2838         uint32_t base_offset = nir_intrinsic_base(instr);
2839 
2840         if (base_offset)
2841                 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2842 
2843         /* Usually, for VS or FS, we only emit outputs once at program end so
2844          * our VPM writes are never in non-uniform control flow, but this
2845          * is not true for GS, where we are emitting multiple vertices.
2846          */
2847         if (vir_in_nonuniform_control_flow(c)) {
2848                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2849                            V3D_QPU_PF_PUSHZ);
2850         }
2851 
2852         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2853 
2854         /* The offset isn’t necessarily dynamically uniform for a geometry
2855          * shader. This can happen if the shader sometimes doesn’t emit one of
2856          * the vertices. In that case subsequent vertices will be written to
2857          * different offsets in the VPM and we need to use the scatter write
2858          * instruction to have a different offset for each lane.
2859          */
2860          bool is_uniform_offset =
2861                  !vir_in_nonuniform_control_flow(c) &&
2862                  !nir_src_is_divergent(instr->src[1]);
2863          vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2864 
2865         if (vir_in_nonuniform_control_flow(c)) {
2866                 struct qinst *last_inst =
2867                         (struct qinst *)c->cur_block->instructions.prev;
2868                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2869         }
2870 }
2871 
2872 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2873 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2874 {
2875         assert(c->s->info.stage == MESA_SHADER_VERTEX);
2876         assert(instr->num_components == 1);
2877 
2878         uint32_t base = nir_intrinsic_base(instr);
2879         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2880 
2881         if (nir_src_is_const(instr->src[1])) {
2882                 vir_VPM_WRITE(c, val,
2883                               base + nir_src_as_uint(instr->src[1]));
2884         } else {
2885                 struct qreg offset = vir_ADD(c,
2886                                              ntq_get_src(c, instr->src[1], 1),
2887                                              vir_uniform_ui(c, base));
2888                 bool is_uniform_offset =
2889                         !vir_in_nonuniform_control_flow(c) &&
2890                         !nir_src_is_divergent(instr->src[1]);
2891                 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2892         }
2893 }
2894 
2895 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2896 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2897 {
2898         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2899                ntq_emit_color_write(c, instr);
2900         else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2901                emit_store_output_gs(c, instr);
2902         else
2903                emit_store_output_vs(c, instr);
2904 }
2905 
2906 /**
2907  * This implementation is based on v3d_sample_{x,y}_offset() from
2908  * v3d_sample_offset.h.
2909  */
2910 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2911 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2912                       struct qreg *sx, struct qreg *sy)
2913 {
2914         sample_idx = vir_ITOF(c, sample_idx);
2915 
2916         struct qreg offset_x =
2917                 vir_FADD(c, vir_uniform_f(c, -0.125f),
2918                             vir_FMUL(c, sample_idx,
2919                                         vir_uniform_f(c, 0.5f)));
2920         vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2921                                     vir_uniform_f(c, 2.0f), sample_idx),
2922                    V3D_QPU_PF_PUSHC);
2923         offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2924                               vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2925                               offset_x);
2926 
2927         struct qreg offset_y =
2928                    vir_FADD(c, vir_uniform_f(c, -0.375f),
2929                                vir_FMUL(c, sample_idx,
2930                                            vir_uniform_f(c, 0.25f)));
2931         *sx = offset_x;
2932         *sy = offset_y;
2933 }
2934 
2935 /**
2936  * This implementation is based on get_centroid_offset() from fep.c.
2937  */
2938 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2939 ntq_get_barycentric_centroid(struct v3d_compile *c,
2940                              struct qreg *out_x,
2941                              struct qreg *out_y)
2942 {
2943         struct qreg sample_mask;
2944         if (c->output_sample_mask_index != -1)
2945                 sample_mask = c->outputs[c->output_sample_mask_index];
2946         else
2947                 sample_mask = vir_MSF(c);
2948 
2949         struct qreg i0 = vir_uniform_ui(c, 0);
2950         struct qreg i1 = vir_uniform_ui(c, 1);
2951         struct qreg i2 = vir_uniform_ui(c, 2);
2952         struct qreg i3 = vir_uniform_ui(c, 3);
2953         struct qreg i4 = vir_uniform_ui(c, 4);
2954         struct qreg i8 = vir_uniform_ui(c, 8);
2955 
2956         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2957         struct qreg F = vir_uniform_ui(c, 0);
2958         struct qreg T = vir_uniform_ui(c, ~0);
2959         struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
2960         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2961         s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2962         struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
2963         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2964         s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2965         struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
2966         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2967         s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2968         struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
2969         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2970         s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2971 
2972         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2973         struct qreg sample_idx = i3;
2974         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2975         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
2976         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2977         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
2978         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2979         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
2980 
2981         /* Get offset at selected sample index */
2982         struct qreg offset_x, offset_y;
2983         ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2984 
2985         /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
2986          * are selected.
2987          */
2988         struct qreg s0_and_s3 = vir_AND(c, s0, s3);
2989         struct qreg s1_and_s2 = vir_AND(c, s1, s2);
2990 
2991         struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
2992         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2993         use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2994         use_center = vir_OR(c, use_center, s0_and_s3);
2995         use_center = vir_OR(c, use_center, s1_and_s2);
2996 
2997         struct qreg zero = vir_uniform_f(c, 0.0f);
2998         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2999         offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
3000         offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
3001 
3002         *out_x = offset_x;
3003         *out_y = offset_y;
3004 }
3005 
3006 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)3007 ntq_emit_load_interpolated_input(struct v3d_compile *c,
3008                                  struct qreg p,
3009                                  struct qreg C,
3010                                  struct qreg offset_x,
3011                                  struct qreg offset_y,
3012                                  unsigned mode)
3013 {
3014         if (mode == INTERP_MODE_FLAT)
3015                 return C;
3016 
3017         struct qreg sample_offset_x =
3018                 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3019         struct qreg sample_offset_y =
3020                 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3021 
3022         struct qreg scaleX =
3023                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
3024                             offset_x);
3025         struct qreg scaleY =
3026                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
3027                             offset_y);
3028 
3029         struct qreg pInterp =
3030                 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
3031                                            vir_FMUL(c, vir_FDY(c, p), scaleY)));
3032 
3033         if (mode == INTERP_MODE_NOPERSPECTIVE)
3034                 return vir_FADD(c, pInterp, C);
3035 
3036         struct qreg w = c->payload_w;
3037         struct qreg wInterp =
3038                 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
3039                                            vir_FMUL(c, vir_FDY(c, w), scaleY)));
3040 
3041         return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
3042 }
3043 
3044 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)3045 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
3046 {
3047         struct qinst *ldunifa =
3048                 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
3049         ldunifa->qpu.sig.ldunifa = true;
3050         if (result)
3051                 *result = vir_emit_def(c, ldunifa);
3052         else
3053                 vir_emit_nondef(c, ldunifa);
3054         c->current_unifa_offset += 4;
3055 }
3056 
3057 static bool
ntq_emit_load_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)3058 ntq_emit_load_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
3059 {
3060         assert(instr->intrinsic == nir_intrinsic_load_ubo ||
3061                instr->intrinsic == nir_intrinsic_load_ssbo ||
3062                instr->intrinsic == nir_intrinsic_load_uniform);
3063 
3064         bool is_uniform = instr->intrinsic == nir_intrinsic_load_uniform;
3065         bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo;
3066         bool is_ssbo = instr->intrinsic == nir_intrinsic_load_ssbo;
3067 
3068         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
3069          * current unifa offset is 4 bytes ahead of the offset of the last load.
3070          */
3071         static const int32_t max_unifa_skip_dist =
3072                 MAX_UNIFA_SKIP_DISTANCE - 4;
3073 
3074         /* We can only use unifa if the offset is uniform */
3075         nir_src offset = is_uniform ? instr->src[0] : instr->src[1];
3076         if (nir_src_is_divergent(offset))
3077                 return false;
3078 
3079         /* We can only use unifa with SSBOs if they are read-only. Otherwise
3080          * ldunifa won't see the shader writes to that address (possibly
3081          * because ldunifa doesn't read from the L2T cache).
3082          */
3083         if (is_ssbo && !(nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE))
3084                 return false;
3085 
3086         /* Just as with SSBOs, we can't use ldunifa to read indirect uniforms
3087          * that we may have been written to scratch using the TMU.
3088          */
3089         bool dynamic_src = !nir_src_is_const(offset);
3090         if (is_uniform && dynamic_src && c->s->scratch_size > 0)
3091                 return false;
3092 
3093         uint32_t const_offset = dynamic_src ? 0 : nir_src_as_uint(offset);
3094         if (is_uniform)
3095                 const_offset += nir_intrinsic_base(instr);
3096 
3097         /* ldunifa is a 32-bit load instruction so we can only use it with
3098          * 32-bit aligned addresses. We always produce 32-bit aligned addresses
3099          * except for types smaller than 32-bit, so in these cases we can only
3100          * use ldunifa if we can verify alignment, which we can only do for
3101          * loads with a constant offset.
3102          */
3103         uint32_t bit_size = nir_dest_bit_size(instr->dest);
3104         uint32_t value_skips = 0;
3105         if (bit_size < 32) {
3106                 if (dynamic_src) {
3107                         return false;
3108                 } else if (const_offset % 4 != 0) {
3109                         /* If we are loading from an unaligned offset, fix
3110                          * alignment and skip over unused elements in result.
3111                          */
3112                         value_skips = (const_offset % 4) / (bit_size / 8);
3113                         const_offset &= ~0x3;
3114                 }
3115         }
3116 
3117         assert((bit_size == 32 && value_skips == 0) ||
3118                (bit_size == 16 && value_skips <= 1) ||
3119                (bit_size == 8  && value_skips <= 3));
3120 
3121         /* Both Vulkan and OpenGL reserve index 0 for uniforms / push
3122          * constants.
3123          */
3124         uint32_t index = is_uniform ? 0 : nir_src_as_uint(instr->src[0]);
3125 
3126         /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
3127          * shifted up by 1 (0 is gallium's constant buffer 0).
3128          */
3129         if (is_ubo && c->key->environment == V3D_ENVIRONMENT_OPENGL)
3130                 index++;
3131 
3132         /* We can only keep track of the last unifa address we used with
3133          * constant offset loads. If the new load targets the same buffer and
3134          * is close enough to the previous load, we can skip the unifa register
3135          * write by emitting dummy ldunifa instructions to update the unifa
3136          * address.
3137          */
3138         bool skip_unifa = false;
3139         uint32_t ldunifa_skips = 0;
3140         if (dynamic_src) {
3141                 c->current_unifa_block = NULL;
3142         } else if (c->cur_block == c->current_unifa_block &&
3143                    c->current_unifa_is_ubo == !is_ssbo &&
3144                    c->current_unifa_index == index &&
3145                    c->current_unifa_offset <= const_offset &&
3146                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
3147                 skip_unifa = true;
3148                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
3149         } else {
3150                 c->current_unifa_block = c->cur_block;
3151                 c->current_unifa_is_ubo = !is_ssbo;
3152                 c->current_unifa_index = index;
3153                 c->current_unifa_offset = const_offset;
3154         }
3155 
3156         if (!skip_unifa) {
3157                 struct qreg base_offset = !is_ssbo ?
3158                         vir_uniform(c, QUNIFORM_UBO_ADDR,
3159                                     v3d_unit_data_create(index, const_offset)) :
3160                         vir_uniform(c, QUNIFORM_SSBO_OFFSET, index);
3161 
3162                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
3163                 if (!dynamic_src) {
3164                         if (!is_ssbo) {
3165                                 vir_MOV_dest(c, unifa, base_offset);
3166                         } else {
3167                                 vir_ADD_dest(c, unifa, base_offset,
3168                                              vir_uniform_ui(c, const_offset));
3169                         }
3170                 } else {
3171                         vir_ADD_dest(c, unifa, base_offset,
3172                                      ntq_get_src(c, offset, 0));
3173                 }
3174         } else {
3175                 for (int i = 0; i < ldunifa_skips; i++)
3176                         emit_ldunifa(c, NULL);
3177         }
3178 
3179         uint32_t num_components = nir_intrinsic_dest_components(instr);
3180         for (uint32_t i = 0; i < num_components; ) {
3181                 struct qreg data;
3182                 emit_ldunifa(c, &data);
3183 
3184                 if (bit_size == 32) {
3185                         assert(value_skips == 0);
3186                         ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
3187                         i++;
3188                 } else {
3189                         assert((bit_size == 16 && value_skips <= 1) ||
3190                                (bit_size ==  8 && value_skips <= 3));
3191 
3192                         /* If we have any values to skip, shift to the first
3193                          * valid value in the ldunifa result.
3194                          */
3195                         if (value_skips > 0) {
3196                                 data = vir_SHR(c, data,
3197                                                vir_uniform_ui(c, bit_size *
3198                                                                  value_skips));
3199                         }
3200 
3201                         /* Check how many valid components we have discounting
3202                          * read components to skip.
3203                          */
3204                         uint32_t valid_count = (32 / bit_size) - value_skips;
3205                         assert((bit_size == 16 && valid_count <= 2) ||
3206                                (bit_size ==  8 && valid_count <= 4));
3207                         assert(valid_count > 0);
3208 
3209                         /* Process the valid components */
3210                         do {
3211                                 struct qreg tmp;
3212                                 uint32_t mask = (1 << bit_size) - 1;
3213                                 tmp = vir_AND(c, vir_MOV(c, data),
3214                                               vir_uniform_ui(c, mask));
3215                                 ntq_store_dest(c, &instr->dest, i,
3216                                                vir_MOV(c, tmp));
3217                                 i++;
3218                                 valid_count--;
3219 
3220                                 /* Shift to next component */
3221                                 if (i < num_components && valid_count > 0) {
3222                                         data = vir_SHR(c, data,
3223                                                        vir_uniform_ui(c, bit_size));
3224                                 }
3225                         } while (i < num_components && valid_count > 0);
3226                 }
3227         }
3228 
3229         return true;
3230 }
3231 
3232 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)3233 emit_load_local_invocation_index(struct v3d_compile *c)
3234 {
3235         return vir_SHR(c, c->cs_payload[1],
3236                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
3237 }
3238 
3239 /* Various subgroup operations rely on the A flags, so this helper ensures that
3240  * A flags represents currently active lanes in the subgroup.
3241  */
3242 static void
set_a_flags_for_subgroup(struct v3d_compile * c)3243 set_a_flags_for_subgroup(struct v3d_compile *c)
3244 {
3245         /* MSF returns 0 for disabled lanes in compute shaders so
3246          * PUSHZ will set A=1 for disabled lanes. We want the inverse
3247          * of this but we don't have any means to negate the A flags
3248          * directly, but we can do it by repeating the same operation
3249          * with NORZ (A = ~A & ~Z).
3250          */
3251         assert(c->s->info.stage == MESA_SHADER_COMPUTE);
3252         vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3253         vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
3254 
3255         /* If we are under non-uniform control flow we also need to
3256          * AND the A flags with the current execute mask.
3257          */
3258         if (vir_in_nonuniform_control_flow(c)) {
3259                 const uint32_t bidx = c->cur_block->index;
3260                 vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
3261                                            c->execute,
3262                                            vir_uniform_ui(c, bidx)),
3263                            V3D_QPU_UF_ANDZ);
3264         }
3265 }
3266 
3267 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)3268 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
3269 {
3270         switch (instr->intrinsic) {
3271         case nir_intrinsic_load_uniform:
3272                 ntq_emit_load_uniform(c, instr);
3273                 break;
3274 
3275         case nir_intrinsic_load_ubo:
3276            if (ntq_emit_inline_ubo_load(c, instr))
3277                    break;
3278            FALLTHROUGH;
3279         case nir_intrinsic_load_ssbo:
3280                 if (!ntq_emit_load_unifa(c, instr)) {
3281                         ntq_emit_tmu_general(c, instr, false);
3282                         c->has_general_tmu_load = true;
3283                 }
3284                 break;
3285 
3286         case nir_intrinsic_ssbo_atomic_add:
3287         case nir_intrinsic_ssbo_atomic_imin:
3288         case nir_intrinsic_ssbo_atomic_umin:
3289         case nir_intrinsic_ssbo_atomic_imax:
3290         case nir_intrinsic_ssbo_atomic_umax:
3291         case nir_intrinsic_ssbo_atomic_and:
3292         case nir_intrinsic_ssbo_atomic_or:
3293         case nir_intrinsic_ssbo_atomic_xor:
3294         case nir_intrinsic_ssbo_atomic_exchange:
3295         case nir_intrinsic_ssbo_atomic_comp_swap:
3296         case nir_intrinsic_store_ssbo:
3297                 ntq_emit_tmu_general(c, instr, false);
3298                 break;
3299 
3300         case nir_intrinsic_shared_atomic_add:
3301         case nir_intrinsic_shared_atomic_imin:
3302         case nir_intrinsic_shared_atomic_umin:
3303         case nir_intrinsic_shared_atomic_imax:
3304         case nir_intrinsic_shared_atomic_umax:
3305         case nir_intrinsic_shared_atomic_and:
3306         case nir_intrinsic_shared_atomic_or:
3307         case nir_intrinsic_shared_atomic_xor:
3308         case nir_intrinsic_shared_atomic_exchange:
3309         case nir_intrinsic_shared_atomic_comp_swap:
3310         case nir_intrinsic_store_shared:
3311         case nir_intrinsic_store_scratch:
3312                 ntq_emit_tmu_general(c, instr, true);
3313                 break;
3314 
3315         case nir_intrinsic_load_scratch:
3316         case nir_intrinsic_load_shared:
3317                 ntq_emit_tmu_general(c, instr, true);
3318                 c->has_general_tmu_load = true;
3319                 break;
3320 
3321         case nir_intrinsic_image_store:
3322         case nir_intrinsic_image_atomic_add:
3323         case nir_intrinsic_image_atomic_imin:
3324         case nir_intrinsic_image_atomic_umin:
3325         case nir_intrinsic_image_atomic_imax:
3326         case nir_intrinsic_image_atomic_umax:
3327         case nir_intrinsic_image_atomic_and:
3328         case nir_intrinsic_image_atomic_or:
3329         case nir_intrinsic_image_atomic_xor:
3330         case nir_intrinsic_image_atomic_exchange:
3331         case nir_intrinsic_image_atomic_comp_swap:
3332                 v3d40_vir_emit_image_load_store(c, instr);
3333                 break;
3334 
3335         case nir_intrinsic_image_load:
3336                 v3d40_vir_emit_image_load_store(c, instr);
3337                 /* Not really a general TMU load, but we only use this flag
3338                  * for NIR scheduling and we do schedule these under the same
3339                  * policy as general TMU.
3340                  */
3341                 c->has_general_tmu_load = true;
3342                 break;
3343 
3344         case nir_intrinsic_get_ssbo_size:
3345                 ntq_store_dest(c, &instr->dest, 0,
3346                                vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
3347                                            nir_src_comp_as_uint(instr->src[0], 0)));
3348                 break;
3349 
3350         case nir_intrinsic_get_ubo_size:
3351                 ntq_store_dest(c, &instr->dest, 0,
3352                                vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
3353                                            nir_src_comp_as_uint(instr->src[0], 0)));
3354                 break;
3355 
3356         case nir_intrinsic_load_user_clip_plane:
3357                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
3358                         ntq_store_dest(c, &instr->dest, i,
3359                                        vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
3360                                                    nir_intrinsic_ucp_id(instr) *
3361                                                    4 + i));
3362                 }
3363                 break;
3364 
3365         case nir_intrinsic_load_viewport_x_scale:
3366                 ntq_store_dest(c, &instr->dest, 0,
3367                                vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
3368                 break;
3369 
3370         case nir_intrinsic_load_viewport_y_scale:
3371                 ntq_store_dest(c, &instr->dest, 0,
3372                                vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
3373                 break;
3374 
3375         case nir_intrinsic_load_viewport_z_scale:
3376                 ntq_store_dest(c, &instr->dest, 0,
3377                                vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
3378                 break;
3379 
3380         case nir_intrinsic_load_viewport_z_offset:
3381                 ntq_store_dest(c, &instr->dest, 0,
3382                                vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
3383                 break;
3384 
3385         case nir_intrinsic_load_line_coord:
3386                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
3387                 break;
3388 
3389         case nir_intrinsic_load_line_width:
3390                 ntq_store_dest(c, &instr->dest, 0,
3391                                vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
3392                 break;
3393 
3394         case nir_intrinsic_load_aa_line_width:
3395                 ntq_store_dest(c, &instr->dest, 0,
3396                                vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
3397                 break;
3398 
3399         case nir_intrinsic_load_sample_mask_in:
3400                 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
3401                 break;
3402 
3403         case nir_intrinsic_load_helper_invocation:
3404                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
3405                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3406                 ntq_store_dest(c, &instr->dest, 0, qdest);
3407                 break;
3408 
3409         case nir_intrinsic_load_front_face:
3410                 /* The register contains 0 (front) or 1 (back), and we need to
3411                  * turn it into a NIR bool where true means front.
3412                  */
3413                 ntq_store_dest(c, &instr->dest, 0,
3414                                vir_ADD(c,
3415                                        vir_uniform_ui(c, -1),
3416                                        vir_REVF(c)));
3417                 break;
3418 
3419         case nir_intrinsic_load_base_instance:
3420                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
3421                 break;
3422 
3423         case nir_intrinsic_load_instance_id:
3424                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
3425                 break;
3426 
3427         case nir_intrinsic_load_vertex_id:
3428                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
3429                 break;
3430 
3431         case nir_intrinsic_load_tlb_color_v3d:
3432                 vir_emit_tlb_color_read(c, instr);
3433                 break;
3434 
3435         case nir_intrinsic_load_input:
3436                 ntq_emit_load_input(c, instr);
3437                 break;
3438 
3439         case nir_intrinsic_store_tlb_sample_color_v3d:
3440                ntq_emit_per_sample_color_write(c, instr);
3441                break;
3442 
3443        case nir_intrinsic_store_output:
3444                 ntq_emit_store_output(c, instr);
3445                 break;
3446 
3447         case nir_intrinsic_image_size:
3448                 ntq_emit_image_size(c, instr);
3449                 break;
3450 
3451         case nir_intrinsic_discard:
3452                 ntq_flush_tmu(c);
3453 
3454                 if (vir_in_nonuniform_control_flow(c)) {
3455                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3456                                    V3D_QPU_PF_PUSHZ);
3457                         vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3458                                                      vir_uniform_ui(c, 0)),
3459                                 V3D_QPU_COND_IFA);
3460                 } else {
3461                         vir_SETMSF_dest(c, vir_nop_reg(),
3462                                         vir_uniform_ui(c, 0));
3463                 }
3464                 break;
3465 
3466         case nir_intrinsic_discard_if: {
3467                 ntq_flush_tmu(c);
3468 
3469                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3470 
3471                 if (vir_in_nonuniform_control_flow(c)) {
3472                         struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3473                                                                c->execute);
3474                         if (cond == V3D_QPU_COND_IFA) {
3475                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3476                         } else {
3477                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3478                                 cond = V3D_QPU_COND_IFA;
3479                         }
3480                 }
3481 
3482                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3483                                              vir_uniform_ui(c, 0)), cond);
3484 
3485                 break;
3486         }
3487 
3488         case nir_intrinsic_memory_barrier:
3489         case nir_intrinsic_memory_barrier_buffer:
3490         case nir_intrinsic_memory_barrier_image:
3491         case nir_intrinsic_memory_barrier_shared:
3492         case nir_intrinsic_memory_barrier_tcs_patch:
3493         case nir_intrinsic_group_memory_barrier:
3494                 /* We don't do any instruction scheduling of these NIR
3495                  * instructions between each other, so we just need to make
3496                  * sure that the TMU operations before the barrier are flushed
3497                  * before the ones after the barrier.
3498                  */
3499                 ntq_flush_tmu(c);
3500                 break;
3501 
3502         case nir_intrinsic_control_barrier:
3503                 /* Emit a TSY op to get all invocations in the workgroup
3504                  * (actually supergroup) to block until the last invocation
3505                  * reaches the TSY op.
3506                  */
3507                 ntq_flush_tmu(c);
3508 
3509                 if (c->devinfo->ver >= 42) {
3510                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3511                                                       V3D_QPU_WADDR_SYNCB));
3512                 } else {
3513                         struct qinst *sync =
3514                                 vir_BARRIERID_dest(c,
3515                                                    vir_reg(QFILE_MAGIC,
3516                                                            V3D_QPU_WADDR_SYNCU));
3517                         sync->uniform =
3518                                 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
3519                                                       0xffffff00 |
3520                                                       V3D_TSY_WAIT_INC_CHECK);
3521 
3522                 }
3523 
3524                 /* The blocking of a TSY op only happens at the next thread
3525                  * switch.  No texturing may be outstanding at the time of a
3526                  * TSY blocking operation.
3527                  */
3528                 vir_emit_thrsw(c);
3529                 break;
3530 
3531         case nir_intrinsic_load_num_workgroups:
3532                 for (int i = 0; i < 3; i++) {
3533                         ntq_store_dest(c, &instr->dest, i,
3534                                        vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3535                                                    i));
3536                 }
3537                 break;
3538 
3539         case nir_intrinsic_load_workgroup_id: {
3540                 struct qreg x = vir_AND(c, c->cs_payload[0],
3541                                          vir_uniform_ui(c, 0xffff));
3542 
3543                 struct qreg y = vir_SHR(c, c->cs_payload[0],
3544                                          vir_uniform_ui(c, 16));
3545 
3546                 struct qreg z = vir_AND(c, c->cs_payload[1],
3547                                          vir_uniform_ui(c, 0xffff));
3548 
3549                 /* We only support dispatch base in Vulkan */
3550                 if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
3551                         x = vir_ADD(c, x,
3552                                     vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
3553                         y = vir_ADD(c, y,
3554                                     vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
3555                         z = vir_ADD(c, z,
3556                                     vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
3557                 }
3558 
3559                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
3560                 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
3561                 ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
3562                 break;
3563         }
3564 
3565         case nir_intrinsic_load_local_invocation_index:
3566                 ntq_store_dest(c, &instr->dest, 0,
3567                                emit_load_local_invocation_index(c));
3568                 break;
3569 
3570         case nir_intrinsic_load_subgroup_id: {
3571                 /* This is basically the batch index, which is the Local
3572                  * Invocation Index divided by the SIMD width).
3573                  */
3574                 STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
3575                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3576                 struct qreg lii = emit_load_local_invocation_index(c);
3577                 ntq_store_dest(c, &instr->dest, 0,
3578                                vir_SHR(c, lii,
3579                                        vir_uniform_ui(c, divide_shift)));
3580                 break;
3581         }
3582 
3583         case nir_intrinsic_load_per_vertex_input: {
3584                 /* The vertex shader writes all its used outputs into
3585                  * consecutive VPM offsets, so if any output component is
3586                  * unused, its VPM offset is used by the next used
3587                  * component. This means that we can't assume that each
3588                  * location will use 4 consecutive scalar offsets in the VPM
3589                  * and we need to compute the VPM offset for each input by
3590                  * going through the inputs and finding the one that matches
3591                  * our location and component.
3592                  *
3593                  * col: vertex index, row = varying index
3594                  */
3595                 assert(nir_src_is_const(instr->src[1]));
3596                 uint32_t location =
3597                         nir_intrinsic_io_semantics(instr).location +
3598                         nir_src_as_uint(instr->src[1]);
3599                 uint32_t component = nir_intrinsic_component(instr);
3600 
3601                 int32_t row_idx = -1;
3602                 for (int i = 0; i < c->num_inputs; i++) {
3603                         struct v3d_varying_slot slot = c->input_slots[i];
3604                         if (v3d_slot_get_slot(slot) == location &&
3605                             v3d_slot_get_component(slot) == component) {
3606                                 row_idx = i;
3607                                 break;
3608                         }
3609                 }
3610 
3611                 assert(row_idx != -1);
3612 
3613                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3614                 for (int i = 0; i < instr->num_components; i++) {
3615                         struct qreg row = vir_uniform_ui(c, row_idx++);
3616                         ntq_store_dest(c, &instr->dest, i,
3617                                        vir_LDVPMG_IN(c, row, col));
3618                 }
3619                 break;
3620         }
3621 
3622         case nir_intrinsic_emit_vertex:
3623         case nir_intrinsic_end_primitive:
3624                 unreachable("Should have been lowered in v3d_nir_lower_io");
3625                 break;
3626 
3627         case nir_intrinsic_load_primitive_id: {
3628                 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3629                  * VPM output header. According to docs, we should read this
3630                  * using ldvpm(v,d)_in (See Table 71).
3631                  */
3632                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3633                 ntq_store_dest(c, &instr->dest, 0,
3634                                vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3635                 break;
3636         }
3637 
3638         case nir_intrinsic_load_invocation_id:
3639                 ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
3640                 break;
3641 
3642         case nir_intrinsic_load_fb_layers_v3d:
3643                 ntq_store_dest(c, &instr->dest, 0,
3644                                vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3645                 break;
3646 
3647         case nir_intrinsic_load_sample_id:
3648                 ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
3649                 break;
3650 
3651         case nir_intrinsic_load_sample_pos:
3652                 ntq_store_dest(c, &instr->dest, 0,
3653                                vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3654                 ntq_store_dest(c, &instr->dest, 1,
3655                                vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3656                 break;
3657 
3658         case nir_intrinsic_load_barycentric_at_offset:
3659                 ntq_store_dest(c, &instr->dest, 0,
3660                                vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3661                 ntq_store_dest(c, &instr->dest, 1,
3662                                vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3663                 break;
3664 
3665         case nir_intrinsic_load_barycentric_pixel:
3666                 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3667                 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3668                 break;
3669 
3670         case nir_intrinsic_load_barycentric_at_sample: {
3671                 if (!c->fs_key->msaa) {
3672                         ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3673                         ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3674                         return;
3675                 }
3676 
3677                 struct qreg offset_x, offset_y;
3678                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3679                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3680 
3681                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3682                 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3683                 break;
3684         }
3685 
3686         case nir_intrinsic_load_barycentric_sample: {
3687                 struct qreg offset_x =
3688                         vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3689                 struct qreg offset_y =
3690                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3691 
3692                 ntq_store_dest(c, &instr->dest, 0,
3693                                   vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3694                 ntq_store_dest(c, &instr->dest, 1,
3695                                   vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3696                 break;
3697         }
3698 
3699         case nir_intrinsic_load_barycentric_centroid: {
3700                 struct qreg offset_x, offset_y;
3701                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3702                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3703                 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3704                 break;
3705         }
3706 
3707         case nir_intrinsic_load_interpolated_input: {
3708                 assert(nir_src_is_const(instr->src[1]));
3709                 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3710 
3711                 for (int i = 0; i < instr->num_components; i++) {
3712                         const uint32_t input_idx =
3713                                 (nir_intrinsic_base(instr) + offset) * 4 +
3714                                 nir_intrinsic_component(instr) + i;
3715 
3716                         /* If we are not in MSAA or if we are not interpolating
3717                          * a user varying, just return the pre-computed
3718                          * interpolated input.
3719                          */
3720                         if (!c->fs_key->msaa ||
3721                             c->interp[input_idx].vp.file == QFILE_NULL) {
3722                                 ntq_store_dest(c, &instr->dest, i,
3723                                                vir_MOV(c, c->inputs[input_idx]));
3724                                 continue;
3725                         }
3726 
3727                         /* Otherwise compute interpolation at the specified
3728                          * offset.
3729                          */
3730                         struct qreg p = c->interp[input_idx].vp;
3731                         struct qreg C = c->interp[input_idx].C;
3732                         unsigned interp_mode =  c->interp[input_idx].mode;
3733 
3734                         struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3735                         struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3736 
3737                         struct qreg result =
3738                               ntq_emit_load_interpolated_input(c, p, C,
3739                                                                offset_x, offset_y,
3740                                                                interp_mode);
3741                         ntq_store_dest(c, &instr->dest, i, result);
3742                 }
3743                 break;
3744         }
3745 
3746         case nir_intrinsic_load_subgroup_size:
3747                 ntq_store_dest(c, &instr->dest, 0,
3748                                vir_uniform_ui(c, V3D_CHANNELS));
3749                 break;
3750 
3751         case nir_intrinsic_load_subgroup_invocation:
3752                 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
3753                 break;
3754 
3755         case nir_intrinsic_elect: {
3756                 set_a_flags_for_subgroup(c);
3757                 struct qreg first = vir_FLAFIRST(c);
3758 
3759                 /* Produce a boolean result from Flafirst */
3760                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3761                                            first, vir_uniform_ui(c, 1)),
3762                                            V3D_QPU_PF_PUSHZ);
3763                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3764                 ntq_store_dest(c, &instr->dest, 0, result);
3765                 break;
3766         }
3767 
3768         case nir_intrinsic_load_num_subgroups:
3769                 unreachable("Should have been lowered");
3770                 break;
3771 
3772         case nir_intrinsic_load_view_index:
3773                 ntq_store_dest(c, &instr->dest, 0,
3774                                vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
3775                 break;
3776 
3777         default:
3778                 fprintf(stderr, "Unknown intrinsic: ");
3779                 nir_print_instr(&instr->instr, stderr);
3780                 fprintf(stderr, "\n");
3781                 break;
3782         }
3783 }
3784 
3785 /* Clears (activates) the execute flags for any channels whose jump target
3786  * matches this block.
3787  *
3788  * XXX perf: Could we be using flpush/flpop somehow for our execution channel
3789  * enabling?
3790  *
3791  */
3792 static void
ntq_activate_execute_for_block(struct v3d_compile * c)3793 ntq_activate_execute_for_block(struct v3d_compile *c)
3794 {
3795         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3796                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
3797                    V3D_QPU_PF_PUSHZ);
3798 
3799         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3800 }
3801 
3802 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)3803 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
3804 {
3805         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3806         bool empty_else_block =
3807                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
3808                  exec_list_is_empty(&nir_else_block->instr_list));
3809 
3810         struct qblock *then_block = vir_new_block(c);
3811         struct qblock *after_block = vir_new_block(c);
3812         struct qblock *else_block;
3813         if (empty_else_block)
3814                 else_block = after_block;
3815         else
3816                 else_block = vir_new_block(c);
3817 
3818         /* Check if this if statement is really just a conditional jump with
3819          * the form:
3820          *
3821          * if (cond) {
3822          *    break/continue;
3823          * } else {
3824          * }
3825          *
3826          * In which case we can skip the jump to ELSE we emit before the THEN
3827          * block and instead just emit the break/continue directly.
3828          */
3829         nir_jump_instr *conditional_jump = NULL;
3830         if (empty_else_block) {
3831                 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
3832                 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
3833                 if (inst && inst->type == nir_instr_type_jump)
3834                         conditional_jump = nir_instr_as_jump(inst);
3835         }
3836 
3837         /* Set up the flags for the IF condition (taking the THEN branch). */
3838         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3839 
3840         if (!conditional_jump) {
3841                 /* Jump to ELSE. */
3842                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3843                            V3D_QPU_BRANCH_COND_ANYNA :
3844                            V3D_QPU_BRANCH_COND_ANYA);
3845                 /* Pixels that were not dispatched or have been discarded
3846                  * should not contribute to the ANYA/ANYNA condition.
3847                  */
3848                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3849 
3850                 vir_link_blocks(c->cur_block, else_block);
3851                 vir_link_blocks(c->cur_block, then_block);
3852 
3853                 /* Process the THEN block. */
3854                 vir_set_emit_block(c, then_block);
3855                 ntq_emit_cf_list(c, &if_stmt->then_list);
3856 
3857                 if (!empty_else_block) {
3858                         /* At the end of the THEN block, jump to ENDIF, unless
3859                          * the block ended in a break or continue.
3860                          */
3861                         if (!c->cur_block->branch_emitted) {
3862                                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3863                                 vir_link_blocks(c->cur_block, after_block);
3864                         }
3865 
3866                         /* Emit the else block. */
3867                         vir_set_emit_block(c, else_block);
3868                         ntq_emit_cf_list(c, &if_stmt->else_list);
3869                 }
3870         } else {
3871                 /* Emit the conditional jump directly.
3872                  *
3873                  * Use ALL with breaks and ANY with continues to ensure that
3874                  * we always break and never continue when all lanes have been
3875                  * disabled (for example because of discards) to prevent
3876                  * infinite loops.
3877                  */
3878                 assert(conditional_jump &&
3879                        (conditional_jump->type == nir_jump_continue ||
3880                         conditional_jump->type == nir_jump_break));
3881 
3882                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3883                            (conditional_jump->type == nir_jump_break ?
3884                             V3D_QPU_BRANCH_COND_ALLA :
3885                             V3D_QPU_BRANCH_COND_ANYA) :
3886                            (conditional_jump->type == nir_jump_break ?
3887                             V3D_QPU_BRANCH_COND_ALLNA :
3888                             V3D_QPU_BRANCH_COND_ANYNA));
3889                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3890 
3891                 vir_link_blocks(c->cur_block,
3892                                 conditional_jump->type == nir_jump_break ?
3893                                         c->loop_break_block :
3894                                         c->loop_cont_block);
3895         }
3896 
3897         vir_link_blocks(c->cur_block, after_block);
3898 
3899         vir_set_emit_block(c, after_block);
3900 }
3901 
3902 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)3903 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
3904 {
3905         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3906         bool empty_else_block =
3907                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
3908                  exec_list_is_empty(&nir_else_block->instr_list));
3909 
3910         struct qblock *then_block = vir_new_block(c);
3911         struct qblock *after_block = vir_new_block(c);
3912         struct qblock *else_block;
3913         if (empty_else_block)
3914                 else_block = after_block;
3915         else
3916                 else_block = vir_new_block(c);
3917 
3918         bool was_uniform_control_flow = false;
3919         if (!vir_in_nonuniform_control_flow(c)) {
3920                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3921                 was_uniform_control_flow = true;
3922         }
3923 
3924         /* Set up the flags for the IF condition (taking the THEN branch). */
3925         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3926 
3927         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
3928          * was previously active (execute Z) for updating the exec flags.
3929          */
3930         if (was_uniform_control_flow) {
3931                 cond = v3d_qpu_cond_invert(cond);
3932         } else {
3933                 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
3934                 if (cond == V3D_QPU_COND_IFA) {
3935                         vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
3936                 } else {
3937                         vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
3938                         cond = V3D_QPU_COND_IFA;
3939                 }
3940         }
3941 
3942         vir_MOV_cond(c, cond,
3943                      c->execute,
3944                      vir_uniform_ui(c, else_block->index));
3945 
3946         /* Jump to ELSE if nothing is active for THEN, otherwise fall
3947          * through.
3948          */
3949         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3950         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
3951         vir_link_blocks(c->cur_block, else_block);
3952         vir_link_blocks(c->cur_block, then_block);
3953 
3954         /* Process the THEN block. */
3955         vir_set_emit_block(c, then_block);
3956         ntq_emit_cf_list(c, &if_stmt->then_list);
3957 
3958         if (!empty_else_block) {
3959                 /* Handle the end of the THEN block.  First, all currently
3960                  * active channels update their execute flags to point to
3961                  * ENDIF
3962                  */
3963                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3964                            V3D_QPU_PF_PUSHZ);
3965                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3966                              vir_uniform_ui(c, after_block->index));
3967 
3968                 /* If everything points at ENDIF, then jump there immediately. */
3969                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3970                                         c->execute,
3971                                         vir_uniform_ui(c, after_block->index)),
3972                            V3D_QPU_PF_PUSHZ);
3973                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
3974                 vir_link_blocks(c->cur_block, after_block);
3975                 vir_link_blocks(c->cur_block, else_block);
3976 
3977                 vir_set_emit_block(c, else_block);
3978                 ntq_activate_execute_for_block(c);
3979                 ntq_emit_cf_list(c, &if_stmt->else_list);
3980         }
3981 
3982         vir_link_blocks(c->cur_block, after_block);
3983 
3984         vir_set_emit_block(c, after_block);
3985         if (was_uniform_control_flow)
3986                 c->execute = c->undef;
3987         else
3988                 ntq_activate_execute_for_block(c);
3989 }
3990 
3991 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)3992 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
3993 {
3994         bool was_in_control_flow = c->in_control_flow;
3995         c->in_control_flow = true;
3996         if (!vir_in_nonuniform_control_flow(c) &&
3997             !nir_src_is_divergent(nif->condition)) {
3998                 ntq_emit_uniform_if(c, nif);
3999         } else {
4000                 ntq_emit_nonuniform_if(c, nif);
4001         }
4002         c->in_control_flow = was_in_control_flow;
4003 }
4004 
4005 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)4006 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
4007 {
4008         switch (jump->type) {
4009         case nir_jump_break:
4010                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4011                            V3D_QPU_PF_PUSHZ);
4012                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4013                              vir_uniform_ui(c, c->loop_break_block->index));
4014                 break;
4015 
4016         case nir_jump_continue:
4017                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
4018                            V3D_QPU_PF_PUSHZ);
4019                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
4020                              vir_uniform_ui(c, c->loop_cont_block->index));
4021                 break;
4022 
4023         case nir_jump_return:
4024                 unreachable("All returns should be lowered\n");
4025                 break;
4026 
4027         case nir_jump_halt:
4028         case nir_jump_goto:
4029         case nir_jump_goto_if:
4030                 unreachable("not supported\n");
4031                 break;
4032         }
4033 }
4034 
4035 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)4036 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
4037 {
4038         switch (jump->type) {
4039         case nir_jump_break:
4040                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4041                 vir_link_blocks(c->cur_block, c->loop_break_block);
4042                 c->cur_block->branch_emitted = true;
4043                 break;
4044         case nir_jump_continue:
4045                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4046                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4047                 c->cur_block->branch_emitted = true;
4048                 break;
4049 
4050         case nir_jump_return:
4051                 unreachable("All returns should be lowered\n");
4052                 break;
4053 
4054         case nir_jump_halt:
4055         case nir_jump_goto:
4056         case nir_jump_goto_if:
4057                 unreachable("not supported\n");
4058                 break;
4059         }
4060 }
4061 
4062 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)4063 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
4064 {
4065         switch (instr->type) {
4066         case nir_instr_type_alu:
4067                 ntq_emit_alu(c, nir_instr_as_alu(instr));
4068                 break;
4069 
4070         case nir_instr_type_intrinsic:
4071                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
4072                 break;
4073 
4074         case nir_instr_type_load_const:
4075                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
4076                 break;
4077 
4078         case nir_instr_type_ssa_undef:
4079                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
4080                 break;
4081 
4082         case nir_instr_type_tex:
4083                 ntq_emit_tex(c, nir_instr_as_tex(instr));
4084                 break;
4085 
4086         case nir_instr_type_jump:
4087                 /* Always flush TMU before jumping to another block, for the
4088                  * same reasons as in ntq_emit_block.
4089                  */
4090                 ntq_flush_tmu(c);
4091                 if (vir_in_nonuniform_control_flow(c))
4092                         ntq_emit_jump(c, nir_instr_as_jump(instr));
4093                 else
4094                         ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
4095                 break;
4096 
4097         default:
4098                 fprintf(stderr, "Unknown NIR instr type: ");
4099                 nir_print_instr(instr, stderr);
4100                 fprintf(stderr, "\n");
4101                 abort();
4102         }
4103 }
4104 
4105 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)4106 ntq_emit_block(struct v3d_compile *c, nir_block *block)
4107 {
4108         nir_foreach_instr(instr, block) {
4109                 ntq_emit_instr(c, instr);
4110         }
4111 
4112         /* Always process pending TMU operations in the same block they were
4113          * emitted: we can't emit TMU operations in a block and then emit a
4114          * thread switch and LDTMU/TMUWT for them in another block, possibly
4115          * under control flow.
4116          */
4117         ntq_flush_tmu(c);
4118 }
4119 
4120 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
4121 
4122 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)4123 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
4124 {
4125         bool was_uniform_control_flow = false;
4126         if (!vir_in_nonuniform_control_flow(c)) {
4127                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
4128                 was_uniform_control_flow = true;
4129         }
4130 
4131         c->loop_cont_block = vir_new_block(c);
4132         c->loop_break_block = vir_new_block(c);
4133 
4134         vir_link_blocks(c->cur_block, c->loop_cont_block);
4135         vir_set_emit_block(c, c->loop_cont_block);
4136         ntq_activate_execute_for_block(c);
4137 
4138         ntq_emit_cf_list(c, &loop->body);
4139 
4140         /* Re-enable any previous continues now, so our ANYA check below
4141          * works.
4142          *
4143          * XXX: Use the .ORZ flags update, instead.
4144          */
4145         vir_set_pf(c, vir_XOR_dest(c,
4146                                 vir_nop_reg(),
4147                                 c->execute,
4148                                 vir_uniform_ui(c, c->loop_cont_block->index)),
4149                    V3D_QPU_PF_PUSHZ);
4150         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
4151 
4152         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
4153 
4154         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
4155         /* Pixels that were not dispatched or have been discarded should not
4156          * contribute to looping again.
4157          */
4158         branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
4159         vir_link_blocks(c->cur_block, c->loop_cont_block);
4160         vir_link_blocks(c->cur_block, c->loop_break_block);
4161 
4162         vir_set_emit_block(c, c->loop_break_block);
4163         if (was_uniform_control_flow)
4164                 c->execute = c->undef;
4165         else
4166                 ntq_activate_execute_for_block(c);
4167 }
4168 
4169 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)4170 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
4171 {
4172 
4173         c->loop_cont_block = vir_new_block(c);
4174         c->loop_break_block = vir_new_block(c);
4175 
4176         vir_link_blocks(c->cur_block, c->loop_cont_block);
4177         vir_set_emit_block(c, c->loop_cont_block);
4178 
4179         ntq_emit_cf_list(c, &loop->body);
4180 
4181         if (!c->cur_block->branch_emitted) {
4182                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
4183                 vir_link_blocks(c->cur_block, c->loop_cont_block);
4184         }
4185 
4186         vir_set_emit_block(c, c->loop_break_block);
4187 }
4188 
4189 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)4190 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
4191 {
4192         bool was_in_control_flow = c->in_control_flow;
4193         c->in_control_flow = true;
4194 
4195         struct qblock *save_loop_cont_block = c->loop_cont_block;
4196         struct qblock *save_loop_break_block = c->loop_break_block;
4197 
4198         if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
4199                 ntq_emit_nonuniform_loop(c, loop);
4200         } else {
4201                 ntq_emit_uniform_loop(c, loop);
4202         }
4203 
4204         c->loop_break_block = save_loop_break_block;
4205         c->loop_cont_block = save_loop_cont_block;
4206 
4207         c->loops++;
4208 
4209         c->in_control_flow = was_in_control_flow;
4210 }
4211 
4212 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)4213 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
4214 {
4215         fprintf(stderr, "FUNCTIONS not handled.\n");
4216         abort();
4217 }
4218 
4219 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)4220 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
4221 {
4222         foreach_list_typed(nir_cf_node, node, node, list) {
4223                 switch (node->type) {
4224                 case nir_cf_node_block:
4225                         ntq_emit_block(c, nir_cf_node_as_block(node));
4226                         break;
4227 
4228                 case nir_cf_node_if:
4229                         ntq_emit_if(c, nir_cf_node_as_if(node));
4230                         break;
4231 
4232                 case nir_cf_node_loop:
4233                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
4234                         break;
4235 
4236                 case nir_cf_node_function:
4237                         ntq_emit_function(c, nir_cf_node_as_function(node));
4238                         break;
4239 
4240                 default:
4241                         fprintf(stderr, "Unknown NIR node type\n");
4242                         abort();
4243                 }
4244         }
4245 }
4246 
4247 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)4248 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
4249 {
4250         ntq_setup_registers(c, &impl->registers);
4251         ntq_emit_cf_list(c, &impl->body);
4252 }
4253 
4254 static void
nir_to_vir(struct v3d_compile * c)4255 nir_to_vir(struct v3d_compile *c)
4256 {
4257         switch (c->s->info.stage) {
4258         case MESA_SHADER_FRAGMENT:
4259                 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
4260                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
4261                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
4262 
4263                 /* V3D 4.x can disable implicit varyings if they are not used */
4264                 c->fs_uses_primitive_id =
4265                         nir_find_variable_with_location(c->s, nir_var_shader_in,
4266                                                         VARYING_SLOT_PRIMITIVE_ID);
4267                 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
4268                        c->primitive_id =
4269                                emit_fragment_varying(c, NULL, -1, 0, 0);
4270                 }
4271 
4272                 if (c->fs_key->is_points &&
4273                     (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
4274                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4275                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
4276                         c->uses_implicit_point_line_varyings = true;
4277                 } else if (c->fs_key->is_lines &&
4278                            (c->devinfo->ver < 40 ||
4279                             BITSET_TEST(c->s->info.system_values_read,
4280                                         SYSTEM_VALUE_LINE_COORD))) {
4281                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
4282                         c->uses_implicit_point_line_varyings = true;
4283                 }
4284 
4285                 c->force_per_sample_msaa =
4286                    c->s->info.fs.uses_sample_qualifier ||
4287                    BITSET_TEST(c->s->info.system_values_read,
4288                                SYSTEM_VALUE_SAMPLE_ID) ||
4289                    BITSET_TEST(c->s->info.system_values_read,
4290                                SYSTEM_VALUE_SAMPLE_POS);
4291                 break;
4292         case MESA_SHADER_COMPUTE:
4293                 /* Set up the TSO for barriers, assuming we do some. */
4294                 if (c->devinfo->ver < 42) {
4295                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
4296                                                       V3D_QPU_WADDR_SYNC));
4297                 }
4298 
4299                 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
4300                 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
4301 
4302                 /* Set up the division between gl_LocalInvocationIndex and
4303                  * wg_in_mem in the payload reg.
4304                  */
4305                 int wg_size = (c->s->info.workgroup_size[0] *
4306                                c->s->info.workgroup_size[1] *
4307                                c->s->info.workgroup_size[2]);
4308                 c->local_invocation_index_bits =
4309                         ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
4310                 assert(c->local_invocation_index_bits <= 8);
4311 
4312                 if (c->s->info.shared_size) {
4313                         struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
4314                                                         vir_uniform_ui(c, 16));
4315                         if (c->s->info.workgroup_size[0] != 1 ||
4316                             c->s->info.workgroup_size[1] != 1 ||
4317                             c->s->info.workgroup_size[2] != 1) {
4318                                 int wg_bits = (16 -
4319                                                c->local_invocation_index_bits);
4320                                 int wg_mask = (1 << wg_bits) - 1;
4321                                 wg_in_mem = vir_AND(c, wg_in_mem,
4322                                                     vir_uniform_ui(c, wg_mask));
4323                         }
4324                         struct qreg shared_per_wg =
4325                                 vir_uniform_ui(c, c->s->info.shared_size);
4326 
4327                         c->cs_shared_offset =
4328                                 vir_ADD(c,
4329                                         vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
4330                                         vir_UMUL(c, wg_in_mem, shared_per_wg));
4331                 }
4332                 break;
4333         default:
4334                 break;
4335         }
4336 
4337         if (c->s->scratch_size) {
4338                 v3d_setup_spill_base(c);
4339                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
4340         }
4341 
4342         switch (c->s->info.stage) {
4343         case MESA_SHADER_VERTEX:
4344                 ntq_setup_vs_inputs(c);
4345                 break;
4346         case MESA_SHADER_GEOMETRY:
4347                 ntq_setup_gs_inputs(c);
4348                 break;
4349         case MESA_SHADER_FRAGMENT:
4350                 ntq_setup_fs_inputs(c);
4351                 break;
4352         case MESA_SHADER_COMPUTE:
4353                 break;
4354         default:
4355                 unreachable("unsupported shader stage");
4356         }
4357 
4358         ntq_setup_outputs(c);
4359 
4360         /* Find the main function and emit the body. */
4361         nir_foreach_function(function, c->s) {
4362                 assert(function->is_entrypoint);
4363                 assert(function->impl);
4364                 ntq_emit_impl(c, function->impl);
4365         }
4366 }
4367 
4368 /**
4369  * When demoting a shader down to single-threaded, removes the THRSW
4370  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
4371  * program end).
4372  */
4373 static void
vir_remove_thrsw(struct v3d_compile * c)4374 vir_remove_thrsw(struct v3d_compile *c)
4375 {
4376         vir_for_each_block(block, c) {
4377                 vir_for_each_inst_safe(inst, block) {
4378                         if (inst->qpu.sig.thrsw)
4379                                 vir_remove_instruction(c, inst);
4380                 }
4381         }
4382 
4383         c->last_thrsw = NULL;
4384 }
4385 
4386 /**
4387  * This makes sure we have a top-level last thread switch which signals the
4388  * start of the last thread section, which may include adding a new thrsw
4389  * instruction if needed. We don't allow spilling in the last thread section, so
4390  * if we need to do any spills that inject additional thread switches later on,
4391  * we ensure this thread switch will still be the last thread switch in the
4392  * program, which makes last thread switch signalling a lot easier when we have
4393  * spilling. If in the end we don't need to spill to compile the program and we
4394  * injected a new thread switch instruction here only for that, we will
4395  * eventually restore the previous last thread switch and remove the one we
4396  * added here.
4397  */
4398 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)4399 vir_emit_last_thrsw(struct v3d_compile *c,
4400                     struct qinst **restore_last_thrsw,
4401                     bool *restore_scoreboard_lock)
4402 {
4403         *restore_last_thrsw = c->last_thrsw;
4404 
4405         /* On V3D before 4.1, we need a TMU op to be outstanding when thread
4406          * switching, so disable threads if we didn't do any TMU ops (each of
4407          * which would have emitted a THRSW).
4408          */
4409         if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
4410                 c->threads = 1;
4411                 if (c->last_thrsw)
4412                         vir_remove_thrsw(c);
4413                 *restore_last_thrsw = NULL;
4414         }
4415 
4416         /* If we're threaded and the last THRSW was in conditional code, then
4417          * we need to emit another one so that we can flag it as the last
4418          * thrsw.
4419          */
4420         if (c->last_thrsw && !c->last_thrsw_at_top_level) {
4421                 assert(c->devinfo->ver >= 41);
4422                 vir_emit_thrsw(c);
4423         }
4424 
4425         /* If we're threaded, then we need to mark the last THRSW instruction
4426          * so we can emit a pair of them at QPU emit time.
4427          *
4428          * For V3D 4.x, we can spawn the non-fragment shaders already in the
4429          * post-last-THRSW state, so we can skip this.
4430          */
4431         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
4432                 assert(c->devinfo->ver >= 41);
4433                 vir_emit_thrsw(c);
4434         }
4435 
4436         /* If we have not inserted a last thread switch yet, do it now to ensure
4437          * any potential spilling we do happens before this. If we don't spill
4438          * in the end, we will restore the previous one.
4439          */
4440         if (*restore_last_thrsw == c->last_thrsw) {
4441                 if (*restore_last_thrsw)
4442                         (*restore_last_thrsw)->is_last_thrsw = false;
4443                 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
4444                 vir_emit_thrsw(c);
4445         } else {
4446                 *restore_last_thrsw = c->last_thrsw;
4447         }
4448 
4449         assert(c->last_thrsw);
4450         c->last_thrsw->is_last_thrsw = true;
4451 }
4452 
4453 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)4454 vir_restore_last_thrsw(struct v3d_compile *c,
4455                        struct qinst *thrsw,
4456                        bool scoreboard_lock)
4457 {
4458         assert(c->last_thrsw);
4459         vir_remove_instruction(c, c->last_thrsw);
4460         c->last_thrsw = thrsw;
4461         if (c->last_thrsw)
4462                 c->last_thrsw->is_last_thrsw = true;
4463         c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
4464 }
4465 
4466 /* There's a flag in the shader for "center W is needed for reasons other than
4467  * non-centroid varyings", so we just walk the program after VIR optimization
4468  * to see if it's used.  It should be harmless to set even if we only use
4469  * center W for varyings.
4470  */
4471 static void
vir_check_payload_w(struct v3d_compile * c)4472 vir_check_payload_w(struct v3d_compile *c)
4473 {
4474         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4475                 return;
4476 
4477         vir_for_each_inst_inorder(inst, c) {
4478                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4479                         if (inst->src[i].file == QFILE_REG &&
4480                             inst->src[i].index == 0) {
4481                                 c->uses_center_w = true;
4482                                 return;
4483                         }
4484                 }
4485         }
4486 }
4487 
4488 void
v3d_nir_to_vir(struct v3d_compile * c)4489 v3d_nir_to_vir(struct v3d_compile *c)
4490 {
4491         if (V3D_DEBUG & (V3D_DEBUG_NIR |
4492                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4493                 fprintf(stderr, "%s prog %d/%d NIR:\n",
4494                         vir_get_stage_name(c),
4495                         c->program_id, c->variant_id);
4496                 nir_print_shader(c->s, stderr);
4497         }
4498 
4499         nir_to_vir(c);
4500 
4501         bool restore_scoreboard_lock = false;
4502         struct qinst *restore_last_thrsw;
4503 
4504         /* Emit the last THRSW before STVPM and TLB writes. */
4505         vir_emit_last_thrsw(c,
4506                             &restore_last_thrsw,
4507                             &restore_scoreboard_lock);
4508 
4509 
4510         switch (c->s->info.stage) {
4511         case MESA_SHADER_FRAGMENT:
4512                 emit_frag_end(c);
4513                 break;
4514         case MESA_SHADER_GEOMETRY:
4515                 emit_geom_end(c);
4516                 break;
4517         case MESA_SHADER_VERTEX:
4518                 emit_vert_end(c);
4519                 break;
4520         case MESA_SHADER_COMPUTE:
4521                 break;
4522         default:
4523                 unreachable("bad stage");
4524         }
4525 
4526         if (V3D_DEBUG & (V3D_DEBUG_VIR |
4527                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4528                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4529                         vir_get_stage_name(c),
4530                         c->program_id, c->variant_id);
4531                 vir_dump(c);
4532                 fprintf(stderr, "\n");
4533         }
4534 
4535         vir_optimize(c);
4536 
4537         vir_check_payload_w(c);
4538 
4539         /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4540          * We used that on that platform to pipeline TMU writes and reduce the
4541          * number of thread switches, as well as try (mostly successfully) to
4542          * reduce maximum register pressure to allow more threads.  We should
4543          * do something of that sort for V3D -- either instruction scheduling
4544          * here, or delay the the THRSW and LDTMUs from our texture
4545          * instructions until the results are needed.
4546          */
4547 
4548         if (V3D_DEBUG & (V3D_DEBUG_VIR |
4549                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4550                 fprintf(stderr, "%s prog %d/%d VIR:\n",
4551                         vir_get_stage_name(c),
4552                         c->program_id, c->variant_id);
4553                 vir_dump(c);
4554                 fprintf(stderr, "\n");
4555         }
4556 
4557         /* Attempt to allocate registers for the temporaries.  If we fail,
4558          * reduce thread count and try again.
4559          */
4560         int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
4561         struct qpu_reg *temp_registers;
4562         while (true) {
4563                 temp_registers = v3d_register_allocate(c);
4564                 if (temp_registers) {
4565                         assert(c->spills + c->fills <= c->max_tmu_spills);
4566                         break;
4567                 }
4568 
4569                 if (c->threads == min_threads &&
4570                     (V3D_DEBUG & V3D_DEBUG_RA)) {
4571                         fprintf(stderr,
4572                                 "Failed to register allocate using %s\n",
4573                                 c->fallback_scheduler ? "the fallback scheduler:" :
4574                                 "the normal scheduler: \n");
4575 
4576                         vir_dump(c);
4577 
4578                         char *shaderdb;
4579                         int ret = v3d_shaderdb_dump(c, &shaderdb);
4580                         if (ret > 0) {
4581                                 fprintf(stderr, "%s\n", shaderdb);
4582                                 free(shaderdb);
4583                         }
4584                 }
4585 
4586                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
4587                         if (V3D_DEBUG & V3D_DEBUG_PERF) {
4588                                 fprintf(stderr,
4589                                         "Failed to register allocate %s "
4590                                         "prog %d/%d at %d threads.\n",
4591                                         vir_get_stage_name(c),
4592                                         c->program_id, c->variant_id, c->threads);
4593                         }
4594                         c->compilation_result =
4595                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
4596                         return;
4597                 }
4598 
4599                 c->spills = 0;
4600                 c->fills = 0;
4601                 c->threads /= 2;
4602 
4603                 if (c->threads == 1)
4604                         vir_remove_thrsw(c);
4605         }
4606 
4607         /* If we didn't spill, then remove the last thread switch we injected
4608          * artificially (if any) and restore the previous one.
4609          */
4610         if (!c->spills && c->last_thrsw != restore_last_thrsw)
4611                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
4612 
4613         if (c->spills &&
4614             (V3D_DEBUG & (V3D_DEBUG_VIR |
4615                           v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
4616                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
4617                         vir_get_stage_name(c),
4618                         c->program_id, c->variant_id);
4619                 vir_dump(c);
4620                 fprintf(stderr, "\n");
4621         }
4622 
4623         v3d_vir_to_qpu(c, temp_registers);
4624 }
4625