1 /*
2  * Copyright © 2016 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 #include "util/format/u_format.h"
26 #include "util/u_helpers.h"
27 #include "util/u_math.h"
28 #include "util/u_memory.h"
29 #include "util/ralloc.h"
30 #include "util/hash_table.h"
31 #include "compiler/nir/nir.h"
32 #include "compiler/nir/nir_builder.h"
33 #include "common/v3d_device_info.h"
34 #include "v3d_compiler.h"
35 
36 /* We don't do any address packing. */
37 #define __gen_user_data void
38 #define __gen_address_type uint32_t
39 #define __gen_address_offset(reloc) (*reloc)
40 #define __gen_emit_reloc(cl, reloc)
41 #include "cle/v3d_packet_v41_pack.h"
42 
43 #define GENERAL_TMU_LOOKUP_PER_QUAD                 (0 << 7)
44 #define GENERAL_TMU_LOOKUP_PER_PIXEL                (1 << 7)
45 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_I              (0 << 0)
46 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_I             (1 << 0)
47 #define GENERAL_TMU_LOOKUP_TYPE_VEC2                (2 << 0)
48 #define GENERAL_TMU_LOOKUP_TYPE_VEC3                (3 << 0)
49 #define GENERAL_TMU_LOOKUP_TYPE_VEC4                (4 << 0)
50 #define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI             (5 << 0)
51 #define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI            (6 << 0)
52 #define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI            (7 << 0)
53 
54 #define V3D_TSY_SET_QUORUM          0
55 #define V3D_TSY_INC_WAITERS         1
56 #define V3D_TSY_DEC_WAITERS         2
57 #define V3D_TSY_INC_QUORUM          3
58 #define V3D_TSY_DEC_QUORUM          4
59 #define V3D_TSY_FREE_ALL            5
60 #define V3D_TSY_RELEASE             6
61 #define V3D_TSY_ACQUIRE             7
62 #define V3D_TSY_WAIT                8
63 #define V3D_TSY_WAIT_INC            9
64 #define V3D_TSY_WAIT_CHECK          10
65 #define V3D_TSY_WAIT_INC_CHECK      11
66 #define V3D_TSY_WAIT_CV             12
67 #define V3D_TSY_INC_SEMAPHORE       13
68 #define V3D_TSY_DEC_SEMAPHORE       14
69 #define V3D_TSY_SET_QUORUM_FREE_ALL 15
70 
71 enum v3d_tmu_op_type
72 {
73         V3D_TMU_OP_TYPE_REGULAR,
74         V3D_TMU_OP_TYPE_ATOMIC,
75         V3D_TMU_OP_TYPE_CACHE
76 };
77 
78 static enum v3d_tmu_op_type
v3d_tmu_get_type_from_op(uint32_t tmu_op,bool is_write)79 v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write)
80 {
81         switch(tmu_op) {
82         case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH:
83         case V3D_TMU_OP_WRITE_SUB_READ_CLEAR:
84         case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH:
85         case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH:
86         case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR:
87                 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE;
88         case V3D_TMU_OP_WRITE_UMAX:
89         case V3D_TMU_OP_WRITE_SMIN:
90         case V3D_TMU_OP_WRITE_SMAX:
91                 assert(is_write);
92                 FALLTHROUGH;
93         case V3D_TMU_OP_WRITE_AND_READ_INC:
94         case V3D_TMU_OP_WRITE_OR_READ_DEC:
95         case V3D_TMU_OP_WRITE_XOR_READ_NOT:
96                 return V3D_TMU_OP_TYPE_ATOMIC;
97         case V3D_TMU_OP_REGULAR:
98                 return V3D_TMU_OP_TYPE_REGULAR;
99 
100         default:
101                 unreachable("Unknown tmu_op\n");
102         }
103 }
104 static void
105 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
106 
107 static void
resize_qreg_array(struct v3d_compile * c,struct qreg ** regs,uint32_t * size,uint32_t decl_size)108 resize_qreg_array(struct v3d_compile *c,
109                   struct qreg **regs,
110                   uint32_t *size,
111                   uint32_t decl_size)
112 {
113         if (*size >= decl_size)
114                 return;
115 
116         uint32_t old_size = *size;
117         *size = MAX2(*size * 2, decl_size);
118         *regs = reralloc(c, *regs, struct qreg, *size);
119         if (!*regs) {
120                 fprintf(stderr, "Malloc failure\n");
121                 abort();
122         }
123 
124         for (uint32_t i = old_size; i < *size; i++)
125                 (*regs)[i] = c->undef;
126 }
127 
128 static void
resize_interp_array(struct v3d_compile * c,struct v3d_interp_input ** regs,uint32_t * size,uint32_t decl_size)129 resize_interp_array(struct v3d_compile *c,
130                     struct v3d_interp_input **regs,
131                     uint32_t *size,
132                     uint32_t decl_size)
133 {
134         if (*size >= decl_size)
135                 return;
136 
137         uint32_t old_size = *size;
138         *size = MAX2(*size * 2, decl_size);
139         *regs = reralloc(c, *regs, struct v3d_interp_input, *size);
140         if (!*regs) {
141                 fprintf(stderr, "Malloc failure\n");
142                 abort();
143         }
144 
145         for (uint32_t i = old_size; i < *size; i++) {
146                 (*regs)[i].vp = c->undef;
147                 (*regs)[i].C = c->undef;
148         }
149 }
150 
151 void
vir_emit_thrsw(struct v3d_compile * c)152 vir_emit_thrsw(struct v3d_compile *c)
153 {
154         if (c->threads == 1)
155                 return;
156 
157         /* Always thread switch after each texture operation for now.
158          *
159          * We could do better by batching a bunch of texture fetches up and
160          * then doing one thread switch and collecting all their results
161          * afterward.
162          */
163         c->last_thrsw = vir_NOP(c);
164         c->last_thrsw->qpu.sig.thrsw = true;
165         c->last_thrsw_at_top_level = !c->in_control_flow;
166 
167         /* We need to lock the scoreboard before any tlb acess happens. If this
168          * thread switch comes after we have emitted a tlb load, then it means
169          * that we can't lock on the last thread switch any more.
170          */
171         if (c->emitted_tlb_load)
172                 c->lock_scoreboard_on_first_thrsw = true;
173 }
174 
175 uint32_t
v3d_get_op_for_atomic_add(nir_intrinsic_instr * instr,unsigned src)176 v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
177 {
178         if (nir_src_is_const(instr->src[src])) {
179                 int64_t add_val = nir_src_as_int(instr->src[src]);
180                 if (add_val == 1)
181                         return V3D_TMU_OP_WRITE_AND_READ_INC;
182                 else if (add_val == -1)
183                         return V3D_TMU_OP_WRITE_OR_READ_DEC;
184         }
185 
186         return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH;
187 }
188 
189 static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr * instr)190 v3d_general_tmu_op(nir_intrinsic_instr *instr)
191 {
192         switch (instr->intrinsic) {
193         case nir_intrinsic_load_ssbo:
194         case nir_intrinsic_load_ubo:
195         case nir_intrinsic_load_uniform:
196         case nir_intrinsic_load_shared:
197         case nir_intrinsic_load_scratch:
198         case nir_intrinsic_store_ssbo:
199         case nir_intrinsic_store_shared:
200         case nir_intrinsic_store_scratch:
201                 return V3D_TMU_OP_REGULAR;
202         case nir_intrinsic_ssbo_atomic_add:
203                 return v3d_get_op_for_atomic_add(instr, 2);
204         case nir_intrinsic_shared_atomic_add:
205                 return v3d_get_op_for_atomic_add(instr, 1);
206         case nir_intrinsic_ssbo_atomic_imin:
207         case nir_intrinsic_shared_atomic_imin:
208                 return V3D_TMU_OP_WRITE_SMIN;
209         case nir_intrinsic_ssbo_atomic_umin:
210         case nir_intrinsic_shared_atomic_umin:
211                 return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
212         case nir_intrinsic_ssbo_atomic_imax:
213         case nir_intrinsic_shared_atomic_imax:
214                 return V3D_TMU_OP_WRITE_SMAX;
215         case nir_intrinsic_ssbo_atomic_umax:
216         case nir_intrinsic_shared_atomic_umax:
217                 return V3D_TMU_OP_WRITE_UMAX;
218         case nir_intrinsic_ssbo_atomic_and:
219         case nir_intrinsic_shared_atomic_and:
220                 return V3D_TMU_OP_WRITE_AND_READ_INC;
221         case nir_intrinsic_ssbo_atomic_or:
222         case nir_intrinsic_shared_atomic_or:
223                 return V3D_TMU_OP_WRITE_OR_READ_DEC;
224         case nir_intrinsic_ssbo_atomic_xor:
225         case nir_intrinsic_shared_atomic_xor:
226                 return V3D_TMU_OP_WRITE_XOR_READ_NOT;
227         case nir_intrinsic_ssbo_atomic_exchange:
228         case nir_intrinsic_shared_atomic_exchange:
229                 return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
230         case nir_intrinsic_ssbo_atomic_comp_swap:
231         case nir_intrinsic_shared_atomic_comp_swap:
232                 return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
233         default:
234                 unreachable("unknown intrinsic op");
235         }
236 }
237 
238 /**
239  * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
240  * would overflow the Output TMU fifo.
241  *
242  * It is not allowed to overflow the Output fifo, however, we can overflow
243  * Input and Config fifos. Doing that makes the shader stall, but only for as
244  * long as it needs to be able to continue so it is better for pipelining to
245  * let the QPU stall on these if needed than trying to emit TMU flushes in the
246  * driver.
247  */
248 bool
ntq_tmu_fifo_overflow(struct v3d_compile * c,uint32_t components)249 ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
250 {
251         if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
252                 return true;
253 
254         return components > 0 &&
255                c->tmu.output_fifo_size + components > 16 / c->threads;
256 }
257 
258 /**
259  * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations,
260  * popping all TMU fifo entries.
261  */
262 void
ntq_flush_tmu(struct v3d_compile * c)263 ntq_flush_tmu(struct v3d_compile *c)
264 {
265         if (c->tmu.flush_count == 0)
266                 return;
267 
268         vir_emit_thrsw(c);
269 
270         bool emitted_tmuwt = false;
271         for (int i = 0; i < c->tmu.flush_count; i++) {
272                 if (c->tmu.flush[i].component_mask > 0) {
273                         nir_dest *dest = c->tmu.flush[i].dest;
274                         assert(dest);
275 
276                         for (int j = 0; j < 4; j++) {
277                                 if (c->tmu.flush[i].component_mask & (1 << j)) {
278                                         ntq_store_dest(c, dest, j,
279                                                        vir_MOV(c, vir_LDTMU(c)));
280                                 }
281                         }
282                 } else if (!emitted_tmuwt) {
283                         vir_TMUWT(c);
284                         emitted_tmuwt = true;
285                 }
286         }
287 
288         c->tmu.output_fifo_size = 0;
289         c->tmu.flush_count = 0;
290         _mesa_set_clear(c->tmu.outstanding_regs, NULL);
291 }
292 
293 /**
294  * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller
295  * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
296  * and more specifically, the output fifo, since that can't stall.
297  */
298 void
ntq_add_pending_tmu_flush(struct v3d_compile * c,nir_dest * dest,uint32_t component_mask)299 ntq_add_pending_tmu_flush(struct v3d_compile *c,
300                           nir_dest *dest,
301                           uint32_t component_mask)
302 {
303         const uint32_t num_components = util_bitcount(component_mask);
304         assert(!ntq_tmu_fifo_overflow(c, num_components));
305 
306         if (num_components > 0) {
307                 c->tmu.output_fifo_size += num_components;
308                 if (!dest->is_ssa)
309                         _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
310         }
311 
312         c->tmu.flush[c->tmu.flush_count].dest = dest;
313         c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
314         c->tmu.flush_count++;
315 
316         if (c->disable_tmu_pipelining)
317                 ntq_flush_tmu(c);
318         else if (c->tmu.flush_count > 1)
319                 c->pipelined_any_tmu = true;
320 }
321 
322 enum emit_mode {
323     MODE_COUNT = 0,
324     MODE_EMIT,
325     MODE_LAST,
326 };
327 
328 /**
329  * For a TMU general store instruction:
330  *
331  * In MODE_COUNT mode, records the number of TMU writes required and flushes
332  * any outstanding TMU operations the instruction depends on, but it doesn't
333  * emit any actual register writes.
334  *
335  * In MODE_EMIT mode, emits the data register writes required by the
336  * instruction.
337  */
338 static void
emit_tmu_general_store_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t base_const_offset,uint32_t * writemask,uint32_t * const_offset,uint32_t * tmu_writes)339 emit_tmu_general_store_writes(struct v3d_compile *c,
340                               enum emit_mode mode,
341                               nir_intrinsic_instr *instr,
342                               uint32_t base_const_offset,
343                               uint32_t *writemask,
344                               uint32_t *const_offset,
345                               uint32_t *tmu_writes)
346 {
347         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
348 
349         /* Find the first set of consecutive components that
350          * are enabled in the writemask and emit the TMUD
351          * instructions for them.
352          */
353         assert(*writemask != 0);
354         uint32_t first_component = ffs(*writemask) - 1;
355         uint32_t last_component = first_component;
356         while (*writemask & BITFIELD_BIT(last_component + 1))
357                 last_component++;
358 
359         assert(first_component <= last_component &&
360                last_component < instr->num_components);
361 
362         for (int i = first_component; i <= last_component; i++) {
363                 struct qreg data = ntq_get_src(c, instr->src[0], i);
364                 if (mode == MODE_COUNT)
365                         (*tmu_writes)++;
366                 else
367                         vir_MOV_dest(c, tmud, data);
368         }
369 
370         if (mode == MODE_EMIT) {
371                 /* Update the offset for the TMU write based on the
372                  * the first component we are writing.
373                  */
374                 *const_offset = base_const_offset + first_component * 4;
375 
376                 /* Clear these components from the writemask */
377                 uint32_t written_mask =
378                         BITFIELD_RANGE(first_component, *tmu_writes);
379                 (*writemask) &= ~written_mask;
380         }
381 }
382 
383 /**
384  * For a TMU general atomic instruction:
385  *
386  * In MODE_COUNT mode, records the number of TMU writes required and flushes
387  * any outstanding TMU operations the instruction depends on, but it doesn't
388  * emit any actual register writes.
389  *
390  * In MODE_EMIT mode, emits the data register writes required by the
391  * instruction.
392  */
393 static void
emit_tmu_general_atomic_writes(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t tmu_op,bool has_index,uint32_t * tmu_writes)394 emit_tmu_general_atomic_writes(struct v3d_compile *c,
395                                enum emit_mode mode,
396                                nir_intrinsic_instr *instr,
397                                uint32_t tmu_op,
398                                bool has_index,
399                                uint32_t *tmu_writes)
400 {
401         struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD);
402 
403         struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0);
404         if (mode == MODE_COUNT)
405                 (*tmu_writes)++;
406         else
407                 vir_MOV_dest(c, tmud, data);
408 
409         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
410                 data = ntq_get_src(c, instr->src[2 + has_index], 0);
411                 if (mode == MODE_COUNT)
412                         (*tmu_writes)++;
413                 else
414                         vir_MOV_dest(c, tmud, data);
415         }
416 }
417 
418 /**
419  * For any TMU general instruction:
420  *
421  * In MODE_COUNT mode, records the number of TMU writes required to emit the
422  * address parameter and flushes any outstanding TMU operations the instruction
423  * depends on, but it doesn't emit any actual register writes.
424  *
425  * In MODE_EMIT mode, emits register writes required to emit the address.
426  */
427 static void
emit_tmu_general_address_write(struct v3d_compile * c,enum emit_mode mode,nir_intrinsic_instr * instr,uint32_t config,bool dynamic_src,int offset_src,struct qreg base_offset,uint32_t const_offset,uint32_t * tmu_writes)428 emit_tmu_general_address_write(struct v3d_compile *c,
429                                enum emit_mode mode,
430                                nir_intrinsic_instr *instr,
431                                uint32_t config,
432                                bool dynamic_src,
433                                int offset_src,
434                                struct qreg base_offset,
435                                uint32_t const_offset,
436                                uint32_t *tmu_writes)
437 {
438         if (mode == MODE_COUNT) {
439                 (*tmu_writes)++;
440                 if (dynamic_src)
441                         ntq_get_src(c, instr->src[offset_src], 0);
442                 return;
443         }
444 
445         if (vir_in_nonuniform_control_flow(c)) {
446                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
447                            V3D_QPU_PF_PUSHZ);
448         }
449 
450         struct qreg tmua;
451         if (config == ~0)
452                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
453         else
454                 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU);
455 
456         struct qinst *tmu;
457         if (dynamic_src) {
458                 struct qreg offset = base_offset;
459                 if (const_offset != 0) {
460                         offset = vir_ADD(c, offset,
461                                          vir_uniform_ui(c, const_offset));
462                 }
463                 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0);
464                 tmu = vir_ADD_dest(c, tmua, offset, data);
465         } else {
466                 if (const_offset != 0) {
467                         tmu = vir_ADD_dest(c, tmua, base_offset,
468                                            vir_uniform_ui(c, const_offset));
469                 } else {
470                         tmu = vir_MOV_dest(c, tmua, base_offset);
471                 }
472         }
473 
474         if (config != ~0) {
475                 tmu->uniform =
476                         vir_get_uniform_index(c, QUNIFORM_CONSTANT, config);
477         }
478 
479         if (vir_in_nonuniform_control_flow(c))
480                 vir_set_cond(tmu, V3D_QPU_COND_IFA);
481 }
482 
483 /**
484  * Implements indirect uniform loads and SSBO accesses through the TMU general
485  * memory access interface.
486  */
487 static void
ntq_emit_tmu_general(struct v3d_compile * c,nir_intrinsic_instr * instr,bool is_shared_or_scratch)488 ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
489                      bool is_shared_or_scratch)
490 {
491         uint32_t tmu_op = v3d_general_tmu_op(instr);
492 
493         /* If we were able to replace atomic_add for an inc/dec, then we
494          * need/can to do things slightly different, like not loading the
495          * amount to add/sub, as that is implicit.
496          */
497         bool atomic_add_replaced =
498                 ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
499                   instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
500                  (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
501                   tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
502 
503         bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
504                          instr->intrinsic == nir_intrinsic_store_scratch ||
505                          instr->intrinsic == nir_intrinsic_store_shared);
506 
507         bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
508                         instr->intrinsic == nir_intrinsic_load_ubo ||
509                         instr->intrinsic == nir_intrinsic_load_ssbo ||
510                         instr->intrinsic == nir_intrinsic_load_scratch ||
511                         instr->intrinsic == nir_intrinsic_load_shared);
512 
513         if (!is_load)
514                 c->tmu_dirty_rcl = true;
515 
516         bool has_index = !is_shared_or_scratch;
517 
518         int offset_src;
519         if (instr->intrinsic == nir_intrinsic_load_uniform) {
520                 offset_src = 0;
521         } else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
522                    instr->intrinsic == nir_intrinsic_load_ubo ||
523                    instr->intrinsic == nir_intrinsic_load_scratch ||
524                    instr->intrinsic == nir_intrinsic_load_shared ||
525                    atomic_add_replaced) {
526                 offset_src = 0 + has_index;
527         } else if (is_store) {
528                 offset_src = 1 + has_index;
529         } else {
530                 offset_src = 0 + has_index;
531         }
532 
533         bool dynamic_src = !nir_src_is_const(instr->src[offset_src]);
534         uint32_t const_offset = 0;
535         if (!dynamic_src)
536                 const_offset = nir_src_as_uint(instr->src[offset_src]);
537 
538         struct qreg base_offset;
539         if (instr->intrinsic == nir_intrinsic_load_uniform) {
540                 const_offset += nir_intrinsic_base(instr);
541                 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
542                                           v3d_unit_data_create(0, const_offset));
543                 const_offset = 0;
544         } else if (instr->intrinsic == nir_intrinsic_load_ubo) {
545                 uint32_t index = nir_src_as_uint(instr->src[0]);
546                 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
547                  * shifted up by 1 (0 is gallium's constant buffer 0).
548                  */
549                 if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
550                         index++;
551 
552                 base_offset =
553                         vir_uniform(c, QUNIFORM_UBO_ADDR,
554                                     v3d_unit_data_create(index, const_offset));
555                 const_offset = 0;
556         } else if (is_shared_or_scratch) {
557                 /* Shared and scratch variables have no buffer index, and all
558                  * start from a common base that we set up at the start of
559                  * dispatch.
560                  */
561                 if (instr->intrinsic == nir_intrinsic_load_scratch ||
562                     instr->intrinsic == nir_intrinsic_store_scratch) {
563                         base_offset = c->spill_base;
564                 } else {
565                         base_offset = c->cs_shared_offset;
566                         const_offset += nir_intrinsic_base(instr);
567                 }
568         } else {
569                 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
570                                           nir_src_as_uint(instr->src[is_store ?
571                                                                       1 : 0]));
572         }
573 
574         /* We are ready to emit TMU register writes now, but before we actually
575          * emit them we need to flush outstanding TMU operations if any of our
576          * writes reads from the result of an outstanding TMU operation before
577          * we start the TMU sequence for this operation, since otherwise the
578          * flush could happen in the middle of the TMU sequence we are about to
579          * emit, which is illegal. To do this we run this logic twice, the
580          * first time it will count required register writes and flush pending
581          * TMU requests if necessary due to a dependency, and the second one
582          * will emit the actual TMU writes.
583          */
584         const uint32_t dest_components = nir_intrinsic_dest_components(instr);
585         uint32_t base_const_offset = const_offset;
586         uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
587         uint32_t tmu_writes = 0;
588         for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
589                 assert(mode == MODE_COUNT || tmu_writes > 0);
590 
591                 if (is_store) {
592                         emit_tmu_general_store_writes(c, mode, instr,
593                                                       base_const_offset,
594                                                       &writemask,
595                                                       &const_offset,
596                                                       &tmu_writes);
597                 } else if (!is_load && !atomic_add_replaced) {
598                          emit_tmu_general_atomic_writes(c, mode, instr,
599                                                         tmu_op, has_index,
600                                                         &tmu_writes);
601                 }
602 
603                 /* For atomics we use 32bit except for CMPXCHG, that we need
604                  * to use VEC2. For the rest of the cases we use the number of
605                  * tmud writes we did to decide the type. For cache operations
606                  * the type is ignored.
607                  */
608                 uint32_t config = 0;
609                 if (mode == MODE_EMIT) {
610                         uint32_t num_components;
611                         if (is_load || atomic_add_replaced) {
612                                 num_components = instr->num_components;
613                         } else {
614                                 assert(tmu_writes > 0);
615                                 num_components = tmu_writes - 1;
616                         }
617                         bool is_atomic =
618                                 v3d_tmu_get_type_from_op(tmu_op, !is_load) ==
619                                 V3D_TMU_OP_TYPE_ATOMIC;
620 
621                         uint32_t perquad =
622                                 is_load && !vir_in_nonuniform_control_flow(c)
623                                 ? GENERAL_TMU_LOOKUP_PER_QUAD
624                                 : GENERAL_TMU_LOOKUP_PER_PIXEL;
625                         config = 0xffffff00 | tmu_op << 3 | perquad;
626 
627                         if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) {
628                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2;
629                         } else if (is_atomic || num_components == 1) {
630                                 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
631                         } else {
632                                 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
633                                           num_components - 2;
634                         }
635                 }
636 
637                 emit_tmu_general_address_write(c, mode, instr, config,
638                                                dynamic_src, offset_src,
639                                                base_offset, const_offset,
640                                                &tmu_writes);
641 
642                 assert(tmu_writes > 0);
643                 if (mode == MODE_COUNT) {
644                         /* Make sure we won't exceed the 16-entry TMU
645                          * fifo if each thread is storing at the same
646                          * time.
647                          */
648                         while (tmu_writes > 16 / c->threads)
649                                 c->threads /= 2;
650 
651                         /* If pipelining this TMU operation would
652                          * overflow TMU fifos, we need to flush.
653                          */
654                         if (ntq_tmu_fifo_overflow(c, dest_components))
655                                 ntq_flush_tmu(c);
656                 } else {
657                         /* Delay emission of the thread switch and
658                          * LDTMU/TMUWT until we really need to do it to
659                          * improve pipelining.
660                          */
661                         const uint32_t component_mask =
662                                 (1 << dest_components) - 1;
663                         ntq_add_pending_tmu_flush(c, &instr->dest,
664                                                   component_mask);
665                 }
666         }
667 
668         /* nir_lower_wrmasks should've ensured that any writemask on a store
669          * operation only has consecutive bits set, in which case we should've
670          * processed the full writemask above.
671          */
672         assert(writemask == 0);
673 }
674 
675 static struct qreg *
ntq_init_ssa_def(struct v3d_compile * c,nir_ssa_def * def)676 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
677 {
678         struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
679                                           def->num_components);
680         _mesa_hash_table_insert(c->def_ht, def, qregs);
681         return qregs;
682 }
683 
684 static bool
is_ld_signal(const struct v3d_qpu_sig * sig)685 is_ld_signal(const struct v3d_qpu_sig *sig)
686 {
687         return (sig->ldunif ||
688                 sig->ldunifa ||
689                 sig->ldunifrf ||
690                 sig->ldunifarf ||
691                 sig->ldtmu ||
692                 sig->ldvary ||
693                 sig->ldvpm ||
694                 sig->ldtlb ||
695                 sig->ldtlbu);
696 }
697 
698 static inline bool
is_ldunif_signal(const struct v3d_qpu_sig * sig)699 is_ldunif_signal(const struct v3d_qpu_sig *sig)
700 {
701         return sig->ldunif || sig->ldunifrf;
702 }
703 
704 /**
705  * This function is responsible for getting VIR results into the associated
706  * storage for a NIR instruction.
707  *
708  * If it's a NIR SSA def, then we just set the associated hash table entry to
709  * the new result.
710  *
711  * If it's a NIR reg, then we need to update the existing qreg assigned to the
712  * NIR destination with the incoming value.  To do that without introducing
713  * new MOVs, we require that the incoming qreg either be a uniform, or be
714  * SSA-defined by the previous VIR instruction in the block and rewritable by
715  * this function.  That lets us sneak ahead and insert the SF flag beforehand
716  * (knowing that the previous instruction doesn't depend on flags) and rewrite
717  * its destination to be the NIR reg's destination
718  */
719 void
ntq_store_dest(struct v3d_compile * c,nir_dest * dest,int chan,struct qreg result)720 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
721                struct qreg result)
722 {
723         struct qinst *last_inst = NULL;
724         if (!list_is_empty(&c->cur_block->instructions))
725                 last_inst = (struct qinst *)c->cur_block->instructions.prev;
726 
727         bool is_reused_uniform =
728                 is_ldunif_signal(&c->defs[result.index]->qpu.sig) &&
729                 last_inst != c->defs[result.index];
730 
731         assert(result.file == QFILE_TEMP && last_inst &&
732                (last_inst == c->defs[result.index] || is_reused_uniform));
733 
734         if (dest->is_ssa) {
735                 assert(chan < dest->ssa.num_components);
736 
737                 struct qreg *qregs;
738                 struct hash_entry *entry =
739                         _mesa_hash_table_search(c->def_ht, &dest->ssa);
740 
741                 if (entry)
742                         qregs = entry->data;
743                 else
744                         qregs = ntq_init_ssa_def(c, &dest->ssa);
745 
746                 qregs[chan] = result;
747         } else {
748                 nir_register *reg = dest->reg.reg;
749                 assert(dest->reg.base_offset == 0);
750                 assert(reg->num_array_elems == 0);
751                 struct hash_entry *entry =
752                         _mesa_hash_table_search(c->def_ht, reg);
753                 struct qreg *qregs = entry->data;
754 
755                 /* If the previous instruction can't be predicated for
756                  * the store into the nir_register, then emit a MOV
757                  * that can be.
758                  */
759                 if (is_reused_uniform ||
760                     (vir_in_nonuniform_control_flow(c) &&
761                      is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) {
762                         result = vir_MOV(c, result);
763                         last_inst = c->defs[result.index];
764                 }
765 
766                 /* We know they're both temps, so just rewrite index. */
767                 c->defs[last_inst->dst.index] = NULL;
768                 last_inst->dst.index = qregs[chan].index;
769 
770                 /* If we're in control flow, then make this update of the reg
771                  * conditional on the execution mask.
772                  */
773                 if (vir_in_nonuniform_control_flow(c)) {
774                         last_inst->dst.index = qregs[chan].index;
775 
776                         /* Set the flags to the current exec mask.
777                          */
778                         c->cursor = vir_before_inst(last_inst);
779                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
780                                    V3D_QPU_PF_PUSHZ);
781                         c->cursor = vir_after_inst(last_inst);
782 
783                         vir_set_cond(last_inst, V3D_QPU_COND_IFA);
784                 }
785         }
786 }
787 
788 /**
789  * This looks up the qreg associated with a particular ssa/reg used as a source
790  * in any instruction.
791  *
792  * It is expected that the definition for any NIR value read as a source has
793  * been emitted by a previous instruction, however, in the case of TMU
794  * operations we may have postponed emission of the thread switch and LDTMUs
795  * required to read the TMU results until the results are actually used to
796  * improve pipelining, which then would lead to us not finding them here
797  * (for SSA defs) or finding them in the list of registers awaiting a TMU flush
798  * (for registers), meaning that we need to flush outstanding TMU operations
799  * to read the correct value.
800  */
801 struct qreg
ntq_get_src(struct v3d_compile * c,nir_src src,int i)802 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
803 {
804         struct hash_entry *entry;
805         if (src.is_ssa) {
806                 assert(i < src.ssa->num_components);
807 
808                 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
809                 if (!entry) {
810                         ntq_flush_tmu(c);
811                         entry = _mesa_hash_table_search(c->def_ht, src.ssa);
812                 }
813         } else {
814                 nir_register *reg = src.reg.reg;
815                 assert(reg->num_array_elems == 0);
816                 assert(src.reg.base_offset == 0);
817                 assert(i < reg->num_components);
818 
819                 if (_mesa_set_search(c->tmu.outstanding_regs, reg))
820                         ntq_flush_tmu(c);
821                 entry = _mesa_hash_table_search(c->def_ht, reg);
822         }
823         assert(entry);
824 
825         struct qreg *qregs = entry->data;
826         return qregs[i];
827 }
828 
829 static struct qreg
ntq_get_alu_src(struct v3d_compile * c,nir_alu_instr * instr,unsigned src)830 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
831                 unsigned src)
832 {
833         assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
834         unsigned chan = ffs(instr->dest.write_mask) - 1;
835         struct qreg r = ntq_get_src(c, instr->src[src].src,
836                                     instr->src[src].swizzle[chan]);
837 
838         assert(!instr->src[src].abs);
839         assert(!instr->src[src].negate);
840 
841         return r;
842 };
843 
844 static struct qreg
ntq_minify(struct v3d_compile * c,struct qreg size,struct qreg level)845 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
846 {
847         return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
848 }
849 
850 static void
ntq_emit_txs(struct v3d_compile * c,nir_tex_instr * instr)851 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
852 {
853         unsigned unit = instr->texture_index;
854         int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
855         int dest_size = nir_tex_instr_dest_size(instr);
856 
857         struct qreg lod = c->undef;
858         if (lod_index != -1)
859                 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
860 
861         for (int i = 0; i < dest_size; i++) {
862                 assert(i < 3);
863                 enum quniform_contents contents;
864 
865                 if (instr->is_array && i == dest_size - 1)
866                         contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
867                 else
868                         contents = QUNIFORM_TEXTURE_WIDTH + i;
869 
870                 struct qreg size = vir_uniform(c, contents, unit);
871 
872                 switch (instr->sampler_dim) {
873                 case GLSL_SAMPLER_DIM_1D:
874                 case GLSL_SAMPLER_DIM_2D:
875                 case GLSL_SAMPLER_DIM_MS:
876                 case GLSL_SAMPLER_DIM_3D:
877                 case GLSL_SAMPLER_DIM_CUBE:
878                 case GLSL_SAMPLER_DIM_BUF:
879                         /* Don't minify the array size. */
880                         if (!(instr->is_array && i == dest_size - 1)) {
881                                 size = ntq_minify(c, size, lod);
882                         }
883                         break;
884 
885                 case GLSL_SAMPLER_DIM_RECT:
886                         /* There's no LOD field for rects */
887                         break;
888 
889                 default:
890                         unreachable("Bad sampler type");
891                 }
892 
893                 ntq_store_dest(c, &instr->dest, i, size);
894         }
895 }
896 
897 static void
ntq_emit_tex(struct v3d_compile * c,nir_tex_instr * instr)898 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
899 {
900         unsigned unit = instr->texture_index;
901 
902         /* Since each texture sampling op requires uploading uniforms to
903          * reference the texture, there's no HW support for texture size and
904          * you just upload uniforms containing the size.
905          */
906         switch (instr->op) {
907         case nir_texop_query_levels:
908                 ntq_store_dest(c, &instr->dest, 0,
909                                vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
910                 return;
911         case nir_texop_texture_samples:
912                 ntq_store_dest(c, &instr->dest, 0,
913                                vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit));
914                 return;
915         case nir_texop_txs:
916                 ntq_emit_txs(c, instr);
917                 return;
918         default:
919                 break;
920         }
921 
922         if (c->devinfo->ver >= 40)
923                 v3d40_vir_emit_tex(c, instr);
924         else
925                 v3d33_vir_emit_tex(c, instr);
926 }
927 
928 static struct qreg
ntq_fsincos(struct v3d_compile * c,struct qreg src,bool is_cos)929 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
930 {
931         struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
932         if (is_cos)
933                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
934 
935         struct qreg periods = vir_FROUND(c, input);
936         struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
937         return vir_XOR(c, sin_output, vir_SHL(c,
938                                               vir_FTOIN(c, periods),
939                                               vir_uniform_ui(c, -1)));
940 }
941 
942 static struct qreg
ntq_fsign(struct v3d_compile * c,struct qreg src)943 ntq_fsign(struct v3d_compile *c, struct qreg src)
944 {
945         struct qreg t = vir_get_temp(c);
946 
947         vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
948         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ);
949         vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
950         vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN);
951         vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
952         return vir_MOV(c, t);
953 }
954 
955 static void
emit_fragcoord_input(struct v3d_compile * c,int attr)956 emit_fragcoord_input(struct v3d_compile *c, int attr)
957 {
958         c->inputs[attr * 4 + 0] = vir_FXCD(c);
959         c->inputs[attr * 4 + 1] = vir_FYCD(c);
960         c->inputs[attr * 4 + 2] = c->payload_z;
961         c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
962 }
963 
964 static struct qreg
emit_smooth_varying(struct v3d_compile * c,struct qreg vary,struct qreg w,struct qreg r5)965 emit_smooth_varying(struct v3d_compile *c,
966                     struct qreg vary, struct qreg w, struct qreg r5)
967 {
968         return vir_FADD(c, vir_FMUL(c, vary, w), r5);
969 }
970 
971 static struct qreg
emit_noperspective_varying(struct v3d_compile * c,struct qreg vary,struct qreg r5)972 emit_noperspective_varying(struct v3d_compile *c,
973                            struct qreg vary, struct qreg r5)
974 {
975         return vir_FADD(c, vir_MOV(c, vary), r5);
976 }
977 
978 static struct qreg
emit_flat_varying(struct v3d_compile * c,struct qreg vary,struct qreg r5)979 emit_flat_varying(struct v3d_compile *c,
980                   struct qreg vary, struct qreg r5)
981 {
982         vir_MOV_dest(c, c->undef, vary);
983         return vir_MOV(c, r5);
984 }
985 
986 static struct qreg
emit_fragment_varying(struct v3d_compile * c,nir_variable * var,int8_t input_idx,uint8_t swizzle,int array_index)987 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
988                       int8_t input_idx, uint8_t swizzle, int array_index)
989 {
990         struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
991         struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
992 
993         struct qinst *ldvary = NULL;
994         struct qreg vary;
995         if (c->devinfo->ver >= 41) {
996                 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
997                                       c->undef, c->undef);
998                 ldvary->qpu.sig.ldvary = true;
999                 vary = vir_emit_def(c, ldvary);
1000         } else {
1001                 vir_NOP(c)->qpu.sig.ldvary = true;
1002                 vary = r3;
1003         }
1004 
1005         /* Store the input value before interpolation so we can implement
1006          * GLSL's interpolateAt functions if the shader uses them.
1007          */
1008         if (input_idx >= 0) {
1009                 assert(var);
1010                 c->interp[input_idx].vp = vary;
1011                 c->interp[input_idx].C = vir_MOV(c, r5);
1012                 c->interp[input_idx].mode = var->data.interpolation;
1013         }
1014 
1015         /* For gl_PointCoord input or distance along a line, we'll be called
1016          * with no nir_variable, and we don't count toward VPM size so we
1017          * don't track an input slot.
1018          */
1019         if (!var) {
1020                 assert(input_idx < 0);
1021                 return emit_smooth_varying(c, vary, c->payload_w, r5);
1022         }
1023 
1024         int i = c->num_inputs++;
1025         c->input_slots[i] =
1026                 v3d_slot_from_slot_and_component(var->data.location +
1027                                                  array_index, swizzle);
1028 
1029         struct qreg result;
1030         switch (var->data.interpolation) {
1031         case INTERP_MODE_NONE:
1032         case INTERP_MODE_SMOOTH:
1033                 if (var->data.centroid) {
1034                         BITSET_SET(c->centroid_flags, i);
1035                         result = emit_smooth_varying(c, vary,
1036                                                      c->payload_w_centroid, r5);
1037                 } else {
1038                         result = emit_smooth_varying(c, vary, c->payload_w, r5);
1039                 }
1040                 break;
1041 
1042         case INTERP_MODE_NOPERSPECTIVE:
1043                 BITSET_SET(c->noperspective_flags, i);
1044                 result = emit_noperspective_varying(c, vary, r5);
1045                 break;
1046 
1047         case INTERP_MODE_FLAT:
1048                 BITSET_SET(c->flat_shade_flags, i);
1049                 result = emit_flat_varying(c, vary, r5);
1050                 break;
1051 
1052         default:
1053                 unreachable("Bad interp mode");
1054         }
1055 
1056         if (input_idx >= 0)
1057                 c->inputs[input_idx] = result;
1058         return result;
1059 }
1060 
1061 static void
emit_fragment_input(struct v3d_compile * c,int base_attr,nir_variable * var,int array_index,unsigned nelem)1062 emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var,
1063                     int array_index, unsigned nelem)
1064 {
1065         for (int i = 0; i < nelem ; i++) {
1066                 int chan = var->data.location_frac + i;
1067                 int input_idx = (base_attr + array_index) * 4 + chan;
1068                 emit_fragment_varying(c, var, input_idx, chan, array_index);
1069         }
1070 }
1071 
1072 static void
emit_compact_fragment_input(struct v3d_compile * c,int attr,nir_variable * var,int array_index)1073 emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var,
1074                             int array_index)
1075 {
1076         /* Compact variables are scalar arrays where each set of 4 elements
1077          * consumes a single location.
1078          */
1079         int loc_offset = array_index / 4;
1080         int chan = var->data.location_frac + array_index % 4;
1081         int input_idx = (attr + loc_offset) * 4  + chan;
1082         emit_fragment_varying(c, var, input_idx, chan, loc_offset);
1083 }
1084 
1085 static void
add_output(struct v3d_compile * c,uint32_t decl_offset,uint8_t slot,uint8_t swizzle)1086 add_output(struct v3d_compile *c,
1087            uint32_t decl_offset,
1088            uint8_t slot,
1089            uint8_t swizzle)
1090 {
1091         uint32_t old_array_size = c->outputs_array_size;
1092         resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
1093                           decl_offset + 1);
1094 
1095         if (old_array_size != c->outputs_array_size) {
1096                 c->output_slots = reralloc(c,
1097                                            c->output_slots,
1098                                            struct v3d_varying_slot,
1099                                            c->outputs_array_size);
1100         }
1101 
1102         c->output_slots[decl_offset] =
1103                 v3d_slot_from_slot_and_component(slot, swizzle);
1104 }
1105 
1106 /**
1107  * If compare_instr is a valid comparison instruction, emits the
1108  * compare_instr's comparison and returns the sel_instr's return value based
1109  * on the compare_instr's result.
1110  */
1111 static bool
ntq_emit_comparison(struct v3d_compile * c,nir_alu_instr * compare_instr,enum v3d_qpu_cond * out_cond)1112 ntq_emit_comparison(struct v3d_compile *c,
1113                     nir_alu_instr *compare_instr,
1114                     enum v3d_qpu_cond *out_cond)
1115 {
1116         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
1117         struct qreg src1;
1118         if (nir_op_infos[compare_instr->op].num_inputs > 1)
1119                 src1 = ntq_get_alu_src(c, compare_instr, 1);
1120         bool cond_invert = false;
1121         struct qreg nop = vir_nop_reg();
1122 
1123         switch (compare_instr->op) {
1124         case nir_op_feq32:
1125         case nir_op_seq:
1126                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1127                 break;
1128         case nir_op_ieq32:
1129                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1130                 break;
1131 
1132         case nir_op_fneu32:
1133         case nir_op_sne:
1134                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1135                 cond_invert = true;
1136                 break;
1137         case nir_op_ine32:
1138                 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ);
1139                 cond_invert = true;
1140                 break;
1141 
1142         case nir_op_fge32:
1143         case nir_op_sge:
1144                 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1145                 break;
1146         case nir_op_ige32:
1147                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1148                 cond_invert = true;
1149                 break;
1150         case nir_op_uge32:
1151                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1152                 cond_invert = true;
1153                 break;
1154 
1155         case nir_op_slt:
1156         case nir_op_flt32:
1157                 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN);
1158                 break;
1159         case nir_op_ilt32:
1160                 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC);
1161                 break;
1162         case nir_op_ult32:
1163                 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC);
1164                 break;
1165 
1166         case nir_op_i2b32:
1167                 vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1168                 cond_invert = true;
1169                 break;
1170 
1171         case nir_op_f2b32:
1172                 vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ);
1173                 cond_invert = true;
1174                 break;
1175 
1176         default:
1177                 return false;
1178         }
1179 
1180         *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA;
1181 
1182         return true;
1183 }
1184 
1185 /* Finds an ALU instruction that generates our src value that could
1186  * (potentially) be greedily emitted in the consuming instruction.
1187  */
1188 static struct nir_alu_instr *
ntq_get_alu_parent(nir_src src)1189 ntq_get_alu_parent(nir_src src)
1190 {
1191         if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu)
1192                 return NULL;
1193         nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr);
1194         if (!instr)
1195                 return NULL;
1196 
1197         /* If the ALU instr's srcs are non-SSA, then we would have to avoid
1198          * moving emission of the ALU instr down past another write of the
1199          * src.
1200          */
1201         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1202                 if (!instr->src[i].src.is_ssa)
1203                         return NULL;
1204         }
1205 
1206         return instr;
1207 }
1208 
1209 /* Turns a NIR bool into a condition code to predicate on. */
1210 static enum v3d_qpu_cond
ntq_emit_bool_to_cond(struct v3d_compile * c,nir_src src)1211 ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src)
1212 {
1213         struct qreg qsrc = ntq_get_src(c, src, 0);
1214         /* skip if we already have src in the flags */
1215         if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index)
1216                 return c->flags_cond;
1217 
1218         nir_alu_instr *compare = ntq_get_alu_parent(src);
1219         if (!compare)
1220                 goto out;
1221 
1222         enum v3d_qpu_cond cond;
1223         if (ntq_emit_comparison(c, compare, &cond))
1224                 return cond;
1225 
1226 out:
1227 
1228         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)),
1229                    V3D_QPU_PF_PUSHZ);
1230         return V3D_QPU_COND_IFNA;
1231 }
1232 
1233 static struct qreg
ntq_emit_cond_to_bool(struct v3d_compile * c,enum v3d_qpu_cond cond)1234 ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond)
1235 {
1236         struct qreg result =
1237                 vir_MOV(c, vir_SEL(c, cond,
1238                                    vir_uniform_ui(c, ~0),
1239                                    vir_uniform_ui(c, 0)));
1240         c->flags_temp = result.index;
1241         c->flags_cond = cond;
1242         return result;
1243 }
1244 
1245 static void
ntq_emit_alu(struct v3d_compile * c,nir_alu_instr * instr)1246 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
1247 {
1248         /* This should always be lowered to ALU operations for V3D. */
1249         assert(!instr->dest.saturate);
1250 
1251         /* Vectors are special in that they have non-scalarized writemasks,
1252          * and just take the first swizzle channel for each argument in order
1253          * into each writemask channel.
1254          */
1255         if (instr->op == nir_op_vec2 ||
1256             instr->op == nir_op_vec3 ||
1257             instr->op == nir_op_vec4) {
1258                 struct qreg srcs[4];
1259                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1260                         srcs[i] = ntq_get_src(c, instr->src[i].src,
1261                                               instr->src[i].swizzle[0]);
1262                 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1263                         ntq_store_dest(c, &instr->dest.dest, i,
1264                                        vir_MOV(c, srcs[i]));
1265                 return;
1266         }
1267 
1268         /* General case: We can just grab the one used channel per src. */
1269         struct qreg src[nir_op_infos[instr->op].num_inputs];
1270         for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1271                 src[i] = ntq_get_alu_src(c, instr, i);
1272         }
1273 
1274         struct qreg result;
1275 
1276         switch (instr->op) {
1277         case nir_op_mov:
1278                 result = vir_MOV(c, src[0]);
1279                 break;
1280 
1281         case nir_op_fneg:
1282                 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
1283                 break;
1284         case nir_op_ineg:
1285                 result = vir_NEG(c, src[0]);
1286                 break;
1287 
1288         case nir_op_fmul:
1289                 result = vir_FMUL(c, src[0], src[1]);
1290                 break;
1291         case nir_op_fadd:
1292                 result = vir_FADD(c, src[0], src[1]);
1293                 break;
1294         case nir_op_fsub:
1295                 result = vir_FSUB(c, src[0], src[1]);
1296                 break;
1297         case nir_op_fmin:
1298                 result = vir_FMIN(c, src[0], src[1]);
1299                 break;
1300         case nir_op_fmax:
1301                 result = vir_FMAX(c, src[0], src[1]);
1302                 break;
1303 
1304         case nir_op_f2i32: {
1305                 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src);
1306                 if (src0_alu && src0_alu->op == nir_op_fround_even) {
1307                         result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0));
1308                 } else {
1309                         result = vir_FTOIZ(c, src[0]);
1310                 }
1311                 break;
1312         }
1313 
1314         case nir_op_f2u32:
1315                 result = vir_FTOUZ(c, src[0]);
1316                 break;
1317         case nir_op_i2f32:
1318                 result = vir_ITOF(c, src[0]);
1319                 break;
1320         case nir_op_u2f32:
1321                 result = vir_UTOF(c, src[0]);
1322                 break;
1323         case nir_op_b2f32:
1324                 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
1325                 break;
1326         case nir_op_b2i32:
1327                 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
1328                 break;
1329 
1330         case nir_op_iadd:
1331                 result = vir_ADD(c, src[0], src[1]);
1332                 break;
1333         case nir_op_ushr:
1334                 result = vir_SHR(c, src[0], src[1]);
1335                 break;
1336         case nir_op_isub:
1337                 result = vir_SUB(c, src[0], src[1]);
1338                 break;
1339         case nir_op_ishr:
1340                 result = vir_ASR(c, src[0], src[1]);
1341                 break;
1342         case nir_op_ishl:
1343                 result = vir_SHL(c, src[0], src[1]);
1344                 break;
1345         case nir_op_imin:
1346                 result = vir_MIN(c, src[0], src[1]);
1347                 break;
1348         case nir_op_umin:
1349                 result = vir_UMIN(c, src[0], src[1]);
1350                 break;
1351         case nir_op_imax:
1352                 result = vir_MAX(c, src[0], src[1]);
1353                 break;
1354         case nir_op_umax:
1355                 result = vir_UMAX(c, src[0], src[1]);
1356                 break;
1357         case nir_op_iand:
1358                 result = vir_AND(c, src[0], src[1]);
1359                 break;
1360         case nir_op_ior:
1361                 result = vir_OR(c, src[0], src[1]);
1362                 break;
1363         case nir_op_ixor:
1364                 result = vir_XOR(c, src[0], src[1]);
1365                 break;
1366         case nir_op_inot:
1367                 result = vir_NOT(c, src[0]);
1368                 break;
1369 
1370         case nir_op_ufind_msb:
1371                 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
1372                 break;
1373 
1374         case nir_op_imul:
1375                 result = vir_UMUL(c, src[0], src[1]);
1376                 break;
1377 
1378         case nir_op_seq:
1379         case nir_op_sne:
1380         case nir_op_sge:
1381         case nir_op_slt: {
1382                 enum v3d_qpu_cond cond;
1383                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1384                 assert(ok);
1385                 result = vir_MOV(c, vir_SEL(c, cond,
1386                                             vir_uniform_f(c, 1.0),
1387                                             vir_uniform_f(c, 0.0)));
1388                 c->flags_temp = result.index;
1389                 c->flags_cond = cond;
1390                 break;
1391         }
1392 
1393         case nir_op_i2b32:
1394         case nir_op_f2b32:
1395         case nir_op_feq32:
1396         case nir_op_fneu32:
1397         case nir_op_fge32:
1398         case nir_op_flt32:
1399         case nir_op_ieq32:
1400         case nir_op_ine32:
1401         case nir_op_ige32:
1402         case nir_op_uge32:
1403         case nir_op_ilt32:
1404         case nir_op_ult32: {
1405                 enum v3d_qpu_cond cond;
1406                 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond);
1407                 assert(ok);
1408                 result = ntq_emit_cond_to_bool(c, cond);
1409                 break;
1410         }
1411 
1412         case nir_op_b32csel:
1413                 result = vir_MOV(c,
1414                                  vir_SEL(c,
1415                                          ntq_emit_bool_to_cond(c, instr->src[0].src),
1416                                          src[1], src[2]));
1417                 break;
1418 
1419         case nir_op_fcsel:
1420                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]),
1421                            V3D_QPU_PF_PUSHZ);
1422                 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
1423                                             src[1], src[2]));
1424                 break;
1425 
1426         case nir_op_frcp:
1427                 result = vir_RECIP(c, src[0]);
1428                 break;
1429         case nir_op_frsq:
1430                 result = vir_RSQRT(c, src[0]);
1431                 break;
1432         case nir_op_fexp2:
1433                 result = vir_EXP(c, src[0]);
1434                 break;
1435         case nir_op_flog2:
1436                 result = vir_LOG(c, src[0]);
1437                 break;
1438 
1439         case nir_op_fceil:
1440                 result = vir_FCEIL(c, src[0]);
1441                 break;
1442         case nir_op_ffloor:
1443                 result = vir_FFLOOR(c, src[0]);
1444                 break;
1445         case nir_op_fround_even:
1446                 result = vir_FROUND(c, src[0]);
1447                 break;
1448         case nir_op_ftrunc:
1449                 result = vir_FTRUNC(c, src[0]);
1450                 break;
1451 
1452         case nir_op_fsin:
1453                 result = ntq_fsincos(c, src[0], false);
1454                 break;
1455         case nir_op_fcos:
1456                 result = ntq_fsincos(c, src[0], true);
1457                 break;
1458 
1459         case nir_op_fsign:
1460                 result = ntq_fsign(c, src[0]);
1461                 break;
1462 
1463         case nir_op_fabs: {
1464                 result = vir_FMOV(c, src[0]);
1465                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1466                 break;
1467         }
1468 
1469         case nir_op_iabs:
1470                 result = vir_MAX(c, src[0], vir_NEG(c, src[0]));
1471                 break;
1472 
1473         case nir_op_fddx:
1474         case nir_op_fddx_coarse:
1475         case nir_op_fddx_fine:
1476                 result = vir_FDX(c, src[0]);
1477                 break;
1478 
1479         case nir_op_fddy:
1480         case nir_op_fddy_coarse:
1481         case nir_op_fddy_fine:
1482                 result = vir_FDY(c, src[0]);
1483                 break;
1484 
1485         case nir_op_uadd_carry:
1486                 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]),
1487                            V3D_QPU_PF_PUSHC);
1488                 result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
1489                 break;
1490 
1491         case nir_op_pack_half_2x16_split:
1492                 result = vir_VFPACK(c, src[0], src[1]);
1493                 break;
1494 
1495         case nir_op_unpack_half_2x16_split_x:
1496                 result = vir_FMOV(c, src[0]);
1497                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
1498                 break;
1499 
1500         case nir_op_unpack_half_2x16_split_y:
1501                 result = vir_FMOV(c, src[0]);
1502                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H);
1503                 break;
1504 
1505         case nir_op_fquantize2f16: {
1506                 /* F32 -> F16 -> F32 conversion */
1507                 struct qreg tmp = vir_FMOV(c, src[0]);
1508                 vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L);
1509                 tmp = vir_FMOV(c, tmp);
1510                 vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L);
1511 
1512                 /* Check for denorm */
1513                 struct qreg abs_src = vir_FMOV(c, src[0]);
1514                 vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS);
1515                 struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14));
1516                 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold),
1517                                          V3D_QPU_PF_PUSHC);
1518 
1519                 /* Return +/-0 for denorms */
1520                 struct qreg zero =
1521                         vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000));
1522                 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
1523                 break;
1524         }
1525 
1526         default:
1527                 fprintf(stderr, "unknown NIR ALU inst: ");
1528                 nir_print_instr(&instr->instr, stderr);
1529                 fprintf(stderr, "\n");
1530                 abort();
1531         }
1532 
1533         /* We have a scalar result, so the instruction should only have a
1534          * single channel written to.
1535          */
1536         assert(util_is_power_of_two_or_zero(instr->dest.write_mask));
1537         ntq_store_dest(c, &instr->dest.dest,
1538                        ffs(instr->dest.write_mask) - 1, result);
1539 }
1540 
1541 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1542  * specifier.  They come from a register that's preloaded with 0xffffffff
1543  * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1544  * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1545  */
1546 #define TLB_TYPE_F16_COLOR         (3 << 6)
1547 #define TLB_TYPE_I32_COLOR         (1 << 6)
1548 #define TLB_TYPE_F32_COLOR         (0 << 6)
1549 #define TLB_RENDER_TARGET_SHIFT    3 /* Reversed!  7 = RT 0, 0 = RT 7. */
1550 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1551 #define TLB_SAMPLE_MODE_PER_PIXEL  (1 << 2)
1552 #define TLB_F16_SWAP_HI_LO         (1 << 1)
1553 #define TLB_VEC_SIZE_4_F16         (1 << 0)
1554 #define TLB_VEC_SIZE_2_F16         (0 << 0)
1555 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1556 
1557 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1558  * flag is set.
1559  */
1560 #define TLB_TYPE_DEPTH             ((2 << 6) | (0 << 4))
1561 #define TLB_DEPTH_TYPE_INVARIANT   (0 << 2) /* Unmodified sideband input used */
1562 #define TLB_DEPTH_TYPE_PER_PIXEL   (1 << 2) /* QPU result used */
1563 #define TLB_V42_DEPTH_TYPE_INVARIANT   (0 << 3) /* Unmodified sideband input used */
1564 #define TLB_V42_DEPTH_TYPE_PER_PIXEL   (1 << 3) /* QPU result used */
1565 
1566 /* Stencil is a single 32-bit write. */
1567 #define TLB_TYPE_STENCIL_ALPHA     ((2 << 6) | (1 << 4))
1568 
1569 static void
vir_emit_tlb_color_write(struct v3d_compile * c,unsigned rt)1570 vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt)
1571 {
1572         if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt])
1573                 return;
1574 
1575         struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB);
1576         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1577 
1578         nir_variable *var = c->output_color_var[rt];
1579         int num_components = glsl_get_vector_elements(var->type);
1580         uint32_t conf = 0xffffff00;
1581         struct qinst *inst;
1582 
1583         conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE :
1584                                             TLB_SAMPLE_MODE_PER_PIXEL;
1585         conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1586 
1587         if (c->fs_key->swap_color_rb & (1 << rt))
1588                 num_components = MAX2(num_components, 3);
1589         assert(num_components != 0);
1590 
1591         enum glsl_base_type type = glsl_get_base_type(var->type);
1592         bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT;
1593         bool is_32b_tlb_format = is_int_format ||
1594                                  (c->fs_key->f32_color_rb & (1 << rt));
1595 
1596         if (is_int_format) {
1597                 /* The F32 vs I32 distinction was dropped in 4.2. */
1598                 if (c->devinfo->ver < 42)
1599                         conf |= TLB_TYPE_I32_COLOR;
1600                 else
1601                         conf |= TLB_TYPE_F32_COLOR;
1602                 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT);
1603         } else {
1604                 if (c->fs_key->f32_color_rb & (1 << rt)) {
1605                         conf |= TLB_TYPE_F32_COLOR;
1606                         conf |= ((num_components - 1) <<
1607                                 TLB_VEC_SIZE_MINUS_1_SHIFT);
1608                 } else {
1609                         conf |= TLB_TYPE_F16_COLOR;
1610                         conf |= TLB_F16_SWAP_HI_LO;
1611                         if (num_components >= 3)
1612                                 conf |= TLB_VEC_SIZE_4_F16;
1613                         else
1614                                 conf |= TLB_VEC_SIZE_2_F16;
1615                 }
1616         }
1617 
1618         int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1;
1619         for (int i = 0; i < num_samples; i++) {
1620                 struct qreg *color = c->msaa_per_sample_output ?
1621                         &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] :
1622                         &c->outputs[var->data.driver_location * 4];
1623 
1624                 struct qreg r = color[0];
1625                 struct qreg g = color[1];
1626                 struct qreg b = color[2];
1627                 struct qreg a = color[3];
1628 
1629                 if (c->fs_key->swap_color_rb & (1 << rt))  {
1630                         r = color[2];
1631                         b = color[0];
1632                 }
1633 
1634                 if (c->fs_key->sample_alpha_to_one)
1635                         a = vir_uniform_f(c, 1.0);
1636 
1637                 if (is_32b_tlb_format) {
1638                         if (i == 0) {
1639                                 inst = vir_MOV_dest(c, tlbu_reg, r);
1640                                 inst->uniform =
1641                                         vir_get_uniform_index(c,
1642                                                               QUNIFORM_CONSTANT,
1643                                                               conf);
1644                         } else {
1645                                 vir_MOV_dest(c, tlb_reg, r);
1646                         }
1647 
1648                         if (num_components >= 2)
1649                                 vir_MOV_dest(c, tlb_reg, g);
1650                         if (num_components >= 3)
1651                                 vir_MOV_dest(c, tlb_reg, b);
1652                         if (num_components >= 4)
1653                                 vir_MOV_dest(c, tlb_reg, a);
1654                 } else {
1655                         inst = vir_VFPACK_dest(c, tlb_reg, r, g);
1656                         if (conf != ~0 && i == 0) {
1657                                 inst->dst = tlbu_reg;
1658                                 inst->uniform =
1659                                         vir_get_uniform_index(c,
1660                                                               QUNIFORM_CONSTANT,
1661                                                               conf);
1662                         }
1663 
1664                         if (num_components >= 3)
1665                                 vir_VFPACK_dest(c, tlb_reg, b, a);
1666                 }
1667         }
1668 }
1669 
1670 static void
emit_frag_end(struct v3d_compile * c)1671 emit_frag_end(struct v3d_compile *c)
1672 {
1673         /* If the shader has no non-TLB side effects and doesn't write Z
1674          * we can promote it to enabling early_fragment_tests even
1675          * if the user didn't.
1676          */
1677         if (c->output_position_index == -1 &&
1678             !(c->s->info.num_images || c->s->info.num_ssbos)) {
1679                 c->s->info.fs.early_fragment_tests = true;
1680         }
1681 
1682         if (c->output_sample_mask_index != -1) {
1683                 vir_SETMSF_dest(c, vir_nop_reg(),
1684                                 vir_AND(c,
1685                                         vir_MSF(c),
1686                                         c->outputs[c->output_sample_mask_index]));
1687         }
1688 
1689         bool has_any_tlb_color_write = false;
1690         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) {
1691                 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt])
1692                         has_any_tlb_color_write = true;
1693         }
1694 
1695         if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
1696                 struct nir_variable *var = c->output_color_var[0];
1697                 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1698 
1699                 vir_SETMSF_dest(c, vir_nop_reg(),
1700                                 vir_AND(c,
1701                                         vir_MSF(c),
1702                                         vir_FTOC(c, color[3])));
1703         }
1704 
1705         struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU);
1706         if (c->output_position_index != -1 &&
1707             !c->s->info.fs.early_fragment_tests) {
1708                 struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1709                                                   c->outputs[c->output_position_index]);
1710                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1711 
1712                 if (c->devinfo->ver >= 42) {
1713                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL |
1714                                           TLB_SAMPLE_MODE_PER_PIXEL);
1715                 } else
1716                         tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL;
1717 
1718                 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT,
1719                                                       tlb_specifier |
1720                                                       0xffffff00);
1721                 c->writes_z = true;
1722         } else if (c->s->info.fs.uses_discard ||
1723                    !c->s->info.fs.early_fragment_tests ||
1724                    c->fs_key->sample_alpha_to_coverage ||
1725                    !has_any_tlb_color_write) {
1726                 /* Emit passthrough Z if it needed to be delayed until shader
1727                  * end due to potential discards.
1728                  *
1729                  * Since (single-threaded) fragment shaders always need a TLB
1730                  * write, emit passthrouh Z if we didn't have any color
1731                  * buffers and flag us as potentially discarding, so that we
1732                  * can use Z as the TLB write.
1733                  */
1734                 c->s->info.fs.uses_discard = true;
1735 
1736                 struct qinst *inst = vir_MOV_dest(c, tlbu_reg,
1737                                                   vir_nop_reg());
1738                 uint8_t tlb_specifier = TLB_TYPE_DEPTH;
1739 
1740                 if (c->devinfo->ver >= 42) {
1741                         /* The spec says the PER_PIXEL flag is ignored for
1742                          * invariant writes, but the simulator demands it.
1743                          */
1744                         tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT |
1745                                           TLB_SAMPLE_MODE_PER_PIXEL);
1746                 } else {
1747                         tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT;
1748                 }
1749 
1750                 inst->uniform = vir_get_uniform_index(c,
1751                                                       QUNIFORM_CONSTANT,
1752                                                       tlb_specifier |
1753                                                       0xffffff00);
1754                 c->writes_z = true;
1755         }
1756 
1757         /* XXX: Performance improvement: Merge Z write and color writes TLB
1758          * uniform setup
1759          */
1760         for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++)
1761                 vir_emit_tlb_color_write(c, rt);
1762 }
1763 
1764 static inline void
vir_VPM_WRITE_indirect(struct v3d_compile * c,struct qreg val,struct qreg vpm_index,bool uniform_vpm_index)1765 vir_VPM_WRITE_indirect(struct v3d_compile *c,
1766                        struct qreg val,
1767                        struct qreg vpm_index,
1768                        bool uniform_vpm_index)
1769 {
1770         assert(c->devinfo->ver >= 40);
1771         if (uniform_vpm_index)
1772                 vir_STVPMV(c, vpm_index, val);
1773         else
1774                 vir_STVPMD(c, vpm_index, val);
1775 }
1776 
1777 static void
vir_VPM_WRITE(struct v3d_compile * c,struct qreg val,uint32_t vpm_index)1778 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index)
1779 {
1780         if (c->devinfo->ver >= 40) {
1781                 vir_VPM_WRITE_indirect(c, val,
1782                                        vir_uniform_ui(c, vpm_index), true);
1783         } else {
1784                 /* XXX: v3d33_vir_vpm_write_setup(c); */
1785                 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1786         }
1787 }
1788 
1789 static void
emit_vert_end(struct v3d_compile * c)1790 emit_vert_end(struct v3d_compile *c)
1791 {
1792         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1793          */
1794         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1795                 vir_VPMWT(c);
1796 }
1797 
1798 static void
emit_geom_end(struct v3d_compile * c)1799 emit_geom_end(struct v3d_compile *c)
1800 {
1801         /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1802          */
1803         if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1804                 vir_VPMWT(c);
1805 }
1806 
1807 static bool
mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)1808 mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
1809                        unsigned bit_size,
1810                        unsigned num_components,
1811                        nir_intrinsic_instr *low,
1812                        nir_intrinsic_instr *high,
1813                        void *data)
1814 {
1815         /* Our backend is 32-bit only at present */
1816         if (bit_size != 32)
1817                 return false;
1818 
1819         if (align_mul % 4 != 0 || align_offset % 4 != 0)
1820                 return false;
1821 
1822         /* Vector accesses wrap at 16-byte boundaries so we can't vectorize
1823          * if the resulting vector crosses a 16-byte boundary.
1824          */
1825         assert(util_is_power_of_two_nonzero(align_mul));
1826         align_mul = MIN2(align_mul, 16);
1827         align_offset &= 0xf;
1828         if (16 - align_mul + align_offset + num_components * 4 > 16)
1829                 return false;
1830 
1831         return true;
1832 }
1833 
1834 void
v3d_optimize_nir(struct v3d_compile * c,struct nir_shader * s)1835 v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
1836 {
1837         bool progress;
1838         unsigned lower_flrp =
1839                 (s->options->lower_flrp16 ? 16 : 0) |
1840                 (s->options->lower_flrp32 ? 32 : 0) |
1841                 (s->options->lower_flrp64 ? 64 : 0);
1842 
1843         do {
1844                 progress = false;
1845 
1846                 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1847                 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL);
1848                 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false);
1849                 NIR_PASS(progress, s, nir_copy_prop);
1850                 NIR_PASS(progress, s, nir_opt_remove_phis);
1851                 NIR_PASS(progress, s, nir_opt_dce);
1852                 NIR_PASS(progress, s, nir_opt_dead_cf);
1853                 NIR_PASS(progress, s, nir_opt_cse);
1854                 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true);
1855                 NIR_PASS(progress, s, nir_opt_algebraic);
1856                 NIR_PASS(progress, s, nir_opt_constant_folding);
1857 
1858                 nir_load_store_vectorize_options vectorize_opts = {
1859                         .modes = nir_var_mem_ssbo | nir_var_mem_ubo |
1860                                  nir_var_mem_push_const | nir_var_mem_shared |
1861                                  nir_var_mem_global,
1862                         .callback = mem_vectorize_callback,
1863                         .robust_modes = 0,
1864                 };
1865                 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
1866 
1867                 if (lower_flrp != 0) {
1868                         bool lower_flrp_progress = false;
1869 
1870                         NIR_PASS(lower_flrp_progress, s, nir_lower_flrp,
1871                                  lower_flrp,
1872                                  false /* always_precise */);
1873                         if (lower_flrp_progress) {
1874                                 NIR_PASS(progress, s, nir_opt_constant_folding);
1875                                 progress = true;
1876                         }
1877 
1878                         /* Nothing should rematerialize any flrps, so we only
1879                          * need to do this lowering once.
1880                          */
1881                         lower_flrp = 0;
1882                 }
1883 
1884                 NIR_PASS(progress, s, nir_opt_undef);
1885                 NIR_PASS(progress, s, nir_lower_undef_to_zero);
1886 
1887                 if (c && !c->disable_loop_unrolling &&
1888                     s->options->max_unroll_iterations > 0) {
1889                        bool local_progress = false;
1890                        NIR_PASS(local_progress, s, nir_opt_loop_unroll);
1891                        c->unrolled_any_loops |= local_progress;
1892                        progress |= local_progress;
1893                 }
1894         } while (progress);
1895 
1896         nir_move_options sink_opts =
1897                 nir_move_const_undef | nir_move_comparisons | nir_move_copies |
1898                 nir_move_load_ubo;
1899         NIR_PASS(progress, s, nir_opt_sink, sink_opts);
1900 
1901         NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo);
1902 }
1903 
1904 static int
driver_location_compare(const nir_variable * a,const nir_variable * b)1905 driver_location_compare(const nir_variable *a, const nir_variable *b)
1906 {
1907         return a->data.driver_location == b->data.driver_location ?
1908                a->data.location_frac - b->data.location_frac :
1909                a->data.driver_location - b->data.driver_location;
1910 }
1911 
1912 static struct qreg
ntq_emit_vpm_read(struct v3d_compile * c,uint32_t * num_components_queued,uint32_t * remaining,uint32_t vpm_index)1913 ntq_emit_vpm_read(struct v3d_compile *c,
1914                   uint32_t *num_components_queued,
1915                   uint32_t *remaining,
1916                   uint32_t vpm_index)
1917 {
1918         struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1919 
1920         if (c->devinfo->ver >= 40 ) {
1921                 return vir_LDVPMV_IN(c,
1922                                      vir_uniform_ui(c,
1923                                                     (*num_components_queued)++));
1924         }
1925 
1926         if (*num_components_queued != 0) {
1927                 (*num_components_queued)--;
1928                 return vir_MOV(c, vpm);
1929         }
1930 
1931         uint32_t num_components = MIN2(*remaining, 32);
1932 
1933         v3d33_vir_vpm_read_setup(c, num_components);
1934 
1935         *num_components_queued = num_components - 1;
1936         *remaining -= num_components;
1937 
1938         return vir_MOV(c, vpm);
1939 }
1940 
1941 static void
ntq_setup_vs_inputs(struct v3d_compile * c)1942 ntq_setup_vs_inputs(struct v3d_compile *c)
1943 {
1944         /* Figure out how many components of each vertex attribute the shader
1945          * uses.  Each variable should have been split to individual
1946          * components and unused ones DCEed.  The vertex fetcher will load
1947          * from the start of the attribute to the number of components we
1948          * declare we need in c->vattr_sizes[].
1949          *
1950          * BGRA vertex attributes are a bit special: since we implement these
1951          * as RGBA swapping R/B components we always need at least 3 components
1952          * if component 0 is read.
1953          */
1954         nir_foreach_shader_in_variable(var, c->s) {
1955                 /* No VS attribute array support. */
1956                 assert(MAX2(glsl_get_length(var->type), 1) == 1);
1957 
1958                 unsigned loc = var->data.driver_location;
1959                 int start_component = var->data.location_frac;
1960                 int num_components = glsl_get_components(var->type);
1961 
1962                 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc],
1963                                            start_component + num_components);
1964 
1965                 /* Handle BGRA inputs */
1966                 if (start_component == 0 &&
1967                     c->vs_key->va_swap_rb_mask & (1 << var->data.location)) {
1968                         c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]);
1969                 }
1970         }
1971 
1972         unsigned num_components = 0;
1973         uint32_t vpm_components_queued = 0;
1974         bool uses_iid = BITSET_TEST(c->s->info.system_values_read,
1975                                     SYSTEM_VALUE_INSTANCE_ID) ||
1976                         BITSET_TEST(c->s->info.system_values_read,
1977                                     SYSTEM_VALUE_INSTANCE_INDEX);
1978         bool uses_biid = BITSET_TEST(c->s->info.system_values_read,
1979                                      SYSTEM_VALUE_BASE_INSTANCE);
1980         bool uses_vid = BITSET_TEST(c->s->info.system_values_read,
1981                                     SYSTEM_VALUE_VERTEX_ID) ||
1982                         BITSET_TEST(c->s->info.system_values_read,
1983                                     SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
1984 
1985         num_components += uses_iid;
1986         num_components += uses_biid;
1987         num_components += uses_vid;
1988 
1989         for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++)
1990                 num_components += c->vattr_sizes[i];
1991 
1992         if (uses_iid) {
1993                 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1994                                            &num_components, ~0);
1995         }
1996 
1997         if (uses_biid) {
1998                 c->biid = ntq_emit_vpm_read(c, &vpm_components_queued,
1999                                             &num_components, ~0);
2000         }
2001 
2002         if (uses_vid) {
2003                 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
2004                                            &num_components, ~0);
2005         }
2006 
2007         /* The actual loads will happen directly in nir_intrinsic_load_input
2008          * on newer versions.
2009          */
2010         if (c->devinfo->ver >= 40)
2011                 return;
2012 
2013         for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
2014                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2015                                   (loc + 1) * 4);
2016 
2017                 for (int i = 0; i < c->vattr_sizes[loc]; i++) {
2018                         c->inputs[loc * 4 + i] =
2019                                 ntq_emit_vpm_read(c,
2020                                                   &vpm_components_queued,
2021                                                   &num_components,
2022                                                   loc * 4 + i);
2023 
2024                 }
2025         }
2026 
2027         if (c->devinfo->ver >= 40) {
2028                 assert(vpm_components_queued == num_components);
2029         } else {
2030                 assert(vpm_components_queued == 0);
2031                 assert(num_components == 0);
2032         }
2033 }
2034 
2035 static bool
program_reads_point_coord(struct v3d_compile * c)2036 program_reads_point_coord(struct v3d_compile *c)
2037 {
2038         nir_foreach_shader_in_variable(var, c->s) {
2039                 if (util_varying_is_point_coord(var->data.location,
2040                                                 c->fs_key->point_sprite_mask)) {
2041                         return true;
2042                 }
2043         }
2044 
2045         return false;
2046 }
2047 
2048 static void
ntq_setup_gs_inputs(struct v3d_compile * c)2049 ntq_setup_gs_inputs(struct v3d_compile *c)
2050 {
2051         nir_sort_variables_with_modes(c->s, driver_location_compare,
2052                                       nir_var_shader_in);
2053 
2054         nir_foreach_shader_in_variable(var, c->s) {
2055                 /* All GS inputs are arrays with as many entries as vertices
2056                  * in the input primitive, but here we only care about the
2057                  * per-vertex input type.
2058                  */
2059                 assert(glsl_type_is_array(var->type));
2060                 const struct glsl_type *type = glsl_get_array_element(var->type);
2061                 unsigned array_len = MAX2(glsl_get_length(type), 1);
2062                 unsigned loc = var->data.driver_location;
2063 
2064                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2065                                   (loc + array_len) * 4);
2066 
2067                 if (var->data.compact) {
2068                         for (unsigned j = 0; j < array_len; j++) {
2069                                 unsigned input_idx = c->num_inputs++;
2070                                 unsigned loc_frac = var->data.location_frac + j;
2071                                 unsigned loc = var->data.location + loc_frac / 4;
2072                                 unsigned comp = loc_frac % 4;
2073                                 c->input_slots[input_idx] =
2074                                         v3d_slot_from_slot_and_component(loc, comp);
2075                         }
2076                        continue;
2077                 }
2078 
2079                 for (unsigned j = 0; j < array_len; j++) {
2080                         unsigned num_elements = glsl_get_vector_elements(type);
2081                         for (unsigned k = 0; k < num_elements; k++) {
2082                                 unsigned chan = var->data.location_frac + k;
2083                                 unsigned input_idx = c->num_inputs++;
2084                                 struct v3d_varying_slot slot =
2085                                         v3d_slot_from_slot_and_component(var->data.location + j, chan);
2086                                 c->input_slots[input_idx] = slot;
2087                         }
2088                 }
2089         }
2090 }
2091 
2092 
2093 static void
ntq_setup_fs_inputs(struct v3d_compile * c)2094 ntq_setup_fs_inputs(struct v3d_compile *c)
2095 {
2096         nir_sort_variables_with_modes(c->s, driver_location_compare,
2097                                       nir_var_shader_in);
2098 
2099         nir_foreach_shader_in_variable(var, c->s) {
2100                 unsigned var_len = glsl_count_vec4_slots(var->type, false, false);
2101                 unsigned loc = var->data.driver_location;
2102 
2103                 uint32_t inputs_array_size = c->inputs_array_size;
2104                 uint32_t inputs_array_required_size = (loc + var_len) * 4;
2105                 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
2106                                   inputs_array_required_size);
2107                 resize_interp_array(c, &c->interp, &inputs_array_size,
2108                                     inputs_array_required_size);
2109 
2110                 if (var->data.location == VARYING_SLOT_POS) {
2111                         emit_fragcoord_input(c, loc);
2112                 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID &&
2113                            !c->fs_key->has_gs) {
2114                         /* If the fragment shader reads gl_PrimitiveID and we
2115                          * don't have a geometry shader in the pipeline to write
2116                          * it then we program the hardware to inject it as
2117                          * an implicit varying. Take it from there.
2118                          */
2119                         c->inputs[loc * 4] = c->primitive_id;
2120                 } else if (util_varying_is_point_coord(var->data.location,
2121                                                        c->fs_key->point_sprite_mask)) {
2122                         c->inputs[loc * 4 + 0] = c->point_x;
2123                         c->inputs[loc * 4 + 1] = c->point_y;
2124                 } else if (var->data.compact) {
2125                         for (int j = 0; j < var_len; j++)
2126                                 emit_compact_fragment_input(c, loc, var, j);
2127                 } else if (glsl_type_is_struct(var->type)) {
2128                         for (int j = 0; j < var_len; j++) {
2129                            emit_fragment_input(c, loc, var, j, 4);
2130                         }
2131                 } else {
2132                         for (int j = 0; j < var_len; j++) {
2133                                 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type));
2134                         }
2135                 }
2136         }
2137 }
2138 
2139 static void
ntq_setup_outputs(struct v3d_compile * c)2140 ntq_setup_outputs(struct v3d_compile *c)
2141 {
2142         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
2143                 return;
2144 
2145         nir_foreach_shader_out_variable(var, c->s) {
2146                 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
2147                 unsigned loc = var->data.driver_location * 4;
2148 
2149                 assert(array_len == 1);
2150                 (void)array_len;
2151 
2152                 for (int i = 0; i < 4 - var->data.location_frac; i++) {
2153                         add_output(c, loc + var->data.location_frac + i,
2154                                    var->data.location,
2155                                    var->data.location_frac + i);
2156                 }
2157 
2158                 switch (var->data.location) {
2159                 case FRAG_RESULT_COLOR:
2160                         c->output_color_var[0] = var;
2161                         c->output_color_var[1] = var;
2162                         c->output_color_var[2] = var;
2163                         c->output_color_var[3] = var;
2164                         break;
2165                 case FRAG_RESULT_DATA0:
2166                 case FRAG_RESULT_DATA1:
2167                 case FRAG_RESULT_DATA2:
2168                 case FRAG_RESULT_DATA3:
2169                         c->output_color_var[var->data.location -
2170                                             FRAG_RESULT_DATA0] = var;
2171                         break;
2172                 case FRAG_RESULT_DEPTH:
2173                         c->output_position_index = loc;
2174                         break;
2175                 case FRAG_RESULT_SAMPLE_MASK:
2176                         c->output_sample_mask_index = loc;
2177                         break;
2178                 }
2179         }
2180 }
2181 
2182 /**
2183  * Sets up the mapping from nir_register to struct qreg *.
2184  *
2185  * Each nir_register gets a struct qreg per 32-bit component being stored.
2186  */
2187 static void
ntq_setup_registers(struct v3d_compile * c,struct exec_list * list)2188 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
2189 {
2190         foreach_list_typed(nir_register, nir_reg, node, list) {
2191                 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
2192                 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
2193                                                   array_len *
2194                                                   nir_reg->num_components);
2195 
2196                 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
2197 
2198                 for (int i = 0; i < array_len * nir_reg->num_components; i++)
2199                         qregs[i] = vir_get_temp(c);
2200         }
2201 }
2202 
2203 static void
ntq_emit_load_const(struct v3d_compile * c,nir_load_const_instr * instr)2204 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
2205 {
2206         /* XXX perf: Experiment with using immediate loads to avoid having
2207          * these end up in the uniform stream.  Watch out for breaking the
2208          * small immediates optimization in the process!
2209          */
2210         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
2211         for (int i = 0; i < instr->def.num_components; i++)
2212                 qregs[i] = vir_uniform_ui(c, instr->value[i].u32);
2213 
2214         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
2215 }
2216 
2217 static void
ntq_emit_image_size(struct v3d_compile * c,nir_intrinsic_instr * instr)2218 ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr)
2219 {
2220         unsigned image_index = nir_src_as_uint(instr->src[0]);
2221         bool is_array = nir_intrinsic_image_array(instr);
2222 
2223         assert(nir_src_as_uint(instr->src[1]) == 0);
2224 
2225         ntq_store_dest(c, &instr->dest, 0,
2226                        vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index));
2227         if (instr->num_components > 1) {
2228                 ntq_store_dest(c, &instr->dest, 1,
2229                                vir_uniform(c,
2230                                            instr->num_components == 2 && is_array ?
2231                                                    QUNIFORM_IMAGE_ARRAY_SIZE :
2232                                                    QUNIFORM_IMAGE_HEIGHT,
2233                                            image_index));
2234         }
2235         if (instr->num_components > 2) {
2236                 ntq_store_dest(c, &instr->dest, 2,
2237                                vir_uniform(c,
2238                                            is_array ?
2239                                            QUNIFORM_IMAGE_ARRAY_SIZE :
2240                                            QUNIFORM_IMAGE_DEPTH,
2241                                            image_index));
2242         }
2243 }
2244 
2245 static void
vir_emit_tlb_color_read(struct v3d_compile * c,nir_intrinsic_instr * instr)2246 vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr)
2247 {
2248         assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
2249 
2250         int rt = nir_src_as_uint(instr->src[0]);
2251         assert(rt < V3D_MAX_DRAW_BUFFERS);
2252 
2253         int sample_index = nir_intrinsic_base(instr) ;
2254         assert(sample_index < V3D_MAX_SAMPLES);
2255 
2256         int component = nir_intrinsic_component(instr);
2257         assert(component < 4);
2258 
2259         /* We need to emit our TLB reads after we have acquired the scoreboard
2260          * lock, or the GPU will hang. Usually, we do our scoreboard locking on
2261          * the last thread switch to improve parallelism, however, that is only
2262          * guaranteed to happen before the tlb color writes.
2263          *
2264          * To fix that, we make sure we always emit a thread switch before the
2265          * first tlb color read. If that happens to be the last thread switch
2266          * we emit, then everything is fine, but otherwsie, if any code after
2267          * this point needs to emit additional thread switches, then we will
2268          * switch the strategy to locking the scoreboard on the first thread
2269          * switch instead -- see vir_emit_thrsw().
2270          */
2271         if (!c->emitted_tlb_load) {
2272                 if (!c->last_thrsw_at_top_level) {
2273                         assert(c->devinfo->ver >= 41);
2274                         vir_emit_thrsw(c);
2275                 }
2276 
2277                 c->emitted_tlb_load = true;
2278         }
2279 
2280         struct qreg *color_reads_for_sample =
2281                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
2282 
2283         if (color_reads_for_sample[component].file == QFILE_NULL) {
2284                 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format;
2285                 int num_components =
2286                         util_format_get_nr_components(rt_format);
2287 
2288                 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt);
2289                 if (swap_rb)
2290                         num_components = MAX2(num_components, 3);
2291 
2292                 nir_variable *var = c->output_color_var[rt];
2293                 enum glsl_base_type type = glsl_get_base_type(var->type);
2294 
2295                 bool is_int_format = type == GLSL_TYPE_INT ||
2296                                      type == GLSL_TYPE_UINT;
2297 
2298                 bool is_32b_tlb_format = is_int_format ||
2299                                          (c->fs_key->f32_color_rb & (1 << rt));
2300 
2301                 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1;
2302 
2303                 uint32_t conf = 0xffffff00;
2304                 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE :
2305                                           TLB_SAMPLE_MODE_PER_PIXEL;
2306                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
2307 
2308                 if (is_32b_tlb_format) {
2309                         /* The F32 vs I32 distinction was dropped in 4.2. */
2310                         conf |= (c->devinfo->ver < 42 && is_int_format) ?
2311                                 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR;
2312 
2313                         conf |= ((num_components - 1) <<
2314                                  TLB_VEC_SIZE_MINUS_1_SHIFT);
2315                 } else {
2316                         conf |= TLB_TYPE_F16_COLOR;
2317                         conf |= TLB_F16_SWAP_HI_LO;
2318 
2319                         if (num_components >= 3)
2320                                 conf |= TLB_VEC_SIZE_4_F16;
2321                         else
2322                                 conf |= TLB_VEC_SIZE_2_F16;
2323                 }
2324 
2325 
2326                 for (int i = 0; i < num_samples; i++) {
2327                         struct qreg r, g, b, a;
2328                         if (is_32b_tlb_format) {
2329                                 r = conf != 0xffffffff && i == 0?
2330                                         vir_TLBU_COLOR_READ(c, conf) :
2331                                         vir_TLB_COLOR_READ(c);
2332                                 if (num_components >= 2)
2333                                         g = vir_TLB_COLOR_READ(c);
2334                                 if (num_components >= 3)
2335                                         b = vir_TLB_COLOR_READ(c);
2336                                 if (num_components >= 4)
2337                                         a = vir_TLB_COLOR_READ(c);
2338                         } else {
2339                                 struct qreg rg = conf != 0xffffffff && i == 0 ?
2340                                         vir_TLBU_COLOR_READ(c, conf) :
2341                                         vir_TLB_COLOR_READ(c);
2342                                 r = vir_FMOV(c, rg);
2343                                 vir_set_unpack(c->defs[r.index], 0,
2344                                                V3D_QPU_UNPACK_L);
2345                                 g = vir_FMOV(c, rg);
2346                                 vir_set_unpack(c->defs[g.index], 0,
2347                                                V3D_QPU_UNPACK_H);
2348 
2349                                 if (num_components > 2) {
2350                                     struct qreg ba = vir_TLB_COLOR_READ(c);
2351                                     b = vir_FMOV(c, ba);
2352                                     vir_set_unpack(c->defs[b.index], 0,
2353                                                    V3D_QPU_UNPACK_L);
2354                                     a = vir_FMOV(c, ba);
2355                                     vir_set_unpack(c->defs[a.index], 0,
2356                                                    V3D_QPU_UNPACK_H);
2357                                 }
2358                         }
2359 
2360                         struct qreg *color_reads =
2361                                 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4];
2362 
2363                         color_reads[0] = swap_rb ? b : r;
2364                         if (num_components >= 2)
2365                                 color_reads[1] = g;
2366                         if (num_components >= 3)
2367                                 color_reads[2] = swap_rb ? r : b;
2368                         if (num_components >= 4)
2369                                 color_reads[3] = a;
2370                 }
2371         }
2372 
2373         assert(color_reads_for_sample[component].file != QFILE_NULL);
2374         ntq_store_dest(c, &instr->dest, 0,
2375                        vir_MOV(c, color_reads_for_sample[component]));
2376 }
2377 
2378 static void
ntq_emit_load_uniform(struct v3d_compile * c,nir_intrinsic_instr * instr)2379 ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
2380 {
2381         if (nir_src_is_const(instr->src[0])) {
2382                 int offset = (nir_intrinsic_base(instr) +
2383                              nir_src_as_uint(instr->src[0]));
2384                 assert(offset % 4 == 0);
2385                 /* We need dwords */
2386                 offset = offset / 4;
2387                 for (int i = 0; i < instr->num_components; i++) {
2388                         ntq_store_dest(c, &instr->dest, i,
2389                                        vir_uniform(c, QUNIFORM_UNIFORM,
2390                                                    offset + i));
2391                 }
2392         } else {
2393                ntq_emit_tmu_general(c, instr, false);
2394         }
2395 }
2396 
2397 static void
ntq_emit_load_input(struct v3d_compile * c,nir_intrinsic_instr * instr)2398 ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr)
2399 {
2400         /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset).
2401          *
2402          * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even
2403          * if we don't support non-uniform offsets because we also set the
2404          * lower_all_io_to_temps option in the NIR compiler. This ensures that
2405          * any indirect indexing on in/out variables is turned into indirect
2406          * indexing on temporary variables instead, that we handle by lowering
2407          * to scratch. If we implement non-uniform offset here we might be able
2408          * to avoid the temp and scratch lowering, which involves copying from
2409          * the input to the temp variable, possibly making code more optimal.
2410          */
2411         unsigned offset =
2412                 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]);
2413 
2414         if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) {
2415                /* Emit the LDVPM directly now, rather than at the top
2416                 * of the shader like we did for V3D 3.x (which needs
2417                 * vpmsetup when not just taking the next offset).
2418                 *
2419                 * Note that delaying like this may introduce stalls,
2420                 * as LDVPMV takes a minimum of 1 instruction but may
2421                 * be slower if the VPM unit is busy with another QPU.
2422                 */
2423                int index = 0;
2424                if (BITSET_TEST(c->s->info.system_values_read,
2425                                SYSTEM_VALUE_INSTANCE_ID)) {
2426                       index++;
2427                }
2428                if (BITSET_TEST(c->s->info.system_values_read,
2429                                SYSTEM_VALUE_BASE_INSTANCE)) {
2430                       index++;
2431                }
2432                if (BITSET_TEST(c->s->info.system_values_read,
2433                                SYSTEM_VALUE_VERTEX_ID)) {
2434                       index++;
2435                }
2436                for (int i = 0; i < offset; i++)
2437                       index += c->vattr_sizes[i];
2438                index += nir_intrinsic_component(instr);
2439                for (int i = 0; i < instr->num_components; i++) {
2440                       struct qreg vpm_offset = vir_uniform_ui(c, index++);
2441                       ntq_store_dest(c, &instr->dest, i,
2442                                      vir_LDVPMV_IN(c, vpm_offset));
2443                 }
2444         } else {
2445                 for (int i = 0; i < instr->num_components; i++) {
2446                         int comp = nir_intrinsic_component(instr) + i;
2447                         ntq_store_dest(c, &instr->dest, i,
2448                                        vir_MOV(c, c->inputs[offset * 4 + comp]));
2449                 }
2450         }
2451 }
2452 
2453 static void
ntq_emit_per_sample_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2454 ntq_emit_per_sample_color_write(struct v3d_compile *c,
2455                                 nir_intrinsic_instr *instr)
2456 {
2457         assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d);
2458 
2459         unsigned rt = nir_src_as_uint(instr->src[1]);
2460         assert(rt < V3D_MAX_DRAW_BUFFERS);
2461 
2462         unsigned sample_idx = nir_intrinsic_base(instr);
2463         assert(sample_idx < V3D_MAX_SAMPLES);
2464 
2465         unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4;
2466         for (int i = 0; i < instr->num_components; i++) {
2467                 c->sample_colors[offset + i] =
2468                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2469         }
2470 }
2471 
2472 static void
ntq_emit_color_write(struct v3d_compile * c,nir_intrinsic_instr * instr)2473 ntq_emit_color_write(struct v3d_compile *c,
2474                      nir_intrinsic_instr *instr)
2475 {
2476         unsigned offset = (nir_intrinsic_base(instr) +
2477                            nir_src_as_uint(instr->src[1])) * 4 +
2478                           nir_intrinsic_component(instr);
2479         for (int i = 0; i < instr->num_components; i++) {
2480                 c->outputs[offset + i] =
2481                         vir_MOV(c, ntq_get_src(c, instr->src[0], i));
2482         }
2483 }
2484 
2485 static void
emit_store_output_gs(struct v3d_compile * c,nir_intrinsic_instr * instr)2486 emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2487 {
2488         assert(instr->num_components == 1);
2489 
2490         struct qreg offset = ntq_get_src(c, instr->src[1], 0);
2491 
2492         uint32_t base_offset = nir_intrinsic_base(instr);
2493 
2494         if (base_offset)
2495                 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset);
2496 
2497         /* Usually, for VS or FS, we only emit outputs once at program end so
2498          * our VPM writes are never in non-uniform control flow, but this
2499          * is not true for GS, where we are emitting multiple vertices.
2500          */
2501         if (vir_in_nonuniform_control_flow(c)) {
2502                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2503                            V3D_QPU_PF_PUSHZ);
2504         }
2505 
2506         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2507 
2508         /* The offset isn’t necessarily dynamically uniform for a geometry
2509          * shader. This can happen if the shader sometimes doesn’t emit one of
2510          * the vertices. In that case subsequent vertices will be written to
2511          * different offsets in the VPM and we need to use the scatter write
2512          * instruction to have a different offset for each lane.
2513          */
2514          bool is_uniform_offset =
2515                  !vir_in_nonuniform_control_flow(c) &&
2516                  !nir_src_is_divergent(instr->src[1]);
2517          vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2518 
2519         if (vir_in_nonuniform_control_flow(c)) {
2520                 struct qinst *last_inst =
2521                         (struct qinst *)c->cur_block->instructions.prev;
2522                 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
2523         }
2524 }
2525 
2526 static void
emit_store_output_vs(struct v3d_compile * c,nir_intrinsic_instr * instr)2527 emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr)
2528 {
2529         assert(c->s->info.stage == MESA_SHADER_VERTEX);
2530         assert(instr->num_components == 1);
2531 
2532         uint32_t base = nir_intrinsic_base(instr);
2533         struct qreg val = ntq_get_src(c, instr->src[0], 0);
2534 
2535         if (nir_src_is_const(instr->src[1])) {
2536                 vir_VPM_WRITE(c, val,
2537                               base + nir_src_as_uint(instr->src[1]));
2538         } else {
2539                 struct qreg offset = vir_ADD(c,
2540                                              ntq_get_src(c, instr->src[1], 1),
2541                                              vir_uniform_ui(c, base));
2542                 bool is_uniform_offset =
2543                         !vir_in_nonuniform_control_flow(c) &&
2544                         !nir_src_is_divergent(instr->src[1]);
2545                 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset);
2546         }
2547 }
2548 
2549 static void
ntq_emit_store_output(struct v3d_compile * c,nir_intrinsic_instr * instr)2550 ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr)
2551 {
2552         if (c->s->info.stage == MESA_SHADER_FRAGMENT)
2553                ntq_emit_color_write(c, instr);
2554         else if (c->s->info.stage == MESA_SHADER_GEOMETRY)
2555                emit_store_output_gs(c, instr);
2556         else
2557                emit_store_output_vs(c, instr);
2558 }
2559 
2560 /**
2561  * This implementation is based on v3d_sample_{x,y}_offset() from
2562  * v3d_sample_offset.h.
2563  */
2564 static void
ntq_get_sample_offset(struct v3d_compile * c,struct qreg sample_idx,struct qreg * sx,struct qreg * sy)2565 ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx,
2566                       struct qreg *sx, struct qreg *sy)
2567 {
2568         sample_idx = vir_ITOF(c, sample_idx);
2569 
2570         struct qreg offset_x =
2571                 vir_FADD(c, vir_uniform_f(c, -0.125f),
2572                             vir_FMUL(c, sample_idx,
2573                                         vir_uniform_f(c, 0.5f)));
2574         vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(),
2575                                     vir_uniform_f(c, 2.0f), sample_idx),
2576                    V3D_QPU_PF_PUSHC);
2577         offset_x = vir_SEL(c, V3D_QPU_COND_IFA,
2578                               vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)),
2579                               offset_x);
2580 
2581         struct qreg offset_y =
2582                    vir_FADD(c, vir_uniform_f(c, -0.375f),
2583                                vir_FMUL(c, sample_idx,
2584                                            vir_uniform_f(c, 0.25f)));
2585         *sx = offset_x;
2586         *sy = offset_y;
2587 }
2588 
2589 /**
2590  * This implementation is based on get_centroid_offset() from fep.c.
2591  */
2592 static void
ntq_get_barycentric_centroid(struct v3d_compile * c,struct qreg * out_x,struct qreg * out_y)2593 ntq_get_barycentric_centroid(struct v3d_compile *c,
2594                              struct qreg *out_x,
2595                              struct qreg *out_y)
2596 {
2597         struct qreg sample_mask;
2598         if (c->output_sample_mask_index != -1)
2599                 sample_mask = c->outputs[c->output_sample_mask_index];
2600         else
2601                 sample_mask = vir_MSF(c);
2602 
2603         struct qreg i0 = vir_uniform_ui(c, 0);
2604         struct qreg i1 = vir_uniform_ui(c, 1);
2605         struct qreg i2 = vir_uniform_ui(c, 2);
2606         struct qreg i3 = vir_uniform_ui(c, 3);
2607         struct qreg i4 = vir_uniform_ui(c, 4);
2608         struct qreg i8 = vir_uniform_ui(c, 8);
2609 
2610         /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */
2611         struct qreg F = vir_uniform_ui(c, 0);
2612         struct qreg T = vir_uniform_ui(c, ~0);
2613         struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1);
2614         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2615         s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2616         struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2);
2617         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2618         s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2619         struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4);
2620         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2621         s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2622         struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8);
2623         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ);
2624         s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2625 
2626         /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */
2627         struct qreg sample_idx = i3;
2628         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ);
2629         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx);
2630         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ);
2631         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx);
2632         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ);
2633         sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx);
2634 
2635         /* Get offset at selected sample index */
2636         struct qreg offset_x, offset_y;
2637         ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
2638 
2639         /* Select pixel center [offset=(0,0)] if two opposing samples (or none)
2640          * are selected.
2641          */
2642         struct qreg s0_and_s3 = vir_AND(c, s0, s3);
2643         struct qreg s1_and_s2 = vir_AND(c, s1, s2);
2644 
2645         struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0));
2646         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2647         use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F);
2648         use_center = vir_OR(c, use_center, s0_and_s3);
2649         use_center = vir_OR(c, use_center, s1_and_s2);
2650 
2651         struct qreg zero = vir_uniform_f(c, 0.0f);
2652         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ);
2653         offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x);
2654         offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y);
2655 
2656         *out_x = offset_x;
2657         *out_y = offset_y;
2658 }
2659 
2660 static struct qreg
ntq_emit_load_interpolated_input(struct v3d_compile * c,struct qreg p,struct qreg C,struct qreg offset_x,struct qreg offset_y,unsigned mode)2661 ntq_emit_load_interpolated_input(struct v3d_compile *c,
2662                                  struct qreg p,
2663                                  struct qreg C,
2664                                  struct qreg offset_x,
2665                                  struct qreg offset_y,
2666                                  unsigned mode)
2667 {
2668         if (mode == INTERP_MODE_FLAT)
2669                 return C;
2670 
2671         struct qreg sample_offset_x =
2672                 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
2673         struct qreg sample_offset_y =
2674                 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
2675 
2676         struct qreg scaleX =
2677                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x),
2678                             offset_x);
2679         struct qreg scaleY =
2680                 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y),
2681                             offset_y);
2682 
2683         struct qreg pInterp =
2684                 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX),
2685                                            vir_FMUL(c, vir_FDY(c, p), scaleY)));
2686 
2687         if (mode == INTERP_MODE_NOPERSPECTIVE)
2688                 return vir_FADD(c, pInterp, C);
2689 
2690         struct qreg w = c->payload_w;
2691         struct qreg wInterp =
2692                 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX),
2693                                            vir_FMUL(c, vir_FDY(c, w), scaleY)));
2694 
2695         return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
2696 }
2697 
2698 static void
emit_ldunifa(struct v3d_compile * c,struct qreg * result)2699 emit_ldunifa(struct v3d_compile *c, struct qreg *result)
2700 {
2701         struct qinst *ldunifa =
2702                 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
2703         ldunifa->qpu.sig.ldunifa = true;
2704         if (result)
2705                 *result = vir_emit_def(c, ldunifa);
2706         else
2707                 vir_emit_nondef(c, ldunifa);
2708         c->current_unifa_offset += 4;
2709 }
2710 
2711 static void
ntq_emit_load_ubo_unifa(struct v3d_compile * c,nir_intrinsic_instr * instr)2712 ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
2713 {
2714         /* Every ldunifa auto-increments the unifa address by 4 bytes, so our
2715          * current unifa offset is 4 bytes ahead of the offset of the last load.
2716          */
2717         static const int32_t max_unifa_skip_dist =
2718                 MAX_UNIFA_SKIP_DISTANCE - 4;
2719 
2720         bool dynamic_src = !nir_src_is_const(instr->src[1]);
2721         uint32_t const_offset =
2722                 dynamic_src ? 0 : nir_src_as_uint(instr->src[1]);
2723 
2724         /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index
2725          * shifted up by 1 (0 is gallium's constant buffer 0).
2726          */
2727         uint32_t index = nir_src_as_uint(instr->src[0]);
2728         if (c->key->environment == V3D_ENVIRONMENT_OPENGL)
2729                 index++;
2730 
2731         /* We can only keep track of the last unifa address we used with
2732          * constant offset loads. If the new load targets the same UBO and
2733          * is close enough to the previous load, we can skip the unifa register
2734          * write by emitting dummy ldunifa instructions to update the unifa
2735          * address.
2736          */
2737         bool skip_unifa = false;
2738         uint32_t ldunifa_skips = 0;
2739         if (dynamic_src) {
2740                 c->current_unifa_block = NULL;
2741         } else if (c->cur_block == c->current_unifa_block &&
2742                    c->current_unifa_index == index &&
2743                    c->current_unifa_offset <= const_offset &&
2744                    c->current_unifa_offset + max_unifa_skip_dist >= const_offset) {
2745                 skip_unifa = true;
2746                 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4;
2747         } else {
2748                 c->current_unifa_block = c->cur_block;
2749                 c->current_unifa_index = index;
2750                 c->current_unifa_offset = const_offset;
2751         }
2752 
2753         if (!skip_unifa) {
2754                 struct qreg base_offset =
2755                         vir_uniform(c, QUNIFORM_UBO_ADDR,
2756                                     v3d_unit_data_create(index, const_offset));
2757 
2758                 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA);
2759                 if (!dynamic_src) {
2760                         vir_MOV_dest(c, unifa, base_offset);
2761                 } else {
2762                         vir_ADD_dest(c, unifa, base_offset,
2763                                      ntq_get_src(c, instr->src[1], 0));
2764                 }
2765         } else {
2766                 for (int i = 0; i < ldunifa_skips; i++)
2767                         emit_ldunifa(c, NULL);
2768         }
2769 
2770         for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2771                 struct qreg data;
2772                 emit_ldunifa(c, &data);
2773                 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
2774         }
2775 }
2776 
2777 static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile * c)2778 emit_load_local_invocation_index(struct v3d_compile *c)
2779 {
2780         return vir_SHR(c, c->cs_payload[1],
2781                        vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
2782 }
2783 
2784 /* Various subgroup operations rely on the A flags, so this helper ensures that
2785  * A flags represents currently active lanes in the subgroup.
2786  */
2787 static void
set_a_flags_for_subgroup(struct v3d_compile * c)2788 set_a_flags_for_subgroup(struct v3d_compile *c)
2789 {
2790         /* MSF returns 0 for disabled lanes in compute shaders so
2791          * PUSHZ will set A=1 for disabled lanes. We want the inverse
2792          * of this but we don't have any means to negate the A flags
2793          * directly, but we can do it by repeating the same operation
2794          * with NORZ (A = ~A & ~Z).
2795          */
2796         assert(c->s->info.stage == MESA_SHADER_COMPUTE);
2797         vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2798         vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ);
2799 
2800         /* If we are under non-uniform control flow we also need to
2801          * AND the A flags with the current execute mask.
2802          */
2803         if (vir_in_nonuniform_control_flow(c)) {
2804                 const uint32_t bidx = c->cur_block->index;
2805                 vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(),
2806                                            c->execute,
2807                                            vir_uniform_ui(c, bidx)),
2808                            V3D_QPU_UF_ANDZ);
2809         }
2810 }
2811 
2812 static void
ntq_emit_intrinsic(struct v3d_compile * c,nir_intrinsic_instr * instr)2813 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
2814 {
2815         switch (instr->intrinsic) {
2816         case nir_intrinsic_load_uniform:
2817                 ntq_emit_load_uniform(c, instr);
2818                 break;
2819 
2820         case nir_intrinsic_load_ubo:
2821                 if (!nir_src_is_divergent(instr->src[1]))
2822                         ntq_emit_load_ubo_unifa(c, instr);
2823                 else
2824                         ntq_emit_tmu_general(c, instr, false);
2825                 break;
2826 
2827         case nir_intrinsic_ssbo_atomic_add:
2828         case nir_intrinsic_ssbo_atomic_imin:
2829         case nir_intrinsic_ssbo_atomic_umin:
2830         case nir_intrinsic_ssbo_atomic_imax:
2831         case nir_intrinsic_ssbo_atomic_umax:
2832         case nir_intrinsic_ssbo_atomic_and:
2833         case nir_intrinsic_ssbo_atomic_or:
2834         case nir_intrinsic_ssbo_atomic_xor:
2835         case nir_intrinsic_ssbo_atomic_exchange:
2836         case nir_intrinsic_ssbo_atomic_comp_swap:
2837         case nir_intrinsic_load_ssbo:
2838         case nir_intrinsic_store_ssbo:
2839                 ntq_emit_tmu_general(c, instr, false);
2840                 break;
2841 
2842         case nir_intrinsic_shared_atomic_add:
2843         case nir_intrinsic_shared_atomic_imin:
2844         case nir_intrinsic_shared_atomic_umin:
2845         case nir_intrinsic_shared_atomic_imax:
2846         case nir_intrinsic_shared_atomic_umax:
2847         case nir_intrinsic_shared_atomic_and:
2848         case nir_intrinsic_shared_atomic_or:
2849         case nir_intrinsic_shared_atomic_xor:
2850         case nir_intrinsic_shared_atomic_exchange:
2851         case nir_intrinsic_shared_atomic_comp_swap:
2852         case nir_intrinsic_load_shared:
2853         case nir_intrinsic_store_shared:
2854         case nir_intrinsic_load_scratch:
2855         case nir_intrinsic_store_scratch:
2856                 ntq_emit_tmu_general(c, instr, true);
2857                 break;
2858 
2859         case nir_intrinsic_image_load:
2860         case nir_intrinsic_image_store:
2861         case nir_intrinsic_image_atomic_add:
2862         case nir_intrinsic_image_atomic_imin:
2863         case nir_intrinsic_image_atomic_umin:
2864         case nir_intrinsic_image_atomic_imax:
2865         case nir_intrinsic_image_atomic_umax:
2866         case nir_intrinsic_image_atomic_and:
2867         case nir_intrinsic_image_atomic_or:
2868         case nir_intrinsic_image_atomic_xor:
2869         case nir_intrinsic_image_atomic_exchange:
2870         case nir_intrinsic_image_atomic_comp_swap:
2871                 v3d40_vir_emit_image_load_store(c, instr);
2872                 break;
2873 
2874         case nir_intrinsic_get_ssbo_size:
2875                 ntq_store_dest(c, &instr->dest, 0,
2876                                vir_uniform(c, QUNIFORM_GET_SSBO_SIZE,
2877                                            nir_src_comp_as_uint(instr->src[0], 0)));
2878                 break;
2879 
2880         case nir_intrinsic_get_ubo_size:
2881                 ntq_store_dest(c, &instr->dest, 0,
2882                                vir_uniform(c, QUNIFORM_GET_UBO_SIZE,
2883                                            nir_src_comp_as_uint(instr->src[0], 0)));
2884                 break;
2885 
2886         case nir_intrinsic_load_user_clip_plane:
2887                 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) {
2888                         ntq_store_dest(c, &instr->dest, i,
2889                                        vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
2890                                                    nir_intrinsic_ucp_id(instr) *
2891                                                    4 + i));
2892                 }
2893                 break;
2894 
2895         case nir_intrinsic_load_viewport_x_scale:
2896                 ntq_store_dest(c, &instr->dest, 0,
2897                                vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0));
2898                 break;
2899 
2900         case nir_intrinsic_load_viewport_y_scale:
2901                 ntq_store_dest(c, &instr->dest, 0,
2902                                vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0));
2903                 break;
2904 
2905         case nir_intrinsic_load_viewport_z_scale:
2906                 ntq_store_dest(c, &instr->dest, 0,
2907                                vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0));
2908                 break;
2909 
2910         case nir_intrinsic_load_viewport_z_offset:
2911                 ntq_store_dest(c, &instr->dest, 0,
2912                                vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0));
2913                 break;
2914 
2915         case nir_intrinsic_load_line_coord:
2916                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x));
2917                 break;
2918 
2919         case nir_intrinsic_load_line_width:
2920                 ntq_store_dest(c, &instr->dest, 0,
2921                                vir_uniform(c, QUNIFORM_LINE_WIDTH, 0));
2922                 break;
2923 
2924         case nir_intrinsic_load_aa_line_width:
2925                 ntq_store_dest(c, &instr->dest, 0,
2926                                vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0));
2927                 break;
2928 
2929         case nir_intrinsic_load_sample_mask_in:
2930                 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c));
2931                 break;
2932 
2933         case nir_intrinsic_load_helper_invocation:
2934                 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ);
2935                 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
2936                 ntq_store_dest(c, &instr->dest, 0, qdest);
2937                 break;
2938 
2939         case nir_intrinsic_load_front_face:
2940                 /* The register contains 0 (front) or 1 (back), and we need to
2941                  * turn it into a NIR bool where true means front.
2942                  */
2943                 ntq_store_dest(c, &instr->dest, 0,
2944                                vir_ADD(c,
2945                                        vir_uniform_ui(c, -1),
2946                                        vir_REVF(c)));
2947                 break;
2948 
2949         case nir_intrinsic_load_base_instance:
2950                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid));
2951                 break;
2952 
2953         case nir_intrinsic_load_instance_id:
2954                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
2955                 break;
2956 
2957         case nir_intrinsic_load_vertex_id:
2958                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
2959                 break;
2960 
2961         case nir_intrinsic_load_tlb_color_v3d:
2962                 vir_emit_tlb_color_read(c, instr);
2963                 break;
2964 
2965         case nir_intrinsic_load_input:
2966                 ntq_emit_load_input(c, instr);
2967                 break;
2968 
2969         case nir_intrinsic_store_tlb_sample_color_v3d:
2970                ntq_emit_per_sample_color_write(c, instr);
2971                break;
2972 
2973        case nir_intrinsic_store_output:
2974                 ntq_emit_store_output(c, instr);
2975                 break;
2976 
2977         case nir_intrinsic_image_size:
2978                 ntq_emit_image_size(c, instr);
2979                 break;
2980 
2981         case nir_intrinsic_discard:
2982                 ntq_flush_tmu(c);
2983 
2984                 if (vir_in_nonuniform_control_flow(c)) {
2985                         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
2986                                    V3D_QPU_PF_PUSHZ);
2987                         vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
2988                                                      vir_uniform_ui(c, 0)),
2989                                 V3D_QPU_COND_IFA);
2990                 } else {
2991                         vir_SETMSF_dest(c, vir_nop_reg(),
2992                                         vir_uniform_ui(c, 0));
2993                 }
2994                 break;
2995 
2996         case nir_intrinsic_discard_if: {
2997                 ntq_flush_tmu(c);
2998 
2999                 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]);
3000 
3001                 if (vir_in_nonuniform_control_flow(c)) {
3002                         struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(),
3003                                                                c->execute);
3004                         if (cond == V3D_QPU_COND_IFA) {
3005                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ);
3006                         } else {
3007                                 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ);
3008                                 cond = V3D_QPU_COND_IFA;
3009                         }
3010                 }
3011 
3012                 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(),
3013                                              vir_uniform_ui(c, 0)), cond);
3014 
3015                 break;
3016         }
3017 
3018         case nir_intrinsic_memory_barrier:
3019         case nir_intrinsic_memory_barrier_buffer:
3020         case nir_intrinsic_memory_barrier_image:
3021         case nir_intrinsic_memory_barrier_shared:
3022         case nir_intrinsic_memory_barrier_tcs_patch:
3023         case nir_intrinsic_group_memory_barrier:
3024                 /* We don't do any instruction scheduling of these NIR
3025                  * instructions between each other, so we just need to make
3026                  * sure that the TMU operations before the barrier are flushed
3027                  * before the ones after the barrier.
3028                  */
3029                 ntq_flush_tmu(c);
3030                 break;
3031 
3032         case nir_intrinsic_control_barrier:
3033                 /* Emit a TSY op to get all invocations in the workgroup
3034                  * (actually supergroup) to block until the last invocation
3035                  * reaches the TSY op.
3036                  */
3037                 ntq_flush_tmu(c);
3038 
3039                 if (c->devinfo->ver >= 42) {
3040                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3041                                                       V3D_QPU_WADDR_SYNCB));
3042                 } else {
3043                         struct qinst *sync =
3044                                 vir_BARRIERID_dest(c,
3045                                                    vir_reg(QFILE_MAGIC,
3046                                                            V3D_QPU_WADDR_SYNCU));
3047                         sync->uniform =
3048                                 vir_get_uniform_index(c, QUNIFORM_CONSTANT,
3049                                                       0xffffff00 |
3050                                                       V3D_TSY_WAIT_INC_CHECK);
3051 
3052                 }
3053 
3054                 /* The blocking of a TSY op only happens at the next thread
3055                  * switch.  No texturing may be outstanding at the time of a
3056                  * TSY blocking operation.
3057                  */
3058                 vir_emit_thrsw(c);
3059                 break;
3060 
3061         case nir_intrinsic_load_num_workgroups:
3062                 for (int i = 0; i < 3; i++) {
3063                         ntq_store_dest(c, &instr->dest, i,
3064                                        vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS,
3065                                                    i));
3066                 }
3067                 break;
3068 
3069         case nir_intrinsic_load_workgroup_id: {
3070                 struct qreg x = vir_AND(c, c->cs_payload[0],
3071                                          vir_uniform_ui(c, 0xffff));
3072 
3073                 struct qreg y = vir_SHR(c, c->cs_payload[0],
3074                                          vir_uniform_ui(c, 16));
3075 
3076                 struct qreg z = vir_AND(c, c->cs_payload[1],
3077                                          vir_uniform_ui(c, 0xffff));
3078 
3079                 /* We only support dispatch base in Vulkan */
3080                 if (c->key->environment == V3D_ENVIRONMENT_VULKAN) {
3081                         x = vir_ADD(c, x,
3082                                     vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0));
3083                         y = vir_ADD(c, y,
3084                                     vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1));
3085                         z = vir_ADD(c, z,
3086                                     vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2));
3087                 }
3088 
3089                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x));
3090                 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y));
3091                 ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z));
3092                 break;
3093         }
3094 
3095         case nir_intrinsic_load_local_invocation_index:
3096                 ntq_store_dest(c, &instr->dest, 0,
3097                                emit_load_local_invocation_index(c));
3098                 break;
3099 
3100         case nir_intrinsic_load_subgroup_id: {
3101                 /* This is basically the batch index, which is the Local
3102                  * Invocation Index divided by the SIMD width).
3103                  */
3104                 STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
3105                 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
3106                 struct qreg lii = emit_load_local_invocation_index(c);
3107                 ntq_store_dest(c, &instr->dest, 0,
3108                                vir_SHR(c, lii,
3109                                        vir_uniform_ui(c, divide_shift)));
3110                 break;
3111         }
3112 
3113         case nir_intrinsic_load_per_vertex_input: {
3114                 /* The vertex shader writes all its used outputs into
3115                  * consecutive VPM offsets, so if any output component is
3116                  * unused, its VPM offset is used by the next used
3117                  * component. This means that we can't assume that each
3118                  * location will use 4 consecutive scalar offsets in the VPM
3119                  * and we need to compute the VPM offset for each input by
3120                  * going through the inputs and finding the one that matches
3121                  * our location and component.
3122                  *
3123                  * col: vertex index, row = varying index
3124                  */
3125                 assert(nir_src_is_const(instr->src[1]));
3126                 uint32_t location =
3127                         nir_intrinsic_io_semantics(instr).location +
3128                         nir_src_as_uint(instr->src[1]);
3129                 uint32_t component = nir_intrinsic_component(instr);
3130 
3131                 int32_t row_idx = -1;
3132                 for (int i = 0; i < c->num_inputs; i++) {
3133                         struct v3d_varying_slot slot = c->input_slots[i];
3134                         if (v3d_slot_get_slot(slot) == location &&
3135                             v3d_slot_get_component(slot) == component) {
3136                                 row_idx = i;
3137                                 break;
3138                         }
3139                 }
3140 
3141                 assert(row_idx != -1);
3142 
3143                 struct qreg col = ntq_get_src(c, instr->src[0], 0);
3144                 for (int i = 0; i < instr->num_components; i++) {
3145                         struct qreg row = vir_uniform_ui(c, row_idx++);
3146                         ntq_store_dest(c, &instr->dest, i,
3147                                        vir_LDVPMG_IN(c, row, col));
3148                 }
3149                 break;
3150         }
3151 
3152         case nir_intrinsic_emit_vertex:
3153         case nir_intrinsic_end_primitive:
3154                 unreachable("Should have been lowered in v3d_nir_lower_io");
3155                 break;
3156 
3157         case nir_intrinsic_load_primitive_id: {
3158                 /* gl_PrimitiveIdIn is written by the GBG in the first word of
3159                  * VPM output header. According to docs, we should read this
3160                  * using ldvpm(v,d)_in (See Table 71).
3161                  */
3162                 assert(c->s->info.stage == MESA_SHADER_GEOMETRY);
3163                 ntq_store_dest(c, &instr->dest, 0,
3164                                vir_LDVPMV_IN(c, vir_uniform_ui(c, 0)));
3165                 break;
3166         }
3167 
3168         case nir_intrinsic_load_invocation_id:
3169                 ntq_store_dest(c, &instr->dest, 0, vir_IID(c));
3170                 break;
3171 
3172         case nir_intrinsic_load_fb_layers_v3d:
3173                 ntq_store_dest(c, &instr->dest, 0,
3174                                vir_uniform(c, QUNIFORM_FB_LAYERS, 0));
3175                 break;
3176 
3177         case nir_intrinsic_load_sample_id:
3178                 ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c));
3179                 break;
3180 
3181         case nir_intrinsic_load_sample_pos:
3182                 ntq_store_dest(c, &instr->dest, 0,
3183                                vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))));
3184                 ntq_store_dest(c, &instr->dest, 1,
3185                                vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))));
3186                 break;
3187 
3188         case nir_intrinsic_load_barycentric_at_offset:
3189                 ntq_store_dest(c, &instr->dest, 0,
3190                                vir_MOV(c, ntq_get_src(c, instr->src[0], 0)));
3191                 ntq_store_dest(c, &instr->dest, 1,
3192                                vir_MOV(c, ntq_get_src(c, instr->src[0], 1)));
3193                 break;
3194 
3195         case nir_intrinsic_load_barycentric_pixel:
3196                 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3197                 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3198                 break;
3199 
3200         case nir_intrinsic_load_barycentric_at_sample: {
3201                 if (!c->fs_key->msaa) {
3202                         ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f));
3203                         ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f));
3204                         return;
3205                 }
3206 
3207                 struct qreg offset_x, offset_y;
3208                 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0);
3209                 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y);
3210 
3211                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3212                 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3213                 break;
3214         }
3215 
3216         case nir_intrinsic_load_barycentric_sample: {
3217                 struct qreg offset_x =
3218                         vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)));
3219                 struct qreg offset_y =
3220                         vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)));
3221 
3222                 ntq_store_dest(c, &instr->dest, 0,
3223                                   vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f)));
3224                 ntq_store_dest(c, &instr->dest, 1,
3225                                   vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f)));
3226                 break;
3227         }
3228 
3229         case nir_intrinsic_load_barycentric_centroid: {
3230                 struct qreg offset_x, offset_y;
3231                 ntq_get_barycentric_centroid(c, &offset_x, &offset_y);
3232                 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x));
3233                 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y));
3234                 break;
3235         }
3236 
3237         case nir_intrinsic_load_interpolated_input: {
3238                 assert(nir_src_is_const(instr->src[1]));
3239                 const uint32_t offset = nir_src_as_uint(instr->src[1]);
3240 
3241                 for (int i = 0; i < instr->num_components; i++) {
3242                         const uint32_t input_idx =
3243                                 (nir_intrinsic_base(instr) + offset) * 4 +
3244                                 nir_intrinsic_component(instr) + i;
3245 
3246                         /* If we are not in MSAA or if we are not interpolating
3247                          * a user varying, just return the pre-computed
3248                          * interpolated input.
3249                          */
3250                         if (!c->fs_key->msaa ||
3251                             c->interp[input_idx].vp.file == QFILE_NULL) {
3252                                 ntq_store_dest(c, &instr->dest, i,
3253                                                vir_MOV(c, c->inputs[input_idx]));
3254                                 continue;
3255                         }
3256 
3257                         /* Otherwise compute interpolation at the specified
3258                          * offset.
3259                          */
3260                         struct qreg p = c->interp[input_idx].vp;
3261                         struct qreg C = c->interp[input_idx].C;
3262                         unsigned interp_mode =  c->interp[input_idx].mode;
3263 
3264                         struct qreg offset_x = ntq_get_src(c, instr->src[0], 0);
3265                         struct qreg offset_y = ntq_get_src(c, instr->src[0], 1);
3266 
3267                         struct qreg result =
3268                               ntq_emit_load_interpolated_input(c, p, C,
3269                                                                offset_x, offset_y,
3270                                                                interp_mode);
3271                         ntq_store_dest(c, &instr->dest, i, result);
3272                 }
3273                 break;
3274         }
3275 
3276         case nir_intrinsic_load_subgroup_size:
3277                 ntq_store_dest(c, &instr->dest, 0,
3278                                vir_uniform_ui(c, V3D_CHANNELS));
3279                 break;
3280 
3281         case nir_intrinsic_load_subgroup_invocation:
3282                 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
3283                 break;
3284 
3285         case nir_intrinsic_elect: {
3286                 set_a_flags_for_subgroup(c);
3287                 struct qreg first = vir_FLAFIRST(c);
3288 
3289                 /* Produce a boolean result from Flafirst */
3290                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3291                                            first, vir_uniform_ui(c, 1)),
3292                                            V3D_QPU_PF_PUSHZ);
3293                 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA);
3294                 ntq_store_dest(c, &instr->dest, 0, result);
3295                 break;
3296         }
3297 
3298         case nir_intrinsic_load_num_subgroups:
3299                 unreachable("Should have been lowered");
3300                 break;
3301 
3302         case nir_intrinsic_load_view_index:
3303                 ntq_store_dest(c, &instr->dest, 0,
3304                                vir_uniform(c, QUNIFORM_VIEW_INDEX, 0));
3305                 break;
3306 
3307         default:
3308                 fprintf(stderr, "Unknown intrinsic: ");
3309                 nir_print_instr(&instr->instr, stderr);
3310                 fprintf(stderr, "\n");
3311                 break;
3312         }
3313 }
3314 
3315 /* Clears (activates) the execute flags for any channels whose jump target
3316  * matches this block.
3317  *
3318  * XXX perf: Could we be using flpush/flpop somehow for our execution channel
3319  * enabling?
3320  *
3321  */
3322 static void
ntq_activate_execute_for_block(struct v3d_compile * c)3323 ntq_activate_execute_for_block(struct v3d_compile *c)
3324 {
3325         vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3326                                 c->execute, vir_uniform_ui(c, c->cur_block->index)),
3327                    V3D_QPU_PF_PUSHZ);
3328 
3329         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3330 }
3331 
3332 static void
ntq_emit_uniform_if(struct v3d_compile * c,nir_if * if_stmt)3333 ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt)
3334 {
3335         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3336         bool empty_else_block =
3337                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
3338                  exec_list_is_empty(&nir_else_block->instr_list));
3339 
3340         struct qblock *then_block = vir_new_block(c);
3341         struct qblock *after_block = vir_new_block(c);
3342         struct qblock *else_block;
3343         if (empty_else_block)
3344                 else_block = after_block;
3345         else
3346                 else_block = vir_new_block(c);
3347 
3348         /* Check if this if statement is really just a conditional jump with
3349          * the form:
3350          *
3351          * if (cond) {
3352          *    break/continue;
3353          * } else {
3354          * }
3355          *
3356          * In which case we can skip the jump to ELSE we emit before the THEN
3357          * block and instead just emit the break/continue directly.
3358          */
3359         nir_jump_instr *conditional_jump = NULL;
3360         if (empty_else_block) {
3361                 nir_block *nir_then_block = nir_if_first_then_block(if_stmt);
3362                 struct nir_instr *inst = nir_block_first_instr(nir_then_block);
3363                 if (inst && inst->type == nir_instr_type_jump)
3364                         conditional_jump = nir_instr_as_jump(inst);
3365         }
3366 
3367         /* Set up the flags for the IF condition (taking the THEN branch). */
3368         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3369 
3370         if (!conditional_jump) {
3371                 /* Jump to ELSE. */
3372                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3373                            V3D_QPU_BRANCH_COND_ANYNA :
3374                            V3D_QPU_BRANCH_COND_ANYA);
3375                 /* Pixels that were not dispatched or have been discarded
3376                  * should not contribute to the ANYA/ANYNA condition.
3377                  */
3378                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3379 
3380                 vir_link_blocks(c->cur_block, else_block);
3381                 vir_link_blocks(c->cur_block, then_block);
3382 
3383                 /* Process the THEN block. */
3384                 vir_set_emit_block(c, then_block);
3385                 ntq_emit_cf_list(c, &if_stmt->then_list);
3386 
3387                 if (!empty_else_block) {
3388                         /* At the end of the THEN block, jump to ENDIF, unless
3389                          * the block ended in a break or continue.
3390                          */
3391                         if (!c->cur_block->branch_emitted) {
3392                                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3393                                 vir_link_blocks(c->cur_block, after_block);
3394                         }
3395 
3396                         /* Emit the else block. */
3397                         vir_set_emit_block(c, else_block);
3398                         ntq_emit_cf_list(c, &if_stmt->else_list);
3399                 }
3400         } else {
3401                 /* Emit the conditional jump directly.
3402                  *
3403                  * Use ALL with breaks and ANY with continues to ensure that
3404                  * we always break and never continue when all lanes have been
3405                  * disabled (for example because of discards) to prevent
3406                  * infinite loops.
3407                  */
3408                 assert(conditional_jump &&
3409                        (conditional_jump->type == nir_jump_continue ||
3410                         conditional_jump->type == nir_jump_break));
3411 
3412                 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ?
3413                            (conditional_jump->type == nir_jump_break ?
3414                             V3D_QPU_BRANCH_COND_ALLA :
3415                             V3D_QPU_BRANCH_COND_ANYA) :
3416                            (conditional_jump->type == nir_jump_break ?
3417                             V3D_QPU_BRANCH_COND_ALLNA :
3418                             V3D_QPU_BRANCH_COND_ANYNA));
3419                 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3420 
3421                 vir_link_blocks(c->cur_block,
3422                                 conditional_jump->type == nir_jump_break ?
3423                                         c->loop_break_block :
3424                                         c->loop_cont_block);
3425         }
3426 
3427         vir_link_blocks(c->cur_block, after_block);
3428 
3429         vir_set_emit_block(c, after_block);
3430 }
3431 
3432 static void
ntq_emit_nonuniform_if(struct v3d_compile * c,nir_if * if_stmt)3433 ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt)
3434 {
3435         nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
3436         bool empty_else_block =
3437                 (nir_else_block == nir_if_last_else_block(if_stmt) &&
3438                  exec_list_is_empty(&nir_else_block->instr_list));
3439 
3440         struct qblock *then_block = vir_new_block(c);
3441         struct qblock *after_block = vir_new_block(c);
3442         struct qblock *else_block;
3443         if (empty_else_block)
3444                 else_block = after_block;
3445         else
3446                 else_block = vir_new_block(c);
3447 
3448         bool was_uniform_control_flow = false;
3449         if (!vir_in_nonuniform_control_flow(c)) {
3450                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3451                 was_uniform_control_flow = true;
3452         }
3453 
3454         /* Set up the flags for the IF condition (taking the THEN branch). */
3455         enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition);
3456 
3457         /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and
3458          * was previously active (execute Z) for updating the exec flags.
3459          */
3460         if (was_uniform_control_flow) {
3461                 cond = v3d_qpu_cond_invert(cond);
3462         } else {
3463                 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute);
3464                 if (cond == V3D_QPU_COND_IFA) {
3465                         vir_set_uf(c, inst, V3D_QPU_UF_NORNZ);
3466                 } else {
3467                         vir_set_uf(c, inst, V3D_QPU_UF_ANDZ);
3468                         cond = V3D_QPU_COND_IFA;
3469                 }
3470         }
3471 
3472         vir_MOV_cond(c, cond,
3473                      c->execute,
3474                      vir_uniform_ui(c, else_block->index));
3475 
3476         /* Jump to ELSE if nothing is active for THEN, otherwise fall
3477          * through.
3478          */
3479         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3480         vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
3481         vir_link_blocks(c->cur_block, else_block);
3482         vir_link_blocks(c->cur_block, then_block);
3483 
3484         /* Process the THEN block. */
3485         vir_set_emit_block(c, then_block);
3486         ntq_emit_cf_list(c, &if_stmt->then_list);
3487 
3488         if (!empty_else_block) {
3489                 /* Handle the end of the THEN block.  First, all currently
3490                  * active channels update their execute flags to point to
3491                  * ENDIF
3492                  */
3493                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3494                            V3D_QPU_PF_PUSHZ);
3495                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3496                              vir_uniform_ui(c, after_block->index));
3497 
3498                 /* If everything points at ENDIF, then jump there immediately. */
3499                 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(),
3500                                         c->execute,
3501                                         vir_uniform_ui(c, after_block->index)),
3502                            V3D_QPU_PF_PUSHZ);
3503                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
3504                 vir_link_blocks(c->cur_block, after_block);
3505                 vir_link_blocks(c->cur_block, else_block);
3506 
3507                 vir_set_emit_block(c, else_block);
3508                 ntq_activate_execute_for_block(c);
3509                 ntq_emit_cf_list(c, &if_stmt->else_list);
3510         }
3511 
3512         vir_link_blocks(c->cur_block, after_block);
3513 
3514         vir_set_emit_block(c, after_block);
3515         if (was_uniform_control_flow)
3516                 c->execute = c->undef;
3517         else
3518                 ntq_activate_execute_for_block(c);
3519 }
3520 
3521 static void
ntq_emit_if(struct v3d_compile * c,nir_if * nif)3522 ntq_emit_if(struct v3d_compile *c, nir_if *nif)
3523 {
3524         bool was_in_control_flow = c->in_control_flow;
3525         c->in_control_flow = true;
3526         if (!vir_in_nonuniform_control_flow(c) &&
3527             !nir_src_is_divergent(nif->condition)) {
3528                 ntq_emit_uniform_if(c, nif);
3529         } else {
3530                 ntq_emit_nonuniform_if(c, nif);
3531         }
3532         c->in_control_flow = was_in_control_flow;
3533 }
3534 
3535 static void
ntq_emit_jump(struct v3d_compile * c,nir_jump_instr * jump)3536 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
3537 {
3538         switch (jump->type) {
3539         case nir_jump_break:
3540                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3541                            V3D_QPU_PF_PUSHZ);
3542                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3543                              vir_uniform_ui(c, c->loop_break_block->index));
3544                 break;
3545 
3546         case nir_jump_continue:
3547                 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute),
3548                            V3D_QPU_PF_PUSHZ);
3549                 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
3550                              vir_uniform_ui(c, c->loop_cont_block->index));
3551                 break;
3552 
3553         case nir_jump_return:
3554                 unreachable("All returns should be lowered\n");
3555                 break;
3556 
3557         case nir_jump_halt:
3558         case nir_jump_goto:
3559         case nir_jump_goto_if:
3560                 unreachable("not supported\n");
3561                 break;
3562         }
3563 }
3564 
3565 static void
ntq_emit_uniform_jump(struct v3d_compile * c,nir_jump_instr * jump)3566 ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump)
3567 {
3568         switch (jump->type) {
3569         case nir_jump_break:
3570                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3571                 vir_link_blocks(c->cur_block, c->loop_break_block);
3572                 c->cur_block->branch_emitted = true;
3573                 break;
3574         case nir_jump_continue:
3575                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3576                 vir_link_blocks(c->cur_block, c->loop_cont_block);
3577                 c->cur_block->branch_emitted = true;
3578                 break;
3579 
3580         case nir_jump_return:
3581                 unreachable("All returns should be lowered\n");
3582                 break;
3583 
3584         case nir_jump_halt:
3585         case nir_jump_goto:
3586         case nir_jump_goto_if:
3587                 unreachable("not supported\n");
3588                 break;
3589         }
3590 }
3591 
3592 static void
ntq_emit_instr(struct v3d_compile * c,nir_instr * instr)3593 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
3594 {
3595         switch (instr->type) {
3596         case nir_instr_type_alu:
3597                 ntq_emit_alu(c, nir_instr_as_alu(instr));
3598                 break;
3599 
3600         case nir_instr_type_intrinsic:
3601                 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
3602                 break;
3603 
3604         case nir_instr_type_load_const:
3605                 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
3606                 break;
3607 
3608         case nir_instr_type_ssa_undef:
3609                 unreachable("Should've been lowered by nir_lower_undef_to_zero");
3610                 break;
3611 
3612         case nir_instr_type_tex:
3613                 ntq_emit_tex(c, nir_instr_as_tex(instr));
3614                 break;
3615 
3616         case nir_instr_type_jump:
3617                 /* Always flush TMU before jumping to another block, for the
3618                  * same reasons as in ntq_emit_block.
3619                  */
3620                 ntq_flush_tmu(c);
3621                 if (vir_in_nonuniform_control_flow(c))
3622                         ntq_emit_jump(c, nir_instr_as_jump(instr));
3623                 else
3624                         ntq_emit_uniform_jump(c, nir_instr_as_jump(instr));
3625                 break;
3626 
3627         default:
3628                 fprintf(stderr, "Unknown NIR instr type: ");
3629                 nir_print_instr(instr, stderr);
3630                 fprintf(stderr, "\n");
3631                 abort();
3632         }
3633 }
3634 
3635 static void
ntq_emit_block(struct v3d_compile * c,nir_block * block)3636 ntq_emit_block(struct v3d_compile *c, nir_block *block)
3637 {
3638         nir_foreach_instr(instr, block) {
3639                 ntq_emit_instr(c, instr);
3640         }
3641 
3642         /* Always process pending TMU operations in the same block they were
3643          * emitted: we can't emit TMU operations in a block and then emit a
3644          * thread switch and LDTMU/TMUWT for them in another block, possibly
3645          * under control flow.
3646          */
3647         ntq_flush_tmu(c);
3648 }
3649 
3650 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
3651 
3652 static void
ntq_emit_nonuniform_loop(struct v3d_compile * c,nir_loop * loop)3653 ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop)
3654 {
3655         bool was_uniform_control_flow = false;
3656         if (!vir_in_nonuniform_control_flow(c)) {
3657                 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
3658                 was_uniform_control_flow = true;
3659         }
3660 
3661         c->loop_cont_block = vir_new_block(c);
3662         c->loop_break_block = vir_new_block(c);
3663 
3664         vir_link_blocks(c->cur_block, c->loop_cont_block);
3665         vir_set_emit_block(c, c->loop_cont_block);
3666         ntq_activate_execute_for_block(c);
3667 
3668         ntq_emit_cf_list(c, &loop->body);
3669 
3670         /* Re-enable any previous continues now, so our ANYA check below
3671          * works.
3672          *
3673          * XXX: Use the .ORZ flags update, instead.
3674          */
3675         vir_set_pf(c, vir_XOR_dest(c,
3676                                 vir_nop_reg(),
3677                                 c->execute,
3678                                 vir_uniform_ui(c, c->loop_cont_block->index)),
3679                    V3D_QPU_PF_PUSHZ);
3680         vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
3681 
3682         vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ);
3683 
3684         struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
3685         /* Pixels that were not dispatched or have been discarded should not
3686          * contribute to looping again.
3687          */
3688         branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
3689         vir_link_blocks(c->cur_block, c->loop_cont_block);
3690         vir_link_blocks(c->cur_block, c->loop_break_block);
3691 
3692         vir_set_emit_block(c, c->loop_break_block);
3693         if (was_uniform_control_flow)
3694                 c->execute = c->undef;
3695         else
3696                 ntq_activate_execute_for_block(c);
3697 }
3698 
3699 static void
ntq_emit_uniform_loop(struct v3d_compile * c,nir_loop * loop)3700 ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop)
3701 {
3702 
3703         c->loop_cont_block = vir_new_block(c);
3704         c->loop_break_block = vir_new_block(c);
3705 
3706         vir_link_blocks(c->cur_block, c->loop_cont_block);
3707         vir_set_emit_block(c, c->loop_cont_block);
3708 
3709         ntq_emit_cf_list(c, &loop->body);
3710 
3711         if (!c->cur_block->branch_emitted) {
3712                 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS);
3713                 vir_link_blocks(c->cur_block, c->loop_cont_block);
3714         }
3715 
3716         vir_set_emit_block(c, c->loop_break_block);
3717 }
3718 
3719 static void
ntq_emit_loop(struct v3d_compile * c,nir_loop * loop)3720 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
3721 {
3722         bool was_in_control_flow = c->in_control_flow;
3723         c->in_control_flow = true;
3724 
3725         struct qblock *save_loop_cont_block = c->loop_cont_block;
3726         struct qblock *save_loop_break_block = c->loop_break_block;
3727 
3728         if (vir_in_nonuniform_control_flow(c) || loop->divergent) {
3729                 ntq_emit_nonuniform_loop(c, loop);
3730         } else {
3731                 ntq_emit_uniform_loop(c, loop);
3732         }
3733 
3734         c->loop_break_block = save_loop_break_block;
3735         c->loop_cont_block = save_loop_cont_block;
3736 
3737         c->loops++;
3738 
3739         c->in_control_flow = was_in_control_flow;
3740 }
3741 
3742 static void
ntq_emit_function(struct v3d_compile * c,nir_function_impl * func)3743 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
3744 {
3745         fprintf(stderr, "FUNCTIONS not handled.\n");
3746         abort();
3747 }
3748 
3749 static void
ntq_emit_cf_list(struct v3d_compile * c,struct exec_list * list)3750 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
3751 {
3752         foreach_list_typed(nir_cf_node, node, node, list) {
3753                 switch (node->type) {
3754                 case nir_cf_node_block:
3755                         ntq_emit_block(c, nir_cf_node_as_block(node));
3756                         break;
3757 
3758                 case nir_cf_node_if:
3759                         ntq_emit_if(c, nir_cf_node_as_if(node));
3760                         break;
3761 
3762                 case nir_cf_node_loop:
3763                         ntq_emit_loop(c, nir_cf_node_as_loop(node));
3764                         break;
3765 
3766                 case nir_cf_node_function:
3767                         ntq_emit_function(c, nir_cf_node_as_function(node));
3768                         break;
3769 
3770                 default:
3771                         fprintf(stderr, "Unknown NIR node type\n");
3772                         abort();
3773                 }
3774         }
3775 }
3776 
3777 static void
ntq_emit_impl(struct v3d_compile * c,nir_function_impl * impl)3778 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
3779 {
3780         ntq_setup_registers(c, &impl->registers);
3781         ntq_emit_cf_list(c, &impl->body);
3782 }
3783 
3784 static void
nir_to_vir(struct v3d_compile * c)3785 nir_to_vir(struct v3d_compile *c)
3786 {
3787         switch (c->s->info.stage) {
3788         case MESA_SHADER_FRAGMENT:
3789                 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
3790                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
3791                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
3792 
3793                 /* V3D 4.x can disable implicit varyings if they are not used */
3794                 c->fs_uses_primitive_id =
3795                         nir_find_variable_with_location(c->s, nir_var_shader_in,
3796                                                         VARYING_SLOT_PRIMITIVE_ID);
3797                 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) {
3798                        c->primitive_id =
3799                                emit_fragment_varying(c, NULL, -1, 0, 0);
3800                 }
3801 
3802                 if (c->fs_key->is_points &&
3803                     (c->devinfo->ver < 40 || program_reads_point_coord(c))) {
3804                         c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3805                         c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0);
3806                         c->uses_implicit_point_line_varyings = true;
3807                 } else if (c->fs_key->is_lines &&
3808                            (c->devinfo->ver < 40 ||
3809                             BITSET_TEST(c->s->info.system_values_read,
3810                                         SYSTEM_VALUE_LINE_COORD))) {
3811                         c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0);
3812                         c->uses_implicit_point_line_varyings = true;
3813                 }
3814 
3815                 c->force_per_sample_msaa =
3816                    c->s->info.fs.uses_sample_qualifier ||
3817                    BITSET_TEST(c->s->info.system_values_read,
3818                                SYSTEM_VALUE_SAMPLE_ID) ||
3819                    BITSET_TEST(c->s->info.system_values_read,
3820                                SYSTEM_VALUE_SAMPLE_POS);
3821                 break;
3822         case MESA_SHADER_COMPUTE:
3823                 /* Set up the TSO for barriers, assuming we do some. */
3824                 if (c->devinfo->ver < 42) {
3825                         vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC,
3826                                                       V3D_QPU_WADDR_SYNC));
3827                 }
3828 
3829                 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
3830                 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
3831 
3832                 /* Set up the division between gl_LocalInvocationIndex and
3833                  * wg_in_mem in the payload reg.
3834                  */
3835                 int wg_size = (c->s->info.workgroup_size[0] *
3836                                c->s->info.workgroup_size[1] *
3837                                c->s->info.workgroup_size[2]);
3838                 c->local_invocation_index_bits =
3839                         ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1;
3840                 assert(c->local_invocation_index_bits <= 8);
3841 
3842                 if (c->s->info.shared_size) {
3843                         struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1],
3844                                                         vir_uniform_ui(c, 16));
3845                         if (c->s->info.workgroup_size[0] != 1 ||
3846                             c->s->info.workgroup_size[1] != 1 ||
3847                             c->s->info.workgroup_size[2] != 1) {
3848                                 int wg_bits = (16 -
3849                                                c->local_invocation_index_bits);
3850                                 int wg_mask = (1 << wg_bits) - 1;
3851                                 wg_in_mem = vir_AND(c, wg_in_mem,
3852                                                     vir_uniform_ui(c, wg_mask));
3853                         }
3854                         struct qreg shared_per_wg =
3855                                 vir_uniform_ui(c, c->s->info.shared_size);
3856 
3857                         c->cs_shared_offset =
3858                                 vir_ADD(c,
3859                                         vir_uniform(c, QUNIFORM_SHARED_OFFSET,0),
3860                                         vir_UMUL(c, wg_in_mem, shared_per_wg));
3861                 }
3862                 break;
3863         default:
3864                 break;
3865         }
3866 
3867         if (c->s->scratch_size) {
3868                 v3d_setup_spill_base(c);
3869                 c->spill_size += V3D_CHANNELS * c->s->scratch_size;
3870         }
3871 
3872         switch (c->s->info.stage) {
3873         case MESA_SHADER_VERTEX:
3874                 ntq_setup_vs_inputs(c);
3875                 break;
3876         case MESA_SHADER_GEOMETRY:
3877                 ntq_setup_gs_inputs(c);
3878                 break;
3879         case MESA_SHADER_FRAGMENT:
3880                 ntq_setup_fs_inputs(c);
3881                 break;
3882         case MESA_SHADER_COMPUTE:
3883                 break;
3884         default:
3885                 unreachable("unsupported shader stage");
3886         }
3887 
3888         ntq_setup_outputs(c);
3889 
3890         /* Find the main function and emit the body. */
3891         nir_foreach_function(function, c->s) {
3892                 assert(strcmp(function->name, "main") == 0);
3893                 assert(function->impl);
3894                 ntq_emit_impl(c, function->impl);
3895         }
3896 }
3897 
3898 /**
3899  * When demoting a shader down to single-threaded, removes the THRSW
3900  * instructions (one will still be inserted at v3d_vir_to_qpu() for the
3901  * program end).
3902  */
3903 static void
vir_remove_thrsw(struct v3d_compile * c)3904 vir_remove_thrsw(struct v3d_compile *c)
3905 {
3906         vir_for_each_block(block, c) {
3907                 vir_for_each_inst_safe(inst, block) {
3908                         if (inst->qpu.sig.thrsw)
3909                                 vir_remove_instruction(c, inst);
3910                 }
3911         }
3912 
3913         c->last_thrsw = NULL;
3914 }
3915 
3916 /**
3917  * This makes sure we have a top-level last thread switch which signals the
3918  * start of the last thread section, which may include adding a new thrsw
3919  * instruction if needed. We don't allow spilling in the last thread section, so
3920  * if we need to do any spills that inject additional thread switches later on,
3921  * we ensure this thread switch will still be the last thread switch in the
3922  * program, which makes last thread switch signalling a lot easier when we have
3923  * spilling. If in the end we don't need to spill to compile the program and we
3924  * injected a new thread switch instruction here only for that, we will
3925  * eventually restore the previous last thread switch and remove the one we
3926  * added here.
3927  */
3928 static void
vir_emit_last_thrsw(struct v3d_compile * c,struct qinst ** restore_last_thrsw,bool * restore_scoreboard_lock)3929 vir_emit_last_thrsw(struct v3d_compile *c,
3930                     struct qinst **restore_last_thrsw,
3931                     bool *restore_scoreboard_lock)
3932 {
3933         *restore_last_thrsw = c->last_thrsw;
3934 
3935         /* On V3D before 4.1, we need a TMU op to be outstanding when thread
3936          * switching, so disable threads if we didn't do any TMU ops (each of
3937          * which would have emitted a THRSW).
3938          */
3939         if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
3940                 c->threads = 1;
3941                 if (c->last_thrsw)
3942                         vir_remove_thrsw(c);
3943                 *restore_last_thrsw = NULL;
3944         }
3945 
3946         /* If we're threaded and the last THRSW was in conditional code, then
3947          * we need to emit another one so that we can flag it as the last
3948          * thrsw.
3949          */
3950         if (c->last_thrsw && !c->last_thrsw_at_top_level) {
3951                 assert(c->devinfo->ver >= 41);
3952                 vir_emit_thrsw(c);
3953         }
3954 
3955         /* If we're threaded, then we need to mark the last THRSW instruction
3956          * so we can emit a pair of them at QPU emit time.
3957          *
3958          * For V3D 4.x, we can spawn the non-fragment shaders already in the
3959          * post-last-THRSW state, so we can skip this.
3960          */
3961         if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
3962                 assert(c->devinfo->ver >= 41);
3963                 vir_emit_thrsw(c);
3964         }
3965 
3966         /* If we have not inserted a last thread switch yet, do it now to ensure
3967          * any potential spilling we do happens before this. If we don't spill
3968          * in the end, we will restore the previous one.
3969          */
3970         if (*restore_last_thrsw == c->last_thrsw) {
3971                 if (*restore_last_thrsw)
3972                         (*restore_last_thrsw)->is_last_thrsw = false;
3973                 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw;
3974                 vir_emit_thrsw(c);
3975         } else {
3976                 *restore_last_thrsw = c->last_thrsw;
3977         }
3978 
3979         assert(c->last_thrsw);
3980         c->last_thrsw->is_last_thrsw = true;
3981 }
3982 
3983 static void
vir_restore_last_thrsw(struct v3d_compile * c,struct qinst * thrsw,bool scoreboard_lock)3984 vir_restore_last_thrsw(struct v3d_compile *c,
3985                        struct qinst *thrsw,
3986                        bool scoreboard_lock)
3987 {
3988         assert(c->last_thrsw);
3989         vir_remove_instruction(c, c->last_thrsw);
3990         c->last_thrsw = thrsw;
3991         if (c->last_thrsw)
3992                 c->last_thrsw->is_last_thrsw = true;
3993         c->lock_scoreboard_on_first_thrsw = scoreboard_lock;
3994 }
3995 
3996 /* There's a flag in the shader for "center W is needed for reasons other than
3997  * non-centroid varyings", so we just walk the program after VIR optimization
3998  * to see if it's used.  It should be harmless to set even if we only use
3999  * center W for varyings.
4000  */
4001 static void
vir_check_payload_w(struct v3d_compile * c)4002 vir_check_payload_w(struct v3d_compile *c)
4003 {
4004         if (c->s->info.stage != MESA_SHADER_FRAGMENT)
4005                 return;
4006 
4007         vir_for_each_inst_inorder(inst, c) {
4008                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
4009                         if (inst->src[i].file == QFILE_REG &&
4010                             inst->src[i].index == 0) {
4011                                 c->uses_center_w = true;
4012                                 return;
4013                         }
4014                 }
4015         }
4016 }
4017 
4018 void
v3d_nir_to_vir(struct v3d_compile * c)4019 v3d_nir_to_vir(struct v3d_compile *c)
4020 {
4021         if (V3D_DEBUG & (V3D_DEBUG_NIR |
4022                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4023                 fprintf(stderr, "%s prog %d/%d NIR:\n",
4024                         vir_get_stage_name(c),
4025                         c->program_id, c->variant_id);
4026                 nir_print_shader(c->s, stderr);
4027         }
4028 
4029         nir_to_vir(c);
4030 
4031         bool restore_scoreboard_lock = false;
4032         struct qinst *restore_last_thrsw;
4033 
4034         /* Emit the last THRSW before STVPM and TLB writes. */
4035         vir_emit_last_thrsw(c,
4036                             &restore_last_thrsw,
4037                             &restore_scoreboard_lock);
4038 
4039 
4040         switch (c->s->info.stage) {
4041         case MESA_SHADER_FRAGMENT:
4042                 emit_frag_end(c);
4043                 break;
4044         case MESA_SHADER_GEOMETRY:
4045                 emit_geom_end(c);
4046                 break;
4047         case MESA_SHADER_VERTEX:
4048                 emit_vert_end(c);
4049                 break;
4050         case MESA_SHADER_COMPUTE:
4051                 break;
4052         default:
4053                 unreachable("bad stage");
4054         }
4055 
4056         if (V3D_DEBUG & (V3D_DEBUG_VIR |
4057                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4058                 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
4059                         vir_get_stage_name(c),
4060                         c->program_id, c->variant_id);
4061                 vir_dump(c);
4062                 fprintf(stderr, "\n");
4063         }
4064 
4065         vir_optimize(c);
4066 
4067         vir_check_payload_w(c);
4068 
4069         /* XXX perf: On VC4, we do a VIR-level instruction scheduling here.
4070          * We used that on that platform to pipeline TMU writes and reduce the
4071          * number of thread switches, as well as try (mostly successfully) to
4072          * reduce maximum register pressure to allow more threads.  We should
4073          * do something of that sort for V3D -- either instruction scheduling
4074          * here, or delay the the THRSW and LDTMUs from our texture
4075          * instructions until the results are needed.
4076          */
4077 
4078         if (V3D_DEBUG & (V3D_DEBUG_VIR |
4079                          v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
4080                 fprintf(stderr, "%s prog %d/%d VIR:\n",
4081                         vir_get_stage_name(c),
4082                         c->program_id, c->variant_id);
4083                 vir_dump(c);
4084                 fprintf(stderr, "\n");
4085         }
4086 
4087         /* Attempt to allocate registers for the temporaries.  If we fail,
4088          * reduce thread count and try again.
4089          */
4090         int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
4091         struct qpu_reg *temp_registers;
4092         while (true) {
4093                 bool spilled;
4094                 temp_registers = v3d_register_allocate(c, &spilled);
4095                 if (spilled)
4096                         continue;
4097 
4098                 if (temp_registers)
4099                         break;
4100 
4101                 if (c->threads == min_threads &&
4102                     (V3D_DEBUG & V3D_DEBUG_RA)) {
4103                         fprintf(stderr,
4104                                 "Failed to register allocate using %s\n",
4105                                 c->fallback_scheduler ? "the fallback scheduler:" :
4106                                 "the normal scheduler: \n");
4107 
4108                         vir_dump(c);
4109 
4110                         char *shaderdb;
4111                         int ret = v3d_shaderdb_dump(c, &shaderdb);
4112                         if (ret > 0) {
4113                                 fprintf(stderr, "%s\n", shaderdb);
4114                                 free(shaderdb);
4115                         }
4116                 }
4117 
4118                 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) {
4119                         if (V3D_DEBUG & V3D_DEBUG_PERF) {
4120                                 fprintf(stderr,
4121                                         "Failed to register allocate %s at "
4122                                         "%d threads.\n", vir_get_stage_name(c),
4123                                         c->threads);
4124                         }
4125                         c->compilation_result =
4126                                 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION;
4127                         return;
4128                 }
4129 
4130                 c->spill_count = 0;
4131                 c->threads /= 2;
4132 
4133                 if (c->threads == 1)
4134                         vir_remove_thrsw(c);
4135         }
4136 
4137         /* If we didn't spill, then remove the last thread switch we injected
4138          * artificially (if any) and restore the previous one.
4139          */
4140         if (!c->spills && c->last_thrsw != restore_last_thrsw)
4141                 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock);
4142 
4143         if (c->spills &&
4144             (V3D_DEBUG & (V3D_DEBUG_VIR |
4145                           v3d_debug_flag_for_shader_stage(c->s->info.stage)))) {
4146                 fprintf(stderr, "%s prog %d/%d spilled VIR:\n",
4147                         vir_get_stage_name(c),
4148                         c->program_id, c->variant_id);
4149                 vir_dump(c);
4150                 fprintf(stderr, "\n");
4151         }
4152 
4153         v3d_vir_to_qpu(c, temp_registers);
4154 }
4155