1 /*
2  * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "util/bitscan.h"
34 #include "util/half_float.h"
35 #include "util/ralloc.h"
36 #include "util/u_math.h"
37 
38 #include "instr-a3xx.h"
39 #include "ir3_shader.h"
40 
41 /* simple allocator to carve allocations out of an up-front allocated heap,
42  * so that we can free everything easily in one shot.
43  */
44 void *
ir3_alloc(struct ir3 * shader,int sz)45 ir3_alloc(struct ir3 *shader, int sz)
46 {
47    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48 }
49 
50 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)51 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52 {
53    struct ir3 *shader = rzalloc(v, struct ir3);
54 
55    shader->compiler = compiler;
56    shader->type = v->type;
57 
58    list_inithead(&shader->block_list);
59    list_inithead(&shader->array_list);
60 
61    return shader;
62 }
63 
64 void
ir3_destroy(struct ir3 * shader)65 ir3_destroy(struct ir3 *shader)
66 {
67    ralloc_free(shader);
68 }
69 
70 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)71 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
72                  struct ir3_info *info)
73 {
74    struct ir3_shader_variant *v = info->data;
75    unsigned repeat = instr->repeat;
76 
77    if (reg->flags & IR3_REG_IMMED) {
78       /* nothing to do */
79       return;
80    }
81 
82    if (!(reg->flags & IR3_REG_R)) {
83       repeat = 0;
84    }
85 
86    unsigned components;
87    int16_t max;
88 
89    if (reg->flags & IR3_REG_RELATIV) {
90       components = reg->size;
91       max = (reg->array.base + components - 1);
92    } else {
93       components = util_last_bit(reg->wrmask);
94       max = (reg->num + repeat + components - 1);
95    }
96 
97    if (reg->flags & IR3_REG_CONST) {
98       info->max_const = MAX2(info->max_const, max >> 2);
99    } else if (max < regid(48, 0)) {
100       if (reg->flags & IR3_REG_HALF) {
101          if (v->mergedregs) {
102             /* starting w/ a6xx, half regs conflict with full regs: */
103             info->max_reg = MAX2(info->max_reg, max >> 3);
104          } else {
105             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
106          }
107       } else {
108          info->max_reg = MAX2(info->max_reg, max >> 2);
109       }
110    }
111 }
112 
113 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)114 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
115 {
116    const struct ir3_compiler *compiler = v->shader->compiler;
117 
118    /* We can't support more than compiler->branchstack_size diverging threads
119     * in a wave. Thus, doubling the threadsize is only possible if we don't
120     * exceed the branchstack size limit.
121     */
122    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
123        compiler->branchstack_size) {
124       return false;
125    }
126 
127    switch (v->type) {
128    case MESA_SHADER_COMPUTE: {
129       unsigned threads_per_wg =
130          v->local_size[0] * v->local_size[1] * v->local_size[2];
131 
132       /* For a5xx, if the workgroup size is greater than the maximum number
133        * of threads per core with 32 threads per wave (512) then we have to
134        * use the doubled threadsize because otherwise the workgroup wouldn't
135        * fit. For smaller workgroup sizes, we follow the blob and use the
136        * smaller threadsize.
137        */
138       if (compiler->gen < 6) {
139          return v->local_size_variable ||
140                 threads_per_wg >
141                    compiler->threadsize_base * compiler->max_waves;
142       }
143 
144       /* On a6xx, we prefer the larger threadsize unless the workgroup is
145        * small enough that it would be useless. Note that because
146        * threadsize_base is bumped to 64, we don't have to worry about the
147        * workgroup fitting, unlike the a5xx case.
148        */
149       if (!v->local_size_variable) {
150          if (threads_per_wg <= compiler->threadsize_base)
151             return false;
152       }
153    }
154       FALLTHROUGH;
155    case MESA_SHADER_FRAGMENT: {
156       /* Check that doubling the threadsize wouldn't exceed the regfile size */
157       return regs_count * 2 <= compiler->reg_size_vec4;
158    }
159 
160    default:
161       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
162        * stages - the bit doesn't exist. The blob never used it for the VS
163        * on earlier gen's anyway.
164        */
165       return false;
166    }
167 }
168 
169 /* Get the maximum number of waves that could be used even if this shader
170  * didn't use any registers.
171  */
172 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)173 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
174                                   bool double_threadsize)
175 {
176    const struct ir3_compiler *compiler = v->shader->compiler;
177    unsigned max_waves = compiler->max_waves;
178 
179    /* If this is a compute shader, compute the limit based on shared size */
180    if (v->type == MESA_SHADER_COMPUTE) {
181       /* Shared is allocated in chunks of 1k */
182       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
183       if (shared_per_wg > 0 && !v->local_size_variable) {
184          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
185          unsigned threads_per_wg =
186             v->local_size[0] * v->local_size[1] * v->local_size[2];
187          unsigned waves_per_wg =
188             DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
189                                             (double_threadsize ? 2 : 1) *
190                                             compiler->wave_granularity);
191          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
192                                         compiler->wave_granularity);
193       }
194    }
195 
196    /* Compute the limit based on branchstack */
197    if (v->branchstack > 0) {
198       unsigned branchstack_max_waves = compiler->branchstack_size /
199                                        v->branchstack *
200                                        compiler->wave_granularity;
201       max_waves = MIN2(max_waves, branchstack_max_waves);
202    }
203 
204    return max_waves;
205 }
206 
207 /* Get the maximum number of waves that could be launched limited by reg size.
208  */
209 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)210 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
211                                 unsigned reg_count, bool double_threadsize)
212 {
213    return reg_count ? (compiler->reg_size_vec4 /
214                        (reg_count * (double_threadsize ? 2 : 1)) *
215                        compiler->wave_granularity)
216                     : compiler->max_waves;
217 }
218 
219 void
ir3_collect_info(struct ir3_shader_variant * v)220 ir3_collect_info(struct ir3_shader_variant *v)
221 {
222    struct ir3_info *info = &v->info;
223    struct ir3 *shader = v->ir;
224    const struct ir3_compiler *compiler = v->shader->compiler;
225 
226    memset(info, 0, sizeof(*info));
227    info->data = v;
228    info->max_reg = -1;
229    info->max_half_reg = -1;
230    info->max_const = -1;
231    info->multi_dword_ldp_stp = false;
232 
233    uint32_t instr_count = 0;
234    foreach_block (block, &shader->block_list) {
235       foreach_instr (instr, &block->instr_list) {
236          instr_count++;
237       }
238    }
239 
240    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
241 
242    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
243     * doesn't try to decode the following data as instructions (such as the
244     * next stage's shader in turnip)
245     */
246    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
247    info->sizedwords = info->size / 4;
248 
249    foreach_block (block, &shader->block_list) {
250       int sfu_delay = 0;
251 
252       foreach_instr (instr, &block->instr_list) {
253 
254          foreach_src (reg, instr) {
255             collect_reg_info(instr, reg, info);
256          }
257 
258          foreach_dst (reg, instr) {
259             if (is_dest_gpr(reg)) {
260                collect_reg_info(instr, reg, info);
261             }
262          }
263 
264          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
265             unsigned components = instr->srcs[2]->uim_val;
266             if (components * type_size(instr->cat6.type) > 32) {
267                info->multi_dword_ldp_stp = true;
268             }
269 
270             if (instr->opc == OPC_STP)
271                info->stp_count += components;
272             else
273                info->ldp_count += components;
274          }
275 
276          if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI))
277             info->last_baryf = info->instrs_count;
278 
279          unsigned instrs_count = 1 + instr->repeat + instr->nop;
280          unsigned nops_count = instr->nop;
281 
282          if (instr->opc == OPC_NOP) {
283             nops_count = 1 + instr->repeat;
284             info->instrs_per_cat[0] += nops_count;
285          } else {
286             info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
287             info->instrs_per_cat[0] += nops_count;
288          }
289 
290          if (instr->opc == OPC_MOV) {
291             if (instr->cat1.src_type == instr->cat1.dst_type) {
292                info->mov_count += 1 + instr->repeat;
293             } else {
294                info->cov_count += 1 + instr->repeat;
295             }
296          }
297 
298          info->instrs_count += instrs_count;
299          info->nops_count += nops_count;
300 
301          if (instr->flags & IR3_INSTR_SS) {
302             info->ss++;
303             info->sstall += sfu_delay;
304             sfu_delay = 0;
305          }
306 
307          if (instr->flags & IR3_INSTR_SY)
308             info->sy++;
309 
310          if (is_sfu(instr)) {
311             sfu_delay = 10;
312          } else {
313             int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
314             sfu_delay -= n;
315          }
316       }
317    }
318 
319    /* TODO: for a5xx and below, is there a separate regfile for
320     * half-registers?
321     */
322    unsigned regs_count =
323       info->max_reg + 1 +
324       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
325 
326    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
327    unsigned reg_independent_max_waves =
328       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
329    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
330       compiler, regs_count, info->double_threadsize);
331    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
332    assert(info->max_waves <= v->shader->compiler->max_waves);
333 }
334 
335 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)336 reg_create(struct ir3 *shader, int num, int flags)
337 {
338    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
339    reg->wrmask = 1;
340    reg->flags = flags;
341    reg->num = num;
342    return reg;
343 }
344 
345 static void
insert_instr(struct ir3_block * block,struct ir3_instruction * instr)346 insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
347 {
348    struct ir3 *shader = block->shader;
349 
350    instr->serialno = ++shader->instr_count;
351 
352    list_addtail(&instr->node, &block->instr_list);
353 
354    if (is_input(instr))
355       array_insert(shader, shader->baryfs, instr);
356 }
357 
358 struct ir3_block *
ir3_block_create(struct ir3 * shader)359 ir3_block_create(struct ir3 *shader)
360 {
361    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
362 #ifdef DEBUG
363    block->serialno = ++shader->block_count;
364 #endif
365    block->shader = shader;
366    list_inithead(&block->node);
367    list_inithead(&block->instr_list);
368    return block;
369 }
370 
371 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)372 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
373 {
374    array_insert(block, block->predecessors, pred);
375 }
376 
377 void
ir3_block_add_physical_predecessor(struct ir3_block * block,struct ir3_block * pred)378 ir3_block_add_physical_predecessor(struct ir3_block *block,
379                                    struct ir3_block *pred)
380 {
381    array_insert(block, block->physical_predecessors, pred);
382 }
383 
384 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)385 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
386 {
387    for (unsigned i = 0; i < block->predecessors_count; i++) {
388       if (block->predecessors[i] == pred) {
389          if (i < block->predecessors_count - 1) {
390             block->predecessors[i] =
391                block->predecessors[block->predecessors_count - 1];
392          }
393 
394          block->predecessors_count--;
395          return;
396       }
397    }
398 }
399 
400 void
ir3_block_remove_physical_predecessor(struct ir3_block * block,struct ir3_block * pred)401 ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
402 {
403    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
404       if (block->physical_predecessors[i] == pred) {
405          if (i < block->physical_predecessors_count - 1) {
406             block->physical_predecessors[i] =
407                block->physical_predecessors[block->physical_predecessors_count - 1];
408          }
409 
410          block->physical_predecessors_count--;
411          return;
412       }
413    }
414 }
415 
416 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)417 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
418 {
419    for (unsigned i = 0; i < block->predecessors_count; i++) {
420       if (block->predecessors[i] == pred) {
421          return i;
422       }
423    }
424 
425    unreachable("ir3_block_get_pred_index() invalid predecessor");
426 }
427 
428 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)429 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
430 {
431    /* Add extra sources for array destinations and the address reg */
432    if (1 <= opc_cat(opc))
433       nsrc += 2;
434    struct ir3_instruction *instr;
435    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
436                  (nsrc * sizeof(instr->srcs[0]));
437    char *ptr = ir3_alloc(block->shader, sz);
438 
439    instr = (struct ir3_instruction *)ptr;
440    ptr += sizeof(*instr);
441    instr->dsts = (struct ir3_register **)ptr;
442    instr->srcs = instr->dsts + ndst;
443 
444 #ifdef DEBUG
445    instr->dsts_max = ndst;
446    instr->srcs_max = nsrc;
447 #endif
448 
449    return instr;
450 }
451 
452 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)453 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
454 {
455    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
456    instr->block = block;
457    instr->opc = opc;
458    insert_instr(block, instr);
459    return instr;
460 }
461 
462 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)463 ir3_instr_clone(struct ir3_instruction *instr)
464 {
465    struct ir3_instruction *new_instr = instr_create(
466       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
467    struct ir3_register **dsts, **srcs;
468 
469    dsts = new_instr->dsts;
470    srcs = new_instr->srcs;
471    *new_instr = *instr;
472    new_instr->dsts = dsts;
473    new_instr->srcs = srcs;
474 
475    insert_instr(instr->block, new_instr);
476 
477    /* clone registers: */
478    new_instr->dsts_count = 0;
479    new_instr->srcs_count = 0;
480    foreach_dst (reg, instr) {
481       struct ir3_register *new_reg =
482          ir3_dst_create(new_instr, reg->num, reg->flags);
483       *new_reg = *reg;
484       if (new_reg->instr)
485          new_reg->instr = new_instr;
486    }
487    foreach_src (reg, instr) {
488       struct ir3_register *new_reg =
489          ir3_src_create(new_instr, reg->num, reg->flags);
490       *new_reg = *reg;
491    }
492 
493    if (instr->address) {
494       assert(instr->srcs_count > 0);
495       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
496    }
497 
498    return new_instr;
499 }
500 
501 /* Add a false dependency to instruction, to ensure it is scheduled first: */
502 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)503 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
504 {
505    for (unsigned i = 0; i < instr->deps_count; i++) {
506       if (instr->deps[i] == dep)
507          return;
508    }
509 
510    array_insert(instr, instr->deps, dep);
511 }
512 
513 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)514 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
515 {
516    struct ir3 *shader = instr->block->shader;
517 #ifdef DEBUG
518    debug_assert(instr->srcs_count < instr->srcs_max);
519 #endif
520    struct ir3_register *reg = reg_create(shader, num, flags);
521    instr->srcs[instr->srcs_count++] = reg;
522    return reg;
523 }
524 
525 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)526 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
527 {
528    struct ir3 *shader = instr->block->shader;
529 #ifdef DEBUG
530    debug_assert(instr->dsts_count < instr->dsts_max);
531 #endif
532    struct ir3_register *reg = reg_create(shader, num, flags);
533    instr->dsts[instr->dsts_count++] = reg;
534    return reg;
535 }
536 
537 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)538 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
539 {
540    struct ir3_register *new_reg = reg_create(shader, 0, 0);
541    *new_reg = *reg;
542    return new_reg;
543 }
544 
545 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)546 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
547                        struct ir3_register *last_write)
548 {
549    assert(reg->flags & IR3_REG_ARRAY);
550    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
551    *new_reg = *reg;
552    new_reg->def = last_write;
553    ir3_reg_tie(reg, new_reg);
554 }
555 
556 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)557 ir3_instr_set_address(struct ir3_instruction *instr,
558                       struct ir3_instruction *addr)
559 {
560    if (!instr->address) {
561       struct ir3 *ir = instr->block->shader;
562 
563       debug_assert(instr->block == addr->block);
564 
565       instr->address =
566          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
567       instr->address->def = addr->dsts[0];
568       debug_assert(reg_num(addr->dsts[0]) == REG_A0);
569       unsigned comp = reg_comp(addr->dsts[0]);
570       if (comp == 0) {
571          array_insert(ir, ir->a0_users, instr);
572       } else {
573          debug_assert(comp == 1);
574          array_insert(ir, ir->a1_users, instr);
575       }
576    } else {
577       debug_assert(instr->address->def->instr == addr);
578    }
579 }
580 
581 void
ir3_block_clear_mark(struct ir3_block * block)582 ir3_block_clear_mark(struct ir3_block *block)
583 {
584    foreach_instr (instr, &block->instr_list)
585       instr->flags &= ~IR3_INSTR_MARK;
586 }
587 
588 void
ir3_clear_mark(struct ir3 * ir)589 ir3_clear_mark(struct ir3 *ir)
590 {
591    foreach_block (block, &ir->block_list) {
592       ir3_block_clear_mark(block);
593    }
594 }
595 
596 unsigned
ir3_count_instructions(struct ir3 * ir)597 ir3_count_instructions(struct ir3 *ir)
598 {
599    unsigned cnt = 1;
600    foreach_block (block, &ir->block_list) {
601       block->start_ip = cnt;
602       foreach_instr (instr, &block->instr_list) {
603          instr->ip = cnt++;
604       }
605       block->end_ip = cnt;
606    }
607    return cnt;
608 }
609 
610 /* When counting instructions for RA, we insert extra fake instructions at the
611  * beginning of each block, where values become live, and at the end where
612  * values die. This prevents problems where values live-in at the beginning or
613  * live-out at the end of a block from being treated as if they were
614  * live-in/live-out at the first/last instruction, which would be incorrect.
615  * In ir3_legalize these ip's are assumed to be actual ip's of the final
616  * program, so it would be incorrect to use this everywhere.
617  */
618 
619 unsigned
ir3_count_instructions_ra(struct ir3 * ir)620 ir3_count_instructions_ra(struct ir3 *ir)
621 {
622    unsigned cnt = 1;
623    foreach_block (block, &ir->block_list) {
624       block->start_ip = cnt++;
625       foreach_instr (instr, &block->instr_list) {
626          instr->ip = cnt++;
627       }
628       block->end_ip = cnt++;
629    }
630    return cnt;
631 }
632 
633 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)634 ir3_lookup_array(struct ir3 *ir, unsigned id)
635 {
636    foreach_array (arr, &ir->array_list)
637       if (arr->id == id)
638          return arr;
639    return NULL;
640 }
641 
642 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)643 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
644 {
645    /* We could do this in a single pass if we can assume instructions
646     * are always sorted.  Which currently might not always be true.
647     * (In particular after ir3_group pass, but maybe other places.)
648     */
649    foreach_block (block, &ir->block_list)
650       foreach_instr (instr, &block->instr_list)
651          instr->uses = NULL;
652 
653    foreach_block (block, &ir->block_list) {
654       foreach_instr (instr, &block->instr_list) {
655          foreach_ssa_src_n (src, n, instr) {
656             if (__is_false_dep(instr, n) && !falsedeps)
657                continue;
658             if (!src->uses)
659                src->uses = _mesa_pointer_set_create(mem_ctx);
660             _mesa_set_add(src->uses, instr);
661          }
662       }
663    }
664 }
665 
666 /**
667  * Set the destination type of an instruction, for example if a
668  * conversion is folded in, handling the special cases where the
669  * instruction's dest type or opcode needs to be fixed up.
670  */
671 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)672 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
673 {
674    if (half) {
675       instr->dsts[0]->flags |= IR3_REG_HALF;
676    } else {
677       instr->dsts[0]->flags &= ~IR3_REG_HALF;
678    }
679 
680    switch (opc_cat(instr->opc)) {
681    case 1: /* move instructions */
682       if (half) {
683          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
684       } else {
685          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
686       }
687       break;
688    case 4:
689       if (half) {
690          instr->opc = cat4_half_opc(instr->opc);
691       } else {
692          instr->opc = cat4_full_opc(instr->opc);
693       }
694       break;
695    case 5:
696       if (half) {
697          instr->cat5.type = half_type(instr->cat5.type);
698       } else {
699          instr->cat5.type = full_type(instr->cat5.type);
700       }
701       break;
702    }
703 }
704 
705 /**
706  * One-time fixup for instruction src-types.  Other than cov's that
707  * are folded, an instruction's src type does not change.
708  */
709 void
ir3_fixup_src_type(struct ir3_instruction * instr)710 ir3_fixup_src_type(struct ir3_instruction *instr)
711 {
712    switch (opc_cat(instr->opc)) {
713    case 1: /* move instructions */
714       if (instr->srcs[0]->flags & IR3_REG_HALF) {
715          instr->cat1.src_type = half_type(instr->cat1.src_type);
716       } else {
717          instr->cat1.src_type = full_type(instr->cat1.src_type);
718       }
719       break;
720    case 3:
721       if (instr->srcs[0]->flags & IR3_REG_HALF) {
722          instr->opc = cat3_half_opc(instr->opc);
723       } else {
724          instr->opc = cat3_full_opc(instr->opc);
725       }
726       break;
727    }
728 }
729 
730 /**
731  * Map a floating point immed to FLUT (float lookup table) value,
732  * returns negative for immediates that cannot be mapped.
733  */
734 int
ir3_flut(struct ir3_register * src_reg)735 ir3_flut(struct ir3_register *src_reg)
736 {
737    static const struct {
738       uint32_t f32;
739       uint16_t f16;
740    } flut[] = {
741          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
742          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
743          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
744          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
745          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
746          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
747          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
748          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
749          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
750          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
751          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
752          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
753    };
754 
755    if (src_reg->flags & IR3_REG_HALF) {
756       /* Note that half-float immeds are already lowered to 16b in nir: */
757       uint32_t imm = src_reg->uim_val;
758       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
759          if (flut[i].f16 == imm) {
760             return i;
761          }
762       }
763    } else {
764       uint32_t imm = src_reg->uim_val;
765       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
766          if (flut[i].f32 == imm) {
767             return i;
768          }
769       }
770    }
771 
772    return -1;
773 }
774 
775 static unsigned
cp_flags(unsigned flags)776 cp_flags(unsigned flags)
777 {
778    /* only considering these flags (at least for now): */
779    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
780              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
781              IR3_REG_SHARED);
782    return flags;
783 }
784 
785 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)786 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
787 {
788    struct ir3_compiler *compiler = instr->block->shader->compiler;
789    unsigned valid_flags;
790 
791    if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
792       return false;
793 
794    flags = cp_flags(flags);
795 
796    /* If destination is indirect, then source cannot be.. at least
797     * I don't think so..
798     */
799    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
800        (flags & IR3_REG_RELATIV))
801       return false;
802 
803    if (flags & IR3_REG_RELATIV) {
804       /* TODO need to test on earlier gens.. pretty sure the earlier
805        * problem was just that we didn't check that the src was from
806        * same block (since we can't propagate address register values
807        * across blocks currently)
808        */
809       if (compiler->gen < 6)
810          return false;
811 
812       /* NOTE in the special try_swap_mad_two_srcs() case we can be
813        * called on a src that has already had an indirect load folded
814        * in, in which case ssa() returns NULL
815        */
816       if (instr->srcs[n]->flags & IR3_REG_SSA) {
817          struct ir3_instruction *src = ssa(instr->srcs[n]);
818          if (src->address->def->instr->block != instr->block)
819             return false;
820       }
821    }
822 
823    if (is_meta(instr)) {
824       /* collect and phi nodes support const/immed sources, which will be
825        * turned into move instructions, but not anything else.
826        */
827       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
828          return false;
829 
830       if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
831          return false;
832 
833       return true;
834    }
835 
836    switch (opc_cat(instr->opc)) {
837    case 0: /* end, chmask */
838       return flags == 0;
839    case 1:
840       switch (instr->opc) {
841       case OPC_MOVMSK:
842       case OPC_SWZ:
843       case OPC_SCT:
844       case OPC_GAT:
845          valid_flags = IR3_REG_SHARED;
846          break;
847       default:
848          valid_flags =
849             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
850       }
851       if (flags & ~valid_flags)
852          return false;
853       break;
854    case 2:
855       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
856                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
857 
858       if (flags & ~valid_flags)
859          return false;
860 
861       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
862          unsigned m = n ^ 1;
863          /* cannot deal w/ const or shared in both srcs:
864           * (note that some cat2 actually only have a single src)
865           */
866          if (m < instr->srcs_count) {
867             struct ir3_register *reg = instr->srcs[m];
868             if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
869                 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
870                return false;
871             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
872                return false;
873          }
874       }
875       break;
876    case 3:
877       valid_flags =
878          ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
879 
880       if (instr->opc == OPC_SHLG_B16) {
881          valid_flags |= IR3_REG_IMMED;
882          /* shlg.b16 can be RELATIV+CONST but not CONST: */
883          if (flags & IR3_REG_RELATIV)
884             valid_flags |= IR3_REG_CONST;
885       } else {
886          valid_flags |= IR3_REG_CONST;
887       }
888 
889       if (flags & ~valid_flags)
890          return false;
891 
892       if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
893          /* cannot deal w/ const/shared/relativ in 2nd src: */
894          if (n == 1)
895             return false;
896       }
897 
898       break;
899    case 4:
900       /* seems like blob compiler avoids const as src.. */
901       /* TODO double check if this is still the case on a4xx */
902       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
903          return false;
904       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
905          return false;
906       break;
907    case 5:
908       /* no flags allowed */
909       if (flags)
910          return false;
911       break;
912    case 6:
913       valid_flags = IR3_REG_IMMED;
914       if (flags & ~valid_flags)
915          return false;
916 
917       if (flags & IR3_REG_IMMED) {
918          /* doesn't seem like we can have immediate src for store
919           * instructions:
920           *
921           * TODO this restriction could also apply to load instructions,
922           * but for load instructions this arg is the address (and not
923           * really sure any good way to test a hard-coded immed addr src)
924           */
925          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
926             return false;
927 
928          if ((instr->opc == OPC_LDL) && (n == 0))
929             return false;
930 
931          if ((instr->opc == OPC_STL) && (n != 2))
932             return false;
933 
934          if ((instr->opc == OPC_LDP) && (n == 0))
935             return false;
936 
937          if ((instr->opc == OPC_STP) && (n != 2))
938             return false;
939 
940          if (instr->opc == OPC_STLW && n == 0)
941             return false;
942 
943          if (instr->opc == OPC_LDLW && n == 0)
944             return false;
945 
946          /* disallow immediates in anything but the SSBO slot argument for
947           * cat6 instructions:
948           */
949          if (is_atomic(instr->opc) && (n != 0))
950             return false;
951 
952          if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G))
953             return false;
954 
955          if (instr->opc == OPC_STG && (n == 2))
956             return false;
957 
958          if (instr->opc == OPC_STG_A && (n == 4))
959             return false;
960 
961          /* as with atomics, these cat6 instrs can only have an immediate
962           * for SSBO/IBO slot argument
963           */
964          switch (instr->opc) {
965          case OPC_LDIB:
966          case OPC_STIB:
967          case OPC_RESINFO:
968             if (n != 0)
969                return false;
970             break;
971          default:
972             break;
973          }
974       }
975 
976       break;
977    }
978 
979    return true;
980 }
981 
982 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)983 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
984 {
985    if (instr->opc == OPC_MOV || is_meta(instr))
986       return true;
987 
988    if (is_mem(instr)) {
989       switch (instr->opc) {
990       /* Some load/store instructions have a 13-bit offset and size which must
991        * always be an immediate and the rest of the sources cannot be
992        * immediates, so the frontend is responsible for checking the size:
993        */
994       case OPC_LDL:
995       case OPC_STL:
996       case OPC_LDP:
997       case OPC_STP:
998       case OPC_LDG:
999       case OPC_STG:
1000       case OPC_SPILL_MACRO:
1001       case OPC_RELOAD_MACRO:
1002       case OPC_LDG_A:
1003       case OPC_STG_A:
1004       case OPC_LDLW:
1005       case OPC_STLW:
1006       case OPC_LDLV:
1007          return true;
1008       default:
1009          /* most cat6 src immediates can only encode 8 bits: */
1010          return !(immed & ~0xff);
1011       }
1012    }
1013 
1014    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1015    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1016 }
1017