1 /*
2  * Copyright (c) 2012 Rob Clark <robdclark@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "util/bitscan.h"
34 #include "util/half_float.h"
35 #include "util/ralloc.h"
36 #include "util/u_math.h"
37 
38 #include "instr-a3xx.h"
39 #include "ir3_shader.h"
40 
41 /* simple allocator to carve allocations out of an up-front allocated heap,
42  * so that we can free everything easily in one shot.
43  */
44 void *
ir3_alloc(struct ir3 * shader,int sz)45 ir3_alloc(struct ir3 *shader, int sz)
46 {
47    return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */
48 }
49 
50 struct ir3 *
ir3_create(struct ir3_compiler * compiler,struct ir3_shader_variant * v)51 ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v)
52 {
53    struct ir3 *shader = rzalloc(v, struct ir3);
54 
55    shader->compiler = compiler;
56    shader->type = v->type;
57 
58    list_inithead(&shader->block_list);
59    list_inithead(&shader->array_list);
60 
61    return shader;
62 }
63 
64 void
ir3_destroy(struct ir3 * shader)65 ir3_destroy(struct ir3 *shader)
66 {
67    ralloc_free(shader);
68 }
69 
70 static void
collect_reg_info(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_info * info)71 collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
72                  struct ir3_info *info)
73 {
74    struct ir3_shader_variant *v = info->data;
75    unsigned repeat = instr->repeat;
76 
77    if (reg->flags & IR3_REG_IMMED) {
78       /* nothing to do */
79       return;
80    }
81 
82    if (!(reg->flags & IR3_REG_R)) {
83       repeat = 0;
84    }
85 
86    unsigned components;
87    int16_t max;
88 
89    if (reg->flags & IR3_REG_RELATIV) {
90       components = reg->size;
91       max = (reg->array.base + components - 1);
92    } else {
93       components = util_last_bit(reg->wrmask);
94       max = (reg->num + repeat + components - 1);
95    }
96 
97    if (reg->flags & IR3_REG_CONST) {
98       info->max_const = MAX2(info->max_const, max >> 2);
99    } else if (max < regid(48, 0)) {
100       if (reg->flags & IR3_REG_HALF) {
101          if (v->mergedregs) {
102             /* starting w/ a6xx, half regs conflict with full regs: */
103             info->max_reg = MAX2(info->max_reg, max >> 3);
104          } else {
105             info->max_half_reg = MAX2(info->max_half_reg, max >> 2);
106          }
107       } else {
108          info->max_reg = MAX2(info->max_reg, max >> 2);
109       }
110    }
111 }
112 
113 bool
ir3_should_double_threadsize(struct ir3_shader_variant * v,unsigned regs_count)114 ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count)
115 {
116    const struct ir3_compiler *compiler = v->shader->compiler;
117 
118    /* If the user forced a particular wavesize respect that. */
119    if (v->shader->real_wavesize == IR3_SINGLE_ONLY)
120       return false;
121    if (v->shader->real_wavesize == IR3_DOUBLE_ONLY)
122       return true;
123 
124    /* We can't support more than compiler->branchstack_size diverging threads
125     * in a wave. Thus, doubling the threadsize is only possible if we don't
126     * exceed the branchstack size limit.
127     */
128    if (MIN2(v->branchstack, compiler->threadsize_base * 2) >
129        compiler->branchstack_size) {
130       return false;
131    }
132 
133    switch (v->type) {
134    case MESA_SHADER_KERNEL:
135    case MESA_SHADER_COMPUTE: {
136       unsigned threads_per_wg =
137          v->local_size[0] * v->local_size[1] * v->local_size[2];
138 
139       /* For a5xx, if the workgroup size is greater than the maximum number
140        * of threads per core with 32 threads per wave (512) then we have to
141        * use the doubled threadsize because otherwise the workgroup wouldn't
142        * fit. For smaller workgroup sizes, we follow the blob and use the
143        * smaller threadsize.
144        */
145       if (compiler->gen < 6) {
146          return v->local_size_variable ||
147                 threads_per_wg >
148                    compiler->threadsize_base * compiler->max_waves;
149       }
150 
151       /* On a6xx, we prefer the larger threadsize unless the workgroup is
152        * small enough that it would be useless. Note that because
153        * threadsize_base is bumped to 64, we don't have to worry about the
154        * workgroup fitting, unlike the a5xx case.
155        */
156       if (!v->local_size_variable) {
157          if (threads_per_wg <= compiler->threadsize_base)
158             return false;
159       }
160    }
161       FALLTHROUGH;
162    case MESA_SHADER_FRAGMENT: {
163       /* Check that doubling the threadsize wouldn't exceed the regfile size */
164       return regs_count * 2 <= compiler->reg_size_vec4;
165    }
166 
167    default:
168       /* On a6xx+, it's impossible to use a doubled wavesize in the geometry
169        * stages - the bit doesn't exist. The blob never used it for the VS
170        * on earlier gen's anyway.
171        */
172       return false;
173    }
174 }
175 
176 /* Get the maximum number of waves that could be used even if this shader
177  * didn't use any registers.
178  */
179 unsigned
ir3_get_reg_independent_max_waves(struct ir3_shader_variant * v,bool double_threadsize)180 ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v,
181                                   bool double_threadsize)
182 {
183    const struct ir3_compiler *compiler = v->shader->compiler;
184    unsigned max_waves = compiler->max_waves;
185 
186    /* Compute the limit based on branchstack */
187    if (v->branchstack > 0) {
188       unsigned branchstack_max_waves = compiler->branchstack_size /
189                                        v->branchstack *
190                                        compiler->wave_granularity;
191       max_waves = MIN2(max_waves, branchstack_max_waves);
192    }
193 
194    /* If this is a compute shader, compute the limit based on shared size */
195    if ((v->type == MESA_SHADER_COMPUTE) ||
196        (v->type == MESA_SHADER_KERNEL)) {
197       unsigned threads_per_wg =
198          v->local_size[0] * v->local_size[1] * v->local_size[2];
199       unsigned waves_per_wg =
200          DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base *
201                                          (double_threadsize ? 2 : 1) *
202                                          compiler->wave_granularity);
203 
204       /* Shared is allocated in chunks of 1k */
205       unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
206       if (shared_per_wg > 0 && !v->local_size_variable) {
207          unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
208 
209          max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core *
210                                         compiler->wave_granularity);
211       }
212 
213       /* If we have a compute shader that has a big workgroup, a barrier, and
214        * a branchstack which limits max_waves - this may result in a situation
215        * when we cannot run concurrently all waves of the workgroup, which
216        * would lead to a hang.
217        *
218        * TODO: Could we spill branchstack or is there other way around?
219        * Blob just explodes in such case.
220        */
221       if (v->has_barrier && (max_waves < waves_per_wg)) {
222          mesa_loge(
223             "Compute shader (%s:%s) which has workgroup barrier cannot be used "
224             "because it's impossible to have enough concurrent waves.",
225             v->shader->nir->info.name, v->shader->nir->info.label);
226          exit(1);
227       }
228    }
229 
230    return max_waves;
231 }
232 
233 /* Get the maximum number of waves that could be launched limited by reg size.
234  */
235 unsigned
ir3_get_reg_dependent_max_waves(const struct ir3_compiler * compiler,unsigned reg_count,bool double_threadsize)236 ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
237                                 unsigned reg_count, bool double_threadsize)
238 {
239    return reg_count ? (compiler->reg_size_vec4 /
240                        (reg_count * (double_threadsize ? 2 : 1)) *
241                        compiler->wave_granularity)
242                     : compiler->max_waves;
243 }
244 
245 void
ir3_collect_info(struct ir3_shader_variant * v)246 ir3_collect_info(struct ir3_shader_variant *v)
247 {
248    struct ir3_info *info = &v->info;
249    struct ir3 *shader = v->ir;
250    const struct ir3_compiler *compiler = v->shader->compiler;
251 
252    memset(info, 0, sizeof(*info));
253    info->data = v;
254    info->max_reg = -1;
255    info->max_half_reg = -1;
256    info->max_const = -1;
257    info->multi_dword_ldp_stp = false;
258 
259    uint32_t instr_count = 0;
260    foreach_block (block, &shader->block_list) {
261       foreach_instr (instr, &block->instr_list) {
262          instr_count++;
263       }
264    }
265 
266    v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align);
267 
268    /* Pad out with NOPs to instrlen, including at least 4 so that cffdump
269     * doesn't try to decode the following data as instructions (such as the
270     * next stage's shader in turnip)
271     */
272    info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8;
273    info->sizedwords = info->size / 4;
274 
275    bool in_preamble = false;
276 
277    foreach_block (block, &shader->block_list) {
278       int sfu_delay = 0, mem_delay = 0;
279 
280       foreach_instr (instr, &block->instr_list) {
281 
282          foreach_src (reg, instr) {
283             collect_reg_info(instr, reg, info);
284          }
285 
286          foreach_dst (reg, instr) {
287             if (is_dest_gpr(reg)) {
288                collect_reg_info(instr, reg, info);
289             }
290          }
291 
292          if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) {
293             unsigned components = instr->srcs[2]->uim_val;
294             if (components * type_size(instr->cat6.type) > 32) {
295                info->multi_dword_ldp_stp = true;
296             }
297 
298             if (instr->opc == OPC_STP)
299                info->stp_count += components;
300             else
301                info->ldp_count += components;
302          }
303 
304          if ((instr->opc == OPC_BARY_F || instr->opc == OPC_FLAT_B) &&
305              (instr->dsts[0]->flags & IR3_REG_EI))
306             info->last_baryf = info->instrs_count;
307 
308          if (instr->opc == OPC_SHPS)
309             in_preamble = true;
310 
311          /* Don't count instructions in the preamble for instruction-count type
312           * stats, because their effect should be much smaller.
313           * TODO: we should probably have separate stats for preamble
314           * instructions, but that would blow up the amount of stats...
315           */
316          if (!in_preamble) {
317             unsigned instrs_count = 1 + instr->repeat + instr->nop;
318             unsigned nops_count = instr->nop;
319 
320             if (instr->opc == OPC_NOP) {
321                nops_count = 1 + instr->repeat;
322                info->instrs_per_cat[0] += nops_count;
323             } else {
324                info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat;
325                info->instrs_per_cat[0] += nops_count;
326             }
327 
328             if (instr->opc == OPC_MOV) {
329                if (instr->cat1.src_type == instr->cat1.dst_type) {
330                   info->mov_count += 1 + instr->repeat;
331                } else {
332                   info->cov_count += 1 + instr->repeat;
333                }
334             }
335 
336             info->instrs_count += instrs_count;
337             info->nops_count += nops_count;
338 
339             if (instr->flags & IR3_INSTR_SS) {
340                info->ss++;
341                info->sstall += sfu_delay;
342                sfu_delay = 0;
343             }
344 
345             if (instr->flags & IR3_INSTR_SY) {
346                info->sy++;
347                info->systall += mem_delay;
348                mem_delay = 0;
349             }
350 
351             if (is_ss_producer(instr)) {
352                sfu_delay = soft_ss_delay(instr);
353             } else {
354                int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
355                sfu_delay -= n;
356             }
357 
358             if (is_sy_producer(instr)) {
359                mem_delay = soft_sy_delay(instr, shader);
360             } else {
361                int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
362                mem_delay -= n;
363             }
364          }
365 
366          if (instr->opc == OPC_SHPE)
367             in_preamble = false;
368       }
369    }
370 
371    /* TODO: for a5xx and below, is there a separate regfile for
372     * half-registers?
373     */
374    unsigned regs_count =
375       info->max_reg + 1 +
376       (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0);
377 
378    info->double_threadsize = ir3_should_double_threadsize(v, regs_count);
379    unsigned reg_independent_max_waves =
380       ir3_get_reg_independent_max_waves(v, info->double_threadsize);
381    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
382       compiler, regs_count, info->double_threadsize);
383    info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
384    assert(info->max_waves <= v->shader->compiler->max_waves);
385 }
386 
387 static struct ir3_register *
reg_create(struct ir3 * shader,int num,int flags)388 reg_create(struct ir3 *shader, int num, int flags)
389 {
390    struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register));
391    reg->wrmask = 1;
392    reg->flags = flags;
393    reg->num = num;
394    return reg;
395 }
396 
397 static void
insert_instr(struct ir3_block * block,struct ir3_instruction * instr)398 insert_instr(struct ir3_block *block, struct ir3_instruction *instr)
399 {
400    struct ir3 *shader = block->shader;
401 
402    instr->serialno = ++shader->instr_count;
403 
404    list_addtail(&instr->node, &block->instr_list);
405 
406    if (is_input(instr))
407       array_insert(shader, shader->baryfs, instr);
408 }
409 
410 struct ir3_block *
ir3_block_create(struct ir3 * shader)411 ir3_block_create(struct ir3 *shader)
412 {
413    struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
414 #ifdef DEBUG
415    block->serialno = ++shader->block_count;
416 #endif
417    block->shader = shader;
418    list_inithead(&block->node);
419    list_inithead(&block->instr_list);
420    return block;
421 }
422 
423 void
ir3_block_add_predecessor(struct ir3_block * block,struct ir3_block * pred)424 ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred)
425 {
426    array_insert(block, block->predecessors, pred);
427 }
428 
429 void
ir3_block_add_physical_predecessor(struct ir3_block * block,struct ir3_block * pred)430 ir3_block_add_physical_predecessor(struct ir3_block *block,
431                                    struct ir3_block *pred)
432 {
433    array_insert(block, block->physical_predecessors, pred);
434 }
435 
436 void
ir3_block_remove_predecessor(struct ir3_block * block,struct ir3_block * pred)437 ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred)
438 {
439    for (unsigned i = 0; i < block->predecessors_count; i++) {
440       if (block->predecessors[i] == pred) {
441          if (i < block->predecessors_count - 1) {
442             block->predecessors[i] =
443                block->predecessors[block->predecessors_count - 1];
444          }
445 
446          block->predecessors_count--;
447          return;
448       }
449    }
450 }
451 
452 void
ir3_block_remove_physical_predecessor(struct ir3_block * block,struct ir3_block * pred)453 ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred)
454 {
455    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
456       if (block->physical_predecessors[i] == pred) {
457          if (i < block->physical_predecessors_count - 1) {
458             block->physical_predecessors[i] =
459                block->physical_predecessors[block->physical_predecessors_count - 1];
460          }
461 
462          block->physical_predecessors_count--;
463          return;
464       }
465    }
466 }
467 
468 unsigned
ir3_block_get_pred_index(struct ir3_block * block,struct ir3_block * pred)469 ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred)
470 {
471    for (unsigned i = 0; i < block->predecessors_count; i++) {
472       if (block->predecessors[i] == pred) {
473          return i;
474       }
475    }
476 
477    unreachable("ir3_block_get_pred_index() invalid predecessor");
478 }
479 
480 static struct ir3_instruction *
instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)481 instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
482 {
483    /* Add extra sources for array destinations and the address reg */
484    if (1 <= opc_cat(opc))
485       nsrc += 2;
486    struct ir3_instruction *instr;
487    unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) +
488                  (nsrc * sizeof(instr->srcs[0]));
489    char *ptr = ir3_alloc(block->shader, sz);
490 
491    instr = (struct ir3_instruction *)ptr;
492    ptr += sizeof(*instr);
493    instr->dsts = (struct ir3_register **)ptr;
494    instr->srcs = instr->dsts + ndst;
495 
496 #ifdef DEBUG
497    instr->dsts_max = ndst;
498    instr->srcs_max = nsrc;
499 #endif
500 
501    return instr;
502 }
503 
504 struct ir3_instruction *
ir3_instr_create(struct ir3_block * block,opc_t opc,int ndst,int nsrc)505 ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc)
506 {
507    struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc);
508    instr->block = block;
509    instr->opc = opc;
510    insert_instr(block, instr);
511    return instr;
512 }
513 
514 struct ir3_instruction *
ir3_instr_clone(struct ir3_instruction * instr)515 ir3_instr_clone(struct ir3_instruction *instr)
516 {
517    struct ir3_instruction *new_instr = instr_create(
518       instr->block, instr->opc, instr->dsts_count, instr->srcs_count);
519    struct ir3_register **dsts, **srcs;
520 
521    dsts = new_instr->dsts;
522    srcs = new_instr->srcs;
523    *new_instr = *instr;
524    new_instr->dsts = dsts;
525    new_instr->srcs = srcs;
526 
527    insert_instr(instr->block, new_instr);
528 
529    /* clone registers: */
530    new_instr->dsts_count = 0;
531    new_instr->srcs_count = 0;
532    foreach_dst (reg, instr) {
533       struct ir3_register *new_reg =
534          ir3_dst_create(new_instr, reg->num, reg->flags);
535       *new_reg = *reg;
536       if (new_reg->instr)
537          new_reg->instr = new_instr;
538    }
539    foreach_src (reg, instr) {
540       struct ir3_register *new_reg =
541          ir3_src_create(new_instr, reg->num, reg->flags);
542       *new_reg = *reg;
543    }
544 
545    if (instr->address) {
546       assert(instr->srcs_count > 0);
547       new_instr->address = new_instr->srcs[instr->srcs_count - 1];
548    }
549 
550    return new_instr;
551 }
552 
553 /* Add a false dependency to instruction, to ensure it is scheduled first: */
554 void
ir3_instr_add_dep(struct ir3_instruction * instr,struct ir3_instruction * dep)555 ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep)
556 {
557    for (unsigned i = 0; i < instr->deps_count; i++) {
558       if (instr->deps[i] == dep)
559          return;
560    }
561 
562    array_insert(instr, instr->deps, dep);
563 }
564 
565 struct ir3_register *
ir3_src_create(struct ir3_instruction * instr,int num,int flags)566 ir3_src_create(struct ir3_instruction *instr, int num, int flags)
567 {
568    struct ir3 *shader = instr->block->shader;
569 #ifdef DEBUG
570    debug_assert(instr->srcs_count < instr->srcs_max);
571 #endif
572    struct ir3_register *reg = reg_create(shader, num, flags);
573    instr->srcs[instr->srcs_count++] = reg;
574    return reg;
575 }
576 
577 struct ir3_register *
ir3_dst_create(struct ir3_instruction * instr,int num,int flags)578 ir3_dst_create(struct ir3_instruction *instr, int num, int flags)
579 {
580    struct ir3 *shader = instr->block->shader;
581 #ifdef DEBUG
582    debug_assert(instr->dsts_count < instr->dsts_max);
583 #endif
584    struct ir3_register *reg = reg_create(shader, num, flags);
585    instr->dsts[instr->dsts_count++] = reg;
586    return reg;
587 }
588 
589 struct ir3_register *
ir3_reg_clone(struct ir3 * shader,struct ir3_register * reg)590 ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg)
591 {
592    struct ir3_register *new_reg = reg_create(shader, 0, 0);
593    *new_reg = *reg;
594    return new_reg;
595 }
596 
597 void
ir3_reg_set_last_array(struct ir3_instruction * instr,struct ir3_register * reg,struct ir3_register * last_write)598 ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg,
599                        struct ir3_register *last_write)
600 {
601    assert(reg->flags & IR3_REG_ARRAY);
602    struct ir3_register *new_reg = ir3_src_create(instr, 0, 0);
603    *new_reg = *reg;
604    new_reg->def = last_write;
605    ir3_reg_tie(reg, new_reg);
606 }
607 
608 void
ir3_instr_set_address(struct ir3_instruction * instr,struct ir3_instruction * addr)609 ir3_instr_set_address(struct ir3_instruction *instr,
610                       struct ir3_instruction *addr)
611 {
612    if (!instr->address) {
613       struct ir3 *ir = instr->block->shader;
614 
615       debug_assert(instr->block == addr->block);
616 
617       instr->address =
618          ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags);
619       instr->address->def = addr->dsts[0];
620       debug_assert(reg_num(addr->dsts[0]) == REG_A0);
621       unsigned comp = reg_comp(addr->dsts[0]);
622       if (comp == 0) {
623          array_insert(ir, ir->a0_users, instr);
624       } else {
625          debug_assert(comp == 1);
626          array_insert(ir, ir->a1_users, instr);
627       }
628    } else {
629       debug_assert(instr->address->def->instr == addr);
630    }
631 }
632 
633 void
ir3_block_clear_mark(struct ir3_block * block)634 ir3_block_clear_mark(struct ir3_block *block)
635 {
636    foreach_instr (instr, &block->instr_list)
637       instr->flags &= ~IR3_INSTR_MARK;
638 }
639 
640 void
ir3_clear_mark(struct ir3 * ir)641 ir3_clear_mark(struct ir3 *ir)
642 {
643    foreach_block (block, &ir->block_list) {
644       ir3_block_clear_mark(block);
645    }
646 }
647 
648 unsigned
ir3_count_instructions(struct ir3 * ir)649 ir3_count_instructions(struct ir3 *ir)
650 {
651    unsigned cnt = 1;
652    foreach_block (block, &ir->block_list) {
653       block->start_ip = cnt;
654       foreach_instr (instr, &block->instr_list) {
655          instr->ip = cnt++;
656       }
657       block->end_ip = cnt;
658    }
659    return cnt;
660 }
661 
662 /* When counting instructions for RA, we insert extra fake instructions at the
663  * beginning of each block, where values become live, and at the end where
664  * values die. This prevents problems where values live-in at the beginning or
665  * live-out at the end of a block from being treated as if they were
666  * live-in/live-out at the first/last instruction, which would be incorrect.
667  * In ir3_legalize these ip's are assumed to be actual ip's of the final
668  * program, so it would be incorrect to use this everywhere.
669  */
670 
671 unsigned
ir3_count_instructions_ra(struct ir3 * ir)672 ir3_count_instructions_ra(struct ir3 *ir)
673 {
674    unsigned cnt = 1;
675    foreach_block (block, &ir->block_list) {
676       block->start_ip = cnt++;
677       foreach_instr (instr, &block->instr_list) {
678          instr->ip = cnt++;
679       }
680       block->end_ip = cnt++;
681    }
682    return cnt;
683 }
684 
685 struct ir3_array *
ir3_lookup_array(struct ir3 * ir,unsigned id)686 ir3_lookup_array(struct ir3 *ir, unsigned id)
687 {
688    foreach_array (arr, &ir->array_list)
689       if (arr->id == id)
690          return arr;
691    return NULL;
692 }
693 
694 void
ir3_find_ssa_uses(struct ir3 * ir,void * mem_ctx,bool falsedeps)695 ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps)
696 {
697    /* We could do this in a single pass if we can assume instructions
698     * are always sorted.  Which currently might not always be true.
699     * (In particular after ir3_group pass, but maybe other places.)
700     */
701    foreach_block (block, &ir->block_list)
702       foreach_instr (instr, &block->instr_list)
703          instr->uses = NULL;
704 
705    foreach_block (block, &ir->block_list) {
706       foreach_instr (instr, &block->instr_list) {
707          foreach_ssa_src_n (src, n, instr) {
708             if (__is_false_dep(instr, n) && !falsedeps)
709                continue;
710             if (!src->uses)
711                src->uses = _mesa_pointer_set_create(mem_ctx);
712             _mesa_set_add(src->uses, instr);
713          }
714       }
715    }
716 }
717 
718 /**
719  * Set the destination type of an instruction, for example if a
720  * conversion is folded in, handling the special cases where the
721  * instruction's dest type or opcode needs to be fixed up.
722  */
723 void
ir3_set_dst_type(struct ir3_instruction * instr,bool half)724 ir3_set_dst_type(struct ir3_instruction *instr, bool half)
725 {
726    if (half) {
727       instr->dsts[0]->flags |= IR3_REG_HALF;
728    } else {
729       instr->dsts[0]->flags &= ~IR3_REG_HALF;
730    }
731 
732    switch (opc_cat(instr->opc)) {
733    case 1: /* move instructions */
734       if (half) {
735          instr->cat1.dst_type = half_type(instr->cat1.dst_type);
736       } else {
737          instr->cat1.dst_type = full_type(instr->cat1.dst_type);
738       }
739       break;
740    case 4:
741       if (half) {
742          instr->opc = cat4_half_opc(instr->opc);
743       } else {
744          instr->opc = cat4_full_opc(instr->opc);
745       }
746       break;
747    case 5:
748       if (half) {
749          instr->cat5.type = half_type(instr->cat5.type);
750       } else {
751          instr->cat5.type = full_type(instr->cat5.type);
752       }
753       break;
754    }
755 }
756 
757 /**
758  * One-time fixup for instruction src-types.  Other than cov's that
759  * are folded, an instruction's src type does not change.
760  */
761 void
ir3_fixup_src_type(struct ir3_instruction * instr)762 ir3_fixup_src_type(struct ir3_instruction *instr)
763 {
764    if (instr->srcs_count == 0)
765       return;
766 
767    switch (opc_cat(instr->opc)) {
768    case 1: /* move instructions */
769       if (instr->srcs[0]->flags & IR3_REG_HALF) {
770          instr->cat1.src_type = half_type(instr->cat1.src_type);
771       } else {
772          instr->cat1.src_type = full_type(instr->cat1.src_type);
773       }
774       break;
775    case 3:
776       if (instr->srcs[0]->flags & IR3_REG_HALF) {
777          instr->opc = cat3_half_opc(instr->opc);
778       } else {
779          instr->opc = cat3_full_opc(instr->opc);
780       }
781       break;
782    }
783 }
784 
785 /**
786  * Map a floating point immed to FLUT (float lookup table) value,
787  * returns negative for immediates that cannot be mapped.
788  */
789 int
ir3_flut(struct ir3_register * src_reg)790 ir3_flut(struct ir3_register *src_reg)
791 {
792    static const struct {
793       uint32_t f32;
794       uint16_t f16;
795    } flut[] = {
796          { .f32 = 0x00000000, .f16 = 0x0000 },    /* 0.0 */
797          { .f32 = 0x3f000000, .f16 = 0x3800 },    /* 0.5 */
798          { .f32 = 0x3f800000, .f16 = 0x3c00 },    /* 1.0 */
799          { .f32 = 0x40000000, .f16 = 0x4000 },    /* 2.0 */
800          { .f32 = 0x402df854, .f16 = 0x4170 },    /* e */
801          { .f32 = 0x40490fdb, .f16 = 0x4248 },    /* pi */
802          { .f32 = 0x3ea2f983, .f16 = 0x3518 },    /* 1/pi */
803          { .f32 = 0x3f317218, .f16 = 0x398c },    /* 1/log2(e) */
804          { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 },    /* log2(e) */
805          { .f32 = 0x3e9a209b, .f16 = 0x34d1 },    /* 1/log2(10) */
806          { .f32 = 0x40549a78, .f16 = 0x42a5 },    /* log2(10) */
807          { .f32 = 0x40800000, .f16 = 0x4400 },    /* 4.0 */
808    };
809 
810    if (src_reg->flags & IR3_REG_HALF) {
811       /* Note that half-float immeds are already lowered to 16b in nir: */
812       uint32_t imm = src_reg->uim_val;
813       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
814          if (flut[i].f16 == imm) {
815             return i;
816          }
817       }
818    } else {
819       uint32_t imm = src_reg->uim_val;
820       for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) {
821          if (flut[i].f32 == imm) {
822             return i;
823          }
824       }
825    }
826 
827    return -1;
828 }
829 
830 static unsigned
cp_flags(unsigned flags)831 cp_flags(unsigned flags)
832 {
833    /* only considering these flags (at least for now): */
834    flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS |
835              IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV |
836              IR3_REG_SHARED);
837    return flags;
838 }
839 
840 bool
ir3_valid_flags(struct ir3_instruction * instr,unsigned n,unsigned flags)841 ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags)
842 {
843    struct ir3_compiler *compiler = instr->block->shader->compiler;
844    unsigned valid_flags;
845 
846    if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3)
847       return false;
848 
849    flags = cp_flags(flags);
850 
851    /* If destination is indirect, then source cannot be.. at least
852     * I don't think so..
853     */
854    if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) &&
855        (flags & IR3_REG_RELATIV))
856       return false;
857 
858    if (flags & IR3_REG_RELATIV) {
859       /* TODO need to test on earlier gens.. pretty sure the earlier
860        * problem was just that we didn't check that the src was from
861        * same block (since we can't propagate address register values
862        * across blocks currently)
863        */
864       if (compiler->gen < 6)
865          return false;
866 
867       /* NOTE in the special try_swap_mad_two_srcs() case we can be
868        * called on a src that has already had an indirect load folded
869        * in, in which case ssa() returns NULL
870        */
871       if (instr->srcs[n]->flags & IR3_REG_SSA) {
872          struct ir3_instruction *src = ssa(instr->srcs[n]);
873          if (src->address->def->instr->block != instr->block)
874             return false;
875       }
876    }
877 
878    if (is_meta(instr)) {
879       /* collect and phi nodes support const/immed sources, which will be
880        * turned into move instructions, but not anything else.
881        */
882       if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED))
883          return false;
884 
885       if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED))
886          return false;
887 
888       return true;
889    }
890 
891    switch (opc_cat(instr->opc)) {
892    case 0: /* end, chmask */
893       return flags == 0;
894    case 1:
895       switch (instr->opc) {
896       case OPC_MOVMSK:
897       case OPC_SWZ:
898       case OPC_SCT:
899       case OPC_GAT:
900          valid_flags = IR3_REG_SHARED;
901          break;
902       case OPC_SCAN_MACRO:
903          return flags == 0;
904          break;
905       default:
906          valid_flags =
907             IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED;
908       }
909       if (flags & ~valid_flags)
910          return false;
911       break;
912    case 2:
913       valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST |
914                     IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED;
915 
916       if (flags & ~valid_flags)
917          return false;
918 
919       /* Allow an immediate src1 for flat.b, since it's ignored */
920       if (instr->opc == OPC_FLAT_B &&
921           n == 1 && flags == IR3_REG_IMMED)
922          return true;
923 
924       if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) {
925          unsigned m = n ^ 1;
926          /* cannot deal w/ const or shared in both srcs:
927           * (note that some cat2 actually only have a single src)
928           */
929          if (m < instr->srcs_count) {
930             struct ir3_register *reg = instr->srcs[m];
931             if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) &&
932                 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED)))
933                return false;
934             if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED))
935                return false;
936          }
937       }
938       break;
939    case 3:
940       valid_flags =
941          ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED;
942 
943       switch (instr->opc) {
944       case OPC_SHRM:
945       case OPC_SHLM:
946       case OPC_SHRG:
947       case OPC_SHLG:
948       case OPC_ANDG: {
949          valid_flags |= IR3_REG_IMMED;
950          /* Can be RELATIV+CONST but not CONST: */
951          if (flags & IR3_REG_RELATIV)
952             valid_flags |= IR3_REG_CONST;
953          break;
954       }
955       case OPC_WMM:
956       case OPC_WMM_ACCU: {
957          valid_flags = IR3_REG_SHARED;
958          if (n == 2)
959             valid_flags = IR3_REG_CONST;
960          break;
961       }
962       case OPC_DP2ACC:
963       case OPC_DP4ACC:
964          break;
965       default:
966          valid_flags |= IR3_REG_CONST;
967       }
968 
969       if (flags & ~valid_flags)
970          return false;
971 
972       if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) {
973          /* cannot deal w/ const/shared/relativ in 2nd src: */
974          if (n == 1)
975             return false;
976       }
977 
978       break;
979    case 4:
980       /* seems like blob compiler avoids const as src.. */
981       /* TODO double check if this is still the case on a4xx */
982       if (flags & (IR3_REG_CONST | IR3_REG_IMMED))
983          return false;
984       if (flags & (IR3_REG_SABS | IR3_REG_SNEG))
985          return false;
986       break;
987    case 5:
988       /* no flags allowed */
989       if (flags)
990          return false;
991       break;
992    case 6:
993       valid_flags = IR3_REG_IMMED;
994       if (flags & ~valid_flags)
995          return false;
996 
997       if (flags & IR3_REG_IMMED) {
998          /* doesn't seem like we can have immediate src for store
999           * instructions:
1000           *
1001           * TODO this restriction could also apply to load instructions,
1002           * but for load instructions this arg is the address (and not
1003           * really sure any good way to test a hard-coded immed addr src)
1004           */
1005          if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1))
1006             return false;
1007 
1008          if ((instr->opc == OPC_LDL) && (n == 0))
1009             return false;
1010 
1011          if ((instr->opc == OPC_STL) && (n != 2))
1012             return false;
1013 
1014          if ((instr->opc == OPC_LDP) && (n == 0))
1015             return false;
1016 
1017          if ((instr->opc == OPC_STP) && (n != 2))
1018             return false;
1019 
1020          if (instr->opc == OPC_STLW && n == 0)
1021             return false;
1022 
1023          if (instr->opc == OPC_LDLW && n == 0)
1024             return false;
1025 
1026          /* disallow immediates in anything but the SSBO slot argument for
1027           * cat6 instructions:
1028           */
1029          if (is_global_a3xx_atomic(instr->opc) && (n != 0))
1030             return false;
1031 
1032          if (is_local_atomic(instr->opc) || is_global_a6xx_atomic(instr->opc) ||
1033              is_bindless_atomic(instr->opc))
1034             return false;
1035 
1036          if (instr->opc == OPC_STG && (n == 2))
1037             return false;
1038 
1039          if (instr->opc == OPC_STG_A && (n == 4))
1040             return false;
1041 
1042          if (instr->opc == OPC_LDG && (n == 0))
1043             return false;
1044 
1045          if (instr->opc == OPC_LDG_A && (n < 2))
1046             return false;
1047 
1048          /* as with atomics, these cat6 instrs can only have an immediate
1049           * for SSBO/IBO slot argument
1050           */
1051          switch (instr->opc) {
1052          case OPC_LDIB:
1053          case OPC_STIB:
1054          case OPC_RESINFO:
1055             if (n != 0)
1056                return false;
1057             break;
1058          default:
1059             break;
1060          }
1061       }
1062 
1063       break;
1064    }
1065 
1066    return true;
1067 }
1068 
1069 bool
ir3_valid_immediate(struct ir3_instruction * instr,int32_t immed)1070 ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed)
1071 {
1072    if (instr->opc == OPC_MOV || is_meta(instr))
1073       return true;
1074 
1075    if (is_mem(instr)) {
1076       switch (instr->opc) {
1077       /* Some load/store instructions have a 13-bit offset and size which must
1078        * always be an immediate and the rest of the sources cannot be
1079        * immediates, so the frontend is responsible for checking the size:
1080        */
1081       case OPC_LDL:
1082       case OPC_STL:
1083       case OPC_LDP:
1084       case OPC_STP:
1085       case OPC_LDG:
1086       case OPC_STG:
1087       case OPC_SPILL_MACRO:
1088       case OPC_RELOAD_MACRO:
1089       case OPC_LDG_A:
1090       case OPC_STG_A:
1091       case OPC_LDLW:
1092       case OPC_STLW:
1093       case OPC_LDLV:
1094          return true;
1095       default:
1096          /* most cat6 src immediates can only encode 8 bits: */
1097          return !(immed & ~0xff);
1098       }
1099    }
1100 
1101    /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */
1102    return !(immed & ~0x1ff) || !(-immed & ~0x1ff);
1103 }
1104