1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "ir3.h"
28 
29 /* The maximum number of nop's we may need to insert between two instructions.
30  */
31 #define MAX_NOPS 6
32 
33 /* The soft delay for approximating the cost of (ss). On a6xx, it takes the
34  * number of delay slots to get a SFU result back (ie. using nop's instead of
35  * (ss) is:
36  *
37  *     8 - single warp
38  *     9 - two warps
39  *    10 - four warps
40  *
41  * and so on. Not quite sure where it tapers out (ie. how many warps share an
42  * SFU unit). But 10 seems like a reasonable # to choose:
43  */
44 #define SOFT_SS_NOPS 10
45 
46 /*
47  * Helpers to figure out the necessary delay slots between instructions.  Used
48  * both in scheduling pass(es) and the final pass to insert any required nop's
49  * so that the shader program is valid.
50  *
51  * Note that this needs to work both pre and post RA, so we can't assume ssa
52  * src iterators work.
53  */
54 
55 /* calculate required # of delay slots between the instruction that
56  * assigns a value and the one that consumes
57  */
58 int
ir3_delayslots(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)59 ir3_delayslots(struct ir3_instruction *assigner,
60                struct ir3_instruction *consumer, unsigned n, bool soft)
61 {
62    /* generally don't count false dependencies, since this can just be
63     * something like a barrier, or SSBO store.
64     */
65    if (__is_false_dep(consumer, n))
66       return 0;
67 
68    /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
69     * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
70     * handled with sync bits
71     */
72 
73    if (is_meta(assigner) || is_meta(consumer))
74       return 0;
75 
76    if (writes_addr0(assigner) || writes_addr1(assigner))
77       return 6;
78 
79    if (soft && is_sfu(assigner))
80       return SOFT_SS_NOPS;
81 
82    /* handled via sync flags: */
83    if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
84       return 0;
85 
86    /* As far as we know, shader outputs don't need any delay. */
87    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
88       return 0;
89 
90    /* assigner must be alu: */
91    if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
92        is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
93       return 6;
94    } else {
95       /* In mergedregs mode, there is an extra 2-cycle penalty when half of
96        * a full-reg is read as a half-reg or when a half-reg is read as a
97        * full-reg.
98        */
99       bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
100                              (consumer->srcs[n]->flags & IR3_REG_HALF);
101       unsigned penalty = mismatched_half ? 3 : 0;
102       if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
103          /* special case, 3rd src to cat3 not required on first cycle */
104          return 1 + penalty;
105       } else {
106          return 3 + penalty;
107       }
108    }
109 }
110 
111 static bool
count_instruction(struct ir3_instruction * n)112 count_instruction(struct ir3_instruction *n)
113 {
114    /* NOTE: don't count branch/jump since we don't know yet if they will
115     * be eliminated later in resolve_jumps().. really should do that
116     * earlier so we don't have this constraint.
117     */
118    return is_alu(n) ||
119           (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
120 }
121 
122 static unsigned
distance(struct ir3_block * block,struct ir3_instruction * instr,unsigned maxd)123 distance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
124 {
125    unsigned d = 0;
126 
127    /* Note that this relies on incrementally building up the block's
128     * instruction list.. but this is how scheduling and nopsched
129     * work.
130     */
131    foreach_instr_rev (n, &block->instr_list) {
132       if ((n == instr) || (d >= maxd))
133          return MIN2(maxd, d + n->nop);
134       if (count_instruction(n))
135          d = MIN2(maxd, d + 1 + n->repeat + n->nop);
136    }
137 
138    return maxd;
139 }
140 
141 static unsigned
delay_calc_srcn_prera(struct ir3_block * block,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned srcn)142 delay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
143                       struct ir3_instruction *consumer, unsigned srcn)
144 {
145    unsigned delay = 0;
146 
147    if (assigner->opc == OPC_META_PHI)
148       return 0;
149 
150    if (is_meta(assigner)) {
151       foreach_src_n (src, n, assigner) {
152          unsigned d;
153 
154          if (!src->def)
155             continue;
156 
157          d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
158          delay = MAX2(delay, d);
159       }
160    } else {
161       delay = ir3_delayslots(assigner, consumer, srcn, false);
162       delay -= distance(block, assigner, delay);
163    }
164 
165    return delay;
166 }
167 
168 /**
169  * Calculate delay for instruction before register allocation, using SSA
170  * source pointers. This can't handle inter-block dependencies.
171  */
172 unsigned
ir3_delay_calc_prera(struct ir3_block * block,struct ir3_instruction * instr)173 ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
174 {
175    unsigned delay = 0;
176 
177    foreach_src_n (src, i, instr) {
178       unsigned d = 0;
179 
180       if (src->def && src->def->instr->block == block) {
181          d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
182       }
183 
184       delay = MAX2(delay, d);
185    }
186 
187    return delay;
188 }
189 
190 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
191  * and have to handle relative accesses specially.
192  */
193 
194 static unsigned
post_ra_reg_elems(struct ir3_register * reg)195 post_ra_reg_elems(struct ir3_register *reg)
196 {
197    if (reg->flags & IR3_REG_RELATIV)
198       return reg->size;
199    return reg_elems(reg);
200 }
201 
202 static unsigned
post_ra_reg_num(struct ir3_register * reg)203 post_ra_reg_num(struct ir3_register *reg)
204 {
205    if (reg->flags & IR3_REG_RELATIV)
206       return reg->array.base;
207    return reg->num;
208 }
209 
210 static unsigned
delay_calc_srcn_postra(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n,bool soft,bool mergedregs)211 delay_calc_srcn_postra(struct ir3_instruction *assigner,
212                        struct ir3_instruction *consumer, unsigned assigner_n,
213                        unsigned consumer_n, bool soft, bool mergedregs)
214 {
215    struct ir3_register *src = consumer->srcs[consumer_n];
216    struct ir3_register *dst = assigner->dsts[assigner_n];
217    bool mismatched_half =
218       (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
219 
220    /* In the mergedregs case or when the register is a special register,
221     * half-registers do not alias with full registers.
222     */
223    if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
224        mismatched_half)
225       return 0;
226 
227    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
228    unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
229    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
230    unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
231 
232    if (dst_start >= src_end || src_start >= dst_end)
233       return 0;
234 
235    unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
236 
237    if (assigner->repeat == 0 && consumer->repeat == 0)
238       return delay;
239 
240    /* If either side is a relative access, we can't really apply most of the
241     * reasoning below because we don't know which component aliases which.
242     * Just bail in this case.
243     */
244    if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
245       return delay;
246 
247    /* MOVMSK seems to require that all users wait until the entire
248     * instruction is finished, so just bail here.
249     */
250    if (assigner->opc == OPC_MOVMSK)
251       return delay;
252 
253    /* TODO: Handle the combination of (rpt) and different component sizes
254     * better like below. This complicates things significantly because the
255     * components don't line up.
256     */
257    if (mismatched_half)
258       return delay;
259 
260    /* If an instruction has a (rpt), then it acts as a sequence of
261     * instructions, reading its non-(r) sources at each cycle. First, get the
262     * register num for the first instruction where they interfere:
263     */
264 
265    unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
266 
267    /* Now, for that first conflicting half/full register, figure out the
268     * sub-instruction within assigner/consumer it corresponds to. For (r)
269     * sources, this should already return the correct answer of 0. However we
270     * have to special-case the multi-mov instructions, where the
271     * sub-instructions sometimes come from the src/dst indices instead.
272     */
273    unsigned first_src_instr;
274    if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
275       first_src_instr = consumer_n;
276    else
277       first_src_instr = first_num - src->num;
278 
279    unsigned first_dst_instr;
280    if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
281       first_dst_instr = assigner_n;
282    else
283       first_dst_instr = first_num - dst->num;
284 
285    /* The delay we return is relative to the *end* of assigner and the
286     * *beginning* of consumer, because it's the number of nops (or other
287     * things) needed between them. Any instructions after first_dst_instr
288     * subtract from the delay, and so do any instructions before
289     * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
290     * delay to account for that.
291     *
292     * Now, a priori, we need to go through this process for every
293     * conflicting regnum and take the minimum of the offsets to make sure
294     * that the appropriate number of nop's is inserted for every conflicting
295     * pair of sub-instructions. However, as we go to the next conflicting
296     * regnum (if any), the number of instructions after first_dst_instr
297     * decreases by 1 and the number of source instructions before
298     * first_src_instr correspondingly increases by 1, so the offset stays the
299     * same for all conflicting registers.
300     */
301    unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
302    return offset > delay ? 0 : delay - offset;
303 }
304 
305 static unsigned
delay_calc_postra(struct ir3_block * block,struct ir3_instruction * start,struct ir3_instruction * consumer,unsigned distance,bool soft,bool pred,bool mergedregs)306 delay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
307                   struct ir3_instruction *consumer, unsigned distance,
308                   bool soft, bool pred, bool mergedregs)
309 {
310    unsigned delay = 0;
311    /* Search backwards starting at the instruction before start, unless it's
312     * NULL then search backwards from the block end.
313     */
314    struct list_head *start_list =
315       start ? start->node.prev : block->instr_list.prev;
316    list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
317                                  &block->instr_list, node) {
318       if (count_instruction(assigner))
319          distance += assigner->nop;
320 
321       if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
322          return delay;
323 
324       if (is_meta(assigner))
325          continue;
326 
327       unsigned new_delay = 0;
328 
329       foreach_dst_n (dst, dst_n, assigner) {
330          if (dst->wrmask == 0)
331             continue;
332          foreach_src_n (src, src_n, consumer) {
333             if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
334                continue;
335 
336             unsigned src_delay = delay_calc_srcn_postra(
337                assigner, consumer, dst_n, src_n, soft, mergedregs);
338             new_delay = MAX2(new_delay, src_delay);
339          }
340       }
341 
342       new_delay = new_delay > distance ? new_delay - distance : 0;
343       delay = MAX2(delay, new_delay);
344 
345       if (count_instruction(assigner))
346          distance += 1 + assigner->repeat;
347    }
348 
349    /* Note: this allows recursion into "block" if it has already been
350     * visited, but *not* recursion into its predecessors. We may have to
351     * visit the original block twice, for the loop case where we have to
352     * consider definititons in an earlier iterations of the same loop:
353     *
354     * while (...) {
355     *		mov.u32u32 ..., r0.x
356     *		...
357     *		mov.u32u32 r0.x, ...
358     * }
359     *
360     * However any other recursion would be unnecessary.
361     */
362 
363    if (pred && block->data != block) {
364       block->data = block;
365 
366       for (unsigned i = 0; i < block->predecessors_count; i++) {
367          struct ir3_block *pred = block->predecessors[i];
368          unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
369                                                  soft, pred, mergedregs);
370          delay = MAX2(delay, pred_delay);
371       }
372 
373       block->data = NULL;
374    }
375 
376    return delay;
377 }
378 
379 /**
380  * Calculate delay for post-RA scheduling based on physical registers but not
381  * exact (i.e. don't recurse into predecessors, and make it possible to
382  * estimate impact of sync flags).
383  *
384  * @soft:  If true, add additional delay for situations where they
385  *    would not be strictly required because a sync flag would be
386  *    used (but scheduler would prefer to schedule some other
387  *    instructions first to avoid stalling on sync flag)
388  * @mergedregs: True if mergedregs is enabled.
389  */
390 unsigned
ir3_delay_calc_postra(struct ir3_block * block,struct ir3_instruction * instr,bool soft,bool mergedregs)391 ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
392                       bool soft, bool mergedregs)
393 {
394    return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
395 }
396 
397 /**
398  * Calculate delay for nop insertion. This must exactly match hardware
399  * requirements, including recursing into predecessor blocks.
400  */
401 unsigned
ir3_delay_calc_exact(struct ir3_block * block,struct ir3_instruction * instr,bool mergedregs)402 ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
403                      bool mergedregs)
404 {
405    return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
406 }
407 
408 /**
409  * Remove nop instructions.  The scheduler can insert placeholder nop's
410  * so that ir3_delay_calc() can account for nop's that won't be needed
411  * due to nop's triggered by a previous instruction.  However, before
412  * legalize, we want to remove these.  The legalize pass can insert
413  * some nop's if needed to hold (for example) sync flags.  This final
414  * remaining nops are inserted by legalize after this.
415  */
416 void
ir3_remove_nops(struct ir3 * ir)417 ir3_remove_nops(struct ir3 *ir)
418 {
419    foreach_block (block, &ir->block_list) {
420       foreach_instr_safe (instr, &block->instr_list) {
421          if (instr->opc == OPC_NOP) {
422             list_del(&instr->node);
423          }
424       }
425    }
426 }
427