1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_bank_conflicts.cpp
25  *
26  * This file contains a GRF bank conflict mitigation pass.  The pass is
27  * intended to be run after register allocation and works by rearranging the
28  * layout of the GRF space (without altering the semantics of the program) in
29  * a way that minimizes the number of GRF bank conflicts incurred by ternary
30  * instructions.
31  *
32  * Unfortunately there is close to no information about bank conflicts in the
33  * hardware spec, but experimentally on Gfx7-Gfx9 ternary instructions seem to
34  * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
35  * the second and third source are stored in the same GRF bank (\sa bank_of()
36  * for the exact bank layout) which cannot be fetched during the same cycle by
37  * the EU, unless the EU logic manages to optimize out the read cycle of a
38  * duplicate source register (\sa is_conflict_optimized_out()).
39  *
40  * The asymptotic run-time of the algorithm is dominated by the
41  * shader_conflict_weight_matrix() computation below, which is O(n) on the
42  * number of instructions in the program, however for small and medium-sized
43  * programs the run-time is likely to be dominated by
44  * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
45  * the program (\sa partitioning), which is bounded (since the program uses a
46  * bounded number of registers post-regalloc) and of the order of 100.  For
47  * that reason optimize_reg_permutation() is vectorized in order to keep the
48  * cubic term within reasonable bounds for m close to its theoretical maximum.
49  */
50 
51 #include "brw_fs.h"
52 #include "brw_cfg.h"
53 
54 #ifdef __SSE2__
55 
56 #include <emmintrin.h>
57 
58 /**
59  * Thin layer around vector intrinsics so they can be easily replaced with
60  * e.g. the fall-back scalar path, an implementation with different vector
61  * width or using different SIMD architectures (AVX-512?!).
62  *
63  * This implementation operates on pairs of independent SSE2 integer vectors à
64  * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
65  * all platforms that care about bank conflicts, so this path should almost
66  * always be available in practice.
67  */
68 namespace {
69    /**
70     * SIMD integer vector data type.
71     */
72    struct vector_type {
73       __m128i v[2];
74    };
75 
76    /**
77     * Scalar data type matching the representation of a single component of \p
78     * vector_type.
79     */
80    typedef int16_t scalar_type;
81 
82    /**
83     * Maximum integer value representable as a \p scalar_type.
84     */
85    const scalar_type max_scalar = INT16_MAX;
86 
87    /**
88     * Number of components of a \p vector_type.
89     */
90    const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
91 
92    /**
93     * Set the i-th component of vector \p v to \p x.
94     */
95    void
set(vector_type & v,unsigned i,scalar_type x)96    set(vector_type &v, unsigned i, scalar_type x)
97    {
98       assert(i < vector_width);
99       memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
100    }
101 
102    /**
103     * Get the i-th component of vector \p v.
104     */
105    scalar_type
get(const vector_type & v,unsigned i)106    get(const vector_type &v, unsigned i)
107    {
108       assert(i < vector_width);
109       scalar_type x;
110       memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
111       return x;
112    }
113 
114    /**
115     * Add two vectors with saturation.
116     */
117    vector_type
adds(const vector_type & v,const vector_type & w)118    adds(const vector_type &v, const vector_type &w)
119    {
120       const vector_type u = {{
121             _mm_adds_epi16(v.v[0], w.v[0]),
122             _mm_adds_epi16(v.v[1], w.v[1])
123          }};
124       return u;
125    }
126 
127    /**
128     * Subtract two vectors with saturation.
129     */
130    vector_type
subs(const vector_type & v,const vector_type & w)131    subs(const vector_type &v, const vector_type &w)
132    {
133       const vector_type u = {{
134             _mm_subs_epi16(v.v[0], w.v[0]),
135             _mm_subs_epi16(v.v[1], w.v[1])
136          }};
137       return u;
138    }
139 
140    /**
141     * Compute the bitwise conjunction of two vectors.
142     */
143    vector_type
mask(const vector_type & v,const vector_type & w)144    mask(const vector_type &v, const vector_type &w)
145    {
146       const vector_type u = {{
147             _mm_and_si128(v.v[0], w.v[0]),
148             _mm_and_si128(v.v[1], w.v[1])
149          }};
150       return u;
151    }
152 
153    /**
154     * Reduce the components of a vector using saturating addition.
155     */
156    scalar_type
sums(const vector_type & v)157    sums(const vector_type &v)
158    {
159       const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
160       const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
161       const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
162       const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
163       return _mm_extract_epi16(v1, 0);
164    }
165 }
166 
167 #else
168 
169 /**
170  * Thin layer around vector intrinsics so they can be easily replaced with
171  * e.g. the fall-back scalar path, an implementation with different vector
172  * width or using different SIMD architectures (AVX-512?!).
173  *
174  * This implementation operates on scalar values and doesn't rely on
175  * any vector extensions.  This is mainly intended for debugging and
176  * to keep this file building on exotic platforms.
177  */
178 namespace {
179    /**
180     * SIMD integer vector data type.
181     */
182    typedef int16_t vector_type;
183 
184    /**
185     * Scalar data type matching the representation of a single component of \p
186     * vector_type.
187     */
188    typedef int16_t scalar_type;
189 
190    /**
191     * Maximum integer value representable as a \p scalar_type.
192     */
193    const scalar_type max_scalar = INT16_MAX;
194 
195    /**
196     * Number of components of a \p vector_type.
197     */
198    const unsigned vector_width = 1;
199 
200    /**
201     * Set the i-th component of vector \p v to \p x.
202     */
203    void
set(vector_type & v,unsigned i,scalar_type x)204    set(vector_type &v, unsigned i, scalar_type x)
205    {
206       assert(i < vector_width);
207       v = x;
208    }
209 
210    /**
211     * Get the i-th component of vector \p v.
212     */
213    scalar_type
get(const vector_type & v,unsigned i)214    get(const vector_type &v, unsigned i)
215    {
216       assert(i < vector_width);
217       return v;
218    }
219 
220    /**
221     * Add two vectors with saturation.
222     */
223    vector_type
adds(vector_type v,vector_type w)224    adds(vector_type v, vector_type w)
225    {
226       return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
227    }
228 
229    /**
230     * Substract two vectors with saturation.
231     */
232    vector_type
subs(vector_type v,vector_type w)233    subs(vector_type v, vector_type w)
234    {
235       return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
236    }
237 
238    /**
239     * Compute the bitwise conjunction of two vectors.
240     */
241    vector_type
mask(vector_type v,vector_type w)242    mask(vector_type v, vector_type w)
243    {
244       return v & w;
245    }
246 
247    /**
248     * Reduce the components of a vector using saturating addition.
249     */
250    scalar_type
sums(vector_type v)251    sums(vector_type v)
252    {
253       return v;
254    }
255 }
256 
257 #endif
258 
259 /**
260  * Swap \p x and \p y.
261  */
262 #define SWAP(x, y) do {                          \
263       __typeof(y) _swap_tmp = y;                 \
264       y = x;                                     \
265       x = _swap_tmp;                             \
266    } while (0)
267 
268 namespace {
269    /**
270     * Variable-length vector type intended to represent cycle-count costs for
271     * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
272     * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
273     * the conflict (respectively, whether the cost is incurred whenever the
274     * atoms are assigned the same bank b or opposite-parity banks b and b^1).
275     * \sa shader_conflict_weight_matrix()
276     */
277    struct weight_vector_type {
weight_vector_type__anonf3320bf40311::weight_vector_type278       weight_vector_type() : v(NULL), size(0) {}
279 
weight_vector_type__anonf3320bf40311::weight_vector_type280       weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
281 
weight_vector_type__anonf3320bf40311::weight_vector_type282       weight_vector_type(const weight_vector_type &u) :
283          v(alloc(u.size)), size(u.size)
284       {
285          memcpy(v, u.v,
286                 DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
287       }
288 
~weight_vector_type__anonf3320bf40311::weight_vector_type289       ~weight_vector_type()
290       {
291          free(v);
292       }
293 
294       weight_vector_type &
operator =__anonf3320bf40311::weight_vector_type295       operator=(weight_vector_type u)
296       {
297          SWAP(v, u.v);
298          SWAP(size, u.size);
299          return *this;
300       }
301 
302       vector_type *v;
303       unsigned size;
304 
305    private:
306       static vector_type *
alloc__anonf3320bf40311::weight_vector_type307       alloc(unsigned n)
308       {
309          const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
310          const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
311          void *p;
312          if (posix_memalign(&p, align, size))
313             return NULL;
314          memset(p, 0, size);
315          return reinterpret_cast<vector_type *>(p);
316       }
317    };
318 
319    /**
320     * Set the (i, p)-th component of weight vector \p v to \p x.
321     */
322    void
set(weight_vector_type & v,unsigned i,unsigned p,scalar_type x)323    set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
324    {
325       set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
326    }
327 
328    /**
329     * Get the (i, p)-th component of weight vector \p v.
330     */
331    scalar_type
get(const weight_vector_type & v,unsigned i,unsigned p)332    get(const weight_vector_type &v, unsigned i, unsigned p)
333    {
334       return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
335    }
336 
337    /**
338     * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
339     */
340    void
swap(weight_vector_type & v,unsigned i,unsigned p,unsigned j,unsigned q)341    swap(weight_vector_type &v,
342         unsigned i, unsigned p,
343         unsigned j, unsigned q)
344    {
345       const scalar_type tmp = get(v, i, p);
346       set(v, i, p, get(v, j, q));
347       set(v, j, q, tmp);
348    }
349 }
350 
351 namespace {
352    /**
353     * Object that represents the partitioning of an arbitrary register space
354     * into indivisible units (referred to as atoms below) that can potentially
355     * be rearranged independently from other registers.  The partitioning is
356     * inferred from a number of contiguity requirements specified using
357     * require_contiguous().  This allows efficient look-up of the atom index a
358     * given register address belongs to, or conversely the range of register
359     * addresses that belong to a given atom.
360     */
361    struct partitioning {
362       /**
363        * Create a (for the moment unrestricted) partitioning of a register
364        * file of size \p n.  The units are arbitrary.
365        */
partitioning__anonf3320bf40411::partitioning366       partitioning(unsigned n) :
367          max_reg(n),
368          offsets(new unsigned[n + num_terminator_atoms]),
369          atoms(new unsigned[n + num_terminator_atoms])
370       {
371          for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
372             offsets[i] = i;
373             atoms[i] = i;
374          }
375       }
376 
partitioning__anonf3320bf40411::partitioning377       partitioning(const partitioning &p) :
378          max_reg(p.max_reg),
379          offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
380          atoms(new unsigned[p.max_reg + num_terminator_atoms])
381       {
382          memcpy(offsets, p.offsets,
383                 sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
384          memcpy(atoms, p.atoms,
385                 sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
386       }
387 
~partitioning__anonf3320bf40411::partitioning388       ~partitioning()
389       {
390          delete[] offsets;
391          delete[] atoms;
392       }
393 
394       partitioning &
operator =__anonf3320bf40411::partitioning395       operator=(partitioning p)
396       {
397          SWAP(max_reg, p.max_reg);
398          SWAP(offsets, p.offsets);
399          SWAP(atoms, p.atoms);
400          return *this;
401       }
402 
403       /**
404        * Require register range [reg, reg + n[ to be considered part of the
405        * same atom.
406        */
407       void
require_contiguous__anonf3320bf40411::partitioning408       require_contiguous(unsigned reg, unsigned n)
409       {
410          unsigned r = atoms[reg];
411 
412          /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
413           * case that the specified contiguity requirement leads to the fusion
414           * (yay) of one or more existing atoms.
415           */
416          for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
417             if (offsets[atoms[reg1]] < reg + n) {
418                atoms[reg1] = r;
419             } else {
420                if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
421                   r++;
422 
423                offsets[r] = offsets[atoms[reg1]];
424                atoms[reg1] = r;
425             }
426          }
427       }
428 
429       /**
430        * Get the atom index register address \p reg belongs to.
431        */
432       unsigned
atom_of_reg__anonf3320bf40411::partitioning433       atom_of_reg(unsigned reg) const
434       {
435          return atoms[reg];
436       }
437 
438       /**
439        * Get the base register address that belongs to atom \p r.
440        */
441       unsigned
reg_of_atom__anonf3320bf40411::partitioning442       reg_of_atom(unsigned r) const
443       {
444          return offsets[r];
445       }
446 
447       /**
448        * Get the size of atom \p r in register address units.
449        */
450       unsigned
size_of_atom__anonf3320bf40411::partitioning451       size_of_atom(unsigned r) const
452       {
453          assert(r < num_atoms());
454          return reg_of_atom(r + 1) - reg_of_atom(r);
455       }
456 
457       /**
458        * Get the number of atoms the whole register space is partitioned into.
459        */
460       unsigned
num_atoms__anonf3320bf40411::partitioning461       num_atoms() const
462       {
463          return atoms[max_reg];
464       }
465 
466    private:
467       /**
468        * Number of trailing atoms inserted for convenience so among other
469        * things we don't need to special-case the last element in
470        * size_of_atom().
471        */
472       static const unsigned num_terminator_atoms = 1;
473       unsigned max_reg;
474       unsigned *offsets;
475       unsigned *atoms;
476    };
477 
478    /**
479     * Only GRF sources (whether they have been register-allocated or not) can
480     * possibly incur bank conflicts.
481     */
482    bool
is_grf(const fs_reg & r)483    is_grf(const fs_reg &r)
484    {
485       return r.file == VGRF || r.file == FIXED_GRF;
486    }
487 
488    /**
489     * Register offset of \p r in GRF units.  Useful because the representation
490     * of GRFs post-register allocation is somewhat inconsistent and depends on
491     * whether the register already had a fixed GRF offset prior to register
492     * allocation or whether it was part of a VGRF allocation.
493     */
494    unsigned
reg_of(const fs_reg & r)495    reg_of(const fs_reg &r)
496    {
497       assert(is_grf(r));
498       if (r.file == VGRF)
499          return r.nr + r.offset / REG_SIZE;
500       else
501          return reg_offset(r) / REG_SIZE;
502    }
503 
504    /**
505     * Calculate the finest partitioning of the GRF space compatible with the
506     * register contiguity requirements derived from all instructions part of
507     * the program.
508     */
509    partitioning
shader_reg_partitioning(const fs_visitor * v)510    shader_reg_partitioning(const fs_visitor *v)
511    {
512       partitioning p(BRW_MAX_GRF);
513 
514       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
515          if (is_grf(inst->dst))
516             p.require_contiguous(reg_of(inst->dst), regs_written(inst));
517 
518          for (int i = 0; i < inst->sources; i++) {
519             if (is_grf(inst->src[i]))
520                p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
521          }
522       }
523 
524       return p;
525    }
526 
527    /**
528     * Return the set of GRF atoms that should be left untouched at their
529     * original location to avoid violating hardware or software assumptions.
530     */
531    bool *
shader_reg_constraints(const fs_visitor * v,const partitioning & p)532    shader_reg_constraints(const fs_visitor *v, const partitioning &p)
533    {
534       bool *constrained = new bool[p.num_atoms()]();
535 
536       /* These are read implicitly by some send-message instructions without
537        * any indication at the IR level.  Assume they are unsafe to move
538        * around.
539        */
540       for (unsigned reg = 0; reg < 2; reg++)
541          constrained[p.atom_of_reg(reg)] = true;
542 
543       /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
544        * subsection "EUISA Instructions", Send Message (page 990):
545        *
546        * "r127 must not be used for return address when there is a src and
547        * dest overlap in send instruction."
548        *
549        * Register allocation ensures that, so don't move 127 around to avoid
550        * breaking that property.
551        */
552       if (v->devinfo->ver >= 8)
553          constrained[p.atom_of_reg(127)] = true;
554 
555       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
556          /* Assume that anything referenced via fixed GRFs is baked into the
557           * hardware's fixed-function logic and may be unsafe to move around.
558           * Also take into account the source GRF restrictions of EOT
559           * send-message instructions.
560           */
561          if (inst->dst.file == FIXED_GRF)
562             constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
563 
564          for (int i = 0; i < inst->sources; i++) {
565             if (inst->src[i].file == FIXED_GRF ||
566                 (is_grf(inst->src[i]) && inst->eot))
567                constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
568          }
569 
570          /* Preserve the original allocation of VGRFs used by the barycentric
571           * source of the LINTERP instruction on Gfx6, since pair-aligned
572           * barycentrics allow the PLN instruction to be used.
573           */
574          if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
575              inst->opcode == FS_OPCODE_LINTERP)
576             constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
577 
578          /* The location of the Gfx7 MRF hack registers is hard-coded in the
579           * rest of the compiler back-end.  Don't attempt to move them around.
580           */
581          if (v->devinfo->ver >= 7) {
582             assert(inst->dst.file != MRF);
583 
584             for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
585                const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
586                constrained[p.atom_of_reg(reg)] = true;
587             }
588          }
589       }
590 
591       return constrained;
592    }
593 
594    /**
595     * Return whether the hardware will be able to prevent a bank conflict by
596     * optimizing out the read cycle of a source register.  The formula was
597     * found experimentally.
598     */
599    bool
is_conflict_optimized_out(const intel_device_info * devinfo,const fs_inst * inst)600    is_conflict_optimized_out(const intel_device_info *devinfo,
601                              const fs_inst *inst)
602    {
603       return devinfo->ver >= 9 &&
604          ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
605                                     reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
606           reg_of(inst->src[1]) == reg_of(inst->src[2]));
607    }
608 
609    /**
610     * Return a matrix that allows reasonably efficient computation of the
611     * cycle-count cost of bank conflicts incurred throughout the whole program
612     * for any given atom-to-bank assignment.
613     *
614     * More precisely, if C_r_s_p is the result of this function, the total
615     * cost of all bank conflicts involving any given atom r can be readily
616     * recovered as follows:
617     *
618     *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
619     *
620     * where d_i_j is the Kronecker delta, and B_r indicates the bank
621     * assignment of r.  \sa delta_conflicts() for a vectorized implementation
622     * of the expression above.
623     *
624     * FINISHME: Teach this about the Gfx10+ bank conflict rules, which are
625     *           somewhat more relaxed than on previous generations.  In the
626     *           meantime optimizing based on Gfx9 weights is likely to be more
627     *           helpful than not optimizing at all.
628     */
629    weight_vector_type *
shader_conflict_weight_matrix(const fs_visitor * v,const partitioning & p)630    shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
631    {
632       weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
633       for (unsigned r = 0; r < p.num_atoms(); r++)
634          conflicts[r] = weight_vector_type(2 * p.num_atoms());
635 
636       /* Crude approximation of the number of times the current basic block
637        * will be executed at run-time.
638        */
639       unsigned block_scale = 1;
640 
641       foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
642          if (inst->opcode == BRW_OPCODE_DO) {
643             block_scale *= 10;
644 
645          } else if (inst->opcode == BRW_OPCODE_WHILE) {
646             block_scale /= 10;
647 
648          } else if (inst->is_3src(v->devinfo) &&
649                     is_grf(inst->src[1]) && is_grf(inst->src[2])) {
650             const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
651             const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
652 
653             /* Estimate of the cycle-count cost of incurring a bank conflict
654              * for this instruction.  This is only true on the average, for a
655              * sequence of back-to-back ternary instructions, since the EU
656              * front-end only seems to be able to issue a new instruction at
657              * an even cycle.  The cost of a bank conflict incurred by an
658              * isolated ternary instruction may be higher.
659              */
660             const unsigned exec_size = inst->dst.component_size(inst->exec_size);
661             const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
662                                                                     REG_SIZE);
663 
664             /* Neglect same-atom conflicts (since they're either trivial or
665              * impossible to avoid without splitting the atom), and conflicts
666              * known to be optimized out by the hardware.
667              */
668             if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
669                /* Calculate the parity of the sources relative to the start of
670                 * their respective atoms.  If their parity is the same (and
671                 * none of the atoms straddle the 2KB mark), the instruction
672                 * will incur a conflict iff both atoms are assigned the same
673                 * bank b.  If their parity is opposite, the instruction will
674                 * incur a conflict iff they are assigned opposite banks (b and
675                 * b^1).
676                 */
677                const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
678                const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
679                const unsigned p = p_r ^ p_s;
680 
681                /* Calculate the updated cost of a hypothetical conflict
682                 * between atoms r and s.  Note that the weight matrix is
683                 * symmetric with respect to indices r and s by construction.
684                 */
685                const scalar_type w = MIN2(unsigned(max_scalar),
686                                           get(conflicts[r], s, p) + cycle_scale);
687                set(conflicts[r], s, p, w);
688                set(conflicts[s], r, p, w);
689             }
690          }
691       }
692 
693       return conflicts;
694    }
695 
696    /**
697     * Return the set of GRF atoms that could potentially lead to bank
698     * conflicts if laid out unfavorably in the GRF space according to
699     * the specified \p conflicts matrix (\sa
700     * shader_conflict_weight_matrix()).
701     */
702    bool *
have_any_conflicts(const partitioning & p,const weight_vector_type * conflicts)703    have_any_conflicts(const partitioning &p,
704                       const weight_vector_type *conflicts)
705    {
706       bool *any_conflicts = new bool[p.num_atoms()]();
707 
708       for (unsigned r = 0; r < p.num_atoms(); r++) {
709          const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
710          for (unsigned s = 0; s < m; s++)
711             any_conflicts[r] |= sums(conflicts[r].v[s]);
712       }
713 
714       return any_conflicts;
715    }
716 
717    /**
718     * Calculate the difference between two S(B) cost estimates as defined
719     * above (\sa shader_conflict_weight_matrix()).  This represents the
720     * (partial) cycle-count benefit from moving an atom r from bank p to n.
721     * The respective bank assignments Bp and Bn are encoded as the \p
722     * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
723     * according to the formula:
724     *
725     *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
726     *
727     * Notice the similarity with the delta function in the S(B) expression
728     * above, and how bank_mask(B) can be precomputed for every possible
729     * selection of r since bank_mask(B) only depends on it via B_r that may
730     * only assume one of four different values, so the caller can keep every
731     * possible bank_mask(B) vector in memory without much hassle (\sa
732     * bank_characteristics()).
733     */
734    int
delta_conflicts(const weight_vector_type & bank_mask_p,const weight_vector_type & bank_mask_n,const weight_vector_type & conflicts)735    delta_conflicts(const weight_vector_type &bank_mask_p,
736                    const weight_vector_type &bank_mask_n,
737                    const weight_vector_type &conflicts)
738    {
739       const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
740       vector_type s_p = {}, s_n = {};
741 
742       for (unsigned r = 0; r < m; r++) {
743          s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
744          s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
745       }
746 
747       return sums(subs(s_p, s_n));
748    }
749 
750    /**
751     * Register atom permutation, represented as the start GRF offset each atom
752     * is mapped into.
753     */
754    struct permutation {
permutation__anonf3320bf40411::permutation755       permutation() : v(NULL), size(0) {}
756 
permutation__anonf3320bf40411::permutation757       permutation(unsigned n) :
758          v(new unsigned[n]()), size(n) {}
759 
permutation__anonf3320bf40411::permutation760       permutation(const permutation &p) :
761          v(new unsigned[p.size]), size(p.size)
762       {
763          memcpy(v, p.v, p.size * sizeof(unsigned));
764       }
765 
~permutation__anonf3320bf40411::permutation766       ~permutation()
767       {
768          delete[] v;
769       }
770 
771       permutation &
operator =__anonf3320bf40411::permutation772       operator=(permutation p)
773       {
774          SWAP(v, p.v);
775          SWAP(size, p.size);
776          return *this;
777       }
778 
779       unsigned *v;
780       unsigned size;
781    };
782 
783    /**
784     * Return an identity permutation of GRF atoms.
785     */
786    permutation
identity_reg_permutation(const partitioning & p)787    identity_reg_permutation(const partitioning &p)
788    {
789       permutation map(p.num_atoms());
790 
791       for (unsigned r = 0; r < map.size; r++)
792          map.v[r] = p.reg_of_atom(r);
793 
794       return map;
795    }
796 
797    /**
798     * Return the bank index of GRF address \p reg, numbered according to the
799     * table:
800     *        Even Odd
801     *    Lo    0   1
802     *    Hi    2   3
803     */
804    unsigned
bank_of(unsigned reg)805    bank_of(unsigned reg)
806    {
807       return (reg & 0x40) >> 5 | (reg & 1);
808    }
809 
810    /**
811     * Return bitmasks suitable for use as bank mask arguments for the
812     * delta_conflicts() computation.  Note that this is just the (negative)
813     * characteristic function of each bank, if you regard it as a set
814     * containing all atoms assigned to it according to the \p map array.
815     */
816    weight_vector_type *
bank_characteristics(const permutation & map)817    bank_characteristics(const permutation &map)
818    {
819       weight_vector_type *banks = new weight_vector_type[4];
820 
821       for (unsigned b = 0; b < 4; b++) {
822          banks[b] = weight_vector_type(2 * map.size);
823 
824          for (unsigned j = 0; j < map.size; j++) {
825             for (unsigned p = 0; p < 2; p++)
826                set(banks[b], j, p,
827                    (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
828          }
829       }
830 
831       return banks;
832    }
833 
834    /**
835     * Return an improved permutation of GRF atoms based on \p map attempting
836     * to reduce the total cycle-count cost of bank conflicts greedily.
837     *
838     * Note that this doesn't attempt to merge multiple atoms into one, which
839     * may allow it to do a better job in some cases -- It simply reorders
840     * existing atoms in the GRF space without affecting their identity.
841     */
842    permutation
optimize_reg_permutation(const partitioning & p,const bool * constrained,const weight_vector_type * conflicts,permutation map)843    optimize_reg_permutation(const partitioning &p,
844                             const bool *constrained,
845                             const weight_vector_type *conflicts,
846                             permutation map)
847    {
848       const bool *any_conflicts = have_any_conflicts(p, conflicts);
849       weight_vector_type *banks = bank_characteristics(map);
850 
851       for (unsigned r = 0; r < map.size; r++) {
852          const unsigned bank_r = bank_of(map.v[r]);
853 
854          if (!constrained[r]) {
855             unsigned best_s = r;
856             int best_benefit = 0;
857 
858             for (unsigned s = 0; s < map.size; s++) {
859                const unsigned bank_s = bank_of(map.v[s]);
860 
861                if (bank_r != bank_s && !constrained[s] &&
862                    p.size_of_atom(r) == p.size_of_atom(s) &&
863                    (any_conflicts[r] || any_conflicts[s])) {
864                   const int benefit =
865                      delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
866                      delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
867 
868                   if (benefit > best_benefit) {
869                      best_s = s;
870                      best_benefit = benefit;
871                   }
872                }
873             }
874 
875             if (best_s != r) {
876                for (unsigned b = 0; b < 4; b++) {
877                   for (unsigned p = 0; p < 2; p++)
878                      swap(banks[b], r, p, best_s, p);
879                }
880 
881                SWAP(map.v[r], map.v[best_s]);
882             }
883          }
884       }
885 
886       delete[] banks;
887       delete[] any_conflicts;
888       return map;
889    }
890 
891    /**
892     * Apply the GRF atom permutation given by \p map to register \p r and
893     * return the result.
894     */
895    fs_reg
transform(const partitioning & p,const permutation & map,fs_reg r)896    transform(const partitioning &p, const permutation &map, fs_reg r)
897    {
898       if (r.file == VGRF) {
899          const unsigned reg = reg_of(r);
900          const unsigned s = p.atom_of_reg(reg);
901          r.nr = map.v[s] + reg - p.reg_of_atom(s);
902          r.offset = r.offset % REG_SIZE;
903       }
904 
905       return r;
906    }
907 }
908 
909 bool
opt_bank_conflicts()910 fs_visitor::opt_bank_conflicts()
911 {
912    assert(grf_used || !"Must be called after register allocation");
913 
914    /* No ternary instructions -- No bank conflicts. */
915    if (devinfo->ver < 6)
916       return false;
917 
918    const partitioning p = shader_reg_partitioning(this);
919    const bool *constrained = shader_reg_constraints(this, p);
920    const weight_vector_type *conflicts =
921       shader_conflict_weight_matrix(this, p);
922    const permutation map =
923       optimize_reg_permutation(p, constrained, conflicts,
924                                identity_reg_permutation(p));
925 
926    foreach_block_and_inst(block, fs_inst, inst, cfg) {
927       inst->dst = transform(p, map, inst->dst);
928 
929       for (int i = 0; i < inst->sources; i++)
930          inst->src[i] = transform(p, map, inst->src[i]);
931    }
932 
933    delete[] conflicts;
934    delete[] constrained;
935    return true;
936 }
937 
938 /**
939  * Return whether the instruction incurs GRF bank conflict cycles.
940  *
941  * Note that this is only accurate after register allocation because otherwise
942  * we don't know which bank each VGRF is going to end up aligned to.
943  */
944 bool
has_bank_conflict(const intel_device_info * devinfo,const fs_inst * inst)945 has_bank_conflict(const intel_device_info *devinfo, const fs_inst *inst)
946 {
947    return inst->is_3src(devinfo) &&
948           is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
949           bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
950           !is_conflict_optimized_out(devinfo, inst);
951 }
952