1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /* From the SKL PRM Vol 2a, "Move":
32     *
33     * "A mov with the same source and destination type, no source modifier,
34     *  and no saturation is a raw move. A packed byte destination region (B
35     *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36     *  using raw move."
37     */
38    bool
is_byte_raw_mov(const fs_inst * inst)39    is_byte_raw_mov(const fs_inst *inst)
40    {
41       return type_sz(inst->dst.type) == 1 &&
42              inst->opcode == BRW_OPCODE_MOV &&
43              inst->src[0].type == inst->dst.type &&
44              !inst->saturate &&
45              !inst->src[0].negate &&
46              !inst->src[0].abs;
47    }
48 
49    /*
50     * Return an acceptable byte stride for the destination of an instruction
51     * that requires it to have some particular alignment.
52     */
53    unsigned
required_dst_byte_stride(const fs_inst * inst)54    required_dst_byte_stride(const fs_inst *inst)
55    {
56       if (inst->dst.is_accumulator()) {
57          /* If the destination is an accumulator, insist that we leave the
58           * stride alone.  We cannot "fix" accumulator destinations by writing
59           * to a temporary and emitting a MOV into the original destination.
60           * For multiply instructions (our one use of the accumulator), the
61           * MUL writes the full 66 bits of the accumulator whereas the MOV we
62           * would emit only writes 33 bits and leaves the top 33 bits
63           * undefined.
64           *
65           * It's safe to just require the original stride here because the
66           * lowering pass will detect the mismatch in has_invalid_src_region
67           * and fix the sources of the multiply instead of the destination.
68           */
69          return inst->dst.stride * type_sz(inst->dst.type);
70       } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71           !is_byte_raw_mov(inst)) {
72          return get_exec_type_size(inst);
73       } else {
74          /* Calculate the maximum byte stride and the minimum/maximum type
75           * size across all source and destination operands we are required to
76           * lower.
77           */
78          unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79          unsigned min_size = type_sz(inst->dst.type);
80          unsigned max_size = type_sz(inst->dst.type);
81 
82          for (unsigned i = 0; i < inst->sources; i++) {
83             if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84                const unsigned size = type_sz(inst->src[i].type);
85                max_stride = MAX2(max_stride, inst->src[i].stride * size);
86                min_size = MIN2(min_size, size);
87                max_size = MAX2(max_size, size);
88             }
89          }
90 
91          /* All operands involved in lowering need to fit in the calculated
92           * stride.
93           */
94          assert(max_size <= 4 * min_size);
95 
96          /* Attempt to use the largest byte stride among all present operands,
97           * but never exceed a stride of 4 since that would lead to illegal
98           * destination regions during lowering.
99           */
100          return MIN2(max_stride, 4 * min_size);
101       }
102    }
103 
104    /*
105     * Return an acceptable byte sub-register offset for the destination of an
106     * instruction that requires it to be aligned to the sub-register offset of
107     * the sources.
108     */
109    unsigned
required_dst_byte_offset(const fs_inst * inst)110    required_dst_byte_offset(const fs_inst *inst)
111    {
112       for (unsigned i = 0; i < inst->sources; i++) {
113          if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114             if (reg_offset(inst->src[i]) % REG_SIZE !=
115                 reg_offset(inst->dst) % REG_SIZE)
116                return 0;
117       }
118 
119       return reg_offset(inst->dst) % REG_SIZE;
120    }
121 
122    /*
123     * Return whether the instruction has an unsupported channel bit layout
124     * specified for the i-th source region.
125     */
126    bool
has_invalid_src_region(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)127    has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
128                           unsigned i)
129    {
130       if (is_unordered(inst) || inst->is_control_source(i))
131          return false;
132 
133       /* Empirical testing shows that Broadwell has a bug affecting half-float
134        * MAD instructions when any of its sources has a non-zero offset, such
135        * as:
136        *
137        * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
138        *
139        * We used to generate code like this for SIMD8 executions where we
140        * used to pack components Y and W of a vector at offset 16B of a SIMD
141        * register. The problem doesn't occur if the stride of the source is 0.
142        */
143       if (devinfo->ver == 8 &&
144           inst->opcode == BRW_OPCODE_MAD &&
145           inst->src[i].type == BRW_REGISTER_TYPE_HF &&
146           reg_offset(inst->src[i]) % REG_SIZE > 0 &&
147           inst->src[i].stride != 0) {
148          return true;
149       }
150 
151       const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
152       const unsigned src_byte_stride = inst->src[i].stride *
153          type_sz(inst->src[i].type);
154       const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
155       const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
156 
157       return has_dst_aligned_region_restriction(devinfo, inst) &&
158              !is_uniform(inst->src[i]) &&
159              (src_byte_stride != dst_byte_stride ||
160               src_byte_offset != dst_byte_offset);
161    }
162 
163    /*
164     * Return whether the instruction has an unsupported channel bit layout
165     * specified for the destination region.
166     */
167    bool
has_invalid_dst_region(const intel_device_info * devinfo,const fs_inst * inst)168    has_invalid_dst_region(const intel_device_info *devinfo,
169                           const fs_inst *inst)
170    {
171       if (is_unordered(inst)) {
172          return false;
173       } else {
174          const brw_reg_type exec_type = get_exec_type(inst);
175          const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
176          const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
177          const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
178             type_sz(inst->dst.type) < type_sz(exec_type);
179 
180          return (has_dst_aligned_region_restriction(devinfo, inst) &&
181                  (required_dst_byte_stride(inst) != dst_byte_stride ||
182                   required_dst_byte_offset(inst) != dst_byte_offset)) ||
183                 (is_narrowing_conversion &&
184                  required_dst_byte_stride(inst) != dst_byte_stride);
185       }
186    }
187 
188    /**
189     * Return a non-zero value if the execution type of the instruction is
190     * unsupported.  The destination and sources matching the returned mask
191     * will be bit-cast to an integer type of appropriate size, lowering any
192     * source or destination modifiers into separate MOV instructions.
193     */
194    unsigned
has_invalid_exec_type(const intel_device_info * devinfo,const fs_inst * inst)195    has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
196    {
197       switch (inst->opcode) {
198       case SHADER_OPCODE_SHUFFLE:
199       case SHADER_OPCODE_QUAD_SWIZZLE:
200          return has_dst_aligned_region_restriction(devinfo, inst) ?
201                 0x1 : 0;
202 
203       case SHADER_OPCODE_BROADCAST:
204       case SHADER_OPCODE_MOV_INDIRECT:
205          return (((devinfo->verx10 == 70) ||
206                   devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
207                   devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
208                 (devinfo->verx10 >= 125 &&
209                  brw_reg_type_is_floating_point(inst->src[0].type)) ?
210                 0x1 : 0;
211 
212       default:
213          return 0;
214       }
215    }
216 
217    /*
218     * Return whether the instruction has unsupported source modifiers
219     * specified for the i-th source region.
220     */
221    bool
has_invalid_src_modifiers(const intel_device_info * devinfo,const fs_inst * inst,unsigned i)222    has_invalid_src_modifiers(const intel_device_info *devinfo,
223                              const fs_inst *inst, unsigned i)
224    {
225       return (!inst->can_do_source_mods(devinfo) &&
226               (inst->src[i].negate || inst->src[i].abs)) ||
227              ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
228               (inst->src[i].negate || inst->src[i].abs ||
229                inst->src[i].type != get_exec_type(inst)));
230    }
231 
232    /*
233     * Return whether the instruction has an unsupported type conversion
234     * specified for the destination.
235     */
236    bool
has_invalid_conversion(const intel_device_info * devinfo,const fs_inst * inst)237    has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
238    {
239       switch (inst->opcode) {
240       case BRW_OPCODE_MOV:
241          return false;
242       case BRW_OPCODE_SEL:
243          return inst->dst.type != get_exec_type(inst);
244       default:
245          /* FIXME: We assume the opcodes not explicitly mentioned before just
246           * work fine with arbitrary conversions, unless they need to be
247           * bit-cast.
248           */
249          return has_invalid_exec_type(devinfo, inst) &&
250                 inst->dst.type != get_exec_type(inst);
251       }
252    }
253 
254    /**
255     * Return whether the instruction has unsupported destination modifiers.
256     */
257    bool
has_invalid_dst_modifiers(const intel_device_info * devinfo,const fs_inst * inst)258    has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
259    {
260       return (has_invalid_exec_type(devinfo, inst) &&
261               (inst->saturate || inst->conditional_mod)) ||
262              has_invalid_conversion(devinfo, inst);
263    }
264 
265    /**
266     * Return whether the instruction has non-standard semantics for the
267     * conditional mod which don't cause the flag register to be updated with
268     * the comparison result.
269     */
270    bool
has_inconsistent_cmod(const fs_inst * inst)271    has_inconsistent_cmod(const fs_inst *inst)
272    {
273       return inst->opcode == BRW_OPCODE_SEL ||
274              inst->opcode == BRW_OPCODE_CSEL ||
275              inst->opcode == BRW_OPCODE_IF ||
276              inst->opcode == BRW_OPCODE_WHILE;
277    }
278 
279    bool
280    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
281 }
282 
283 namespace brw {
284    /**
285     * Remove any modifiers from the \p i-th source region of the instruction,
286     * including negate, abs and any implicit type conversion to the execution
287     * type.  Instead any source modifiers will be implemented as a separate
288     * MOV instruction prior to the original instruction.
289     */
290    bool
lower_src_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)291    lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
292    {
293       assert(inst->components_read(i) == 1);
294       assert(v->devinfo->has_integer_dword_mul ||
295              inst->opcode != BRW_OPCODE_MUL ||
296              brw_reg_type_is_floating_point(get_exec_type(inst)) ||
297              MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
298              type_sz(inst->src[i].type) == get_exec_type_size(inst));
299 
300       const fs_builder ibld(v, block, inst);
301       const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
302 
303       lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
304       inst->src[i] = tmp;
305 
306       return true;
307    }
308 }
309 
310 namespace {
311    /**
312     * Remove any modifiers from the destination region of the instruction,
313     * including saturate, conditional mod and any implicit type conversion
314     * from the execution type.  Instead any destination modifiers will be
315     * implemented as a separate MOV instruction after the original
316     * instruction.
317     */
318    bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)319    lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
320    {
321       const fs_builder ibld(v, block, inst);
322       const brw_reg_type type = get_exec_type(inst);
323       /* Not strictly necessary, but if possible use a temporary with the same
324        * channel alignment as the current destination in order to avoid
325        * violating the restrictions enforced later on by lower_src_region()
326        * and lower_dst_region(), which would introduce additional copy
327        * instructions into the program unnecessarily.
328        */
329       const unsigned stride =
330          type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
331          type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
332       fs_reg tmp = ibld.vgrf(type, stride);
333       ibld.UNDEF(tmp);
334       tmp = horiz_stride(tmp, stride);
335 
336       /* Emit a MOV taking care of all the destination modifiers. */
337       fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
338       mov->saturate = inst->saturate;
339       if (!has_inconsistent_cmod(inst))
340          mov->conditional_mod = inst->conditional_mod;
341       if (inst->opcode != BRW_OPCODE_SEL) {
342          mov->predicate = inst->predicate;
343          mov->predicate_inverse = inst->predicate_inverse;
344       }
345       mov->flag_subreg = inst->flag_subreg;
346       lower_instruction(v, block, mov);
347 
348       /* Point the original instruction at the temporary, and clean up any
349        * destination modifiers.
350        */
351       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
352       inst->dst = tmp;
353       inst->size_written = inst->dst.component_size(inst->exec_size);
354       inst->saturate = false;
355       if (!has_inconsistent_cmod(inst))
356          inst->conditional_mod = BRW_CONDITIONAL_NONE;
357 
358       assert(!inst->flags_written(v->devinfo) || !mov->predicate);
359       return true;
360    }
361 
362    /**
363     * Remove any non-trivial shuffling of data from the \p i-th source region
364     * of the instruction.  Instead implement the region as a series of integer
365     * copies into a temporary with the same channel layout as the destination.
366     */
367    bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)368    lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
369    {
370       assert(inst->components_read(i) == 1);
371       const fs_builder ibld(v, block, inst);
372       const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
373                               type_sz(inst->src[i].type);
374       assert(stride > 0);
375       fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
376       ibld.UNDEF(tmp);
377       tmp = horiz_stride(tmp, stride);
378 
379       /* Emit a series of 32-bit integer copies with any source modifiers
380        * cleaned up (because their semantics are dependent on the type).
381        */
382       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
383                                                  false);
384       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
385       fs_reg raw_src = inst->src[i];
386       raw_src.negate = false;
387       raw_src.abs = false;
388 
389       for (unsigned j = 0; j < n; j++)
390          ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
391 
392       /* Point the original instruction at the temporary, making sure to keep
393        * any source modifiers in the instruction.
394        */
395       fs_reg lower_src = tmp;
396       lower_src.negate = inst->src[i].negate;
397       lower_src.abs = inst->src[i].abs;
398       inst->src[i] = lower_src;
399 
400       return true;
401    }
402 
403    /**
404     * Remove any non-trivial shuffling of data from the destination region of
405     * the instruction.  Instead implement the region as a series of integer
406     * copies from a temporary with a channel layout compatible with the
407     * sources.
408     */
409    bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)410    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
411    {
412       /* We cannot replace the result of an integer multiply which writes the
413        * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
414        * value whereas the MOV will act on only 32 or 33 bits of the
415        * accumulator.
416        */
417       assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
418              brw_reg_type_is_floating_point(inst->dst.type));
419 
420       const fs_builder ibld(v, block, inst);
421       const unsigned stride = required_dst_byte_stride(inst) /
422                               type_sz(inst->dst.type);
423       assert(stride > 0);
424       fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
425       ibld.UNDEF(tmp);
426       tmp = horiz_stride(tmp, stride);
427 
428       /* Emit a series of 32-bit integer copies from the temporary into the
429        * original destination.
430        */
431       const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
432                                                  false);
433       const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
434 
435       if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
436          /* Note that in general we cannot simply predicate the copies on the
437           * same flag register as the original instruction, since it may have
438           * been overwritten by the instruction itself.  Instead initialize
439           * the temporary with the previous contents of the destination
440           * register.
441           */
442          for (unsigned j = 0; j < n; j++)
443             ibld.MOV(subscript(tmp, raw_type, j),
444                      subscript(inst->dst, raw_type, j));
445       }
446 
447       for (unsigned j = 0; j < n; j++)
448          ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
449                                         subscript(tmp, raw_type, j));
450 
451       /* Point the original instruction at the temporary, making sure to keep
452        * any destination modifiers in the instruction.
453        */
454       assert(inst->size_written == inst->dst.component_size(inst->exec_size));
455       inst->dst = tmp;
456       inst->size_written = inst->dst.component_size(inst->exec_size);
457 
458       return true;
459    }
460 
461    /**
462     * Bit-cast sources and destination of the instruction to an appropriate
463     * integer type, to be used in cases where the instruction doesn't support
464     * some other execution type.
465     */
466    bool
lower_exec_type(fs_visitor * v,bblock_t * block,fs_inst * inst)467    lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
468    {
469       assert(inst->dst.type == get_exec_type(inst));
470       const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
471       const brw_reg_type raw_type = brw_int_type(type_sz(inst->dst.type), false);
472 
473       for (unsigned i = 0; i < inst->sources; i++) {
474          if (mask & (1u << i)) {
475             assert(inst->src[i].type == inst->dst.type);
476             inst->src[i].type = raw_type;
477          }
478       }
479 
480       inst->dst.type = raw_type;
481 
482       return true;
483    }
484 
485    /**
486     * Legalize the source and destination regioning controls of the specified
487     * instruction.
488     */
489    bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)490    lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
491    {
492       const intel_device_info *devinfo = v->devinfo;
493       bool progress = false;
494 
495       if (has_invalid_dst_modifiers(devinfo, inst))
496          progress |= lower_dst_modifiers(v, block, inst);
497 
498       if (has_invalid_dst_region(devinfo, inst))
499          progress |= lower_dst_region(v, block, inst);
500 
501       for (unsigned i = 0; i < inst->sources; i++) {
502          if (has_invalid_src_modifiers(devinfo, inst, i))
503             progress |= lower_src_modifiers(v, block, inst, i);
504 
505          if (has_invalid_src_region(devinfo, inst, i))
506             progress |= lower_src_region(v, block, inst, i);
507       }
508 
509       if (has_invalid_exec_type(devinfo, inst))
510          progress |= lower_exec_type(v, block, inst);
511 
512       return progress;
513    }
514 }
515 
516 bool
lower_regioning()517 fs_visitor::lower_regioning()
518 {
519    bool progress = false;
520 
521    foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
522       progress |= lower_instruction(this, block, inst);
523 
524    if (progress)
525       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
526 
527    return progress;
528 }
529