1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 
28 /** @file brw_fs_cmod_propagation.cpp
29  *
30  * Implements a pass that propagates the conditional modifier from a CMP x 0.0
31  * instruction into the instruction that generated x. For instance, in this
32  * sequence
33  *
34  *    add(8)          g70<1>F    g69<8,8,1>F    4096F
35  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
36  *
37  * we can do the comparison as part of the ADD instruction directly:
38  *
39  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
40  *
41  * If there had been a use of the flag register and another CMP using g70
42  *
43  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
44  *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
45  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
46  *
47  * we can recognize that the CMP is generating the flag value that already
48  * exists and therefore remove the instruction.
49  */
50 
51 using namespace brw;
52 
53 static bool
cmod_propagate_cmp_to_add(const intel_device_info * devinfo,bblock_t * block,fs_inst * inst)54 cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
55                           fs_inst *inst)
56 {
57    bool read_flag = false;
58    const unsigned flags_written = inst->flags_written(devinfo);
59 
60    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
61       if (scan_inst->opcode == BRW_OPCODE_ADD &&
62           !scan_inst->is_partial_write() &&
63           scan_inst->exec_size == inst->exec_size) {
64          bool negate;
65 
66          /* A CMP is basically a subtraction.  The result of the
67           * subtraction must be the same as the result of the addition.
68           * This means that one of the operands must be negated.  So (a +
69           * b) vs (a == -b) or (a + -b) vs (a == b).
70           */
71          if ((inst->src[0].equals(scan_inst->src[0]) &&
72               inst->src[1].negative_equals(scan_inst->src[1])) ||
73              (inst->src[0].equals(scan_inst->src[1]) &&
74               inst->src[1].negative_equals(scan_inst->src[0]))) {
75             negate = false;
76          } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
77                      inst->src[1].equals(scan_inst->src[1])) ||
78                     (inst->src[0].negative_equals(scan_inst->src[1]) &&
79                      inst->src[1].equals(scan_inst->src[0]))) {
80             negate = true;
81          } else {
82             goto not_match;
83          }
84 
85          /* If the scan instruction writes a different flag register than the
86           * instruction we're trying to propagate from, bail.
87           *
88           * FINISHME: The second part of the condition may be too strong.
89           * Perhaps (scan_inst->flags_written() & flags_written) !=
90           * flags_written?
91           */
92          if (scan_inst->flags_written(devinfo) != 0 &&
93              scan_inst->flags_written(devinfo) != flags_written)
94             goto not_match;
95 
96          /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
97           *
98           *    * Note that the [post condition signal] bits generated at
99           *      the output of a compute are before the .sat.
100           *
101           * Paragraph about post_zero does not mention saturation, but
102           * testing it on actual GPUs shows that conditional modifiers
103           * are applied after saturation.
104           *
105           *    * post_zero bit: This bit reflects whether the final
106           *      result is zero after all the clamping, normalizing,
107           *      or format conversion logic.
108           *
109           * For signed types we don't care about saturation: it won't
110           * change the result of conditional modifier.
111           *
112           * For floating and unsigned types there two special cases,
113           * when we can remove inst even if scan_inst is saturated: G
114           * and LE. Since conditional modifiers are just comparations
115           * against zero, saturating positive values to the upper
116           * limit never changes the result of comparation.
117           *
118           * For negative values:
119           * (sat(x) >  0) == (x >  0) --- false
120           * (sat(x) <= 0) == (x <= 0) --- true
121           */
122          const enum brw_conditional_mod cond =
123             negate ? brw_swap_cmod(inst->conditional_mod)
124             : inst->conditional_mod;
125 
126          if (scan_inst->saturate &&
127              (brw_reg_type_is_floating_point(scan_inst->dst.type) ||
128               brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
129              (cond != BRW_CONDITIONAL_G &&
130               cond != BRW_CONDITIONAL_LE))
131             goto not_match;
132 
133          /* Otherwise, try propagating the conditional. */
134          if (scan_inst->can_do_cmod() &&
135              ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
136               scan_inst->conditional_mod == cond)) {
137             scan_inst->conditional_mod = cond;
138             scan_inst->flag_subreg = inst->flag_subreg;
139             inst->remove(block, true);
140             return true;
141          }
142          break;
143       }
144 
145    not_match:
146       if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
147          break;
148 
149       read_flag = read_flag ||
150                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
151    }
152 
153    return false;
154 }
155 
156 /**
157  * Propagate conditional modifiers from NOT instructions
158  *
159  * Attempt to convert sequences like
160  *
161  *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
162  *    ...
163  *    not.nz.f0(8)    null            g78<8,8,1>UD
164  *
165  * into
166  *
167  *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
168  */
169 static bool
cmod_propagate_not(const intel_device_info * devinfo,bblock_t * block,fs_inst * inst)170 cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
171                    fs_inst *inst)
172 {
173    const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
174    bool read_flag = false;
175    const unsigned flags_written = inst->flags_written(devinfo);
176 
177    if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
178       return false;
179 
180    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
181       if (regions_overlap(scan_inst->dst, scan_inst->size_written,
182                           inst->src[0], inst->size_read(0))) {
183          if (scan_inst->opcode != BRW_OPCODE_OR &&
184              scan_inst->opcode != BRW_OPCODE_AND)
185             break;
186 
187          if (scan_inst->is_partial_write() ||
188              scan_inst->dst.offset != inst->src[0].offset ||
189              scan_inst->exec_size != inst->exec_size)
190             break;
191 
192          /* If the scan instruction writes a different flag register than the
193           * instruction we're trying to propagate from, bail.
194           *
195           * FINISHME: The second part of the condition may be too strong.
196           * Perhaps (scan_inst->flags_written() & flags_written) !=
197           * flags_written?
198           */
199          if (scan_inst->flags_written(devinfo) != 0 &&
200              scan_inst->flags_written(devinfo) != flags_written)
201             break;
202 
203          if (scan_inst->can_do_cmod() &&
204              ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
205               scan_inst->conditional_mod == cond)) {
206             scan_inst->conditional_mod = cond;
207             scan_inst->flag_subreg = inst->flag_subreg;
208             inst->remove(block, true);
209             return true;
210          }
211          break;
212       }
213 
214       if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
215          break;
216 
217       read_flag = read_flag ||
218                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
219    }
220 
221    return false;
222 }
223 
224 static bool
opt_cmod_propagation_local(const intel_device_info * devinfo,bblock_t * block)225 opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
226 {
227    bool progress = false;
228    int ip = block->end_ip + 1;
229 
230    foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
231       ip--;
232 
233       if ((inst->opcode != BRW_OPCODE_AND &&
234            inst->opcode != BRW_OPCODE_CMP &&
235            inst->opcode != BRW_OPCODE_MOV &&
236            inst->opcode != BRW_OPCODE_NOT) ||
237           inst->predicate != BRW_PREDICATE_NONE ||
238           !inst->dst.is_null() ||
239           (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
240            inst->src[0].file != UNIFORM))
241          continue;
242 
243       /* An ABS source modifier can only be handled when processing a compare
244        * with a value other than zero.
245        */
246       if (inst->src[0].abs &&
247           (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
248          continue;
249 
250       /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
251        * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
252        * Propagating those would require inverting the condition on the CMP.
253        * This changes both the flag value and the register destination of the
254        * CMP.  That result may be used elsewhere, so we can't change its value
255        * on a whim.
256        */
257       if (inst->opcode == BRW_OPCODE_AND &&
258           !(inst->src[1].is_one() &&
259             inst->conditional_mod == BRW_CONDITIONAL_NZ &&
260             !inst->src[0].negate))
261          continue;
262 
263       /* A CMP with a second source of zero can match with anything.  A CMP
264        * with a second source that is not zero can only match with an ADD
265        * instruction.
266        *
267        * Only apply this optimization to float-point sources.  It can fail for
268        * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
269        * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
270        * less than zero, so the flags get set differently than for (a < b).
271        */
272       if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
273          if (brw_reg_type_is_floating_point(inst->src[0].type) &&
274              cmod_propagate_cmp_to_add(devinfo, block, inst))
275             progress = true;
276 
277          continue;
278       }
279 
280       if (inst->opcode == BRW_OPCODE_NOT) {
281          progress = cmod_propagate_not(devinfo, block, inst) || progress;
282          continue;
283       }
284 
285       bool read_flag = false;
286       const unsigned flags_written = inst->flags_written(devinfo);
287       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
288          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
289                              inst->src[0], inst->size_read(0))) {
290             /* If the scan instruction writes a different flag register than
291              * the instruction we're trying to propagate from, bail.
292              *
293              * FINISHME: The second part of the condition may be too strong.
294              * Perhaps (scan_inst->flags_written() & flags_written) !=
295              * flags_written?
296              */
297             if (scan_inst->flags_written(devinfo) != 0 &&
298                 scan_inst->flags_written(devinfo) != flags_written)
299                break;
300 
301             if (scan_inst->is_partial_write() ||
302                 scan_inst->dst.offset != inst->src[0].offset ||
303                 scan_inst->exec_size != inst->exec_size)
304                break;
305 
306             /* CMP's result is the same regardless of dest type. */
307             if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
308                 scan_inst->opcode == BRW_OPCODE_CMP &&
309                 brw_reg_type_is_integer(inst->dst.type)) {
310                inst->remove(block, true);
311                progress = true;
312                break;
313             }
314 
315             /* If the AND wasn't handled by the previous case, it isn't safe
316              * to remove it.
317              */
318             if (inst->opcode == BRW_OPCODE_AND)
319                break;
320 
321             if (inst->opcode == BRW_OPCODE_MOV) {
322                if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
323                   /* If the destination type of scan_inst is floating-point,
324                    * then:
325                    *
326                    * - The source of the MOV instruction must be the same
327                    *   type.
328                    *
329                    * - The destination of the MOV instruction must be float
330                    *   point with a size at least as large as the destination
331                    *   of inst.  Size-reducing f2f conversions could cause
332                    *   non-zero values to become zero, etc.
333                    */
334                   if (scan_inst->dst.type != inst->src[0].type)
335                      break;
336 
337                   if (!brw_reg_type_is_floating_point(inst->dst.type))
338                      break;
339 
340                   if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
341                      break;
342                } else {
343                   /* If the destination type of scan_inst is integer, then:
344                    *
345                    * - The source of the MOV instruction must be integer with
346                    *   the same size.
347                    *
348                    * - If the conditional modifier is Z or NZ, then the
349                    *   destination type of inst must either be floating point
350                    *   (of any size) or integer with a size at least as large
351                    *   as the destination of inst.
352                    *
353                    * - If the conditional modifier is neither Z nor NZ, then the
354                    *   destination type of inst must either be floating point
355                    *   (of any size) or integer with a size at least as large
356                    *   as the destination of inst and the same signedness.
357                    */
358                   if (!brw_reg_type_is_integer(inst->src[0].type) ||
359                       type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
360                      break;
361 
362                   if (brw_reg_type_is_integer(inst->dst.type)) {
363                      if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
364                         break;
365 
366                      if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
367                          inst->conditional_mod != BRW_CONDITIONAL_NZ &&
368                          brw_reg_type_is_unsigned_integer(inst->dst.type) !=
369                          brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
370                         break;
371                   }
372                }
373             } else {
374                /* Not safe to use inequality operators if the types are
375                 * different.
376                 */
377                if (scan_inst->dst.type != inst->src[0].type &&
378                    inst->conditional_mod != BRW_CONDITIONAL_Z &&
379                    inst->conditional_mod != BRW_CONDITIONAL_NZ)
380                   break;
381 
382                /* Comparisons operate differently for ints and floats */
383                if (scan_inst->dst.type != inst->dst.type) {
384                   /* Comparison result may be altered if the bit-size changes
385                    * since that affects range, denorms, etc
386                    */
387                   if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
388                      break;
389 
390                   if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
391                       brw_reg_type_is_floating_point(inst->dst.type))
392                      break;
393                }
394             }
395 
396             /* Knowing following:
397              * - CMP writes to flag register the result of
398              *   applying cmod to the `src0 - src1`.
399              *   After that it stores the same value to dst.
400              *   Other instructions first store their result to
401              *   dst, and then store cmod(dst) to the flag
402              *   register.
403              * - inst is either CMP or MOV
404              * - inst->dst is null
405              * - inst->src[0] overlaps with scan_inst->dst
406              * - inst->src[1] is zero
407              * - scan_inst wrote to a flag register
408              *
409              * There can be three possible paths:
410              *
411              * - scan_inst is CMP:
412              *
413              *   Considering that src0 is either 0x0 (false),
414              *   or 0xffffffff (true), and src1 is 0x0:
415              *
416              *   - If inst's cmod is NZ, we can always remove
417              *     scan_inst: NZ is invariant for false and true. This
418              *     holds even if src0 is NaN: .nz is the only cmod,
419              *     that returns true for NaN.
420              *
421              *   - .g is invariant if src0 has a UD type
422              *
423              *   - .l is invariant if src0 has a D type
424              *
425              * - scan_inst and inst have the same cmod:
426              *
427              *   If scan_inst is anything than CMP, it already
428              *   wrote the appropriate value to the flag register.
429              *
430              * - else:
431              *
432              *   We can change cmod of scan_inst to that of inst,
433              *   and remove inst. It is valid as long as we make
434              *   sure that no instruction uses the flag register
435              *   between scan_inst and inst.
436              */
437             if (!inst->src[0].negate &&
438                 scan_inst->flags_written(devinfo)) {
439                if (scan_inst->opcode == BRW_OPCODE_CMP) {
440                   if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
441                       (inst->conditional_mod == BRW_CONDITIONAL_G &&
442                        inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
443                       (inst->conditional_mod == BRW_CONDITIONAL_L &&
444                        inst->src[0].type == BRW_REGISTER_TYPE_D)) {
445                      inst->remove(block, true);
446                      progress = true;
447                      break;
448                   }
449                } else if (scan_inst->conditional_mod == inst->conditional_mod) {
450                   /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
451                    * flags value is not based on the result stored in the
452                    * destination.  On all other platforms sel.cond will not
453                    * write the flags, so execution will not get to this point.
454                    */
455                   if (scan_inst->opcode == BRW_OPCODE_SEL) {
456                      assert(devinfo->ver <= 5);
457                   } else {
458                      inst->remove(block, true);
459                      progress = true;
460                   }
461 
462                   break;
463                } else if (!read_flag && scan_inst->can_do_cmod()) {
464                   scan_inst->conditional_mod = inst->conditional_mod;
465                   scan_inst->flag_subreg = inst->flag_subreg;
466                   inst->remove(block, true);
467                   progress = true;
468                   break;
469                }
470             }
471 
472             /* The conditional mod of the CMP/CMPN instructions behaves
473              * specially because the flag output is not calculated from the
474              * result of the instruction, but the other way around, which
475              * means that even if the condmod to propagate and the condmod
476              * from the CMP instruction are the same they will in general give
477              * different results because they are evaluated based on different
478              * inputs.
479              */
480             if (scan_inst->opcode == BRW_OPCODE_CMP ||
481                 scan_inst->opcode == BRW_OPCODE_CMPN)
482                break;
483 
484             /* From the Sky Lake PRM, Vol 2a, "Multiply":
485              *
486              *    "When multiplying integer data types, if one of the sources
487              *     is a DW, the resulting full precision data is stored in
488              *     the accumulator. However, if the destination data type is
489              *     either W or DW, the low bits of the result are written to
490              *     the destination register and the remaining high bits are
491              *     discarded. This results in undefined Overflow and Sign
492              *     flags. Therefore, conditional modifiers and saturation
493              *     (.sat) cannot be used in this case."
494              *
495              * We just disallow cmod propagation on all integer multiplies.
496              */
497             if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
498                 scan_inst->opcode == BRW_OPCODE_MUL)
499                break;
500 
501             enum brw_conditional_mod cond =
502                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
503                                    : inst->conditional_mod;
504 
505             /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
506              *
507              *    * Note that the [post condition signal] bits generated at
508              *      the output of a compute are before the .sat.
509              *
510              * Paragraph about post_zero does not mention saturation, but
511              * testing it on actual GPUs shows that conditional modifiers are
512              * applied after saturation.
513              *
514              *    * post_zero bit: This bit reflects whether the final
515              *      result is zero after all the clamping, normalizing,
516              *      or format conversion logic.
517              *
518              * For this reason, no additional restrictions are necessary on
519              * instructions with saturate.
520              */
521 
522             /* Otherwise, try propagating the conditional. */
523             if (scan_inst->can_do_cmod() &&
524                 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
525                  scan_inst->conditional_mod == cond)) {
526                scan_inst->conditional_mod = cond;
527                scan_inst->flag_subreg = inst->flag_subreg;
528                inst->remove(block, true);
529                progress = true;
530             }
531             break;
532          }
533 
534          if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
535             break;
536 
537          read_flag = read_flag ||
538                      (scan_inst->flags_read(devinfo) & flags_written) != 0;
539       }
540    }
541 
542    /* There is progress if and only if instructions were removed. */
543    assert(progress == (block->end_ip_delta != 0));
544 
545    return progress;
546 }
547 
548 bool
opt_cmod_propagation()549 fs_visitor::opt_cmod_propagation()
550 {
551    bool progress = false;
552 
553    foreach_block_reverse(block, cfg) {
554       progress = opt_cmod_propagation_local(devinfo, block) || progress;
555    }
556 
557    if (progress) {
558       cfg->adjust_block_ips();
559 
560       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
561    }
562 
563    return progress;
564 }
565