1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 
28 /** @file brw_fs_cmod_propagation.cpp
29  *
30  * Implements a pass that propagates the conditional modifier from a CMP x 0.0
31  * instruction into the instruction that generated x. For instance, in this
32  * sequence
33  *
34  *    add(8)          g70<1>F    g69<8,8,1>F    4096F
35  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
36  *
37  * we can do the comparison as part of the ADD instruction directly:
38  *
39  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
40  *
41  * If there had been a use of the flag register and another CMP using g70
42  *
43  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
44  *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
45  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
46  *
47  * we can recognize that the CMP is generating the flag value that already
48  * exists and therefore remove the instruction.
49  */
50 
51 using namespace brw;
52 
53 static bool
cmod_propagate_cmp_to_add(const gen_device_info * devinfo,bblock_t * block,fs_inst * inst)54 cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
55                           fs_inst *inst)
56 {
57    bool read_flag = false;
58    const unsigned flags_written = inst->flags_written();
59 
60    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
61       if (scan_inst->opcode == BRW_OPCODE_ADD &&
62           !scan_inst->is_partial_write() &&
63           scan_inst->exec_size == inst->exec_size) {
64          bool negate;
65 
66          /* A CMP is basically a subtraction.  The result of the
67           * subtraction must be the same as the result of the addition.
68           * This means that one of the operands must be negated.  So (a +
69           * b) vs (a == -b) or (a + -b) vs (a == b).
70           */
71          if ((inst->src[0].equals(scan_inst->src[0]) &&
72               inst->src[1].negative_equals(scan_inst->src[1])) ||
73              (inst->src[0].equals(scan_inst->src[1]) &&
74               inst->src[1].negative_equals(scan_inst->src[0]))) {
75             negate = false;
76          } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
77                      inst->src[1].equals(scan_inst->src[1])) ||
78                     (inst->src[0].negative_equals(scan_inst->src[1]) &&
79                      inst->src[1].equals(scan_inst->src[0]))) {
80             negate = true;
81          } else {
82             goto not_match;
83          }
84 
85          /* If the scan instruction writes a different flag register than the
86           * instruction we're trying to propagate from, bail.
87           *
88           * FINISHME: The second part of the condition may be too strong.
89           * Perhaps (scan_inst->flags_written() & flags_written) !=
90           * flags_written?
91           */
92          if (scan_inst->flags_written() != 0 &&
93              scan_inst->flags_written() != flags_written)
94             goto not_match;
95 
96          /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
97           *
98           *    * Note that the [post condition signal] bits generated at
99           *      the output of a compute are before the .sat.
100           *
101           * Paragraph about post_zero does not mention saturation, but
102           * testing it on actual GPUs shows that conditional modifiers
103           * are applied after saturation.
104           *
105           *    * post_zero bit: This bit reflects whether the final
106           *      result is zero after all the clamping, normalizing,
107           *      or format conversion logic.
108           *
109           * For signed types we don't care about saturation: it won't
110           * change the result of conditional modifier.
111           *
112           * For floating and unsigned types there two special cases,
113           * when we can remove inst even if scan_inst is saturated: G
114           * and LE. Since conditional modifiers are just comparations
115           * against zero, saturating positive values to the upper
116           * limit never changes the result of comparation.
117           *
118           * For negative values:
119           * (sat(x) >  0) == (x >  0) --- false
120           * (sat(x) <= 0) == (x <= 0) --- true
121           */
122          const enum brw_conditional_mod cond =
123             negate ? brw_swap_cmod(inst->conditional_mod)
124             : inst->conditional_mod;
125 
126          if (scan_inst->saturate &&
127              (brw_reg_type_is_floating_point(scan_inst->dst.type) ||
128               type_is_unsigned_int(scan_inst->dst.type)) &&
129              (cond != BRW_CONDITIONAL_G &&
130               cond != BRW_CONDITIONAL_LE))
131             goto not_match;
132 
133          /* Otherwise, try propagating the conditional. */
134          if (scan_inst->can_do_cmod() &&
135              ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
136               scan_inst->conditional_mod == cond)) {
137             scan_inst->conditional_mod = cond;
138             inst->remove(block);
139             return true;
140          }
141          break;
142       }
143 
144    not_match:
145       if ((scan_inst->flags_written() & flags_written) != 0)
146          break;
147 
148       read_flag = read_flag ||
149                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
150    }
151 
152    return false;
153 }
154 
155 /**
156  * Propagate conditional modifiers from NOT instructions
157  *
158  * Attempt to convert sequences like
159  *
160  *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
161  *    ...
162  *    not.nz.f0(8)    null            g78<8,8,1>UD
163  *
164  * into
165  *
166  *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
167  */
168 static bool
cmod_propagate_not(const gen_device_info * devinfo,bblock_t * block,fs_inst * inst)169 cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
170                    fs_inst *inst)
171 {
172    const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
173    bool read_flag = false;
174    const unsigned flags_written = inst->flags_written();
175 
176    if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
177       return false;
178 
179    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
180       if (regions_overlap(scan_inst->dst, scan_inst->size_written,
181                           inst->src[0], inst->size_read(0))) {
182          if (scan_inst->opcode != BRW_OPCODE_OR &&
183              scan_inst->opcode != BRW_OPCODE_AND)
184             break;
185 
186          if (scan_inst->is_partial_write() ||
187              scan_inst->dst.offset != inst->src[0].offset ||
188              scan_inst->exec_size != inst->exec_size)
189             break;
190 
191          /* If the scan instruction writes a different flag register than the
192           * instruction we're trying to propagate from, bail.
193           *
194           * FINISHME: The second part of the condition may be too strong.
195           * Perhaps (scan_inst->flags_written() & flags_written) !=
196           * flags_written?
197           */
198          if (scan_inst->flags_written() != 0 &&
199              scan_inst->flags_written() != flags_written)
200             break;
201 
202          if (scan_inst->can_do_cmod() &&
203              ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
204               scan_inst->conditional_mod == cond)) {
205             scan_inst->conditional_mod = cond;
206             inst->remove(block);
207             return true;
208          }
209          break;
210       }
211 
212       if ((scan_inst->flags_written() & flags_written) != 0)
213          break;
214 
215       read_flag = read_flag ||
216                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
217    }
218 
219    return false;
220 }
221 
222 static bool
opt_cmod_propagation_local(const gen_device_info * devinfo,bblock_t * block)223 opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
224 {
225    bool progress = false;
226    int ip = block->end_ip + 1;
227 
228    foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
229       ip--;
230 
231       if ((inst->opcode != BRW_OPCODE_AND &&
232            inst->opcode != BRW_OPCODE_CMP &&
233            inst->opcode != BRW_OPCODE_MOV &&
234            inst->opcode != BRW_OPCODE_NOT) ||
235           inst->predicate != BRW_PREDICATE_NONE ||
236           !inst->dst.is_null() ||
237           (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
238            inst->src[0].file != UNIFORM))
239          continue;
240 
241       /* An ABS source modifier can only be handled when processing a compare
242        * with a value other than zero.
243        */
244       if (inst->src[0].abs &&
245           (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
246          continue;
247 
248       /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
249        * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
250        * Propagating those would require inverting the condition on the CMP.
251        * This changes both the flag value and the register destination of the
252        * CMP.  That result may be used elsewhere, so we can't change its value
253        * on a whim.
254        */
255       if (inst->opcode == BRW_OPCODE_AND &&
256           !(inst->src[1].is_one() &&
257             inst->conditional_mod == BRW_CONDITIONAL_NZ &&
258             !inst->src[0].negate))
259          continue;
260 
261       if (inst->opcode == BRW_OPCODE_MOV &&
262           inst->conditional_mod != BRW_CONDITIONAL_NZ)
263          continue;
264 
265       /* A CMP with a second source of zero can match with anything.  A CMP
266        * with a second source that is not zero can only match with an ADD
267        * instruction.
268        *
269        * Only apply this optimization to float-point sources.  It can fail for
270        * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
271        * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
272        * less than zero, so the flags get set differently than for (a < b).
273        */
274       if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
275          if (brw_reg_type_is_floating_point(inst->src[0].type) &&
276              cmod_propagate_cmp_to_add(devinfo, block, inst))
277             progress = true;
278 
279          continue;
280       }
281 
282       if (inst->opcode == BRW_OPCODE_NOT) {
283          progress = cmod_propagate_not(devinfo, block, inst) || progress;
284          continue;
285       }
286 
287       bool read_flag = false;
288       const unsigned flags_written = inst->flags_written();
289       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
290          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
291                              inst->src[0], inst->size_read(0))) {
292             /* If the scan instruction writes a different flag register than
293              * the instruction we're trying to propagate from, bail.
294              *
295              * FINISHME: The second part of the condition may be too strong.
296              * Perhaps (scan_inst->flags_written() & flags_written) !=
297              * flags_written?
298              */
299             if (scan_inst->flags_written() != 0 &&
300                 scan_inst->flags_written() != flags_written)
301                break;
302 
303             if (scan_inst->is_partial_write() ||
304                 scan_inst->dst.offset != inst->src[0].offset ||
305                 scan_inst->exec_size != inst->exec_size)
306                break;
307 
308             /* CMP's result is the same regardless of dest type. */
309             if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
310                 scan_inst->opcode == BRW_OPCODE_CMP &&
311                 brw_reg_type_is_integer(inst->dst.type)) {
312                inst->remove(block);
313                progress = true;
314                break;
315             }
316 
317             /* If the AND wasn't handled by the previous case, it isn't safe
318              * to remove it.
319              */
320             if (inst->opcode == BRW_OPCODE_AND)
321                break;
322 
323             /* Not safe to use inequality operators if the types are different
324              */
325             if (scan_inst->dst.type != inst->src[0].type &&
326                 inst->conditional_mod != BRW_CONDITIONAL_Z &&
327                 inst->conditional_mod != BRW_CONDITIONAL_NZ)
328                break;
329 
330             /* Comparisons operate differently for ints and floats */
331             if (scan_inst->dst.type != inst->dst.type) {
332                /* Comparison result may be altered if the bit-size changes
333                 * since that affects range, denorms, etc
334                 */
335                if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
336                   break;
337 
338                /* We should propagate from a MOV to another instruction in a
339                 * sequence like:
340                 *
341                 *    and(16)         g31<1>UD       g20<8,8,1>UD   g22<8,8,1>UD
342                 *    mov.nz.f0(16)   null<1>F       g31<8,8,1>D
343                 */
344                if (inst->opcode == BRW_OPCODE_MOV) {
345                   if ((inst->src[0].type != BRW_REGISTER_TYPE_D &&
346                        inst->src[0].type != BRW_REGISTER_TYPE_UD) ||
347                       (scan_inst->dst.type != BRW_REGISTER_TYPE_D &&
348                        scan_inst->dst.type != BRW_REGISTER_TYPE_UD)) {
349                      break;
350                   }
351                } else if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
352                           brw_reg_type_is_floating_point(inst->dst.type)) {
353                   break;
354                }
355             }
356 
357             /* Knowing following:
358              * - CMP writes to flag register the result of
359              *   applying cmod to the `src0 - src1`.
360              *   After that it stores the same value to dst.
361              *   Other instructions first store their result to
362              *   dst, and then store cmod(dst) to the flag
363              *   register.
364              * - inst is either CMP or MOV
365              * - inst->dst is null
366              * - inst->src[0] overlaps with scan_inst->dst
367              * - inst->src[1] is zero
368              * - scan_inst wrote to a flag register
369              *
370              * There can be three possible paths:
371              *
372              * - scan_inst is CMP:
373              *
374              *   Considering that src0 is either 0x0 (false),
375              *   or 0xffffffff (true), and src1 is 0x0:
376              *
377              *   - If inst's cmod is NZ, we can always remove
378              *     scan_inst: NZ is invariant for false and true. This
379              *     holds even if src0 is NaN: .nz is the only cmod,
380              *     that returns true for NaN.
381              *
382              *   - .g is invariant if src0 has a UD type
383              *
384              *   - .l is invariant if src0 has a D type
385              *
386              * - scan_inst and inst have the same cmod:
387              *
388              *   If scan_inst is anything than CMP, it already
389              *   wrote the appropriate value to the flag register.
390              *
391              * - else:
392              *
393              *   We can change cmod of scan_inst to that of inst,
394              *   and remove inst. It is valid as long as we make
395              *   sure that no instruction uses the flag register
396              *   between scan_inst and inst.
397              */
398             if (!inst->src[0].negate &&
399                 scan_inst->flags_written()) {
400                if (scan_inst->opcode == BRW_OPCODE_CMP) {
401                   if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
402                       (inst->conditional_mod == BRW_CONDITIONAL_G &&
403                        inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
404                       (inst->conditional_mod == BRW_CONDITIONAL_L &&
405                        inst->src[0].type == BRW_REGISTER_TYPE_D)) {
406                      inst->remove(block);
407                      progress = true;
408                      break;
409                   }
410                } else if (scan_inst->conditional_mod == inst->conditional_mod) {
411                   inst->remove(block);
412                   progress = true;
413                   break;
414                } else if (!read_flag) {
415                   scan_inst->conditional_mod = inst->conditional_mod;
416                   inst->remove(block);
417                   progress = true;
418                   break;
419                }
420             }
421 
422             /* The conditional mod of the CMP/CMPN instructions behaves
423              * specially because the flag output is not calculated from the
424              * result of the instruction, but the other way around, which
425              * means that even if the condmod to propagate and the condmod
426              * from the CMP instruction are the same they will in general give
427              * different results because they are evaluated based on different
428              * inputs.
429              */
430             if (scan_inst->opcode == BRW_OPCODE_CMP ||
431                 scan_inst->opcode == BRW_OPCODE_CMPN)
432                break;
433 
434             /* From the Sky Lake PRM, Vol 2a, "Multiply":
435              *
436              *    "When multiplying integer data types, if one of the sources
437              *     is a DW, the resulting full precision data is stored in
438              *     the accumulator. However, if the destination data type is
439              *     either W or DW, the low bits of the result are written to
440              *     the destination register and the remaining high bits are
441              *     discarded. This results in undefined Overflow and Sign
442              *     flags. Therefore, conditional modifiers and saturation
443              *     (.sat) cannot be used in this case."
444              *
445              * We just disallow cmod propagation on all integer multiplies.
446              */
447             if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
448                 scan_inst->opcode == BRW_OPCODE_MUL)
449                break;
450 
451             enum brw_conditional_mod cond =
452                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
453                                    : inst->conditional_mod;
454 
455             /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
456              *
457              *    * Note that the [post condition signal] bits generated at
458              *      the output of a compute are before the .sat.
459              *
460              * This limits the cases where we can propagate the conditional
461              * modifier.  If scan_inst has a saturate modifier, then we can
462              * only propagate from inst if inst is 'scan_inst <= 0',
463              * 'scan_inst == 0', 'scan_inst != 0', or 'scan_inst > 0'.  If
464              * inst is 'scan_inst == 0', the conditional modifier must be
465              * replace with LE.  Likewise, if inst is 'scan_inst != 0', the
466              * conditional modifier must be replace with G.
467              *
468              * The only other cases are 'scan_inst < 0' (which is a
469              * contradiction) and 'scan_inst >= 0' (which is a tautology).
470              */
471             if (scan_inst->saturate) {
472                if (scan_inst->dst.type != BRW_REGISTER_TYPE_F)
473                   break;
474 
475                if (cond != BRW_CONDITIONAL_Z &&
476                    cond != BRW_CONDITIONAL_NZ &&
477                    cond != BRW_CONDITIONAL_LE &&
478                    cond != BRW_CONDITIONAL_G)
479                   break;
480 
481                if (inst->opcode != BRW_OPCODE_MOV &&
482                    inst->opcode != BRW_OPCODE_CMP)
483                   break;
484 
485                /* inst->src[1].is_zero() was tested before, but be safe
486                 * against possible future changes in this code.
487                 */
488                assert(inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero());
489 
490                if (cond == BRW_CONDITIONAL_Z)
491                   cond = BRW_CONDITIONAL_LE;
492                else if (cond == BRW_CONDITIONAL_NZ)
493                   cond = BRW_CONDITIONAL_G;
494             }
495 
496             /* Otherwise, try propagating the conditional. */
497             if (scan_inst->can_do_cmod() &&
498                 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
499                  scan_inst->conditional_mod == cond)) {
500                scan_inst->conditional_mod = cond;
501                scan_inst->flag_subreg = inst->flag_subreg;
502                inst->remove(block);
503                progress = true;
504             }
505             break;
506          }
507 
508          if ((scan_inst->flags_written() & flags_written) != 0)
509             break;
510 
511          read_flag = read_flag ||
512                      (scan_inst->flags_read(devinfo) & flags_written) != 0;
513       }
514    }
515 
516    return progress;
517 }
518 
519 bool
opt_cmod_propagation()520 fs_visitor::opt_cmod_propagation()
521 {
522    bool progress = false;
523 
524    foreach_block_reverse(block, cfg) {
525       progress = opt_cmod_propagation_local(devinfo, block) || progress;
526    }
527 
528    if (progress)
529       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
530 
531    return progress;
532 }
533