1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27
28 /** @file brw_fs_cmod_propagation.cpp
29 *
30 * Implements a pass that propagates the conditional modifier from a CMP x 0.0
31 * instruction into the instruction that generated x. For instance, in this
32 * sequence
33 *
34 * add(8) g70<1>F g69<8,8,1>F 4096F
35 * cmp.ge.f0(8) null g70<8,8,1>F 0F
36 *
37 * we can do the comparison as part of the ADD instruction directly:
38 *
39 * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
40 *
41 * If there had been a use of the flag register and another CMP using g70
42 *
43 * add.ge.f0(8) g70<1>F g69<8,8,1>F 4096F
44 * (+f0) sel(8) g71<F> g72<8,8,1>F g73<8,8,1>F
45 * cmp.ge.f0(8) null g70<8,8,1>F 0F
46 *
47 * we can recognize that the CMP is generating the flag value that already
48 * exists and therefore remove the instruction.
49 */
50
51 using namespace brw;
52
53 static bool
cmod_propagate_cmp_to_add(const intel_device_info * devinfo,bblock_t * block,fs_inst * inst)54 cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
55 fs_inst *inst)
56 {
57 bool read_flag = false;
58 const unsigned flags_written = inst->flags_written(devinfo);
59
60 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
61 if (scan_inst->opcode == BRW_OPCODE_ADD &&
62 !scan_inst->is_partial_write() &&
63 scan_inst->exec_size == inst->exec_size) {
64 bool negate;
65
66 /* A CMP is basically a subtraction. The result of the
67 * subtraction must be the same as the result of the addition.
68 * This means that one of the operands must be negated. So (a +
69 * b) vs (a == -b) or (a + -b) vs (a == b).
70 */
71 if ((inst->src[0].equals(scan_inst->src[0]) &&
72 inst->src[1].negative_equals(scan_inst->src[1])) ||
73 (inst->src[0].equals(scan_inst->src[1]) &&
74 inst->src[1].negative_equals(scan_inst->src[0]))) {
75 negate = false;
76 } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
77 inst->src[1].equals(scan_inst->src[1])) ||
78 (inst->src[0].negative_equals(scan_inst->src[1]) &&
79 inst->src[1].equals(scan_inst->src[0]))) {
80 negate = true;
81 } else {
82 goto not_match;
83 }
84
85 /* If the scan instruction writes a different flag register than the
86 * instruction we're trying to propagate from, bail.
87 *
88 * FINISHME: The second part of the condition may be too strong.
89 * Perhaps (scan_inst->flags_written() & flags_written) !=
90 * flags_written?
91 */
92 if (scan_inst->flags_written(devinfo) != 0 &&
93 scan_inst->flags_written(devinfo) != flags_written)
94 goto not_match;
95
96 /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
97 *
98 * * Note that the [post condition signal] bits generated at
99 * the output of a compute are before the .sat.
100 *
101 * Paragraph about post_zero does not mention saturation, but
102 * testing it on actual GPUs shows that conditional modifiers
103 * are applied after saturation.
104 *
105 * * post_zero bit: This bit reflects whether the final
106 * result is zero after all the clamping, normalizing,
107 * or format conversion logic.
108 *
109 * For signed types we don't care about saturation: it won't
110 * change the result of conditional modifier.
111 *
112 * For floating and unsigned types there two special cases,
113 * when we can remove inst even if scan_inst is saturated: G
114 * and LE. Since conditional modifiers are just comparations
115 * against zero, saturating positive values to the upper
116 * limit never changes the result of comparation.
117 *
118 * For negative values:
119 * (sat(x) > 0) == (x > 0) --- false
120 * (sat(x) <= 0) == (x <= 0) --- true
121 */
122 const enum brw_conditional_mod cond =
123 negate ? brw_swap_cmod(inst->conditional_mod)
124 : inst->conditional_mod;
125
126 if (scan_inst->saturate &&
127 (brw_reg_type_is_floating_point(scan_inst->dst.type) ||
128 brw_reg_type_is_unsigned_integer(scan_inst->dst.type)) &&
129 (cond != BRW_CONDITIONAL_G &&
130 cond != BRW_CONDITIONAL_LE))
131 goto not_match;
132
133 /* Otherwise, try propagating the conditional. */
134 if (scan_inst->can_do_cmod() &&
135 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
136 scan_inst->conditional_mod == cond)) {
137 scan_inst->conditional_mod = cond;
138 scan_inst->flag_subreg = inst->flag_subreg;
139 inst->remove(block, true);
140 return true;
141 }
142 break;
143 }
144
145 not_match:
146 if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
147 break;
148
149 read_flag = read_flag ||
150 (scan_inst->flags_read(devinfo) & flags_written) != 0;
151 }
152
153 return false;
154 }
155
156 /**
157 * Propagate conditional modifiers from NOT instructions
158 *
159 * Attempt to convert sequences like
160 *
161 * or(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD
162 * ...
163 * not.nz.f0(8) null g78<8,8,1>UD
164 *
165 * into
166 *
167 * or.z.f0(8) g78<8,8,1> g76<8,8,1>UD g77<8,8,1>UD
168 */
169 static bool
cmod_propagate_not(const intel_device_info * devinfo,bblock_t * block,fs_inst * inst)170 cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
171 fs_inst *inst)
172 {
173 const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
174 bool read_flag = false;
175 const unsigned flags_written = inst->flags_written(devinfo);
176
177 if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
178 return false;
179
180 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
181 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
182 inst->src[0], inst->size_read(0))) {
183 if (scan_inst->opcode != BRW_OPCODE_OR &&
184 scan_inst->opcode != BRW_OPCODE_AND)
185 break;
186
187 if (scan_inst->is_partial_write() ||
188 scan_inst->dst.offset != inst->src[0].offset ||
189 scan_inst->exec_size != inst->exec_size)
190 break;
191
192 /* If the scan instruction writes a different flag register than the
193 * instruction we're trying to propagate from, bail.
194 *
195 * FINISHME: The second part of the condition may be too strong.
196 * Perhaps (scan_inst->flags_written() & flags_written) !=
197 * flags_written?
198 */
199 if (scan_inst->flags_written(devinfo) != 0 &&
200 scan_inst->flags_written(devinfo) != flags_written)
201 break;
202
203 if (scan_inst->can_do_cmod() &&
204 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
205 scan_inst->conditional_mod == cond)) {
206 scan_inst->conditional_mod = cond;
207 scan_inst->flag_subreg = inst->flag_subreg;
208 inst->remove(block, true);
209 return true;
210 }
211 break;
212 }
213
214 if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
215 break;
216
217 read_flag = read_flag ||
218 (scan_inst->flags_read(devinfo) & flags_written) != 0;
219 }
220
221 return false;
222 }
223
224 static bool
opt_cmod_propagation_local(const intel_device_info * devinfo,bblock_t * block)225 opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
226 {
227 bool progress = false;
228 int ip = block->end_ip + 1;
229
230 foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
231 ip--;
232
233 if ((inst->opcode != BRW_OPCODE_AND &&
234 inst->opcode != BRW_OPCODE_CMP &&
235 inst->opcode != BRW_OPCODE_MOV &&
236 inst->opcode != BRW_OPCODE_NOT) ||
237 inst->predicate != BRW_PREDICATE_NONE ||
238 !inst->dst.is_null() ||
239 (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
240 inst->src[0].file != UNIFORM))
241 continue;
242
243 /* An ABS source modifier can only be handled when processing a compare
244 * with a value other than zero.
245 */
246 if (inst->src[0].abs &&
247 (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
248 continue;
249
250 /* Only an AND.NZ can be propagated. Many AND.Z instructions are
251 * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
252 * Propagating those would require inverting the condition on the CMP.
253 * This changes both the flag value and the register destination of the
254 * CMP. That result may be used elsewhere, so we can't change its value
255 * on a whim.
256 */
257 if (inst->opcode == BRW_OPCODE_AND &&
258 !(inst->src[1].is_one() &&
259 inst->conditional_mod == BRW_CONDITIONAL_NZ &&
260 !inst->src[0].negate))
261 continue;
262
263 /* A CMP with a second source of zero can match with anything. A CMP
264 * with a second source that is not zero can only match with an ADD
265 * instruction.
266 *
267 * Only apply this optimization to float-point sources. It can fail for
268 * integers. For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
269 * int(0x80000000) - 4 overflows and results in 0x7ffffffc. that's not
270 * less than zero, so the flags get set differently than for (a < b).
271 */
272 if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
273 if (brw_reg_type_is_floating_point(inst->src[0].type) &&
274 cmod_propagate_cmp_to_add(devinfo, block, inst))
275 progress = true;
276
277 continue;
278 }
279
280 if (inst->opcode == BRW_OPCODE_NOT) {
281 progress = cmod_propagate_not(devinfo, block, inst) || progress;
282 continue;
283 }
284
285 bool read_flag = false;
286 const unsigned flags_written = inst->flags_written(devinfo);
287 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
288 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
289 inst->src[0], inst->size_read(0))) {
290 /* If the scan instruction writes a different flag register than
291 * the instruction we're trying to propagate from, bail.
292 *
293 * FINISHME: The second part of the condition may be too strong.
294 * Perhaps (scan_inst->flags_written() & flags_written) !=
295 * flags_written?
296 */
297 if (scan_inst->flags_written(devinfo) != 0 &&
298 scan_inst->flags_written(devinfo) != flags_written)
299 break;
300
301 if (scan_inst->is_partial_write() ||
302 scan_inst->dst.offset != inst->src[0].offset ||
303 scan_inst->exec_size != inst->exec_size)
304 break;
305
306 /* CMP's result is the same regardless of dest type. */
307 if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
308 scan_inst->opcode == BRW_OPCODE_CMP &&
309 brw_reg_type_is_integer(inst->dst.type)) {
310 inst->remove(block, true);
311 progress = true;
312 break;
313 }
314
315 /* If the AND wasn't handled by the previous case, it isn't safe
316 * to remove it.
317 */
318 if (inst->opcode == BRW_OPCODE_AND)
319 break;
320
321 if (inst->opcode == BRW_OPCODE_MOV) {
322 if (brw_reg_type_is_floating_point(scan_inst->dst.type)) {
323 /* If the destination type of scan_inst is floating-point,
324 * then:
325 *
326 * - The source of the MOV instruction must be the same
327 * type.
328 *
329 * - The destination of the MOV instruction must be float
330 * point with a size at least as large as the destination
331 * of inst. Size-reducing f2f conversions could cause
332 * non-zero values to become zero, etc.
333 */
334 if (scan_inst->dst.type != inst->src[0].type)
335 break;
336
337 if (!brw_reg_type_is_floating_point(inst->dst.type))
338 break;
339
340 if (type_sz(scan_inst->dst.type) > type_sz(inst->dst.type))
341 break;
342 } else {
343 /* If the destination type of scan_inst is integer, then:
344 *
345 * - The source of the MOV instruction must be integer with
346 * the same size.
347 *
348 * - If the conditional modifier is Z or NZ, then the
349 * destination type of inst must either be floating point
350 * (of any size) or integer with a size at least as large
351 * as the destination of inst.
352 *
353 * - If the conditional modifier is neither Z nor NZ, then the
354 * destination type of inst must either be floating point
355 * (of any size) or integer with a size at least as large
356 * as the destination of inst and the same signedness.
357 */
358 if (!brw_reg_type_is_integer(inst->src[0].type) ||
359 type_sz(scan_inst->dst.type) != type_sz(inst->src[0].type))
360 break;
361
362 if (brw_reg_type_is_integer(inst->dst.type)) {
363 if (type_sz(inst->dst.type) < type_sz(scan_inst->dst.type))
364 break;
365
366 if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
367 inst->conditional_mod != BRW_CONDITIONAL_NZ &&
368 brw_reg_type_is_unsigned_integer(inst->dst.type) !=
369 brw_reg_type_is_unsigned_integer(scan_inst->dst.type))
370 break;
371 }
372 }
373 } else {
374 /* Not safe to use inequality operators if the types are
375 * different.
376 */
377 if (scan_inst->dst.type != inst->src[0].type &&
378 inst->conditional_mod != BRW_CONDITIONAL_Z &&
379 inst->conditional_mod != BRW_CONDITIONAL_NZ)
380 break;
381
382 /* Comparisons operate differently for ints and floats */
383 if (scan_inst->dst.type != inst->dst.type) {
384 /* Comparison result may be altered if the bit-size changes
385 * since that affects range, denorms, etc
386 */
387 if (type_sz(scan_inst->dst.type) != type_sz(inst->dst.type))
388 break;
389
390 if (brw_reg_type_is_floating_point(scan_inst->dst.type) !=
391 brw_reg_type_is_floating_point(inst->dst.type))
392 break;
393 }
394 }
395
396 /* Knowing following:
397 * - CMP writes to flag register the result of
398 * applying cmod to the `src0 - src1`.
399 * After that it stores the same value to dst.
400 * Other instructions first store their result to
401 * dst, and then store cmod(dst) to the flag
402 * register.
403 * - inst is either CMP or MOV
404 * - inst->dst is null
405 * - inst->src[0] overlaps with scan_inst->dst
406 * - inst->src[1] is zero
407 * - scan_inst wrote to a flag register
408 *
409 * There can be three possible paths:
410 *
411 * - scan_inst is CMP:
412 *
413 * Considering that src0 is either 0x0 (false),
414 * or 0xffffffff (true), and src1 is 0x0:
415 *
416 * - If inst's cmod is NZ, we can always remove
417 * scan_inst: NZ is invariant for false and true. This
418 * holds even if src0 is NaN: .nz is the only cmod,
419 * that returns true for NaN.
420 *
421 * - .g is invariant if src0 has a UD type
422 *
423 * - .l is invariant if src0 has a D type
424 *
425 * - scan_inst and inst have the same cmod:
426 *
427 * If scan_inst is anything than CMP, it already
428 * wrote the appropriate value to the flag register.
429 *
430 * - else:
431 *
432 * We can change cmod of scan_inst to that of inst,
433 * and remove inst. It is valid as long as we make
434 * sure that no instruction uses the flag register
435 * between scan_inst and inst.
436 */
437 if (!inst->src[0].negate &&
438 scan_inst->flags_written(devinfo)) {
439 if (scan_inst->opcode == BRW_OPCODE_CMP) {
440 if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
441 (inst->conditional_mod == BRW_CONDITIONAL_G &&
442 inst->src[0].type == BRW_REGISTER_TYPE_UD) ||
443 (inst->conditional_mod == BRW_CONDITIONAL_L &&
444 inst->src[0].type == BRW_REGISTER_TYPE_D)) {
445 inst->remove(block, true);
446 progress = true;
447 break;
448 }
449 } else if (scan_inst->conditional_mod == inst->conditional_mod) {
450 /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
451 * flags value is not based on the result stored in the
452 * destination. On all other platforms sel.cond will not
453 * write the flags, so execution will not get to this point.
454 */
455 if (scan_inst->opcode == BRW_OPCODE_SEL) {
456 assert(devinfo->ver <= 5);
457 } else {
458 inst->remove(block, true);
459 progress = true;
460 }
461
462 break;
463 } else if (!read_flag && scan_inst->can_do_cmod()) {
464 scan_inst->conditional_mod = inst->conditional_mod;
465 scan_inst->flag_subreg = inst->flag_subreg;
466 inst->remove(block, true);
467 progress = true;
468 break;
469 }
470 }
471
472 /* The conditional mod of the CMP/CMPN instructions behaves
473 * specially because the flag output is not calculated from the
474 * result of the instruction, but the other way around, which
475 * means that even if the condmod to propagate and the condmod
476 * from the CMP instruction are the same they will in general give
477 * different results because they are evaluated based on different
478 * inputs.
479 */
480 if (scan_inst->opcode == BRW_OPCODE_CMP ||
481 scan_inst->opcode == BRW_OPCODE_CMPN)
482 break;
483
484 /* From the Sky Lake PRM, Vol 2a, "Multiply":
485 *
486 * "When multiplying integer data types, if one of the sources
487 * is a DW, the resulting full precision data is stored in
488 * the accumulator. However, if the destination data type is
489 * either W or DW, the low bits of the result are written to
490 * the destination register and the remaining high bits are
491 * discarded. This results in undefined Overflow and Sign
492 * flags. Therefore, conditional modifiers and saturation
493 * (.sat) cannot be used in this case."
494 *
495 * We just disallow cmod propagation on all integer multiplies.
496 */
497 if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
498 scan_inst->opcode == BRW_OPCODE_MUL)
499 break;
500
501 enum brw_conditional_mod cond =
502 inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
503 : inst->conditional_mod;
504
505 /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
506 *
507 * * Note that the [post condition signal] bits generated at
508 * the output of a compute are before the .sat.
509 *
510 * Paragraph about post_zero does not mention saturation, but
511 * testing it on actual GPUs shows that conditional modifiers are
512 * applied after saturation.
513 *
514 * * post_zero bit: This bit reflects whether the final
515 * result is zero after all the clamping, normalizing,
516 * or format conversion logic.
517 *
518 * For this reason, no additional restrictions are necessary on
519 * instructions with saturate.
520 */
521
522 /* Otherwise, try propagating the conditional. */
523 if (scan_inst->can_do_cmod() &&
524 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
525 scan_inst->conditional_mod == cond)) {
526 scan_inst->conditional_mod = cond;
527 scan_inst->flag_subreg = inst->flag_subreg;
528 inst->remove(block, true);
529 progress = true;
530 }
531 break;
532 }
533
534 if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
535 break;
536
537 read_flag = read_flag ||
538 (scan_inst->flags_read(devinfo) & flags_written) != 0;
539 }
540 }
541
542 /* There is progress if and only if instructions were removed. */
543 assert(progress == (block->end_ip_delta != 0));
544
545 return progress;
546 }
547
548 bool
opt_cmod_propagation()549 fs_visitor::opt_cmod_propagation()
550 {
551 bool progress = false;
552
553 foreach_block_reverse(block, cfg) {
554 progress = opt_cmod_propagation_local(devinfo, block) || progress;
555 }
556
557 if (progress) {
558 cfg->adjust_block_ips();
559
560 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
561 }
562
563 return progress;
564 }
565