1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 /** @file brw_vec4_cmod_propagation.cpp
26  *
27  * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
28  * brw_fs_cmod_propagation for further details on the rationale behind this
29  * optimization.
30  */
31 
32 #include "brw_vec4.h"
33 #include "brw_cfg.h"
34 #include "brw_eu.h"
35 
36 namespace brw {
37 
38 static bool
writemasks_incompatible(const vec4_instruction * earlier,const vec4_instruction * later)39 writemasks_incompatible(const vec4_instruction *earlier,
40                         const vec4_instruction *later)
41 {
42    return (earlier->dst.writemask != WRITEMASK_X &&
43            earlier->dst.writemask != WRITEMASK_XYZW) ||
44           (earlier->dst.writemask == WRITEMASK_XYZW &&
45            later->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
46           (later->dst.writemask & ~earlier->dst.writemask) != 0;
47 }
48 
49 static bool
opt_cmod_propagation_local(bblock_t * block,vec4_visitor * v)50 opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
51 {
52    bool progress = false;
53    int ip = block->end_ip + 1;
54 
55    foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
56       ip--;
57 
58       if ((inst->opcode != BRW_OPCODE_AND &&
59            inst->opcode != BRW_OPCODE_CMP &&
60            inst->opcode != BRW_OPCODE_MOV) ||
61           inst->predicate != BRW_PREDICATE_NONE ||
62           !inst->dst.is_null() ||
63           (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
64            inst->src[0].file != UNIFORM))
65          continue;
66 
67       /* An ABS source modifier can only be handled when processing a compare
68        * with a value other than zero.
69        */
70       if (inst->src[0].abs &&
71           (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
72          continue;
73 
74       if (inst->opcode == BRW_OPCODE_AND &&
75           !(inst->src[1].is_one() &&
76             inst->conditional_mod == BRW_CONDITIONAL_NZ &&
77             !inst->src[0].negate))
78          continue;
79 
80       if (inst->opcode == BRW_OPCODE_MOV &&
81           inst->conditional_mod != BRW_CONDITIONAL_NZ)
82          continue;
83 
84       bool read_flag = false;
85       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
86          /* A CMP with a second source of zero can match with anything.  A CMP
87           * with a second source that is not zero can only match with an ADD
88           * instruction.
89           */
90          if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
91             bool negate;
92 
93             if (scan_inst->opcode != BRW_OPCODE_ADD)
94                goto not_match;
95 
96             if (writemasks_incompatible(scan_inst, inst))
97                goto not_match;
98 
99             /* A CMP is basically a subtraction.  The result of the
100              * subtraction must be the same as the result of the addition.
101              * This means that one of the operands must be negated.  So (a +
102              * b) vs (a == -b) or (a + -b) vs (a == b).
103              */
104             if ((inst->src[0].equals(scan_inst->src[0]) &&
105                  inst->src[1].negative_equals(scan_inst->src[1])) ||
106                 (inst->src[0].equals(scan_inst->src[1]) &&
107                  inst->src[1].negative_equals(scan_inst->src[0]))) {
108                negate = false;
109             } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
110                         inst->src[1].equals(scan_inst->src[1])) ||
111                        (inst->src[0].negative_equals(scan_inst->src[1]) &&
112                         inst->src[1].equals(scan_inst->src[0]))) {
113                negate = true;
114             } else {
115                goto not_match;
116             }
117 
118             if (scan_inst->exec_size != inst->exec_size ||
119                 scan_inst->group != inst->group)
120                goto not_match;
121 
122             /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
123              *
124              *    * Note that the [post condition signal] bits generated at
125              *      the output of a compute are before the .sat.
126              *
127              * So we don't have to bail if scan_inst has saturate.
128              */
129 
130             /* Otherwise, try propagating the conditional. */
131             const enum brw_conditional_mod cond =
132                negate ? brw_swap_cmod(inst->conditional_mod)
133                       : inst->conditional_mod;
134 
135             if (scan_inst->can_do_cmod() &&
136                 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
137                  scan_inst->conditional_mod == cond)) {
138                scan_inst->conditional_mod = cond;
139                inst->remove(block);
140                progress = true;
141             }
142             break;
143          }
144 
145          if (regions_overlap(inst->src[0], inst->size_read(0),
146                              scan_inst->dst, scan_inst->size_written)) {
147             if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
148                 scan_inst->dst.offset != inst->src[0].offset ||
149                 scan_inst->exec_size != inst->exec_size ||
150                 scan_inst->group != inst->group) {
151                break;
152             }
153 
154             /* If scan_inst is a CMP that produces a single value and inst is
155              * a CMP.NZ that consumes only that value, remove inst.
156              */
157             if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
158                 (inst->src[0].type == BRW_REGISTER_TYPE_D ||
159                  inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
160                 (inst->opcode == BRW_OPCODE_CMP ||
161                  inst->opcode == BRW_OPCODE_MOV) &&
162                 scan_inst->opcode == BRW_OPCODE_CMP &&
163                 ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
164                   scan_inst->dst.writemask == WRITEMASK_X) ||
165                  (inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
166                   scan_inst->dst.writemask == WRITEMASK_Y) ||
167                  (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
168                   scan_inst->dst.writemask == WRITEMASK_Z) ||
169                  (inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
170                   scan_inst->dst.writemask == WRITEMASK_W))) {
171                if (inst->dst.writemask != scan_inst->dst.writemask) {
172                   src_reg temp(v, glsl_type::vec4_type, 1);
173 
174                   /* Given a sequence like:
175                    *
176                    *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.xF      g18<4>.xF
177                    *    ...
178                    *    cmp.nz.f0(8)  null<1>D       g21<4>.zD      0D
179                    *
180                    * Replace it with something like:
181                    *
182                    *    cmp.ge.f0(8)  g22<1>.zF      g20<4>.xF      g18<4>.xF
183                    *    mov(8)        g21<1>.xF      g22<1>.zzzzF
184                    *
185                    * The added MOV will most likely be removed later.  In the
186                    * worst case, it should be cheaper to schedule.
187                    */
188                   temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
189                   temp.type = scan_inst->src[0].type;
190 
191                   vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
192 
193                   /* Modify the source swizzles on scan_inst.  If scan_inst
194                    * was
195                    *
196                    *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.wzyxF   g18<4>.yxwzF
197                    *
198                    * replace it with
199                    *
200                    *    cmp.ge.f0(8)  g21<1>.zF      g20<4>.yyyyF   g18<4>.wwwwF
201                    */
202                   unsigned src0_chan;
203                   unsigned src1_chan;
204                   switch (scan_inst->dst.writemask) {
205                   case WRITEMASK_X:
206                      src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
207                      src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
208                      break;
209                   case WRITEMASK_Y:
210                      src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
211                      src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
212                      break;
213                   case WRITEMASK_Z:
214                      src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
215                      src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
216                      break;
217                   case WRITEMASK_W:
218                      src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
219                      src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
220                      break;
221                   default:
222                      unreachable("Impossible writemask");
223                   }
224 
225                   scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
226                                                            src0_chan,
227                                                            src0_chan,
228                                                            src0_chan);
229 
230                   /* There's no swizzle on immediate value sources. */
231                   if (scan_inst->src[1].file != IMM) {
232                      scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
233                                                               src1_chan,
234                                                               src1_chan,
235                                                               src1_chan);
236                   }
237 
238                   scan_inst->dst = dst_reg(temp);
239                   scan_inst->dst.writemask = inst->dst.writemask;
240 
241                   scan_inst->insert_after(block, mov);
242                }
243 
244                inst->remove(block);
245                progress = true;
246                break;
247             }
248 
249             if (writemasks_incompatible(scan_inst, inst))
250                break;
251 
252             /* CMP's result is the same regardless of dest type. */
253             if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
254                 scan_inst->opcode == BRW_OPCODE_CMP &&
255                 (inst->dst.type == BRW_REGISTER_TYPE_D ||
256                  inst->dst.type == BRW_REGISTER_TYPE_UD)) {
257                inst->remove(block);
258                progress = true;
259                break;
260             }
261 
262             /* If the AND wasn't handled by the previous case, it isn't safe
263              * to remove it.
264              */
265             if (inst->opcode == BRW_OPCODE_AND)
266                break;
267 
268             /* Comparisons operate differently for ints and floats */
269             if (scan_inst->dst.type != inst->dst.type &&
270                 (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
271                  inst->dst.type == BRW_REGISTER_TYPE_F))
272                break;
273 
274             /* If the instruction generating inst's source also wrote the
275              * flag, and inst is doing a simple .nz comparison, then inst
276              * is redundant - the appropriate value is already in the flag
277              * register.  Delete inst.
278              */
279             if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
280                 !inst->src[0].negate &&
281                 scan_inst->writes_flag(v->devinfo)) {
282                inst->remove(block);
283                progress = true;
284                break;
285             }
286 
287             /* The conditional mod of the CMP/CMPN instructions behaves
288              * specially because the flag output is not calculated from the
289              * result of the instruction, but the other way around, which
290              * means that even if the condmod to propagate and the condmod
291              * from the CMP instruction are the same they will in general give
292              * different results because they are evaluated based on different
293              * inputs.
294              */
295             if (scan_inst->opcode == BRW_OPCODE_CMP ||
296                 scan_inst->opcode == BRW_OPCODE_CMPN)
297                break;
298 
299             /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
300              *
301              *    * Note that the [post condition signal] bits generated at
302              *      the output of a compute are before the .sat.
303              */
304             if (scan_inst->saturate)
305                break;
306 
307             /* From the Sky Lake PRM, Vol 2a, "Multiply":
308              *
309              *    "When multiplying integer data types, if one of the sources
310              *    is a DW, the resulting full precision data is stored in
311              *    the accumulator. However, if the destination data type is
312              *    either W or DW, the low bits of the result are written to
313              *    the destination register and the remaining high bits are
314              *    discarded. This results in undefined Overflow and Sign
315              *    flags. Therefore, conditional modifiers and saturation
316              *    (.sat) cannot be used in this case.
317              *
318              * We just disallow cmod propagation on all integer multiplies.
319              */
320             if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
321                 scan_inst->opcode == BRW_OPCODE_MUL)
322                break;
323 
324             /* Otherwise, try propagating the conditional. */
325             enum brw_conditional_mod cond =
326                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
327                                    : inst->conditional_mod;
328 
329             if (scan_inst->can_do_cmod() &&
330                 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
331                  scan_inst->conditional_mod == cond)) {
332                scan_inst->conditional_mod = cond;
333                inst->remove(block);
334                progress = true;
335             }
336             break;
337          }
338 
339       not_match:
340          if (scan_inst->writes_flag(v->devinfo))
341             break;
342 
343          read_flag = read_flag || scan_inst->reads_flag();
344       }
345    }
346 
347    return progress;
348 }
349 
350 bool
opt_cmod_propagation()351 vec4_visitor::opt_cmod_propagation()
352 {
353    bool progress = false;
354 
355    foreach_block_reverse(block, cfg) {
356       progress = opt_cmod_propagation_local(block, this) || progress;
357    }
358 
359    if (progress)
360       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
361 
362    return progress;
363 }
364 
365 } /* namespace brw */
366