1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir_instr_set.h"
25 #include "nir_search_helpers.h"
26 #include "nir_builder.h"
27 #include "util/u_vector.h"
28 
29 /* Partial redundancy elimination of compares
30  *
31  * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
32  * instructions like 'b - a'.  The comparison is replaced by the arithmetic
33  * instruction, and the result is compared with zero.  For example,
34  *
35  *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
36  *       if ssa_111 {
37  *               block block_1:
38  *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
39  *              ...
40  *
41  * becomes
42  *
43  *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
44  *       vec1 32 ssa_112 = flt 0.0, ssa_111
45  *       if ssa_112 {
46  *               block block_1:
47  *              ...
48  */
49 
50 struct block_queue {
51    /**
52     * Stack of blocks from the current location in the CFG to the entry point
53     * of the function.
54     *
55     * This is sort of a poor man's dominator tree.
56     */
57    struct exec_list blocks;
58 
59    /** List of freed block_instructions structures that can be reused. */
60    struct exec_list reusable_blocks;
61 };
62 
63 struct block_instructions {
64    struct exec_node node;
65 
66    /**
67     * Set of comparison instructions from the block that are candidates for
68     * being replaced by add instructions.
69     */
70    struct u_vector instructions;
71 };
72 
73 static void
block_queue_init(struct block_queue * bq)74 block_queue_init(struct block_queue *bq)
75 {
76    exec_list_make_empty(&bq->blocks);
77    exec_list_make_empty(&bq->reusable_blocks);
78 }
79 
80 static void
block_queue_finish(struct block_queue * bq)81 block_queue_finish(struct block_queue *bq)
82 {
83    struct block_instructions *n;
84 
85    while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
86       u_vector_finish(&n->instructions);
87       free(n);
88    }
89 
90    while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
91       free(n);
92    }
93 }
94 
95 static struct block_instructions *
push_block(struct block_queue * bq)96 push_block(struct block_queue *bq)
97 {
98    struct block_instructions *bi =
99       (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
100 
101    if (bi == NULL) {
102       bi = calloc(1, sizeof(struct block_instructions));
103 
104       if (bi == NULL)
105          return NULL;
106    }
107 
108    if (!u_vector_init_pow2(&bi->instructions, 8, sizeof(nir_alu_instr *))) {
109       free(bi);
110       return NULL;
111    }
112 
113    exec_list_push_tail(&bq->blocks, &bi->node);
114 
115    return bi;
116 }
117 
118 static void
pop_block(struct block_queue * bq,struct block_instructions * bi)119 pop_block(struct block_queue *bq, struct block_instructions *bi)
120 {
121    u_vector_finish(&bi->instructions);
122    exec_node_remove(&bi->node);
123    exec_list_push_head(&bq->reusable_blocks, &bi->node);
124 }
125 
126 static void
add_instruction_for_block(struct block_instructions * bi,nir_alu_instr * alu)127 add_instruction_for_block(struct block_instructions *bi,
128                           nir_alu_instr *alu)
129 {
130    nir_alu_instr **data =
131       u_vector_add(&bi->instructions);
132 
133    *data = alu;
134 }
135 
136 static void
rewrite_compare_instruction(nir_builder * bld,nir_alu_instr * orig_cmp,nir_alu_instr * orig_add,bool zero_on_left)137 rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
138                             nir_alu_instr *orig_add, bool zero_on_left)
139 {
140    bld->cursor = nir_before_instr(&orig_cmp->instr);
141 
142    /* This is somewhat tricky.  The compare instruction may be something like
143     * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
144     * b).  This is problematic because the SSA value for the fneg(a) may not
145     * exist yet at the compare instruction.
146     *
147     * We fabricate the operands of the new add.  This is done using
148     * information provided by zero_on_left.  If zero_on_left is true, we know
149     * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)).  If the
150     * original compare instruction was (fcmp, a, b), x = b and y = -a.  If
151     * zero_on_left is false, the resulting compare instruction is (fcmp,
152     * (fadd, x, y), 0.0) and x = a and y = -b.
153     */
154    nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
155    nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
156 
157    nir_ssa_def *const fadd = zero_on_left
158       ? nir_fadd(bld, b, nir_fneg(bld, a))
159       : nir_fadd(bld, a, nir_fneg(bld, b));
160 
161    nir_ssa_def *const zero =
162       nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
163 
164    nir_ssa_def *const cmp = zero_on_left
165       ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
166       : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
167 
168    /* Generating extra moves of the results is the easy way to make sure the
169     * writemasks match the original instructions.  Later optimization passes
170     * will clean these up.  This is similar to nir_replace_instr (in
171     * nir_search.c).
172     */
173    nir_alu_instr *mov_add = nir_alu_instr_create(bld->shader, nir_op_mov);
174    mov_add->dest.write_mask = orig_add->dest.write_mask;
175    nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
176                      orig_add->dest.dest.ssa.num_components,
177                      orig_add->dest.dest.ssa.bit_size, NULL);
178    mov_add->src[0].src = nir_src_for_ssa(fadd);
179 
180    nir_builder_instr_insert(bld, &mov_add->instr);
181 
182    nir_alu_instr *mov_cmp = nir_alu_instr_create(bld->shader, nir_op_mov);
183    mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
184    nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
185                      orig_cmp->dest.dest.ssa.num_components,
186                      orig_cmp->dest.dest.ssa.bit_size, NULL);
187    mov_cmp->src[0].src = nir_src_for_ssa(cmp);
188 
189    nir_builder_instr_insert(bld, &mov_cmp->instr);
190 
191    nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
192                             &mov_cmp->dest.dest.ssa);
193    nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
194                             &mov_add->dest.dest.ssa);
195 
196    /* We know these have no more uses because we just rewrote them all, so we
197     * can remove them.
198     */
199    nir_instr_remove(&orig_cmp->instr);
200    nir_instr_remove(&orig_add->instr);
201 }
202 
203 static bool
comparison_pre_block(nir_block * block,struct block_queue * bq,nir_builder * bld)204 comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
205 {
206    bool progress = false;
207 
208    struct block_instructions *bi = push_block(bq);
209    if (bi == NULL)
210       return false;
211 
212    /* Starting with the current block, examine each instruction.  If the
213     * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
214     * to the block_instructions::instructions set.  If the instruction is an
215     * add instruction, walk up the block queue looking at the stored
216     * instructions.  If a matching comparison is found, move the addition and
217     * replace the comparison with a different comparison based on the result
218     * of the addition.  All of the blocks in the queue are guaranteed to be
219     * dominators of the current block.
220     *
221     * After processing the current block, recurse into the blocks dominated by
222     * the current block.
223     */
224    nir_foreach_instr_safe(instr, block) {
225       if (instr->type != nir_instr_type_alu)
226          continue;
227 
228       nir_alu_instr *const alu = nir_instr_as_alu(instr);
229 
230       if (alu->dest.dest.ssa.num_components != 1)
231          continue;
232 
233       if (alu->dest.saturate)
234          continue;
235 
236       static const uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
237 
238       switch (alu->op) {
239       case nir_op_fadd: {
240          /* If the instruction is fadd, check it against comparison
241           * instructions that dominate it.
242           */
243          struct block_instructions *b =
244             (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
245 
246          while (b->node.next != NULL) {
247             nir_alu_instr **a;
248             bool rewrote_compare = false;
249 
250             u_vector_foreach(a, &b->instructions) {
251                nir_alu_instr *const cmp = *a;
252 
253                if (cmp == NULL)
254                   continue;
255 
256                /* The operands of both instructions are, with some liberty,
257                 * commutative.  Check all four permutations.  The third and
258                 * fourth permutations are negations of the first two.
259                 */
260                if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
261                     nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
262                    (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
263                     nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
264                   /* These are the cases where (A cmp B) matches either (A +
265                    * -B) or (-B + A)
266                    *
267                    *    A cmp B <=> A + -B cmp 0
268                    */
269                   rewrite_compare_instruction(bld, cmp, alu, false);
270 
271                   *a = NULL;
272                   rewrote_compare = true;
273                   break;
274                } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
275                            nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
276                           (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
277                            nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
278                   /* This is the case where (A cmp B) matches (B + -A) or (-A
279                    * + B).
280                    *
281                    *    A cmp B <=> 0 cmp B + -A
282                    */
283                   rewrite_compare_instruction(bld, cmp, alu, true);
284 
285                   *a = NULL;
286                   rewrote_compare = true;
287                   break;
288                }
289             }
290 
291             /* Bail after a compare in the most dominating block is found.
292              * This is necessary because 'alu' has been removed from the
293              * instruction stream.  Should there be a matching compare in
294              * another block, calling rewrite_compare_instruction again will
295              * try to operate on a node that is not in the list as if it were
296              * in the list.
297              *
298              * FINISHME: There may be opportunity for additional optimization
299              * here.  I discovered this problem due to a shader in Guacamelee.
300              * It may be possible to rewrite the matching compares that are
301              * encountered later to reuse the result from the compare that was
302              * first rewritten.  It's also possible that this is just taken
303              * care of by calling the optimization pass repeatedly.
304              */
305             if (rewrote_compare) {
306                progress = true;
307                break;
308             }
309 
310             b = (struct block_instructions *) b->node.next;
311          }
312 
313          break;
314       }
315 
316       case nir_op_flt:
317       case nir_op_fge:
318       case nir_op_fneu:
319       case nir_op_feq:
320          /* If the instruction is a comparison that is used by an if-statement
321           * and neither operand is immediate value 0, add it to the set.
322           */
323          if (is_used_by_if(alu) &&
324              is_not_const_zero(NULL, alu, 0, 1, swizzle) &&
325              is_not_const_zero(NULL, alu, 1, 1, swizzle))
326             add_instruction_for_block(bi, alu);
327 
328          break;
329 
330       default:
331          break;
332       }
333    }
334 
335    for (unsigned i = 0; i < block->num_dom_children; i++) {
336       nir_block *child = block->dom_children[i];
337 
338       if (comparison_pre_block(child, bq, bld))
339          progress = true;
340    }
341 
342    pop_block(bq, bi);
343 
344    return progress;
345 }
346 
347 bool
nir_opt_comparison_pre_impl(nir_function_impl * impl)348 nir_opt_comparison_pre_impl(nir_function_impl *impl)
349 {
350    struct block_queue bq;
351    nir_builder bld;
352 
353    block_queue_init(&bq);
354    nir_builder_init(&bld, impl);
355 
356    nir_metadata_require(impl, nir_metadata_dominance);
357 
358    const bool progress =
359       comparison_pre_block(nir_start_block(impl), &bq, &bld);
360 
361    block_queue_finish(&bq);
362 
363    if (progress) {
364       nir_metadata_preserve(impl, nir_metadata_block_index |
365                                   nir_metadata_dominance);
366    } else {
367       nir_metadata_preserve(impl, nir_metadata_all);
368    }
369 
370    return progress;
371 }
372 
373 bool
nir_opt_comparison_pre(nir_shader * shader)374 nir_opt_comparison_pre(nir_shader *shader)
375 {
376    bool progress = false;
377 
378    nir_foreach_function(function, shader) {
379       if (function->impl)
380          progress |= nir_opt_comparison_pre_impl(function->impl);
381    }
382 
383    return progress;
384 }
385