1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "params.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48
49
50 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
51 FINAL_P is true if we have vectorized the instance or if we have
52 made a final decision not to vectorize the statements in any way. */
53
54 static void
vect_free_slp_tree(slp_tree node,bool final_p)55 vect_free_slp_tree (slp_tree node, bool final_p)
56 {
57 int i;
58 slp_tree child;
59
60 if (--node->refcnt != 0)
61 return;
62
63 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
64 vect_free_slp_tree (child, final_p);
65
66 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
67 Some statements might no longer exist, after having been
68 removed by vect_transform_stmt. Updating the remaining
69 statements would be redundant. */
70 if (!final_p)
71 {
72 stmt_vec_info stmt_info;
73 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
74 {
75 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
76 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
77 }
78 }
79
80 SLP_TREE_CHILDREN (node).release ();
81 SLP_TREE_SCALAR_STMTS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
vect_free_slp_instance(slp_instance instance,bool final_p)93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts)104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_VEC_STMTS (node).create (0);
126 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
127 SLP_TREE_CHILDREN (node).create (nops);
128 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
129 SLP_TREE_TWO_OPERATORS (node) = false;
130 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
131 node->refcnt = 1;
132 node->max_nunits = 1;
133
134 unsigned i;
135 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
136 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
137
138 return node;
139 }
140
141
142 /* This structure is used in creation of an SLP tree. Each instance
143 corresponds to the same operand in a group of scalar stmts in an SLP
144 node. */
145 typedef struct _slp_oprnd_info
146 {
147 /* Def-stmts for the operands. */
148 vec<stmt_vec_info> def_stmts;
149 /* Information about the first statement, its vector def-type, type, the
150 operand itself in case it's constant, and an indication if it's a pattern
151 stmt. */
152 tree first_op_type;
153 enum vect_def_type first_dt;
154 bool any_pattern;
155 } *slp_oprnd_info;
156
157
158 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
159 operand. */
160 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)161 vect_create_oprnd_info (int nops, int group_size)
162 {
163 int i;
164 slp_oprnd_info oprnd_info;
165 vec<slp_oprnd_info> oprnds_info;
166
167 oprnds_info.create (nops);
168 for (i = 0; i < nops; i++)
169 {
170 oprnd_info = XNEW (struct _slp_oprnd_info);
171 oprnd_info->def_stmts.create (group_size);
172 oprnd_info->first_dt = vect_uninitialized_def;
173 oprnd_info->first_op_type = NULL_TREE;
174 oprnd_info->any_pattern = false;
175 oprnds_info.quick_push (oprnd_info);
176 }
177
178 return oprnds_info;
179 }
180
181
182 /* Free operands info. */
183
184 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)185 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
186 {
187 int i;
188 slp_oprnd_info oprnd_info;
189
190 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
191 {
192 oprnd_info->def_stmts.release ();
193 XDELETE (oprnd_info);
194 }
195
196 oprnds_info.release ();
197 }
198
199
200 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
201 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
202 of the chain. */
203
204 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)205 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
206 stmt_vec_info first_stmt_info)
207 {
208 stmt_vec_info next_stmt_info = first_stmt_info;
209 int result = 0;
210
211 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
212 return -1;
213
214 do
215 {
216 if (next_stmt_info == stmt_info)
217 return result;
218 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
219 if (next_stmt_info)
220 result += DR_GROUP_GAP (next_stmt_info);
221 }
222 while (next_stmt_info);
223
224 return -1;
225 }
226
227 /* Check whether it is possible to load COUNT elements of type ELT_MODE
228 using the method implemented by duplicate_and_interleave. Return true
229 if so, returning the number of intermediate vectors in *NVECTORS_OUT
230 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
231 (if nonnull). */
232
233 bool
can_duplicate_and_interleave_p(unsigned int count,machine_mode elt_mode,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)234 can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
235 unsigned int *nvectors_out,
236 tree *vector_type_out,
237 tree *permutes)
238 {
239 poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
240 poly_int64 nelts;
241 unsigned int nvectors = 1;
242 for (;;)
243 {
244 scalar_int_mode int_mode;
245 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
246 if (multiple_p (current_vector_size, elt_bytes, &nelts)
247 && int_mode_for_size (elt_bits, 0).exists (&int_mode))
248 {
249 tree int_type = build_nonstandard_integer_type
250 (GET_MODE_BITSIZE (int_mode), 1);
251 tree vector_type = build_vector_type (int_type, nelts);
252 if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
253 {
254 vec_perm_builder sel1 (nelts, 2, 3);
255 vec_perm_builder sel2 (nelts, 2, 3);
256 poly_int64 half_nelts = exact_div (nelts, 2);
257 for (unsigned int i = 0; i < 3; ++i)
258 {
259 sel1.quick_push (i);
260 sel1.quick_push (i + nelts);
261 sel2.quick_push (half_nelts + i);
262 sel2.quick_push (half_nelts + i + nelts);
263 }
264 vec_perm_indices indices1 (sel1, 2, nelts);
265 vec_perm_indices indices2 (sel2, 2, nelts);
266 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
267 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
268 {
269 if (nvectors_out)
270 *nvectors_out = nvectors;
271 if (vector_type_out)
272 *vector_type_out = vector_type;
273 if (permutes)
274 {
275 permutes[0] = vect_gen_perm_mask_checked (vector_type,
276 indices1);
277 permutes[1] = vect_gen_perm_mask_checked (vector_type,
278 indices2);
279 }
280 return true;
281 }
282 }
283 }
284 if (!multiple_p (elt_bytes, 2, &elt_bytes))
285 return false;
286 nvectors *= 2;
287 }
288 }
289
290 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
291 they are of a valid type and that they match the defs of the first stmt of
292 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
293 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
294 indicates swap is required for cond_expr stmts. Specifically, *SWAP
295 is 1 if STMT is cond and operands of comparison need to be swapped;
296 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
297 If there is any operand swap in this function, *SWAP is set to non-zero
298 value.
299 If there was a fatal error return -1; if the error could be corrected by
300 swapping operands of father node of this one, return 1; if everything is
301 ok return 0. */
302 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)303 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
304 vec<stmt_vec_info> stmts, unsigned stmt_num,
305 vec<slp_oprnd_info> *oprnds_info)
306 {
307 stmt_vec_info stmt_info = stmts[stmt_num];
308 tree oprnd;
309 unsigned int i, number_of_oprnds;
310 enum vect_def_type dt = vect_uninitialized_def;
311 slp_oprnd_info oprnd_info;
312 int first_op_idx = 1;
313 unsigned int commutative_op = -1U;
314 bool first_op_cond = false;
315 bool first = stmt_num == 0;
316
317 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
318 {
319 number_of_oprnds = gimple_call_num_args (stmt);
320 first_op_idx = 3;
321 if (gimple_call_internal_p (stmt))
322 {
323 internal_fn ifn = gimple_call_internal_fn (stmt);
324 commutative_op = first_commutative_argument (ifn);
325 }
326 }
327 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
328 {
329 enum tree_code code = gimple_assign_rhs_code (stmt);
330 number_of_oprnds = gimple_num_ops (stmt) - 1;
331 /* Swap can only be done for cond_expr if asked to, otherwise we
332 could result in different comparison code to the first stmt. */
333 if (gimple_assign_rhs_code (stmt) == COND_EXPR
334 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
335 {
336 first_op_cond = true;
337 number_of_oprnds++;
338 }
339 else
340 commutative_op = commutative_tree_code (code) ? 0U : -1U;
341 }
342 else
343 return -1;
344
345 bool swapped = (*swap != 0);
346 gcc_assert (!swapped || first_op_cond);
347 for (i = 0; i < number_of_oprnds; i++)
348 {
349 again:
350 if (first_op_cond)
351 {
352 /* Map indicating how operands of cond_expr should be swapped. */
353 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
354 int *map = maps[*swap];
355
356 if (i < 2)
357 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
358 first_op_idx), map[i]);
359 else
360 oprnd = gimple_op (stmt_info->stmt, map[i]);
361 }
362 else
363 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
364
365 oprnd_info = (*oprnds_info)[i];
366
367 stmt_vec_info def_stmt_info;
368 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
369 {
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 "Build SLP failed: can't analyze def for %T\n",
373 oprnd);
374
375 return -1;
376 }
377
378 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
379 oprnd_info->any_pattern = true;
380
381 if (first)
382 {
383 oprnd_info->first_dt = dt;
384 oprnd_info->first_op_type = TREE_TYPE (oprnd);
385 }
386 else
387 {
388 /* Not first stmt of the group, check that the def-stmt/s match
389 the def-stmt/s of the first stmt. Allow different definition
390 types for reduction chains: the first stmt must be a
391 vect_reduction_def (a phi node), and the rest
392 vect_internal_def. */
393 tree type = TREE_TYPE (oprnd);
394 if ((oprnd_info->first_dt != dt
395 && !(oprnd_info->first_dt == vect_reduction_def
396 && dt == vect_internal_def)
397 && !((oprnd_info->first_dt == vect_external_def
398 || oprnd_info->first_dt == vect_constant_def)
399 && (dt == vect_external_def
400 || dt == vect_constant_def)))
401 || !types_compatible_p (oprnd_info->first_op_type, type))
402 {
403 /* Try swapping operands if we got a mismatch. */
404 if (i == commutative_op && !swapped)
405 {
406 swapped = true;
407 goto again;
408 }
409
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
412 "Build SLP failed: different types\n");
413
414 return 1;
415 }
416 if ((dt == vect_constant_def
417 || dt == vect_external_def)
418 && !current_vector_size.is_constant ()
419 && (TREE_CODE (type) == BOOLEAN_TYPE
420 || !can_duplicate_and_interleave_p (stmts.length (),
421 TYPE_MODE (type))))
422 {
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 "Build SLP failed: invalid type of def "
426 "for variable-length SLP %T\n", oprnd);
427 return -1;
428 }
429 }
430
431 /* Check the types of the definitions. */
432 switch (dt)
433 {
434 case vect_constant_def:
435 case vect_external_def:
436 break;
437
438 case vect_reduction_def:
439 case vect_induction_def:
440 case vect_internal_def:
441 oprnd_info->def_stmts.quick_push (def_stmt_info);
442 break;
443
444 default:
445 /* FORNOW: Not supported. */
446 if (dump_enabled_p ())
447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
448 "Build SLP failed: illegal type of def %T\n",
449 oprnd);
450
451 return -1;
452 }
453 }
454
455 /* Swap operands. */
456 if (swapped)
457 {
458 /* If there are already uses of this stmt in a SLP instance then
459 we've committed to the operand order and can't swap it. */
460 if (STMT_VINFO_NUM_SLP_USES (stmt_info) != 0)
461 {
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
464 "Build SLP failed: cannot swap operands of "
465 "shared stmt %G", stmt_info->stmt);
466 return -1;
467 }
468
469 if (first_op_cond)
470 {
471 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
472 tree cond = gimple_assign_rhs1 (stmt);
473 enum tree_code code = TREE_CODE (cond);
474
475 /* Swap. */
476 if (*swap == 1)
477 {
478 swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
479 &TREE_OPERAND (cond, 1));
480 TREE_SET_CODE (cond, swap_tree_comparison (code));
481 }
482 /* Invert. */
483 else
484 {
485 swap_ssa_operands (stmt, gimple_assign_rhs2_ptr (stmt),
486 gimple_assign_rhs3_ptr (stmt));
487 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond, 0));
488 code = invert_tree_comparison (TREE_CODE (cond), honor_nans);
489 gcc_assert (code != ERROR_MARK);
490 TREE_SET_CODE (cond, code);
491 }
492 }
493 else
494 {
495 unsigned int op = commutative_op + first_op_idx;
496 swap_ssa_operands (stmt_info->stmt,
497 gimple_op_ptr (stmt_info->stmt, op),
498 gimple_op_ptr (stmt_info->stmt, op + 1));
499 }
500 if (dump_enabled_p ())
501 dump_printf_loc (MSG_NOTE, vect_location,
502 "swapped operands to match def types in %G",
503 stmt_info->stmt);
504 }
505
506 *swap = swapped;
507 return 0;
508 }
509
510 /* Return true if call statements CALL1 and CALL2 are similar enough
511 to be combined into the same SLP group. */
512
513 static bool
compatible_calls_p(gcall * call1,gcall * call2)514 compatible_calls_p (gcall *call1, gcall *call2)
515 {
516 unsigned int nargs = gimple_call_num_args (call1);
517 if (nargs != gimple_call_num_args (call2))
518 return false;
519
520 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
521 return false;
522
523 if (gimple_call_internal_p (call1))
524 {
525 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
526 TREE_TYPE (gimple_call_lhs (call2))))
527 return false;
528 for (unsigned int i = 0; i < nargs; ++i)
529 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
530 TREE_TYPE (gimple_call_arg (call2, i))))
531 return false;
532 }
533 else
534 {
535 if (!operand_equal_p (gimple_call_fn (call1),
536 gimple_call_fn (call2), 0))
537 return false;
538
539 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
540 return false;
541 }
542 return true;
543 }
544
545 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
546 caller's attempt to find the vector type in STMT_INFO with the narrowest
547 element type. Return true if VECTYPE is nonnull and if it is valid
548 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
549 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
550 vect_build_slp_tree. */
551
552 static bool
vect_record_max_nunits(stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)553 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
554 tree vectype, poly_uint64 *max_nunits)
555 {
556 if (!vectype)
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "Build SLP failed: unsupported data-type in %G\n",
561 stmt_info->stmt);
562 /* Fatal mismatch. */
563 return false;
564 }
565
566 /* If populating the vector type requires unrolling then fail
567 before adjusting *max_nunits for basic-block vectorization. */
568 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
569 unsigned HOST_WIDE_INT const_nunits;
570 if (STMT_VINFO_BB_VINFO (stmt_info)
571 && (!nunits.is_constant (&const_nunits)
572 || const_nunits > group_size))
573 {
574 if (dump_enabled_p ())
575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
576 "Build SLP failed: unrolling required "
577 "in basic block SLP\n");
578 /* Fatal mismatch. */
579 return false;
580 }
581
582 /* In case of multiple types we need to detect the smallest type. */
583 vect_update_max_nunits (max_nunits, vectype);
584 return true;
585 }
586
587 /* STMTS is a group of GROUP_SIZE SLP statements in which some
588 statements do the same operation as the first statement and in which
589 the others do ALT_STMT_CODE. Return true if we can take one vector
590 of the first operation and one vector of the second and permute them
591 to get the required result. VECTYPE is the type of the vector that
592 would be permuted. */
593
594 static bool
vect_two_operations_perm_ok_p(vec<stmt_vec_info> stmts,unsigned int group_size,tree vectype,tree_code alt_stmt_code)595 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
596 unsigned int group_size, tree vectype,
597 tree_code alt_stmt_code)
598 {
599 unsigned HOST_WIDE_INT count;
600 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
601 return false;
602
603 vec_perm_builder sel (count, count, 1);
604 for (unsigned int i = 0; i < count; ++i)
605 {
606 unsigned int elt = i;
607 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
608 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
609 elt += count;
610 sel.quick_push (elt);
611 }
612 vec_perm_indices indices (sel, 2, count);
613 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
614 }
615
616 /* Verify if the scalar stmts STMTS are isomorphic, require data
617 permutation or are of unsupported types of operation. Return
618 true if they are, otherwise return false and indicate in *MATCHES
619 which stmts are not isomorphic to the first one. If MATCHES[0]
620 is false then this indicates the comparison could not be
621 carried out or the stmts will never be vectorized by SLP.
622
623 Note COND_EXPR is possibly ismorphic to another one after swapping its
624 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
625 the first stmt by swapping the two operands of comparison; set SWAP[i]
626 to 2 if stmt I is isormorphic to the first stmt by inverting the code
627 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
628 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
629
630 static bool
vect_build_slp_tree_1(unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators)631 vect_build_slp_tree_1 (unsigned char *swap,
632 vec<stmt_vec_info> stmts, unsigned int group_size,
633 poly_uint64 *max_nunits, bool *matches,
634 bool *two_operators)
635 {
636 unsigned int i;
637 stmt_vec_info first_stmt_info = stmts[0];
638 enum tree_code first_stmt_code = ERROR_MARK;
639 enum tree_code alt_stmt_code = ERROR_MARK;
640 enum tree_code rhs_code = ERROR_MARK;
641 enum tree_code first_cond_code = ERROR_MARK;
642 tree lhs;
643 bool need_same_oprnds = false;
644 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
645 optab optab;
646 int icode;
647 machine_mode optab_op2_mode;
648 machine_mode vec_mode;
649 stmt_vec_info first_load = NULL, prev_first_load = NULL;
650
651 /* For every stmt in NODE find its def stmt/s. */
652 stmt_vec_info stmt_info;
653 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
654 {
655 gimple *stmt = stmt_info->stmt;
656 swap[i] = 0;
657 matches[i] = false;
658
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
661
662 /* Fail to vectorize statements marked as unvectorizable. */
663 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
667 "Build SLP failed: unvectorizable statement %G",
668 stmt);
669 /* Fatal mismatch. */
670 matches[0] = false;
671 return false;
672 }
673
674 lhs = gimple_get_lhs (stmt);
675 if (lhs == NULL_TREE)
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
679 "Build SLP failed: not GIMPLE_ASSIGN nor "
680 "GIMPLE_CALL %G", stmt);
681 /* Fatal mismatch. */
682 matches[0] = false;
683 return false;
684 }
685
686 tree nunits_vectype;
687 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
688 &nunits_vectype)
689 || (nunits_vectype
690 && !vect_record_max_nunits (stmt_info, group_size,
691 nunits_vectype, max_nunits)))
692 {
693 /* Fatal mismatch. */
694 matches[0] = false;
695 return false;
696 }
697
698 gcc_assert (vectype);
699
700 if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
701 {
702 rhs_code = CALL_EXPR;
703 if ((gimple_call_internal_p (call_stmt)
704 && (!vectorizable_internal_fn_p
705 (gimple_call_internal_fn (call_stmt))))
706 || gimple_call_tail_p (call_stmt)
707 || gimple_call_noreturn_p (call_stmt)
708 || !gimple_call_nothrow_p (call_stmt)
709 || gimple_call_chain (call_stmt))
710 {
711 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Build SLP failed: unsupported call type %G",
714 call_stmt);
715 /* Fatal mismatch. */
716 matches[0] = false;
717 return false;
718 }
719 }
720 else
721 rhs_code = gimple_assign_rhs_code (stmt);
722
723 /* Check the operation. */
724 if (i == 0)
725 {
726 first_stmt_code = rhs_code;
727
728 /* Shift arguments should be equal in all the packed stmts for a
729 vector shift with scalar shift operand. */
730 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
731 || rhs_code == LROTATE_EXPR
732 || rhs_code == RROTATE_EXPR)
733 {
734 if (vectype == boolean_type_node)
735 {
736 if (dump_enabled_p ())
737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
738 "Build SLP failed: shift of a"
739 " boolean.\n");
740 /* Fatal mismatch. */
741 matches[0] = false;
742 return false;
743 }
744
745 vec_mode = TYPE_MODE (vectype);
746
747 /* First see if we have a vector/vector shift. */
748 optab = optab_for_tree_code (rhs_code, vectype,
749 optab_vector);
750
751 if (!optab
752 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
753 {
754 /* No vector/vector shift, try for a vector/scalar shift. */
755 optab = optab_for_tree_code (rhs_code, vectype,
756 optab_scalar);
757
758 if (!optab)
759 {
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "Build SLP failed: no optab.\n");
763 /* Fatal mismatch. */
764 matches[0] = false;
765 return false;
766 }
767 icode = (int) optab_handler (optab, vec_mode);
768 if (icode == CODE_FOR_nothing)
769 {
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "Build SLP failed: "
773 "op not supported by target.\n");
774 /* Fatal mismatch. */
775 matches[0] = false;
776 return false;
777 }
778 optab_op2_mode = insn_data[icode].operand[2].mode;
779 if (!VECTOR_MODE_P (optab_op2_mode))
780 {
781 need_same_oprnds = true;
782 first_op1 = gimple_assign_rhs2 (stmt);
783 }
784 }
785 }
786 else if (rhs_code == WIDEN_LSHIFT_EXPR)
787 {
788 need_same_oprnds = true;
789 first_op1 = gimple_assign_rhs2 (stmt);
790 }
791 }
792 else
793 {
794 if (first_stmt_code != rhs_code
795 && alt_stmt_code == ERROR_MARK)
796 alt_stmt_code = rhs_code;
797 if (first_stmt_code != rhs_code
798 && (first_stmt_code != IMAGPART_EXPR
799 || rhs_code != REALPART_EXPR)
800 && (first_stmt_code != REALPART_EXPR
801 || rhs_code != IMAGPART_EXPR)
802 /* Handle mismatches in plus/minus by computing both
803 and merging the results. */
804 && !((first_stmt_code == PLUS_EXPR
805 || first_stmt_code == MINUS_EXPR)
806 && (alt_stmt_code == PLUS_EXPR
807 || alt_stmt_code == MINUS_EXPR)
808 && rhs_code == alt_stmt_code)
809 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
810 && (first_stmt_code == ARRAY_REF
811 || first_stmt_code == BIT_FIELD_REF
812 || first_stmt_code == INDIRECT_REF
813 || first_stmt_code == COMPONENT_REF
814 || first_stmt_code == MEM_REF)))
815 {
816 if (dump_enabled_p ())
817 {
818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
819 "Build SLP failed: different operation "
820 "in stmt %G", stmt);
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "original stmt %G", first_stmt_info->stmt);
823 }
824 /* Mismatch. */
825 continue;
826 }
827
828 if (need_same_oprnds
829 && !operand_equal_p (first_op1, gimple_assign_rhs2 (stmt), 0))
830 {
831 if (dump_enabled_p ())
832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 "Build SLP failed: different shift "
834 "arguments in %G", stmt);
835 /* Mismatch. */
836 continue;
837 }
838
839 if (rhs_code == CALL_EXPR)
840 {
841 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
842 as_a <gcall *> (stmt)))
843 {
844 if (dump_enabled_p ())
845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
846 "Build SLP failed: different calls in %G",
847 stmt);
848 /* Mismatch. */
849 continue;
850 }
851 }
852 }
853
854 /* Grouped store or load. */
855 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
856 {
857 if (REFERENCE_CLASS_P (lhs))
858 {
859 /* Store. */
860 ;
861 }
862 else
863 {
864 /* Load. */
865 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
866 if (prev_first_load)
867 {
868 /* Check that there are no loads from different interleaving
869 chains in the same node. */
870 if (prev_first_load != first_load)
871 {
872 if (dump_enabled_p ())
873 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
874 vect_location,
875 "Build SLP failed: different "
876 "interleaving chains in one node %G",
877 stmt);
878 /* Mismatch. */
879 continue;
880 }
881 }
882 else
883 prev_first_load = first_load;
884 }
885 } /* Grouped access. */
886 else
887 {
888 if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
889 {
890 /* Not grouped load. */
891 if (dump_enabled_p ())
892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
893 "Build SLP failed: not grouped load %G", stmt);
894
895 /* FORNOW: Not grouped loads are not supported. */
896 /* Fatal mismatch. */
897 matches[0] = false;
898 return false;
899 }
900
901 /* Not memory operation. */
902 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
903 && TREE_CODE_CLASS (rhs_code) != tcc_unary
904 && TREE_CODE_CLASS (rhs_code) != tcc_expression
905 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
906 && rhs_code != CALL_EXPR)
907 {
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "Build SLP failed: operation unsupported %G",
911 stmt);
912 /* Fatal mismatch. */
913 matches[0] = false;
914 return false;
915 }
916
917 if (rhs_code == COND_EXPR)
918 {
919 tree cond_expr = gimple_assign_rhs1 (stmt);
920 enum tree_code cond_code = TREE_CODE (cond_expr);
921 enum tree_code swap_code = ERROR_MARK;
922 enum tree_code invert_code = ERROR_MARK;
923
924 if (i == 0)
925 first_cond_code = TREE_CODE (cond_expr);
926 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
927 {
928 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
929 swap_code = swap_tree_comparison (cond_code);
930 invert_code = invert_tree_comparison (cond_code, honor_nans);
931 }
932
933 if (first_cond_code == cond_code)
934 ;
935 /* Isomorphic can be achieved by swapping. */
936 else if (first_cond_code == swap_code)
937 swap[i] = 1;
938 /* Isomorphic can be achieved by inverting. */
939 else if (first_cond_code == invert_code)
940 swap[i] = 2;
941 else
942 {
943 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "Build SLP failed: different"
946 " operation %G", stmt);
947 /* Mismatch. */
948 continue;
949 }
950 }
951 }
952
953 matches[i] = true;
954 }
955
956 for (i = 0; i < group_size; ++i)
957 if (!matches[i])
958 return false;
959
960 /* If we allowed a two-operation SLP node verify the target can cope
961 with the permute we are going to use. */
962 if (alt_stmt_code != ERROR_MARK
963 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
964 {
965 if (vectype == boolean_type_node
966 || !vect_two_operations_perm_ok_p (stmts, group_size,
967 vectype, alt_stmt_code))
968 {
969 for (i = 0; i < group_size; ++i)
970 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
971 {
972 matches[i] = false;
973 if (dump_enabled_p ())
974 {
975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
976 "Build SLP failed: different operation "
977 "in stmt %G", stmts[i]->stmt);
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "original stmt %G", first_stmt_info->stmt);
980 }
981 }
982 return false;
983 }
984 *two_operators = true;
985 }
986
987 return true;
988 }
989
990 /* Traits for the hash_set to record failed SLP builds for a stmt set.
991 Note we never remove apart from at destruction time so we do not
992 need a special value for deleted that differs from empty. */
993 struct bst_traits
994 {
995 typedef vec <stmt_vec_info> value_type;
996 typedef vec <stmt_vec_info> compare_type;
997 static inline hashval_t hash (value_type);
998 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits999 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1000 static inline bool is_deleted (value_type x) { return !x.exists (); }
mark_emptybst_traits1001 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1002 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1003 static inline void remove (value_type &x) { x.release (); }
1004 };
1005 inline hashval_t
hash(value_type x)1006 bst_traits::hash (value_type x)
1007 {
1008 inchash::hash h;
1009 for (unsigned i = 0; i < x.length (); ++i)
1010 h.add_int (gimple_uid (x[i]->stmt));
1011 return h.end ();
1012 }
1013 inline bool
equal(value_type existing,value_type candidate)1014 bst_traits::equal (value_type existing, value_type candidate)
1015 {
1016 if (existing.length () != candidate.length ())
1017 return false;
1018 for (unsigned i = 0; i < existing.length (); ++i)
1019 if (existing[i] != candidate[i])
1020 return false;
1021 return true;
1022 }
1023
1024 typedef hash_map <vec <gimple *>, slp_tree,
1025 simple_hashmap_traits <bst_traits, slp_tree> >
1026 scalar_stmts_to_slp_tree_map_t;
1027
1028 static slp_tree
1029 vect_build_slp_tree_2 (vec_info *vinfo,
1030 vec<stmt_vec_info> stmts, unsigned int group_size,
1031 poly_uint64 *max_nunits,
1032 bool *matches, unsigned *npermutes, unsigned *tree_size,
1033 unsigned max_tree_size,
1034 scalar_stmts_to_slp_tree_map_t *bst_map);
1035
1036 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1037 vect_build_slp_tree (vec_info *vinfo,
1038 vec<stmt_vec_info> stmts, unsigned int group_size,
1039 poly_uint64 *max_nunits,
1040 bool *matches, unsigned *npermutes, unsigned *tree_size,
1041 unsigned max_tree_size,
1042 scalar_stmts_to_slp_tree_map_t *bst_map)
1043 {
1044 if (slp_tree *leader = bst_map->get (stmts))
1045 {
1046 if (dump_enabled_p ())
1047 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1048 *leader ? "" : "failed ", *leader);
1049 if (*leader)
1050 {
1051 (*leader)->refcnt++;
1052 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1053 }
1054 return *leader;
1055 }
1056 poly_uint64 this_max_nunits = 1;
1057 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1058 &this_max_nunits,
1059 matches, npermutes, tree_size,
1060 max_tree_size, bst_map);
1061 if (res)
1062 {
1063 res->max_nunits = this_max_nunits;
1064 vect_update_max_nunits (max_nunits, this_max_nunits);
1065 /* Keep a reference for the bst_map use. */
1066 res->refcnt++;
1067 }
1068 bst_map->put (stmts.copy (), res);
1069 return res;
1070 }
1071
1072 /* Recursively build an SLP tree starting from NODE.
1073 Fail (and return a value not equal to zero) if def-stmts are not
1074 isomorphic, require data permutation or are of unsupported types of
1075 operation. Otherwise, return 0.
1076 The value returned is the depth in the SLP tree where a mismatch
1077 was found. */
1078
1079 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,unsigned max_tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1080 vect_build_slp_tree_2 (vec_info *vinfo,
1081 vec<stmt_vec_info> stmts, unsigned int group_size,
1082 poly_uint64 *max_nunits,
1083 bool *matches, unsigned *npermutes, unsigned *tree_size,
1084 unsigned max_tree_size,
1085 scalar_stmts_to_slp_tree_map_t *bst_map)
1086 {
1087 unsigned nops, i, this_tree_size = 0;
1088 poly_uint64 this_max_nunits = *max_nunits;
1089 slp_tree node;
1090
1091 matches[0] = false;
1092
1093 stmt_vec_info stmt_info = stmts[0];
1094 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1095 nops = gimple_call_num_args (stmt);
1096 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1097 {
1098 nops = gimple_num_ops (stmt) - 1;
1099 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1100 nops++;
1101 }
1102 else if (is_a <gphi *> (stmt_info->stmt))
1103 nops = 0;
1104 else
1105 return NULL;
1106
1107 /* If the SLP node is a PHI (induction or reduction), terminate
1108 the recursion. */
1109 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1110 {
1111 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1112 tree vectype = get_vectype_for_scalar_type (scalar_type);
1113 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1114 return NULL;
1115
1116 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1117 /* Induction from different IVs is not supported. */
1118 if (def_type == vect_induction_def)
1119 {
1120 stmt_vec_info other_info;
1121 FOR_EACH_VEC_ELT (stmts, i, other_info)
1122 if (stmt_info != other_info)
1123 return NULL;
1124 }
1125 else if (def_type == vect_reduction_def
1126 || def_type == vect_double_reduction_def
1127 || def_type == vect_nested_cycle)
1128 {
1129 /* Else def types have to match. */
1130 stmt_vec_info other_info;
1131 FOR_EACH_VEC_ELT (stmts, i, other_info)
1132 {
1133 /* But for reduction chains only check on the first stmt. */
1134 if (!STMT_VINFO_DATA_REF (other_info)
1135 && REDUC_GROUP_FIRST_ELEMENT (other_info)
1136 && REDUC_GROUP_FIRST_ELEMENT (other_info) != stmt_info)
1137 continue;
1138 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1139 return NULL;
1140 }
1141 }
1142 else
1143 return NULL;
1144 node = vect_create_new_slp_node (stmts);
1145 return node;
1146 }
1147
1148
1149 bool two_operators = false;
1150 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1151 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1152 &this_max_nunits, matches, &two_operators))
1153 return NULL;
1154
1155 /* If the SLP node is a load, terminate the recursion. */
1156 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1157 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1158 {
1159 *max_nunits = this_max_nunits;
1160 node = vect_create_new_slp_node (stmts);
1161 return node;
1162 }
1163
1164 /* Get at the operands, verifying they are compatible. */
1165 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1166 slp_oprnd_info oprnd_info;
1167 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1168 {
1169 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1170 stmts, i, &oprnds_info);
1171 if (res != 0)
1172 matches[(res == -1) ? 0 : i] = false;
1173 if (!matches[0])
1174 break;
1175 }
1176 for (i = 0; i < group_size; ++i)
1177 if (!matches[i])
1178 {
1179 vect_free_oprnd_info (oprnds_info);
1180 return NULL;
1181 }
1182
1183 auto_vec<slp_tree, 4> children;
1184
1185 stmt_info = stmts[0];
1186
1187 if (tree_size)
1188 max_tree_size -= *tree_size;
1189
1190 /* Create SLP_TREE nodes for the definition node/s. */
1191 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1192 {
1193 slp_tree child;
1194 unsigned old_tree_size = this_tree_size;
1195 unsigned int j;
1196
1197 if (oprnd_info->first_dt != vect_internal_def
1198 && oprnd_info->first_dt != vect_reduction_def
1199 && oprnd_info->first_dt != vect_induction_def)
1200 continue;
1201
1202 if (++this_tree_size > max_tree_size)
1203 {
1204 if (dump_enabled_p ())
1205 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1206 vect_location,
1207 "Build SLP failed: SLP tree too large\n");
1208 FOR_EACH_VEC_ELT (children, j, child)
1209 vect_free_slp_tree (child, false);
1210 vect_free_oprnd_info (oprnds_info);
1211 return NULL;
1212 }
1213
1214 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1215 group_size, &this_max_nunits,
1216 matches, npermutes,
1217 &this_tree_size,
1218 max_tree_size, bst_map)) != NULL)
1219 {
1220 /* If we have all children of child built up from scalars then just
1221 throw that away and build it up this node from scalars. */
1222 if (!SLP_TREE_CHILDREN (child).is_empty ()
1223 /* ??? Rejecting patterns this way doesn't work. We'd have to
1224 do extra work to cancel the pattern so the uses see the
1225 scalar version. */
1226 && !oprnd_info->any_pattern)
1227 {
1228 slp_tree grandchild;
1229
1230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1231 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1232 break;
1233 if (!grandchild)
1234 {
1235 /* Roll back. */
1236 this_tree_size = old_tree_size;
1237 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1238 vect_free_slp_tree (grandchild, false);
1239 SLP_TREE_CHILDREN (child).truncate (0);
1240
1241 if (dump_enabled_p ())
1242 dump_printf_loc (MSG_NOTE, vect_location,
1243 "Building parent vector operands from "
1244 "scalars instead\n");
1245 oprnd_info->def_stmts = vNULL;
1246 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1247 children.safe_push (child);
1248 continue;
1249 }
1250 }
1251
1252 oprnd_info->def_stmts = vNULL;
1253 children.safe_push (child);
1254 continue;
1255 }
1256
1257 /* If the SLP build failed fatally and we analyze a basic-block
1258 simply treat nodes we fail to build as externally defined
1259 (and thus build vectors from the scalar defs).
1260 The cost model will reject outright expensive cases.
1261 ??? This doesn't treat cases where permutation ultimatively
1262 fails (or we don't try permutation below). Ideally we'd
1263 even compute a permutation that will end up with the maximum
1264 SLP tree size... */
1265 if (is_a <bb_vec_info> (vinfo)
1266 && !matches[0]
1267 /* ??? Rejecting patterns this way doesn't work. We'd have to
1268 do extra work to cancel the pattern so the uses see the
1269 scalar version. */
1270 && !is_pattern_stmt_p (stmt_info)
1271 && !oprnd_info->any_pattern)
1272 {
1273 if (dump_enabled_p ())
1274 dump_printf_loc (MSG_NOTE, vect_location,
1275 "Building vector operands from scalars\n");
1276 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1277 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1278 children.safe_push (child);
1279 oprnd_info->def_stmts = vNULL;
1280 continue;
1281 }
1282
1283 /* If the SLP build for operand zero failed and operand zero
1284 and one can be commutated try that for the scalar stmts
1285 that failed the match. */
1286 if (i == 0
1287 /* A first scalar stmt mismatch signals a fatal mismatch. */
1288 && matches[0]
1289 /* ??? For COND_EXPRs we can swap the comparison operands
1290 as well as the arms under some constraints. */
1291 && nops == 2
1292 && oprnds_info[1]->first_dt == vect_internal_def
1293 && is_gimple_assign (stmt_info->stmt)
1294 /* Swapping operands for reductions breaks assumptions later on. */
1295 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1296 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1297 /* Do so only if the number of not successful permutes was nor more
1298 than a cut-ff as re-trying the recursive match on
1299 possibly each level of the tree would expose exponential
1300 behavior. */
1301 && *npermutes < 4)
1302 {
1303 /* See whether we can swap the matching or the non-matching
1304 stmt operands. */
1305 bool swap_not_matching = true;
1306 do
1307 {
1308 for (j = 0; j < group_size; ++j)
1309 {
1310 if (matches[j] != !swap_not_matching)
1311 continue;
1312 stmt_vec_info stmt_info = stmts[j];
1313 /* Verify if we can swap operands of this stmt. */
1314 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1315 if (!stmt
1316 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1317 {
1318 if (!swap_not_matching)
1319 goto fail;
1320 swap_not_matching = false;
1321 break;
1322 }
1323 /* Verify if we can safely swap or if we committed to a
1324 specific operand order already.
1325 ??? Instead of modifying GIMPLE stmts here we could
1326 record whether we want to swap operands in the SLP
1327 node and temporarily do that when processing it
1328 (or wrap operand accessors in a helper). */
1329 else if (swap[j] != 0
1330 || STMT_VINFO_NUM_SLP_USES (stmt_info))
1331 {
1332 if (!swap_not_matching)
1333 {
1334 if (dump_enabled_p ())
1335 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1336 vect_location,
1337 "Build SLP failed: cannot swap "
1338 "operands of shared stmt %G",
1339 stmts[j]->stmt);
1340 goto fail;
1341 }
1342 swap_not_matching = false;
1343 break;
1344 }
1345 }
1346 }
1347 while (j != group_size);
1348
1349 /* Swap mismatched definition stmts. */
1350 if (dump_enabled_p ())
1351 dump_printf_loc (MSG_NOTE, vect_location,
1352 "Re-trying with swapped operands of stmts ");
1353 for (j = 0; j < group_size; ++j)
1354 if (matches[j] == !swap_not_matching)
1355 {
1356 std::swap (oprnds_info[0]->def_stmts[j],
1357 oprnds_info[1]->def_stmts[j]);
1358 if (dump_enabled_p ())
1359 dump_printf (MSG_NOTE, "%d ", j);
1360 }
1361 if (dump_enabled_p ())
1362 dump_printf (MSG_NOTE, "\n");
1363 /* And try again with scratch 'matches' ... */
1364 bool *tem = XALLOCAVEC (bool, group_size);
1365 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1366 group_size, &this_max_nunits,
1367 tem, npermutes,
1368 &this_tree_size,
1369 max_tree_size, bst_map)) != NULL)
1370 {
1371 /* ... so if successful we can apply the operand swapping
1372 to the GIMPLE IL. This is necessary because for example
1373 vect_get_slp_defs uses operand indexes and thus expects
1374 canonical operand order. This is also necessary even
1375 if we end up building the operand from scalars as
1376 we'll continue to process swapped operand two. */
1377 for (j = 0; j < group_size; ++j)
1378 gimple_set_plf (stmts[j]->stmt, GF_PLF_1, false);
1379 for (j = 0; j < group_size; ++j)
1380 if (matches[j] == !swap_not_matching)
1381 {
1382 gassign *stmt = as_a <gassign *> (stmts[j]->stmt);
1383 /* Avoid swapping operands twice. */
1384 if (gimple_plf (stmt, GF_PLF_1))
1385 continue;
1386 swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
1387 gimple_assign_rhs2_ptr (stmt));
1388 gimple_set_plf (stmt, GF_PLF_1, true);
1389 }
1390 /* Verify we swap all duplicates or none. */
1391 if (flag_checking)
1392 for (j = 0; j < group_size; ++j)
1393 gcc_assert (gimple_plf (stmts[j]->stmt, GF_PLF_1)
1394 == (matches[j] == !swap_not_matching));
1395
1396 /* If we have all children of child built up from scalars then
1397 just throw that away and build it up this node from scalars. */
1398 if (!SLP_TREE_CHILDREN (child).is_empty ()
1399 /* ??? Rejecting patterns this way doesn't work. We'd have
1400 to do extra work to cancel the pattern so the uses see the
1401 scalar version. */
1402 && !oprnd_info->any_pattern)
1403 {
1404 unsigned int j;
1405 slp_tree grandchild;
1406
1407 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1408 if (SLP_TREE_DEF_TYPE (grandchild) == vect_internal_def)
1409 break;
1410 if (!grandchild)
1411 {
1412 /* Roll back. */
1413 this_tree_size = old_tree_size;
1414 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1415 vect_free_slp_tree (grandchild, false);
1416 SLP_TREE_CHILDREN (child).truncate (0);
1417
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_NOTE, vect_location,
1420 "Building parent vector operands from "
1421 "scalars instead\n");
1422 oprnd_info->def_stmts = vNULL;
1423 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1424 children.safe_push (child);
1425 continue;
1426 }
1427 }
1428
1429 oprnd_info->def_stmts = vNULL;
1430 children.safe_push (child);
1431 continue;
1432 }
1433
1434 ++*npermutes;
1435 }
1436
1437 fail:
1438 gcc_assert (child == NULL);
1439 FOR_EACH_VEC_ELT (children, j, child)
1440 vect_free_slp_tree (child, false);
1441 vect_free_oprnd_info (oprnds_info);
1442 return NULL;
1443 }
1444
1445 vect_free_oprnd_info (oprnds_info);
1446
1447 if (tree_size)
1448 *tree_size += this_tree_size;
1449 *max_nunits = this_max_nunits;
1450
1451 node = vect_create_new_slp_node (stmts);
1452 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1453 SLP_TREE_CHILDREN (node).splice (children);
1454 return node;
1455 }
1456
1457 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1458
1459 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)1460 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1461 slp_tree node, hash_set<slp_tree> &visited)
1462 {
1463 int i;
1464 stmt_vec_info stmt_info;
1465 slp_tree child;
1466
1467 if (visited.add (node))
1468 return;
1469
1470 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1471 dump_user_location_t user_loc = loc.get_user_location ();
1472 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u)\n",
1473 SLP_TREE_DEF_TYPE (node) != vect_internal_def
1474 ? " (external)" : "", node,
1475 estimated_poly_value (node->max_nunits));
1476 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1477 dump_printf_loc (metadata, user_loc, "\tstmt %d %G", i, stmt_info->stmt);
1478 if (SLP_TREE_CHILDREN (node).is_empty ())
1479 return;
1480 dump_printf_loc (metadata, user_loc, "\tchildren");
1481 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1482 dump_printf (dump_kind, " %p", (void *)child);
1483 dump_printf (dump_kind, "\n");
1484 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1485 vect_print_slp_tree (dump_kind, loc, child, visited);
1486 }
1487
1488 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)1489 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1490 slp_tree node)
1491 {
1492 hash_set<slp_tree> visited;
1493 vect_print_slp_tree (dump_kind, loc, node, visited);
1494 }
1495
1496 /* Mark the tree rooted at NODE with PURE_SLP. */
1497
1498 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)1499 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1500 {
1501 int i;
1502 stmt_vec_info stmt_info;
1503 slp_tree child;
1504
1505 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1506 return;
1507
1508 if (visited.add (node))
1509 return;
1510
1511 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1512 STMT_SLP_TYPE (stmt_info) = pure_slp;
1513
1514 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1515 vect_mark_slp_stmts (child, visited);
1516 }
1517
1518 static void
vect_mark_slp_stmts(slp_tree node)1519 vect_mark_slp_stmts (slp_tree node)
1520 {
1521 hash_set<slp_tree> visited;
1522 vect_mark_slp_stmts (node, visited);
1523 }
1524
1525 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1526
1527 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)1528 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1529 {
1530 int i;
1531 stmt_vec_info stmt_info;
1532 slp_tree child;
1533
1534 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1535 return;
1536
1537 if (visited.add (node))
1538 return;
1539
1540 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1541 {
1542 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1543 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1544 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1545 }
1546
1547 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1548 vect_mark_slp_stmts_relevant (child, visited);
1549 }
1550
1551 static void
vect_mark_slp_stmts_relevant(slp_tree node)1552 vect_mark_slp_stmts_relevant (slp_tree node)
1553 {
1554 hash_set<slp_tree> visited;
1555 vect_mark_slp_stmts_relevant (node, visited);
1556 }
1557
1558
1559 /* Rearrange the statements of NODE according to PERMUTATION. */
1560
1561 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation,hash_set<slp_tree> & visited)1562 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1563 vec<unsigned> permutation,
1564 hash_set<slp_tree> &visited)
1565 {
1566 stmt_vec_info stmt_info;
1567 vec<stmt_vec_info> tmp_stmts;
1568 unsigned int i;
1569 slp_tree child;
1570
1571 if (visited.add (node))
1572 return;
1573
1574 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1575 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1576
1577 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1578 tmp_stmts.create (group_size);
1579 tmp_stmts.quick_grow_cleared (group_size);
1580
1581 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1582 tmp_stmts[permutation[i]] = stmt_info;
1583
1584 SLP_TREE_SCALAR_STMTS (node).release ();
1585 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1586 }
1587
1588
1589 /* Attempt to reorder stmts in a reduction chain so that we don't
1590 require any load permutation. Return true if that was possible,
1591 otherwise return false. */
1592
1593 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1594 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1595 {
1596 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1597 unsigned int i, j;
1598 unsigned int lidx;
1599 slp_tree node, load;
1600
1601 /* Compare all the permutation sequences to the first one. We know
1602 that at least one load is permuted. */
1603 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1604 if (!node->load_permutation.exists ())
1605 return false;
1606 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1607 {
1608 if (!load->load_permutation.exists ())
1609 return false;
1610 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1611 if (lidx != node->load_permutation[j])
1612 return false;
1613 }
1614
1615 /* Check that the loads in the first sequence are different and there
1616 are no gaps between them. */
1617 auto_sbitmap load_index (group_size);
1618 bitmap_clear (load_index);
1619 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1620 {
1621 if (lidx >= group_size)
1622 return false;
1623 if (bitmap_bit_p (load_index, lidx))
1624 return false;
1625
1626 bitmap_set_bit (load_index, lidx);
1627 }
1628 for (i = 0; i < group_size; i++)
1629 if (!bitmap_bit_p (load_index, i))
1630 return false;
1631
1632 /* This permutation is valid for reduction. Since the order of the
1633 statements in the nodes is not important unless they are memory
1634 accesses, we can rearrange the statements in all the nodes
1635 according to the order of the loads. */
1636 hash_set<slp_tree> visited;
1637 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1638 node->load_permutation, visited);
1639
1640 /* We are done, no actual permutations need to be generated. */
1641 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1642 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1643 {
1644 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1645 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1646 /* But we have to keep those permutations that are required because
1647 of handling of gaps. */
1648 if (known_eq (unrolling_factor, 1U)
1649 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1650 && DR_GROUP_GAP (first_stmt_info) == 0))
1651 SLP_TREE_LOAD_PERMUTATION (node).release ();
1652 else
1653 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1654 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1655 }
1656
1657 return true;
1658 }
1659
1660 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1661
1662 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node,hash_set<slp_tree> & visited)1663 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1664 hash_set<slp_tree> &visited)
1665 {
1666 if (visited.add (node))
1667 return;
1668
1669 if (SLP_TREE_CHILDREN (node).length () == 0)
1670 {
1671 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1672 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
1673 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
1674 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1675 SLP_INSTANCE_LOADS (inst).safe_push (node);
1676 }
1677 else
1678 {
1679 unsigned i;
1680 slp_tree child;
1681 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1682 vect_gather_slp_loads (inst, child, visited);
1683 }
1684 }
1685
1686 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node)1687 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1688 {
1689 hash_set<slp_tree> visited;
1690 vect_gather_slp_loads (inst, node, visited);
1691 }
1692
1693 /* Check if the required load permutations in the SLP instance
1694 SLP_INSTN are supported. */
1695
1696 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1697 vect_supported_load_permutation_p (slp_instance slp_instn)
1698 {
1699 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1700 unsigned int i, j, k, next;
1701 slp_tree node;
1702
1703 if (dump_enabled_p ())
1704 {
1705 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1706 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1707 if (node->load_permutation.exists ())
1708 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1709 dump_printf (MSG_NOTE, "%d ", next);
1710 else
1711 for (k = 0; k < group_size; ++k)
1712 dump_printf (MSG_NOTE, "%d ", k);
1713 dump_printf (MSG_NOTE, "\n");
1714 }
1715
1716 /* In case of reduction every load permutation is allowed, since the order
1717 of the reduction statements is not important (as opposed to the case of
1718 grouped stores). The only condition we need to check is that all the
1719 load nodes are of the same size and have the same permutation (and then
1720 rearrange all the nodes of the SLP instance according to this
1721 permutation). */
1722
1723 /* Check that all the load nodes are of the same size. */
1724 /* ??? Can't we assert this? */
1725 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1726 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1727 return false;
1728
1729 node = SLP_INSTANCE_TREE (slp_instn);
1730 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1731
1732 /* Reduction (there are no data-refs in the root).
1733 In reduction chain the order of the loads is not important. */
1734 if (!STMT_VINFO_DATA_REF (stmt_info)
1735 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1736 vect_attempt_slp_rearrange_stmts (slp_instn);
1737
1738 /* In basic block vectorization we allow any subchain of an interleaving
1739 chain.
1740 FORNOW: not supported in loop SLP because of realignment compications. */
1741 if (STMT_VINFO_BB_VINFO (stmt_info))
1742 {
1743 /* Check whether the loads in an instance form a subchain and thus
1744 no permutation is necessary. */
1745 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1746 {
1747 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1748 continue;
1749 bool subchain_p = true;
1750 stmt_vec_info next_load_info = NULL;
1751 stmt_vec_info load_info;
1752 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1753 {
1754 if (j != 0
1755 && (next_load_info != load_info
1756 || DR_GROUP_GAP (load_info) != 1))
1757 {
1758 subchain_p = false;
1759 break;
1760 }
1761 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1762 }
1763 if (subchain_p)
1764 SLP_TREE_LOAD_PERMUTATION (node).release ();
1765 else
1766 {
1767 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1768 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1769 unsigned HOST_WIDE_INT nunits;
1770 unsigned k, maxk = 0;
1771 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
1772 if (k > maxk)
1773 maxk = k;
1774 /* In BB vectorization we may not actually use a loaded vector
1775 accessing elements in excess of DR_GROUP_SIZE. */
1776 tree vectype = STMT_VINFO_VECTYPE (group_info);
1777 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
1778 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
1779 {
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782 "BB vectorization with gaps at the end of "
1783 "a load is not supported\n");
1784 return false;
1785 }
1786
1787 /* Verify the permutation can be generated. */
1788 vec<tree> tem;
1789 unsigned n_perms;
1790 if (!vect_transform_slp_perm_load (node, tem, NULL,
1791 1, slp_instn, true, &n_perms))
1792 {
1793 if (dump_enabled_p ())
1794 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1795 vect_location,
1796 "unsupported load permutation\n");
1797 return false;
1798 }
1799 }
1800 }
1801 return true;
1802 }
1803
1804 /* For loop vectorization verify we can generate the permutation. Be
1805 conservative about the vectorization factor, there are permutations
1806 that will use three vector inputs only starting from a specific factor
1807 and the vectorization factor is not yet final.
1808 ??? The SLP instance unrolling factor might not be the maximum one. */
1809 unsigned n_perms;
1810 poly_uint64 test_vf
1811 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
1812 LOOP_VINFO_VECT_FACTOR
1813 (STMT_VINFO_LOOP_VINFO (stmt_info)));
1814 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1815 if (node->load_permutation.exists ()
1816 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
1817 slp_instn, true, &n_perms))
1818 return false;
1819
1820 return true;
1821 }
1822
1823
1824 /* Find the last store in SLP INSTANCE. */
1825
1826 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)1827 vect_find_last_scalar_stmt_in_slp (slp_tree node)
1828 {
1829 stmt_vec_info last = NULL;
1830 stmt_vec_info stmt_vinfo;
1831
1832 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
1833 {
1834 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
1835 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
1836 }
1837
1838 return last;
1839 }
1840
1841 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
1842 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
1843 (also containing the first GROUP1_SIZE stmts, since stores are
1844 consecutive), the second containing the remainder.
1845 Return the first stmt in the second group. */
1846
1847 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)1848 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
1849 {
1850 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
1851 gcc_assert (group1_size > 0);
1852 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
1853 gcc_assert (group2_size > 0);
1854 DR_GROUP_SIZE (first_vinfo) = group1_size;
1855
1856 stmt_vec_info stmt_info = first_vinfo;
1857 for (unsigned i = group1_size; i > 1; i--)
1858 {
1859 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
1860 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1861 }
1862 /* STMT is now the last element of the first group. */
1863 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
1864 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
1865
1866 DR_GROUP_SIZE (group2) = group2_size;
1867 for (stmt_info = group2; stmt_info;
1868 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
1869 {
1870 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
1871 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
1872 }
1873
1874 /* For the second group, the DR_GROUP_GAP is that before the original group,
1875 plus skipping over the first vector. */
1876 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
1877
1878 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
1879 DR_GROUP_GAP (first_vinfo) += group2_size;
1880
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
1883 group1_size, group2_size);
1884
1885 return group2;
1886 }
1887
1888 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
1889 statements and a vector of NUNITS elements. */
1890
1891 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)1892 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
1893 {
1894 return exact_div (common_multiple (nunits, group_size), group_size);
1895 }
1896
1897 /* Analyze an SLP instance starting from a group of grouped stores. Call
1898 vect_build_slp_tree to build a tree of packed stmts if possible.
1899 Return FALSE if it's impossible to SLP any stmt in the loop. */
1900
1901 static bool
vect_analyze_slp_instance(vec_info * vinfo,stmt_vec_info stmt_info,unsigned max_tree_size)1902 vect_analyze_slp_instance (vec_info *vinfo,
1903 stmt_vec_info stmt_info, unsigned max_tree_size)
1904 {
1905 slp_instance new_instance;
1906 slp_tree node;
1907 unsigned int group_size;
1908 tree vectype, scalar_type = NULL_TREE;
1909 unsigned int i;
1910 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
1911 vec<stmt_vec_info> scalar_stmts;
1912
1913 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1914 {
1915 scalar_type = TREE_TYPE (DR_REF (dr));
1916 vectype = get_vectype_for_scalar_type (scalar_type);
1917 group_size = DR_GROUP_SIZE (stmt_info);
1918 }
1919 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1920 {
1921 gcc_assert (is_a <loop_vec_info> (vinfo));
1922 vectype = STMT_VINFO_VECTYPE (stmt_info);
1923 group_size = REDUC_GROUP_SIZE (stmt_info);
1924 }
1925 else
1926 {
1927 gcc_assert (is_a <loop_vec_info> (vinfo));
1928 vectype = STMT_VINFO_VECTYPE (stmt_info);
1929 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
1930 }
1931
1932 if (!vectype)
1933 {
1934 if (dump_enabled_p ())
1935 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936 "Build SLP failed: unsupported data-type %T\n",
1937 scalar_type);
1938
1939 return false;
1940 }
1941 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1942
1943 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
1944 scalar_stmts.create (group_size);
1945 stmt_vec_info next_info = stmt_info;
1946 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1947 {
1948 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
1949 while (next_info)
1950 {
1951 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1952 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
1953 }
1954 }
1955 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1956 {
1957 /* Collect the reduction stmts and store them in
1958 SLP_TREE_SCALAR_STMTS. */
1959 while (next_info)
1960 {
1961 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
1962 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
1963 }
1964 /* Mark the first element of the reduction chain as reduction to properly
1965 transform the node. In the reduction analysis phase only the last
1966 element of the chain is marked as reduction. */
1967 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
1968 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
1969 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
1970 }
1971 else
1972 {
1973 /* Collect reduction statements. */
1974 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
1975 for (i = 0; reductions.iterate (i, &next_info); i++)
1976 scalar_stmts.safe_push (next_info);
1977 }
1978
1979 /* Build the tree for the SLP instance. */
1980 bool *matches = XALLOCAVEC (bool, group_size);
1981 unsigned npermutes = 0;
1982 scalar_stmts_to_slp_tree_map_t *bst_map
1983 = new scalar_stmts_to_slp_tree_map_t ();
1984 poly_uint64 max_nunits = nunits;
1985 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
1986 &max_nunits, matches, &npermutes,
1987 NULL, max_tree_size, bst_map);
1988 /* The map keeps a reference on SLP nodes built, release that. */
1989 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1990 it != bst_map->end (); ++it)
1991 if ((*it).second)
1992 vect_free_slp_tree ((*it).second, false);
1993 delete bst_map;
1994 if (node != NULL)
1995 {
1996 /* Calculate the unrolling factor based on the smallest type. */
1997 poly_uint64 unrolling_factor
1998 = calculate_unrolling_factor (max_nunits, group_size);
1999
2000 if (maybe_ne (unrolling_factor, 1U)
2001 && is_a <bb_vec_info> (vinfo))
2002 {
2003 unsigned HOST_WIDE_INT const_max_nunits;
2004 if (!max_nunits.is_constant (&const_max_nunits)
2005 || const_max_nunits > group_size)
2006 {
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009 "Build SLP failed: store group "
2010 "size not a multiple of the vector size "
2011 "in basic block SLP\n");
2012 vect_free_slp_tree (node, false);
2013 return false;
2014 }
2015 /* Fatal mismatch. */
2016 matches[group_size / const_max_nunits * const_max_nunits] = false;
2017 vect_free_slp_tree (node, false);
2018 }
2019 else
2020 {
2021 /* Create a new SLP instance. */
2022 new_instance = XNEW (struct _slp_instance);
2023 SLP_INSTANCE_TREE (new_instance) = node;
2024 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2025 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2026 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2027 vect_gather_slp_loads (new_instance, node);
2028
2029 /* Compute the load permutation. */
2030 slp_tree load_node;
2031 bool loads_permuted = false;
2032 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2033 {
2034 vec<unsigned> load_permutation;
2035 int j;
2036 stmt_vec_info load_info;
2037 bool this_load_permuted = false;
2038 load_permutation.create (group_size);
2039 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2040 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2041 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2042 {
2043 int load_place = vect_get_place_in_interleaving_chain
2044 (load_info, first_stmt_info);
2045 gcc_assert (load_place != -1);
2046 if (load_place != j)
2047 this_load_permuted = true;
2048 load_permutation.safe_push (load_place);
2049 }
2050 if (!this_load_permuted
2051 /* The load requires permutation when unrolling exposes
2052 a gap either because the group is larger than the SLP
2053 group-size or because there is a gap between the groups. */
2054 && (known_eq (unrolling_factor, 1U)
2055 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2056 && DR_GROUP_GAP (first_stmt_info) == 0)))
2057 {
2058 load_permutation.release ();
2059 continue;
2060 }
2061 SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
2062 loads_permuted = true;
2063 }
2064
2065 if (loads_permuted)
2066 {
2067 if (!vect_supported_load_permutation_p (new_instance))
2068 {
2069 if (dump_enabled_p ())
2070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2071 "Build SLP failed: unsupported load "
2072 "permutation %G", stmt_info->stmt);
2073 vect_free_slp_instance (new_instance, false);
2074 return false;
2075 }
2076 }
2077
2078 /* If the loads and stores can be handled with load/store-lan
2079 instructions do not generate this SLP instance. */
2080 if (is_a <loop_vec_info> (vinfo)
2081 && loads_permuted
2082 && dr && vect_store_lanes_supported (vectype, group_size, false))
2083 {
2084 slp_tree load_node;
2085 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2086 {
2087 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2088 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2089 /* Use SLP for strided accesses (or if we can't load-lanes). */
2090 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2091 || ! vect_load_lanes_supported
2092 (STMT_VINFO_VECTYPE (stmt_vinfo),
2093 DR_GROUP_SIZE (stmt_vinfo), false))
2094 break;
2095 }
2096 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2097 {
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100 "Built SLP cancelled: can use "
2101 "load/store-lanes\n");
2102 vect_free_slp_instance (new_instance, false);
2103 return false;
2104 }
2105 }
2106
2107 vinfo->slp_instances.safe_push (new_instance);
2108
2109 if (dump_enabled_p ())
2110 {
2111 dump_printf_loc (MSG_NOTE, vect_location,
2112 "Final SLP tree for instance:\n");
2113 vect_print_slp_tree (MSG_NOTE, vect_location, node);
2114 }
2115
2116 return true;
2117 }
2118 }
2119 else
2120 {
2121 /* Failed to SLP. */
2122 /* Free the allocated memory. */
2123 scalar_stmts.release ();
2124 }
2125
2126 /* For basic block SLP, try to break the group up into multiples of the
2127 vector size. */
2128 unsigned HOST_WIDE_INT const_nunits;
2129 if (is_a <bb_vec_info> (vinfo)
2130 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2131 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2132 && nunits.is_constant (&const_nunits))
2133 {
2134 /* We consider breaking the group only on VF boundaries from the existing
2135 start. */
2136 for (i = 0; i < group_size; i++)
2137 if (!matches[i]) break;
2138
2139 if (i >= const_nunits && i < group_size)
2140 {
2141 /* Split into two groups at the first vector boundary before i. */
2142 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2143 unsigned group1_size = i & ~(const_nunits - 1);
2144
2145 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2146 group1_size);
2147 bool res = vect_analyze_slp_instance (vinfo, stmt_info,
2148 max_tree_size);
2149 /* If the first non-match was in the middle of a vector,
2150 skip the rest of that vector. */
2151 if (group1_size < i)
2152 {
2153 i = group1_size + const_nunits;
2154 if (i < group_size)
2155 rest = vect_split_slp_store_group (rest, const_nunits);
2156 }
2157 if (i < group_size)
2158 res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size);
2159 return res;
2160 }
2161 /* Even though the first vector did not all match, we might be able to SLP
2162 (some) of the remainder. FORNOW ignore this possibility. */
2163 }
2164
2165 return false;
2166 }
2167
2168
2169 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2170 trees of packed scalar stmts if SLP is possible. */
2171
2172 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2173 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2174 {
2175 unsigned int i;
2176 stmt_vec_info first_element;
2177
2178 DUMP_VECT_SCOPE ("vect_analyze_slp");
2179
2180 /* Find SLP sequences starting from groups of grouped stores. */
2181 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2182 vect_analyze_slp_instance (vinfo, first_element, max_tree_size);
2183
2184 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2185 {
2186 if (loop_vinfo->reduction_chains.length () > 0)
2187 {
2188 /* Find SLP sequences starting from reduction chains. */
2189 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2190 if (! vect_analyze_slp_instance (vinfo, first_element,
2191 max_tree_size))
2192 {
2193 /* Dissolve reduction chain group. */
2194 stmt_vec_info vinfo = first_element;
2195 while (vinfo)
2196 {
2197 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2198 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2199 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2200 vinfo = next;
2201 }
2202 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2203 }
2204 }
2205
2206 /* Find SLP sequences starting from groups of reductions. */
2207 if (loop_vinfo->reductions.length () > 1)
2208 vect_analyze_slp_instance (vinfo, loop_vinfo->reductions[0],
2209 max_tree_size);
2210 }
2211
2212 return opt_result::success ();
2213 }
2214
2215
2216 /* For each possible SLP instance decide whether to SLP it and calculate overall
2217 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2218 least one instance. */
2219
2220 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2221 vect_make_slp_decision (loop_vec_info loop_vinfo)
2222 {
2223 unsigned int i;
2224 poly_uint64 unrolling_factor = 1;
2225 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2226 slp_instance instance;
2227 int decided_to_slp = 0;
2228
2229 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2230
2231 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2232 {
2233 /* FORNOW: SLP if you can. */
2234 /* All unroll factors have the form current_vector_size * X for some
2235 rational X, so they must have a common multiple. */
2236 unrolling_factor
2237 = force_common_multiple (unrolling_factor,
2238 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2239
2240 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2241 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2242 loop-based vectorization. Such stmts will be marked as HYBRID. */
2243 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2244 decided_to_slp++;
2245 }
2246
2247 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2248
2249 if (decided_to_slp && dump_enabled_p ())
2250 {
2251 dump_printf_loc (MSG_NOTE, vect_location,
2252 "Decided to SLP %d instances. Unrolling factor ",
2253 decided_to_slp);
2254 dump_dec (MSG_NOTE, unrolling_factor);
2255 dump_printf (MSG_NOTE, "\n");
2256 }
2257
2258 return (decided_to_slp > 0);
2259 }
2260
2261
2262 /* Private data for vect_detect_hybrid_slp. */
2263 struct vdhs_data
2264 {
2265 loop_vec_info loop_vinfo;
2266 vec<stmt_vec_info> *worklist;
2267 };
2268
2269 /* Walker for walk_gimple_op. */
2270
2271 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)2272 vect_detect_hybrid_slp (tree *tp, int *, void *data)
2273 {
2274 walk_stmt_info *wi = (walk_stmt_info *)data;
2275 vdhs_data *dat = (vdhs_data *)wi->info;
2276
2277 if (wi->is_lhs)
2278 return NULL_TREE;
2279
2280 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
2281 if (!def_stmt_info)
2282 return NULL_TREE;
2283 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
2284 if (PURE_SLP_STMT (def_stmt_info))
2285 {
2286 if (dump_enabled_p ())
2287 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2288 def_stmt_info->stmt);
2289 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2290 dat->worklist->safe_push (def_stmt_info);
2291 }
2292
2293 return NULL_TREE;
2294 }
2295
2296 /* Find stmts that must be both vectorized and SLPed. */
2297
2298 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2299 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2300 {
2301 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2302
2303 /* All stmts participating in SLP are marked pure_slp, all other
2304 stmts are loop_vect.
2305 First collect all loop_vect stmts into a worklist. */
2306 auto_vec<stmt_vec_info> worklist;
2307 for (unsigned i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2308 {
2309 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2310 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2311 gsi_next (&gsi))
2312 {
2313 gphi *phi = gsi.phi ();
2314 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
2315 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2316 worklist.safe_push (stmt_info);
2317 }
2318 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2319 gsi_next (&gsi))
2320 {
2321 gimple *stmt = gsi_stmt (gsi);
2322 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2323 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2324 {
2325 for (gimple_stmt_iterator gsi2
2326 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2327 !gsi_end_p (gsi2); gsi_next (&gsi2))
2328 {
2329 stmt_vec_info patt_info
2330 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
2331 if (!STMT_SLP_TYPE (patt_info)
2332 && STMT_VINFO_RELEVANT (patt_info))
2333 worklist.safe_push (patt_info);
2334 }
2335 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2336 }
2337 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
2338 worklist.safe_push (stmt_info);
2339 }
2340 }
2341
2342 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
2343 mark any SLP vectorized stmt as hybrid.
2344 ??? We're visiting def stmts N times (once for each non-SLP and
2345 once for each hybrid-SLP use). */
2346 walk_stmt_info wi;
2347 vdhs_data dat;
2348 dat.worklist = &worklist;
2349 dat.loop_vinfo = loop_vinfo;
2350 memset (&wi, 0, sizeof (wi));
2351 wi.info = (void *)&dat;
2352 while (!worklist.is_empty ())
2353 {
2354 stmt_vec_info stmt_info = worklist.pop ();
2355 /* Since SSA operands are not set up for pattern stmts we need
2356 to use walk_gimple_op. */
2357 wi.is_lhs = 0;
2358 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
2359 }
2360 }
2361
2362
2363 /* Initialize a bb_vec_info struct for the statements between
2364 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2365
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in,vec_info_shared * shared)2366 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2367 gimple_stmt_iterator region_end_in,
2368 vec_info_shared *shared)
2369 : vec_info (vec_info::bb, init_cost (NULL), shared),
2370 bb (gsi_bb (region_begin_in)),
2371 region_begin (region_begin_in),
2372 region_end (region_end_in)
2373 {
2374 gimple_stmt_iterator gsi;
2375
2376 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2377 gsi_next (&gsi))
2378 {
2379 gimple *stmt = gsi_stmt (gsi);
2380 gimple_set_uid (stmt, 0);
2381 add_stmt (stmt);
2382 }
2383
2384 bb->aux = this;
2385 }
2386
2387
2388 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2389 stmts in the basic block. */
2390
~_bb_vec_info()2391 _bb_vec_info::~_bb_vec_info ()
2392 {
2393 for (gimple_stmt_iterator si = region_begin;
2394 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2395 /* Reset region marker. */
2396 gimple_set_uid (gsi_stmt (si), -1);
2397
2398 bb->aux = NULL;
2399 }
2400
2401 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2402 given then that child nodes have already been processed, and that
2403 their def types currently match their SLP node's def type. */
2404
2405 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)2406 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2407 slp_instance node_instance,
2408 stmt_vector_for_cost *cost_vec)
2409 {
2410 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2411 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2412
2413 /* For BB vectorization vector types are assigned here.
2414 Memory accesses already got their vector type assigned
2415 in vect_analyze_data_refs. */
2416 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2417 if (bb_vinfo
2418 && ! STMT_VINFO_DATA_REF (stmt_info))
2419 {
2420 tree vectype, nunits_vectype;
2421 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
2422 &nunits_vectype))
2423 /* We checked this when building the node. */
2424 gcc_unreachable ();
2425 if (vectype == boolean_type_node)
2426 {
2427 vectype = vect_get_mask_type_for_stmt (stmt_info);
2428 if (!vectype)
2429 /* vect_get_mask_type_for_stmt has already explained the
2430 failure. */
2431 return false;
2432 }
2433
2434 stmt_vec_info sstmt_info;
2435 unsigned int i;
2436 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, sstmt_info)
2437 STMT_VINFO_VECTYPE (sstmt_info) = vectype;
2438 }
2439
2440 /* Calculate the number of vector statements to be created for the
2441 scalar stmts in this node. For SLP reductions it is equal to the
2442 number of vector statements in the children (which has already been
2443 calculated by the recursive call). Otherwise it is the number of
2444 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2445 VF divided by the number of elements in a vector. */
2446 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2447 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2448 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2449 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
2450 else
2451 {
2452 poly_uint64 vf;
2453 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2454 vf = loop_vinfo->vectorization_factor;
2455 else
2456 vf = 1;
2457 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2458 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2459 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2460 = vect_get_num_vectors (vf * group_size, vectype);
2461 }
2462
2463 bool dummy;
2464 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2465 }
2466
2467 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2468 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2469
2470 Return true if the operations are supported. */
2471
2472 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,scalar_stmts_to_slp_tree_map_t * visited,scalar_stmts_to_slp_tree_map_t * lvisited,stmt_vector_for_cost * cost_vec)2473 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2474 slp_instance node_instance,
2475 scalar_stmts_to_slp_tree_map_t *visited,
2476 scalar_stmts_to_slp_tree_map_t *lvisited,
2477 stmt_vector_for_cost *cost_vec)
2478 {
2479 int i, j;
2480 slp_tree child;
2481
2482 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2483 return true;
2484
2485 /* If we already analyzed the exact same set of scalar stmts we're done.
2486 We share the generated vector stmts for those. */
2487 slp_tree *leader;
2488 if ((leader = visited->get (SLP_TREE_SCALAR_STMTS (node)))
2489 || (leader = lvisited->get (SLP_TREE_SCALAR_STMTS (node))))
2490 {
2491 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2492 = SLP_TREE_NUMBER_OF_VEC_STMTS (*leader);
2493 return true;
2494 }
2495
2496 /* The SLP graph is acyclic so not caching whether we failed or succeeded
2497 doesn't result in any issue since we throw away the lvisited set
2498 when we fail. */
2499 lvisited->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
2500
2501 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2502 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2503 visited, lvisited, cost_vec))
2504 return false;
2505
2506 /* ??? We have to catch the case late where two first scalar stmts appear
2507 in multiple SLP children with different def type and fail. Remember
2508 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2509 match it when that is vect_internal_def. */
2510 auto_vec<vect_def_type, 4> dt;
2511 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2512 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2513 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2514
2515 /* Push SLP node def-type to stmt operands. */
2516 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2517 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2518 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2519 = SLP_TREE_DEF_TYPE (child);
2520
2521 /* Check everything worked out. */
2522 bool res = true;
2523 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2524 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2525 {
2526 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2527 != SLP_TREE_DEF_TYPE (child))
2528 res = false;
2529 }
2530 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) != dt[j])
2531 res = false;
2532 if (!res && dump_enabled_p ())
2533 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2534 "not vectorized: same operand with different "
2535 "def type in stmt.\n");
2536
2537 if (res)
2538 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2539 cost_vec);
2540
2541 /* Restore def-types. */
2542 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2543 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2544
2545 return res;
2546 }
2547
2548
2549 /* Analyze statements in SLP instances of VINFO. Return true if the
2550 operations are supported. */
2551
2552 bool
vect_slp_analyze_operations(vec_info * vinfo)2553 vect_slp_analyze_operations (vec_info *vinfo)
2554 {
2555 slp_instance instance;
2556 int i;
2557
2558 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2559
2560 scalar_stmts_to_slp_tree_map_t *visited
2561 = new scalar_stmts_to_slp_tree_map_t ();
2562 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2563 {
2564 scalar_stmts_to_slp_tree_map_t lvisited;
2565 stmt_vector_for_cost cost_vec;
2566 cost_vec.create (2);
2567 if (!vect_slp_analyze_node_operations (vinfo,
2568 SLP_INSTANCE_TREE (instance),
2569 instance, visited, &lvisited,
2570 &cost_vec))
2571 {
2572 slp_tree node = SLP_INSTANCE_TREE (instance);
2573 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2574 if (dump_enabled_p ())
2575 dump_printf_loc (MSG_NOTE, vect_location,
2576 "removing SLP instance operations starting from: %G",
2577 stmt_info->stmt);
2578 vect_free_slp_instance (instance, false);
2579 vinfo->slp_instances.ordered_remove (i);
2580 cost_vec.release ();
2581 }
2582 else
2583 {
2584 for (scalar_stmts_to_slp_tree_map_t::iterator x = lvisited.begin();
2585 x != lvisited.end(); ++x)
2586 visited->put ((*x).first.copy (), (*x).second);
2587 i++;
2588
2589 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2590 cost_vec.release ();
2591 }
2592 }
2593 delete visited;
2594
2595 return !vinfo->slp_instances.is_empty ();
2596 }
2597
2598
2599 /* Compute the scalar cost of the SLP node NODE and its children
2600 and return it. Do not account defs that are marked in LIFE and
2601 update LIFE according to uses of NODE. */
2602
2603 static void
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<slp_tree> & visited)2604 vect_bb_slp_scalar_cost (basic_block bb,
2605 slp_tree node, vec<bool, va_heap> *life,
2606 stmt_vector_for_cost *cost_vec,
2607 hash_set<slp_tree> &visited)
2608 {
2609 unsigned i;
2610 stmt_vec_info stmt_info;
2611 slp_tree child;
2612
2613 if (visited.add (node))
2614 return;
2615
2616 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2617 {
2618 gimple *stmt = stmt_info->stmt;
2619 vec_info *vinfo = stmt_info->vinfo;
2620 ssa_op_iter op_iter;
2621 def_operand_p def_p;
2622
2623 if ((*life)[i])
2624 continue;
2625
2626 /* If there is a non-vectorized use of the defs then the scalar
2627 stmt is kept live in which case we do not account it or any
2628 required defs in the SLP children in the scalar cost. This
2629 way we make the vectorization more costly when compared to
2630 the scalar cost. */
2631 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
2632 {
2633 imm_use_iterator use_iter;
2634 gimple *use_stmt;
2635 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
2636 if (!is_gimple_debug (use_stmt))
2637 {
2638 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
2639 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
2640 {
2641 (*life)[i] = true;
2642 BREAK_FROM_IMM_USE_STMT (use_iter);
2643 }
2644 }
2645 }
2646 if ((*life)[i])
2647 continue;
2648
2649 /* Count scalar stmts only once. */
2650 if (gimple_visited_p (stmt))
2651 continue;
2652 gimple_set_visited (stmt, true);
2653
2654 vect_cost_for_stmt kind;
2655 if (STMT_VINFO_DATA_REF (stmt_info))
2656 {
2657 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2658 kind = scalar_load;
2659 else
2660 kind = scalar_store;
2661 }
2662 else
2663 kind = scalar_stmt;
2664 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
2665 }
2666
2667 auto_vec<bool, 20> subtree_life;
2668 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2669 {
2670 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2671 {
2672 /* Do not directly pass LIFE to the recursive call, copy it to
2673 confine changes in the callee to the current child/subtree. */
2674 subtree_life.safe_splice (*life);
2675 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
2676 visited);
2677 subtree_life.truncate (0);
2678 }
2679 }
2680 }
2681
2682 static void
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec)2683 vect_bb_slp_scalar_cost (basic_block bb,
2684 slp_tree node, vec<bool, va_heap> *life,
2685 stmt_vector_for_cost *cost_vec)
2686 {
2687 hash_set<slp_tree> visited;
2688 vect_bb_slp_scalar_cost (bb, node, life, cost_vec, visited);
2689 }
2690
2691 /* Check if vectorization of the basic block is profitable. */
2692
2693 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)2694 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
2695 {
2696 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
2697 slp_instance instance;
2698 int i;
2699 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
2700 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
2701
2702 /* Calculate scalar cost. */
2703 stmt_vector_for_cost scalar_costs;
2704 scalar_costs.create (0);
2705 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2706 {
2707 auto_vec<bool, 20> life;
2708 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
2709 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
2710 SLP_INSTANCE_TREE (instance),
2711 &life, &scalar_costs);
2712 }
2713 void *target_cost_data = init_cost (NULL);
2714 add_stmt_costs (target_cost_data, &scalar_costs);
2715 scalar_costs.release ();
2716 unsigned dummy;
2717 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
2718 destroy_cost_data (target_cost_data);
2719
2720 /* Unset visited flag. */
2721 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
2722 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
2723 gimple_set_visited (gsi_stmt (gsi), false);
2724
2725 /* Complete the target-specific cost calculation. */
2726 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
2727 &vec_inside_cost, &vec_epilogue_cost);
2728
2729 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
2730
2731 if (dump_enabled_p ())
2732 {
2733 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2734 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
2735 vec_inside_cost);
2736 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
2737 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
2738 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
2739 }
2740
2741 /* Vectorization is profitable if its cost is more than the cost of scalar
2742 version. Note that we err on the vector side for equal cost because
2743 the cost estimate is otherwise quite pessimistic (constant uses are
2744 free on the scalar side but cost a load on the vector side for
2745 example). */
2746 if (vec_outside_cost + vec_inside_cost > scalar_cost)
2747 return false;
2748
2749 return true;
2750 }
2751
2752 /* Check if the basic block can be vectorized. Returns a bb_vec_info
2753 if so and sets fatal to true if failure is independent of
2754 current_vector_size. */
2755
2756 static bb_vec_info
vect_slp_analyze_bb_1(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,int n_stmts,bool & fatal,vec_info_shared * shared)2757 vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
2758 gimple_stmt_iterator region_end,
2759 vec<data_reference_p> datarefs, int n_stmts,
2760 bool &fatal, vec_info_shared *shared)
2761 {
2762 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
2763
2764 bb_vec_info bb_vinfo;
2765 slp_instance instance;
2766 int i;
2767 poly_uint64 min_vf = 2;
2768
2769 /* The first group of checks is independent of the vector size. */
2770 fatal = true;
2771
2772 if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
2773 {
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 "not vectorized: too many instructions in "
2777 "basic block.\n");
2778 free_data_refs (datarefs);
2779 return NULL;
2780 }
2781
2782 bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
2783 if (!bb_vinfo)
2784 return NULL;
2785
2786 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
2787 bb_vinfo->shared->save_datarefs ();
2788
2789 /* Analyze the data references. */
2790
2791 if (!vect_analyze_data_refs (bb_vinfo, &min_vf))
2792 {
2793 if (dump_enabled_p ())
2794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2795 "not vectorized: unhandled data-ref in basic "
2796 "block.\n");
2797
2798 delete bb_vinfo;
2799 return NULL;
2800 }
2801
2802 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
2803 {
2804 if (dump_enabled_p ())
2805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2806 "not vectorized: not enough data-refs in "
2807 "basic block.\n");
2808
2809 delete bb_vinfo;
2810 return NULL;
2811 }
2812
2813 if (!vect_analyze_data_ref_accesses (bb_vinfo))
2814 {
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817 "not vectorized: unhandled data access in "
2818 "basic block.\n");
2819
2820 delete bb_vinfo;
2821 return NULL;
2822 }
2823
2824 /* If there are no grouped stores in the region there is no need
2825 to continue with pattern recog as vect_analyze_slp will fail
2826 anyway. */
2827 if (bb_vinfo->grouped_stores.is_empty ())
2828 {
2829 if (dump_enabled_p ())
2830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2831 "not vectorized: no grouped stores in "
2832 "basic block.\n");
2833
2834 delete bb_vinfo;
2835 return NULL;
2836 }
2837
2838 /* While the rest of the analysis below depends on it in some way. */
2839 fatal = false;
2840
2841 vect_pattern_recog (bb_vinfo);
2842
2843 /* Check the SLP opportunities in the basic block, analyze and build SLP
2844 trees. */
2845 if (!vect_analyze_slp (bb_vinfo, n_stmts))
2846 {
2847 if (dump_enabled_p ())
2848 {
2849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2850 "Failed to SLP the basic block.\n");
2851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2852 "not vectorized: failed to find SLP opportunities "
2853 "in basic block.\n");
2854 }
2855
2856 delete bb_vinfo;
2857 return NULL;
2858 }
2859
2860 vect_record_base_alignments (bb_vinfo);
2861
2862 /* Analyze and verify the alignment of data references and the
2863 dependence in the SLP instances. */
2864 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
2865 {
2866 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
2867 || ! vect_slp_analyze_instance_dependence (instance))
2868 {
2869 slp_tree node = SLP_INSTANCE_TREE (instance);
2870 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "removing SLP instance operations starting from: %G",
2874 stmt_info->stmt);
2875 vect_free_slp_instance (instance, false);
2876 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
2877 continue;
2878 }
2879
2880 /* Mark all the statements that we want to vectorize as pure SLP and
2881 relevant. */
2882 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2883 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
2884
2885 i++;
2886 }
2887 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
2888 {
2889 delete bb_vinfo;
2890 return NULL;
2891 }
2892
2893 if (!vect_slp_analyze_operations (bb_vinfo))
2894 {
2895 if (dump_enabled_p ())
2896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2897 "not vectorized: bad operation in basic block.\n");
2898
2899 delete bb_vinfo;
2900 return NULL;
2901 }
2902
2903 /* Cost model: check if the vectorization is worthwhile. */
2904 if (!unlimited_cost_model (NULL)
2905 && !vect_bb_vectorization_profitable_p (bb_vinfo))
2906 {
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909 "not vectorized: vectorization is not "
2910 "profitable.\n");
2911
2912 delete bb_vinfo;
2913 return NULL;
2914 }
2915
2916 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "Basic block will be vectorized using SLP\n");
2919
2920 return bb_vinfo;
2921 }
2922
2923
2924 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
2925 true if anything in the basic-block was vectorized. */
2926
2927 bool
vect_slp_bb(basic_block bb)2928 vect_slp_bb (basic_block bb)
2929 {
2930 bb_vec_info bb_vinfo;
2931 gimple_stmt_iterator gsi;
2932 bool any_vectorized = false;
2933 auto_vector_sizes vector_sizes;
2934
2935 /* Autodetect first vector size we try. */
2936 current_vector_size = 0;
2937 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2938 unsigned int next_size = 0;
2939
2940 gsi = gsi_start_bb (bb);
2941
2942 poly_uint64 autodetected_vector_size = 0;
2943 while (1)
2944 {
2945 if (gsi_end_p (gsi))
2946 break;
2947
2948 gimple_stmt_iterator region_begin = gsi;
2949 vec<data_reference_p> datarefs = vNULL;
2950 int insns = 0;
2951
2952 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2953 {
2954 gimple *stmt = gsi_stmt (gsi);
2955 if (is_gimple_debug (stmt))
2956 continue;
2957 insns++;
2958
2959 if (gimple_location (stmt) != UNKNOWN_LOCATION)
2960 vect_location = stmt;
2961
2962 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
2963 break;
2964 }
2965
2966 /* Skip leading unhandled stmts. */
2967 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
2968 {
2969 gsi_next (&gsi);
2970 continue;
2971 }
2972
2973 gimple_stmt_iterator region_end = gsi;
2974
2975 bool vectorized = false;
2976 bool fatal = false;
2977 vec_info_shared shared;
2978 bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
2979 datarefs, insns, fatal, &shared);
2980 if (bb_vinfo
2981 && dbg_cnt (vect_slp))
2982 {
2983 if (dump_enabled_p ())
2984 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
2985
2986 bb_vinfo->shared->check_datarefs ();
2987 vect_schedule_slp (bb_vinfo);
2988
2989 unsigned HOST_WIDE_INT bytes;
2990 if (dump_enabled_p ())
2991 {
2992 if (current_vector_size.is_constant (&bytes))
2993 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2994 "basic block part vectorized using %wu byte "
2995 "vectors\n", bytes);
2996 else
2997 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2998 "basic block part vectorized using variable "
2999 "length vectors\n");
3000 }
3001
3002 vectorized = true;
3003 }
3004 delete bb_vinfo;
3005
3006 any_vectorized |= vectorized;
3007
3008 if (next_size == 0)
3009 autodetected_vector_size = current_vector_size;
3010
3011 if (next_size < vector_sizes.length ()
3012 && known_eq (vector_sizes[next_size], autodetected_vector_size))
3013 next_size += 1;
3014
3015 if (vectorized
3016 || next_size == vector_sizes.length ()
3017 || known_eq (current_vector_size, 0U)
3018 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3019 vector sizes will fail do not bother iterating. */
3020 || fatal)
3021 {
3022 if (gsi_end_p (region_end))
3023 break;
3024
3025 /* Skip the unhandled stmt. */
3026 gsi_next (&gsi);
3027
3028 /* And reset vector sizes. */
3029 current_vector_size = 0;
3030 next_size = 0;
3031 }
3032 else
3033 {
3034 /* Try the next biggest vector size. */
3035 current_vector_size = vector_sizes[next_size++];
3036 if (dump_enabled_p ())
3037 {
3038 dump_printf_loc (MSG_NOTE, vect_location,
3039 "***** Re-trying analysis with "
3040 "vector size ");
3041 dump_dec (MSG_NOTE, current_vector_size);
3042 dump_printf (MSG_NOTE, "\n");
3043 }
3044
3045 /* Start over. */
3046 gsi = region_begin;
3047 }
3048 }
3049
3050 return any_vectorized;
3051 }
3052
3053
3054 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3055
3056 static bool
vect_mask_constant_operand_p(stmt_vec_info stmt_vinfo)3057 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo)
3058 {
3059 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3060 tree op, vectype;
3061 enum vect_def_type dt;
3062
3063 /* For comparison and COND_EXPR type is chosen depending
3064 on the non-constant other comparison operand. */
3065 if (TREE_CODE_CLASS (code) == tcc_comparison)
3066 {
3067 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3068 op = gimple_assign_rhs1 (stmt);
3069
3070 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3071 gcc_unreachable ();
3072
3073 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3074 }
3075
3076 if (code == COND_EXPR)
3077 {
3078 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3079 tree cond = gimple_assign_rhs1 (stmt);
3080
3081 if (TREE_CODE (cond) == SSA_NAME)
3082 op = cond;
3083 else
3084 op = TREE_OPERAND (cond, 0);
3085
3086 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3087 gcc_unreachable ();
3088
3089 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3090 }
3091
3092 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3093 }
3094
3095 /* Build a variable-length vector in which the elements in ELTS are repeated
3096 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3097 RESULTS and add any new instructions to SEQ.
3098
3099 The approach we use is:
3100
3101 (1) Find a vector mode VM with integer elements of mode IM.
3102
3103 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3104 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3105 from small vectors to IM.
3106
3107 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3108
3109 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3110 correct byte contents.
3111
3112 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3113
3114 We try to find the largest IM for which this sequence works, in order
3115 to cut down on the number of interleaves. */
3116
3117 void
duplicate_and_interleave(gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3118 duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
3119 unsigned int nresults, vec<tree> &results)
3120 {
3121 unsigned int nelts = elts.length ();
3122 tree element_type = TREE_TYPE (vector_type);
3123
3124 /* (1) Find a vector mode VM with integer elements of mode IM. */
3125 unsigned int nvectors = 1;
3126 tree new_vector_type;
3127 tree permutes[2];
3128 if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
3129 &nvectors, &new_vector_type,
3130 permutes))
3131 gcc_unreachable ();
3132
3133 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3134 unsigned int partial_nelts = nelts / nvectors;
3135 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3136
3137 tree_vector_builder partial_elts;
3138 auto_vec<tree, 32> pieces (nvectors * 2);
3139 pieces.quick_grow (nvectors * 2);
3140 for (unsigned int i = 0; i < nvectors; ++i)
3141 {
3142 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3143 ELTS' has mode IM. */
3144 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3145 for (unsigned int j = 0; j < partial_nelts; ++j)
3146 partial_elts.quick_push (elts[i * partial_nelts + j]);
3147 tree t = gimple_build_vector (seq, &partial_elts);
3148 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3149 TREE_TYPE (new_vector_type), t);
3150
3151 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3152 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3153 }
3154
3155 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3156 correct byte contents.
3157
3158 We need to repeat the following operation log2(nvectors) times:
3159
3160 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3161 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3162
3163 However, if each input repeats every N elements and the VF is
3164 a multiple of N * 2, the HI result is the same as the LO. */
3165 unsigned int in_start = 0;
3166 unsigned int out_start = nvectors;
3167 unsigned int hi_start = nvectors / 2;
3168 /* A bound on the number of outputs needed to produce NRESULTS results
3169 in the final iteration. */
3170 unsigned int noutputs_bound = nvectors * nresults;
3171 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3172 {
3173 noutputs_bound /= 2;
3174 unsigned int limit = MIN (noutputs_bound, nvectors);
3175 for (unsigned int i = 0; i < limit; ++i)
3176 {
3177 if ((i & 1) != 0
3178 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3179 2 * in_repeat))
3180 {
3181 pieces[out_start + i] = pieces[out_start + i - 1];
3182 continue;
3183 }
3184
3185 tree output = make_ssa_name (new_vector_type);
3186 tree input1 = pieces[in_start + (i / 2)];
3187 tree input2 = pieces[in_start + (i / 2) + hi_start];
3188 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3189 input1, input2,
3190 permutes[i & 1]);
3191 gimple_seq_add_stmt (seq, stmt);
3192 pieces[out_start + i] = output;
3193 }
3194 std::swap (in_start, out_start);
3195 }
3196
3197 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3198 results.reserve (nresults);
3199 for (unsigned int i = 0; i < nresults; ++i)
3200 if (i < nvectors)
3201 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3202 pieces[in_start + i]));
3203 else
3204 results.quick_push (results[i - nvectors]);
3205 }
3206
3207
3208 /* For constant and loop invariant defs of SLP_NODE this function returns
3209 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3210 OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
3211 scalar stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
3212 REDUC_INDEX is the index of the reduction operand in the statements, unless
3213 it is -1. */
3214
3215 static void
vect_get_constant_vectors(tree op,slp_tree slp_node,vec<tree> * vec_oprnds,unsigned int op_num,unsigned int number_of_vectors)3216 vect_get_constant_vectors (tree op, slp_tree slp_node,
3217 vec<tree> *vec_oprnds,
3218 unsigned int op_num, unsigned int number_of_vectors)
3219 {
3220 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3221 stmt_vec_info stmt_vinfo = stmts[0];
3222 gimple *stmt = stmt_vinfo->stmt;
3223 unsigned HOST_WIDE_INT nunits;
3224 tree vec_cst;
3225 unsigned j, number_of_places_left_in_vector;
3226 tree vector_type;
3227 tree vop;
3228 int group_size = stmts.length ();
3229 unsigned int vec_num, i;
3230 unsigned number_of_copies = 1;
3231 vec<tree> voprnds;
3232 voprnds.create (number_of_vectors);
3233 bool constant_p, is_store;
3234 tree neutral_op = NULL;
3235 enum tree_code code = gimple_expr_code (stmt);
3236 gimple_seq ctor_seq = NULL;
3237 auto_vec<tree, 16> permute_results;
3238
3239 /* Check if vector type is a boolean vector. */
3240 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3241 && vect_mask_constant_operand_p (stmt_vinfo))
3242 vector_type
3243 = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
3244 else
3245 vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
3246
3247 if (STMT_VINFO_DATA_REF (stmt_vinfo))
3248 {
3249 is_store = true;
3250 op = gimple_assign_rhs1 (stmt);
3251 }
3252 else
3253 is_store = false;
3254
3255 gcc_assert (op);
3256
3257 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3258 created vectors. It is greater than 1 if unrolling is performed.
3259
3260 For example, we have two scalar operands, s1 and s2 (e.g., group of
3261 strided accesses of size two), while NUNITS is four (i.e., four scalars
3262 of this type can be packed in a vector). The output vector will contain
3263 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3264 will be 2).
3265
3266 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3267 containing the operands.
3268
3269 For example, NUNITS is four as before, and the group size is 8
3270 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3271 {s5, s6, s7, s8}. */
3272
3273 /* When using duplicate_and_interleave, we just need one element for
3274 each scalar statement. */
3275 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3276 nunits = group_size;
3277
3278 number_of_copies = nunits * number_of_vectors / group_size;
3279
3280 number_of_places_left_in_vector = nunits;
3281 constant_p = true;
3282 tree_vector_builder elts (vector_type, nunits, 1);
3283 elts.quick_grow (nunits);
3284 bool place_after_defs = false;
3285 for (j = 0; j < number_of_copies; j++)
3286 {
3287 for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
3288 {
3289 stmt = stmt_vinfo->stmt;
3290 if (is_store)
3291 op = gimple_assign_rhs1 (stmt);
3292 else
3293 {
3294 switch (code)
3295 {
3296 case COND_EXPR:
3297 {
3298 tree cond = gimple_assign_rhs1 (stmt);
3299 if (TREE_CODE (cond) == SSA_NAME)
3300 op = gimple_op (stmt, op_num + 1);
3301 else if (op_num == 0 || op_num == 1)
3302 op = TREE_OPERAND (cond, op_num);
3303 else
3304 {
3305 if (op_num == 2)
3306 op = gimple_assign_rhs2 (stmt);
3307 else
3308 op = gimple_assign_rhs3 (stmt);
3309 }
3310 }
3311 break;
3312
3313 case CALL_EXPR:
3314 op = gimple_call_arg (stmt, op_num);
3315 break;
3316
3317 case LSHIFT_EXPR:
3318 case RSHIFT_EXPR:
3319 case LROTATE_EXPR:
3320 case RROTATE_EXPR:
3321 op = gimple_op (stmt, op_num + 1);
3322 /* Unlike the other binary operators, shifts/rotates have
3323 the shift count being int, instead of the same type as
3324 the lhs, so make sure the scalar is the right type if
3325 we are dealing with vectors of
3326 long long/long/short/char. */
3327 if (op_num == 1 && TREE_CODE (op) == INTEGER_CST)
3328 op = fold_convert (TREE_TYPE (vector_type), op);
3329 break;
3330
3331 default:
3332 op = gimple_op (stmt, op_num + 1);
3333 break;
3334 }
3335 }
3336
3337 /* Create 'vect_ = {op0,op1,...,opn}'. */
3338 number_of_places_left_in_vector--;
3339 tree orig_op = op;
3340 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3341 {
3342 if (CONSTANT_CLASS_P (op))
3343 {
3344 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3345 {
3346 /* Can't use VIEW_CONVERT_EXPR for booleans because
3347 of possibly different sizes of scalar value and
3348 vector element. */
3349 if (integer_zerop (op))
3350 op = build_int_cst (TREE_TYPE (vector_type), 0);
3351 else if (integer_onep (op))
3352 op = build_all_ones_cst (TREE_TYPE (vector_type));
3353 else
3354 gcc_unreachable ();
3355 }
3356 else
3357 op = fold_unary (VIEW_CONVERT_EXPR,
3358 TREE_TYPE (vector_type), op);
3359 gcc_assert (op && CONSTANT_CLASS_P (op));
3360 }
3361 else
3362 {
3363 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3364 gimple *init_stmt;
3365 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3366 {
3367 tree true_val
3368 = build_all_ones_cst (TREE_TYPE (vector_type));
3369 tree false_val
3370 = build_zero_cst (TREE_TYPE (vector_type));
3371 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3372 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3373 op, true_val,
3374 false_val);
3375 }
3376 else
3377 {
3378 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3379 op);
3380 init_stmt
3381 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3382 op);
3383 }
3384 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3385 op = new_temp;
3386 }
3387 }
3388 elts[number_of_places_left_in_vector] = op;
3389 if (!CONSTANT_CLASS_P (op))
3390 constant_p = false;
3391 if (TREE_CODE (orig_op) == SSA_NAME
3392 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3393 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3394 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3395 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3396 place_after_defs = true;
3397
3398 if (number_of_places_left_in_vector == 0)
3399 {
3400 if (constant_p
3401 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3402 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3403 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3404 else
3405 {
3406 if (vec_oprnds->is_empty ())
3407 duplicate_and_interleave (&ctor_seq, vector_type, elts,
3408 number_of_vectors,
3409 permute_results);
3410 vec_cst = permute_results[number_of_vectors - j - 1];
3411 }
3412 tree init;
3413 gimple_stmt_iterator gsi;
3414 if (place_after_defs)
3415 {
3416 stmt_vec_info last_stmt_info
3417 = vect_find_last_scalar_stmt_in_slp (slp_node);
3418 gsi = gsi_for_stmt (last_stmt_info->stmt);
3419 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3420 &gsi);
3421 }
3422 else
3423 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3424 NULL);
3425 if (ctor_seq != NULL)
3426 {
3427 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3428 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3429 ctor_seq = NULL;
3430 }
3431 voprnds.quick_push (init);
3432 place_after_defs = false;
3433 number_of_places_left_in_vector = nunits;
3434 constant_p = true;
3435 elts.new_vector (vector_type, nunits, 1);
3436 elts.quick_grow (nunits);
3437 }
3438 }
3439 }
3440
3441 /* Since the vectors are created in the reverse order, we should invert
3442 them. */
3443 vec_num = voprnds.length ();
3444 for (j = vec_num; j != 0; j--)
3445 {
3446 vop = voprnds[j - 1];
3447 vec_oprnds->quick_push (vop);
3448 }
3449
3450 voprnds.release ();
3451
3452 /* In case that VF is greater than the unrolling factor needed for the SLP
3453 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3454 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3455 to replicate the vectors. */
3456 while (number_of_vectors > vec_oprnds->length ())
3457 {
3458 tree neutral_vec = NULL;
3459
3460 if (neutral_op)
3461 {
3462 if (!neutral_vec)
3463 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3464
3465 vec_oprnds->quick_push (neutral_vec);
3466 }
3467 else
3468 {
3469 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3470 vec_oprnds->quick_push (vop);
3471 }
3472 }
3473 }
3474
3475
3476 /* Get vectorized definitions from SLP_NODE that contains corresponding
3477 vectorized def-stmts. */
3478
3479 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3480 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3481 {
3482 tree vec_oprnd;
3483 stmt_vec_info vec_def_stmt_info;
3484 unsigned int i;
3485
3486 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3487
3488 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3489 {
3490 gcc_assert (vec_def_stmt_info);
3491 if (gphi *vec_def_phi = dyn_cast <gphi *> (vec_def_stmt_info->stmt))
3492 vec_oprnd = gimple_phi_result (vec_def_phi);
3493 else
3494 vec_oprnd = gimple_get_lhs (vec_def_stmt_info->stmt);
3495 vec_oprnds->quick_push (vec_oprnd);
3496 }
3497 }
3498
3499
3500 /* Get vectorized definitions for SLP_NODE.
3501 If the scalar definitions are loop invariants or constants, collect them and
3502 call vect_get_constant_vectors() to create vector stmts.
3503 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3504 must be stored in the corresponding child of SLP_NODE, and we call
3505 vect_get_slp_vect_defs () to retrieve them. */
3506
3507 void
vect_get_slp_defs(vec<tree> ops,slp_tree slp_node,vec<vec<tree>> * vec_oprnds)3508 vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
3509 vec<vec<tree> > *vec_oprnds)
3510 {
3511 int number_of_vects = 0, i;
3512 unsigned int child_index = 0;
3513 HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
3514 slp_tree child = NULL;
3515 vec<tree> vec_defs;
3516 tree oprnd;
3517 bool vectorized_defs;
3518
3519 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3520 FOR_EACH_VEC_ELT (ops, i, oprnd)
3521 {
3522 /* For each operand we check if it has vectorized definitions in a child
3523 node or we need to create them (for invariants and constants). We
3524 check if the LHS of the first stmt of the next child matches OPRND.
3525 If it does, we found the correct child. Otherwise, we call
3526 vect_get_constant_vectors (), and not advance CHILD_INDEX in order
3527 to check this child node for the next operand. */
3528 vectorized_defs = false;
3529 if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
3530 {
3531 child = SLP_TREE_CHILDREN (slp_node)[child_index];
3532
3533 /* We have to check both pattern and original def, if available. */
3534 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3535 {
3536 stmt_vec_info first_def_info = SLP_TREE_SCALAR_STMTS (child)[0];
3537 stmt_vec_info related = STMT_VINFO_RELATED_STMT (first_def_info);
3538 tree first_def_op;
3539
3540 if (gphi *first_def = dyn_cast <gphi *> (first_def_info->stmt))
3541 first_def_op = gimple_phi_result (first_def);
3542 else
3543 first_def_op = gimple_get_lhs (first_def_info->stmt);
3544 if (operand_equal_p (oprnd, first_def_op, 0)
3545 || (related
3546 && operand_equal_p (oprnd,
3547 gimple_get_lhs (related->stmt), 0)))
3548 {
3549 /* The number of vector defs is determined by the number of
3550 vector statements in the node from which we get those
3551 statements. */
3552 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
3553 vectorized_defs = true;
3554 child_index++;
3555 }
3556 }
3557 else
3558 child_index++;
3559 }
3560
3561 if (!vectorized_defs)
3562 {
3563 if (i == 0)
3564 {
3565 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3566 /* Number of vector stmts was calculated according to LHS in
3567 vect_schedule_slp_instance (), fix it by replacing LHS with
3568 RHS, if necessary. See vect_get_smallest_scalar_type () for
3569 details. */
3570 vect_get_smallest_scalar_type (first_stmt_info, &lhs_size_unit,
3571 &rhs_size_unit);
3572 if (rhs_size_unit != lhs_size_unit)
3573 {
3574 number_of_vects *= rhs_size_unit;
3575 number_of_vects /= lhs_size_unit;
3576 }
3577 }
3578 }
3579
3580 /* Allocate memory for vectorized defs. */
3581 vec_defs = vNULL;
3582 vec_defs.create (number_of_vects);
3583
3584 /* For reduction defs we call vect_get_constant_vectors (), since we are
3585 looking for initial loop invariant values. */
3586 if (vectorized_defs)
3587 /* The defs are already vectorized. */
3588 vect_get_slp_vect_defs (child, &vec_defs);
3589 else
3590 /* Build vectors from scalar defs. */
3591 vect_get_constant_vectors (oprnd, slp_node, &vec_defs, i,
3592 number_of_vects);
3593
3594 vec_oprnds->quick_push (vec_defs);
3595 }
3596 }
3597
3598 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3599 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3600 permute statements for the SLP node NODE of the SLP instance
3601 SLP_NODE_INSTANCE. */
3602
3603 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3604 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3605 gimple_stmt_iterator *gsi, poly_uint64 vf,
3606 slp_instance slp_node_instance, bool analyze_only,
3607 unsigned *n_perms)
3608 {
3609 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3610 vec_info *vinfo = stmt_info->vinfo;
3611 int vec_index = 0;
3612 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3613 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3614 unsigned int mask_element;
3615 machine_mode mode;
3616
3617 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3618 return false;
3619
3620 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3621
3622 mode = TYPE_MODE (vectype);
3623 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3624
3625 /* Initialize the vect stmts of NODE to properly insert the generated
3626 stmts later. */
3627 if (! analyze_only)
3628 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3629 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3630 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3631
3632 /* Generate permutation masks for every NODE. Number of masks for each NODE
3633 is equal to GROUP_SIZE.
3634 E.g., we have a group of three nodes with three loads from the same
3635 location in each node, and the vector size is 4. I.e., we have a
3636 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3637 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3638 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3639 ...
3640
3641 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3642 The last mask is illegal since we assume two operands for permute
3643 operation, and the mask element values can't be outside that range.
3644 Hence, the last mask must be converted into {2,5,5,5}.
3645 For the first two permutations we need the first and the second input
3646 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3647 we need the second and the third vectors: {b1,c1,a2,b2} and
3648 {c2,a3,b3,c3}. */
3649
3650 int vect_stmts_counter = 0;
3651 unsigned int index = 0;
3652 int first_vec_index = -1;
3653 int second_vec_index = -1;
3654 bool noop_p = true;
3655 *n_perms = 0;
3656
3657 vec_perm_builder mask;
3658 unsigned int nelts_to_build;
3659 unsigned int nvectors_per_build;
3660 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3661 && multiple_p (nunits, group_size));
3662 if (repeating_p)
3663 {
3664 /* A single vector contains a whole number of copies of the node, so:
3665 (a) all permutes can use the same mask; and
3666 (b) the permutes only need a single vector input. */
3667 mask.new_vector (nunits, group_size, 3);
3668 nelts_to_build = mask.encoded_nelts ();
3669 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
3670 }
3671 else
3672 {
3673 /* We need to construct a separate mask for each vector statement. */
3674 unsigned HOST_WIDE_INT const_nunits, const_vf;
3675 if (!nunits.is_constant (&const_nunits)
3676 || !vf.is_constant (&const_vf))
3677 return false;
3678 mask.new_vector (const_nunits, const_nunits, 1);
3679 nelts_to_build = const_vf * group_size;
3680 nvectors_per_build = 1;
3681 }
3682
3683 unsigned int count = mask.encoded_nelts ();
3684 mask.quick_grow (count);
3685 vec_perm_indices indices;
3686
3687 for (unsigned int j = 0; j < nelts_to_build; j++)
3688 {
3689 unsigned int iter_num = j / group_size;
3690 unsigned int stmt_num = j % group_size;
3691 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
3692 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
3693 if (repeating_p)
3694 {
3695 first_vec_index = 0;
3696 mask_element = i;
3697 }
3698 else
3699 {
3700 /* Enforced before the loop when !repeating_p. */
3701 unsigned int const_nunits = nunits.to_constant ();
3702 vec_index = i / const_nunits;
3703 mask_element = i % const_nunits;
3704 if (vec_index == first_vec_index
3705 || first_vec_index == -1)
3706 {
3707 first_vec_index = vec_index;
3708 }
3709 else if (vec_index == second_vec_index
3710 || second_vec_index == -1)
3711 {
3712 second_vec_index = vec_index;
3713 mask_element += const_nunits;
3714 }
3715 else
3716 {
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3719 "permutation requires at "
3720 "least three vectors %G",
3721 stmt_info->stmt);
3722 gcc_assert (analyze_only);
3723 return false;
3724 }
3725
3726 gcc_assert (mask_element < 2 * const_nunits);
3727 }
3728
3729 if (mask_element != index)
3730 noop_p = false;
3731 mask[index++] = mask_element;
3732
3733 if (index == count && !noop_p)
3734 {
3735 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
3736 if (!can_vec_perm_const_p (mode, indices))
3737 {
3738 if (dump_enabled_p ())
3739 {
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
3741 vect_location,
3742 "unsupported vect permute { ");
3743 for (i = 0; i < count; ++i)
3744 {
3745 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
3746 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
3747 }
3748 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
3749 }
3750 gcc_assert (analyze_only);
3751 return false;
3752 }
3753
3754 ++*n_perms;
3755 }
3756
3757 if (index == count)
3758 {
3759 if (!analyze_only)
3760 {
3761 tree mask_vec = NULL_TREE;
3762
3763 if (! noop_p)
3764 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
3765
3766 if (second_vec_index == -1)
3767 second_vec_index = first_vec_index;
3768
3769 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
3770 {
3771 /* Generate the permute statement if necessary. */
3772 tree first_vec = dr_chain[first_vec_index + ri];
3773 tree second_vec = dr_chain[second_vec_index + ri];
3774 stmt_vec_info perm_stmt_info;
3775 if (! noop_p)
3776 {
3777 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3778 tree perm_dest
3779 = vect_create_destination_var (gimple_assign_lhs (stmt),
3780 vectype);
3781 perm_dest = make_ssa_name (perm_dest);
3782 gassign *perm_stmt
3783 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
3784 first_vec, second_vec,
3785 mask_vec);
3786 perm_stmt_info
3787 = vect_finish_stmt_generation (stmt_info, perm_stmt,
3788 gsi);
3789 }
3790 else
3791 /* If mask was NULL_TREE generate the requested
3792 identity transform. */
3793 perm_stmt_info = vinfo->lookup_def (first_vec);
3794
3795 /* Store the vector statement in NODE. */
3796 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
3797 = perm_stmt_info;
3798 }
3799 }
3800
3801 index = 0;
3802 first_vec_index = -1;
3803 second_vec_index = -1;
3804 noop_p = true;
3805 }
3806 }
3807
3808 return true;
3809 }
3810
3811 /* Vectorize SLP instance tree in postorder. */
3812
3813 static void
vect_schedule_slp_instance(slp_tree node,slp_instance instance,scalar_stmts_to_slp_tree_map_t * bst_map)3814 vect_schedule_slp_instance (slp_tree node, slp_instance instance,
3815 scalar_stmts_to_slp_tree_map_t *bst_map)
3816 {
3817 gimple_stmt_iterator si;
3818 stmt_vec_info stmt_info;
3819 unsigned int group_size;
3820 tree vectype;
3821 int i, j;
3822 slp_tree child;
3823
3824 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3825 return;
3826
3827 /* See if we have already vectorized the node in the graph of the
3828 SLP instance. */
3829 if (SLP_TREE_VEC_STMTS (node).exists ())
3830 return;
3831
3832 /* See if we have already vectorized the same set of stmts and reuse their
3833 vectorized stmts across instances. */
3834 if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
3835 {
3836 SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
3837 return;
3838 }
3839
3840 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
3841 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3842 vect_schedule_slp_instance (child, instance, bst_map);
3843
3844 /* Push SLP node def-type to stmts. */
3845 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3846 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3847 {
3848 stmt_vec_info child_stmt_info;
3849 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3850 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
3851 }
3852
3853 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3854
3855 /* VECTYPE is the type of the destination. */
3856 vectype = STMT_VINFO_VECTYPE (stmt_info);
3857 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3858 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
3859
3860 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
3861 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
3862
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE, vect_location,
3865 "------>vectorizing SLP node starting from: %G",
3866 stmt_info->stmt);
3867
3868 /* Vectorized stmts go before the last scalar stmt which is where
3869 all uses are ready. */
3870 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
3871 si = gsi_for_stmt (last_stmt_info->stmt);
3872
3873 /* Mark the first element of the reduction chain as reduction to properly
3874 transform the node. In the analysis phase only the last element of the
3875 chain is marked as reduction. */
3876 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3877 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
3878 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3879 {
3880 STMT_VINFO_DEF_TYPE (stmt_info) = vect_reduction_def;
3881 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3882 }
3883
3884 /* Handle two-operation SLP nodes by vectorizing the group with
3885 both operations and then performing a merge. */
3886 if (SLP_TREE_TWO_OPERATORS (node))
3887 {
3888 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
3889 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3890 enum tree_code ocode = ERROR_MARK;
3891 stmt_vec_info ostmt_info;
3892 vec_perm_builder mask (group_size, group_size, 1);
3893 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
3894 {
3895 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3896 if (gimple_assign_rhs_code (ostmt) != code0)
3897 {
3898 mask.quick_push (1);
3899 ocode = gimple_assign_rhs_code (ostmt);
3900 }
3901 else
3902 mask.quick_push (0);
3903 }
3904 if (ocode != ERROR_MARK)
3905 {
3906 vec<stmt_vec_info> v0;
3907 vec<stmt_vec_info> v1;
3908 unsigned j;
3909 tree tmask = NULL_TREE;
3910 vect_transform_stmt (stmt_info, &si, node, instance);
3911 v0 = SLP_TREE_VEC_STMTS (node).copy ();
3912 SLP_TREE_VEC_STMTS (node).truncate (0);
3913 gimple_assign_set_rhs_code (stmt, ocode);
3914 vect_transform_stmt (stmt_info, &si, node, instance);
3915 gimple_assign_set_rhs_code (stmt, code0);
3916 v1 = SLP_TREE_VEC_STMTS (node).copy ();
3917 SLP_TREE_VEC_STMTS (node).truncate (0);
3918 tree meltype = build_nonstandard_integer_type
3919 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
3920 tree mvectype = get_same_sized_vectype (meltype, vectype);
3921 unsigned k = 0, l;
3922 for (j = 0; j < v0.length (); ++j)
3923 {
3924 /* Enforced by vect_build_slp_tree, which rejects variable-length
3925 vectors for SLP_TREE_TWO_OPERATORS. */
3926 unsigned int const_nunits = nunits.to_constant ();
3927 tree_vector_builder melts (mvectype, const_nunits, 1);
3928 for (l = 0; l < const_nunits; ++l)
3929 {
3930 if (k >= group_size)
3931 k = 0;
3932 tree t = build_int_cst (meltype,
3933 mask[k++] * const_nunits + l);
3934 melts.quick_push (t);
3935 }
3936 tmask = melts.build ();
3937
3938 /* ??? Not all targets support a VEC_PERM_EXPR with a
3939 constant mask that would translate to a vec_merge RTX
3940 (with their vec_perm_const_ok). We can either not
3941 vectorize in that case or let veclower do its job.
3942 Unfortunately that isn't too great and at least for
3943 plus/minus we'd eventually like to match targets
3944 vector addsub instructions. */
3945 gimple *vstmt;
3946 vstmt = gimple_build_assign (make_ssa_name (vectype),
3947 VEC_PERM_EXPR,
3948 gimple_assign_lhs (v0[j]->stmt),
3949 gimple_assign_lhs (v1[j]->stmt),
3950 tmask);
3951 SLP_TREE_VEC_STMTS (node).quick_push
3952 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
3953 }
3954 v0.release ();
3955 v1.release ();
3956 return;
3957 }
3958 }
3959 vect_transform_stmt (stmt_info, &si, node, instance);
3960
3961 /* Restore stmt def-types. */
3962 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3963 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
3964 {
3965 stmt_vec_info child_stmt_info;
3966 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
3967 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
3968 }
3969 }
3970
3971 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
3972 For loop vectorization this is done in vectorizable_call, but for SLP
3973 it needs to be deferred until end of vect_schedule_slp, because multiple
3974 SLP instances may refer to the same scalar stmt. */
3975
3976 static void
vect_remove_slp_scalar_calls(slp_tree node,hash_set<slp_tree> & visited)3977 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
3978 {
3979 gimple *new_stmt;
3980 gimple_stmt_iterator gsi;
3981 int i;
3982 slp_tree child;
3983 tree lhs;
3984 stmt_vec_info stmt_info;
3985
3986 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3987 return;
3988
3989 if (visited.add (node))
3990 return;
3991
3992 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3993 vect_remove_slp_scalar_calls (child, visited);
3994
3995 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3996 {
3997 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
3998 if (!stmt || gimple_bb (stmt) == NULL)
3999 continue;
4000 if (is_pattern_stmt_p (stmt_info)
4001 || !PURE_SLP_STMT (stmt_info))
4002 continue;
4003 lhs = gimple_call_lhs (stmt);
4004 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4005 gsi = gsi_for_stmt (stmt);
4006 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4007 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4008 }
4009 }
4010
4011 static void
vect_remove_slp_scalar_calls(slp_tree node)4012 vect_remove_slp_scalar_calls (slp_tree node)
4013 {
4014 hash_set<slp_tree> visited;
4015 vect_remove_slp_scalar_calls (node, visited);
4016 }
4017
4018 /* Generate vector code for all SLP instances in the loop/basic block. */
4019
4020 void
vect_schedule_slp(vec_info * vinfo)4021 vect_schedule_slp (vec_info *vinfo)
4022 {
4023 vec<slp_instance> slp_instances;
4024 slp_instance instance;
4025 unsigned int i;
4026
4027 scalar_stmts_to_slp_tree_map_t *bst_map
4028 = new scalar_stmts_to_slp_tree_map_t ();
4029 slp_instances = vinfo->slp_instances;
4030 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4031 {
4032 /* Schedule the tree of INSTANCE. */
4033 vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance),
4034 instance, bst_map);
4035 if (dump_enabled_p ())
4036 dump_printf_loc (MSG_NOTE, vect_location,
4037 "vectorizing stmts using SLP.\n");
4038 }
4039 delete bst_map;
4040
4041 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4042 {
4043 slp_tree root = SLP_INSTANCE_TREE (instance);
4044 stmt_vec_info store_info;
4045 unsigned int j;
4046
4047 /* Remove scalar call stmts. Do not do this for basic-block
4048 vectorization as not all uses may be vectorized.
4049 ??? Why should this be necessary? DCE should be able to
4050 remove the stmts itself.
4051 ??? For BB vectorization we can as well remove scalar
4052 stmts starting from the SLP tree root if they have no
4053 uses. */
4054 if (is_a <loop_vec_info> (vinfo))
4055 vect_remove_slp_scalar_calls (root);
4056
4057 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4058 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4059 {
4060 if (!STMT_VINFO_DATA_REF (store_info))
4061 break;
4062
4063 store_info = vect_orig_stmt (store_info);
4064 /* Free the attached stmt_vec_info and remove the stmt. */
4065 vinfo->remove_stmt (store_info);
4066 }
4067 }
4068 }
4069