1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47
48
49 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
50 FINAL_P is true if we have vectorized the instance or if we have
51 made a final decision not to vectorize the statements in any way. */
52
53 static void
vect_free_slp_tree(slp_tree node,bool final_p)54 vect_free_slp_tree (slp_tree node, bool final_p)
55 {
56 int i;
57 slp_tree child;
58
59 if (--node->refcnt != 0)
60 return;
61
62 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
63 vect_free_slp_tree (child, final_p);
64
65 /* Don't update STMT_VINFO_NUM_SLP_USES if it isn't relevant.
66 Some statements might no longer exist, after having been
67 removed by vect_transform_stmt. Updating the remaining
68 statements would be redundant. */
69 if (!final_p)
70 {
71 stmt_vec_info stmt_info;
72 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
73 {
74 gcc_assert (STMT_VINFO_NUM_SLP_USES (stmt_info) > 0);
75 STMT_VINFO_NUM_SLP_USES (stmt_info)--;
76 }
77 }
78
79 SLP_TREE_CHILDREN (node).release ();
80 SLP_TREE_SCALAR_STMTS (node).release ();
81 SLP_TREE_SCALAR_OPS (node).release ();
82 SLP_TREE_VEC_STMTS (node).release ();
83 SLP_TREE_LOAD_PERMUTATION (node).release ();
84
85 free (node);
86 }
87
88 /* Free the memory allocated for the SLP instance. FINAL_P is true if we
89 have vectorized the instance or if we have made a final decision not
90 to vectorize the statements in any way. */
91
92 void
vect_free_slp_instance(slp_instance instance,bool final_p)93 vect_free_slp_instance (slp_instance instance, bool final_p)
94 {
95 vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
96 SLP_INSTANCE_LOADS (instance).release ();
97 free (instance);
98 }
99
100
101 /* Create an SLP node for SCALAR_STMTS. */
102
103 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts)104 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
105 {
106 slp_tree node;
107 stmt_vec_info stmt_info = scalar_stmts[0];
108 unsigned int nops;
109
110 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
111 nops = gimple_call_num_args (stmt);
112 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
113 {
114 nops = gimple_num_ops (stmt) - 1;
115 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
116 nops++;
117 }
118 else if (is_a <gphi *> (stmt_info->stmt))
119 nops = 0;
120 else
121 return NULL;
122
123 node = XNEW (struct _slp_tree);
124 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
125 SLP_TREE_SCALAR_OPS (node) = vNULL;
126 SLP_TREE_VEC_STMTS (node).create (0);
127 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
128 SLP_TREE_CHILDREN (node).create (nops);
129 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
130 SLP_TREE_TWO_OPERATORS (node) = false;
131 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
132 node->refcnt = 1;
133 node->max_nunits = 1;
134
135 unsigned i;
136 FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
137 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
138
139 return node;
140 }
141
142 /* Create an SLP node for OPS. */
143
144 static slp_tree
vect_create_new_slp_node(vec<tree> ops)145 vect_create_new_slp_node (vec<tree> ops)
146 {
147 slp_tree node;
148
149 node = XNEW (struct _slp_tree);
150 SLP_TREE_SCALAR_STMTS (node) = vNULL;
151 SLP_TREE_SCALAR_OPS (node) = ops;
152 SLP_TREE_VEC_STMTS (node).create (0);
153 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = 0;
154 SLP_TREE_CHILDREN (node) = vNULL;
155 SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
156 SLP_TREE_TWO_OPERATORS (node) = false;
157 SLP_TREE_DEF_TYPE (node) = vect_external_def;
158 node->refcnt = 1;
159 node->max_nunits = 1;
160
161 return node;
162 }
163
164
165 /* This structure is used in creation of an SLP tree. Each instance
166 corresponds to the same operand in a group of scalar stmts in an SLP
167 node. */
168 typedef struct _slp_oprnd_info
169 {
170 /* Def-stmts for the operands. */
171 vec<stmt_vec_info> def_stmts;
172 /* Operands. */
173 vec<tree> ops;
174 /* Information about the first statement, its vector def-type, type, the
175 operand itself in case it's constant, and an indication if it's a pattern
176 stmt. */
177 tree first_op_type;
178 enum vect_def_type first_dt;
179 bool any_pattern;
180 } *slp_oprnd_info;
181
182
183 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
184 operand. */
185 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)186 vect_create_oprnd_info (int nops, int group_size)
187 {
188 int i;
189 slp_oprnd_info oprnd_info;
190 vec<slp_oprnd_info> oprnds_info;
191
192 oprnds_info.create (nops);
193 for (i = 0; i < nops; i++)
194 {
195 oprnd_info = XNEW (struct _slp_oprnd_info);
196 oprnd_info->def_stmts.create (group_size);
197 oprnd_info->ops.create (group_size);
198 oprnd_info->first_dt = vect_uninitialized_def;
199 oprnd_info->first_op_type = NULL_TREE;
200 oprnd_info->any_pattern = false;
201 oprnds_info.quick_push (oprnd_info);
202 }
203
204 return oprnds_info;
205 }
206
207
208 /* Free operands info. */
209
210 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)211 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
212 {
213 int i;
214 slp_oprnd_info oprnd_info;
215
216 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
217 {
218 oprnd_info->def_stmts.release ();
219 oprnd_info->ops.release ();
220 XDELETE (oprnd_info);
221 }
222
223 oprnds_info.release ();
224 }
225
226
227 /* Return true if STMTS contains a pattern statement. */
228
229 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)230 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
231 {
232 stmt_vec_info stmt_info;
233 unsigned int i;
234 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
235 if (is_pattern_stmt_p (stmt_info))
236 return true;
237 return false;
238 }
239
240 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
241 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
242 of the chain. */
243
244 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)245 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
246 stmt_vec_info first_stmt_info)
247 {
248 stmt_vec_info next_stmt_info = first_stmt_info;
249 int result = 0;
250
251 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
252 return -1;
253
254 do
255 {
256 if (next_stmt_info == stmt_info)
257 return result;
258 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
259 if (next_stmt_info)
260 result += DR_GROUP_GAP (next_stmt_info);
261 }
262 while (next_stmt_info);
263
264 return -1;
265 }
266
267 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
268 using the method implemented by duplicate_and_interleave. Return true
269 if so, returning the number of intermediate vectors in *NVECTORS_OUT
270 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
271 (if nonnull). */
272
273 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)274 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
275 tree elt_type, unsigned int *nvectors_out,
276 tree *vector_type_out,
277 tree *permutes)
278 {
279 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
280 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
281 return false;
282
283 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
284 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
285 unsigned int nvectors = 1;
286 for (;;)
287 {
288 scalar_int_mode int_mode;
289 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
290 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
291 {
292 /* Get the natural vector type for this SLP group size. */
293 tree int_type = build_nonstandard_integer_type
294 (GET_MODE_BITSIZE (int_mode), 1);
295 tree vector_type
296 = get_vectype_for_scalar_type (vinfo, int_type, count);
297 if (vector_type
298 && VECTOR_MODE_P (TYPE_MODE (vector_type))
299 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
300 GET_MODE_SIZE (base_vector_mode)))
301 {
302 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
303 together into elements of type INT_TYPE and using the result
304 to build NVECTORS vectors. */
305 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
306 vec_perm_builder sel1 (nelts, 2, 3);
307 vec_perm_builder sel2 (nelts, 2, 3);
308 poly_int64 half_nelts = exact_div (nelts, 2);
309 for (unsigned int i = 0; i < 3; ++i)
310 {
311 sel1.quick_push (i);
312 sel1.quick_push (i + nelts);
313 sel2.quick_push (half_nelts + i);
314 sel2.quick_push (half_nelts + i + nelts);
315 }
316 vec_perm_indices indices1 (sel1, 2, nelts);
317 vec_perm_indices indices2 (sel2, 2, nelts);
318 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
319 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
320 {
321 if (nvectors_out)
322 *nvectors_out = nvectors;
323 if (vector_type_out)
324 *vector_type_out = vector_type;
325 if (permutes)
326 {
327 permutes[0] = vect_gen_perm_mask_checked (vector_type,
328 indices1);
329 permutes[1] = vect_gen_perm_mask_checked (vector_type,
330 indices2);
331 }
332 return true;
333 }
334 }
335 }
336 if (!multiple_p (elt_bytes, 2, &elt_bytes))
337 return false;
338 nvectors *= 2;
339 }
340 }
341
342 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
343 they are of a valid type and that they match the defs of the first stmt of
344 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
345 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero *SWAP
346 indicates swap is required for cond_expr stmts. Specifically, *SWAP
347 is 1 if STMT is cond and operands of comparison need to be swapped;
348 *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
349 If there is any operand swap in this function, *SWAP is set to non-zero
350 value.
351 If there was a fatal error return -1; if the error could be corrected by
352 swapping operands of father node of this one, return 1; if everything is
353 ok return 0. */
354 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)355 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
356 vec<stmt_vec_info> stmts, unsigned stmt_num,
357 vec<slp_oprnd_info> *oprnds_info)
358 {
359 stmt_vec_info stmt_info = stmts[stmt_num];
360 tree oprnd;
361 unsigned int i, number_of_oprnds;
362 enum vect_def_type dt = vect_uninitialized_def;
363 slp_oprnd_info oprnd_info;
364 int first_op_idx = 1;
365 unsigned int commutative_op = -1U;
366 bool first_op_cond = false;
367 bool first = stmt_num == 0;
368
369 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
370 {
371 number_of_oprnds = gimple_call_num_args (stmt);
372 first_op_idx = 3;
373 if (gimple_call_internal_p (stmt))
374 {
375 internal_fn ifn = gimple_call_internal_fn (stmt);
376 commutative_op = first_commutative_argument (ifn);
377
378 /* Masked load, only look at mask. */
379 if (ifn == IFN_MASK_LOAD)
380 {
381 number_of_oprnds = 1;
382 /* Mask operand index. */
383 first_op_idx = 5;
384 }
385 }
386 }
387 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
388 {
389 enum tree_code code = gimple_assign_rhs_code (stmt);
390 number_of_oprnds = gimple_num_ops (stmt) - 1;
391 /* Swap can only be done for cond_expr if asked to, otherwise we
392 could result in different comparison code to the first stmt. */
393 if (code == COND_EXPR
394 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
395 {
396 first_op_cond = true;
397 number_of_oprnds++;
398 }
399 else
400 commutative_op = commutative_tree_code (code) ? 0U : -1U;
401 }
402 else
403 return -1;
404
405 bool swapped = (*swap != 0);
406 gcc_assert (!swapped || first_op_cond);
407 for (i = 0; i < number_of_oprnds; i++)
408 {
409 again:
410 if (first_op_cond)
411 {
412 /* Map indicating how operands of cond_expr should be swapped. */
413 int maps[3][4] = {{0, 1, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2}};
414 int *map = maps[*swap];
415
416 if (i < 2)
417 oprnd = TREE_OPERAND (gimple_op (stmt_info->stmt,
418 first_op_idx), map[i]);
419 else
420 oprnd = gimple_op (stmt_info->stmt, map[i]);
421 }
422 else
423 oprnd = gimple_op (stmt_info->stmt, first_op_idx + (swapped ? !i : i));
424 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
425 oprnd = TREE_OPERAND (oprnd, 0);
426
427 oprnd_info = (*oprnds_info)[i];
428
429 stmt_vec_info def_stmt_info;
430 if (!vect_is_simple_use (oprnd, vinfo, &dt, &def_stmt_info))
431 {
432 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "Build SLP failed: can't analyze def for %T\n",
435 oprnd);
436
437 return -1;
438 }
439
440 if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
441 oprnd_info->any_pattern = true;
442
443 if (first)
444 {
445 /* For the swapping logic below force vect_reduction_def
446 for the reduction op in a SLP reduction group. */
447 if (!STMT_VINFO_DATA_REF (stmt_info)
448 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
449 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
450 && def_stmt_info)
451 dt = vect_reduction_def;
452 oprnd_info->first_dt = dt;
453 oprnd_info->first_op_type = TREE_TYPE (oprnd);
454 }
455 else
456 {
457 /* Not first stmt of the group, check that the def-stmt/s match
458 the def-stmt/s of the first stmt. Allow different definition
459 types for reduction chains: the first stmt must be a
460 vect_reduction_def (a phi node), and the rest
461 end in the reduction chain. */
462 tree type = TREE_TYPE (oprnd);
463 if ((oprnd_info->first_dt != dt
464 && !(oprnd_info->first_dt == vect_reduction_def
465 && !STMT_VINFO_DATA_REF (stmt_info)
466 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
467 && def_stmt_info
468 && !STMT_VINFO_DATA_REF (def_stmt_info)
469 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
470 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
471 && !((oprnd_info->first_dt == vect_external_def
472 || oprnd_info->first_dt == vect_constant_def)
473 && (dt == vect_external_def
474 || dt == vect_constant_def)))
475 || !types_compatible_p (oprnd_info->first_op_type, type)
476 || (!STMT_VINFO_DATA_REF (stmt_info)
477 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
478 && ((!def_stmt_info
479 || STMT_VINFO_DATA_REF (def_stmt_info)
480 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
481 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
482 != (oprnd_info->first_dt != vect_reduction_def))))
483 {
484 /* Try swapping operands if we got a mismatch. */
485 if (i == commutative_op && !swapped)
486 {
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location,
489 "trying swapped operands\n");
490 swapped = true;
491 goto again;
492 }
493
494 if (dump_enabled_p ())
495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
496 "Build SLP failed: different types\n");
497
498 return 1;
499 }
500 if ((dt == vect_constant_def
501 || dt == vect_external_def)
502 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
503 && (TREE_CODE (type) == BOOLEAN_TYPE
504 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
505 type)))
506 {
507 if (dump_enabled_p ())
508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
509 "Build SLP failed: invalid type of def "
510 "for variable-length SLP %T\n", oprnd);
511 return -1;
512 }
513 }
514
515 /* Check the types of the definitions. */
516 switch (dt)
517 {
518 case vect_external_def:
519 /* Make sure to demote the overall operand to external. */
520 oprnd_info->first_dt = vect_external_def;
521 /* Fallthru. */
522 case vect_constant_def:
523 oprnd_info->def_stmts.quick_push (NULL);
524 oprnd_info->ops.quick_push (oprnd);
525 break;
526
527 case vect_internal_def:
528 case vect_reduction_def:
529 if (oprnd_info->first_dt == vect_reduction_def
530 && !STMT_VINFO_DATA_REF (stmt_info)
531 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
532 && !STMT_VINFO_DATA_REF (def_stmt_info)
533 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
534 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
535 {
536 /* For a SLP reduction chain we want to duplicate the
537 reduction to each of the chain members. That gets
538 us a sane SLP graph (still the stmts are not 100%
539 correct wrt the initial values). */
540 gcc_assert (!first);
541 oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]);
542 oprnd_info->ops.quick_push (oprnd_info->ops[0]);
543 break;
544 }
545 /* Fallthru. */
546 case vect_induction_def:
547 oprnd_info->def_stmts.quick_push (def_stmt_info);
548 oprnd_info->ops.quick_push (oprnd);
549 break;
550
551 default:
552 /* FORNOW: Not supported. */
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
555 "Build SLP failed: illegal type of def %T\n",
556 oprnd);
557
558 return -1;
559 }
560 }
561
562 /* Swap operands. */
563 if (swapped)
564 {
565 if (dump_enabled_p ())
566 dump_printf_loc (MSG_NOTE, vect_location,
567 "swapped operands to match def types in %G",
568 stmt_info->stmt);
569 }
570
571 *swap = swapped;
572 return 0;
573 }
574
575 /* Try to assign vector type VECTYPE to STMT_INFO for BB vectorization.
576 Return true if we can, meaning that this choice doesn't conflict with
577 existing SLP nodes that use STMT_INFO. */
578
579 static bool
vect_update_shared_vectype(stmt_vec_info stmt_info,tree vectype)580 vect_update_shared_vectype (stmt_vec_info stmt_info, tree vectype)
581 {
582 tree old_vectype = STMT_VINFO_VECTYPE (stmt_info);
583 if (old_vectype && useless_type_conversion_p (vectype, old_vectype))
584 return true;
585
586 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
587 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
588 {
589 /* We maintain the invariant that if any statement in the group is
590 used, all other members of the group have the same vector type. */
591 stmt_vec_info first_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
592 stmt_vec_info member_info = first_info;
593 for (; member_info; member_info = DR_GROUP_NEXT_ELEMENT (member_info))
594 if (STMT_VINFO_NUM_SLP_USES (member_info) > 0
595 || is_pattern_stmt_p (member_info))
596 break;
597
598 if (!member_info)
599 {
600 for (member_info = first_info; member_info;
601 member_info = DR_GROUP_NEXT_ELEMENT (member_info))
602 STMT_VINFO_VECTYPE (member_info) = vectype;
603 return true;
604 }
605 }
606 else if (STMT_VINFO_NUM_SLP_USES (stmt_info) == 0
607 && !is_pattern_stmt_p (stmt_info))
608 {
609 STMT_VINFO_VECTYPE (stmt_info) = vectype;
610 return true;
611 }
612
613 if (dump_enabled_p ())
614 {
615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
616 "Build SLP failed: incompatible vector"
617 " types for: %G", stmt_info->stmt);
618 dump_printf_loc (MSG_NOTE, vect_location,
619 " old vector type: %T\n", old_vectype);
620 dump_printf_loc (MSG_NOTE, vect_location,
621 " new vector type: %T\n", vectype);
622 }
623 return false;
624 }
625
626 /* Try to infer and assign a vector type to all the statements in STMTS.
627 Used only for BB vectorization. */
628
629 static bool
vect_update_all_shared_vectypes(vec<stmt_vec_info> stmts)630 vect_update_all_shared_vectypes (vec<stmt_vec_info> stmts)
631 {
632 tree vectype, nunits_vectype;
633 if (!vect_get_vector_types_for_stmt (stmts[0], &vectype,
634 &nunits_vectype, stmts.length ()))
635 return false;
636
637 stmt_vec_info stmt_info;
638 unsigned int i;
639 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
640 if (!vect_update_shared_vectype (stmt_info, vectype))
641 return false;
642
643 return true;
644 }
645
646 /* Return true if call statements CALL1 and CALL2 are similar enough
647 to be combined into the same SLP group. */
648
649 static bool
compatible_calls_p(gcall * call1,gcall * call2)650 compatible_calls_p (gcall *call1, gcall *call2)
651 {
652 unsigned int nargs = gimple_call_num_args (call1);
653 if (nargs != gimple_call_num_args (call2))
654 return false;
655
656 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
657 return false;
658
659 if (gimple_call_internal_p (call1))
660 {
661 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
662 TREE_TYPE (gimple_call_lhs (call2))))
663 return false;
664 for (unsigned int i = 0; i < nargs; ++i)
665 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
666 TREE_TYPE (gimple_call_arg (call2, i))))
667 return false;
668 }
669 else
670 {
671 if (!operand_equal_p (gimple_call_fn (call1),
672 gimple_call_fn (call2), 0))
673 return false;
674
675 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
676 return false;
677 }
678 return true;
679 }
680
681 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
682 caller's attempt to find the vector type in STMT_INFO with the narrowest
683 element type. Return true if VECTYPE is nonnull and if it is valid
684 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
685 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
686 vect_build_slp_tree. */
687
688 static bool
vect_record_max_nunits(stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)689 vect_record_max_nunits (stmt_vec_info stmt_info, unsigned int group_size,
690 tree vectype, poly_uint64 *max_nunits)
691 {
692 if (!vectype)
693 {
694 if (dump_enabled_p ())
695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
696 "Build SLP failed: unsupported data-type in %G\n",
697 stmt_info->stmt);
698 /* Fatal mismatch. */
699 return false;
700 }
701
702 /* If populating the vector type requires unrolling then fail
703 before adjusting *max_nunits for basic-block vectorization. */
704 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
705 unsigned HOST_WIDE_INT const_nunits;
706 if (STMT_VINFO_BB_VINFO (stmt_info)
707 && (!nunits.is_constant (&const_nunits)
708 || const_nunits > group_size))
709 {
710 if (dump_enabled_p ())
711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 "Build SLP failed: unrolling required "
713 "in basic block SLP\n");
714 /* Fatal mismatch. */
715 return false;
716 }
717
718 /* In case of multiple types we need to detect the smallest type. */
719 vect_update_max_nunits (max_nunits, vectype);
720 return true;
721 }
722
723 /* STMTS is a group of GROUP_SIZE SLP statements in which some
724 statements do the same operation as the first statement and in which
725 the others do ALT_STMT_CODE. Return true if we can take one vector
726 of the first operation and one vector of the second and permute them
727 to get the required result. VECTYPE is the type of the vector that
728 would be permuted. */
729
730 static bool
vect_two_operations_perm_ok_p(vec<stmt_vec_info> stmts,unsigned int group_size,tree vectype,tree_code alt_stmt_code)731 vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
732 unsigned int group_size, tree vectype,
733 tree_code alt_stmt_code)
734 {
735 unsigned HOST_WIDE_INT count;
736 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
737 return false;
738
739 vec_perm_builder sel (count, count, 1);
740 for (unsigned int i = 0; i < count; ++i)
741 {
742 unsigned int elt = i;
743 gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
744 if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
745 elt += count;
746 sel.quick_push (elt);
747 }
748 vec_perm_indices indices (sel, 2, count);
749 return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
750 }
751
752 /* Verify if the scalar stmts STMTS are isomorphic, require data
753 permutation or are of unsupported types of operation. Return
754 true if they are, otherwise return false and indicate in *MATCHES
755 which stmts are not isomorphic to the first one. If MATCHES[0]
756 is false then this indicates the comparison could not be
757 carried out or the stmts will never be vectorized by SLP.
758
759 Note COND_EXPR is possibly isomorphic to another one after swapping its
760 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
761 the first stmt by swapping the two operands of comparison; set SWAP[i]
762 to 2 if stmt I is isormorphic to the first stmt by inverting the code
763 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
764 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
765
766 static bool
vect_build_slp_tree_1(unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators)767 vect_build_slp_tree_1 (unsigned char *swap,
768 vec<stmt_vec_info> stmts, unsigned int group_size,
769 poly_uint64 *max_nunits, bool *matches,
770 bool *two_operators)
771 {
772 unsigned int i;
773 stmt_vec_info first_stmt_info = stmts[0];
774 enum tree_code first_stmt_code = ERROR_MARK;
775 enum tree_code alt_stmt_code = ERROR_MARK;
776 enum tree_code rhs_code = ERROR_MARK;
777 enum tree_code first_cond_code = ERROR_MARK;
778 tree lhs;
779 bool need_same_oprnds = false;
780 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
781 optab optab;
782 int icode;
783 machine_mode optab_op2_mode;
784 machine_mode vec_mode;
785 stmt_vec_info first_load = NULL, prev_first_load = NULL;
786 bool load_p = false;
787
788 /* For every stmt in NODE find its def stmt/s. */
789 stmt_vec_info stmt_info;
790 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
791 {
792 vec_info *vinfo = stmt_info->vinfo;
793 gimple *stmt = stmt_info->stmt;
794 swap[i] = 0;
795 matches[i] = false;
796
797 if (dump_enabled_p ())
798 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
799
800 /* Fail to vectorize statements marked as unvectorizable. */
801 if (!STMT_VINFO_VECTORIZABLE (stmt_info))
802 {
803 if (dump_enabled_p ())
804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
805 "Build SLP failed: unvectorizable statement %G",
806 stmt);
807 /* Fatal mismatch. */
808 matches[0] = false;
809 return false;
810 }
811
812 lhs = gimple_get_lhs (stmt);
813 if (lhs == NULL_TREE)
814 {
815 if (dump_enabled_p ())
816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
817 "Build SLP failed: not GIMPLE_ASSIGN nor "
818 "GIMPLE_CALL %G", stmt);
819 /* Fatal mismatch. */
820 matches[0] = false;
821 return false;
822 }
823
824 tree nunits_vectype;
825 if (!vect_get_vector_types_for_stmt (stmt_info, &vectype,
826 &nunits_vectype, group_size)
827 || (nunits_vectype
828 && !vect_record_max_nunits (stmt_info, group_size,
829 nunits_vectype, max_nunits)))
830 {
831 /* Fatal mismatch. */
832 matches[0] = false;
833 return false;
834 }
835
836 gcc_assert (vectype);
837
838 if (is_a <bb_vec_info> (vinfo)
839 && !vect_update_shared_vectype (stmt_info, vectype))
840 continue;
841
842 gcall *call_stmt = dyn_cast <gcall *> (stmt);
843 if (call_stmt)
844 {
845 rhs_code = CALL_EXPR;
846
847 if (gimple_call_internal_p (stmt, IFN_MASK_LOAD))
848 load_p = true;
849 else if ((gimple_call_internal_p (call_stmt)
850 && (!vectorizable_internal_fn_p
851 (gimple_call_internal_fn (call_stmt))))
852 || gimple_call_tail_p (call_stmt)
853 || gimple_call_noreturn_p (call_stmt)
854 || !gimple_call_nothrow_p (call_stmt)
855 || gimple_call_chain (call_stmt))
856 {
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
859 "Build SLP failed: unsupported call type %G",
860 call_stmt);
861 /* Fatal mismatch. */
862 matches[0] = false;
863 return false;
864 }
865 }
866 else
867 {
868 rhs_code = gimple_assign_rhs_code (stmt);
869 load_p = TREE_CODE_CLASS (rhs_code) == tcc_reference;
870 }
871
872 /* Check the operation. */
873 if (i == 0)
874 {
875 first_stmt_code = rhs_code;
876
877 /* Shift arguments should be equal in all the packed stmts for a
878 vector shift with scalar shift operand. */
879 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
880 || rhs_code == LROTATE_EXPR
881 || rhs_code == RROTATE_EXPR)
882 {
883 vec_mode = TYPE_MODE (vectype);
884
885 /* First see if we have a vector/vector shift. */
886 optab = optab_for_tree_code (rhs_code, vectype,
887 optab_vector);
888
889 if (!optab
890 || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
891 {
892 /* No vector/vector shift, try for a vector/scalar shift. */
893 optab = optab_for_tree_code (rhs_code, vectype,
894 optab_scalar);
895
896 if (!optab)
897 {
898 if (dump_enabled_p ())
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 "Build SLP failed: no optab.\n");
901 /* Fatal mismatch. */
902 matches[0] = false;
903 return false;
904 }
905 icode = (int) optab_handler (optab, vec_mode);
906 if (icode == CODE_FOR_nothing)
907 {
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "Build SLP failed: "
911 "op not supported by target.\n");
912 /* Fatal mismatch. */
913 matches[0] = false;
914 return false;
915 }
916 optab_op2_mode = insn_data[icode].operand[2].mode;
917 if (!VECTOR_MODE_P (optab_op2_mode))
918 {
919 need_same_oprnds = true;
920 first_op1 = gimple_assign_rhs2 (stmt);
921 }
922 }
923 }
924 else if (rhs_code == WIDEN_LSHIFT_EXPR)
925 {
926 need_same_oprnds = true;
927 first_op1 = gimple_assign_rhs2 (stmt);
928 }
929 else if (call_stmt
930 && gimple_call_internal_p (call_stmt, IFN_DIV_POW2))
931 {
932 need_same_oprnds = true;
933 first_op1 = gimple_call_arg (call_stmt, 1);
934 }
935 }
936 else
937 {
938 if (first_stmt_code != rhs_code
939 && alt_stmt_code == ERROR_MARK)
940 alt_stmt_code = rhs_code;
941 if (first_stmt_code != rhs_code
942 && (first_stmt_code != IMAGPART_EXPR
943 || rhs_code != REALPART_EXPR)
944 && (first_stmt_code != REALPART_EXPR
945 || rhs_code != IMAGPART_EXPR)
946 /* Handle mismatches in plus/minus by computing both
947 and merging the results. */
948 && !((first_stmt_code == PLUS_EXPR
949 || first_stmt_code == MINUS_EXPR)
950 && (alt_stmt_code == PLUS_EXPR
951 || alt_stmt_code == MINUS_EXPR)
952 && rhs_code == alt_stmt_code)
953 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
954 && (first_stmt_code == ARRAY_REF
955 || first_stmt_code == BIT_FIELD_REF
956 || first_stmt_code == INDIRECT_REF
957 || first_stmt_code == COMPONENT_REF
958 || first_stmt_code == MEM_REF)))
959 {
960 if (dump_enabled_p ())
961 {
962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
963 "Build SLP failed: different operation "
964 "in stmt %G", stmt);
965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
966 "original stmt %G", first_stmt_info->stmt);
967 }
968 /* Mismatch. */
969 continue;
970 }
971
972 if (need_same_oprnds)
973 {
974 tree other_op1 = (call_stmt
975 ? gimple_call_arg (call_stmt, 1)
976 : gimple_assign_rhs2 (stmt));
977 if (!operand_equal_p (first_op1, other_op1, 0))
978 {
979 if (dump_enabled_p ())
980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981 "Build SLP failed: different shift "
982 "arguments in %G", stmt);
983 /* Mismatch. */
984 continue;
985 }
986 }
987
988 if (!load_p && rhs_code == CALL_EXPR)
989 {
990 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
991 as_a <gcall *> (stmt)))
992 {
993 if (dump_enabled_p ())
994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
995 "Build SLP failed: different calls in %G",
996 stmt);
997 /* Mismatch. */
998 continue;
999 }
1000 }
1001 }
1002
1003 /* Grouped store or load. */
1004 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1005 {
1006 if (REFERENCE_CLASS_P (lhs))
1007 {
1008 /* Store. */
1009 ;
1010 }
1011 else
1012 {
1013 /* Load. */
1014 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1015 if (prev_first_load)
1016 {
1017 /* Check that there are no loads from different interleaving
1018 chains in the same node. */
1019 if (prev_first_load != first_load)
1020 {
1021 if (dump_enabled_p ())
1022 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1023 vect_location,
1024 "Build SLP failed: different "
1025 "interleaving chains in one node %G",
1026 stmt);
1027 /* Mismatch. */
1028 continue;
1029 }
1030 }
1031 else
1032 prev_first_load = first_load;
1033 }
1034 } /* Grouped access. */
1035 else
1036 {
1037 if (load_p)
1038 {
1039 /* Not grouped load. */
1040 if (dump_enabled_p ())
1041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1042 "Build SLP failed: not grouped load %G", stmt);
1043
1044 /* FORNOW: Not grouped loads are not supported. */
1045 /* Fatal mismatch. */
1046 matches[0] = false;
1047 return false;
1048 }
1049
1050 /* Not memory operation. */
1051 if (TREE_CODE_CLASS (rhs_code) != tcc_binary
1052 && TREE_CODE_CLASS (rhs_code) != tcc_unary
1053 && TREE_CODE_CLASS (rhs_code) != tcc_expression
1054 && TREE_CODE_CLASS (rhs_code) != tcc_comparison
1055 && rhs_code != VIEW_CONVERT_EXPR
1056 && rhs_code != CALL_EXPR)
1057 {
1058 if (dump_enabled_p ())
1059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1060 "Build SLP failed: operation unsupported %G",
1061 stmt);
1062 /* Fatal mismatch. */
1063 matches[0] = false;
1064 return false;
1065 }
1066
1067 if (rhs_code == COND_EXPR)
1068 {
1069 tree cond_expr = gimple_assign_rhs1 (stmt);
1070 enum tree_code cond_code = TREE_CODE (cond_expr);
1071 enum tree_code swap_code = ERROR_MARK;
1072 enum tree_code invert_code = ERROR_MARK;
1073
1074 if (i == 0)
1075 first_cond_code = TREE_CODE (cond_expr);
1076 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1077 {
1078 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1079 swap_code = swap_tree_comparison (cond_code);
1080 invert_code = invert_tree_comparison (cond_code, honor_nans);
1081 }
1082
1083 if (first_cond_code == cond_code)
1084 ;
1085 /* Isomorphic can be achieved by swapping. */
1086 else if (first_cond_code == swap_code)
1087 swap[i] = 1;
1088 /* Isomorphic can be achieved by inverting. */
1089 else if (first_cond_code == invert_code)
1090 swap[i] = 2;
1091 else
1092 {
1093 if (dump_enabled_p ())
1094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1095 "Build SLP failed: different"
1096 " operation %G", stmt);
1097 /* Mismatch. */
1098 continue;
1099 }
1100 }
1101 }
1102
1103 matches[i] = true;
1104 }
1105
1106 for (i = 0; i < group_size; ++i)
1107 if (!matches[i])
1108 return false;
1109
1110 /* If we allowed a two-operation SLP node verify the target can cope
1111 with the permute we are going to use. */
1112 if (alt_stmt_code != ERROR_MARK
1113 && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
1114 {
1115 if (!vect_two_operations_perm_ok_p (stmts, group_size,
1116 vectype, alt_stmt_code))
1117 {
1118 for (i = 0; i < group_size; ++i)
1119 if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
1120 {
1121 matches[i] = false;
1122 if (dump_enabled_p ())
1123 {
1124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1125 "Build SLP failed: different operation "
1126 "in stmt %G", stmts[i]->stmt);
1127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1128 "original stmt %G", first_stmt_info->stmt);
1129 }
1130 }
1131 return false;
1132 }
1133 *two_operators = true;
1134 }
1135
1136 return true;
1137 }
1138
1139 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1140 Note we never remove apart from at destruction time so we do not
1141 need a special value for deleted that differs from empty. */
1142 struct bst_traits
1143 {
1144 typedef vec <stmt_vec_info> value_type;
1145 typedef vec <stmt_vec_info> compare_type;
1146 static inline hashval_t hash (value_type);
1147 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1148 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1149 static inline bool is_deleted (value_type x) { return !x.exists (); }
1150 static const bool empty_zero_p = true;
mark_emptybst_traits1151 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1152 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1153 static inline void remove (value_type &x) { x.release (); }
1154 };
1155 inline hashval_t
hash(value_type x)1156 bst_traits::hash (value_type x)
1157 {
1158 inchash::hash h;
1159 for (unsigned i = 0; i < x.length (); ++i)
1160 h.add_int (gimple_uid (x[i]->stmt));
1161 return h.end ();
1162 }
1163 inline bool
equal(value_type existing,value_type candidate)1164 bst_traits::equal (value_type existing, value_type candidate)
1165 {
1166 if (existing.length () != candidate.length ())
1167 return false;
1168 for (unsigned i = 0; i < existing.length (); ++i)
1169 if (existing[i] != candidate[i])
1170 return false;
1171 return true;
1172 }
1173
1174 typedef hash_map <vec <gimple *>, slp_tree,
1175 simple_hashmap_traits <bst_traits, slp_tree> >
1176 scalar_stmts_to_slp_tree_map_t;
1177
1178 static slp_tree
1179 vect_build_slp_tree_2 (vec_info *vinfo,
1180 vec<stmt_vec_info> stmts, unsigned int group_size,
1181 poly_uint64 *max_nunits,
1182 bool *matches, unsigned *npermutes, unsigned *tree_size,
1183 scalar_stmts_to_slp_tree_map_t *bst_map);
1184
1185 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1186 vect_build_slp_tree (vec_info *vinfo,
1187 vec<stmt_vec_info> stmts, unsigned int group_size,
1188 poly_uint64 *max_nunits,
1189 bool *matches, unsigned *npermutes, unsigned *tree_size,
1190 scalar_stmts_to_slp_tree_map_t *bst_map)
1191 {
1192 if (slp_tree *leader = bst_map->get (stmts))
1193 {
1194 if (dump_enabled_p ())
1195 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1196 *leader ? "" : "failed ", *leader);
1197 if (*leader)
1198 {
1199 (*leader)->refcnt++;
1200 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1201 }
1202 return *leader;
1203 }
1204 poly_uint64 this_max_nunits = 1;
1205 slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size,
1206 &this_max_nunits,
1207 matches, npermutes, tree_size, bst_map);
1208 if (res)
1209 {
1210 res->max_nunits = this_max_nunits;
1211 vect_update_max_nunits (max_nunits, this_max_nunits);
1212 /* Keep a reference for the bst_map use. */
1213 res->refcnt++;
1214 }
1215 bst_map->put (stmts.copy (), res);
1216 return res;
1217 }
1218
1219 /* Recursively build an SLP tree starting from NODE.
1220 Fail (and return a value not equal to zero) if def-stmts are not
1221 isomorphic, require data permutation or are of unsupported types of
1222 operation. Otherwise, return 0.
1223 The value returned is the depth in the SLP tree where a mismatch
1224 was found. */
1225
1226 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * npermutes,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1227 vect_build_slp_tree_2 (vec_info *vinfo,
1228 vec<stmt_vec_info> stmts, unsigned int group_size,
1229 poly_uint64 *max_nunits,
1230 bool *matches, unsigned *npermutes, unsigned *tree_size,
1231 scalar_stmts_to_slp_tree_map_t *bst_map)
1232 {
1233 unsigned nops, i, this_tree_size = 0;
1234 poly_uint64 this_max_nunits = *max_nunits;
1235 slp_tree node;
1236
1237 matches[0] = false;
1238
1239 stmt_vec_info stmt_info = stmts[0];
1240 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1241 nops = gimple_call_num_args (stmt);
1242 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
1243 {
1244 nops = gimple_num_ops (stmt) - 1;
1245 if (gimple_assign_rhs_code (stmt) == COND_EXPR)
1246 nops++;
1247 }
1248 else if (is_a <gphi *> (stmt_info->stmt))
1249 nops = 0;
1250 else
1251 return NULL;
1252
1253 /* If the SLP node is a PHI (induction or reduction), terminate
1254 the recursion. */
1255 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1256 {
1257 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1258 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
1259 if (!vect_record_max_nunits (stmt_info, group_size, vectype, max_nunits))
1260 return NULL;
1261
1262 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1263 /* Induction from different IVs is not supported. */
1264 if (def_type == vect_induction_def)
1265 {
1266 stmt_vec_info other_info;
1267 FOR_EACH_VEC_ELT (stmts, i, other_info)
1268 if (stmt_info != other_info)
1269 return NULL;
1270 }
1271 else if (def_type == vect_reduction_def
1272 || def_type == vect_double_reduction_def
1273 || def_type == vect_nested_cycle)
1274 {
1275 /* Else def types have to match. */
1276 stmt_vec_info other_info;
1277 FOR_EACH_VEC_ELT (stmts, i, other_info)
1278 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1279 return NULL;
1280 }
1281 else
1282 return NULL;
1283 (*tree_size)++;
1284 node = vect_create_new_slp_node (stmts);
1285 return node;
1286 }
1287
1288
1289 bool two_operators = false;
1290 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1291 if (!vect_build_slp_tree_1 (swap, stmts, group_size,
1292 &this_max_nunits, matches, &two_operators))
1293 return NULL;
1294
1295 /* If the SLP node is a load, terminate the recursion unless masked. */
1296 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1297 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1298 {
1299 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1300 {
1301 /* Masked load. */
1302 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
1303 nops = 1;
1304 }
1305 else
1306 {
1307 *max_nunits = this_max_nunits;
1308 (*tree_size)++;
1309 node = vect_create_new_slp_node (stmts);
1310 /* And compute the load permutation. Whether it is actually
1311 a permutation depends on the unrolling factor which is
1312 decided later. */
1313 vec<unsigned> load_permutation;
1314 int j;
1315 stmt_vec_info load_info;
1316 load_permutation.create (group_size);
1317 stmt_vec_info first_stmt_info
1318 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1319 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1320 {
1321 int load_place = vect_get_place_in_interleaving_chain
1322 (load_info, first_stmt_info);
1323 gcc_assert (load_place != -1);
1324 load_permutation.safe_push (load_place);
1325 }
1326 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1327 return node;
1328 }
1329 }
1330
1331 /* Get at the operands, verifying they are compatible. */
1332 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
1333 slp_oprnd_info oprnd_info;
1334 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1335 {
1336 int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
1337 stmts, i, &oprnds_info);
1338 if (res != 0)
1339 matches[(res == -1) ? 0 : i] = false;
1340 if (!matches[0])
1341 break;
1342 }
1343 for (i = 0; i < group_size; ++i)
1344 if (!matches[i])
1345 {
1346 vect_free_oprnd_info (oprnds_info);
1347 return NULL;
1348 }
1349
1350 auto_vec<slp_tree, 4> children;
1351
1352 stmt_info = stmts[0];
1353
1354 /* Create SLP_TREE nodes for the definition node/s. */
1355 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
1356 {
1357 slp_tree child;
1358 unsigned old_tree_size = this_tree_size;
1359 unsigned int j;
1360
1361 if (oprnd_info->first_dt == vect_uninitialized_def)
1362 {
1363 /* COND_EXPR have one too many eventually if the condition
1364 is a SSA name. */
1365 gcc_assert (i == 3 && nops == 4);
1366 continue;
1367 }
1368
1369 if (oprnd_info->first_dt != vect_internal_def
1370 && oprnd_info->first_dt != vect_reduction_def
1371 && oprnd_info->first_dt != vect_induction_def)
1372 {
1373 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
1374 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
1375 oprnd_info->ops = vNULL;
1376 children.safe_push (invnode);
1377 continue;
1378 }
1379
1380 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1381 group_size, &this_max_nunits,
1382 matches, npermutes,
1383 &this_tree_size, bst_map)) != NULL)
1384 {
1385 /* If we have all children of a non-unary child built up from
1386 scalars then just throw that away and build it up this node
1387 from scalars. */
1388 if (is_a <bb_vec_info> (vinfo)
1389 && SLP_TREE_CHILDREN (child).length () > 1
1390 /* ??? Rejecting patterns this way doesn't work. We'd have to
1391 do extra work to cancel the pattern so the uses see the
1392 scalar version. */
1393 && !oprnd_info->any_pattern)
1394 {
1395 slp_tree grandchild;
1396
1397 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1398 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1399 break;
1400 if (!grandchild
1401 && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
1402 {
1403 /* Roll back. */
1404 this_tree_size = old_tree_size;
1405 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1406 vect_free_slp_tree (grandchild, false);
1407 SLP_TREE_CHILDREN (child).truncate (0);
1408
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_NOTE, vect_location,
1411 "Building parent vector operands from "
1412 "scalars instead\n");
1413 oprnd_info->def_stmts = vNULL;
1414 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1415 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1416 oprnd_info->ops = vNULL;
1417 ++this_tree_size;
1418 children.safe_push (child);
1419 continue;
1420 }
1421 }
1422
1423 oprnd_info->def_stmts = vNULL;
1424 children.safe_push (child);
1425 continue;
1426 }
1427
1428 /* If the SLP build failed fatally and we analyze a basic-block
1429 simply treat nodes we fail to build as externally defined
1430 (and thus build vectors from the scalar defs).
1431 The cost model will reject outright expensive cases.
1432 ??? This doesn't treat cases where permutation ultimatively
1433 fails (or we don't try permutation below). Ideally we'd
1434 even compute a permutation that will end up with the maximum
1435 SLP tree size... */
1436 if (is_a <bb_vec_info> (vinfo)
1437 && !matches[0]
1438 /* ??? Rejecting patterns this way doesn't work. We'd have to
1439 do extra work to cancel the pattern so the uses see the
1440 scalar version. */
1441 && !is_pattern_stmt_p (stmt_info)
1442 && !oprnd_info->any_pattern
1443 && vect_update_all_shared_vectypes (oprnd_info->def_stmts))
1444 {
1445 if (dump_enabled_p ())
1446 dump_printf_loc (MSG_NOTE, vect_location,
1447 "Building vector operands from scalars\n");
1448 this_tree_size++;
1449 child = vect_create_new_slp_node (oprnd_info->def_stmts);
1450 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1451 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1452 children.safe_push (child);
1453 oprnd_info->ops = vNULL;
1454 oprnd_info->def_stmts = vNULL;
1455 continue;
1456 }
1457
1458 /* If the SLP build for operand zero failed and operand zero
1459 and one can be commutated try that for the scalar stmts
1460 that failed the match. */
1461 if (i == 0
1462 /* A first scalar stmt mismatch signals a fatal mismatch. */
1463 && matches[0]
1464 /* ??? For COND_EXPRs we can swap the comparison operands
1465 as well as the arms under some constraints. */
1466 && nops == 2
1467 && oprnds_info[1]->first_dt == vect_internal_def
1468 && is_gimple_assign (stmt_info->stmt)
1469 /* Swapping operands for reductions breaks assumptions later on. */
1470 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
1471 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
1472 /* Do so only if the number of not successful permutes was nor more
1473 than a cut-ff as re-trying the recursive match on
1474 possibly each level of the tree would expose exponential
1475 behavior. */
1476 && *npermutes < 4)
1477 {
1478 /* See whether we can swap the matching or the non-matching
1479 stmt operands. */
1480 bool swap_not_matching = true;
1481 do
1482 {
1483 for (j = 0; j < group_size; ++j)
1484 {
1485 if (matches[j] != !swap_not_matching)
1486 continue;
1487 stmt_vec_info stmt_info = stmts[j];
1488 /* Verify if we can swap operands of this stmt. */
1489 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
1490 if (!stmt
1491 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
1492 {
1493 if (!swap_not_matching)
1494 goto fail;
1495 swap_not_matching = false;
1496 break;
1497 }
1498 }
1499 }
1500 while (j != group_size);
1501
1502 /* Swap mismatched definition stmts. */
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_NOTE, vect_location,
1505 "Re-trying with swapped operands of stmts ");
1506 for (j = 0; j < group_size; ++j)
1507 if (matches[j] == !swap_not_matching)
1508 {
1509 std::swap (oprnds_info[0]->def_stmts[j],
1510 oprnds_info[1]->def_stmts[j]);
1511 std::swap (oprnds_info[0]->ops[j],
1512 oprnds_info[1]->ops[j]);
1513 if (dump_enabled_p ())
1514 dump_printf (MSG_NOTE, "%d ", j);
1515 }
1516 if (dump_enabled_p ())
1517 dump_printf (MSG_NOTE, "\n");
1518 /* And try again with scratch 'matches' ... */
1519 bool *tem = XALLOCAVEC (bool, group_size);
1520 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
1521 group_size, &this_max_nunits,
1522 tem, npermutes,
1523 &this_tree_size, bst_map)) != NULL)
1524 {
1525 /* If we have all children of a non-unary child built up from
1526 scalars then just throw that away and build it up this node
1527 from scalars. */
1528 if (is_a <bb_vec_info> (vinfo)
1529 && SLP_TREE_CHILDREN (child).length () > 1
1530 /* ??? Rejecting patterns this way doesn't work. We'd have
1531 to do extra work to cancel the pattern so the uses see the
1532 scalar version. */
1533 && !oprnd_info->any_pattern)
1534 {
1535 unsigned int j;
1536 slp_tree grandchild;
1537
1538 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1539 if (SLP_TREE_DEF_TYPE (grandchild) != vect_external_def)
1540 break;
1541 if (!grandchild
1542 && (vect_update_all_shared_vectypes
1543 (oprnd_info->def_stmts)))
1544 {
1545 /* Roll back. */
1546 this_tree_size = old_tree_size;
1547 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
1548 vect_free_slp_tree (grandchild, false);
1549 SLP_TREE_CHILDREN (child).truncate (0);
1550
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_NOTE, vect_location,
1553 "Building parent vector operands from "
1554 "scalars instead\n");
1555 oprnd_info->def_stmts = vNULL;
1556 SLP_TREE_DEF_TYPE (child) = vect_external_def;
1557 SLP_TREE_SCALAR_OPS (child) = oprnd_info->ops;
1558 oprnd_info->ops = vNULL;
1559 ++this_tree_size;
1560 children.safe_push (child);
1561 continue;
1562 }
1563 }
1564
1565 oprnd_info->def_stmts = vNULL;
1566 children.safe_push (child);
1567 continue;
1568 }
1569
1570 ++*npermutes;
1571 }
1572
1573 fail:
1574 gcc_assert (child == NULL);
1575 FOR_EACH_VEC_ELT (children, j, child)
1576 vect_free_slp_tree (child, false);
1577 vect_free_oprnd_info (oprnds_info);
1578 return NULL;
1579 }
1580
1581 vect_free_oprnd_info (oprnds_info);
1582
1583 *tree_size += this_tree_size + 1;
1584 *max_nunits = this_max_nunits;
1585
1586 node = vect_create_new_slp_node (stmts);
1587 SLP_TREE_TWO_OPERATORS (node) = two_operators;
1588 SLP_TREE_CHILDREN (node).splice (children);
1589 return node;
1590 }
1591
1592 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
1593
1594 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)1595 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1596 slp_tree node, hash_set<slp_tree> &visited)
1597 {
1598 unsigned i, j;
1599 stmt_vec_info stmt_info;
1600 slp_tree child;
1601 tree op;
1602
1603 if (visited.add (node))
1604 return;
1605
1606 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
1607 dump_user_location_t user_loc = loc.get_user_location ();
1608 dump_printf_loc (metadata, user_loc, "node%s %p (max_nunits=%u, refcnt=%u)\n",
1609 SLP_TREE_DEF_TYPE (node) == vect_external_def
1610 ? " (external)"
1611 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
1612 ? " (constant)"
1613 : ""), node,
1614 estimated_poly_value (node->max_nunits), node->refcnt);
1615 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1616 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1617 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
1618 else
1619 {
1620 dump_printf_loc (metadata, user_loc, "\t{ ");
1621 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1622 dump_printf (metadata, "%T%s ", op,
1623 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
1624 dump_printf (metadata, "}\n");
1625 }
1626 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1627 {
1628 dump_printf_loc (metadata, user_loc, "\tload permutation {");
1629 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
1630 dump_printf (dump_kind, " %u", j);
1631 dump_printf (dump_kind, " }\n");
1632 }
1633 if (SLP_TREE_CHILDREN (node).is_empty ())
1634 return;
1635 dump_printf_loc (metadata, user_loc, "\tchildren");
1636 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1637 dump_printf (dump_kind, " %p", (void *)child);
1638 dump_printf (dump_kind, "\n");
1639 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1640 vect_print_slp_tree (dump_kind, loc, child, visited);
1641 }
1642
1643 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)1644 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
1645 slp_tree node)
1646 {
1647 hash_set<slp_tree> visited;
1648 vect_print_slp_tree (dump_kind, loc, node, visited);
1649 }
1650
1651 /* Mark the tree rooted at NODE with PURE_SLP. */
1652
1653 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)1654 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
1655 {
1656 int i;
1657 stmt_vec_info stmt_info;
1658 slp_tree child;
1659
1660 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1661 return;
1662
1663 if (visited.add (node))
1664 return;
1665
1666 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1667 STMT_SLP_TYPE (stmt_info) = pure_slp;
1668
1669 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1670 vect_mark_slp_stmts (child, visited);
1671 }
1672
1673 static void
vect_mark_slp_stmts(slp_tree node)1674 vect_mark_slp_stmts (slp_tree node)
1675 {
1676 hash_set<slp_tree> visited;
1677 vect_mark_slp_stmts (node, visited);
1678 }
1679
1680 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
1681
1682 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)1683 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
1684 {
1685 int i;
1686 stmt_vec_info stmt_info;
1687 slp_tree child;
1688
1689 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1690 return;
1691
1692 if (visited.add (node))
1693 return;
1694
1695 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1696 {
1697 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
1698 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
1699 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
1700 }
1701
1702 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1703 vect_mark_slp_stmts_relevant (child, visited);
1704 }
1705
1706 static void
vect_mark_slp_stmts_relevant(slp_tree node)1707 vect_mark_slp_stmts_relevant (slp_tree node)
1708 {
1709 hash_set<slp_tree> visited;
1710 vect_mark_slp_stmts_relevant (node, visited);
1711 }
1712
1713 /* Copy the SLP subtree rooted at NODE. */
1714
1715 static slp_tree
slp_copy_subtree(slp_tree node,hash_map<slp_tree,slp_tree> & map)1716 slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
1717 {
1718 unsigned i;
1719
1720 bool existed_p;
1721 slp_tree ©_ref = map.get_or_insert (node, &existed_p);
1722 if (existed_p)
1723 return copy_ref;
1724
1725 copy_ref = XNEW (_slp_tree);
1726 slp_tree copy = copy_ref;
1727 memcpy (copy, node, sizeof (_slp_tree));
1728 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1729 {
1730 SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy ();
1731 stmt_vec_info stmt_info;
1732 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1733 STMT_VINFO_NUM_SLP_USES (stmt_info)++;
1734 }
1735 if (SLP_TREE_SCALAR_OPS (node).exists ())
1736 SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
1737 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
1738 SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
1739 if (SLP_TREE_CHILDREN (node).exists ())
1740 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
1741 gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
1742 copy->refcnt = 0;
1743
1744 slp_tree child;
1745 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child)
1746 {
1747 SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map);
1748 SLP_TREE_CHILDREN (copy)[i]->refcnt++;
1749 }
1750 return copy;
1751 }
1752
1753 /* Rearrange the statements of NODE according to PERMUTATION. */
1754
1755 static void
vect_slp_rearrange_stmts(slp_tree node,unsigned int group_size,vec<unsigned> permutation,hash_set<slp_tree> & visited)1756 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
1757 vec<unsigned> permutation,
1758 hash_set<slp_tree> &visited)
1759 {
1760 unsigned int i;
1761 slp_tree child;
1762
1763 if (visited.add (node))
1764 return;
1765
1766 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1767 vect_slp_rearrange_stmts (child, group_size, permutation, visited);
1768
1769 if (SLP_TREE_SCALAR_STMTS (node).exists ())
1770 {
1771 gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
1772 /* ??? Computation nodes are isomorphic and need no rearrangement.
1773 This is a quick hack to cover those where rearrangement breaks
1774 semantics because only the first stmt is guaranteed to have the
1775 correct operation code due to others being swapped or inverted. */
1776 stmt_vec_info first = SLP_TREE_SCALAR_STMTS (node)[0];
1777 if (is_gimple_assign (first->stmt)
1778 && gimple_assign_rhs_code (first->stmt) == COND_EXPR)
1779 return;
1780 vec<stmt_vec_info> tmp_stmts;
1781 tmp_stmts.create (group_size);
1782 tmp_stmts.quick_grow (group_size);
1783 stmt_vec_info stmt_info;
1784 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
1785 tmp_stmts[permutation[i]] = stmt_info;
1786 SLP_TREE_SCALAR_STMTS (node).release ();
1787 SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
1788 }
1789 if (SLP_TREE_SCALAR_OPS (node).exists ())
1790 {
1791 gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ());
1792 vec<tree> tmp_ops;
1793 tmp_ops.create (group_size);
1794 tmp_ops.quick_grow (group_size);
1795 tree op;
1796 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
1797 tmp_ops[permutation[i]] = op;
1798 SLP_TREE_SCALAR_OPS (node).release ();
1799 SLP_TREE_SCALAR_OPS (node) = tmp_ops;
1800 }
1801 }
1802
1803
1804 /* Attempt to reorder stmts in a reduction chain so that we don't
1805 require any load permutation. Return true if that was possible,
1806 otherwise return false. */
1807
1808 static bool
vect_attempt_slp_rearrange_stmts(slp_instance slp_instn)1809 vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
1810 {
1811 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1812 unsigned int i, j;
1813 unsigned int lidx;
1814 slp_tree node, load;
1815
1816 /* Compare all the permutation sequences to the first one. We know
1817 that at least one load is permuted. */
1818 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1819 if (!node->load_permutation.exists ())
1820 return false;
1821 for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
1822 {
1823 if (!load->load_permutation.exists ())
1824 return false;
1825 FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
1826 if (lidx != node->load_permutation[j])
1827 return false;
1828 }
1829
1830 /* Check that the loads in the first sequence are different and there
1831 are no gaps between them. */
1832 auto_sbitmap load_index (group_size);
1833 bitmap_clear (load_index);
1834 FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
1835 {
1836 if (lidx >= group_size)
1837 return false;
1838 if (bitmap_bit_p (load_index, lidx))
1839 return false;
1840
1841 bitmap_set_bit (load_index, lidx);
1842 }
1843 for (i = 0; i < group_size; i++)
1844 if (!bitmap_bit_p (load_index, i))
1845 return false;
1846
1847 /* This permutation is valid for reduction. Since the order of the
1848 statements in the nodes is not important unless they are memory
1849 accesses, we can rearrange the statements in all the nodes
1850 according to the order of the loads. */
1851
1852 /* We have to unshare the SLP tree we modify. */
1853 hash_map<slp_tree, slp_tree> map;
1854 slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map);
1855 vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn), false);
1856 unshared->refcnt++;
1857 SLP_INSTANCE_TREE (slp_instn) = unshared;
1858 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1859 SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node);
1860 node = SLP_INSTANCE_LOADS (slp_instn)[0];
1861
1862 /* Do the actual re-arrangement. */
1863 hash_set<slp_tree> visited;
1864 vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
1865 node->load_permutation, visited);
1866
1867 /* We are done, no actual permutations need to be generated. */
1868 poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
1869 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1870 {
1871 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1872 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
1873 /* But we have to keep those permutations that are required because
1874 of handling of gaps. */
1875 if (known_eq (unrolling_factor, 1U)
1876 || (group_size == DR_GROUP_SIZE (first_stmt_info)
1877 && DR_GROUP_GAP (first_stmt_info) == 0))
1878 SLP_TREE_LOAD_PERMUTATION (node).release ();
1879 else
1880 for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j)
1881 SLP_TREE_LOAD_PERMUTATION (node)[j] = j;
1882 }
1883
1884 return true;
1885 }
1886
1887 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
1888
1889 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node,hash_set<slp_tree> & visited)1890 vect_gather_slp_loads (slp_instance inst, slp_tree node,
1891 hash_set<slp_tree> &visited)
1892 {
1893 if (visited.add (node))
1894 return;
1895
1896 if (SLP_TREE_CHILDREN (node).length () == 0)
1897 {
1898 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
1899 return;
1900 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1901 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1902 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1903 SLP_INSTANCE_LOADS (inst).safe_push (node);
1904 }
1905 else
1906 {
1907 unsigned i;
1908 slp_tree child;
1909 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
1910 vect_gather_slp_loads (inst, child, visited);
1911 }
1912 }
1913
1914 static void
vect_gather_slp_loads(slp_instance inst,slp_tree node)1915 vect_gather_slp_loads (slp_instance inst, slp_tree node)
1916 {
1917 hash_set<slp_tree> visited;
1918 vect_gather_slp_loads (inst, node, visited);
1919 }
1920
1921 /* Check if the required load permutations in the SLP instance
1922 SLP_INSTN are supported. */
1923
1924 static bool
vect_supported_load_permutation_p(slp_instance slp_instn)1925 vect_supported_load_permutation_p (slp_instance slp_instn)
1926 {
1927 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
1928 unsigned int i, j, k, next;
1929 slp_tree node;
1930
1931 if (dump_enabled_p ())
1932 {
1933 dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
1934 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1935 if (node->load_permutation.exists ())
1936 FOR_EACH_VEC_ELT (node->load_permutation, j, next)
1937 dump_printf (MSG_NOTE, "%d ", next);
1938 else
1939 for (k = 0; k < group_size; ++k)
1940 dump_printf (MSG_NOTE, "%d ", k);
1941 dump_printf (MSG_NOTE, "\n");
1942 }
1943
1944 /* In case of reduction every load permutation is allowed, since the order
1945 of the reduction statements is not important (as opposed to the case of
1946 grouped stores). The only condition we need to check is that all the
1947 load nodes are of the same size and have the same permutation (and then
1948 rearrange all the nodes of the SLP instance according to this
1949 permutation). */
1950
1951 /* Check that all the load nodes are of the same size. */
1952 /* ??? Can't we assert this? */
1953 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1954 if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
1955 return false;
1956
1957 node = SLP_INSTANCE_TREE (slp_instn);
1958 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
1959
1960 /* Reduction (there are no data-refs in the root).
1961 In reduction chain the order of the loads is not important. */
1962 if (!STMT_VINFO_DATA_REF (stmt_info)
1963 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1964 vect_attempt_slp_rearrange_stmts (slp_instn);
1965
1966 /* In basic block vectorization we allow any subchain of an interleaving
1967 chain.
1968 FORNOW: not supported in loop SLP because of realignment compications. */
1969 if (STMT_VINFO_BB_VINFO (stmt_info))
1970 {
1971 /* Check whether the loads in an instance form a subchain and thus
1972 no permutation is necessary. */
1973 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
1974 {
1975 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
1976 continue;
1977 bool subchain_p = true;
1978 stmt_vec_info next_load_info = NULL;
1979 stmt_vec_info load_info;
1980 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1981 {
1982 if (j != 0
1983 && (next_load_info != load_info
1984 || DR_GROUP_GAP (load_info) != 1))
1985 {
1986 subchain_p = false;
1987 break;
1988 }
1989 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
1990 }
1991 if (subchain_p)
1992 SLP_TREE_LOAD_PERMUTATION (node).release ();
1993 else
1994 {
1995 stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
1996 group_info = DR_GROUP_FIRST_ELEMENT (group_info);
1997 unsigned HOST_WIDE_INT nunits;
1998 unsigned k, maxk = 0;
1999 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
2000 if (k > maxk)
2001 maxk = k;
2002 /* In BB vectorization we may not actually use a loaded vector
2003 accessing elements in excess of DR_GROUP_SIZE. */
2004 tree vectype = STMT_VINFO_VECTYPE (group_info);
2005 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
2006 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
2007 {
2008 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2010 "BB vectorization with gaps at the end of "
2011 "a load is not supported\n");
2012 return false;
2013 }
2014
2015 /* Verify the permutation can be generated. */
2016 vec<tree> tem;
2017 unsigned n_perms;
2018 if (!vect_transform_slp_perm_load (node, tem, NULL,
2019 1, slp_instn, true, &n_perms))
2020 {
2021 if (dump_enabled_p ())
2022 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2023 vect_location,
2024 "unsupported load permutation\n");
2025 return false;
2026 }
2027 }
2028 }
2029 return true;
2030 }
2031
2032 /* For loop vectorization verify we can generate the permutation. Be
2033 conservative about the vectorization factor, there are permutations
2034 that will use three vector inputs only starting from a specific factor
2035 and the vectorization factor is not yet final.
2036 ??? The SLP instance unrolling factor might not be the maximum one. */
2037 unsigned n_perms;
2038 poly_uint64 test_vf
2039 = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
2040 LOOP_VINFO_VECT_FACTOR
2041 (STMT_VINFO_LOOP_VINFO (stmt_info)));
2042 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
2043 if (node->load_permutation.exists ()
2044 && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
2045 slp_instn, true, &n_perms))
2046 return false;
2047
2048 return true;
2049 }
2050
2051
2052 /* Find the last store in SLP INSTANCE. */
2053
2054 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2055 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2056 {
2057 stmt_vec_info last = NULL;
2058 stmt_vec_info stmt_vinfo;
2059
2060 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2061 {
2062 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2063 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2064 }
2065
2066 return last;
2067 }
2068
2069 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2070 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2071 (also containing the first GROUP1_SIZE stmts, since stores are
2072 consecutive), the second containing the remainder.
2073 Return the first stmt in the second group. */
2074
2075 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2076 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2077 {
2078 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2079 gcc_assert (group1_size > 0);
2080 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2081 gcc_assert (group2_size > 0);
2082 DR_GROUP_SIZE (first_vinfo) = group1_size;
2083
2084 stmt_vec_info stmt_info = first_vinfo;
2085 for (unsigned i = group1_size; i > 1; i--)
2086 {
2087 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2088 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2089 }
2090 /* STMT is now the last element of the first group. */
2091 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2092 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2093
2094 DR_GROUP_SIZE (group2) = group2_size;
2095 for (stmt_info = group2; stmt_info;
2096 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2097 {
2098 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2099 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2100 }
2101
2102 /* For the second group, the DR_GROUP_GAP is that before the original group,
2103 plus skipping over the first vector. */
2104 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2105
2106 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2107 DR_GROUP_GAP (first_vinfo) += group2_size;
2108
2109 if (dump_enabled_p ())
2110 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2111 group1_size, group2_size);
2112
2113 return group2;
2114 }
2115
2116 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2117 statements and a vector of NUNITS elements. */
2118
2119 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2120 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2121 {
2122 return exact_div (common_multiple (nunits, group_size), group_size);
2123 }
2124
2125 /* Analyze an SLP instance starting from a group of grouped stores. Call
2126 vect_build_slp_tree to build a tree of packed stmts if possible.
2127 Return FALSE if it's impossible to SLP any stmt in the loop. */
2128
2129 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,unsigned max_tree_size)2130 vect_analyze_slp_instance (vec_info *vinfo,
2131 scalar_stmts_to_slp_tree_map_t *bst_map,
2132 stmt_vec_info stmt_info, unsigned max_tree_size)
2133 {
2134 slp_instance new_instance;
2135 slp_tree node;
2136 unsigned int group_size;
2137 tree vectype, scalar_type = NULL_TREE;
2138 unsigned int i;
2139 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2140 vec<stmt_vec_info> scalar_stmts;
2141 bool constructor = false;
2142
2143 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2144 {
2145 scalar_type = TREE_TYPE (DR_REF (dr));
2146 group_size = DR_GROUP_SIZE (stmt_info);
2147 vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
2148 }
2149 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2150 {
2151 gcc_assert (is_a <loop_vec_info> (vinfo));
2152 vectype = STMT_VINFO_VECTYPE (stmt_info);
2153 group_size = REDUC_GROUP_SIZE (stmt_info);
2154 }
2155 else if (is_gimple_assign (stmt_info->stmt)
2156 && gimple_assign_rhs_code (stmt_info->stmt) == CONSTRUCTOR)
2157 {
2158 vectype = TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt));
2159 group_size = CONSTRUCTOR_NELTS (gimple_assign_rhs1 (stmt_info->stmt));
2160 constructor = true;
2161 }
2162 else
2163 {
2164 gcc_assert (is_a <loop_vec_info> (vinfo));
2165 vectype = STMT_VINFO_VECTYPE (stmt_info);
2166 group_size = as_a <loop_vec_info> (vinfo)->reductions.length ();
2167 }
2168
2169 if (!vectype)
2170 {
2171 if (dump_enabled_p ())
2172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173 "Build SLP failed: unsupported data-type %T\n",
2174 scalar_type);
2175
2176 return false;
2177 }
2178 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2179
2180 /* Create a node (a root of the SLP tree) for the packed grouped stores. */
2181 scalar_stmts.create (group_size);
2182 stmt_vec_info next_info = stmt_info;
2183 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2184 {
2185 /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
2186 while (next_info)
2187 {
2188 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2189 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
2190 }
2191 }
2192 else if (!dr && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2193 {
2194 /* Collect the reduction stmts and store them in
2195 SLP_TREE_SCALAR_STMTS. */
2196 while (next_info)
2197 {
2198 scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info));
2199 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
2200 }
2201 /* Mark the first element of the reduction chain as reduction to properly
2202 transform the node. In the reduction analysis phase only the last
2203 element of the chain is marked as reduction. */
2204 STMT_VINFO_DEF_TYPE (stmt_info)
2205 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
2206 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
2207 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
2208 }
2209 else if (constructor)
2210 {
2211 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
2212 tree val;
2213 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
2214 {
2215 if (TREE_CODE (val) == SSA_NAME)
2216 {
2217 gimple* def = SSA_NAME_DEF_STMT (val);
2218 stmt_vec_info def_info = vinfo->lookup_stmt (def);
2219 /* Value is defined in another basic block. */
2220 if (!def_info)
2221 return false;
2222 def_info = vect_stmt_to_vectorize (def_info);
2223 scalar_stmts.safe_push (def_info);
2224 }
2225 else
2226 return false;
2227 }
2228 if (dump_enabled_p ())
2229 dump_printf_loc (MSG_NOTE, vect_location,
2230 "Analyzing vectorizable constructor: %G\n",
2231 stmt_info->stmt);
2232 }
2233 else
2234 {
2235 /* Collect reduction statements. */
2236 vec<stmt_vec_info> reductions = as_a <loop_vec_info> (vinfo)->reductions;
2237 for (i = 0; reductions.iterate (i, &next_info); i++)
2238 scalar_stmts.safe_push (next_info);
2239 }
2240
2241 /* Build the tree for the SLP instance. */
2242 bool *matches = XALLOCAVEC (bool, group_size);
2243 unsigned npermutes = 0;
2244 poly_uint64 max_nunits = nunits;
2245 unsigned tree_size = 0;
2246 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
2247 &max_nunits, matches, &npermutes,
2248 &tree_size, bst_map);
2249 if (node != NULL)
2250 {
2251 /* Calculate the unrolling factor based on the smallest type. */
2252 poly_uint64 unrolling_factor
2253 = calculate_unrolling_factor (max_nunits, group_size);
2254
2255 if (maybe_ne (unrolling_factor, 1U)
2256 && is_a <bb_vec_info> (vinfo))
2257 {
2258 unsigned HOST_WIDE_INT const_max_nunits;
2259 if (!max_nunits.is_constant (&const_max_nunits)
2260 || const_max_nunits > group_size)
2261 {
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264 "Build SLP failed: store group "
2265 "size not a multiple of the vector size "
2266 "in basic block SLP\n");
2267 vect_free_slp_tree (node, false);
2268 return false;
2269 }
2270 /* Fatal mismatch. */
2271 matches[group_size / const_max_nunits * const_max_nunits] = false;
2272 vect_free_slp_tree (node, false);
2273 }
2274 else
2275 {
2276 /* Create a new SLP instance. */
2277 new_instance = XNEW (class _slp_instance);
2278 SLP_INSTANCE_TREE (new_instance) = node;
2279 SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
2280 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
2281 SLP_INSTANCE_LOADS (new_instance) = vNULL;
2282 SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
2283
2284 vect_gather_slp_loads (new_instance, node);
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_NOTE, vect_location,
2287 "SLP size %u vs. limit %u.\n",
2288 tree_size, max_tree_size);
2289
2290 /* Compute the load permutation. */
2291 slp_tree load_node;
2292 bool loads_permuted = false;
2293 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2294 {
2295 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2296 continue;
2297 unsigned j;
2298 stmt_vec_info load_info;
2299 bool this_load_permuted = false;
2300 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
2301 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2302 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2303 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2304 {
2305 this_load_permuted = true;
2306 break;
2307 }
2308 if (!this_load_permuted
2309 /* The load requires permutation when unrolling exposes
2310 a gap either because the group is larger than the SLP
2311 group-size or because there is a gap between the groups. */
2312 && (known_eq (unrolling_factor, 1U)
2313 || (group_size == DR_GROUP_SIZE (first_stmt_info)
2314 && DR_GROUP_GAP (first_stmt_info) == 0)))
2315 {
2316 SLP_TREE_LOAD_PERMUTATION (load_node).release ();
2317 continue;
2318 }
2319 loads_permuted = true;
2320 }
2321
2322 if (loads_permuted)
2323 {
2324 if (!vect_supported_load_permutation_p (new_instance))
2325 {
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328 "Build SLP failed: unsupported load "
2329 "permutation %G", stmt_info->stmt);
2330 vect_free_slp_instance (new_instance, false);
2331 return false;
2332 }
2333 }
2334
2335 /* If the loads and stores can be handled with load/store-lan
2336 instructions do not generate this SLP instance. */
2337 if (is_a <loop_vec_info> (vinfo)
2338 && loads_permuted
2339 && dr && vect_store_lanes_supported (vectype, group_size, false))
2340 {
2341 slp_tree load_node;
2342 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
2343 {
2344 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2345 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2346 /* Use SLP for strided accesses (or if we can't load-lanes). */
2347 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2348 || ! vect_load_lanes_supported
2349 (STMT_VINFO_VECTYPE (stmt_vinfo),
2350 DR_GROUP_SIZE (stmt_vinfo), false))
2351 break;
2352 }
2353 if (i == SLP_INSTANCE_LOADS (new_instance).length ())
2354 {
2355 if (dump_enabled_p ())
2356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2357 "Built SLP cancelled: can use "
2358 "load/store-lanes\n");
2359 vect_free_slp_instance (new_instance, false);
2360 return false;
2361 }
2362 }
2363
2364 /* If this is a reduction chain with a conversion in front
2365 amend the SLP tree with a node for that. */
2366 if (!dr
2367 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
2368 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
2369 {
2370 /* Get at the conversion stmt - we know it's the single use
2371 of the last stmt of the reduction chain. */
2372 gimple *tem = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
2373 use_operand_p use_p;
2374 gimple *use_stmt;
2375 bool r = single_imm_use (gimple_assign_lhs (tem),
2376 &use_p, &use_stmt);
2377 gcc_assert (r);
2378 next_info = vinfo->lookup_stmt (use_stmt);
2379 next_info = vect_stmt_to_vectorize (next_info);
2380 scalar_stmts = vNULL;
2381 scalar_stmts.create (group_size);
2382 for (unsigned i = 0; i < group_size; ++i)
2383 scalar_stmts.quick_push (next_info);
2384 slp_tree conv = vect_create_new_slp_node (scalar_stmts);
2385 SLP_TREE_CHILDREN (conv).quick_push (node);
2386 SLP_INSTANCE_TREE (new_instance) = conv;
2387 /* We also have to fake this conversion stmt as SLP reduction
2388 group so we don't have to mess with too much code
2389 elsewhere. */
2390 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
2391 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
2392 }
2393
2394 vinfo->slp_instances.safe_push (new_instance);
2395
2396 if (dump_enabled_p ())
2397 {
2398 dump_printf_loc (MSG_NOTE, vect_location,
2399 "Final SLP tree for instance:\n");
2400 vect_print_slp_tree (MSG_NOTE, vect_location,
2401 SLP_INSTANCE_TREE (new_instance));
2402 }
2403
2404 return true;
2405 }
2406 }
2407 else
2408 {
2409 /* Failed to SLP. */
2410 /* Free the allocated memory. */
2411 scalar_stmts.release ();
2412 }
2413
2414 /* For basic block SLP, try to break the group up into multiples of the
2415 vector size. */
2416 unsigned HOST_WIDE_INT const_nunits;
2417 if (is_a <bb_vec_info> (vinfo)
2418 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2419 && DR_GROUP_FIRST_ELEMENT (stmt_info)
2420 && nunits.is_constant (&const_nunits))
2421 {
2422 /* We consider breaking the group only on VF boundaries from the existing
2423 start. */
2424 for (i = 0; i < group_size; i++)
2425 if (!matches[i]) break;
2426
2427 if (i >= const_nunits && i < group_size)
2428 {
2429 /* Split into two groups at the first vector boundary before i. */
2430 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
2431 unsigned group1_size = i & ~(const_nunits - 1);
2432
2433 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
2434 group1_size);
2435 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
2436 max_tree_size);
2437 /* If the first non-match was in the middle of a vector,
2438 skip the rest of that vector. */
2439 if (group1_size < i)
2440 {
2441 i = group1_size + const_nunits;
2442 if (i < group_size)
2443 rest = vect_split_slp_store_group (rest, const_nunits);
2444 }
2445 if (i < group_size)
2446 res |= vect_analyze_slp_instance (vinfo, bst_map,
2447 rest, max_tree_size);
2448 return res;
2449 }
2450 /* Even though the first vector did not all match, we might be able to SLP
2451 (some) of the remainder. FORNOW ignore this possibility. */
2452 }
2453
2454 return false;
2455 }
2456
2457
2458 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
2459 trees of packed scalar stmts if SLP is possible. */
2460
2461 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)2462 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
2463 {
2464 unsigned int i;
2465 stmt_vec_info first_element;
2466
2467 DUMP_VECT_SCOPE ("vect_analyze_slp");
2468
2469 scalar_stmts_to_slp_tree_map_t *bst_map
2470 = new scalar_stmts_to_slp_tree_map_t ();
2471
2472 /* Find SLP sequences starting from groups of grouped stores. */
2473 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
2474 vect_analyze_slp_instance (vinfo, bst_map, first_element, max_tree_size);
2475
2476 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2477 {
2478 if (loop_vinfo->reduction_chains.length () > 0)
2479 {
2480 /* Find SLP sequences starting from reduction chains. */
2481 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
2482 if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
2483 max_tree_size))
2484 {
2485 /* Dissolve reduction chain group. */
2486 stmt_vec_info vinfo = first_element;
2487 stmt_vec_info last = NULL;
2488 while (vinfo)
2489 {
2490 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
2491 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
2492 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2493 last = vinfo;
2494 vinfo = next;
2495 }
2496 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
2497 /* It can be still vectorized as part of an SLP reduction. */
2498 loop_vinfo->reductions.safe_push (last);
2499 }
2500 }
2501
2502 /* Find SLP sequences starting from groups of reductions. */
2503 if (loop_vinfo->reductions.length () > 1)
2504 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
2505 max_tree_size);
2506 }
2507
2508 /* The map keeps a reference on SLP nodes built, release that. */
2509 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
2510 it != bst_map->end (); ++it)
2511 if ((*it).second)
2512 vect_free_slp_tree ((*it).second, false);
2513 delete bst_map;
2514
2515 return opt_result::success ();
2516 }
2517
2518
2519 /* For each possible SLP instance decide whether to SLP it and calculate overall
2520 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
2521 least one instance. */
2522
2523 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)2524 vect_make_slp_decision (loop_vec_info loop_vinfo)
2525 {
2526 unsigned int i;
2527 poly_uint64 unrolling_factor = 1;
2528 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2529 slp_instance instance;
2530 int decided_to_slp = 0;
2531
2532 DUMP_VECT_SCOPE ("vect_make_slp_decision");
2533
2534 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2535 {
2536 /* FORNOW: SLP if you can. */
2537 /* All unroll factors have the form:
2538
2539 GET_MODE_SIZE (vinfo->vector_mode) * X
2540
2541 for some rational X, so they must have a common multiple. */
2542 unrolling_factor
2543 = force_common_multiple (unrolling_factor,
2544 SLP_INSTANCE_UNROLLING_FACTOR (instance));
2545
2546 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
2547 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
2548 loop-based vectorization. Such stmts will be marked as HYBRID. */
2549 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
2550 decided_to_slp++;
2551 }
2552
2553 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
2554
2555 if (decided_to_slp && dump_enabled_p ())
2556 {
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "Decided to SLP %d instances. Unrolling factor ",
2559 decided_to_slp);
2560 dump_dec (MSG_NOTE, unrolling_factor);
2561 dump_printf (MSG_NOTE, "\n");
2562 }
2563
2564 return (decided_to_slp > 0);
2565 }
2566
2567
2568 /* Find stmts that must be both vectorized and SLPed (since they feed stmts that
2569 can't be SLPed) in the tree rooted at NODE. Mark such stmts as HYBRID. */
2570
2571 static void
vect_detect_hybrid_slp_stmts(slp_tree node,unsigned i,slp_vect_type stype,hash_map<slp_tree,unsigned> & visited)2572 vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype,
2573 hash_map<slp_tree, unsigned> &visited)
2574 {
2575 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (node)[i];
2576 imm_use_iterator imm_iter;
2577 gimple *use_stmt;
2578 stmt_vec_info use_vinfo;
2579 slp_tree child;
2580 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2581 int j;
2582
2583 /* We need to union stype over the incoming graph edges but we still
2584 want to limit recursion to stay O(N+E). */
2585 unsigned visited_cnt = ++visited.get_or_insert (node);
2586 gcc_assert (visited_cnt <= node->refcnt);
2587 bool only_edge = (visited_cnt != node->refcnt);
2588
2589 /* Propagate hybrid down the SLP tree. */
2590 if (stype == hybrid)
2591 ;
2592 else if (HYBRID_SLP_STMT (stmt_vinfo))
2593 stype = hybrid;
2594 else if (!only_edge)
2595 {
2596 /* Check if a pure SLP stmt has uses in non-SLP stmts. */
2597 gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
2598 /* If we get a pattern stmt here we have to use the LHS of the
2599 original stmt for immediate uses. */
2600 gimple *stmt = vect_orig_stmt (stmt_vinfo)->stmt;
2601 tree def;
2602 if (gimple_code (stmt) == GIMPLE_PHI)
2603 def = gimple_phi_result (stmt);
2604 else
2605 def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF);
2606 if (def)
2607 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, def)
2608 {
2609 use_vinfo = loop_vinfo->lookup_stmt (use_stmt);
2610 if (!use_vinfo)
2611 continue;
2612 use_vinfo = vect_stmt_to_vectorize (use_vinfo);
2613 if (!STMT_SLP_TYPE (use_vinfo)
2614 && (STMT_VINFO_RELEVANT (use_vinfo)
2615 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2616 && !(gimple_code (use_stmt) == GIMPLE_PHI
2617 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2618 {
2619 if (dump_enabled_p ())
2620 dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
2621 "def in non-SLP stmt: %G", use_stmt);
2622 stype = hybrid;
2623 }
2624 }
2625 }
2626
2627 if (stype == hybrid
2628 && !HYBRID_SLP_STMT (stmt_vinfo))
2629 {
2630 if (dump_enabled_p ())
2631 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2632 stmt_vinfo->stmt);
2633 STMT_SLP_TYPE (stmt_vinfo) = hybrid;
2634 }
2635
2636 if (!only_edge)
2637 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2638 if (SLP_TREE_DEF_TYPE (child) != vect_external_def
2639 && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
2640 vect_detect_hybrid_slp_stmts (child, i, stype, visited);
2641 }
2642
2643 /* Helpers for vect_detect_hybrid_slp walking pattern stmt uses. */
2644
2645 static tree
vect_detect_hybrid_slp_1(tree * tp,int *,void * data)2646 vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
2647 {
2648 walk_stmt_info *wi = (walk_stmt_info *)data;
2649 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2650
2651 if (wi->is_lhs)
2652 return NULL_TREE;
2653
2654 stmt_vec_info def_stmt_info = loop_vinfo->lookup_def (*tp);
2655 if (def_stmt_info && PURE_SLP_STMT (def_stmt_info))
2656 {
2657 if (dump_enabled_p ())
2658 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
2659 def_stmt_info->stmt);
2660 STMT_SLP_TYPE (def_stmt_info) = hybrid;
2661 }
2662
2663 return NULL_TREE;
2664 }
2665
2666 static tree
vect_detect_hybrid_slp_2(gimple_stmt_iterator * gsi,bool * handled,walk_stmt_info * wi)2667 vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
2668 walk_stmt_info *wi)
2669 {
2670 loop_vec_info loop_vinfo = (loop_vec_info) wi->info;
2671 stmt_vec_info use_vinfo = loop_vinfo->lookup_stmt (gsi_stmt (*gsi));
2672 /* If the stmt is in a SLP instance then this isn't a reason
2673 to mark use definitions in other SLP instances as hybrid. */
2674 if (! STMT_SLP_TYPE (use_vinfo)
2675 && (STMT_VINFO_RELEVANT (use_vinfo)
2676 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
2677 && ! (gimple_code (gsi_stmt (*gsi)) == GIMPLE_PHI
2678 && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
2679 ;
2680 else
2681 *handled = true;
2682 return NULL_TREE;
2683 }
2684
2685 /* Find stmts that must be both vectorized and SLPed. */
2686
2687 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)2688 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
2689 {
2690 unsigned int i;
2691 vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2692 slp_instance instance;
2693
2694 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
2695
2696 /* First walk all pattern stmt in the loop and mark defs of uses as
2697 hybrid because immediate uses in them are not recorded. */
2698 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2699 {
2700 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2701 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2702 gsi_next (&gsi))
2703 {
2704 gimple *stmt = gsi_stmt (gsi);
2705 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
2706 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2707 {
2708 walk_stmt_info wi;
2709 memset (&wi, 0, sizeof (wi));
2710 wi.info = loop_vinfo;
2711 gimple_stmt_iterator gsi2
2712 = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info)->stmt);
2713 walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
2714 vect_detect_hybrid_slp_1, &wi);
2715 walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
2716 vect_detect_hybrid_slp_2,
2717 vect_detect_hybrid_slp_1, &wi);
2718 }
2719 }
2720 }
2721
2722 /* Then walk the SLP instance trees marking stmts with uses in
2723 non-SLP stmts as hybrid, also propagating hybrid down the
2724 SLP tree, collecting the above info on-the-fly. */
2725 for (unsigned j = 0;; ++j)
2726 {
2727 hash_map<slp_tree, unsigned> visited;
2728 bool any = false;
2729 FOR_EACH_VEC_ELT (slp_instances, i, instance)
2730 if (j < SLP_INSTANCE_GROUP_SIZE (instance))
2731 {
2732 any = true;
2733 vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
2734 j, pure_slp, visited);
2735 }
2736 if (!any)
2737 break;
2738 }
2739 }
2740
2741
2742 /* Initialize a bb_vec_info struct for the statements between
2743 REGION_BEGIN_IN (inclusive) and REGION_END_IN (exclusive). */
2744
_bb_vec_info(gimple_stmt_iterator region_begin_in,gimple_stmt_iterator region_end_in,vec_info_shared * shared)2745 _bb_vec_info::_bb_vec_info (gimple_stmt_iterator region_begin_in,
2746 gimple_stmt_iterator region_end_in,
2747 vec_info_shared *shared)
2748 : vec_info (vec_info::bb, init_cost (NULL), shared),
2749 bb (gsi_bb (region_begin_in)),
2750 region_begin (region_begin_in),
2751 region_end (region_end_in)
2752 {
2753 gimple_stmt_iterator gsi;
2754
2755 for (gsi = region_begin; gsi_stmt (gsi) != gsi_stmt (region_end);
2756 gsi_next (&gsi))
2757 {
2758 gimple *stmt = gsi_stmt (gsi);
2759 gimple_set_uid (stmt, 0);
2760 add_stmt (stmt);
2761 }
2762
2763 bb->aux = this;
2764 }
2765
2766
2767 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
2768 stmts in the basic block. */
2769
~_bb_vec_info()2770 _bb_vec_info::~_bb_vec_info ()
2771 {
2772 for (gimple_stmt_iterator si = region_begin;
2773 gsi_stmt (si) != gsi_stmt (region_end); gsi_next (&si))
2774 /* Reset region marker. */
2775 gimple_set_uid (gsi_stmt (si), -1);
2776
2777 bb->aux = NULL;
2778 }
2779
2780 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
2781 given then that child nodes have already been processed, and that
2782 their def types currently match their SLP node's def type. */
2783
2784 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)2785 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
2786 slp_instance node_instance,
2787 stmt_vector_for_cost *cost_vec)
2788 {
2789 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2790 gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
2791
2792 /* Calculate the number of vector statements to be created for the
2793 scalar stmts in this node. For SLP reductions it is equal to the
2794 number of vector statements in the children (which has already been
2795 calculated by the recursive call). Otherwise it is the number of
2796 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
2797 VF divided by the number of elements in a vector. */
2798 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2799 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
2800 {
2801 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
2802 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
2803 {
2804 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2805 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
2806 break;
2807 }
2808 }
2809 else
2810 {
2811 poly_uint64 vf;
2812 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2813 vf = loop_vinfo->vectorization_factor;
2814 else
2815 vf = 1;
2816 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (node_instance);
2817 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2818 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
2819 = vect_get_num_vectors (vf * group_size, vectype);
2820 }
2821
2822 bool dummy;
2823 return vect_analyze_stmt (stmt_info, &dummy, node, node_instance, cost_vec);
2824 }
2825
2826 /* Try to build NODE from scalars, returning true on success.
2827 NODE_INSTANCE is the SLP instance that contains NODE. */
2828
2829 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)2830 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
2831 slp_instance node_instance)
2832 {
2833 stmt_vec_info stmt_info;
2834 unsigned int i;
2835
2836 if (!is_a <bb_vec_info> (vinfo)
2837 || node == SLP_INSTANCE_TREE (node_instance)
2838 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
2839 return false;
2840
2841 if (dump_enabled_p ())
2842 dump_printf_loc (MSG_NOTE, vect_location,
2843 "Building vector operands from scalars instead\n");
2844
2845 /* Don't remove and free the child nodes here, since they could be
2846 referenced by other structures. The analysis and scheduling phases
2847 (need to) ignore child nodes of anything that isn't vect_internal_def. */
2848 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
2849 SLP_TREE_DEF_TYPE (node) = vect_external_def;
2850 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size);
2851 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2852 {
2853 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
2854 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
2855 }
2856 return true;
2857 }
2858
2859 /* Analyze statements contained in SLP tree NODE after recursively analyzing
2860 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
2861
2862 Return true if the operations are supported. */
2863
2864 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited,hash_set<slp_tree> & lvisited,stmt_vector_for_cost * cost_vec)2865 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
2866 slp_instance node_instance,
2867 hash_set<slp_tree> &visited,
2868 hash_set<slp_tree> &lvisited,
2869 stmt_vector_for_cost *cost_vec)
2870 {
2871 int i, j;
2872 slp_tree child;
2873
2874 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2875 return true;
2876
2877 /* If we already analyzed the exact same set of scalar stmts we're done.
2878 We share the generated vector stmts for those.
2879 The SLP graph is acyclic so not caching whether we failed or succeeded
2880 doesn't result in any issue since we throw away the lvisited set
2881 when we fail. */
2882 if (visited.contains (node)
2883 || lvisited.add (node))
2884 return true;
2885
2886 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2887 if (!vect_slp_analyze_node_operations (vinfo, child, node_instance,
2888 visited, lvisited, cost_vec))
2889 return false;
2890
2891 /* ??? We have to catch the case late where two first scalar stmts appear
2892 in multiple SLP children with different def type and fail. Remember
2893 original def types first since SLP_TREE_DEF_TYPE doesn't necessarily
2894 match it when that is vect_internal_def. */
2895 auto_vec<vect_def_type, 4> dt;
2896 dt.safe_grow (SLP_TREE_CHILDREN (node).length ());
2897 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2898 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2899 dt[j] = STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]);
2900
2901 /* Push SLP node def-type to stmt operands. */
2902 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2903 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def
2904 && SLP_TREE_SCALAR_STMTS (child).length () != 0)
2905 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2906 = SLP_TREE_DEF_TYPE (child);
2907
2908 /* Check everything worked out. */
2909 bool res = true;
2910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2911 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2912 {
2913 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
2914 {
2915 if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2916 != SLP_TREE_DEF_TYPE (child))
2917 res = false;
2918 }
2919 else if (STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0])
2920 != dt[j])
2921 res = false;
2922 }
2923 if (!res && dump_enabled_p ())
2924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2925 "not vectorized: same operand with different "
2926 "def type in stmt.\n");
2927
2928 if (res)
2929 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
2930 cost_vec);
2931
2932 /* Restore def-types. */
2933 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
2934 if (SLP_TREE_SCALAR_STMTS (child).length () != 0)
2935 STMT_VINFO_DEF_TYPE (SLP_TREE_SCALAR_STMTS (child)[0]) = dt[j];
2936
2937 /* If this node can't be vectorized, try pruning the tree here rather
2938 than felling the whole thing. */
2939 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
2940 res = true;
2941
2942 return res;
2943 }
2944
2945
2946 /* Analyze statements in SLP instances of VINFO. Return true if the
2947 operations are supported. */
2948
2949 bool
vect_slp_analyze_operations(vec_info * vinfo)2950 vect_slp_analyze_operations (vec_info *vinfo)
2951 {
2952 slp_instance instance;
2953 int i;
2954
2955 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
2956
2957 hash_set<slp_tree> visited;
2958 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
2959 {
2960 hash_set<slp_tree> lvisited;
2961 stmt_vector_for_cost cost_vec;
2962 cost_vec.create (2);
2963 if (!vect_slp_analyze_node_operations (vinfo,
2964 SLP_INSTANCE_TREE (instance),
2965 instance, visited, lvisited,
2966 &cost_vec)
2967 /* Instances with a root stmt require vectorized defs for the
2968 SLP tree root. */
2969 || (SLP_INSTANCE_ROOT_STMT (instance)
2970 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
2971 != vect_internal_def)))
2972 {
2973 slp_tree node = SLP_INSTANCE_TREE (instance);
2974 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2975 if (dump_enabled_p ())
2976 dump_printf_loc (MSG_NOTE, vect_location,
2977 "removing SLP instance operations starting from: %G",
2978 stmt_info->stmt);
2979 vect_free_slp_instance (instance, false);
2980 vinfo->slp_instances.ordered_remove (i);
2981 cost_vec.release ();
2982 }
2983 else
2984 {
2985 for (hash_set<slp_tree>::iterator x = lvisited.begin();
2986 x != lvisited.end(); ++x)
2987 visited.add (*x);
2988 i++;
2989
2990 add_stmt_costs (vinfo->target_cost_data, &cost_vec);
2991 cost_vec.release ();
2992 }
2993 }
2994
2995 return !vinfo->slp_instances.is_empty ();
2996 }
2997
2998
2999 /* Compute the scalar cost of the SLP node NODE and its children
3000 and return it. Do not account defs that are marked in LIFE and
3001 update LIFE according to uses of NODE. */
3002
3003 static void
vect_bb_slp_scalar_cost(basic_block bb,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<slp_tree> & visited)3004 vect_bb_slp_scalar_cost (basic_block bb,
3005 slp_tree node, vec<bool, va_heap> *life,
3006 stmt_vector_for_cost *cost_vec,
3007 hash_set<slp_tree> &visited)
3008 {
3009 unsigned i;
3010 stmt_vec_info stmt_info;
3011 slp_tree child;
3012
3013 if (visited.add (node))
3014 return;
3015
3016 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3017 {
3018 gimple *stmt = stmt_info->stmt;
3019 vec_info *vinfo = stmt_info->vinfo;
3020 ssa_op_iter op_iter;
3021 def_operand_p def_p;
3022
3023 if ((*life)[i])
3024 continue;
3025
3026 /* If there is a non-vectorized use of the defs then the scalar
3027 stmt is kept live in which case we do not account it or any
3028 required defs in the SLP children in the scalar cost. This
3029 way we make the vectorization more costly when compared to
3030 the scalar cost. */
3031 FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
3032 {
3033 imm_use_iterator use_iter;
3034 gimple *use_stmt;
3035 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
3036 if (!is_gimple_debug (use_stmt))
3037 {
3038 stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
3039 if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
3040 {
3041 (*life)[i] = true;
3042 BREAK_FROM_IMM_USE_STMT (use_iter);
3043 }
3044 }
3045 }
3046 if ((*life)[i])
3047 continue;
3048
3049 /* Count scalar stmts only once. */
3050 if (gimple_visited_p (stmt))
3051 continue;
3052 gimple_set_visited (stmt, true);
3053
3054 vect_cost_for_stmt kind;
3055 if (STMT_VINFO_DATA_REF (stmt_info))
3056 {
3057 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3058 kind = scalar_load;
3059 else
3060 kind = scalar_store;
3061 }
3062 else if (vect_nop_conversion_p (stmt_info))
3063 continue;
3064 else
3065 kind = scalar_stmt;
3066 record_stmt_cost (cost_vec, 1, kind, stmt_info, 0, vect_body);
3067 }
3068
3069 auto_vec<bool, 20> subtree_life;
3070 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3071 {
3072 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3073 {
3074 /* Do not directly pass LIFE to the recursive call, copy it to
3075 confine changes in the callee to the current child/subtree. */
3076 subtree_life.safe_splice (*life);
3077 vect_bb_slp_scalar_cost (bb, child, &subtree_life, cost_vec,
3078 visited);
3079 subtree_life.truncate (0);
3080 }
3081 }
3082 }
3083
3084 /* Check if vectorization of the basic block is profitable. */
3085
3086 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo)3087 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
3088 {
3089 vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
3090 slp_instance instance;
3091 int i;
3092 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
3093 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
3094
3095 /* Calculate scalar cost. */
3096 stmt_vector_for_cost scalar_costs;
3097 scalar_costs.create (0);
3098 hash_set<slp_tree> visited;
3099 FOR_EACH_VEC_ELT (slp_instances, i, instance)
3100 {
3101 auto_vec<bool, 20> life;
3102 life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
3103 vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
3104 SLP_INSTANCE_TREE (instance),
3105 &life, &scalar_costs, visited);
3106 }
3107 void *target_cost_data = init_cost (NULL);
3108 add_stmt_costs (target_cost_data, &scalar_costs);
3109 scalar_costs.release ();
3110 unsigned dummy;
3111 finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
3112 destroy_cost_data (target_cost_data);
3113
3114 /* Unset visited flag. */
3115 for (gimple_stmt_iterator gsi = bb_vinfo->region_begin;
3116 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3117 gimple_set_visited (gsi_stmt (gsi), false);
3118
3119 /* Complete the target-specific cost calculation. */
3120 finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
3121 &vec_inside_cost, &vec_epilogue_cost);
3122
3123 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
3124
3125 if (dump_enabled_p ())
3126 {
3127 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3128 dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
3129 vec_inside_cost);
3130 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
3131 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
3132 dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
3133 }
3134
3135 /* Vectorization is profitable if its cost is more than the cost of scalar
3136 version. Note that we err on the vector side for equal cost because
3137 the cost estimate is otherwise quite pessimistic (constant uses are
3138 free on the scalar side but cost a load on the vector side for
3139 example). */
3140 if (vec_outside_cost + vec_inside_cost > scalar_cost)
3141 return false;
3142
3143 return true;
3144 }
3145
3146 /* Find any vectorizable constructors and add them to the grouped_store
3147 array. */
3148
3149 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)3150 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
3151 {
3152 gimple_stmt_iterator gsi;
3153
3154 for (gsi = bb_vinfo->region_begin;
3155 gsi_stmt (gsi) != gsi_stmt (bb_vinfo->region_end); gsi_next (&gsi))
3156 {
3157 gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
3158 if (!stmt || gimple_assign_rhs_code (stmt) != CONSTRUCTOR)
3159 continue;
3160
3161 tree rhs = gimple_assign_rhs1 (stmt);
3162 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
3163 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
3164 CONSTRUCTOR_NELTS (rhs))
3165 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
3166 || uniform_vector_p (rhs))
3167 continue;
3168
3169 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (stmt);
3170 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3171 }
3172 }
3173
3174 /* Check if the region described by BB_VINFO can be vectorized, returning
3175 true if so. When returning false, set FATAL to true if the same failure
3176 would prevent vectorization at other vector sizes, false if it is still
3177 worth trying other sizes. N_STMTS is the number of statements in the
3178 region. */
3179
3180 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal)3181 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
3182 {
3183 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
3184
3185 slp_instance instance;
3186 int i;
3187 poly_uint64 min_vf = 2;
3188
3189 /* The first group of checks is independent of the vector size. */
3190 fatal = true;
3191
3192 /* Analyze the data references. */
3193
3194 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
3195 {
3196 if (dump_enabled_p ())
3197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3198 "not vectorized: unhandled data-ref in basic "
3199 "block.\n");
3200 return false;
3201 }
3202
3203 if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
3204 {
3205 if (dump_enabled_p ())
3206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3207 "not vectorized: not enough data-refs in "
3208 "basic block.\n");
3209 return false;
3210 }
3211
3212 if (!vect_analyze_data_ref_accesses (bb_vinfo))
3213 {
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216 "not vectorized: unhandled data access in "
3217 "basic block.\n");
3218 return false;
3219 }
3220
3221 vect_slp_check_for_constructors (bb_vinfo);
3222
3223 /* If there are no grouped stores in the region there is no need
3224 to continue with pattern recog as vect_analyze_slp will fail
3225 anyway. */
3226 if (bb_vinfo->grouped_stores.is_empty ())
3227 {
3228 if (dump_enabled_p ())
3229 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230 "not vectorized: no grouped stores in "
3231 "basic block.\n");
3232 return false;
3233 }
3234
3235 /* While the rest of the analysis below depends on it in some way. */
3236 fatal = false;
3237
3238 vect_pattern_recog (bb_vinfo);
3239
3240 /* Check the SLP opportunities in the basic block, analyze and build SLP
3241 trees. */
3242 if (!vect_analyze_slp (bb_vinfo, n_stmts))
3243 {
3244 if (dump_enabled_p ())
3245 {
3246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3247 "Failed to SLP the basic block.\n");
3248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3249 "not vectorized: failed to find SLP opportunities "
3250 "in basic block.\n");
3251 }
3252 return false;
3253 }
3254
3255 vect_record_base_alignments (bb_vinfo);
3256
3257 /* Analyze and verify the alignment of data references and the
3258 dependence in the SLP instances. */
3259 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
3260 {
3261 if (! vect_slp_analyze_and_verify_instance_alignment (instance)
3262 || ! vect_slp_analyze_instance_dependence (instance))
3263 {
3264 slp_tree node = SLP_INSTANCE_TREE (instance);
3265 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3266 if (dump_enabled_p ())
3267 dump_printf_loc (MSG_NOTE, vect_location,
3268 "removing SLP instance operations starting from: %G",
3269 stmt_info->stmt);
3270 vect_free_slp_instance (instance, false);
3271 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
3272 continue;
3273 }
3274
3275 /* Mark all the statements that we want to vectorize as pure SLP and
3276 relevant. */
3277 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
3278 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
3279 if (SLP_INSTANCE_ROOT_STMT (instance))
3280 STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
3281
3282 i++;
3283 }
3284 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
3285 return false;
3286
3287 if (!vect_slp_analyze_operations (bb_vinfo))
3288 {
3289 if (dump_enabled_p ())
3290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3291 "not vectorized: bad operation in basic block.\n");
3292 return false;
3293 }
3294
3295 /* Cost model: check if the vectorization is worthwhile. */
3296 if (!unlimited_cost_model (NULL)
3297 && !vect_bb_vectorization_profitable_p (bb_vinfo))
3298 {
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301 "not vectorized: vectorization is not "
3302 "profitable.\n");
3303 return false;
3304 }
3305
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_NOTE, vect_location,
3308 "Basic block will be vectorized using SLP\n");
3309 return true;
3310 }
3311
3312 /* Subroutine of vect_slp_bb. Try to vectorize the statements between
3313 REGION_BEGIN (inclusive) and REGION_END (exclusive), returning true
3314 on success. The region has N_STMTS statements and has the datarefs
3315 given by DATAREFS. */
3316
3317 static bool
vect_slp_bb_region(gimple_stmt_iterator region_begin,gimple_stmt_iterator region_end,vec<data_reference_p> datarefs,unsigned int n_stmts)3318 vect_slp_bb_region (gimple_stmt_iterator region_begin,
3319 gimple_stmt_iterator region_end,
3320 vec<data_reference_p> datarefs,
3321 unsigned int n_stmts)
3322 {
3323 bb_vec_info bb_vinfo;
3324 auto_vector_modes vector_modes;
3325
3326 /* Autodetect first vector size we try. */
3327 machine_mode next_vector_mode = VOIDmode;
3328 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
3329 unsigned int mode_i = 0;
3330
3331 vec_info_shared shared;
3332
3333 machine_mode autodetected_vector_mode = VOIDmode;
3334 while (1)
3335 {
3336 bool vectorized = false;
3337 bool fatal = false;
3338 bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
3339
3340 bool first_time_p = shared.datarefs.is_empty ();
3341 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
3342 if (first_time_p)
3343 bb_vinfo->shared->save_datarefs ();
3344 else
3345 bb_vinfo->shared->check_datarefs ();
3346 bb_vinfo->vector_mode = next_vector_mode;
3347
3348 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
3349 && dbg_cnt (vect_slp))
3350 {
3351 if (dump_enabled_p ())
3352 {
3353 dump_printf_loc (MSG_NOTE, vect_location,
3354 "***** Analysis succeeded with vector mode"
3355 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
3356 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
3357 }
3358
3359 bb_vinfo->shared->check_datarefs ();
3360 vect_schedule_slp (bb_vinfo);
3361
3362 unsigned HOST_WIDE_INT bytes;
3363 if (dump_enabled_p ())
3364 {
3365 if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
3366 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3367 "basic block part vectorized using %wu byte "
3368 "vectors\n", bytes);
3369 else
3370 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3371 "basic block part vectorized using variable "
3372 "length vectors\n");
3373 }
3374
3375 vectorized = true;
3376 }
3377 else
3378 {
3379 if (dump_enabled_p ())
3380 dump_printf_loc (MSG_NOTE, vect_location,
3381 "***** Analysis failed with vector mode %s\n",
3382 GET_MODE_NAME (bb_vinfo->vector_mode));
3383 }
3384
3385 if (mode_i == 0)
3386 autodetected_vector_mode = bb_vinfo->vector_mode;
3387
3388 if (!fatal)
3389 while (mode_i < vector_modes.length ()
3390 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
3391 {
3392 if (dump_enabled_p ())
3393 dump_printf_loc (MSG_NOTE, vect_location,
3394 "***** The result for vector mode %s would"
3395 " be the same\n",
3396 GET_MODE_NAME (vector_modes[mode_i]));
3397 mode_i += 1;
3398 }
3399
3400 delete bb_vinfo;
3401
3402 if (mode_i < vector_modes.length ()
3403 && VECTOR_MODE_P (autodetected_vector_mode)
3404 && (related_vector_mode (vector_modes[mode_i],
3405 GET_MODE_INNER (autodetected_vector_mode))
3406 == autodetected_vector_mode)
3407 && (related_vector_mode (autodetected_vector_mode,
3408 GET_MODE_INNER (vector_modes[mode_i]))
3409 == vector_modes[mode_i]))
3410 {
3411 if (dump_enabled_p ())
3412 dump_printf_loc (MSG_NOTE, vect_location,
3413 "***** Skipping vector mode %s, which would"
3414 " repeat the analysis for %s\n",
3415 GET_MODE_NAME (vector_modes[mode_i]),
3416 GET_MODE_NAME (autodetected_vector_mode));
3417 mode_i += 1;
3418 }
3419
3420 if (vectorized
3421 || mode_i == vector_modes.length ()
3422 || autodetected_vector_mode == VOIDmode
3423 /* If vect_slp_analyze_bb_1 signaled that analysis for all
3424 vector sizes will fail do not bother iterating. */
3425 || fatal)
3426 return vectorized;
3427
3428 /* Try the next biggest vector size. */
3429 next_vector_mode = vector_modes[mode_i++];
3430 if (dump_enabled_p ())
3431 dump_printf_loc (MSG_NOTE, vect_location,
3432 "***** Re-trying analysis with vector mode %s\n",
3433 GET_MODE_NAME (next_vector_mode));
3434 }
3435 }
3436
3437 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
3438 true if anything in the basic-block was vectorized. */
3439
3440 bool
vect_slp_bb(basic_block bb)3441 vect_slp_bb (basic_block bb)
3442 {
3443 gimple_stmt_iterator gsi;
3444 bool any_vectorized = false;
3445
3446 gsi = gsi_start_bb (bb);
3447 while (!gsi_end_p (gsi))
3448 {
3449 gimple_stmt_iterator region_begin = gsi;
3450 vec<data_reference_p> datarefs = vNULL;
3451 int insns = 0;
3452
3453 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3454 {
3455 gimple *stmt = gsi_stmt (gsi);
3456 if (is_gimple_debug (stmt))
3457 continue;
3458 insns++;
3459
3460 if (gimple_location (stmt) != UNKNOWN_LOCATION)
3461 vect_location = stmt;
3462
3463 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
3464 break;
3465 }
3466
3467 /* Skip leading unhandled stmts. */
3468 if (gsi_stmt (region_begin) == gsi_stmt (gsi))
3469 {
3470 gsi_next (&gsi);
3471 continue;
3472 }
3473
3474 gimple_stmt_iterator region_end = gsi;
3475
3476 if (insns > param_slp_max_insns_in_bb)
3477 {
3478 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480 "not vectorized: too many instructions in "
3481 "basic block.\n");
3482 }
3483 else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
3484 any_vectorized = true;
3485
3486 if (gsi_end_p (region_end))
3487 break;
3488
3489 /* Skip the unhandled stmt. */
3490 gsi_next (&gsi);
3491 }
3492
3493 return any_vectorized;
3494 }
3495
3496
3497 /* Return 1 if vector type STMT_VINFO is a boolean vector. */
3498
3499 static bool
vect_mask_constant_operand_p(stmt_vec_info stmt_vinfo,unsigned op_num)3500 vect_mask_constant_operand_p (stmt_vec_info stmt_vinfo, unsigned op_num)
3501 {
3502 enum tree_code code = gimple_expr_code (stmt_vinfo->stmt);
3503 tree op, vectype;
3504 enum vect_def_type dt;
3505
3506 /* For comparison and COND_EXPR type is chosen depending
3507 on the non-constant other comparison operand. */
3508 if (TREE_CODE_CLASS (code) == tcc_comparison)
3509 {
3510 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3511 op = gimple_assign_rhs1 (stmt);
3512
3513 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3514 gcc_unreachable ();
3515
3516 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3517 }
3518
3519 if (code == COND_EXPR)
3520 {
3521 gassign *stmt = as_a <gassign *> (stmt_vinfo->stmt);
3522 tree cond = gimple_assign_rhs1 (stmt);
3523
3524 if (TREE_CODE (cond) == SSA_NAME)
3525 {
3526 if (op_num > 0)
3527 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3528 op = cond;
3529 }
3530 else
3531 {
3532 if (op_num > 1)
3533 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3534 op = TREE_OPERAND (cond, 0);
3535 }
3536
3537 if (!vect_is_simple_use (op, stmt_vinfo->vinfo, &dt, &vectype))
3538 gcc_unreachable ();
3539
3540 return !vectype || VECTOR_BOOLEAN_TYPE_P (vectype);
3541 }
3542
3543 return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
3544 }
3545
3546 /* Build a variable-length vector in which the elements in ELTS are repeated
3547 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
3548 RESULTS and add any new instructions to SEQ.
3549
3550 The approach we use is:
3551
3552 (1) Find a vector mode VM with integer elements of mode IM.
3553
3554 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3555 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
3556 from small vectors to IM.
3557
3558 (3) Duplicate each ELTS'[I] into a vector of mode VM.
3559
3560 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
3561 correct byte contents.
3562
3563 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
3564
3565 We try to find the largest IM for which this sequence works, in order
3566 to cut down on the number of interleaves. */
3567
3568 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,vec<tree> elts,unsigned int nresults,vec<tree> & results)3569 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
3570 vec<tree> elts, unsigned int nresults,
3571 vec<tree> &results)
3572 {
3573 unsigned int nelts = elts.length ();
3574 tree element_type = TREE_TYPE (vector_type);
3575
3576 /* (1) Find a vector mode VM with integer elements of mode IM. */
3577 unsigned int nvectors = 1;
3578 tree new_vector_type;
3579 tree permutes[2];
3580 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
3581 &nvectors, &new_vector_type,
3582 permutes))
3583 gcc_unreachable ();
3584
3585 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
3586 unsigned int partial_nelts = nelts / nvectors;
3587 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
3588
3589 tree_vector_builder partial_elts;
3590 auto_vec<tree, 32> pieces (nvectors * 2);
3591 pieces.quick_grow (nvectors * 2);
3592 for (unsigned int i = 0; i < nvectors; ++i)
3593 {
3594 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
3595 ELTS' has mode IM. */
3596 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
3597 for (unsigned int j = 0; j < partial_nelts; ++j)
3598 partial_elts.quick_push (elts[i * partial_nelts + j]);
3599 tree t = gimple_build_vector (seq, &partial_elts);
3600 t = gimple_build (seq, VIEW_CONVERT_EXPR,
3601 TREE_TYPE (new_vector_type), t);
3602
3603 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
3604 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
3605 }
3606
3607 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
3608 correct byte contents.
3609
3610 We need to repeat the following operation log2(nvectors) times:
3611
3612 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
3613 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
3614
3615 However, if each input repeats every N elements and the VF is
3616 a multiple of N * 2, the HI result is the same as the LO. */
3617 unsigned int in_start = 0;
3618 unsigned int out_start = nvectors;
3619 unsigned int hi_start = nvectors / 2;
3620 /* A bound on the number of outputs needed to produce NRESULTS results
3621 in the final iteration. */
3622 unsigned int noutputs_bound = nvectors * nresults;
3623 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
3624 {
3625 noutputs_bound /= 2;
3626 unsigned int limit = MIN (noutputs_bound, nvectors);
3627 for (unsigned int i = 0; i < limit; ++i)
3628 {
3629 if ((i & 1) != 0
3630 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
3631 2 * in_repeat))
3632 {
3633 pieces[out_start + i] = pieces[out_start + i - 1];
3634 continue;
3635 }
3636
3637 tree output = make_ssa_name (new_vector_type);
3638 tree input1 = pieces[in_start + (i / 2)];
3639 tree input2 = pieces[in_start + (i / 2) + hi_start];
3640 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
3641 input1, input2,
3642 permutes[i & 1]);
3643 gimple_seq_add_stmt (seq, stmt);
3644 pieces[out_start + i] = output;
3645 }
3646 std::swap (in_start, out_start);
3647 }
3648
3649 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
3650 results.reserve (nresults);
3651 for (unsigned int i = 0; i < nresults; ++i)
3652 if (i < nvectors)
3653 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
3654 pieces[in_start + i]));
3655 else
3656 results.quick_push (results[i - nvectors]);
3657 }
3658
3659
3660 /* For constant and loop invariant defs of SLP_NODE this function returns
3661 (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
3662 OP_NODE determines the node for the operand containing the scalar
3663 operands. */
3664
3665 static void
vect_get_constant_vectors(slp_tree slp_node,unsigned op_num,vec<tree> * vec_oprnds)3666 vect_get_constant_vectors (slp_tree slp_node, unsigned op_num,
3667 vec<tree> *vec_oprnds)
3668 {
3669 slp_tree op_node = SLP_TREE_CHILDREN (slp_node)[op_num];
3670 stmt_vec_info stmt_vinfo = SLP_TREE_SCALAR_STMTS (slp_node)[0];
3671 vec_info *vinfo = stmt_vinfo->vinfo;
3672 unsigned HOST_WIDE_INT nunits;
3673 tree vec_cst;
3674 unsigned j, number_of_places_left_in_vector;
3675 tree vector_type;
3676 tree vop;
3677 int group_size = op_node->ops.length ();
3678 unsigned int vec_num, i;
3679 unsigned number_of_copies = 1;
3680 bool constant_p;
3681 tree neutral_op = NULL;
3682 gimple_seq ctor_seq = NULL;
3683 auto_vec<tree, 16> permute_results;
3684
3685 /* ??? SLP analysis should compute the vector type for the
3686 constant / invariant and store it in the SLP node. */
3687 tree op = op_node->ops[0];
3688 /* Check if vector type is a boolean vector. */
3689 tree stmt_vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3690 if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
3691 && vect_mask_constant_operand_p (stmt_vinfo, op_num))
3692 vector_type = truth_type_for (stmt_vectype);
3693 else
3694 vector_type = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), op_node);
3695
3696 /* ??? For lane-reducing ops we should also have the required number
3697 of vector stmts initialized rather than second-guessing here. */
3698 unsigned int number_of_vectors;
3699 if (is_gimple_assign (stmt_vinfo->stmt)
3700 && (gimple_assign_rhs_code (stmt_vinfo->stmt) == SAD_EXPR
3701 || gimple_assign_rhs_code (stmt_vinfo->stmt) == DOT_PROD_EXPR
3702 || gimple_assign_rhs_code (stmt_vinfo->stmt) == WIDEN_SUM_EXPR))
3703 number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
3704 else
3705 number_of_vectors
3706 = vect_get_num_vectors (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
3707 * TYPE_VECTOR_SUBPARTS (stmt_vectype),
3708 vector_type);
3709 vec_oprnds->create (number_of_vectors);
3710 auto_vec<tree> voprnds (number_of_vectors);
3711
3712 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3713 created vectors. It is greater than 1 if unrolling is performed.
3714
3715 For example, we have two scalar operands, s1 and s2 (e.g., group of
3716 strided accesses of size two), while NUNITS is four (i.e., four scalars
3717 of this type can be packed in a vector). The output vector will contain
3718 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3719 will be 2).
3720
3721 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
3722 containing the operands.
3723
3724 For example, NUNITS is four as before, and the group size is 8
3725 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3726 {s5, s6, s7, s8}. */
3727
3728 /* When using duplicate_and_interleave, we just need one element for
3729 each scalar statement. */
3730 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3731 nunits = group_size;
3732
3733 number_of_copies = nunits * number_of_vectors / group_size;
3734
3735 number_of_places_left_in_vector = nunits;
3736 constant_p = true;
3737 tree_vector_builder elts (vector_type, nunits, 1);
3738 elts.quick_grow (nunits);
3739 bool place_after_defs = false;
3740 for (j = 0; j < number_of_copies; j++)
3741 {
3742 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
3743 {
3744 /* Create 'vect_ = {op0,op1,...,opn}'. */
3745 number_of_places_left_in_vector--;
3746 tree orig_op = op;
3747 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
3748 {
3749 if (CONSTANT_CLASS_P (op))
3750 {
3751 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3752 {
3753 /* Can't use VIEW_CONVERT_EXPR for booleans because
3754 of possibly different sizes of scalar value and
3755 vector element. */
3756 if (integer_zerop (op))
3757 op = build_int_cst (TREE_TYPE (vector_type), 0);
3758 else if (integer_onep (op))
3759 op = build_all_ones_cst (TREE_TYPE (vector_type));
3760 else
3761 gcc_unreachable ();
3762 }
3763 else
3764 op = fold_unary (VIEW_CONVERT_EXPR,
3765 TREE_TYPE (vector_type), op);
3766 gcc_assert (op && CONSTANT_CLASS_P (op));
3767 }
3768 else
3769 {
3770 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
3771 gimple *init_stmt;
3772 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
3773 {
3774 tree true_val
3775 = build_all_ones_cst (TREE_TYPE (vector_type));
3776 tree false_val
3777 = build_zero_cst (TREE_TYPE (vector_type));
3778 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
3779 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
3780 op, true_val,
3781 false_val);
3782 }
3783 else
3784 {
3785 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
3786 op);
3787 init_stmt
3788 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
3789 op);
3790 }
3791 gimple_seq_add_stmt (&ctor_seq, init_stmt);
3792 op = new_temp;
3793 }
3794 }
3795 elts[number_of_places_left_in_vector] = op;
3796 if (!CONSTANT_CLASS_P (op))
3797 constant_p = false;
3798 if (TREE_CODE (orig_op) == SSA_NAME
3799 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
3800 && STMT_VINFO_BB_VINFO (stmt_vinfo)
3801 && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
3802 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
3803 place_after_defs = true;
3804
3805 if (number_of_places_left_in_vector == 0)
3806 {
3807 if (constant_p
3808 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3809 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3810 vec_cst = gimple_build_vector (&ctor_seq, &elts);
3811 else
3812 {
3813 if (permute_results.is_empty ())
3814 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
3815 elts, number_of_vectors,
3816 permute_results);
3817 vec_cst = permute_results[number_of_vectors - j - 1];
3818 }
3819 tree init;
3820 gimple_stmt_iterator gsi;
3821 if (place_after_defs)
3822 {
3823 stmt_vec_info last_stmt_info
3824 = vect_find_last_scalar_stmt_in_slp (slp_node);
3825 gsi = gsi_for_stmt (last_stmt_info->stmt);
3826 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3827 &gsi);
3828 }
3829 else
3830 init = vect_init_vector (stmt_vinfo, vec_cst, vector_type,
3831 NULL);
3832 if (ctor_seq != NULL)
3833 {
3834 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
3835 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
3836 ctor_seq = NULL;
3837 }
3838 voprnds.quick_push (init);
3839 place_after_defs = false;
3840 number_of_places_left_in_vector = nunits;
3841 constant_p = true;
3842 elts.new_vector (vector_type, nunits, 1);
3843 elts.quick_grow (nunits);
3844 }
3845 }
3846 }
3847
3848 /* Since the vectors are created in the reverse order, we should invert
3849 them. */
3850 vec_num = voprnds.length ();
3851 for (j = vec_num; j != 0; j--)
3852 {
3853 vop = voprnds[j - 1];
3854 vec_oprnds->quick_push (vop);
3855 }
3856
3857 /* In case that VF is greater than the unrolling factor needed for the SLP
3858 group of stmts, NUMBER_OF_VECTORS to be created is greater than
3859 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
3860 to replicate the vectors. */
3861 while (number_of_vectors > vec_oprnds->length ())
3862 {
3863 tree neutral_vec = NULL;
3864
3865 if (neutral_op)
3866 {
3867 if (!neutral_vec)
3868 neutral_vec = build_vector_from_val (vector_type, neutral_op);
3869
3870 vec_oprnds->quick_push (neutral_vec);
3871 }
3872 else
3873 {
3874 for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
3875 vec_oprnds->quick_push (vop);
3876 }
3877 }
3878 }
3879
3880
3881 /* Get vectorized definitions from SLP_NODE that contains corresponding
3882 vectorized def-stmts. */
3883
3884 static void
vect_get_slp_vect_defs(slp_tree slp_node,vec<tree> * vec_oprnds)3885 vect_get_slp_vect_defs (slp_tree slp_node, vec<tree> *vec_oprnds)
3886 {
3887 stmt_vec_info vec_def_stmt_info;
3888 unsigned int i;
3889
3890 gcc_assert (SLP_TREE_VEC_STMTS (slp_node).exists ());
3891
3892 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt_info)
3893 vec_oprnds->quick_push (gimple_get_lhs (vec_def_stmt_info->stmt));
3894 }
3895
3896
3897 /* Get N vectorized definitions for SLP_NODE.
3898 If the scalar definitions are loop invariants or constants, collect them and
3899 call vect_get_constant_vectors() to create vector stmts.
3900 Otherwise, the def-stmts must be already vectorized and the vectorized stmts
3901 must be stored in the corresponding child of SLP_NODE, and we call
3902 vect_get_slp_vect_defs () to retrieve them. */
3903
3904 void
vect_get_slp_defs(slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)3905 vect_get_slp_defs (slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
3906 {
3907 if (n == -1U)
3908 n = SLP_TREE_CHILDREN (slp_node).length ();
3909
3910 for (unsigned i = 0; i < n; ++i)
3911 {
3912 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
3913
3914 vec<tree> vec_defs = vNULL;
3915
3916 /* For each operand we check if it has vectorized definitions in a child
3917 node or we need to create them (for invariants and constants). */
3918 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3919 {
3920 vec_defs.create (SLP_TREE_NUMBER_OF_VEC_STMTS (child));
3921 vect_get_slp_vect_defs (child, &vec_defs);
3922 }
3923 else
3924 vect_get_constant_vectors (slp_node, i, &vec_defs);
3925
3926 vec_oprnds->quick_push (vec_defs);
3927 }
3928 }
3929
3930 /* Generate vector permute statements from a list of loads in DR_CHAIN.
3931 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
3932 permute statements for the SLP node NODE of the SLP instance
3933 SLP_NODE_INSTANCE. */
3934
3935 bool
vect_transform_slp_perm_load(slp_tree node,vec<tree> dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,slp_instance slp_node_instance,bool analyze_only,unsigned * n_perms)3936 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
3937 gimple_stmt_iterator *gsi, poly_uint64 vf,
3938 slp_instance slp_node_instance, bool analyze_only,
3939 unsigned *n_perms)
3940 {
3941 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3942 vec_info *vinfo = stmt_info->vinfo;
3943 int vec_index = 0;
3944 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3945 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
3946 unsigned int mask_element;
3947 machine_mode mode;
3948
3949 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
3950 return false;
3951
3952 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
3953
3954 mode = TYPE_MODE (vectype);
3955 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3956
3957 /* Initialize the vect stmts of NODE to properly insert the generated
3958 stmts later. */
3959 if (! analyze_only)
3960 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
3961 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
3962 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
3963
3964 /* Generate permutation masks for every NODE. Number of masks for each NODE
3965 is equal to GROUP_SIZE.
3966 E.g., we have a group of three nodes with three loads from the same
3967 location in each node, and the vector size is 4. I.e., we have a
3968 a0b0c0a1b1c1... sequence and we need to create the following vectors:
3969 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
3970 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
3971 ...
3972
3973 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
3974 The last mask is illegal since we assume two operands for permute
3975 operation, and the mask element values can't be outside that range.
3976 Hence, the last mask must be converted into {2,5,5,5}.
3977 For the first two permutations we need the first and the second input
3978 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
3979 we need the second and the third vectors: {b1,c1,a2,b2} and
3980 {c2,a3,b3,c3}. */
3981
3982 int vect_stmts_counter = 0;
3983 unsigned int index = 0;
3984 int first_vec_index = -1;
3985 int second_vec_index = -1;
3986 bool noop_p = true;
3987 *n_perms = 0;
3988
3989 vec_perm_builder mask;
3990 unsigned int nelts_to_build;
3991 unsigned int nvectors_per_build;
3992 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
3993 && multiple_p (nunits, group_size));
3994 if (repeating_p)
3995 {
3996 /* A single vector contains a whole number of copies of the node, so:
3997 (a) all permutes can use the same mask; and
3998 (b) the permutes only need a single vector input. */
3999 mask.new_vector (nunits, group_size, 3);
4000 nelts_to_build = mask.encoded_nelts ();
4001 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
4002 }
4003 else
4004 {
4005 /* We need to construct a separate mask for each vector statement. */
4006 unsigned HOST_WIDE_INT const_nunits, const_vf;
4007 if (!nunits.is_constant (&const_nunits)
4008 || !vf.is_constant (&const_vf))
4009 return false;
4010 mask.new_vector (const_nunits, const_nunits, 1);
4011 nelts_to_build = const_vf * group_size;
4012 nvectors_per_build = 1;
4013 }
4014
4015 unsigned int count = mask.encoded_nelts ();
4016 mask.quick_grow (count);
4017 vec_perm_indices indices;
4018
4019 for (unsigned int j = 0; j < nelts_to_build; j++)
4020 {
4021 unsigned int iter_num = j / group_size;
4022 unsigned int stmt_num = j % group_size;
4023 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
4024 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
4025 if (repeating_p)
4026 {
4027 first_vec_index = 0;
4028 mask_element = i;
4029 }
4030 else
4031 {
4032 /* Enforced before the loop when !repeating_p. */
4033 unsigned int const_nunits = nunits.to_constant ();
4034 vec_index = i / const_nunits;
4035 mask_element = i % const_nunits;
4036 if (vec_index == first_vec_index
4037 || first_vec_index == -1)
4038 {
4039 first_vec_index = vec_index;
4040 }
4041 else if (vec_index == second_vec_index
4042 || second_vec_index == -1)
4043 {
4044 second_vec_index = vec_index;
4045 mask_element += const_nunits;
4046 }
4047 else
4048 {
4049 if (dump_enabled_p ())
4050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4051 "permutation requires at "
4052 "least three vectors %G",
4053 stmt_info->stmt);
4054 gcc_assert (analyze_only);
4055 return false;
4056 }
4057
4058 gcc_assert (mask_element < 2 * const_nunits);
4059 }
4060
4061 if (mask_element != index)
4062 noop_p = false;
4063 mask[index++] = mask_element;
4064
4065 if (index == count && !noop_p)
4066 {
4067 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
4068 if (!can_vec_perm_const_p (mode, indices))
4069 {
4070 if (dump_enabled_p ())
4071 {
4072 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
4073 vect_location,
4074 "unsupported vect permute { ");
4075 for (i = 0; i < count; ++i)
4076 {
4077 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
4078 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
4079 }
4080 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
4081 }
4082 gcc_assert (analyze_only);
4083 return false;
4084 }
4085
4086 ++*n_perms;
4087 }
4088
4089 if (index == count)
4090 {
4091 if (!analyze_only)
4092 {
4093 tree mask_vec = NULL_TREE;
4094
4095 if (! noop_p)
4096 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
4097
4098 if (second_vec_index == -1)
4099 second_vec_index = first_vec_index;
4100
4101 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
4102 {
4103 /* Generate the permute statement if necessary. */
4104 tree first_vec = dr_chain[first_vec_index + ri];
4105 tree second_vec = dr_chain[second_vec_index + ri];
4106 stmt_vec_info perm_stmt_info;
4107 if (! noop_p)
4108 {
4109 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4110 tree perm_dest
4111 = vect_create_destination_var (gimple_assign_lhs (stmt),
4112 vectype);
4113 perm_dest = make_ssa_name (perm_dest);
4114 gassign *perm_stmt
4115 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
4116 first_vec, second_vec,
4117 mask_vec);
4118 perm_stmt_info
4119 = vect_finish_stmt_generation (stmt_info, perm_stmt,
4120 gsi);
4121 }
4122 else
4123 /* If mask was NULL_TREE generate the requested
4124 identity transform. */
4125 perm_stmt_info = vinfo->lookup_def (first_vec);
4126
4127 /* Store the vector statement in NODE. */
4128 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
4129 = perm_stmt_info;
4130 }
4131 }
4132
4133 index = 0;
4134 first_vec_index = -1;
4135 second_vec_index = -1;
4136 noop_p = true;
4137 }
4138 }
4139
4140 return true;
4141 }
4142
4143 /* Vectorize SLP instance tree in postorder. */
4144
4145 static void
vect_schedule_slp_instance(slp_tree node,slp_instance instance)4146 vect_schedule_slp_instance (slp_tree node, slp_instance instance)
4147 {
4148 gimple_stmt_iterator si;
4149 stmt_vec_info stmt_info;
4150 unsigned int group_size;
4151 tree vectype;
4152 int i, j;
4153 slp_tree child;
4154
4155 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4156 return;
4157
4158 /* See if we have already vectorized the node in the graph of the
4159 SLP instance. */
4160 if (SLP_TREE_VEC_STMTS (node).exists ())
4161 return;
4162
4163 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4164 vect_schedule_slp_instance (child, instance);
4165
4166 /* Push SLP node def-type to stmts. */
4167 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4168 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4169 {
4170 stmt_vec_info child_stmt_info;
4171 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4172 STMT_VINFO_DEF_TYPE (child_stmt_info) = SLP_TREE_DEF_TYPE (child);
4173 }
4174
4175 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4176
4177 /* VECTYPE is the type of the destination. */
4178 vectype = STMT_VINFO_VECTYPE (stmt_info);
4179 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
4180 group_size = SLP_INSTANCE_GROUP_SIZE (instance);
4181
4182 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
4183 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4184
4185 if (dump_enabled_p ())
4186 dump_printf_loc (MSG_NOTE, vect_location,
4187 "------>vectorizing SLP node starting from: %G",
4188 stmt_info->stmt);
4189
4190 /* Vectorized stmts go before the last scalar stmt which is where
4191 all uses are ready. */
4192 stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
4193 si = gsi_for_stmt (last_stmt_info->stmt);
4194
4195 /* Handle two-operation SLP nodes by vectorizing the group with
4196 both operations and then performing a merge. */
4197 bool done_p = false;
4198 if (SLP_TREE_TWO_OPERATORS (node))
4199 {
4200 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
4201 enum tree_code code0 = gimple_assign_rhs_code (stmt);
4202 enum tree_code ocode = ERROR_MARK;
4203 stmt_vec_info ostmt_info;
4204 vec_perm_builder mask (group_size, group_size, 1);
4205 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
4206 {
4207 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
4208 if (gimple_assign_rhs_code (ostmt) != code0)
4209 {
4210 mask.quick_push (1);
4211 ocode = gimple_assign_rhs_code (ostmt);
4212 }
4213 else
4214 mask.quick_push (0);
4215 }
4216 if (ocode != ERROR_MARK)
4217 {
4218 vec<stmt_vec_info> v0;
4219 vec<stmt_vec_info> v1;
4220 unsigned j;
4221 tree tmask = NULL_TREE;
4222 vect_transform_stmt (stmt_info, &si, node, instance);
4223 v0 = SLP_TREE_VEC_STMTS (node).copy ();
4224 SLP_TREE_VEC_STMTS (node).truncate (0);
4225 gimple_assign_set_rhs_code (stmt, ocode);
4226 vect_transform_stmt (stmt_info, &si, node, instance);
4227 gimple_assign_set_rhs_code (stmt, code0);
4228 v1 = SLP_TREE_VEC_STMTS (node).copy ();
4229 SLP_TREE_VEC_STMTS (node).truncate (0);
4230 tree meltype = build_nonstandard_integer_type
4231 (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
4232 tree mvectype = get_same_sized_vectype (meltype, vectype);
4233 unsigned k = 0, l;
4234 for (j = 0; j < v0.length (); ++j)
4235 {
4236 /* Enforced by vect_build_slp_tree, which rejects variable-length
4237 vectors for SLP_TREE_TWO_OPERATORS. */
4238 unsigned int const_nunits = nunits.to_constant ();
4239 tree_vector_builder melts (mvectype, const_nunits, 1);
4240 for (l = 0; l < const_nunits; ++l)
4241 {
4242 if (k >= group_size)
4243 k = 0;
4244 tree t = build_int_cst (meltype,
4245 mask[k++] * const_nunits + l);
4246 melts.quick_push (t);
4247 }
4248 tmask = melts.build ();
4249
4250 /* ??? Not all targets support a VEC_PERM_EXPR with a
4251 constant mask that would translate to a vec_merge RTX
4252 (with their vec_perm_const_ok). We can either not
4253 vectorize in that case or let veclower do its job.
4254 Unfortunately that isn't too great and at least for
4255 plus/minus we'd eventually like to match targets
4256 vector addsub instructions. */
4257 gimple *vstmt;
4258 vstmt = gimple_build_assign (make_ssa_name (vectype),
4259 VEC_PERM_EXPR,
4260 gimple_assign_lhs (v0[j]->stmt),
4261 gimple_assign_lhs (v1[j]->stmt),
4262 tmask);
4263 SLP_TREE_VEC_STMTS (node).quick_push
4264 (vect_finish_stmt_generation (stmt_info, vstmt, &si));
4265 }
4266 v0.release ();
4267 v1.release ();
4268 done_p = true;
4269 }
4270 }
4271 if (!done_p)
4272 vect_transform_stmt (stmt_info, &si, node, instance);
4273
4274 /* Restore stmt def-types. */
4275 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4276 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
4277 {
4278 stmt_vec_info child_stmt_info;
4279 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (child), j, child_stmt_info)
4280 STMT_VINFO_DEF_TYPE (child_stmt_info) = vect_internal_def;
4281 }
4282 }
4283
4284 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
4285 For loop vectorization this is done in vectorizable_call, but for SLP
4286 it needs to be deferred until end of vect_schedule_slp, because multiple
4287 SLP instances may refer to the same scalar stmt. */
4288
4289 static void
vect_remove_slp_scalar_calls(slp_tree node,hash_set<slp_tree> & visited)4290 vect_remove_slp_scalar_calls (slp_tree node, hash_set<slp_tree> &visited)
4291 {
4292 gimple *new_stmt;
4293 gimple_stmt_iterator gsi;
4294 int i;
4295 slp_tree child;
4296 tree lhs;
4297 stmt_vec_info stmt_info;
4298
4299 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
4300 return;
4301
4302 if (visited.add (node))
4303 return;
4304
4305 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4306 vect_remove_slp_scalar_calls (child, visited);
4307
4308 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4309 {
4310 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4311 if (!stmt || gimple_bb (stmt) == NULL)
4312 continue;
4313 if (is_pattern_stmt_p (stmt_info)
4314 || !PURE_SLP_STMT (stmt_info))
4315 continue;
4316 lhs = gimple_call_lhs (stmt);
4317 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
4318 gsi = gsi_for_stmt (stmt);
4319 stmt_info->vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
4320 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
4321 }
4322 }
4323
4324 static void
vect_remove_slp_scalar_calls(slp_tree node)4325 vect_remove_slp_scalar_calls (slp_tree node)
4326 {
4327 hash_set<slp_tree> visited;
4328 vect_remove_slp_scalar_calls (node, visited);
4329 }
4330
4331 /* Vectorize the instance root. */
4332
4333 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)4334 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
4335 {
4336 gassign *rstmt = NULL;
4337
4338 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
4339 {
4340 stmt_vec_info child_stmt_info;
4341 int j;
4342
4343 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4344 {
4345 tree vect_lhs = gimple_get_lhs (child_stmt_info->stmt);
4346 tree root_lhs = gimple_get_lhs (instance->root_stmt->stmt);
4347 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
4348 TREE_TYPE (vect_lhs)))
4349 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
4350 vect_lhs);
4351 rstmt = gimple_build_assign (root_lhs, vect_lhs);
4352 break;
4353 }
4354 }
4355 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
4356 {
4357 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
4358 stmt_vec_info child_stmt_info;
4359 int j;
4360 vec<constructor_elt, va_gc> *v;
4361 vec_alloc (v, nelts);
4362
4363 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt_info)
4364 {
4365 CONSTRUCTOR_APPEND_ELT (v,
4366 NULL_TREE,
4367 gimple_get_lhs (child_stmt_info->stmt));
4368 }
4369 tree lhs = gimple_get_lhs (instance->root_stmt->stmt);
4370 tree rtype = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmt->stmt));
4371 tree r_constructor = build_constructor (rtype, v);
4372 rstmt = gimple_build_assign (lhs, r_constructor);
4373 }
4374
4375 gcc_assert (rstmt);
4376
4377 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmt->stmt);
4378 gsi_replace (&rgsi, rstmt, true);
4379 }
4380
4381 /* Generate vector code for all SLP instances in the loop/basic block. */
4382
4383 void
vect_schedule_slp(vec_info * vinfo)4384 vect_schedule_slp (vec_info *vinfo)
4385 {
4386 vec<slp_instance> slp_instances;
4387 slp_instance instance;
4388 unsigned int i;
4389
4390 slp_instances = vinfo->slp_instances;
4391 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4392 {
4393 slp_tree node = SLP_INSTANCE_TREE (instance);
4394 /* Schedule the tree of INSTANCE. */
4395 vect_schedule_slp_instance (node, instance);
4396
4397 if (SLP_INSTANCE_ROOT_STMT (instance))
4398 vectorize_slp_instance_root_stmt (node, instance);
4399
4400 if (dump_enabled_p ())
4401 dump_printf_loc (MSG_NOTE, vect_location,
4402 "vectorizing stmts using SLP.\n");
4403 }
4404
4405 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4406 {
4407 slp_tree root = SLP_INSTANCE_TREE (instance);
4408 stmt_vec_info store_info;
4409 unsigned int j;
4410
4411 /* Remove scalar call stmts. Do not do this for basic-block
4412 vectorization as not all uses may be vectorized.
4413 ??? Why should this be necessary? DCE should be able to
4414 remove the stmts itself.
4415 ??? For BB vectorization we can as well remove scalar
4416 stmts starting from the SLP tree root if they have no
4417 uses. */
4418 if (is_a <loop_vec_info> (vinfo))
4419 vect_remove_slp_scalar_calls (root);
4420
4421 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info)
4422 && j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
4423 {
4424 if (!STMT_VINFO_DATA_REF (store_info))
4425 break;
4426
4427 if (SLP_INSTANCE_ROOT_STMT (instance))
4428 continue;
4429
4430 store_info = vect_orig_stmt (store_info);
4431 /* Free the attached stmt_vec_info and remove the stmt. */
4432 vinfo->remove_stmt (store_info);
4433 }
4434 }
4435 }
4436