1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h" /* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59
60 void
vect_slp_init(void)61 vect_slp_init (void)
62 {
63 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65
66 void
vect_slp_fini(void)67 vect_slp_fini (void)
68 {
69 while (slp_first_node)
70 delete slp_first_node;
71 delete slp_tree_pool;
72 slp_tree_pool = NULL;
73 }
74
75 void *
operator new(size_t n)76 _slp_tree::operator new (size_t n)
77 {
78 gcc_assert (n == sizeof (_slp_tree));
79 return slp_tree_pool->allocate_raw ();
80 }
81
82 void
operator delete(void * node,size_t n)83 _slp_tree::operator delete (void *node, size_t n)
84 {
85 gcc_assert (n == sizeof (_slp_tree));
86 slp_tree_pool->remove_raw (node);
87 }
88
89
90 /* Initialize a SLP node. */
91
_slp_tree()92 _slp_tree::_slp_tree ()
93 {
94 this->prev_node = NULL;
95 if (slp_first_node)
96 slp_first_node->prev_node = this;
97 this->next_node = slp_first_node;
98 slp_first_node = this;
99 SLP_TREE_SCALAR_STMTS (this) = vNULL;
100 SLP_TREE_SCALAR_OPS (this) = vNULL;
101 SLP_TREE_VEC_STMTS (this) = vNULL;
102 SLP_TREE_VEC_DEFS (this) = vNULL;
103 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104 SLP_TREE_CHILDREN (this) = vNULL;
105 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108 SLP_TREE_CODE (this) = ERROR_MARK;
109 SLP_TREE_VECTYPE (this) = NULL_TREE;
110 SLP_TREE_REPRESENTATIVE (this) = NULL;
111 SLP_TREE_REF_COUNT (this) = 1;
112 this->failed = NULL;
113 this->max_nunits = 1;
114 this->lanes = 0;
115 }
116
117 /* Tear down a SLP node. */
118
~_slp_tree()119 _slp_tree::~_slp_tree ()
120 {
121 if (this->prev_node)
122 this->prev_node->next_node = this->next_node;
123 else
124 slp_first_node = this->next_node;
125 if (this->next_node)
126 this->next_node->prev_node = this->prev_node;
127 SLP_TREE_CHILDREN (this).release ();
128 SLP_TREE_SCALAR_STMTS (this).release ();
129 SLP_TREE_SCALAR_OPS (this).release ();
130 SLP_TREE_VEC_STMTS (this).release ();
131 SLP_TREE_VEC_DEFS (this).release ();
132 SLP_TREE_LOAD_PERMUTATION (this).release ();
133 SLP_TREE_LANE_PERMUTATION (this).release ();
134 if (this->failed)
135 free (failed);
136 }
137
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
139
140 void
vect_free_slp_tree(slp_tree node)141 vect_free_slp_tree (slp_tree node)
142 {
143 int i;
144 slp_tree child;
145
146 if (--SLP_TREE_REF_COUNT (node) != 0)
147 return;
148
149 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150 if (child)
151 vect_free_slp_tree (child);
152
153 /* If the node defines any SLP only patterns then those patterns are no
154 longer valid and should be removed. */
155 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157 {
158 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161 }
162
163 delete node;
164 }
165
166 /* Return a location suitable for dumpings related to the SLP instance. */
167
168 dump_user_location_t
location() const169 _slp_instance::location () const
170 {
171 if (!root_stmts.is_empty ())
172 return root_stmts[0]->stmt;
173 else
174 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176
177
178 /* Free the memory allocated for the SLP instance. */
179
180 void
vect_free_slp_instance(slp_instance instance)181 vect_free_slp_instance (slp_instance instance)
182 {
183 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184 SLP_INSTANCE_LOADS (instance).release ();
185 SLP_INSTANCE_ROOT_STMTS (instance).release ();
186 instance->subgraph_entries.release ();
187 instance->cost_vec.release ();
188 free (instance);
189 }
190
191
192 /* Create an SLP node for SCALAR_STMTS. */
193
194 slp_tree
vect_create_new_slp_node(unsigned nops,tree_code code)195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197 slp_tree node = new _slp_tree;
198 SLP_TREE_SCALAR_STMTS (node) = vNULL;
199 SLP_TREE_CHILDREN (node).create (nops);
200 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201 SLP_TREE_CODE (node) = code;
202 return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS. */
205
206 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<stmt_vec_info> scalar_stmts,unsigned nops)207 vect_create_new_slp_node (slp_tree node,
208 vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211 SLP_TREE_CHILDREN (node).create (nops);
212 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214 SLP_TREE_LANES (node) = scalar_stmts.length ();
215 return node;
216 }
217
218 /* Create an SLP node for SCALAR_STMTS. */
219
220 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts,unsigned nops)221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225
226 /* Create an SLP node for OPS. */
227
228 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<tree> ops)229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231 SLP_TREE_SCALAR_OPS (node) = ops;
232 SLP_TREE_DEF_TYPE (node) = vect_external_def;
233 SLP_TREE_LANES (node) = ops.length ();
234 return node;
235 }
236
237 /* Create an SLP node for OPS. */
238
239 static slp_tree
vect_create_new_slp_node(vec<tree> ops)240 vect_create_new_slp_node (vec<tree> ops)
241 {
242 return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244
245
246 /* This structure is used in creation of an SLP tree. Each instance
247 corresponds to the same operand in a group of scalar stmts in an SLP
248 node. */
249 typedef struct _slp_oprnd_info
250 {
251 /* Def-stmts for the operands. */
252 vec<stmt_vec_info> def_stmts;
253 /* Operands. */
254 vec<tree> ops;
255 /* Information about the first statement, its vector def-type, type, the
256 operand itself in case it's constant, and an indication if it's a pattern
257 stmt. */
258 tree first_op_type;
259 enum vect_def_type first_dt;
260 bool any_pattern;
261 } *slp_oprnd_info;
262
263
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265 operand. */
266 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)267 vect_create_oprnd_info (int nops, int group_size)
268 {
269 int i;
270 slp_oprnd_info oprnd_info;
271 vec<slp_oprnd_info> oprnds_info;
272
273 oprnds_info.create (nops);
274 for (i = 0; i < nops; i++)
275 {
276 oprnd_info = XNEW (struct _slp_oprnd_info);
277 oprnd_info->def_stmts.create (group_size);
278 oprnd_info->ops.create (group_size);
279 oprnd_info->first_dt = vect_uninitialized_def;
280 oprnd_info->first_op_type = NULL_TREE;
281 oprnd_info->any_pattern = false;
282 oprnds_info.quick_push (oprnd_info);
283 }
284
285 return oprnds_info;
286 }
287
288
289 /* Free operands info. */
290
291 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294 int i;
295 slp_oprnd_info oprnd_info;
296
297 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298 {
299 oprnd_info->def_stmts.release ();
300 oprnd_info->ops.release ();
301 XDELETE (oprnd_info);
302 }
303
304 oprnds_info.release ();
305 }
306
307
308 /* Return true if STMTS contains a pattern statement. */
309
310 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313 stmt_vec_info stmt_info;
314 unsigned int i;
315 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316 if (is_pattern_stmt_p (stmt_info))
317 return true;
318 return false;
319 }
320
321 /* Return true when all lanes in the external or constant NODE have
322 the same value. */
323
324 static bool
vect_slp_tree_uniform_p(slp_tree node)325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329
330 /* Pre-exsting vectors. */
331 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332 return false;
333
334 unsigned i;
335 tree op, first = NULL_TREE;
336 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337 if (!first)
338 first = op;
339 else if (!operand_equal_p (first, op, 0))
340 return false;
341
342 return true;
343 }
344
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
347 of the chain. */
348
349 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 stmt_vec_info first_stmt_info)
352 {
353 stmt_vec_info next_stmt_info = first_stmt_info;
354 int result = 0;
355
356 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357 return -1;
358
359 do
360 {
361 if (next_stmt_info == stmt_info)
362 return result;
363 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364 if (next_stmt_info)
365 result += DR_GROUP_GAP (next_stmt_info);
366 }
367 while (next_stmt_info);
368
369 return -1;
370 }
371
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373 using the method implemented by duplicate_and_interleave. Return true
374 if so, returning the number of intermediate vectors in *NVECTORS_OUT
375 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376 (if nonnull). */
377
378 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 tree elt_type, unsigned int *nvectors_out,
381 tree *vector_type_out,
382 tree *permutes)
383 {
384 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386 return false;
387
388 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390 unsigned int nvectors = 1;
391 for (;;)
392 {
393 scalar_int_mode int_mode;
394 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 {
397 /* Get the natural vector type for this SLP group size. */
398 tree int_type = build_nonstandard_integer_type
399 (GET_MODE_BITSIZE (int_mode), 1);
400 tree vector_type
401 = get_vectype_for_scalar_type (vinfo, int_type, count);
402 if (vector_type
403 && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 GET_MODE_SIZE (base_vector_mode)))
406 {
407 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 together into elements of type INT_TYPE and using the result
409 to build NVECTORS vectors. */
410 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 vec_perm_builder sel1 (nelts, 2, 3);
412 vec_perm_builder sel2 (nelts, 2, 3);
413 poly_int64 half_nelts = exact_div (nelts, 2);
414 for (unsigned int i = 0; i < 3; ++i)
415 {
416 sel1.quick_push (i);
417 sel1.quick_push (i + nelts);
418 sel2.quick_push (half_nelts + i);
419 sel2.quick_push (half_nelts + i + nelts);
420 }
421 vec_perm_indices indices1 (sel1, 2, nelts);
422 vec_perm_indices indices2 (sel2, 2, nelts);
423 if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 {
426 if (nvectors_out)
427 *nvectors_out = nvectors;
428 if (vector_type_out)
429 *vector_type_out = vector_type;
430 if (permutes)
431 {
432 permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 indices1);
434 permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 indices2);
436 }
437 return true;
438 }
439 }
440 }
441 if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 return false;
443 nvectors *= 2;
444 }
445 }
446
447 /* Return true if DTA and DTB match. */
448
449 static bool
vect_def_types_match(enum vect_def_type dta,enum vect_def_type dtb)450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452 return (dta == dtb
453 || ((dta == vect_external_def || dta == vect_constant_def)
454 && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456
457 static const int cond_expr_maps[3][5] = {
458 { 4, -1, -2, 1, 2 },
459 { 4, -2, -1, 1, 2 },
460 { 4, -1, -2, 2, 1 }
461 };
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
465
466 /* For most SLP statements, there is a one-to-one mapping between
467 gimple arguments and child nodes. If that is not true for STMT,
468 return an array that contains:
469
470 - the number of child nodes, followed by
471 - for each child node, the index of the argument associated with that node.
472 The special index -1 is the first operand of an embedded comparison and
473 the special index -2 is the second operand of an embedded comparison.
474
475 SWAP is as for vect_get_and_check_slp_defs. */
476
477 static const int *
vect_get_operand_map(const gimple * stmt,unsigned char swap=0)478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
479 {
480 if (auto assign = dyn_cast<const gassign *> (stmt))
481 {
482 if (gimple_assign_rhs_code (assign) == COND_EXPR
483 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484 return cond_expr_maps[swap];
485 }
486 gcc_assert (!swap);
487 if (auto call = dyn_cast<const gcall *> (stmt))
488 {
489 if (gimple_call_internal_p (call))
490 switch (gimple_call_internal_fn (call))
491 {
492 case IFN_MASK_LOAD:
493 return arg2_map;
494
495 case IFN_GATHER_LOAD:
496 return arg1_map;
497
498 case IFN_MASK_GATHER_LOAD:
499 return arg1_arg4_map;
500
501 default:
502 break;
503 }
504 }
505 return nullptr;
506 }
507
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509 they are of a valid type and that they match the defs of the first stmt of
510 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
511 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
512 indicates swap is required for cond_expr stmts. Specifically, SWAP
513 is 1 if STMT is cond and operands of comparison need to be swapped;
514 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
515
516 If there was a fatal error return -1; if the error could be corrected by
517 swapping operands of father node of this one, return 1; if everything is
518 ok return 0. */
519 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char swap,bool * skip_args,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521 bool *skip_args,
522 vec<stmt_vec_info> stmts, unsigned stmt_num,
523 vec<slp_oprnd_info> *oprnds_info)
524 {
525 stmt_vec_info stmt_info = stmts[stmt_num];
526 tree oprnd;
527 unsigned int i, number_of_oprnds;
528 enum vect_def_type dt = vect_uninitialized_def;
529 slp_oprnd_info oprnd_info;
530 unsigned int commutative_op = -1U;
531 bool first = stmt_num == 0;
532
533 if (!is_a<gcall *> (stmt_info->stmt)
534 && !is_a<gassign *> (stmt_info->stmt)
535 && !is_a<gphi *> (stmt_info->stmt))
536 return -1;
537
538 number_of_oprnds = gimple_num_args (stmt_info->stmt);
539 const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540 if (map)
541 number_of_oprnds = *map++;
542 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
543 {
544 if (gimple_call_internal_p (stmt))
545 {
546 internal_fn ifn = gimple_call_internal_fn (stmt);
547 commutative_op = first_commutative_argument (ifn);
548 }
549 }
550 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
551 {
552 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553 commutative_op = 0;
554 }
555
556 bool swapped = (swap != 0);
557 bool backedge = false;
558 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559 for (i = 0; i < number_of_oprnds; i++)
560 {
561 int opno = map ? map[i] : int (i);
562 if (opno < 0)
563 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564 else
565 {
566 oprnd = gimple_arg (stmt_info->stmt, opno);
567 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568 backedge = dominated_by_p (CDI_DOMINATORS,
569 gimple_phi_arg_edge (stmt, opno)->src,
570 gimple_bb (stmt_info->stmt));
571 }
572 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573 oprnd = TREE_OPERAND (oprnd, 0);
574
575 oprnd_info = (*oprnds_info)[i];
576
577 stmt_vec_info def_stmt_info;
578 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 "Build SLP failed: can't analyze def for %T\n",
583 oprnd);
584
585 return -1;
586 }
587
588 if (skip_args[i])
589 {
590 oprnd_info->def_stmts.quick_push (NULL);
591 oprnd_info->ops.quick_push (NULL_TREE);
592 oprnd_info->first_dt = vect_uninitialized_def;
593 continue;
594 }
595
596 oprnd_info->def_stmts.quick_push (def_stmt_info);
597 oprnd_info->ops.quick_push (oprnd);
598
599 if (def_stmt_info
600 && is_pattern_stmt_p (def_stmt_info))
601 {
602 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603 != def_stmt_info)
604 oprnd_info->any_pattern = true;
605 else
606 /* If we promote this to external use the original stmt def. */
607 oprnd_info->ops.last ()
608 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
609 }
610
611 /* If there's a extern def on a backedge make sure we can
612 code-generate at the region start.
613 ??? This is another case that could be fixed by adjusting
614 how we split the function but at the moment we'd have conflicting
615 goals there. */
616 if (backedge
617 && dts[i] == vect_external_def
618 && is_a <bb_vec_info> (vinfo)
619 && TREE_CODE (oprnd) == SSA_NAME
620 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621 && !dominated_by_p (CDI_DOMINATORS,
622 as_a <bb_vec_info> (vinfo)->bbs[0],
623 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
624 {
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 "Build SLP failed: extern def %T only defined "
628 "on backedge\n", oprnd);
629 return -1;
630 }
631
632 if (first)
633 {
634 tree type = TREE_TYPE (oprnd);
635 dt = dts[i];
636 if ((dt == vect_constant_def
637 || dt == vect_external_def)
638 && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639 && (TREE_CODE (type) == BOOLEAN_TYPE
640 || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641 type)))
642 {
643 if (dump_enabled_p ())
644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 "Build SLP failed: invalid type of def "
646 "for variable-length SLP %T\n", oprnd);
647 return -1;
648 }
649
650 /* For the swapping logic below force vect_reduction_def
651 for the reduction op in a SLP reduction group. */
652 if (!STMT_VINFO_DATA_REF (stmt_info)
653 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655 && def_stmt_info)
656 dts[i] = dt = vect_reduction_def;
657
658 /* Check the types of the definition. */
659 switch (dt)
660 {
661 case vect_external_def:
662 case vect_constant_def:
663 case vect_internal_def:
664 case vect_reduction_def:
665 case vect_induction_def:
666 case vect_nested_cycle:
667 break;
668
669 default:
670 /* FORNOW: Not supported. */
671 if (dump_enabled_p ())
672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 "Build SLP failed: illegal type of def %T\n",
674 oprnd);
675 return -1;
676 }
677
678 oprnd_info->first_dt = dt;
679 oprnd_info->first_op_type = type;
680 }
681 }
682 if (first)
683 return 0;
684
685 /* Now match the operand definition types to that of the first stmt. */
686 for (i = 0; i < number_of_oprnds;)
687 {
688 if (skip_args[i])
689 {
690 ++i;
691 continue;
692 }
693
694 oprnd_info = (*oprnds_info)[i];
695 dt = dts[i];
696 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697 oprnd = oprnd_info->ops[stmt_num];
698 tree type = TREE_TYPE (oprnd);
699
700 if (!types_compatible_p (oprnd_info->first_op_type, type))
701 {
702 if (dump_enabled_p ())
703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 "Build SLP failed: different operand types\n");
705 return 1;
706 }
707
708 /* Not first stmt of the group, check that the def-stmt/s match
709 the def-stmt/s of the first stmt. Allow different definition
710 types for reduction chains: the first stmt must be a
711 vect_reduction_def (a phi node), and the rest
712 end in the reduction chain. */
713 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714 && !(oprnd_info->first_dt == vect_reduction_def
715 && !STMT_VINFO_DATA_REF (stmt_info)
716 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 && def_stmt_info
718 && !STMT_VINFO_DATA_REF (def_stmt_info)
719 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721 || (!STMT_VINFO_DATA_REF (stmt_info)
722 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 && ((!def_stmt_info
724 || STMT_VINFO_DATA_REF (def_stmt_info)
725 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727 != (oprnd_info->first_dt != vect_reduction_def))))
728 {
729 /* Try swapping operands if we got a mismatch. For BB
730 vectorization only in case it will clearly improve things. */
731 if (i == commutative_op && !swapped
732 && (!is_a <bb_vec_info> (vinfo)
733 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734 dts[i+1])
735 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736 || vect_def_types_match
737 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
738 {
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_NOTE, vect_location,
741 "trying swapped operands\n");
742 std::swap (dts[i], dts[i+1]);
743 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745 std::swap ((*oprnds_info)[i]->ops[stmt_num],
746 (*oprnds_info)[i+1]->ops[stmt_num]);
747 swapped = true;
748 continue;
749 }
750
751 if (is_a <bb_vec_info> (vinfo)
752 && !oprnd_info->any_pattern)
753 {
754 /* Now for commutative ops we should see whether we can
755 make the other operand matching. */
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "treating operand as external\n");
759 oprnd_info->first_dt = dt = vect_external_def;
760 }
761 else
762 {
763 if (dump_enabled_p ())
764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 "Build SLP failed: different types\n");
766 return 1;
767 }
768 }
769
770 /* Make sure to demote the overall operand to external. */
771 if (dt == vect_external_def)
772 oprnd_info->first_dt = vect_external_def;
773 /* For a SLP reduction chain we want to duplicate the reduction to
774 each of the chain members. That gets us a sane SLP graph (still
775 the stmts are not 100% correct wrt the initial values). */
776 else if ((dt == vect_internal_def
777 || dt == vect_reduction_def)
778 && oprnd_info->first_dt == vect_reduction_def
779 && !STMT_VINFO_DATA_REF (stmt_info)
780 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 && !STMT_VINFO_DATA_REF (def_stmt_info)
782 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
784 {
785 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
787 }
788
789 ++i;
790 }
791
792 /* Swap operands. */
793 if (swapped)
794 {
795 if (dump_enabled_p ())
796 dump_printf_loc (MSG_NOTE, vect_location,
797 "swapped operands to match def types in %G",
798 stmt_info->stmt);
799 }
800
801 return 0;
802 }
803
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805 to be combined into the same SLP group. */
806
807 bool
compatible_calls_p(gcall * call1,gcall * call2)808 compatible_calls_p (gcall *call1, gcall *call2)
809 {
810 unsigned int nargs = gimple_call_num_args (call1);
811 if (nargs != gimple_call_num_args (call2))
812 return false;
813
814 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815 return false;
816
817 if (gimple_call_internal_p (call1))
818 {
819 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820 TREE_TYPE (gimple_call_lhs (call2))))
821 return false;
822 for (unsigned int i = 0; i < nargs; ++i)
823 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824 TREE_TYPE (gimple_call_arg (call2, i))))
825 return false;
826 }
827 else
828 {
829 if (!operand_equal_p (gimple_call_fn (call1),
830 gimple_call_fn (call2), 0))
831 return false;
832
833 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834 return false;
835 }
836
837 /* Check that any unvectorized arguments are equal. */
838 if (const int *map = vect_get_operand_map (call1))
839 {
840 unsigned int nkept = *map++;
841 unsigned int mapi = 0;
842 for (unsigned int i = 0; i < nargs; ++i)
843 if (mapi < nkept && map[mapi] == int (i))
844 mapi += 1;
845 else if (!operand_equal_p (gimple_call_arg (call1, i),
846 gimple_call_arg (call2, i)))
847 return false;
848 }
849
850 return true;
851 }
852
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854 caller's attempt to find the vector type in STMT_INFO with the narrowest
855 element type. Return true if VECTYPE is nonnull and if it is valid
856 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
857 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
858 vect_build_slp_tree. */
859
860 static bool
vect_record_max_nunits(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862 unsigned int group_size,
863 tree vectype, poly_uint64 *max_nunits)
864 {
865 if (!vectype)
866 {
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 "Build SLP failed: unsupported data-type in %G\n",
870 stmt_info->stmt);
871 /* Fatal mismatch. */
872 return false;
873 }
874
875 /* If populating the vector type requires unrolling then fail
876 before adjusting *max_nunits for basic-block vectorization. */
877 if (is_a <bb_vec_info> (vinfo)
878 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879 {
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 "Build SLP failed: unrolling required "
883 "in basic block SLP\n");
884 /* Fatal mismatch. */
885 return false;
886 }
887
888 /* In case of multiple types we need to detect the smallest type. */
889 vect_update_max_nunits (max_nunits, vectype);
890 return true;
891 }
892
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894 permutation or are of unsupported types of operation. Return
895 true if they are, otherwise return false and indicate in *MATCHES
896 which stmts are not isomorphic to the first one. If MATCHES[0]
897 is false then this indicates the comparison could not be
898 carried out or the stmts will never be vectorized by SLP.
899
900 Note COND_EXPR is possibly isomorphic to another one after swapping its
901 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902 the first stmt by swapping the two operands of comparison; set SWAP[i]
903 to 2 if stmt I is isormorphic to the first stmt by inverting the code
904 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
906
907 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators,tree * node_vectype)908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909 vec<stmt_vec_info> stmts, unsigned int group_size,
910 poly_uint64 *max_nunits, bool *matches,
911 bool *two_operators, tree *node_vectype)
912 {
913 unsigned int i;
914 stmt_vec_info first_stmt_info = stmts[0];
915 code_helper first_stmt_code = ERROR_MARK;
916 code_helper alt_stmt_code = ERROR_MARK;
917 code_helper rhs_code = ERROR_MARK;
918 code_helper first_cond_code = ERROR_MARK;
919 tree lhs;
920 bool need_same_oprnds = false;
921 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922 stmt_vec_info first_load = NULL, prev_first_load = NULL;
923 bool first_stmt_load_p = false, load_p = false;
924 bool first_stmt_phi_p = false, phi_p = false;
925 bool maybe_soft_fail = false;
926 tree soft_fail_nunits_vectype = NULL_TREE;
927
928 /* For every stmt in NODE find its def stmt/s. */
929 stmt_vec_info stmt_info;
930 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
931 {
932 gimple *stmt = stmt_info->stmt;
933 swap[i] = 0;
934 matches[i] = false;
935
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
938
939 /* Fail to vectorize statements marked as unvectorizable, throw
940 or are volatile. */
941 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942 || stmt_can_throw_internal (cfun, stmt)
943 || gimple_has_volatile_ops (stmt))
944 {
945 if (dump_enabled_p ())
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "Build SLP failed: unvectorizable statement %G",
948 stmt);
949 /* ??? For BB vectorization we want to commutate operands in a way
950 to shuffle all unvectorizable defs into one operand and have
951 the other still vectorized. The following doesn't reliably
952 work for this though but it's the easiest we can do here. */
953 if (is_a <bb_vec_info> (vinfo) && i != 0)
954 continue;
955 /* Fatal mismatch. */
956 matches[0] = false;
957 return false;
958 }
959
960 lhs = gimple_get_lhs (stmt);
961 if (lhs == NULL_TREE)
962 {
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 "Build SLP failed: not GIMPLE_ASSIGN nor "
966 "GIMPLE_CALL %G", stmt);
967 if (is_a <bb_vec_info> (vinfo) && i != 0)
968 continue;
969 /* Fatal mismatch. */
970 matches[0] = false;
971 return false;
972 }
973
974 tree nunits_vectype;
975 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976 &nunits_vectype, group_size))
977 {
978 if (is_a <bb_vec_info> (vinfo) && i != 0)
979 continue;
980 /* Fatal mismatch. */
981 matches[0] = false;
982 return false;
983 }
984 /* Record nunits required but continue analysis, producing matches[]
985 as if nunits was not an issue. This allows splitting of groups
986 to happen. */
987 if (nunits_vectype
988 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989 nunits_vectype, max_nunits))
990 {
991 gcc_assert (is_a <bb_vec_info> (vinfo));
992 maybe_soft_fail = true;
993 soft_fail_nunits_vectype = nunits_vectype;
994 }
995
996 gcc_assert (vectype);
997
998 gcall *call_stmt = dyn_cast <gcall *> (stmt);
999 if (call_stmt)
1000 {
1001 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002 if (cfn != CFN_LAST)
1003 rhs_code = cfn;
1004 else
1005 rhs_code = CALL_EXPR;
1006
1007 if (cfn == CFN_MASK_LOAD
1008 || cfn == CFN_GATHER_LOAD
1009 || cfn == CFN_MASK_GATHER_LOAD)
1010 load_p = true;
1011 else if ((internal_fn_p (cfn)
1012 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013 || gimple_call_tail_p (call_stmt)
1014 || gimple_call_noreturn_p (call_stmt)
1015 || gimple_call_chain (call_stmt))
1016 {
1017 if (dump_enabled_p ())
1018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 "Build SLP failed: unsupported call type %G",
1020 call_stmt);
1021 if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 continue;
1023 /* Fatal mismatch. */
1024 matches[0] = false;
1025 return false;
1026 }
1027 }
1028 else if (gimple_code (stmt) == GIMPLE_PHI)
1029 {
1030 rhs_code = ERROR_MARK;
1031 phi_p = true;
1032 }
1033 else
1034 {
1035 rhs_code = gimple_assign_rhs_code (stmt);
1036 load_p = gimple_vuse (stmt);
1037 }
1038
1039 /* Check the operation. */
1040 if (i == 0)
1041 {
1042 *node_vectype = vectype;
1043 first_stmt_code = rhs_code;
1044 first_stmt_load_p = load_p;
1045 first_stmt_phi_p = phi_p;
1046
1047 /* Shift arguments should be equal in all the packed stmts for a
1048 vector shift with scalar shift operand. */
1049 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050 || rhs_code == LROTATE_EXPR
1051 || rhs_code == RROTATE_EXPR)
1052 {
1053 /* First see if we have a vector/vector shift. */
1054 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1055 {
1056 /* No vector/vector shift, try for a vector/scalar shift. */
1057 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1058 {
1059 if (dump_enabled_p ())
1060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 "Build SLP failed: "
1062 "op not supported by target.\n");
1063 if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 continue;
1065 /* Fatal mismatch. */
1066 matches[0] = false;
1067 return false;
1068 }
1069 need_same_oprnds = true;
1070 first_op1 = gimple_assign_rhs2 (stmt);
1071 }
1072 }
1073 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1074 {
1075 need_same_oprnds = true;
1076 first_op1 = gimple_assign_rhs2 (stmt);
1077 }
1078 else if (!load_p
1079 && rhs_code == BIT_FIELD_REF)
1080 {
1081 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082 if (!is_a <bb_vec_info> (vinfo)
1083 || TREE_CODE (vec) != SSA_NAME
1084 || !operand_equal_p (TYPE_SIZE (vectype),
1085 TYPE_SIZE (TREE_TYPE (vec))))
1086 {
1087 if (dump_enabled_p ())
1088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 "Build SLP failed: "
1090 "BIT_FIELD_REF not supported\n");
1091 /* Fatal mismatch. */
1092 matches[0] = false;
1093 return false;
1094 }
1095 }
1096 else if (rhs_code == CFN_DIV_POW2)
1097 {
1098 need_same_oprnds = true;
1099 first_op1 = gimple_call_arg (call_stmt, 1);
1100 }
1101 }
1102 else
1103 {
1104 if (first_stmt_code != rhs_code
1105 && alt_stmt_code == ERROR_MARK)
1106 alt_stmt_code = rhs_code;
1107 if ((first_stmt_code != rhs_code
1108 && (first_stmt_code != IMAGPART_EXPR
1109 || rhs_code != REALPART_EXPR)
1110 && (first_stmt_code != REALPART_EXPR
1111 || rhs_code != IMAGPART_EXPR)
1112 /* Handle mismatches in plus/minus by computing both
1113 and merging the results. */
1114 && !((first_stmt_code == PLUS_EXPR
1115 || first_stmt_code == MINUS_EXPR)
1116 && (alt_stmt_code == PLUS_EXPR
1117 || alt_stmt_code == MINUS_EXPR)
1118 && rhs_code == alt_stmt_code)
1119 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 && (first_stmt_code == ARRAY_REF
1121 || first_stmt_code == BIT_FIELD_REF
1122 || first_stmt_code == INDIRECT_REF
1123 || first_stmt_code == COMPONENT_REF
1124 || first_stmt_code == MEM_REF)
1125 && (rhs_code == ARRAY_REF
1126 || rhs_code == BIT_FIELD_REF
1127 || rhs_code == INDIRECT_REF
1128 || rhs_code == COMPONENT_REF
1129 || rhs_code == MEM_REF)))
1130 || first_stmt_load_p != load_p
1131 || first_stmt_phi_p != phi_p)
1132 {
1133 if (dump_enabled_p ())
1134 {
1135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 "Build SLP failed: different operation "
1137 "in stmt %G", stmt);
1138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 "original stmt %G", first_stmt_info->stmt);
1140 }
1141 /* Mismatch. */
1142 continue;
1143 }
1144
1145 if (!load_p
1146 && first_stmt_code == BIT_FIELD_REF
1147 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1148 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1149 {
1150 if (dump_enabled_p ())
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 "Build SLP failed: different BIT_FIELD_REF "
1153 "arguments in %G", stmt);
1154 /* Mismatch. */
1155 continue;
1156 }
1157
1158 if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1159 {
1160 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1161 call_stmt))
1162 {
1163 if (dump_enabled_p ())
1164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 "Build SLP failed: different calls in %G",
1166 stmt);
1167 /* Mismatch. */
1168 continue;
1169 }
1170 }
1171
1172 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1173 && (gimple_bb (first_stmt_info->stmt)
1174 != gimple_bb (stmt_info->stmt)))
1175 {
1176 if (dump_enabled_p ())
1177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 "Build SLP failed: different BB for PHI "
1179 "or possibly trapping operation in %G", stmt);
1180 /* Mismatch. */
1181 continue;
1182 }
1183
1184 if (need_same_oprnds)
1185 {
1186 tree other_op1 = gimple_arg (stmt, 1);
1187 if (!operand_equal_p (first_op1, other_op1, 0))
1188 {
1189 if (dump_enabled_p ())
1190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 "Build SLP failed: different shift "
1192 "arguments in %G", stmt);
1193 /* Mismatch. */
1194 continue;
1195 }
1196 }
1197
1198 if (!types_compatible_p (vectype, *node_vectype))
1199 {
1200 if (dump_enabled_p ())
1201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 "Build SLP failed: different vector type "
1203 "in %G", stmt);
1204 /* Mismatch. */
1205 continue;
1206 }
1207 }
1208
1209 /* Grouped store or load. */
1210 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211 {
1212 if (REFERENCE_CLASS_P (lhs))
1213 {
1214 /* Store. */
1215 ;
1216 }
1217 else
1218 {
1219 /* Load. */
1220 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1221 if (prev_first_load)
1222 {
1223 /* Check that there are no loads from different interleaving
1224 chains in the same node. */
1225 if (prev_first_load != first_load)
1226 {
1227 if (dump_enabled_p ())
1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1229 vect_location,
1230 "Build SLP failed: different "
1231 "interleaving chains in one node %G",
1232 stmt);
1233 /* Mismatch. */
1234 continue;
1235 }
1236 }
1237 else
1238 prev_first_load = first_load;
1239 }
1240 } /* Grouped access. */
1241 else
1242 {
1243 if (load_p
1244 && rhs_code != CFN_GATHER_LOAD
1245 && rhs_code != CFN_MASK_GATHER_LOAD)
1246 {
1247 /* Not grouped load. */
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 "Build SLP failed: not grouped load %G", stmt);
1251
1252 /* FORNOW: Not grouped loads are not supported. */
1253 if (is_a <bb_vec_info> (vinfo) && i != 0)
1254 continue;
1255 /* Fatal mismatch. */
1256 matches[0] = false;
1257 return false;
1258 }
1259
1260 /* Not memory operation. */
1261 if (!phi_p
1262 && rhs_code.is_tree_code ()
1263 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1264 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1265 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1266 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1267 && rhs_code != VIEW_CONVERT_EXPR
1268 && rhs_code != CALL_EXPR
1269 && rhs_code != BIT_FIELD_REF)
1270 {
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273 "Build SLP failed: operation unsupported %G",
1274 stmt);
1275 if (is_a <bb_vec_info> (vinfo) && i != 0)
1276 continue;
1277 /* Fatal mismatch. */
1278 matches[0] = false;
1279 return false;
1280 }
1281
1282 if (rhs_code == COND_EXPR)
1283 {
1284 tree cond_expr = gimple_assign_rhs1 (stmt);
1285 enum tree_code cond_code = TREE_CODE (cond_expr);
1286 enum tree_code swap_code = ERROR_MARK;
1287 enum tree_code invert_code = ERROR_MARK;
1288
1289 if (i == 0)
1290 first_cond_code = TREE_CODE (cond_expr);
1291 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1292 {
1293 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1294 swap_code = swap_tree_comparison (cond_code);
1295 invert_code = invert_tree_comparison (cond_code, honor_nans);
1296 }
1297
1298 if (first_cond_code == cond_code)
1299 ;
1300 /* Isomorphic can be achieved by swapping. */
1301 else if (first_cond_code == swap_code)
1302 swap[i] = 1;
1303 /* Isomorphic can be achieved by inverting. */
1304 else if (first_cond_code == invert_code)
1305 swap[i] = 2;
1306 else
1307 {
1308 if (dump_enabled_p ())
1309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 "Build SLP failed: different"
1311 " operation %G", stmt);
1312 /* Mismatch. */
1313 continue;
1314 }
1315 }
1316 }
1317
1318 matches[i] = true;
1319 }
1320
1321 for (i = 0; i < group_size; ++i)
1322 if (!matches[i])
1323 return false;
1324
1325 /* If we allowed a two-operation SLP node verify the target can cope
1326 with the permute we are going to use. */
1327 if (alt_stmt_code != ERROR_MARK
1328 && (!alt_stmt_code.is_tree_code ()
1329 || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1330 {
1331 *two_operators = true;
1332 }
1333
1334 if (maybe_soft_fail)
1335 {
1336 unsigned HOST_WIDE_INT const_nunits;
1337 if (!TYPE_VECTOR_SUBPARTS
1338 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1339 || const_nunits > group_size)
1340 matches[0] = false;
1341 else
1342 {
1343 /* With constant vector elements simulate a mismatch at the
1344 point we need to split. */
1345 unsigned tail = group_size & (const_nunits - 1);
1346 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1347 }
1348 return false;
1349 }
1350
1351 return true;
1352 }
1353
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355 Note we never remove apart from at destruction time so we do not
1356 need a special value for deleted that differs from empty. */
1357 struct bst_traits
1358 {
1359 typedef vec <stmt_vec_info> value_type;
1360 typedef vec <stmt_vec_info> compare_type;
1361 static inline hashval_t hash (value_type);
1362 static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1363 static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1364 static inline bool is_deleted (value_type x) { return !x.exists (); }
1365 static const bool empty_zero_p = true;
mark_emptybst_traits1366 static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1367 static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1368 static inline void remove (value_type &x) { x.release (); }
1369 };
1370 inline hashval_t
hash(value_type x)1371 bst_traits::hash (value_type x)
1372 {
1373 inchash::hash h;
1374 for (unsigned i = 0; i < x.length (); ++i)
1375 h.add_int (gimple_uid (x[i]->stmt));
1376 return h.end ();
1377 }
1378 inline bool
equal(value_type existing,value_type candidate)1379 bst_traits::equal (value_type existing, value_type candidate)
1380 {
1381 if (existing.length () != candidate.length ())
1382 return false;
1383 for (unsigned i = 0; i < existing.length (); ++i)
1384 if (existing[i] != candidate[i])
1385 return false;
1386 return true;
1387 }
1388
1389 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390 but then vec::insert does memmove and that's not compatible with
1391 std::pair. */
1392 struct chain_op_t
1393 {
chain_op_tchain_op_t1394 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1395 : code (code_), dt (dt_), op (op_) {}
1396 tree_code code;
1397 vect_def_type dt;
1398 tree op;
1399 };
1400
1401 /* Comparator for sorting associatable chains. */
1402
1403 static int
dt_sort_cmp(const void * op1_,const void * op2_,void *)1404 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1405 {
1406 auto *op1 = (const chain_op_t *) op1_;
1407 auto *op2 = (const chain_op_t *) op2_;
1408 if (op1->dt != op2->dt)
1409 return (int)op1->dt - (int)op2->dt;
1410 return (int)op1->code - (int)op2->code;
1411 }
1412
1413 /* Linearize the associatable expression chain at START with the
1414 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415 filling CHAIN with the result and using WORKLIST as intermediate storage.
1416 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1418 stmts, starting with START. */
1419
1420 static void
vect_slp_linearize_chain(vec_info * vinfo,vec<std::pair<tree_code,gimple * >> & worklist,vec<chain_op_t> & chain,enum tree_code code,gimple * start,gimple * & code_stmt,gimple * & alt_code_stmt,vec<gimple * > * chain_stmts)1421 vect_slp_linearize_chain (vec_info *vinfo,
1422 vec<std::pair<tree_code, gimple *> > &worklist,
1423 vec<chain_op_t> &chain,
1424 enum tree_code code, gimple *start,
1425 gimple *&code_stmt, gimple *&alt_code_stmt,
1426 vec<gimple *> *chain_stmts)
1427 {
1428 /* For each lane linearize the addition/subtraction (or other
1429 uniform associatable operation) expression tree. */
1430 worklist.safe_push (std::make_pair (code, start));
1431 while (!worklist.is_empty ())
1432 {
1433 auto entry = worklist.pop ();
1434 gassign *stmt = as_a <gassign *> (entry.second);
1435 enum tree_code in_code = entry.first;
1436 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1437 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1438 if (!code_stmt
1439 && gimple_assign_rhs_code (stmt) == code)
1440 code_stmt = stmt;
1441 else if (!alt_code_stmt
1442 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1443 alt_code_stmt = stmt;
1444 if (chain_stmts)
1445 chain_stmts->safe_push (stmt);
1446 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1447 {
1448 tree op = gimple_op (stmt, opnum);
1449 vect_def_type dt;
1450 stmt_vec_info def_stmt_info;
1451 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1452 gcc_assert (res);
1453 if (dt == vect_internal_def
1454 && is_pattern_stmt_p (def_stmt_info))
1455 op = gimple_get_lhs (def_stmt_info->stmt);
1456 gimple *use_stmt;
1457 use_operand_p use_p;
1458 if (dt == vect_internal_def
1459 && single_imm_use (op, &use_p, &use_stmt)
1460 && is_gimple_assign (def_stmt_info->stmt)
1461 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1462 || (code == PLUS_EXPR
1463 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1464 == MINUS_EXPR))))
1465 {
1466 tree_code op_def_code = this_code;
1467 if (op_def_code == MINUS_EXPR && opnum == 1)
1468 op_def_code = PLUS_EXPR;
1469 if (in_code == MINUS_EXPR)
1470 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1471 worklist.safe_push (std::make_pair (op_def_code,
1472 def_stmt_info->stmt));
1473 }
1474 else
1475 {
1476 tree_code op_def_code = this_code;
1477 if (op_def_code == MINUS_EXPR && opnum == 1)
1478 op_def_code = PLUS_EXPR;
1479 if (in_code == MINUS_EXPR)
1480 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1481 chain.safe_push (chain_op_t (op_def_code, dt, op));
1482 }
1483 }
1484 }
1485 }
1486
1487 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1488 simple_hashmap_traits <bst_traits, slp_tree> >
1489 scalar_stmts_to_slp_tree_map_t;
1490
1491 static slp_tree
1492 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1493 vec<stmt_vec_info> stmts, unsigned int group_size,
1494 poly_uint64 *max_nunits,
1495 bool *matches, unsigned *limit, unsigned *tree_size,
1496 scalar_stmts_to_slp_tree_map_t *bst_map);
1497
1498 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1499 vect_build_slp_tree (vec_info *vinfo,
1500 vec<stmt_vec_info> stmts, unsigned int group_size,
1501 poly_uint64 *max_nunits,
1502 bool *matches, unsigned *limit, unsigned *tree_size,
1503 scalar_stmts_to_slp_tree_map_t *bst_map)
1504 {
1505 if (slp_tree *leader = bst_map->get (stmts))
1506 {
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1509 !(*leader)->failed ? "" : "failed ", *leader);
1510 if (!(*leader)->failed)
1511 {
1512 SLP_TREE_REF_COUNT (*leader)++;
1513 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1514 stmts.release ();
1515 return *leader;
1516 }
1517 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1518 return NULL;
1519 }
1520
1521 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522 so we can pick up backedge destinations during discovery. */
1523 slp_tree res = new _slp_tree;
1524 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1525 SLP_TREE_SCALAR_STMTS (res) = stmts;
1526 bst_map->put (stmts.copy (), res);
1527
1528 if (*limit == 0)
1529 {
1530 if (dump_enabled_p ())
1531 dump_printf_loc (MSG_NOTE, vect_location,
1532 "SLP discovery limit exceeded\n");
1533 /* Mark the node invalid so we can detect those when still in use
1534 as backedge destinations. */
1535 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1536 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1537 res->failed = XNEWVEC (bool, group_size);
1538 memset (res->failed, 0, sizeof (bool) * group_size);
1539 memset (matches, 0, sizeof (bool) * group_size);
1540 return NULL;
1541 }
1542 --*limit;
1543
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_NOTE, vect_location,
1546 "starting SLP discovery for node %p\n", res);
1547
1548 poly_uint64 this_max_nunits = 1;
1549 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1550 &this_max_nunits,
1551 matches, limit, tree_size, bst_map);
1552 if (!res_)
1553 {
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_NOTE, vect_location,
1556 "SLP discovery for node %p failed\n", res);
1557 /* Mark the node invalid so we can detect those when still in use
1558 as backedge destinations. */
1559 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1560 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1561 res->failed = XNEWVEC (bool, group_size);
1562 if (flag_checking)
1563 {
1564 unsigned i;
1565 for (i = 0; i < group_size; ++i)
1566 if (!matches[i])
1567 break;
1568 gcc_assert (i < group_size);
1569 }
1570 memcpy (res->failed, matches, sizeof (bool) * group_size);
1571 }
1572 else
1573 {
1574 if (dump_enabled_p ())
1575 dump_printf_loc (MSG_NOTE, vect_location,
1576 "SLP discovery for node %p succeeded\n", res);
1577 gcc_assert (res_ == res);
1578 res->max_nunits = this_max_nunits;
1579 vect_update_max_nunits (max_nunits, this_max_nunits);
1580 /* Keep a reference for the bst_map use. */
1581 SLP_TREE_REF_COUNT (res)++;
1582 }
1583 return res_;
1584 }
1585
1586 /* Helper for building an associated SLP node chain. */
1587
1588 static void
vect_slp_build_two_operator_nodes(slp_tree perm,tree vectype,slp_tree op0,slp_tree op1,stmt_vec_info oper1,stmt_vec_info oper2,vec<std::pair<unsigned,unsigned>> lperm)1589 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1590 slp_tree op0, slp_tree op1,
1591 stmt_vec_info oper1, stmt_vec_info oper2,
1592 vec<std::pair<unsigned, unsigned> > lperm)
1593 {
1594 unsigned group_size = SLP_TREE_LANES (op1);
1595
1596 slp_tree child1 = new _slp_tree;
1597 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1598 SLP_TREE_VECTYPE (child1) = vectype;
1599 SLP_TREE_LANES (child1) = group_size;
1600 SLP_TREE_CHILDREN (child1).create (2);
1601 SLP_TREE_CHILDREN (child1).quick_push (op0);
1602 SLP_TREE_CHILDREN (child1).quick_push (op1);
1603 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1604
1605 slp_tree child2 = new _slp_tree;
1606 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1607 SLP_TREE_VECTYPE (child2) = vectype;
1608 SLP_TREE_LANES (child2) = group_size;
1609 SLP_TREE_CHILDREN (child2).create (2);
1610 SLP_TREE_CHILDREN (child2).quick_push (op0);
1611 SLP_TREE_REF_COUNT (op0)++;
1612 SLP_TREE_CHILDREN (child2).quick_push (op1);
1613 SLP_TREE_REF_COUNT (op1)++;
1614 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1615
1616 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1617 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1618 SLP_TREE_VECTYPE (perm) = vectype;
1619 SLP_TREE_LANES (perm) = group_size;
1620 /* ??? We should set this NULL but that's not expected. */
1621 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1622 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1623 SLP_TREE_CHILDREN (perm).quick_push (child1);
1624 SLP_TREE_CHILDREN (perm).quick_push (child2);
1625 }
1626
1627 /* Recursively build an SLP tree starting from NODE.
1628 Fail (and return a value not equal to zero) if def-stmts are not
1629 isomorphic, require data permutation or are of unsupported types of
1630 operation. Otherwise, return 0.
1631 The value returned is the depth in the SLP tree where a mismatch
1632 was found. */
1633
1634 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1635 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1636 vec<stmt_vec_info> stmts, unsigned int group_size,
1637 poly_uint64 *max_nunits,
1638 bool *matches, unsigned *limit, unsigned *tree_size,
1639 scalar_stmts_to_slp_tree_map_t *bst_map)
1640 {
1641 unsigned nops, i, this_tree_size = 0;
1642 poly_uint64 this_max_nunits = *max_nunits;
1643
1644 matches[0] = false;
1645
1646 stmt_vec_info stmt_info = stmts[0];
1647 if (!is_a<gcall *> (stmt_info->stmt)
1648 && !is_a<gassign *> (stmt_info->stmt)
1649 && !is_a<gphi *> (stmt_info->stmt))
1650 return NULL;
1651
1652 nops = gimple_num_args (stmt_info->stmt);
1653 if (const int *map = vect_get_operand_map (stmt_info->stmt))
1654 nops = map[0];
1655
1656 /* If the SLP node is a PHI (induction or reduction), terminate
1657 the recursion. */
1658 bool *skip_args = XALLOCAVEC (bool, nops);
1659 memset (skip_args, 0, sizeof (bool) * nops);
1660 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1661 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1662 {
1663 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1664 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1665 group_size);
1666 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1667 max_nunits))
1668 return NULL;
1669
1670 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1671 if (def_type == vect_induction_def)
1672 {
1673 /* Induction PHIs are not cycles but walk the initial
1674 value. Only for inner loops through, for outer loops
1675 we need to pick up the value from the actual PHIs
1676 to more easily support peeling and epilogue vectorization. */
1677 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678 if (!nested_in_vect_loop_p (loop, stmt_info))
1679 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1680 else
1681 loop = loop->inner;
1682 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1683 }
1684 else if (def_type == vect_reduction_def
1685 || def_type == vect_double_reduction_def
1686 || def_type == vect_nested_cycle)
1687 {
1688 /* Else def types have to match. */
1689 stmt_vec_info other_info;
1690 bool all_same = true;
1691 FOR_EACH_VEC_ELT (stmts, i, other_info)
1692 {
1693 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1694 return NULL;
1695 if (other_info != stmt_info)
1696 all_same = false;
1697 }
1698 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699 /* Reduction initial values are not explicitely represented. */
1700 if (!nested_in_vect_loop_p (loop, stmt_info))
1701 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1702 /* Reduction chain backedge defs are filled manually.
1703 ??? Need a better way to identify a SLP reduction chain PHI.
1704 Or a better overall way to SLP match those. */
1705 if (all_same && def_type == vect_reduction_def)
1706 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1707 }
1708 else if (def_type != vect_internal_def)
1709 return NULL;
1710 }
1711
1712
1713 bool two_operators = false;
1714 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1715 tree vectype = NULL_TREE;
1716 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1717 &this_max_nunits, matches, &two_operators,
1718 &vectype))
1719 return NULL;
1720
1721 /* If the SLP node is a load, terminate the recursion unless masked. */
1722 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1723 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1724 {
1725 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1726 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1727 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1728 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1729 else
1730 {
1731 *max_nunits = this_max_nunits;
1732 (*tree_size)++;
1733 node = vect_create_new_slp_node (node, stmts, 0);
1734 SLP_TREE_VECTYPE (node) = vectype;
1735 /* And compute the load permutation. Whether it is actually
1736 a permutation depends on the unrolling factor which is
1737 decided later. */
1738 vec<unsigned> load_permutation;
1739 int j;
1740 stmt_vec_info load_info;
1741 load_permutation.create (group_size);
1742 stmt_vec_info first_stmt_info
1743 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1744 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1745 {
1746 int load_place = vect_get_place_in_interleaving_chain
1747 (load_info, first_stmt_info);
1748 gcc_assert (load_place != -1);
1749 load_permutation.safe_push (load_place);
1750 }
1751 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1752 return node;
1753 }
1754 }
1755 else if (gimple_assign_single_p (stmt_info->stmt)
1756 && !gimple_vuse (stmt_info->stmt)
1757 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1758 {
1759 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1760 the same SSA name vector of a compatible type to vectype. */
1761 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1762 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1763 stmt_vec_info estmt_info;
1764 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1765 {
1766 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1767 tree bfref = gimple_assign_rhs1 (estmt);
1768 HOST_WIDE_INT lane;
1769 if (!known_eq (bit_field_size (bfref),
1770 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1771 || !constant_multiple_p (bit_field_offset (bfref),
1772 bit_field_size (bfref), &lane))
1773 {
1774 lperm.release ();
1775 matches[0] = false;
1776 return NULL;
1777 }
1778 lperm.safe_push (std::make_pair (0, (unsigned)lane));
1779 }
1780 slp_tree vnode = vect_create_new_slp_node (vNULL);
1781 /* ??? We record vectype here but we hide eventually necessary
1782 punning and instead rely on code generation to materialize
1783 VIEW_CONVERT_EXPRs as necessary. We instead should make
1784 this explicit somehow. */
1785 SLP_TREE_VECTYPE (vnode) = vectype;
1786 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1787 /* We are always building a permutation node even if it is an identity
1788 permute to shield the rest of the vectorizer from the odd node
1789 representing an actual vector without any scalar ops.
1790 ??? We could hide it completely with making the permute node
1791 external? */
1792 node = vect_create_new_slp_node (node, stmts, 1);
1793 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1794 SLP_TREE_LANE_PERMUTATION (node) = lperm;
1795 SLP_TREE_VECTYPE (node) = vectype;
1796 SLP_TREE_CHILDREN (node).quick_push (vnode);
1797 return node;
1798 }
1799 /* When discovery reaches an associatable operation see whether we can
1800 improve that to match up lanes in a way superior to the operand
1801 swapping code which at most looks at two defs.
1802 ??? For BB vectorization we cannot do the brute-force search
1803 for matching as we can succeed by means of builds from scalars
1804 and have no good way to "cost" one build against another. */
1805 else if (is_a <loop_vec_info> (vinfo)
1806 /* ??? We don't handle !vect_internal_def defs below. */
1807 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1808 && is_gimple_assign (stmt_info->stmt)
1809 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1810 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1811 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1812 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1813 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1814 {
1815 /* See if we have a chain of (mixed) adds or subtracts or other
1816 associatable ops. */
1817 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1818 if (code == MINUS_EXPR)
1819 code = PLUS_EXPR;
1820 stmt_vec_info other_op_stmt_info = NULL;
1821 stmt_vec_info op_stmt_info = NULL;
1822 unsigned chain_len = 0;
1823 auto_vec<chain_op_t> chain;
1824 auto_vec<std::pair<tree_code, gimple *> > worklist;
1825 auto_vec<vec<chain_op_t> > chains (group_size);
1826 auto_vec<slp_tree, 4> children;
1827 bool hard_fail = true;
1828 for (unsigned lane = 0; lane < group_size; ++lane)
1829 {
1830 /* For each lane linearize the addition/subtraction (or other
1831 uniform associatable operation) expression tree. */
1832 gimple *op_stmt = NULL, *other_op_stmt = NULL;
1833 vect_slp_linearize_chain (vinfo, worklist, chain, code,
1834 stmts[lane]->stmt, op_stmt, other_op_stmt,
1835 NULL);
1836 if (!op_stmt_info && op_stmt)
1837 op_stmt_info = vinfo->lookup_stmt (op_stmt);
1838 if (!other_op_stmt_info && other_op_stmt)
1839 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1840 if (chain.length () == 2)
1841 {
1842 /* In a chain of just two elements resort to the regular
1843 operand swapping scheme. If we run into a length
1844 mismatch still hard-FAIL. */
1845 if (chain_len == 0)
1846 hard_fail = false;
1847 else
1848 {
1849 matches[lane] = false;
1850 /* ??? We might want to process the other lanes, but
1851 make sure to not give false matching hints to the
1852 caller for lanes we did not process. */
1853 if (lane != group_size - 1)
1854 matches[0] = false;
1855 }
1856 break;
1857 }
1858 else if (chain_len == 0)
1859 chain_len = chain.length ();
1860 else if (chain.length () != chain_len)
1861 {
1862 /* ??? Here we could slip in magic to compensate with
1863 neutral operands. */
1864 matches[lane] = false;
1865 if (lane != group_size - 1)
1866 matches[0] = false;
1867 break;
1868 }
1869 chains.quick_push (chain.copy ());
1870 chain.truncate (0);
1871 }
1872 if (chains.length () == group_size)
1873 {
1874 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
1875 if (!op_stmt_info)
1876 {
1877 hard_fail = false;
1878 goto out;
1879 }
1880 /* Now we have a set of chains with the same length. */
1881 /* 1. pre-sort according to def_type and operation. */
1882 for (unsigned lane = 0; lane < group_size; ++lane)
1883 chains[lane].stablesort (dt_sort_cmp, vinfo);
1884 if (dump_enabled_p ())
1885 {
1886 dump_printf_loc (MSG_NOTE, vect_location,
1887 "pre-sorted chains of %s\n",
1888 get_tree_code_name (code));
1889 for (unsigned lane = 0; lane < group_size; ++lane)
1890 {
1891 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1892 dump_printf (MSG_NOTE, "%s %T ",
1893 get_tree_code_name (chains[lane][opnum].code),
1894 chains[lane][opnum].op);
1895 dump_printf (MSG_NOTE, "\n");
1896 }
1897 }
1898 /* 2. try to build children nodes, associating as necessary. */
1899 for (unsigned n = 0; n < chain_len; ++n)
1900 {
1901 vect_def_type dt = chains[0][n].dt;
1902 unsigned lane;
1903 for (lane = 0; lane < group_size; ++lane)
1904 if (chains[lane][n].dt != dt)
1905 {
1906 if (dt == vect_constant_def
1907 && chains[lane][n].dt == vect_external_def)
1908 dt = vect_external_def;
1909 else if (dt == vect_external_def
1910 && chains[lane][n].dt == vect_constant_def)
1911 ;
1912 else
1913 break;
1914 }
1915 if (lane != group_size)
1916 {
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE, vect_location,
1919 "giving up on chain due to mismatched "
1920 "def types\n");
1921 matches[lane] = false;
1922 if (lane != group_size - 1)
1923 matches[0] = false;
1924 goto out;
1925 }
1926 if (dt == vect_constant_def
1927 || dt == vect_external_def)
1928 {
1929 /* Check whether we can build the invariant. If we can't
1930 we never will be able to. */
1931 tree type = TREE_TYPE (chains[0][n].op);
1932 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1933 && (TREE_CODE (type) == BOOLEAN_TYPE
1934 || !can_duplicate_and_interleave_p (vinfo, group_size,
1935 type)))
1936 {
1937 matches[0] = false;
1938 goto out;
1939 }
1940 vec<tree> ops;
1941 ops.create (group_size);
1942 for (lane = 0; lane < group_size; ++lane)
1943 ops.quick_push (chains[lane][n].op);
1944 slp_tree child = vect_create_new_slp_node (ops);
1945 SLP_TREE_DEF_TYPE (child) = dt;
1946 children.safe_push (child);
1947 }
1948 else if (dt != vect_internal_def)
1949 {
1950 /* Not sure, we might need sth special.
1951 gcc.dg/vect/pr96854.c,
1952 gfortran.dg/vect/fast-math-pr37021.f90
1953 and gfortran.dg/vect/pr61171.f trigger. */
1954 /* Soft-fail for now. */
1955 hard_fail = false;
1956 goto out;
1957 }
1958 else
1959 {
1960 vec<stmt_vec_info> op_stmts;
1961 op_stmts.create (group_size);
1962 slp_tree child = NULL;
1963 /* Brute-force our way. We have to consider a lane
1964 failing after fixing an earlier fail up in the
1965 SLP discovery recursion. So track the current
1966 permute per lane. */
1967 unsigned *perms = XALLOCAVEC (unsigned, group_size);
1968 memset (perms, 0, sizeof (unsigned) * group_size);
1969 do
1970 {
1971 op_stmts.truncate (0);
1972 for (lane = 0; lane < group_size; ++lane)
1973 op_stmts.quick_push
1974 (vinfo->lookup_def (chains[lane][n].op));
1975 child = vect_build_slp_tree (vinfo, op_stmts,
1976 group_size, &this_max_nunits,
1977 matches, limit,
1978 &this_tree_size, bst_map);
1979 /* ??? We're likely getting too many fatal mismatches
1980 here so maybe we want to ignore them (but then we
1981 have no idea which lanes fatally mismatched). */
1982 if (child || !matches[0])
1983 break;
1984 /* Swap another lane we have not yet matched up into
1985 lanes that did not match. If we run out of
1986 permute possibilities for a lane terminate the
1987 search. */
1988 bool term = false;
1989 for (lane = 1; lane < group_size; ++lane)
1990 if (!matches[lane])
1991 {
1992 if (n + perms[lane] + 1 == chain_len)
1993 {
1994 term = true;
1995 break;
1996 }
1997 std::swap (chains[lane][n],
1998 chains[lane][n + perms[lane] + 1]);
1999 perms[lane]++;
2000 }
2001 if (term)
2002 break;
2003 }
2004 while (1);
2005 if (!child)
2006 {
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_NOTE, vect_location,
2009 "failed to match up op %d\n", n);
2010 op_stmts.release ();
2011 if (lane != group_size - 1)
2012 matches[0] = false;
2013 else
2014 matches[lane] = false;
2015 goto out;
2016 }
2017 if (dump_enabled_p ())
2018 {
2019 dump_printf_loc (MSG_NOTE, vect_location,
2020 "matched up op %d to\n", n);
2021 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2022 }
2023 children.safe_push (child);
2024 }
2025 }
2026 /* 3. build SLP nodes to combine the chain. */
2027 for (unsigned lane = 0; lane < group_size; ++lane)
2028 if (chains[lane][0].code != code)
2029 {
2030 /* See if there's any alternate all-PLUS entry. */
2031 unsigned n;
2032 for (n = 1; n < chain_len; ++n)
2033 {
2034 for (lane = 0; lane < group_size; ++lane)
2035 if (chains[lane][n].code != code)
2036 break;
2037 if (lane == group_size)
2038 break;
2039 }
2040 if (n != chain_len)
2041 {
2042 /* Swap that in at first position. */
2043 std::swap (children[0], children[n]);
2044 for (lane = 0; lane < group_size; ++lane)
2045 std::swap (chains[lane][0], chains[lane][n]);
2046 }
2047 else
2048 {
2049 /* ??? When this triggers and we end up with two
2050 vect_constant/external_def up-front things break (ICE)
2051 spectacularly finding an insertion place for the
2052 all-constant op. We should have a fully
2053 vect_internal_def operand though(?) so we can swap
2054 that into first place and then prepend the all-zero
2055 constant. */
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_NOTE, vect_location,
2058 "inserting constant zero to compensate "
2059 "for (partially) negated first "
2060 "operand\n");
2061 chain_len++;
2062 for (lane = 0; lane < group_size; ++lane)
2063 chains[lane].safe_insert
2064 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2065 vec<tree> zero_ops;
2066 zero_ops.create (group_size);
2067 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2068 for (lane = 1; lane < group_size; ++lane)
2069 zero_ops.quick_push (zero_ops[0]);
2070 slp_tree zero = vect_create_new_slp_node (zero_ops);
2071 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2072 children.safe_insert (0, zero);
2073 }
2074 break;
2075 }
2076 for (unsigned i = 1; i < children.length (); ++i)
2077 {
2078 slp_tree op0 = children[i - 1];
2079 slp_tree op1 = children[i];
2080 bool this_two_op = false;
2081 for (unsigned lane = 0; lane < group_size; ++lane)
2082 if (chains[lane][i].code != chains[0][i].code)
2083 {
2084 this_two_op = true;
2085 break;
2086 }
2087 slp_tree child;
2088 if (i == children.length () - 1)
2089 child = vect_create_new_slp_node (node, stmts, 2);
2090 else
2091 child = vect_create_new_slp_node (2, ERROR_MARK);
2092 if (this_two_op)
2093 {
2094 vec<std::pair<unsigned, unsigned> > lperm;
2095 lperm.create (group_size);
2096 for (unsigned lane = 0; lane < group_size; ++lane)
2097 lperm.quick_push (std::make_pair
2098 (chains[lane][i].code != chains[0][i].code, lane));
2099 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2100 (chains[0][i].code == code
2101 ? op_stmt_info
2102 : other_op_stmt_info),
2103 (chains[0][i].code == code
2104 ? other_op_stmt_info
2105 : op_stmt_info),
2106 lperm);
2107 }
2108 else
2109 {
2110 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2111 SLP_TREE_VECTYPE (child) = vectype;
2112 SLP_TREE_LANES (child) = group_size;
2113 SLP_TREE_CHILDREN (child).quick_push (op0);
2114 SLP_TREE_CHILDREN (child).quick_push (op1);
2115 SLP_TREE_REPRESENTATIVE (child)
2116 = (chains[0][i].code == code
2117 ? op_stmt_info : other_op_stmt_info);
2118 }
2119 children[i] = child;
2120 }
2121 *tree_size += this_tree_size + 1;
2122 *max_nunits = this_max_nunits;
2123 while (!chains.is_empty ())
2124 chains.pop ().release ();
2125 return node;
2126 }
2127 out:
2128 while (!children.is_empty ())
2129 vect_free_slp_tree (children.pop ());
2130 while (!chains.is_empty ())
2131 chains.pop ().release ();
2132 /* Hard-fail, otherwise we might run into quadratic processing of the
2133 chains starting one stmt into the chain again. */
2134 if (hard_fail)
2135 return NULL;
2136 /* Fall thru to normal processing. */
2137 }
2138
2139 /* Get at the operands, verifying they are compatible. */
2140 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2141 slp_oprnd_info oprnd_info;
2142 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2143 {
2144 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2145 stmts, i, &oprnds_info);
2146 if (res != 0)
2147 matches[(res == -1) ? 0 : i] = false;
2148 if (!matches[0])
2149 break;
2150 }
2151 for (i = 0; i < group_size; ++i)
2152 if (!matches[i])
2153 {
2154 vect_free_oprnd_info (oprnds_info);
2155 return NULL;
2156 }
2157 swap = NULL;
2158
2159 auto_vec<slp_tree, 4> children;
2160
2161 stmt_info = stmts[0];
2162
2163 /* Create SLP_TREE nodes for the definition node/s. */
2164 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2165 {
2166 slp_tree child;
2167 unsigned int j;
2168
2169 /* We're skipping certain operands from processing, for example
2170 outer loop reduction initial defs. */
2171 if (skip_args[i])
2172 {
2173 children.safe_push (NULL);
2174 continue;
2175 }
2176
2177 if (oprnd_info->first_dt == vect_uninitialized_def)
2178 {
2179 /* COND_EXPR have one too many eventually if the condition
2180 is a SSA name. */
2181 gcc_assert (i == 3 && nops == 4);
2182 continue;
2183 }
2184
2185 if (is_a <bb_vec_info> (vinfo)
2186 && oprnd_info->first_dt == vect_internal_def
2187 && !oprnd_info->any_pattern)
2188 {
2189 /* For BB vectorization, if all defs are the same do not
2190 bother to continue the build along the single-lane
2191 graph but use a splat of the scalar value. */
2192 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2193 for (j = 1; j < group_size; ++j)
2194 if (oprnd_info->def_stmts[j] != first_def)
2195 break;
2196 if (j == group_size
2197 /* But avoid doing this for loads where we may be
2198 able to CSE things, unless the stmt is not
2199 vectorizable. */
2200 && (!STMT_VINFO_VECTORIZABLE (first_def)
2201 || !gimple_vuse (first_def->stmt)))
2202 {
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_NOTE, vect_location,
2205 "Using a splat of the uniform operand %G",
2206 first_def->stmt);
2207 oprnd_info->first_dt = vect_external_def;
2208 }
2209 }
2210
2211 if (oprnd_info->first_dt == vect_external_def
2212 || oprnd_info->first_dt == vect_constant_def)
2213 {
2214 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2215 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2216 oprnd_info->ops = vNULL;
2217 children.safe_push (invnode);
2218 continue;
2219 }
2220
2221 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2222 group_size, &this_max_nunits,
2223 matches, limit,
2224 &this_tree_size, bst_map)) != NULL)
2225 {
2226 oprnd_info->def_stmts = vNULL;
2227 children.safe_push (child);
2228 continue;
2229 }
2230
2231 /* If the SLP build for operand zero failed and operand zero
2232 and one can be commutated try that for the scalar stmts
2233 that failed the match. */
2234 if (i == 0
2235 /* A first scalar stmt mismatch signals a fatal mismatch. */
2236 && matches[0]
2237 /* ??? For COND_EXPRs we can swap the comparison operands
2238 as well as the arms under some constraints. */
2239 && nops == 2
2240 && oprnds_info[1]->first_dt == vect_internal_def
2241 && is_gimple_assign (stmt_info->stmt)
2242 /* Swapping operands for reductions breaks assumptions later on. */
2243 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2244 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2245 {
2246 /* See whether we can swap the matching or the non-matching
2247 stmt operands. */
2248 bool swap_not_matching = true;
2249 do
2250 {
2251 for (j = 0; j < group_size; ++j)
2252 {
2253 if (matches[j] != !swap_not_matching)
2254 continue;
2255 stmt_vec_info stmt_info = stmts[j];
2256 /* Verify if we can swap operands of this stmt. */
2257 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2258 if (!stmt
2259 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2260 {
2261 if (!swap_not_matching)
2262 goto fail;
2263 swap_not_matching = false;
2264 break;
2265 }
2266 }
2267 }
2268 while (j != group_size);
2269
2270 /* Swap mismatched definition stmts. */
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_NOTE, vect_location,
2273 "Re-trying with swapped operands of stmts ");
2274 for (j = 0; j < group_size; ++j)
2275 if (matches[j] == !swap_not_matching)
2276 {
2277 std::swap (oprnds_info[0]->def_stmts[j],
2278 oprnds_info[1]->def_stmts[j]);
2279 std::swap (oprnds_info[0]->ops[j],
2280 oprnds_info[1]->ops[j]);
2281 if (dump_enabled_p ())
2282 dump_printf (MSG_NOTE, "%d ", j);
2283 }
2284 if (dump_enabled_p ())
2285 dump_printf (MSG_NOTE, "\n");
2286 /* After swapping some operands we lost track whether an
2287 operand has any pattern defs so be conservative here. */
2288 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2289 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2290 /* And try again with scratch 'matches' ... */
2291 bool *tem = XALLOCAVEC (bool, group_size);
2292 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2293 group_size, &this_max_nunits,
2294 tem, limit,
2295 &this_tree_size, bst_map)) != NULL)
2296 {
2297 oprnd_info->def_stmts = vNULL;
2298 children.safe_push (child);
2299 continue;
2300 }
2301 }
2302 fail:
2303
2304 /* If the SLP build failed and we analyze a basic-block
2305 simply treat nodes we fail to build as externally defined
2306 (and thus build vectors from the scalar defs).
2307 The cost model will reject outright expensive cases.
2308 ??? This doesn't treat cases where permutation ultimatively
2309 fails (or we don't try permutation below). Ideally we'd
2310 even compute a permutation that will end up with the maximum
2311 SLP tree size... */
2312 if (is_a <bb_vec_info> (vinfo)
2313 /* ??? Rejecting patterns this way doesn't work. We'd have to
2314 do extra work to cancel the pattern so the uses see the
2315 scalar version. */
2316 && !is_pattern_stmt_p (stmt_info)
2317 && !oprnd_info->any_pattern)
2318 {
2319 /* But if there's a leading vector sized set of matching stmts
2320 fail here so we can split the group. This matches the condition
2321 vect_analyze_slp_instance uses. */
2322 /* ??? We might want to split here and combine the results to support
2323 multiple vector sizes better. */
2324 for (j = 0; j < group_size; ++j)
2325 if (!matches[j])
2326 break;
2327 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2328 {
2329 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_NOTE, vect_location,
2331 "Building vector operands from scalars\n");
2332 this_tree_size++;
2333 child = vect_create_new_slp_node (oprnd_info->ops);
2334 children.safe_push (child);
2335 oprnd_info->ops = vNULL;
2336 continue;
2337 }
2338 }
2339
2340 gcc_assert (child == NULL);
2341 FOR_EACH_VEC_ELT (children, j, child)
2342 if (child)
2343 vect_free_slp_tree (child);
2344 vect_free_oprnd_info (oprnds_info);
2345 return NULL;
2346 }
2347
2348 vect_free_oprnd_info (oprnds_info);
2349
2350 /* If we have all children of a child built up from uniform scalars
2351 or does more than one possibly expensive vector construction then
2352 just throw that away, causing it built up from scalars.
2353 The exception is the SLP node for the vector store. */
2354 if (is_a <bb_vec_info> (vinfo)
2355 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2356 /* ??? Rejecting patterns this way doesn't work. We'd have to
2357 do extra work to cancel the pattern so the uses see the
2358 scalar version. */
2359 && !is_pattern_stmt_p (stmt_info))
2360 {
2361 slp_tree child;
2362 unsigned j;
2363 bool all_uniform_p = true;
2364 unsigned n_vector_builds = 0;
2365 FOR_EACH_VEC_ELT (children, j, child)
2366 {
2367 if (!child)
2368 ;
2369 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2370 all_uniform_p = false;
2371 else if (!vect_slp_tree_uniform_p (child))
2372 {
2373 all_uniform_p = false;
2374 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2375 n_vector_builds++;
2376 }
2377 }
2378 if (all_uniform_p
2379 || n_vector_builds > 1
2380 || (n_vector_builds == children.length ()
2381 && is_a <gphi *> (stmt_info->stmt)))
2382 {
2383 /* Roll back. */
2384 matches[0] = false;
2385 FOR_EACH_VEC_ELT (children, j, child)
2386 if (child)
2387 vect_free_slp_tree (child);
2388
2389 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE, vect_location,
2391 "Building parent vector operands from "
2392 "scalars instead\n");
2393 return NULL;
2394 }
2395 }
2396
2397 *tree_size += this_tree_size + 1;
2398 *max_nunits = this_max_nunits;
2399
2400 if (two_operators)
2401 {
2402 /* ??? We'd likely want to either cache in bst_map sth like
2403 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2404 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2405 explicit stmts to put in so the keying on 'stmts' doesn't
2406 work (but we have the same issue with nodes that use 'ops'). */
2407 slp_tree one = new _slp_tree;
2408 slp_tree two = new _slp_tree;
2409 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2410 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2411 SLP_TREE_VECTYPE (one) = vectype;
2412 SLP_TREE_VECTYPE (two) = vectype;
2413 SLP_TREE_CHILDREN (one).safe_splice (children);
2414 SLP_TREE_CHILDREN (two).safe_splice (children);
2415 slp_tree child;
2416 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2417 SLP_TREE_REF_COUNT (child)++;
2418
2419 /* Here we record the original defs since this
2420 node represents the final lane configuration. */
2421 node = vect_create_new_slp_node (node, stmts, 2);
2422 SLP_TREE_VECTYPE (node) = vectype;
2423 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2424 SLP_TREE_CHILDREN (node).quick_push (one);
2425 SLP_TREE_CHILDREN (node).quick_push (two);
2426 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2427 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2428 enum tree_code ocode = ERROR_MARK;
2429 stmt_vec_info ostmt_info;
2430 unsigned j = 0;
2431 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2432 {
2433 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2434 if (gimple_assign_rhs_code (ostmt) != code0)
2435 {
2436 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2437 ocode = gimple_assign_rhs_code (ostmt);
2438 j = i;
2439 }
2440 else
2441 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2442 }
2443 SLP_TREE_CODE (one) = code0;
2444 SLP_TREE_CODE (two) = ocode;
2445 SLP_TREE_LANES (one) = stmts.length ();
2446 SLP_TREE_LANES (two) = stmts.length ();
2447 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2448 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2449 return node;
2450 }
2451
2452 node = vect_create_new_slp_node (node, stmts, nops);
2453 SLP_TREE_VECTYPE (node) = vectype;
2454 SLP_TREE_CHILDREN (node).splice (children);
2455 return node;
2456 }
2457
2458 /* Dump a single SLP tree NODE. */
2459
2460 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)2461 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2462 slp_tree node)
2463 {
2464 unsigned i, j;
2465 slp_tree child;
2466 stmt_vec_info stmt_info;
2467 tree op;
2468
2469 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2470 dump_user_location_t user_loc = loc.get_user_location ();
2471 dump_printf_loc (metadata, user_loc,
2472 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2473 ", refcnt=%u)",
2474 SLP_TREE_DEF_TYPE (node) == vect_external_def
2475 ? " (external)"
2476 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2477 ? " (constant)"
2478 : ""), node,
2479 estimated_poly_value (node->max_nunits),
2480 SLP_TREE_REF_COUNT (node));
2481 if (SLP_TREE_VECTYPE (node))
2482 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2483 dump_printf (metadata, "\n");
2484 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2485 {
2486 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2487 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2488 else
2489 dump_printf_loc (metadata, user_loc, "op template: %G",
2490 SLP_TREE_REPRESENTATIVE (node)->stmt);
2491 }
2492 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2493 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2494 dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2495 else
2496 {
2497 dump_printf_loc (metadata, user_loc, "\t{ ");
2498 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2499 dump_printf (metadata, "%T%s ", op,
2500 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2501 dump_printf (metadata, "}\n");
2502 }
2503 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2504 {
2505 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2506 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2507 dump_printf (dump_kind, " %u", j);
2508 dump_printf (dump_kind, " }\n");
2509 }
2510 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2511 {
2512 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2513 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2514 dump_printf (dump_kind, " %u[%u]",
2515 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2516 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2517 dump_printf (dump_kind, " }\n");
2518 }
2519 if (SLP_TREE_CHILDREN (node).is_empty ())
2520 return;
2521 dump_printf_loc (metadata, user_loc, "\tchildren");
2522 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2523 dump_printf (dump_kind, " %p", (void *)child);
2524 dump_printf (dump_kind, "\n");
2525 }
2526
2527 DEBUG_FUNCTION void
debug(slp_tree node)2528 debug (slp_tree node)
2529 {
2530 debug_dump_context ctx;
2531 vect_print_slp_tree (MSG_NOTE,
2532 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2533 node);
2534 }
2535
2536 /* Recursive helper for the dot producer below. */
2537
2538 static void
dot_slp_tree(FILE * f,slp_tree node,hash_set<slp_tree> & visited)2539 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2540 {
2541 if (visited.add (node))
2542 return;
2543
2544 fprintf (f, "\"%p\" [label=\"", (void *)node);
2545 vect_print_slp_tree (MSG_NOTE,
2546 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2547 node);
2548 fprintf (f, "\"];\n");
2549
2550
2551 for (slp_tree child : SLP_TREE_CHILDREN (node))
2552 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2553
2554 for (slp_tree child : SLP_TREE_CHILDREN (node))
2555 if (child)
2556 dot_slp_tree (f, child, visited);
2557 }
2558
2559 DEBUG_FUNCTION void
dot_slp_tree(const char * fname,slp_tree node)2560 dot_slp_tree (const char *fname, slp_tree node)
2561 {
2562 FILE *f = fopen (fname, "w");
2563 fprintf (f, "digraph {\n");
2564 fflush (f);
2565 {
2566 debug_dump_context ctx (f);
2567 hash_set<slp_tree> visited;
2568 dot_slp_tree (f, node, visited);
2569 }
2570 fflush (f);
2571 fprintf (f, "}\n");
2572 fclose (f);
2573 }
2574
2575 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2576
2577 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)2578 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2579 slp_tree node, hash_set<slp_tree> &visited)
2580 {
2581 unsigned i;
2582 slp_tree child;
2583
2584 if (visited.add (node))
2585 return;
2586
2587 vect_print_slp_tree (dump_kind, loc, node);
2588
2589 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2590 if (child)
2591 vect_print_slp_graph (dump_kind, loc, child, visited);
2592 }
2593
2594 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree entry)2595 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2596 slp_tree entry)
2597 {
2598 hash_set<slp_tree> visited;
2599 vect_print_slp_graph (dump_kind, loc, entry, visited);
2600 }
2601
2602 /* Mark the tree rooted at NODE with PURE_SLP. */
2603
2604 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)2605 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2606 {
2607 int i;
2608 stmt_vec_info stmt_info;
2609 slp_tree child;
2610
2611 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2612 return;
2613
2614 if (visited.add (node))
2615 return;
2616
2617 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2618 STMT_SLP_TYPE (stmt_info) = pure_slp;
2619
2620 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2621 if (child)
2622 vect_mark_slp_stmts (child, visited);
2623 }
2624
2625 static void
vect_mark_slp_stmts(slp_tree node)2626 vect_mark_slp_stmts (slp_tree node)
2627 {
2628 hash_set<slp_tree> visited;
2629 vect_mark_slp_stmts (node, visited);
2630 }
2631
2632 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2633
2634 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)2635 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2636 {
2637 int i;
2638 stmt_vec_info stmt_info;
2639 slp_tree child;
2640
2641 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2642 return;
2643
2644 if (visited.add (node))
2645 return;
2646
2647 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2648 {
2649 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2650 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2651 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2652 }
2653
2654 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2655 if (child)
2656 vect_mark_slp_stmts_relevant (child, visited);
2657 }
2658
2659 static void
vect_mark_slp_stmts_relevant(slp_tree node)2660 vect_mark_slp_stmts_relevant (slp_tree node)
2661 {
2662 hash_set<slp_tree> visited;
2663 vect_mark_slp_stmts_relevant (node, visited);
2664 }
2665
2666
2667 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2668
2669 static void
vect_gather_slp_loads(vec<slp_tree> & loads,slp_tree node,hash_set<slp_tree> & visited)2670 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2671 hash_set<slp_tree> &visited)
2672 {
2673 if (!node || visited.add (node))
2674 return;
2675
2676 if (SLP_TREE_CHILDREN (node).length () == 0)
2677 {
2678 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2679 return;
2680 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2681 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2682 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2683 loads.safe_push (node);
2684 }
2685 else
2686 {
2687 unsigned i;
2688 slp_tree child;
2689 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2690 vect_gather_slp_loads (loads, child, visited);
2691 }
2692 }
2693
2694
2695 /* Find the last store in SLP INSTANCE. */
2696
2697 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2698 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2699 {
2700 stmt_vec_info last = NULL;
2701 stmt_vec_info stmt_vinfo;
2702
2703 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2704 {
2705 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2706 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2707 }
2708
2709 return last;
2710 }
2711
2712 /* Find the first stmt in NODE. */
2713
2714 stmt_vec_info
vect_find_first_scalar_stmt_in_slp(slp_tree node)2715 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2716 {
2717 stmt_vec_info first = NULL;
2718 stmt_vec_info stmt_vinfo;
2719
2720 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2721 {
2722 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2723 if (!first
2724 || get_later_stmt (stmt_vinfo, first) == first)
2725 first = stmt_vinfo;
2726 }
2727
2728 return first;
2729 }
2730
2731 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2732 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2733 (also containing the first GROUP1_SIZE stmts, since stores are
2734 consecutive), the second containing the remainder.
2735 Return the first stmt in the second group. */
2736
2737 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2738 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2739 {
2740 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2741 gcc_assert (group1_size > 0);
2742 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2743 gcc_assert (group2_size > 0);
2744 DR_GROUP_SIZE (first_vinfo) = group1_size;
2745
2746 stmt_vec_info stmt_info = first_vinfo;
2747 for (unsigned i = group1_size; i > 1; i--)
2748 {
2749 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2750 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2751 }
2752 /* STMT is now the last element of the first group. */
2753 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2754 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2755
2756 DR_GROUP_SIZE (group2) = group2_size;
2757 for (stmt_info = group2; stmt_info;
2758 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2759 {
2760 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2761 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2762 }
2763
2764 /* For the second group, the DR_GROUP_GAP is that before the original group,
2765 plus skipping over the first vector. */
2766 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2767
2768 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
2769 DR_GROUP_GAP (first_vinfo) += group2_size;
2770
2771 if (dump_enabled_p ())
2772 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2773 group1_size, group2_size);
2774
2775 return group2;
2776 }
2777
2778 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2779 statements and a vector of NUNITS elements. */
2780
2781 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2782 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2783 {
2784 return exact_div (common_multiple (nunits, group_size), group_size);
2785 }
2786
2787 /* Helper that checks to see if a node is a load node. */
2788
2789 static inline bool
vect_is_slp_load_node(slp_tree root)2790 vect_is_slp_load_node (slp_tree root)
2791 {
2792 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2793 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2794 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2795 }
2796
2797
2798 /* Helper function of optimize_load_redistribution that performs the operation
2799 recursively. */
2800
2801 static slp_tree
optimize_load_redistribution_1(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2802 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2803 vec_info *vinfo, unsigned int group_size,
2804 hash_map<slp_tree, slp_tree> *load_map,
2805 slp_tree root)
2806 {
2807 if (slp_tree *leader = load_map->get (root))
2808 return *leader;
2809
2810 slp_tree node;
2811 unsigned i;
2812
2813 /* For now, we don't know anything about externals so do not do anything. */
2814 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2815 return NULL;
2816 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2817 {
2818 /* First convert this node into a load node and add it to the leaves
2819 list and flatten the permute from a lane to a load one. If it's
2820 unneeded it will be elided later. */
2821 vec<stmt_vec_info> stmts;
2822 stmts.create (SLP_TREE_LANES (root));
2823 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2824 for (unsigned j = 0; j < lane_perm.length (); j++)
2825 {
2826 std::pair<unsigned, unsigned> perm = lane_perm[j];
2827 node = SLP_TREE_CHILDREN (root)[perm.first];
2828
2829 if (!vect_is_slp_load_node (node)
2830 || SLP_TREE_CHILDREN (node).exists ())
2831 {
2832 stmts.release ();
2833 goto next;
2834 }
2835
2836 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2837 }
2838
2839 if (dump_enabled_p ())
2840 dump_printf_loc (MSG_NOTE, vect_location,
2841 "converting stmts on permute node %p\n", root);
2842
2843 bool *matches = XALLOCAVEC (bool, group_size);
2844 poly_uint64 max_nunits = 1;
2845 unsigned tree_size = 0, limit = 1;
2846 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2847 matches, &limit, &tree_size, bst_map);
2848 if (!node)
2849 stmts.release ();
2850
2851 load_map->put (root, node);
2852 return node;
2853 }
2854
2855 next:
2856 load_map->put (root, NULL);
2857
2858 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2859 {
2860 slp_tree value
2861 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2862 node);
2863 if (value)
2864 {
2865 SLP_TREE_REF_COUNT (value)++;
2866 SLP_TREE_CHILDREN (root)[i] = value;
2867 /* ??? We know the original leafs of the replaced nodes will
2868 be referenced by bst_map, only the permutes created by
2869 pattern matching are not. */
2870 if (SLP_TREE_REF_COUNT (node) == 1)
2871 load_map->remove (node);
2872 vect_free_slp_tree (node);
2873 }
2874 }
2875
2876 return NULL;
2877 }
2878
2879 /* Temporary workaround for loads not being CSEd during SLP build. This
2880 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2881 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2882 same DR such that the final operation is equal to a permuted load. Such
2883 NODES are then directly converted into LOADS themselves. The nodes are
2884 CSEd using BST_MAP. */
2885
2886 static void
optimize_load_redistribution(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2887 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2888 vec_info *vinfo, unsigned int group_size,
2889 hash_map<slp_tree, slp_tree> *load_map,
2890 slp_tree root)
2891 {
2892 slp_tree node;
2893 unsigned i;
2894
2895 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2896 {
2897 slp_tree value
2898 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2899 node);
2900 if (value)
2901 {
2902 SLP_TREE_REF_COUNT (value)++;
2903 SLP_TREE_CHILDREN (root)[i] = value;
2904 /* ??? We know the original leafs of the replaced nodes will
2905 be referenced by bst_map, only the permutes created by
2906 pattern matching are not. */
2907 if (SLP_TREE_REF_COUNT (node) == 1)
2908 load_map->remove (node);
2909 vect_free_slp_tree (node);
2910 }
2911 }
2912 }
2913
2914 /* Helper function of vect_match_slp_patterns.
2915
2916 Attempts to match patterns against the slp tree rooted in REF_NODE using
2917 VINFO. Patterns are matched in post-order traversal.
2918
2919 If matching is successful the value in REF_NODE is updated and returned, if
2920 not then it is returned unchanged. */
2921
2922 static bool
vect_match_slp_patterns_2(slp_tree * ref_node,vec_info * vinfo,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache,hash_set<slp_tree> * visited)2923 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2924 slp_tree_to_load_perm_map_t *perm_cache,
2925 slp_compat_nodes_map_t *compat_cache,
2926 hash_set<slp_tree> *visited)
2927 {
2928 unsigned i;
2929 slp_tree node = *ref_node;
2930 bool found_p = false;
2931 if (!node || visited->add (node))
2932 return false;
2933
2934 slp_tree child;
2935 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2936 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2937 vinfo, perm_cache, compat_cache,
2938 visited);
2939
2940 for (unsigned x = 0; x < num__slp_patterns; x++)
2941 {
2942 vect_pattern *pattern
2943 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
2944 if (pattern)
2945 {
2946 pattern->build (vinfo);
2947 delete pattern;
2948 found_p = true;
2949 }
2950 }
2951
2952 return found_p;
2953 }
2954
2955 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2956 vec_info VINFO.
2957
2958 The modified tree is returned. Patterns are tried in order and multiple
2959 patterns may match. */
2960
2961 static bool
vect_match_slp_patterns(slp_instance instance,vec_info * vinfo,hash_set<slp_tree> * visited,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache)2962 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2963 hash_set<slp_tree> *visited,
2964 slp_tree_to_load_perm_map_t *perm_cache,
2965 slp_compat_nodes_map_t *compat_cache)
2966 {
2967 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2968 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2969
2970 if (dump_enabled_p ())
2971 dump_printf_loc (MSG_NOTE, vect_location,
2972 "Analyzing SLP tree %p for patterns\n",
2973 SLP_INSTANCE_TREE (instance));
2974
2975 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2976 visited);
2977 }
2978
2979 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2980 splitting into two, with the first split group having size NEW_GROUP_SIZE.
2981 Return true if we could use IFN_STORE_LANES instead and if that appears
2982 to be the better approach. */
2983
2984 static bool
vect_slp_prefer_store_lanes_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,unsigned int new_group_size)2985 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2986 unsigned int group_size,
2987 unsigned int new_group_size)
2988 {
2989 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2990 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2991 if (!vectype)
2992 return false;
2993 /* Allow the split if one of the two new groups would operate on full
2994 vectors *within* rather than across one scalar loop iteration.
2995 This is purely a heuristic, but it should work well for group
2996 sizes of 3 and 4, where the possible splits are:
2997
2998 3->2+1: OK if the vector has exactly two elements
2999 4->2+2: Likewise
3000 4->3+1: Less clear-cut. */
3001 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3002 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3003 return false;
3004 return vect_store_lanes_supported (vectype, group_size, false);
3005 }
3006
3007 /* Analyze an SLP instance starting from a group of grouped stores. Call
3008 vect_build_slp_tree to build a tree of packed stmts if possible.
3009 Return FALSE if it's impossible to SLP any stmt in the loop. */
3010
3011 static bool
3012 vect_analyze_slp_instance (vec_info *vinfo,
3013 scalar_stmts_to_slp_tree_map_t *bst_map,
3014 stmt_vec_info stmt_info, slp_instance_kind kind,
3015 unsigned max_tree_size, unsigned *limit);
3016
3017 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3018 of KIND. Return true if successful. */
3019
3020 static bool
vect_build_slp_instance(vec_info * vinfo,slp_instance_kind kind,vec<stmt_vec_info> & scalar_stmts,vec<stmt_vec_info> & root_stmt_infos,unsigned max_tree_size,unsigned * limit,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info_)3021 vect_build_slp_instance (vec_info *vinfo,
3022 slp_instance_kind kind,
3023 vec<stmt_vec_info> &scalar_stmts,
3024 vec<stmt_vec_info> &root_stmt_infos,
3025 unsigned max_tree_size, unsigned *limit,
3026 scalar_stmts_to_slp_tree_map_t *bst_map,
3027 /* ??? We need stmt_info for group splitting. */
3028 stmt_vec_info stmt_info_)
3029 {
3030 if (dump_enabled_p ())
3031 {
3032 dump_printf_loc (MSG_NOTE, vect_location,
3033 "Starting SLP discovery for\n");
3034 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 " %G", scalar_stmts[i]->stmt);
3037 }
3038
3039 /* Build the tree for the SLP instance. */
3040 unsigned int group_size = scalar_stmts.length ();
3041 bool *matches = XALLOCAVEC (bool, group_size);
3042 poly_uint64 max_nunits = 1;
3043 unsigned tree_size = 0;
3044 unsigned i;
3045 slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3046 &max_nunits, matches, limit,
3047 &tree_size, bst_map);
3048 if (node != NULL)
3049 {
3050 /* Calculate the unrolling factor based on the smallest type. */
3051 poly_uint64 unrolling_factor
3052 = calculate_unrolling_factor (max_nunits, group_size);
3053
3054 if (maybe_ne (unrolling_factor, 1U)
3055 && is_a <bb_vec_info> (vinfo))
3056 {
3057 unsigned HOST_WIDE_INT const_max_nunits;
3058 if (!max_nunits.is_constant (&const_max_nunits)
3059 || const_max_nunits > group_size)
3060 {
3061 if (dump_enabled_p ())
3062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063 "Build SLP failed: store group "
3064 "size not a multiple of the vector size "
3065 "in basic block SLP\n");
3066 vect_free_slp_tree (node);
3067 return false;
3068 }
3069 /* Fatal mismatch. */
3070 if (dump_enabled_p ())
3071 dump_printf_loc (MSG_NOTE, vect_location,
3072 "SLP discovery succeeded but node needs "
3073 "splitting\n");
3074 memset (matches, true, group_size);
3075 matches[group_size / const_max_nunits * const_max_nunits] = false;
3076 vect_free_slp_tree (node);
3077 }
3078 else
3079 {
3080 /* Create a new SLP instance. */
3081 slp_instance new_instance = XNEW (class _slp_instance);
3082 SLP_INSTANCE_TREE (new_instance) = node;
3083 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3084 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3085 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3086 SLP_INSTANCE_KIND (new_instance) = kind;
3087 new_instance->reduc_phis = NULL;
3088 new_instance->cost_vec = vNULL;
3089 new_instance->subgraph_entries = vNULL;
3090
3091 if (dump_enabled_p ())
3092 dump_printf_loc (MSG_NOTE, vect_location,
3093 "SLP size %u vs. limit %u.\n",
3094 tree_size, max_tree_size);
3095
3096 /* Fixup SLP reduction chains. */
3097 if (kind == slp_inst_kind_reduc_chain)
3098 {
3099 /* If this is a reduction chain with a conversion in front
3100 amend the SLP tree with a node for that. */
3101 gimple *scalar_def
3102 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3103 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3104 {
3105 /* Get at the conversion stmt - we know it's the single use
3106 of the last stmt of the reduction chain. */
3107 use_operand_p use_p;
3108 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3109 &use_p, &scalar_def);
3110 gcc_assert (r);
3111 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3112 next_info = vect_stmt_to_vectorize (next_info);
3113 scalar_stmts = vNULL;
3114 scalar_stmts.create (group_size);
3115 for (unsigned i = 0; i < group_size; ++i)
3116 scalar_stmts.quick_push (next_info);
3117 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3118 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3119 SLP_TREE_CHILDREN (conv).quick_push (node);
3120 SLP_INSTANCE_TREE (new_instance) = conv;
3121 /* We also have to fake this conversion stmt as SLP reduction
3122 group so we don't have to mess with too much code
3123 elsewhere. */
3124 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3125 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3126 }
3127 /* Fill the backedge child of the PHI SLP node. The
3128 general matching code cannot find it because the
3129 scalar code does not reflect how we vectorize the
3130 reduction. */
3131 use_operand_p use_p;
3132 imm_use_iterator imm_iter;
3133 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3134 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3135 gimple_get_lhs (scalar_def))
3136 /* There are exactly two non-debug uses, the reduction
3137 PHI and the loop-closed PHI node. */
3138 if (!is_gimple_debug (USE_STMT (use_p))
3139 && gimple_bb (USE_STMT (use_p)) == loop->header)
3140 {
3141 auto_vec<stmt_vec_info, 64> phis (group_size);
3142 stmt_vec_info phi_info
3143 = vinfo->lookup_stmt (USE_STMT (use_p));
3144 for (unsigned i = 0; i < group_size; ++i)
3145 phis.quick_push (phi_info);
3146 slp_tree *phi_node = bst_map->get (phis);
3147 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3148 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3149 = SLP_INSTANCE_TREE (new_instance);
3150 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3151 }
3152 }
3153
3154 vinfo->slp_instances.safe_push (new_instance);
3155
3156 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3157 the number of scalar stmts in the root in a few places.
3158 Verify that assumption holds. */
3159 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3160 .length () == group_size);
3161
3162 if (dump_enabled_p ())
3163 {
3164 dump_printf_loc (MSG_NOTE, vect_location,
3165 "Final SLP tree for instance %p:\n", new_instance);
3166 vect_print_slp_graph (MSG_NOTE, vect_location,
3167 SLP_INSTANCE_TREE (new_instance));
3168 }
3169
3170 return true;
3171 }
3172 }
3173 else
3174 {
3175 /* Failed to SLP. */
3176 /* Free the allocated memory. */
3177 scalar_stmts.release ();
3178 }
3179
3180 stmt_vec_info stmt_info = stmt_info_;
3181 /* Try to break the group up into pieces. */
3182 if (kind == slp_inst_kind_store)
3183 {
3184 /* ??? We could delay all the actual splitting of store-groups
3185 until after SLP discovery of the original group completed.
3186 Then we can recurse to vect_build_slp_instance directly. */
3187 for (i = 0; i < group_size; i++)
3188 if (!matches[i])
3189 break;
3190
3191 /* For basic block SLP, try to break the group up into multiples of
3192 a vector size. */
3193 if (is_a <bb_vec_info> (vinfo)
3194 && (i > 1 && i < group_size))
3195 {
3196 tree scalar_type
3197 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3198 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3199 1 << floor_log2 (i));
3200 unsigned HOST_WIDE_INT const_nunits;
3201 if (vectype
3202 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3203 {
3204 /* Split into two groups at the first vector boundary. */
3205 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3206 unsigned group1_size = i & ~(const_nunits - 1);
3207
3208 if (dump_enabled_p ())
3209 dump_printf_loc (MSG_NOTE, vect_location,
3210 "Splitting SLP group at stmt %u\n", i);
3211 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3212 group1_size);
3213 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3214 kind, max_tree_size,
3215 limit);
3216 /* Split the rest at the failure point and possibly
3217 re-analyze the remaining matching part if it has
3218 at least two lanes. */
3219 if (group1_size < i
3220 && (i + 1 < group_size
3221 || i - group1_size > 1))
3222 {
3223 stmt_vec_info rest2 = rest;
3224 rest = vect_split_slp_store_group (rest, i - group1_size);
3225 if (i - group1_size > 1)
3226 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3227 kind, max_tree_size,
3228 limit);
3229 }
3230 /* Re-analyze the non-matching tail if it has at least
3231 two lanes. */
3232 if (i + 1 < group_size)
3233 res |= vect_analyze_slp_instance (vinfo, bst_map,
3234 rest, kind, max_tree_size,
3235 limit);
3236 return res;
3237 }
3238 }
3239
3240 /* For loop vectorization split into arbitrary pieces of size > 1. */
3241 if (is_a <loop_vec_info> (vinfo)
3242 && (i > 1 && i < group_size)
3243 && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3244 {
3245 unsigned group1_size = i;
3246
3247 if (dump_enabled_p ())
3248 dump_printf_loc (MSG_NOTE, vect_location,
3249 "Splitting SLP group at stmt %u\n", i);
3250
3251 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3252 group1_size);
3253 /* Loop vectorization cannot handle gaps in stores, make sure
3254 the split group appears as strided. */
3255 STMT_VINFO_STRIDED_P (rest) = 1;
3256 DR_GROUP_GAP (rest) = 0;
3257 STMT_VINFO_STRIDED_P (stmt_info) = 1;
3258 DR_GROUP_GAP (stmt_info) = 0;
3259
3260 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3261 kind, max_tree_size, limit);
3262 if (i + 1 < group_size)
3263 res |= vect_analyze_slp_instance (vinfo, bst_map,
3264 rest, kind, max_tree_size, limit);
3265
3266 return res;
3267 }
3268
3269 /* Even though the first vector did not all match, we might be able to SLP
3270 (some) of the remainder. FORNOW ignore this possibility. */
3271 }
3272
3273 /* Failed to SLP. */
3274 if (dump_enabled_p ())
3275 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3276 return false;
3277 }
3278
3279
3280 /* Analyze an SLP instance starting from a group of grouped stores. Call
3281 vect_build_slp_tree to build a tree of packed stmts if possible.
3282 Return FALSE if it's impossible to SLP any stmt in the loop. */
3283
3284 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,slp_instance_kind kind,unsigned max_tree_size,unsigned * limit)3285 vect_analyze_slp_instance (vec_info *vinfo,
3286 scalar_stmts_to_slp_tree_map_t *bst_map,
3287 stmt_vec_info stmt_info,
3288 slp_instance_kind kind,
3289 unsigned max_tree_size, unsigned *limit)
3290 {
3291 unsigned int i;
3292 vec<stmt_vec_info> scalar_stmts;
3293
3294 if (is_a <bb_vec_info> (vinfo))
3295 vect_location = stmt_info->stmt;
3296
3297 stmt_vec_info next_info = stmt_info;
3298 if (kind == slp_inst_kind_store)
3299 {
3300 /* Collect the stores and store them in scalar_stmts. */
3301 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3302 while (next_info)
3303 {
3304 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3305 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3306 }
3307 }
3308 else if (kind == slp_inst_kind_reduc_chain)
3309 {
3310 /* Collect the reduction stmts and store them in scalar_stmts. */
3311 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3312 while (next_info)
3313 {
3314 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3315 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3316 }
3317 /* Mark the first element of the reduction chain as reduction to properly
3318 transform the node. In the reduction analysis phase only the last
3319 element of the chain is marked as reduction. */
3320 STMT_VINFO_DEF_TYPE (stmt_info)
3321 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3322 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3323 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3324 }
3325 else if (kind == slp_inst_kind_ctor)
3326 {
3327 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3328 tree val;
3329 scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3330 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3331 {
3332 stmt_vec_info def_info = vinfo->lookup_def (val);
3333 def_info = vect_stmt_to_vectorize (def_info);
3334 scalar_stmts.quick_push (def_info);
3335 }
3336 if (dump_enabled_p ())
3337 dump_printf_loc (MSG_NOTE, vect_location,
3338 "Analyzing vectorizable constructor: %G\n",
3339 stmt_info->stmt);
3340 }
3341 else if (kind == slp_inst_kind_reduc_group)
3342 {
3343 /* Collect reduction statements. */
3344 const vec<stmt_vec_info> &reductions
3345 = as_a <loop_vec_info> (vinfo)->reductions;
3346 scalar_stmts.create (reductions.length ());
3347 for (i = 0; reductions.iterate (i, &next_info); i++)
3348 if ((STMT_VINFO_RELEVANT_P (next_info)
3349 || STMT_VINFO_LIVE_P (next_info))
3350 /* ??? Make sure we didn't skip a conversion around a reduction
3351 path. In that case we'd have to reverse engineer that conversion
3352 stmt following the chain using reduc_idx and from the PHI
3353 using reduc_def. */
3354 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3355 scalar_stmts.quick_push (next_info);
3356 /* If less than two were relevant/live there's nothing to SLP. */
3357 if (scalar_stmts.length () < 2)
3358 return false;
3359 }
3360 else
3361 gcc_unreachable ();
3362
3363 vec<stmt_vec_info> roots = vNULL;
3364 if (kind == slp_inst_kind_ctor)
3365 {
3366 roots.create (1);
3367 roots.quick_push (stmt_info);
3368 }
3369 /* Build the tree for the SLP instance. */
3370 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3371 roots,
3372 max_tree_size, limit, bst_map,
3373 kind == slp_inst_kind_store
3374 ? stmt_info : NULL);
3375 if (!res)
3376 roots.release ();
3377
3378 /* ??? If this is slp_inst_kind_store and the above succeeded here's
3379 where we should do store group splitting. */
3380
3381 return res;
3382 }
3383
3384 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3385 trees of packed scalar stmts if SLP is possible. */
3386
3387 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)3388 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3389 {
3390 unsigned int i;
3391 stmt_vec_info first_element;
3392 slp_instance instance;
3393
3394 DUMP_VECT_SCOPE ("vect_analyze_slp");
3395
3396 unsigned limit = max_tree_size;
3397
3398 scalar_stmts_to_slp_tree_map_t *bst_map
3399 = new scalar_stmts_to_slp_tree_map_t ();
3400
3401 /* Find SLP sequences starting from groups of grouped stores. */
3402 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3403 vect_analyze_slp_instance (vinfo, bst_map, first_element,
3404 STMT_VINFO_GROUPED_ACCESS (first_element)
3405 ? slp_inst_kind_store : slp_inst_kind_ctor,
3406 max_tree_size, &limit);
3407
3408 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3409 {
3410 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3411 {
3412 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3413 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3414 bb_vinfo->roots[i].stmts,
3415 bb_vinfo->roots[i].roots,
3416 max_tree_size, &limit, bst_map, NULL))
3417 {
3418 bb_vinfo->roots[i].stmts = vNULL;
3419 bb_vinfo->roots[i].roots = vNULL;
3420 }
3421 }
3422 }
3423
3424 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3425 {
3426 /* Find SLP sequences starting from reduction chains. */
3427 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3428 if (! STMT_VINFO_RELEVANT_P (first_element)
3429 && ! STMT_VINFO_LIVE_P (first_element))
3430 ;
3431 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3432 slp_inst_kind_reduc_chain,
3433 max_tree_size, &limit))
3434 {
3435 /* Dissolve reduction chain group. */
3436 stmt_vec_info vinfo = first_element;
3437 stmt_vec_info last = NULL;
3438 while (vinfo)
3439 {
3440 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3441 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3442 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3443 last = vinfo;
3444 vinfo = next;
3445 }
3446 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3447 /* It can be still vectorized as part of an SLP reduction. */
3448 loop_vinfo->reductions.safe_push (last);
3449 }
3450
3451 /* Find SLP sequences starting from groups of reductions. */
3452 if (loop_vinfo->reductions.length () > 1)
3453 vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3454 slp_inst_kind_reduc_group, max_tree_size,
3455 &limit);
3456 }
3457
3458 hash_set<slp_tree> visited_patterns;
3459 slp_tree_to_load_perm_map_t perm_cache;
3460 slp_compat_nodes_map_t compat_cache;
3461
3462 /* See if any patterns can be found in the SLP tree. */
3463 bool pattern_found = false;
3464 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3465 pattern_found |= vect_match_slp_patterns (instance, vinfo,
3466 &visited_patterns, &perm_cache,
3467 &compat_cache);
3468
3469 /* If any were found optimize permutations of loads. */
3470 if (pattern_found)
3471 {
3472 hash_map<slp_tree, slp_tree> load_map;
3473 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3474 {
3475 slp_tree root = SLP_INSTANCE_TREE (instance);
3476 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3477 &load_map, root);
3478 }
3479 }
3480
3481
3482
3483 /* The map keeps a reference on SLP nodes built, release that. */
3484 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3485 it != bst_map->end (); ++it)
3486 if ((*it).second)
3487 vect_free_slp_tree ((*it).second);
3488 delete bst_map;
3489
3490 if (pattern_found && dump_enabled_p ())
3491 {
3492 dump_printf_loc (MSG_NOTE, vect_location,
3493 "Pattern matched SLP tree\n");
3494 hash_set<slp_tree> visited;
3495 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3496 vect_print_slp_graph (MSG_NOTE, vect_location,
3497 SLP_INSTANCE_TREE (instance), visited);
3498 }
3499
3500 return opt_result::success ();
3501 }
3502
3503 struct slpg_vertex
3504 {
slpg_vertexslpg_vertex3505 slpg_vertex (slp_tree node_)
3506 : node (node_), perm_in (-1), perm_out (-1) {}
3507
get_perm_materializedslpg_vertex3508 int get_perm_materialized () const
3509 { return perm_in != perm_out ? perm_in : 0; }
3510
3511 slp_tree node;
3512 /* The common permutation on the incoming lanes (towards SLP children). */
3513 int perm_in;
3514 /* The permutation on the outgoing lanes (towards SLP parents). When
3515 the node is a materialization point for a permute this differs
3516 from perm_in (and is then usually zero). Materialization happens
3517 on the input side. */
3518 int perm_out;
3519 };
3520
3521 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3522
3523 static void
vect_slp_build_vertices(hash_set<slp_tree> & visited,slp_tree node,vec<slpg_vertex> & vertices,vec<int> & leafs)3524 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3525 vec<slpg_vertex> &vertices, vec<int> &leafs)
3526 {
3527 unsigned i;
3528 slp_tree child;
3529
3530 if (visited.add (node))
3531 return;
3532
3533 node->vertex = vertices.length ();
3534 vertices.safe_push (slpg_vertex (node));
3535
3536 bool leaf = true;
3537 bool force_leaf = false;
3538 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3539 if (child)
3540 {
3541 leaf = false;
3542 vect_slp_build_vertices (visited, child, vertices, leafs);
3543 }
3544 else
3545 force_leaf = true;
3546 /* Since SLP discovery works along use-def edges all cycles have an
3547 entry - but there's the exception of cycles where we do not handle
3548 the entry explicitely (but with a NULL SLP node), like some reductions
3549 and inductions. Force those SLP PHIs to act as leafs to make them
3550 backwards reachable. */
3551 if (leaf || force_leaf)
3552 leafs.safe_push (node->vertex);
3553 }
3554
3555 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
3556
3557 static void
vect_slp_build_vertices(vec_info * info,vec<slpg_vertex> & vertices,vec<int> & leafs)3558 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3559 vec<int> &leafs)
3560 {
3561 hash_set<slp_tree> visited;
3562 unsigned i;
3563 slp_instance instance;
3564 FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3565 vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3566 leafs);
3567 }
3568
3569 /* Apply (reverse) bijectite PERM to VEC. */
3570
3571 template <class T>
3572 static void
vect_slp_permute(vec<unsigned> perm,vec<T> & vec,bool reverse)3573 vect_slp_permute (vec<unsigned> perm,
3574 vec<T> &vec, bool reverse)
3575 {
3576 auto_vec<T, 64> saved;
3577 saved.create (vec.length ());
3578 for (unsigned i = 0; i < vec.length (); ++i)
3579 saved.quick_push (vec[i]);
3580
3581 if (reverse)
3582 {
3583 for (unsigned i = 0; i < vec.length (); ++i)
3584 vec[perm[i]] = saved[i];
3585 for (unsigned i = 0; i < vec.length (); ++i)
3586 gcc_assert (vec[perm[i]] == saved[i]);
3587 }
3588 else
3589 {
3590 for (unsigned i = 0; i < vec.length (); ++i)
3591 vec[i] = saved[perm[i]];
3592 for (unsigned i = 0; i < vec.length (); ++i)
3593 gcc_assert (vec[i] == saved[perm[i]]);
3594 }
3595 }
3596
3597 /* Return whether permutations PERM_A and PERM_B as recorded in the
3598 PERMS vector are equal. */
3599
3600 static bool
vect_slp_perms_eq(const vec<vec<unsigned>> & perms,int perm_a,int perm_b)3601 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3602 int perm_a, int perm_b)
3603 {
3604 return (perm_a == perm_b
3605 || (perm_a != -1 && perm_b != -1
3606 && perms[perm_a].length () == perms[perm_b].length ()
3607 && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3608 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3609 }
3610
3611 /* Optimize the SLP graph of VINFO. */
3612
3613 void
vect_optimize_slp(vec_info * vinfo)3614 vect_optimize_slp (vec_info *vinfo)
3615 {
3616 if (vinfo->slp_instances.is_empty ())
3617 return;
3618
3619 slp_tree node;
3620 unsigned i;
3621 auto_vec<slpg_vertex> vertices;
3622 auto_vec<int> leafs;
3623 vect_slp_build_vertices (vinfo, vertices, leafs);
3624
3625 struct graph *slpg = new_graph (vertices.length ());
3626 for (slpg_vertex &v : vertices)
3627 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3628 if (child)
3629 add_edge (slpg, v.node->vertex, child->vertex);
3630
3631 /* Compute (reverse) postorder on the inverted graph. */
3632 auto_vec<int> ipo;
3633 graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3634
3635 auto_vec<vec<unsigned> > perms;
3636 perms.safe_push (vNULL); /* zero is no permute */
3637
3638 /* Produce initial permutations. */
3639 for (i = 0; i < leafs.length (); ++i)
3640 {
3641 int idx = leafs[i];
3642 slp_tree node = vertices[idx].node;
3643
3644 /* Handle externals and constants optimistically throughout the
3645 iteration. But treat existing vectors as fixed since we
3646 do not handle permuting them below. */
3647 if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3648 && !SLP_TREE_VEC_DEFS (node).exists ())
3649 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3650 continue;
3651
3652 /* Leafs do not change across iterations. Note leafs also double
3653 as entries to the reverse graph. */
3654 if (!slpg->vertices[idx].succ)
3655 {
3656 vertices[idx].perm_in = 0;
3657 vertices[idx].perm_out = 0;
3658 }
3659
3660 /* Loads are the only thing generating permutes. */
3661 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3662 continue;
3663
3664 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3665 node unpermuted, record this permute. */
3666 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3667 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3668 continue;
3669 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3670 unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3671 bool any_permute = false;
3672 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3673 {
3674 unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3675 imin = MIN (imin, idx);
3676 imax = MAX (imax, idx);
3677 if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3678 any_permute = true;
3679 }
3680 /* If there's no permute no need to split one out. */
3681 if (!any_permute)
3682 continue;
3683 /* If the span doesn't match we'd disrupt VF computation, avoid
3684 that for now. */
3685 if (imax - imin + 1 != SLP_TREE_LANES (node))
3686 continue;
3687
3688 /* For now only handle true permutes, like
3689 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
3690 when permuting constants and invariants keeping the permute
3691 bijective. */
3692 auto_sbitmap load_index (SLP_TREE_LANES (node));
3693 bitmap_clear (load_index);
3694 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3695 bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3696 unsigned j;
3697 for (j = 0; j < SLP_TREE_LANES (node); ++j)
3698 if (!bitmap_bit_p (load_index, j))
3699 break;
3700 if (j != SLP_TREE_LANES (node))
3701 continue;
3702
3703 vec<unsigned> perm = vNULL;
3704 perm.safe_grow (SLP_TREE_LANES (node), true);
3705 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3706 perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3707 perms.safe_push (perm);
3708 vertices[idx].perm_in = perms.length () - 1;
3709 vertices[idx].perm_out = perms.length () - 1;
3710 }
3711
3712 /* In addition to the above we have to mark outgoing permutes facing
3713 non-reduction graph entries that are not represented as to be
3714 materialized. */
3715 for (slp_instance instance : vinfo->slp_instances)
3716 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3717 {
3718 /* Just setting perm_out isn't enough for the propagation to
3719 pick this up. */
3720 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3721 vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3722 }
3723
3724 /* Propagate permutes along the graph and compute materialization points. */
3725 bool changed;
3726 bool do_materialization = false;
3727 unsigned iteration = 0;
3728 do
3729 {
3730 changed = false;
3731 ++iteration;
3732
3733 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_NOTE, vect_location,
3735 "SLP optimize iteration %d\n", iteration);
3736
3737 for (i = vertices.length (); i > 0 ; --i)
3738 {
3739 int idx = ipo[i-1];
3740 slp_tree node = vertices[idx].node;
3741
3742 /* Handle externals and constants optimistically throughout the
3743 iteration. */
3744 if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3745 || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3746 continue;
3747
3748 /* We still eventually have failed backedge SLP nodes in the
3749 graph, those are only cancelled when analyzing operations.
3750 Simply treat them as transparent ops, propagating permutes
3751 through them. */
3752 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3753 {
3754 /* We do not handle stores with a permutation, so all
3755 incoming permutes must have been materialized. */
3756 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3757 if (STMT_VINFO_DATA_REF (rep)
3758 && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3759 {
3760 /* ??? We're forcing materialization in place
3761 of the child here, we'd need special handling
3762 in materialization to leave perm_in -1 here. */
3763 vertices[idx].perm_in = 0;
3764 vertices[idx].perm_out = 0;
3765 }
3766 /* We cannot move a permute across an operation that is
3767 not independent on lanes. Note this is an explicit
3768 negative list since that's much shorter than the respective
3769 positive one but it's critical to keep maintaining it. */
3770 if (is_gimple_call (STMT_VINFO_STMT (rep)))
3771 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3772 {
3773 case CFN_COMPLEX_ADD_ROT90:
3774 case CFN_COMPLEX_ADD_ROT270:
3775 case CFN_COMPLEX_MUL:
3776 case CFN_COMPLEX_MUL_CONJ:
3777 case CFN_VEC_ADDSUB:
3778 case CFN_VEC_FMADDSUB:
3779 case CFN_VEC_FMSUBADD:
3780 vertices[idx].perm_in = 0;
3781 vertices[idx].perm_out = 0;
3782 default:;
3783 }
3784 }
3785
3786 if (!slpg->vertices[idx].succ)
3787 /* Pick up pre-computed leaf values. */
3788 ;
3789 else
3790 {
3791 bool any_succ_perm_out_m1 = false;
3792 int perm_in = vertices[idx].perm_in;
3793 for (graph_edge *succ = slpg->vertices[idx].succ;
3794 succ; succ = succ->succ_next)
3795 {
3796 int succ_idx = succ->dest;
3797 int succ_perm = vertices[succ_idx].perm_out;
3798 /* Handle unvisited (and constant) nodes optimistically. */
3799 /* ??? But for constants once we want to handle
3800 non-bijective permutes we have to verify the permute,
3801 when unifying lanes, will not unify different constants.
3802 For example see gcc.dg/vect/bb-slp-14.c for a case
3803 that would break. */
3804 if (succ_perm == -1)
3805 {
3806 /* When we handled a non-leaf optimistically, note
3807 that so we can adjust its outgoing permute below. */
3808 slp_tree succ_node = vertices[succ_idx].node;
3809 if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3810 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3811 any_succ_perm_out_m1 = true;
3812 continue;
3813 }
3814 if (perm_in == -1)
3815 perm_in = succ_perm;
3816 else if (succ_perm == 0
3817 || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3818 {
3819 perm_in = 0;
3820 break;
3821 }
3822 }
3823
3824 /* Adjust any incoming permutes we treated optimistically. */
3825 if (perm_in != -1 && any_succ_perm_out_m1)
3826 {
3827 for (graph_edge *succ = slpg->vertices[idx].succ;
3828 succ; succ = succ->succ_next)
3829 {
3830 slp_tree succ_node = vertices[succ->dest].node;
3831 if (vertices[succ->dest].perm_out == -1
3832 && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3833 && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3834 {
3835 vertices[succ->dest].perm_out = perm_in;
3836 /* And ensure this propagates. */
3837 if (vertices[succ->dest].perm_in == -1)
3838 vertices[succ->dest].perm_in = perm_in;
3839 }
3840 }
3841 changed = true;
3842 }
3843
3844 if (!vect_slp_perms_eq (perms, perm_in,
3845 vertices[idx].perm_in))
3846 {
3847 /* Make sure we eventually converge. */
3848 gcc_checking_assert (vertices[idx].perm_in == -1
3849 || perm_in == 0);
3850 vertices[idx].perm_in = perm_in;
3851
3852 /* While we can handle VEC_PERM nodes as transparent
3853 pass-through they can be a cheap materialization
3854 point as well. In addition they can act as source
3855 of a random permutation as well.
3856 The following ensures that former materialization
3857 points that now have zero incoming permutes no
3858 longer appear as such and that former "any" permutes
3859 get pass-through. We keep VEC_PERM nodes optimistic
3860 as "any" outgoing permute though. */
3861 if (vertices[idx].perm_out != 0
3862 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3863 vertices[idx].perm_out = perm_in;
3864 changed = true;
3865 }
3866 }
3867
3868 /* Elide pruning at materialization points in the first
3869 iteration phase. */
3870 if (!do_materialization)
3871 continue;
3872
3873 int perm = vertices[idx].perm_out;
3874 if (perm == 0 || perm == -1)
3875 continue;
3876
3877 /* Decide on permute materialization. Look whether there's
3878 a use (pred) edge that is permuted differently than us.
3879 In that case mark ourselves so the permutation is applied. */
3880 bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3881 if (all_preds_permuted)
3882 for (graph_edge *pred = slpg->vertices[idx].pred;
3883 pred; pred = pred->pred_next)
3884 {
3885 int pred_perm = vertices[pred->src].perm_in;
3886 gcc_checking_assert (pred_perm != -1);
3887 if (!vect_slp_perms_eq (perms, perm, pred_perm))
3888 {
3889 all_preds_permuted = false;
3890 break;
3891 }
3892 }
3893 if (!all_preds_permuted)
3894 {
3895 vertices[idx].perm_out = 0;
3896 changed = true;
3897 }
3898 }
3899
3900 /* If the initial propagation converged, switch on materialization
3901 and re-propagate. */
3902 if (!changed && !do_materialization)
3903 {
3904 do_materialization = true;
3905 changed = true;
3906 }
3907 }
3908 while (changed);
3909 statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3910
3911 /* Materialize. */
3912 for (i = 0; i < vertices.length (); ++i)
3913 {
3914 int perm_in = vertices[i].perm_in;
3915 slp_tree node = vertices[i].node;
3916
3917 /* First permute invariant/external original successors, we handle
3918 those optimistically during propagation and duplicate them if
3919 they are used with different permutations. */
3920 unsigned j;
3921 slp_tree child;
3922 if (perm_in > 0)
3923 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3924 {
3925 if (!child
3926 || (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3927 && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3928 continue;
3929
3930 /* If the vector is uniform there's nothing to do. */
3931 if (vect_slp_tree_uniform_p (child))
3932 continue;
3933
3934 /* We can end up sharing some externals via two_operator
3935 handling. Be prepared to unshare those. */
3936 if (child->refcnt != 1)
3937 {
3938 gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3939 SLP_TREE_CHILDREN (node)[j] = child
3940 = vect_create_new_slp_node
3941 (SLP_TREE_SCALAR_OPS (child).copy ());
3942 }
3943 vect_slp_permute (perms[perm_in],
3944 SLP_TREE_SCALAR_OPS (child), true);
3945 }
3946
3947 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3948 {
3949 /* Apply the common permutes to the input vectors. */
3950 if (perm_in > 0)
3951 {
3952 /* If the node is already a permute node we can apply
3953 the permutation to the lane selection, effectively
3954 materializing it on the incoming vectors. */
3955 if (dump_enabled_p ())
3956 dump_printf_loc (MSG_NOTE, vect_location,
3957 "simplifying permute node %p\n",
3958 node);
3959 for (unsigned k = 0;
3960 k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3961 SLP_TREE_LANE_PERMUTATION (node)[k].second
3962 = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3963 }
3964 /* Apply the anticipated output permute to the permute and
3965 stmt vectors. */
3966 int perm_out = vertices[i].perm_out;
3967 if (perm_out > 0)
3968 {
3969 vect_slp_permute (perms[perm_out],
3970 SLP_TREE_SCALAR_STMTS (node), true);
3971 vect_slp_permute (perms[perm_out],
3972 SLP_TREE_LANE_PERMUTATION (node), true);
3973 }
3974 }
3975 else if (vertices[i].get_perm_materialized () != 0)
3976 {
3977 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3978 /* For loads simply drop the permutation, the load permutation
3979 already performs the desired permutation. */
3980 ;
3981 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3982 gcc_unreachable ();
3983 else
3984 {
3985 if (dump_enabled_p ())
3986 dump_printf_loc (MSG_NOTE, vect_location,
3987 "inserting permute node in place of %p\n",
3988 node);
3989
3990 /* Make a copy of NODE and in-place change it to a
3991 VEC_PERM node to permute the lanes of the copy. */
3992 slp_tree copy = new _slp_tree;
3993 SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3994 SLP_TREE_CHILDREN (node) = vNULL;
3995 SLP_TREE_SCALAR_STMTS (copy)
3996 = SLP_TREE_SCALAR_STMTS (node).copy ();
3997 vect_slp_permute (perms[perm_in],
3998 SLP_TREE_SCALAR_STMTS (copy), true);
3999 gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
4000 SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
4001 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4002 SLP_TREE_LANE_PERMUTATION (copy)
4003 = SLP_TREE_LANE_PERMUTATION (node);
4004 SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4005 SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4006 copy->refcnt = 1;
4007 copy->max_nunits = node->max_nunits;
4008 SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4009 SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4010 SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4011
4012 /* Now turn NODE into a VEC_PERM. */
4013 SLP_TREE_CHILDREN (node).safe_push (copy);
4014 SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4015 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4016 SLP_TREE_LANE_PERMUTATION (node)
4017 .quick_push (std::make_pair (0, perms[perm_in][j]));
4018 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4019 }
4020 }
4021 else if (perm_in > 0) /* perm_in == perm_out */
4022 {
4023 /* Apply the reverse permutation to our stmts. */
4024 vect_slp_permute (perms[perm_in],
4025 SLP_TREE_SCALAR_STMTS (node), true);
4026 /* And to the lane/load permutation, which we can simply
4027 make regular by design. */
4028 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4029 {
4030 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4031 /* ??? When we handle non-bijective permutes the idea
4032 is that we can force the load-permutation to be
4033 { min, min + 1, min + 2, ... max }. But then the
4034 scalar defs might no longer match the lane content
4035 which means wrong-code with live lane vectorization.
4036 So we possibly have to have NULL entries for those. */
4037 vect_slp_permute (perms[perm_in],
4038 SLP_TREE_LOAD_PERMUTATION (node), true);
4039 }
4040 else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4041 gcc_unreachable ();
4042 }
4043 }
4044
4045 /* Elide any permutations at BB reduction roots. */
4046 if (is_a <bb_vec_info> (vinfo))
4047 {
4048 for (slp_instance instance : vinfo->slp_instances)
4049 {
4050 if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4051 continue;
4052 slp_tree old = SLP_INSTANCE_TREE (instance);
4053 if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4054 && SLP_TREE_CHILDREN (old).length () == 1)
4055 {
4056 slp_tree child = SLP_TREE_CHILDREN (old)[0];
4057 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4058 {
4059 /* Preserve the special VEC_PERM we use to shield existing
4060 vector defs from the rest. But make it a no-op. */
4061 unsigned i = 0;
4062 for (std::pair<unsigned, unsigned> &p
4063 : SLP_TREE_LANE_PERMUTATION (old))
4064 p.second = i++;
4065 }
4066 else
4067 {
4068 SLP_INSTANCE_TREE (instance) = child;
4069 SLP_TREE_REF_COUNT (child)++;
4070 vect_free_slp_tree (old);
4071 }
4072 }
4073 else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4074 && SLP_TREE_REF_COUNT (old) == 1
4075 && vertices[old->vertex].get_perm_materialized () != 0)
4076 {
4077 /* ??? For loads the situation is more complex since
4078 we can't modify the permute in place in case the
4079 node is used multiple times. In fact for loads this
4080 should be somehow handled in the propagation engine. */
4081 /* Apply the reverse permutation to our stmts. */
4082 int perm = vertices[old->vertex].get_perm_materialized ();
4083 vect_slp_permute (perms[perm],
4084 SLP_TREE_SCALAR_STMTS (old), true);
4085 vect_slp_permute (perms[perm],
4086 SLP_TREE_LOAD_PERMUTATION (old), true);
4087 }
4088 }
4089 }
4090
4091 /* Free the perms vector used for propagation. */
4092 while (!perms.is_empty ())
4093 perms.pop ().release ();
4094 free_graph (slpg);
4095
4096
4097 /* Now elide load permutations that are not necessary. */
4098 for (i = 0; i < leafs.length (); ++i)
4099 {
4100 node = vertices[leafs[i]].node;
4101 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4102 continue;
4103
4104 /* In basic block vectorization we allow any subchain of an interleaving
4105 chain.
4106 FORNOW: not in loop SLP because of realignment complications. */
4107 if (is_a <bb_vec_info> (vinfo))
4108 {
4109 bool subchain_p = true;
4110 stmt_vec_info next_load_info = NULL;
4111 stmt_vec_info load_info;
4112 unsigned j;
4113 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4114 {
4115 if (j != 0
4116 && (next_load_info != load_info
4117 || DR_GROUP_GAP (load_info) != 1))
4118 {
4119 subchain_p = false;
4120 break;
4121 }
4122 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4123 }
4124 if (subchain_p)
4125 {
4126 SLP_TREE_LOAD_PERMUTATION (node).release ();
4127 continue;
4128 }
4129 }
4130 else
4131 {
4132 stmt_vec_info load_info;
4133 bool this_load_permuted = false;
4134 unsigned j;
4135 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4136 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4137 {
4138 this_load_permuted = true;
4139 break;
4140 }
4141 stmt_vec_info first_stmt_info
4142 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4143 if (!this_load_permuted
4144 /* The load requires permutation when unrolling exposes
4145 a gap either because the group is larger than the SLP
4146 group-size or because there is a gap between the groups. */
4147 && (known_eq (LOOP_VINFO_VECT_FACTOR
4148 (as_a <loop_vec_info> (vinfo)), 1U)
4149 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4150 && DR_GROUP_GAP (first_stmt_info) == 0)))
4151 {
4152 SLP_TREE_LOAD_PERMUTATION (node).release ();
4153 continue;
4154 }
4155 }
4156 }
4157 }
4158
4159 /* Gather loads reachable from the individual SLP graph entries. */
4160
4161 void
vect_gather_slp_loads(vec_info * vinfo)4162 vect_gather_slp_loads (vec_info *vinfo)
4163 {
4164 unsigned i;
4165 slp_instance instance;
4166 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4167 {
4168 hash_set<slp_tree> visited;
4169 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4170 SLP_INSTANCE_TREE (instance), visited);
4171 }
4172 }
4173
4174
4175 /* For each possible SLP instance decide whether to SLP it and calculate overall
4176 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
4177 least one instance. */
4178
4179 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)4180 vect_make_slp_decision (loop_vec_info loop_vinfo)
4181 {
4182 unsigned int i;
4183 poly_uint64 unrolling_factor = 1;
4184 const vec<slp_instance> &slp_instances
4185 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4186 slp_instance instance;
4187 int decided_to_slp = 0;
4188
4189 DUMP_VECT_SCOPE ("vect_make_slp_decision");
4190
4191 FOR_EACH_VEC_ELT (slp_instances, i, instance)
4192 {
4193 /* FORNOW: SLP if you can. */
4194 /* All unroll factors have the form:
4195
4196 GET_MODE_SIZE (vinfo->vector_mode) * X
4197
4198 for some rational X, so they must have a common multiple. */
4199 unrolling_factor
4200 = force_common_multiple (unrolling_factor,
4201 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4202
4203 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
4204 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4205 loop-based vectorization. Such stmts will be marked as HYBRID. */
4206 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4207 decided_to_slp++;
4208 }
4209
4210 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4211
4212 if (decided_to_slp && dump_enabled_p ())
4213 {
4214 dump_printf_loc (MSG_NOTE, vect_location,
4215 "Decided to SLP %d instances. Unrolling factor ",
4216 decided_to_slp);
4217 dump_dec (MSG_NOTE, unrolling_factor);
4218 dump_printf (MSG_NOTE, "\n");
4219 }
4220
4221 return (decided_to_slp > 0);
4222 }
4223
4224 /* Private data for vect_detect_hybrid_slp. */
4225 struct vdhs_data
4226 {
4227 loop_vec_info loop_vinfo;
4228 vec<stmt_vec_info> *worklist;
4229 };
4230
4231 /* Walker for walk_gimple_op. */
4232
4233 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)4234 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4235 {
4236 walk_stmt_info *wi = (walk_stmt_info *)data;
4237 vdhs_data *dat = (vdhs_data *)wi->info;
4238
4239 if (wi->is_lhs)
4240 return NULL_TREE;
4241
4242 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4243 if (!def_stmt_info)
4244 return NULL_TREE;
4245 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4246 if (PURE_SLP_STMT (def_stmt_info))
4247 {
4248 if (dump_enabled_p ())
4249 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4250 def_stmt_info->stmt);
4251 STMT_SLP_TYPE (def_stmt_info) = hybrid;
4252 dat->worklist->safe_push (def_stmt_info);
4253 }
4254
4255 return NULL_TREE;
4256 }
4257
4258 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4259 if so, otherwise pushing it to WORKLIST. */
4260
4261 static void
maybe_push_to_hybrid_worklist(vec_info * vinfo,vec<stmt_vec_info> & worklist,stmt_vec_info stmt_info)4262 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4263 vec<stmt_vec_info> &worklist,
4264 stmt_vec_info stmt_info)
4265 {
4266 if (dump_enabled_p ())
4267 dump_printf_loc (MSG_NOTE, vect_location,
4268 "Processing hybrid candidate : %G", stmt_info->stmt);
4269 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4270 imm_use_iterator iter2;
4271 ssa_op_iter iter1;
4272 use_operand_p use_p;
4273 def_operand_p def_p;
4274 bool any_def = false;
4275 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4276 {
4277 any_def = true;
4278 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4279 {
4280 if (is_gimple_debug (USE_STMT (use_p)))
4281 continue;
4282 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4283 /* An out-of loop use means this is a loop_vect sink. */
4284 if (!use_info)
4285 {
4286 if (dump_enabled_p ())
4287 dump_printf_loc (MSG_NOTE, vect_location,
4288 "Found loop_vect sink: %G", stmt_info->stmt);
4289 worklist.safe_push (stmt_info);
4290 return;
4291 }
4292 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4293 {
4294 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_NOTE, vect_location,
4296 "Found loop_vect use: %G", use_info->stmt);
4297 worklist.safe_push (stmt_info);
4298 return;
4299 }
4300 }
4301 }
4302 /* No def means this is a loo_vect sink. */
4303 if (!any_def)
4304 {
4305 if (dump_enabled_p ())
4306 dump_printf_loc (MSG_NOTE, vect_location,
4307 "Found loop_vect sink: %G", stmt_info->stmt);
4308 worklist.safe_push (stmt_info);
4309 return;
4310 }
4311 if (dump_enabled_p ())
4312 dump_printf_loc (MSG_NOTE, vect_location,
4313 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4314 STMT_SLP_TYPE (stmt_info) = pure_slp;
4315 }
4316
4317 /* Find stmts that must be both vectorized and SLPed. */
4318
4319 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)4320 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4321 {
4322 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4323
4324 /* All stmts participating in SLP are marked pure_slp, all other
4325 stmts are loop_vect.
4326 First collect all loop_vect stmts into a worklist.
4327 SLP patterns cause not all original scalar stmts to appear in
4328 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4329 Rectify this here and do a backward walk over the IL only considering
4330 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4331 mark them as pure_slp. */
4332 auto_vec<stmt_vec_info> worklist;
4333 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4334 {
4335 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4336 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4337 gsi_next (&gsi))
4338 {
4339 gphi *phi = gsi.phi ();
4340 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4341 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4342 maybe_push_to_hybrid_worklist (loop_vinfo,
4343 worklist, stmt_info);
4344 }
4345 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4346 gsi_prev (&gsi))
4347 {
4348 gimple *stmt = gsi_stmt (gsi);
4349 if (is_gimple_debug (stmt))
4350 continue;
4351 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4352 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4353 {
4354 for (gimple_stmt_iterator gsi2
4355 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4356 !gsi_end_p (gsi2); gsi_next (&gsi2))
4357 {
4358 stmt_vec_info patt_info
4359 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4360 if (!STMT_SLP_TYPE (patt_info)
4361 && STMT_VINFO_RELEVANT (patt_info))
4362 maybe_push_to_hybrid_worklist (loop_vinfo,
4363 worklist, patt_info);
4364 }
4365 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4366 }
4367 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4368 maybe_push_to_hybrid_worklist (loop_vinfo,
4369 worklist, stmt_info);
4370 }
4371 }
4372
4373 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4374 mark any SLP vectorized stmt as hybrid.
4375 ??? We're visiting def stmts N times (once for each non-SLP and
4376 once for each hybrid-SLP use). */
4377 walk_stmt_info wi;
4378 vdhs_data dat;
4379 dat.worklist = &worklist;
4380 dat.loop_vinfo = loop_vinfo;
4381 memset (&wi, 0, sizeof (wi));
4382 wi.info = (void *)&dat;
4383 while (!worklist.is_empty ())
4384 {
4385 stmt_vec_info stmt_info = worklist.pop ();
4386 /* Since SSA operands are not set up for pattern stmts we need
4387 to use walk_gimple_op. */
4388 wi.is_lhs = 0;
4389 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4390 /* For gather/scatter make sure to walk the offset operand, that
4391 can be a scaling and conversion away. */
4392 gather_scatter_info gs_info;
4393 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4394 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
4395 {
4396 int dummy;
4397 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
4398 }
4399 }
4400 }
4401
4402
4403 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
4404
_bb_vec_info(vec<basic_block> _bbs,vec_info_shared * shared)4405 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4406 : vec_info (vec_info::bb, shared),
4407 bbs (_bbs),
4408 roots (vNULL)
4409 {
4410 for (unsigned i = 0; i < bbs.length (); ++i)
4411 {
4412 if (i != 0)
4413 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4414 gsi_next (&si))
4415 {
4416 gphi *phi = si.phi ();
4417 gimple_set_uid (phi, 0);
4418 add_stmt (phi);
4419 }
4420 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4421 !gsi_end_p (gsi); gsi_next (&gsi))
4422 {
4423 gimple *stmt = gsi_stmt (gsi);
4424 gimple_set_uid (stmt, 0);
4425 if (is_gimple_debug (stmt))
4426 continue;
4427 add_stmt (stmt);
4428 }
4429 }
4430 }
4431
4432
4433 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4434 stmts in the basic block. */
4435
~_bb_vec_info()4436 _bb_vec_info::~_bb_vec_info ()
4437 {
4438 /* Reset region marker. */
4439 for (unsigned i = 0; i < bbs.length (); ++i)
4440 {
4441 if (i != 0)
4442 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4443 gsi_next (&si))
4444 {
4445 gphi *phi = si.phi ();
4446 gimple_set_uid (phi, -1);
4447 }
4448 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4449 !gsi_end_p (gsi); gsi_next (&gsi))
4450 {
4451 gimple *stmt = gsi_stmt (gsi);
4452 gimple_set_uid (stmt, -1);
4453 }
4454 }
4455
4456 for (unsigned i = 0; i < roots.length (); ++i)
4457 {
4458 roots[i].stmts.release ();
4459 roots[i].roots.release ();
4460 }
4461 roots.release ();
4462 }
4463
4464 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
4465 given then that child nodes have already been processed, and that
4466 their def types currently match their SLP node's def type. */
4467
4468 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)4469 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4470 slp_instance node_instance,
4471 stmt_vector_for_cost *cost_vec)
4472 {
4473 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4474
4475 /* Calculate the number of vector statements to be created for the
4476 scalar stmts in this node. For SLP reductions it is equal to the
4477 number of vector statements in the children (which has already been
4478 calculated by the recursive call). Otherwise it is the number of
4479 scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4480 VF divided by the number of elements in a vector. */
4481 if (!STMT_VINFO_DATA_REF (stmt_info)
4482 && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4483 {
4484 for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4485 if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4486 {
4487 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4488 = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4489 break;
4490 }
4491 }
4492 else
4493 {
4494 poly_uint64 vf;
4495 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4496 vf = loop_vinfo->vectorization_factor;
4497 else
4498 vf = 1;
4499 unsigned int group_size = SLP_TREE_LANES (node);
4500 tree vectype = SLP_TREE_VECTYPE (node);
4501 SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4502 = vect_get_num_vectors (vf * group_size, vectype);
4503 }
4504
4505 /* Handle purely internal nodes. */
4506 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4507 {
4508 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
4509 return false;
4510
4511 stmt_vec_info slp_stmt_info;
4512 unsigned int i;
4513 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
4514 {
4515 if (STMT_VINFO_LIVE_P (slp_stmt_info)
4516 && !vectorizable_live_operation (vinfo,
4517 slp_stmt_info, NULL, node,
4518 node_instance, i,
4519 false, cost_vec))
4520 return false;
4521 }
4522 return true;
4523 }
4524
4525 bool dummy;
4526 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4527 node, node_instance, cost_vec);
4528 }
4529
4530 /* Try to build NODE from scalars, returning true on success.
4531 NODE_INSTANCE is the SLP instance that contains NODE. */
4532
4533 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)4534 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4535 slp_instance node_instance)
4536 {
4537 stmt_vec_info stmt_info;
4538 unsigned int i;
4539
4540 if (!is_a <bb_vec_info> (vinfo)
4541 || node == SLP_INSTANCE_TREE (node_instance)
4542 || !SLP_TREE_SCALAR_STMTS (node).exists ()
4543 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4544 return false;
4545
4546 if (dump_enabled_p ())
4547 dump_printf_loc (MSG_NOTE, vect_location,
4548 "Building vector operands of %p from scalars instead\n", node);
4549
4550 /* Don't remove and free the child nodes here, since they could be
4551 referenced by other structures. The analysis and scheduling phases
4552 (need to) ignore child nodes of anything that isn't vect_internal_def. */
4553 unsigned int group_size = SLP_TREE_LANES (node);
4554 SLP_TREE_DEF_TYPE (node) = vect_external_def;
4555 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4556 SLP_TREE_LOAD_PERMUTATION (node).release ();
4557 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4558 {
4559 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4560 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4561 }
4562 return true;
4563 }
4564
4565 /* Return true if all elements of the slice are the same. */
4566 bool
all_same_p() const4567 vect_scalar_ops_slice::all_same_p () const
4568 {
4569 for (unsigned int i = 1; i < length; ++i)
4570 if (!operand_equal_p (op (0), op (i)))
4571 return false;
4572 return true;
4573 }
4574
4575 hashval_t
hash(const value_type & s)4576 vect_scalar_ops_slice_hash::hash (const value_type &s)
4577 {
4578 hashval_t hash = 0;
4579 for (unsigned i = 0; i < s.length; ++i)
4580 hash = iterative_hash_expr (s.op (i), hash);
4581 return hash;
4582 }
4583
4584 bool
equal(const value_type & s1,const compare_type & s2)4585 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4586 const compare_type &s2)
4587 {
4588 if (s1.length != s2.length)
4589 return false;
4590 for (unsigned i = 0; i < s1.length; ++i)
4591 if (!operand_equal_p (s1.op (i), s2.op (i)))
4592 return false;
4593 return true;
4594 }
4595
4596 /* Compute the prologue cost for invariant or constant operands represented
4597 by NODE. */
4598
4599 static void
vect_prologue_cost_for_slp(slp_tree node,stmt_vector_for_cost * cost_vec)4600 vect_prologue_cost_for_slp (slp_tree node,
4601 stmt_vector_for_cost *cost_vec)
4602 {
4603 /* There's a special case of an existing vector, that costs nothing. */
4604 if (SLP_TREE_SCALAR_OPS (node).length () == 0
4605 && !SLP_TREE_VEC_DEFS (node).is_empty ())
4606 return;
4607 /* Without looking at the actual initializer a vector of
4608 constants can be implemented as load from the constant pool.
4609 When all elements are the same we can use a splat. */
4610 tree vectype = SLP_TREE_VECTYPE (node);
4611 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4612 unsigned HOST_WIDE_INT const_nunits;
4613 unsigned nelt_limit;
4614 auto ops = &SLP_TREE_SCALAR_OPS (node);
4615 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4616 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4617 && ! multiple_p (const_nunits, group_size))
4618 {
4619 nelt_limit = const_nunits;
4620 hash_set<vect_scalar_ops_slice_hash> vector_ops;
4621 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4622 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4623 starts.quick_push (i * const_nunits);
4624 }
4625 else
4626 {
4627 /* If either the vector has variable length or the vectors
4628 are composed of repeated whole groups we only need to
4629 cost construction once. All vectors will be the same. */
4630 nelt_limit = group_size;
4631 starts.quick_push (0);
4632 }
4633 /* ??? We're just tracking whether vectors in a single node are the same.
4634 Ideally we'd do something more global. */
4635 for (unsigned int start : starts)
4636 {
4637 vect_cost_for_stmt kind;
4638 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4639 kind = vector_load;
4640 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4641 kind = scalar_to_vec;
4642 else
4643 kind = vec_construct;
4644 record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4645 }
4646 }
4647
4648 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4649 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4650
4651 Return true if the operations are supported. */
4652
4653 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited_set,vec<slp_tree> & visited_vec,stmt_vector_for_cost * cost_vec)4654 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4655 slp_instance node_instance,
4656 hash_set<slp_tree> &visited_set,
4657 vec<slp_tree> &visited_vec,
4658 stmt_vector_for_cost *cost_vec)
4659 {
4660 int i, j;
4661 slp_tree child;
4662
4663 /* Assume we can code-generate all invariants. */
4664 if (!node
4665 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4666 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4667 return true;
4668
4669 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4670 {
4671 if (dump_enabled_p ())
4672 dump_printf_loc (MSG_NOTE, vect_location,
4673 "Failed cyclic SLP reference in %p\n", node);
4674 return false;
4675 }
4676 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4677
4678 /* If we already analyzed the exact same set of scalar stmts we're done.
4679 We share the generated vector stmts for those. */
4680 if (visited_set.add (node))
4681 return true;
4682 visited_vec.safe_push (node);
4683
4684 bool res = true;
4685 unsigned visited_rec_start = visited_vec.length ();
4686 unsigned cost_vec_rec_start = cost_vec->length ();
4687 bool seen_non_constant_child = false;
4688 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4689 {
4690 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4691 visited_set, visited_vec,
4692 cost_vec);
4693 if (!res)
4694 break;
4695 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4696 seen_non_constant_child = true;
4697 }
4698 /* We're having difficulties scheduling nodes with just constant
4699 operands and no scalar stmts since we then cannot compute a stmt
4700 insertion place. */
4701 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4702 {
4703 if (dump_enabled_p ())
4704 dump_printf_loc (MSG_NOTE, vect_location,
4705 "Cannot vectorize all-constant op node %p\n", node);
4706 res = false;
4707 }
4708
4709 if (res)
4710 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4711 cost_vec);
4712 /* If analysis failed we have to pop all recursive visited nodes
4713 plus ourselves. */
4714 if (!res)
4715 {
4716 while (visited_vec.length () >= visited_rec_start)
4717 visited_set.remove (visited_vec.pop ());
4718 cost_vec->truncate (cost_vec_rec_start);
4719 }
4720
4721 /* When the node can be vectorized cost invariant nodes it references.
4722 This is not done in DFS order to allow the refering node
4723 vectorizable_* calls to nail down the invariant nodes vector type
4724 and possibly unshare it if it needs a different vector type than
4725 other referrers. */
4726 if (res)
4727 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4728 if (child
4729 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4730 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4731 /* Perform usual caching, note code-generation still
4732 code-gens these nodes multiple times but we expect
4733 to CSE them later. */
4734 && !visited_set.add (child))
4735 {
4736 visited_vec.safe_push (child);
4737 /* ??? After auditing more code paths make a "default"
4738 and push the vector type from NODE to all children
4739 if it is not already set. */
4740 /* Compute the number of vectors to be generated. */
4741 tree vector_type = SLP_TREE_VECTYPE (child);
4742 if (!vector_type)
4743 {
4744 /* For shifts with a scalar argument we don't need
4745 to cost or code-generate anything.
4746 ??? Represent this more explicitely. */
4747 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4748 == shift_vec_info_type)
4749 && j == 1);
4750 continue;
4751 }
4752 unsigned group_size = SLP_TREE_LANES (child);
4753 poly_uint64 vf = 1;
4754 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4755 vf = loop_vinfo->vectorization_factor;
4756 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4757 = vect_get_num_vectors (vf * group_size, vector_type);
4758 /* And cost them. */
4759 vect_prologue_cost_for_slp (child, cost_vec);
4760 }
4761
4762 /* If this node or any of its children can't be vectorized, try pruning
4763 the tree here rather than felling the whole thing. */
4764 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4765 {
4766 /* We'll need to revisit this for invariant costing and number
4767 of vectorized stmt setting. */
4768 res = true;
4769 }
4770
4771 return res;
4772 }
4773
4774 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4775 region and that can be vectorized using vectorizable_live_operation
4776 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
4777 scalar code computing it to be retained. */
4778
4779 static void
vect_bb_slp_mark_live_stmts(bb_vec_info bb_vinfo,slp_tree node,slp_instance instance,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & svisited,hash_set<slp_tree> & visited)4780 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4781 slp_instance instance,
4782 stmt_vector_for_cost *cost_vec,
4783 hash_set<stmt_vec_info> &svisited,
4784 hash_set<slp_tree> &visited)
4785 {
4786 if (visited.add (node))
4787 return;
4788
4789 unsigned i;
4790 stmt_vec_info stmt_info;
4791 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4792 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4793 {
4794 if (svisited.contains (stmt_info))
4795 continue;
4796 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4797 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4798 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4799 /* Only the pattern root stmt computes the original scalar value. */
4800 continue;
4801 bool mark_visited = true;
4802 gimple *orig_stmt = orig_stmt_info->stmt;
4803 ssa_op_iter op_iter;
4804 def_operand_p def_p;
4805 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4806 {
4807 imm_use_iterator use_iter;
4808 gimple *use_stmt;
4809 stmt_vec_info use_stmt_info;
4810 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4811 if (!is_gimple_debug (use_stmt))
4812 {
4813 use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4814 if (!use_stmt_info
4815 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4816 {
4817 STMT_VINFO_LIVE_P (stmt_info) = true;
4818 if (vectorizable_live_operation (bb_vinfo, stmt_info,
4819 NULL, node, instance, i,
4820 false, cost_vec))
4821 /* ??? So we know we can vectorize the live stmt
4822 from one SLP node. If we cannot do so from all
4823 or none consistently we'd have to record which
4824 SLP node (and lane) we want to use for the live
4825 operation. So make sure we can code-generate
4826 from all nodes. */
4827 mark_visited = false;
4828 else
4829 STMT_VINFO_LIVE_P (stmt_info) = false;
4830 break;
4831 }
4832 }
4833 /* We have to verify whether we can insert the lane extract
4834 before all uses. The following is a conservative approximation.
4835 We cannot put this into vectorizable_live_operation because
4836 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4837 doesn't work.
4838 Note that while the fact that we emit code for loads at the
4839 first load should make this a non-problem leafs we construct
4840 from scalars are vectorized after the last scalar def.
4841 ??? If we'd actually compute the insert location during
4842 analysis we could use sth less conservative than the last
4843 scalar stmt in the node for the dominance check. */
4844 /* ??? What remains is "live" uses in vector CTORs in the same
4845 SLP graph which is where those uses can end up code-generated
4846 right after their definition instead of close to their original
4847 use. But that would restrict us to code-generate lane-extracts
4848 from the latest stmt in a node. So we compensate for this
4849 during code-generation, simply not replacing uses for those
4850 hopefully rare cases. */
4851 if (STMT_VINFO_LIVE_P (stmt_info))
4852 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4853 if (!is_gimple_debug (use_stmt)
4854 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4855 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4856 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4857 {
4858 if (dump_enabled_p ())
4859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4860 "Cannot determine insertion place for "
4861 "lane extract\n");
4862 STMT_VINFO_LIVE_P (stmt_info) = false;
4863 mark_visited = true;
4864 }
4865 }
4866 if (mark_visited)
4867 svisited.add (stmt_info);
4868 }
4869
4870 slp_tree child;
4871 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4872 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4873 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4874 cost_vec, svisited, visited);
4875 }
4876
4877 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
4878
4879 static bool
vectorizable_bb_reduc_epilogue(slp_instance instance,stmt_vector_for_cost * cost_vec)4880 vectorizable_bb_reduc_epilogue (slp_instance instance,
4881 stmt_vector_for_cost *cost_vec)
4882 {
4883 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4884 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4885 if (reduc_code == MINUS_EXPR)
4886 reduc_code = PLUS_EXPR;
4887 internal_fn reduc_fn;
4888 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4889 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4890 || reduc_fn == IFN_LAST
4891 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4892 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4893 TREE_TYPE (vectype)))
4894 return false;
4895
4896 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4897 cost log2 vector operations plus shuffles and one extraction. */
4898 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4899 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4900 vectype, 0, vect_body);
4901 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4902 vectype, 0, vect_body);
4903 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4904 vectype, 0, vect_body);
4905 return true;
4906 }
4907
4908 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4909 and recurse to children. */
4910
4911 static void
vect_slp_prune_covered_roots(slp_tree node,hash_set<stmt_vec_info> & roots,hash_set<slp_tree> & visited)4912 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4913 hash_set<slp_tree> &visited)
4914 {
4915 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4916 || visited.add (node))
4917 return;
4918
4919 stmt_vec_info stmt;
4920 unsigned i;
4921 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4922 roots.remove (vect_orig_stmt (stmt));
4923
4924 slp_tree child;
4925 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4926 if (child)
4927 vect_slp_prune_covered_roots (child, roots, visited);
4928 }
4929
4930 /* Analyze statements in SLP instances of VINFO. Return true if the
4931 operations are supported. */
4932
4933 bool
vect_slp_analyze_operations(vec_info * vinfo)4934 vect_slp_analyze_operations (vec_info *vinfo)
4935 {
4936 slp_instance instance;
4937 int i;
4938
4939 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4940
4941 hash_set<slp_tree> visited;
4942 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4943 {
4944 auto_vec<slp_tree> visited_vec;
4945 stmt_vector_for_cost cost_vec;
4946 cost_vec.create (2);
4947 if (is_a <bb_vec_info> (vinfo))
4948 vect_location = instance->location ();
4949 if (!vect_slp_analyze_node_operations (vinfo,
4950 SLP_INSTANCE_TREE (instance),
4951 instance, visited, visited_vec,
4952 &cost_vec)
4953 /* CTOR instances require vectorized defs for the SLP tree root. */
4954 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4955 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4956 != vect_internal_def
4957 /* Make sure we vectorized with the expected type. */
4958 || !useless_type_conversion_p
4959 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4960 (instance->root_stmts[0]->stmt))),
4961 TREE_TYPE (SLP_TREE_VECTYPE
4962 (SLP_INSTANCE_TREE (instance))))))
4963 /* Check we can vectorize the reduction. */
4964 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4965 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4966 {
4967 slp_tree node = SLP_INSTANCE_TREE (instance);
4968 stmt_vec_info stmt_info;
4969 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4970 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4971 else
4972 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_NOTE, vect_location,
4975 "removing SLP instance operations starting from: %G",
4976 stmt_info->stmt);
4977 vect_free_slp_instance (instance);
4978 vinfo->slp_instances.ordered_remove (i);
4979 cost_vec.release ();
4980 while (!visited_vec.is_empty ())
4981 visited.remove (visited_vec.pop ());
4982 }
4983 else
4984 {
4985 i++;
4986 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
4987 {
4988 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
4989 cost_vec.release ();
4990 }
4991 else
4992 /* For BB vectorization remember the SLP graph entry
4993 cost for later. */
4994 instance->cost_vec = cost_vec;
4995 }
4996 }
4997
4998 /* Now look for SLP instances with a root that are covered by other
4999 instances and remove them. */
5000 hash_set<stmt_vec_info> roots;
5001 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5002 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5003 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
5004 if (!roots.is_empty ())
5005 {
5006 visited.empty ();
5007 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5008 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
5009 visited);
5010 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
5011 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
5012 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
5013 {
5014 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5015 if (dump_enabled_p ())
5016 dump_printf_loc (MSG_NOTE, vect_location,
5017 "removing SLP instance operations starting "
5018 "from: %G", root->stmt);
5019 vect_free_slp_instance (instance);
5020 vinfo->slp_instances.ordered_remove (i);
5021 }
5022 else
5023 ++i;
5024 }
5025
5026 /* Compute vectorizable live stmts. */
5027 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5028 {
5029 hash_set<stmt_vec_info> svisited;
5030 hash_set<slp_tree> visited;
5031 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5032 {
5033 vect_location = instance->location ();
5034 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5035 instance, &instance->cost_vec, svisited,
5036 visited);
5037 }
5038 }
5039
5040 return !vinfo->slp_instances.is_empty ();
5041 }
5042
5043 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5044 closing the eventual chain. */
5045
5046 static slp_instance
get_ultimate_leader(slp_instance instance,hash_map<slp_instance,slp_instance> & instance_leader)5047 get_ultimate_leader (slp_instance instance,
5048 hash_map<slp_instance, slp_instance> &instance_leader)
5049 {
5050 auto_vec<slp_instance *, 8> chain;
5051 slp_instance *tem;
5052 while (*(tem = instance_leader.get (instance)) != instance)
5053 {
5054 chain.safe_push (tem);
5055 instance = *tem;
5056 }
5057 while (!chain.is_empty ())
5058 *chain.pop () = instance;
5059 return instance;
5060 }
5061
5062 /* Worker of vect_bb_partition_graph, recurse on NODE. */
5063
5064 static void
vect_bb_partition_graph_r(bb_vec_info bb_vinfo,slp_instance instance,slp_tree node,hash_map<stmt_vec_info,slp_instance> & stmt_to_instance,hash_map<slp_instance,slp_instance> & instance_leader,hash_set<slp_tree> & visited)5065 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5066 slp_instance instance, slp_tree node,
5067 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5068 hash_map<slp_instance, slp_instance> &instance_leader,
5069 hash_set<slp_tree> &visited)
5070 {
5071 stmt_vec_info stmt_info;
5072 unsigned i;
5073
5074 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5075 {
5076 bool existed_p;
5077 slp_instance &stmt_instance
5078 = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5079 if (!existed_p)
5080 ;
5081 else if (stmt_instance != instance)
5082 {
5083 /* If we're running into a previously marked stmt make us the
5084 leader of the current ultimate leader. This keeps the
5085 leader chain acyclic and works even when the current instance
5086 connects two previously independent graph parts. */
5087 slp_instance stmt_leader
5088 = get_ultimate_leader (stmt_instance, instance_leader);
5089 if (stmt_leader != instance)
5090 instance_leader.put (stmt_leader, instance);
5091 }
5092 stmt_instance = instance;
5093 }
5094
5095 if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5096 return;
5097
5098 slp_tree child;
5099 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5100 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5101 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5102 instance_leader, visited);
5103 }
5104
5105 /* Partition the SLP graph into pieces that can be costed independently. */
5106
5107 static void
vect_bb_partition_graph(bb_vec_info bb_vinfo)5108 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5109 {
5110 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5111
5112 /* First walk the SLP graph assigning each involved scalar stmt a
5113 corresponding SLP graph entry and upon visiting a previously
5114 marked stmt, make the stmts leader the current SLP graph entry. */
5115 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5116 hash_map<slp_instance, slp_instance> instance_leader;
5117 hash_set<slp_tree> visited;
5118 slp_instance instance;
5119 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5120 {
5121 instance_leader.put (instance, instance);
5122 vect_bb_partition_graph_r (bb_vinfo,
5123 instance, SLP_INSTANCE_TREE (instance),
5124 stmt_to_instance, instance_leader,
5125 visited);
5126 }
5127
5128 /* Then collect entries to each independent subgraph. */
5129 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5130 {
5131 slp_instance leader = get_ultimate_leader (instance, instance_leader);
5132 leader->subgraph_entries.safe_push (instance);
5133 if (dump_enabled_p ()
5134 && leader != instance)
5135 dump_printf_loc (MSG_NOTE, vect_location,
5136 "instance %p is leader of %p\n",
5137 leader, instance);
5138 }
5139 }
5140
5141 /* Compute the set of scalar stmts participating in internal and external
5142 nodes. */
5143
5144 static void
vect_slp_gather_vectorized_scalar_stmts(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited,hash_set<stmt_vec_info> & vstmts,hash_set<stmt_vec_info> & estmts)5145 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5146 hash_set<slp_tree> &visited,
5147 hash_set<stmt_vec_info> &vstmts,
5148 hash_set<stmt_vec_info> &estmts)
5149 {
5150 int i;
5151 stmt_vec_info stmt_info;
5152 slp_tree child;
5153
5154 if (visited.add (node))
5155 return;
5156
5157 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5158 {
5159 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5160 vstmts.add (stmt_info);
5161
5162 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5163 if (child)
5164 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5165 vstmts, estmts);
5166 }
5167 else
5168 for (tree def : SLP_TREE_SCALAR_OPS (node))
5169 {
5170 stmt_vec_info def_stmt = vinfo->lookup_def (def);
5171 if (def_stmt)
5172 estmts.add (def_stmt);
5173 }
5174 }
5175
5176
5177 /* Compute the scalar cost of the SLP node NODE and its children
5178 and return it. Do not account defs that are marked in LIFE and
5179 update LIFE according to uses of NODE. */
5180
5181 static void
vect_bb_slp_scalar_cost(vec_info * vinfo,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & vectorized_scalar_stmts,hash_set<slp_tree> & visited)5182 vect_bb_slp_scalar_cost (vec_info *vinfo,
5183 slp_tree node, vec<bool, va_heap> *life,
5184 stmt_vector_for_cost *cost_vec,
5185 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5186 hash_set<slp_tree> &visited)
5187 {
5188 unsigned i;
5189 stmt_vec_info stmt_info;
5190 slp_tree child;
5191
5192 if (visited.add (node))
5193 return;
5194
5195 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5196 {
5197 ssa_op_iter op_iter;
5198 def_operand_p def_p;
5199
5200 if ((*life)[i])
5201 continue;
5202
5203 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5204 gimple *orig_stmt = orig_stmt_info->stmt;
5205
5206 /* If there is a non-vectorized use of the defs then the scalar
5207 stmt is kept live in which case we do not account it or any
5208 required defs in the SLP children in the scalar cost. This
5209 way we make the vectorization more costly when compared to
5210 the scalar cost. */
5211 if (!STMT_VINFO_LIVE_P (stmt_info))
5212 {
5213 auto_vec<gimple *, 8> worklist;
5214 hash_set<gimple *> *worklist_visited = NULL;
5215 worklist.quick_push (orig_stmt);
5216 do
5217 {
5218 gimple *work_stmt = worklist.pop ();
5219 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
5220 {
5221 imm_use_iterator use_iter;
5222 gimple *use_stmt;
5223 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
5224 DEF_FROM_PTR (def_p))
5225 if (!is_gimple_debug (use_stmt))
5226 {
5227 stmt_vec_info use_stmt_info
5228 = vinfo->lookup_stmt (use_stmt);
5229 if (!use_stmt_info
5230 || !vectorized_scalar_stmts.contains (use_stmt_info))
5231 {
5232 if (use_stmt_info
5233 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
5234 {
5235 /* For stmts participating in patterns we have
5236 to check its uses recursively. */
5237 if (!worklist_visited)
5238 worklist_visited = new hash_set<gimple *> ();
5239 if (!worklist_visited->add (use_stmt))
5240 worklist.safe_push (use_stmt);
5241 continue;
5242 }
5243 (*life)[i] = true;
5244 goto next_lane;
5245 }
5246 }
5247 }
5248 }
5249 while (!worklist.is_empty ());
5250 next_lane:
5251 if (worklist_visited)
5252 delete worklist_visited;
5253 if ((*life)[i])
5254 continue;
5255 }
5256
5257 /* Count scalar stmts only once. */
5258 if (gimple_visited_p (orig_stmt))
5259 continue;
5260 gimple_set_visited (orig_stmt, true);
5261
5262 vect_cost_for_stmt kind;
5263 if (STMT_VINFO_DATA_REF (orig_stmt_info))
5264 {
5265 if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5266 kind = scalar_load;
5267 else
5268 kind = scalar_store;
5269 }
5270 else if (vect_nop_conversion_p (orig_stmt_info))
5271 continue;
5272 /* For single-argument PHIs assume coalescing which means zero cost
5273 for the scalar and the vector PHIs. This avoids artificially
5274 favoring the vector path (but may pessimize it in some cases). */
5275 else if (is_a <gphi *> (orig_stmt_info->stmt)
5276 && gimple_phi_num_args
5277 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5278 continue;
5279 else
5280 kind = scalar_stmt;
5281 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5282 SLP_TREE_VECTYPE (node), 0, vect_body);
5283 }
5284
5285 auto_vec<bool, 20> subtree_life;
5286 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5287 {
5288 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5289 {
5290 /* Do not directly pass LIFE to the recursive call, copy it to
5291 confine changes in the callee to the current child/subtree. */
5292 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5293 {
5294 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5295 for (unsigned j = 0;
5296 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5297 {
5298 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5299 if (perm.first == i)
5300 subtree_life[perm.second] = (*life)[j];
5301 }
5302 }
5303 else
5304 {
5305 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5306 subtree_life.safe_splice (*life);
5307 }
5308 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5309 vectorized_scalar_stmts, visited);
5310 subtree_life.truncate (0);
5311 }
5312 }
5313 }
5314
5315 /* Comparator for the loop-index sorted cost vectors. */
5316
5317 static int
li_cost_vec_cmp(const void * a_,const void * b_)5318 li_cost_vec_cmp (const void *a_, const void *b_)
5319 {
5320 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5321 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5322 if (a->first < b->first)
5323 return -1;
5324 else if (a->first == b->first)
5325 return 0;
5326 return 1;
5327 }
5328
5329 /* Check if vectorization of the basic block is profitable for the
5330 subgraph denoted by SLP_INSTANCES. */
5331
5332 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo,vec<slp_instance> slp_instances,loop_p orig_loop)5333 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5334 vec<slp_instance> slp_instances,
5335 loop_p orig_loop)
5336 {
5337 slp_instance instance;
5338 int i;
5339 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5340 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5341
5342 if (dump_enabled_p ())
5343 {
5344 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5345 hash_set<slp_tree> visited;
5346 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5347 vect_print_slp_graph (MSG_NOTE, vect_location,
5348 SLP_INSTANCE_TREE (instance), visited);
5349 }
5350
5351 /* Compute the set of scalar stmts we know will go away 'locally' when
5352 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
5353 not accurate for nodes promoted extern late or for scalar stmts that
5354 are used both in extern defs and in vectorized defs. */
5355 hash_set<stmt_vec_info> vectorized_scalar_stmts;
5356 hash_set<stmt_vec_info> scalar_stmts_in_externs;
5357 hash_set<slp_tree> visited;
5358 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5359 {
5360 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5361 SLP_INSTANCE_TREE (instance),
5362 visited,
5363 vectorized_scalar_stmts,
5364 scalar_stmts_in_externs);
5365 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5366 vectorized_scalar_stmts.add (rstmt);
5367 }
5368 /* Scalar stmts used as defs in external nodes need to be preseved, so
5369 remove them from vectorized_scalar_stmts. */
5370 for (stmt_vec_info stmt : scalar_stmts_in_externs)
5371 vectorized_scalar_stmts.remove (stmt);
5372
5373 /* Calculate scalar cost and sum the cost for the vector stmts
5374 previously collected. */
5375 stmt_vector_for_cost scalar_costs = vNULL;
5376 stmt_vector_for_cost vector_costs = vNULL;
5377 visited.empty ();
5378 FOR_EACH_VEC_ELT (slp_instances, i, instance)
5379 {
5380 auto_vec<bool, 20> life;
5381 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5382 true);
5383 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5384 record_stmt_cost (&scalar_costs,
5385 SLP_INSTANCE_ROOT_STMTS (instance).length (),
5386 scalar_stmt,
5387 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5388 vect_bb_slp_scalar_cost (bb_vinfo,
5389 SLP_INSTANCE_TREE (instance),
5390 &life, &scalar_costs, vectorized_scalar_stmts,
5391 visited);
5392 vector_costs.safe_splice (instance->cost_vec);
5393 instance->cost_vec.release ();
5394 }
5395
5396 if (dump_enabled_p ())
5397 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5398
5399 /* When costing non-loop vectorization we need to consider each covered
5400 loop independently and make sure vectorization is profitable. For
5401 now we assume a loop may be not entered or executed an arbitrary
5402 number of iterations (??? static information can provide more
5403 precise info here) which means we can simply cost each containing
5404 loops stmts separately. */
5405
5406 /* First produce cost vectors sorted by loop index. */
5407 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5408 li_scalar_costs (scalar_costs.length ());
5409 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5410 li_vector_costs (vector_costs.length ());
5411 stmt_info_for_cost *cost;
5412 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5413 {
5414 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5415 li_scalar_costs.quick_push (std::make_pair (l, cost));
5416 }
5417 /* Use a random used loop as fallback in case the first vector_costs
5418 entry does not have a stmt_info associated with it. */
5419 unsigned l = li_scalar_costs[0].first;
5420 FOR_EACH_VEC_ELT (vector_costs, i, cost)
5421 {
5422 /* We inherit from the previous COST, invariants, externals and
5423 extracts immediately follow the cost for the related stmt. */
5424 if (cost->stmt_info)
5425 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5426 li_vector_costs.quick_push (std::make_pair (l, cost));
5427 }
5428 li_scalar_costs.qsort (li_cost_vec_cmp);
5429 li_vector_costs.qsort (li_cost_vec_cmp);
5430
5431 /* Now cost the portions individually. */
5432 unsigned vi = 0;
5433 unsigned si = 0;
5434 bool profitable = true;
5435 while (si < li_scalar_costs.length ()
5436 && vi < li_vector_costs.length ())
5437 {
5438 unsigned sl = li_scalar_costs[si].first;
5439 unsigned vl = li_vector_costs[vi].first;
5440 if (sl != vl)
5441 {
5442 if (dump_enabled_p ())
5443 dump_printf_loc (MSG_NOTE, vect_location,
5444 "Scalar %d and vector %d loop part do not "
5445 "match up, skipping scalar part\n", sl, vl);
5446 /* Skip the scalar part, assuming zero cost on the vector side. */
5447 do
5448 {
5449 si++;
5450 }
5451 while (si < li_scalar_costs.length ()
5452 && li_scalar_costs[si].first == sl);
5453 continue;
5454 }
5455
5456 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5457 do
5458 {
5459 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5460 si++;
5461 }
5462 while (si < li_scalar_costs.length ()
5463 && li_scalar_costs[si].first == sl);
5464 unsigned dummy;
5465 finish_cost (scalar_target_cost_data, nullptr,
5466 &dummy, &scalar_cost, &dummy);
5467
5468 /* Complete the target-specific vector cost calculation. */
5469 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5470 do
5471 {
5472 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5473 vi++;
5474 }
5475 while (vi < li_vector_costs.length ()
5476 && li_vector_costs[vi].first == vl);
5477 finish_cost (vect_target_cost_data, scalar_target_cost_data,
5478 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5479 delete scalar_target_cost_data;
5480 delete vect_target_cost_data;
5481
5482 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5483
5484 if (dump_enabled_p ())
5485 {
5486 dump_printf_loc (MSG_NOTE, vect_location,
5487 "Cost model analysis for part in loop %d:\n", sl);
5488 dump_printf (MSG_NOTE, " Vector cost: %d\n",
5489 vec_inside_cost + vec_outside_cost);
5490 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
5491 }
5492
5493 /* Vectorization is profitable if its cost is more than the cost of scalar
5494 version. Note that we err on the vector side for equal cost because
5495 the cost estimate is otherwise quite pessimistic (constant uses are
5496 free on the scalar side but cost a load on the vector side for
5497 example). */
5498 if (vec_outside_cost + vec_inside_cost > scalar_cost)
5499 {
5500 profitable = false;
5501 break;
5502 }
5503 }
5504 if (profitable && vi < li_vector_costs.length ())
5505 {
5506 if (dump_enabled_p ())
5507 dump_printf_loc (MSG_NOTE, vect_location,
5508 "Excess vector cost for part in loop %d:\n",
5509 li_vector_costs[vi].first);
5510 profitable = false;
5511 }
5512
5513 /* Unset visited flag. This is delayed when the subgraph is profitable
5514 and we process the loop for remaining unvectorized if-converted code. */
5515 if (!orig_loop || !profitable)
5516 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5517 gimple_set_visited (cost->stmt_info->stmt, false);
5518
5519 scalar_costs.release ();
5520 vector_costs.release ();
5521
5522 return profitable;
5523 }
5524
5525 /* qsort comparator for lane defs. */
5526
5527 static int
vld_cmp(const void * a_,const void * b_)5528 vld_cmp (const void *a_, const void *b_)
5529 {
5530 auto *a = (const std::pair<unsigned, tree> *)a_;
5531 auto *b = (const std::pair<unsigned, tree> *)b_;
5532 return a->first - b->first;
5533 }
5534
5535 /* Return true if USE_STMT is a vector lane insert into VEC and set
5536 *THIS_LANE to the lane number that is set. */
5537
5538 static bool
vect_slp_is_lane_insert(gimple * use_stmt,tree vec,unsigned * this_lane)5539 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5540 {
5541 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5542 if (!use_ass
5543 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5544 || (vec
5545 ? gimple_assign_rhs1 (use_ass) != vec
5546 : ((vec = gimple_assign_rhs1 (use_ass)), false))
5547 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5548 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5549 || !constant_multiple_p
5550 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5551 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5552 this_lane))
5553 return false;
5554 return true;
5555 }
5556
5557 /* Find any vectorizable constructors and add them to the grouped_store
5558 array. */
5559
5560 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)5561 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5562 {
5563 for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5564 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5565 !gsi_end_p (gsi); gsi_next (&gsi))
5566 {
5567 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5568 if (!assign)
5569 continue;
5570
5571 tree rhs = gimple_assign_rhs1 (assign);
5572 enum tree_code code = gimple_assign_rhs_code (assign);
5573 use_operand_p use_p;
5574 gimple *use_stmt;
5575 if (code == CONSTRUCTOR)
5576 {
5577 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5578 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5579 CONSTRUCTOR_NELTS (rhs))
5580 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5581 || uniform_vector_p (rhs))
5582 continue;
5583
5584 unsigned j;
5585 tree val;
5586 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5587 if (TREE_CODE (val) != SSA_NAME
5588 || !bb_vinfo->lookup_def (val))
5589 break;
5590 if (j != CONSTRUCTOR_NELTS (rhs))
5591 continue;
5592
5593 stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5594 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5595 }
5596 else if (code == BIT_INSERT_EXPR
5597 && VECTOR_TYPE_P (TREE_TYPE (rhs))
5598 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5599 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5600 && integer_zerop (gimple_assign_rhs3 (assign))
5601 && useless_type_conversion_p
5602 (TREE_TYPE (TREE_TYPE (rhs)),
5603 TREE_TYPE (gimple_assign_rhs2 (assign)))
5604 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5605 {
5606 /* We start to match on insert to lane zero but since the
5607 inserts need not be ordered we'd have to search both
5608 the def and the use chains. */
5609 tree vectype = TREE_TYPE (rhs);
5610 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5611 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5612 auto_sbitmap lanes (nlanes);
5613 bitmap_clear (lanes);
5614 bitmap_set_bit (lanes, 0);
5615 tree def = gimple_assign_lhs (assign);
5616 lane_defs.quick_push
5617 (std::make_pair (0, gimple_assign_rhs2 (assign)));
5618 unsigned lanes_found = 1;
5619 /* Start with the use chains, the last stmt will be the root. */
5620 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5621 vec<stmt_vec_info> roots = vNULL;
5622 roots.safe_push (last);
5623 do
5624 {
5625 use_operand_p use_p;
5626 gimple *use_stmt;
5627 if (!single_imm_use (def, &use_p, &use_stmt))
5628 break;
5629 unsigned this_lane;
5630 if (!bb_vinfo->lookup_stmt (use_stmt)
5631 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5632 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5633 break;
5634 if (bitmap_bit_p (lanes, this_lane))
5635 break;
5636 lanes_found++;
5637 bitmap_set_bit (lanes, this_lane);
5638 gassign *use_ass = as_a <gassign *> (use_stmt);
5639 lane_defs.quick_push (std::make_pair
5640 (this_lane, gimple_assign_rhs2 (use_ass)));
5641 last = bb_vinfo->lookup_stmt (use_ass);
5642 roots.safe_push (last);
5643 def = gimple_assign_lhs (use_ass);
5644 }
5645 while (lanes_found < nlanes);
5646 if (roots.length () > 1)
5647 std::swap(roots[0], roots[roots.length () - 1]);
5648 if (lanes_found < nlanes)
5649 {
5650 /* Now search the def chain. */
5651 def = gimple_assign_rhs1 (assign);
5652 do
5653 {
5654 if (TREE_CODE (def) != SSA_NAME
5655 || !has_single_use (def))
5656 break;
5657 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5658 unsigned this_lane;
5659 if (!bb_vinfo->lookup_stmt (def_stmt)
5660 || !vect_slp_is_lane_insert (def_stmt,
5661 NULL_TREE, &this_lane)
5662 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5663 break;
5664 if (bitmap_bit_p (lanes, this_lane))
5665 break;
5666 lanes_found++;
5667 bitmap_set_bit (lanes, this_lane);
5668 lane_defs.quick_push (std::make_pair
5669 (this_lane,
5670 gimple_assign_rhs2 (def_stmt)));
5671 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5672 def = gimple_assign_rhs1 (def_stmt);
5673 }
5674 while (lanes_found < nlanes);
5675 }
5676 if (lanes_found == nlanes)
5677 {
5678 /* Sort lane_defs after the lane index and register the root. */
5679 lane_defs.qsort (vld_cmp);
5680 vec<stmt_vec_info> stmts;
5681 stmts.create (nlanes);
5682 for (unsigned i = 0; i < nlanes; ++i)
5683 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5684 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5685 stmts, roots));
5686 }
5687 else
5688 roots.release ();
5689 }
5690 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5691 && (associative_tree_code (code) || code == MINUS_EXPR)
5692 /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
5693 checks pessimize a two-element reduction. PR54400.
5694 ??? In-order reduction could be handled if we only
5695 traverse one operand chain in vect_slp_linearize_chain. */
5696 && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5697 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5698 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5699 /* Ops with constants at the tail can be stripped here. */
5700 && TREE_CODE (rhs) == SSA_NAME
5701 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5702 /* Should be the chain end. */
5703 && (!single_imm_use (gimple_assign_lhs (assign),
5704 &use_p, &use_stmt)
5705 || !is_gimple_assign (use_stmt)
5706 || (gimple_assign_rhs_code (use_stmt) != code
5707 && ((code != PLUS_EXPR && code != MINUS_EXPR)
5708 || (gimple_assign_rhs_code (use_stmt)
5709 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5710 {
5711 /* We start the match at the end of a possible association
5712 chain. */
5713 auto_vec<chain_op_t> chain;
5714 auto_vec<std::pair<tree_code, gimple *> > worklist;
5715 auto_vec<gimple *> chain_stmts;
5716 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5717 if (code == MINUS_EXPR)
5718 code = PLUS_EXPR;
5719 internal_fn reduc_fn;
5720 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5721 || reduc_fn == IFN_LAST)
5722 continue;
5723 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5724 /* ??? */
5725 code_stmt, alt_code_stmt, &chain_stmts);
5726 if (chain.length () > 1)
5727 {
5728 /* Sort the chain according to def_type and operation. */
5729 chain.sort (dt_sort_cmp, bb_vinfo);
5730 /* ??? Now we'd want to strip externals and constants
5731 but record those to be handled in the epilogue. */
5732 /* ??? For now do not allow mixing ops or externs/constants. */
5733 bool invalid = false;
5734 for (unsigned i = 0; i < chain.length (); ++i)
5735 if (chain[i].dt != vect_internal_def
5736 || chain[i].code != code)
5737 invalid = true;
5738 if (!invalid)
5739 {
5740 vec<stmt_vec_info> stmts;
5741 stmts.create (chain.length ());
5742 for (unsigned i = 0; i < chain.length (); ++i)
5743 stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5744 vec<stmt_vec_info> roots;
5745 roots.create (chain_stmts.length ());
5746 for (unsigned i = 0; i < chain_stmts.length (); ++i)
5747 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5748 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5749 stmts, roots));
5750 }
5751 }
5752 }
5753 }
5754 }
5755
5756 /* Walk the grouped store chains and replace entries with their
5757 pattern variant if any. */
5758
5759 static void
vect_fixup_store_groups_with_patterns(vec_info * vinfo)5760 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5761 {
5762 stmt_vec_info first_element;
5763 unsigned i;
5764
5765 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5766 {
5767 /* We also have CTORs in this array. */
5768 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5769 continue;
5770 if (STMT_VINFO_IN_PATTERN_P (first_element))
5771 {
5772 stmt_vec_info orig = first_element;
5773 first_element = STMT_VINFO_RELATED_STMT (first_element);
5774 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5775 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5776 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5777 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5778 vinfo->grouped_stores[i] = first_element;
5779 }
5780 stmt_vec_info prev = first_element;
5781 while (DR_GROUP_NEXT_ELEMENT (prev))
5782 {
5783 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5784 if (STMT_VINFO_IN_PATTERN_P (elt))
5785 {
5786 stmt_vec_info orig = elt;
5787 elt = STMT_VINFO_RELATED_STMT (elt);
5788 DR_GROUP_NEXT_ELEMENT (prev) = elt;
5789 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5790 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5791 }
5792 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5793 prev = elt;
5794 }
5795 }
5796 }
5797
5798 /* Check if the region described by BB_VINFO can be vectorized, returning
5799 true if so. When returning false, set FATAL to true if the same failure
5800 would prevent vectorization at other vector sizes, false if it is still
5801 worth trying other sizes. N_STMTS is the number of statements in the
5802 region. */
5803
5804 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal,vec<int> * dataref_groups)5805 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5806 vec<int> *dataref_groups)
5807 {
5808 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5809
5810 slp_instance instance;
5811 int i;
5812 poly_uint64 min_vf = 2;
5813
5814 /* The first group of checks is independent of the vector size. */
5815 fatal = true;
5816
5817 /* Analyze the data references. */
5818
5819 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5820 {
5821 if (dump_enabled_p ())
5822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5823 "not vectorized: unhandled data-ref in basic "
5824 "block.\n");
5825 return false;
5826 }
5827
5828 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5829 {
5830 if (dump_enabled_p ())
5831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 "not vectorized: unhandled data access in "
5833 "basic block.\n");
5834 return false;
5835 }
5836
5837 vect_slp_check_for_constructors (bb_vinfo);
5838
5839 /* If there are no grouped stores and no constructors in the region
5840 there is no need to continue with pattern recog as vect_analyze_slp
5841 will fail anyway. */
5842 if (bb_vinfo->grouped_stores.is_empty ()
5843 && bb_vinfo->roots.is_empty ())
5844 {
5845 if (dump_enabled_p ())
5846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5847 "not vectorized: no grouped stores in "
5848 "basic block.\n");
5849 return false;
5850 }
5851
5852 /* While the rest of the analysis below depends on it in some way. */
5853 fatal = false;
5854
5855 vect_pattern_recog (bb_vinfo);
5856
5857 /* Update store groups from pattern processing. */
5858 vect_fixup_store_groups_with_patterns (bb_vinfo);
5859
5860 /* Check the SLP opportunities in the basic block, analyze and build SLP
5861 trees. */
5862 if (!vect_analyze_slp (bb_vinfo, n_stmts))
5863 {
5864 if (dump_enabled_p ())
5865 {
5866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5867 "Failed to SLP the basic block.\n");
5868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5869 "not vectorized: failed to find SLP opportunities "
5870 "in basic block.\n");
5871 }
5872 return false;
5873 }
5874
5875 /* Optimize permutations. */
5876 vect_optimize_slp (bb_vinfo);
5877
5878 /* Gather the loads reachable from the SLP graph entries. */
5879 vect_gather_slp_loads (bb_vinfo);
5880
5881 vect_record_base_alignments (bb_vinfo);
5882
5883 /* Analyze and verify the alignment of data references and the
5884 dependence in the SLP instances. */
5885 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5886 {
5887 vect_location = instance->location ();
5888 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5889 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5890 {
5891 slp_tree node = SLP_INSTANCE_TREE (instance);
5892 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5893 if (dump_enabled_p ())
5894 dump_printf_loc (MSG_NOTE, vect_location,
5895 "removing SLP instance operations starting from: %G",
5896 stmt_info->stmt);
5897 vect_free_slp_instance (instance);
5898 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5899 continue;
5900 }
5901
5902 /* Mark all the statements that we want to vectorize as pure SLP and
5903 relevant. */
5904 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5905 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5906 unsigned j;
5907 stmt_vec_info root;
5908 /* Likewise consider instance root stmts as vectorized. */
5909 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5910 STMT_SLP_TYPE (root) = pure_slp;
5911
5912 i++;
5913 }
5914 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5915 return false;
5916
5917 if (!vect_slp_analyze_operations (bb_vinfo))
5918 {
5919 if (dump_enabled_p ())
5920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5921 "not vectorized: bad operation in basic block.\n");
5922 return false;
5923 }
5924
5925 vect_bb_partition_graph (bb_vinfo);
5926
5927 return true;
5928 }
5929
5930 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
5931 basic blocks in BBS, returning true on success.
5932 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
5933
5934 static bool
vect_slp_region(vec<basic_block> bbs,vec<data_reference_p> datarefs,vec<int> * dataref_groups,unsigned int n_stmts,loop_p orig_loop)5935 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5936 vec<int> *dataref_groups, unsigned int n_stmts,
5937 loop_p orig_loop)
5938 {
5939 bb_vec_info bb_vinfo;
5940 auto_vector_modes vector_modes;
5941
5942 /* Autodetect first vector size we try. */
5943 machine_mode next_vector_mode = VOIDmode;
5944 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5945 unsigned int mode_i = 0;
5946
5947 vec_info_shared shared;
5948
5949 machine_mode autodetected_vector_mode = VOIDmode;
5950 while (1)
5951 {
5952 bool vectorized = false;
5953 bool fatal = false;
5954 bb_vinfo = new _bb_vec_info (bbs, &shared);
5955
5956 bool first_time_p = shared.datarefs.is_empty ();
5957 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5958 if (first_time_p)
5959 bb_vinfo->shared->save_datarefs ();
5960 else
5961 bb_vinfo->shared->check_datarefs ();
5962 bb_vinfo->vector_mode = next_vector_mode;
5963
5964 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5965 {
5966 if (dump_enabled_p ())
5967 {
5968 dump_printf_loc (MSG_NOTE, vect_location,
5969 "***** Analysis succeeded with vector mode"
5970 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5971 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5972 }
5973
5974 bb_vinfo->shared->check_datarefs ();
5975
5976 auto_vec<slp_instance> profitable_subgraphs;
5977 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
5978 {
5979 if (instance->subgraph_entries.is_empty ())
5980 continue;
5981
5982 vect_location = instance->location ();
5983 if (!unlimited_cost_model (NULL)
5984 && !vect_bb_vectorization_profitable_p
5985 (bb_vinfo, instance->subgraph_entries, orig_loop))
5986 {
5987 if (dump_enabled_p ())
5988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5989 "not vectorized: vectorization is not "
5990 "profitable.\n");
5991 continue;
5992 }
5993
5994 if (!dbg_cnt (vect_slp))
5995 continue;
5996
5997 profitable_subgraphs.safe_push (instance);
5998 }
5999
6000 /* When we're vectorizing an if-converted loop body make sure
6001 we vectorized all if-converted code. */
6002 if (!profitable_subgraphs.is_empty ()
6003 && orig_loop)
6004 {
6005 gcc_assert (bb_vinfo->bbs.length () == 1);
6006 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
6007 !gsi_end_p (gsi); gsi_next (&gsi))
6008 {
6009 /* The costing above left us with DCEable vectorized scalar
6010 stmts having the visited flag set on profitable
6011 subgraphs. Do the delayed clearing of the flag here. */
6012 if (gimple_visited_p (gsi_stmt (gsi)))
6013 {
6014 gimple_set_visited (gsi_stmt (gsi), false);
6015 continue;
6016 }
6017 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
6018 continue;
6019
6020 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
6021 if (gimple_assign_rhs_code (ass) == COND_EXPR)
6022 {
6023 if (!profitable_subgraphs.is_empty ()
6024 && dump_enabled_p ())
6025 dump_printf_loc (MSG_NOTE, vect_location,
6026 "not profitable because of "
6027 "unprofitable if-converted scalar "
6028 "code\n");
6029 profitable_subgraphs.truncate (0);
6030 }
6031 }
6032 }
6033
6034 /* Finally schedule the profitable subgraphs. */
6035 for (slp_instance instance : profitable_subgraphs)
6036 {
6037 if (!vectorized && dump_enabled_p ())
6038 dump_printf_loc (MSG_NOTE, vect_location,
6039 "Basic block will be vectorized "
6040 "using SLP\n");
6041 vectorized = true;
6042
6043 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
6044
6045 unsigned HOST_WIDE_INT bytes;
6046 if (dump_enabled_p ())
6047 {
6048 if (GET_MODE_SIZE
6049 (bb_vinfo->vector_mode).is_constant (&bytes))
6050 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6051 "basic block part vectorized using %wu "
6052 "byte vectors\n", bytes);
6053 else
6054 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6055 "basic block part vectorized using "
6056 "variable length vectors\n");
6057 }
6058 }
6059 }
6060 else
6061 {
6062 if (dump_enabled_p ())
6063 dump_printf_loc (MSG_NOTE, vect_location,
6064 "***** Analysis failed with vector mode %s\n",
6065 GET_MODE_NAME (bb_vinfo->vector_mode));
6066 }
6067
6068 if (mode_i == 0)
6069 autodetected_vector_mode = bb_vinfo->vector_mode;
6070
6071 if (!fatal)
6072 while (mode_i < vector_modes.length ()
6073 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6074 {
6075 if (dump_enabled_p ())
6076 dump_printf_loc (MSG_NOTE, vect_location,
6077 "***** The result for vector mode %s would"
6078 " be the same\n",
6079 GET_MODE_NAME (vector_modes[mode_i]));
6080 mode_i += 1;
6081 }
6082
6083 delete bb_vinfo;
6084
6085 if (mode_i < vector_modes.length ()
6086 && VECTOR_MODE_P (autodetected_vector_mode)
6087 && (related_vector_mode (vector_modes[mode_i],
6088 GET_MODE_INNER (autodetected_vector_mode))
6089 == autodetected_vector_mode)
6090 && (related_vector_mode (autodetected_vector_mode,
6091 GET_MODE_INNER (vector_modes[mode_i]))
6092 == vector_modes[mode_i]))
6093 {
6094 if (dump_enabled_p ())
6095 dump_printf_loc (MSG_NOTE, vect_location,
6096 "***** Skipping vector mode %s, which would"
6097 " repeat the analysis for %s\n",
6098 GET_MODE_NAME (vector_modes[mode_i]),
6099 GET_MODE_NAME (autodetected_vector_mode));
6100 mode_i += 1;
6101 }
6102
6103 if (vectorized
6104 || mode_i == vector_modes.length ()
6105 || autodetected_vector_mode == VOIDmode
6106 /* If vect_slp_analyze_bb_1 signaled that analysis for all
6107 vector sizes will fail do not bother iterating. */
6108 || fatal)
6109 return vectorized;
6110
6111 /* Try the next biggest vector size. */
6112 next_vector_mode = vector_modes[mode_i++];
6113 if (dump_enabled_p ())
6114 dump_printf_loc (MSG_NOTE, vect_location,
6115 "***** Re-trying analysis with vector mode %s\n",
6116 GET_MODE_NAME (next_vector_mode));
6117 }
6118 }
6119
6120
6121 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
6122 true if anything in the basic-block was vectorized. */
6123
6124 static bool
vect_slp_bbs(const vec<basic_block> & bbs,loop_p orig_loop)6125 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6126 {
6127 vec<data_reference_p> datarefs = vNULL;
6128 auto_vec<int> dataref_groups;
6129 int insns = 0;
6130 int current_group = 0;
6131
6132 for (unsigned i = 0; i < bbs.length (); i++)
6133 {
6134 basic_block bb = bbs[i];
6135 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6136 gsi_next (&gsi))
6137 {
6138 gimple *stmt = gsi_stmt (gsi);
6139 if (is_gimple_debug (stmt))
6140 continue;
6141
6142 insns++;
6143
6144 if (gimple_location (stmt) != UNKNOWN_LOCATION)
6145 vect_location = stmt;
6146
6147 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6148 &dataref_groups, current_group))
6149 ++current_group;
6150 }
6151 /* New BBs always start a new DR group. */
6152 ++current_group;
6153 }
6154
6155 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6156 }
6157
6158 /* Special entry for the BB vectorizer. Analyze and transform a single
6159 if-converted BB with ORIG_LOOPs body being the not if-converted
6160 representation. Returns true if anything in the basic-block was
6161 vectorized. */
6162
6163 bool
vect_slp_if_converted_bb(basic_block bb,loop_p orig_loop)6164 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6165 {
6166 auto_vec<basic_block> bbs;
6167 bbs.safe_push (bb);
6168 return vect_slp_bbs (bbs, orig_loop);
6169 }
6170
6171 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
6172 true if anything in the basic-block was vectorized. */
6173
6174 bool
vect_slp_function(function * fun)6175 vect_slp_function (function *fun)
6176 {
6177 bool r = false;
6178 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6179 unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6180
6181 /* For the moment split the function into pieces to avoid making
6182 the iteration on the vector mode moot. Split at points we know
6183 to not handle well which is CFG merges (SLP discovery doesn't
6184 handle non-loop-header PHIs) and loop exits. Since pattern
6185 recog requires reverse iteration to visit uses before defs
6186 simply chop RPO into pieces. */
6187 auto_vec<basic_block> bbs;
6188 for (unsigned i = 0; i < n; i++)
6189 {
6190 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6191 bool split = false;
6192
6193 /* Split when a BB is not dominated by the first block. */
6194 if (!bbs.is_empty ()
6195 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6196 {
6197 if (dump_enabled_p ())
6198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6199 "splitting region at dominance boundary bb%d\n",
6200 bb->index);
6201 split = true;
6202 }
6203 /* Split when the loop determined by the first block
6204 is exited. This is because we eventually insert
6205 invariants at region begin. */
6206 else if (!bbs.is_empty ()
6207 && bbs[0]->loop_father != bb->loop_father
6208 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6209 {
6210 if (dump_enabled_p ())
6211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212 "splitting region at loop %d exit at bb%d\n",
6213 bbs[0]->loop_father->num, bb->index);
6214 split = true;
6215 }
6216
6217 if (split && !bbs.is_empty ())
6218 {
6219 r |= vect_slp_bbs (bbs, NULL);
6220 bbs.truncate (0);
6221 }
6222
6223 /* We need to be able to insert at the head of the region which
6224 we cannot for region starting with a returns-twice call. */
6225 if (bbs.is_empty ())
6226 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
6227 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
6228 {
6229 if (dump_enabled_p ())
6230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6231 "skipping bb%d as start of region as it "
6232 "starts with returns-twice call\n",
6233 bb->index);
6234 continue;
6235 }
6236
6237 bbs.safe_push (bb);
6238
6239 /* When we have a stmt ending this block and defining a
6240 value we have to insert on edges when inserting after it for
6241 a vector containing its definition. Avoid this for now. */
6242 if (gimple *last = last_stmt (bb))
6243 if (gimple_get_lhs (last)
6244 && is_ctrl_altering_stmt (last))
6245 {
6246 if (dump_enabled_p ())
6247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6248 "splitting region at control altering "
6249 "definition %G", last);
6250 r |= vect_slp_bbs (bbs, NULL);
6251 bbs.truncate (0);
6252 }
6253 }
6254
6255 if (!bbs.is_empty ())
6256 r |= vect_slp_bbs (bbs, NULL);
6257
6258 free (rpo);
6259
6260 return r;
6261 }
6262
6263 /* Build a variable-length vector in which the elements in ELTS are repeated
6264 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
6265 RESULTS and add any new instructions to SEQ.
6266
6267 The approach we use is:
6268
6269 (1) Find a vector mode VM with integer elements of mode IM.
6270
6271 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6272 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
6273 from small vectors to IM.
6274
6275 (3) Duplicate each ELTS'[I] into a vector of mode VM.
6276
6277 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6278 correct byte contents.
6279
6280 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6281
6282 We try to find the largest IM for which this sequence works, in order
6283 to cut down on the number of interleaves. */
6284
6285 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,const vec<tree> & elts,unsigned int nresults,vec<tree> & results)6286 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6287 const vec<tree> &elts, unsigned int nresults,
6288 vec<tree> &results)
6289 {
6290 unsigned int nelts = elts.length ();
6291 tree element_type = TREE_TYPE (vector_type);
6292
6293 /* (1) Find a vector mode VM with integer elements of mode IM. */
6294 unsigned int nvectors = 1;
6295 tree new_vector_type;
6296 tree permutes[2];
6297 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6298 &nvectors, &new_vector_type,
6299 permutes))
6300 gcc_unreachable ();
6301
6302 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
6303 unsigned int partial_nelts = nelts / nvectors;
6304 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6305
6306 tree_vector_builder partial_elts;
6307 auto_vec<tree, 32> pieces (nvectors * 2);
6308 pieces.quick_grow_cleared (nvectors * 2);
6309 for (unsigned int i = 0; i < nvectors; ++i)
6310 {
6311 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6312 ELTS' has mode IM. */
6313 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6314 for (unsigned int j = 0; j < partial_nelts; ++j)
6315 partial_elts.quick_push (elts[i * partial_nelts + j]);
6316 tree t = gimple_build_vector (seq, &partial_elts);
6317 t = gimple_build (seq, VIEW_CONVERT_EXPR,
6318 TREE_TYPE (new_vector_type), t);
6319
6320 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
6321 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6322 }
6323
6324 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6325 correct byte contents.
6326
6327 Conceptually, we need to repeat the following operation log2(nvectors)
6328 times, where hi_start = nvectors / 2:
6329
6330 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6331 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6332
6333 However, if each input repeats every N elements and the VF is
6334 a multiple of N * 2, the HI result is the same as the LO result.
6335 This will be true for the first N1 iterations of the outer loop,
6336 followed by N2 iterations for which both the LO and HI results
6337 are needed. I.e.:
6338
6339 N1 + N2 = log2(nvectors)
6340
6341 Each "N1 iteration" doubles the number of redundant vectors and the
6342 effect of the process as a whole is to have a sequence of nvectors/2**N1
6343 vectors that repeats 2**N1 times. Rather than generate these redundant
6344 vectors, we halve the number of vectors for each N1 iteration. */
6345 unsigned int in_start = 0;
6346 unsigned int out_start = nvectors;
6347 unsigned int new_nvectors = nvectors;
6348 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6349 {
6350 unsigned int hi_start = new_nvectors / 2;
6351 unsigned int out_i = 0;
6352 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6353 {
6354 if ((in_i & 1) != 0
6355 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6356 2 * in_repeat))
6357 continue;
6358
6359 tree output = make_ssa_name (new_vector_type);
6360 tree input1 = pieces[in_start + (in_i / 2)];
6361 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6362 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6363 input1, input2,
6364 permutes[in_i & 1]);
6365 gimple_seq_add_stmt (seq, stmt);
6366 pieces[out_start + out_i] = output;
6367 out_i += 1;
6368 }
6369 std::swap (in_start, out_start);
6370 new_nvectors = out_i;
6371 }
6372
6373 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
6374 results.reserve (nresults);
6375 for (unsigned int i = 0; i < nresults; ++i)
6376 if (i < new_nvectors)
6377 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6378 pieces[in_start + i]));
6379 else
6380 results.quick_push (results[i - new_nvectors]);
6381 }
6382
6383
6384 /* For constant and loop invariant defs in OP_NODE this function creates
6385 vector defs that will be used in the vectorized stmts and stores them
6386 to SLP_TREE_VEC_DEFS of OP_NODE. */
6387
6388 static void
vect_create_constant_vectors(vec_info * vinfo,slp_tree op_node)6389 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6390 {
6391 unsigned HOST_WIDE_INT nunits;
6392 tree vec_cst;
6393 unsigned j, number_of_places_left_in_vector;
6394 tree vector_type;
6395 tree vop;
6396 int group_size = op_node->ops.length ();
6397 unsigned int vec_num, i;
6398 unsigned number_of_copies = 1;
6399 bool constant_p;
6400 gimple_seq ctor_seq = NULL;
6401 auto_vec<tree, 16> permute_results;
6402
6403 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
6404 vector_type = SLP_TREE_VECTYPE (op_node);
6405
6406 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6407 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6408 auto_vec<tree> voprnds (number_of_vectors);
6409
6410 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6411 created vectors. It is greater than 1 if unrolling is performed.
6412
6413 For example, we have two scalar operands, s1 and s2 (e.g., group of
6414 strided accesses of size two), while NUNITS is four (i.e., four scalars
6415 of this type can be packed in a vector). The output vector will contain
6416 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
6417 will be 2).
6418
6419 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6420 containing the operands.
6421
6422 For example, NUNITS is four as before, and the group size is 8
6423 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
6424 {s5, s6, s7, s8}. */
6425
6426 /* When using duplicate_and_interleave, we just need one element for
6427 each scalar statement. */
6428 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6429 nunits = group_size;
6430
6431 number_of_copies = nunits * number_of_vectors / group_size;
6432
6433 number_of_places_left_in_vector = nunits;
6434 constant_p = true;
6435 tree_vector_builder elts (vector_type, nunits, 1);
6436 elts.quick_grow (nunits);
6437 stmt_vec_info insert_after = NULL;
6438 for (j = 0; j < number_of_copies; j++)
6439 {
6440 tree op;
6441 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6442 {
6443 /* Create 'vect_ = {op0,op1,...,opn}'. */
6444 number_of_places_left_in_vector--;
6445 tree orig_op = op;
6446 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6447 {
6448 if (CONSTANT_CLASS_P (op))
6449 {
6450 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6451 {
6452 /* Can't use VIEW_CONVERT_EXPR for booleans because
6453 of possibly different sizes of scalar value and
6454 vector element. */
6455 if (integer_zerop (op))
6456 op = build_int_cst (TREE_TYPE (vector_type), 0);
6457 else if (integer_onep (op))
6458 op = build_all_ones_cst (TREE_TYPE (vector_type));
6459 else
6460 gcc_unreachable ();
6461 }
6462 else
6463 op = fold_unary (VIEW_CONVERT_EXPR,
6464 TREE_TYPE (vector_type), op);
6465 gcc_assert (op && CONSTANT_CLASS_P (op));
6466 }
6467 else
6468 {
6469 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6470 gimple *init_stmt;
6471 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6472 {
6473 tree true_val
6474 = build_all_ones_cst (TREE_TYPE (vector_type));
6475 tree false_val
6476 = build_zero_cst (TREE_TYPE (vector_type));
6477 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6478 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6479 op, true_val,
6480 false_val);
6481 }
6482 else
6483 {
6484 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6485 op);
6486 init_stmt
6487 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6488 op);
6489 }
6490 gimple_seq_add_stmt (&ctor_seq, init_stmt);
6491 op = new_temp;
6492 }
6493 }
6494 elts[number_of_places_left_in_vector] = op;
6495 if (!CONSTANT_CLASS_P (op))
6496 constant_p = false;
6497 /* For BB vectorization we have to compute an insert location
6498 when a def is inside the analyzed region since we cannot
6499 simply insert at the BB start in this case. */
6500 stmt_vec_info opdef;
6501 if (TREE_CODE (orig_op) == SSA_NAME
6502 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6503 && is_a <bb_vec_info> (vinfo)
6504 && (opdef = vinfo->lookup_def (orig_op)))
6505 {
6506 if (!insert_after)
6507 insert_after = opdef;
6508 else
6509 insert_after = get_later_stmt (insert_after, opdef);
6510 }
6511
6512 if (number_of_places_left_in_vector == 0)
6513 {
6514 if (constant_p
6515 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6516 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6517 vec_cst = gimple_build_vector (&ctor_seq, &elts);
6518 else
6519 {
6520 if (permute_results.is_empty ())
6521 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6522 elts, number_of_vectors,
6523 permute_results);
6524 vec_cst = permute_results[number_of_vectors - j - 1];
6525 }
6526 if (!gimple_seq_empty_p (ctor_seq))
6527 {
6528 if (insert_after)
6529 {
6530 gimple_stmt_iterator gsi;
6531 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6532 {
6533 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6534 gsi_insert_seq_before (&gsi, ctor_seq,
6535 GSI_CONTINUE_LINKING);
6536 }
6537 else if (!stmt_ends_bb_p (insert_after->stmt))
6538 {
6539 gsi = gsi_for_stmt (insert_after->stmt);
6540 gsi_insert_seq_after (&gsi, ctor_seq,
6541 GSI_CONTINUE_LINKING);
6542 }
6543 else
6544 {
6545 /* When we want to insert after a def where the
6546 defining stmt throws then insert on the fallthru
6547 edge. */
6548 edge e = find_fallthru_edge
6549 (gimple_bb (insert_after->stmt)->succs);
6550 basic_block new_bb
6551 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6552 gcc_assert (!new_bb);
6553 }
6554 }
6555 else
6556 vinfo->insert_seq_on_entry (NULL, ctor_seq);
6557 ctor_seq = NULL;
6558 }
6559 voprnds.quick_push (vec_cst);
6560 insert_after = NULL;
6561 number_of_places_left_in_vector = nunits;
6562 constant_p = true;
6563 elts.new_vector (vector_type, nunits, 1);
6564 elts.quick_grow (nunits);
6565 }
6566 }
6567 }
6568
6569 /* Since the vectors are created in the reverse order, we should invert
6570 them. */
6571 vec_num = voprnds.length ();
6572 for (j = vec_num; j != 0; j--)
6573 {
6574 vop = voprnds[j - 1];
6575 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6576 }
6577
6578 /* In case that VF is greater than the unrolling factor needed for the SLP
6579 group of stmts, NUMBER_OF_VECTORS to be created is greater than
6580 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6581 to replicate the vectors. */
6582 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6583 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6584 i++)
6585 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6586 }
6587
6588 /* Get the Ith vectorized definition from SLP_NODE. */
6589
6590 tree
vect_get_slp_vect_def(slp_tree slp_node,unsigned i)6591 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6592 {
6593 if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6594 return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6595 else
6596 return SLP_TREE_VEC_DEFS (slp_node)[i];
6597 }
6598
6599 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
6600
6601 void
vect_get_slp_defs(slp_tree slp_node,vec<tree> * vec_defs)6602 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6603 {
6604 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6605 if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6606 {
6607 unsigned j;
6608 gimple *vec_def_stmt;
6609 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6610 vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6611 }
6612 else
6613 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6614 }
6615
6616 /* Get N vectorized definitions for SLP_NODE. */
6617
6618 void
vect_get_slp_defs(vec_info *,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)6619 vect_get_slp_defs (vec_info *,
6620 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6621 {
6622 if (n == -1U)
6623 n = SLP_TREE_CHILDREN (slp_node).length ();
6624
6625 for (unsigned i = 0; i < n; ++i)
6626 {
6627 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6628 vec<tree> vec_defs = vNULL;
6629 vect_get_slp_defs (child, &vec_defs);
6630 vec_oprnds->quick_push (vec_defs);
6631 }
6632 }
6633
6634 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6635 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6636 permute statements for the SLP node NODE. Store the number of vector
6637 permute instructions in *N_PERMS and the number of vector load
6638 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
6639 that were not needed. */
6640
6641 bool
vect_transform_slp_perm_load(vec_info * vinfo,slp_tree node,const vec<tree> & dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,bool analyze_only,unsigned * n_perms,unsigned int * n_loads,bool dce_chain)6642 vect_transform_slp_perm_load (vec_info *vinfo,
6643 slp_tree node, const vec<tree> &dr_chain,
6644 gimple_stmt_iterator *gsi, poly_uint64 vf,
6645 bool analyze_only, unsigned *n_perms,
6646 unsigned int *n_loads, bool dce_chain)
6647 {
6648 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6649 int vec_index = 0;
6650 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6651 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6652 unsigned int mask_element;
6653 machine_mode mode;
6654
6655 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6656 return false;
6657
6658 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6659
6660 mode = TYPE_MODE (vectype);
6661 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6662
6663 /* Initialize the vect stmts of NODE to properly insert the generated
6664 stmts later. */
6665 if (! analyze_only)
6666 for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6667 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6668 SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6669
6670 /* Generate permutation masks for every NODE. Number of masks for each NODE
6671 is equal to GROUP_SIZE.
6672 E.g., we have a group of three nodes with three loads from the same
6673 location in each node, and the vector size is 4. I.e., we have a
6674 a0b0c0a1b1c1... sequence and we need to create the following vectors:
6675 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6676 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6677 ...
6678
6679 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6680 The last mask is illegal since we assume two operands for permute
6681 operation, and the mask element values can't be outside that range.
6682 Hence, the last mask must be converted into {2,5,5,5}.
6683 For the first two permutations we need the first and the second input
6684 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6685 we need the second and the third vectors: {b1,c1,a2,b2} and
6686 {c2,a3,b3,c3}. */
6687
6688 int vect_stmts_counter = 0;
6689 unsigned int index = 0;
6690 int first_vec_index = -1;
6691 int second_vec_index = -1;
6692 bool noop_p = true;
6693 *n_perms = 0;
6694
6695 vec_perm_builder mask;
6696 unsigned int nelts_to_build;
6697 unsigned int nvectors_per_build;
6698 unsigned int in_nlanes;
6699 bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6700 && multiple_p (nunits, group_size));
6701 if (repeating_p)
6702 {
6703 /* A single vector contains a whole number of copies of the node, so:
6704 (a) all permutes can use the same mask; and
6705 (b) the permutes only need a single vector input. */
6706 mask.new_vector (nunits, group_size, 3);
6707 nelts_to_build = mask.encoded_nelts ();
6708 nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6709 in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6710 }
6711 else
6712 {
6713 /* We need to construct a separate mask for each vector statement. */
6714 unsigned HOST_WIDE_INT const_nunits, const_vf;
6715 if (!nunits.is_constant (&const_nunits)
6716 || !vf.is_constant (&const_vf))
6717 return false;
6718 mask.new_vector (const_nunits, const_nunits, 1);
6719 nelts_to_build = const_vf * group_size;
6720 nvectors_per_build = 1;
6721 in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6722 }
6723 auto_sbitmap used_in_lanes (in_nlanes);
6724 bitmap_clear (used_in_lanes);
6725 auto_bitmap used_defs;
6726
6727 unsigned int count = mask.encoded_nelts ();
6728 mask.quick_grow (count);
6729 vec_perm_indices indices;
6730
6731 for (unsigned int j = 0; j < nelts_to_build; j++)
6732 {
6733 unsigned int iter_num = j / group_size;
6734 unsigned int stmt_num = j % group_size;
6735 unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6736 + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6737 bitmap_set_bit (used_in_lanes, i);
6738 if (repeating_p)
6739 {
6740 first_vec_index = 0;
6741 mask_element = i;
6742 }
6743 else
6744 {
6745 /* Enforced before the loop when !repeating_p. */
6746 unsigned int const_nunits = nunits.to_constant ();
6747 vec_index = i / const_nunits;
6748 mask_element = i % const_nunits;
6749 if (vec_index == first_vec_index
6750 || first_vec_index == -1)
6751 {
6752 first_vec_index = vec_index;
6753 }
6754 else if (vec_index == second_vec_index
6755 || second_vec_index == -1)
6756 {
6757 second_vec_index = vec_index;
6758 mask_element += const_nunits;
6759 }
6760 else
6761 {
6762 if (dump_enabled_p ())
6763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764 "permutation requires at "
6765 "least three vectors %G",
6766 stmt_info->stmt);
6767 gcc_assert (analyze_only);
6768 return false;
6769 }
6770
6771 gcc_assert (mask_element < 2 * const_nunits);
6772 }
6773
6774 if (mask_element != index)
6775 noop_p = false;
6776 mask[index++] = mask_element;
6777
6778 if (index == count && !noop_p)
6779 {
6780 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6781 if (!can_vec_perm_const_p (mode, indices))
6782 {
6783 if (dump_enabled_p ())
6784 {
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6786 vect_location,
6787 "unsupported vect permute { ");
6788 for (i = 0; i < count; ++i)
6789 {
6790 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6791 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6792 }
6793 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6794 }
6795 gcc_assert (analyze_only);
6796 return false;
6797 }
6798
6799 ++*n_perms;
6800 }
6801
6802 if (index == count)
6803 {
6804 if (!analyze_only)
6805 {
6806 tree mask_vec = NULL_TREE;
6807
6808 if (! noop_p)
6809 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6810
6811 if (second_vec_index == -1)
6812 second_vec_index = first_vec_index;
6813
6814 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6815 {
6816 /* Generate the permute statement if necessary. */
6817 tree first_vec = dr_chain[first_vec_index + ri];
6818 tree second_vec = dr_chain[second_vec_index + ri];
6819 gimple *perm_stmt;
6820 if (! noop_p)
6821 {
6822 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6823 tree perm_dest
6824 = vect_create_destination_var (gimple_assign_lhs (stmt),
6825 vectype);
6826 perm_dest = make_ssa_name (perm_dest);
6827 perm_stmt
6828 = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6829 first_vec, second_vec,
6830 mask_vec);
6831 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6832 gsi);
6833 if (dce_chain)
6834 {
6835 bitmap_set_bit (used_defs, first_vec_index + ri);
6836 bitmap_set_bit (used_defs, second_vec_index + ri);
6837 }
6838 }
6839 else
6840 {
6841 /* If mask was NULL_TREE generate the requested
6842 identity transform. */
6843 perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6844 if (dce_chain)
6845 bitmap_set_bit (used_defs, first_vec_index + ri);
6846 }
6847
6848 /* Store the vector statement in NODE. */
6849 SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6850 }
6851 }
6852
6853 index = 0;
6854 first_vec_index = -1;
6855 second_vec_index = -1;
6856 noop_p = true;
6857 }
6858 }
6859
6860 if (n_loads)
6861 {
6862 if (repeating_p)
6863 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6864 else
6865 {
6866 /* Enforced above when !repeating_p. */
6867 unsigned int const_nunits = nunits.to_constant ();
6868 *n_loads = 0;
6869 bool load_seen = false;
6870 for (unsigned i = 0; i < in_nlanes; ++i)
6871 {
6872 if (i % const_nunits == 0)
6873 {
6874 if (load_seen)
6875 *n_loads += 1;
6876 load_seen = false;
6877 }
6878 if (bitmap_bit_p (used_in_lanes, i))
6879 load_seen = true;
6880 }
6881 if (load_seen)
6882 *n_loads += 1;
6883 }
6884 }
6885
6886 if (dce_chain)
6887 for (unsigned i = 0; i < dr_chain.length (); ++i)
6888 if (!bitmap_bit_p (used_defs, i))
6889 {
6890 gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6891 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6892 gsi_remove (&rgsi, true);
6893 release_defs (stmt);
6894 }
6895
6896 return true;
6897 }
6898
6899 /* Produce the next vector result for SLP permutation NODE by adding a vector
6900 statement at GSI. If MASK_VEC is nonnull, add:
6901
6902 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6903
6904 otherwise add:
6905
6906 <new SSA name> = FIRST_DEF. */
6907
6908 static void
vect_add_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,tree first_def,tree second_def,tree mask_vec)6909 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6910 slp_tree node, tree first_def, tree second_def,
6911 tree mask_vec)
6912 {
6913 tree vectype = SLP_TREE_VECTYPE (node);
6914
6915 /* ??? We SLP match existing vector element extracts but
6916 allow punning which we need to re-instantiate at uses
6917 but have no good way of explicitly representing. */
6918 if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6919 {
6920 gassign *conv_stmt
6921 = gimple_build_assign (make_ssa_name (vectype),
6922 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6923 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6924 first_def = gimple_assign_lhs (conv_stmt);
6925 }
6926 gassign *perm_stmt;
6927 tree perm_dest = make_ssa_name (vectype);
6928 if (mask_vec)
6929 {
6930 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6931 {
6932 gassign *conv_stmt
6933 = gimple_build_assign (make_ssa_name (vectype),
6934 build1 (VIEW_CONVERT_EXPR,
6935 vectype, second_def));
6936 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6937 second_def = gimple_assign_lhs (conv_stmt);
6938 }
6939 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6940 first_def, second_def,
6941 mask_vec);
6942 }
6943 else
6944 /* We need a copy here in case the def was external. */
6945 perm_stmt = gimple_build_assign (perm_dest, first_def);
6946 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6947 /* Store the vector statement in NODE. */
6948 SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6949 }
6950
6951 /* Vectorize the SLP permutations in NODE as specified
6952 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6953 child number and lane number.
6954 Interleaving of two two-lane two-child SLP subtrees (not supported):
6955 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6956 A blend of two four-lane two-child SLP subtrees:
6957 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6958 Highpart of a four-lane one-child SLP subtree (not supported):
6959 [ { 0, 2 }, { 0, 3 } ]
6960 Where currently only a subset is supported by code generating below. */
6961
6962 static bool
vectorizable_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,stmt_vector_for_cost * cost_vec)6963 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6964 slp_tree node, stmt_vector_for_cost *cost_vec)
6965 {
6966 tree vectype = SLP_TREE_VECTYPE (node);
6967
6968 /* ??? We currently only support all same vector input and output types
6969 while the SLP IL should really do a concat + select and thus accept
6970 arbitrary mismatches. */
6971 slp_tree child;
6972 unsigned i;
6973 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6974 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6975 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6976 {
6977 if (!vect_maybe_update_slp_op_vectype (child, vectype)
6978 || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6979 {
6980 if (dump_enabled_p ())
6981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982 "Unsupported lane permutation\n");
6983 return false;
6984 }
6985 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6986 repeating_p = false;
6987 }
6988
6989 vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6990 gcc_assert (perm.length () == SLP_TREE_LANES (node));
6991 if (dump_enabled_p ())
6992 {
6993 dump_printf_loc (MSG_NOTE, vect_location,
6994 "vectorizing permutation");
6995 for (unsigned i = 0; i < perm.length (); ++i)
6996 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6997 if (repeating_p)
6998 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6999 dump_printf (MSG_NOTE, "\n");
7000 }
7001
7002 /* REPEATING_P is true if every output vector is guaranteed to use the
7003 same permute vector. We can handle that case for both variable-length
7004 and constant-length vectors, but we only handle other cases for
7005 constant-length vectors.
7006
7007 Set:
7008
7009 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7010 mask vector that we want to build.
7011
7012 - NCOPIES to the number of copies of PERM that we need in order
7013 to build the necessary permute mask vectors.
7014
7015 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7016 for each permute mask vector. This is only relevant when GSI is
7017 nonnull. */
7018 uint64_t npatterns;
7019 unsigned nelts_per_pattern;
7020 uint64_t ncopies;
7021 unsigned noutputs_per_mask;
7022 if (repeating_p)
7023 {
7024 /* We need a single permute mask vector that has the form:
7025
7026 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7027
7028 In other words, the original n-element permute in PERM is
7029 "unrolled" to fill a full vector. The stepped vector encoding
7030 that we use for permutes requires 3n elements. */
7031 npatterns = SLP_TREE_LANES (node);
7032 nelts_per_pattern = ncopies = 3;
7033 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7034 }
7035 else
7036 {
7037 /* Calculate every element of every permute mask vector explicitly,
7038 instead of relying on the pattern described above. */
7039 if (!nunits.is_constant (&npatterns))
7040 return false;
7041 nelts_per_pattern = ncopies = 1;
7042 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
7043 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
7044 return false;
7045 noutputs_per_mask = 1;
7046 }
7047 unsigned olanes = ncopies * SLP_TREE_LANES (node);
7048 gcc_assert (repeating_p || multiple_p (olanes, nunits));
7049
7050 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7051 from the { SLP operand, scalar lane } permutation as recorded in the
7052 SLP node as intermediate step. This part should already work
7053 with SLP children with arbitrary number of lanes. */
7054 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
7055 auto_vec<unsigned> active_lane;
7056 vperm.create (olanes);
7057 active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
7058 for (unsigned i = 0; i < ncopies; ++i)
7059 {
7060 for (unsigned pi = 0; pi < perm.length (); ++pi)
7061 {
7062 std::pair<unsigned, unsigned> p = perm[pi];
7063 tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7064 if (repeating_p)
7065 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7066 else
7067 {
7068 /* We checked above that the vectors are constant-length. */
7069 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7070 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7071 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7072 vperm.quick_push ({{p.first, vi}, vl});
7073 }
7074 }
7075 /* Advance to the next group. */
7076 for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7077 active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7078 }
7079
7080 if (dump_enabled_p ())
7081 {
7082 dump_printf_loc (MSG_NOTE, vect_location, "as");
7083 for (unsigned i = 0; i < vperm.length (); ++i)
7084 {
7085 if (i != 0
7086 && (repeating_p
7087 ? multiple_p (i, npatterns)
7088 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7089 dump_printf (MSG_NOTE, ",");
7090 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7091 vperm[i].first.first, vperm[i].first.second,
7092 vperm[i].second);
7093 }
7094 dump_printf (MSG_NOTE, "\n");
7095 }
7096
7097 /* We can only handle two-vector permutes, everything else should
7098 be lowered on the SLP level. The following is closely inspired
7099 by vect_transform_slp_perm_load and is supposed to eventually
7100 replace it.
7101 ??? As intermediate step do code-gen in the SLP tree representation
7102 somehow? */
7103 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7104 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7105 unsigned int index = 0;
7106 poly_uint64 mask_element;
7107 vec_perm_builder mask;
7108 mask.new_vector (nunits, npatterns, nelts_per_pattern);
7109 unsigned int count = mask.encoded_nelts ();
7110 mask.quick_grow (count);
7111 vec_perm_indices indices;
7112 unsigned nperms = 0;
7113 for (unsigned i = 0; i < vperm.length (); ++i)
7114 {
7115 mask_element = vperm[i].second;
7116 if (first_vec.first == -1U
7117 || first_vec == vperm[i].first)
7118 first_vec = vperm[i].first;
7119 else if (second_vec.first == -1U
7120 || second_vec == vperm[i].first)
7121 {
7122 second_vec = vperm[i].first;
7123 mask_element += nunits;
7124 }
7125 else
7126 {
7127 if (dump_enabled_p ())
7128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7129 "permutation requires at "
7130 "least three vectors\n");
7131 gcc_assert (!gsi);
7132 return false;
7133 }
7134
7135 mask[index++] = mask_element;
7136
7137 if (index == count)
7138 {
7139 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7140 bool identity_p = indices.series_p (0, 1, 0, 1);
7141 if (!identity_p
7142 && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7143 {
7144 if (dump_enabled_p ())
7145 {
7146 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7147 vect_location,
7148 "unsupported vect permute { ");
7149 for (i = 0; i < count; ++i)
7150 {
7151 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7152 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7153 }
7154 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7155 }
7156 gcc_assert (!gsi);
7157 return false;
7158 }
7159
7160 if (!identity_p)
7161 nperms++;
7162 if (gsi)
7163 {
7164 if (second_vec.first == -1U)
7165 second_vec = first_vec;
7166
7167 slp_tree
7168 first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7169 second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7170
7171 tree mask_vec = NULL_TREE;
7172 if (!identity_p)
7173 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7174
7175 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7176 {
7177 tree first_def
7178 = vect_get_slp_vect_def (first_node,
7179 first_vec.second + vi);
7180 tree second_def
7181 = vect_get_slp_vect_def (second_node,
7182 second_vec.second + vi);
7183 vect_add_slp_permutation (vinfo, gsi, node, first_def,
7184 second_def, mask_vec);
7185 }
7186 }
7187
7188 index = 0;
7189 first_vec = std::make_pair (-1U, -1U);
7190 second_vec = std::make_pair (-1U, -1U);
7191 }
7192 }
7193
7194 if (!gsi)
7195 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7196
7197 return true;
7198 }
7199
7200 /* Vectorize SLP NODE. */
7201
7202 static void
vect_schedule_slp_node(vec_info * vinfo,slp_tree node,slp_instance instance)7203 vect_schedule_slp_node (vec_info *vinfo,
7204 slp_tree node, slp_instance instance)
7205 {
7206 gimple_stmt_iterator si;
7207 int i;
7208 slp_tree child;
7209
7210 /* For existing vectors there's nothing to do. */
7211 if (SLP_TREE_VEC_DEFS (node).exists ())
7212 return;
7213
7214 gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7215
7216 /* Vectorize externals and constants. */
7217 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7218 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7219 {
7220 /* ??? vectorizable_shift can end up using a scalar operand which is
7221 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
7222 node in this case. */
7223 if (!SLP_TREE_VECTYPE (node))
7224 return;
7225
7226 vect_create_constant_vectors (vinfo, node);
7227 return;
7228 }
7229
7230 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7231
7232 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7233 SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7234
7235 if (dump_enabled_p ())
7236 dump_printf_loc (MSG_NOTE, vect_location,
7237 "------>vectorizing SLP node starting from: %G",
7238 stmt_info->stmt);
7239
7240 if (STMT_VINFO_DATA_REF (stmt_info)
7241 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7242 {
7243 /* Vectorized loads go before the first scalar load to make it
7244 ready early, vectorized stores go before the last scalar
7245 stmt which is where all uses are ready. */
7246 stmt_vec_info last_stmt_info = NULL;
7247 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7248 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7249 else /* DR_IS_WRITE */
7250 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7251 si = gsi_for_stmt (last_stmt_info->stmt);
7252 }
7253 else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7254 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7255 || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7256 && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7257 {
7258 /* For PHI node vectorization we do not use the insertion iterator. */
7259 si = gsi_none ();
7260 }
7261 else
7262 {
7263 /* Emit other stmts after the children vectorized defs which is
7264 earliest possible. */
7265 gimple *last_stmt = NULL;
7266 bool seen_vector_def = false;
7267 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7268 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7269 {
7270 /* For fold-left reductions we are retaining the scalar
7271 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7272 set so the representation isn't perfect. Resort to the
7273 last scalar def here. */
7274 if (SLP_TREE_VEC_STMTS (child).is_empty ())
7275 {
7276 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7277 == cycle_phi_info_type);
7278 gphi *phi = as_a <gphi *>
7279 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7280 if (!last_stmt
7281 || vect_stmt_dominates_stmt_p (last_stmt, phi))
7282 last_stmt = phi;
7283 }
7284 /* We are emitting all vectorized stmts in the same place and
7285 the last one is the last.
7286 ??? Unless we have a load permutation applied and that
7287 figures to re-use an earlier generated load. */
7288 unsigned j;
7289 gimple *vstmt;
7290 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7291 if (!last_stmt
7292 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7293 last_stmt = vstmt;
7294 }
7295 else if (!SLP_TREE_VECTYPE (child))
7296 {
7297 /* For externals we use unvectorized at all scalar defs. */
7298 unsigned j;
7299 tree def;
7300 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7301 if (TREE_CODE (def) == SSA_NAME
7302 && !SSA_NAME_IS_DEFAULT_DEF (def))
7303 {
7304 gimple *stmt = SSA_NAME_DEF_STMT (def);
7305 if (!last_stmt
7306 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7307 last_stmt = stmt;
7308 }
7309 }
7310 else
7311 {
7312 /* For externals we have to look at all defs since their
7313 insertion place is decided per vector. But beware
7314 of pre-existing vectors where we need to make sure
7315 we do not insert before the region boundary. */
7316 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7317 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7318 seen_vector_def = true;
7319 else
7320 {
7321 unsigned j;
7322 tree vdef;
7323 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7324 if (TREE_CODE (vdef) == SSA_NAME
7325 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7326 {
7327 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7328 if (!last_stmt
7329 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7330 last_stmt = vstmt;
7331 }
7332 }
7333 }
7334 /* This can happen when all children are pre-existing vectors or
7335 constants. */
7336 if (!last_stmt)
7337 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7338 if (!last_stmt)
7339 {
7340 gcc_assert (seen_vector_def);
7341 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7342 }
7343 else if (is_ctrl_altering_stmt (last_stmt))
7344 {
7345 /* We split regions to vectorize at control altering stmts
7346 with a definition so this must be an external which
7347 we can insert at the start of the region. */
7348 si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7349 }
7350 else if (is_a <bb_vec_info> (vinfo)
7351 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7352 && gimple_could_trap_p (stmt_info->stmt))
7353 {
7354 /* We've constrained possibly trapping operations to all come
7355 from the same basic-block, if vectorized defs would allow earlier
7356 scheduling still force vectorized stmts to the original block.
7357 This is only necessary for BB vectorization since for loop vect
7358 all operations are in a single BB and scalar stmt based
7359 placement doesn't play well with epilogue vectorization. */
7360 gcc_assert (dominated_by_p (CDI_DOMINATORS,
7361 gimple_bb (stmt_info->stmt),
7362 gimple_bb (last_stmt)));
7363 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7364 }
7365 else if (is_a <gphi *> (last_stmt))
7366 si = gsi_after_labels (gimple_bb (last_stmt));
7367 else
7368 {
7369 si = gsi_for_stmt (last_stmt);
7370 gsi_next (&si);
7371 }
7372 }
7373
7374 /* Handle purely internal nodes. */
7375 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7376 {
7377 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
7378 be shared with different SLP nodes (but usually it's the same
7379 operation apart from the case the stmt is only there for denoting
7380 the actual scalar lane defs ...). So do not call vect_transform_stmt
7381 but open-code it here (partly). */
7382 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7383 gcc_assert (done);
7384 stmt_vec_info slp_stmt_info;
7385 unsigned int i;
7386 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7387 if (STMT_VINFO_LIVE_P (slp_stmt_info))
7388 {
7389 done = vectorizable_live_operation (vinfo,
7390 slp_stmt_info, &si, node,
7391 instance, i, true, NULL);
7392 gcc_assert (done);
7393 }
7394 }
7395 else
7396 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7397 }
7398
7399 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7400 For loop vectorization this is done in vectorizable_call, but for SLP
7401 it needs to be deferred until end of vect_schedule_slp, because multiple
7402 SLP instances may refer to the same scalar stmt. */
7403
7404 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited)7405 vect_remove_slp_scalar_calls (vec_info *vinfo,
7406 slp_tree node, hash_set<slp_tree> &visited)
7407 {
7408 gimple *new_stmt;
7409 gimple_stmt_iterator gsi;
7410 int i;
7411 slp_tree child;
7412 tree lhs;
7413 stmt_vec_info stmt_info;
7414
7415 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7416 return;
7417
7418 if (visited.add (node))
7419 return;
7420
7421 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7422 vect_remove_slp_scalar_calls (vinfo, child, visited);
7423
7424 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7425 {
7426 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7427 if (!stmt || gimple_bb (stmt) == NULL)
7428 continue;
7429 if (is_pattern_stmt_p (stmt_info)
7430 || !PURE_SLP_STMT (stmt_info))
7431 continue;
7432 lhs = gimple_call_lhs (stmt);
7433 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7434 gsi = gsi_for_stmt (stmt);
7435 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7436 SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7437 }
7438 }
7439
7440 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node)7441 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7442 {
7443 hash_set<slp_tree> visited;
7444 vect_remove_slp_scalar_calls (vinfo, node, visited);
7445 }
7446
7447 /* Vectorize the instance root. */
7448
7449 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)7450 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7451 {
7452 gassign *rstmt = NULL;
7453
7454 if (instance->kind == slp_inst_kind_ctor)
7455 {
7456 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7457 {
7458 gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7459 tree vect_lhs = gimple_get_lhs (child_stmt);
7460 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7461 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7462 TREE_TYPE (vect_lhs)))
7463 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7464 vect_lhs);
7465 rstmt = gimple_build_assign (root_lhs, vect_lhs);
7466 }
7467 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7468 {
7469 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7470 gimple *child_stmt;
7471 int j;
7472 vec<constructor_elt, va_gc> *v;
7473 vec_alloc (v, nelts);
7474
7475 /* A CTOR can handle V16HI composition from VNx8HI so we
7476 do not need to convert vector elements if the types
7477 do not match. */
7478 FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7479 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7480 gimple_get_lhs (child_stmt));
7481 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7482 tree rtype
7483 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7484 tree r_constructor = build_constructor (rtype, v);
7485 rstmt = gimple_build_assign (lhs, r_constructor);
7486 }
7487 }
7488 else if (instance->kind == slp_inst_kind_bb_reduc)
7489 {
7490 /* Largely inspired by reduction chain epilogue handling in
7491 vect_create_epilog_for_reduction. */
7492 vec<tree> vec_defs = vNULL;
7493 vect_get_slp_defs (node, &vec_defs);
7494 enum tree_code reduc_code
7495 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7496 /* ??? We actually have to reflect signs somewhere. */
7497 if (reduc_code == MINUS_EXPR)
7498 reduc_code = PLUS_EXPR;
7499 gimple_seq epilogue = NULL;
7500 /* We may end up with more than one vector result, reduce them
7501 to one vector. */
7502 tree vec_def = vec_defs[0];
7503 for (unsigned i = 1; i < vec_defs.length (); ++i)
7504 vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7505 vec_def, vec_defs[i]);
7506 vec_defs.release ();
7507 /* ??? Support other schemes than direct internal fn. */
7508 internal_fn reduc_fn;
7509 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7510 || reduc_fn == IFN_LAST)
7511 gcc_unreachable ();
7512 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7513 TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7514
7515 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7516 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7517 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7518 update_stmt (gsi_stmt (rgsi));
7519 return;
7520 }
7521 else
7522 gcc_unreachable ();
7523
7524 gcc_assert (rstmt);
7525
7526 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7527 gsi_replace (&rgsi, rstmt, true);
7528 }
7529
7530 struct slp_scc_info
7531 {
7532 bool on_stack;
7533 int dfs;
7534 int lowlink;
7535 };
7536
7537 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
7538
7539 static void
vect_schedule_scc(vec_info * vinfo,slp_tree node,slp_instance instance,hash_map<slp_tree,slp_scc_info> & scc_info,int & maxdfs,vec<slp_tree> & stack)7540 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7541 hash_map<slp_tree, slp_scc_info> &scc_info,
7542 int &maxdfs, vec<slp_tree> &stack)
7543 {
7544 bool existed_p;
7545 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7546 gcc_assert (!existed_p);
7547 info->dfs = maxdfs;
7548 info->lowlink = maxdfs;
7549 maxdfs++;
7550
7551 /* Leaf. */
7552 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7553 {
7554 info->on_stack = false;
7555 vect_schedule_slp_node (vinfo, node, instance);
7556 return;
7557 }
7558
7559 info->on_stack = true;
7560 stack.safe_push (node);
7561
7562 unsigned i;
7563 slp_tree child;
7564 /* DFS recurse. */
7565 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7566 {
7567 if (!child)
7568 continue;
7569 slp_scc_info *child_info = scc_info.get (child);
7570 if (!child_info)
7571 {
7572 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7573 /* Recursion might have re-allocated the node. */
7574 info = scc_info.get (node);
7575 child_info = scc_info.get (child);
7576 info->lowlink = MIN (info->lowlink, child_info->lowlink);
7577 }
7578 else if (child_info->on_stack)
7579 info->lowlink = MIN (info->lowlink, child_info->dfs);
7580 }
7581 if (info->lowlink != info->dfs)
7582 return;
7583
7584 auto_vec<slp_tree, 4> phis_to_fixup;
7585
7586 /* Singleton. */
7587 if (stack.last () == node)
7588 {
7589 stack.pop ();
7590 info->on_stack = false;
7591 vect_schedule_slp_node (vinfo, node, instance);
7592 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7593 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7594 phis_to_fixup.quick_push (node);
7595 }
7596 else
7597 {
7598 /* SCC. */
7599 int last_idx = stack.length () - 1;
7600 while (stack[last_idx] != node)
7601 last_idx--;
7602 /* We can break the cycle at PHIs who have at least one child
7603 code generated. Then we could re-start the DFS walk until
7604 all nodes in the SCC are covered (we might have new entries
7605 for only back-reachable nodes). But it's simpler to just
7606 iterate and schedule those that are ready. */
7607 unsigned todo = stack.length () - last_idx;
7608 do
7609 {
7610 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7611 {
7612 slp_tree entry = stack[idx];
7613 if (!entry)
7614 continue;
7615 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7616 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7617 bool ready = !phi;
7618 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7619 if (!child)
7620 {
7621 gcc_assert (phi);
7622 ready = true;
7623 break;
7624 }
7625 else if (scc_info.get (child)->on_stack)
7626 {
7627 if (!phi)
7628 {
7629 ready = false;
7630 break;
7631 }
7632 }
7633 else
7634 {
7635 if (phi)
7636 {
7637 ready = true;
7638 break;
7639 }
7640 }
7641 if (ready)
7642 {
7643 vect_schedule_slp_node (vinfo, entry, instance);
7644 scc_info.get (entry)->on_stack = false;
7645 stack[idx] = NULL;
7646 todo--;
7647 if (phi)
7648 phis_to_fixup.safe_push (entry);
7649 }
7650 }
7651 }
7652 while (todo != 0);
7653
7654 /* Pop the SCC. */
7655 stack.truncate (last_idx);
7656 }
7657
7658 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
7659 slp_tree phi_node;
7660 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7661 {
7662 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7663 edge_iterator ei;
7664 edge e;
7665 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7666 {
7667 unsigned dest_idx = e->dest_idx;
7668 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7669 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7670 continue;
7671 /* Simply fill all args. */
7672 for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7673 add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7674 vect_get_slp_vect_def (child, i),
7675 e, gimple_phi_arg_location (phi, dest_idx));
7676 }
7677 }
7678 }
7679
7680 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
7681
7682 void
vect_schedule_slp(vec_info * vinfo,const vec<slp_instance> & slp_instances)7683 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7684 {
7685 slp_instance instance;
7686 unsigned int i;
7687
7688 hash_map<slp_tree, slp_scc_info> scc_info;
7689 int maxdfs = 0;
7690 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7691 {
7692 slp_tree node = SLP_INSTANCE_TREE (instance);
7693 if (dump_enabled_p ())
7694 {
7695 dump_printf_loc (MSG_NOTE, vect_location,
7696 "Vectorizing SLP tree:\n");
7697 /* ??? Dump all? */
7698 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7699 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7700 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7701 vect_print_slp_graph (MSG_NOTE, vect_location,
7702 SLP_INSTANCE_TREE (instance));
7703 }
7704 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7705 have a PHI be the node breaking the cycle. */
7706 auto_vec<slp_tree> stack;
7707 if (!scc_info.get (node))
7708 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7709
7710 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7711 vectorize_slp_instance_root_stmt (node, instance);
7712
7713 if (dump_enabled_p ())
7714 dump_printf_loc (MSG_NOTE, vect_location,
7715 "vectorizing stmts using SLP.\n");
7716 }
7717
7718 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7719 {
7720 slp_tree root = SLP_INSTANCE_TREE (instance);
7721 stmt_vec_info store_info;
7722 unsigned int j;
7723
7724 /* Remove scalar call stmts. Do not do this for basic-block
7725 vectorization as not all uses may be vectorized.
7726 ??? Why should this be necessary? DCE should be able to
7727 remove the stmts itself.
7728 ??? For BB vectorization we can as well remove scalar
7729 stmts starting from the SLP tree root if they have no
7730 uses. */
7731 if (is_a <loop_vec_info> (vinfo))
7732 vect_remove_slp_scalar_calls (vinfo, root);
7733
7734 /* Remove vectorized stores original scalar stmts. */
7735 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7736 {
7737 if (!STMT_VINFO_DATA_REF (store_info)
7738 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7739 break;
7740
7741 store_info = vect_orig_stmt (store_info);
7742 /* Free the attached stmt_vec_info and remove the stmt. */
7743 vinfo->remove_stmt (store_info);
7744
7745 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7746 to not crash in vect_free_slp_tree later. */
7747 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7748 SLP_TREE_REPRESENTATIVE (root) = NULL;
7749 }
7750 }
7751 }
7752