1 /* SLP - Basic Block Vectorization
2    Copyright (C) 2007-2022 Free Software Foundation, Inc.
3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
4    and Ira Rosen <irar@il.ibm.com>
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "tree-pass.h"
31 #include "ssa.h"
32 #include "optabs-tree.h"
33 #include "insn-config.h"
34 #include "recog.h"		/* FIXME: for insn_data */
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "gimple-iterator.h"
38 #include "cfgloop.h"
39 #include "tree-vectorizer.h"
40 #include "langhooks.h"
41 #include "gimple-walk.h"
42 #include "dbgcnt.h"
43 #include "tree-vector-builder.h"
44 #include "vec-perm-indices.h"
45 #include "gimple-fold.h"
46 #include "internal-fn.h"
47 #include "dump-context.h"
48 #include "cfganal.h"
49 #include "tree-eh.h"
50 #include "tree-cfg.h"
51 #include "alloc-pool.h"
52 
53 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
54 					  slp_tree, stmt_vector_for_cost *);
55 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
56 
57 static object_allocator<_slp_tree> *slp_tree_pool;
58 static slp_tree slp_first_node;
59 
60 void
vect_slp_init(void)61 vect_slp_init (void)
62 {
63   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
64 }
65 
66 void
vect_slp_fini(void)67 vect_slp_fini (void)
68 {
69   while (slp_first_node)
70     delete slp_first_node;
71   delete slp_tree_pool;
72   slp_tree_pool = NULL;
73 }
74 
75 void *
operator new(size_t n)76 _slp_tree::operator new (size_t n)
77 {
78   gcc_assert (n == sizeof (_slp_tree));
79   return slp_tree_pool->allocate_raw ();
80 }
81 
82 void
operator delete(void * node,size_t n)83 _slp_tree::operator delete (void *node, size_t n)
84 {
85   gcc_assert (n == sizeof (_slp_tree));
86   slp_tree_pool->remove_raw (node);
87 }
88 
89 
90 /* Initialize a SLP node.  */
91 
_slp_tree()92 _slp_tree::_slp_tree ()
93 {
94   this->prev_node = NULL;
95   if (slp_first_node)
96     slp_first_node->prev_node = this;
97   this->next_node = slp_first_node;
98   slp_first_node = this;
99   SLP_TREE_SCALAR_STMTS (this) = vNULL;
100   SLP_TREE_SCALAR_OPS (this) = vNULL;
101   SLP_TREE_VEC_STMTS (this) = vNULL;
102   SLP_TREE_VEC_DEFS (this) = vNULL;
103   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
104   SLP_TREE_CHILDREN (this) = vNULL;
105   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
106   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
107   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
108   SLP_TREE_CODE (this) = ERROR_MARK;
109   SLP_TREE_VECTYPE (this) = NULL_TREE;
110   SLP_TREE_REPRESENTATIVE (this) = NULL;
111   SLP_TREE_REF_COUNT (this) = 1;
112   this->failed = NULL;
113   this->max_nunits = 1;
114   this->lanes = 0;
115 }
116 
117 /* Tear down a SLP node.  */
118 
~_slp_tree()119 _slp_tree::~_slp_tree ()
120 {
121   if (this->prev_node)
122     this->prev_node->next_node = this->next_node;
123   else
124     slp_first_node = this->next_node;
125   if (this->next_node)
126     this->next_node->prev_node = this->prev_node;
127   SLP_TREE_CHILDREN (this).release ();
128   SLP_TREE_SCALAR_STMTS (this).release ();
129   SLP_TREE_SCALAR_OPS (this).release ();
130   SLP_TREE_VEC_STMTS (this).release ();
131   SLP_TREE_VEC_DEFS (this).release ();
132   SLP_TREE_LOAD_PERMUTATION (this).release ();
133   SLP_TREE_LANE_PERMUTATION (this).release ();
134   if (this->failed)
135     free (failed);
136 }
137 
138 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
139 
140 void
vect_free_slp_tree(slp_tree node)141 vect_free_slp_tree (slp_tree node)
142 {
143   int i;
144   slp_tree child;
145 
146   if (--SLP_TREE_REF_COUNT (node) != 0)
147     return;
148 
149   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
150     if (child)
151       vect_free_slp_tree (child);
152 
153   /* If the node defines any SLP only patterns then those patterns are no
154      longer valid and should be removed.  */
155   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
156   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
157     {
158       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
159       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
160       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
161     }
162 
163   delete node;
164 }
165 
166 /* Return a location suitable for dumpings related to the SLP instance.  */
167 
168 dump_user_location_t
location() const169 _slp_instance::location () const
170 {
171   if (!root_stmts.is_empty ())
172     return root_stmts[0]->stmt;
173   else
174     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
175 }
176 
177 
178 /* Free the memory allocated for the SLP instance.  */
179 
180 void
vect_free_slp_instance(slp_instance instance)181 vect_free_slp_instance (slp_instance instance)
182 {
183   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
184   SLP_INSTANCE_LOADS (instance).release ();
185   SLP_INSTANCE_ROOT_STMTS (instance).release ();
186   instance->subgraph_entries.release ();
187   instance->cost_vec.release ();
188   free (instance);
189 }
190 
191 
192 /* Create an SLP node for SCALAR_STMTS.  */
193 
194 slp_tree
vect_create_new_slp_node(unsigned nops,tree_code code)195 vect_create_new_slp_node (unsigned nops, tree_code code)
196 {
197   slp_tree node = new _slp_tree;
198   SLP_TREE_SCALAR_STMTS (node) = vNULL;
199   SLP_TREE_CHILDREN (node).create (nops);
200   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
201   SLP_TREE_CODE (node) = code;
202   return node;
203 }
204 /* Create an SLP node for SCALAR_STMTS.  */
205 
206 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<stmt_vec_info> scalar_stmts,unsigned nops)207 vect_create_new_slp_node (slp_tree node,
208 			  vec<stmt_vec_info> scalar_stmts, unsigned nops)
209 {
210   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
211   SLP_TREE_CHILDREN (node).create (nops);
212   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
213   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
214   SLP_TREE_LANES (node) = scalar_stmts.length ();
215   return node;
216 }
217 
218 /* Create an SLP node for SCALAR_STMTS.  */
219 
220 static slp_tree
vect_create_new_slp_node(vec<stmt_vec_info> scalar_stmts,unsigned nops)221 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
222 {
223   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
224 }
225 
226 /* Create an SLP node for OPS.  */
227 
228 static slp_tree
vect_create_new_slp_node(slp_tree node,vec<tree> ops)229 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
230 {
231   SLP_TREE_SCALAR_OPS (node) = ops;
232   SLP_TREE_DEF_TYPE (node) = vect_external_def;
233   SLP_TREE_LANES (node) = ops.length ();
234   return node;
235 }
236 
237 /* Create an SLP node for OPS.  */
238 
239 static slp_tree
vect_create_new_slp_node(vec<tree> ops)240 vect_create_new_slp_node (vec<tree> ops)
241 {
242   return vect_create_new_slp_node (new _slp_tree, ops);
243 }
244 
245 
246 /* This structure is used in creation of an SLP tree.  Each instance
247    corresponds to the same operand in a group of scalar stmts in an SLP
248    node.  */
249 typedef struct _slp_oprnd_info
250 {
251   /* Def-stmts for the operands.  */
252   vec<stmt_vec_info> def_stmts;
253   /* Operands.  */
254   vec<tree> ops;
255   /* Information about the first statement, its vector def-type, type, the
256      operand itself in case it's constant, and an indication if it's a pattern
257      stmt.  */
258   tree first_op_type;
259   enum vect_def_type first_dt;
260   bool any_pattern;
261 } *slp_oprnd_info;
262 
263 
264 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
265    operand.  */
266 static vec<slp_oprnd_info>
vect_create_oprnd_info(int nops,int group_size)267 vect_create_oprnd_info (int nops, int group_size)
268 {
269   int i;
270   slp_oprnd_info oprnd_info;
271   vec<slp_oprnd_info> oprnds_info;
272 
273   oprnds_info.create (nops);
274   for (i = 0; i < nops; i++)
275     {
276       oprnd_info = XNEW (struct _slp_oprnd_info);
277       oprnd_info->def_stmts.create (group_size);
278       oprnd_info->ops.create (group_size);
279       oprnd_info->first_dt = vect_uninitialized_def;
280       oprnd_info->first_op_type = NULL_TREE;
281       oprnd_info->any_pattern = false;
282       oprnds_info.quick_push (oprnd_info);
283     }
284 
285   return oprnds_info;
286 }
287 
288 
289 /* Free operands info.  */
290 
291 static void
vect_free_oprnd_info(vec<slp_oprnd_info> & oprnds_info)292 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
293 {
294   int i;
295   slp_oprnd_info oprnd_info;
296 
297   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
298     {
299       oprnd_info->def_stmts.release ();
300       oprnd_info->ops.release ();
301       XDELETE (oprnd_info);
302     }
303 
304   oprnds_info.release ();
305 }
306 
307 
308 /* Return true if STMTS contains a pattern statement.  */
309 
310 static bool
vect_contains_pattern_stmt_p(vec<stmt_vec_info> stmts)311 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
312 {
313   stmt_vec_info stmt_info;
314   unsigned int i;
315   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
316     if (is_pattern_stmt_p (stmt_info))
317       return true;
318   return false;
319 }
320 
321 /* Return true when all lanes in the external or constant NODE have
322    the same value.  */
323 
324 static bool
vect_slp_tree_uniform_p(slp_tree node)325 vect_slp_tree_uniform_p (slp_tree node)
326 {
327   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
328 	      || SLP_TREE_DEF_TYPE (node) == vect_external_def);
329 
330   /* Pre-exsting vectors.  */
331   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
332     return false;
333 
334   unsigned i;
335   tree op, first = NULL_TREE;
336   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
337     if (!first)
338       first = op;
339     else if (!operand_equal_p (first, op, 0))
340       return false;
341 
342   return true;
343 }
344 
345 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
346    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
347    of the chain.  */
348 
349 int
vect_get_place_in_interleaving_chain(stmt_vec_info stmt_info,stmt_vec_info first_stmt_info)350 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
351 				      stmt_vec_info first_stmt_info)
352 {
353   stmt_vec_info next_stmt_info = first_stmt_info;
354   int result = 0;
355 
356   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
357     return -1;
358 
359   do
360     {
361       if (next_stmt_info == stmt_info)
362 	return result;
363       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
364       if (next_stmt_info)
365 	result += DR_GROUP_GAP (next_stmt_info);
366     }
367   while (next_stmt_info);
368 
369   return -1;
370 }
371 
372 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
373    using the method implemented by duplicate_and_interleave.  Return true
374    if so, returning the number of intermediate vectors in *NVECTORS_OUT
375    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
376    (if nonnull).  */
377 
378 bool
can_duplicate_and_interleave_p(vec_info * vinfo,unsigned int count,tree elt_type,unsigned int * nvectors_out,tree * vector_type_out,tree * permutes)379 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
380 				tree elt_type, unsigned int *nvectors_out,
381 				tree *vector_type_out,
382 				tree *permutes)
383 {
384   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
385   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
386     return false;
387 
388   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
389   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
390   unsigned int nvectors = 1;
391   for (;;)
392     {
393       scalar_int_mode int_mode;
394       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
395       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
396 	{
397 	  /* Get the natural vector type for this SLP group size.  */
398 	  tree int_type = build_nonstandard_integer_type
399 	    (GET_MODE_BITSIZE (int_mode), 1);
400 	  tree vector_type
401 	    = get_vectype_for_scalar_type (vinfo, int_type, count);
402 	  if (vector_type
403 	      && VECTOR_MODE_P (TYPE_MODE (vector_type))
404 	      && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
405 			   GET_MODE_SIZE (base_vector_mode)))
406 	    {
407 	      /* Try fusing consecutive sequences of COUNT / NVECTORS elements
408 		 together into elements of type INT_TYPE and using the result
409 		 to build NVECTORS vectors.  */
410 	      poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
411 	      vec_perm_builder sel1 (nelts, 2, 3);
412 	      vec_perm_builder sel2 (nelts, 2, 3);
413 	      poly_int64 half_nelts = exact_div (nelts, 2);
414 	      for (unsigned int i = 0; i < 3; ++i)
415 		{
416 		  sel1.quick_push (i);
417 		  sel1.quick_push (i + nelts);
418 		  sel2.quick_push (half_nelts + i);
419 		  sel2.quick_push (half_nelts + i + nelts);
420 		}
421 	      vec_perm_indices indices1 (sel1, 2, nelts);
422 	      vec_perm_indices indices2 (sel2, 2, nelts);
423 	      if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
424 		  && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
425 		{
426 		  if (nvectors_out)
427 		    *nvectors_out = nvectors;
428 		  if (vector_type_out)
429 		    *vector_type_out = vector_type;
430 		  if (permutes)
431 		    {
432 		      permutes[0] = vect_gen_perm_mask_checked (vector_type,
433 								indices1);
434 		      permutes[1] = vect_gen_perm_mask_checked (vector_type,
435 								indices2);
436 		    }
437 		  return true;
438 		}
439 	    }
440 	}
441       if (!multiple_p (elt_bytes, 2, &elt_bytes))
442 	return false;
443       nvectors *= 2;
444     }
445 }
446 
447 /* Return true if DTA and DTB match.  */
448 
449 static bool
vect_def_types_match(enum vect_def_type dta,enum vect_def_type dtb)450 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
451 {
452   return (dta == dtb
453 	  || ((dta == vect_external_def || dta == vect_constant_def)
454 	      && (dtb == vect_external_def || dtb == vect_constant_def)));
455 }
456 
457 static const int cond_expr_maps[3][5] = {
458   { 4, -1, -2, 1, 2 },
459   { 4, -2, -1, 1, 2 },
460   { 4, -1, -2, 2, 1 }
461 };
462 static const int arg1_map[] = { 1, 1 };
463 static const int arg2_map[] = { 1, 2 };
464 static const int arg1_arg4_map[] = { 2, 1, 4 };
465 
466 /* For most SLP statements, there is a one-to-one mapping between
467    gimple arguments and child nodes.  If that is not true for STMT,
468    return an array that contains:
469 
470    - the number of child nodes, followed by
471    - for each child node, the index of the argument associated with that node.
472      The special index -1 is the first operand of an embedded comparison and
473      the special index -2 is the second operand of an embedded comparison.
474 
475    SWAP is as for vect_get_and_check_slp_defs.  */
476 
477 static const int *
vect_get_operand_map(const gimple * stmt,unsigned char swap=0)478 vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
479 {
480   if (auto assign = dyn_cast<const gassign *> (stmt))
481     {
482       if (gimple_assign_rhs_code (assign) == COND_EXPR
483 	  && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
484 	return cond_expr_maps[swap];
485     }
486   gcc_assert (!swap);
487   if (auto call = dyn_cast<const gcall *> (stmt))
488     {
489       if (gimple_call_internal_p (call))
490 	switch (gimple_call_internal_fn (call))
491 	  {
492 	  case IFN_MASK_LOAD:
493 	    return arg2_map;
494 
495 	  case IFN_GATHER_LOAD:
496 	    return arg1_map;
497 
498 	  case IFN_MASK_GATHER_LOAD:
499 	    return arg1_arg4_map;
500 
501 	  default:
502 	    break;
503 	  }
504     }
505   return nullptr;
506 }
507 
508 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
509    they are of a valid type and that they match the defs of the first stmt of
510    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
511    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
512    indicates swap is required for cond_expr stmts.  Specifically, SWAP
513    is 1 if STMT is cond and operands of comparison need to be swapped;
514    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
515 
516    If there was a fatal error return -1; if the error could be corrected by
517    swapping operands of father node of this one, return 1; if everything is
518    ok return 0.  */
519 static int
vect_get_and_check_slp_defs(vec_info * vinfo,unsigned char swap,bool * skip_args,vec<stmt_vec_info> stmts,unsigned stmt_num,vec<slp_oprnd_info> * oprnds_info)520 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
521 			     bool *skip_args,
522 			     vec<stmt_vec_info> stmts, unsigned stmt_num,
523 			     vec<slp_oprnd_info> *oprnds_info)
524 {
525   stmt_vec_info stmt_info = stmts[stmt_num];
526   tree oprnd;
527   unsigned int i, number_of_oprnds;
528   enum vect_def_type dt = vect_uninitialized_def;
529   slp_oprnd_info oprnd_info;
530   unsigned int commutative_op = -1U;
531   bool first = stmt_num == 0;
532 
533   if (!is_a<gcall *> (stmt_info->stmt)
534       && !is_a<gassign *> (stmt_info->stmt)
535       && !is_a<gphi *> (stmt_info->stmt))
536     return -1;
537 
538   number_of_oprnds = gimple_num_args (stmt_info->stmt);
539   const int *map = vect_get_operand_map (stmt_info->stmt, swap);
540   if (map)
541     number_of_oprnds = *map++;
542   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
543     {
544       if (gimple_call_internal_p (stmt))
545 	{
546 	  internal_fn ifn = gimple_call_internal_fn (stmt);
547 	  commutative_op = first_commutative_argument (ifn);
548 	}
549     }
550   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
551     {
552       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
553 	commutative_op = 0;
554     }
555 
556   bool swapped = (swap != 0);
557   bool backedge = false;
558   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
559   for (i = 0; i < number_of_oprnds; i++)
560     {
561       int opno = map ? map[i] : int (i);
562       if (opno < 0)
563 	oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
564       else
565 	{
566 	  oprnd = gimple_arg (stmt_info->stmt, opno);
567 	  if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
568 	    backedge = dominated_by_p (CDI_DOMINATORS,
569 				       gimple_phi_arg_edge (stmt, opno)->src,
570 				       gimple_bb (stmt_info->stmt));
571 	}
572       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
573 	oprnd = TREE_OPERAND (oprnd, 0);
574 
575       oprnd_info = (*oprnds_info)[i];
576 
577       stmt_vec_info def_stmt_info;
578       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
579 	{
580 	  if (dump_enabled_p ())
581 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
582 			     "Build SLP failed: can't analyze def for %T\n",
583 			     oprnd);
584 
585 	  return -1;
586 	}
587 
588       if (skip_args[i])
589 	{
590 	  oprnd_info->def_stmts.quick_push (NULL);
591 	  oprnd_info->ops.quick_push (NULL_TREE);
592 	  oprnd_info->first_dt = vect_uninitialized_def;
593 	  continue;
594 	}
595 
596       oprnd_info->def_stmts.quick_push (def_stmt_info);
597       oprnd_info->ops.quick_push (oprnd);
598 
599       if (def_stmt_info
600 	  && is_pattern_stmt_p (def_stmt_info))
601 	{
602 	  if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
603 	      != def_stmt_info)
604 	    oprnd_info->any_pattern = true;
605 	  else
606 	    /* If we promote this to external use the original stmt def.  */
607 	    oprnd_info->ops.last ()
608 	      = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
609 	}
610 
611       /* If there's a extern def on a backedge make sure we can
612 	 code-generate at the region start.
613 	 ???  This is another case that could be fixed by adjusting
614 	 how we split the function but at the moment we'd have conflicting
615 	 goals there.  */
616       if (backedge
617 	  && dts[i] == vect_external_def
618 	  && is_a <bb_vec_info> (vinfo)
619 	  && TREE_CODE (oprnd) == SSA_NAME
620 	  && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
621 	  && !dominated_by_p (CDI_DOMINATORS,
622 			      as_a <bb_vec_info> (vinfo)->bbs[0],
623 			      gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
624 	{
625 	  if (dump_enabled_p ())
626 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
627 			     "Build SLP failed: extern def %T only defined "
628 			     "on backedge\n", oprnd);
629 	  return -1;
630 	}
631 
632       if (first)
633 	{
634 	  tree type = TREE_TYPE (oprnd);
635 	  dt = dts[i];
636 	  if ((dt == vect_constant_def
637 	       || dt == vect_external_def)
638 	      && !GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
639 	      && (TREE_CODE (type) == BOOLEAN_TYPE
640 		  || !can_duplicate_and_interleave_p (vinfo, stmts.length (),
641 						      type)))
642 	    {
643 	      if (dump_enabled_p ())
644 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
645 				 "Build SLP failed: invalid type of def "
646 				 "for variable-length SLP %T\n", oprnd);
647 	      return -1;
648 	    }
649 
650 	  /* For the swapping logic below force vect_reduction_def
651 	     for the reduction op in a SLP reduction group.  */
652 	  if (!STMT_VINFO_DATA_REF (stmt_info)
653 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
654 	      && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
655 	      && def_stmt_info)
656 	    dts[i] = dt = vect_reduction_def;
657 
658 	  /* Check the types of the definition.  */
659 	  switch (dt)
660 	    {
661 	    case vect_external_def:
662 	    case vect_constant_def:
663 	    case vect_internal_def:
664 	    case vect_reduction_def:
665 	    case vect_induction_def:
666 	    case vect_nested_cycle:
667 	      break;
668 
669 	    default:
670 	      /* FORNOW: Not supported.  */
671 	      if (dump_enabled_p ())
672 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
673 				 "Build SLP failed: illegal type of def %T\n",
674 				 oprnd);
675 	      return -1;
676 	    }
677 
678 	  oprnd_info->first_dt = dt;
679 	  oprnd_info->first_op_type = type;
680 	}
681     }
682   if (first)
683     return 0;
684 
685   /* Now match the operand definition types to that of the first stmt.  */
686   for (i = 0; i < number_of_oprnds;)
687     {
688       if (skip_args[i])
689 	{
690 	  ++i;
691 	  continue;
692 	}
693 
694       oprnd_info = (*oprnds_info)[i];
695       dt = dts[i];
696       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
697       oprnd = oprnd_info->ops[stmt_num];
698       tree type = TREE_TYPE (oprnd);
699 
700       if (!types_compatible_p (oprnd_info->first_op_type, type))
701 	{
702 	  if (dump_enabled_p ())
703 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
704 			     "Build SLP failed: different operand types\n");
705 	  return 1;
706 	}
707 
708       /* Not first stmt of the group, check that the def-stmt/s match
709 	 the def-stmt/s of the first stmt.  Allow different definition
710 	 types for reduction chains: the first stmt must be a
711 	 vect_reduction_def (a phi node), and the rest
712 	 end in the reduction chain.  */
713       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
714 	   && !(oprnd_info->first_dt == vect_reduction_def
715 		&& !STMT_VINFO_DATA_REF (stmt_info)
716 		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info)
717 		&& def_stmt_info
718 		&& !STMT_VINFO_DATA_REF (def_stmt_info)
719 		&& (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
720 		    == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
721 	  || (!STMT_VINFO_DATA_REF (stmt_info)
722 	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
723 	      && ((!def_stmt_info
724 		   || STMT_VINFO_DATA_REF (def_stmt_info)
725 		   || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
726 		       != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
727 		  != (oprnd_info->first_dt != vect_reduction_def))))
728 	{
729 	  /* Try swapping operands if we got a mismatch.  For BB
730 	     vectorization only in case it will clearly improve things.  */
731 	  if (i == commutative_op && !swapped
732 	      && (!is_a <bb_vec_info> (vinfo)
733 		  || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
734 					     dts[i+1])
735 		      && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
736 			  || vect_def_types_match
737 			       ((*oprnds_info)[i+1]->first_dt, dts[i])))))
738 	    {
739 	      if (dump_enabled_p ())
740 		dump_printf_loc (MSG_NOTE, vect_location,
741 				 "trying swapped operands\n");
742 	      std::swap (dts[i], dts[i+1]);
743 	      std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
744 			 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
745 	      std::swap ((*oprnds_info)[i]->ops[stmt_num],
746 			 (*oprnds_info)[i+1]->ops[stmt_num]);
747 	      swapped = true;
748 	      continue;
749 	    }
750 
751 	  if (is_a <bb_vec_info> (vinfo)
752 	      && !oprnd_info->any_pattern)
753 	    {
754 	      /* Now for commutative ops we should see whether we can
755 		 make the other operand matching.  */
756 	      if (dump_enabled_p ())
757 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 				 "treating operand as external\n");
759 	      oprnd_info->first_dt = dt = vect_external_def;
760 	    }
761 	  else
762 	    {
763 	      if (dump_enabled_p ())
764 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
765 				 "Build SLP failed: different types\n");
766 	      return 1;
767 	    }
768 	}
769 
770       /* Make sure to demote the overall operand to external.  */
771       if (dt == vect_external_def)
772 	oprnd_info->first_dt = vect_external_def;
773       /* For a SLP reduction chain we want to duplicate the reduction to
774 	 each of the chain members.  That gets us a sane SLP graph (still
775 	 the stmts are not 100% correct wrt the initial values).  */
776       else if ((dt == vect_internal_def
777 		|| dt == vect_reduction_def)
778 	       && oprnd_info->first_dt == vect_reduction_def
779 	       && !STMT_VINFO_DATA_REF (stmt_info)
780 	       && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
781 	       && !STMT_VINFO_DATA_REF (def_stmt_info)
782 	       && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
783 		   == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
784 	{
785 	  oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
786 	  oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
787 	}
788 
789       ++i;
790     }
791 
792   /* Swap operands.  */
793   if (swapped)
794     {
795       if (dump_enabled_p ())
796 	dump_printf_loc (MSG_NOTE, vect_location,
797 			 "swapped operands to match def types in %G",
798 			 stmt_info->stmt);
799     }
800 
801   return 0;
802 }
803 
804 /* Return true if call statements CALL1 and CALL2 are similar enough
805    to be combined into the same SLP group.  */
806 
807 bool
compatible_calls_p(gcall * call1,gcall * call2)808 compatible_calls_p (gcall *call1, gcall *call2)
809 {
810   unsigned int nargs = gimple_call_num_args (call1);
811   if (nargs != gimple_call_num_args (call2))
812     return false;
813 
814   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
815     return false;
816 
817   if (gimple_call_internal_p (call1))
818     {
819       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
820 			       TREE_TYPE (gimple_call_lhs (call2))))
821 	return false;
822       for (unsigned int i = 0; i < nargs; ++i)
823 	if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
824 				 TREE_TYPE (gimple_call_arg (call2, i))))
825 	  return false;
826     }
827   else
828     {
829       if (!operand_equal_p (gimple_call_fn (call1),
830 			    gimple_call_fn (call2), 0))
831 	return false;
832 
833       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
834 	return false;
835     }
836 
837   /* Check that any unvectorized arguments are equal.  */
838   if (const int *map = vect_get_operand_map (call1))
839     {
840       unsigned int nkept = *map++;
841       unsigned int mapi = 0;
842       for (unsigned int i = 0; i < nargs; ++i)
843 	if (mapi < nkept && map[mapi] == int (i))
844 	  mapi += 1;
845 	else if (!operand_equal_p (gimple_call_arg (call1, i),
846 				   gimple_call_arg (call2, i)))
847 	  return false;
848     }
849 
850   return true;
851 }
852 
853 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
854    caller's attempt to find the vector type in STMT_INFO with the narrowest
855    element type.  Return true if VECTYPE is nonnull and if it is valid
856    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
857    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
858    vect_build_slp_tree.  */
859 
860 static bool
vect_record_max_nunits(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,tree vectype,poly_uint64 * max_nunits)861 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
862 			unsigned int group_size,
863 			tree vectype, poly_uint64 *max_nunits)
864 {
865   if (!vectype)
866     {
867       if (dump_enabled_p ())
868 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 			 "Build SLP failed: unsupported data-type in %G\n",
870 			 stmt_info->stmt);
871       /* Fatal mismatch.  */
872       return false;
873     }
874 
875   /* If populating the vector type requires unrolling then fail
876      before adjusting *max_nunits for basic-block vectorization.  */
877   if (is_a <bb_vec_info> (vinfo)
878       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
879     {
880       if (dump_enabled_p ())
881 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 			 "Build SLP failed: unrolling required "
883 			 "in basic block SLP\n");
884       /* Fatal mismatch.  */
885       return false;
886     }
887 
888   /* In case of multiple types we need to detect the smallest type.  */
889   vect_update_max_nunits (max_nunits, vectype);
890   return true;
891 }
892 
893 /* Verify if the scalar stmts STMTS are isomorphic, require data
894    permutation or are of unsupported types of operation.  Return
895    true if they are, otherwise return false and indicate in *MATCHES
896    which stmts are not isomorphic to the first one.  If MATCHES[0]
897    is false then this indicates the comparison could not be
898    carried out or the stmts will never be vectorized by SLP.
899 
900    Note COND_EXPR is possibly isomorphic to another one after swapping its
901    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
902    the first stmt by swapping the two operands of comparison; set SWAP[i]
903    to 2 if stmt I is isormorphic to the first stmt by inverting the code
904    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
905    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
906 
907 static bool
vect_build_slp_tree_1(vec_info * vinfo,unsigned char * swap,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,bool * two_operators,tree * node_vectype)908 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
909 		       vec<stmt_vec_info> stmts, unsigned int group_size,
910 		       poly_uint64 *max_nunits, bool *matches,
911 		       bool *two_operators, tree *node_vectype)
912 {
913   unsigned int i;
914   stmt_vec_info first_stmt_info = stmts[0];
915   code_helper first_stmt_code = ERROR_MARK;
916   code_helper alt_stmt_code = ERROR_MARK;
917   code_helper rhs_code = ERROR_MARK;
918   code_helper first_cond_code = ERROR_MARK;
919   tree lhs;
920   bool need_same_oprnds = false;
921   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
922   stmt_vec_info first_load = NULL, prev_first_load = NULL;
923   bool first_stmt_load_p = false, load_p = false;
924   bool first_stmt_phi_p = false, phi_p = false;
925   bool maybe_soft_fail = false;
926   tree soft_fail_nunits_vectype = NULL_TREE;
927 
928   /* For every stmt in NODE find its def stmt/s.  */
929   stmt_vec_info stmt_info;
930   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
931     {
932       gimple *stmt = stmt_info->stmt;
933       swap[i] = 0;
934       matches[i] = false;
935 
936       if (dump_enabled_p ())
937 	dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
938 
939       /* Fail to vectorize statements marked as unvectorizable, throw
940 	 or are volatile.  */
941       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
942 	  || stmt_can_throw_internal (cfun, stmt)
943 	  || gimple_has_volatile_ops (stmt))
944         {
945           if (dump_enabled_p ())
946 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 			     "Build SLP failed: unvectorizable statement %G",
948 			     stmt);
949 	  /* ???  For BB vectorization we want to commutate operands in a way
950 	     to shuffle all unvectorizable defs into one operand and have
951 	     the other still vectorized.  The following doesn't reliably
952 	     work for this though but it's the easiest we can do here.  */
953 	  if (is_a <bb_vec_info> (vinfo) && i != 0)
954 	    continue;
955 	  /* Fatal mismatch.  */
956 	  matches[0] = false;
957           return false;
958         }
959 
960       lhs = gimple_get_lhs (stmt);
961       if (lhs == NULL_TREE)
962 	{
963 	  if (dump_enabled_p ())
964 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
965 			     "Build SLP failed: not GIMPLE_ASSIGN nor "
966 			     "GIMPLE_CALL %G", stmt);
967 	  if (is_a <bb_vec_info> (vinfo) && i != 0)
968 	    continue;
969 	  /* Fatal mismatch.  */
970 	  matches[0] = false;
971 	  return false;
972 	}
973 
974       tree nunits_vectype;
975       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
976 					   &nunits_vectype, group_size))
977 	{
978 	  if (is_a <bb_vec_info> (vinfo) && i != 0)
979 	    continue;
980 	  /* Fatal mismatch.  */
981 	  matches[0] = false;
982 	  return false;
983 	}
984       /* Record nunits required but continue analysis, producing matches[]
985 	 as if nunits was not an issue.  This allows splitting of groups
986 	 to happen.  */
987       if (nunits_vectype
988 	  && !vect_record_max_nunits (vinfo, stmt_info, group_size,
989 				      nunits_vectype, max_nunits))
990 	{
991 	  gcc_assert (is_a <bb_vec_info> (vinfo));
992 	  maybe_soft_fail = true;
993 	  soft_fail_nunits_vectype = nunits_vectype;
994 	}
995 
996       gcc_assert (vectype);
997 
998       gcall *call_stmt = dyn_cast <gcall *> (stmt);
999       if (call_stmt)
1000 	{
1001 	  combined_fn cfn = gimple_call_combined_fn (call_stmt);
1002 	  if (cfn != CFN_LAST)
1003 	    rhs_code = cfn;
1004 	  else
1005 	    rhs_code = CALL_EXPR;
1006 
1007 	  if (cfn == CFN_MASK_LOAD
1008 	      || cfn == CFN_GATHER_LOAD
1009 	      || cfn == CFN_MASK_GATHER_LOAD)
1010 	    load_p = true;
1011 	  else if ((internal_fn_p (cfn)
1012 		    && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1013 		   || gimple_call_tail_p (call_stmt)
1014 		   || gimple_call_noreturn_p (call_stmt)
1015 		   || gimple_call_chain (call_stmt))
1016 	    {
1017 	      if (dump_enabled_p ())
1018 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019 				 "Build SLP failed: unsupported call type %G",
1020 				 call_stmt);
1021 	      if (is_a <bb_vec_info> (vinfo) && i != 0)
1022 		continue;
1023 	      /* Fatal mismatch.  */
1024 	      matches[0] = false;
1025 	      return false;
1026 	    }
1027 	}
1028       else if (gimple_code (stmt) == GIMPLE_PHI)
1029 	{
1030 	  rhs_code = ERROR_MARK;
1031 	  phi_p = true;
1032 	}
1033       else
1034 	{
1035 	  rhs_code = gimple_assign_rhs_code (stmt);
1036 	  load_p = gimple_vuse (stmt);
1037 	}
1038 
1039       /* Check the operation.  */
1040       if (i == 0)
1041 	{
1042 	  *node_vectype = vectype;
1043 	  first_stmt_code = rhs_code;
1044 	  first_stmt_load_p = load_p;
1045 	  first_stmt_phi_p = phi_p;
1046 
1047 	  /* Shift arguments should be equal in all the packed stmts for a
1048 	     vector shift with scalar shift operand.  */
1049 	  if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1050 	      || rhs_code == LROTATE_EXPR
1051 	      || rhs_code == RROTATE_EXPR)
1052 	    {
1053 	      /* First see if we have a vector/vector shift.  */
1054 	      if (!directly_supported_p (rhs_code, vectype, optab_vector))
1055 		{
1056 		  /* No vector/vector shift, try for a vector/scalar shift.  */
1057 		  if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1058 		    {
1059 		      if (dump_enabled_p ())
1060 			dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1061 					 "Build SLP failed: "
1062 					 "op not supported by target.\n");
1063 		      if (is_a <bb_vec_info> (vinfo) && i != 0)
1064 			continue;
1065 		      /* Fatal mismatch.  */
1066 		      matches[0] = false;
1067 		      return false;
1068 		    }
1069 		  need_same_oprnds = true;
1070 		  first_op1 = gimple_assign_rhs2 (stmt);
1071 		}
1072 	    }
1073 	  else if (rhs_code == WIDEN_LSHIFT_EXPR)
1074             {
1075               need_same_oprnds = true;
1076               first_op1 = gimple_assign_rhs2 (stmt);
1077             }
1078 	  else if (!load_p
1079 		   && rhs_code == BIT_FIELD_REF)
1080 	    {
1081 	      tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1082 	      if (!is_a <bb_vec_info> (vinfo)
1083 		  || TREE_CODE (vec) != SSA_NAME
1084 		  || !operand_equal_p (TYPE_SIZE (vectype),
1085 				       TYPE_SIZE (TREE_TYPE (vec))))
1086 		{
1087 		  if (dump_enabled_p ())
1088 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1089 				     "Build SLP failed: "
1090 				     "BIT_FIELD_REF not supported\n");
1091 		  /* Fatal mismatch.  */
1092 		  matches[0] = false;
1093 		  return false;
1094 		}
1095 	    }
1096 	  else if (rhs_code == CFN_DIV_POW2)
1097 	    {
1098 	      need_same_oprnds = true;
1099 	      first_op1 = gimple_call_arg (call_stmt, 1);
1100 	    }
1101 	}
1102       else
1103 	{
1104 	  if (first_stmt_code != rhs_code
1105 	      && alt_stmt_code == ERROR_MARK)
1106 	    alt_stmt_code = rhs_code;
1107 	  if ((first_stmt_code != rhs_code
1108 	       && (first_stmt_code != IMAGPART_EXPR
1109 		   || rhs_code != REALPART_EXPR)
1110 	       && (first_stmt_code != REALPART_EXPR
1111 		   || rhs_code != IMAGPART_EXPR)
1112 	       /* Handle mismatches in plus/minus by computing both
1113 		  and merging the results.  */
1114 	       && !((first_stmt_code == PLUS_EXPR
1115 		     || first_stmt_code == MINUS_EXPR)
1116 		    && (alt_stmt_code == PLUS_EXPR
1117 			|| alt_stmt_code == MINUS_EXPR)
1118 		    && rhs_code == alt_stmt_code)
1119 	       && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 		    && (first_stmt_code == ARRAY_REF
1121 			|| first_stmt_code == BIT_FIELD_REF
1122 			|| first_stmt_code == INDIRECT_REF
1123 			|| first_stmt_code == COMPONENT_REF
1124 			|| first_stmt_code == MEM_REF)
1125 		    && (rhs_code == ARRAY_REF
1126 			|| rhs_code == BIT_FIELD_REF
1127 			|| rhs_code == INDIRECT_REF
1128 			|| rhs_code == COMPONENT_REF
1129 			|| rhs_code == MEM_REF)))
1130 	      || first_stmt_load_p != load_p
1131 	      || first_stmt_phi_p != phi_p)
1132 	    {
1133 	      if (dump_enabled_p ())
1134 		{
1135 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1136 				   "Build SLP failed: different operation "
1137 				   "in stmt %G", stmt);
1138 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 				   "original stmt %G", first_stmt_info->stmt);
1140 		}
1141 	      /* Mismatch.  */
1142 	      continue;
1143 	    }
1144 
1145 	  if (!load_p
1146 	      && first_stmt_code == BIT_FIELD_REF
1147 	      && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1148 		  != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1149 	    {
1150 	      if (dump_enabled_p ())
1151 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 				 "Build SLP failed: different BIT_FIELD_REF "
1153 				 "arguments in %G", stmt);
1154 	      /* Mismatch.  */
1155 	      continue;
1156 	    }
1157 
1158 	  if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
1159 	    {
1160 	      if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1161 				       call_stmt))
1162 		{
1163 		  if (dump_enabled_p ())
1164 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1165 				     "Build SLP failed: different calls in %G",
1166 				     stmt);
1167 		  /* Mismatch.  */
1168 		  continue;
1169 		}
1170 	    }
1171 
1172 	  if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1173 	      && (gimple_bb (first_stmt_info->stmt)
1174 		  != gimple_bb (stmt_info->stmt)))
1175 	    {
1176 	      if (dump_enabled_p ())
1177 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1178 				 "Build SLP failed: different BB for PHI "
1179 				 "or possibly trapping operation in %G", stmt);
1180 	      /* Mismatch.  */
1181 	      continue;
1182 	    }
1183 
1184 	  if (need_same_oprnds)
1185 	    {
1186 	      tree other_op1 = gimple_arg (stmt, 1);
1187 	      if (!operand_equal_p (first_op1, other_op1, 0))
1188 		{
1189 		  if (dump_enabled_p ())
1190 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191 				     "Build SLP failed: different shift "
1192 				     "arguments in %G", stmt);
1193 		  /* Mismatch.  */
1194 		  continue;
1195 		}
1196 	    }
1197 
1198 	  if (!types_compatible_p (vectype, *node_vectype))
1199 	    {
1200 	      if (dump_enabled_p ())
1201 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202 				 "Build SLP failed: different vector type "
1203 				 "in %G", stmt);
1204 	      /* Mismatch.  */
1205 	      continue;
1206 	    }
1207 	}
1208 
1209       /* Grouped store or load.  */
1210       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1211 	{
1212 	  if (REFERENCE_CLASS_P (lhs))
1213 	    {
1214 	      /* Store.  */
1215 	      ;
1216 	    }
1217 	  else
1218 	    {
1219 	      /* Load.  */
1220 	      first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1221               if (prev_first_load)
1222                 {
1223                   /* Check that there are no loads from different interleaving
1224                      chains in the same node.  */
1225                   if (prev_first_load != first_load)
1226                     {
1227                       if (dump_enabled_p ())
1228 			dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1229 					 vect_location,
1230 					 "Build SLP failed: different "
1231 					 "interleaving chains in one node %G",
1232 					 stmt);
1233 		      /* Mismatch.  */
1234 		      continue;
1235                     }
1236                 }
1237               else
1238                 prev_first_load = first_load;
1239            }
1240         } /* Grouped access.  */
1241       else
1242 	{
1243 	  if (load_p
1244 	      && rhs_code != CFN_GATHER_LOAD
1245 	      && rhs_code != CFN_MASK_GATHER_LOAD)
1246 	    {
1247 	      /* Not grouped load.  */
1248 	      if (dump_enabled_p ())
1249 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 				 "Build SLP failed: not grouped load %G", stmt);
1251 
1252 	      /* FORNOW: Not grouped loads are not supported.  */
1253 	      if (is_a <bb_vec_info> (vinfo) && i != 0)
1254 		continue;
1255 	      /* Fatal mismatch.  */
1256 	      matches[0] = false;
1257 	      return false;
1258 	    }
1259 
1260 	  /* Not memory operation.  */
1261 	  if (!phi_p
1262 	      && rhs_code.is_tree_code ()
1263 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1264 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1265 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1266 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1267 	      && rhs_code != VIEW_CONVERT_EXPR
1268 	      && rhs_code != CALL_EXPR
1269 	      && rhs_code != BIT_FIELD_REF)
1270 	    {
1271 	      if (dump_enabled_p ())
1272 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1273 				 "Build SLP failed: operation unsupported %G",
1274 				 stmt);
1275 	      if (is_a <bb_vec_info> (vinfo) && i != 0)
1276 		continue;
1277 	      /* Fatal mismatch.  */
1278 	      matches[0] = false;
1279 	      return false;
1280 	    }
1281 
1282 	  if (rhs_code == COND_EXPR)
1283 	    {
1284 	      tree cond_expr = gimple_assign_rhs1 (stmt);
1285 	      enum tree_code cond_code = TREE_CODE (cond_expr);
1286 	      enum tree_code swap_code = ERROR_MARK;
1287 	      enum tree_code invert_code = ERROR_MARK;
1288 
1289 	      if (i == 0)
1290 		first_cond_code = TREE_CODE (cond_expr);
1291 	      else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1292 		{
1293 		  bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1294 		  swap_code = swap_tree_comparison (cond_code);
1295 		  invert_code = invert_tree_comparison (cond_code, honor_nans);
1296 		}
1297 
1298 	      if (first_cond_code == cond_code)
1299 		;
1300 	      /* Isomorphic can be achieved by swapping.  */
1301 	      else if (first_cond_code == swap_code)
1302 		swap[i] = 1;
1303 	      /* Isomorphic can be achieved by inverting.  */
1304 	      else if (first_cond_code == invert_code)
1305 		swap[i] = 2;
1306 	      else
1307 		{
1308 		  if (dump_enabled_p ())
1309 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310 				     "Build SLP failed: different"
1311 				     " operation %G", stmt);
1312 		  /* Mismatch.  */
1313 		  continue;
1314 		}
1315 	    }
1316 	}
1317 
1318       matches[i] = true;
1319     }
1320 
1321   for (i = 0; i < group_size; ++i)
1322     if (!matches[i])
1323       return false;
1324 
1325   /* If we allowed a two-operation SLP node verify the target can cope
1326      with the permute we are going to use.  */
1327   if (alt_stmt_code != ERROR_MARK
1328       && (!alt_stmt_code.is_tree_code ()
1329 	  || TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference))
1330     {
1331       *two_operators = true;
1332     }
1333 
1334   if (maybe_soft_fail)
1335     {
1336       unsigned HOST_WIDE_INT const_nunits;
1337       if (!TYPE_VECTOR_SUBPARTS
1338 	    (soft_fail_nunits_vectype).is_constant (&const_nunits)
1339 	  || const_nunits > group_size)
1340 	matches[0] = false;
1341       else
1342 	{
1343 	  /* With constant vector elements simulate a mismatch at the
1344 	     point we need to split.  */
1345 	  unsigned tail = group_size & (const_nunits - 1);
1346 	  memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1347 	}
1348       return false;
1349     }
1350 
1351   return true;
1352 }
1353 
1354 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1355    Note we never remove apart from at destruction time so we do not
1356    need a special value for deleted that differs from empty.  */
1357 struct bst_traits
1358 {
1359   typedef vec <stmt_vec_info> value_type;
1360   typedef vec <stmt_vec_info> compare_type;
1361   static inline hashval_t hash (value_type);
1362   static inline bool equal (value_type existing, value_type candidate);
is_emptybst_traits1363   static inline bool is_empty (value_type x) { return !x.exists (); }
is_deletedbst_traits1364   static inline bool is_deleted (value_type x) { return !x.exists (); }
1365   static const bool empty_zero_p = true;
mark_emptybst_traits1366   static inline void mark_empty (value_type &x) { x.release (); }
mark_deletedbst_traits1367   static inline void mark_deleted (value_type &x) { x.release (); }
removebst_traits1368   static inline void remove (value_type &x) { x.release (); }
1369 };
1370 inline hashval_t
hash(value_type x)1371 bst_traits::hash (value_type x)
1372 {
1373   inchash::hash h;
1374   for (unsigned i = 0; i < x.length (); ++i)
1375     h.add_int (gimple_uid (x[i]->stmt));
1376   return h.end ();
1377 }
1378 inline bool
equal(value_type existing,value_type candidate)1379 bst_traits::equal (value_type existing, value_type candidate)
1380 {
1381   if (existing.length () != candidate.length ())
1382     return false;
1383   for (unsigned i = 0; i < existing.length (); ++i)
1384     if (existing[i] != candidate[i])
1385       return false;
1386   return true;
1387 }
1388 
1389 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1390    but then vec::insert does memmove and that's not compatible with
1391    std::pair.  */
1392 struct chain_op_t
1393 {
chain_op_tchain_op_t1394   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1395       : code (code_), dt (dt_), op (op_) {}
1396   tree_code code;
1397   vect_def_type dt;
1398   tree op;
1399 };
1400 
1401 /* Comparator for sorting associatable chains.  */
1402 
1403 static int
dt_sort_cmp(const void * op1_,const void * op2_,void *)1404 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1405 {
1406   auto *op1 = (const chain_op_t *) op1_;
1407   auto *op2 = (const chain_op_t *) op2_;
1408   if (op1->dt != op2->dt)
1409     return (int)op1->dt - (int)op2->dt;
1410   return (int)op1->code - (int)op2->code;
1411 }
1412 
1413 /* Linearize the associatable expression chain at START with the
1414    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1415    filling CHAIN with the result and using WORKLIST as intermediate storage.
1416    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1417    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1418    stmts, starting with START.  */
1419 
1420 static void
vect_slp_linearize_chain(vec_info * vinfo,vec<std::pair<tree_code,gimple * >> & worklist,vec<chain_op_t> & chain,enum tree_code code,gimple * start,gimple * & code_stmt,gimple * & alt_code_stmt,vec<gimple * > * chain_stmts)1421 vect_slp_linearize_chain (vec_info *vinfo,
1422 			  vec<std::pair<tree_code, gimple *> > &worklist,
1423 			  vec<chain_op_t> &chain,
1424 			  enum tree_code code, gimple *start,
1425 			  gimple *&code_stmt, gimple *&alt_code_stmt,
1426 			  vec<gimple *> *chain_stmts)
1427 {
1428   /* For each lane linearize the addition/subtraction (or other
1429      uniform associatable operation) expression tree.  */
1430   worklist.safe_push (std::make_pair (code, start));
1431   while (!worklist.is_empty ())
1432     {
1433       auto entry = worklist.pop ();
1434       gassign *stmt = as_a <gassign *> (entry.second);
1435       enum tree_code in_code = entry.first;
1436       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1437       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1438       if (!code_stmt
1439 	  && gimple_assign_rhs_code (stmt) == code)
1440 	code_stmt = stmt;
1441       else if (!alt_code_stmt
1442 	       && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1443 	alt_code_stmt = stmt;
1444       if (chain_stmts)
1445 	chain_stmts->safe_push (stmt);
1446       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1447 	{
1448 	  tree op = gimple_op (stmt, opnum);
1449 	  vect_def_type dt;
1450 	  stmt_vec_info def_stmt_info;
1451 	  bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1452 	  gcc_assert (res);
1453 	  if (dt == vect_internal_def
1454 	      && is_pattern_stmt_p (def_stmt_info))
1455 	    op = gimple_get_lhs (def_stmt_info->stmt);
1456 	  gimple *use_stmt;
1457 	  use_operand_p use_p;
1458 	  if (dt == vect_internal_def
1459 	      && single_imm_use (op, &use_p, &use_stmt)
1460 	      && is_gimple_assign (def_stmt_info->stmt)
1461 	      && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1462 		  || (code == PLUS_EXPR
1463 		      && (gimple_assign_rhs_code (def_stmt_info->stmt)
1464 			  == MINUS_EXPR))))
1465 	    {
1466 	      tree_code op_def_code = this_code;
1467 	      if (op_def_code == MINUS_EXPR && opnum == 1)
1468 		op_def_code = PLUS_EXPR;
1469 	      if (in_code == MINUS_EXPR)
1470 		op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1471 	      worklist.safe_push (std::make_pair (op_def_code,
1472 						  def_stmt_info->stmt));
1473 	    }
1474 	  else
1475 	    {
1476 	      tree_code op_def_code = this_code;
1477 	      if (op_def_code == MINUS_EXPR && opnum == 1)
1478 		op_def_code = PLUS_EXPR;
1479 	      if (in_code == MINUS_EXPR)
1480 		op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1481 	      chain.safe_push (chain_op_t (op_def_code, dt, op));
1482 	    }
1483 	}
1484     }
1485 }
1486 
1487 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1488 		  simple_hashmap_traits <bst_traits, slp_tree> >
1489   scalar_stmts_to_slp_tree_map_t;
1490 
1491 static slp_tree
1492 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1493 		       vec<stmt_vec_info> stmts, unsigned int group_size,
1494 		       poly_uint64 *max_nunits,
1495 		       bool *matches, unsigned *limit, unsigned *tree_size,
1496 		       scalar_stmts_to_slp_tree_map_t *bst_map);
1497 
1498 static slp_tree
vect_build_slp_tree(vec_info * vinfo,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1499 vect_build_slp_tree (vec_info *vinfo,
1500 		     vec<stmt_vec_info> stmts, unsigned int group_size,
1501 		     poly_uint64 *max_nunits,
1502 		     bool *matches, unsigned *limit, unsigned *tree_size,
1503 		     scalar_stmts_to_slp_tree_map_t *bst_map)
1504 {
1505   if (slp_tree *leader = bst_map->get (stmts))
1506     {
1507       if (dump_enabled_p ())
1508 	dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1509 			 !(*leader)->failed ? "" : "failed ", *leader);
1510       if (!(*leader)->failed)
1511 	{
1512 	  SLP_TREE_REF_COUNT (*leader)++;
1513 	  vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1514 	  stmts.release ();
1515 	  return *leader;
1516 	}
1517       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1518       return NULL;
1519     }
1520 
1521   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1522      so we can pick up backedge destinations during discovery.  */
1523   slp_tree res = new _slp_tree;
1524   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1525   SLP_TREE_SCALAR_STMTS (res) = stmts;
1526   bst_map->put (stmts.copy (), res);
1527 
1528   if (*limit == 0)
1529     {
1530       if (dump_enabled_p ())
1531 	dump_printf_loc (MSG_NOTE, vect_location,
1532 			 "SLP discovery limit exceeded\n");
1533       /* Mark the node invalid so we can detect those when still in use
1534 	 as backedge destinations.  */
1535       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1536       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1537       res->failed = XNEWVEC (bool, group_size);
1538       memset (res->failed, 0, sizeof (bool) * group_size);
1539       memset (matches, 0, sizeof (bool) * group_size);
1540       return NULL;
1541     }
1542   --*limit;
1543 
1544   if (dump_enabled_p ())
1545     dump_printf_loc (MSG_NOTE, vect_location,
1546 		     "starting SLP discovery for node %p\n", res);
1547 
1548   poly_uint64 this_max_nunits = 1;
1549   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1550 					&this_max_nunits,
1551 					matches, limit, tree_size, bst_map);
1552   if (!res_)
1553     {
1554       if (dump_enabled_p ())
1555 	dump_printf_loc (MSG_NOTE, vect_location,
1556 			 "SLP discovery for node %p failed\n", res);
1557       /* Mark the node invalid so we can detect those when still in use
1558 	 as backedge destinations.  */
1559       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1560       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1561       res->failed = XNEWVEC (bool, group_size);
1562       if (flag_checking)
1563 	{
1564 	  unsigned i;
1565 	  for (i = 0; i < group_size; ++i)
1566 	    if (!matches[i])
1567 	      break;
1568 	  gcc_assert (i < group_size);
1569 	}
1570       memcpy (res->failed, matches, sizeof (bool) * group_size);
1571     }
1572   else
1573     {
1574       if (dump_enabled_p ())
1575 	dump_printf_loc (MSG_NOTE, vect_location,
1576 			 "SLP discovery for node %p succeeded\n", res);
1577       gcc_assert (res_ == res);
1578       res->max_nunits = this_max_nunits;
1579       vect_update_max_nunits (max_nunits, this_max_nunits);
1580       /* Keep a reference for the bst_map use.  */
1581       SLP_TREE_REF_COUNT (res)++;
1582     }
1583   return res_;
1584 }
1585 
1586 /* Helper for building an associated SLP node chain.  */
1587 
1588 static void
vect_slp_build_two_operator_nodes(slp_tree perm,tree vectype,slp_tree op0,slp_tree op1,stmt_vec_info oper1,stmt_vec_info oper2,vec<std::pair<unsigned,unsigned>> lperm)1589 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1590 				   slp_tree op0, slp_tree op1,
1591 				   stmt_vec_info oper1, stmt_vec_info oper2,
1592 				   vec<std::pair<unsigned, unsigned> > lperm)
1593 {
1594   unsigned group_size = SLP_TREE_LANES (op1);
1595 
1596   slp_tree child1 = new _slp_tree;
1597   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1598   SLP_TREE_VECTYPE (child1) = vectype;
1599   SLP_TREE_LANES (child1) = group_size;
1600   SLP_TREE_CHILDREN (child1).create (2);
1601   SLP_TREE_CHILDREN (child1).quick_push (op0);
1602   SLP_TREE_CHILDREN (child1).quick_push (op1);
1603   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1604 
1605   slp_tree child2 = new _slp_tree;
1606   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1607   SLP_TREE_VECTYPE (child2) = vectype;
1608   SLP_TREE_LANES (child2) = group_size;
1609   SLP_TREE_CHILDREN (child2).create (2);
1610   SLP_TREE_CHILDREN (child2).quick_push (op0);
1611   SLP_TREE_REF_COUNT (op0)++;
1612   SLP_TREE_CHILDREN (child2).quick_push (op1);
1613   SLP_TREE_REF_COUNT (op1)++;
1614   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1615 
1616   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1617   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1618   SLP_TREE_VECTYPE (perm) = vectype;
1619   SLP_TREE_LANES (perm) = group_size;
1620   /* ???  We should set this NULL but that's not expected.  */
1621   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1622   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1623   SLP_TREE_CHILDREN (perm).quick_push (child1);
1624   SLP_TREE_CHILDREN (perm).quick_push (child2);
1625 }
1626 
1627 /* Recursively build an SLP tree starting from NODE.
1628    Fail (and return a value not equal to zero) if def-stmts are not
1629    isomorphic, require data permutation or are of unsupported types of
1630    operation.  Otherwise, return 0.
1631    The value returned is the depth in the SLP tree where a mismatch
1632    was found.  */
1633 
1634 static slp_tree
vect_build_slp_tree_2(vec_info * vinfo,slp_tree node,vec<stmt_vec_info> stmts,unsigned int group_size,poly_uint64 * max_nunits,bool * matches,unsigned * limit,unsigned * tree_size,scalar_stmts_to_slp_tree_map_t * bst_map)1635 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1636 		       vec<stmt_vec_info> stmts, unsigned int group_size,
1637 		       poly_uint64 *max_nunits,
1638 		       bool *matches, unsigned *limit, unsigned *tree_size,
1639 		       scalar_stmts_to_slp_tree_map_t *bst_map)
1640 {
1641   unsigned nops, i, this_tree_size = 0;
1642   poly_uint64 this_max_nunits = *max_nunits;
1643 
1644   matches[0] = false;
1645 
1646   stmt_vec_info stmt_info = stmts[0];
1647   if (!is_a<gcall *> (stmt_info->stmt)
1648       && !is_a<gassign *> (stmt_info->stmt)
1649       && !is_a<gphi *> (stmt_info->stmt))
1650     return NULL;
1651 
1652   nops = gimple_num_args (stmt_info->stmt);
1653   if (const int *map = vect_get_operand_map (stmt_info->stmt))
1654     nops = map[0];
1655 
1656   /* If the SLP node is a PHI (induction or reduction), terminate
1657      the recursion.  */
1658   bool *skip_args = XALLOCAVEC (bool, nops);
1659   memset (skip_args, 0, sizeof (bool) * nops);
1660   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1661     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1662       {
1663 	tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1664 	tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1665 						    group_size);
1666 	if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1667 				     max_nunits))
1668 	  return NULL;
1669 
1670 	vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1671 	if (def_type == vect_induction_def)
1672 	  {
1673 	    /* Induction PHIs are not cycles but walk the initial
1674 	       value.  Only for inner loops through, for outer loops
1675 	       we need to pick up the value from the actual PHIs
1676 	       to more easily support peeling and epilogue vectorization.  */
1677 	    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678 	    if (!nested_in_vect_loop_p (loop, stmt_info))
1679 	      skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1680 	    else
1681 	      loop = loop->inner;
1682 	    skip_args[loop_latch_edge (loop)->dest_idx] = true;
1683 	  }
1684 	else if (def_type == vect_reduction_def
1685 		 || def_type == vect_double_reduction_def
1686 		 || def_type == vect_nested_cycle)
1687 	  {
1688 	    /* Else def types have to match.  */
1689 	    stmt_vec_info other_info;
1690 	    bool all_same = true;
1691 	    FOR_EACH_VEC_ELT (stmts, i, other_info)
1692 	      {
1693 		if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1694 		  return NULL;
1695 		if (other_info != stmt_info)
1696 		  all_same = false;
1697 	      }
1698 	    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1699 	    /* Reduction initial values are not explicitely represented.  */
1700 	    if (!nested_in_vect_loop_p (loop, stmt_info))
1701 	      skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1702 	    /* Reduction chain backedge defs are filled manually.
1703 	       ???  Need a better way to identify a SLP reduction chain PHI.
1704 	       Or a better overall way to SLP match those.  */
1705 	    if (all_same && def_type == vect_reduction_def)
1706 	      skip_args[loop_latch_edge (loop)->dest_idx] = true;
1707 	  }
1708 	else if (def_type != vect_internal_def)
1709 	  return NULL;
1710       }
1711 
1712 
1713   bool two_operators = false;
1714   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1715   tree vectype = NULL_TREE;
1716   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1717 			      &this_max_nunits, matches, &two_operators,
1718 			      &vectype))
1719     return NULL;
1720 
1721   /* If the SLP node is a load, terminate the recursion unless masked.  */
1722   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1723       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1724     {
1725       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1726 	gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1727 		    || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1728 		    || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD));
1729       else
1730 	{
1731 	  *max_nunits = this_max_nunits;
1732 	  (*tree_size)++;
1733 	  node = vect_create_new_slp_node (node, stmts, 0);
1734 	  SLP_TREE_VECTYPE (node) = vectype;
1735 	  /* And compute the load permutation.  Whether it is actually
1736 	     a permutation depends on the unrolling factor which is
1737 	     decided later.  */
1738 	  vec<unsigned> load_permutation;
1739 	  int j;
1740 	  stmt_vec_info load_info;
1741 	  load_permutation.create (group_size);
1742 	  stmt_vec_info first_stmt_info
1743 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1744 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1745 	    {
1746 	      int load_place = vect_get_place_in_interleaving_chain
1747 		  (load_info, first_stmt_info);
1748 	      gcc_assert (load_place != -1);
1749 	      load_permutation.safe_push (load_place);
1750 	    }
1751 	  SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1752 	  return node;
1753 	}
1754     }
1755   else if (gimple_assign_single_p (stmt_info->stmt)
1756 	   && !gimple_vuse (stmt_info->stmt)
1757 	   && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1758     {
1759       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1760 	 the same SSA name vector of a compatible type to vectype.  */
1761       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1762       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1763       stmt_vec_info estmt_info;
1764       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1765 	{
1766 	  gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1767 	  tree bfref = gimple_assign_rhs1 (estmt);
1768 	  HOST_WIDE_INT lane;
1769 	  if (!known_eq (bit_field_size (bfref),
1770 			 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1771 	      || !constant_multiple_p (bit_field_offset (bfref),
1772 				       bit_field_size (bfref), &lane))
1773 	    {
1774 	      lperm.release ();
1775 	      matches[0] = false;
1776 	      return NULL;
1777 	    }
1778 	  lperm.safe_push (std::make_pair (0, (unsigned)lane));
1779 	}
1780       slp_tree vnode = vect_create_new_slp_node (vNULL);
1781       /* ???  We record vectype here but we hide eventually necessary
1782 	 punning and instead rely on code generation to materialize
1783 	 VIEW_CONVERT_EXPRs as necessary.  We instead should make
1784 	 this explicit somehow.  */
1785       SLP_TREE_VECTYPE (vnode) = vectype;
1786       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
1787       /* We are always building a permutation node even if it is an identity
1788 	 permute to shield the rest of the vectorizer from the odd node
1789 	 representing an actual vector without any scalar ops.
1790 	 ???  We could hide it completely with making the permute node
1791 	 external?  */
1792       node = vect_create_new_slp_node (node, stmts, 1);
1793       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
1794       SLP_TREE_LANE_PERMUTATION (node) = lperm;
1795       SLP_TREE_VECTYPE (node) = vectype;
1796       SLP_TREE_CHILDREN (node).quick_push (vnode);
1797       return node;
1798     }
1799   /* When discovery reaches an associatable operation see whether we can
1800      improve that to match up lanes in a way superior to the operand
1801      swapping code which at most looks at two defs.
1802      ???  For BB vectorization we cannot do the brute-force search
1803      for matching as we can succeed by means of builds from scalars
1804      and have no good way to "cost" one build against another.  */
1805   else if (is_a <loop_vec_info> (vinfo)
1806 	   /* ???  We don't handle !vect_internal_def defs below.  */
1807 	   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1808 	   && is_gimple_assign (stmt_info->stmt)
1809 	   && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
1810 	       || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
1811 	   && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
1812 	       || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
1813 		   && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
1814     {
1815       /* See if we have a chain of (mixed) adds or subtracts or other
1816 	 associatable ops.  */
1817       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
1818       if (code == MINUS_EXPR)
1819 	code = PLUS_EXPR;
1820       stmt_vec_info other_op_stmt_info = NULL;
1821       stmt_vec_info op_stmt_info = NULL;
1822       unsigned chain_len = 0;
1823       auto_vec<chain_op_t> chain;
1824       auto_vec<std::pair<tree_code, gimple *> > worklist;
1825       auto_vec<vec<chain_op_t> > chains (group_size);
1826       auto_vec<slp_tree, 4> children;
1827       bool hard_fail = true;
1828       for (unsigned lane = 0; lane < group_size; ++lane)
1829 	{
1830 	  /* For each lane linearize the addition/subtraction (or other
1831 	     uniform associatable operation) expression tree.  */
1832 	  gimple *op_stmt = NULL, *other_op_stmt = NULL;
1833 	  vect_slp_linearize_chain (vinfo, worklist, chain, code,
1834 				    stmts[lane]->stmt, op_stmt, other_op_stmt,
1835 				    NULL);
1836 	  if (!op_stmt_info && op_stmt)
1837 	    op_stmt_info = vinfo->lookup_stmt (op_stmt);
1838 	  if (!other_op_stmt_info && other_op_stmt)
1839 	    other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
1840 	  if (chain.length () == 2)
1841 	    {
1842 	      /* In a chain of just two elements resort to the regular
1843 		 operand swapping scheme.  If we run into a length
1844 		 mismatch still hard-FAIL.  */
1845 	      if (chain_len == 0)
1846 		hard_fail = false;
1847 	      else
1848 		{
1849 		  matches[lane] = false;
1850 		  /* ???  We might want to process the other lanes, but
1851 		     make sure to not give false matching hints to the
1852 		     caller for lanes we did not process.  */
1853 		  if (lane != group_size - 1)
1854 		    matches[0] = false;
1855 		}
1856 	      break;
1857 	    }
1858 	  else if (chain_len == 0)
1859 	    chain_len = chain.length ();
1860 	  else if (chain.length () != chain_len)
1861 	    {
1862 	      /* ???  Here we could slip in magic to compensate with
1863 		 neutral operands.  */
1864 	      matches[lane] = false;
1865 	      if (lane != group_size - 1)
1866 		matches[0] = false;
1867 	      break;
1868 	    }
1869 	  chains.quick_push (chain.copy ());
1870 	  chain.truncate (0);
1871 	}
1872       if (chains.length () == group_size)
1873 	{
1874 	  /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
1875 	  if (!op_stmt_info)
1876 	    {
1877 	      hard_fail = false;
1878 	      goto out;
1879 	    }
1880 	  /* Now we have a set of chains with the same length.  */
1881 	  /* 1. pre-sort according to def_type and operation.  */
1882 	  for (unsigned lane = 0; lane < group_size; ++lane)
1883 	    chains[lane].stablesort (dt_sort_cmp, vinfo);
1884 	  if (dump_enabled_p ())
1885 	    {
1886 	      dump_printf_loc (MSG_NOTE, vect_location,
1887 			       "pre-sorted chains of %s\n",
1888 			       get_tree_code_name (code));
1889 	      for (unsigned lane = 0; lane < group_size; ++lane)
1890 		{
1891 		  for (unsigned opnum = 0; opnum < chain_len; ++opnum)
1892 		    dump_printf (MSG_NOTE, "%s %T ",
1893 				 get_tree_code_name (chains[lane][opnum].code),
1894 				 chains[lane][opnum].op);
1895 		  dump_printf (MSG_NOTE, "\n");
1896 		}
1897 	    }
1898 	  /* 2. try to build children nodes, associating as necessary.  */
1899 	  for (unsigned n = 0; n < chain_len; ++n)
1900 	    {
1901 	      vect_def_type dt = chains[0][n].dt;
1902 	      unsigned lane;
1903 	      for (lane = 0; lane < group_size; ++lane)
1904 		if (chains[lane][n].dt != dt)
1905 		  {
1906 		    if (dt == vect_constant_def
1907 			&& chains[lane][n].dt == vect_external_def)
1908 		      dt = vect_external_def;
1909 		    else if (dt == vect_external_def
1910 			     && chains[lane][n].dt == vect_constant_def)
1911 		      ;
1912 		    else
1913 		      break;
1914 		  }
1915 	      if (lane != group_size)
1916 		{
1917 		  if (dump_enabled_p ())
1918 		    dump_printf_loc (MSG_NOTE, vect_location,
1919 				     "giving up on chain due to mismatched "
1920 				     "def types\n");
1921 		  matches[lane] = false;
1922 		  if (lane != group_size - 1)
1923 		    matches[0] = false;
1924 		  goto out;
1925 		}
1926 	      if (dt == vect_constant_def
1927 		  || dt == vect_external_def)
1928 		{
1929 		  /* Check whether we can build the invariant.  If we can't
1930 		     we never will be able to.  */
1931 		  tree type = TREE_TYPE (chains[0][n].op);
1932 		  if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
1933 		      && (TREE_CODE (type) == BOOLEAN_TYPE
1934 			  || !can_duplicate_and_interleave_p (vinfo, group_size,
1935 							      type)))
1936 		    {
1937 		      matches[0] = false;
1938 		      goto out;
1939 		    }
1940 		  vec<tree> ops;
1941 		  ops.create (group_size);
1942 		  for (lane = 0; lane < group_size; ++lane)
1943 		    ops.quick_push (chains[lane][n].op);
1944 		  slp_tree child = vect_create_new_slp_node (ops);
1945 		  SLP_TREE_DEF_TYPE (child) = dt;
1946 		  children.safe_push (child);
1947 		}
1948 	      else if (dt != vect_internal_def)
1949 		{
1950 		  /* Not sure, we might need sth special.
1951 		     gcc.dg/vect/pr96854.c,
1952 		     gfortran.dg/vect/fast-math-pr37021.f90
1953 		     and gfortran.dg/vect/pr61171.f trigger.  */
1954 		  /* Soft-fail for now.  */
1955 		  hard_fail = false;
1956 		  goto out;
1957 		}
1958 	      else
1959 		{
1960 		  vec<stmt_vec_info> op_stmts;
1961 		  op_stmts.create (group_size);
1962 		  slp_tree child = NULL;
1963 		  /* Brute-force our way.  We have to consider a lane
1964 		     failing after fixing an earlier fail up in the
1965 		     SLP discovery recursion.  So track the current
1966 		     permute per lane.  */
1967 		  unsigned *perms = XALLOCAVEC (unsigned, group_size);
1968 		  memset (perms, 0, sizeof (unsigned) * group_size);
1969 		  do
1970 		    {
1971 		      op_stmts.truncate (0);
1972 		      for (lane = 0; lane < group_size; ++lane)
1973 			op_stmts.quick_push
1974 			  (vinfo->lookup_def (chains[lane][n].op));
1975 		      child = vect_build_slp_tree (vinfo, op_stmts,
1976 						   group_size, &this_max_nunits,
1977 						   matches, limit,
1978 						   &this_tree_size, bst_map);
1979 		      /* ???  We're likely getting too many fatal mismatches
1980 			 here so maybe we want to ignore them (but then we
1981 			 have no idea which lanes fatally mismatched).  */
1982 		      if (child || !matches[0])
1983 			break;
1984 		      /* Swap another lane we have not yet matched up into
1985 			 lanes that did not match.  If we run out of
1986 			 permute possibilities for a lane terminate the
1987 			 search.  */
1988 		      bool term = false;
1989 		      for (lane = 1; lane < group_size; ++lane)
1990 			if (!matches[lane])
1991 			  {
1992 			    if (n + perms[lane] + 1 == chain_len)
1993 			      {
1994 				term = true;
1995 				break;
1996 			      }
1997 			    std::swap (chains[lane][n],
1998 				       chains[lane][n + perms[lane] + 1]);
1999 			    perms[lane]++;
2000 			  }
2001 		      if (term)
2002 			break;
2003 		    }
2004 		  while (1);
2005 		  if (!child)
2006 		    {
2007 		      if (dump_enabled_p ())
2008 			dump_printf_loc (MSG_NOTE, vect_location,
2009 					 "failed to match up op %d\n", n);
2010 		      op_stmts.release ();
2011 		      if (lane != group_size - 1)
2012 			matches[0] = false;
2013 		      else
2014 			matches[lane] = false;
2015 		      goto out;
2016 		    }
2017 		  if (dump_enabled_p ())
2018 		    {
2019 		      dump_printf_loc (MSG_NOTE, vect_location,
2020 				       "matched up op %d to\n", n);
2021 		      vect_print_slp_tree (MSG_NOTE, vect_location, child);
2022 		    }
2023 		  children.safe_push (child);
2024 		}
2025 	    }
2026 	  /* 3. build SLP nodes to combine the chain.  */
2027 	  for (unsigned lane = 0; lane < group_size; ++lane)
2028 	    if (chains[lane][0].code != code)
2029 	      {
2030 		/* See if there's any alternate all-PLUS entry.  */
2031 		unsigned n;
2032 		for (n = 1; n < chain_len; ++n)
2033 		  {
2034 		    for (lane = 0; lane < group_size; ++lane)
2035 		      if (chains[lane][n].code != code)
2036 			break;
2037 		    if (lane == group_size)
2038 		      break;
2039 		  }
2040 		if (n != chain_len)
2041 		  {
2042 		    /* Swap that in at first position.  */
2043 		    std::swap (children[0], children[n]);
2044 		    for (lane = 0; lane < group_size; ++lane)
2045 		      std::swap (chains[lane][0], chains[lane][n]);
2046 		  }
2047 		else
2048 		  {
2049 		    /* ???  When this triggers and we end up with two
2050 		       vect_constant/external_def up-front things break (ICE)
2051 		       spectacularly finding an insertion place for the
2052 		       all-constant op.  We should have a fully
2053 		       vect_internal_def operand though(?) so we can swap
2054 		       that into first place and then prepend the all-zero
2055 		       constant.  */
2056 		    if (dump_enabled_p ())
2057 		      dump_printf_loc (MSG_NOTE, vect_location,
2058 				       "inserting constant zero to compensate "
2059 				       "for (partially) negated first "
2060 				       "operand\n");
2061 		    chain_len++;
2062 		    for (lane = 0; lane < group_size; ++lane)
2063 		      chains[lane].safe_insert
2064 			(0, chain_op_t (code, vect_constant_def, NULL_TREE));
2065 		    vec<tree> zero_ops;
2066 		    zero_ops.create (group_size);
2067 		    zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2068 		    for (lane = 1; lane < group_size; ++lane)
2069 		      zero_ops.quick_push (zero_ops[0]);
2070 		    slp_tree zero = vect_create_new_slp_node (zero_ops);
2071 		    SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2072 		    children.safe_insert (0, zero);
2073 		  }
2074 		break;
2075 	      }
2076 	  for (unsigned i = 1; i < children.length (); ++i)
2077 	    {
2078 	      slp_tree op0 = children[i - 1];
2079 	      slp_tree op1 = children[i];
2080 	      bool this_two_op = false;
2081 	      for (unsigned lane = 0; lane < group_size; ++lane)
2082 		if (chains[lane][i].code != chains[0][i].code)
2083 		  {
2084 		    this_two_op = true;
2085 		    break;
2086 		  }
2087 	      slp_tree child;
2088 	      if (i == children.length () - 1)
2089 		child = vect_create_new_slp_node (node, stmts, 2);
2090 	      else
2091 		child = vect_create_new_slp_node (2, ERROR_MARK);
2092 	      if (this_two_op)
2093 		{
2094 		  vec<std::pair<unsigned, unsigned> > lperm;
2095 		  lperm.create (group_size);
2096 		  for (unsigned lane = 0; lane < group_size; ++lane)
2097 		    lperm.quick_push (std::make_pair
2098 		      (chains[lane][i].code != chains[0][i].code, lane));
2099 		  vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2100 						     (chains[0][i].code == code
2101 						      ? op_stmt_info
2102 						      : other_op_stmt_info),
2103 						     (chains[0][i].code == code
2104 						      ? other_op_stmt_info
2105 						      : op_stmt_info),
2106 						     lperm);
2107 		}
2108 	      else
2109 		{
2110 		  SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2111 		  SLP_TREE_VECTYPE (child) = vectype;
2112 		  SLP_TREE_LANES (child) = group_size;
2113 		  SLP_TREE_CHILDREN (child).quick_push (op0);
2114 		  SLP_TREE_CHILDREN (child).quick_push (op1);
2115 		  SLP_TREE_REPRESENTATIVE (child)
2116 		    = (chains[0][i].code == code
2117 		       ? op_stmt_info : other_op_stmt_info);
2118 		}
2119 	      children[i] = child;
2120 	    }
2121 	  *tree_size += this_tree_size + 1;
2122 	  *max_nunits = this_max_nunits;
2123 	  while (!chains.is_empty ())
2124 	    chains.pop ().release ();
2125 	  return node;
2126 	}
2127 out:
2128       while (!children.is_empty ())
2129 	vect_free_slp_tree (children.pop ());
2130       while (!chains.is_empty ())
2131 	chains.pop ().release ();
2132       /* Hard-fail, otherwise we might run into quadratic processing of the
2133 	 chains starting one stmt into the chain again.  */
2134       if (hard_fail)
2135 	return NULL;
2136       /* Fall thru to normal processing.  */
2137     }
2138 
2139   /* Get at the operands, verifying they are compatible.  */
2140   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2141   slp_oprnd_info oprnd_info;
2142   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2143     {
2144       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2145 					     stmts, i, &oprnds_info);
2146       if (res != 0)
2147 	matches[(res == -1) ? 0 : i] = false;
2148       if (!matches[0])
2149 	break;
2150     }
2151   for (i = 0; i < group_size; ++i)
2152     if (!matches[i])
2153       {
2154 	vect_free_oprnd_info (oprnds_info);
2155 	return NULL;
2156       }
2157   swap = NULL;
2158 
2159   auto_vec<slp_tree, 4> children;
2160 
2161   stmt_info = stmts[0];
2162 
2163   /* Create SLP_TREE nodes for the definition node/s.  */
2164   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2165     {
2166       slp_tree child;
2167       unsigned int j;
2168 
2169       /* We're skipping certain operands from processing, for example
2170 	 outer loop reduction initial defs.  */
2171       if (skip_args[i])
2172 	{
2173 	  children.safe_push (NULL);
2174 	  continue;
2175 	}
2176 
2177       if (oprnd_info->first_dt == vect_uninitialized_def)
2178 	{
2179 	  /* COND_EXPR have one too many eventually if the condition
2180 	     is a SSA name.  */
2181 	  gcc_assert (i == 3 && nops == 4);
2182 	  continue;
2183 	}
2184 
2185       if (is_a <bb_vec_info> (vinfo)
2186 	  && oprnd_info->first_dt == vect_internal_def
2187 	  && !oprnd_info->any_pattern)
2188 	{
2189 	  /* For BB vectorization, if all defs are the same do not
2190 	     bother to continue the build along the single-lane
2191 	     graph but use a splat of the scalar value.  */
2192 	  stmt_vec_info first_def = oprnd_info->def_stmts[0];
2193 	  for (j = 1; j < group_size; ++j)
2194 	    if (oprnd_info->def_stmts[j] != first_def)
2195 	      break;
2196 	  if (j == group_size
2197 	      /* But avoid doing this for loads where we may be
2198 		 able to CSE things, unless the stmt is not
2199 		 vectorizable.  */
2200 	      && (!STMT_VINFO_VECTORIZABLE (first_def)
2201 		  || !gimple_vuse (first_def->stmt)))
2202 	    {
2203 	      if (dump_enabled_p ())
2204 		dump_printf_loc (MSG_NOTE, vect_location,
2205 				 "Using a splat of the uniform operand %G",
2206 				 first_def->stmt);
2207 	      oprnd_info->first_dt = vect_external_def;
2208 	    }
2209 	}
2210 
2211       if (oprnd_info->first_dt == vect_external_def
2212 	  || oprnd_info->first_dt == vect_constant_def)
2213 	{
2214 	  slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2215 	  SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2216 	  oprnd_info->ops = vNULL;
2217 	  children.safe_push (invnode);
2218 	  continue;
2219 	}
2220 
2221       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2222 					group_size, &this_max_nunits,
2223 					matches, limit,
2224 					&this_tree_size, bst_map)) != NULL)
2225 	{
2226 	  oprnd_info->def_stmts = vNULL;
2227 	  children.safe_push (child);
2228 	  continue;
2229 	}
2230 
2231       /* If the SLP build for operand zero failed and operand zero
2232 	 and one can be commutated try that for the scalar stmts
2233 	 that failed the match.  */
2234       if (i == 0
2235 	  /* A first scalar stmt mismatch signals a fatal mismatch.  */
2236 	  && matches[0]
2237 	  /* ???  For COND_EXPRs we can swap the comparison operands
2238 	     as well as the arms under some constraints.  */
2239 	  && nops == 2
2240 	  && oprnds_info[1]->first_dt == vect_internal_def
2241 	  && is_gimple_assign (stmt_info->stmt)
2242 	  /* Swapping operands for reductions breaks assumptions later on.  */
2243 	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2244 	  && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2245 	{
2246 	  /* See whether we can swap the matching or the non-matching
2247 	     stmt operands.  */
2248 	  bool swap_not_matching = true;
2249 	  do
2250 	    {
2251 	      for (j = 0; j < group_size; ++j)
2252 		{
2253 		  if (matches[j] != !swap_not_matching)
2254 		    continue;
2255 		  stmt_vec_info stmt_info = stmts[j];
2256 		  /* Verify if we can swap operands of this stmt.  */
2257 		  gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2258 		  if (!stmt
2259 		      || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2260 		    {
2261 		      if (!swap_not_matching)
2262 			goto fail;
2263 		      swap_not_matching = false;
2264 		      break;
2265 		    }
2266 		}
2267 	    }
2268 	  while (j != group_size);
2269 
2270 	  /* Swap mismatched definition stmts.  */
2271 	  if (dump_enabled_p ())
2272 	    dump_printf_loc (MSG_NOTE, vect_location,
2273 			     "Re-trying with swapped operands of stmts ");
2274 	  for (j = 0; j < group_size; ++j)
2275 	    if (matches[j] == !swap_not_matching)
2276 	      {
2277 		std::swap (oprnds_info[0]->def_stmts[j],
2278 			   oprnds_info[1]->def_stmts[j]);
2279 		std::swap (oprnds_info[0]->ops[j],
2280 			   oprnds_info[1]->ops[j]);
2281 		if (dump_enabled_p ())
2282 		  dump_printf (MSG_NOTE, "%d ", j);
2283 	      }
2284 	  if (dump_enabled_p ())
2285 	    dump_printf (MSG_NOTE, "\n");
2286 	  /* After swapping some operands we lost track whether an
2287 	     operand has any pattern defs so be conservative here.  */
2288 	  if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2289 	    oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2290 	  /* And try again with scratch 'matches' ... */
2291 	  bool *tem = XALLOCAVEC (bool, group_size);
2292 	  if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2293 					    group_size, &this_max_nunits,
2294 					    tem, limit,
2295 					    &this_tree_size, bst_map)) != NULL)
2296 	    {
2297 	      oprnd_info->def_stmts = vNULL;
2298 	      children.safe_push (child);
2299 	      continue;
2300 	    }
2301 	}
2302 fail:
2303 
2304       /* If the SLP build failed and we analyze a basic-block
2305 	 simply treat nodes we fail to build as externally defined
2306 	 (and thus build vectors from the scalar defs).
2307 	 The cost model will reject outright expensive cases.
2308 	 ???  This doesn't treat cases where permutation ultimatively
2309 	 fails (or we don't try permutation below).  Ideally we'd
2310 	 even compute a permutation that will end up with the maximum
2311 	 SLP tree size...  */
2312       if (is_a <bb_vec_info> (vinfo)
2313 	  /* ???  Rejecting patterns this way doesn't work.  We'd have to
2314 	     do extra work to cancel the pattern so the uses see the
2315 	     scalar version.  */
2316 	  && !is_pattern_stmt_p (stmt_info)
2317 	  && !oprnd_info->any_pattern)
2318 	{
2319 	  /* But if there's a leading vector sized set of matching stmts
2320 	     fail here so we can split the group.  This matches the condition
2321 	     vect_analyze_slp_instance uses.  */
2322 	  /* ???  We might want to split here and combine the results to support
2323 	     multiple vector sizes better.  */
2324 	  for (j = 0; j < group_size; ++j)
2325 	    if (!matches[j])
2326 	      break;
2327 	  if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2328 	    {
2329 	      if (dump_enabled_p ())
2330 		dump_printf_loc (MSG_NOTE, vect_location,
2331 				 "Building vector operands from scalars\n");
2332 	      this_tree_size++;
2333 	      child = vect_create_new_slp_node (oprnd_info->ops);
2334 	      children.safe_push (child);
2335 	      oprnd_info->ops = vNULL;
2336 	      continue;
2337 	    }
2338 	}
2339 
2340       gcc_assert (child == NULL);
2341       FOR_EACH_VEC_ELT (children, j, child)
2342 	if (child)
2343 	  vect_free_slp_tree (child);
2344       vect_free_oprnd_info (oprnds_info);
2345       return NULL;
2346     }
2347 
2348   vect_free_oprnd_info (oprnds_info);
2349 
2350   /* If we have all children of a child built up from uniform scalars
2351      or does more than one possibly expensive vector construction then
2352      just throw that away, causing it built up from scalars.
2353      The exception is the SLP node for the vector store.  */
2354   if (is_a <bb_vec_info> (vinfo)
2355       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2356       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2357 	 do extra work to cancel the pattern so the uses see the
2358 	 scalar version.  */
2359       && !is_pattern_stmt_p (stmt_info))
2360     {
2361       slp_tree child;
2362       unsigned j;
2363       bool all_uniform_p = true;
2364       unsigned n_vector_builds = 0;
2365       FOR_EACH_VEC_ELT (children, j, child)
2366 	{
2367 	  if (!child)
2368 	    ;
2369 	  else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2370 	    all_uniform_p = false;
2371 	  else if (!vect_slp_tree_uniform_p (child))
2372 	    {
2373 	      all_uniform_p = false;
2374 	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2375 		n_vector_builds++;
2376 	    }
2377 	}
2378       if (all_uniform_p
2379 	  || n_vector_builds > 1
2380 	  || (n_vector_builds == children.length ()
2381 	      && is_a <gphi *> (stmt_info->stmt)))
2382 	{
2383 	  /* Roll back.  */
2384 	  matches[0] = false;
2385 	  FOR_EACH_VEC_ELT (children, j, child)
2386 	    if (child)
2387 	      vect_free_slp_tree (child);
2388 
2389 	  if (dump_enabled_p ())
2390 	    dump_printf_loc (MSG_NOTE, vect_location,
2391 			     "Building parent vector operands from "
2392 			     "scalars instead\n");
2393 	  return NULL;
2394 	}
2395     }
2396 
2397   *tree_size += this_tree_size + 1;
2398   *max_nunits = this_max_nunits;
2399 
2400   if (two_operators)
2401     {
2402       /* ???  We'd likely want to either cache in bst_map sth like
2403 	 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2404 	 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2405 	 explicit stmts to put in so the keying on 'stmts' doesn't
2406 	 work (but we have the same issue with nodes that use 'ops').  */
2407       slp_tree one = new _slp_tree;
2408       slp_tree two = new _slp_tree;
2409       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2410       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2411       SLP_TREE_VECTYPE (one) = vectype;
2412       SLP_TREE_VECTYPE (two) = vectype;
2413       SLP_TREE_CHILDREN (one).safe_splice (children);
2414       SLP_TREE_CHILDREN (two).safe_splice (children);
2415       slp_tree child;
2416       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2417 	SLP_TREE_REF_COUNT (child)++;
2418 
2419       /* Here we record the original defs since this
2420 	 node represents the final lane configuration.  */
2421       node = vect_create_new_slp_node (node, stmts, 2);
2422       SLP_TREE_VECTYPE (node) = vectype;
2423       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2424       SLP_TREE_CHILDREN (node).quick_push (one);
2425       SLP_TREE_CHILDREN (node).quick_push (two);
2426       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2427       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2428       enum tree_code ocode = ERROR_MARK;
2429       stmt_vec_info ostmt_info;
2430       unsigned j = 0;
2431       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2432 	{
2433 	  gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2434 	  if (gimple_assign_rhs_code (ostmt) != code0)
2435 	    {
2436 	      SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2437 	      ocode = gimple_assign_rhs_code (ostmt);
2438 	      j = i;
2439 	    }
2440 	  else
2441 	    SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2442 	}
2443       SLP_TREE_CODE (one) = code0;
2444       SLP_TREE_CODE (two) = ocode;
2445       SLP_TREE_LANES (one) = stmts.length ();
2446       SLP_TREE_LANES (two) = stmts.length ();
2447       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2448       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2449       return node;
2450     }
2451 
2452   node = vect_create_new_slp_node (node, stmts, nops);
2453   SLP_TREE_VECTYPE (node) = vectype;
2454   SLP_TREE_CHILDREN (node).splice (children);
2455   return node;
2456 }
2457 
2458 /* Dump a single SLP tree NODE.  */
2459 
2460 static void
vect_print_slp_tree(dump_flags_t dump_kind,dump_location_t loc,slp_tree node)2461 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2462 		     slp_tree node)
2463 {
2464   unsigned i, j;
2465   slp_tree child;
2466   stmt_vec_info stmt_info;
2467   tree op;
2468 
2469   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2470   dump_user_location_t user_loc = loc.get_user_location ();
2471   dump_printf_loc (metadata, user_loc,
2472 		   "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2473 		   ", refcnt=%u)",
2474 		   SLP_TREE_DEF_TYPE (node) == vect_external_def
2475 		   ? " (external)"
2476 		   : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2477 		      ? " (constant)"
2478 		      : ""), node,
2479 		   estimated_poly_value (node->max_nunits),
2480 					 SLP_TREE_REF_COUNT (node));
2481   if (SLP_TREE_VECTYPE (node))
2482     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2483   dump_printf (metadata, "\n");
2484   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2485     {
2486       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2487 	dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2488       else
2489 	dump_printf_loc (metadata, user_loc, "op template: %G",
2490 			 SLP_TREE_REPRESENTATIVE (node)->stmt);
2491     }
2492   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2493     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2494       dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2495   else
2496     {
2497       dump_printf_loc (metadata, user_loc, "\t{ ");
2498       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2499 	dump_printf (metadata, "%T%s ", op,
2500 		     i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2501       dump_printf (metadata, "}\n");
2502     }
2503   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2504     {
2505       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2506       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2507 	dump_printf (dump_kind, " %u", j);
2508       dump_printf (dump_kind, " }\n");
2509     }
2510   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2511     {
2512       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2513       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2514 	dump_printf (dump_kind, " %u[%u]",
2515 		     SLP_TREE_LANE_PERMUTATION (node)[i].first,
2516 		     SLP_TREE_LANE_PERMUTATION (node)[i].second);
2517       dump_printf (dump_kind, " }\n");
2518     }
2519   if (SLP_TREE_CHILDREN (node).is_empty ())
2520     return;
2521   dump_printf_loc (metadata, user_loc, "\tchildren");
2522   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2523     dump_printf (dump_kind, " %p", (void *)child);
2524   dump_printf (dump_kind, "\n");
2525 }
2526 
2527 DEBUG_FUNCTION void
debug(slp_tree node)2528 debug (slp_tree node)
2529 {
2530   debug_dump_context ctx;
2531   vect_print_slp_tree (MSG_NOTE,
2532 		       dump_location_t::from_location_t (UNKNOWN_LOCATION),
2533 		       node);
2534 }
2535 
2536 /* Recursive helper for the dot producer below.  */
2537 
2538 static void
dot_slp_tree(FILE * f,slp_tree node,hash_set<slp_tree> & visited)2539 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2540 {
2541   if (visited.add (node))
2542     return;
2543 
2544   fprintf (f, "\"%p\" [label=\"", (void *)node);
2545   vect_print_slp_tree (MSG_NOTE,
2546 		       dump_location_t::from_location_t (UNKNOWN_LOCATION),
2547 		       node);
2548   fprintf (f, "\"];\n");
2549 
2550 
2551   for (slp_tree child : SLP_TREE_CHILDREN (node))
2552     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2553 
2554   for (slp_tree child : SLP_TREE_CHILDREN (node))
2555     if (child)
2556       dot_slp_tree (f, child, visited);
2557 }
2558 
2559 DEBUG_FUNCTION void
dot_slp_tree(const char * fname,slp_tree node)2560 dot_slp_tree (const char *fname, slp_tree node)
2561 {
2562   FILE *f = fopen (fname, "w");
2563   fprintf (f, "digraph {\n");
2564   fflush (f);
2565     {
2566       debug_dump_context ctx (f);
2567       hash_set<slp_tree> visited;
2568       dot_slp_tree (f, node, visited);
2569     }
2570   fflush (f);
2571   fprintf (f, "}\n");
2572   fclose (f);
2573 }
2574 
2575 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
2576 
2577 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree node,hash_set<slp_tree> & visited)2578 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2579 		      slp_tree node, hash_set<slp_tree> &visited)
2580 {
2581   unsigned i;
2582   slp_tree child;
2583 
2584   if (visited.add (node))
2585     return;
2586 
2587   vect_print_slp_tree (dump_kind, loc, node);
2588 
2589   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2590     if (child)
2591       vect_print_slp_graph (dump_kind, loc, child, visited);
2592 }
2593 
2594 static void
vect_print_slp_graph(dump_flags_t dump_kind,dump_location_t loc,slp_tree entry)2595 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2596 		      slp_tree entry)
2597 {
2598   hash_set<slp_tree> visited;
2599   vect_print_slp_graph (dump_kind, loc, entry, visited);
2600 }
2601 
2602 /* Mark the tree rooted at NODE with PURE_SLP.  */
2603 
2604 static void
vect_mark_slp_stmts(slp_tree node,hash_set<slp_tree> & visited)2605 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2606 {
2607   int i;
2608   stmt_vec_info stmt_info;
2609   slp_tree child;
2610 
2611   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2612     return;
2613 
2614   if (visited.add (node))
2615     return;
2616 
2617   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2618     STMT_SLP_TYPE (stmt_info) = pure_slp;
2619 
2620   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2621     if (child)
2622       vect_mark_slp_stmts (child, visited);
2623 }
2624 
2625 static void
vect_mark_slp_stmts(slp_tree node)2626 vect_mark_slp_stmts (slp_tree node)
2627 {
2628   hash_set<slp_tree> visited;
2629   vect_mark_slp_stmts (node, visited);
2630 }
2631 
2632 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
2633 
2634 static void
vect_mark_slp_stmts_relevant(slp_tree node,hash_set<slp_tree> & visited)2635 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2636 {
2637   int i;
2638   stmt_vec_info stmt_info;
2639   slp_tree child;
2640 
2641   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2642     return;
2643 
2644   if (visited.add (node))
2645     return;
2646 
2647   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2648     {
2649       gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2650                   || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2651       STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2652     }
2653 
2654   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2655     if (child)
2656       vect_mark_slp_stmts_relevant (child, visited);
2657 }
2658 
2659 static void
vect_mark_slp_stmts_relevant(slp_tree node)2660 vect_mark_slp_stmts_relevant (slp_tree node)
2661 {
2662   hash_set<slp_tree> visited;
2663   vect_mark_slp_stmts_relevant (node, visited);
2664 }
2665 
2666 
2667 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
2668 
2669 static void
vect_gather_slp_loads(vec<slp_tree> & loads,slp_tree node,hash_set<slp_tree> & visited)2670 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2671 		       hash_set<slp_tree> &visited)
2672 {
2673   if (!node || visited.add (node))
2674     return;
2675 
2676   if (SLP_TREE_CHILDREN (node).length () == 0)
2677     {
2678       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2679 	return;
2680       stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2681       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2682 	  && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2683 	loads.safe_push (node);
2684     }
2685   else
2686     {
2687       unsigned i;
2688       slp_tree child;
2689       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2690 	vect_gather_slp_loads (loads, child, visited);
2691     }
2692 }
2693 
2694 
2695 /* Find the last store in SLP INSTANCE.  */
2696 
2697 stmt_vec_info
vect_find_last_scalar_stmt_in_slp(slp_tree node)2698 vect_find_last_scalar_stmt_in_slp (slp_tree node)
2699 {
2700   stmt_vec_info last = NULL;
2701   stmt_vec_info stmt_vinfo;
2702 
2703   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2704     {
2705       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2706       last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2707     }
2708 
2709   return last;
2710 }
2711 
2712 /* Find the first stmt in NODE.  */
2713 
2714 stmt_vec_info
vect_find_first_scalar_stmt_in_slp(slp_tree node)2715 vect_find_first_scalar_stmt_in_slp (slp_tree node)
2716 {
2717   stmt_vec_info first = NULL;
2718   stmt_vec_info stmt_vinfo;
2719 
2720   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2721     {
2722       stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2723       if (!first
2724 	  || get_later_stmt (stmt_vinfo, first) == first)
2725 	first = stmt_vinfo;
2726     }
2727 
2728   return first;
2729 }
2730 
2731 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2732    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2733    (also containing the first GROUP1_SIZE stmts, since stores are
2734    consecutive), the second containing the remainder.
2735    Return the first stmt in the second group.  */
2736 
2737 static stmt_vec_info
vect_split_slp_store_group(stmt_vec_info first_vinfo,unsigned group1_size)2738 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
2739 {
2740   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
2741   gcc_assert (group1_size > 0);
2742   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
2743   gcc_assert (group2_size > 0);
2744   DR_GROUP_SIZE (first_vinfo) = group1_size;
2745 
2746   stmt_vec_info stmt_info = first_vinfo;
2747   for (unsigned i = group1_size; i > 1; i--)
2748     {
2749       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
2750       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2751     }
2752   /* STMT is now the last element of the first group.  */
2753   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
2754   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
2755 
2756   DR_GROUP_SIZE (group2) = group2_size;
2757   for (stmt_info = group2; stmt_info;
2758        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
2759     {
2760       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
2761       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
2762     }
2763 
2764   /* For the second group, the DR_GROUP_GAP is that before the original group,
2765      plus skipping over the first vector.  */
2766   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
2767 
2768   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
2769   DR_GROUP_GAP (first_vinfo) += group2_size;
2770 
2771   if (dump_enabled_p ())
2772     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
2773 		     group1_size, group2_size);
2774 
2775   return group2;
2776 }
2777 
2778 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
2779    statements and a vector of NUNITS elements.  */
2780 
2781 static poly_uint64
calculate_unrolling_factor(poly_uint64 nunits,unsigned int group_size)2782 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
2783 {
2784   return exact_div (common_multiple (nunits, group_size), group_size);
2785 }
2786 
2787 /* Helper that checks to see if a node is a load node.  */
2788 
2789 static inline bool
vect_is_slp_load_node(slp_tree root)2790 vect_is_slp_load_node  (slp_tree root)
2791 {
2792   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
2793 	 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
2794 	 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
2795 }
2796 
2797 
2798 /* Helper function of optimize_load_redistribution that performs the operation
2799    recursively.  */
2800 
2801 static slp_tree
optimize_load_redistribution_1(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2802 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
2803 				vec_info *vinfo, unsigned int group_size,
2804 				hash_map<slp_tree, slp_tree> *load_map,
2805 				slp_tree root)
2806 {
2807   if (slp_tree *leader = load_map->get (root))
2808     return *leader;
2809 
2810   slp_tree node;
2811   unsigned i;
2812 
2813   /* For now, we don't know anything about externals so do not do anything.  */
2814   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
2815     return NULL;
2816   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
2817     {
2818       /* First convert this node into a load node and add it to the leaves
2819 	 list and flatten the permute from a lane to a load one.  If it's
2820 	 unneeded it will be elided later.  */
2821       vec<stmt_vec_info> stmts;
2822       stmts.create (SLP_TREE_LANES (root));
2823       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
2824       for (unsigned j = 0; j < lane_perm.length (); j++)
2825 	{
2826 	  std::pair<unsigned, unsigned> perm = lane_perm[j];
2827 	  node = SLP_TREE_CHILDREN (root)[perm.first];
2828 
2829 	  if (!vect_is_slp_load_node (node)
2830 	      || SLP_TREE_CHILDREN (node).exists ())
2831 	    {
2832 	      stmts.release ();
2833 	      goto next;
2834 	    }
2835 
2836 	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
2837 	}
2838 
2839       if (dump_enabled_p ())
2840 	dump_printf_loc (MSG_NOTE, vect_location,
2841 			 "converting stmts on permute node %p\n", root);
2842 
2843       bool *matches = XALLOCAVEC (bool, group_size);
2844       poly_uint64 max_nunits = 1;
2845       unsigned tree_size = 0, limit = 1;
2846       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
2847 				  matches, &limit, &tree_size, bst_map);
2848       if (!node)
2849 	stmts.release ();
2850 
2851       load_map->put (root, node);
2852       return node;
2853     }
2854 
2855 next:
2856   load_map->put (root, NULL);
2857 
2858   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2859     {
2860       slp_tree value
2861 	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2862 					  node);
2863       if (value)
2864 	{
2865 	  SLP_TREE_REF_COUNT (value)++;
2866 	  SLP_TREE_CHILDREN (root)[i] = value;
2867 	  /* ???  We know the original leafs of the replaced nodes will
2868 	     be referenced by bst_map, only the permutes created by
2869 	     pattern matching are not.  */
2870 	  if (SLP_TREE_REF_COUNT (node) == 1)
2871 	    load_map->remove (node);
2872 	  vect_free_slp_tree (node);
2873 	}
2874     }
2875 
2876   return NULL;
2877 }
2878 
2879 /* Temporary workaround for loads not being CSEd during SLP build.  This
2880    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
2881    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
2882    same DR such that the final operation is equal to a permuted load.  Such
2883    NODES are then directly converted into LOADS themselves.  The nodes are
2884    CSEd using BST_MAP.  */
2885 
2886 static void
optimize_load_redistribution(scalar_stmts_to_slp_tree_map_t * bst_map,vec_info * vinfo,unsigned int group_size,hash_map<slp_tree,slp_tree> * load_map,slp_tree root)2887 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
2888 			      vec_info *vinfo, unsigned int group_size,
2889 			      hash_map<slp_tree, slp_tree> *load_map,
2890 			      slp_tree root)
2891 {
2892   slp_tree node;
2893   unsigned i;
2894 
2895   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
2896     {
2897       slp_tree value
2898 	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
2899 					  node);
2900       if (value)
2901 	{
2902 	  SLP_TREE_REF_COUNT (value)++;
2903 	  SLP_TREE_CHILDREN (root)[i] = value;
2904 	  /* ???  We know the original leafs of the replaced nodes will
2905 	     be referenced by bst_map, only the permutes created by
2906 	     pattern matching are not.  */
2907 	  if (SLP_TREE_REF_COUNT (node) == 1)
2908 	    load_map->remove (node);
2909 	  vect_free_slp_tree (node);
2910 	}
2911     }
2912 }
2913 
2914 /* Helper function of vect_match_slp_patterns.
2915 
2916    Attempts to match patterns against the slp tree rooted in REF_NODE using
2917    VINFO.  Patterns are matched in post-order traversal.
2918 
2919    If matching is successful the value in REF_NODE is updated and returned, if
2920    not then it is returned unchanged.  */
2921 
2922 static bool
vect_match_slp_patterns_2(slp_tree * ref_node,vec_info * vinfo,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache,hash_set<slp_tree> * visited)2923 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
2924 			   slp_tree_to_load_perm_map_t *perm_cache,
2925 			   slp_compat_nodes_map_t *compat_cache,
2926 			   hash_set<slp_tree> *visited)
2927 {
2928   unsigned i;
2929   slp_tree node = *ref_node;
2930   bool found_p = false;
2931   if (!node || visited->add (node))
2932     return false;
2933 
2934   slp_tree child;
2935   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2936     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
2937 					  vinfo, perm_cache, compat_cache,
2938 					  visited);
2939 
2940   for (unsigned x = 0; x < num__slp_patterns; x++)
2941     {
2942       vect_pattern *pattern
2943 	= slp_patterns[x] (perm_cache, compat_cache, ref_node);
2944       if (pattern)
2945 	{
2946 	  pattern->build (vinfo);
2947 	  delete pattern;
2948 	  found_p = true;
2949 	}
2950     }
2951 
2952   return found_p;
2953 }
2954 
2955 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
2956    vec_info VINFO.
2957 
2958    The modified tree is returned.  Patterns are tried in order and multiple
2959    patterns may match.  */
2960 
2961 static bool
vect_match_slp_patterns(slp_instance instance,vec_info * vinfo,hash_set<slp_tree> * visited,slp_tree_to_load_perm_map_t * perm_cache,slp_compat_nodes_map_t * compat_cache)2962 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
2963 			 hash_set<slp_tree> *visited,
2964 			 slp_tree_to_load_perm_map_t *perm_cache,
2965 			 slp_compat_nodes_map_t *compat_cache)
2966 {
2967   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
2968   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
2969 
2970   if (dump_enabled_p ())
2971     dump_printf_loc (MSG_NOTE, vect_location,
2972 		     "Analyzing SLP tree %p for patterns\n",
2973 		     SLP_INSTANCE_TREE (instance));
2974 
2975   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
2976 				    visited);
2977 }
2978 
2979 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
2980    splitting into two, with the first split group having size NEW_GROUP_SIZE.
2981    Return true if we could use IFN_STORE_LANES instead and if that appears
2982    to be the better approach.  */
2983 
2984 static bool
vect_slp_prefer_store_lanes_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int group_size,unsigned int new_group_size)2985 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
2986 			       unsigned int group_size,
2987 			       unsigned int new_group_size)
2988 {
2989   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
2990   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
2991   if (!vectype)
2992     return false;
2993   /* Allow the split if one of the two new groups would operate on full
2994      vectors *within* rather than across one scalar loop iteration.
2995      This is purely a heuristic, but it should work well for group
2996      sizes of 3 and 4, where the possible splits are:
2997 
2998        3->2+1:  OK if the vector has exactly two elements
2999        4->2+2:  Likewise
3000        4->3+1:  Less clear-cut.  */
3001   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3002       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3003     return false;
3004   return vect_store_lanes_supported (vectype, group_size, false);
3005 }
3006 
3007 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3008    vect_build_slp_tree to build a tree of packed stmts if possible.
3009    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3010 
3011 static bool
3012 vect_analyze_slp_instance (vec_info *vinfo,
3013 			   scalar_stmts_to_slp_tree_map_t *bst_map,
3014 			   stmt_vec_info stmt_info, slp_instance_kind kind,
3015 			   unsigned max_tree_size, unsigned *limit);
3016 
3017 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3018    of KIND.  Return true if successful.  */
3019 
3020 static bool
vect_build_slp_instance(vec_info * vinfo,slp_instance_kind kind,vec<stmt_vec_info> & scalar_stmts,vec<stmt_vec_info> & root_stmt_infos,unsigned max_tree_size,unsigned * limit,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info_)3021 vect_build_slp_instance (vec_info *vinfo,
3022 			 slp_instance_kind kind,
3023 			 vec<stmt_vec_info> &scalar_stmts,
3024 			 vec<stmt_vec_info> &root_stmt_infos,
3025 			 unsigned max_tree_size, unsigned *limit,
3026 			 scalar_stmts_to_slp_tree_map_t *bst_map,
3027 			 /* ???  We need stmt_info for group splitting.  */
3028 			 stmt_vec_info stmt_info_)
3029 {
3030   if (dump_enabled_p ())
3031     {
3032       dump_printf_loc (MSG_NOTE, vect_location,
3033 		       "Starting SLP discovery for\n");
3034       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3035 	dump_printf_loc (MSG_NOTE, vect_location,
3036 			 "  %G", scalar_stmts[i]->stmt);
3037     }
3038 
3039   /* Build the tree for the SLP instance.  */
3040   unsigned int group_size = scalar_stmts.length ();
3041   bool *matches = XALLOCAVEC (bool, group_size);
3042   poly_uint64 max_nunits = 1;
3043   unsigned tree_size = 0;
3044   unsigned i;
3045   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3046 				       &max_nunits, matches, limit,
3047 				       &tree_size, bst_map);
3048   if (node != NULL)
3049     {
3050       /* Calculate the unrolling factor based on the smallest type.  */
3051       poly_uint64 unrolling_factor
3052 	= calculate_unrolling_factor (max_nunits, group_size);
3053 
3054       if (maybe_ne (unrolling_factor, 1U)
3055 	  && is_a <bb_vec_info> (vinfo))
3056 	{
3057 	  unsigned HOST_WIDE_INT const_max_nunits;
3058 	  if (!max_nunits.is_constant (&const_max_nunits)
3059 	      || const_max_nunits > group_size)
3060 	    {
3061 	      if (dump_enabled_p ())
3062 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3063 				 "Build SLP failed: store group "
3064 				 "size not a multiple of the vector size "
3065 				 "in basic block SLP\n");
3066 	      vect_free_slp_tree (node);
3067 	      return false;
3068 	    }
3069 	  /* Fatal mismatch.  */
3070 	  if (dump_enabled_p ())
3071 	    dump_printf_loc (MSG_NOTE, vect_location,
3072 			     "SLP discovery succeeded but node needs "
3073 			     "splitting\n");
3074 	  memset (matches, true, group_size);
3075 	  matches[group_size / const_max_nunits * const_max_nunits] = false;
3076 	  vect_free_slp_tree (node);
3077 	}
3078       else
3079 	{
3080 	  /* Create a new SLP instance.  */
3081 	  slp_instance new_instance = XNEW (class _slp_instance);
3082 	  SLP_INSTANCE_TREE (new_instance) = node;
3083 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3084 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
3085 	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3086 	  SLP_INSTANCE_KIND (new_instance) = kind;
3087 	  new_instance->reduc_phis = NULL;
3088 	  new_instance->cost_vec = vNULL;
3089 	  new_instance->subgraph_entries = vNULL;
3090 
3091 	  if (dump_enabled_p ())
3092 	    dump_printf_loc (MSG_NOTE, vect_location,
3093 			     "SLP size %u vs. limit %u.\n",
3094 			     tree_size, max_tree_size);
3095 
3096 	  /* Fixup SLP reduction chains.  */
3097 	  if (kind == slp_inst_kind_reduc_chain)
3098 	    {
3099 	      /* If this is a reduction chain with a conversion in front
3100 		 amend the SLP tree with a node for that.  */
3101 	      gimple *scalar_def
3102 		= vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3103 	      if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3104 		{
3105 		  /* Get at the conversion stmt - we know it's the single use
3106 		     of the last stmt of the reduction chain.  */
3107 		  use_operand_p use_p;
3108 		  bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3109 					   &use_p, &scalar_def);
3110 		  gcc_assert (r);
3111 		  stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3112 		  next_info = vect_stmt_to_vectorize (next_info);
3113 		  scalar_stmts = vNULL;
3114 		  scalar_stmts.create (group_size);
3115 		  for (unsigned i = 0; i < group_size; ++i)
3116 		    scalar_stmts.quick_push (next_info);
3117 		  slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3118 		  SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3119 		  SLP_TREE_CHILDREN (conv).quick_push (node);
3120 		  SLP_INSTANCE_TREE (new_instance) = conv;
3121 		  /* We also have to fake this conversion stmt as SLP reduction
3122 		     group so we don't have to mess with too much code
3123 		     elsewhere.  */
3124 		  REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3125 		  REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3126 		}
3127 	      /* Fill the backedge child of the PHI SLP node.  The
3128 		 general matching code cannot find it because the
3129 		 scalar code does not reflect how we vectorize the
3130 		 reduction.  */
3131 	      use_operand_p use_p;
3132 	      imm_use_iterator imm_iter;
3133 	      class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3134 	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3135 				     gimple_get_lhs (scalar_def))
3136 		/* There are exactly two non-debug uses, the reduction
3137 		   PHI and the loop-closed PHI node.  */
3138 		if (!is_gimple_debug (USE_STMT (use_p))
3139 		    && gimple_bb (USE_STMT (use_p)) == loop->header)
3140 		  {
3141 		    auto_vec<stmt_vec_info, 64> phis (group_size);
3142 		    stmt_vec_info phi_info
3143 		      = vinfo->lookup_stmt (USE_STMT (use_p));
3144 		    for (unsigned i = 0; i < group_size; ++i)
3145 		      phis.quick_push (phi_info);
3146 		    slp_tree *phi_node = bst_map->get (phis);
3147 		    unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3148 		    SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3149 		      = SLP_INSTANCE_TREE (new_instance);
3150 		    SLP_INSTANCE_TREE (new_instance)->refcnt++;
3151 		  }
3152 	    }
3153 
3154 	  vinfo->slp_instances.safe_push (new_instance);
3155 
3156 	  /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3157 	     the number of scalar stmts in the root in a few places.
3158 	     Verify that assumption holds.  */
3159 	  gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3160 			.length () == group_size);
3161 
3162 	  if (dump_enabled_p ())
3163 	    {
3164 	      dump_printf_loc (MSG_NOTE, vect_location,
3165 			       "Final SLP tree for instance %p:\n", new_instance);
3166 	      vect_print_slp_graph (MSG_NOTE, vect_location,
3167 				    SLP_INSTANCE_TREE (new_instance));
3168 	    }
3169 
3170 	  return true;
3171 	}
3172     }
3173   else
3174     {
3175       /* Failed to SLP.  */
3176       /* Free the allocated memory.  */
3177       scalar_stmts.release ();
3178     }
3179 
3180   stmt_vec_info stmt_info = stmt_info_;
3181   /* Try to break the group up into pieces.  */
3182   if (kind == slp_inst_kind_store)
3183     {
3184       /* ???  We could delay all the actual splitting of store-groups
3185 	 until after SLP discovery of the original group completed.
3186 	 Then we can recurse to vect_build_slp_instance directly.  */
3187       for (i = 0; i < group_size; i++)
3188 	if (!matches[i])
3189 	  break;
3190 
3191       /* For basic block SLP, try to break the group up into multiples of
3192 	 a vector size.  */
3193       if (is_a <bb_vec_info> (vinfo)
3194 	  && (i > 1 && i < group_size))
3195 	{
3196 	  tree scalar_type
3197 	    = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3198 	  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3199 						      1 << floor_log2 (i));
3200 	  unsigned HOST_WIDE_INT const_nunits;
3201 	  if (vectype
3202 	      && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3203 	    {
3204 	      /* Split into two groups at the first vector boundary.  */
3205 	      gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3206 	      unsigned group1_size = i & ~(const_nunits - 1);
3207 
3208 	      if (dump_enabled_p ())
3209 		dump_printf_loc (MSG_NOTE, vect_location,
3210 				 "Splitting SLP group at stmt %u\n", i);
3211 	      stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3212 							       group1_size);
3213 	      bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3214 						    kind, max_tree_size,
3215 						    limit);
3216 	      /* Split the rest at the failure point and possibly
3217 		 re-analyze the remaining matching part if it has
3218 		 at least two lanes.  */
3219 	      if (group1_size < i
3220 		  && (i + 1 < group_size
3221 		      || i - group1_size > 1))
3222 		{
3223 		  stmt_vec_info rest2 = rest;
3224 		  rest = vect_split_slp_store_group (rest, i - group1_size);
3225 		  if (i - group1_size > 1)
3226 		    res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3227 						      kind, max_tree_size,
3228 						      limit);
3229 		}
3230 	      /* Re-analyze the non-matching tail if it has at least
3231 		 two lanes.  */
3232 	      if (i + 1 < group_size)
3233 		res |= vect_analyze_slp_instance (vinfo, bst_map,
3234 						  rest, kind, max_tree_size,
3235 						  limit);
3236 	      return res;
3237 	    }
3238 	}
3239 
3240       /* For loop vectorization split into arbitrary pieces of size > 1.  */
3241       if (is_a <loop_vec_info> (vinfo)
3242 	  && (i > 1 && i < group_size)
3243 	  && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3244 	{
3245 	  unsigned group1_size = i;
3246 
3247 	  if (dump_enabled_p ())
3248 	    dump_printf_loc (MSG_NOTE, vect_location,
3249 			     "Splitting SLP group at stmt %u\n", i);
3250 
3251 	  stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3252 							   group1_size);
3253 	  /* Loop vectorization cannot handle gaps in stores, make sure
3254 	     the split group appears as strided.  */
3255 	  STMT_VINFO_STRIDED_P (rest) = 1;
3256 	  DR_GROUP_GAP (rest) = 0;
3257 	  STMT_VINFO_STRIDED_P (stmt_info) = 1;
3258 	  DR_GROUP_GAP (stmt_info) = 0;
3259 
3260 	  bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3261 						kind, max_tree_size, limit);
3262 	  if (i + 1 < group_size)
3263 	    res |= vect_analyze_slp_instance (vinfo, bst_map,
3264 					      rest, kind, max_tree_size, limit);
3265 
3266 	  return res;
3267 	}
3268 
3269       /* Even though the first vector did not all match, we might be able to SLP
3270 	 (some) of the remainder.  FORNOW ignore this possibility.  */
3271     }
3272 
3273   /* Failed to SLP.  */
3274   if (dump_enabled_p ())
3275     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3276   return false;
3277 }
3278 
3279 
3280 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3281    vect_build_slp_tree to build a tree of packed stmts if possible.
3282    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3283 
3284 static bool
vect_analyze_slp_instance(vec_info * vinfo,scalar_stmts_to_slp_tree_map_t * bst_map,stmt_vec_info stmt_info,slp_instance_kind kind,unsigned max_tree_size,unsigned * limit)3285 vect_analyze_slp_instance (vec_info *vinfo,
3286 			   scalar_stmts_to_slp_tree_map_t *bst_map,
3287 			   stmt_vec_info stmt_info,
3288 			   slp_instance_kind kind,
3289 			   unsigned max_tree_size, unsigned *limit)
3290 {
3291   unsigned int i;
3292   vec<stmt_vec_info> scalar_stmts;
3293 
3294   if (is_a <bb_vec_info> (vinfo))
3295     vect_location = stmt_info->stmt;
3296 
3297   stmt_vec_info next_info = stmt_info;
3298   if (kind == slp_inst_kind_store)
3299     {
3300       /* Collect the stores and store them in scalar_stmts.  */
3301       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3302       while (next_info)
3303 	{
3304 	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3305 	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3306 	}
3307     }
3308   else if (kind == slp_inst_kind_reduc_chain)
3309     {
3310       /* Collect the reduction stmts and store them in scalar_stmts.  */
3311       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3312       while (next_info)
3313 	{
3314 	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3315 	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3316 	}
3317       /* Mark the first element of the reduction chain as reduction to properly
3318 	 transform the node.  In the reduction analysis phase only the last
3319 	 element of the chain is marked as reduction.  */
3320       STMT_VINFO_DEF_TYPE (stmt_info)
3321 	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3322       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3323 	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3324     }
3325   else if (kind == slp_inst_kind_ctor)
3326     {
3327       tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
3328       tree val;
3329       scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
3330       FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
3331 	{
3332 	  stmt_vec_info def_info = vinfo->lookup_def (val);
3333 	  def_info = vect_stmt_to_vectorize (def_info);
3334 	  scalar_stmts.quick_push (def_info);
3335 	}
3336       if (dump_enabled_p ())
3337 	dump_printf_loc (MSG_NOTE, vect_location,
3338 			 "Analyzing vectorizable constructor: %G\n",
3339 			 stmt_info->stmt);
3340     }
3341   else if (kind == slp_inst_kind_reduc_group)
3342     {
3343       /* Collect reduction statements.  */
3344       const vec<stmt_vec_info> &reductions
3345 	= as_a <loop_vec_info> (vinfo)->reductions;
3346       scalar_stmts.create (reductions.length ());
3347       for (i = 0; reductions.iterate (i, &next_info); i++)
3348 	if ((STMT_VINFO_RELEVANT_P (next_info)
3349 	     || STMT_VINFO_LIVE_P (next_info))
3350 	    /* ???  Make sure we didn't skip a conversion around a reduction
3351 	       path.  In that case we'd have to reverse engineer that conversion
3352 	       stmt following the chain using reduc_idx and from the PHI
3353 	       using reduc_def.  */
3354 	    && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3355 	  scalar_stmts.quick_push (next_info);
3356       /* If less than two were relevant/live there's nothing to SLP.  */
3357       if (scalar_stmts.length () < 2)
3358 	return false;
3359     }
3360   else
3361     gcc_unreachable ();
3362 
3363   vec<stmt_vec_info> roots = vNULL;
3364   if (kind == slp_inst_kind_ctor)
3365     {
3366       roots.create (1);
3367       roots.quick_push (stmt_info);
3368     }
3369   /* Build the tree for the SLP instance.  */
3370   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3371 				      roots,
3372 				      max_tree_size, limit, bst_map,
3373 				      kind == slp_inst_kind_store
3374 				      ? stmt_info : NULL);
3375   if (!res)
3376     roots.release ();
3377 
3378   /* ???  If this is slp_inst_kind_store and the above succeeded here's
3379      where we should do store group splitting.  */
3380 
3381   return res;
3382 }
3383 
3384 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
3385    trees of packed scalar stmts if SLP is possible.  */
3386 
3387 opt_result
vect_analyze_slp(vec_info * vinfo,unsigned max_tree_size)3388 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3389 {
3390   unsigned int i;
3391   stmt_vec_info first_element;
3392   slp_instance instance;
3393 
3394   DUMP_VECT_SCOPE ("vect_analyze_slp");
3395 
3396   unsigned limit = max_tree_size;
3397 
3398   scalar_stmts_to_slp_tree_map_t *bst_map
3399     = new scalar_stmts_to_slp_tree_map_t ();
3400 
3401   /* Find SLP sequences starting from groups of grouped stores.  */
3402   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3403     vect_analyze_slp_instance (vinfo, bst_map, first_element,
3404 			       STMT_VINFO_GROUPED_ACCESS (first_element)
3405 			       ? slp_inst_kind_store : slp_inst_kind_ctor,
3406 			       max_tree_size, &limit);
3407 
3408   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3409     {
3410       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3411 	{
3412 	  vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3413 	  if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3414 				       bb_vinfo->roots[i].stmts,
3415 				       bb_vinfo->roots[i].roots,
3416 				       max_tree_size, &limit, bst_map, NULL))
3417 	    {
3418 	      bb_vinfo->roots[i].stmts = vNULL;
3419 	      bb_vinfo->roots[i].roots = vNULL;
3420 	    }
3421 	}
3422     }
3423 
3424   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3425     {
3426       /* Find SLP sequences starting from reduction chains.  */
3427       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3428 	if (! STMT_VINFO_RELEVANT_P (first_element)
3429 	    && ! STMT_VINFO_LIVE_P (first_element))
3430 	  ;
3431 	else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3432 					      slp_inst_kind_reduc_chain,
3433 					      max_tree_size, &limit))
3434 	  {
3435 	    /* Dissolve reduction chain group.  */
3436 	    stmt_vec_info vinfo = first_element;
3437 	    stmt_vec_info last = NULL;
3438 	    while (vinfo)
3439 	      {
3440 		stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3441 		REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3442 		REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3443 		last = vinfo;
3444 		vinfo = next;
3445 	      }
3446 	    STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3447 	    /* It can be still vectorized as part of an SLP reduction.  */
3448 	    loop_vinfo->reductions.safe_push (last);
3449 	  }
3450 
3451       /* Find SLP sequences starting from groups of reductions.  */
3452       if (loop_vinfo->reductions.length () > 1)
3453 	vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3454 				   slp_inst_kind_reduc_group, max_tree_size,
3455 				   &limit);
3456     }
3457 
3458   hash_set<slp_tree> visited_patterns;
3459   slp_tree_to_load_perm_map_t perm_cache;
3460   slp_compat_nodes_map_t compat_cache;
3461 
3462   /* See if any patterns can be found in the SLP tree.  */
3463   bool pattern_found = false;
3464   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3465     pattern_found |= vect_match_slp_patterns (instance, vinfo,
3466 					      &visited_patterns, &perm_cache,
3467 					      &compat_cache);
3468 
3469   /* If any were found optimize permutations of loads.  */
3470   if (pattern_found)
3471     {
3472       hash_map<slp_tree, slp_tree> load_map;
3473       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3474 	{
3475 	  slp_tree root = SLP_INSTANCE_TREE (instance);
3476 	  optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3477 					&load_map, root);
3478 	}
3479     }
3480 
3481 
3482 
3483   /* The map keeps a reference on SLP nodes built, release that.  */
3484   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3485        it != bst_map->end (); ++it)
3486     if ((*it).second)
3487       vect_free_slp_tree ((*it).second);
3488   delete bst_map;
3489 
3490   if (pattern_found && dump_enabled_p ())
3491     {
3492       dump_printf_loc (MSG_NOTE, vect_location,
3493 		       "Pattern matched SLP tree\n");
3494       hash_set<slp_tree> visited;
3495       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3496 	vect_print_slp_graph (MSG_NOTE, vect_location,
3497 			      SLP_INSTANCE_TREE (instance), visited);
3498     }
3499 
3500   return opt_result::success ();
3501 }
3502 
3503 struct slpg_vertex
3504 {
slpg_vertexslpg_vertex3505   slpg_vertex (slp_tree node_)
3506     : node (node_), perm_in (-1), perm_out (-1) {}
3507 
get_perm_materializedslpg_vertex3508   int get_perm_materialized () const
3509     { return perm_in != perm_out ? perm_in : 0; }
3510 
3511   slp_tree node;
3512   /* The common permutation on the incoming lanes (towards SLP children).  */
3513   int perm_in;
3514   /* The permutation on the outgoing lanes (towards SLP parents).  When
3515      the node is a materialization point for a permute this differs
3516      from perm_in (and is then usually zero).  Materialization happens
3517      on the input side.  */
3518   int perm_out;
3519 };
3520 
3521 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
3522 
3523 static void
vect_slp_build_vertices(hash_set<slp_tree> & visited,slp_tree node,vec<slpg_vertex> & vertices,vec<int> & leafs)3524 vect_slp_build_vertices (hash_set<slp_tree> &visited, slp_tree node,
3525 			 vec<slpg_vertex> &vertices, vec<int> &leafs)
3526 {
3527   unsigned i;
3528   slp_tree child;
3529 
3530   if (visited.add (node))
3531     return;
3532 
3533   node->vertex = vertices.length ();
3534   vertices.safe_push (slpg_vertex (node));
3535 
3536   bool leaf = true;
3537   bool force_leaf = false;
3538   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3539     if (child)
3540       {
3541 	leaf = false;
3542 	vect_slp_build_vertices (visited, child, vertices, leafs);
3543       }
3544     else
3545       force_leaf = true;
3546   /* Since SLP discovery works along use-def edges all cycles have an
3547      entry - but there's the exception of cycles where we do not handle
3548      the entry explicitely (but with a NULL SLP node), like some reductions
3549      and inductions.  Force those SLP PHIs to act as leafs to make them
3550      backwards reachable.  */
3551   if (leaf || force_leaf)
3552     leafs.safe_push (node->vertex);
3553 }
3554 
3555 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
3556 
3557 static void
vect_slp_build_vertices(vec_info * info,vec<slpg_vertex> & vertices,vec<int> & leafs)3558 vect_slp_build_vertices (vec_info *info, vec<slpg_vertex> &vertices,
3559 			 vec<int> &leafs)
3560 {
3561   hash_set<slp_tree> visited;
3562   unsigned i;
3563   slp_instance instance;
3564   FOR_EACH_VEC_ELT (info->slp_instances, i, instance)
3565     vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices,
3566 			     leafs);
3567 }
3568 
3569 /* Apply (reverse) bijectite PERM to VEC.  */
3570 
3571 template <class T>
3572 static void
vect_slp_permute(vec<unsigned> perm,vec<T> & vec,bool reverse)3573 vect_slp_permute (vec<unsigned> perm,
3574 		  vec<T> &vec, bool reverse)
3575 {
3576   auto_vec<T, 64> saved;
3577   saved.create (vec.length ());
3578   for (unsigned i = 0; i < vec.length (); ++i)
3579     saved.quick_push (vec[i]);
3580 
3581   if (reverse)
3582     {
3583       for (unsigned i = 0; i < vec.length (); ++i)
3584 	vec[perm[i]] = saved[i];
3585       for (unsigned i = 0; i < vec.length (); ++i)
3586 	gcc_assert (vec[perm[i]] == saved[i]);
3587     }
3588   else
3589     {
3590       for (unsigned i = 0; i < vec.length (); ++i)
3591 	vec[i] = saved[perm[i]];
3592       for (unsigned i = 0; i < vec.length (); ++i)
3593 	gcc_assert (vec[i] == saved[perm[i]]);
3594     }
3595 }
3596 
3597 /* Return whether permutations PERM_A and PERM_B as recorded in the
3598    PERMS vector are equal.  */
3599 
3600 static bool
vect_slp_perms_eq(const vec<vec<unsigned>> & perms,int perm_a,int perm_b)3601 vect_slp_perms_eq (const vec<vec<unsigned> > &perms,
3602 		   int perm_a, int perm_b)
3603 {
3604   return (perm_a == perm_b
3605 	  || (perm_a != -1 && perm_b != -1
3606 	      && perms[perm_a].length () == perms[perm_b].length ()
3607 	      && memcmp (&perms[perm_a][0], &perms[perm_b][0],
3608 			 sizeof (unsigned) * perms[perm_a].length ()) == 0));
3609 }
3610 
3611 /* Optimize the SLP graph of VINFO.  */
3612 
3613 void
vect_optimize_slp(vec_info * vinfo)3614 vect_optimize_slp (vec_info *vinfo)
3615 {
3616   if (vinfo->slp_instances.is_empty ())
3617     return;
3618 
3619   slp_tree node;
3620   unsigned i;
3621   auto_vec<slpg_vertex> vertices;
3622   auto_vec<int> leafs;
3623   vect_slp_build_vertices (vinfo, vertices, leafs);
3624 
3625   struct graph *slpg = new_graph (vertices.length ());
3626   for (slpg_vertex &v : vertices)
3627     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
3628       if (child)
3629 	add_edge (slpg, v.node->vertex, child->vertex);
3630 
3631   /* Compute (reverse) postorder on the inverted graph.  */
3632   auto_vec<int> ipo;
3633   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
3634 
3635   auto_vec<vec<unsigned> > perms;
3636   perms.safe_push (vNULL); /* zero is no permute */
3637 
3638   /* Produce initial permutations.  */
3639   for (i = 0; i < leafs.length (); ++i)
3640     {
3641       int idx = leafs[i];
3642       slp_tree node = vertices[idx].node;
3643 
3644       /* Handle externals and constants optimistically throughout the
3645 	 iteration.  But treat existing vectors as fixed since we
3646 	 do not handle permuting them below.  */
3647       if ((SLP_TREE_DEF_TYPE (node) == vect_external_def
3648 	   && !SLP_TREE_VEC_DEFS (node).exists ())
3649 	  || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3650 	continue;
3651 
3652       /* Leafs do not change across iterations.  Note leafs also double
3653 	 as entries to the reverse graph.  */
3654       if (!slpg->vertices[idx].succ)
3655 	{
3656 	  vertices[idx].perm_in = 0;
3657 	  vertices[idx].perm_out = 0;
3658 	}
3659 
3660       /* Loads are the only thing generating permutes.  */
3661       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
3662 	continue;
3663 
3664       /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the
3665 	 node unpermuted, record this permute.  */
3666       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
3667       if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
3668 	continue;
3669       dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
3670       unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
3671       bool any_permute = false;
3672       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3673 	{
3674 	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
3675 	  imin = MIN (imin, idx);
3676 	  imax = MAX (imax, idx);
3677 	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
3678 	    any_permute = true;
3679 	}
3680       /* If there's no permute no need to split one out.  */
3681       if (!any_permute)
3682 	continue;
3683       /* If the span doesn't match we'd disrupt VF computation, avoid
3684 	 that for now.  */
3685       if (imax - imin + 1 != SLP_TREE_LANES (node))
3686 	continue;
3687 
3688       /* For now only handle true permutes, like
3689 	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
3690 	 when permuting constants and invariants keeping the permute
3691 	 bijective.  */
3692       auto_sbitmap load_index (SLP_TREE_LANES (node));
3693       bitmap_clear (load_index);
3694       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3695 	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
3696       unsigned j;
3697       for (j = 0; j < SLP_TREE_LANES (node); ++j)
3698 	if (!bitmap_bit_p (load_index, j))
3699 	  break;
3700       if (j != SLP_TREE_LANES (node))
3701 	continue;
3702 
3703       vec<unsigned> perm = vNULL;
3704       perm.safe_grow (SLP_TREE_LANES (node), true);
3705       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
3706 	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
3707       perms.safe_push (perm);
3708       vertices[idx].perm_in = perms.length () - 1;
3709       vertices[idx].perm_out = perms.length () - 1;
3710     }
3711 
3712   /* In addition to the above we have to mark outgoing permutes facing
3713      non-reduction graph entries that are not represented as to be
3714      materialized.  */
3715   for (slp_instance instance : vinfo->slp_instances)
3716     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
3717       {
3718 	/* Just setting perm_out isn't enough for the propagation to
3719 	   pick this up.  */
3720 	vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_in = 0;
3721 	vertices[SLP_INSTANCE_TREE (instance)->vertex].perm_out = 0;
3722       }
3723 
3724   /* Propagate permutes along the graph and compute materialization points.  */
3725   bool changed;
3726   bool do_materialization = false;
3727   unsigned iteration = 0;
3728   do
3729     {
3730       changed = false;
3731       ++iteration;
3732 
3733       if (dump_enabled_p ())
3734 	dump_printf_loc (MSG_NOTE, vect_location,
3735 			 "SLP optimize iteration %d\n", iteration);
3736 
3737       for (i = vertices.length (); i > 0 ; --i)
3738 	{
3739 	  int idx = ipo[i-1];
3740 	  slp_tree node = vertices[idx].node;
3741 
3742 	  /* Handle externals and constants optimistically throughout the
3743 	     iteration.  */
3744 	  if (SLP_TREE_DEF_TYPE (node) == vect_external_def
3745 	      || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
3746 	    continue;
3747 
3748 	  /* We still eventually have failed backedge SLP nodes in the
3749 	     graph, those are only cancelled when analyzing operations.
3750 	     Simply treat them as transparent ops, propagating permutes
3751 	     through them.  */
3752 	  if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3753 	    {
3754 	      /* We do not handle stores with a permutation, so all
3755 		 incoming permutes must have been materialized.  */
3756 	      stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
3757 	      if (STMT_VINFO_DATA_REF (rep)
3758 		  && DR_IS_WRITE (STMT_VINFO_DATA_REF (rep)))
3759 		{
3760 		  /* ???  We're forcing materialization in place
3761 		     of the child here, we'd need special handling
3762 		     in materialization to leave perm_in -1 here.  */
3763 		  vertices[idx].perm_in = 0;
3764 		  vertices[idx].perm_out = 0;
3765 		}
3766 	      /* We cannot move a permute across an operation that is
3767 		 not independent on lanes.  Note this is an explicit
3768 		 negative list since that's much shorter than the respective
3769 		 positive one but it's critical to keep maintaining it.  */
3770 	      if (is_gimple_call (STMT_VINFO_STMT (rep)))
3771 		switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
3772 		  {
3773 		  case CFN_COMPLEX_ADD_ROT90:
3774 		  case CFN_COMPLEX_ADD_ROT270:
3775 		  case CFN_COMPLEX_MUL:
3776 		  case CFN_COMPLEX_MUL_CONJ:
3777 		  case CFN_VEC_ADDSUB:
3778 		  case CFN_VEC_FMADDSUB:
3779 		  case CFN_VEC_FMSUBADD:
3780 		    vertices[idx].perm_in = 0;
3781 		    vertices[idx].perm_out = 0;
3782 		  default:;
3783 		  }
3784 	    }
3785 
3786 	  if (!slpg->vertices[idx].succ)
3787 	    /* Pick up pre-computed leaf values.  */
3788 	    ;
3789 	  else
3790 	    {
3791 	      bool any_succ_perm_out_m1 = false;
3792 	      int perm_in = vertices[idx].perm_in;
3793 	      for (graph_edge *succ = slpg->vertices[idx].succ;
3794 		   succ; succ = succ->succ_next)
3795 		{
3796 		  int succ_idx = succ->dest;
3797 		  int succ_perm = vertices[succ_idx].perm_out;
3798 		  /* Handle unvisited (and constant) nodes optimistically.  */
3799 		  /* ???  But for constants once we want to handle
3800 		     non-bijective permutes we have to verify the permute,
3801 		     when unifying lanes, will not unify different constants.
3802 		     For example see gcc.dg/vect/bb-slp-14.c for a case
3803 		     that would break.  */
3804 		  if (succ_perm == -1)
3805 		    {
3806 		      /* When we handled a non-leaf optimistically, note
3807 			 that so we can adjust its outgoing permute below.  */
3808 		      slp_tree succ_node = vertices[succ_idx].node;
3809 		      if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3810 			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3811 			any_succ_perm_out_m1 = true;
3812 		      continue;
3813 		    }
3814 		  if (perm_in == -1)
3815 		    perm_in = succ_perm;
3816 		  else if (succ_perm == 0
3817 			   || !vect_slp_perms_eq (perms, perm_in, succ_perm))
3818 		    {
3819 		      perm_in = 0;
3820 		      break;
3821 		    }
3822 		}
3823 
3824 	      /* Adjust any incoming permutes we treated optimistically.  */
3825 	      if (perm_in != -1 && any_succ_perm_out_m1)
3826 		{
3827 		  for (graph_edge *succ = slpg->vertices[idx].succ;
3828 		       succ; succ = succ->succ_next)
3829 		    {
3830 		      slp_tree succ_node = vertices[succ->dest].node;
3831 		      if (vertices[succ->dest].perm_out == -1
3832 			  && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
3833 			  && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
3834 			{
3835 			  vertices[succ->dest].perm_out = perm_in;
3836 			  /* And ensure this propagates.  */
3837 			  if (vertices[succ->dest].perm_in == -1)
3838 			    vertices[succ->dest].perm_in = perm_in;
3839 			}
3840 		    }
3841 		  changed = true;
3842 		}
3843 
3844 	      if (!vect_slp_perms_eq (perms, perm_in,
3845 				      vertices[idx].perm_in))
3846 		{
3847 		  /* Make sure we eventually converge.  */
3848 		  gcc_checking_assert (vertices[idx].perm_in == -1
3849 				       || perm_in == 0);
3850 		  vertices[idx].perm_in = perm_in;
3851 
3852 		  /* While we can handle VEC_PERM nodes as transparent
3853 		     pass-through they can be a cheap materialization
3854 		     point as well.  In addition they can act as source
3855 		     of a random permutation as well.
3856 		     The following ensures that former materialization
3857 		     points that now have zero incoming permutes no
3858 		     longer appear as such and that former "any" permutes
3859 		     get pass-through.  We keep VEC_PERM nodes optimistic
3860 		     as "any" outgoing permute though.  */
3861 		  if (vertices[idx].perm_out != 0
3862 		      && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3863 		    vertices[idx].perm_out = perm_in;
3864 		  changed = true;
3865 		}
3866 	    }
3867 
3868 	  /* Elide pruning at materialization points in the first
3869 	     iteration phase.  */
3870 	  if (!do_materialization)
3871 	    continue;
3872 
3873 	  int perm = vertices[idx].perm_out;
3874 	  if (perm == 0 || perm == -1)
3875 	    continue;
3876 
3877 	  /* Decide on permute materialization.  Look whether there's
3878 	     a use (pred) edge that is permuted differently than us.
3879 	     In that case mark ourselves so the permutation is applied.  */
3880 	  bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
3881 	  if (all_preds_permuted)
3882 	    for (graph_edge *pred = slpg->vertices[idx].pred;
3883 		 pred; pred = pred->pred_next)
3884 	      {
3885 		int pred_perm = vertices[pred->src].perm_in;
3886 		gcc_checking_assert (pred_perm != -1);
3887 		if (!vect_slp_perms_eq (perms, perm, pred_perm))
3888 		  {
3889 		    all_preds_permuted = false;
3890 		    break;
3891 		  }
3892 	      }
3893 	  if (!all_preds_permuted)
3894 	    {
3895 	      vertices[idx].perm_out = 0;
3896 	      changed = true;
3897 	    }
3898 	}
3899 
3900       /* If the initial propagation converged, switch on materialization
3901 	 and re-propagate.  */
3902       if (!changed && !do_materialization)
3903 	{
3904 	  do_materialization = true;
3905 	  changed = true;
3906 	}
3907     }
3908   while (changed);
3909   statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
3910 
3911   /* Materialize.  */
3912   for (i = 0; i < vertices.length (); ++i)
3913     {
3914       int perm_in = vertices[i].perm_in;
3915       slp_tree node = vertices[i].node;
3916 
3917       /* First permute invariant/external original successors, we handle
3918 	 those optimistically during propagation and duplicate them if
3919 	 they are used with different permutations.  */
3920       unsigned j;
3921       slp_tree child;
3922       if (perm_in > 0)
3923 	FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
3924 	  {
3925 	    if (!child
3926 		|| (SLP_TREE_DEF_TYPE (child) != vect_constant_def
3927 		    && SLP_TREE_DEF_TYPE (child) != vect_external_def))
3928 	      continue;
3929 
3930 	    /* If the vector is uniform there's nothing to do.  */
3931 	    if (vect_slp_tree_uniform_p (child))
3932 	      continue;
3933 
3934 	    /* We can end up sharing some externals via two_operator
3935 	       handling.  Be prepared to unshare those.  */
3936 	    if (child->refcnt != 1)
3937 	      {
3938 		gcc_assert (slpg->vertices[child->vertex].pred->pred_next);
3939 		SLP_TREE_CHILDREN (node)[j] = child
3940 		  = vect_create_new_slp_node
3941 		      (SLP_TREE_SCALAR_OPS (child).copy ());
3942 	      }
3943 	    vect_slp_permute (perms[perm_in],
3944 			      SLP_TREE_SCALAR_OPS (child), true);
3945 	  }
3946 
3947       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3948 	{
3949 	  /* Apply the common permutes to the input vectors.  */
3950 	  if (perm_in > 0)
3951 	    {
3952 	      /* If the node is already a permute node we can apply
3953 		 the permutation to the lane selection, effectively
3954 		 materializing it on the incoming vectors.  */
3955 	      if (dump_enabled_p ())
3956 		dump_printf_loc (MSG_NOTE, vect_location,
3957 				 "simplifying permute node %p\n",
3958 				 node);
3959 	      for (unsigned k = 0;
3960 		   k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
3961 		SLP_TREE_LANE_PERMUTATION (node)[k].second
3962 		  = perms[perm_in][SLP_TREE_LANE_PERMUTATION (node)[k].second];
3963 	    }
3964 	  /* Apply the anticipated output permute to the permute and
3965 	     stmt vectors.  */
3966 	  int perm_out = vertices[i].perm_out;
3967 	  if (perm_out > 0)
3968 	    {
3969 	      vect_slp_permute (perms[perm_out],
3970 				SLP_TREE_SCALAR_STMTS (node), true);
3971 	      vect_slp_permute (perms[perm_out],
3972 				SLP_TREE_LANE_PERMUTATION (node), true);
3973 	    }
3974 	}
3975       else if (vertices[i].get_perm_materialized () != 0)
3976 	{
3977 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3978 	    /* For loads simply drop the permutation, the load permutation
3979 	       already performs the desired permutation.  */
3980 	    ;
3981 	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3982 	    gcc_unreachable ();
3983 	  else
3984 	    {
3985 	      if (dump_enabled_p ())
3986 		dump_printf_loc (MSG_NOTE, vect_location,
3987 				 "inserting permute node in place of %p\n",
3988 				 node);
3989 
3990 	      /* Make a copy of NODE and in-place change it to a
3991 		 VEC_PERM node to permute the lanes of the copy.  */
3992 	      slp_tree copy = new _slp_tree;
3993 	      SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node);
3994 	      SLP_TREE_CHILDREN (node) = vNULL;
3995 	      SLP_TREE_SCALAR_STMTS (copy)
3996 		= SLP_TREE_SCALAR_STMTS (node).copy ();
3997 	      vect_slp_permute (perms[perm_in],
3998 				SLP_TREE_SCALAR_STMTS (copy), true);
3999 	      gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ());
4000 	      SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node);
4001 	      gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ());
4002 	      SLP_TREE_LANE_PERMUTATION (copy)
4003 		= SLP_TREE_LANE_PERMUTATION (node);
4004 	      SLP_TREE_LANE_PERMUTATION (node) = vNULL;
4005 	      SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node);
4006 	      copy->refcnt = 1;
4007 	      copy->max_nunits = node->max_nunits;
4008 	      SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node);
4009 	      SLP_TREE_LANES (copy) = SLP_TREE_LANES (node);
4010 	      SLP_TREE_CODE (copy) = SLP_TREE_CODE (node);
4011 
4012 	      /* Now turn NODE into a VEC_PERM.  */
4013 	      SLP_TREE_CHILDREN (node).safe_push (copy);
4014 	      SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node));
4015 	      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4016 		SLP_TREE_LANE_PERMUTATION (node)
4017 		  .quick_push (std::make_pair (0, perms[perm_in][j]));
4018 	      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
4019 	    }
4020 	}
4021       else if (perm_in > 0) /* perm_in == perm_out */
4022 	{
4023 	  /* Apply the reverse permutation to our stmts.  */
4024 	  vect_slp_permute (perms[perm_in],
4025 			    SLP_TREE_SCALAR_STMTS (node), true);
4026 	  /* And to the lane/load permutation, which we can simply
4027 	     make regular by design.  */
4028 	  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4029 	    {
4030 	      gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
4031 	      /* ???  When we handle non-bijective permutes the idea
4032 		 is that we can force the load-permutation to be
4033 		 { min, min + 1, min + 2, ... max }.  But then the
4034 		 scalar defs might no longer match the lane content
4035 		 which means wrong-code with live lane vectorization.
4036 		 So we possibly have to have NULL entries for those.  */
4037 	      vect_slp_permute (perms[perm_in],
4038 				SLP_TREE_LOAD_PERMUTATION (node), true);
4039 	    }
4040 	  else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
4041 	    gcc_unreachable ();
4042 	}
4043     }
4044 
4045   /* Elide any permutations at BB reduction roots.  */
4046   if (is_a <bb_vec_info> (vinfo))
4047     {
4048       for (slp_instance instance : vinfo->slp_instances)
4049 	{
4050 	  if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
4051 	    continue;
4052 	  slp_tree old = SLP_INSTANCE_TREE (instance);
4053 	  if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
4054 	      && SLP_TREE_CHILDREN (old).length () == 1)
4055 	    {
4056 	      slp_tree child = SLP_TREE_CHILDREN (old)[0];
4057 	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
4058 		{
4059 		  /* Preserve the special VEC_PERM we use to shield existing
4060 		     vector defs from the rest.  But make it a no-op.  */
4061 		  unsigned i = 0;
4062 		  for (std::pair<unsigned, unsigned> &p
4063 		       : SLP_TREE_LANE_PERMUTATION (old))
4064 		    p.second = i++;
4065 		}
4066 	      else
4067 		{
4068 		  SLP_INSTANCE_TREE (instance) = child;
4069 		  SLP_TREE_REF_COUNT (child)++;
4070 		  vect_free_slp_tree (old);
4071 		}
4072 	    }
4073 	  else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
4074 		   && SLP_TREE_REF_COUNT (old) == 1
4075 		   && vertices[old->vertex].get_perm_materialized () != 0)
4076 	    {
4077 	      /* ???  For loads the situation is more complex since
4078 		 we can't modify the permute in place in case the
4079 		 node is used multiple times.  In fact for loads this
4080 		 should be somehow handled in the propagation engine.  */
4081 	      /* Apply the reverse permutation to our stmts.  */
4082 	      int perm = vertices[old->vertex].get_perm_materialized ();
4083 	      vect_slp_permute (perms[perm],
4084 				SLP_TREE_SCALAR_STMTS (old), true);
4085 	      vect_slp_permute (perms[perm],
4086 				SLP_TREE_LOAD_PERMUTATION (old), true);
4087 	    }
4088 	}
4089     }
4090 
4091   /* Free the perms vector used for propagation.  */
4092   while (!perms.is_empty ())
4093     perms.pop ().release ();
4094   free_graph (slpg);
4095 
4096 
4097   /* Now elide load permutations that are not necessary.  */
4098   for (i = 0; i < leafs.length (); ++i)
4099     {
4100       node = vertices[leafs[i]].node;
4101       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
4102 	continue;
4103 
4104       /* In basic block vectorization we allow any subchain of an interleaving
4105 	 chain.
4106 	 FORNOW: not in loop SLP because of realignment complications.  */
4107       if (is_a <bb_vec_info> (vinfo))
4108 	{
4109 	  bool subchain_p = true;
4110 	  stmt_vec_info next_load_info = NULL;
4111 	  stmt_vec_info load_info;
4112 	  unsigned j;
4113 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4114 	    {
4115 	      if (j != 0
4116 		  && (next_load_info != load_info
4117 		      || DR_GROUP_GAP (load_info) != 1))
4118 		{
4119 		  subchain_p = false;
4120 		  break;
4121 		}
4122 	      next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
4123 	    }
4124 	  if (subchain_p)
4125 	    {
4126 	      SLP_TREE_LOAD_PERMUTATION (node).release ();
4127 	      continue;
4128 	    }
4129 	}
4130       else
4131 	{
4132 	  stmt_vec_info load_info;
4133 	  bool this_load_permuted = false;
4134 	  unsigned j;
4135 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
4136 	    if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
4137 	      {
4138 		this_load_permuted = true;
4139 		break;
4140 	      }
4141 	  stmt_vec_info first_stmt_info
4142 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
4143 	  if (!this_load_permuted
4144 	      /* The load requires permutation when unrolling exposes
4145 		 a gap either because the group is larger than the SLP
4146 		 group-size or because there is a gap between the groups.  */
4147 	      && (known_eq (LOOP_VINFO_VECT_FACTOR
4148 			      (as_a <loop_vec_info> (vinfo)), 1U)
4149 		  || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
4150 		      && DR_GROUP_GAP (first_stmt_info) == 0)))
4151 	    {
4152 	      SLP_TREE_LOAD_PERMUTATION (node).release ();
4153 	      continue;
4154 	    }
4155 	}
4156     }
4157 }
4158 
4159 /* Gather loads reachable from the individual SLP graph entries.  */
4160 
4161 void
vect_gather_slp_loads(vec_info * vinfo)4162 vect_gather_slp_loads (vec_info *vinfo)
4163 {
4164   unsigned i;
4165   slp_instance instance;
4166   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
4167     {
4168       hash_set<slp_tree> visited;
4169       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
4170 			     SLP_INSTANCE_TREE (instance), visited);
4171     }
4172 }
4173 
4174 
4175 /* For each possible SLP instance decide whether to SLP it and calculate overall
4176    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
4177    least one instance.  */
4178 
4179 bool
vect_make_slp_decision(loop_vec_info loop_vinfo)4180 vect_make_slp_decision (loop_vec_info loop_vinfo)
4181 {
4182   unsigned int i;
4183   poly_uint64 unrolling_factor = 1;
4184   const vec<slp_instance> &slp_instances
4185     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
4186   slp_instance instance;
4187   int decided_to_slp = 0;
4188 
4189   DUMP_VECT_SCOPE ("vect_make_slp_decision");
4190 
4191   FOR_EACH_VEC_ELT (slp_instances, i, instance)
4192     {
4193       /* FORNOW: SLP if you can.  */
4194       /* All unroll factors have the form:
4195 
4196 	   GET_MODE_SIZE (vinfo->vector_mode) * X
4197 
4198 	 for some rational X, so they must have a common multiple.  */
4199       unrolling_factor
4200 	= force_common_multiple (unrolling_factor,
4201 				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
4202 
4203       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
4204 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
4205 	 loop-based vectorization.  Such stmts will be marked as HYBRID.  */
4206       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
4207       decided_to_slp++;
4208     }
4209 
4210   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
4211 
4212   if (decided_to_slp && dump_enabled_p ())
4213     {
4214       dump_printf_loc (MSG_NOTE, vect_location,
4215 		       "Decided to SLP %d instances. Unrolling factor ",
4216 		       decided_to_slp);
4217       dump_dec (MSG_NOTE, unrolling_factor);
4218       dump_printf (MSG_NOTE, "\n");
4219     }
4220 
4221   return (decided_to_slp > 0);
4222 }
4223 
4224 /* Private data for vect_detect_hybrid_slp.  */
4225 struct vdhs_data
4226 {
4227   loop_vec_info loop_vinfo;
4228   vec<stmt_vec_info> *worklist;
4229 };
4230 
4231 /* Walker for walk_gimple_op.  */
4232 
4233 static tree
vect_detect_hybrid_slp(tree * tp,int *,void * data)4234 vect_detect_hybrid_slp (tree *tp, int *, void *data)
4235 {
4236   walk_stmt_info *wi = (walk_stmt_info *)data;
4237   vdhs_data *dat = (vdhs_data *)wi->info;
4238 
4239   if (wi->is_lhs)
4240     return NULL_TREE;
4241 
4242   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
4243   if (!def_stmt_info)
4244     return NULL_TREE;
4245   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
4246   if (PURE_SLP_STMT (def_stmt_info))
4247     {
4248       if (dump_enabled_p ())
4249 	dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
4250 			 def_stmt_info->stmt);
4251       STMT_SLP_TYPE (def_stmt_info) = hybrid;
4252       dat->worklist->safe_push (def_stmt_info);
4253     }
4254 
4255   return NULL_TREE;
4256 }
4257 
4258 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
4259    if so, otherwise pushing it to WORKLIST.  */
4260 
4261 static void
maybe_push_to_hybrid_worklist(vec_info * vinfo,vec<stmt_vec_info> & worklist,stmt_vec_info stmt_info)4262 maybe_push_to_hybrid_worklist (vec_info *vinfo,
4263 			       vec<stmt_vec_info> &worklist,
4264 			       stmt_vec_info stmt_info)
4265 {
4266   if (dump_enabled_p ())
4267     dump_printf_loc (MSG_NOTE, vect_location,
4268 		     "Processing hybrid candidate : %G", stmt_info->stmt);
4269   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
4270   imm_use_iterator iter2;
4271   ssa_op_iter iter1;
4272   use_operand_p use_p;
4273   def_operand_p def_p;
4274   bool any_def = false;
4275   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
4276     {
4277       any_def = true;
4278       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
4279 	{
4280 	  if (is_gimple_debug (USE_STMT (use_p)))
4281 	    continue;
4282 	  stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
4283 	  /* An out-of loop use means this is a loop_vect sink.  */
4284 	  if (!use_info)
4285 	    {
4286 	      if (dump_enabled_p ())
4287 		dump_printf_loc (MSG_NOTE, vect_location,
4288 				 "Found loop_vect sink: %G", stmt_info->stmt);
4289 	      worklist.safe_push (stmt_info);
4290 	      return;
4291 	    }
4292 	  else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
4293 	    {
4294 	      if (dump_enabled_p ())
4295 		dump_printf_loc (MSG_NOTE, vect_location,
4296 				 "Found loop_vect use: %G", use_info->stmt);
4297 	      worklist.safe_push (stmt_info);
4298 	      return;
4299 	    }
4300 	}
4301     }
4302   /* No def means this is a loo_vect sink.  */
4303   if (!any_def)
4304     {
4305       if (dump_enabled_p ())
4306 	dump_printf_loc (MSG_NOTE, vect_location,
4307 			 "Found loop_vect sink: %G", stmt_info->stmt);
4308       worklist.safe_push (stmt_info);
4309       return;
4310     }
4311   if (dump_enabled_p ())
4312     dump_printf_loc (MSG_NOTE, vect_location,
4313 		     "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
4314   STMT_SLP_TYPE (stmt_info) = pure_slp;
4315 }
4316 
4317 /* Find stmts that must be both vectorized and SLPed.  */
4318 
4319 void
vect_detect_hybrid_slp(loop_vec_info loop_vinfo)4320 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
4321 {
4322   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
4323 
4324   /* All stmts participating in SLP are marked pure_slp, all other
4325      stmts are loop_vect.
4326      First collect all loop_vect stmts into a worklist.
4327      SLP patterns cause not all original scalar stmts to appear in
4328      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
4329      Rectify this here and do a backward walk over the IL only considering
4330      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
4331      mark them as pure_slp.  */
4332   auto_vec<stmt_vec_info> worklist;
4333   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
4334     {
4335       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
4336       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
4337 	   gsi_next (&gsi))
4338 	{
4339 	  gphi *phi = gsi.phi ();
4340 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
4341 	  if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4342 	    maybe_push_to_hybrid_worklist (loop_vinfo,
4343 					   worklist, stmt_info);
4344 	}
4345       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
4346 	   gsi_prev (&gsi))
4347 	{
4348 	  gimple *stmt = gsi_stmt (gsi);
4349 	  if (is_gimple_debug (stmt))
4350 	    continue;
4351 	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
4352 	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
4353 	    {
4354 	      for (gimple_stmt_iterator gsi2
4355 		     = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
4356 		   !gsi_end_p (gsi2); gsi_next (&gsi2))
4357 		{
4358 		  stmt_vec_info patt_info
4359 		    = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
4360 		  if (!STMT_SLP_TYPE (patt_info)
4361 		      && STMT_VINFO_RELEVANT (patt_info))
4362 		    maybe_push_to_hybrid_worklist (loop_vinfo,
4363 						   worklist, patt_info);
4364 		}
4365 	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4366 	    }
4367 	  if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
4368 	    maybe_push_to_hybrid_worklist (loop_vinfo,
4369 					   worklist, stmt_info);
4370 	}
4371     }
4372 
4373   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
4374      mark any SLP vectorized stmt as hybrid.
4375      ???  We're visiting def stmts N times (once for each non-SLP and
4376      once for each hybrid-SLP use).  */
4377   walk_stmt_info wi;
4378   vdhs_data dat;
4379   dat.worklist = &worklist;
4380   dat.loop_vinfo = loop_vinfo;
4381   memset (&wi, 0, sizeof (wi));
4382   wi.info = (void *)&dat;
4383   while (!worklist.is_empty ())
4384     {
4385       stmt_vec_info stmt_info = worklist.pop ();
4386       /* Since SSA operands are not set up for pattern stmts we need
4387 	 to use walk_gimple_op.  */
4388       wi.is_lhs = 0;
4389       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
4390       /* For gather/scatter make sure to walk the offset operand, that
4391 	 can be a scaling and conversion away.  */
4392       gather_scatter_info gs_info;
4393       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4394 	  && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
4395 	{
4396 	  int dummy;
4397 	  vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
4398 	}
4399     }
4400 }
4401 
4402 
4403 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
4404 
_bb_vec_info(vec<basic_block> _bbs,vec_info_shared * shared)4405 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
4406   : vec_info (vec_info::bb, shared),
4407     bbs (_bbs),
4408     roots (vNULL)
4409 {
4410   for (unsigned i = 0; i < bbs.length (); ++i)
4411     {
4412       if (i != 0)
4413 	for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4414 	     gsi_next (&si))
4415 	  {
4416 	    gphi *phi = si.phi ();
4417 	    gimple_set_uid (phi, 0);
4418 	    add_stmt (phi);
4419 	  }
4420       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4421 	   !gsi_end_p (gsi); gsi_next (&gsi))
4422 	{
4423 	  gimple *stmt = gsi_stmt (gsi);
4424 	  gimple_set_uid (stmt, 0);
4425 	  if (is_gimple_debug (stmt))
4426 	    continue;
4427 	  add_stmt (stmt);
4428 	}
4429     }
4430 }
4431 
4432 
4433 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
4434    stmts in the basic block.  */
4435 
~_bb_vec_info()4436 _bb_vec_info::~_bb_vec_info ()
4437 {
4438   /* Reset region marker.  */
4439   for (unsigned i = 0; i < bbs.length (); ++i)
4440     {
4441       if (i != 0)
4442 	for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
4443 	     gsi_next (&si))
4444 	  {
4445 	    gphi *phi = si.phi ();
4446 	    gimple_set_uid (phi, -1);
4447 	  }
4448       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
4449 	   !gsi_end_p (gsi); gsi_next (&gsi))
4450 	{
4451 	  gimple *stmt = gsi_stmt (gsi);
4452 	  gimple_set_uid (stmt, -1);
4453 	}
4454     }
4455 
4456   for (unsigned i = 0; i < roots.length (); ++i)
4457     {
4458       roots[i].stmts.release ();
4459       roots[i].roots.release ();
4460     }
4461   roots.release ();
4462 }
4463 
4464 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
4465    given then that child nodes have already been processed, and that
4466    their def types currently match their SLP node's def type.  */
4467 
4468 static bool
vect_slp_analyze_node_operations_1(vec_info * vinfo,slp_tree node,slp_instance node_instance,stmt_vector_for_cost * cost_vec)4469 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
4470 				    slp_instance node_instance,
4471 				    stmt_vector_for_cost *cost_vec)
4472 {
4473   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
4474 
4475   /* Calculate the number of vector statements to be created for the
4476      scalar stmts in this node.  For SLP reductions it is equal to the
4477      number of vector statements in the children (which has already been
4478      calculated by the recursive call).  Otherwise it is the number of
4479      scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
4480      VF divided by the number of elements in a vector.  */
4481   if (!STMT_VINFO_DATA_REF (stmt_info)
4482       && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
4483     {
4484       for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
4485 	if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
4486 	  {
4487 	    SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4488 	      = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
4489 	    break;
4490 	  }
4491     }
4492   else
4493     {
4494       poly_uint64 vf;
4495       if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4496 	vf = loop_vinfo->vectorization_factor;
4497       else
4498 	vf = 1;
4499       unsigned int group_size = SLP_TREE_LANES (node);
4500       tree vectype = SLP_TREE_VECTYPE (node);
4501       SLP_TREE_NUMBER_OF_VEC_STMTS (node)
4502 	= vect_get_num_vectors (vf * group_size, vectype);
4503     }
4504 
4505   /* Handle purely internal nodes.  */
4506   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4507     {
4508       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
4509 	return false;
4510 
4511       stmt_vec_info slp_stmt_info;
4512       unsigned int i;
4513       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
4514 	{
4515 	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
4516 	      && !vectorizable_live_operation (vinfo,
4517 					       slp_stmt_info, NULL, node,
4518 					       node_instance, i,
4519 					       false, cost_vec))
4520 	    return false;
4521 	}
4522       return true;
4523     }
4524 
4525   bool dummy;
4526   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
4527 			    node, node_instance, cost_vec);
4528 }
4529 
4530 /* Try to build NODE from scalars, returning true on success.
4531    NODE_INSTANCE is the SLP instance that contains NODE.  */
4532 
4533 static bool
vect_slp_convert_to_external(vec_info * vinfo,slp_tree node,slp_instance node_instance)4534 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
4535 			      slp_instance node_instance)
4536 {
4537   stmt_vec_info stmt_info;
4538   unsigned int i;
4539 
4540   if (!is_a <bb_vec_info> (vinfo)
4541       || node == SLP_INSTANCE_TREE (node_instance)
4542       || !SLP_TREE_SCALAR_STMTS (node).exists ()
4543       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node)))
4544     return false;
4545 
4546   if (dump_enabled_p ())
4547     dump_printf_loc (MSG_NOTE, vect_location,
4548 		     "Building vector operands of %p from scalars instead\n", node);
4549 
4550   /* Don't remove and free the child nodes here, since they could be
4551      referenced by other structures.  The analysis and scheduling phases
4552      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
4553   unsigned int group_size = SLP_TREE_LANES (node);
4554   SLP_TREE_DEF_TYPE (node) = vect_external_def;
4555   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
4556   SLP_TREE_LOAD_PERMUTATION (node).release ();
4557   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4558     {
4559       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
4560       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
4561     }
4562   return true;
4563 }
4564 
4565 /* Return true if all elements of the slice are the same.  */
4566 bool
all_same_p() const4567 vect_scalar_ops_slice::all_same_p () const
4568 {
4569   for (unsigned int i = 1; i < length; ++i)
4570     if (!operand_equal_p (op (0), op (i)))
4571       return false;
4572   return true;
4573 }
4574 
4575 hashval_t
hash(const value_type & s)4576 vect_scalar_ops_slice_hash::hash (const value_type &s)
4577 {
4578   hashval_t hash = 0;
4579   for (unsigned i = 0; i < s.length; ++i)
4580     hash = iterative_hash_expr (s.op (i), hash);
4581   return hash;
4582 }
4583 
4584 bool
equal(const value_type & s1,const compare_type & s2)4585 vect_scalar_ops_slice_hash::equal (const value_type &s1,
4586 				   const compare_type &s2)
4587 {
4588   if (s1.length != s2.length)
4589     return false;
4590   for (unsigned i = 0; i < s1.length; ++i)
4591     if (!operand_equal_p (s1.op (i), s2.op (i)))
4592       return false;
4593   return true;
4594 }
4595 
4596 /* Compute the prologue cost for invariant or constant operands represented
4597    by NODE.  */
4598 
4599 static void
vect_prologue_cost_for_slp(slp_tree node,stmt_vector_for_cost * cost_vec)4600 vect_prologue_cost_for_slp (slp_tree node,
4601 			    stmt_vector_for_cost *cost_vec)
4602 {
4603   /* There's a special case of an existing vector, that costs nothing.  */
4604   if (SLP_TREE_SCALAR_OPS (node).length () == 0
4605       && !SLP_TREE_VEC_DEFS (node).is_empty ())
4606     return;
4607   /* Without looking at the actual initializer a vector of
4608      constants can be implemented as load from the constant pool.
4609      When all elements are the same we can use a splat.  */
4610   tree vectype = SLP_TREE_VECTYPE (node);
4611   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
4612   unsigned HOST_WIDE_INT const_nunits;
4613   unsigned nelt_limit;
4614   auto ops = &SLP_TREE_SCALAR_OPS (node);
4615   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
4616   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
4617       && ! multiple_p (const_nunits, group_size))
4618     {
4619       nelt_limit = const_nunits;
4620       hash_set<vect_scalar_ops_slice_hash> vector_ops;
4621       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
4622 	if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
4623 	  starts.quick_push (i * const_nunits);
4624     }
4625   else
4626     {
4627       /* If either the vector has variable length or the vectors
4628 	 are composed of repeated whole groups we only need to
4629 	 cost construction once.  All vectors will be the same.  */
4630       nelt_limit = group_size;
4631       starts.quick_push (0);
4632     }
4633   /* ???  We're just tracking whether vectors in a single node are the same.
4634      Ideally we'd do something more global.  */
4635   for (unsigned int start : starts)
4636     {
4637       vect_cost_for_stmt kind;
4638       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
4639 	kind = vector_load;
4640       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
4641 	kind = scalar_to_vec;
4642       else
4643 	kind = vec_construct;
4644       record_stmt_cost (cost_vec, 1, kind, node, vectype, 0, vect_prologue);
4645     }
4646 }
4647 
4648 /* Analyze statements contained in SLP tree NODE after recursively analyzing
4649    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
4650 
4651    Return true if the operations are supported.  */
4652 
4653 static bool
vect_slp_analyze_node_operations(vec_info * vinfo,slp_tree node,slp_instance node_instance,hash_set<slp_tree> & visited_set,vec<slp_tree> & visited_vec,stmt_vector_for_cost * cost_vec)4654 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
4655 				  slp_instance node_instance,
4656 				  hash_set<slp_tree> &visited_set,
4657 				  vec<slp_tree> &visited_vec,
4658 				  stmt_vector_for_cost *cost_vec)
4659 {
4660   int i, j;
4661   slp_tree child;
4662 
4663   /* Assume we can code-generate all invariants.  */
4664   if (!node
4665       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
4666       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
4667     return true;
4668 
4669   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
4670     {
4671       if (dump_enabled_p ())
4672 	dump_printf_loc (MSG_NOTE, vect_location,
4673 			 "Failed cyclic SLP reference in %p\n", node);
4674       return false;
4675     }
4676   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
4677 
4678   /* If we already analyzed the exact same set of scalar stmts we're done.
4679      We share the generated vector stmts for those.  */
4680   if (visited_set.add (node))
4681     return true;
4682   visited_vec.safe_push (node);
4683 
4684   bool res = true;
4685   unsigned visited_rec_start = visited_vec.length ();
4686   unsigned cost_vec_rec_start = cost_vec->length ();
4687   bool seen_non_constant_child = false;
4688   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4689     {
4690       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
4691 					      visited_set, visited_vec,
4692 					      cost_vec);
4693       if (!res)
4694 	break;
4695       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
4696 	seen_non_constant_child = true;
4697     }
4698   /* We're having difficulties scheduling nodes with just constant
4699      operands and no scalar stmts since we then cannot compute a stmt
4700      insertion place.  */
4701   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
4702     {
4703       if (dump_enabled_p ())
4704 	dump_printf_loc (MSG_NOTE, vect_location,
4705 			 "Cannot vectorize all-constant op node %p\n", node);
4706       res = false;
4707     }
4708 
4709   if (res)
4710     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
4711 					      cost_vec);
4712   /* If analysis failed we have to pop all recursive visited nodes
4713      plus ourselves.  */
4714   if (!res)
4715     {
4716       while (visited_vec.length () >= visited_rec_start)
4717 	visited_set.remove (visited_vec.pop ());
4718       cost_vec->truncate (cost_vec_rec_start);
4719     }
4720 
4721   /* When the node can be vectorized cost invariant nodes it references.
4722      This is not done in DFS order to allow the refering node
4723      vectorizable_* calls to nail down the invariant nodes vector type
4724      and possibly unshare it if it needs a different vector type than
4725      other referrers.  */
4726   if (res)
4727     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
4728       if (child
4729 	  && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
4730 	      || SLP_TREE_DEF_TYPE (child) == vect_external_def)
4731 	  /* Perform usual caching, note code-generation still
4732 	     code-gens these nodes multiple times but we expect
4733 	     to CSE them later.  */
4734 	  && !visited_set.add (child))
4735 	{
4736 	  visited_vec.safe_push (child);
4737 	  /* ???  After auditing more code paths make a "default"
4738 	     and push the vector type from NODE to all children
4739 	     if it is not already set.  */
4740 	  /* Compute the number of vectors to be generated.  */
4741 	  tree vector_type = SLP_TREE_VECTYPE (child);
4742 	  if (!vector_type)
4743 	    {
4744 	      /* For shifts with a scalar argument we don't need
4745 		 to cost or code-generate anything.
4746 		 ???  Represent this more explicitely.  */
4747 	      gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
4748 			   == shift_vec_info_type)
4749 			  && j == 1);
4750 	      continue;
4751 	    }
4752 	  unsigned group_size = SLP_TREE_LANES (child);
4753 	  poly_uint64 vf = 1;
4754 	  if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4755 	    vf = loop_vinfo->vectorization_factor;
4756 	  SLP_TREE_NUMBER_OF_VEC_STMTS (child)
4757 	    = vect_get_num_vectors (vf * group_size, vector_type);
4758 	  /* And cost them.  */
4759 	  vect_prologue_cost_for_slp (child, cost_vec);
4760 	}
4761 
4762   /* If this node or any of its children can't be vectorized, try pruning
4763      the tree here rather than felling the whole thing.  */
4764   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
4765     {
4766       /* We'll need to revisit this for invariant costing and number
4767 	 of vectorized stmt setting.   */
4768       res = true;
4769     }
4770 
4771   return res;
4772 }
4773 
4774 /* Mark lanes of NODE that are live outside of the basic-block vectorized
4775    region and that can be vectorized using vectorizable_live_operation
4776    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
4777    scalar code computing it to be retained.  */
4778 
4779 static void
vect_bb_slp_mark_live_stmts(bb_vec_info bb_vinfo,slp_tree node,slp_instance instance,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & svisited,hash_set<slp_tree> & visited)4780 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
4781 			     slp_instance instance,
4782 			     stmt_vector_for_cost *cost_vec,
4783 			     hash_set<stmt_vec_info> &svisited,
4784 			     hash_set<slp_tree> &visited)
4785 {
4786   if (visited.add (node))
4787     return;
4788 
4789   unsigned i;
4790   stmt_vec_info stmt_info;
4791   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
4792   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
4793     {
4794       if (svisited.contains (stmt_info))
4795 	continue;
4796       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4797       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
4798 	  && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
4799 	/* Only the pattern root stmt computes the original scalar value.  */
4800 	continue;
4801       bool mark_visited = true;
4802       gimple *orig_stmt = orig_stmt_info->stmt;
4803       ssa_op_iter op_iter;
4804       def_operand_p def_p;
4805       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
4806 	{
4807 	  imm_use_iterator use_iter;
4808 	  gimple *use_stmt;
4809 	  stmt_vec_info use_stmt_info;
4810 	  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4811 	    if (!is_gimple_debug (use_stmt))
4812 	      {
4813 		use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
4814 		if (!use_stmt_info
4815 		    || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4816 		  {
4817 		    STMT_VINFO_LIVE_P (stmt_info) = true;
4818 		    if (vectorizable_live_operation (bb_vinfo, stmt_info,
4819 						     NULL, node, instance, i,
4820 						     false, cost_vec))
4821 		      /* ???  So we know we can vectorize the live stmt
4822 			 from one SLP node.  If we cannot do so from all
4823 			 or none consistently we'd have to record which
4824 			 SLP node (and lane) we want to use for the live
4825 			 operation.  So make sure we can code-generate
4826 			 from all nodes.  */
4827 		      mark_visited = false;
4828 		    else
4829 		      STMT_VINFO_LIVE_P (stmt_info) = false;
4830 		    break;
4831 		  }
4832 	      }
4833 	  /* We have to verify whether we can insert the lane extract
4834 	     before all uses.  The following is a conservative approximation.
4835 	     We cannot put this into vectorizable_live_operation because
4836 	     iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
4837 	     doesn't work.
4838 	     Note that while the fact that we emit code for loads at the
4839 	     first load should make this a non-problem leafs we construct
4840 	     from scalars are vectorized after the last scalar def.
4841 	     ???  If we'd actually compute the insert location during
4842 	     analysis we could use sth less conservative than the last
4843 	     scalar stmt in the node for the dominance check.  */
4844 	  /* ???  What remains is "live" uses in vector CTORs in the same
4845 	     SLP graph which is where those uses can end up code-generated
4846 	     right after their definition instead of close to their original
4847 	     use.  But that would restrict us to code-generate lane-extracts
4848 	     from the latest stmt in a node.  So we compensate for this
4849 	     during code-generation, simply not replacing uses for those
4850 	     hopefully rare cases.  */
4851 	  if (STMT_VINFO_LIVE_P (stmt_info))
4852 	    FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
4853 	      if (!is_gimple_debug (use_stmt)
4854 		  && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
4855 		      || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
4856 		  && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
4857 		{
4858 		  if (dump_enabled_p ())
4859 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4860 				     "Cannot determine insertion place for "
4861 				     "lane extract\n");
4862 		  STMT_VINFO_LIVE_P (stmt_info) = false;
4863 		  mark_visited = true;
4864 		}
4865 	}
4866       if (mark_visited)
4867 	svisited.add (stmt_info);
4868     }
4869 
4870   slp_tree child;
4871   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4872     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
4873       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
4874 				   cost_vec, svisited, visited);
4875 }
4876 
4877 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
4878 
4879 static bool
vectorizable_bb_reduc_epilogue(slp_instance instance,stmt_vector_for_cost * cost_vec)4880 vectorizable_bb_reduc_epilogue (slp_instance instance,
4881 				stmt_vector_for_cost *cost_vec)
4882 {
4883   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
4884   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
4885   if (reduc_code == MINUS_EXPR)
4886     reduc_code = PLUS_EXPR;
4887   internal_fn reduc_fn;
4888   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
4889   if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
4890       || reduc_fn == IFN_LAST
4891       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
4892       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4893 				     TREE_TYPE (vectype)))
4894     return false;
4895 
4896   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
4897      cost log2 vector operations plus shuffles and one extraction.  */
4898   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
4899   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
4900 		    vectype, 0, vect_body);
4901   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
4902 		    vectype, 0, vect_body);
4903   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
4904 		    vectype, 0, vect_body);
4905   return true;
4906 }
4907 
4908 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
4909    and recurse to children.  */
4910 
4911 static void
vect_slp_prune_covered_roots(slp_tree node,hash_set<stmt_vec_info> & roots,hash_set<slp_tree> & visited)4912 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
4913 			      hash_set<slp_tree> &visited)
4914 {
4915   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
4916       || visited.add (node))
4917     return;
4918 
4919   stmt_vec_info stmt;
4920   unsigned i;
4921   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
4922     roots.remove (vect_orig_stmt (stmt));
4923 
4924   slp_tree child;
4925   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4926     if (child)
4927       vect_slp_prune_covered_roots (child, roots, visited);
4928 }
4929 
4930 /* Analyze statements in SLP instances of VINFO.  Return true if the
4931    operations are supported. */
4932 
4933 bool
vect_slp_analyze_operations(vec_info * vinfo)4934 vect_slp_analyze_operations (vec_info *vinfo)
4935 {
4936   slp_instance instance;
4937   int i;
4938 
4939   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
4940 
4941   hash_set<slp_tree> visited;
4942   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
4943     {
4944       auto_vec<slp_tree> visited_vec;
4945       stmt_vector_for_cost cost_vec;
4946       cost_vec.create (2);
4947       if (is_a <bb_vec_info> (vinfo))
4948 	vect_location = instance->location ();
4949       if (!vect_slp_analyze_node_operations (vinfo,
4950 					     SLP_INSTANCE_TREE (instance),
4951 					     instance, visited, visited_vec,
4952 					     &cost_vec)
4953 	  /* CTOR instances require vectorized defs for the SLP tree root.  */
4954 	  || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
4955 	      && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
4956 		  != vect_internal_def
4957 		  /* Make sure we vectorized with the expected type.  */
4958 		  || !useless_type_conversion_p
4959 			(TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
4960 					      (instance->root_stmts[0]->stmt))),
4961 			 TREE_TYPE (SLP_TREE_VECTYPE
4962 					    (SLP_INSTANCE_TREE (instance))))))
4963 	  /* Check we can vectorize the reduction.  */
4964 	  || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
4965 	      && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
4966         {
4967 	  slp_tree node = SLP_INSTANCE_TREE (instance);
4968 	  stmt_vec_info stmt_info;
4969 	  if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
4970 	    stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
4971 	  else
4972 	    stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
4973 	  if (dump_enabled_p ())
4974 	    dump_printf_loc (MSG_NOTE, vect_location,
4975 			     "removing SLP instance operations starting from: %G",
4976 			     stmt_info->stmt);
4977 	  vect_free_slp_instance (instance);
4978           vinfo->slp_instances.ordered_remove (i);
4979 	  cost_vec.release ();
4980 	  while (!visited_vec.is_empty ())
4981 	    visited.remove (visited_vec.pop ());
4982 	}
4983       else
4984 	{
4985 	  i++;
4986 	  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
4987 	    {
4988 	      add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
4989 	      cost_vec.release ();
4990 	    }
4991 	  else
4992 	    /* For BB vectorization remember the SLP graph entry
4993 	       cost for later.  */
4994 	    instance->cost_vec = cost_vec;
4995 	}
4996     }
4997 
4998   /* Now look for SLP instances with a root that are covered by other
4999      instances and remove them.  */
5000   hash_set<stmt_vec_info> roots;
5001   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5002     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5003       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
5004   if (!roots.is_empty ())
5005     {
5006       visited.empty ();
5007       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5008 	vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
5009 				      visited);
5010       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
5011 	if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
5012 	    && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
5013 	  {
5014 	    stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
5015 	    if (dump_enabled_p ())
5016 	      dump_printf_loc (MSG_NOTE, vect_location,
5017 			       "removing SLP instance operations starting "
5018 			       "from: %G", root->stmt);
5019 	    vect_free_slp_instance (instance);
5020 	    vinfo->slp_instances.ordered_remove (i);
5021 	  }
5022 	else
5023 	  ++i;
5024     }
5025 
5026   /* Compute vectorizable live stmts.  */
5027   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5028     {
5029       hash_set<stmt_vec_info> svisited;
5030       hash_set<slp_tree> visited;
5031       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
5032 	{
5033 	  vect_location = instance->location ();
5034 	  vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
5035 				       instance, &instance->cost_vec, svisited,
5036 				       visited);
5037 	}
5038     }
5039 
5040   return !vinfo->slp_instances.is_empty ();
5041 }
5042 
5043 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
5044    closing the eventual chain.  */
5045 
5046 static slp_instance
get_ultimate_leader(slp_instance instance,hash_map<slp_instance,slp_instance> & instance_leader)5047 get_ultimate_leader (slp_instance instance,
5048 		     hash_map<slp_instance, slp_instance> &instance_leader)
5049 {
5050   auto_vec<slp_instance *, 8> chain;
5051   slp_instance *tem;
5052   while (*(tem = instance_leader.get (instance)) != instance)
5053     {
5054       chain.safe_push (tem);
5055       instance = *tem;
5056     }
5057   while (!chain.is_empty ())
5058     *chain.pop () = instance;
5059   return instance;
5060 }
5061 
5062 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
5063 
5064 static void
vect_bb_partition_graph_r(bb_vec_info bb_vinfo,slp_instance instance,slp_tree node,hash_map<stmt_vec_info,slp_instance> & stmt_to_instance,hash_map<slp_instance,slp_instance> & instance_leader,hash_set<slp_tree> & visited)5065 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
5066 			   slp_instance instance, slp_tree node,
5067 			   hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
5068 			   hash_map<slp_instance, slp_instance> &instance_leader,
5069 			   hash_set<slp_tree> &visited)
5070 {
5071   stmt_vec_info stmt_info;
5072   unsigned i;
5073 
5074   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5075     {
5076       bool existed_p;
5077       slp_instance &stmt_instance
5078 	= stmt_to_instance.get_or_insert (stmt_info, &existed_p);
5079       if (!existed_p)
5080 	;
5081       else if (stmt_instance != instance)
5082 	{
5083 	  /* If we're running into a previously marked stmt make us the
5084 	     leader of the current ultimate leader.  This keeps the
5085 	     leader chain acyclic and works even when the current instance
5086 	     connects two previously independent graph parts.  */
5087 	  slp_instance stmt_leader
5088 	    = get_ultimate_leader (stmt_instance, instance_leader);
5089 	  if (stmt_leader != instance)
5090 	    instance_leader.put (stmt_leader, instance);
5091 	}
5092       stmt_instance = instance;
5093     }
5094 
5095   if (!SLP_TREE_SCALAR_STMTS (node).is_empty () && visited.add (node))
5096     return;
5097 
5098   slp_tree child;
5099   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5100     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5101       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
5102 				 instance_leader, visited);
5103 }
5104 
5105 /* Partition the SLP graph into pieces that can be costed independently.  */
5106 
5107 static void
vect_bb_partition_graph(bb_vec_info bb_vinfo)5108 vect_bb_partition_graph (bb_vec_info bb_vinfo)
5109 {
5110   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
5111 
5112   /* First walk the SLP graph assigning each involved scalar stmt a
5113      corresponding SLP graph entry and upon visiting a previously
5114      marked stmt, make the stmts leader the current SLP graph entry.  */
5115   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
5116   hash_map<slp_instance, slp_instance> instance_leader;
5117   hash_set<slp_tree> visited;
5118   slp_instance instance;
5119   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5120     {
5121       instance_leader.put (instance, instance);
5122       vect_bb_partition_graph_r (bb_vinfo,
5123 				 instance, SLP_INSTANCE_TREE (instance),
5124 				 stmt_to_instance, instance_leader,
5125 				 visited);
5126     }
5127 
5128   /* Then collect entries to each independent subgraph.  */
5129   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
5130     {
5131       slp_instance leader = get_ultimate_leader (instance, instance_leader);
5132       leader->subgraph_entries.safe_push (instance);
5133       if (dump_enabled_p ()
5134 	  && leader != instance)
5135 	dump_printf_loc (MSG_NOTE, vect_location,
5136 			 "instance %p is leader of %p\n",
5137 			 leader, instance);
5138     }
5139 }
5140 
5141 /* Compute the set of scalar stmts participating in internal and external
5142    nodes.  */
5143 
5144 static void
vect_slp_gather_vectorized_scalar_stmts(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited,hash_set<stmt_vec_info> & vstmts,hash_set<stmt_vec_info> & estmts)5145 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
5146 					 hash_set<slp_tree> &visited,
5147 					 hash_set<stmt_vec_info> &vstmts,
5148 					 hash_set<stmt_vec_info> &estmts)
5149 {
5150   int i;
5151   stmt_vec_info stmt_info;
5152   slp_tree child;
5153 
5154   if (visited.add (node))
5155     return;
5156 
5157   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
5158     {
5159       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5160 	vstmts.add (stmt_info);
5161 
5162       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5163 	if (child)
5164 	  vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
5165 						   vstmts, estmts);
5166     }
5167   else
5168     for (tree def : SLP_TREE_SCALAR_OPS (node))
5169       {
5170 	stmt_vec_info def_stmt = vinfo->lookup_def (def);
5171 	if (def_stmt)
5172 	  estmts.add (def_stmt);
5173       }
5174 }
5175 
5176 
5177 /* Compute the scalar cost of the SLP node NODE and its children
5178    and return it.  Do not account defs that are marked in LIFE and
5179    update LIFE according to uses of NODE.  */
5180 
5181 static void
vect_bb_slp_scalar_cost(vec_info * vinfo,slp_tree node,vec<bool,va_heap> * life,stmt_vector_for_cost * cost_vec,hash_set<stmt_vec_info> & vectorized_scalar_stmts,hash_set<slp_tree> & visited)5182 vect_bb_slp_scalar_cost (vec_info *vinfo,
5183 			 slp_tree node, vec<bool, va_heap> *life,
5184 			 stmt_vector_for_cost *cost_vec,
5185 			 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
5186 			 hash_set<slp_tree> &visited)
5187 {
5188   unsigned i;
5189   stmt_vec_info stmt_info;
5190   slp_tree child;
5191 
5192   if (visited.add (node))
5193     return;
5194 
5195   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
5196     {
5197       ssa_op_iter op_iter;
5198       def_operand_p def_p;
5199 
5200       if ((*life)[i])
5201 	continue;
5202 
5203       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5204       gimple *orig_stmt = orig_stmt_info->stmt;
5205 
5206       /* If there is a non-vectorized use of the defs then the scalar
5207          stmt is kept live in which case we do not account it or any
5208 	 required defs in the SLP children in the scalar cost.  This
5209 	 way we make the vectorization more costly when compared to
5210 	 the scalar cost.  */
5211       if (!STMT_VINFO_LIVE_P (stmt_info))
5212 	{
5213 	  auto_vec<gimple *, 8> worklist;
5214 	  hash_set<gimple *> *worklist_visited = NULL;
5215 	  worklist.quick_push (orig_stmt);
5216 	  do
5217 	    {
5218 	      gimple *work_stmt = worklist.pop ();
5219 	      FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
5220 		{
5221 		  imm_use_iterator use_iter;
5222 		  gimple *use_stmt;
5223 		  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
5224 					 DEF_FROM_PTR (def_p))
5225 		    if (!is_gimple_debug (use_stmt))
5226 		      {
5227 			stmt_vec_info use_stmt_info
5228 			  = vinfo->lookup_stmt (use_stmt);
5229 			if (!use_stmt_info
5230 			    || !vectorized_scalar_stmts.contains (use_stmt_info))
5231 			  {
5232 			    if (use_stmt_info
5233 				&& STMT_VINFO_IN_PATTERN_P (use_stmt_info))
5234 			      {
5235 				/* For stmts participating in patterns we have
5236 				   to check its uses recursively.  */
5237 				if (!worklist_visited)
5238 				  worklist_visited = new hash_set<gimple *> ();
5239 				if (!worklist_visited->add (use_stmt))
5240 				  worklist.safe_push (use_stmt);
5241 				continue;
5242 			      }
5243 			    (*life)[i] = true;
5244 			    goto next_lane;
5245 			  }
5246 		      }
5247 		}
5248 	    }
5249 	  while (!worklist.is_empty ());
5250 next_lane:
5251 	  if (worklist_visited)
5252 	    delete worklist_visited;
5253 	  if ((*life)[i])
5254 	    continue;
5255 	}
5256 
5257       /* Count scalar stmts only once.  */
5258       if (gimple_visited_p (orig_stmt))
5259 	continue;
5260       gimple_set_visited (orig_stmt, true);
5261 
5262       vect_cost_for_stmt kind;
5263       if (STMT_VINFO_DATA_REF (orig_stmt_info))
5264 	{
5265 	  if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
5266 	    kind = scalar_load;
5267 	  else
5268 	    kind = scalar_store;
5269 	}
5270       else if (vect_nop_conversion_p (orig_stmt_info))
5271 	continue;
5272       /* For single-argument PHIs assume coalescing which means zero cost
5273 	 for the scalar and the vector PHIs.  This avoids artificially
5274 	 favoring the vector path (but may pessimize it in some cases).  */
5275       else if (is_a <gphi *> (orig_stmt_info->stmt)
5276 	       && gimple_phi_num_args
5277 		    (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
5278 	continue;
5279       else
5280 	kind = scalar_stmt;
5281       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
5282 			SLP_TREE_VECTYPE (node), 0, vect_body);
5283     }
5284 
5285   auto_vec<bool, 20> subtree_life;
5286   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5287     {
5288       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
5289 	{
5290 	  /* Do not directly pass LIFE to the recursive call, copy it to
5291 	     confine changes in the callee to the current child/subtree.  */
5292 	  if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5293 	    {
5294 	      subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
5295 	      for (unsigned j = 0;
5296 		   j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
5297 		{
5298 		  auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
5299 		  if (perm.first == i)
5300 		    subtree_life[perm.second] = (*life)[j];
5301 		}
5302 	    }
5303 	  else
5304 	    {
5305 	      gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
5306 	      subtree_life.safe_splice (*life);
5307 	    }
5308 	  vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
5309 				   vectorized_scalar_stmts, visited);
5310 	  subtree_life.truncate (0);
5311 	}
5312     }
5313 }
5314 
5315 /* Comparator for the loop-index sorted cost vectors.  */
5316 
5317 static int
li_cost_vec_cmp(const void * a_,const void * b_)5318 li_cost_vec_cmp (const void *a_, const void *b_)
5319 {
5320   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
5321   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
5322   if (a->first < b->first)
5323     return -1;
5324   else if (a->first == b->first)
5325     return 0;
5326   return 1;
5327 }
5328 
5329 /* Check if vectorization of the basic block is profitable for the
5330    subgraph denoted by SLP_INSTANCES.  */
5331 
5332 static bool
vect_bb_vectorization_profitable_p(bb_vec_info bb_vinfo,vec<slp_instance> slp_instances,loop_p orig_loop)5333 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
5334 				    vec<slp_instance> slp_instances,
5335 				    loop_p orig_loop)
5336 {
5337   slp_instance instance;
5338   int i;
5339   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
5340   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
5341 
5342   if (dump_enabled_p ())
5343     {
5344       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
5345       hash_set<slp_tree> visited;
5346       FOR_EACH_VEC_ELT (slp_instances, i, instance)
5347 	vect_print_slp_graph (MSG_NOTE, vect_location,
5348 			      SLP_INSTANCE_TREE (instance), visited);
5349     }
5350 
5351   /* Compute the set of scalar stmts we know will go away 'locally' when
5352      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
5353      not accurate for nodes promoted extern late or for scalar stmts that
5354      are used both in extern defs and in vectorized defs.  */
5355   hash_set<stmt_vec_info> vectorized_scalar_stmts;
5356   hash_set<stmt_vec_info> scalar_stmts_in_externs;
5357   hash_set<slp_tree> visited;
5358   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5359     {
5360       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
5361 					       SLP_INSTANCE_TREE (instance),
5362 					       visited,
5363 					       vectorized_scalar_stmts,
5364 					       scalar_stmts_in_externs);
5365       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
5366 	vectorized_scalar_stmts.add (rstmt);
5367     }
5368   /* Scalar stmts used as defs in external nodes need to be preseved, so
5369      remove them from vectorized_scalar_stmts.  */
5370   for (stmt_vec_info stmt : scalar_stmts_in_externs)
5371     vectorized_scalar_stmts.remove (stmt);
5372 
5373   /* Calculate scalar cost and sum the cost for the vector stmts
5374      previously collected.  */
5375   stmt_vector_for_cost scalar_costs = vNULL;
5376   stmt_vector_for_cost vector_costs = vNULL;
5377   visited.empty ();
5378   FOR_EACH_VEC_ELT (slp_instances, i, instance)
5379     {
5380       auto_vec<bool, 20> life;
5381       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
5382 			      true);
5383       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
5384 	record_stmt_cost (&scalar_costs,
5385 			  SLP_INSTANCE_ROOT_STMTS (instance).length (),
5386 			  scalar_stmt,
5387 			  SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
5388       vect_bb_slp_scalar_cost (bb_vinfo,
5389 			       SLP_INSTANCE_TREE (instance),
5390 			       &life, &scalar_costs, vectorized_scalar_stmts,
5391 			       visited);
5392       vector_costs.safe_splice (instance->cost_vec);
5393       instance->cost_vec.release ();
5394     }
5395 
5396   if (dump_enabled_p ())
5397     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5398 
5399   /* When costing non-loop vectorization we need to consider each covered
5400      loop independently and make sure vectorization is profitable.  For
5401      now we assume a loop may be not entered or executed an arbitrary
5402      number of iterations (???  static information can provide more
5403      precise info here) which means we can simply cost each containing
5404      loops stmts separately.  */
5405 
5406   /* First produce cost vectors sorted by loop index.  */
5407   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5408     li_scalar_costs (scalar_costs.length ());
5409   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
5410     li_vector_costs (vector_costs.length ());
5411   stmt_info_for_cost *cost;
5412   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5413     {
5414       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5415       li_scalar_costs.quick_push (std::make_pair (l, cost));
5416     }
5417   /* Use a random used loop as fallback in case the first vector_costs
5418      entry does not have a stmt_info associated with it.  */
5419   unsigned l = li_scalar_costs[0].first;
5420   FOR_EACH_VEC_ELT (vector_costs, i, cost)
5421     {
5422       /* We inherit from the previous COST, invariants, externals and
5423 	 extracts immediately follow the cost for the related stmt.  */
5424       if (cost->stmt_info)
5425 	l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
5426       li_vector_costs.quick_push (std::make_pair (l, cost));
5427     }
5428   li_scalar_costs.qsort (li_cost_vec_cmp);
5429   li_vector_costs.qsort (li_cost_vec_cmp);
5430 
5431   /* Now cost the portions individually.  */
5432   unsigned vi = 0;
5433   unsigned si = 0;
5434   bool profitable = true;
5435   while (si < li_scalar_costs.length ()
5436 	 && vi < li_vector_costs.length ())
5437     {
5438       unsigned sl = li_scalar_costs[si].first;
5439       unsigned vl = li_vector_costs[vi].first;
5440       if (sl != vl)
5441 	{
5442 	  if (dump_enabled_p ())
5443 	    dump_printf_loc (MSG_NOTE, vect_location,
5444 			     "Scalar %d and vector %d loop part do not "
5445 			     "match up, skipping scalar part\n", sl, vl);
5446 	  /* Skip the scalar part, assuming zero cost on the vector side.  */
5447 	  do
5448 	    {
5449 	      si++;
5450 	    }
5451 	  while (si < li_scalar_costs.length ()
5452 		 && li_scalar_costs[si].first == sl);
5453 	  continue;
5454 	}
5455 
5456       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
5457       do
5458 	{
5459 	  add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
5460 	  si++;
5461 	}
5462       while (si < li_scalar_costs.length ()
5463 	     && li_scalar_costs[si].first == sl);
5464       unsigned dummy;
5465       finish_cost (scalar_target_cost_data, nullptr,
5466 		   &dummy, &scalar_cost, &dummy);
5467 
5468       /* Complete the target-specific vector cost calculation.  */
5469       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
5470       do
5471 	{
5472 	  add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
5473 	  vi++;
5474 	}
5475       while (vi < li_vector_costs.length ()
5476 	     && li_vector_costs[vi].first == vl);
5477       finish_cost (vect_target_cost_data, scalar_target_cost_data,
5478 		   &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
5479       delete scalar_target_cost_data;
5480       delete vect_target_cost_data;
5481 
5482       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
5483 
5484       if (dump_enabled_p ())
5485 	{
5486 	  dump_printf_loc (MSG_NOTE, vect_location,
5487 			   "Cost model analysis for part in loop %d:\n", sl);
5488 	  dump_printf (MSG_NOTE, "  Vector cost: %d\n",
5489 		       vec_inside_cost + vec_outside_cost);
5490 	  dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
5491 	}
5492 
5493       /* Vectorization is profitable if its cost is more than the cost of scalar
5494 	 version.  Note that we err on the vector side for equal cost because
5495 	 the cost estimate is otherwise quite pessimistic (constant uses are
5496 	 free on the scalar side but cost a load on the vector side for
5497 	 example).  */
5498       if (vec_outside_cost + vec_inside_cost > scalar_cost)
5499 	{
5500 	  profitable = false;
5501 	  break;
5502 	}
5503     }
5504   if (profitable && vi < li_vector_costs.length ())
5505     {
5506       if (dump_enabled_p ())
5507 	dump_printf_loc (MSG_NOTE, vect_location,
5508 			 "Excess vector cost for part in loop %d:\n",
5509 			 li_vector_costs[vi].first);
5510       profitable = false;
5511     }
5512 
5513   /* Unset visited flag.  This is delayed when the subgraph is profitable
5514      and we process the loop for remaining unvectorized if-converted code.  */
5515   if (!orig_loop || !profitable)
5516     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
5517       gimple_set_visited  (cost->stmt_info->stmt, false);
5518 
5519   scalar_costs.release ();
5520   vector_costs.release ();
5521 
5522   return profitable;
5523 }
5524 
5525 /* qsort comparator for lane defs.  */
5526 
5527 static int
vld_cmp(const void * a_,const void * b_)5528 vld_cmp (const void *a_, const void *b_)
5529 {
5530   auto *a = (const std::pair<unsigned, tree> *)a_;
5531   auto *b = (const std::pair<unsigned, tree> *)b_;
5532   return a->first - b->first;
5533 }
5534 
5535 /* Return true if USE_STMT is a vector lane insert into VEC and set
5536    *THIS_LANE to the lane number that is set.  */
5537 
5538 static bool
vect_slp_is_lane_insert(gimple * use_stmt,tree vec,unsigned * this_lane)5539 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
5540 {
5541   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
5542   if (!use_ass
5543       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
5544       || (vec
5545 	  ? gimple_assign_rhs1 (use_ass) != vec
5546 	  : ((vec = gimple_assign_rhs1 (use_ass)), false))
5547       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
5548 				     TREE_TYPE (gimple_assign_rhs2 (use_ass)))
5549       || !constant_multiple_p
5550 	    (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
5551 	     tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
5552 	     this_lane))
5553     return false;
5554   return true;
5555 }
5556 
5557 /* Find any vectorizable constructors and add them to the grouped_store
5558    array.  */
5559 
5560 static void
vect_slp_check_for_constructors(bb_vec_info bb_vinfo)5561 vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
5562 {
5563   for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
5564     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
5565 	 !gsi_end_p (gsi); gsi_next (&gsi))
5566     {
5567       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
5568       if (!assign)
5569 	continue;
5570 
5571       tree rhs = gimple_assign_rhs1 (assign);
5572       enum tree_code code = gimple_assign_rhs_code (assign);
5573       use_operand_p use_p;
5574       gimple *use_stmt;
5575       if (code == CONSTRUCTOR)
5576 	{
5577 	  if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5578 	      || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
5579 			   CONSTRUCTOR_NELTS (rhs))
5580 	      || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
5581 	      || uniform_vector_p (rhs))
5582 	    continue;
5583 
5584 	  unsigned j;
5585 	  tree val;
5586 	  FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
5587 	      if (TREE_CODE (val) != SSA_NAME
5588 		  || !bb_vinfo->lookup_def (val))
5589 		break;
5590 	  if (j != CONSTRUCTOR_NELTS (rhs))
5591 	    continue;
5592 
5593 	  stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
5594 	  BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
5595 	}
5596       else if (code == BIT_INSERT_EXPR
5597 	       && VECTOR_TYPE_P (TREE_TYPE (rhs))
5598 	       && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
5599 	       && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
5600 	       && integer_zerop (gimple_assign_rhs3 (assign))
5601 	       && useless_type_conversion_p
5602 		    (TREE_TYPE (TREE_TYPE (rhs)),
5603 		     TREE_TYPE (gimple_assign_rhs2 (assign)))
5604 	       && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
5605 	{
5606 	  /* We start to match on insert to lane zero but since the
5607 	     inserts need not be ordered we'd have to search both
5608 	     the def and the use chains.  */
5609 	  tree vectype = TREE_TYPE (rhs);
5610 	  unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5611 	  auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
5612 	  auto_sbitmap lanes (nlanes);
5613 	  bitmap_clear (lanes);
5614 	  bitmap_set_bit (lanes, 0);
5615 	  tree def = gimple_assign_lhs (assign);
5616 	  lane_defs.quick_push
5617 		      (std::make_pair (0, gimple_assign_rhs2 (assign)));
5618 	  unsigned lanes_found = 1;
5619 	  /* Start with the use chains, the last stmt will be the root.  */
5620 	  stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
5621 	  vec<stmt_vec_info> roots = vNULL;
5622 	  roots.safe_push (last);
5623 	  do
5624 	    {
5625 	      use_operand_p use_p;
5626 	      gimple *use_stmt;
5627 	      if (!single_imm_use (def, &use_p, &use_stmt))
5628 		break;
5629 	      unsigned this_lane;
5630 	      if (!bb_vinfo->lookup_stmt (use_stmt)
5631 		  || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
5632 		  || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
5633 		break;
5634 	      if (bitmap_bit_p (lanes, this_lane))
5635 		break;
5636 	      lanes_found++;
5637 	      bitmap_set_bit (lanes, this_lane);
5638 	      gassign *use_ass = as_a <gassign *> (use_stmt);
5639 	      lane_defs.quick_push (std::make_pair
5640 				     (this_lane, gimple_assign_rhs2 (use_ass)));
5641 	      last = bb_vinfo->lookup_stmt (use_ass);
5642 	      roots.safe_push (last);
5643 	      def = gimple_assign_lhs (use_ass);
5644 	    }
5645 	  while (lanes_found < nlanes);
5646 	  if (roots.length () > 1)
5647 	    std::swap(roots[0], roots[roots.length () - 1]);
5648 	  if (lanes_found < nlanes)
5649 	    {
5650 	      /* Now search the def chain.  */
5651 	      def = gimple_assign_rhs1 (assign);
5652 	      do
5653 		{
5654 		  if (TREE_CODE (def) != SSA_NAME
5655 		      || !has_single_use (def))
5656 		    break;
5657 		  gimple *def_stmt = SSA_NAME_DEF_STMT (def);
5658 		  unsigned this_lane;
5659 		  if (!bb_vinfo->lookup_stmt (def_stmt)
5660 		      || !vect_slp_is_lane_insert (def_stmt,
5661 						   NULL_TREE, &this_lane)
5662 		      || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
5663 		    break;
5664 		  if (bitmap_bit_p (lanes, this_lane))
5665 		    break;
5666 		  lanes_found++;
5667 		  bitmap_set_bit (lanes, this_lane);
5668 		  lane_defs.quick_push (std::make_pair
5669 					  (this_lane,
5670 					   gimple_assign_rhs2 (def_stmt)));
5671 		  roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
5672 		  def = gimple_assign_rhs1 (def_stmt);
5673 		}
5674 	      while (lanes_found < nlanes);
5675 	    }
5676 	  if (lanes_found == nlanes)
5677 	    {
5678 	      /* Sort lane_defs after the lane index and register the root.  */
5679 	      lane_defs.qsort (vld_cmp);
5680 	      vec<stmt_vec_info> stmts;
5681 	      stmts.create (nlanes);
5682 	      for (unsigned i = 0; i < nlanes; ++i)
5683 		stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
5684 	      bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
5685 						   stmts, roots));
5686 	    }
5687 	  else
5688 	    roots.release ();
5689 	}
5690       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
5691 	       && (associative_tree_code (code) || code == MINUS_EXPR)
5692 	       /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
5693 		  checks pessimize a two-element reduction.  PR54400.
5694 		  ???  In-order reduction could be handled if we only
5695 		  traverse one operand chain in vect_slp_linearize_chain.  */
5696 	       && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
5697 		   || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
5698 		       && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
5699 	       /* Ops with constants at the tail can be stripped here.  */
5700 	       && TREE_CODE (rhs) == SSA_NAME
5701 	       && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
5702 	       /* Should be the chain end.  */
5703 	       && (!single_imm_use (gimple_assign_lhs (assign),
5704 				    &use_p, &use_stmt)
5705 		   || !is_gimple_assign (use_stmt)
5706 		   || (gimple_assign_rhs_code (use_stmt) != code
5707 		       && ((code != PLUS_EXPR && code != MINUS_EXPR)
5708 			   || (gimple_assign_rhs_code (use_stmt)
5709 			       != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
5710 	{
5711 	  /* We start the match at the end of a possible association
5712 	     chain.  */
5713 	  auto_vec<chain_op_t> chain;
5714 	  auto_vec<std::pair<tree_code, gimple *> > worklist;
5715 	  auto_vec<gimple *> chain_stmts;
5716 	  gimple *code_stmt = NULL, *alt_code_stmt = NULL;
5717 	  if (code == MINUS_EXPR)
5718 	    code = PLUS_EXPR;
5719 	  internal_fn reduc_fn;
5720 	  if (!reduction_fn_for_scalar_code (code, &reduc_fn)
5721 	      || reduc_fn == IFN_LAST)
5722 	    continue;
5723 	  vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
5724 				    /* ??? */
5725 				    code_stmt, alt_code_stmt, &chain_stmts);
5726 	  if (chain.length () > 1)
5727 	    {
5728 	      /* Sort the chain according to def_type and operation.  */
5729 	      chain.sort (dt_sort_cmp, bb_vinfo);
5730 	      /* ???  Now we'd want to strip externals and constants
5731 		 but record those to be handled in the epilogue.  */
5732 	      /* ???  For now do not allow mixing ops or externs/constants.  */
5733 	      bool invalid = false;
5734 	      for (unsigned i = 0; i < chain.length (); ++i)
5735 		if (chain[i].dt != vect_internal_def
5736 		    || chain[i].code != code)
5737 		  invalid = true;
5738 	      if (!invalid)
5739 		{
5740 		  vec<stmt_vec_info> stmts;
5741 		  stmts.create (chain.length ());
5742 		  for (unsigned i = 0; i < chain.length (); ++i)
5743 		    stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
5744 		  vec<stmt_vec_info> roots;
5745 		  roots.create (chain_stmts.length ());
5746 		  for (unsigned i = 0; i < chain_stmts.length (); ++i)
5747 		    roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
5748 		  bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
5749 						       stmts, roots));
5750 		}
5751 	    }
5752 	}
5753     }
5754 }
5755 
5756 /* Walk the grouped store chains and replace entries with their
5757    pattern variant if any.  */
5758 
5759 static void
vect_fixup_store_groups_with_patterns(vec_info * vinfo)5760 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
5761 {
5762   stmt_vec_info first_element;
5763   unsigned i;
5764 
5765   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5766     {
5767       /* We also have CTORs in this array.  */
5768       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
5769 	continue;
5770       if (STMT_VINFO_IN_PATTERN_P (first_element))
5771 	{
5772 	  stmt_vec_info orig = first_element;
5773 	  first_element = STMT_VINFO_RELATED_STMT (first_element);
5774 	  DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
5775 	  DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
5776 	  DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
5777 	  DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
5778 	  vinfo->grouped_stores[i] = first_element;
5779 	}
5780       stmt_vec_info prev = first_element;
5781       while (DR_GROUP_NEXT_ELEMENT (prev))
5782 	{
5783 	  stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
5784 	  if (STMT_VINFO_IN_PATTERN_P (elt))
5785 	    {
5786 	      stmt_vec_info orig = elt;
5787 	      elt = STMT_VINFO_RELATED_STMT (elt);
5788 	      DR_GROUP_NEXT_ELEMENT (prev) = elt;
5789 	      DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
5790 	      DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
5791 	    }
5792 	  DR_GROUP_FIRST_ELEMENT (elt) = first_element;
5793 	  prev = elt;
5794 	}
5795     }
5796 }
5797 
5798 /* Check if the region described by BB_VINFO can be vectorized, returning
5799    true if so.  When returning false, set FATAL to true if the same failure
5800    would prevent vectorization at other vector sizes, false if it is still
5801    worth trying other sizes.  N_STMTS is the number of statements in the
5802    region.  */
5803 
5804 static bool
vect_slp_analyze_bb_1(bb_vec_info bb_vinfo,int n_stmts,bool & fatal,vec<int> * dataref_groups)5805 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
5806 		       vec<int> *dataref_groups)
5807 {
5808   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
5809 
5810   slp_instance instance;
5811   int i;
5812   poly_uint64 min_vf = 2;
5813 
5814   /* The first group of checks is independent of the vector size.  */
5815   fatal = true;
5816 
5817   /* Analyze the data references.  */
5818 
5819   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
5820     {
5821       if (dump_enabled_p ())
5822         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5823 			 "not vectorized: unhandled data-ref in basic "
5824 			 "block.\n");
5825       return false;
5826     }
5827 
5828   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
5829     {
5830      if (dump_enabled_p ())
5831        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5832 			"not vectorized: unhandled data access in "
5833 			"basic block.\n");
5834       return false;
5835     }
5836 
5837   vect_slp_check_for_constructors (bb_vinfo);
5838 
5839   /* If there are no grouped stores and no constructors in the region
5840      there is no need to continue with pattern recog as vect_analyze_slp
5841      will fail anyway.  */
5842   if (bb_vinfo->grouped_stores.is_empty ()
5843       && bb_vinfo->roots.is_empty ())
5844     {
5845       if (dump_enabled_p ())
5846 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5847 			 "not vectorized: no grouped stores in "
5848 			 "basic block.\n");
5849       return false;
5850     }
5851 
5852   /* While the rest of the analysis below depends on it in some way.  */
5853   fatal = false;
5854 
5855   vect_pattern_recog (bb_vinfo);
5856 
5857   /* Update store groups from pattern processing.  */
5858   vect_fixup_store_groups_with_patterns (bb_vinfo);
5859 
5860   /* Check the SLP opportunities in the basic block, analyze and build SLP
5861      trees.  */
5862   if (!vect_analyze_slp (bb_vinfo, n_stmts))
5863     {
5864       if (dump_enabled_p ())
5865 	{
5866 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5867 			   "Failed to SLP the basic block.\n");
5868 	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5869 			   "not vectorized: failed to find SLP opportunities "
5870 			   "in basic block.\n");
5871 	}
5872       return false;
5873     }
5874 
5875   /* Optimize permutations.  */
5876   vect_optimize_slp (bb_vinfo);
5877 
5878   /* Gather the loads reachable from the SLP graph entries.  */
5879   vect_gather_slp_loads (bb_vinfo);
5880 
5881   vect_record_base_alignments (bb_vinfo);
5882 
5883   /* Analyze and verify the alignment of data references and the
5884      dependence in the SLP instances.  */
5885   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
5886     {
5887       vect_location = instance->location ();
5888       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
5889 	  || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
5890 	{
5891 	  slp_tree node = SLP_INSTANCE_TREE (instance);
5892 	  stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
5893 	  if (dump_enabled_p ())
5894 	    dump_printf_loc (MSG_NOTE, vect_location,
5895 			     "removing SLP instance operations starting from: %G",
5896 			     stmt_info->stmt);
5897 	  vect_free_slp_instance (instance);
5898 	  BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
5899 	  continue;
5900 	}
5901 
5902       /* Mark all the statements that we want to vectorize as pure SLP and
5903 	 relevant.  */
5904       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5905       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
5906       unsigned j;
5907       stmt_vec_info root;
5908       /* Likewise consider instance root stmts as vectorized.  */
5909       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
5910 	STMT_SLP_TYPE (root) = pure_slp;
5911 
5912       i++;
5913     }
5914   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
5915     return false;
5916 
5917   if (!vect_slp_analyze_operations (bb_vinfo))
5918     {
5919       if (dump_enabled_p ())
5920         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5921 			 "not vectorized: bad operation in basic block.\n");
5922       return false;
5923     }
5924 
5925   vect_bb_partition_graph (bb_vinfo);
5926 
5927   return true;
5928 }
5929 
5930 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
5931    basic blocks in BBS, returning true on success.
5932    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
5933 
5934 static bool
vect_slp_region(vec<basic_block> bbs,vec<data_reference_p> datarefs,vec<int> * dataref_groups,unsigned int n_stmts,loop_p orig_loop)5935 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
5936 		 vec<int> *dataref_groups, unsigned int n_stmts,
5937 		 loop_p orig_loop)
5938 {
5939   bb_vec_info bb_vinfo;
5940   auto_vector_modes vector_modes;
5941 
5942   /* Autodetect first vector size we try.  */
5943   machine_mode next_vector_mode = VOIDmode;
5944   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
5945   unsigned int mode_i = 0;
5946 
5947   vec_info_shared shared;
5948 
5949   machine_mode autodetected_vector_mode = VOIDmode;
5950   while (1)
5951     {
5952       bool vectorized = false;
5953       bool fatal = false;
5954       bb_vinfo = new _bb_vec_info (bbs, &shared);
5955 
5956       bool first_time_p = shared.datarefs.is_empty ();
5957       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
5958       if (first_time_p)
5959 	bb_vinfo->shared->save_datarefs ();
5960       else
5961 	bb_vinfo->shared->check_datarefs ();
5962       bb_vinfo->vector_mode = next_vector_mode;
5963 
5964       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
5965 	{
5966 	  if (dump_enabled_p ())
5967 	    {
5968 	      dump_printf_loc (MSG_NOTE, vect_location,
5969 			       "***** Analysis succeeded with vector mode"
5970 			       " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
5971 	      dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
5972 	    }
5973 
5974 	  bb_vinfo->shared->check_datarefs ();
5975 
5976 	  auto_vec<slp_instance> profitable_subgraphs;
5977 	  for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
5978 	    {
5979 	      if (instance->subgraph_entries.is_empty ())
5980 		continue;
5981 
5982 	      vect_location = instance->location ();
5983 	      if (!unlimited_cost_model (NULL)
5984 		  && !vect_bb_vectorization_profitable_p
5985 			(bb_vinfo, instance->subgraph_entries, orig_loop))
5986 		{
5987 		  if (dump_enabled_p ())
5988 		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5989 				     "not vectorized: vectorization is not "
5990 				     "profitable.\n");
5991 		  continue;
5992 		}
5993 
5994 	      if (!dbg_cnt (vect_slp))
5995 		continue;
5996 
5997 	      profitable_subgraphs.safe_push (instance);
5998 	    }
5999 
6000 	  /* When we're vectorizing an if-converted loop body make sure
6001 	     we vectorized all if-converted code.  */
6002 	  if (!profitable_subgraphs.is_empty ()
6003 	      && orig_loop)
6004 	    {
6005 	      gcc_assert (bb_vinfo->bbs.length () == 1);
6006 	      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
6007 		   !gsi_end_p (gsi); gsi_next (&gsi))
6008 		{
6009 		  /* The costing above left us with DCEable vectorized scalar
6010 		     stmts having the visited flag set on profitable
6011 		     subgraphs.  Do the delayed clearing of the flag here.  */
6012 		  if (gimple_visited_p (gsi_stmt (gsi)))
6013 		    {
6014 		      gimple_set_visited (gsi_stmt (gsi), false);
6015 		      continue;
6016 		    }
6017 		  if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
6018 		    continue;
6019 
6020 		  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
6021 		    if (gimple_assign_rhs_code (ass) == COND_EXPR)
6022 		      {
6023 			if (!profitable_subgraphs.is_empty ()
6024 			    && dump_enabled_p ())
6025 			  dump_printf_loc (MSG_NOTE, vect_location,
6026 					   "not profitable because of "
6027 					   "unprofitable if-converted scalar "
6028 					   "code\n");
6029 			profitable_subgraphs.truncate (0);
6030 		      }
6031 		}
6032 	    }
6033 
6034 	  /* Finally schedule the profitable subgraphs.  */
6035 	  for (slp_instance instance : profitable_subgraphs)
6036 	    {
6037 	      if (!vectorized && dump_enabled_p ())
6038 		dump_printf_loc (MSG_NOTE, vect_location,
6039 				 "Basic block will be vectorized "
6040 				 "using SLP\n");
6041 	      vectorized = true;
6042 
6043 	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
6044 
6045 	      unsigned HOST_WIDE_INT bytes;
6046 	      if (dump_enabled_p ())
6047 		{
6048 		  if (GET_MODE_SIZE
6049 			(bb_vinfo->vector_mode).is_constant (&bytes))
6050 		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6051 				     "basic block part vectorized using %wu "
6052 				     "byte vectors\n", bytes);
6053 		  else
6054 		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
6055 				     "basic block part vectorized using "
6056 				     "variable length vectors\n");
6057 		}
6058 	    }
6059 	}
6060       else
6061 	{
6062 	  if (dump_enabled_p ())
6063 	    dump_printf_loc (MSG_NOTE, vect_location,
6064 			     "***** Analysis failed with vector mode %s\n",
6065 			     GET_MODE_NAME (bb_vinfo->vector_mode));
6066 	}
6067 
6068       if (mode_i == 0)
6069 	autodetected_vector_mode = bb_vinfo->vector_mode;
6070 
6071       if (!fatal)
6072 	while (mode_i < vector_modes.length ()
6073 	       && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
6074 	  {
6075 	    if (dump_enabled_p ())
6076 	      dump_printf_loc (MSG_NOTE, vect_location,
6077 			       "***** The result for vector mode %s would"
6078 			       " be the same\n",
6079 			       GET_MODE_NAME (vector_modes[mode_i]));
6080 	    mode_i += 1;
6081 	  }
6082 
6083       delete bb_vinfo;
6084 
6085       if (mode_i < vector_modes.length ()
6086 	  && VECTOR_MODE_P (autodetected_vector_mode)
6087 	  && (related_vector_mode (vector_modes[mode_i],
6088 				   GET_MODE_INNER (autodetected_vector_mode))
6089 	      == autodetected_vector_mode)
6090 	  && (related_vector_mode (autodetected_vector_mode,
6091 				   GET_MODE_INNER (vector_modes[mode_i]))
6092 	      == vector_modes[mode_i]))
6093 	{
6094 	  if (dump_enabled_p ())
6095 	    dump_printf_loc (MSG_NOTE, vect_location,
6096 			     "***** Skipping vector mode %s, which would"
6097 			     " repeat the analysis for %s\n",
6098 			     GET_MODE_NAME (vector_modes[mode_i]),
6099 			     GET_MODE_NAME (autodetected_vector_mode));
6100 	  mode_i += 1;
6101 	}
6102 
6103       if (vectorized
6104 	  || mode_i == vector_modes.length ()
6105 	  || autodetected_vector_mode == VOIDmode
6106 	  /* If vect_slp_analyze_bb_1 signaled that analysis for all
6107 	     vector sizes will fail do not bother iterating.  */
6108 	  || fatal)
6109 	return vectorized;
6110 
6111       /* Try the next biggest vector size.  */
6112       next_vector_mode = vector_modes[mode_i++];
6113       if (dump_enabled_p ())
6114 	dump_printf_loc (MSG_NOTE, vect_location,
6115 			 "***** Re-trying analysis with vector mode %s\n",
6116 			 GET_MODE_NAME (next_vector_mode));
6117     }
6118 }
6119 
6120 
6121 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
6122    true if anything in the basic-block was vectorized.  */
6123 
6124 static bool
vect_slp_bbs(const vec<basic_block> & bbs,loop_p orig_loop)6125 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
6126 {
6127   vec<data_reference_p> datarefs = vNULL;
6128   auto_vec<int> dataref_groups;
6129   int insns = 0;
6130   int current_group = 0;
6131 
6132   for (unsigned i = 0; i < bbs.length (); i++)
6133     {
6134       basic_block bb = bbs[i];
6135       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
6136 	   gsi_next (&gsi))
6137 	{
6138 	  gimple *stmt = gsi_stmt (gsi);
6139 	  if (is_gimple_debug (stmt))
6140 	    continue;
6141 
6142 	  insns++;
6143 
6144 	  if (gimple_location (stmt) != UNKNOWN_LOCATION)
6145 	    vect_location = stmt;
6146 
6147 	  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
6148 					      &dataref_groups, current_group))
6149 	    ++current_group;
6150 	}
6151       /* New BBs always start a new DR group.  */
6152       ++current_group;
6153     }
6154 
6155   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
6156 }
6157 
6158 /* Special entry for the BB vectorizer.  Analyze and transform a single
6159    if-converted BB with ORIG_LOOPs body being the not if-converted
6160    representation.  Returns true if anything in the basic-block was
6161    vectorized.  */
6162 
6163 bool
vect_slp_if_converted_bb(basic_block bb,loop_p orig_loop)6164 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
6165 {
6166   auto_vec<basic_block> bbs;
6167   bbs.safe_push (bb);
6168   return vect_slp_bbs (bbs, orig_loop);
6169 }
6170 
6171 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
6172    true if anything in the basic-block was vectorized.  */
6173 
6174 bool
vect_slp_function(function * fun)6175 vect_slp_function (function *fun)
6176 {
6177   bool r = false;
6178   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
6179   unsigned n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false);
6180 
6181   /* For the moment split the function into pieces to avoid making
6182      the iteration on the vector mode moot.  Split at points we know
6183      to not handle well which is CFG merges (SLP discovery doesn't
6184      handle non-loop-header PHIs) and loop exits.  Since pattern
6185      recog requires reverse iteration to visit uses before defs
6186      simply chop RPO into pieces.  */
6187   auto_vec<basic_block> bbs;
6188   for (unsigned i = 0; i < n; i++)
6189     {
6190       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
6191       bool split = false;
6192 
6193       /* Split when a BB is not dominated by the first block.  */
6194       if (!bbs.is_empty ()
6195 	  && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
6196 	{
6197 	  if (dump_enabled_p ())
6198 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6199 			     "splitting region at dominance boundary bb%d\n",
6200 			     bb->index);
6201 	  split = true;
6202 	}
6203       /* Split when the loop determined by the first block
6204 	 is exited.  This is because we eventually insert
6205 	 invariants at region begin.  */
6206       else if (!bbs.is_empty ()
6207 	       && bbs[0]->loop_father != bb->loop_father
6208 	       && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
6209 	{
6210 	  if (dump_enabled_p ())
6211 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6212 			     "splitting region at loop %d exit at bb%d\n",
6213 			     bbs[0]->loop_father->num, bb->index);
6214 	  split = true;
6215 	}
6216 
6217       if (split && !bbs.is_empty ())
6218 	{
6219 	  r |= vect_slp_bbs (bbs, NULL);
6220 	  bbs.truncate (0);
6221 	}
6222 
6223       /* We need to be able to insert at the head of the region which
6224 	 we cannot for region starting with a returns-twice call.  */
6225       if (bbs.is_empty ())
6226 	if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
6227 	  if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
6228 	    {
6229 	      if (dump_enabled_p ())
6230 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6231 				 "skipping bb%d as start of region as it "
6232 				 "starts with returns-twice call\n",
6233 				 bb->index);
6234 	      continue;
6235 	    }
6236 
6237       bbs.safe_push (bb);
6238 
6239       /* When we have a stmt ending this block and defining a
6240 	 value we have to insert on edges when inserting after it for
6241 	 a vector containing its definition.  Avoid this for now.  */
6242       if (gimple *last = last_stmt (bb))
6243 	if (gimple_get_lhs (last)
6244 	    && is_ctrl_altering_stmt (last))
6245 	  {
6246 	    if (dump_enabled_p ())
6247 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6248 			       "splitting region at control altering "
6249 			       "definition %G", last);
6250 	    r |= vect_slp_bbs (bbs, NULL);
6251 	    bbs.truncate (0);
6252 	  }
6253     }
6254 
6255   if (!bbs.is_empty ())
6256     r |= vect_slp_bbs (bbs, NULL);
6257 
6258   free (rpo);
6259 
6260   return r;
6261 }
6262 
6263 /* Build a variable-length vector in which the elements in ELTS are repeated
6264    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
6265    RESULTS and add any new instructions to SEQ.
6266 
6267    The approach we use is:
6268 
6269    (1) Find a vector mode VM with integer elements of mode IM.
6270 
6271    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6272        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
6273        from small vectors to IM.
6274 
6275    (3) Duplicate each ELTS'[I] into a vector of mode VM.
6276 
6277    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
6278        correct byte contents.
6279 
6280    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
6281 
6282    We try to find the largest IM for which this sequence works, in order
6283    to cut down on the number of interleaves.  */
6284 
6285 void
duplicate_and_interleave(vec_info * vinfo,gimple_seq * seq,tree vector_type,const vec<tree> & elts,unsigned int nresults,vec<tree> & results)6286 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
6287 			  const vec<tree> &elts, unsigned int nresults,
6288 			  vec<tree> &results)
6289 {
6290   unsigned int nelts = elts.length ();
6291   tree element_type = TREE_TYPE (vector_type);
6292 
6293   /* (1) Find a vector mode VM with integer elements of mode IM.  */
6294   unsigned int nvectors = 1;
6295   tree new_vector_type;
6296   tree permutes[2];
6297   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
6298 				       &nvectors, &new_vector_type,
6299 				       permutes))
6300     gcc_unreachable ();
6301 
6302   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
6303   unsigned int partial_nelts = nelts / nvectors;
6304   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
6305 
6306   tree_vector_builder partial_elts;
6307   auto_vec<tree, 32> pieces (nvectors * 2);
6308   pieces.quick_grow_cleared (nvectors * 2);
6309   for (unsigned int i = 0; i < nvectors; ++i)
6310     {
6311       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
6312 	     ELTS' has mode IM.  */
6313       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
6314       for (unsigned int j = 0; j < partial_nelts; ++j)
6315 	partial_elts.quick_push (elts[i * partial_nelts + j]);
6316       tree t = gimple_build_vector (seq, &partial_elts);
6317       t = gimple_build (seq, VIEW_CONVERT_EXPR,
6318 			TREE_TYPE (new_vector_type), t);
6319 
6320       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
6321       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
6322     }
6323 
6324   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
6325 	 correct byte contents.
6326 
6327      Conceptually, we need to repeat the following operation log2(nvectors)
6328      times, where hi_start = nvectors / 2:
6329 
6330 	out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
6331 	out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
6332 
6333      However, if each input repeats every N elements and the VF is
6334      a multiple of N * 2, the HI result is the same as the LO result.
6335      This will be true for the first N1 iterations of the outer loop,
6336      followed by N2 iterations for which both the LO and HI results
6337      are needed.  I.e.:
6338 
6339 	N1 + N2 = log2(nvectors)
6340 
6341      Each "N1 iteration" doubles the number of redundant vectors and the
6342      effect of the process as a whole is to have a sequence of nvectors/2**N1
6343      vectors that repeats 2**N1 times.  Rather than generate these redundant
6344      vectors, we halve the number of vectors for each N1 iteration.  */
6345   unsigned int in_start = 0;
6346   unsigned int out_start = nvectors;
6347   unsigned int new_nvectors = nvectors;
6348   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
6349     {
6350       unsigned int hi_start = new_nvectors / 2;
6351       unsigned int out_i = 0;
6352       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
6353 	{
6354 	  if ((in_i & 1) != 0
6355 	      && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
6356 			     2 * in_repeat))
6357 	    continue;
6358 
6359 	  tree output = make_ssa_name (new_vector_type);
6360 	  tree input1 = pieces[in_start + (in_i / 2)];
6361 	  tree input2 = pieces[in_start + (in_i / 2) + hi_start];
6362 	  gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
6363 					       input1, input2,
6364 					       permutes[in_i & 1]);
6365 	  gimple_seq_add_stmt (seq, stmt);
6366 	  pieces[out_start + out_i] = output;
6367 	  out_i += 1;
6368 	}
6369       std::swap (in_start, out_start);
6370       new_nvectors = out_i;
6371     }
6372 
6373   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
6374   results.reserve (nresults);
6375   for (unsigned int i = 0; i < nresults; ++i)
6376     if (i < new_nvectors)
6377       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
6378 					pieces[in_start + i]));
6379     else
6380       results.quick_push (results[i - new_nvectors]);
6381 }
6382 
6383 
6384 /* For constant and loop invariant defs in OP_NODE this function creates
6385    vector defs that will be used in the vectorized stmts and stores them
6386    to SLP_TREE_VEC_DEFS of OP_NODE.  */
6387 
6388 static void
vect_create_constant_vectors(vec_info * vinfo,slp_tree op_node)6389 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
6390 {
6391   unsigned HOST_WIDE_INT nunits;
6392   tree vec_cst;
6393   unsigned j, number_of_places_left_in_vector;
6394   tree vector_type;
6395   tree vop;
6396   int group_size = op_node->ops.length ();
6397   unsigned int vec_num, i;
6398   unsigned number_of_copies = 1;
6399   bool constant_p;
6400   gimple_seq ctor_seq = NULL;
6401   auto_vec<tree, 16> permute_results;
6402 
6403   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
6404   vector_type = SLP_TREE_VECTYPE (op_node);
6405 
6406   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
6407   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
6408   auto_vec<tree> voprnds (number_of_vectors);
6409 
6410   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
6411      created vectors. It is greater than 1 if unrolling is performed.
6412 
6413      For example, we have two scalar operands, s1 and s2 (e.g., group of
6414      strided accesses of size two), while NUNITS is four (i.e., four scalars
6415      of this type can be packed in a vector).  The output vector will contain
6416      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
6417      will be 2).
6418 
6419      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
6420      containing the operands.
6421 
6422      For example, NUNITS is four as before, and the group size is 8
6423      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
6424      {s5, s6, s7, s8}.  */
6425 
6426   /* When using duplicate_and_interleave, we just need one element for
6427      each scalar statement.  */
6428   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
6429     nunits = group_size;
6430 
6431   number_of_copies = nunits * number_of_vectors / group_size;
6432 
6433   number_of_places_left_in_vector = nunits;
6434   constant_p = true;
6435   tree_vector_builder elts (vector_type, nunits, 1);
6436   elts.quick_grow (nunits);
6437   stmt_vec_info insert_after = NULL;
6438   for (j = 0; j < number_of_copies; j++)
6439     {
6440       tree op;
6441       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
6442         {
6443           /* Create 'vect_ = {op0,op1,...,opn}'.  */
6444           number_of_places_left_in_vector--;
6445 	  tree orig_op = op;
6446 	  if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
6447 	    {
6448 	      if (CONSTANT_CLASS_P (op))
6449 		{
6450 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6451 		    {
6452 		      /* Can't use VIEW_CONVERT_EXPR for booleans because
6453 			 of possibly different sizes of scalar value and
6454 			 vector element.  */
6455 		      if (integer_zerop (op))
6456 			op = build_int_cst (TREE_TYPE (vector_type), 0);
6457 		      else if (integer_onep (op))
6458 			op = build_all_ones_cst (TREE_TYPE (vector_type));
6459 		      else
6460 			gcc_unreachable ();
6461 		    }
6462 		  else
6463 		    op = fold_unary (VIEW_CONVERT_EXPR,
6464 				     TREE_TYPE (vector_type), op);
6465 		  gcc_assert (op && CONSTANT_CLASS_P (op));
6466 		}
6467 	      else
6468 		{
6469 		  tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
6470 		  gimple *init_stmt;
6471 		  if (VECTOR_BOOLEAN_TYPE_P (vector_type))
6472 		    {
6473 		      tree true_val
6474 			= build_all_ones_cst (TREE_TYPE (vector_type));
6475 		      tree false_val
6476 			= build_zero_cst (TREE_TYPE (vector_type));
6477 		      gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
6478 		      init_stmt = gimple_build_assign (new_temp, COND_EXPR,
6479 						       op, true_val,
6480 						       false_val);
6481 		    }
6482 		  else
6483 		    {
6484 		      op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
6485 				   op);
6486 		      init_stmt
6487 			= gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
6488 					       op);
6489 		    }
6490 		  gimple_seq_add_stmt (&ctor_seq, init_stmt);
6491 		  op = new_temp;
6492 		}
6493 	    }
6494 	  elts[number_of_places_left_in_vector] = op;
6495 	  if (!CONSTANT_CLASS_P (op))
6496 	    constant_p = false;
6497 	  /* For BB vectorization we have to compute an insert location
6498 	     when a def is inside the analyzed region since we cannot
6499 	     simply insert at the BB start in this case.  */
6500 	  stmt_vec_info opdef;
6501 	  if (TREE_CODE (orig_op) == SSA_NAME
6502 	      && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
6503 	      && is_a <bb_vec_info> (vinfo)
6504 	      && (opdef = vinfo->lookup_def (orig_op)))
6505 	    {
6506 	      if (!insert_after)
6507 		insert_after = opdef;
6508 	      else
6509 		insert_after = get_later_stmt (insert_after, opdef);
6510 	    }
6511 
6512           if (number_of_places_left_in_vector == 0)
6513             {
6514 	      if (constant_p
6515 		  ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
6516 		  : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
6517 		vec_cst = gimple_build_vector (&ctor_seq, &elts);
6518 	      else
6519 		{
6520 		  if (permute_results.is_empty ())
6521 		    duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
6522 					      elts, number_of_vectors,
6523 					      permute_results);
6524 		  vec_cst = permute_results[number_of_vectors - j - 1];
6525 		}
6526 	      if (!gimple_seq_empty_p (ctor_seq))
6527 		{
6528 		  if (insert_after)
6529 		    {
6530 		      gimple_stmt_iterator gsi;
6531 		      if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
6532 			{
6533 			  gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
6534 			  gsi_insert_seq_before (&gsi, ctor_seq,
6535 						 GSI_CONTINUE_LINKING);
6536 			}
6537 		      else if (!stmt_ends_bb_p (insert_after->stmt))
6538 			{
6539 			  gsi = gsi_for_stmt (insert_after->stmt);
6540 			  gsi_insert_seq_after (&gsi, ctor_seq,
6541 						GSI_CONTINUE_LINKING);
6542 			}
6543 		      else
6544 			{
6545 			  /* When we want to insert after a def where the
6546 			     defining stmt throws then insert on the fallthru
6547 			     edge.  */
6548 			  edge e = find_fallthru_edge
6549 				     (gimple_bb (insert_after->stmt)->succs);
6550 			  basic_block new_bb
6551 			    = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
6552 			  gcc_assert (!new_bb);
6553 			}
6554 		    }
6555 		  else
6556 		    vinfo->insert_seq_on_entry (NULL, ctor_seq);
6557 		  ctor_seq = NULL;
6558 		}
6559 	      voprnds.quick_push (vec_cst);
6560 	      insert_after = NULL;
6561               number_of_places_left_in_vector = nunits;
6562 	      constant_p = true;
6563 	      elts.new_vector (vector_type, nunits, 1);
6564 	      elts.quick_grow (nunits);
6565             }
6566         }
6567     }
6568 
6569   /* Since the vectors are created in the reverse order, we should invert
6570      them.  */
6571   vec_num = voprnds.length ();
6572   for (j = vec_num; j != 0; j--)
6573     {
6574       vop = voprnds[j - 1];
6575       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6576     }
6577 
6578   /* In case that VF is greater than the unrolling factor needed for the SLP
6579      group of stmts, NUMBER_OF_VECTORS to be created is greater than
6580      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
6581      to replicate the vectors.  */
6582   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
6583     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
6584 	 i++)
6585       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
6586 }
6587 
6588 /* Get the Ith vectorized definition from SLP_NODE.  */
6589 
6590 tree
vect_get_slp_vect_def(slp_tree slp_node,unsigned i)6591 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
6592 {
6593   if (SLP_TREE_VEC_STMTS (slp_node).exists ())
6594     return gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]);
6595   else
6596     return SLP_TREE_VEC_DEFS (slp_node)[i];
6597 }
6598 
6599 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
6600 
6601 void
vect_get_slp_defs(slp_tree slp_node,vec<tree> * vec_defs)6602 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
6603 {
6604   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
6605   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
6606     {
6607       unsigned j;
6608       gimple *vec_def_stmt;
6609       FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (slp_node), j, vec_def_stmt)
6610 	vec_defs->quick_push (gimple_get_lhs (vec_def_stmt));
6611     }
6612   else
6613     vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
6614 }
6615 
6616 /* Get N vectorized definitions for SLP_NODE.  */
6617 
6618 void
vect_get_slp_defs(vec_info *,slp_tree slp_node,vec<vec<tree>> * vec_oprnds,unsigned n)6619 vect_get_slp_defs (vec_info *,
6620 		   slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
6621 {
6622   if (n == -1U)
6623     n = SLP_TREE_CHILDREN (slp_node).length ();
6624 
6625   for (unsigned i = 0; i < n; ++i)
6626     {
6627       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
6628       vec<tree> vec_defs = vNULL;
6629       vect_get_slp_defs (child, &vec_defs);
6630       vec_oprnds->quick_push (vec_defs);
6631     }
6632 }
6633 
6634 /* Generate vector permute statements from a list of loads in DR_CHAIN.
6635    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
6636    permute statements for the SLP node NODE.  Store the number of vector
6637    permute instructions in *N_PERMS and the number of vector load
6638    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
6639    that were not needed.  */
6640 
6641 bool
vect_transform_slp_perm_load(vec_info * vinfo,slp_tree node,const vec<tree> & dr_chain,gimple_stmt_iterator * gsi,poly_uint64 vf,bool analyze_only,unsigned * n_perms,unsigned int * n_loads,bool dce_chain)6642 vect_transform_slp_perm_load (vec_info *vinfo,
6643 			      slp_tree node, const vec<tree> &dr_chain,
6644 			      gimple_stmt_iterator *gsi, poly_uint64 vf,
6645 			      bool analyze_only, unsigned *n_perms,
6646 			      unsigned int *n_loads, bool dce_chain)
6647 {
6648   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6649   int vec_index = 0;
6650   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6651   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
6652   unsigned int mask_element;
6653   machine_mode mode;
6654 
6655   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
6656     return false;
6657 
6658   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6659 
6660   mode = TYPE_MODE (vectype);
6661   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6662 
6663   /* Initialize the vect stmts of NODE to properly insert the generated
6664      stmts later.  */
6665   if (! analyze_only)
6666     for (unsigned i = SLP_TREE_VEC_STMTS (node).length ();
6667 	 i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
6668       SLP_TREE_VEC_STMTS (node).quick_push (NULL);
6669 
6670   /* Generate permutation masks for every NODE. Number of masks for each NODE
6671      is equal to GROUP_SIZE.
6672      E.g., we have a group of three nodes with three loads from the same
6673      location in each node, and the vector size is 4. I.e., we have a
6674      a0b0c0a1b1c1... sequence and we need to create the following vectors:
6675      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
6676      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
6677      ...
6678 
6679      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
6680      The last mask is illegal since we assume two operands for permute
6681      operation, and the mask element values can't be outside that range.
6682      Hence, the last mask must be converted into {2,5,5,5}.
6683      For the first two permutations we need the first and the second input
6684      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
6685      we need the second and the third vectors: {b1,c1,a2,b2} and
6686      {c2,a3,b3,c3}.  */
6687 
6688   int vect_stmts_counter = 0;
6689   unsigned int index = 0;
6690   int first_vec_index = -1;
6691   int second_vec_index = -1;
6692   bool noop_p = true;
6693   *n_perms = 0;
6694 
6695   vec_perm_builder mask;
6696   unsigned int nelts_to_build;
6697   unsigned int nvectors_per_build;
6698   unsigned int in_nlanes;
6699   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
6700 		      && multiple_p (nunits, group_size));
6701   if (repeating_p)
6702     {
6703       /* A single vector contains a whole number of copies of the node, so:
6704 	 (a) all permutes can use the same mask; and
6705 	 (b) the permutes only need a single vector input.  */
6706       mask.new_vector (nunits, group_size, 3);
6707       nelts_to_build = mask.encoded_nelts ();
6708       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
6709       in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
6710     }
6711   else
6712     {
6713       /* We need to construct a separate mask for each vector statement.  */
6714       unsigned HOST_WIDE_INT const_nunits, const_vf;
6715       if (!nunits.is_constant (&const_nunits)
6716 	  || !vf.is_constant (&const_vf))
6717 	return false;
6718       mask.new_vector (const_nunits, const_nunits, 1);
6719       nelts_to_build = const_vf * group_size;
6720       nvectors_per_build = 1;
6721       in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
6722     }
6723   auto_sbitmap used_in_lanes (in_nlanes);
6724   bitmap_clear (used_in_lanes);
6725   auto_bitmap used_defs;
6726 
6727   unsigned int count = mask.encoded_nelts ();
6728   mask.quick_grow (count);
6729   vec_perm_indices indices;
6730 
6731   for (unsigned int j = 0; j < nelts_to_build; j++)
6732     {
6733       unsigned int iter_num = j / group_size;
6734       unsigned int stmt_num = j % group_size;
6735       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
6736 			+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
6737       bitmap_set_bit (used_in_lanes, i);
6738       if (repeating_p)
6739 	{
6740 	  first_vec_index = 0;
6741 	  mask_element = i;
6742 	}
6743       else
6744 	{
6745 	  /* Enforced before the loop when !repeating_p.  */
6746 	  unsigned int const_nunits = nunits.to_constant ();
6747 	  vec_index = i / const_nunits;
6748 	  mask_element = i % const_nunits;
6749 	  if (vec_index == first_vec_index
6750 	      || first_vec_index == -1)
6751 	    {
6752 	      first_vec_index = vec_index;
6753 	    }
6754 	  else if (vec_index == second_vec_index
6755 		   || second_vec_index == -1)
6756 	    {
6757 	      second_vec_index = vec_index;
6758 	      mask_element += const_nunits;
6759 	    }
6760 	  else
6761 	    {
6762 	      if (dump_enabled_p ())
6763 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764 				 "permutation requires at "
6765 				 "least three vectors %G",
6766 				 stmt_info->stmt);
6767 	      gcc_assert (analyze_only);
6768 	      return false;
6769 	    }
6770 
6771 	  gcc_assert (mask_element < 2 * const_nunits);
6772 	}
6773 
6774       if (mask_element != index)
6775 	noop_p = false;
6776       mask[index++] = mask_element;
6777 
6778       if (index == count && !noop_p)
6779 	{
6780 	  indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
6781 	  if (!can_vec_perm_const_p (mode, indices))
6782 	    {
6783 	      if (dump_enabled_p ())
6784 		{
6785 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
6786 				   vect_location,
6787 				   "unsupported vect permute { ");
6788 		  for (i = 0; i < count; ++i)
6789 		    {
6790 		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
6791 		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
6792 		    }
6793 		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
6794 		}
6795 	      gcc_assert (analyze_only);
6796 	      return false;
6797 	    }
6798 
6799 	  ++*n_perms;
6800 	}
6801 
6802       if (index == count)
6803 	{
6804 	  if (!analyze_only)
6805 	    {
6806 	      tree mask_vec = NULL_TREE;
6807 
6808 	      if (! noop_p)
6809 		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
6810 
6811 	      if (second_vec_index == -1)
6812 		second_vec_index = first_vec_index;
6813 
6814 	      for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
6815 		{
6816 		  /* Generate the permute statement if necessary.  */
6817 		  tree first_vec = dr_chain[first_vec_index + ri];
6818 		  tree second_vec = dr_chain[second_vec_index + ri];
6819 		  gimple *perm_stmt;
6820 		  if (! noop_p)
6821 		    {
6822 		      gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6823 		      tree perm_dest
6824 			= vect_create_destination_var (gimple_assign_lhs (stmt),
6825 						       vectype);
6826 		      perm_dest = make_ssa_name (perm_dest);
6827 		      perm_stmt
6828 			= gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6829 					       first_vec, second_vec,
6830 					       mask_vec);
6831 		      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
6832 						   gsi);
6833 		      if (dce_chain)
6834 			{
6835 			  bitmap_set_bit (used_defs, first_vec_index + ri);
6836 			  bitmap_set_bit (used_defs, second_vec_index + ri);
6837 			}
6838 		    }
6839 		  else
6840 		    {
6841 		      /* If mask was NULL_TREE generate the requested
6842 			 identity transform.  */
6843 		      perm_stmt = SSA_NAME_DEF_STMT (first_vec);
6844 		      if (dce_chain)
6845 			bitmap_set_bit (used_defs, first_vec_index + ri);
6846 		    }
6847 
6848 		  /* Store the vector statement in NODE.  */
6849 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt;
6850 		}
6851 	    }
6852 
6853 	  index = 0;
6854 	  first_vec_index = -1;
6855 	  second_vec_index = -1;
6856 	  noop_p = true;
6857 	}
6858     }
6859 
6860   if (n_loads)
6861     {
6862       if (repeating_p)
6863 	*n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
6864       else
6865 	{
6866 	  /* Enforced above when !repeating_p.  */
6867 	  unsigned int const_nunits = nunits.to_constant ();
6868 	  *n_loads = 0;
6869 	  bool load_seen = false;
6870 	  for (unsigned i = 0; i < in_nlanes; ++i)
6871 	    {
6872 	      if (i % const_nunits == 0)
6873 		{
6874 		  if (load_seen)
6875 		    *n_loads += 1;
6876 		  load_seen = false;
6877 		}
6878 	      if (bitmap_bit_p (used_in_lanes, i))
6879 		load_seen = true;
6880 	    }
6881 	  if (load_seen)
6882 	    *n_loads += 1;
6883 	}
6884     }
6885 
6886   if (dce_chain)
6887     for (unsigned i = 0; i < dr_chain.length (); ++i)
6888       if (!bitmap_bit_p (used_defs, i))
6889 	{
6890 	  gimple *stmt = SSA_NAME_DEF_STMT (dr_chain[i]);
6891 	  gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
6892 	  gsi_remove (&rgsi, true);
6893 	  release_defs (stmt);
6894 	}
6895 
6896   return true;
6897 }
6898 
6899 /* Produce the next vector result for SLP permutation NODE by adding a vector
6900    statement at GSI.  If MASK_VEC is nonnull, add:
6901 
6902       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
6903 
6904    otherwise add:
6905 
6906       <new SSA name> = FIRST_DEF.  */
6907 
6908 static void
vect_add_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,tree first_def,tree second_def,tree mask_vec)6909 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6910 			  slp_tree node, tree first_def, tree second_def,
6911 			  tree mask_vec)
6912 {
6913   tree vectype = SLP_TREE_VECTYPE (node);
6914 
6915   /* ???  We SLP match existing vector element extracts but
6916      allow punning which we need to re-instantiate at uses
6917      but have no good way of explicitly representing.  */
6918   if (!types_compatible_p (TREE_TYPE (first_def), vectype))
6919     {
6920       gassign *conv_stmt
6921 	= gimple_build_assign (make_ssa_name (vectype),
6922 			       build1 (VIEW_CONVERT_EXPR, vectype, first_def));
6923       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6924       first_def = gimple_assign_lhs (conv_stmt);
6925     }
6926   gassign *perm_stmt;
6927   tree perm_dest = make_ssa_name (vectype);
6928   if (mask_vec)
6929     {
6930       if (!types_compatible_p (TREE_TYPE (second_def), vectype))
6931 	{
6932 	  gassign *conv_stmt
6933 	    = gimple_build_assign (make_ssa_name (vectype),
6934 				   build1 (VIEW_CONVERT_EXPR,
6935 					   vectype, second_def));
6936 	  vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
6937 	  second_def = gimple_assign_lhs (conv_stmt);
6938 	}
6939       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
6940 				       first_def, second_def,
6941 				       mask_vec);
6942     }
6943   else
6944     /* We need a copy here in case the def was external.  */
6945     perm_stmt = gimple_build_assign (perm_dest, first_def);
6946   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
6947   /* Store the vector statement in NODE.  */
6948   SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
6949 }
6950 
6951 /* Vectorize the SLP permutations in NODE as specified
6952    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
6953    child number and lane number.
6954    Interleaving of two two-lane two-child SLP subtrees (not supported):
6955      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
6956    A blend of two four-lane two-child SLP subtrees:
6957      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
6958    Highpart of a four-lane one-child SLP subtree (not supported):
6959      [ { 0, 2 }, { 0, 3 } ]
6960    Where currently only a subset is supported by code generating below.  */
6961 
6962 static bool
vectorizable_slp_permutation(vec_info * vinfo,gimple_stmt_iterator * gsi,slp_tree node,stmt_vector_for_cost * cost_vec)6963 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
6964 			      slp_tree node, stmt_vector_for_cost *cost_vec)
6965 {
6966   tree vectype = SLP_TREE_VECTYPE (node);
6967 
6968   /* ???  We currently only support all same vector input and output types
6969      while the SLP IL should really do a concat + select and thus accept
6970      arbitrary mismatches.  */
6971   slp_tree child;
6972   unsigned i;
6973   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6974   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
6975   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6976     {
6977       if (!vect_maybe_update_slp_op_vectype (child, vectype)
6978 	  || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
6979 	{
6980 	  if (dump_enabled_p ())
6981 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982 			     "Unsupported lane permutation\n");
6983 	  return false;
6984 	}
6985       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
6986 	repeating_p = false;
6987     }
6988 
6989   vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
6990   gcc_assert (perm.length () == SLP_TREE_LANES (node));
6991   if (dump_enabled_p ())
6992     {
6993       dump_printf_loc (MSG_NOTE, vect_location,
6994 		       "vectorizing permutation");
6995       for (unsigned i = 0; i < perm.length (); ++i)
6996 	dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
6997       if (repeating_p)
6998 	dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
6999       dump_printf (MSG_NOTE, "\n");
7000     }
7001 
7002   /* REPEATING_P is true if every output vector is guaranteed to use the
7003      same permute vector.  We can handle that case for both variable-length
7004      and constant-length vectors, but we only handle other cases for
7005      constant-length vectors.
7006 
7007      Set:
7008 
7009      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
7010        mask vector that we want to build.
7011 
7012      - NCOPIES to the number of copies of PERM that we need in order
7013        to build the necessary permute mask vectors.
7014 
7015      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
7016        for each permute mask vector.  This is only relevant when GSI is
7017        nonnull.  */
7018   uint64_t npatterns;
7019   unsigned nelts_per_pattern;
7020   uint64_t ncopies;
7021   unsigned noutputs_per_mask;
7022   if (repeating_p)
7023     {
7024       /* We need a single permute mask vector that has the form:
7025 
7026 	   { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
7027 
7028 	 In other words, the original n-element permute in PERM is
7029 	 "unrolled" to fill a full vector.  The stepped vector encoding
7030 	 that we use for permutes requires 3n elements.  */
7031       npatterns = SLP_TREE_LANES (node);
7032       nelts_per_pattern = ncopies = 3;
7033       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7034     }
7035   else
7036     {
7037       /* Calculate every element of every permute mask vector explicitly,
7038 	 instead of relying on the pattern described above.  */
7039       if (!nunits.is_constant (&npatterns))
7040 	return false;
7041       nelts_per_pattern = ncopies = 1;
7042       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
7043 	if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
7044 	  return false;
7045       noutputs_per_mask = 1;
7046     }
7047   unsigned olanes = ncopies * SLP_TREE_LANES (node);
7048   gcc_assert (repeating_p || multiple_p (olanes, nunits));
7049 
7050   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
7051      from the { SLP operand, scalar lane } permutation as recorded in the
7052      SLP node as intermediate step.  This part should already work
7053      with SLP children with arbitrary number of lanes.  */
7054   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
7055   auto_vec<unsigned> active_lane;
7056   vperm.create (olanes);
7057   active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
7058   for (unsigned i = 0; i < ncopies; ++i)
7059     {
7060       for (unsigned pi = 0; pi < perm.length (); ++pi)
7061 	{
7062 	  std::pair<unsigned, unsigned> p = perm[pi];
7063 	  tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
7064 	  if (repeating_p)
7065 	    vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
7066 	  else
7067 	    {
7068 	      /* We checked above that the vectors are constant-length.  */
7069 	      unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
7070 	      unsigned vi = (active_lane[p.first] + p.second) / vnunits;
7071 	      unsigned vl = (active_lane[p.first] + p.second) % vnunits;
7072 	      vperm.quick_push ({{p.first, vi}, vl});
7073 	    }
7074 	}
7075       /* Advance to the next group.  */
7076       for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
7077 	active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
7078     }
7079 
7080   if (dump_enabled_p ())
7081     {
7082       dump_printf_loc (MSG_NOTE, vect_location, "as");
7083       for (unsigned i = 0; i < vperm.length (); ++i)
7084 	{
7085 	  if (i != 0
7086 	      && (repeating_p
7087 		  ? multiple_p (i, npatterns)
7088 		  : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
7089 	    dump_printf (MSG_NOTE, ",");
7090 	  dump_printf (MSG_NOTE, " vops%u[%u][%u]",
7091 		       vperm[i].first.first, vperm[i].first.second,
7092 		       vperm[i].second);
7093 	}
7094       dump_printf (MSG_NOTE, "\n");
7095     }
7096 
7097   /* We can only handle two-vector permutes, everything else should
7098      be lowered on the SLP level.  The following is closely inspired
7099      by vect_transform_slp_perm_load and is supposed to eventually
7100      replace it.
7101      ???   As intermediate step do code-gen in the SLP tree representation
7102      somehow?  */
7103   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
7104   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
7105   unsigned int index = 0;
7106   poly_uint64 mask_element;
7107   vec_perm_builder mask;
7108   mask.new_vector (nunits, npatterns, nelts_per_pattern);
7109   unsigned int count = mask.encoded_nelts ();
7110   mask.quick_grow (count);
7111   vec_perm_indices indices;
7112   unsigned nperms = 0;
7113   for (unsigned i = 0; i < vperm.length (); ++i)
7114     {
7115       mask_element = vperm[i].second;
7116       if (first_vec.first == -1U
7117 	  || first_vec == vperm[i].first)
7118 	first_vec = vperm[i].first;
7119       else if (second_vec.first == -1U
7120 	       || second_vec == vperm[i].first)
7121 	{
7122 	  second_vec = vperm[i].first;
7123 	  mask_element += nunits;
7124 	}
7125       else
7126 	{
7127 	  if (dump_enabled_p ())
7128 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7129 			     "permutation requires at "
7130 			     "least three vectors\n");
7131 	  gcc_assert (!gsi);
7132 	  return false;
7133 	}
7134 
7135       mask[index++] = mask_element;
7136 
7137       if (index == count)
7138 	{
7139 	  indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
7140 	  bool identity_p = indices.series_p (0, 1, 0, 1);
7141 	  if (!identity_p
7142 	      && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
7143 	    {
7144 	      if (dump_enabled_p ())
7145 		{
7146 		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
7147 				   vect_location,
7148 				   "unsupported vect permute { ");
7149 		  for (i = 0; i < count; ++i)
7150 		    {
7151 		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
7152 		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
7153 		    }
7154 		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
7155 		}
7156 	      gcc_assert (!gsi);
7157 	      return false;
7158 	    }
7159 
7160 	  if (!identity_p)
7161 	    nperms++;
7162 	  if (gsi)
7163 	    {
7164 	      if (second_vec.first == -1U)
7165 		second_vec = first_vec;
7166 
7167 	      slp_tree
7168 		first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
7169 		second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
7170 
7171 	      tree mask_vec = NULL_TREE;
7172 	      if (!identity_p)
7173 		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
7174 
7175 	      for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
7176 		{
7177 		  tree first_def
7178 		    = vect_get_slp_vect_def (first_node,
7179 					     first_vec.second + vi);
7180 		  tree second_def
7181 		    = vect_get_slp_vect_def (second_node,
7182 					     second_vec.second + vi);
7183 		  vect_add_slp_permutation (vinfo, gsi, node, first_def,
7184 					    second_def, mask_vec);
7185 		}
7186 	    }
7187 
7188 	  index = 0;
7189 	  first_vec = std::make_pair (-1U, -1U);
7190 	  second_vec = std::make_pair (-1U, -1U);
7191 	}
7192     }
7193 
7194   if (!gsi)
7195     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
7196 
7197   return true;
7198 }
7199 
7200 /* Vectorize SLP NODE.  */
7201 
7202 static void
vect_schedule_slp_node(vec_info * vinfo,slp_tree node,slp_instance instance)7203 vect_schedule_slp_node (vec_info *vinfo,
7204 			slp_tree node, slp_instance instance)
7205 {
7206   gimple_stmt_iterator si;
7207   int i;
7208   slp_tree child;
7209 
7210   /* For existing vectors there's nothing to do.  */
7211   if (SLP_TREE_VEC_DEFS (node).exists ())
7212     return;
7213 
7214   gcc_assert (SLP_TREE_VEC_STMTS (node).is_empty ());
7215 
7216   /* Vectorize externals and constants.  */
7217   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7218       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7219     {
7220       /* ???  vectorizable_shift can end up using a scalar operand which is
7221 	 currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
7222 	 node in this case.  */
7223       if (!SLP_TREE_VECTYPE (node))
7224 	return;
7225 
7226       vect_create_constant_vectors (vinfo, node);
7227       return;
7228     }
7229 
7230   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7231 
7232   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
7233   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7234 
7235   if (dump_enabled_p ())
7236     dump_printf_loc (MSG_NOTE, vect_location,
7237 		     "------>vectorizing SLP node starting from: %G",
7238 		     stmt_info->stmt);
7239 
7240   if (STMT_VINFO_DATA_REF (stmt_info)
7241       && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7242     {
7243       /* Vectorized loads go before the first scalar load to make it
7244 	 ready early, vectorized stores go before the last scalar
7245 	 stmt which is where all uses are ready.  */
7246       stmt_vec_info last_stmt_info = NULL;
7247       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
7248 	last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
7249       else /* DR_IS_WRITE */
7250 	last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
7251       si = gsi_for_stmt (last_stmt_info->stmt);
7252     }
7253   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
7254 	    || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
7255 	    || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
7256 	   && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
7257     {
7258       /* For PHI node vectorization we do not use the insertion iterator.  */
7259       si = gsi_none ();
7260     }
7261   else
7262     {
7263       /* Emit other stmts after the children vectorized defs which is
7264 	 earliest possible.  */
7265       gimple *last_stmt = NULL;
7266       bool seen_vector_def = false;
7267       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7268 	if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7269 	  {
7270 	    /* For fold-left reductions we are retaining the scalar
7271 	       reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
7272 	       set so the representation isn't perfect.  Resort to the
7273 	       last scalar def here.  */
7274 	    if (SLP_TREE_VEC_STMTS (child).is_empty ())
7275 	      {
7276 		gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
7277 			    == cycle_phi_info_type);
7278 		gphi *phi = as_a <gphi *>
7279 			      (vect_find_last_scalar_stmt_in_slp (child)->stmt);
7280 		if (!last_stmt
7281 		    || vect_stmt_dominates_stmt_p (last_stmt, phi))
7282 		  last_stmt = phi;
7283 	      }
7284 	    /* We are emitting all vectorized stmts in the same place and
7285 	       the last one is the last.
7286 	       ???  Unless we have a load permutation applied and that
7287 	       figures to re-use an earlier generated load.  */
7288 	    unsigned j;
7289 	    gimple *vstmt;
7290 	    FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (child), j, vstmt)
7291 	      if (!last_stmt
7292 		  || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7293 		last_stmt = vstmt;
7294 	  }
7295 	else if (!SLP_TREE_VECTYPE (child))
7296 	  {
7297 	    /* For externals we use unvectorized at all scalar defs.  */
7298 	    unsigned j;
7299 	    tree def;
7300 	    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
7301 	      if (TREE_CODE (def) == SSA_NAME
7302 		  && !SSA_NAME_IS_DEFAULT_DEF (def))
7303 		{
7304 		  gimple *stmt = SSA_NAME_DEF_STMT (def);
7305 		  if (!last_stmt
7306 		      || vect_stmt_dominates_stmt_p (last_stmt, stmt))
7307 		    last_stmt = stmt;
7308 		}
7309 	  }
7310 	else
7311 	  {
7312 	    /* For externals we have to look at all defs since their
7313 	       insertion place is decided per vector.  But beware
7314 	       of pre-existing vectors where we need to make sure
7315 	       we do not insert before the region boundary.  */
7316 	    if (SLP_TREE_SCALAR_OPS (child).is_empty ()
7317 		&& !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
7318 	      seen_vector_def = true;
7319 	    else
7320 	      {
7321 		unsigned j;
7322 		tree vdef;
7323 		FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
7324 		  if (TREE_CODE (vdef) == SSA_NAME
7325 		      && !SSA_NAME_IS_DEFAULT_DEF (vdef))
7326 		    {
7327 		      gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
7328 		      if (!last_stmt
7329 			  || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
7330 			last_stmt = vstmt;
7331 		    }
7332 	      }
7333 	  }
7334       /* This can happen when all children are pre-existing vectors or
7335 	 constants.  */
7336       if (!last_stmt)
7337 	last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
7338       if (!last_stmt)
7339 	{
7340 	  gcc_assert (seen_vector_def);
7341 	  si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7342 	}
7343       else if (is_ctrl_altering_stmt (last_stmt))
7344 	{
7345 	  /* We split regions to vectorize at control altering stmts
7346 	     with a definition so this must be an external which
7347 	     we can insert at the start of the region.  */
7348 	  si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
7349 	}
7350       else if (is_a <bb_vec_info> (vinfo)
7351 	       && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
7352 	       && gimple_could_trap_p (stmt_info->stmt))
7353 	{
7354 	  /* We've constrained possibly trapping operations to all come
7355 	     from the same basic-block, if vectorized defs would allow earlier
7356 	     scheduling still force vectorized stmts to the original block.
7357 	     This is only necessary for BB vectorization since for loop vect
7358 	     all operations are in a single BB and scalar stmt based
7359 	     placement doesn't play well with epilogue vectorization.  */
7360 	  gcc_assert (dominated_by_p (CDI_DOMINATORS,
7361 				      gimple_bb (stmt_info->stmt),
7362 				      gimple_bb (last_stmt)));
7363 	  si = gsi_after_labels (gimple_bb (stmt_info->stmt));
7364 	}
7365       else if (is_a <gphi *> (last_stmt))
7366 	si = gsi_after_labels (gimple_bb (last_stmt));
7367       else
7368 	{
7369 	  si = gsi_for_stmt (last_stmt);
7370 	  gsi_next (&si);
7371 	}
7372     }
7373 
7374   /* Handle purely internal nodes.  */
7375   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7376     {
7377       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
7378 	 be shared with different SLP nodes (but usually it's the same
7379 	 operation apart from the case the stmt is only there for denoting
7380 	 the actual scalar lane defs ...).  So do not call vect_transform_stmt
7381 	 but open-code it here (partly).  */
7382       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
7383       gcc_assert (done);
7384       stmt_vec_info slp_stmt_info;
7385       unsigned int i;
7386       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7387 	if (STMT_VINFO_LIVE_P (slp_stmt_info))
7388 	  {
7389 	    done = vectorizable_live_operation (vinfo,
7390 						slp_stmt_info, &si, node,
7391 						instance, i, true, NULL);
7392 	    gcc_assert (done);
7393 	  }
7394     }
7395   else
7396     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
7397 }
7398 
7399 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
7400    For loop vectorization this is done in vectorizable_call, but for SLP
7401    it needs to be deferred until end of vect_schedule_slp, because multiple
7402    SLP instances may refer to the same scalar stmt.  */
7403 
7404 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node,hash_set<slp_tree> & visited)7405 vect_remove_slp_scalar_calls (vec_info *vinfo,
7406 			      slp_tree node, hash_set<slp_tree> &visited)
7407 {
7408   gimple *new_stmt;
7409   gimple_stmt_iterator gsi;
7410   int i;
7411   slp_tree child;
7412   tree lhs;
7413   stmt_vec_info stmt_info;
7414 
7415   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7416     return;
7417 
7418   if (visited.add (node))
7419     return;
7420 
7421   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7422     vect_remove_slp_scalar_calls (vinfo, child, visited);
7423 
7424   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7425     {
7426       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
7427       if (!stmt || gimple_bb (stmt) == NULL)
7428 	continue;
7429       if (is_pattern_stmt_p (stmt_info)
7430 	  || !PURE_SLP_STMT (stmt_info))
7431 	continue;
7432       lhs = gimple_call_lhs (stmt);
7433       new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
7434       gsi = gsi_for_stmt (stmt);
7435       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
7436       SSA_NAME_DEF_STMT (gimple_assign_lhs (new_stmt)) = new_stmt;
7437     }
7438 }
7439 
7440 static void
vect_remove_slp_scalar_calls(vec_info * vinfo,slp_tree node)7441 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
7442 {
7443   hash_set<slp_tree> visited;
7444   vect_remove_slp_scalar_calls (vinfo, node, visited);
7445 }
7446 
7447 /* Vectorize the instance root.  */
7448 
7449 void
vectorize_slp_instance_root_stmt(slp_tree node,slp_instance instance)7450 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
7451 {
7452   gassign *rstmt = NULL;
7453 
7454   if (instance->kind == slp_inst_kind_ctor)
7455     {
7456       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
7457 	{
7458 	  gimple *child_stmt = SLP_TREE_VEC_STMTS (node)[0];
7459 	  tree vect_lhs = gimple_get_lhs (child_stmt);
7460 	  tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7461 	  if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
7462 					  TREE_TYPE (vect_lhs)))
7463 	    vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
7464 			       vect_lhs);
7465 	  rstmt = gimple_build_assign (root_lhs, vect_lhs);
7466 	}
7467       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
7468 	{
7469 	  int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
7470 	  gimple *child_stmt;
7471 	  int j;
7472 	  vec<constructor_elt, va_gc> *v;
7473 	  vec_alloc (v, nelts);
7474 
7475 	  /* A CTOR can handle V16HI composition from VNx8HI so we
7476 	     do not need to convert vector elements if the types
7477 	     do not match.  */
7478 	  FOR_EACH_VEC_ELT (SLP_TREE_VEC_STMTS (node), j, child_stmt)
7479 	    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
7480 				    gimple_get_lhs (child_stmt));
7481 	  tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
7482 	  tree rtype
7483 	    = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
7484 	  tree r_constructor = build_constructor (rtype, v);
7485 	  rstmt = gimple_build_assign (lhs, r_constructor);
7486 	}
7487     }
7488   else if (instance->kind == slp_inst_kind_bb_reduc)
7489     {
7490       /* Largely inspired by reduction chain epilogue handling in
7491 	 vect_create_epilog_for_reduction.  */
7492       vec<tree> vec_defs = vNULL;
7493       vect_get_slp_defs (node, &vec_defs);
7494       enum tree_code reduc_code
7495 	= gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
7496       /* ???  We actually have to reflect signs somewhere.  */
7497       if (reduc_code == MINUS_EXPR)
7498 	reduc_code = PLUS_EXPR;
7499       gimple_seq epilogue = NULL;
7500       /* We may end up with more than one vector result, reduce them
7501 	 to one vector.  */
7502       tree vec_def = vec_defs[0];
7503       for (unsigned i = 1; i < vec_defs.length (); ++i)
7504 	vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
7505 				vec_def, vec_defs[i]);
7506       vec_defs.release ();
7507       /* ???  Support other schemes than direct internal fn.  */
7508       internal_fn reduc_fn;
7509       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7510 	  || reduc_fn == IFN_LAST)
7511 	gcc_unreachable ();
7512       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
7513 				      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
7514 
7515       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7516       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
7517       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
7518       update_stmt (gsi_stmt (rgsi));
7519       return;
7520     }
7521   else
7522     gcc_unreachable ();
7523 
7524   gcc_assert (rstmt);
7525 
7526   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
7527   gsi_replace (&rgsi, rstmt, true);
7528 }
7529 
7530 struct slp_scc_info
7531 {
7532   bool on_stack;
7533   int dfs;
7534   int lowlink;
7535 };
7536 
7537 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
7538 
7539 static void
vect_schedule_scc(vec_info * vinfo,slp_tree node,slp_instance instance,hash_map<slp_tree,slp_scc_info> & scc_info,int & maxdfs,vec<slp_tree> & stack)7540 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
7541 		   hash_map<slp_tree, slp_scc_info> &scc_info,
7542 		   int &maxdfs, vec<slp_tree> &stack)
7543 {
7544   bool existed_p;
7545   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
7546   gcc_assert (!existed_p);
7547   info->dfs = maxdfs;
7548   info->lowlink = maxdfs;
7549   maxdfs++;
7550 
7551   /* Leaf.  */
7552   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7553     {
7554       info->on_stack = false;
7555       vect_schedule_slp_node (vinfo, node, instance);
7556       return;
7557     }
7558 
7559   info->on_stack = true;
7560   stack.safe_push (node);
7561 
7562   unsigned i;
7563   slp_tree child;
7564   /* DFS recurse.  */
7565   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7566     {
7567       if (!child)
7568 	continue;
7569       slp_scc_info *child_info = scc_info.get (child);
7570       if (!child_info)
7571 	{
7572 	  vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
7573 	  /* Recursion might have re-allocated the node.  */
7574 	  info = scc_info.get (node);
7575 	  child_info = scc_info.get (child);
7576 	  info->lowlink = MIN (info->lowlink, child_info->lowlink);
7577 	}
7578       else if (child_info->on_stack)
7579 	info->lowlink = MIN (info->lowlink, child_info->dfs);
7580     }
7581   if (info->lowlink != info->dfs)
7582     return;
7583 
7584   auto_vec<slp_tree, 4> phis_to_fixup;
7585 
7586   /* Singleton.  */
7587   if (stack.last () == node)
7588     {
7589       stack.pop ();
7590       info->on_stack = false;
7591       vect_schedule_slp_node (vinfo, node, instance);
7592       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
7593 	  && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
7594 	phis_to_fixup.quick_push (node);
7595     }
7596   else
7597     {
7598       /* SCC.  */
7599       int last_idx = stack.length () - 1;
7600       while (stack[last_idx] != node)
7601 	last_idx--;
7602       /* We can break the cycle at PHIs who have at least one child
7603 	 code generated.  Then we could re-start the DFS walk until
7604 	 all nodes in the SCC are covered (we might have new entries
7605 	 for only back-reachable nodes).  But it's simpler to just
7606 	 iterate and schedule those that are ready.  */
7607       unsigned todo = stack.length () - last_idx;
7608       do
7609 	{
7610 	  for (int idx = stack.length () - 1; idx >= last_idx; --idx)
7611 	    {
7612 	      slp_tree entry = stack[idx];
7613 	      if (!entry)
7614 		continue;
7615 	      bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
7616 			  && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
7617 	      bool ready = !phi;
7618 	      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
7619 		  if (!child)
7620 		    {
7621 		      gcc_assert (phi);
7622 		      ready = true;
7623 		      break;
7624 		    }
7625 		  else if (scc_info.get (child)->on_stack)
7626 		    {
7627 		      if (!phi)
7628 			{
7629 			  ready = false;
7630 			  break;
7631 			}
7632 		    }
7633 		  else
7634 		    {
7635 		      if (phi)
7636 			{
7637 			  ready = true;
7638 			  break;
7639 			}
7640 		    }
7641 	      if (ready)
7642 		{
7643 		  vect_schedule_slp_node (vinfo, entry, instance);
7644 		  scc_info.get (entry)->on_stack = false;
7645 		  stack[idx] = NULL;
7646 		  todo--;
7647 		  if (phi)
7648 		    phis_to_fixup.safe_push (entry);
7649 		}
7650 	    }
7651 	}
7652       while (todo != 0);
7653 
7654       /* Pop the SCC.  */
7655       stack.truncate (last_idx);
7656     }
7657 
7658   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
7659   slp_tree phi_node;
7660   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
7661     {
7662       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
7663       edge_iterator ei;
7664       edge e;
7665       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
7666 	{
7667 	  unsigned dest_idx = e->dest_idx;
7668 	  child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
7669 	  if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7670 	    continue;
7671 	  /* Simply fill all args.  */
7672 	  for (unsigned i = 0; i < SLP_TREE_VEC_STMTS (phi_node).length (); ++i)
7673 	    add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[i]),
7674 			 vect_get_slp_vect_def (child, i),
7675 			 e, gimple_phi_arg_location (phi, dest_idx));
7676 	}
7677     }
7678 }
7679 
7680 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
7681 
7682 void
vect_schedule_slp(vec_info * vinfo,const vec<slp_instance> & slp_instances)7683 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
7684 {
7685   slp_instance instance;
7686   unsigned int i;
7687 
7688   hash_map<slp_tree, slp_scc_info> scc_info;
7689   int maxdfs = 0;
7690   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7691     {
7692       slp_tree node = SLP_INSTANCE_TREE (instance);
7693       if (dump_enabled_p ())
7694 	{
7695 	  dump_printf_loc (MSG_NOTE, vect_location,
7696 			   "Vectorizing SLP tree:\n");
7697 	  /* ???  Dump all?  */
7698 	  if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7699 	    dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
7700 			 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
7701 	  vect_print_slp_graph (MSG_NOTE, vect_location,
7702 				SLP_INSTANCE_TREE (instance));
7703 	}
7704       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
7705 	 have a PHI be the node breaking the cycle.  */
7706       auto_vec<slp_tree> stack;
7707       if (!scc_info.get (node))
7708 	vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
7709 
7710       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7711 	vectorize_slp_instance_root_stmt (node, instance);
7712 
7713       if (dump_enabled_p ())
7714 	dump_printf_loc (MSG_NOTE, vect_location,
7715                          "vectorizing stmts using SLP.\n");
7716     }
7717 
7718   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7719     {
7720       slp_tree root = SLP_INSTANCE_TREE (instance);
7721       stmt_vec_info store_info;
7722       unsigned int j;
7723 
7724       /* Remove scalar call stmts.  Do not do this for basic-block
7725 	 vectorization as not all uses may be vectorized.
7726 	 ???  Why should this be necessary?  DCE should be able to
7727 	 remove the stmts itself.
7728 	 ???  For BB vectorization we can as well remove scalar
7729 	 stmts starting from the SLP tree root if they have no
7730 	 uses.  */
7731       if (is_a <loop_vec_info> (vinfo))
7732 	vect_remove_slp_scalar_calls (vinfo, root);
7733 
7734       /* Remove vectorized stores original scalar stmts.  */
7735       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
7736         {
7737 	  if (!STMT_VINFO_DATA_REF (store_info)
7738 	      || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
7739 	    break;
7740 
7741 	  store_info = vect_orig_stmt (store_info);
7742 	  /* Free the attached stmt_vec_info and remove the stmt.  */
7743 	  vinfo->remove_stmt (store_info);
7744 
7745 	  /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
7746 	     to not crash in vect_free_slp_tree later.  */
7747 	  if (SLP_TREE_REPRESENTATIVE (root) == store_info)
7748 	    SLP_TREE_REPRESENTATIVE (root) = NULL;
7749         }
7750     }
7751 }
7752